Sai16216 commited on
Commit
9fabae0
·
verified ·
1 Parent(s): 1167820

Delete app_ex.py

Browse files
Files changed (1) hide show
  1. app_ex.py +0 -269
app_ex.py DELETED
@@ -1,269 +0,0 @@
1
- import os
2
- import re
3
- import gc
4
- import torch
5
- import gradio as gr
6
- import numpy as np
7
- import faiss
8
- import nltk
9
- from dotenv import load_dotenv
10
- from PyPDF2 import PdfReader
11
- from transformers import (
12
- MarianMTModel,
13
- MarianTokenizer,
14
- AutoTokenizer,
15
- AutoModelForSeq2SeqLM,
16
- pipeline,
17
- )
18
- from sentence_transformers import SentenceTransformer
19
-
20
- nltk.download("punkt_tab")
21
-
22
- load_dotenv()
23
- device = "cuda" if torch.cuda.is_available() else "cpu"
24
-
25
- # Embeddings & QA
26
- embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
- qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
28
-
29
- # Translation models:
30
- # English -> Hindi (fine-tuned Marian model; used for summary -> Hindi)
31
- en_hi_model_name = "saved_model_nlp"
32
- translator_en_hi_model = MarianMTModel.from_pretrained(en_hi_model_name).to(device)
33
- translator_en_hi_tokenizer = MarianTokenizer.from_pretrained(en_hi_model_name)
34
-
35
- # Hindi -> English (Helsinki model to convert input Hindi PDF to English)
36
- hi_en_model_name = "Helsinki-NLP/opus-mt-hi-en"
37
- translator_hi_en_model = MarianMTModel.from_pretrained(hi_en_model_name).to(device)
38
- translator_hi_en_tokenizer = MarianTokenizer.from_pretrained(hi_en_model_name)
39
-
40
- # BART Summarizer
41
- bart_model_name = "pszemraj/led-large-book-summary"
42
- bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
43
- bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_model_name).to(device)
44
-
45
- pdf_text = ""
46
- text_chunks = []
47
- index = None
48
-
49
- # QA
50
- def extract_text_from_pdf(file_path):
51
- reader = PdfReader(file_path)
52
- text = ""
53
- for page in reader.pages:
54
- page_text = page.extract_text()
55
- if page_text:
56
- text += page_text + "\n"
57
- return text
58
-
59
-
60
- def chunk_text(text, chunk_size=500, overlap=100):
61
- chunks = []
62
- start = 0
63
- while start < len(text):
64
- end = min(start + chunk_size, len(text))
65
- chunk = text[start:end]
66
- chunks.append(chunk)
67
- start += chunk_size - overlap
68
- return chunks
69
-
70
-
71
- def build_faiss_index(chunks, embedder):
72
- embeddings = embedder.encode(chunks)
73
- dim = embeddings.shape[1]
74
- index = faiss.IndexFlatL2(dim)
75
- index.add(np.array(embeddings, dtype=np.float32))
76
- return index, np.array(embeddings, dtype=np.float32)
77
-
78
-
79
- def is_devanagari(text: str, threshold: float = 0.02) -> bool:
80
- """
81
- Percentage of Devanagari characters in text.
82
- If above threshold -> consider the document as Hindi/Devanagari.
83
- """
84
- if not text:
85
- return False
86
- devanagari_count = len(re.findall(r"[\u0900-\u097F]", text))
87
- return (devanagari_count / max(1, len(text))) > threshold
88
-
89
-
90
- def sentence_tokenize_english(text: str):
91
- return nltk.sent_tokenize(text)
92
-
93
-
94
- def sentence_tokenize_hindi(text: str):
95
- parts = re.split(r"[।\.\?\!]\s+", text)
96
- parts = [p.strip() for p in parts if p and p.strip()]
97
- return parts
98
-
99
-
100
- def batch_translate_hi_to_en(sentences, batch_size=16):
101
- """
102
- Translate a list of Hindi sentences -> English using Helsinki model in batches.
103
- Returns list of translated strings in same order.
104
- """
105
- out = []
106
- for i in range(0, len(sentences), batch_size):
107
- batch = sentences[i : i + batch_size]
108
- toks = translator_hi_en_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
109
- with torch.no_grad():
110
- gen = translator_hi_en_model.generate(**toks, max_length=512)
111
- decoded = [translator_hi_en_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
112
- out.extend(decoded)
113
- return out
114
-
115
-
116
- def batch_translate_en_to_hi(sentences, batch_size=16):
117
- """
118
- Translate a list of English sentences -> Hindi using your saved_model_nlp (Marian).
119
- """
120
- out = []
121
- for i in range(0, len(sentences), batch_size):
122
- batch = sentences[i : i + batch_size]
123
- toks = translator_en_hi_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
124
- with torch.no_grad():
125
- gen = translator_en_hi_model.generate(**toks, max_length=512)
126
- decoded = [translator_en_hi_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
127
- out.extend(decoded)
128
- return out
129
-
130
-
131
- # Upload + Process PDF(QA)
132
- def upload_pdf(file):
133
- global pdf_text, text_chunks, index
134
- pdf_text = extract_text_from_pdf(file.name)
135
- text_chunks = chunk_text(pdf_text)
136
- if len(text_chunks) == 0:
137
- return "❌ Empty PDF or could not extract text."
138
- index, _ = build_faiss_index(text_chunks, embedder)
139
- return "✅ PDF uploaded and processed successfully! Ready for questions."
140
-
141
-
142
- # Answer Questions
143
- def get_answer(question):
144
- global pdf_text, text_chunks, index
145
- if index is None:
146
- return "❌ Please upload a PDF first."
147
-
148
- q_emb = embedder.encode([question])
149
- D, I = index.search(np.array(q_emb, dtype=np.float32), k=3)
150
- relevant_text = " ".join([text_chunks[i] for i in I[0]])
151
-
152
- result = qa_pipeline(question=question, context=relevant_text)
153
- answer = result.get("answer", "")
154
- confidence = round(result.get("score", 0.0), 3)
155
-
156
- return (
157
- f"**Answer:** {answer}\n\n"
158
- f"**Confidence:** {confidence}\n\n"
159
- f"**Context Extract:**\n{relevant_text[:500]}..."
160
- )
161
-
162
-
163
- # BART Summarization(English)
164
- def bart_summarize(text):
165
- inputs = bart_tokenizer(
166
- text,
167
- return_tensors="pt",
168
- truncation=True,
169
- max_length=4096,
170
- ).to(device)
171
- bart_model.config.max_length = 4096
172
- with torch.no_grad():
173
- summary_ids = bart_model.generate(
174
- inputs["input_ids"],
175
- max_length=2000,
176
- min_length=80,
177
- num_beams=4,
178
- length_penalty=2.0,
179
- )
180
- return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
181
-
182
-
183
- def summarize_pdf_with_options(pdf_file, output_lang="english"):
184
- """
185
- output_lang: "english" or "hindi"
186
- """
187
- try:
188
- # Extract text
189
- reader = PdfReader(pdf_file)
190
- text = ""
191
- for page in reader.pages[:10]: # first 10 pages
192
- text += page.extract_text() or ""
193
-
194
- if not text.strip():
195
- return "❌ Could not extract text from the PDF."
196
-
197
- # Detect Devanagari(Hindi)
198
- doc_is_hindi = is_devanagari(text)
199
-
200
- # If Hindi document->translate whole doc to English sentence-wise first
201
- if doc_is_hindi:
202
- # split into Hindi sentences
203
- hindi_sentences = sentence_tokenize_hindi(text)
204
- # translate in batches to English
205
- english_sentences = batch_translate_hi_to_en(hindi_sentences)
206
- # join for summarization
207
- english_source_text = " ".join(english_sentences)
208
- else:
209
- english_source_text = text
210
-
211
- # Summarize English source text using BART
212
- english_summary = bart_summarize(english_source_text[:5000])
213
-
214
- # Sentence-tokenize the English summary
215
- english_sentences_out = sentence_tokenize_english(english_summary)
216
-
217
- if output_lang.lower().startswith("eng"):
218
- # each sentence in a new line
219
- lines = [s.strip() for s in english_sentences_out if s.strip()]
220
- return "\n".join(lines)
221
-
222
- # If user wants Hindi output -> translate each English sentence sentence-wise to Hindi
223
- else:
224
- hindi_translations = batch_translate_en_to_hi(english_sentences_out)
225
- lines = [s.strip() for s in hindi_translations if s.strip()]
226
- return "\n".join(lines)
227
-
228
- except Exception as e:
229
- return f"⚠️ Error processing PDF: {e}"
230
-
231
- # UI
232
- with gr.Blocks() as demo:
233
- gr.Markdown("# 📄 PDF Assist (QA + BART Summarizer — English/Hindi)")
234
-
235
- # PDF Question Answering
236
- with gr.Tab("🤖 PDF Question Answering"):
237
- gr.Markdown("Ask questions about your uploaded PDF document.")
238
-
239
- pdf_file = gr.File(label="📄 Upload PDF")
240
- upload_btn = gr.Button("Process PDF")
241
- status = gr.Markdown()
242
-
243
- question_box = gr.Textbox(label="Ask a question")
244
- ask_btn = gr.Button("Get Answer")
245
- output_box = gr.Markdown()
246
-
247
- upload_btn.click(upload_pdf, inputs=pdf_file, outputs=status)
248
- ask_btn.click(get_answer, inputs=question_box, outputs=output_box)
249
-
250
- # Academic PDF Summarizer
251
- with gr.Tab("📚 Academic PDF Summarizer (English ↔ Hindi)"):
252
- gr.Markdown(
253
- "Upload an academic PDF (English or Hindi). The app auto-detects script. "
254
- "Choose output language"
255
- )
256
-
257
- pdf_input = gr.File(label="📎 Upload a PDF", file_types=[".pdf"])
258
- output_choice = gr.Radio(choices=["English summary", "Hindi summary"], value="English summary", label="Choose output language")
259
- summarize_btn = gr.Button("📑 Summarize")
260
- summarize_out = gr.Textbox(label="📘 Summary", lines=20)
261
-
262
- summarize_btn.click(
263
- fn=summarize_pdf_with_options,
264
- inputs=[pdf_input, output_choice],
265
- outputs=summarize_out,
266
- )
267
-
268
- if __name__ == "__main__":
269
- demo.launch(share=True)