Sai16216 commited on
Commit
1167820
Β·
verified Β·
1 Parent(s): 3badcdb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +278 -0
app.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import gc
4
+ import torch
5
+ import gradio as gr
6
+ import numpy as np
7
+ import faiss
8
+ import nltk
9
+ from dotenv import load_dotenv
10
+ from PyPDF2 import PdfReader
11
+ from transformers import (
12
+ MarianMTModel,
13
+ MarianTokenizer,
14
+ AutoTokenizer,
15
+ AutoModelForSeq2SeqLM,
16
+ pipeline,
17
+ )
18
+ from sentence_transformers import SentenceTransformer
19
+
20
+ nltk.download("punkt_tab")
21
+
22
+ load_dotenv()
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+
25
+ # Embeddings & QA
26
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
+ qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
28
+
29
+ # Translation models:
30
+ # English -> Hindi (fine-tuned Marian model; used for summary -> Hindi)
31
+ en_hi_model_name = "saved_model_nlp"
32
+ translator_en_hi_model = MarianMTModel.from_pretrained(en_hi_model_name).to(device)
33
+ translator_en_hi_tokenizer = MarianTokenizer.from_pretrained(en_hi_model_name)
34
+
35
+ # Hindi -> English (Helsinki model to convert input Hindi PDF to English)
36
+ hi_en_model_name = "Helsinki-NLP/opus-mt-hi-en"
37
+ translator_hi_en_model = MarianMTModel.from_pretrained(hi_en_model_name).to(device)
38
+ translator_hi_en_tokenizer = MarianTokenizer.from_pretrained(hi_en_model_name)
39
+
40
+ # BART Summarizer
41
+ bart_model_name = "pszemraj/led-large-book-summary"
42
+ bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
43
+ bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_model_name).to(device)
44
+
45
+ pdf_text = ""
46
+ text_chunks = []
47
+ index = None
48
+
49
+ # QA
50
+ def extract_text_from_pdf(file_path):
51
+ reader = PdfReader(file_path)
52
+ text = ""
53
+ for page in reader.pages:
54
+ page_text = page.extract_text()
55
+ if page_text:
56
+ text += page_text + "\n"
57
+ doc_is_hindi = is_devanagari(text)
58
+ if doc_is_hindi:
59
+ # split into Hindi sentences
60
+ hindi_sentences = sentence_tokenize_hindi(text)
61
+ # translate in batches to English
62
+ english_sentences = batch_translate_hi_to_en(hindi_sentences)
63
+ english_source_text = " ".join(english_sentences)
64
+ else:
65
+ english_source_text = text
66
+ return english_source_text
67
+
68
+
69
+ def chunk_text(text, chunk_size=500, overlap=100):
70
+ chunks = []
71
+ start = 0
72
+ while start < len(text):
73
+ end = min(start + chunk_size, len(text))
74
+ chunk = text[start:end]
75
+ chunks.append(chunk)
76
+ start += chunk_size - overlap
77
+ return chunks
78
+
79
+
80
+ def build_faiss_index(chunks, embedder):
81
+ embeddings = embedder.encode(chunks)
82
+ dim = embeddings.shape[1]
83
+ index = faiss.IndexFlatL2(dim)
84
+ index.add(np.array(embeddings, dtype=np.float32))
85
+ return index, np.array(embeddings, dtype=np.float32)
86
+
87
+
88
+ def is_devanagari(text: str, threshold: float = 0.02) -> bool:
89
+ """
90
+ Percentage of Devanagari characters in text.
91
+ If above threshold -> consider the document as Hindi/Devanagari.
92
+ """
93
+ if not text:
94
+ return False
95
+ devanagari_count = len(re.findall(r"[\u0900-\u097F]", text))
96
+ return (devanagari_count / max(1, len(text))) > threshold
97
+
98
+
99
+ def sentence_tokenize_english(text: str):
100
+ return nltk.sent_tokenize(text)
101
+
102
+
103
+ def sentence_tokenize_hindi(text: str):
104
+ parts = re.split(r"[ΰ₯€\.\?\!]\s+", text)
105
+ parts = [p.strip() for p in parts if p and p.strip()]
106
+ return parts
107
+
108
+
109
+ def batch_translate_hi_to_en(sentences, batch_size=16):
110
+ """
111
+ Translate a list of Hindi sentences -> English using Helsinki model in batches.
112
+ Returns list of translated strings in same order.
113
+ """
114
+ out = []
115
+ for i in range(0, len(sentences), batch_size):
116
+ batch = sentences[i : i + batch_size]
117
+ toks = translator_hi_en_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
118
+ with torch.no_grad():
119
+ gen = translator_hi_en_model.generate(**toks, max_length=512)
120
+ decoded = [translator_hi_en_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
121
+ out.extend(decoded)
122
+ return out
123
+
124
+
125
+ def batch_translate_en_to_hi(sentences, batch_size=16):
126
+ """
127
+ Translate a list of English sentences -> Hindi using your saved_model_nlp (Marian).
128
+ """
129
+ out = []
130
+ for i in range(0, len(sentences), batch_size):
131
+ batch = sentences[i : i + batch_size]
132
+ toks = translator_en_hi_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
133
+ with torch.no_grad():
134
+ gen = translator_en_hi_model.generate(**toks, max_length=512)
135
+ decoded = [translator_en_hi_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
136
+ out.extend(decoded)
137
+ return out
138
+
139
+
140
+ # Upload + Process PDF(QA)
141
+ def upload_pdf(file):
142
+ global pdf_text, text_chunks, index
143
+ pdf_text = extract_text_from_pdf(file.name)
144
+ text_chunks = chunk_text(pdf_text)
145
+ if len(text_chunks) == 0:
146
+ return "❌ Empty PDF or could not extract text."
147
+ index, _ = build_faiss_index(text_chunks, embedder)
148
+ return "βœ… PDF uploaded and processed successfully! Ready for questions."
149
+
150
+
151
+ # Answer Questions
152
+ def get_answer(question):
153
+ global pdf_text, text_chunks, index
154
+ if index is None:
155
+ return "❌ Please upload a PDF first."
156
+
157
+ q_emb = embedder.encode([question])
158
+ D, I = index.search(np.array(q_emb, dtype=np.float32), k=3)
159
+ relevant_text = " ".join([text_chunks[i] for i in I[0]])
160
+
161
+ result = qa_pipeline(question=question, context=relevant_text)
162
+ answer = result.get("answer", "")
163
+ confidence = round(result.get("score", 0.0), 3)
164
+
165
+ return (
166
+ f"**Answer:** {answer}\n\n"
167
+ f"**Confidence:** {confidence}\n\n"
168
+ f"**Context Extract:**\n{relevant_text[:500]}..."
169
+ )
170
+
171
+
172
+ # BART Summarization(English)
173
+ def bart_summarize(text):
174
+ inputs = bart_tokenizer(
175
+ text,
176
+ return_tensors="pt",
177
+ truncation=True,
178
+ max_length=4096,
179
+ ).to(device)
180
+ bart_model.config.max_length = 4096
181
+ with torch.no_grad():
182
+ summary_ids = bart_model.generate(
183
+ inputs["input_ids"],
184
+ max_length=2000,
185
+ min_length=80,
186
+ num_beams=4,
187
+ length_penalty=2.0,
188
+ )
189
+ return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
190
+
191
+
192
+ def summarize_pdf_with_options(pdf_file, output_lang="english"):
193
+ """
194
+ output_lang: "english" or "hindi"
195
+ """
196
+ try:
197
+ # Extract text
198
+ reader = PdfReader(pdf_file)
199
+ text = ""
200
+ for page in reader.pages[:10]: # first 10 pages
201
+ text += page.extract_text() or ""
202
+
203
+ if not text.strip():
204
+ return "❌ Could not extract text from the PDF."
205
+
206
+ # Detect Devanagari(Hindi)
207
+ doc_is_hindi = is_devanagari(text)
208
+
209
+ # If Hindi document->translate whole doc to English sentence-wise first
210
+ if doc_is_hindi:
211
+ # split into Hindi sentences
212
+ hindi_sentences = sentence_tokenize_hindi(text)
213
+ # translate in batches to English
214
+ english_sentences = batch_translate_hi_to_en(hindi_sentences)
215
+ # join for summarization
216
+ english_source_text = " ".join(english_sentences)
217
+ else:
218
+ english_source_text = text
219
+
220
+ # Summarize English source text using BART
221
+ english_summary = bart_summarize(english_source_text[:5000])
222
+
223
+ # Sentence-tokenize the English summary
224
+ english_sentences_out = sentence_tokenize_english(english_summary)
225
+
226
+ if output_lang.lower().startswith("eng"):
227
+ # each sentence in a new line
228
+ lines = [s.strip() for s in english_sentences_out if s.strip()]
229
+ return "\n".join(lines)
230
+
231
+ # If user wants Hindi output -> translate each English sentence sentence-wise to Hindi
232
+ else:
233
+ hindi_translations = batch_translate_en_to_hi(english_sentences_out)
234
+ lines = [s.strip() for s in hindi_translations if s.strip()]
235
+ return "\n".join(lines)
236
+
237
+ except Exception as e:
238
+ return f"⚠️ Error processing PDF: {e}"
239
+
240
+ # UI
241
+ with gr.Blocks() as demo:
242
+ gr.Markdown("# πŸ“„ PDF Assist (QA + BART Summarizer β€” English/Hindi)")
243
+
244
+ # PDF Question Answering
245
+ with gr.Tab("πŸ€– PDF Question Answering"):
246
+ gr.Markdown("Ask questions about your uploaded PDF document.")
247
+
248
+ pdf_file = gr.File(label="πŸ“„ Upload PDF")
249
+ upload_btn = gr.Button("Process PDF")
250
+ status = gr.Markdown()
251
+
252
+ question_box = gr.Textbox(label="Ask a question")
253
+ ask_btn = gr.Button("Get Answer")
254
+ output_box = gr.Markdown()
255
+
256
+ upload_btn.click(upload_pdf, inputs=pdf_file, outputs=status)
257
+ ask_btn.click(get_answer, inputs=question_box, outputs=output_box)
258
+
259
+ # Academic PDF Summarizer
260
+ with gr.Tab("πŸ“š Academic PDF Summarizer (English ↔ Hindi)"):
261
+ gr.Markdown(
262
+ "Upload an academic PDF (English or Hindi). The app auto-detects script. "
263
+ "Choose output language"
264
+ )
265
+
266
+ pdf_input = gr.File(label="πŸ“Ž Upload a PDF", file_types=[".pdf"])
267
+ output_choice = gr.Radio(choices=["English summary", "Hindi summary"], value="English summary", label="Choose output language")
268
+ summarize_btn = gr.Button("πŸ“‘ Summarize")
269
+ summarize_out = gr.Textbox(label="πŸ“˜ Summary", lines=20)
270
+
271
+ summarize_btn.click(
272
+ fn=summarize_pdf_with_options,
273
+ inputs=[pdf_input, output_choice],
274
+ outputs=summarize_out,
275
+ )
276
+
277
+ if __name__ == "__main__":
278
+ demo.launch(share=True)