Spaces:
Running
Running
| import os | |
| import re | |
| import gc | |
| import torch | |
| import gradio as gr | |
| import numpy as np | |
| import faiss | |
| import nltk | |
| from dotenv import load_dotenv | |
| from PyPDF2 import PdfReader | |
| from transformers import ( | |
| MarianMTModel, | |
| MarianTokenizer, | |
| AutoTokenizer, | |
| AutoModelForSeq2SeqLM, | |
| pipeline, | |
| ) | |
| from sentence_transformers import SentenceTransformer | |
| nltk.download("punkt_tab") | |
| load_dotenv() | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Embeddings & QA | |
| embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") | |
| # Translation models: | |
| # English -> Hindi (fine-tuned Marian model; used for summary -> Hindi) | |
| en_hi_model_name = "saved_model_nlp" | |
| translator_en_hi_model = MarianMTModel.from_pretrained(en_hi_model_name).to(device) | |
| translator_en_hi_tokenizer = MarianTokenizer.from_pretrained(en_hi_model_name) | |
| # Hindi -> English (Helsinki model to convert input Hindi PDF to English) | |
| hi_en_model_name = "Helsinki-NLP/opus-mt-hi-en" | |
| translator_hi_en_model = MarianMTModel.from_pretrained(hi_en_model_name).to(device) | |
| translator_hi_en_tokenizer = MarianTokenizer.from_pretrained(hi_en_model_name) | |
| # BART Summarizer | |
| bart_model_name = "pszemraj/led-large-book-summary" | |
| bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name) | |
| bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_model_name).to(device) | |
| pdf_text = "" | |
| text_chunks = [] | |
| index = None | |
| # QA | |
| def extract_text_from_pdf(file_path): | |
| reader = PdfReader(file_path) | |
| text = "" | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| doc_is_hindi = is_devanagari(text) | |
| if doc_is_hindi: | |
| # split into Hindi sentences | |
| hindi_sentences = sentence_tokenize_hindi(text) | |
| # translate in batches to English | |
| english_sentences = batch_translate_hi_to_en(hindi_sentences) | |
| english_source_text = " ".join(english_sentences) | |
| else: | |
| english_source_text = text | |
| return english_source_text | |
| def chunk_text(text, chunk_size=500, overlap=100): | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = min(start + chunk_size, len(text)) | |
| chunk = text[start:end] | |
| chunks.append(chunk) | |
| start += chunk_size - overlap | |
| return chunks | |
| def build_faiss_index(chunks, embedder): | |
| embeddings = embedder.encode(chunks) | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(np.array(embeddings, dtype=np.float32)) | |
| return index, np.array(embeddings, dtype=np.float32) | |
| def is_devanagari(text: str, threshold: float = 0.02) -> bool: | |
| """ | |
| Percentage of Devanagari characters in text. | |
| If above threshold -> consider the document as Hindi/Devanagari. | |
| """ | |
| if not text: | |
| return False | |
| devanagari_count = len(re.findall(r"[\u0900-\u097F]", text)) | |
| return (devanagari_count / max(1, len(text))) > threshold | |
| def sentence_tokenize_english(text: str): | |
| return nltk.sent_tokenize(text) | |
| def sentence_tokenize_hindi(text: str): | |
| parts = re.split(r"[ΰ₯€\.\?\!]\s+", text) | |
| parts = [p.strip() for p in parts if p and p.strip()] | |
| return parts | |
| def batch_translate_hi_to_en(sentences, batch_size=16): | |
| """ | |
| Translate a list of Hindi sentences -> English using Helsinki model in batches. | |
| Returns list of translated strings in same order. | |
| """ | |
| out = [] | |
| for i in range(0, len(sentences), batch_size): | |
| batch = sentences[i : i + batch_size] | |
| toks = translator_hi_en_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device) | |
| with torch.no_grad(): | |
| gen = translator_hi_en_model.generate(**toks, max_length=512) | |
| decoded = [translator_hi_en_tokenizer.decode(g, skip_special_tokens=True) for g in gen] | |
| out.extend(decoded) | |
| return out | |
| def batch_translate_en_to_hi(sentences, batch_size=16): | |
| """ | |
| Translate a list of English sentences -> Hindi using your saved_model_nlp (Marian). | |
| """ | |
| out = [] | |
| for i in range(0, len(sentences), batch_size): | |
| batch = sentences[i : i + batch_size] | |
| toks = translator_en_hi_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device) | |
| with torch.no_grad(): | |
| gen = translator_en_hi_model.generate(**toks, max_length=512) | |
| decoded = [translator_en_hi_tokenizer.decode(g, skip_special_tokens=True) for g in gen] | |
| out.extend(decoded) | |
| return out | |
| # Upload + Process PDF(QA) | |
| def upload_pdf(file): | |
| global pdf_text, text_chunks, index | |
| pdf_text = extract_text_from_pdf(file.name) | |
| text_chunks = chunk_text(pdf_text) | |
| if len(text_chunks) == 0: | |
| return "β Empty PDF or could not extract text." | |
| index, _ = build_faiss_index(text_chunks, embedder) | |
| return "β PDF uploaded and processed successfully! Ready for questions." | |
| # Answer Questions | |
| def get_answer(question): | |
| global pdf_text, text_chunks, index | |
| if index is None: | |
| return "β Please upload a PDF first." | |
| q_emb = embedder.encode([question]) | |
| D, I = index.search(np.array(q_emb, dtype=np.float32), k=3) | |
| relevant_text = " ".join([text_chunks[i] for i in I[0]]) | |
| result = qa_pipeline(question=question, context=relevant_text) | |
| answer = result.get("answer", "") | |
| confidence = round(result.get("score", 0.0), 3) | |
| return ( | |
| f"**Answer:** {answer}\n\n" | |
| f"**Confidence:** {confidence}\n\n" | |
| f"**Context Extract:**\n{relevant_text[:500]}..." | |
| ) | |
| # BART Summarization(English) | |
| def bart_summarize(text): | |
| inputs = bart_tokenizer( | |
| text, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=4096, | |
| ).to(device) | |
| bart_model.config.max_length = 4096 | |
| with torch.no_grad(): | |
| summary_ids = bart_model.generate( | |
| inputs["input_ids"], | |
| max_length=2000, | |
| min_length=80, | |
| num_beams=4, | |
| length_penalty=2.0, | |
| ) | |
| return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
| def summarize_pdf_with_options(pdf_file, output_lang="english"): | |
| """ | |
| output_lang: "english" or "hindi" | |
| """ | |
| try: | |
| # Extract text | |
| reader = PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages[:10]: # first 10 pages | |
| text += page.extract_text() or "" | |
| if not text.strip(): | |
| return "β Could not extract text from the PDF." | |
| # Detect Devanagari(Hindi) | |
| doc_is_hindi = is_devanagari(text) | |
| # If Hindi document->translate whole doc to English sentence-wise first | |
| if doc_is_hindi: | |
| # split into Hindi sentences | |
| hindi_sentences = sentence_tokenize_hindi(text) | |
| # translate in batches to English | |
| english_sentences = batch_translate_hi_to_en(hindi_sentences) | |
| # join for summarization | |
| english_source_text = " ".join(english_sentences) | |
| else: | |
| english_source_text = text | |
| # Summarize English source text using BART | |
| english_summary = bart_summarize(english_source_text[:5000]) | |
| # Sentence-tokenize the English summary | |
| english_sentences_out = sentence_tokenize_english(english_summary) | |
| if output_lang.lower().startswith("eng"): | |
| # each sentence in a new line | |
| lines = [s.strip() for s in english_sentences_out if s.strip()] | |
| return "\n".join(lines) | |
| # If user wants Hindi output -> translate each English sentence sentence-wise to Hindi | |
| else: | |
| hindi_translations = batch_translate_en_to_hi(english_sentences_out) | |
| lines = [s.strip() for s in hindi_translations if s.strip()] | |
| return "\n".join(lines) | |
| except Exception as e: | |
| return f"β οΈ Error processing PDF: {e}" | |
| # UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π PDF Assist (QA + BART Summarizer β English/Hindi)") | |
| # PDF Question Answering | |
| with gr.Tab("π€ PDF Question Answering"): | |
| gr.Markdown("Ask questions about your uploaded PDF document.") | |
| pdf_file = gr.File(label="π Upload PDF") | |
| upload_btn = gr.Button("Process PDF") | |
| status = gr.Markdown() | |
| question_box = gr.Textbox(label="Ask a question") | |
| ask_btn = gr.Button("Get Answer") | |
| output_box = gr.Markdown() | |
| upload_btn.click(upload_pdf, inputs=pdf_file, outputs=status) | |
| ask_btn.click(get_answer, inputs=question_box, outputs=output_box) | |
| # Academic PDF Summarizer | |
| with gr.Tab("π Academic PDF Summarizer (English β Hindi)"): | |
| gr.Markdown( | |
| "Upload an academic PDF (English or Hindi). The app auto-detects script. " | |
| "Choose output language" | |
| ) | |
| pdf_input = gr.File(label="π Upload a PDF", file_types=[".pdf"]) | |
| output_choice = gr.Radio(choices=["English summary", "Hindi summary"], value="English summary", label="Choose output language") | |
| summarize_btn = gr.Button("π Summarize") | |
| summarize_out = gr.Textbox(label="π Summary", lines=20) | |
| summarize_btn.click( | |
| fn=summarize_pdf_with_options, | |
| inputs=[pdf_input, output_choice], | |
| outputs=summarize_out, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |