Spaces:

Sai16216
/

PDF_Assist

Sleeping

App Files Files Community

Sai16216 commited on 9 days ago

Commit

1167820

verified ·

1 Parent(s): 3badcdb

Create app.py

Browse files

Files changed (1) hide show

app.py +278 -0

app.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import os
+import re
+import gc
+import torch
+import gradio as gr
+import numpy as np
+import faiss
+import nltk
+from dotenv import load_dotenv
+from PyPDF2 import PdfReader
+from transformers import (
+    MarianMTModel,
+    MarianTokenizer,
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    pipeline,
+)
+from sentence_transformers import SentenceTransformer
+nltk.download("punkt_tab")
+load_dotenv()
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Embeddings & QA
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
+# Translation models:
+# English -> Hindi (fine-tuned Marian model; used for summary -> Hindi)
+en_hi_model_name = "saved_model_nlp"
+translator_en_hi_model = MarianMTModel.from_pretrained(en_hi_model_name).to(device)
+translator_en_hi_tokenizer = MarianTokenizer.from_pretrained(en_hi_model_name)
+# Hindi -> English (Helsinki model to convert input Hindi PDF to English)
+hi_en_model_name = "Helsinki-NLP/opus-mt-hi-en"
+translator_hi_en_model = MarianMTModel.from_pretrained(hi_en_model_name).to(device)
+translator_hi_en_tokenizer = MarianTokenizer.from_pretrained(hi_en_model_name)
+# BART Summarizer
+bart_model_name = "pszemraj/led-large-book-summary"
+bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
+bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_model_name).to(device)
+pdf_text = ""
+text_chunks = []
+index = None
+# QA
+def extract_text_from_pdf(file_path):
+    reader = PdfReader(file_path)
+    text = ""
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text += page_text + "\n"
+    doc_is_hindi = is_devanagari(text)
+    if doc_is_hindi:
+        # split into Hindi sentences
+        hindi_sentences = sentence_tokenize_hindi(text)
+        # translate in batches to English
+        english_sentences = batch_translate_hi_to_en(hindi_sentences)
+        english_source_text = " ".join(english_sentences)
+    else:
+        english_source_text = text
+    return english_source_text
+def chunk_text(text, chunk_size=500, overlap=100):
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = min(start + chunk_size, len(text))
+        chunk = text[start:end]
+        chunks.append(chunk)
+        start += chunk_size - overlap
+    return chunks
+def build_faiss_index(chunks, embedder):
+    embeddings = embedder.encode(chunks)
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dim)
+    index.add(np.array(embeddings, dtype=np.float32))
+    return index, np.array(embeddings, dtype=np.float32)
+def is_devanagari(text: str, threshold: float = 0.02) -> bool:
+    """
+    Percentage of Devanagari characters in text.
+    If above threshold -> consider the document as Hindi/Devanagari.
+    """
+    if not text:
+        return False
+    devanagari_count = len(re.findall(r"[\u0900-\u097F]", text))
+    return (devanagari_count / max(1, len(text))) > threshold
+def sentence_tokenize_english(text: str):
+    return nltk.sent_tokenize(text)
+def sentence_tokenize_hindi(text: str):
+    parts = re.split(r"[।\.\?\!]\s+", text)
+    parts = [p.strip() for p in parts if p and p.strip()]
+    return parts
+def batch_translate_hi_to_en(sentences, batch_size=16):
+    """
+    Translate a list of Hindi sentences -> English using Helsinki model in batches.
+    Returns list of translated strings in same order.
+    """
+    out = []
+    for i in range(0, len(sentences), batch_size):
+        batch = sentences[i : i + batch_size]
+        toks = translator_hi_en_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
+        with torch.no_grad():
+            gen = translator_hi_en_model.generate(**toks, max_length=512)
+        decoded = [translator_hi_en_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
+        out.extend(decoded)
+    return out
+def batch_translate_en_to_hi(sentences, batch_size=16):
+    """
+    Translate a list of English sentences -> Hindi using your saved_model_nlp (Marian).
+    """
+    out = []
+    for i in range(0, len(sentences), batch_size):
+        batch = sentences[i : i + batch_size]
+        toks = translator_en_hi_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
+        with torch.no_grad():
+            gen = translator_en_hi_model.generate(**toks, max_length=512)
+        decoded = [translator_en_hi_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
+        out.extend(decoded)
+    return out
+# Upload + Process PDF(QA)
+def upload_pdf(file):
+    global pdf_text, text_chunks, index
+    pdf_text = extract_text_from_pdf(file.name)
+    text_chunks = chunk_text(pdf_text)
+    if len(text_chunks) == 0:
+        return "❌ Empty PDF or could not extract text."
+    index, _ = build_faiss_index(text_chunks, embedder)
+    return "✅ PDF uploaded and processed successfully! Ready for questions."
+# Answer Questions
+def get_answer(question):
+    global pdf_text, text_chunks, index
+    if index is None:
+        return "❌ Please upload a PDF first."
+    q_emb = embedder.encode([question])
+    D, I = index.search(np.array(q_emb, dtype=np.float32), k=3)
+    relevant_text = " ".join([text_chunks[i] for i in I[0]])
+    result = qa_pipeline(question=question, context=relevant_text)
+    answer = result.get("answer", "")
+    confidence = round(result.get("score", 0.0), 3)
+    return (
+        f"**Answer:** {answer}\n\n"
+        f"**Confidence:** {confidence}\n\n"
+        f"**Context Extract:**\n{relevant_text[:500]}..."
+    )
+# BART Summarization(English)
+def bart_summarize(text):
+    inputs = bart_tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        max_length=4096,
+    ).to(device)
+    bart_model.config.max_length = 4096
+    with torch.no_grad():
+        summary_ids = bart_model.generate(
+            inputs["input_ids"],
+            max_length=2000,
+            min_length=80,
+            num_beams=4,
+            length_penalty=2.0,
+        )
+    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+def summarize_pdf_with_options(pdf_file, output_lang="english"):
+    """
+    output_lang: "english" or "hindi"
+    """
+    try:
+        # Extract text
+        reader = PdfReader(pdf_file)
+        text = ""
+        for page in reader.pages[:10]:  # first 10 pages
+            text += page.extract_text() or ""
+        if not text.strip():
+            return "❌ Could not extract text from the PDF."
+        # Detect Devanagari(Hindi)
+        doc_is_hindi = is_devanagari(text)
+        # If Hindi document->translate whole doc to English sentence-wise first
+        if doc_is_hindi:
+            # split into Hindi sentences
+            hindi_sentences = sentence_tokenize_hindi(text)
+            # translate in batches to English
+            english_sentences = batch_translate_hi_to_en(hindi_sentences)
+            # join for summarization
+            english_source_text = " ".join(english_sentences)
+        else:
+            english_source_text = text
+        # Summarize English source text using BART
+        english_summary = bart_summarize(english_source_text[:5000])
+        # Sentence-tokenize the English summary
+        english_sentences_out = sentence_tokenize_english(english_summary)
+        if output_lang.lower().startswith("eng"):
+            # each sentence in a new line
+            lines = [s.strip() for s in english_sentences_out if s.strip()]
+            return "\n".join(lines)
+        # If user wants Hindi output -> translate each English sentence sentence-wise to Hindi
+        else:
+            hindi_translations = batch_translate_en_to_hi(english_sentences_out)
+            lines = [s.strip() for s in hindi_translations if s.strip()]
+            return "\n".join(lines)
+    except Exception as e:
+        return f"⚠️ Error processing PDF: {e}"
+# UI
+with gr.Blocks() as demo:
+    gr.Markdown("# 📄 PDF Assist (QA + BART Summarizer — English/Hindi)")
+    # PDF Question Answering
+    with gr.Tab("🤖 PDF Question Answering"):
+        gr.Markdown("Ask questions about your uploaded PDF document.")
+        pdf_file = gr.File(label="📄 Upload PDF")
+        upload_btn = gr.Button("Process PDF")
+        status = gr.Markdown()
+        question_box = gr.Textbox(label="Ask a question")
+        ask_btn = gr.Button("Get Answer")
+        output_box = gr.Markdown()
+        upload_btn.click(upload_pdf, inputs=pdf_file, outputs=status)
+        ask_btn.click(get_answer, inputs=question_box, outputs=output_box)
+    # Academic PDF Summarizer
+    with gr.Tab("📚 Academic PDF Summarizer (English ↔ Hindi)"):
+        gr.Markdown(
+            "Upload an academic PDF (English or Hindi). The app auto-detects script. "
+            "Choose output language"
+        )
+        pdf_input = gr.File(label="📎 Upload a PDF", file_types=[".pdf"])
+        output_choice = gr.Radio(choices=["English summary", "Hindi summary"], value="English summary", label="Choose output language")
+        summarize_btn = gr.Button("📑 Summarize")
+        summarize_out = gr.Textbox(label="📘 Summary", lines=20)
+        summarize_btn.click(
+            fn=summarize_pdf_with_options,
+            inputs=[pdf_input, output_choice],
+            outputs=summarize_out,
+        )
+if __name__ == "__main__":
+    demo.launch(share=True)