Spaces:

Sai16216
/

PDF_Assist

Sleeping

App Files Files Community

Sai16216 commited on 9 days ago

Commit

9fabae0

verified ·

1 Parent(s): 1167820

Delete app_ex.py

Browse files

Files changed (1) hide show

app_ex.py +0 -269

app_ex.py DELETED Viewed

@@ -1,269 +0,0 @@
-import os
-import re
-import gc
-import torch
-import gradio as gr
-import numpy as np
-import faiss
-import nltk
-from dotenv import load_dotenv
-from PyPDF2 import PdfReader
-from transformers import (
-    MarianMTModel,
-    MarianTokenizer,
-    AutoTokenizer,
-    AutoModelForSeq2SeqLM,
-    pipeline,
-)
-from sentence_transformers import SentenceTransformer
-nltk.download("punkt_tab")
-load_dotenv()
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Embeddings & QA
-embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
-# Translation models:
-# English -> Hindi (fine-tuned Marian model; used for summary -> Hindi)
-en_hi_model_name = "saved_model_nlp"
-translator_en_hi_model = MarianMTModel.from_pretrained(en_hi_model_name).to(device)
-translator_en_hi_tokenizer = MarianTokenizer.from_pretrained(en_hi_model_name)
-# Hindi -> English (Helsinki model to convert input Hindi PDF to English)
-hi_en_model_name = "Helsinki-NLP/opus-mt-hi-en"
-translator_hi_en_model = MarianMTModel.from_pretrained(hi_en_model_name).to(device)
-translator_hi_en_tokenizer = MarianTokenizer.from_pretrained(hi_en_model_name)
-# BART Summarizer
-bart_model_name = "pszemraj/led-large-book-summary"
-bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
-bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_model_name).to(device)
-pdf_text = ""
-text_chunks = []
-index = None
-# QA
-def extract_text_from_pdf(file_path):
-    reader = PdfReader(file_path)
-    text = ""
-    for page in reader.pages:
-        page_text = page.extract_text()
-        if page_text:
-            text += page_text + "\n"
-    return text
-def chunk_text(text, chunk_size=500, overlap=100):
-    chunks = []
-    start = 0
-    while start < len(text):
-        end = min(start + chunk_size, len(text))
-        chunk = text[start:end]
-        chunks.append(chunk)
-        start += chunk_size - overlap
-    return chunks
-def build_faiss_index(chunks, embedder):
-    embeddings = embedder.encode(chunks)
-    dim = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dim)
-    index.add(np.array(embeddings, dtype=np.float32))
-    return index, np.array(embeddings, dtype=np.float32)
-def is_devanagari(text: str, threshold: float = 0.02) -> bool:
-    """
-    Percentage of Devanagari characters in text.
-    If above threshold -> consider the document as Hindi/Devanagari.
-    """
-    if not text:
-        return False
-    devanagari_count = len(re.findall(r"[\u0900-\u097F]", text))
-    return (devanagari_count / max(1, len(text))) > threshold
-def sentence_tokenize_english(text: str):
-    return nltk.sent_tokenize(text)
-def sentence_tokenize_hindi(text: str):
-    parts = re.split(r"[।\.\?\!]\s+", text)
-    parts = [p.strip() for p in parts if p and p.strip()]
-    return parts
-def batch_translate_hi_to_en(sentences, batch_size=16):
-    """
-    Translate a list of Hindi sentences -> English using Helsinki model in batches.
-    Returns list of translated strings in same order.
-    """
-    out = []
-    for i in range(0, len(sentences), batch_size):
-        batch = sentences[i : i + batch_size]
-        toks = translator_hi_en_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
-        with torch.no_grad():
-            gen = translator_hi_en_model.generate(**toks, max_length=512)
-        decoded = [translator_hi_en_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
-        out.extend(decoded)
-    return out
-def batch_translate_en_to_hi(sentences, batch_size=16):
-    """
-    Translate a list of English sentences -> Hindi using your saved_model_nlp (Marian).
-    """
-    out = []
-    for i in range(0, len(sentences), batch_size):
-        batch = sentences[i : i + batch_size]
-        toks = translator_en_hi_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
-        with torch.no_grad():
-            gen = translator_en_hi_model.generate(**toks, max_length=512)
-        decoded = [translator_en_hi_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
-        out.extend(decoded)
-    return out
-# Upload + Process PDF(QA)
-def upload_pdf(file):
-    global pdf_text, text_chunks, index
-    pdf_text = extract_text_from_pdf(file.name)
-    text_chunks = chunk_text(pdf_text)
-    if len(text_chunks) == 0:
-        return "❌ Empty PDF or could not extract text."
-    index, _ = build_faiss_index(text_chunks, embedder)
-    return "✅ PDF uploaded and processed successfully! Ready for questions."
-# Answer Questions
-def get_answer(question):
-    global pdf_text, text_chunks, index
-    if index is None:
-        return "❌ Please upload a PDF first."
-    q_emb = embedder.encode([question])
-    D, I = index.search(np.array(q_emb, dtype=np.float32), k=3)
-    relevant_text = " ".join([text_chunks[i] for i in I[0]])
-    result = qa_pipeline(question=question, context=relevant_text)
-    answer = result.get("answer", "")
-    confidence = round(result.get("score", 0.0), 3)
-    return (
-        f"**Answer:** {answer}\n\n"
-        f"**Confidence:** {confidence}\n\n"
-        f"**Context Extract:**\n{relevant_text[:500]}..."
-    )
-# BART Summarization(English)
-def bart_summarize(text):
-    inputs = bart_tokenizer(
-        text,
-        return_tensors="pt",
-        truncation=True,
-        max_length=4096,
-    ).to(device)
-    bart_model.config.max_length = 4096
-    with torch.no_grad():
-        summary_ids = bart_model.generate(
-            inputs["input_ids"],
-            max_length=2000,
-            min_length=80,
-            num_beams=4,
-            length_penalty=2.0,
-        )
-    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-def summarize_pdf_with_options(pdf_file, output_lang="english"):
-    """
-    output_lang: "english" or "hindi"
-    """
-    try:
-        # Extract text
-        reader = PdfReader(pdf_file)
-        text = ""
-        for page in reader.pages[:10]:  # first 10 pages
-            text += page.extract_text() or ""
-        if not text.strip():
-            return "❌ Could not extract text from the PDF."
-        # Detect Devanagari(Hindi)
-        doc_is_hindi = is_devanagari(text)
-        # If Hindi document->translate whole doc to English sentence-wise first
-        if doc_is_hindi:
-            # split into Hindi sentences
-            hindi_sentences = sentence_tokenize_hindi(text)
-            # translate in batches to English
-            english_sentences = batch_translate_hi_to_en(hindi_sentences)
-            # join for summarization
-            english_source_text = " ".join(english_sentences)
-        else:
-            english_source_text = text
-        # Summarize English source text using BART
-        english_summary = bart_summarize(english_source_text[:5000])
-        # Sentence-tokenize the English summary
-        english_sentences_out = sentence_tokenize_english(english_summary)
-        if output_lang.lower().startswith("eng"):
-            # each sentence in a new line
-            lines = [s.strip() for s in english_sentences_out if s.strip()]
-            return "\n".join(lines)
-        # If user wants Hindi output -> translate each English sentence sentence-wise to Hindi
-        else:
-            hindi_translations = batch_translate_en_to_hi(english_sentences_out)
-            lines = [s.strip() for s in hindi_translations if s.strip()]
-            return "\n".join(lines)
-    except Exception as e:
-        return f"⚠️ Error processing PDF: {e}"
-# UI
-with gr.Blocks() as demo:
-    gr.Markdown("# 📄 PDF Assist (QA + BART Summarizer — English/Hindi)")
-    # PDF Question Answering
-    with gr.Tab("🤖 PDF Question Answering"):
-        gr.Markdown("Ask questions about your uploaded PDF document.")
-        pdf_file = gr.File(label="📄 Upload PDF")
-        upload_btn = gr.Button("Process PDF")
-        status = gr.Markdown()
-        question_box = gr.Textbox(label="Ask a question")
-        ask_btn = gr.Button("Get Answer")
-        output_box = gr.Markdown()
-        upload_btn.click(upload_pdf, inputs=pdf_file, outputs=status)
-        ask_btn.click(get_answer, inputs=question_box, outputs=output_box)
-    # Academic PDF Summarizer
-    with gr.Tab("📚 Academic PDF Summarizer (English ↔ Hindi)"):
-        gr.Markdown(
-            "Upload an academic PDF (English or Hindi). The app auto-detects script. "
-            "Choose output language"
-        )
-        pdf_input = gr.File(label="📎 Upload a PDF", file_types=[".pdf"])
-        output_choice = gr.Radio(choices=["English summary", "Hindi summary"], value="English summary", label="Choose output language")
-        summarize_btn = gr.Button("📑 Summarize")
-        summarize_out = gr.Textbox(label="📘 Summary", lines=20)
-        summarize_btn.click(
-            fn=summarize_pdf_with_options,
-            inputs=[pdf_input, output_choice],
-            outputs=summarize_out,
-        )
-if __name__ == "__main__":
-    demo.launch(share=True)