PDF_Assist / app.py
Sai16216's picture
Create app.py
1167820 verified
import os
import re
import gc
import torch
import gradio as gr
import numpy as np
import faiss
import nltk
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from transformers import (
MarianMTModel,
MarianTokenizer,
AutoTokenizer,
AutoModelForSeq2SeqLM,
pipeline,
)
from sentence_transformers import SentenceTransformer
nltk.download("punkt_tab")
load_dotenv()
device = "cuda" if torch.cuda.is_available() else "cpu"
# Embeddings & QA
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
# Translation models:
# English -> Hindi (fine-tuned Marian model; used for summary -> Hindi)
en_hi_model_name = "saved_model_nlp"
translator_en_hi_model = MarianMTModel.from_pretrained(en_hi_model_name).to(device)
translator_en_hi_tokenizer = MarianTokenizer.from_pretrained(en_hi_model_name)
# Hindi -> English (Helsinki model to convert input Hindi PDF to English)
hi_en_model_name = "Helsinki-NLP/opus-mt-hi-en"
translator_hi_en_model = MarianMTModel.from_pretrained(hi_en_model_name).to(device)
translator_hi_en_tokenizer = MarianTokenizer.from_pretrained(hi_en_model_name)
# BART Summarizer
bart_model_name = "pszemraj/led-large-book-summary"
bart_tokenizer = AutoTokenizer.from_pretrained(bart_model_name)
bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_model_name).to(device)
pdf_text = ""
text_chunks = []
index = None
# QA
def extract_text_from_pdf(file_path):
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
doc_is_hindi = is_devanagari(text)
if doc_is_hindi:
# split into Hindi sentences
hindi_sentences = sentence_tokenize_hindi(text)
# translate in batches to English
english_sentences = batch_translate_hi_to_en(hindi_sentences)
english_source_text = " ".join(english_sentences)
else:
english_source_text = text
return english_source_text
def chunk_text(text, chunk_size=500, overlap=100):
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunk = text[start:end]
chunks.append(chunk)
start += chunk_size - overlap
return chunks
def build_faiss_index(chunks, embedder):
embeddings = embedder.encode(chunks)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings, dtype=np.float32))
return index, np.array(embeddings, dtype=np.float32)
def is_devanagari(text: str, threshold: float = 0.02) -> bool:
"""
Percentage of Devanagari characters in text.
If above threshold -> consider the document as Hindi/Devanagari.
"""
if not text:
return False
devanagari_count = len(re.findall(r"[\u0900-\u097F]", text))
return (devanagari_count / max(1, len(text))) > threshold
def sentence_tokenize_english(text: str):
return nltk.sent_tokenize(text)
def sentence_tokenize_hindi(text: str):
parts = re.split(r"[ΰ₯€\.\?\!]\s+", text)
parts = [p.strip() for p in parts if p and p.strip()]
return parts
def batch_translate_hi_to_en(sentences, batch_size=16):
"""
Translate a list of Hindi sentences -> English using Helsinki model in batches.
Returns list of translated strings in same order.
"""
out = []
for i in range(0, len(sentences), batch_size):
batch = sentences[i : i + batch_size]
toks = translator_hi_en_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
with torch.no_grad():
gen = translator_hi_en_model.generate(**toks, max_length=512)
decoded = [translator_hi_en_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
out.extend(decoded)
return out
def batch_translate_en_to_hi(sentences, batch_size=16):
"""
Translate a list of English sentences -> Hindi using your saved_model_nlp (Marian).
"""
out = []
for i in range(0, len(sentences), batch_size):
batch = sentences[i : i + batch_size]
toks = translator_en_hi_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
with torch.no_grad():
gen = translator_en_hi_model.generate(**toks, max_length=512)
decoded = [translator_en_hi_tokenizer.decode(g, skip_special_tokens=True) for g in gen]
out.extend(decoded)
return out
# Upload + Process PDF(QA)
def upload_pdf(file):
global pdf_text, text_chunks, index
pdf_text = extract_text_from_pdf(file.name)
text_chunks = chunk_text(pdf_text)
if len(text_chunks) == 0:
return "❌ Empty PDF or could not extract text."
index, _ = build_faiss_index(text_chunks, embedder)
return "βœ… PDF uploaded and processed successfully! Ready for questions."
# Answer Questions
def get_answer(question):
global pdf_text, text_chunks, index
if index is None:
return "❌ Please upload a PDF first."
q_emb = embedder.encode([question])
D, I = index.search(np.array(q_emb, dtype=np.float32), k=3)
relevant_text = " ".join([text_chunks[i] for i in I[0]])
result = qa_pipeline(question=question, context=relevant_text)
answer = result.get("answer", "")
confidence = round(result.get("score", 0.0), 3)
return (
f"**Answer:** {answer}\n\n"
f"**Confidence:** {confidence}\n\n"
f"**Context Extract:**\n{relevant_text[:500]}..."
)
# BART Summarization(English)
def bart_summarize(text):
inputs = bart_tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=4096,
).to(device)
bart_model.config.max_length = 4096
with torch.no_grad():
summary_ids = bart_model.generate(
inputs["input_ids"],
max_length=2000,
min_length=80,
num_beams=4,
length_penalty=2.0,
)
return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
def summarize_pdf_with_options(pdf_file, output_lang="english"):
"""
output_lang: "english" or "hindi"
"""
try:
# Extract text
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages[:10]: # first 10 pages
text += page.extract_text() or ""
if not text.strip():
return "❌ Could not extract text from the PDF."
# Detect Devanagari(Hindi)
doc_is_hindi = is_devanagari(text)
# If Hindi document->translate whole doc to English sentence-wise first
if doc_is_hindi:
# split into Hindi sentences
hindi_sentences = sentence_tokenize_hindi(text)
# translate in batches to English
english_sentences = batch_translate_hi_to_en(hindi_sentences)
# join for summarization
english_source_text = " ".join(english_sentences)
else:
english_source_text = text
# Summarize English source text using BART
english_summary = bart_summarize(english_source_text[:5000])
# Sentence-tokenize the English summary
english_sentences_out = sentence_tokenize_english(english_summary)
if output_lang.lower().startswith("eng"):
# each sentence in a new line
lines = [s.strip() for s in english_sentences_out if s.strip()]
return "\n".join(lines)
# If user wants Hindi output -> translate each English sentence sentence-wise to Hindi
else:
hindi_translations = batch_translate_en_to_hi(english_sentences_out)
lines = [s.strip() for s in hindi_translations if s.strip()]
return "\n".join(lines)
except Exception as e:
return f"⚠️ Error processing PDF: {e}"
# UI
with gr.Blocks() as demo:
gr.Markdown("# πŸ“„ PDF Assist (QA + BART Summarizer β€” English/Hindi)")
# PDF Question Answering
with gr.Tab("πŸ€– PDF Question Answering"):
gr.Markdown("Ask questions about your uploaded PDF document.")
pdf_file = gr.File(label="πŸ“„ Upload PDF")
upload_btn = gr.Button("Process PDF")
status = gr.Markdown()
question_box = gr.Textbox(label="Ask a question")
ask_btn = gr.Button("Get Answer")
output_box = gr.Markdown()
upload_btn.click(upload_pdf, inputs=pdf_file, outputs=status)
ask_btn.click(get_answer, inputs=question_box, outputs=output_box)
# Academic PDF Summarizer
with gr.Tab("πŸ“š Academic PDF Summarizer (English ↔ Hindi)"):
gr.Markdown(
"Upload an academic PDF (English or Hindi). The app auto-detects script. "
"Choose output language"
)
pdf_input = gr.File(label="πŸ“Ž Upload a PDF", file_types=[".pdf"])
output_choice = gr.Radio(choices=["English summary", "Hindi summary"], value="English summary", label="Choose output language")
summarize_btn = gr.Button("πŸ“‘ Summarize")
summarize_out = gr.Textbox(label="πŸ“˜ Summary", lines=20)
summarize_btn.click(
fn=summarize_pdf_with_options,
inputs=[pdf_input, output_choice],
outputs=summarize_out,
)
if __name__ == "__main__":
demo.launch(share=True)