import os import docx import pdfplumber import nltk nltk.download("punkt") from nltk.tokenize import sent_tokenize def extract_text_from_docx(path): try: doc = docx.Document(path) paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] return paragraphs if paragraphs else [] except Exception as e: print(f"❌ Failed to extract DOCX: {e}") return [] def extract_text_from_pdf(path): try: with pdfplumber.open(path) as pdf: all_text = "\n".join( page.extract_text() for page in pdf.pages if page.extract_text() ) except Exception as e: print(f"❌ Failed to extract PDF: {e}") return [] if not all_text.strip(): return [] # Try splitting by paragraphs paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()] if paragraphs: return paragraphs # Fallback: break into 3–5 sentence chunks sentences = sent_tokenize(all_text) return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)] def extract_paragraphs(path): ext = os.path.splitext(path)[-1].lower() if ext == ".docx": return extract_text_from_docx(path) elif ext == ".pdf": return extract_text_from_pdf(path) else: raise ValueError(f"Unsupported file type: {ext}")