Spaces:
Running
Running
import os | |
import docx | |
import pdfplumber | |
import nltk | |
nltk.download("punkt") | |
from nltk.tokenize import sent_tokenize | |
def extract_text_from_docx(path): | |
try: | |
doc = docx.Document(path) | |
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] | |
return paragraphs if paragraphs else [] | |
except Exception as e: | |
print(f"β Failed to extract DOCX: {e}") | |
return [] | |
def extract_text_from_pdf(path): | |
try: | |
with pdfplumber.open(path) as pdf: | |
all_text = "\n".join( | |
page.extract_text() for page in pdf.pages if page.extract_text() | |
) | |
except Exception as e: | |
print(f"β Failed to extract PDF: {e}") | |
return [] | |
if not all_text.strip(): | |
return [] | |
# Try splitting by paragraphs | |
paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()] | |
if paragraphs: | |
return paragraphs | |
# Fallback: break into 3β5 sentence chunks | |
sentences = sent_tokenize(all_text) | |
return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)] | |
def extract_paragraphs(path): | |
ext = os.path.splitext(path)[-1].lower() | |
if ext == ".docx": | |
return extract_text_from_docx(path) | |
elif ext == ".pdf": | |
return extract_text_from_pdf(path) | |
else: | |
raise ValueError(f"Unsupported file type: {ext}") | |