import os
import docx
import pdfplumber
import nltk

nltk.download("punkt")
from nltk.tokenize import sent_tokenize

def extract_text_from_docx(path):
    try:
        doc = docx.Document(path)
        paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
        return paragraphs if paragraphs else []
    except Exception as e:
        print(f"❌ Failed to extract DOCX: {e}")
        return []

def extract_text_from_pdf(path):
    try:
        with pdfplumber.open(path) as pdf:
            all_text = "\n".join(
                page.extract_text() for page in pdf.pages if page.extract_text()
            )
    except Exception as e:
        print(f"❌ Failed to extract PDF: {e}")
        return []

    if not all_text.strip():
        return []

    # Try splitting by paragraphs
    paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()]
    if paragraphs:
        return paragraphs

    # Fallback: break into 3–5 sentence chunks
    sentences = sent_tokenize(all_text)
    return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)]

def extract_paragraphs(path):
    ext = os.path.splitext(path)[-1].lower()
    if ext == ".docx":
        return extract_text_from_docx(path)
    elif ext == ".pdf":
        return extract_text_from_pdf(path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")