StealthWriter / detector /preprocess.py
AlyanAkram's picture
Upload 11 files
a53dc0a verified
import os
import docx
import pdfplumber
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize
def extract_text_from_docx(path):
try:
doc = docx.Document(path)
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
return paragraphs if paragraphs else []
except Exception as e:
print(f"❌ Failed to extract DOCX: {e}")
return []
def extract_text_from_pdf(path):
try:
with pdfplumber.open(path) as pdf:
all_text = "\n".join(
page.extract_text() for page in pdf.pages if page.extract_text()
)
except Exception as e:
print(f"❌ Failed to extract PDF: {e}")
return []
if not all_text.strip():
return []
# Try splitting by paragraphs
paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()]
if paragraphs:
return paragraphs
# Fallback: break into 3–5 sentence chunks
sentences = sent_tokenize(all_text)
return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)]
def extract_paragraphs(path):
ext = os.path.splitext(path)[-1].lower()
if ext == ".docx":
return extract_text_from_docx(path)
elif ext == ".pdf":
return extract_text_from_pdf(path)
else:
raise ValueError(f"Unsupported file type: {ext}")