Spaces:

AlyanAkram
/

StealthWriter

Running

StealthWriter / detector /preprocess.py

Upload 11 files

a53dc0a verified 2 months ago

1.38 kB

	import os
	import docx
	import pdfplumber
	import nltk

	nltk.download("punkt")
	from nltk.tokenize import sent_tokenize

	def extract_text_from_docx(path):
	try:
	doc = docx.Document(path)
	paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
	return paragraphs if paragraphs else []
	except Exception as e:
	print(f"❌ Failed to extract DOCX: {e}")
	return []

	def extract_text_from_pdf(path):
	try:
	with pdfplumber.open(path) as pdf:
	all_text = "\n".join(
	page.extract_text() for page in pdf.pages if page.extract_text()
	)
	except Exception as e:
	print(f"❌ Failed to extract PDF: {e}")
	return []

	if not all_text.strip():
	return []

	# Try splitting by paragraphs
	paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()]
	if paragraphs:
	return paragraphs

	# Fallback: break into 3–5 sentence chunks
	sentences = sent_tokenize(all_text)
	return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)]

	def extract_paragraphs(path):
	ext = os.path.splitext(path)[-1].lower()
	if ext == ".docx":
	return extract_text_from_docx(path)
	elif ext == ".pdf":
	return extract_text_from_pdf(path)
	else:
	raise ValueError(f"Unsupported file type: {ext}")