Spaces:

AlyanAkram
/

StealthWriter

Running

StealthWriter / detector /create_dataset.py

Upload 11 files

a53dc0a verified 2 months ago

1.26 kB

	import os
	import json
	from preprocess import extract_paragraphs

	# Label mapping
	LABELS = {
	"ai": 1,
	"human": 0,
	"mixed": 2 # You can change to 2 if doing 3-class classification
	}

	root_dir = "training_data"
	dataset = []

	for label_folder in os.listdir(root_dir):
	folder_path = os.path.join(root_dir, label_folder)
	if not os.path.isdir(folder_path):
	continue

	label = LABELS.get(label_folder.lower())
	if label is None:
	continue

	for filename in os.listdir(folder_path):
	if not filename.endswith((".pdf", ".docx")):
	continue

	file_path = os.path.join(folder_path, filename)
	print(f"📄 Extracting: {file_path}")
	try:
	paragraphs = extract_paragraphs(file_path)
	for para in paragraphs:
	if para.strip():
	dataset.append({
	"text": para.strip(),
	"label": label
	})
	except Exception as e:
	print(f"❌ Failed: {file_path} — {str(e)}")

	# Save dataset
	with open("ai_training_dataset.json", "w", encoding="utf-8") as f:
	json.dump(dataset, f, indent=2, ensure_ascii=False)

	print(f"\n✅ Saved {len(dataset)} samples.")