StealthWriter / detector /create_dataset.py
AlyanAkram's picture
Upload 11 files
a53dc0a verified
import os
import json
from preprocess import extract_paragraphs
# Label mapping
LABELS = {
"ai": 1,
"human": 0,
"mixed": 2 # You can change to 2 if doing 3-class classification
}
root_dir = "training_data"
dataset = []
for label_folder in os.listdir(root_dir):
folder_path = os.path.join(root_dir, label_folder)
if not os.path.isdir(folder_path):
continue
label = LABELS.get(label_folder.lower())
if label is None:
continue
for filename in os.listdir(folder_path):
if not filename.endswith((".pdf", ".docx")):
continue
file_path = os.path.join(folder_path, filename)
print(f"πŸ“„ Extracting: {file_path}")
try:
paragraphs = extract_paragraphs(file_path)
for para in paragraphs:
if para.strip():
dataset.append({
"text": para.strip(),
"label": label
})
except Exception as e:
print(f"❌ Failed: {file_path} β€” {str(e)}")
# Save dataset
with open("ai_training_dataset.json", "w", encoding="utf-8") as f:
json.dump(dataset, f, indent=2, ensure_ascii=False)
print(f"\nβœ… Saved {len(dataset)} samples.")