import os import json from preprocess import extract_paragraphs # Label mapping LABELS = { "ai": 1, "human": 0, "mixed": 2 # You can change to 2 if doing 3-class classification } root_dir = "training_data" dataset = [] for label_folder in os.listdir(root_dir): folder_path = os.path.join(root_dir, label_folder) if not os.path.isdir(folder_path): continue label = LABELS.get(label_folder.lower()) if label is None: continue for filename in os.listdir(folder_path): if not filename.endswith((".pdf", ".docx")): continue file_path = os.path.join(folder_path, filename) print(f"šŸ“„ Extracting: {file_path}") try: paragraphs = extract_paragraphs(file_path) for para in paragraphs: if para.strip(): dataset.append({ "text": para.strip(), "label": label }) except Exception as e: print(f"āŒ Failed: {file_path} — {str(e)}") # Save dataset with open("ai_training_dataset.json", "w", encoding="utf-8") as f: json.dump(dataset, f, indent=2, ensure_ascii=False) print(f"\nāœ… Saved {len(dataset)} samples.")