Spaces:
Running
Running
import os | |
import json | |
from preprocess import extract_paragraphs | |
# Label mapping | |
LABELS = { | |
"ai": 1, | |
"human": 0, | |
"mixed": 2 # You can change to 2 if doing 3-class classification | |
} | |
root_dir = "training_data" | |
dataset = [] | |
for label_folder in os.listdir(root_dir): | |
folder_path = os.path.join(root_dir, label_folder) | |
if not os.path.isdir(folder_path): | |
continue | |
label = LABELS.get(label_folder.lower()) | |
if label is None: | |
continue | |
for filename in os.listdir(folder_path): | |
if not filename.endswith((".pdf", ".docx")): | |
continue | |
file_path = os.path.join(folder_path, filename) | |
print(f"π Extracting: {file_path}") | |
try: | |
paragraphs = extract_paragraphs(file_path) | |
for para in paragraphs: | |
if para.strip(): | |
dataset.append({ | |
"text": para.strip(), | |
"label": label | |
}) | |
except Exception as e: | |
print(f"β Failed: {file_path} β {str(e)}") | |
# Save dataset | |
with open("ai_training_dataset.json", "w", encoding="utf-8") as f: | |
json.dump(dataset, f, indent=2, ensure_ascii=False) | |
print(f"\nβ Saved {len(dataset)} samples.") | |