Spaces:
Running
Running
File size: 1,255 Bytes
a53dc0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import os
import json
from preprocess import extract_paragraphs
# Label mapping
LABELS = {
"ai": 1,
"human": 0,
"mixed": 2 # You can change to 2 if doing 3-class classification
}
root_dir = "training_data"
dataset = []
for label_folder in os.listdir(root_dir):
folder_path = os.path.join(root_dir, label_folder)
if not os.path.isdir(folder_path):
continue
label = LABELS.get(label_folder.lower())
if label is None:
continue
for filename in os.listdir(folder_path):
if not filename.endswith((".pdf", ".docx")):
continue
file_path = os.path.join(folder_path, filename)
print(f"π Extracting: {file_path}")
try:
paragraphs = extract_paragraphs(file_path)
for para in paragraphs:
if para.strip():
dataset.append({
"text": para.strip(),
"label": label
})
except Exception as e:
print(f"β Failed: {file_path} β {str(e)}")
# Save dataset
with open("ai_training_dataset.json", "w", encoding="utf-8") as f:
json.dump(dataset, f, indent=2, ensure_ascii=False)
print(f"\nβ
Saved {len(dataset)} samples.")
|