import json import torch from datasets import Dataset import evaluate from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments # Load dataset from JSON with open("ai_training_dataset.json", "r", encoding="utf-8") as f: data = json.load(f) # Check that all labels are integers (0, 1, or 2) for item in data: item["label"] = int(item["label"]) # Ensure type is correct # Convert to HuggingFace Dataset dataset = Dataset.from_list(data) dataset = dataset.train_test_split(test_size=0.2) train_dataset = dataset["train"] eval_dataset = dataset["test"] # Load tokenizer model_name = "roberta-base" tokenizer = RobertaTokenizer.from_pretrained(model_name) # Tokenization function def tokenize(example): return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512) # Tokenize datasets train_dataset = train_dataset.map(tokenize, batched=True) eval_dataset = eval_dataset.map(tokenize, batched=True) # Keep only model-required fields train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) # Load model with 3 output labels model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3) # Optional: define metrics accuracy = evaluate.load("accuracy") def compute_metrics(eval_pred): logits, labels = eval_pred preds = torch.argmax(torch.tensor(logits), dim=1) return accuracy.compute(predictions=preds, references=labels) # Training configuration training_args = TrainingArguments( output_dir="./models/roberta-detector", evaluation_strategy="epoch", # MUST match save_strategy save_strategy="epoch", per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=3, logging_steps=10, save_total_limit=1, load_best_model_at_end=True, metric_for_best_model="eval_loss", report_to="none", # Prevents WandB issues ) # Trainer setup trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Train trainer.train() # Save model + tokenizer model.save_pretrained("./models/roberta-detector") tokenizer.save_pretrained("./models/roberta-detector") print("✅ Model trained and saved.")