""" tool_trainer.py - Fine-tune SmolLM3-3B for dynamic function calling using LoFT + DPO This script loads SmolLM3-3B, attaches a LoRA adapter (rank 8), and trains it using Direct Preference Optimization (DPO) on our preference pairs to teach JSON-only responses. Key hyperparameters: - LoRA rank: 8 (small adapter for efficiency) - DPO beta: 0.1 (controls how strongly we prefer chosen over rejected) - Epochs: 3 (enough to learn pattern without overfitting) """ import json import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer ) from peft import LoraConfig, get_peft_model, TaskType from trl import DPOTrainer from datasets import Dataset import os def load_preference_pairs(file_path="tool_pairs.jsonl"): """Load and parse the JSONL preference pairs.""" pairs = [] with open(file_path, 'r') as f: for line in f: pairs.append(json.loads(line.strip())) return pairs def format_for_dpo(pairs): """Convert our pairs to DPO trainer format.""" formatted = [] for pair in pairs: formatted.append({ "prompt": pair["prompt"], "chosen": pair["chosen"], "rejected": pair["rejected"] }) return formatted def main(): print("šŸš€ Starting Dynamic Function-Calling Agent Training") print("=" * 60) # 1. Load the base model and tokenizer print("šŸ“„ Loading SmolLM3-3B model and tokenizer...") model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct" # Using available model tokenizer = AutoTokenizer.from_pretrained(model_name) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" if torch.cuda.is_available() else None, trust_remote_code=True ) print(f"āœ… Loaded model: {model_name}") print(f"šŸ”§ Model dtype: {model.dtype}") print(f"šŸ’¾ Model size: ~{sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters") # 2. Set up LoRA configuration print("\nšŸ”© Setting up LoRA adapter (rank 8)...") lora_config = LoraConfig( r=8, # Low rank - small adapter lora_alpha=16, # Scaling factor (typically 2x rank) target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_dropout=0.1, # Prevent overfitting bias="none", task_type=TaskType.CAUSAL_LM ) model = get_peft_model(model, lora_config) trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) print(f"āœ… LoRA adapter attached") print(f"šŸŽÆ Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)") # 3. Load and prepare training data print("\nšŸ“Š Loading preference pairs...") pairs = load_preference_pairs() formatted_pairs = format_for_dpo(pairs) train_dataset = Dataset.from_list(formatted_pairs) print(f"āœ… Loaded {len(pairs)} preference pairs") print("šŸ“ Sample pair:") print(f" Prompt: {pairs[0]['prompt'][:100]}...") print(f" Chosen: {pairs[0]['chosen']}") print(f" Rejected: {pairs[0]['rejected'][:50]}...") # 4. Set up training arguments print("\nāš™ļø Configuring training (3 epochs, β=0.1)...") training_args = TrainingArguments( output_dir="./smollm_tool_adapter", num_train_epochs=3, per_device_train_batch_size=1, # Small batch for memory efficiency gradient_accumulation_steps=4, # Effective batch size = 4 learning_rate=5e-5, warmup_steps=10, logging_steps=1, save_steps=50, eval_strategy="no", # Updated parameter name remove_unused_columns=False, fp16=torch.cuda.is_available(), # Use fp16 if GPU available dataloader_pin_memory=False, report_to=None # Disable wandb logging ) # 5. Initialize DPO trainer print("šŸ‹ļø Initializing DPO trainer...") dpo_trainer = DPOTrainer( model, args=training_args, train_dataset=train_dataset, processing_class=tokenizer, # Updated parameter name beta=0.1, # DPO hyperparameter - how strongly to prefer chosen max_length=512, # Max sequence length max_prompt_length=400, # Max prompt length ) print("āœ… DPO trainer ready") # 6. Start training print("\nšŸŽÆ Starting training...") print("ā±ļø This should take ~8 minutes on M4 Max, longer on CPU") # Get initial loss for comparison initial_logs = dpo_trainer.evaluate() initial_loss = initial_logs.get('eval_loss', 'N/A') print(f"šŸ“Š Initial loss: {initial_loss}") # Train the model train_result = dpo_trainer.train() # Get final loss final_logs = dpo_trainer.evaluate() final_loss = final_logs.get('eval_loss', train_result.training_loss) print("\nšŸŽ‰ Training completed!") print(f"šŸ“Š Final training loss: {train_result.training_loss:.4f}") print(f"šŸ“ˆ Loss improvement: {initial_loss} → {final_loss:.4f}") # 7. Save the fine-tuned adapter print("\nšŸ’¾ Saving model adapter...") model.save_pretrained("./smollm_tool_adapter") tokenizer.save_pretrained("./smollm_tool_adapter") print("āœ… Model saved to './smollm_tool_adapter'") print("šŸ Training complete! Ready for testing.") return model, tokenizer if __name__ == "__main__": model, tokenizer = main()