jlov7's picture
feat: Multi-tool selection and robustness testing
6639f75
"""
tool_trainer.py - Fine-tune SmolLM3-3B for dynamic function calling using LoFT + DPO
This script loads SmolLM3-3B, attaches a LoRA adapter (rank 8), and trains it using
Direct Preference Optimization (DPO) on our preference pairs to teach JSON-only responses.
Key hyperparameters:
- LoRA rank: 8 (small adapter for efficiency)
- DPO beta: 0.1 (controls how strongly we prefer chosen over rejected)
- Epochs: 3 (enough to learn pattern without overfitting)
"""
import json
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from trl import DPOTrainer
from datasets import Dataset
import os
def load_preference_pairs(file_path="tool_pairs.jsonl"):
"""Load and parse the JSONL preference pairs."""
pairs = []
with open(file_path, 'r') as f:
for line in f:
pairs.append(json.loads(line.strip()))
return pairs
def format_for_dpo(pairs):
"""Convert our pairs to DPO trainer format."""
formatted = []
for pair in pairs:
formatted.append({
"prompt": pair["prompt"],
"chosen": pair["chosen"],
"rejected": pair["rejected"]
})
return formatted
def main():
print("πŸš€ Starting Dynamic Function-Calling Agent Training")
print("=" * 60)
# 1. Load the base model and tokenizer
print("πŸ“₯ Loading SmolLM3-3B model and tokenizer...")
model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct" # Using available model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None,
trust_remote_code=True
)
print(f"βœ… Loaded model: {model_name}")
print(f"πŸ”§ Model dtype: {model.dtype}")
print(f"πŸ’Ύ Model size: ~{sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")
# 2. Set up LoRA configuration
print("\nπŸ”© Setting up LoRA adapter (rank 8)...")
lora_config = LoraConfig(
r=8, # Low rank - small adapter
lora_alpha=16, # Scaling factor (typically 2x rank)
target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.1, # Prevent overfitting
bias="none",
task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"βœ… LoRA adapter attached")
print(f"🎯 Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)")
# 3. Load and prepare training data
print("\nπŸ“Š Loading preference pairs...")
pairs = load_preference_pairs()
formatted_pairs = format_for_dpo(pairs)
train_dataset = Dataset.from_list(formatted_pairs)
print(f"βœ… Loaded {len(pairs)} preference pairs")
print("πŸ“ Sample pair:")
print(f" Prompt: {pairs[0]['prompt'][:100]}...")
print(f" Chosen: {pairs[0]['chosen']}")
print(f" Rejected: {pairs[0]['rejected'][:50]}...")
# 4. Set up training arguments
print("\nβš™οΈ Configuring training (3 epochs, Ξ²=0.1)...")
training_args = TrainingArguments(
output_dir="./smollm_tool_adapter",
num_train_epochs=3,
per_device_train_batch_size=1, # Small batch for memory efficiency
gradient_accumulation_steps=4, # Effective batch size = 4
learning_rate=5e-5,
warmup_steps=10,
logging_steps=1,
save_steps=50,
eval_strategy="no", # Updated parameter name
remove_unused_columns=False,
fp16=torch.cuda.is_available(), # Use fp16 if GPU available
dataloader_pin_memory=False,
report_to=None # Disable wandb logging
)
# 5. Initialize DPO trainer
print("πŸ‹οΈ Initializing DPO trainer...")
dpo_trainer = DPOTrainer(
model,
args=training_args,
train_dataset=train_dataset,
processing_class=tokenizer, # Updated parameter name
beta=0.1, # DPO hyperparameter - how strongly to prefer chosen
max_length=512, # Max sequence length
max_prompt_length=400, # Max prompt length
)
print("βœ… DPO trainer ready")
# 6. Start training
print("\n🎯 Starting training...")
print("⏱️ This should take ~8 minutes on M4 Max, longer on CPU")
# Get initial loss for comparison
initial_logs = dpo_trainer.evaluate()
initial_loss = initial_logs.get('eval_loss', 'N/A')
print(f"πŸ“Š Initial loss: {initial_loss}")
# Train the model
train_result = dpo_trainer.train()
# Get final loss
final_logs = dpo_trainer.evaluate()
final_loss = final_logs.get('eval_loss', train_result.training_loss)
print("\nπŸŽ‰ Training completed!")
print(f"πŸ“Š Final training loss: {train_result.training_loss:.4f}")
print(f"πŸ“ˆ Loss improvement: {initial_loss} β†’ {final_loss:.4f}")
# 7. Save the fine-tuned adapter
print("\nπŸ’Ύ Saving model adapter...")
model.save_pretrained("./smollm_tool_adapter")
tokenizer.save_pretrained("./smollm_tool_adapter")
print("βœ… Model saved to './smollm_tool_adapter'")
print("🏁 Training complete! Ready for testing.")
return model, tokenizer
if __name__ == "__main__":
model, tokenizer = main()