File size: 5,897 Bytes
6639f75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
"""
tool_trainer.py - Fine-tune SmolLM3-3B for dynamic function calling using LoFT + DPO
This script loads SmolLM3-3B, attaches a LoRA adapter (rank 8), and trains it using
Direct Preference Optimization (DPO) on our preference pairs to teach JSON-only responses.
Key hyperparameters:
- LoRA rank: 8 (small adapter for efficiency)
- DPO beta: 0.1 (controls how strongly we prefer chosen over rejected)
- Epochs: 3 (enough to learn pattern without overfitting)
"""
import json
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from trl import DPOTrainer
from datasets import Dataset
import os
def load_preference_pairs(file_path="tool_pairs.jsonl"):
"""Load and parse the JSONL preference pairs."""
pairs = []
with open(file_path, 'r') as f:
for line in f:
pairs.append(json.loads(line.strip()))
return pairs
def format_for_dpo(pairs):
"""Convert our pairs to DPO trainer format."""
formatted = []
for pair in pairs:
formatted.append({
"prompt": pair["prompt"],
"chosen": pair["chosen"],
"rejected": pair["rejected"]
})
return formatted
def main():
print("π Starting Dynamic Function-Calling Agent Training")
print("=" * 60)
# 1. Load the base model and tokenizer
print("π₯ Loading SmolLM3-3B model and tokenizer...")
model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct" # Using available model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto" if torch.cuda.is_available() else None,
trust_remote_code=True
)
print(f"β
Loaded model: {model_name}")
print(f"π§ Model dtype: {model.dtype}")
print(f"πΎ Model size: ~{sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")
# 2. Set up LoRA configuration
print("\nπ© Setting up LoRA adapter (rank 8)...")
lora_config = LoraConfig(
r=8, # Low rank - small adapter
lora_alpha=16, # Scaling factor (typically 2x rank)
target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.1, # Prevent overfitting
bias="none",
task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"β
LoRA adapter attached")
print(f"π― Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)")
# 3. Load and prepare training data
print("\nπ Loading preference pairs...")
pairs = load_preference_pairs()
formatted_pairs = format_for_dpo(pairs)
train_dataset = Dataset.from_list(formatted_pairs)
print(f"β
Loaded {len(pairs)} preference pairs")
print("π Sample pair:")
print(f" Prompt: {pairs[0]['prompt'][:100]}...")
print(f" Chosen: {pairs[0]['chosen']}")
print(f" Rejected: {pairs[0]['rejected'][:50]}...")
# 4. Set up training arguments
print("\nβοΈ Configuring training (3 epochs, Ξ²=0.1)...")
training_args = TrainingArguments(
output_dir="./smollm_tool_adapter",
num_train_epochs=3,
per_device_train_batch_size=1, # Small batch for memory efficiency
gradient_accumulation_steps=4, # Effective batch size = 4
learning_rate=5e-5,
warmup_steps=10,
logging_steps=1,
save_steps=50,
eval_strategy="no", # Updated parameter name
remove_unused_columns=False,
fp16=torch.cuda.is_available(), # Use fp16 if GPU available
dataloader_pin_memory=False,
report_to=None # Disable wandb logging
)
# 5. Initialize DPO trainer
print("ποΈ Initializing DPO trainer...")
dpo_trainer = DPOTrainer(
model,
args=training_args,
train_dataset=train_dataset,
processing_class=tokenizer, # Updated parameter name
beta=0.1, # DPO hyperparameter - how strongly to prefer chosen
max_length=512, # Max sequence length
max_prompt_length=400, # Max prompt length
)
print("β
DPO trainer ready")
# 6. Start training
print("\nπ― Starting training...")
print("β±οΈ This should take ~8 minutes on M4 Max, longer on CPU")
# Get initial loss for comparison
initial_logs = dpo_trainer.evaluate()
initial_loss = initial_logs.get('eval_loss', 'N/A')
print(f"π Initial loss: {initial_loss}")
# Train the model
train_result = dpo_trainer.train()
# Get final loss
final_logs = dpo_trainer.evaluate()
final_loss = final_logs.get('eval_loss', train_result.training_loss)
print("\nπ Training completed!")
print(f"π Final training loss: {train_result.training_loss:.4f}")
print(f"π Loss improvement: {initial_loss} β {final_loss:.4f}")
# 7. Save the fine-tuned adapter
print("\nπΎ Saving model adapter...")
model.save_pretrained("./smollm_tool_adapter")
tokenizer.save_pretrained("./smollm_tool_adapter")
print("β
Model saved to './smollm_tool_adapter'")
print("π Training complete! Ready for testing.")
return model, tokenizer
if __name__ == "__main__":
model, tokenizer = main() |