File size: 5,897 Bytes
6639f75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
tool_trainer.py - Fine-tune SmolLM3-3B for dynamic function calling using LoFT + DPO

This script loads SmolLM3-3B, attaches a LoRA adapter (rank 8), and trains it using
Direct Preference Optimization (DPO) on our preference pairs to teach JSON-only responses.

Key hyperparameters:
- LoRA rank: 8 (small adapter for efficiency)
- DPO beta: 0.1 (controls how strongly we prefer chosen over rejected)
- Epochs: 3 (enough to learn pattern without overfitting)
"""

import json
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from trl import DPOTrainer
from datasets import Dataset
import os

def load_preference_pairs(file_path="tool_pairs.jsonl"):
    """Load and parse the JSONL preference pairs."""
    pairs = []
    with open(file_path, 'r') as f:
        for line in f:
            pairs.append(json.loads(line.strip()))
    return pairs

def format_for_dpo(pairs):
    """Convert our pairs to DPO trainer format."""
    formatted = []
    for pair in pairs:
        formatted.append({
            "prompt": pair["prompt"],
            "chosen": pair["chosen"], 
            "rejected": pair["rejected"]
        })
    return formatted

def main():
    print("πŸš€ Starting Dynamic Function-Calling Agent Training")
    print("=" * 60)
    
    # 1. Load the base model and tokenizer
    print("πŸ“₯ Loading SmolLM3-3B model and tokenizer...")
    model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct"  # Using available model
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None,
        trust_remote_code=True
    )
    
    print(f"βœ… Loaded model: {model_name}")
    print(f"πŸ”§ Model dtype: {model.dtype}")
    print(f"πŸ’Ύ Model size: ~{sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")
    
    # 2. Set up LoRA configuration
    print("\nπŸ”© Setting up LoRA adapter (rank 8)...")
    lora_config = LoraConfig(
        r=8,                    # Low rank - small adapter
        lora_alpha=16,          # Scaling factor (typically 2x rank)
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.1,       # Prevent overfitting
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    
    model = get_peft_model(model, lora_config)
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    
    print(f"βœ… LoRA adapter attached")
    print(f"🎯 Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)")
    
    # 3. Load and prepare training data
    print("\nπŸ“Š Loading preference pairs...")
    pairs = load_preference_pairs()
    formatted_pairs = format_for_dpo(pairs)
    train_dataset = Dataset.from_list(formatted_pairs)
    
    print(f"βœ… Loaded {len(pairs)} preference pairs")
    print("πŸ“ Sample pair:")
    print(f"   Prompt: {pairs[0]['prompt'][:100]}...")
    print(f"   Chosen: {pairs[0]['chosen']}")
    print(f"   Rejected: {pairs[0]['rejected'][:50]}...")
    
    # 4. Set up training arguments
    print("\nβš™οΈ Configuring training (3 epochs, Ξ²=0.1)...")
    training_args = TrainingArguments(
        output_dir="./smollm_tool_adapter",
        num_train_epochs=3,
        per_device_train_batch_size=1,      # Small batch for memory efficiency
        gradient_accumulation_steps=4,       # Effective batch size = 4
        learning_rate=5e-5,
        warmup_steps=10,
        logging_steps=1,
        save_steps=50,
        eval_strategy="no",                  # Updated parameter name
        remove_unused_columns=False,
        fp16=torch.cuda.is_available(),      # Use fp16 if GPU available
        dataloader_pin_memory=False,
        report_to=None                       # Disable wandb logging
    )
    
    # 5. Initialize DPO trainer
    print("πŸ‹οΈ Initializing DPO trainer...")
    dpo_trainer = DPOTrainer(
        model,
        args=training_args,
        train_dataset=train_dataset,
        processing_class=tokenizer,         # Updated parameter name
        beta=0.1,                           # DPO hyperparameter - how strongly to prefer chosen
        max_length=512,                     # Max sequence length
        max_prompt_length=400,              # Max prompt length
    )
    
    print("βœ… DPO trainer ready")
    
    # 6. Start training
    print("\n🎯 Starting training...")
    print("⏱️  This should take ~8 minutes on M4 Max, longer on CPU")
    
    # Get initial loss for comparison
    initial_logs = dpo_trainer.evaluate()
    initial_loss = initial_logs.get('eval_loss', 'N/A')
    print(f"πŸ“Š Initial loss: {initial_loss}")
    
    # Train the model
    train_result = dpo_trainer.train()
    
    # Get final loss
    final_logs = dpo_trainer.evaluate() 
    final_loss = final_logs.get('eval_loss', train_result.training_loss)
    
    print("\nπŸŽ‰ Training completed!")
    print(f"πŸ“Š Final training loss: {train_result.training_loss:.4f}")
    print(f"πŸ“ˆ Loss improvement: {initial_loss} β†’ {final_loss:.4f}")
    
    # 7. Save the fine-tuned adapter
    print("\nπŸ’Ύ Saving model adapter...")
    model.save_pretrained("./smollm_tool_adapter")
    tokenizer.save_pretrained("./smollm_tool_adapter")
    
    print("βœ… Model saved to './smollm_tool_adapter'")
    print("🏁 Training complete! Ready for testing.")
    
    return model, tokenizer

if __name__ == "__main__":
    model, tokenizer = main()