|
""" |
|
tool_trainer.py - Fine-tune SmolLM3-3B for dynamic function calling using LoFT + DPO |
|
|
|
This script loads SmolLM3-3B, attaches a LoRA adapter (rank 8), and trains it using |
|
Direct Preference Optimization (DPO) on our preference pairs to teach JSON-only responses. |
|
|
|
Key hyperparameters: |
|
- LoRA rank: 8 (small adapter for efficiency) |
|
- DPO beta: 0.1 (controls how strongly we prefer chosen over rejected) |
|
- Epochs: 3 (enough to learn pattern without overfitting) |
|
""" |
|
|
|
import json |
|
import torch |
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForCausalLM, |
|
TrainingArguments, |
|
Trainer |
|
) |
|
from peft import LoraConfig, get_peft_model, TaskType |
|
from trl import DPOTrainer |
|
from datasets import Dataset |
|
import os |
|
|
|
def load_preference_pairs(file_path="tool_pairs.jsonl"): |
|
"""Load and parse the JSONL preference pairs.""" |
|
pairs = [] |
|
with open(file_path, 'r') as f: |
|
for line in f: |
|
pairs.append(json.loads(line.strip())) |
|
return pairs |
|
|
|
def format_for_dpo(pairs): |
|
"""Convert our pairs to DPO trainer format.""" |
|
formatted = [] |
|
for pair in pairs: |
|
formatted.append({ |
|
"prompt": pair["prompt"], |
|
"chosen": pair["chosen"], |
|
"rejected": pair["rejected"] |
|
}) |
|
return formatted |
|
|
|
def main(): |
|
print("π Starting Dynamic Function-Calling Agent Training") |
|
print("=" * 60) |
|
|
|
|
|
print("π₯ Loading SmolLM3-3B model and tokenizer...") |
|
model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
|
device_map="auto" if torch.cuda.is_available() else None, |
|
trust_remote_code=True |
|
) |
|
|
|
print(f"β
Loaded model: {model_name}") |
|
print(f"π§ Model dtype: {model.dtype}") |
|
print(f"πΎ Model size: ~{sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters") |
|
|
|
|
|
print("\nπ© Setting up LoRA adapter (rank 8)...") |
|
lora_config = LoraConfig( |
|
r=8, |
|
lora_alpha=16, |
|
target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], |
|
lora_dropout=0.1, |
|
bias="none", |
|
task_type=TaskType.CAUSAL_LM |
|
) |
|
|
|
model = get_peft_model(model, lora_config) |
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) |
|
total_params = sum(p.numel() for p in model.parameters()) |
|
|
|
print(f"β
LoRA adapter attached") |
|
print(f"π― Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)") |
|
|
|
|
|
print("\nπ Loading preference pairs...") |
|
pairs = load_preference_pairs() |
|
formatted_pairs = format_for_dpo(pairs) |
|
train_dataset = Dataset.from_list(formatted_pairs) |
|
|
|
print(f"β
Loaded {len(pairs)} preference pairs") |
|
print("π Sample pair:") |
|
print(f" Prompt: {pairs[0]['prompt'][:100]}...") |
|
print(f" Chosen: {pairs[0]['chosen']}") |
|
print(f" Rejected: {pairs[0]['rejected'][:50]}...") |
|
|
|
|
|
print("\nβοΈ Configuring training (3 epochs, Ξ²=0.1)...") |
|
training_args = TrainingArguments( |
|
output_dir="./smollm_tool_adapter", |
|
num_train_epochs=3, |
|
per_device_train_batch_size=1, |
|
gradient_accumulation_steps=4, |
|
learning_rate=5e-5, |
|
warmup_steps=10, |
|
logging_steps=1, |
|
save_steps=50, |
|
eval_strategy="no", |
|
remove_unused_columns=False, |
|
fp16=torch.cuda.is_available(), |
|
dataloader_pin_memory=False, |
|
report_to=None |
|
) |
|
|
|
|
|
print("ποΈ Initializing DPO trainer...") |
|
dpo_trainer = DPOTrainer( |
|
model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
processing_class=tokenizer, |
|
beta=0.1, |
|
max_length=512, |
|
max_prompt_length=400, |
|
) |
|
|
|
print("β
DPO trainer ready") |
|
|
|
|
|
print("\nπ― Starting training...") |
|
print("β±οΈ This should take ~8 minutes on M4 Max, longer on CPU") |
|
|
|
|
|
initial_logs = dpo_trainer.evaluate() |
|
initial_loss = initial_logs.get('eval_loss', 'N/A') |
|
print(f"π Initial loss: {initial_loss}") |
|
|
|
|
|
train_result = dpo_trainer.train() |
|
|
|
|
|
final_logs = dpo_trainer.evaluate() |
|
final_loss = final_logs.get('eval_loss', train_result.training_loss) |
|
|
|
print("\nπ Training completed!") |
|
print(f"π Final training loss: {train_result.training_loss:.4f}") |
|
print(f"π Loss improvement: {initial_loss} β {final_loss:.4f}") |
|
|
|
|
|
print("\nπΎ Saving model adapter...") |
|
model.save_pretrained("./smollm_tool_adapter") |
|
tokenizer.save_pretrained("./smollm_tool_adapter") |
|
|
|
print("β
Model saved to './smollm_tool_adapter'") |
|
print("π Training complete! Ready for testing.") |
|
|
|
return model, tokenizer |
|
|
|
if __name__ == "__main__": |
|
model, tokenizer = main() |