|
""" |
|
tool_trainer_m4_max.py - Optimized training for M4 Max Apple Silicon + SmolLM3-3B |
|
|
|
This script is specifically optimized for: |
|
- M4 Max 40-core GPU Apple Silicon |
|
- SmolLM3-3B (larger, more capable model) |
|
- Large training dataset (100+ examples) |
|
- Aggressive but stable hyperparameters for fast, high-quality training |
|
""" |
|
|
|
import json |
|
import torch |
|
import torch.backends.mps |
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForCausalLM, |
|
TrainingArguments, |
|
Trainer, |
|
DataCollatorForLanguageModeling |
|
) |
|
from peft import LoraConfig, get_peft_model, TaskType |
|
from datasets import Dataset |
|
import os |
|
import time |
|
|
|
def setup_mps_optimization(): |
|
"""Configure optimal settings for M4 Max.""" |
|
print("π Configuring M4 Max optimizations...") |
|
|
|
|
|
if torch.backends.mps.is_available(): |
|
print("β
MPS (Metal Performance Shaders) is available") |
|
print(f"π Using all 40 GPU cores of M4 Max") |
|
device = torch.device("mps") |
|
else: |
|
print("β οΈ MPS not available, falling back to CPU") |
|
device = torch.device("cpu") |
|
|
|
|
|
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
return device |
|
|
|
def load_training_data(file_path="tool_pairs_enhanced.jsonl"): |
|
"""Load the comprehensive training dataset.""" |
|
pairs = [] |
|
with open(file_path, 'r') as f: |
|
for line in f: |
|
pairs.append(json.loads(line.strip())) |
|
return pairs |
|
|
|
def format_for_sft(pairs, tokenizer): |
|
"""Convert pairs to SFT format optimized for function calling.""" |
|
formatted = [] |
|
for pair in pairs: |
|
|
|
full_text = pair["prompt"] + pair["chosen"] + tokenizer.eos_token |
|
formatted.append({"text": full_text}) |
|
return formatted |
|
|
|
def tokenize_function(examples, tokenizer, max_length=512): |
|
"""Tokenize with consistent padding for variable length sequences.""" |
|
|
|
tokenized = tokenizer( |
|
examples["text"], |
|
truncation=True, |
|
padding="max_length", |
|
max_length=max_length, |
|
return_tensors=None |
|
) |
|
|
|
|
|
tokenized["labels"] = tokenized["input_ids"] |
|
return tokenized |
|
|
|
def main(): |
|
print("π M4 Max Optimized Training: SmolLM3-3B Function Calling") |
|
print("=" * 70) |
|
|
|
|
|
device = setup_mps_optimization() |
|
start_time = time.time() |
|
|
|
|
|
print("π₯ Loading SmolLM3-3B model and tokenizer...") |
|
model_name = "HuggingFaceTB/SmolLM3-3B" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
tokenizer.padding_side = "right" |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
torch_dtype=torch.float32, |
|
trust_remote_code=True, |
|
attn_implementation="eager" |
|
) |
|
|
|
|
|
if str(device) == "mps": |
|
model = model.to(device) |
|
|
|
print(f"β
Loaded model: {model_name}") |
|
print(f"π§ Model dtype: {model.dtype}") |
|
print(f"πΎ Model size: ~{sum(p.numel() for p in model.parameters()) / 1e9:.1f}B parameters") |
|
print(f"π― Device: {device}") |
|
|
|
|
|
print("\nπ© Setting up LoRA adapter (rank 16 for SmolLM3-3B)...") |
|
lora_config = LoraConfig( |
|
r=16, |
|
lora_alpha=32, |
|
target_modules=[ |
|
"q_proj", "v_proj", "k_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj", |
|
"embed_tokens", "lm_head" |
|
], |
|
lora_dropout=0.05, |
|
bias="none", |
|
task_type=TaskType.CAUSAL_LM, |
|
modules_to_save=["embed_tokens", "lm_head"] |
|
) |
|
|
|
model = get_peft_model(model, lora_config) |
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) |
|
total_params = sum(p.numel() for p in model.parameters()) |
|
|
|
print(f"β
LoRA adapter attached") |
|
print(f"π― Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)") |
|
|
|
|
|
print("\nπ Loading comprehensive training dataset...") |
|
pairs = load_training_data() |
|
formatted_pairs = format_for_sft(pairs, tokenizer) |
|
|
|
print(f"β
Loaded {len(pairs)} training pairs") |
|
print(f"π Dataset is {len(pairs)/8:.1f}x larger than before!") |
|
|
|
|
|
train_dataset = Dataset.from_list(formatted_pairs) |
|
tokenized_dataset = train_dataset.map( |
|
lambda x: tokenize_function(x, tokenizer), |
|
batched=True, |
|
remove_columns=train_dataset.column_names, |
|
num_proc=1 |
|
) |
|
|
|
print(f"π Tokenized dataset: {len(tokenized_dataset)} examples") |
|
|
|
|
|
print("\nβοΈ Configuring M4 Max optimized training...") |
|
training_args = TrainingArguments( |
|
output_dir="./smollm3_tool_adapter", |
|
num_train_epochs=5, |
|
per_device_train_batch_size=4, |
|
gradient_accumulation_steps=2, |
|
learning_rate=3e-4, |
|
weight_decay=0.01, |
|
warmup_steps=50, |
|
logging_steps=5, |
|
save_steps=25, |
|
save_total_limit=3, |
|
remove_unused_columns=False, |
|
fp16=False, |
|
dataloader_pin_memory=False, |
|
report_to=None, |
|
logging_dir="./logs", |
|
gradient_checkpointing=True, |
|
optim="adamw_torch", |
|
lr_scheduler_type="cosine", |
|
save_strategy="steps", |
|
eval_strategy="no", |
|
load_best_model_at_end=False, |
|
) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, |
|
mlm=False, |
|
pad_to_multiple_of=8, |
|
) |
|
|
|
|
|
print("ποΈ Initializing M4 Max optimized trainer...") |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset, |
|
data_collator=data_collator, |
|
remove_unused_columns=False, |
|
) |
|
|
|
print("β
Trainer ready for M4 Max acceleration") |
|
|
|
|
|
print("\nπ― Starting accelerated training on M4 Max...") |
|
print("β±οΈ Expected time: ~3-5 minutes with 40 GPU cores") |
|
print("π Monitoring loss for quality improvement...") |
|
|
|
|
|
train_result = trainer.train() |
|
|
|
end_time = time.time() |
|
training_time = end_time - start_time |
|
|
|
print("\nπ M4 Max training completed!") |
|
print(f"π Final training loss: {train_result.training_loss:.4f}") |
|
print(f"β±οΈ Total training time: {training_time:.1f} seconds") |
|
print(f"π Training speed: {len(pairs) * 5 / training_time:.1f} examples/second") |
|
|
|
|
|
print("\nπΎ Saving optimized model adapter...") |
|
model.save_pretrained("./smollm3_tool_adapter") |
|
tokenizer.save_pretrained("./smollm3_tool_adapter") |
|
|
|
print("β
Model saved to './smollm3_tool_adapter'") |
|
|
|
|
|
print("\nπ§ͺ Enhanced functionality test...") |
|
test_schemas = [ |
|
{ |
|
"schema": { |
|
"name": "get_stock_price", |
|
"description": "Get current stock price", |
|
"parameters": { |
|
"type": "object", |
|
"properties": {"ticker": {"type": "string"}}, |
|
"required": ["ticker"] |
|
} |
|
}, |
|
"question": "What's Google stock price?", |
|
"expected_ticker": "GOOGL" |
|
}, |
|
{ |
|
"schema": { |
|
"name": "process_payment", |
|
"description": "Process a payment transaction", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"amount": {"type": "number"}, |
|
"currency": {"type": "string"}, |
|
"recipient": {"type": "string"} |
|
}, |
|
"required": ["amount", "recipient"] |
|
} |
|
}, |
|
"question": "Send $150 to Alice", |
|
"expected": "process_payment" |
|
} |
|
] |
|
|
|
model.eval() |
|
for i, test in enumerate(test_schemas, 1): |
|
test_prompt = f"""<|im_start|>system |
|
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
|
|
|
<schema> |
|
{json.dumps(test['schema'], indent=2)} |
|
</schema> |
|
|
|
<|im_start|>user |
|
{test['question']}<|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
|
|
inputs = tokenizer(test_prompt, return_tensors="pt") |
|
if str(device) == "mps": |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=80, |
|
temperature=0.1, |
|
do_sample=True, |
|
pad_token_id=tokenizer.eos_token_id |
|
) |
|
|
|
response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True) |
|
print(f"π§ͺ Test {i}: {test['question']}") |
|
print(f"π€ Response: {response.strip()}") |
|
|
|
|
|
try: |
|
json_response = json.loads(response.strip()) |
|
print(f"β
Valid JSON: {json_response}") |
|
except: |
|
print(f"β Invalid JSON") |
|
print("-" * 50) |
|
|
|
print("\nπ M4 Max Optimized Training Complete!") |
|
print(f"π Loss reduction with {len(pairs)} examples should be significant") |
|
print(f"π― Ready for comprehensive testing with schema_tester.py") |
|
|
|
return model, tokenizer |
|
|
|
if __name__ == "__main__": |
|
model, tokenizer = main() |