File size: 6,860 Bytes
6639f75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
"""
tool_trainer_intensive.py - Intensive Training for 80% Target
This trainer implements:
1. 10+ epochs (vs 3 before)
2. Better learning rate schedule
3. Optimized training parameters
4. Progress monitoring for 80% target
"""
import torch
from transformers import (
AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import json
import time
def load_training_data(file_path="tool_pairs_massive.jsonl"):
"""Load the massive training dataset."""
pairs = []
with open(file_path, 'r') as f:
for line in f:
pairs.append(json.loads(line.strip()))
return pairs
def format_training_data(pairs, tokenizer):
"""Format training data for the model."""
formatted = []
for pair in pairs:
# Create training example: prompt + chosen response
full_text = pair["prompt"] + pair["chosen"] + tokenizer.eos_token
formatted.append({"text": full_text})
return formatted
def tokenize_function(examples, tokenizer, max_length=400):
"""Tokenize with optimized settings for intensive training."""
tokenized = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=max_length,
return_tensors=None
)
# For causal LM, labels are the same as input_ids
tokenized["labels"] = tokenized["input_ids"]
return tokenized
def main():
print("π INTENSIVE Training: SmolLM3-3B for 80% Target")
print("=" * 60)
# Setup device
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"β
Using device: {device}")
start_time = time.time()
# 1. Load model and tokenizer
print("π₯ Loading SmolLM3-3B...")
model_name = "HuggingFaceTB/SmolLM3-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32,
device_map={"": device} if device == "mps" else "auto"
)
print(f"β
Model loaded: {model.num_parameters() / 1e9:.1f}B params")
# 2. Setup LoRA with higher rank for better capacity
print("π© Setting up enhanced LoRA (rank 32)...")
lora_config = LoraConfig(
r=32, # Increased from 16 for better capacity
lora_alpha=64, # Increased proportionally
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.1,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"π― Trainable: {trainable_params:,} parameters ({100 * trainable_params / total_params:.2f}%)")
# 3. Load massive training data
print("π Loading massive training data...")
pairs = load_training_data()
print(f"β
{len(pairs)} training examples ready")
# 4. Format and tokenize
print("π€ Tokenizing massive dataset...")
formatted_data = format_training_data(pairs, tokenizer)
dataset = Dataset.from_list(formatted_data)
tokenized_dataset = dataset.map(
lambda x: tokenize_function(x, tokenizer),
batched=True,
remove_columns=dataset.column_names
)
print(f"π Tokenized {len(tokenized_dataset)} examples")
# 5. Setup intensive training arguments
print("βοΈ Configuring intensive training...")
training_args = TrainingArguments(
output_dir="./smollm3_intensive",
num_train_epochs=12, # Much longer training
per_device_train_batch_size=2, # Smaller batch for stability
gradient_accumulation_steps=4, # Effective batch size = 8
warmup_steps=100, # Longer warmup
learning_rate=3e-5, # Slightly higher learning rate
lr_scheduler_type="cosine", # Better learning schedule
weight_decay=0.01,
logging_steps=10,
save_steps=100,
save_total_limit=3,
push_to_hub=False,
report_to=None,
dataloader_pin_memory=False,
fp16=False, # Stability over speed
gradient_checkpointing=True, # Memory efficiency
max_grad_norm=1.0, # Gradient clipping
adam_epsilon=1e-8,
adam_beta1=0.9,
adam_beta2=0.999,
)
# 6. Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
pad_to_multiple_of=8,
)
# 7. Initialize intensive trainer
print("ποΈ Initializing intensive trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
)
# 8. Start intensive training
print("π― Starting INTENSIVE training...")
print(f"π Dataset: {len(pairs)} examples")
print(f"π Epochs: 12 (vs 3 before)")
print(f"π Learning rate: 3e-5 with cosine schedule")
print(f"β±οΈ Expected time: ~10-15 minutes")
print("π Monitoring for dramatic improvement...")
train_result = trainer.train()
training_time = time.time() - start_time
print(f"\nπ INTENSIVE Training completed!")
print(f"π Final loss: {train_result.training_loss:.4f}")
print(f"β±οΈ Training time: {training_time:.1f}s")
# 9. Save the intensively trained model
print("πΎ Saving intensively trained model...")
model.save_pretrained("./smollm3_intensive")
tokenizer.save_pretrained("./smollm3_intensive")
# 10. Quick validation test
print("π§ͺ Quick validation test...")
model.eval()
test_input = "Get weather for New York"
inputs = tokenizer(test_input, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=100,
temperature=0.1,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
print(f"π€ Model response: {response}")
# Try to parse as JSON
try:
parsed = json.loads(response.strip())
print(f"β
Valid JSON! {parsed}")
except json.JSONDecodeError as e:
print(f"β JSON error: {e}")
print(f"\nπ Intensive training complete!")
print(f"π Ready for 80% target evaluation")
return model, tokenizer
if __name__ == "__main__":
model, tokenizer = main() |