File size: 6,860 Bytes
6639f75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""
tool_trainer_intensive.py - Intensive Training for 80% Target

This trainer implements:
1. 10+ epochs (vs 3 before)
2. Better learning rate schedule
3. Optimized training parameters
4. Progress monitoring for 80% target
"""

import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import json
import time

def load_training_data(file_path="tool_pairs_massive.jsonl"):
    """Load the massive training dataset."""
    pairs = []
    with open(file_path, 'r') as f:
        for line in f:
            pairs.append(json.loads(line.strip()))
    return pairs

def format_training_data(pairs, tokenizer):
    """Format training data for the model."""
    formatted = []
    for pair in pairs:
        # Create training example: prompt + chosen response
        full_text = pair["prompt"] + pair["chosen"] + tokenizer.eos_token
        formatted.append({"text": full_text})
    return formatted

def tokenize_function(examples, tokenizer, max_length=400):
    """Tokenize with optimized settings for intensive training."""
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors=None
    )
    
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"]
    return tokenized

def main():
    print("πŸš€ INTENSIVE Training: SmolLM3-3B for 80% Target")
    print("=" * 60)
    
    # Setup device
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"βœ… Using device: {device}")
    
    start_time = time.time()
    
    # 1. Load model and tokenizer
    print("πŸ“₯ Loading SmolLM3-3B...")
    model_name = "HuggingFaceTB/SmolLM3-3B"
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,
        device_map={"": device} if device == "mps" else "auto"
    )
    
    print(f"βœ… Model loaded: {model.num_parameters() / 1e9:.1f}B params")
    
    # 2. Setup LoRA with higher rank for better capacity
    print("πŸ”© Setting up enhanced LoRA (rank 32)...")
    lora_config = LoraConfig(
        r=32,  # Increased from 16 for better capacity
        lora_alpha=64,  # Increased proportionally
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    model = get_peft_model(model, lora_config)
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"🎯 Trainable: {trainable_params:,} parameters ({100 * trainable_params / total_params:.2f}%)")
    
    # 3. Load massive training data
    print("πŸ“Š Loading massive training data...")
    pairs = load_training_data()
    print(f"βœ… {len(pairs)} training examples ready")
    
    # 4. Format and tokenize
    print("πŸ”€ Tokenizing massive dataset...")
    formatted_data = format_training_data(pairs, tokenizer)
    dataset = Dataset.from_list(formatted_data)
    
    tokenized_dataset = dataset.map(
        lambda x: tokenize_function(x, tokenizer),
        batched=True,
        remove_columns=dataset.column_names
    )
    print(f"πŸ“Š Tokenized {len(tokenized_dataset)} examples")
    
    # 5. Setup intensive training arguments
    print("βš™οΈ Configuring intensive training...")
    training_args = TrainingArguments(
        output_dir="./smollm3_intensive",
        num_train_epochs=12,  # Much longer training
        per_device_train_batch_size=2,  # Smaller batch for stability
        gradient_accumulation_steps=4,  # Effective batch size = 8
        warmup_steps=100,  # Longer warmup
        learning_rate=3e-5,  # Slightly higher learning rate
        lr_scheduler_type="cosine",  # Better learning schedule
        weight_decay=0.01,
        logging_steps=10,
        save_steps=100,
        save_total_limit=3,
        push_to_hub=False,
        report_to=None,
        dataloader_pin_memory=False,
        fp16=False,  # Stability over speed
        gradient_checkpointing=True,  # Memory efficiency
        max_grad_norm=1.0,  # Gradient clipping
        adam_epsilon=1e-8,
        adam_beta1=0.9,
        adam_beta2=0.999,
    )
    
    # 6. Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
        pad_to_multiple_of=8,
    )
    
    # 7. Initialize intensive trainer
    print("πŸ‹οΈ Initializing intensive trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )
    
    # 8. Start intensive training
    print("🎯 Starting INTENSIVE training...")
    print(f"πŸ“Š Dataset: {len(pairs)} examples")
    print(f"πŸ“Š Epochs: 12 (vs 3 before)")
    print(f"πŸ“Š Learning rate: 3e-5 with cosine schedule")
    print(f"⏱️ Expected time: ~10-15 minutes")
    print("πŸ“ˆ Monitoring for dramatic improvement...")
    
    train_result = trainer.train()
    
    training_time = time.time() - start_time
    print(f"\nπŸŽ‰ INTENSIVE Training completed!")
    print(f"πŸ“Š Final loss: {train_result.training_loss:.4f}")
    print(f"⏱️ Training time: {training_time:.1f}s")
    
    # 9. Save the intensively trained model
    print("πŸ’Ύ Saving intensively trained model...")
    model.save_pretrained("./smollm3_intensive")
    tokenizer.save_pretrained("./smollm3_intensive")
    
    # 10. Quick validation test
    print("πŸ§ͺ Quick validation test...")
    model.eval()
    test_input = "Get weather for New York"
    inputs = tokenizer(test_input, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    print(f"πŸ€– Model response: {response}")
    
    # Try to parse as JSON
    try:
        parsed = json.loads(response.strip())
        print(f"βœ… Valid JSON! {parsed}")
    except json.JSONDecodeError as e:
        print(f"❌ JSON error: {e}")
    
    print(f"\nπŸ† Intensive training complete!")
    print(f"πŸ“ˆ Ready for 80% target evaluation")
    
    return model, tokenizer

if __name__ == "__main__":
    model, tokenizer = main()