training-scripts / train_h100.py
Codyfederer's picture
Upload train_h100.py with huggingface_hub
fe42dd1 verified
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "torch>=2.0.0",
# "transformers>=4.50.0",
# "datasets>=2.14.0",
# "peft>=0.7.0",
# "accelerate>=0.25.0",
# "trackio",
# "huggingface_hub",
# ]
# ///
"""
LoRA Fine-tuning: Add Tool Calling to Synthia-S1-27b
Using pre-tokenized data from Codyfederer/synthia-tool-calling-tokenized
Optimized for H100 80GB
"""
import os
from dataclasses import dataclass
from typing import Any, Dict, List
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
)
from peft import LoraConfig, get_peft_model
import torch
import trackio
from huggingface_hub import whoami
@dataclass
class DataCollatorForPreTokenized:
"""Data collator for pre-tokenized datasets with padding."""
pad_token_id: int
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
# Find max length in batch
max_length = max(len(f["input_ids"]) for f in features)
batch = {
"input_ids": [],
"attention_mask": [],
"labels": [],
}
for feature in features:
input_ids = feature["input_ids"]
attention_mask = feature["attention_mask"]
labels = feature.get("labels", input_ids.copy())
# Calculate padding needed
padding_length = max_length - len(input_ids)
# Pad sequences (right padding)
batch["input_ids"].append(input_ids + [self.pad_token_id] * padding_length)
batch["attention_mask"].append(attention_mask + [0] * padding_length)
batch["labels"].append(labels + [-100] * padding_length) # -100 is ignored by loss
# Convert to tensors
return {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
# Configuration
BASE_MODEL = "Tesslate/Synthia-S1-27b"
OUTPUT_MODEL = "Synthia-S1-27b-tool-calling"
TOKENIZED_DATASET = "Codyfederer/synthia-tool-calling-tokenized"
MAX_SEQ_LENGTH = 4096
# H100 optimized parameters
BATCH_SIZE = 4 # Higher batch size for H100 80GB
GRADIENT_ACCUMULATION = 8 # Effective batch = 32
LEARNING_RATE = 2e-4
NUM_EPOCHS = 1
LORA_R = 64
LORA_ALPHA = 128
print("=" * 60)
print("Tool Calling Fine-tuning for Synthia-S1-27b (H100)")
print("=" * 60)
# Initialize Trackio
trackio.init(project="synthia-tool-calling")
# Get HF username
try:
username = whoami()["name"]
hub_model_id = f"{username}/{OUTPUT_MODEL}"
print(f"Will push to: {hub_model_id}")
except Exception as e:
print(f"Error getting username: {e}")
raise
# Load tokenizer
print(f"\nLoading tokenizer from {BASE_MODEL}...")
tokenizer = AutoTokenizer.from_pretrained(
BASE_MODEL,
trust_remote_code=True,
padding_side="right",
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
print(f"Vocab size: {len(tokenizer):,}")
# Load pre-tokenized dataset
print(f"\nLoading pre-tokenized dataset: {TOKENIZED_DATASET}")
tokenized_ds = load_dataset(TOKENIZED_DATASET)
train_dataset = tokenized_ds["train"]
eval_dataset = tokenized_ds.get("test", tokenized_ds.get("validation"))
print(f"Train samples: {len(train_dataset):,}")
if eval_dataset:
print(f"Eval samples: {len(eval_dataset):,}")
# Truncate to MAX_SEQ_LENGTH
def truncate_example(example):
return {
"input_ids": example["input_ids"][:MAX_SEQ_LENGTH],
"attention_mask": example["attention_mask"][:MAX_SEQ_LENGTH],
"labels": example["labels"][:MAX_SEQ_LENGTH] if "labels" in example else example["input_ids"][:MAX_SEQ_LENGTH],
}
print(f"Truncating to max_length={MAX_SEQ_LENGTH}...")
train_dataset = train_dataset.map(truncate_example, desc="Truncating train")
if eval_dataset:
eval_dataset = eval_dataset.map(truncate_example, desc="Truncating eval")
# Load model
print(f"\nLoading model: {BASE_MODEL}...")
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="sdpa",
)
print(f"Model loaded. Parameters: {model.num_parameters():,}")
# Configure LoRA
print(f"\nConfiguring LoRA (r={LORA_R}, alpha={LORA_ALPHA})...")
lora_config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=0.05,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Training arguments - H100 optimized
print("\nConfiguring training...")
training_args = TrainingArguments(
output_dir=f"./{OUTPUT_MODEL}",
num_train_epochs=NUM_EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION,
learning_rate=LEARNING_RATE,
lr_scheduler_type="cosine",
warmup_ratio=0.03,
weight_decay=0.01,
optim="adamw_torch",
gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": False},
max_grad_norm=1.0,
eval_strategy="steps",
eval_steps=500,
save_strategy="steps",
save_steps=500,
save_total_limit=3,
push_to_hub=True,
hub_model_id=hub_model_id,
hub_strategy="checkpoint",
logging_steps=10,
report_to="trackio",
run_name=f"synthia-tool-calling-lora-r{LORA_R}",
bf16=True,
dataloader_num_workers=0, # Avoid multiprocessing issues with custom collator
dataloader_pin_memory=True,
seed=42,
remove_unused_columns=False,
)
# Initialize trainer
print("\nInitializing trainer...")
data_collator = DataCollatorForPreTokenized(pad_token_id=tokenizer.pad_token_id)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
)
# Train
print("\n" + "=" * 60)
print("Starting training...")
print("=" * 60 + "\n")
trainer.train()
# Save and push
print("\nSaving final model...")
trainer.save_model()
print(f"Pushing to Hub: {hub_model_id}")
trainer.push_to_hub()
print(f"\n" + "=" * 60)
print(f"Training complete!")
print(f"Model available at: https://huggingface.co/{hub_model_id}")
print("=" * 60)