Spaces:
Running
Running
import torch | |
from utils import create_tokenizer | |
from dataset import load_saved_datasets | |
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer | |
# HYPERPARAMETERS | |
save_dir = "..data/tokenized_recipes" | |
NUM_EPOCHS = 5 | |
# Set up device agnostic code | |
if torch.cuda.is_available(): | |
device = 'cuda' | |
elif torch.mps.is_available(): | |
device = 'mps' | |
else: | |
device = 'cpu' | |
print("Using device:", device) | |
tokenizer = create_tokenizer() | |
# Load dataset | |
train_dataset, val_dataset = load_saved_datasets(save_dir) | |
# Set up model | |
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") | |
model.resize_token_embeddings(len(tokenizer)) | |
model.to(device) | |
data_collator = DataCollatorForLanguageModeling( | |
tokenizer = tokenizer, | |
mlm = False #Causal LM, not BERT | |
) | |
# Set up training arguments | |
training_args = TrainingArguments( | |
output_dir="../models/recipe-gpt", | |
overwrite_output_dir=True, | |
eval_strategy="epoch", | |
save_strategy="epoch", | |
logging_strategy="steps", | |
logging_steps=50, | |
save_total_limit=2, | |
num_train_epochs=NUM_EPOCHS, | |
per_device_train_batch_size=4, # adjust based on available memory | |
per_device_eval_batch_size=4, | |
warmup_steps=50, | |
weight_decay=0.01, | |
fp16=True, # Set to True if using a GPU with FP16 support | |
report_to="none" | |
) | |
#Set up trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=val_dataset, | |
tokenizer=tokenizer, | |
data_collator=data_collator | |
) | |
# Train the model | |
trainer.train() | |
final_model_path = "../models/recipe-gpt" | |
trainer.save_model(final_model_path) | |
tokenizer.save_pretrained(final_model_path) | |