from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling from datasets import load_dataset import os os.environ["USE_TF"] = "0" model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" tokenizer = AutoTokenizer.from_pretrained(model_name) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained(model_name) # Load your text file as a dataset dataset = load_dataset("text", data_files={"train": "skin_disease_articles_clean.txt"}) # Tokenize the dataset def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) train_dataset = tokenized_datasets["train"] data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) training_args = TrainingArguments( output_dir="./tinyllama-finetuned-skin", overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=2, save_steps=500, save_total_limit=2, prediction_loss_only=True, fp16=True # Set True if using GPU with float16 support ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, ) trainer.train()