| !pip install transformers accelerate peft torch datasets | |
| from datasets import load_dataset | |
| dataset = load_dataset("Mahler60/yuuka_lore") # Cambia al nombre de tu dataset | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer | |
| from peft import get_peft_model, LoraConfig, TaskType | |
| import json | |
| # 1. Cargar el modelo base y el tokenizer | |
| model_name = "EleutherAI/gpt-neox-20b" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| # 2. Configurar LoRA (ajuste eficiente) | |
| peft_config = LoraConfig( | |
| task_type=TaskType.CAUSAL_LM, # Modelo de lenguaje causal | |
| r=8, # Dimensión de las matrices | |
| lora_alpha=16, # Factor de escalado | |
| lora_dropout=0.1 # Dropout para evitar sobreajuste | |
| ) | |
| model = get_peft_model(model, peft_config) | |
| # 3. Cargar los datos desde el archivo JSON | |
| data = [] | |
| with open("Yuuka-Proto.JSON", "r") as file: | |
| for line in file: | |
| example = json.loads(line.strip()) | |
| prompt = example["prompt"] | |
| response = example["response"] | |
| combined = f"{prompt} {response}" # Concatenamos prompt + respuesta como secuencia | |
| data.append(combined) | |
| # 4. Tokenizar el dataset | |
| tokenized_data = tokenizer(data, padding=True, truncation=True, return_tensors="pt") | |
| # 5. Configurar argumentos de entrenamiento | |
| training_args = TrainingArguments( | |
| output_dir="./results", # Carpeta donde guardar resultados | |
| per_device_train_batch_size=1, # Tamaño del batch | |
| num_train_epochs=1, # Número de épocas | |
| logging_dir="./logs", # Carpeta para logs | |
| save_steps=10, # Guardar cada X pasos | |
| ) | |
| # 6. Configurar el Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_data, # Dataset tokenizado | |
| ) | |
| # 7. Entrenar el modelo | |
| trainer.train() | |
| # 8. Guardar el modelo ajustado | |
| model.save_pretrained("./ajustado") | |
| tokenizer.save_pretrained("./ajustado") | |