QuizCraftAi / valhalla /t5-base-qg-hl finetuned
Vriti29's picture
Create t5-base-qg-hl finetuned
393894b
# -*- coding: utf-8 -*-
"""Untitled6.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/11megvyfcr49Oy4FGK7kteQ2iMdxZYp4L
"""
pip install transformers datasets sentence-transformers evaluate scikit-learn
from google.colab import files
uploaded = files.upload()
from google.colab import files
uploaded = files.upload()
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, DatasetDict, load_metric
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import torch
import numpy as np
def load_csv_datasets(train_path, eval_path):
train_df = pd.read_csv(train_path)
eval_df = pd.read_csv(eval_path)
dataset = DatasetDict({
'train': Dataset.from_pandas(train_df),
'eval': Dataset.from_pandas(eval_df)
})
return dataset
def preprocess(example):
input_text = example['input']
target_text = example['target']
model_inputs = tokenizer(input_text, max_length=512, padding='max_length', truncation=True)
labels = tokenizer(target_text, max_length=64, padding='max_length', truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
model_name = "valhalla/t5-base-qg-hl"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
dataset = load_csv_datasets("train.csv", "eval.csv")
tokenized_dataset = dataset.map(preprocess, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
training_args = TrainingArguments(
output_dir="./qg_finetuned",
eval_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
logging_dir='./logs',
logging_steps=10,
save_total_limit=1,
load_best_model_at_end=True,
metric_for_best_model="cosine",
greater_is_better=True
)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
# Ensure predictions is a list of lists of integers for batch_decode
if isinstance(predictions, tuple):
predictions = predictions[0]
# Replace -100 in labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
# Filter out invalid token IDs from predictions
valid_vocab_size = tokenizer.vocab_size
filtered_predictions = []
for pred_seq in predictions.tolist():
filtered_seq = [token_id for token_id in pred_seq[0] if 0 <= token_id < valid_vocab_size]
filtered_predictions.append(filtered_seq)
decoded_preds = tokenizer.batch_decode(filtered_predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Using SentenceTransformer for cosine similarity
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings_pred = embedder.encode(decoded_preds, convert_to_tensor=True)
embeddings_label = embedder.encode(decoded_labels, convert_to_tensor=True)
cosine_scores = util.cos_sim(embeddings_pred, embeddings_label).diagonal()
avg_cosine = cosine_scores.mean().item()
return {"cosine": avg_cosine}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["eval"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
results = trainer.evaluate()
print("Evaluation Results:", results)