Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""Untitled6.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/11megvyfcr49Oy4FGK7kteQ2iMdxZYp4L | |
""" | |
pip install transformers datasets sentence-transformers evaluate scikit-learn | |
from google.colab import files | |
uploaded = files.upload() | |
from google.colab import files | |
uploaded = files.upload() | |
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq | |
from datasets import load_dataset, DatasetDict, load_metric | |
from sentence_transformers import SentenceTransformer, util | |
from sklearn.metrics.pairwise import cosine_similarity | |
import pandas as pd | |
import torch | |
import numpy as np | |
def load_csv_datasets(train_path, eval_path): | |
train_df = pd.read_csv(train_path) | |
eval_df = pd.read_csv(eval_path) | |
dataset = DatasetDict({ | |
'train': Dataset.from_pandas(train_df), | |
'eval': Dataset.from_pandas(eval_df) | |
}) | |
return dataset | |
def preprocess(example): | |
input_text = example['input'] | |
target_text = example['target'] | |
model_inputs = tokenizer(input_text, max_length=512, padding='max_length', truncation=True) | |
labels = tokenizer(target_text, max_length=64, padding='max_length', truncation=True) | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
model_name = "valhalla/t5-base-qg-hl" | |
tokenizer = T5Tokenizer.from_pretrained(model_name) | |
model = T5ForConditionalGeneration.from_pretrained(model_name) | |
dataset = load_csv_datasets("train.csv", "eval.csv") | |
tokenized_dataset = dataset.map(preprocess, batched=True) | |
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) | |
training_args = TrainingArguments( | |
output_dir="./qg_finetuned", | |
eval_strategy="epoch", | |
save_strategy="epoch", | |
per_device_train_batch_size=4, | |
per_device_eval_batch_size=4, | |
num_train_epochs=3, | |
logging_dir='./logs', | |
logging_steps=10, | |
save_total_limit=1, | |
load_best_model_at_end=True, | |
metric_for_best_model="cosine", | |
greater_is_better=True | |
) | |
def compute_metrics(eval_pred): | |
predictions, labels = eval_pred | |
# Ensure predictions is a list of lists of integers for batch_decode | |
if isinstance(predictions, tuple): | |
predictions = predictions[0] | |
# Replace -100 in labels as we can't decode them. | |
labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | |
# Filter out invalid token IDs from predictions | |
valid_vocab_size = tokenizer.vocab_size | |
filtered_predictions = [] | |
for pred_seq in predictions.tolist(): | |
filtered_seq = [token_id for token_id in pred_seq[0] if 0 <= token_id < valid_vocab_size] | |
filtered_predictions.append(filtered_seq) | |
decoded_preds = tokenizer.batch_decode(filtered_predictions, skip_special_tokens=True) | |
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | |
# Using SentenceTransformer for cosine similarity | |
embedder = SentenceTransformer('all-MiniLM-L6-v2') | |
embeddings_pred = embedder.encode(decoded_preds, convert_to_tensor=True) | |
embeddings_label = embedder.encode(decoded_labels, convert_to_tensor=True) | |
cosine_scores = util.cos_sim(embeddings_pred, embeddings_label).diagonal() | |
avg_cosine = cosine_scores.mean().item() | |
return {"cosine": avg_cosine} | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_dataset["train"], | |
eval_dataset=tokenized_dataset["eval"], | |
tokenizer=tokenizer, | |
data_collator=data_collator, | |
compute_metrics=compute_metrics, | |
) | |
trainer.train() | |
results = trainer.evaluate() | |
print("Evaluation Results:", results) | |