File size: 2,517 Bytes
fc7053c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
"""Untitled4.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/19SAJcA_N4eQVyeNjT1iFdgpyLvvtSSEw
"""

!pip install transformers datasets accelerate -q

from google.colab import files
uploaded = files.upload()

from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments
import pandas as pd
import torch

# Load CSV file (adjust filename if needed)
df = pd.read_csv("flan_t5_true_false_dataset.csv")

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Load tokenizer and model
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing
def preprocess(example):
    inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=256)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], padding="max_length", truncation=True, max_length=64)
    inputs["labels"] = labels["input_ids"]
    return inputs

# Tokenize dataset
tokenized_dataset = dataset.map(preprocess, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan_t5_finetuned_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4, # Added evaluation batch size
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    save_total_limit=1,
    fp16=torch.cuda.is_available()
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model)
)

# Start training
trainer.train()

!zip -r flan_t5_finetuned_model.zip flan_t5_finetuned_model
files.download("flan_t5_finetuned_model.zip")

import pandas as pd

data = [
    {
        "input": f"Convert this fact into a true/false question: The moon is made of cheese {i}.",
        "output": f"The moon is made of cheese {i}. True or False?"
    }
    for i in range(150)
]

df = pd.DataFrame(data)
df.to_csv("flan_t5_eval.csv", index=False)

from google.colab import files
files.download('flan_t5_eval.csv')

!pip install transformers datasets bert-score sentence-transformers -q

from google.colab import files
uploaded = files.upload()

EVAL_CSV = "/content/flan_t5_eval.csv"

!ls -l ./flan_t5_finetuned