File size: 6,696 Bytes
95a751f 072f790 95a751f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import os
import argparse
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import pandas as pd
import torch.nn.functional as F
class CSVDataset(Dataset):
def __init__(self, filepath, tokenizer, seq_length, rows_per_sample):
self.data = pd.read_csv(filepath)
self.text_data = self.data['Text'].tolist()
self.tokenizer = tokenizer
self.seq_length = seq_length
self.rows_per_sample = rows_per_sample # Number of rows to pack per sample
# Define CAP_SAMPLE_LEN
self.CAP_SAMPLE_LEN = 17500 # 15000 for Phi3 Model # Maximum number of characters per sample
if self.tokenizer.eos_token is None:
self.tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})
if self.tokenizer.pad_token is None:
self.tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
self.eos_token_id = self.tokenizer.eos_token_id
self.pad_token_id = self.tokenizer.pad_token_id
def __len__(self):
return (len(self.text_data) + self.rows_per_sample - 1) // self.rows_per_sample
def __getitem__(self, idx):
start_idx = idx * self.rows_per_sample
end_idx = min(start_idx + self.rows_per_sample, len(self.text_data))
lines = self.text_data[start_idx:end_idx]
# Truncate each line at CAP_SAMPLE_LEN (preferably at a space boundary)
truncated_lines = []
for text in lines:
if len(text) > self.CAP_SAMPLE_LEN:
l = text.rfind(' ', 0, self.CAP_SAMPLE_LEN)
if l < 0:
l = self.CAP_SAMPLE_LEN
text = text[:l]
truncated_lines.append(text)
# Tokenize all lines at once. Each line will be tokenized independently.
# We use add_special_tokens=False to avoid introducing BOS/EOS tokens automatically.
batch_encodings = self.tokenizer(
truncated_lines,
add_special_tokens=False,
truncation=True,
max_length=self.seq_length - 2, # Reserve space for EOS tokens
return_tensors=None
)
# batch_encodings["input_ids"] is a list of lists, each sub-list is token_ids for a line.
input_ids_list = []
for tokens in batch_encodings["input_ids"]:
# Append an EOS token after each line
tokens.append(self.eos_token_id)
input_ids_list.extend(tokens)
# Now we have a single list of input_ids for all rows.
# Ensure final token is EOS
if input_ids_list[-1] != self.eos_token_id:
input_ids_list.append(self.eos_token_id)
# Handle length adjustments
if len(input_ids_list) > self.seq_length:
# Truncate from the end
tokens_to_remove = len(input_ids_list) - self.seq_length
input_ids_list = input_ids_list[:-tokens_to_remove]
# Ensure EOS at the end after truncation
if input_ids_list[-1] != self.eos_token_id:
input_ids_list[-1] = self.eos_token_id
elif len(input_ids_list) < self.seq_length:
# Pad until we reach seq_length
padding_length = self.seq_length - len(input_ids_list)
input_ids_list.extend([self.pad_token_id] * padding_length)
# Ensure EOS at the end
input_ids_list[-1] = self.eos_token_id
input_ids = torch.tensor(input_ids_list, dtype=torch.long)
return input_ids
def evaluate_model(model, dataloader, device):
"""
Evaluate the model batch by batch and print the losses for each batch.
"""
model.eval()
total_loss = 0
with torch.no_grad():
for batch_idx, input_ids in enumerate(tqdm(dataloader, desc="Evaluating Model")):
input_ids = input_ids.to(device)
# Evaluate the model
outputs = model(input_ids, labels=input_ids)
loss = outputs.loss.item()
total_loss += loss
# Print loss for the current batch
print(f"Batch {batch_idx + 1} Loss: {loss:.4f}")
avg_loss = total_loss / len(dataloader)
return avg_loss
def evaluate_single_model(model_path, tokenizer_path, csv_path, seq_length, batch_size, device):
"""
Evaluate a single model on the dataset and print losses for each batch.
"""
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
dataset = CSVDataset(csv_path, tokenizer, seq_length, rows_per_sample=50)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4)
# model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
# Load model in 4-bit precision
# bnb_config = BitsAndBytesConfig(load_in_4bit=True)
# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
model_path,
# quantization_config=bnb_config, # Use quantization
torch_dtype=torch.float16, # 4-bit models compute in FP32
# device_map="auto"
).to(device)
# Convert model to bfloat16
# model.to(torch.bfloat16)
# # Remove quantization metadata from config
# if hasattr(model.config, "quantization_config"):
# delattr(model.config, "quantization_config")
# print("Removed quantization_config from model configuration.")
# Check model's dtype
print(model.dtype) # Should print torch.bfloat16
# Save the model in bfloat16 precision
# model.save_pretrained("model_bfloat16")
print("Evaluating Model...")
avg_loss = evaluate_model(model, dataloader, device)
print(f"Average Loss: {avg_loss:.4f}")
return avg_loss
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, required=True, help="Path to the model.")
parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the tokenizer.")
parser.add_argument("--csv_path", type=str, required=True, help="Path to the CSV file with 'Text' column.")
parser.add_argument("--seq_length", type=int, default=4096, help="Maximum sequence length.")
parser.add_argument("--batch_size", type=int, default=2, help="Batch size for evaluation.")
parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use.")
args = parser.parse_args()
evaluate_single_model(
args.model_path,
args.tokenizer_path,
args.csv_path,
args.seq_length,
args.batch_size,
args.device
)
|