File size: 6,696 Bytes

import os
import argparse
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import pandas as pd
import torch.nn.functional as F

class CSVDataset(Dataset):
    def __init__(self, filepath, tokenizer, seq_length, rows_per_sample): 
        self.data = pd.read_csv(filepath)
        self.text_data = self.data['Text'].tolist()
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        self.rows_per_sample = rows_per_sample  # Number of rows to pack per sample

        # Define CAP_SAMPLE_LEN
        self.CAP_SAMPLE_LEN = 17500  # 15000 for Phi3 Model # Maximum number of characters per sample

        if self.tokenizer.eos_token is None:
            self.tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

        self.eos_token_id = self.tokenizer.eos_token_id
        self.pad_token_id = self.tokenizer.pad_token_id

    def __len__(self): 
        return (len(self.text_data) + self.rows_per_sample - 1) // self.rows_per_sample

    def __getitem__(self, idx):
        start_idx = idx * self.rows_per_sample
        end_idx = min(start_idx + self.rows_per_sample, len(self.text_data))

        lines = self.text_data[start_idx:end_idx]

        # Truncate each line at CAP_SAMPLE_LEN (preferably at a space boundary)
        truncated_lines = []
        for text in lines:
            if len(text) > self.CAP_SAMPLE_LEN:
                l = text.rfind(' ', 0, self.CAP_SAMPLE_LEN)
                if l < 0:
                    l = self.CAP_SAMPLE_LEN
                text = text[:l]
            truncated_lines.append(text)

        # Tokenize all lines at once. Each line will be tokenized independently.
        # We use add_special_tokens=False to avoid introducing BOS/EOS tokens automatically.
        batch_encodings = self.tokenizer(
            truncated_lines,
            add_special_tokens=False,
            truncation=True,
            max_length=self.seq_length - 2,  # Reserve space for EOS tokens
            return_tensors=None
        )

        # batch_encodings["input_ids"] is a list of lists, each sub-list is token_ids for a line.
        input_ids_list = []
        for tokens in batch_encodings["input_ids"]:
            # Append an EOS token after each line
            tokens.append(self.eos_token_id)
            input_ids_list.extend(tokens)

        # Now we have a single list of input_ids for all rows.
        # Ensure final token is EOS
        if input_ids_list[-1] != self.eos_token_id:
            input_ids_list.append(self.eos_token_id)

        # Handle length adjustments
        if len(input_ids_list) > self.seq_length:
            # Truncate from the end
            tokens_to_remove = len(input_ids_list) - self.seq_length
            input_ids_list = input_ids_list[:-tokens_to_remove]
            # Ensure EOS at the end after truncation
            if input_ids_list[-1] != self.eos_token_id:
                input_ids_list[-1] = self.eos_token_id
        elif len(input_ids_list) < self.seq_length:
            # Pad until we reach seq_length
            padding_length = self.seq_length - len(input_ids_list)
            input_ids_list.extend([self.pad_token_id] * padding_length)
            # Ensure EOS at the end
            input_ids_list[-1] = self.eos_token_id

        input_ids = torch.tensor(input_ids_list, dtype=torch.long)
        return input_ids


def evaluate_model(model, dataloader, device):
    """
    Evaluate the model batch by batch and print the losses for each batch.
    """
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch_idx, input_ids in enumerate(tqdm(dataloader, desc="Evaluating Model")):
            input_ids = input_ids.to(device)

            # Evaluate the model
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss.item()
            total_loss += loss

            # Print loss for the current batch
            print(f"Batch {batch_idx + 1} Loss: {loss:.4f}")

    avg_loss = total_loss / len(dataloader)
    return avg_loss


def evaluate_single_model(model_path, tokenizer_path, csv_path, seq_length, batch_size, device):
    """
    Evaluate a single model on the dataset and print losses for each batch.
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    dataset = CSVDataset(csv_path, tokenizer, seq_length, rows_per_sample=50)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4)

    # model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
    
    # Load model in 4-bit precision
    # bnb_config = BitsAndBytesConfig(load_in_4bit=True)

    # Load the quantized model
    model = AutoModelForCausalLM.from_pretrained(
    model_path,
    # quantization_config=bnb_config,  # Use quantization
    torch_dtype=torch.float16,      # 4-bit models compute in FP32
    # device_map="auto"
    ).to(device)

    # Convert model to bfloat16
    # model.to(torch.bfloat16)

    # # Remove quantization metadata from config
    # if hasattr(model.config, "quantization_config"):
    #     delattr(model.config, "quantization_config")
    #     print("Removed quantization_config from model configuration.")
    
    # Check model's dtype
    print(model.dtype)  # Should print torch.bfloat16

    # Save the model in bfloat16 precision
    # model.save_pretrained("model_bfloat16")

    print("Evaluating Model...")
    avg_loss = evaluate_model(model, dataloader, device)
    print(f"Average Loss: {avg_loss:.4f}")

    return avg_loss


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, required=True, help="Path to the model.")
    parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the tokenizer.")
    parser.add_argument("--csv_path", type=str, required=True, help="Path to the CSV file with 'Text' column.")
    parser.add_argument("--seq_length", type=int, default=4096, help="Maximum sequence length.")
    parser.add_argument("--batch_size", type=int, default=2, help="Batch size for evaluation.")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use.")

    args = parser.parse_args()

    evaluate_single_model(
        args.model_path,
        args.tokenizer_path,
        args.csv_path,
        args.seq_length,
        args.batch_size,
        args.device
    )