File size: 6,696 Bytes
95a751f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
072f790
 
95a751f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import os
import argparse
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import pandas as pd
import torch.nn.functional as F

class CSVDataset(Dataset):
    def __init__(self, filepath, tokenizer, seq_length, rows_per_sample): 
        self.data = pd.read_csv(filepath)
        self.text_data = self.data['Text'].tolist()
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        self.rows_per_sample = rows_per_sample  # Number of rows to pack per sample

        # Define CAP_SAMPLE_LEN
        self.CAP_SAMPLE_LEN = 17500  # 15000 for Phi3 Model # Maximum number of characters per sample

        if self.tokenizer.eos_token is None:
            self.tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

        self.eos_token_id = self.tokenizer.eos_token_id
        self.pad_token_id = self.tokenizer.pad_token_id

    def __len__(self): 
        return (len(self.text_data) + self.rows_per_sample - 1) // self.rows_per_sample

    def __getitem__(self, idx):
        start_idx = idx * self.rows_per_sample
        end_idx = min(start_idx + self.rows_per_sample, len(self.text_data))

        lines = self.text_data[start_idx:end_idx]

        # Truncate each line at CAP_SAMPLE_LEN (preferably at a space boundary)
        truncated_lines = []
        for text in lines:
            if len(text) > self.CAP_SAMPLE_LEN:
                l = text.rfind(' ', 0, self.CAP_SAMPLE_LEN)
                if l < 0:
                    l = self.CAP_SAMPLE_LEN
                text = text[:l]
            truncated_lines.append(text)

        # Tokenize all lines at once. Each line will be tokenized independently.
        # We use add_special_tokens=False to avoid introducing BOS/EOS tokens automatically.
        batch_encodings = self.tokenizer(
            truncated_lines,
            add_special_tokens=False,
            truncation=True,
            max_length=self.seq_length - 2,  # Reserve space for EOS tokens
            return_tensors=None
        )

        # batch_encodings["input_ids"] is a list of lists, each sub-list is token_ids for a line.
        input_ids_list = []
        for tokens in batch_encodings["input_ids"]:
            # Append an EOS token after each line
            tokens.append(self.eos_token_id)
            input_ids_list.extend(tokens)

        # Now we have a single list of input_ids for all rows.
        # Ensure final token is EOS
        if input_ids_list[-1] != self.eos_token_id:
            input_ids_list.append(self.eos_token_id)

        # Handle length adjustments
        if len(input_ids_list) > self.seq_length:
            # Truncate from the end
            tokens_to_remove = len(input_ids_list) - self.seq_length
            input_ids_list = input_ids_list[:-tokens_to_remove]
            # Ensure EOS at the end after truncation
            if input_ids_list[-1] != self.eos_token_id:
                input_ids_list[-1] = self.eos_token_id
        elif len(input_ids_list) < self.seq_length:
            # Pad until we reach seq_length
            padding_length = self.seq_length - len(input_ids_list)
            input_ids_list.extend([self.pad_token_id] * padding_length)
            # Ensure EOS at the end
            input_ids_list[-1] = self.eos_token_id

        input_ids = torch.tensor(input_ids_list, dtype=torch.long)
        return input_ids


def evaluate_model(model, dataloader, device):
    """
    Evaluate the model batch by batch and print the losses for each batch.
    """
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch_idx, input_ids in enumerate(tqdm(dataloader, desc="Evaluating Model")):
            input_ids = input_ids.to(device)

            # Evaluate the model
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss.item()
            total_loss += loss

            # Print loss for the current batch
            print(f"Batch {batch_idx + 1} Loss: {loss:.4f}")

    avg_loss = total_loss / len(dataloader)
    return avg_loss


def evaluate_single_model(model_path, tokenizer_path, csv_path, seq_length, batch_size, device):
    """
    Evaluate a single model on the dataset and print losses for each batch.
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    dataset = CSVDataset(csv_path, tokenizer, seq_length, rows_per_sample=50)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4)

    # model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
    
    # Load model in 4-bit precision
    # bnb_config = BitsAndBytesConfig(load_in_4bit=True)

    # Load the quantized model
    model = AutoModelForCausalLM.from_pretrained(
    model_path,
    # quantization_config=bnb_config,  # Use quantization
    torch_dtype=torch.float16,      # 4-bit models compute in FP32
    # device_map="auto"
    ).to(device)

    # Convert model to bfloat16
    # model.to(torch.bfloat16)

    # # Remove quantization metadata from config
    # if hasattr(model.config, "quantization_config"):
    #     delattr(model.config, "quantization_config")
    #     print("Removed quantization_config from model configuration.")
    
    # Check model's dtype
    print(model.dtype)  # Should print torch.bfloat16

    # Save the model in bfloat16 precision
    # model.save_pretrained("model_bfloat16")

    print("Evaluating Model...")
    avg_loss = evaluate_model(model, dataloader, device)
    print(f"Average Loss: {avg_loss:.4f}")

    return avg_loss


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", type=str, required=True, help="Path to the model.")
    parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the tokenizer.")
    parser.add_argument("--csv_path", type=str, required=True, help="Path to the CSV file with 'Text' column.")
    parser.add_argument("--seq_length", type=int, default=4096, help="Maximum sequence length.")
    parser.add_argument("--batch_size", type=int, default=2, help="Batch size for evaluation.")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use.")

    args = parser.parse_args()

    evaluate_single_model(
        args.model_path,
        args.tokenizer_path,
        args.csv_path,
        args.seq_length,
        args.batch_size,
        args.device
    )