Denis202 / train_model.py
Denis202's picture
Update train_model.py
812387d verified
raw
history blame
3.46 kB
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import torch.nn.functional as F
import json
import os
from sklearn.model_selection import train_test_split
# --- Config ---
MODEL_NAME = "bert-base-multilingual-cased"
TRAINING_FILE = "./training_data/greetings.txt"
SAVE_PATH = "./trained_bert_model"
EPOCHS = 3
BATCH_SIZE = 8
MAX_LEN = 64
LEARNING_RATE = 2e-5
# --- Load training data ---
def load_training_data(file_path):
inputs, responses = [], []
if not os.path.exists(file_path):
raise FileNotFoundError(f"{file_path} not found!")
with open(file_path, "r", encoding="utf-8") as f:
lines = [line.strip() for line in f if line.strip()]
for i in range(0, len(lines), 2):
user_input = lines[i].replace("User:", "").strip()
assistant_response = lines[i+1].replace("Assistant:", "").strip()
inputs.append(user_input)
responses.append(assistant_response)
return inputs, responses
# --- Dataset ---
class KiswahiliDataset(Dataset):
def __init__(self, inputs, responses, tokenizer):
self.inputs = inputs
self.responses = responses
self.tokenizer = tokenizer
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
text = f"{self.inputs[idx]} [SEP] {self.responses[idx]}"
encoding = self.tokenizer(
text,
truncation=True,
padding='max_length',
max_length=MAX_LEN,
return_tensors='pt'
)
# Label 1 = positive example
label = torch.tensor(1)
return {
'input_ids': encoding['input_ids'].squeeze(),
'attention_mask': encoding['attention_mask'].squeeze(),
'labels': label
}
# --- Main training ---
def main():
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
inputs, responses = load_training_data(TRAINING_FILE)
dataset = KiswahiliDataset(inputs, responses, tokenizer)
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
model.train()
for epoch in range(EPOCHS):
total_loss = 0
for batch in train_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss/len(train_loader):.4f}")
# Save model
if not os.path.exists(SAVE_PATH):
os.makedirs(SAVE_PATH)
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
# Save responses for chatbot
with open(os.path.join(SAVE_PATH, "responses.json"), "w", encoding="utf-8") as f:
json.dump({"responses": responses}, f, ensure_ascii=False, indent=4)
print(f"✅ Training complete. Model saved to {SAVE_PATH}")
if __name__ == "__main__":
main()