import gradio as gr import pandas as pd from datasets import Dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from sklearn.metrics import accuracy_score, f1_score import torch import os model_name = "distilbert-base-multilingual-cased" tokenizer = AutoTokenizer.from_pretrained(model_name) def preprocess_function(examples): return tokenizer(examples["comment"], truncation=True, padding=True) def compute_metrics(eval_pred): logits, labels = eval_pred predictions = torch.argmax(torch.tensor(logits), dim=-1) acc = accuracy_score(labels, predictions) f1 = f1_score(labels, predictions, average="weighted") return {"accuracy": acc, "f1": f1} def train_model(file): try: df = pd.read_csv(file.name) if "comment" not in df.columns or "label" not in df.columns: return "File CSV phải có cột 'comment' và 'label'" dataset = Dataset.from_pandas(df) tokenized_dataset = dataset.map(preprocess_function, batched=True) tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2) train_dataset = tokenized_dataset["train"] eval_dataset = tokenized_dataset["test"] model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) args = TrainingArguments( output_dir="results", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, logging_dir="logs", logging_steps=10, push_to_hub=True, hub_model_id="vnanhtuan/fine-tune-danh-gia-cam-xuc" ) trainer = Trainer( model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train() trainer.push_to_hub() return "Huấn luyện hoàn tất và model đã được đẩy lên Hugging Face." except Exception as e: import traceback return f"❌ Error: {str(e)}\n\n{traceback.format_exc()}" def predict_sentiment(text): model = AutoModelForSequenceClassification.from_pretrained("vnanhtuan/fine-tune-danh-gia-cam-xuc") inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs) prediction = torch.argmax(outputs.logits, dim=1).item() label_text = "Tích cực" if prediction == 1 else "Tiêu cực" return label_text with gr.Blocks() as demo: gr.Markdown("# Fine-tune mô hình phân loại cảm xúc tiếng Việt") with gr.Tab("Huấn luyện mô hình"): csv_file = gr.File(label="Tải file CSV gồm 2 cột: comment, label") train_button = gr.Button("Bắt đầu huấn luyện") train_output = gr.Textbox(label="Kết quả") train_button.click(fn=train_model, inputs=csv_file, outputs=train_output) with gr.Tab("Dự đoán cảm xúc"): input_text = gr.Textbox(label="Nhập câu cần phân tích") output_label = gr.Textbox(label="Kết quả dự đoán") predict_button = gr.Button("Dự đoán") predict_button.click(fn=predict_sentiment, inputs=input_text, outputs=output_label) demo.launch()