# -*- coding: utf-8 -*- """Untitled8.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1JMKmuuP0equrOr6l6oQVQbpbBnGTGvcc """ !pip install sentence-transformers from google.colab import files import pandas as pd import random uploaded = files.upload() file_name = list(uploaded.keys())[0] df = pd.read_csv(file_name) # Preview print("šŸ“„ Preview of training data:") print(df.head()) print(f"\nāœ… Loaded {len(df)} training pairs.") from sentence_transformers import InputExample train_examples = [ InputExample(texts=[row["text1"], row["text2"]], label=float(row["score"])) for _, row in df.iterrows() ] from sentence_transformers import SentenceTransformer, losses from torch.utils.data import DataLoader model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) train_loss = losses.CosineSimilarityLoss(model) model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=1, # Increase to 3–5 for better results warmup_steps=10, # Usually 10% of steps per epoch output_path="fine-tuned-mpnet-model" ) from sentence_transformers import SentenceTransformer model = SentenceTransformer("fine-tuned-mpnet-model") sentence = "This is a test sentence." embedding = model.encode(sentence) print(embedding.shape) fine_tuned_model = SentenceTransformer("fine-tuned-mpnet-model") # Example usage embedding = fine_tuned_model.encode("This is a test sentence.") print("šŸ”¢ Embedding shape:", embedding.shape) import os print(os.listdir("fine-tuned-mpnet-model")) from sentence_transformers import SentenceTransformer from sentence_transformers.util import cos_sim # Load base and fine-tuned models base_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") ft_model = SentenceTransformer("fine-tuned-mpnet-model") from sentence_transformers import SentenceTransformer from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from torch.utils.data import DataLoader fine_tuned_model = SentenceTransformer("fine-tuned-mpnet-model") sentence = "This is a test sentence." embedding = fine_tuned_model.encode(sentence) print("šŸ”¢ Embedding shape:", embedding.shape) """You can now use the `fine_tuned_model` to generate embeddings for any text data. For example, you can use these embeddings for tasks like semantic search, clustering, or classification.""" from tqdm import tqdm import numpy as np from sentence_transformers.util import cos_sim from sklearn.metrics import mean_squared_error from scipy.stats import spearmanr def evaluate_model(model, name): embeddings1 = model.encode(sentences1, convert_to_tensor=True) embeddings2 = model.encode(sentences2, convert_to_tensor=True) similarities = cos_sim(embeddings1, embeddings2).diagonal().cpu().numpy() mse = mean_squared_error(true_scores, similarities) spearman_corr, _ = spearmanr(true_scores, similarities) print(f"\nšŸ“‹ Evaluation: {name}") print(f"šŸ“ CosineSim vs Human Scores: ") print(f" • MSE: {mse:.4f}") print(f" • Spearman R: {spearman_corr:.4f}") return similarities # Extract sentences and scores from the DataFrame sentences1 = df['text1'].tolist() sentences2 = df['text2'].tolist() true_scores = df['score'].tolist() # Evaluate both models _ = evaluate_model(base_model, "Base MPNET") _ = evaluate_model(ft_model, "Fine-Tuned MPNET")