fjmgAI
/

col1-210M-EuroBERT

@@ -53,7 +53,7 @@ This dataset has been filtered for the Spanish language containing **303,000 exa
 ## Fine-Tuning Details
 - The model was trained using the **Contrastive Training**.
-- * Evaluated with <code>pylate.evaluation.colbert_triplet.ColBERTTripletEvaluator</code>
 | Metric       | Value      |
 |:-------------|:-----------|
@@ -72,8 +72,7 @@ pip install -U pylate
 import torch
 from pylate import models
-# Load the ColBERT model from Hugging Face Hub
-# 'trust_remote_code=True' is required for custom models like ColBERT
 model = models.ColBERT("fjmgAI/col1-210M-EuroBERT", trust_remote_code=True)
 # Move the model to GPU if available, otherwise use CPU
@@ -96,9 +95,7 @@ inputs = {key: value.to(device) for key, value in inputs.items()}
 # Generate token embeddings (no gradients needed for inference)
 with torch.no_grad():
     # Forward pass through the model
-    embeddings_dict = model(inputs)  # Returns dictionary with model outputs
-    # Extract token-level embeddings (shape: [batch_size, seq_length, embedding_dim])
     embeddings = embeddings_dict['token_embeddings']
     print(embeddings.shape)  # Expected: [3, 32, 128] (3 texts, 32 tokens max, 128-dim embeddings)
@@ -116,7 +113,7 @@ def colbert_similarity(query_emb, doc_emb):
         Normalized similarity score
     """
     # Compute dot product between all token pairs
-    similarity_matrix = torch.matmul(query_emb, doc_emb.T)  # [query_tokens, doc_tokens]
     # Get maximum similarity for each query token (MaxSim)
     max_similarities = similarity_matrix.max(dim=1)[0]
@@ -125,9 +122,9 @@ def colbert_similarity(query_emb, doc_emb):
     return max_similarities.sum() / query_emb.shape[0]
 # Extract embeddings for each text
-query_emb = embeddings[0]  # [32, 128] - Query embeddings
-positive_emb = embeddings[1]  # [32, 128] - Positive document embeddings
-negative_emb = embeddings[2]  # [32, 128] - Negative document embeddings
 # Compute similarity scores
 positive_score = colbert_similarity(query_emb, positive_emb)  # Query vs positive doc

 ## Fine-Tuning Details
 - The model was trained using the **Contrastive Training**.
+- Evaluated with <code>pylate.evaluation.colbert_triplet.ColBERTTripletEvaluator</code>
 | Metric       | Value      |
 |:-------------|:-----------|
 import torch
 from pylate import models
+# Load the ColBERT model
 model = models.ColBERT("fjmgAI/col1-210M-EuroBERT", trust_remote_code=True)
 # Move the model to GPU if available, otherwise use CPU
 # Generate token embeddings (no gradients needed for inference)
 with torch.no_grad():
     # Forward pass through the model
+    embeddings_dict = model(inputs)
     embeddings = embeddings_dict['token_embeddings']
     print(embeddings.shape)  # Expected: [3, 32, 128] (3 texts, 32 tokens max, 128-dim embeddings)
         Normalized similarity score
     """
     # Compute dot product between all token pairs
+    similarity_matrix = torch.matmul(query_emb, doc_emb.T)
     # Get maximum similarity for each query token (MaxSim)
     max_similarities = similarity_matrix.max(dim=1)[0]
     return max_similarities.sum() / query_emb.shape[0]
 # Extract embeddings for each text
+query_emb = embeddings[0]
+positive_emb = embeddings[1]
+negative_emb = embeddings[2]
 # Compute similarity scores
 positive_score = colbert_similarity(query_emb, positive_emb)  # Query vs positive doc