Update README.md
Browse files
README.md
CHANGED
|
@@ -53,7 +53,7 @@ This dataset has been filtered for the Spanish language containing **303,000 exa
|
|
| 53 |
|
| 54 |
## Fine-Tuning Details
|
| 55 |
- The model was trained using the **Contrastive Training**.
|
| 56 |
-
-
|
| 57 |
|
| 58 |
| Metric | Value |
|
| 59 |
|:-------------|:-----------|
|
|
@@ -72,8 +72,7 @@ pip install -U pylate
|
|
| 72 |
import torch
|
| 73 |
from pylate import models
|
| 74 |
|
| 75 |
-
# Load the ColBERT model
|
| 76 |
-
# 'trust_remote_code=True' is required for custom models like ColBERT
|
| 77 |
model = models.ColBERT("fjmgAI/col1-210M-EuroBERT", trust_remote_code=True)
|
| 78 |
|
| 79 |
# Move the model to GPU if available, otherwise use CPU
|
|
@@ -96,9 +95,7 @@ inputs = {key: value.to(device) for key, value in inputs.items()}
|
|
| 96 |
# Generate token embeddings (no gradients needed for inference)
|
| 97 |
with torch.no_grad():
|
| 98 |
# Forward pass through the model
|
| 99 |
-
embeddings_dict = model(inputs)
|
| 100 |
-
|
| 101 |
-
# Extract token-level embeddings (shape: [batch_size, seq_length, embedding_dim])
|
| 102 |
embeddings = embeddings_dict['token_embeddings']
|
| 103 |
print(embeddings.shape) # Expected: [3, 32, 128] (3 texts, 32 tokens max, 128-dim embeddings)
|
| 104 |
|
|
@@ -116,7 +113,7 @@ def colbert_similarity(query_emb, doc_emb):
|
|
| 116 |
Normalized similarity score
|
| 117 |
"""
|
| 118 |
# Compute dot product between all token pairs
|
| 119 |
-
similarity_matrix = torch.matmul(query_emb, doc_emb.T)
|
| 120 |
|
| 121 |
# Get maximum similarity for each query token (MaxSim)
|
| 122 |
max_similarities = similarity_matrix.max(dim=1)[0]
|
|
@@ -125,9 +122,9 @@ def colbert_similarity(query_emb, doc_emb):
|
|
| 125 |
return max_similarities.sum() / query_emb.shape[0]
|
| 126 |
|
| 127 |
# Extract embeddings for each text
|
| 128 |
-
query_emb = embeddings[0]
|
| 129 |
-
positive_emb = embeddings[1]
|
| 130 |
-
negative_emb = embeddings[2]
|
| 131 |
|
| 132 |
# Compute similarity scores
|
| 133 |
positive_score = colbert_similarity(query_emb, positive_emb) # Query vs positive doc
|
|
|
|
| 53 |
|
| 54 |
## Fine-Tuning Details
|
| 55 |
- The model was trained using the **Contrastive Training**.
|
| 56 |
+
- Evaluated with <code>pylate.evaluation.colbert_triplet.ColBERTTripletEvaluator</code>
|
| 57 |
|
| 58 |
| Metric | Value |
|
| 59 |
|:-------------|:-----------|
|
|
|
|
| 72 |
import torch
|
| 73 |
from pylate import models
|
| 74 |
|
| 75 |
+
# Load the ColBERT model
|
|
|
|
| 76 |
model = models.ColBERT("fjmgAI/col1-210M-EuroBERT", trust_remote_code=True)
|
| 77 |
|
| 78 |
# Move the model to GPU if available, otherwise use CPU
|
|
|
|
| 95 |
# Generate token embeddings (no gradients needed for inference)
|
| 96 |
with torch.no_grad():
|
| 97 |
# Forward pass through the model
|
| 98 |
+
embeddings_dict = model(inputs)
|
|
|
|
|
|
|
| 99 |
embeddings = embeddings_dict['token_embeddings']
|
| 100 |
print(embeddings.shape) # Expected: [3, 32, 128] (3 texts, 32 tokens max, 128-dim embeddings)
|
| 101 |
|
|
|
|
| 113 |
Normalized similarity score
|
| 114 |
"""
|
| 115 |
# Compute dot product between all token pairs
|
| 116 |
+
similarity_matrix = torch.matmul(query_emb, doc_emb.T)
|
| 117 |
|
| 118 |
# Get maximum similarity for each query token (MaxSim)
|
| 119 |
max_similarities = similarity_matrix.max(dim=1)[0]
|
|
|
|
| 122 |
return max_similarities.sum() / query_emb.shape[0]
|
| 123 |
|
| 124 |
# Extract embeddings for each text
|
| 125 |
+
query_emb = embeddings[0]
|
| 126 |
+
positive_emb = embeddings[1]
|
| 127 |
+
negative_emb = embeddings[2]
|
| 128 |
|
| 129 |
# Compute similarity scores
|
| 130 |
positive_score = colbert_similarity(query_emb, positive_emb) # Query vs positive doc
|