fjmgAI commited on
Commit
37e4951
·
verified ·
1 Parent(s): 9c2036a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +7 -10
README.md CHANGED
@@ -53,7 +53,7 @@ This dataset has been filtered for the Spanish language containing **303,000 exa
53
 
54
  ## Fine-Tuning Details
55
  - The model was trained using the **Contrastive Training**.
56
- - * Evaluated with <code>pylate.evaluation.colbert_triplet.ColBERTTripletEvaluator</code>
57
 
58
  | Metric | Value |
59
  |:-------------|:-----------|
@@ -72,8 +72,7 @@ pip install -U pylate
72
  import torch
73
  from pylate import models
74
 
75
- # Load the ColBERT model from Hugging Face Hub
76
- # 'trust_remote_code=True' is required for custom models like ColBERT
77
  model = models.ColBERT("fjmgAI/col1-210M-EuroBERT", trust_remote_code=True)
78
 
79
  # Move the model to GPU if available, otherwise use CPU
@@ -96,9 +95,7 @@ inputs = {key: value.to(device) for key, value in inputs.items()}
96
  # Generate token embeddings (no gradients needed for inference)
97
  with torch.no_grad():
98
  # Forward pass through the model
99
- embeddings_dict = model(inputs) # Returns dictionary with model outputs
100
-
101
- # Extract token-level embeddings (shape: [batch_size, seq_length, embedding_dim])
102
  embeddings = embeddings_dict['token_embeddings']
103
  print(embeddings.shape) # Expected: [3, 32, 128] (3 texts, 32 tokens max, 128-dim embeddings)
104
 
@@ -116,7 +113,7 @@ def colbert_similarity(query_emb, doc_emb):
116
  Normalized similarity score
117
  """
118
  # Compute dot product between all token pairs
119
- similarity_matrix = torch.matmul(query_emb, doc_emb.T) # [query_tokens, doc_tokens]
120
 
121
  # Get maximum similarity for each query token (MaxSim)
122
  max_similarities = similarity_matrix.max(dim=1)[0]
@@ -125,9 +122,9 @@ def colbert_similarity(query_emb, doc_emb):
125
  return max_similarities.sum() / query_emb.shape[0]
126
 
127
  # Extract embeddings for each text
128
- query_emb = embeddings[0] # [32, 128] - Query embeddings
129
- positive_emb = embeddings[1] # [32, 128] - Positive document embeddings
130
- negative_emb = embeddings[2] # [32, 128] - Negative document embeddings
131
 
132
  # Compute similarity scores
133
  positive_score = colbert_similarity(query_emb, positive_emb) # Query vs positive doc
 
53
 
54
  ## Fine-Tuning Details
55
  - The model was trained using the **Contrastive Training**.
56
+ - Evaluated with <code>pylate.evaluation.colbert_triplet.ColBERTTripletEvaluator</code>
57
 
58
  | Metric | Value |
59
  |:-------------|:-----------|
 
72
  import torch
73
  from pylate import models
74
 
75
+ # Load the ColBERT model
 
76
  model = models.ColBERT("fjmgAI/col1-210M-EuroBERT", trust_remote_code=True)
77
 
78
  # Move the model to GPU if available, otherwise use CPU
 
95
  # Generate token embeddings (no gradients needed for inference)
96
  with torch.no_grad():
97
  # Forward pass through the model
98
+ embeddings_dict = model(inputs)
 
 
99
  embeddings = embeddings_dict['token_embeddings']
100
  print(embeddings.shape) # Expected: [3, 32, 128] (3 texts, 32 tokens max, 128-dim embeddings)
101
 
 
113
  Normalized similarity score
114
  """
115
  # Compute dot product between all token pairs
116
+ similarity_matrix = torch.matmul(query_emb, doc_emb.T)
117
 
118
  # Get maximum similarity for each query token (MaxSim)
119
  max_similarities = similarity_matrix.max(dim=1)[0]
 
122
  return max_similarities.sum() / query_emb.shape[0]
123
 
124
  # Extract embeddings for each text
125
+ query_emb = embeddings[0]
126
+ positive_emb = embeddings[1]
127
+ negative_emb = embeddings[2]
128
 
129
  # Compute similarity scores
130
  positive_score = colbert_similarity(query_emb, positive_emb) # Query vs positive doc