Upload folder using huggingface_hub
Browse files- 1_Dense/config.json +1 -0
- 1_Dense/model.safetensors +3 -0
- README.md +526 -0
- added_tokens.json +4 -0
- config.json +26 -0
- config_sentence_transformers.json +49 -0
- model.safetensors +3 -0
- modules.json +14 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +31 -0
- tokenizer.json +0 -0
- tokenizer_config.json +72 -0
- vocab.txt +0 -0
1_Dense/config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"in_features": 768, "out_features": 128, "bias": false, "activation_function": "torch.nn.modules.linear.Identity"}
|
1_Dense/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9823f35ba3d6089bab61ec7cd8e07456c1e53c6134741d3bf802bc5270b3021f
|
| 3 |
+
size 393304
|
README.md
ADDED
|
@@ -0,0 +1,526 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- ColBERT
|
| 4 |
+
- PyLate
|
| 5 |
+
- sentence-transformers
|
| 6 |
+
- sentence-similarity
|
| 7 |
+
- feature-extraction
|
| 8 |
+
- generated_from_trainer
|
| 9 |
+
- dataset_size:909188
|
| 10 |
+
- loss:Contrastive
|
| 11 |
+
base_model: colbert-ir/colbertv2.0
|
| 12 |
+
datasets:
|
| 13 |
+
- baconnier/rag-comprehensive-triplets
|
| 14 |
+
pipeline_tag: sentence-similarity
|
| 15 |
+
library_name: PyLate
|
| 16 |
+
metrics:
|
| 17 |
+
- accuracy
|
| 18 |
+
model-index:
|
| 19 |
+
- name: PyLate model based on colbert-ir/colbertv2.0
|
| 20 |
+
results:
|
| 21 |
+
- task:
|
| 22 |
+
type: col-berttriplet
|
| 23 |
+
name: Col BERTTriplet
|
| 24 |
+
dataset:
|
| 25 |
+
name: Unknown
|
| 26 |
+
type: unknown
|
| 27 |
+
metrics:
|
| 28 |
+
- type: accuracy
|
| 29 |
+
value: 0.9841766953468323
|
| 30 |
+
name: Accuracy
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
# PyLate model based on colbert-ir/colbertv2.0
|
| 34 |
+
|
| 35 |
+
This is a [PyLate](https://github.com/lightonai/pylate) model finetuned from [colbert-ir/colbertv2.0](https://huggingface.co/colbert-ir/colbertv2.0) on the [rag-comprehensive-triplets](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets) dataset. It maps sentences & paragraphs to sequences of 128-dimensional dense vectors and can be used for semantic textual similarity using the MaxSim operator.
|
| 36 |
+
|
| 37 |
+
## Model Details
|
| 38 |
+
|
| 39 |
+
### Model Description
|
| 40 |
+
- **Model Type:** PyLate model
|
| 41 |
+
- **Base model:** [colbert-ir/colbertv2.0](https://huggingface.co/colbert-ir/colbertv2.0) <!-- at revision c1e84128e85ef755c096a95bdb06b47793b13acf -->
|
| 42 |
+
- **Document Length:** 180 tokens
|
| 43 |
+
- **Query Length:** 32 tokens
|
| 44 |
+
- **Output Dimensionality:** 128 tokens
|
| 45 |
+
- **Similarity Function:** MaxSim
|
| 46 |
+
- **Training Dataset:**
|
| 47 |
+
- [rag-comprehensive-triplets](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets)
|
| 48 |
+
<!-- - **Language:** Unknown -->
|
| 49 |
+
<!-- - **License:** Unknown -->
|
| 50 |
+
|
| 51 |
+
### Model Sources
|
| 52 |
+
|
| 53 |
+
- **Documentation:** [PyLate Documentation](https://lightonai.github.io/pylate/)
|
| 54 |
+
- **Repository:** [PyLate on GitHub](https://github.com/lightonai/pylate)
|
| 55 |
+
- **Hugging Face:** [PyLate models on Hugging Face](https://huggingface.co/models?library=PyLate)
|
| 56 |
+
|
| 57 |
+
### Full Model Architecture
|
| 58 |
+
|
| 59 |
+
```
|
| 60 |
+
ColBERT(
|
| 61 |
+
(0): Transformer({'max_seq_length': 179, 'do_lower_case': False}) with Transformer model: BertModel
|
| 62 |
+
(1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
|
| 63 |
+
)
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
## Usage
|
| 67 |
+
First install the PyLate library:
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
pip install -U pylate
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
### Retrieval
|
| 74 |
+
|
| 75 |
+
PyLate provides a streamlined interface to index and retrieve documents using ColBERT models. The index leverages the Voyager HNSW index to efficiently handle document embeddings and enable fast retrieval.
|
| 76 |
+
|
| 77 |
+
#### Indexing documents
|
| 78 |
+
|
| 79 |
+
First, load the ColBERT model and initialize the Voyager index, then encode and index your documents:
|
| 80 |
+
|
| 81 |
+
```python
|
| 82 |
+
from pylate import indexes, models, retrieve
|
| 83 |
+
|
| 84 |
+
# Step 1: Load the ColBERT model
|
| 85 |
+
model = models.ColBERT(
|
| 86 |
+
model_name_or_path=pylate_model_id,
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Step 2: Initialize the Voyager index
|
| 90 |
+
index = indexes.Voyager(
|
| 91 |
+
index_folder="pylate-index",
|
| 92 |
+
index_name="index",
|
| 93 |
+
override=True, # This overwrites the existing index if any
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
# Step 3: Encode the documents
|
| 97 |
+
documents_ids = ["1", "2", "3"]
|
| 98 |
+
documents = ["document 1 text", "document 2 text", "document 3 text"]
|
| 99 |
+
|
| 100 |
+
documents_embeddings = model.encode(
|
| 101 |
+
documents,
|
| 102 |
+
batch_size=32,
|
| 103 |
+
is_query=False, # Ensure that it is set to False to indicate that these are documents, not queries
|
| 104 |
+
show_progress_bar=True,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Step 4: Add document embeddings to the index by providing embeddings and corresponding ids
|
| 108 |
+
index.add_documents(
|
| 109 |
+
documents_ids=documents_ids,
|
| 110 |
+
documents_embeddings=documents_embeddings,
|
| 111 |
+
)
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
Note that you do not have to recreate the index and encode the documents every time. Once you have created an index and added the documents, you can re-use the index later by loading it:
|
| 115 |
+
|
| 116 |
+
```python
|
| 117 |
+
# To load an index, simply instantiate it with the correct folder/name and without overriding it
|
| 118 |
+
index = indexes.Voyager(
|
| 119 |
+
index_folder="pylate-index",
|
| 120 |
+
index_name="index",
|
| 121 |
+
)
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
#### Retrieving top-k documents for queries
|
| 125 |
+
|
| 126 |
+
Once the documents are indexed, you can retrieve the top-k most relevant documents for a given set of queries.
|
| 127 |
+
To do so, initialize the ColBERT retriever with the index you want to search in, encode the queries and then retrieve the top-k documents to get the top matches ids and relevance scores:
|
| 128 |
+
|
| 129 |
+
```python
|
| 130 |
+
# Step 1: Initialize the ColBERT retriever
|
| 131 |
+
retriever = retrieve.ColBERT(index=index)
|
| 132 |
+
|
| 133 |
+
# Step 2: Encode the queries
|
| 134 |
+
queries_embeddings = model.encode(
|
| 135 |
+
["query for document 3", "query for document 1"],
|
| 136 |
+
batch_size=32,
|
| 137 |
+
is_query=True, # # Ensure that it is set to False to indicate that these are queries
|
| 138 |
+
show_progress_bar=True,
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
# Step 3: Retrieve top-k documents
|
| 142 |
+
scores = retriever.retrieve(
|
| 143 |
+
queries_embeddings=queries_embeddings,
|
| 144 |
+
k=10, # Retrieve the top 10 matches for each query
|
| 145 |
+
)
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
### Reranking
|
| 149 |
+
If you only want to use the ColBERT model to perform reranking on top of your first-stage retrieval pipeline without building an index, you can simply use rank function and pass the queries and documents to rerank:
|
| 150 |
+
|
| 151 |
+
```python
|
| 152 |
+
from pylate import rank, models
|
| 153 |
+
|
| 154 |
+
queries = [
|
| 155 |
+
"query A",
|
| 156 |
+
"query B",
|
| 157 |
+
]
|
| 158 |
+
|
| 159 |
+
documents = [
|
| 160 |
+
["document A", "document B"],
|
| 161 |
+
["document 1", "document C", "document B"],
|
| 162 |
+
]
|
| 163 |
+
|
| 164 |
+
documents_ids = [
|
| 165 |
+
[1, 2],
|
| 166 |
+
[1, 3, 2],
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
model = models.ColBERT(
|
| 170 |
+
model_name_or_path=pylate_model_id,
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
queries_embeddings = model.encode(
|
| 174 |
+
queries,
|
| 175 |
+
is_query=True,
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
documents_embeddings = model.encode(
|
| 179 |
+
documents,
|
| 180 |
+
is_query=False,
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
reranked_documents = rank.rerank(
|
| 184 |
+
documents_ids=documents_ids,
|
| 185 |
+
queries_embeddings=queries_embeddings,
|
| 186 |
+
documents_embeddings=documents_embeddings,
|
| 187 |
+
)
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
<!--
|
| 191 |
+
### Direct Usage (Transformers)
|
| 192 |
+
|
| 193 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
| 194 |
+
|
| 195 |
+
</details>
|
| 196 |
+
-->
|
| 197 |
+
|
| 198 |
+
<!--
|
| 199 |
+
### Downstream Usage (Sentence Transformers)
|
| 200 |
+
|
| 201 |
+
You can finetune this model on your own dataset.
|
| 202 |
+
|
| 203 |
+
<details><summary>Click to expand</summary>
|
| 204 |
+
|
| 205 |
+
</details>
|
| 206 |
+
-->
|
| 207 |
+
|
| 208 |
+
<!--
|
| 209 |
+
### Out-of-Scope Use
|
| 210 |
+
|
| 211 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
| 212 |
+
-->
|
| 213 |
+
|
| 214 |
+
## Evaluation
|
| 215 |
+
|
| 216 |
+
### Metrics
|
| 217 |
+
|
| 218 |
+
#### Col BERTTriplet
|
| 219 |
+
|
| 220 |
+
* Evaluated with <code>pylate.evaluation.colbert_triplet.ColBERTTripletEvaluator</code>
|
| 221 |
+
|
| 222 |
+
| Metric | Value |
|
| 223 |
+
|:-------------|:-----------|
|
| 224 |
+
| **accuracy** | **0.9842** |
|
| 225 |
+
|
| 226 |
+
<!--
|
| 227 |
+
## Bias, Risks and Limitations
|
| 228 |
+
|
| 229 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
| 230 |
+
-->
|
| 231 |
+
|
| 232 |
+
<!--
|
| 233 |
+
### Recommendations
|
| 234 |
+
|
| 235 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
| 236 |
+
-->
|
| 237 |
+
|
| 238 |
+
## Training Details
|
| 239 |
+
|
| 240 |
+
### Training Dataset
|
| 241 |
+
|
| 242 |
+
#### rag-comprehensive-triplets
|
| 243 |
+
|
| 244 |
+
* Dataset: [rag-comprehensive-triplets](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets) at [678e83e](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets/tree/678e83ed6a74d17c38b33344168abc7787e39754)
|
| 245 |
+
* Size: 909,188 training samples
|
| 246 |
+
* Columns: <code>query</code>, <code>positive</code>, <code>negative</code>, <code>original_id</code>, <code>dataset_source</code>, <code>category</code>, and <code>language</code>
|
| 247 |
+
* Approximate statistics based on the first 1000 samples:
|
| 248 |
+
| | query | positive | negative | original_id | dataset_source | category | language |
|
| 249 |
+
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:-------------------------------------------------------------------------------|
|
| 250 |
+
| type | string | string | string | string | string | string | string |
|
| 251 |
+
| details | <ul><li>min: 9 tokens</li><li>mean: 26.32 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 29.53 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 30.37 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 5.22 tokens</li><li>max: 6 tokens</li></ul> | <ul><li>min: 21 tokens</li><li>mean: 21.0 tokens</li><li>max: 21 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 6.25 tokens</li><li>max: 7 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |
|
| 252 |
+
* Samples:
|
| 253 |
+
| query | positive | negative | original_id | dataset_source | category | language |
|
| 254 |
+
|:---------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------|:---------------------------------------------------------------|:-----------------------|:----------------|
|
| 255 |
+
| <code>Dime los nombres de cinco equipos deportivos profesionales que están ubicados en Nueva York</code> | <code>Los equipos deportivos profesionales del estado de Nueva York son los Yankees de Nueva York, los Mets de Nueva York, los Islanders de Nueva York, los Nets de Brooklyn y los Knicks de Nueva York</code> | <code>En el estado de Nueva York, los cinco equipos deportivos profesionales son los Yankees, los Mets, los Rangers, los Nets y los Knicks</code> | <code>12558</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>open_qa</code> | <code>es</code> |
|
| 256 |
+
| <code>¿Qué significa eso de MBB y eso que tiene que ver con las consultoras?</code> | <code>McKinsey & Company es una empresa global de consultorÃa de gestión fundada en 1926 por el profesor de la Universidad de Chicago James O. McKinsey, que ofrece servicios profesionales a empresas, gobiernos y otras organizaciones. McKinsey es la mayor y más antigua de las "Tres Grandes" consultoras de gestión (MBB), las tres mayores consultoras de estrategia del mundo por ingresos. La empresa se centra principalmente en las finanzas y operaciones de sus clientes.<br><br>Bajo la dirección de Marvin Bower, McKinsey se expandió por Europa durante las décadas de 1940 y 1950. En los años 60, Fred Gluck, de McKinsey, junto con Bruce Henderson, de Boston Consulting Group, Bill Bain, de Bain & Company, y Michael Porter, de Harvard Business School, transformaron la cultura empresarial Una publicación de 1975 de John L. Neuman, de McKinsey, introdujo la práctica empresarial del "análisis del valor de los gastos generales", que contribuyó a una tendencia a la reducción que eliminó muchos pue...</code> | <code>McKinsey & Company es una empresa global de servicios financieros fundada en 1926 por James O. McKinsey, que ofrece servicios de consultorÃa a empresas y gobiernos.</code> | <code>11131</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>closed_qa</code> | <code>es</code> |
|
| 257 |
+
| <code>Tour de france desde 1903 hasta ahora, ¿alguna vez no ha habido la carrera?</code> | <code>El tour de france se celebra desde 1903, y las únicas cancelaciones destacadas se produjeron con motivo de las dos guerras mundiales</code> | <code>El tour de france ha corrido todos los años desde 1903, excepto durante la Primera Guerra Mundial.</code> | <code>14640</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>closed_qa</code> | <code>es</code> |
|
| 258 |
+
* Loss: <code>pylate.losses.contrastive.Contrastive</code>
|
| 259 |
+
|
| 260 |
+
### Evaluation Dataset
|
| 261 |
+
|
| 262 |
+
#### rag-comprehensive-triplets
|
| 263 |
+
|
| 264 |
+
* Dataset: [rag-comprehensive-triplets](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets) at [678e83e](https://huggingface.co/datasets/baconnier/rag-comprehensive-triplets/tree/678e83ed6a74d17c38b33344168abc7787e39754)
|
| 265 |
+
* Size: 909,188 evaluation samples
|
| 266 |
+
* Columns: <code>query</code>, <code>positive</code>, <code>negative</code>, <code>original_id</code>, <code>dataset_source</code>, <code>category</code>, and <code>language</code>
|
| 267 |
+
* Approximate statistics based on the first 1000 samples:
|
| 268 |
+
| | query | positive | negative | original_id | dataset_source | category | language |
|
| 269 |
+
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------|:-------------------------------------------------------------------------------|
|
| 270 |
+
| type | string | string | string | string | string | string | string |
|
| 271 |
+
| details | <ul><li>min: 9 tokens</li><li>mean: 26.28 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 28.98 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 29.95 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 5.18 tokens</li><li>max: 6 tokens</li></ul> | <ul><li>min: 21 tokens</li><li>mean: 21.0 tokens</li><li>max: 21 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 6.23 tokens</li><li>max: 7 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 4.0 tokens</li><li>max: 4 tokens</li></ul> |
|
| 272 |
+
* Samples:
|
| 273 |
+
| query | positive | negative | original_id | dataset_source | category | language |
|
| 274 |
+
|:------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------|:---------------------------------------------------------------|:------------------------------------|:----------------|
|
| 275 |
+
| <code>Dado el texto, ¿cuál es el tamaño de la población del país de Perú?</code> | <code>Perú tiene una población de más de 34 millones de habitantes.</code> | <code>La población del paÃs de Perú es de más de 32 millones de habitantes.</code> | <code>7265</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>closed_qa</code> | <code>es</code> |
|
| 276 |
+
| <code>¿Cuál es el significado de 'acoplamiento de mareas'?</code> | <code>El acoplamiento de marea o rotación sincrónica es lo que hace que la cara de un objeto astronómico esté fija apuntando a otro, igual que la cara visible de la Luna está siempre apuntando a la Tierra. Un objeto acoplado tarda el mismo tiempo en girar sobre su eje que en trasladarse alrededor de su compañero. Esta rotación sincrónica, también conocida como rotación capturada o rotación sincrónica, hace que un hemisferio apunte continuamente hacia el objeto compañero. Normalmente, sólo el satélite se acopla alrededor de un planeta mayor, pero si la diferencia de masa entre los dos cuerpos y la distancia entre ellos es pequeña, ambos objetos pueden tener un acoplamiento de marea recÃproco, como Plutón y Caronte.</code> | <code>El acoplamiento de mareas es un fenómeno astronómico en el que un objeto gira alrededor de otro objeto, manteniendo siempre la misma distancia hacia él.</code> | <code>2892</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>closed_qa</code> | <code>es</code> |
|
| 277 |
+
| <code>¿Trópicos a la nieve fue co-ganadora de la Medalla de Oro del Instituto Australiano de Cinematografía a la Mejor Película en 1965?</code> | <code>Del trópico a la nieve es un cortometraje documental australiano de 1964. Fue una de las pelÃculas australianas más conocidas de los años 60. Se produjo bajo los auspicios de la Unidad Cinematográfica de la Commonwealth (CFU), más tarde reincorporada como Film Australia. Fue codirigido por Jack Lee y Richard (Dick) Mason, y contó con Reg Livermore (en su primer papel cinematográfico) como uno de los "narradores".<br><br>Una de las tareas habituales de la UFC en aquella época era la producción de cortometrajes destinados a su distribución en el extranjero y cuyo objetivo era promocionar Australia como destino atractivo para emigrantes y turistas. Del trópico a la nieve supuso una ruptura significativa con el estilo tradicional de este tipo de largometrajes, y destaca por su enfoque subversivo y satÃrico del tema.<br><br>En lugar de utilizar la narración forzada y autoritaria a una sola voz, tÃpica de los "documentales de promoción" de la época, Mason y Lee optaron por un enfoque so...</code> | <code>Del trópico a la nieve es un documental australiano de 1964 que ganó el premio al mejor director en 1965.</code> | <code>6611</code> | <code>argilla/databricks-dolly-15k-curated-multilingual</code> | <code>information_extraction</code> | <code>es</code> |
|
| 278 |
+
* Loss: <code>pylate.losses.contrastive.Contrastive</code>
|
| 279 |
+
|
| 280 |
+
### Training Hyperparameters
|
| 281 |
+
#### Non-Default Hyperparameters
|
| 282 |
+
|
| 283 |
+
- `eval_strategy`: steps
|
| 284 |
+
- `per_device_train_batch_size`: 16
|
| 285 |
+
- `per_device_eval_batch_size`: 16
|
| 286 |
+
- `learning_rate`: 2e-05
|
| 287 |
+
- `num_train_epochs`: 1
|
| 288 |
+
- `fp16`: True
|
| 289 |
+
- `load_best_model_at_end`: True
|
| 290 |
+
|
| 291 |
+
#### All Hyperparameters
|
| 292 |
+
<details><summary>Click to expand</summary>
|
| 293 |
+
|
| 294 |
+
- `overwrite_output_dir`: False
|
| 295 |
+
- `do_predict`: False
|
| 296 |
+
- `eval_strategy`: steps
|
| 297 |
+
- `prediction_loss_only`: True
|
| 298 |
+
- `per_device_train_batch_size`: 16
|
| 299 |
+
- `per_device_eval_batch_size`: 16
|
| 300 |
+
- `per_gpu_train_batch_size`: None
|
| 301 |
+
- `per_gpu_eval_batch_size`: None
|
| 302 |
+
- `gradient_accumulation_steps`: 1
|
| 303 |
+
- `eval_accumulation_steps`: None
|
| 304 |
+
- `torch_empty_cache_steps`: None
|
| 305 |
+
- `learning_rate`: 2e-05
|
| 306 |
+
- `weight_decay`: 0.0
|
| 307 |
+
- `adam_beta1`: 0.9
|
| 308 |
+
- `adam_beta2`: 0.999
|
| 309 |
+
- `adam_epsilon`: 1e-08
|
| 310 |
+
- `max_grad_norm`: 1.0
|
| 311 |
+
- `num_train_epochs`: 1
|
| 312 |
+
- `max_steps`: -1
|
| 313 |
+
- `lr_scheduler_type`: linear
|
| 314 |
+
- `lr_scheduler_kwargs`: {}
|
| 315 |
+
- `warmup_ratio`: 0.0
|
| 316 |
+
- `warmup_steps`: 0
|
| 317 |
+
- `log_level`: passive
|
| 318 |
+
- `log_level_replica`: warning
|
| 319 |
+
- `log_on_each_node`: True
|
| 320 |
+
- `logging_nan_inf_filter`: True
|
| 321 |
+
- `save_safetensors`: True
|
| 322 |
+
- `save_on_each_node`: False
|
| 323 |
+
- `save_only_model`: False
|
| 324 |
+
- `restore_callback_states_from_checkpoint`: False
|
| 325 |
+
- `no_cuda`: False
|
| 326 |
+
- `use_cpu`: False
|
| 327 |
+
- `use_mps_device`: False
|
| 328 |
+
- `seed`: 42
|
| 329 |
+
- `data_seed`: None
|
| 330 |
+
- `jit_mode_eval`: False
|
| 331 |
+
- `use_ipex`: False
|
| 332 |
+
- `bf16`: False
|
| 333 |
+
- `fp16`: True
|
| 334 |
+
- `fp16_opt_level`: O1
|
| 335 |
+
- `half_precision_backend`: auto
|
| 336 |
+
- `bf16_full_eval`: False
|
| 337 |
+
- `fp16_full_eval`: False
|
| 338 |
+
- `tf32`: None
|
| 339 |
+
- `local_rank`: 0
|
| 340 |
+
- `ddp_backend`: None
|
| 341 |
+
- `tpu_num_cores`: None
|
| 342 |
+
- `tpu_metrics_debug`: False
|
| 343 |
+
- `debug`: []
|
| 344 |
+
- `dataloader_drop_last`: False
|
| 345 |
+
- `dataloader_num_workers`: 0
|
| 346 |
+
- `dataloader_prefetch_factor`: None
|
| 347 |
+
- `past_index`: -1
|
| 348 |
+
- `disable_tqdm`: False
|
| 349 |
+
- `remove_unused_columns`: True
|
| 350 |
+
- `label_names`: None
|
| 351 |
+
- `load_best_model_at_end`: True
|
| 352 |
+
- `ignore_data_skip`: False
|
| 353 |
+
- `fsdp`: []
|
| 354 |
+
- `fsdp_min_num_params`: 0
|
| 355 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
| 356 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
| 357 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
| 358 |
+
- `deepspeed`: None
|
| 359 |
+
- `label_smoothing_factor`: 0.0
|
| 360 |
+
- `optim`: adamw_torch
|
| 361 |
+
- `optim_args`: None
|
| 362 |
+
- `adafactor`: False
|
| 363 |
+
- `group_by_length`: False
|
| 364 |
+
- `length_column_name`: length
|
| 365 |
+
- `ddp_find_unused_parameters`: None
|
| 366 |
+
- `ddp_bucket_cap_mb`: None
|
| 367 |
+
- `ddp_broadcast_buffers`: False
|
| 368 |
+
- `dataloader_pin_memory`: True
|
| 369 |
+
- `dataloader_persistent_workers`: False
|
| 370 |
+
- `skip_memory_metrics`: True
|
| 371 |
+
- `use_legacy_prediction_loop`: False
|
| 372 |
+
- `push_to_hub`: False
|
| 373 |
+
- `resume_from_checkpoint`: None
|
| 374 |
+
- `hub_model_id`: None
|
| 375 |
+
- `hub_strategy`: every_save
|
| 376 |
+
- `hub_private_repo`: None
|
| 377 |
+
- `hub_always_push`: False
|
| 378 |
+
- `gradient_checkpointing`: False
|
| 379 |
+
- `gradient_checkpointing_kwargs`: None
|
| 380 |
+
- `include_inputs_for_metrics`: False
|
| 381 |
+
- `include_for_metrics`: []
|
| 382 |
+
- `eval_do_concat_batches`: True
|
| 383 |
+
- `fp16_backend`: auto
|
| 384 |
+
- `push_to_hub_model_id`: None
|
| 385 |
+
- `push_to_hub_organization`: None
|
| 386 |
+
- `mp_parameters`:
|
| 387 |
+
- `auto_find_batch_size`: False
|
| 388 |
+
- `full_determinism`: False
|
| 389 |
+
- `torchdynamo`: None
|
| 390 |
+
- `ray_scope`: last
|
| 391 |
+
- `ddp_timeout`: 1800
|
| 392 |
+
- `torch_compile`: False
|
| 393 |
+
- `torch_compile_backend`: None
|
| 394 |
+
- `torch_compile_mode`: None
|
| 395 |
+
- `dispatch_batches`: None
|
| 396 |
+
- `split_batches`: None
|
| 397 |
+
- `include_tokens_per_second`: False
|
| 398 |
+
- `include_num_input_tokens_seen`: False
|
| 399 |
+
- `neftune_noise_alpha`: None
|
| 400 |
+
- `optim_target_modules`: None
|
| 401 |
+
- `batch_eval_metrics`: False
|
| 402 |
+
- `eval_on_start`: False
|
| 403 |
+
- `use_liger_kernel`: False
|
| 404 |
+
- `eval_use_gather_object`: False
|
| 405 |
+
- `average_tokens_across_devices`: False
|
| 406 |
+
- `prompts`: None
|
| 407 |
+
- `batch_sampler`: batch_sampler
|
| 408 |
+
- `multi_dataset_batch_sampler`: proportional
|
| 409 |
+
|
| 410 |
+
</details>
|
| 411 |
+
|
| 412 |
+
### Training Logs
|
| 413 |
+
| Epoch | Step | Training Loss | Validation Loss | accuracy |
|
| 414 |
+
|:----------:|:--------:|:-------------:|:---------------:|:--------:|
|
| 415 |
+
| 0.0538 | 500 | 1.1908 | - | - |
|
| 416 |
+
| 0 | 0 | - | - | 0.7445 |
|
| 417 |
+
| 0.0538 | 500 | - | 0.8940 | - |
|
| 418 |
+
| 0.1076 | 1000 | 0.879 | - | - |
|
| 419 |
+
| 0 | 0 | - | - | 0.7974 |
|
| 420 |
+
| 0.1076 | 1000 | - | 0.7308 | - |
|
| 421 |
+
| 0.1615 | 1500 | 0.7334 | - | - |
|
| 422 |
+
| 0 | 0 | - | - | 0.8312 |
|
| 423 |
+
| 0.1615 | 1500 | - | 0.6040 | - |
|
| 424 |
+
| 0.2153 | 2000 | 0.6319 | - | - |
|
| 425 |
+
| 0 | 0 | - | - | 0.8508 |
|
| 426 |
+
| 0.2153 | 2000 | - | 0.5380 | - |
|
| 427 |
+
| 0.2691 | 2500 | 0.5576 | - | - |
|
| 428 |
+
| 0 | 0 | - | - | 0.8831 |
|
| 429 |
+
| 0.2691 | 2500 | - | 0.4493 | - |
|
| 430 |
+
| 0.3229 | 3000 | 0.5005 | - | - |
|
| 431 |
+
| 0 | 0 | - | - | 0.9082 |
|
| 432 |
+
| 0.3229 | 3000 | - | 0.3969 | - |
|
| 433 |
+
| 0.3767 | 3500 | 0.4591 | - | - |
|
| 434 |
+
| 0 | 0 | - | - | 0.9267 |
|
| 435 |
+
| 0.3767 | 3500 | - | 0.3430 | - |
|
| 436 |
+
| 0.4306 | 4000 | 0.3944 | - | - |
|
| 437 |
+
| 0 | 0 | - | - | 0.9293 |
|
| 438 |
+
| 0.4306 | 4000 | - | 0.2984 | - |
|
| 439 |
+
| 0.4844 | 4500 | 0.3674 | - | - |
|
| 440 |
+
| 0 | 0 | - | - | 0.9438 |
|
| 441 |
+
| 0.4844 | 4500 | - | 0.2657 | - |
|
| 442 |
+
| 0.5382 | 5000 | 0.3351 | - | - |
|
| 443 |
+
| 0 | 0 | - | - | 0.9492 |
|
| 444 |
+
| 0.5382 | 5000 | - | 0.2365 | - |
|
| 445 |
+
| 0.5920 | 5500 | 0.3019 | - | - |
|
| 446 |
+
| 0 | 0 | - | - | 0.9614 |
|
| 447 |
+
| 0.5920 | 5500 | - | 0.2020 | - |
|
| 448 |
+
| 0.6459 | 6000 | 0.2769 | - | - |
|
| 449 |
+
| 0 | 0 | - | - | 0.9637 |
|
| 450 |
+
| 0.6459 | 6000 | - | 0.1839 | - |
|
| 451 |
+
| 0.6997 | 6500 | 0.2578 | - | - |
|
| 452 |
+
| 0 | 0 | - | - | 0.9738 |
|
| 453 |
+
| 0.6997 | 6500 | - | 0.1623 | - |
|
| 454 |
+
| 0.7535 | 7000 | 0.2362 | - | - |
|
| 455 |
+
| 0 | 0 | - | - | 0.9761 |
|
| 456 |
+
| 0.7535 | 7000 | - | 0.1528 | - |
|
| 457 |
+
| 0.8073 | 7500 | 0.2239 | - | - |
|
| 458 |
+
| 0 | 0 | - | - | 0.9791 |
|
| 459 |
+
| 0.8073 | 7500 | - | 0.1407 | - |
|
| 460 |
+
| 0.8611 | 8000 | 0.2069 | - | - |
|
| 461 |
+
| 0 | 0 | - | - | 0.9802 |
|
| 462 |
+
| 0.8611 | 8000 | - | 0.1339 | - |
|
| 463 |
+
| 0.9150 | 8500 | 0.2067 | - | - |
|
| 464 |
+
| 0 | 0 | - | - | 0.9830 |
|
| 465 |
+
| 0.9150 | 8500 | - | 0.1290 | - |
|
| 466 |
+
| **0.9688** | **9000** | **0.1935** | **-** | **-** |
|
| 467 |
+
| 0 | 0 | - | - | 0.9842 |
|
| 468 |
+
| **0.9688** | **9000** | **-** | **0.1221** | **-** |
|
| 469 |
+
|
| 470 |
+
* The bold row denotes the saved checkpoint.
|
| 471 |
+
|
| 472 |
+
### Framework Versions
|
| 473 |
+
- Python: 3.10.12
|
| 474 |
+
- Sentence Transformers: 3.4.1
|
| 475 |
+
- PyLate: 1.1.7
|
| 476 |
+
- Transformers: 4.48.2
|
| 477 |
+
- PyTorch: 2.5.1+cu121
|
| 478 |
+
- Accelerate: 1.2.1
|
| 479 |
+
- Datasets: 3.3.1
|
| 480 |
+
- Tokenizers: 0.21.0
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
## Citation
|
| 484 |
+
|
| 485 |
+
### BibTeX
|
| 486 |
+
|
| 487 |
+
#### Sentence Transformers
|
| 488 |
+
```bibtex
|
| 489 |
+
@inproceedings{reimers-2019-sentence-bert,
|
| 490 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
| 491 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
| 492 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
| 493 |
+
month = "11",
|
| 494 |
+
year = "2019",
|
| 495 |
+
publisher = "Association for Computational Linguistics",
|
| 496 |
+
url = "https://arxiv.org/abs/1908.10084"
|
| 497 |
+
}
|
| 498 |
+
```
|
| 499 |
+
|
| 500 |
+
#### PyLate
|
| 501 |
+
```bibtex
|
| 502 |
+
@misc{PyLate,
|
| 503 |
+
title={PyLate: Flexible Training and Retrieval for Late Interaction Models},
|
| 504 |
+
author={Chaffin, Antoine and Sourty, Raphaël},
|
| 505 |
+
url={https://github.com/lightonai/pylate},
|
| 506 |
+
year={2024}
|
| 507 |
+
}
|
| 508 |
+
```
|
| 509 |
+
|
| 510 |
+
<!--
|
| 511 |
+
## Glossary
|
| 512 |
+
|
| 513 |
+
*Clearly define terms in order to be accessible across audiences.*
|
| 514 |
+
-->
|
| 515 |
+
|
| 516 |
+
<!--
|
| 517 |
+
## Model Card Authors
|
| 518 |
+
|
| 519 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
| 520 |
+
-->
|
| 521 |
+
|
| 522 |
+
<!--
|
| 523 |
+
## Model Card Contact
|
| 524 |
+
|
| 525 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
| 526 |
+
-->
|
added_tokens.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"[D] ": 30523,
|
| 3 |
+
"[Q] ": 30522
|
| 4 |
+
}
|
config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "colbert-ir/colbertv2.0",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BertModel"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"gradient_checkpointing": false,
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 3072,
|
| 14 |
+
"layer_norm_eps": 1e-12,
|
| 15 |
+
"max_position_embeddings": 512,
|
| 16 |
+
"model_type": "bert",
|
| 17 |
+
"num_attention_heads": 12,
|
| 18 |
+
"num_hidden_layers": 12,
|
| 19 |
+
"pad_token_id": 0,
|
| 20 |
+
"position_embedding_type": "absolute",
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.48.2",
|
| 23 |
+
"type_vocab_size": 2,
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"vocab_size": 30524
|
| 26 |
+
}
|
config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"__version__": {
|
| 3 |
+
"sentence_transformers": "3.4.1",
|
| 4 |
+
"transformers": "4.48.2",
|
| 5 |
+
"pytorch": "2.5.1+cu121"
|
| 6 |
+
},
|
| 7 |
+
"prompts": {},
|
| 8 |
+
"default_prompt_name": null,
|
| 9 |
+
"similarity_fn_name": "MaxSim",
|
| 10 |
+
"query_prefix": "[Q] ",
|
| 11 |
+
"document_prefix": "[D] ",
|
| 12 |
+
"query_length": 32,
|
| 13 |
+
"document_length": 180,
|
| 14 |
+
"attend_to_expansion_tokens": false,
|
| 15 |
+
"skiplist_words": [
|
| 16 |
+
"!",
|
| 17 |
+
"\"",
|
| 18 |
+
"#",
|
| 19 |
+
"$",
|
| 20 |
+
"%",
|
| 21 |
+
"&",
|
| 22 |
+
"'",
|
| 23 |
+
"(",
|
| 24 |
+
")",
|
| 25 |
+
"*",
|
| 26 |
+
"+",
|
| 27 |
+
",",
|
| 28 |
+
"-",
|
| 29 |
+
".",
|
| 30 |
+
"/",
|
| 31 |
+
":",
|
| 32 |
+
";",
|
| 33 |
+
"<",
|
| 34 |
+
"=",
|
| 35 |
+
">",
|
| 36 |
+
"?",
|
| 37 |
+
"@",
|
| 38 |
+
"[",
|
| 39 |
+
"\\",
|
| 40 |
+
"]",
|
| 41 |
+
"^",
|
| 42 |
+
"_",
|
| 43 |
+
"`",
|
| 44 |
+
"{",
|
| 45 |
+
"|",
|
| 46 |
+
"}",
|
| 47 |
+
"~"
|
| 48 |
+
]
|
| 49 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a5c738e7954167a2a260c80b010fe2e1634e9bf303f5b8591d01e2582a8900d
|
| 3 |
+
size 437957472
|
modules.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Dense",
|
| 12 |
+
"type": "pylate.models.Dense.Dense"
|
| 13 |
+
}
|
| 14 |
+
]
|
sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 179,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": {
|
| 3 |
+
"content": "[CLS]",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"mask_token": {
|
| 10 |
+
"content": "[MASK]",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "[MASK]",
|
| 17 |
+
"sep_token": {
|
| 18 |
+
"content": "[SEP]",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"unk_token": {
|
| 25 |
+
"content": "[UNK]",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
},
|
| 43 |
+
"30522": {
|
| 44 |
+
"content": "[Q] ",
|
| 45 |
+
"lstrip": false,
|
| 46 |
+
"normalized": true,
|
| 47 |
+
"rstrip": false,
|
| 48 |
+
"single_word": false,
|
| 49 |
+
"special": false
|
| 50 |
+
},
|
| 51 |
+
"30523": {
|
| 52 |
+
"content": "[D] ",
|
| 53 |
+
"lstrip": false,
|
| 54 |
+
"normalized": true,
|
| 55 |
+
"rstrip": false,
|
| 56 |
+
"single_word": false,
|
| 57 |
+
"special": false
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"clean_up_tokenization_spaces": false,
|
| 61 |
+
"cls_token": "[CLS]",
|
| 62 |
+
"do_lower_case": true,
|
| 63 |
+
"extra_special_tokens": {},
|
| 64 |
+
"mask_token": "[MASK]",
|
| 65 |
+
"model_max_length": 512,
|
| 66 |
+
"pad_token": "[MASK]",
|
| 67 |
+
"sep_token": "[SEP]",
|
| 68 |
+
"strip_accents": null,
|
| 69 |
+
"tokenize_chinese_chars": true,
|
| 70 |
+
"tokenizer_class": "BertTokenizer",
|
| 71 |
+
"unk_token": "[UNK]"
|
| 72 |
+
}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|