Patch Sentence Transformers integration (#2)
Browse files- Patch Sentence Transformers implementation (6398ee4b03b8b7747bbed4483c6fff6e3504a320)
- {1_Pool → 1_Pooling}/config.json +1 -1
- README.md +5 -7
- config_sentence_transformers.json +25 -1
- modules.json +6 -0
- sentence_bert_config.json +3 -0
{1_Pool → 1_Pooling}/config.json
RENAMED
|
@@ -6,5 +6,5 @@
|
|
| 6 |
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
"pooling_mode_lasttoken": false,
|
| 9 |
-
"include_prompt":
|
| 10 |
}
|
|
|
|
| 6 |
"pooling_mode_mean_sqrt_len_tokens": false,
|
| 7 |
"pooling_mode_weightedmean_tokens": false,
|
| 8 |
"pooling_mode_lasttoken": false,
|
| 9 |
+
"include_prompt": true
|
| 10 |
}
|
README.md
CHANGED
|
@@ -262,6 +262,7 @@ model-index:
|
|
| 262 |
pipeline_tag: feature-extraction
|
| 263 |
tags:
|
| 264 |
- mteb
|
|
|
|
| 265 |
library_name: transformers
|
| 266 |
---
|
| 267 |
## MiniCPM-Embedding
|
|
@@ -401,21 +402,18 @@ import torch
|
|
| 401 |
from sentence_transformers import SentenceTransformer
|
| 402 |
|
| 403 |
model_name = "openbmb/MiniCPM-Embedding"
|
| 404 |
-
model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={"attn_implementation":"flash_attention_2", "torch_dtype":torch.float16})
|
| 405 |
-
model.max_seq_length = 512
|
| 406 |
-
model.tokenizer.padding_side="right"
|
| 407 |
|
| 408 |
queries = ["中国的首都是哪里?"]
|
| 409 |
passages = ["beijing", "shanghai"]
|
| 410 |
|
| 411 |
-
|
| 412 |
INSTRUCTION = "Query: "
|
| 413 |
|
| 414 |
-
embeddings_query = model.encode(queries, prompt=INSTRUCTION
|
| 415 |
-
embeddings_doc = model.encode(passages
|
| 416 |
|
| 417 |
scores = (embeddings_query @ embeddings_doc.T)
|
| 418 |
-
print(scores.tolist()) # [[0.
|
| 419 |
```
|
| 420 |
|
| 421 |
## 实验结果 Evaluation Results
|
|
|
|
| 262 |
pipeline_tag: feature-extraction
|
| 263 |
tags:
|
| 264 |
- mteb
|
| 265 |
+
- sentence-transformers
|
| 266 |
library_name: transformers
|
| 267 |
---
|
| 268 |
## MiniCPM-Embedding
|
|
|
|
| 402 |
from sentence_transformers import SentenceTransformer
|
| 403 |
|
| 404 |
model_name = "openbmb/MiniCPM-Embedding"
|
| 405 |
+
model = SentenceTransformer(model_name, trust_remote_code=True, model_kwargs={"attn_implementation": "flash_attention_2", "torch_dtype": torch.float16})
|
|
|
|
|
|
|
| 406 |
|
| 407 |
queries = ["中国的首都是哪里?"]
|
| 408 |
passages = ["beijing", "shanghai"]
|
| 409 |
|
|
|
|
| 410 |
INSTRUCTION = "Query: "
|
| 411 |
|
| 412 |
+
embeddings_query = model.encode(queries, prompt=INSTRUCTION)
|
| 413 |
+
embeddings_doc = model.encode(passages)
|
| 414 |
|
| 415 |
scores = (embeddings_query @ embeddings_doc.T)
|
| 416 |
+
print(scores.tolist()) # [[0.35365450382232666, 0.18592746555805206]]
|
| 417 |
```
|
| 418 |
|
| 419 |
## 实验结果 Evaluation Results
|
config_sentence_transformers.json
CHANGED
|
@@ -4,6 +4,30 @@
|
|
| 4 |
"transformers": "4.37.2",
|
| 5 |
"pytorch": "2.0.1+cu121"
|
| 6 |
},
|
| 7 |
-
"prompts": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"default_prompt_name": null
|
| 9 |
}
|
|
|
|
| 4 |
"transformers": "4.37.2",
|
| 5 |
"pytorch": "2.0.1+cu121"
|
| 6 |
},
|
| 7 |
+
"prompts": {
|
| 8 |
+
"fiqa": "Instruction: Given a financial question, retrieve user replies that best answer the question. Query: ",
|
| 9 |
+
"dbpedia": "Instruction: Given a query, retrieve relevant entity descriptions from DBPedia. Query: ",
|
| 10 |
+
"CmedqaRetrieval": "Instruction: 为这个医疗问题检索相关回答。 Query: ",
|
| 11 |
+
"nfcorpus": "Instruction: Given a question, retrieve relevant documents that best answer the question. Query: ",
|
| 12 |
+
"touche2020": "Instruction: Given a question, retrieve detailed and persuasive arguments that answer the question. Query: ",
|
| 13 |
+
"CovidRetrieval": "Instruction: 为这个问题检索相关政策回答。 Query: ",
|
| 14 |
+
"scifact": "Instruction: Given a scientific claim, retrieve documents that support or refute the claim. Query: ",
|
| 15 |
+
"scidocs": "Instruction: Given a scientific paper title, retrieve paper abstracts that are cited by the given paper. Query: ",
|
| 16 |
+
"nq": "Instruction: Given a question, retrieve Wikipedia passages that answer the question. Query: ",
|
| 17 |
+
"T2Retrieval": "Instruction: 为这个问题检索相关段落。 Query: ",
|
| 18 |
+
"VideoRetrieval": "Instruction: 为这个电影标题检索相关段落。 Query: ",
|
| 19 |
+
"DuRetrieval": "Instruction: 为这个问题检索相关百度知道回答。 Query: ",
|
| 20 |
+
"MMarcoRetrieval": "Instruction: 为这个查询检索相关段落。 Query: ",
|
| 21 |
+
"hotpotqa": "Instruction: Given a multi-hop question, retrieve documents that can help answer the question. Query: ",
|
| 22 |
+
"quora": "Instruction: Given a question, retrieve questions that are semantically equivalent to the given question. Query: ",
|
| 23 |
+
"climate-fever": "Instruction: Given a claim about climate change, retrieve documents that support or refute the claim. Query: ",
|
| 24 |
+
"arguana": "Instruction: Given a claim, find documents that refute the claim. Query: ",
|
| 25 |
+
"fever": "Instruction: Given a claim, retrieve documents that support or refute the claim. Query: ",
|
| 26 |
+
"trec-covid": "Instruction: Given a query on COVID-19, retrieve documents that answer the query. Query: ",
|
| 27 |
+
"msmarco": "Instruction: Given a web search query, retrieve relevant passages that answer the query. Query: ",
|
| 28 |
+
"EcomRetrieval": "Instruction: 为这个查询检索相关商品标题。 Query: ",
|
| 29 |
+
"MedicalRetrieval": "Instruction: 为这个医学问题检索相关回答。 Query: ",
|
| 30 |
+
"CAQstack":"Instruction: Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question. Query: "
|
| 31 |
+
},
|
| 32 |
"default_prompt_name": null
|
| 33 |
}
|
modules.json
CHANGED
|
@@ -10,5 +10,11 @@
|
|
| 10 |
"name": "1",
|
| 11 |
"path": "1_Pooling",
|
| 12 |
"type": "sentence_transformers.models.Pooling"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
}
|
| 14 |
]
|
|
|
|
| 10 |
"name": "1",
|
| 11 |
"path": "1_Pooling",
|
| 12 |
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
| 18 |
+
"type": "sentence_transformers.models.Normalize"
|
| 19 |
}
|
| 20 |
]
|
sentence_bert_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 512
|
| 3 |
+
}
|