| from argparse import ArgumentParser, Namespace |
| from typing import List, Optional |
|
|
| from model_api import SionicEmbeddingModel |
| from mteb import MTEB |
|
|
| RETRIEVAL_TASKS: List[str] = [ |
| 'ArguAna', |
| 'ClimateFEVER', |
| 'DBPedia', |
| 'FEVER', |
| 'FiQA2018', |
| 'HotpotQA', |
| 'MSMARCO', |
| 'NFCorpus', |
| 'NQ', |
| 'QuoraRetrieval', |
| 'SCIDOCS', |
| 'SciFact', |
| 'Touche2020', |
| 'TRECCOVID', |
| ] |
|
|
|
|
| def get_arguments() -> Namespace: |
| parser = ArgumentParser() |
| parser.add_argument('--url', type=str, default='https://api.sionic.ai/v2/embedding', help='api server url') |
| parser.add_argument('--instruction', type=str, default='query: ', help='query instruction') |
| parser.add_argument('--batch_size', type=int, default=128) |
| parser.add_argument('--dimension', type=int, default=3072) |
| parser.add_argument('--output_dir', type=str, default='./result/v2') |
| return parser.parse_args() |
|
|
|
|
| if __name__ == '__main__': |
| args = get_arguments() |
|
|
| model = SionicEmbeddingModel(url=args.url, instruction=args.instruction, batch_size=args.batch_size, dimension=args.dimension) |
|
|
| task_names: List[str] = [t.description['name'] for t in MTEB(task_types=None, task_langs=['en']).tasks] |
|
|
| for task in task_names: |
| if task in ['MSMARCOv2']: |
| continue |
|
|
| instruction: Optional[str] = args.instruction if ('CQADupstack' in task) or (task in RETRIEVAL_TASKS) else None |
| model.instruction = instruction |
|
|
| evaluation = MTEB( |
| tasks=[task], |
| task_langs=['en'], |
| eval_splits=['test' if task not in ['MSMARCO'] else 'dev'], |
| ) |
| evaluation.run(model, output_folder=args.output_dir) |
|
|