caption_retrieval / create_corpus.py
anhdt-dsai-02's picture
Create create_corpus.py
32a572c verified
raw
history blame contribute delete
360 Bytes
import datasets
import bm25s
from bm25s.hf import BM25HF
import json
dataset = datasets.load_dataset("anhdt-dsai-02/test_image_dataset_1_2_3_4")
corpus = dataset["train"]["caption"]
retriever = BM25HF(corpus=corpus)
retriever.index(bm25s.tokenize(corpus))
# Set your username and token
user = "anhdt-dsai-02"
retriever.save_to_hub(f"{user}/caption_1_2_3_4")