In [1]:
import datasets
from datasets import Dataset
import numpy as np
import json
import os
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm as tqdm

ARTICLES_PATH = '/kaggle/input/ysda-ml-02-05-process-json/articles.hf'
OUTPUT_PATH = '/kaggle/working/embeddings.npy'

In [2]:
device = torch.device('cuda:0')
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [3]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1) \
        .expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) \
        / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_embedding(query, model, tokenizer):
    encoded_input = tokenizer(
        query, padding=True, truncation=True, return_tensors='pt'
    ).to(device)
    with torch.no_grad():
        embeds = model(**encoded_input)
    embeds = mean_pooling(embeds, encoded_input['attention_mask']).cpu()
    embeds = F.normalize(embeds, p=2, dim=1)
    return embeds.numpy().astype(np.float16)


def generate_embeddings(articles, batch_size):
    embeddings = []
    for batch in tqdm(range(0, len(articles['id']), batch_size)):
        batch_abstracts = articles['abstract'][batch:batch+batch_size]
        batch_embeddings = get_embedding(batch_abstracts, model, tokenizer)
        embeddings.extend(list(batch_embeddings))
    return embeddings

In [4]:
articles = Dataset.load_from_disk(ARTICLES_PATH).to_dict()
embeddings = generate_embeddings(articles, batch_size=128)
np.save(OUTPUT_PATH, embeddings)

  0%|          | 0/17492 [00:00<?, ?it/s]