Spaces:
Runtime error
Runtime error
import torch | |
import logging | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModel | |
class LanguageModel: | |
def __init__(self, pre_trained_model_path, max_len=1000): | |
""" Load pipeline for pre-trained model """ | |
self.max_len = max_len | |
logging.info(f"Loading tokenizer for {pre_trained_model_path}") | |
self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_path) | |
logging.info(f"Loading model for {pre_trained_model_path}") | |
self.model = AutoModel.from_pretrained( | |
pre_trained_model_path, | |
) | |
def mean_pooling(self, model_output, attention_mask): | |
token_embeddings = model_output[0] | |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) | |
def get_chunks(self, s): | |
""" Split long string into chunks based on max len """ | |
max_len = self.max_len | |
start = 0 | |
end = 0 | |
while start + max_len < len(s) and end != -1: | |
end = s.rfind(" ", start, start + max_len + 1) | |
yield s[start:end] | |
start = end +1 | |
yield s[start:] | |
def preprocess(self, text): | |
text = str(text) | |
text = text.lower() | |
return text.strip() | |
def featurize(self, input_text): | |
""" Return feature vector for the input text """ | |
# split long text into multiple strings | |
text_list = list(self.get_chunks(input_text)) | |
# apply text preprocessing | |
processed_text_list = [self.preprocess(text) for text in text_list] | |
# tokenize input | |
max_length = max([len(text) for text in processed_text_list]) | |
encoded_input = self.tokenizer(processed_text_list, padding=True, | |
max_length=max_length, truncation=True, | |
return_tensors='pt') | |
# get model output | |
with torch.no_grad(): | |
model_output = self.model(**encoded_input) | |
# get mean pooled output | |
feature_list = self.mean_pooling(model_output, encoded_input['attention_mask']) | |
feature_mean = np.average(feature_list, axis=0) | |
return feature_mean | |
class Similarity: | |
def __init__(self, featurize_fn): | |
self.featurize_fn = featurize_fn | |
def get_score(self, text1, text2): | |
text1_features = self.featurize_fn(text1) | |
text2_features = self.featurize_fn(text2) | |
# calculate dot product | |
similarity = np.dot(text1_features, text2_features) | |
# normalize | |
score = max(0, similarity - 70) / ((100 - 70)) | |
# handle for score going above 1 | |
score = min(1.0, score) | |
return score |