Spaces:
Runtime error
Runtime error
File size: 2,835 Bytes
7ce6e7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import torch
import logging
import numpy as np
from transformers import AutoTokenizer, AutoModel
class LanguageModel:
def __init__(self, pre_trained_model_path, max_len=1000):
""" Load pipeline for pre-trained model """
self.max_len = max_len
logging.info(f"Loading tokenizer for {pre_trained_model_path}")
self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_path)
logging.info(f"Loading model for {pre_trained_model_path}")
self.model = AutoModel.from_pretrained(
pre_trained_model_path,
)
def mean_pooling(self, model_output, attention_mask):
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def get_chunks(self, s):
""" Split long string into chunks based on max len """
max_len = self.max_len
start = 0
end = 0
while start + max_len < len(s) and end != -1:
end = s.rfind(" ", start, start + max_len + 1)
yield s[start:end]
start = end +1
yield s[start:]
def preprocess(self, text):
text = str(text)
text = text.lower()
return text.strip()
def featurize(self, input_text):
""" Return feature vector for the input text """
# split long text into multiple strings
text_list = list(self.get_chunks(input_text))
# apply text preprocessing
processed_text_list = [self.preprocess(text) for text in text_list]
# tokenize input
max_length = max([len(text) for text in processed_text_list])
encoded_input = self.tokenizer(processed_text_list, padding=True,
max_length=max_length, truncation=True,
return_tensors='pt')
# get model output
with torch.no_grad():
model_output = self.model(**encoded_input)
# get mean pooled output
feature_list = self.mean_pooling(model_output, encoded_input['attention_mask'])
feature_mean = np.average(feature_list, axis=0)
return feature_mean
class Similarity:
def __init__(self, featurize_fn):
self.featurize_fn = featurize_fn
def get_score(self, text1, text2):
text1_features = self.featurize_fn(text1)
text2_features = self.featurize_fn(text2)
# calculate dot product
similarity = np.dot(text1_features, text2_features)
# normalize
score = max(0, similarity - 70) / ((100 - 70))
# handle for score going above 1
score = min(1.0, score)
return score |