import torch import logging import numpy as np from transformers import AutoTokenizer, AutoModel class LanguageModel: def __init__(self, pre_trained_model_path, max_len=1000): """ Load pipeline for pre-trained model """ self.max_len = max_len logging.info(f"Loading tokenizer for {pre_trained_model_path}") self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_path) logging.info(f"Loading model for {pre_trained_model_path}") self.model = AutoModel.from_pretrained( pre_trained_model_path, ) def mean_pooling(self, model_output, attention_mask): token_embeddings = model_output[0] input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) def get_chunks(self, s): """ Split long string into chunks based on max len """ max_len = self.max_len start = 0 end = 0 while start + max_len < len(s) and end != -1: end = s.rfind(" ", start, start + max_len + 1) yield s[start:end] start = end +1 yield s[start:] def preprocess(self, text): text = str(text) text = text.lower() return text.strip() def featurize(self, input_text): """ Return feature vector for the input text """ # split long text into multiple strings text_list = list(self.get_chunks(input_text)) # apply text preprocessing processed_text_list = [self.preprocess(text) for text in text_list] # tokenize input max_length = max([len(text) for text in processed_text_list]) encoded_input = self.tokenizer(processed_text_list, padding=True, max_length=max_length, truncation=True, return_tensors='pt') # get model output with torch.no_grad(): model_output = self.model(**encoded_input) # get mean pooled output feature_list = self.mean_pooling(model_output, encoded_input['attention_mask']) feature_mean = np.average(feature_list, axis=0) return feature_mean class Similarity: def __init__(self, featurize_fn): self.featurize_fn = featurize_fn def get_score(self, text1, text2): text1_features = self.featurize_fn(text1) text2_features = self.featurize_fn(text2) # calculate dot product similarity = np.dot(text1_features, text2_features) # normalize score = max(0, similarity - 70) / ((100 - 70)) # handle for score going above 1 score = min(1.0, score) return score