Spaces:

claree007
/

Doc-similarity

Runtime error

App Files Files Community

Doc-similarity / app /similarity.py

claree007

initial

7ce6e7f almost 2 years ago

raw

history blame contribute delete

2.84 kB

	import torch
	import logging
	import numpy as np
	from transformers import AutoTokenizer, AutoModel

	class LanguageModel:
	def __init__(self, pre_trained_model_path, max_len=1000):
	""" Load pipeline for pre-trained model """

	self.max_len = max_len

	logging.info(f"Loading tokenizer for {pre_trained_model_path}")
	self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_path)

	logging.info(f"Loading model for {pre_trained_model_path}")
	self.model = AutoModel.from_pretrained(
	pre_trained_model_path,
	)

	def mean_pooling(self, model_output, attention_mask):
	token_embeddings = model_output[0]
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

	def get_chunks(self, s):
	""" Split long string into chunks based on max len """

	max_len = self.max_len
	start = 0
	end = 0
	while start + max_len < len(s) and end != -1:
	end = s.rfind(" ", start, start + max_len + 1)
	yield s[start:end]
	start = end +1
	yield s[start:]

	def preprocess(self, text):
	text = str(text)
	text = text.lower()
	return text.strip()

	def featurize(self, input_text):
	""" Return feature vector for the input text """

	# split long text into multiple strings
	text_list = list(self.get_chunks(input_text))
	# apply text preprocessing
	processed_text_list = [self.preprocess(text) for text in text_list]
	# tokenize input
	max_length = max([len(text) for text in processed_text_list])
	encoded_input = self.tokenizer(processed_text_list, padding=True,
	max_length=max_length, truncation=True,
	return_tensors='pt')
	# get model output
	with torch.no_grad():
	model_output = self.model(**encoded_input)

	# get mean pooled output
	feature_list = self.mean_pooling(model_output, encoded_input['attention_mask'])
	feature_mean = np.average(feature_list, axis=0)

	return feature_mean

	class Similarity:
	def __init__(self, featurize_fn):
	self.featurize_fn = featurize_fn

	def get_score(self, text1, text2):
	text1_features = self.featurize_fn(text1)
	text2_features = self.featurize_fn(text2)
	# calculate dot product
	similarity = np.dot(text1_features, text2_features)
	# normalize
	score = max(0, similarity - 70) / ((100 - 70))
	# handle for score going above 1
	score = min(1.0, score)
	return score