Spaces:
Runtime error
Runtime error
initial
Browse files- Dockerfile +6 -0
- app/main.py +33 -0
- app/similarity.py +77 -0
- requirements.txt +5 -0
Dockerfile
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
WORKDIR /code
|
3 |
+
COPY ./requirements.txt /code/requirements.txt
|
4 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
5 |
+
COPY . .
|
6 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
app/main.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import logging
|
3 |
+
from flask import Flask, jsonify, request
|
4 |
+
from similarity import LanguageModel, Similarity
|
5 |
+
|
6 |
+
app = Flask(__name__)
|
7 |
+
|
8 |
+
PRE_TRAINED_MODEL_PATH = './model'
|
9 |
+
|
10 |
+
def init_logger():
|
11 |
+
root_logger= logging.getLogger()
|
12 |
+
root_logger.setLevel(logging.INFO)
|
13 |
+
root_logger.addHandler(logging.StreamHandler(sys.stdout))
|
14 |
+
logging.info("Logger initialized")
|
15 |
+
|
16 |
+
init_logger()
|
17 |
+
|
18 |
+
lm = LanguageModel(pre_trained_model_path=PRE_TRAINED_MODEL_PATH, max_len=1000)
|
19 |
+
similarity = Similarity(featurize_fn=lm.featurize)
|
20 |
+
|
21 |
+
@app.route('/getSimilarity/', methods=['GET', 'POST'])
|
22 |
+
def process_request():
|
23 |
+
text1 = request.values.get('text1')
|
24 |
+
text2 = request.values.get('text2')
|
25 |
+
score = similarity.get_score(text1, text2)
|
26 |
+
|
27 |
+
response = {'similarity score': score}
|
28 |
+
response = jsonify(response)
|
29 |
+
response.headers.add("Access-Control-Allow-Origin", "*")
|
30 |
+
return response
|
31 |
+
|
32 |
+
if __name__ == "__main__":
|
33 |
+
app.run(host="0.0.0.0", port=7860)
|
app/similarity.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import logging
|
3 |
+
import numpy as np
|
4 |
+
from transformers import AutoTokenizer, AutoModel
|
5 |
+
|
6 |
+
class LanguageModel:
|
7 |
+
def __init__(self, pre_trained_model_path, max_len=1000):
|
8 |
+
""" Load pipeline for pre-trained model """
|
9 |
+
|
10 |
+
self.max_len = max_len
|
11 |
+
|
12 |
+
logging.info(f"Loading tokenizer for {pre_trained_model_path}")
|
13 |
+
self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_path)
|
14 |
+
|
15 |
+
logging.info(f"Loading model for {pre_trained_model_path}")
|
16 |
+
self.model = AutoModel.from_pretrained(
|
17 |
+
pre_trained_model_path,
|
18 |
+
)
|
19 |
+
|
20 |
+
def mean_pooling(self, model_output, attention_mask):
|
21 |
+
token_embeddings = model_output[0]
|
22 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
23 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
24 |
+
|
25 |
+
def get_chunks(self, s):
|
26 |
+
""" Split long string into chunks based on max len """
|
27 |
+
|
28 |
+
max_len = self.max_len
|
29 |
+
start = 0
|
30 |
+
end = 0
|
31 |
+
while start + max_len < len(s) and end != -1:
|
32 |
+
end = s.rfind(" ", start, start + max_len + 1)
|
33 |
+
yield s[start:end]
|
34 |
+
start = end +1
|
35 |
+
yield s[start:]
|
36 |
+
|
37 |
+
def preprocess(self, text):
|
38 |
+
text = str(text)
|
39 |
+
text = text.lower()
|
40 |
+
return text.strip()
|
41 |
+
|
42 |
+
def featurize(self, input_text):
|
43 |
+
""" Return feature vector for the input text """
|
44 |
+
|
45 |
+
# split long text into multiple strings
|
46 |
+
text_list = list(self.get_chunks(input_text))
|
47 |
+
# apply text preprocessing
|
48 |
+
processed_text_list = [self.preprocess(text) for text in text_list]
|
49 |
+
# tokenize input
|
50 |
+
max_length = max([len(text) for text in processed_text_list])
|
51 |
+
encoded_input = self.tokenizer(processed_text_list, padding=True,
|
52 |
+
max_length=max_length, truncation=True,
|
53 |
+
return_tensors='pt')
|
54 |
+
# get model output
|
55 |
+
with torch.no_grad():
|
56 |
+
model_output = self.model(**encoded_input)
|
57 |
+
|
58 |
+
# get mean pooled output
|
59 |
+
feature_list = self.mean_pooling(model_output, encoded_input['attention_mask'])
|
60 |
+
feature_mean = np.average(feature_list, axis=0)
|
61 |
+
|
62 |
+
return feature_mean
|
63 |
+
|
64 |
+
class Similarity:
|
65 |
+
def __init__(self, featurize_fn):
|
66 |
+
self.featurize_fn = featurize_fn
|
67 |
+
|
68 |
+
def get_score(self, text1, text2):
|
69 |
+
text1_features = self.featurize_fn(text1)
|
70 |
+
text2_features = self.featurize_fn(text2)
|
71 |
+
# calculate dot product
|
72 |
+
similarity = np.dot(text1_features, text2_features)
|
73 |
+
# normalize
|
74 |
+
score = max(0, similarity - 70) / ((100 - 70))
|
75 |
+
# handle for score going above 1
|
76 |
+
score = min(1.0, score)
|
77 |
+
return score
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Flask==2.2.0
|
2 |
+
torch==2.0.1+cpu
|
3 |
+
numpy==1.23.5
|
4 |
+
transformers==4.34.0
|
5 |
+
uvicorn==0.23.2
|