claree007 commited on
Commit
7ce6e7f
·
1 Parent(s): e2f3953
Files changed (4) hide show
  1. Dockerfile +6 -0
  2. app/main.py +33 -0
  3. app/similarity.py +77 -0
  4. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ WORKDIR /code
3
+ COPY ./requirements.txt /code/requirements.txt
4
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
5
+ COPY . .
6
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app/main.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import logging
3
+ from flask import Flask, jsonify, request
4
+ from similarity import LanguageModel, Similarity
5
+
6
+ app = Flask(__name__)
7
+
8
+ PRE_TRAINED_MODEL_PATH = './model'
9
+
10
+ def init_logger():
11
+ root_logger= logging.getLogger()
12
+ root_logger.setLevel(logging.INFO)
13
+ root_logger.addHandler(logging.StreamHandler(sys.stdout))
14
+ logging.info("Logger initialized")
15
+
16
+ init_logger()
17
+
18
+ lm = LanguageModel(pre_trained_model_path=PRE_TRAINED_MODEL_PATH, max_len=1000)
19
+ similarity = Similarity(featurize_fn=lm.featurize)
20
+
21
+ @app.route('/getSimilarity/', methods=['GET', 'POST'])
22
+ def process_request():
23
+ text1 = request.values.get('text1')
24
+ text2 = request.values.get('text2')
25
+ score = similarity.get_score(text1, text2)
26
+
27
+ response = {'similarity score': score}
28
+ response = jsonify(response)
29
+ response.headers.add("Access-Control-Allow-Origin", "*")
30
+ return response
31
+
32
+ if __name__ == "__main__":
33
+ app.run(host="0.0.0.0", port=7860)
app/similarity.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import logging
3
+ import numpy as np
4
+ from transformers import AutoTokenizer, AutoModel
5
+
6
+ class LanguageModel:
7
+ def __init__(self, pre_trained_model_path, max_len=1000):
8
+ """ Load pipeline for pre-trained model """
9
+
10
+ self.max_len = max_len
11
+
12
+ logging.info(f"Loading tokenizer for {pre_trained_model_path}")
13
+ self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_path)
14
+
15
+ logging.info(f"Loading model for {pre_trained_model_path}")
16
+ self.model = AutoModel.from_pretrained(
17
+ pre_trained_model_path,
18
+ )
19
+
20
+ def mean_pooling(self, model_output, attention_mask):
21
+ token_embeddings = model_output[0]
22
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
23
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
24
+
25
+ def get_chunks(self, s):
26
+ """ Split long string into chunks based on max len """
27
+
28
+ max_len = self.max_len
29
+ start = 0
30
+ end = 0
31
+ while start + max_len < len(s) and end != -1:
32
+ end = s.rfind(" ", start, start + max_len + 1)
33
+ yield s[start:end]
34
+ start = end +1
35
+ yield s[start:]
36
+
37
+ def preprocess(self, text):
38
+ text = str(text)
39
+ text = text.lower()
40
+ return text.strip()
41
+
42
+ def featurize(self, input_text):
43
+ """ Return feature vector for the input text """
44
+
45
+ # split long text into multiple strings
46
+ text_list = list(self.get_chunks(input_text))
47
+ # apply text preprocessing
48
+ processed_text_list = [self.preprocess(text) for text in text_list]
49
+ # tokenize input
50
+ max_length = max([len(text) for text in processed_text_list])
51
+ encoded_input = self.tokenizer(processed_text_list, padding=True,
52
+ max_length=max_length, truncation=True,
53
+ return_tensors='pt')
54
+ # get model output
55
+ with torch.no_grad():
56
+ model_output = self.model(**encoded_input)
57
+
58
+ # get mean pooled output
59
+ feature_list = self.mean_pooling(model_output, encoded_input['attention_mask'])
60
+ feature_mean = np.average(feature_list, axis=0)
61
+
62
+ return feature_mean
63
+
64
+ class Similarity:
65
+ def __init__(self, featurize_fn):
66
+ self.featurize_fn = featurize_fn
67
+
68
+ def get_score(self, text1, text2):
69
+ text1_features = self.featurize_fn(text1)
70
+ text2_features = self.featurize_fn(text2)
71
+ # calculate dot product
72
+ similarity = np.dot(text1_features, text2_features)
73
+ # normalize
74
+ score = max(0, similarity - 70) / ((100 - 70))
75
+ # handle for score going above 1
76
+ score = min(1.0, score)
77
+ return score
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Flask==2.2.0
2
+ torch==2.0.1+cpu
3
+ numpy==1.23.5
4
+ transformers==4.34.0
5
+ uvicorn==0.23.2