Spaces:

claree007
/

Doc-similarity

Runtime error

App Files Files Community

claree007 commited on Oct 14, 2023

Commit

7ce6e7f

1 Parent(s): e2f3953

initial

Browse files

Files changed (4) hide show

Dockerfile +6 -0
app/main.py +33 -0
app/similarity.py +77 -0
requirements.txt +5 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,6 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/main.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import sys
+import logging
+from flask import Flask, jsonify, request
+from similarity import LanguageModel, Similarity
+app = Flask(__name__)
+PRE_TRAINED_MODEL_PATH = './model'
+def init_logger():
+    root_logger= logging.getLogger()
+    root_logger.setLevel(logging.INFO)
+    root_logger.addHandler(logging.StreamHandler(sys.stdout))
+    logging.info("Logger initialized")
+init_logger()
+lm = LanguageModel(pre_trained_model_path=PRE_TRAINED_MODEL_PATH, max_len=1000)
+similarity = Similarity(featurize_fn=lm.featurize)
+@app.route('/getSimilarity/', methods=['GET', 'POST'])
+def process_request():
+    text1 = request.values.get('text1')
+    text2 = request.values.get('text2')
+    score = similarity.get_score(text1, text2)
+    response = {'similarity score': score}
+    response = jsonify(response)
+    response.headers.add("Access-Control-Allow-Origin", "*")
+    return response
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860)

app/similarity.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import logging
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+class LanguageModel:
+    def __init__(self, pre_trained_model_path, max_len=1000):
+        """ Load pipeline for pre-trained model """
+        self.max_len = max_len
+        logging.info(f"Loading tokenizer for {pre_trained_model_path}")
+        self.tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_path)
+        logging.info(f"Loading model for {pre_trained_model_path}")
+        self.model = AutoModel.from_pretrained(
+            pre_trained_model_path,
+        )
+    def mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output[0]
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def get_chunks(self, s):
+        """ Split long string into chunks based on max len """
+        max_len = self.max_len
+        start = 0
+        end = 0
+        while start + max_len  < len(s) and end != -1:
+            end = s.rfind(" ", start, start + max_len + 1)
+            yield s[start:end]
+            start = end +1
+        yield s[start:]
+    def preprocess(self, text):
+        text = str(text)
+        text = text.lower()
+        return text.strip()
+    def featurize(self, input_text):
+        """ Return feature vector for the input text """
+        # split long text into multiple strings
+        text_list = list(self.get_chunks(input_text))
+        # apply text preprocessing
+        processed_text_list = [self.preprocess(text) for text in text_list]
+        # tokenize input
+        max_length = max([len(text) for text in processed_text_list])
+        encoded_input = self.tokenizer(processed_text_list, padding=True,
+                                       max_length=max_length, truncation=True,
+                                       return_tensors='pt')
+        # get model output
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+        # get mean pooled output
+        feature_list = self.mean_pooling(model_output, encoded_input['attention_mask'])
+        feature_mean = np.average(feature_list, axis=0)
+        return feature_mean
+class Similarity:
+    def __init__(self, featurize_fn):
+        self.featurize_fn = featurize_fn
+    def get_score(self, text1, text2):
+        text1_features = self.featurize_fn(text1)
+        text2_features = self.featurize_fn(text2)
+        # calculate dot product
+        similarity = np.dot(text1_features, text2_features)
+        # normalize
+        score = max(0, similarity - 70) / ((100 - 70))
+        # handle for score going above 1
+        score = min(1.0, score)
+        return score

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Flask==2.2.0
+torch==2.0.1+cpu
+numpy==1.23.5
+transformers==4.34.0
+uvicorn==0.23.2