sam2ai commited on
Commit
0e4258a
·
1 Parent(s): 3acbfb4

Synced repo using 'sync_with_huggingface' Github Action

Browse files
app.py CHANGED
@@ -13,41 +13,11 @@ from fastapi import FastAPI, Body, Request
13
  from utils.utils import add_arguments, print_arguments
14
  from sentence_transformers import SentenceTransformer, models
15
 
 
 
 
16
 
17
 
18
- # def print_arguments(args):
19
- # print("----------- Configuration Arguments -----------")
20
- # for arg, value in vars(args).items():
21
- # print("%s: %s" % (arg, value))
22
- # print("------------------------------------------------")
23
-
24
-
25
- # def strtobool(val):
26
- # val = val.lower()
27
- # if val in ('y', 'yes', 't', 'true', 'on', '1'):
28
- # return True
29
- # elif val in ('n', 'no', 'f', 'false', 'off', '0'):
30
- # return False
31
- # else:
32
- # raise ValueError("invalid truth value %r" % (val,))
33
-
34
- # def str_none(val):
35
- # if val == 'None':
36
- # return None
37
- # else:
38
- # return val
39
-
40
- # def add_arguments(argname, type, default, help, argparser, **kwargs):
41
- # type = strtobool if type == bool else type
42
- # type = str_none if type == str else type
43
- # argparser.add_argument(
44
- # "--" + argname,
45
- # default=default,
46
- # type=type,
47
- # help=help + ' Default: %(default)s.',
48
- # **kwargs
49
- # )
50
-
51
 
52
  os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
53
 
@@ -60,11 +30,8 @@ add_arg("host", type=str, default="0.0.0.0", help="")
60
  add_arg("port", type=int, default=5000, help="")
61
  add_arg("model_path", type=str, default="BAAI/bge-small-en-v1.5", help="")
62
  add_arg("use_gpu", type=bool, default=False, help="")
63
- # add_arg("use_int8", type=bool, default=True, help="")
64
- add_arg("beam_size", type=int, default=10, help="")
65
  add_arg("num_workers", type=int, default=2, help="")
66
- add_arg("vad_filter", type=bool, default=True, help="")
67
- add_arg("local_files_only", type=bool, default=True, help="")
68
 
69
 
70
  args = parser.parse_args()
@@ -72,13 +39,24 @@ print_arguments(args)
72
 
73
 
74
 
 
 
 
 
 
 
 
 
 
 
 
75
  if args.use_gpu:
76
  bge_model = SentenceTransformer(args.model_path, device="cuda", compute_type="float16", cache_folder=".")
77
  else:
78
  bge_model = SentenceTransformer(args.model_path, device='cpu', cache_folder=".")
79
 
80
 
81
-
82
  if args.use_gpu:
83
  model_name = 'sam2ai/sbert-tsdae'
84
  word_embedding_model = models.Transformer(model_name)
@@ -100,22 +78,34 @@ else:
100
  )
101
 
102
 
103
- app = FastAPI(title="embedding Inference")
 
 
 
 
 
104
 
 
 
 
 
105
 
106
- def similarity_score(model, textA, textB):
107
- em_test = model.encode(
108
- [textA, textB],
109
- normalize_embeddings=True
110
- )
111
- return em_test[0] @ em_test[1].T
 
112
 
113
 
 
 
 
114
  @app.get("/")
115
  async def index(request: Request):
116
  return {"detail": "API is Active !!"}
117
 
118
-
119
  @app.post("/bge_embed")
120
  async def api_bge_embed(
121
  text1: str = Body("text1", description="", embed=True),
@@ -142,6 +132,19 @@ async def api_tsdae_embed(
142
  ret = {"similarity score": scores, "status_code": 200}
143
  return ret
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
 
147
 
 
13
  from utils.utils import add_arguments, print_arguments
14
  from sentence_transformers import SentenceTransformer, models
15
 
16
+ from gensim.models import Word2Vec
17
+ from gensim.utils import simple_preprocess
18
+ import numpy as np
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
23
 
 
30
  add_arg("port", type=int, default=5000, help="")
31
  add_arg("model_path", type=str, default="BAAI/bge-small-en-v1.5", help="")
32
  add_arg("use_gpu", type=bool, default=False, help="")
 
 
33
  add_arg("num_workers", type=int, default=2, help="")
34
+
 
35
 
36
 
37
  args = parser.parse_args()
 
39
 
40
 
41
 
42
+ # similarity score func
43
+ def similarity_score(model, textA, textB):
44
+ em_test = model.encode(
45
+ [textA, textB],
46
+ normalize_embeddings=True
47
+ )
48
+ return em_test[0] @ em_test[1].T
49
+
50
+
51
+ # BGE embedding
52
+
53
  if args.use_gpu:
54
  bge_model = SentenceTransformer(args.model_path, device="cuda", compute_type="float16", cache_folder=".")
55
  else:
56
  bge_model = SentenceTransformer(args.model_path, device='cpu', cache_folder=".")
57
 
58
 
59
+ # tsdae embedding
60
  if args.use_gpu:
61
  model_name = 'sam2ai/sbert-tsdae'
62
  word_embedding_model = models.Transformer(model_name)
 
78
  )
79
 
80
 
81
+ # word2vec embedding
82
+ # Define the calculate_similarity function
83
+ def calculate_similarity(sentence1, sentence2):
84
+ # Tokenize the sentences
85
+ tokens1 = simple_preprocess(sentence1)
86
+ tokens2 = simple_preprocess(sentence2)
87
 
88
+ # Load or train a Word2Vec model
89
+ # Here, we'll create a simple model for demonstration purposes
90
+ sentences = [tokens1, tokens2]
91
+ model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)
92
 
93
+ # Calculate the vector representation for each sentence
94
+ vector1 = np.mean([model.wv[token] for token in tokens1], axis=0)
95
+ vector2 = np.mean([model.wv[token] for token in tokens2], axis=0)
96
+
97
+ # Calculate cosine similarity
98
+ similarity = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
99
+ return similarity
100
 
101
 
102
+
103
+ app = FastAPI(title="embedding Inference")
104
+
105
  @app.get("/")
106
  async def index(request: Request):
107
  return {"detail": "API is Active !!"}
108
 
 
109
  @app.post("/bge_embed")
110
  async def api_bge_embed(
111
  text1: str = Body("text1", description="", embed=True),
 
132
  ret = {"similarity score": scores, "status_code": 200}
133
  return ret
134
 
135
+ @app.post("/w2v_embed")
136
+ async def api_w2v_embed(
137
+ text1: str = Body("text1", description="", embed=True),
138
+ text2: str = Body("text2", description="", embed=True),
139
+ ):
140
+
141
+ scores = calculate_similarity(text1, text2)
142
+ print(scores)
143
+ scores = scores.tolist()
144
+
145
+ ret = {"similarity score": scores, "status_code": 200}
146
+ return ret
147
+
148
 
149
 
150
 
notebooks/embeding_poc_test.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/sentence_transformer_TSDAE.ipynb ADDED
The diff for this file is too large to render. See raw diff