Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "5000"))
|
|
| 8 |
RANDOM_STATE = 42
|
| 9 |
DEFAULT_INPUT = "I am so happy with this product"
|
| 10 |
|
| 11 |
-
# --------
|
| 12 |
def clean_text(text: str) -> str:
|
| 13 |
text = text.lower()
|
| 14 |
text = re.sub(r"http\S+", "", text)
|
|
@@ -18,6 +18,9 @@ def clean_text(text: str) -> str:
|
|
| 18 |
text = re.sub(r"\s+", " ", text).strip()
|
| 19 |
return text
|
| 20 |
|
|
|
|
|
|
|
|
|
|
| 21 |
# -------- Load sample data once --------
|
| 22 |
@functools.lru_cache(maxsize=1)
|
| 23 |
def load_sample_df():
|
|
@@ -57,9 +60,8 @@ def ensure_corpus_embeddings(model_name: str, texts: list[str]):
|
|
| 57 |
return _CORPUS_CACHE[model_name]
|
| 58 |
model_id = EMBEDDERS[model_name]
|
| 59 |
model = load_sentence_model(model_id)
|
| 60 |
-
emb = model.encode(
|
| 61 |
-
|
| 62 |
-
)
|
| 63 |
_CORPUS_CACHE[model_name] = emb
|
| 64 |
return emb
|
| 65 |
|
|
@@ -71,7 +73,8 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
|
|
| 71 |
try:
|
| 72 |
model = load_sentence_model(EMBEDDERS[name])
|
| 73 |
corpus_emb = ensure_corpus_embeddings(name, texts)
|
| 74 |
-
q = model.encode([clean_text(user_input)], show_progress_bar=False,
|
|
|
|
| 75 |
sims = cosine_similarity(q, corpus_emb)[0]
|
| 76 |
top_idx = sims.argsort()[-3:][::-1]
|
| 77 |
for rank, i in enumerate(top_idx, start=1):
|
|
@@ -84,7 +87,7 @@ def top3_for_each_model(user_input: str, selected_models: list[str]):
|
|
| 84 |
})
|
| 85 |
except Exception as e:
|
| 86 |
rows.append({
|
| 87 |
-
"Model": name, "Rank": "-", "Similarity": "-",
|
| 88 |
"Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
|
| 89 |
})
|
| 90 |
out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
|
|
@@ -104,8 +107,8 @@ def generate_and_pick_best(prompt: str, n_sequences: int, max_length: int, tempe
|
|
| 104 |
|
| 105 |
scorer_id = EMBEDDERS[scorer_model_name]
|
| 106 |
scorer = load_sentence_model(scorer_id)
|
| 107 |
-
q = scorer.encode([prompt], show_progress_bar=False,
|
| 108 |
-
cand_vecs = scorer.encode(candidates, show_progress_bar=False,
|
| 109 |
sims = cosine_similarity(q, cand_vecs)[0]
|
| 110 |
best_idx = int(sims.argmax())
|
| 111 |
table = pd.DataFrame({
|
|
@@ -135,7 +138,7 @@ Type a tweet, get similar tweets from Sentiment140, and generate a new one.
|
|
| 135 |
)
|
| 136 |
|
| 137 |
run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
|
| 138 |
-
table_out = gr.Dataframe(interactive=False)
|
| 139 |
|
| 140 |
run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
|
| 141 |
|
|
|
|
| 8 |
RANDOM_STATE = 42
|
| 9 |
DEFAULT_INPUT = "I am so happy with this product"
|
| 10 |
|
| 11 |
+
# -------- Helpers --------
|
| 12 |
def clean_text(text: str) -> str:
|
| 13 |
text = text.lower()
|
| 14 |
text = re.sub(r"http\S+", "", text)
|
|
|
|
| 18 |
text = re.sub(r"\s+", " ", text).strip()
|
| 19 |
return text
|
| 20 |
|
| 21 |
+
def _l2norm(x: np.ndarray) -> np.ndarray:
|
| 22 |
+
return x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-12)
|
| 23 |
+
|
| 24 |
# -------- Load sample data once --------
|
| 25 |
@functools.lru_cache(maxsize=1)
|
| 26 |
def load_sample_df():
|
|
|
|
| 60 |
return _CORPUS_CACHE[model_name]
|
| 61 |
model_id = EMBEDDERS[model_name]
|
| 62 |
model = load_sentence_model(model_id)
|
| 63 |
+
emb = model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
|
| 64 |
+
emb = _l2norm(emb)
|
|
|
|
| 65 |
_CORPUS_CACHE[model_name] = emb
|
| 66 |
return emb
|
| 67 |
|
|
|
|
| 73 |
try:
|
| 74 |
model = load_sentence_model(EMBEDDERS[name])
|
| 75 |
corpus_emb = ensure_corpus_embeddings(name, texts)
|
| 76 |
+
q = model.encode([clean_text(user_input)], show_progress_bar=False, convert_to_numpy=True)
|
| 77 |
+
q = _l2norm(q)
|
| 78 |
sims = cosine_similarity(q, corpus_emb)[0]
|
| 79 |
top_idx = sims.argsort()[-3:][::-1]
|
| 80 |
for rank, i in enumerate(top_idx, start=1):
|
|
|
|
| 87 |
})
|
| 88 |
except Exception as e:
|
| 89 |
rows.append({
|
| 90 |
+
"Model": name, "Rank": "-", "Similarity": "-",
|
| 91 |
"Tweet (clean)": f"[Error: {e}]", "Tweet (orig)": ""
|
| 92 |
})
|
| 93 |
out = pd.DataFrame(rows, columns=["Model","Rank","Similarity","Tweet (clean)","Tweet (orig)"])
|
|
|
|
| 107 |
|
| 108 |
scorer_id = EMBEDDERS[scorer_model_name]
|
| 109 |
scorer = load_sentence_model(scorer_id)
|
| 110 |
+
q = scorer.encode([prompt], show_progress_bar=False, convert_to_numpy=True); q = _l2norm(q)
|
| 111 |
+
cand_vecs = scorer.encode(candidates, show_progress_bar=False, convert_to_numpy=True); cand_vecs = _l2norm(cand_vecs)
|
| 112 |
sims = cosine_similarity(q, cand_vecs)[0]
|
| 113 |
best_idx = int(sims.argmax())
|
| 114 |
table = pd.DataFrame({
|
|
|
|
| 138 |
)
|
| 139 |
|
| 140 |
run_btn = gr.Button("🔎 Find Top‑3 Similar Tweets")
|
| 141 |
+
table_out = gr.Dataframe(interactive=False)
|
| 142 |
|
| 143 |
run_btn.click(top3_for_each_model, inputs=[test_input, models], outputs=table_out)
|
| 144 |
|