Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,447 +1,453 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
|
9 |
-
#
|
10 |
-
|
11 |
-
|
12 |
|
13 |
-
#
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
|
27 |
-
#
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
|
35 |
# ARABIC_STOPWORDS = {
|
36 |
# 'ูู', 'ู
ู', 'ุฅูู', 'ุนู', 'ู
ุน', 'ูุฐุง', 'ูุฐู', 'ุฐูู', 'ุชูู',
|
37 |
# 'ุงูุชู', 'ุงูุฐู', 'ู
ุง', 'ูุง', 'ุฃู', 'ุฃู', 'ููู', 'ูุฏ', 'ุญูู
', 'ูุงู',
|
38 |
# 'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ูู', 'ููุง', 'ููู
', 'ู', 'ุฃู
', 'ุฅู'
|
39 |
# }
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
#
|
51 |
-
|
52 |
-
#
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
#
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
|
76 |
-
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
|
91 |
-
#
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
|
96 |
-
#
|
97 |
-
|
98 |
|
99 |
-
#
|
100 |
-
|
101 |
-
|
102 |
|
103 |
-
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
#
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
#
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
#
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
#
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
#
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
|
152 |
-
#
|
153 |
-
|
154 |
-
|
155 |
|
156 |
-
|
157 |
-
#
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
#
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
#
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
|
172 |
-
|
173 |
-
|
174 |
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
|
180 |
-
#
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
|
185 |
-
#
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
|
199 |
-
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
#
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
#
|
220 |
-
|
221 |
-
|
222 |
|
223 |
-
#
|
224 |
-
|
225 |
|
226 |
-
#
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
|
234 |
-
#
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
|
245 |
-
#
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
|
252 |
-
|
253 |
|
254 |
-
#
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
|
259 |
-
|
260 |
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
|
274 |
-
|
275 |
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
|
285 |
-
|
286 |
-
|
287 |
|
288 |
|
289 |
-
import torch
|
290 |
-
import pandas as pd
|
291 |
-
from sentence_transformers import SentenceTransformer, util
|
292 |
-
import gradio as gr
|
293 |
-
import re
|
294 |
-
import numpy as np
|
295 |
-
import math
|
296 |
-
from collections import Counter
|
297 |
|
298 |
-
# Load both models
|
299 |
-
model1 = SentenceTransformer("distilbert-base-multilingual-cased")
|
300 |
-
model2 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
301 |
|
302 |
-
# Load data
|
303 |
-
print("Loading data and embeddings...")
|
304 |
-
df = pd.read_csv("cleaned1.csv")
|
305 |
-
df2 = pd.read_csv("cleaned2.csv")
|
306 |
-
df3 = pd.read_csv("cleaned3.csv")
|
307 |
|
308 |
-
embeddings1 = torch.load("embeddings1_1.pt")
|
309 |
-
embeddings2 = torch.load("embeddings2_1.pt")
|
310 |
-
embeddings3 = torch.load("embeddings3_1.pt")
|
311 |
|
312 |
-
embeddings1a = torch.load("embeddings1.pt")
|
313 |
-
embeddings2a = torch.load("embeddings2.pt")
|
314 |
-
embeddings3a = torch.load("embeddings3.pt")
|
315 |
|
316 |
-
# Arabic stopwords
|
317 |
-
ARABIC_STOPWORDS = {
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
}
|
324 |
|
325 |
-
def arabic_word_tokenize(text):
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
|
330 |
-
# Pre-tokenize questions and compute doc frequencies
|
331 |
-
def setup_tokenization_and_freqs(questions):
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
|
336 |
-
tokenized1, doc_freqs1 = setup_tokenization_and_freqs(df["question"].values)
|
337 |
-
tokenized2, doc_freqs2 = setup_tokenization_and_freqs(df2["question"].values)
|
338 |
-
tokenized3, doc_freqs3 = setup_tokenization_and_freqs(df3["question"].values)
|
339 |
|
340 |
-
def compute_word_overlap(query, questions):
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
|
355 |
-
def lightweight_bm25_score(query_tokens, doc_tokens, doc_freqs, total_docs, k1=1.2, b=0.75):
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
|
368 |
-
def normalize_scores(scores):
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
def combine_scores(query, questions, tokenized, doc_freqs, emb1, emb2):
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
|
412 |
-
def get_top_diverse(results, links, top_k=5):
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
def predict(query):
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
|
437 |
-
title = "Arabic Search: Dual-Model + BM25 + Overlap"
|
438 |
-
iface = gr.Interface(
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
)
|
445 |
|
446 |
-
if __name__ == "__main__":
|
447 |
-
|
|
|
1 |
+
import torch
|
2 |
+
import pandas as pd
|
3 |
+
from sentence_transformers import SentenceTransformer, util
|
4 |
+
import gradio as gr
|
5 |
+
import re
|
6 |
+
from rank_bm25 import BM25Okapi
|
7 |
+
import numpy as np
|
8 |
|
9 |
+
# Load models
|
10 |
+
model = SentenceTransformer("distilbert-base-multilingual-cased")
|
11 |
+
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
12 |
|
13 |
+
# Load data
|
14 |
+
df = pd.read_csv("cleaned1.csv")
|
15 |
+
df2 = pd.read_csv("cleaned2.csv")
|
16 |
+
df3 = pd.read_csv("cleaned3.csv")
|
17 |
|
18 |
+
# Load pre-computed embeddings
|
19 |
+
embeddings = torch.load("embeddings1_1.pt")
|
20 |
+
embeddings2 = torch.load("embeddings2_1.pt")
|
21 |
+
embeddings3 = torch.load("embeddings3_1.pt")
|
22 |
|
23 |
+
embeddingsa = torch.load("embeddings1.pt")
|
24 |
+
embeddingsa2 = torch.load("embeddings2.pt")
|
25 |
+
embeddingsa3 = torch.load("embeddings3.pt")
|
26 |
|
27 |
+
# Extract questions and links
|
28 |
+
df_questions = df["question"].values
|
29 |
+
df_links = df["link"].values
|
30 |
+
df2_questions = df2["question"].values
|
31 |
+
df2_links = df2["link"].values
|
32 |
+
df3_questions = df3["question"].values
|
33 |
+
df3_links = df3["url"].values
|
34 |
|
35 |
# ARABIC_STOPWORDS = {
|
36 |
# 'ูู', 'ู
ู', 'ุฅูู', 'ุนู', 'ู
ุน', 'ูุฐุง', 'ูุฐู', 'ุฐูู', 'ุชูู',
|
37 |
# 'ุงูุชู', 'ุงูุฐู', 'ู
ุง', 'ูุง', 'ุฃู', 'ุฃู', 'ููู', 'ูุฏ', 'ุญูู
', 'ูุงู',
|
38 |
# 'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ูู', 'ููุง', 'ููู
', 'ู', 'ุฃู
', 'ุฅู'
|
39 |
# }
|
40 |
+
ARABIC_STOPWORDS = {
|
41 |
+
'ูู', 'ู
ู', 'ุฅูู', 'ุนู', 'ู
ุน', 'ูุฐุง', 'ูุฐู', 'ุฐูู', 'ุชูู',
|
42 |
+
'ุงูุชู', 'ุงูุฐู', 'ู
ุง', 'ูุง', 'ุฃู', 'ุฃู', 'ููู', 'ูุฏ', 'ุญูู
', 'ูุงู',
|
43 |
+
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ูู', 'ููุง', 'ููู
', 'ู', 'ุฃู
', 'ุฅู',
|
44 |
+
'ุฑุถู', 'ุนูููุง', 'ุนููู
', 'ุนูู', 'ุนูููู
', 'ุตูู', 'ูุณูู
',
|
45 |
+
'ุณูุงู
', 'ุนููู', 'ุงูุฑุณูู', 'ุงููุจู', 'ุนููู', 'ุงูุณูุงู
', 'ุญุฏูุซ', 'ุงุญุงุฏูุซ'
|
46 |
+
}
|
47 |
+
def arabic_word_tokenize(text):
|
48 |
+
if not isinstance(text, str):
|
49 |
+
return []
|
50 |
+
# Remove diacritics
|
51 |
+
text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
|
52 |
+
# Extract only Arabic words (length โฅ 2)
|
53 |
+
tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
|
54 |
+
return [t for t in tokens if t not in ARABIC_STOPWORDS]
|
55 |
+
|
56 |
+
def prepare_bm25_corpus(questions):
|
57 |
+
"""Prepare tokenized corpus for BM25"""
|
58 |
+
tokenized_corpus = []
|
59 |
+
for question in questions:
|
60 |
+
tokens = arabic_word_tokenize(question)
|
61 |
+
tokenized_corpus.append(tokens)
|
62 |
+
return tokenized_corpus
|
63 |
+
|
64 |
+
# Initialize BM25 models for each dataset
|
65 |
+
print("Initializing BM25 models...")
|
66 |
+
bm25_corpus1 = prepare_bm25_corpus(df_questions)
|
67 |
+
bm25_corpus2 = prepare_bm25_corpus(df2_questions)
|
68 |
+
bm25_corpus3 = prepare_bm25_corpus(df3_questions)
|
69 |
+
|
70 |
+
bm25_model1 = BM25Okapi(bm25_corpus1)
|
71 |
+
bm25_model2 = BM25Okapi(bm25_corpus2)
|
72 |
+
bm25_model3 = BM25Okapi(bm25_corpus3)
|
73 |
+
print("BM25 models initialized!")
|
74 |
+
|
75 |
+
def compute_bm25_scores(query, bm25_model):
|
76 |
+
"""Compute BM25 scores for a query"""
|
77 |
+
query_tokens = arabic_word_tokenize(query)
|
78 |
+
if not query_tokens:
|
79 |
+
return np.zeros(len(bm25_model.corpus))
|
80 |
|
81 |
+
scores = bm25_model.get_scores(query_tokens)
|
82 |
+
return scores
|
83 |
|
84 |
+
def compute_word_overlap(query, questions):
|
85 |
+
"""Enhanced word overlap computation"""
|
86 |
+
query_words = set(arabic_word_tokenize(query))
|
87 |
+
if len(query_words) == 0:
|
88 |
+
return [0.0] * len(questions)
|
89 |
|
90 |
+
overlaps = []
|
91 |
+
for q in questions:
|
92 |
+
q_words = set(arabic_word_tokenize(q))
|
93 |
+
if len(q_words) == 0:
|
94 |
+
overlaps.append(0.0)
|
95 |
+
continue
|
96 |
|
97 |
+
# Use Jaccard similarity (intersection over union)
|
98 |
+
intersection = len(query_words & q_words)
|
99 |
+
union = len(query_words | q_words)
|
100 |
+
jaccard = intersection / union if union > 0 else 0.0
|
101 |
|
102 |
+
# Also compute coverage (how much of query is matched)
|
103 |
+
coverage = intersection / len(query_words)
|
104 |
|
105 |
+
# Combine both: prioritize coverage but consider similarity
|
106 |
+
overlap_score = 0.7 * coverage + 0.3 * jaccard
|
107 |
+
overlaps.append(overlap_score)
|
108 |
|
109 |
+
return overlaps
|
110 |
|
111 |
+
def normalize_scores(scores):
|
112 |
+
"""Normalize scores to 0-1 range"""
|
113 |
+
scores = np.array(scores)
|
114 |
+
if np.max(scores) == np.min(scores):
|
115 |
+
return np.zeros_like(scores)
|
116 |
+
return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
|
117 |
+
|
118 |
+
def predict(text):
|
119 |
+
print(f"Received query: {text}")
|
120 |
+
if not text or text.strip() == "":
|
121 |
+
return "No query provided"
|
122 |
+
|
123 |
+
# Semantic similarity scores
|
124 |
+
query_embedding = model.encode(text, convert_to_tensor=True)
|
125 |
+
query_embeddinga = modela.encode(text, convert_to_tensor=True)
|
126 |
+
|
127 |
+
# Cosine similarities (averaged from two models)
|
128 |
+
sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
|
129 |
+
util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
|
130 |
+
sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
|
131 |
+
util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
|
132 |
+
sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
|
133 |
+
util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
|
134 |
+
|
135 |
+
# BM25 scores
|
136 |
+
bm25_scores1 = compute_bm25_scores(text, bm25_model1)
|
137 |
+
bm25_scores2 = compute_bm25_scores(text, bm25_model2)
|
138 |
+
bm25_scores3 = compute_bm25_scores(text, bm25_model3)
|
139 |
+
|
140 |
+
# Word overlap scores
|
141 |
+
word_overlap1 = compute_word_overlap(text, df_questions)
|
142 |
+
word_overlap2 = compute_word_overlap(text, df2_questions)
|
143 |
+
word_overlap3 = compute_word_overlap(text, df3_questions)
|
144 |
+
|
145 |
+
# Normalize all scores for fair combination
|
146 |
+
norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
|
147 |
+
norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
|
148 |
+
norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
|
149 |
|
150 |
+
norm_bm25_1 = normalize_scores(bm25_scores1)
|
151 |
+
norm_bm25_2 = normalize_scores(bm25_scores2)
|
152 |
+
norm_bm25_3 = normalize_scores(bm25_scores3)
|
153 |
|
154 |
+
norm_word1 = normalize_scores(word_overlap1)
|
155 |
+
norm_word2 = normalize_scores(word_overlap2)
|
156 |
+
norm_word3 = normalize_scores(word_overlap3)
|
157 |
|
158 |
+
# Adaptive weighting based on query characteristics
|
159 |
+
query_words = arabic_word_tokenize(text)
|
160 |
+
query_length = len(query_words)
|
161 |
|
162 |
+
if query_length <= 2:
|
163 |
+
# Short queries: prioritize exact matches (BM25 + word overlap)
|
164 |
+
semantic_weight = 0.3
|
165 |
+
bm25_weight = 0.4
|
166 |
+
word_weight = 0.3
|
167 |
+
elif query_length <= 5:
|
168 |
+
# Medium queries: balanced approach
|
169 |
+
semantic_weight = 0.4
|
170 |
+
bm25_weight = 0.35
|
171 |
+
word_weight = 0.25
|
172 |
+
else:
|
173 |
+
# Long queries: prioritize semantic understanding
|
174 |
+
semantic_weight = 0.5
|
175 |
+
bm25_weight = 0.3
|
176 |
+
word_weight = 0.2
|
177 |
|
178 |
+
def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
|
179 |
+
combined_results = []
|
180 |
|
181 |
+
for i in range(len(questions)):
|
182 |
+
semantic_score = float(norm_semantic[i])
|
183 |
+
bm25_score = float(norm_bm25[i])
|
184 |
+
word_score = float(norm_word[i])
|
185 |
|
186 |
+
# Enhanced scoring with BM25
|
187 |
+
combined_score = (semantic_weight * semantic_score +
|
188 |
+
bm25_weight * bm25_score +
|
189 |
+
word_weight * word_score)
|
190 |
|
191 |
+
# Boost results that perform well across multiple metrics
|
192 |
+
high_performance_count = sum([
|
193 |
+
semantic_score > 0.7,
|
194 |
+
bm25_score > 0.7,
|
195 |
+
word_score > 0.5
|
196 |
+
])
|
197 |
|
198 |
+
if high_performance_count >= 2:
|
199 |
+
boost = 0.1
|
200 |
+
elif high_performance_count >= 1:
|
201 |
+
boost = 0.05
|
202 |
+
else:
|
203 |
+
boost = 0.0
|
204 |
|
205 |
+
final_score = combined_score + boost
|
206 |
|
207 |
+
combined_results.append({
|
208 |
+
"question": questions[i],
|
209 |
+
"link": links[i],
|
210 |
+
"semantic_score": semantic_score,
|
211 |
+
"bm25_score": bm25_score,
|
212 |
+
"word_overlap_score": word_score,
|
213 |
+
"combined_score": final_score
|
214 |
+
})
|
215 |
|
216 |
+
return combined_results
|
217 |
+
|
218 |
+
# Create combined results for all datasets
|
219 |
+
combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
|
220 |
+
combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
|
221 |
+
combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
|
222 |
+
|
223 |
+
def get_diverse_top_results(combined_results, top_k=5):
|
224 |
+
"""Get diverse top results using multiple ranking strategies"""
|
225 |
+
# Sort by combined score and get top candidates
|
226 |
+
by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
|
227 |
+
top_combined = by_combined[:3]
|
228 |
|
229 |
+
# Get questions from top combined to avoid duplicates
|
230 |
+
used_questions = {item["question"] for item in top_combined}
|
231 |
|
232 |
+
# Add best BM25 result not already included
|
233 |
+
by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
|
234 |
+
bm25_pick = None
|
235 |
+
for item in by_bm25:
|
236 |
+
if item["question"] not in used_questions:
|
237 |
+
bm25_pick = item
|
238 |
+
break
|
239 |
|
240 |
+
# Add best semantic result not already included
|
241 |
+
by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
|
242 |
+
semantic_pick = None
|
243 |
+
if bm25_pick:
|
244 |
+
used_questions.add(bm25_pick["question"])
|
245 |
|
246 |
+
for item in by_semantic:
|
247 |
+
if item["question"] not in used_questions:
|
248 |
+
semantic_pick = item
|
249 |
+
break
|
250 |
|
251 |
+
# Combine results
|
252 |
+
final_results = top_combined.copy()
|
253 |
+
if bm25_pick:
|
254 |
+
final_results.append(bm25_pick)
|
255 |
+
if semantic_pick:
|
256 |
+
final_results.append(semantic_pick)
|
257 |
|
258 |
+
return final_results[:top_k]
|
259 |
|
260 |
+
# Get top results for each dataset
|
261 |
+
top1 = get_diverse_top_results(combined1)
|
262 |
+
top2 = get_diverse_top_results(combined2)
|
263 |
+
top3 = get_diverse_top_results(combined3)
|
264 |
|
265 |
+
results = {
|
266 |
|
267 |
+
"top2": top2,
|
268 |
+
"top3": top3,
|
269 |
+
"top1": top1,
|
270 |
+
"query_info": {
|
271 |
+
"query_length": query_length,
|
272 |
+
"weights": {
|
273 |
+
"semantic": semantic_weight,
|
274 |
+
"bm25": bm25_weight,
|
275 |
+
"word_overlap": word_weight
|
276 |
+
}
|
277 |
+
}
|
278 |
+
}
|
279 |
|
280 |
+
return results
|
281 |
|
282 |
+
title = "Enhanced Search with BM25"
|
283 |
+
iface = gr.Interface(
|
284 |
+
fn=predict,
|
285 |
+
inputs=[gr.Textbox(label="Search Query", lines=3)],
|
286 |
+
outputs='json',
|
287 |
+
title=title,
|
288 |
+
description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
|
289 |
+
)
|
290 |
|
291 |
+
if __name__ == "__main__":
|
292 |
+
iface.launch()
|
293 |
|
294 |
|
295 |
+
# import torch
|
296 |
+
# import pandas as pd
|
297 |
+
# from sentence_transformers import SentenceTransformer, util
|
298 |
+
# import gradio as gr
|
299 |
+
# import re
|
300 |
+
# import numpy as np
|
301 |
+
# import math
|
302 |
+
# from collections import Counter
|
303 |
|
304 |
+
# # Load both models
|
305 |
+
# model1 = SentenceTransformer("distilbert-base-multilingual-cased")
|
306 |
+
# model2 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
307 |
|
308 |
+
# # Load data
|
309 |
+
# print("Loading data and embeddings...")
|
310 |
+
# df = pd.read_csv("cleaned1.csv")
|
311 |
+
# df2 = pd.read_csv("cleaned2.csv")
|
312 |
+
# df3 = pd.read_csv("cleaned3.csv")
|
313 |
|
314 |
+
# embeddings1 = torch.load("embeddings1_1.pt")
|
315 |
+
# embeddings2 = torch.load("embeddings2_1.pt")
|
316 |
+
# embeddings3 = torch.load("embeddings3_1.pt")
|
317 |
|
318 |
+
# embeddings1a = torch.load("embeddings1.pt")
|
319 |
+
# embeddings2a = torch.load("embeddings2.pt")
|
320 |
+
# embeddings3a = torch.load("embeddings3.pt")
|
321 |
|
322 |
+
# # Arabic stopwords
|
323 |
+
# ARABIC_STOPWORDS = {
|
324 |
+
# 'ูู', 'ู
ู', 'ุฅูู', 'ุนู', 'ู
ุน', 'ูุฐุง', 'ูุฐู', 'ุฐูู', 'ุชูู',
|
325 |
+
# 'ุงูุชู', 'ุงูุฐู', 'ู
ุง', 'ูุง', 'ุฃู', 'ุฃู', 'ููู', 'ูุฏ', 'ุญูู
', 'ูุงู',
|
326 |
+
# 'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ูู', 'ููุง', 'ููู
', 'ู', 'ุฃู
', 'ุฅู',
|
327 |
+
# 'ุฑุถู', 'ุนูููุง', 'ุนููู
', 'ุนูู', 'ุนูููู
', 'ุตูู', 'ูุณูู
',
|
328 |
+
# 'ุณูุงู
', 'ุนููู', 'ุงูุฑุณูู', 'ุงููุจู', 'ุนููู', 'ุงูุณูุงู
', 'ุญุฏูุซ', 'ุงุญุงุฏูุซ'
|
329 |
+
# }
|
330 |
|
331 |
+
# def arabic_word_tokenize(text):
|
332 |
+
# if not isinstance(text, str): return []
|
333 |
+
# text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
|
334 |
+
# return [t for t in re.findall(r'[\u0600-\u06FF]{2,}', text) if t not in ARABIC_STOPWORDS]
|
335 |
|
336 |
+
# # Pre-tokenize questions and compute doc frequencies
|
337 |
+
# def setup_tokenization_and_freqs(questions):
|
338 |
+
# tokenized = [arabic_word_tokenize(q) for q in questions]
|
339 |
+
# doc_freqs = Counter(word for doc in tokenized for word in set(doc))
|
340 |
+
# return tokenized, doc_freqs
|
341 |
|
342 |
+
# tokenized1, doc_freqs1 = setup_tokenization_and_freqs(df["question"].values)
|
343 |
+
# tokenized2, doc_freqs2 = setup_tokenization_and_freqs(df2["question"].values)
|
344 |
+
# tokenized3, doc_freqs3 = setup_tokenization_and_freqs(df3["question"].values)
|
345 |
|
346 |
+
# def compute_word_overlap(query, questions):
|
347 |
+
# q_words = set(arabic_word_tokenize(query))
|
348 |
+
# scores = []
|
349 |
+
# for doc in questions:
|
350 |
+
# d_words = set(arabic_word_tokenize(doc))
|
351 |
+
# if not d_words or not q_words:
|
352 |
+
# scores.append(0.0)
|
353 |
+
# continue
|
354 |
+
# inter = len(q_words & d_words)
|
355 |
+
# union = len(q_words | d_words)
|
356 |
+
# jaccard = inter / union if union else 0.0
|
357 |
+
# coverage = inter / len(q_words)
|
358 |
+
# scores.append(0.7 * coverage + 0.3 * jaccard)
|
359 |
+
# return scores
|
360 |
|
361 |
+
# def lightweight_bm25_score(query_tokens, doc_tokens, doc_freqs, total_docs, k1=1.2, b=0.75):
|
362 |
+
# score = 0.0
|
363 |
+
# doc_len = len(doc_tokens)
|
364 |
+
# avg_doc_len = 10
|
365 |
+
# for term in query_tokens:
|
366 |
+
# if term in doc_tokens:
|
367 |
+
# tf = doc_tokens.count(term)
|
368 |
+
# df = doc_freqs.get(term, 0)
|
369 |
+
# if df > 0:
|
370 |
+
# idf = math.log((total_docs - df + 0.5) / (df + 0.5))
|
371 |
+
# score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len)))
|
372 |
+
# return score
|
373 |
|
374 |
+
# def normalize_scores(scores):
|
375 |
+
# arr = np.array(scores)
|
376 |
+
# if arr.max() == arr.min(): return np.zeros_like(arr)
|
377 |
+
# return (arr - arr.min()) / (arr.max() - arr.min())
|
378 |
+
|
379 |
+
# def combine_scores(query, questions, tokenized, doc_freqs, emb1, emb2):
|
380 |
+
# total_docs = len(questions)
|
381 |
+
# q_emb1 = model1.encode(query, convert_to_tensor=True)
|
382 |
+
# q_emb2 = model2.encode(query, convert_to_tensor=True)
|
383 |
|
384 |
+
# sim1 = util.pytorch_cos_sim(q_emb1, emb1)[0]
|
385 |
+
# sim2 = util.pytorch_cos_sim(q_emb2, emb2)[0]
|
386 |
+
# sim_scores = ((sim1 + sim2) / 2).cpu().numpy()
|
387 |
+
|
388 |
+
# bm25_scores = [lightweight_bm25_score(arabic_word_tokenize(query), doc_tokens, doc_freqs, total_docs)
|
389 |
+
# for doc_tokens in tokenized]
|
390 |
+
# word_scores = compute_word_overlap(query, questions)
|
391 |
+
|
392 |
+
# norm_bm25 = normalize_scores(bm25_scores)
|
393 |
+
# norm_word = normalize_scores(word_scores)
|
394 |
+
# norm_sim = normalize_scores(sim_scores)
|
395 |
+
|
396 |
+
# query_len = len(arabic_word_tokenize(query))
|
397 |
+
# if query_len <= 2:
|
398 |
+
# w_sem, w_bm, w_word = 0.3, 0.4, 0.3
|
399 |
+
# elif query_len <= 5:
|
400 |
+
# w_sem, w_bm, w_word = 0.4, 0.35, 0.25
|
401 |
+
# else:
|
402 |
+
# w_sem, w_bm, w_word = 0.5, 0.3, 0.2
|
403 |
+
|
404 |
+
# results = []
|
405 |
+
# for i, q in enumerate(questions):
|
406 |
+
# sem, bm, word = norm_sim[i], norm_bm25[i], norm_word[i]
|
407 |
+
# combined = w_sem*sem + w_bm*bm + w_word*word
|
408 |
+
# boost = 0.1 if sum([sem > 0.7, bm > 0.7, word > 0.5]) >= 2 else (0.05 if sum([sem > 0.7, bm > 0.7, word > 0.5]) == 1 else 0.0)
|
409 |
+
# results.append({
|
410 |
+
# "question": q,
|
411 |
+
# "semantic_score": sem,
|
412 |
+
# "bm25_score": bm,
|
413 |
+
# "word_overlap_score": word,
|
414 |
+
# "combined_score": combined + boost
|
415 |
+
# })
|
416 |
+
# return results
|
417 |
|
418 |
+
# def get_top_diverse(results, links, top_k=5):
|
419 |
+
# results = [dict(r, link=links[i]) for i, r in enumerate(results)]
|
420 |
+
# top_combined = sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
|
421 |
+
# used_q = {r['question'] for r in top_combined}
|
422 |
+
# top_bm = next((r for r in sorted(results, key=lambda x: x['bm25_score'], reverse=True) if r['question'] not in used_q), None)
|
423 |
+
# if top_bm: used_q.add(top_bm['question'])
|
424 |
+
# top_sem = next((r for r in sorted(results, key=lambda x: x['semantic_score'], reverse=True) if r['question'] not in used_q), None)
|
425 |
+
# final = top_combined + ([top_bm] if top_bm else []) + ([top_sem] if top_sem else [])
|
426 |
+
# return final[:top_k]
|
427 |
+
|
428 |
+
# def predict(query):
|
429 |
+
# print(f"Query: {query}")
|
430 |
+
# results1 = combine_scores(query, df["question"].values, tokenized1, doc_freqs1, embeddings1, embeddings1a)
|
431 |
+
# results2 = combine_scores(query, df2["question"].values, tokenized2, doc_freqs2, embeddings2, embeddings2a)
|
432 |
+
# results3 = combine_scores(query, df3["question"].values, tokenized3, doc_freqs3, embeddings3, embeddings3a)
|
433 |
+
|
434 |
+
# return {
|
435 |
+
# "top2": get_top_diverse(results2, df2["link"].values),
|
436 |
+
# "top3": get_top_diverse(results3, df3["url"].values),
|
437 |
+
# "top1": get_top_diverse(results1, df["link"].values),
|
438 |
+
# "query_info": {
|
439 |
+
# "query_length": len(arabic_word_tokenize(query))
|
440 |
+
# }
|
441 |
+
# }
|
442 |
|
443 |
+
# title = "Arabic Search: Dual-Model + BM25 + Overlap"
|
444 |
+
# iface = gr.Interface(
|
445 |
+
# fn=predict,
|
446 |
+
# inputs=[gr.Textbox(label="Search Query", lines=3)],
|
447 |
+
# outputs="json",
|
448 |
+
# title=title,
|
449 |
+
# description="Accurate Arabic search using two semantic models, fast BM25, and word overlap."
|
450 |
+
# )
|
451 |
|
452 |
+
# if __name__ == "__main__":
|
453 |
+
# iface.launch()
|