mohbay commited on
Commit
00b3d0f
ยท
verified ยท
1 Parent(s): 3dafe6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +393 -387
app.py CHANGED
@@ -1,447 +1,453 @@
1
- # import torch
2
- # import pandas as pd
3
- # from sentence_transformers import SentenceTransformer, util
4
- # import gradio as gr
5
- # import re
6
- # from rank_bm25 import BM25Okapi
7
- # import numpy as np
8
 
9
- # # Load models
10
- # model = SentenceTransformer("distilbert-base-multilingual-cased")
11
- # modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
12
 
13
- # # Load data
14
- # df = pd.read_csv("cleaned1.csv")
15
- # df2 = pd.read_csv("cleaned2.csv")
16
- # df3 = pd.read_csv("cleaned3.csv")
17
 
18
- # # Load pre-computed embeddings
19
- # embeddings = torch.load("embeddings1_1.pt")
20
- # embeddings2 = torch.load("embeddings2_1.pt")
21
- # embeddings3 = torch.load("embeddings3_1.pt")
22
 
23
- # embeddingsa = torch.load("embeddings1.pt")
24
- # embeddingsa2 = torch.load("embeddings2.pt")
25
- # embeddingsa3 = torch.load("embeddings3.pt")
26
 
27
- # # Extract questions and links
28
- # df_questions = df["question"].values
29
- # df_links = df["link"].values
30
- # df2_questions = df2["question"].values
31
- # df2_links = df2["link"].values
32
- # df3_questions = df3["question"].values
33
- # df3_links = df3["url"].values
34
 
35
  # ARABIC_STOPWORDS = {
36
  # 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
37
  # 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
38
  # 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†'
39
  # }
40
-
41
- # def arabic_word_tokenize(text):
42
- # if not isinstance(text, str):
43
- # return []
44
- # # Remove diacritics
45
- # text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
46
- # # Extract only Arabic words (length โ‰ฅ 2)
47
- # tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
48
- # return [t for t in tokens if t not in ARABIC_STOPWORDS]
49
-
50
- # def prepare_bm25_corpus(questions):
51
- # """Prepare tokenized corpus for BM25"""
52
- # tokenized_corpus = []
53
- # for question in questions:
54
- # tokens = arabic_word_tokenize(question)
55
- # tokenized_corpus.append(tokens)
56
- # return tokenized_corpus
57
-
58
- # # Initialize BM25 models for each dataset
59
- # print("Initializing BM25 models...")
60
- # bm25_corpus1 = prepare_bm25_corpus(df_questions)
61
- # bm25_corpus2 = prepare_bm25_corpus(df2_questions)
62
- # bm25_corpus3 = prepare_bm25_corpus(df3_questions)
63
-
64
- # bm25_model1 = BM25Okapi(bm25_corpus1)
65
- # bm25_model2 = BM25Okapi(bm25_corpus2)
66
- # bm25_model3 = BM25Okapi(bm25_corpus3)
67
- # print("BM25 models initialized!")
68
-
69
- # def compute_bm25_scores(query, bm25_model):
70
- # """Compute BM25 scores for a query"""
71
- # query_tokens = arabic_word_tokenize(query)
72
- # if not query_tokens:
73
- # return np.zeros(len(bm25_model.corpus))
 
 
 
 
 
 
74
 
75
- # scores = bm25_model.get_scores(query_tokens)
76
- # return scores
77
 
78
- # def compute_word_overlap(query, questions):
79
- # """Enhanced word overlap computation"""
80
- # query_words = set(arabic_word_tokenize(query))
81
- # if len(query_words) == 0:
82
- # return [0.0] * len(questions)
83
 
84
- # overlaps = []
85
- # for q in questions:
86
- # q_words = set(arabic_word_tokenize(q))
87
- # if len(q_words) == 0:
88
- # overlaps.append(0.0)
89
- # continue
90
 
91
- # # Use Jaccard similarity (intersection over union)
92
- # intersection = len(query_words & q_words)
93
- # union = len(query_words | q_words)
94
- # jaccard = intersection / union if union > 0 else 0.0
95
 
96
- # # Also compute coverage (how much of query is matched)
97
- # coverage = intersection / len(query_words)
98
 
99
- # # Combine both: prioritize coverage but consider similarity
100
- # overlap_score = 0.7 * coverage + 0.3 * jaccard
101
- # overlaps.append(overlap_score)
102
 
103
- # return overlaps
104
 
105
- # def normalize_scores(scores):
106
- # """Normalize scores to 0-1 range"""
107
- # scores = np.array(scores)
108
- # if np.max(scores) == np.min(scores):
109
- # return np.zeros_like(scores)
110
- # return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
111
-
112
- # def predict(text):
113
- # print(f"Received query: {text}")
114
- # if not text or text.strip() == "":
115
- # return "No query provided"
116
-
117
- # # Semantic similarity scores
118
- # query_embedding = model.encode(text, convert_to_tensor=True)
119
- # query_embeddinga = modela.encode(text, convert_to_tensor=True)
120
-
121
- # # Cosine similarities (averaged from two models)
122
- # sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
123
- # util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
124
- # sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
125
- # util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
126
- # sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
127
- # util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
128
-
129
- # # BM25 scores
130
- # bm25_scores1 = compute_bm25_scores(text, bm25_model1)
131
- # bm25_scores2 = compute_bm25_scores(text, bm25_model2)
132
- # bm25_scores3 = compute_bm25_scores(text, bm25_model3)
133
-
134
- # # Word overlap scores
135
- # word_overlap1 = compute_word_overlap(text, df_questions)
136
- # word_overlap2 = compute_word_overlap(text, df2_questions)
137
- # word_overlap3 = compute_word_overlap(text, df3_questions)
138
-
139
- # # Normalize all scores for fair combination
140
- # norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
141
- # norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
142
- # norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
143
 
144
- # norm_bm25_1 = normalize_scores(bm25_scores1)
145
- # norm_bm25_2 = normalize_scores(bm25_scores2)
146
- # norm_bm25_3 = normalize_scores(bm25_scores3)
147
 
148
- # norm_word1 = normalize_scores(word_overlap1)
149
- # norm_word2 = normalize_scores(word_overlap2)
150
- # norm_word3 = normalize_scores(word_overlap3)
151
 
152
- # # Adaptive weighting based on query characteristics
153
- # query_words = arabic_word_tokenize(text)
154
- # query_length = len(query_words)
155
 
156
- # if query_length <= 2:
157
- # # Short queries: prioritize exact matches (BM25 + word overlap)
158
- # semantic_weight = 0.3
159
- # bm25_weight = 0.4
160
- # word_weight = 0.3
161
- # elif query_length <= 5:
162
- # # Medium queries: balanced approach
163
- # semantic_weight = 0.4
164
- # bm25_weight = 0.35
165
- # word_weight = 0.25
166
- # else:
167
- # # Long queries: prioritize semantic understanding
168
- # semantic_weight = 0.5
169
- # bm25_weight = 0.3
170
- # word_weight = 0.2
171
 
172
- # def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
173
- # combined_results = []
174
 
175
- # for i in range(len(questions)):
176
- # semantic_score = float(norm_semantic[i])
177
- # bm25_score = float(norm_bm25[i])
178
- # word_score = float(norm_word[i])
179
 
180
- # # Enhanced scoring with BM25
181
- # combined_score = (semantic_weight * semantic_score +
182
- # bm25_weight * bm25_score +
183
- # word_weight * word_score)
184
 
185
- # # Boost results that perform well across multiple metrics
186
- # high_performance_count = sum([
187
- # semantic_score > 0.7,
188
- # bm25_score > 0.7,
189
- # word_score > 0.5
190
- # ])
191
 
192
- # if high_performance_count >= 2:
193
- # boost = 0.1
194
- # elif high_performance_count >= 1:
195
- # boost = 0.05
196
- # else:
197
- # boost = 0.0
198
 
199
- # final_score = combined_score + boost
200
 
201
- # combined_results.append({
202
- # "question": questions[i],
203
- # "link": links[i],
204
- # "semantic_score": semantic_score,
205
- # "bm25_score": bm25_score,
206
- # "word_overlap_score": word_score,
207
- # "combined_score": final_score
208
- # })
209
 
210
- # return combined_results
211
-
212
- # # Create combined results for all datasets
213
- # combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
214
- # combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
215
- # combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
216
-
217
- # def get_diverse_top_results(combined_results, top_k=5):
218
- # """Get diverse top results using multiple ranking strategies"""
219
- # # Sort by combined score and get top candidates
220
- # by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
221
- # top_combined = by_combined[:3]
222
 
223
- # # Get questions from top combined to avoid duplicates
224
- # used_questions = {item["question"] for item in top_combined}
225
 
226
- # # Add best BM25 result not already included
227
- # by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
228
- # bm25_pick = None
229
- # for item in by_bm25:
230
- # if item["question"] not in used_questions:
231
- # bm25_pick = item
232
- # break
233
 
234
- # # Add best semantic result not already included
235
- # by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
236
- # semantic_pick = None
237
- # if bm25_pick:
238
- # used_questions.add(bm25_pick["question"])
239
 
240
- # for item in by_semantic:
241
- # if item["question"] not in used_questions:
242
- # semantic_pick = item
243
- # break
244
 
245
- # # Combine results
246
- # final_results = top_combined.copy()
247
- # if bm25_pick:
248
- # final_results.append(bm25_pick)
249
- # if semantic_pick:
250
- # final_results.append(semantic_pick)
251
 
252
- # return final_results[:top_k]
253
 
254
- # # Get top results for each dataset
255
- # top1 = get_diverse_top_results(combined1)
256
- # top2 = get_diverse_top_results(combined2)
257
- # top3 = get_diverse_top_results(combined3)
258
 
259
- # results = {
260
 
261
- # "top2": top2,
262
- # "top3": top3,
263
- # "top1": top1,
264
- # "query_info": {
265
- # "query_length": query_length,
266
- # "weights": {
267
- # "semantic": semantic_weight,
268
- # "bm25": bm25_weight,
269
- # "word_overlap": word_weight
270
- # }
271
- # }
272
- # }
273
 
274
- # return results
275
 
276
- # title = "Enhanced Search with BM25"
277
- # iface = gr.Interface(
278
- # fn=predict,
279
- # inputs=[gr.Textbox(label="Search Query", lines=3)],
280
- # outputs='json',
281
- # title=title,
282
- # description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
283
- # )
284
 
285
- # if __name__ == "__main__":
286
- # iface.launch()
287
 
288
 
289
- import torch
290
- import pandas as pd
291
- from sentence_transformers import SentenceTransformer, util
292
- import gradio as gr
293
- import re
294
- import numpy as np
295
- import math
296
- from collections import Counter
297
 
298
- # Load both models
299
- model1 = SentenceTransformer("distilbert-base-multilingual-cased")
300
- model2 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
301
 
302
- # Load data
303
- print("Loading data and embeddings...")
304
- df = pd.read_csv("cleaned1.csv")
305
- df2 = pd.read_csv("cleaned2.csv")
306
- df3 = pd.read_csv("cleaned3.csv")
307
 
308
- embeddings1 = torch.load("embeddings1_1.pt")
309
- embeddings2 = torch.load("embeddings2_1.pt")
310
- embeddings3 = torch.load("embeddings3_1.pt")
311
 
312
- embeddings1a = torch.load("embeddings1.pt")
313
- embeddings2a = torch.load("embeddings2.pt")
314
- embeddings3a = torch.load("embeddings3.pt")
315
 
316
- # Arabic stopwords
317
- ARABIC_STOPWORDS = {
318
- 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
319
- 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
320
- 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†',
321
- 'ุฑุถูŠ', 'ุนู„ูŠู‡ุง', 'ุนู†ู‡ู…', 'ุนู†ู‡', 'ุนู„ูŠู‡ู…', 'ุตู„ู‰', 'ูˆุณู„ู…',
322
- 'ุณู„ุงู…', 'ุนู„ูŠู‡', 'ุงู„ุฑุณูˆู„', 'ุงู„ู†ุจูŠ', 'ุนู„ูŠู‡', 'ุงู„ุณู„ุงู…', 'ุญุฏูŠุซ', 'ุงุญุงุฏูŠุซ'
323
- }
324
 
325
- def arabic_word_tokenize(text):
326
- if not isinstance(text, str): return []
327
- text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
328
- return [t for t in re.findall(r'[\u0600-\u06FF]{2,}', text) if t not in ARABIC_STOPWORDS]
329
 
330
- # Pre-tokenize questions and compute doc frequencies
331
- def setup_tokenization_and_freqs(questions):
332
- tokenized = [arabic_word_tokenize(q) for q in questions]
333
- doc_freqs = Counter(word for doc in tokenized for word in set(doc))
334
- return tokenized, doc_freqs
335
 
336
- tokenized1, doc_freqs1 = setup_tokenization_and_freqs(df["question"].values)
337
- tokenized2, doc_freqs2 = setup_tokenization_and_freqs(df2["question"].values)
338
- tokenized3, doc_freqs3 = setup_tokenization_and_freqs(df3["question"].values)
339
 
340
- def compute_word_overlap(query, questions):
341
- q_words = set(arabic_word_tokenize(query))
342
- scores = []
343
- for doc in questions:
344
- d_words = set(arabic_word_tokenize(doc))
345
- if not d_words or not q_words:
346
- scores.append(0.0)
347
- continue
348
- inter = len(q_words & d_words)
349
- union = len(q_words | d_words)
350
- jaccard = inter / union if union else 0.0
351
- coverage = inter / len(q_words)
352
- scores.append(0.7 * coverage + 0.3 * jaccard)
353
- return scores
354
 
355
- def lightweight_bm25_score(query_tokens, doc_tokens, doc_freqs, total_docs, k1=1.2, b=0.75):
356
- score = 0.0
357
- doc_len = len(doc_tokens)
358
- avg_doc_len = 10
359
- for term in query_tokens:
360
- if term in doc_tokens:
361
- tf = doc_tokens.count(term)
362
- df = doc_freqs.get(term, 0)
363
- if df > 0:
364
- idf = math.log((total_docs - df + 0.5) / (df + 0.5))
365
- score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len)))
366
- return score
367
 
368
- def normalize_scores(scores):
369
- arr = np.array(scores)
370
- if arr.max() == arr.min(): return np.zeros_like(arr)
371
- return (arr - arr.min()) / (arr.max() - arr.min())
372
-
373
- def combine_scores(query, questions, tokenized, doc_freqs, emb1, emb2):
374
- total_docs = len(questions)
375
- q_emb1 = model1.encode(query, convert_to_tensor=True)
376
- q_emb2 = model2.encode(query, convert_to_tensor=True)
377
 
378
- sim1 = util.pytorch_cos_sim(q_emb1, emb1)[0]
379
- sim2 = util.pytorch_cos_sim(q_emb2, emb2)[0]
380
- sim_scores = ((sim1 + sim2) / 2).cpu().numpy()
381
-
382
- bm25_scores = [lightweight_bm25_score(arabic_word_tokenize(query), doc_tokens, doc_freqs, total_docs)
383
- for doc_tokens in tokenized]
384
- word_scores = compute_word_overlap(query, questions)
385
-
386
- norm_bm25 = normalize_scores(bm25_scores)
387
- norm_word = normalize_scores(word_scores)
388
- norm_sim = normalize_scores(sim_scores)
389
-
390
- query_len = len(arabic_word_tokenize(query))
391
- if query_len <= 2:
392
- w_sem, w_bm, w_word = 0.3, 0.4, 0.3
393
- elif query_len <= 5:
394
- w_sem, w_bm, w_word = 0.4, 0.35, 0.25
395
- else:
396
- w_sem, w_bm, w_word = 0.5, 0.3, 0.2
397
-
398
- results = []
399
- for i, q in enumerate(questions):
400
- sem, bm, word = norm_sim[i], norm_bm25[i], norm_word[i]
401
- combined = w_sem*sem + w_bm*bm + w_word*word
402
- boost = 0.1 if sum([sem > 0.7, bm > 0.7, word > 0.5]) >= 2 else (0.05 if sum([sem > 0.7, bm > 0.7, word > 0.5]) == 1 else 0.0)
403
- results.append({
404
- "question": q,
405
- "semantic_score": sem,
406
- "bm25_score": bm,
407
- "word_overlap_score": word,
408
- "combined_score": combined + boost
409
- })
410
- return results
411
 
412
- def get_top_diverse(results, links, top_k=5):
413
- results = [dict(r, link=links[i]) for i, r in enumerate(results)]
414
- top_combined = sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
415
- used_q = {r['question'] for r in top_combined}
416
- top_bm = next((r for r in sorted(results, key=lambda x: x['bm25_score'], reverse=True) if r['question'] not in used_q), None)
417
- if top_bm: used_q.add(top_bm['question'])
418
- top_sem = next((r for r in sorted(results, key=lambda x: x['semantic_score'], reverse=True) if r['question'] not in used_q), None)
419
- final = top_combined + ([top_bm] if top_bm else []) + ([top_sem] if top_sem else [])
420
- return final[:top_k]
421
-
422
- def predict(query):
423
- print(f"Query: {query}")
424
- results1 = combine_scores(query, df["question"].values, tokenized1, doc_freqs1, embeddings1, embeddings1a)
425
- results2 = combine_scores(query, df2["question"].values, tokenized2, doc_freqs2, embeddings2, embeddings2a)
426
- results3 = combine_scores(query, df3["question"].values, tokenized3, doc_freqs3, embeddings3, embeddings3a)
427
-
428
- return {
429
- "top2": get_top_diverse(results2, df2["link"].values),
430
- "top3": get_top_diverse(results3, df3["url"].values),
431
- "top1": get_top_diverse(results1, df["link"].values),
432
- "query_info": {
433
- "query_length": len(arabic_word_tokenize(query))
434
- }
435
- }
436
 
437
- title = "Arabic Search: Dual-Model + BM25 + Overlap"
438
- iface = gr.Interface(
439
- fn=predict,
440
- inputs=[gr.Textbox(label="Search Query", lines=3)],
441
- outputs="json",
442
- title=title,
443
- description="Accurate Arabic search using two semantic models, fast BM25, and word overlap."
444
- )
445
 
446
- if __name__ == "__main__":
447
- iface.launch()
 
1
+ import torch
2
+ import pandas as pd
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import gradio as gr
5
+ import re
6
+ from rank_bm25 import BM25Okapi
7
+ import numpy as np
8
 
9
+ # Load models
10
+ model = SentenceTransformer("distilbert-base-multilingual-cased")
11
+ modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
12
 
13
+ # Load data
14
+ df = pd.read_csv("cleaned1.csv")
15
+ df2 = pd.read_csv("cleaned2.csv")
16
+ df3 = pd.read_csv("cleaned3.csv")
17
 
18
+ # Load pre-computed embeddings
19
+ embeddings = torch.load("embeddings1_1.pt")
20
+ embeddings2 = torch.load("embeddings2_1.pt")
21
+ embeddings3 = torch.load("embeddings3_1.pt")
22
 
23
+ embeddingsa = torch.load("embeddings1.pt")
24
+ embeddingsa2 = torch.load("embeddings2.pt")
25
+ embeddingsa3 = torch.load("embeddings3.pt")
26
 
27
+ # Extract questions and links
28
+ df_questions = df["question"].values
29
+ df_links = df["link"].values
30
+ df2_questions = df2["question"].values
31
+ df2_links = df2["link"].values
32
+ df3_questions = df3["question"].values
33
+ df3_links = df3["url"].values
34
 
35
  # ARABIC_STOPWORDS = {
36
  # 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
37
  # 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
38
  # 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†'
39
  # }
40
+ ARABIC_STOPWORDS = {
41
+ 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
42
+ 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
43
+ 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†',
44
+ 'ุฑุถูŠ', 'ุนู„ูŠู‡ุง', 'ุนู†ู‡ู…', 'ุนู†ู‡', 'ุนู„ูŠู‡ู…', 'ุตู„ู‰', 'ูˆุณู„ู…',
45
+ 'ุณู„ุงู…', 'ุนู„ูŠู‡', 'ุงู„ุฑุณูˆู„', 'ุงู„ู†ุจูŠ', 'ุนู„ูŠู‡', 'ุงู„ุณู„ุงู…', 'ุญุฏูŠุซ', 'ุงุญุงุฏูŠุซ'
46
+ }
47
+ def arabic_word_tokenize(text):
48
+ if not isinstance(text, str):
49
+ return []
50
+ # Remove diacritics
51
+ text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
52
+ # Extract only Arabic words (length โ‰ฅ 2)
53
+ tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
54
+ return [t for t in tokens if t not in ARABIC_STOPWORDS]
55
+
56
+ def prepare_bm25_corpus(questions):
57
+ """Prepare tokenized corpus for BM25"""
58
+ tokenized_corpus = []
59
+ for question in questions:
60
+ tokens = arabic_word_tokenize(question)
61
+ tokenized_corpus.append(tokens)
62
+ return tokenized_corpus
63
+
64
+ # Initialize BM25 models for each dataset
65
+ print("Initializing BM25 models...")
66
+ bm25_corpus1 = prepare_bm25_corpus(df_questions)
67
+ bm25_corpus2 = prepare_bm25_corpus(df2_questions)
68
+ bm25_corpus3 = prepare_bm25_corpus(df3_questions)
69
+
70
+ bm25_model1 = BM25Okapi(bm25_corpus1)
71
+ bm25_model2 = BM25Okapi(bm25_corpus2)
72
+ bm25_model3 = BM25Okapi(bm25_corpus3)
73
+ print("BM25 models initialized!")
74
+
75
+ def compute_bm25_scores(query, bm25_model):
76
+ """Compute BM25 scores for a query"""
77
+ query_tokens = arabic_word_tokenize(query)
78
+ if not query_tokens:
79
+ return np.zeros(len(bm25_model.corpus))
80
 
81
+ scores = bm25_model.get_scores(query_tokens)
82
+ return scores
83
 
84
+ def compute_word_overlap(query, questions):
85
+ """Enhanced word overlap computation"""
86
+ query_words = set(arabic_word_tokenize(query))
87
+ if len(query_words) == 0:
88
+ return [0.0] * len(questions)
89
 
90
+ overlaps = []
91
+ for q in questions:
92
+ q_words = set(arabic_word_tokenize(q))
93
+ if len(q_words) == 0:
94
+ overlaps.append(0.0)
95
+ continue
96
 
97
+ # Use Jaccard similarity (intersection over union)
98
+ intersection = len(query_words & q_words)
99
+ union = len(query_words | q_words)
100
+ jaccard = intersection / union if union > 0 else 0.0
101
 
102
+ # Also compute coverage (how much of query is matched)
103
+ coverage = intersection / len(query_words)
104
 
105
+ # Combine both: prioritize coverage but consider similarity
106
+ overlap_score = 0.7 * coverage + 0.3 * jaccard
107
+ overlaps.append(overlap_score)
108
 
109
+ return overlaps
110
 
111
+ def normalize_scores(scores):
112
+ """Normalize scores to 0-1 range"""
113
+ scores = np.array(scores)
114
+ if np.max(scores) == np.min(scores):
115
+ return np.zeros_like(scores)
116
+ return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
117
+
118
+ def predict(text):
119
+ print(f"Received query: {text}")
120
+ if not text or text.strip() == "":
121
+ return "No query provided"
122
+
123
+ # Semantic similarity scores
124
+ query_embedding = model.encode(text, convert_to_tensor=True)
125
+ query_embeddinga = modela.encode(text, convert_to_tensor=True)
126
+
127
+ # Cosine similarities (averaged from two models)
128
+ sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
129
+ util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
130
+ sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
131
+ util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
132
+ sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
133
+ util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
134
+
135
+ # BM25 scores
136
+ bm25_scores1 = compute_bm25_scores(text, bm25_model1)
137
+ bm25_scores2 = compute_bm25_scores(text, bm25_model2)
138
+ bm25_scores3 = compute_bm25_scores(text, bm25_model3)
139
+
140
+ # Word overlap scores
141
+ word_overlap1 = compute_word_overlap(text, df_questions)
142
+ word_overlap2 = compute_word_overlap(text, df2_questions)
143
+ word_overlap3 = compute_word_overlap(text, df3_questions)
144
+
145
+ # Normalize all scores for fair combination
146
+ norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
147
+ norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
148
+ norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
149
 
150
+ norm_bm25_1 = normalize_scores(bm25_scores1)
151
+ norm_bm25_2 = normalize_scores(bm25_scores2)
152
+ norm_bm25_3 = normalize_scores(bm25_scores3)
153
 
154
+ norm_word1 = normalize_scores(word_overlap1)
155
+ norm_word2 = normalize_scores(word_overlap2)
156
+ norm_word3 = normalize_scores(word_overlap3)
157
 
158
+ # Adaptive weighting based on query characteristics
159
+ query_words = arabic_word_tokenize(text)
160
+ query_length = len(query_words)
161
 
162
+ if query_length <= 2:
163
+ # Short queries: prioritize exact matches (BM25 + word overlap)
164
+ semantic_weight = 0.3
165
+ bm25_weight = 0.4
166
+ word_weight = 0.3
167
+ elif query_length <= 5:
168
+ # Medium queries: balanced approach
169
+ semantic_weight = 0.4
170
+ bm25_weight = 0.35
171
+ word_weight = 0.25
172
+ else:
173
+ # Long queries: prioritize semantic understanding
174
+ semantic_weight = 0.5
175
+ bm25_weight = 0.3
176
+ word_weight = 0.2
177
 
178
+ def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
179
+ combined_results = []
180
 
181
+ for i in range(len(questions)):
182
+ semantic_score = float(norm_semantic[i])
183
+ bm25_score = float(norm_bm25[i])
184
+ word_score = float(norm_word[i])
185
 
186
+ # Enhanced scoring with BM25
187
+ combined_score = (semantic_weight * semantic_score +
188
+ bm25_weight * bm25_score +
189
+ word_weight * word_score)
190
 
191
+ # Boost results that perform well across multiple metrics
192
+ high_performance_count = sum([
193
+ semantic_score > 0.7,
194
+ bm25_score > 0.7,
195
+ word_score > 0.5
196
+ ])
197
 
198
+ if high_performance_count >= 2:
199
+ boost = 0.1
200
+ elif high_performance_count >= 1:
201
+ boost = 0.05
202
+ else:
203
+ boost = 0.0
204
 
205
+ final_score = combined_score + boost
206
 
207
+ combined_results.append({
208
+ "question": questions[i],
209
+ "link": links[i],
210
+ "semantic_score": semantic_score,
211
+ "bm25_score": bm25_score,
212
+ "word_overlap_score": word_score,
213
+ "combined_score": final_score
214
+ })
215
 
216
+ return combined_results
217
+
218
+ # Create combined results for all datasets
219
+ combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
220
+ combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
221
+ combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
222
+
223
+ def get_diverse_top_results(combined_results, top_k=5):
224
+ """Get diverse top results using multiple ranking strategies"""
225
+ # Sort by combined score and get top candidates
226
+ by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
227
+ top_combined = by_combined[:3]
228
 
229
+ # Get questions from top combined to avoid duplicates
230
+ used_questions = {item["question"] for item in top_combined}
231
 
232
+ # Add best BM25 result not already included
233
+ by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
234
+ bm25_pick = None
235
+ for item in by_bm25:
236
+ if item["question"] not in used_questions:
237
+ bm25_pick = item
238
+ break
239
 
240
+ # Add best semantic result not already included
241
+ by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
242
+ semantic_pick = None
243
+ if bm25_pick:
244
+ used_questions.add(bm25_pick["question"])
245
 
246
+ for item in by_semantic:
247
+ if item["question"] not in used_questions:
248
+ semantic_pick = item
249
+ break
250
 
251
+ # Combine results
252
+ final_results = top_combined.copy()
253
+ if bm25_pick:
254
+ final_results.append(bm25_pick)
255
+ if semantic_pick:
256
+ final_results.append(semantic_pick)
257
 
258
+ return final_results[:top_k]
259
 
260
+ # Get top results for each dataset
261
+ top1 = get_diverse_top_results(combined1)
262
+ top2 = get_diverse_top_results(combined2)
263
+ top3 = get_diverse_top_results(combined3)
264
 
265
+ results = {
266
 
267
+ "top2": top2,
268
+ "top3": top3,
269
+ "top1": top1,
270
+ "query_info": {
271
+ "query_length": query_length,
272
+ "weights": {
273
+ "semantic": semantic_weight,
274
+ "bm25": bm25_weight,
275
+ "word_overlap": word_weight
276
+ }
277
+ }
278
+ }
279
 
280
+ return results
281
 
282
+ title = "Enhanced Search with BM25"
283
+ iface = gr.Interface(
284
+ fn=predict,
285
+ inputs=[gr.Textbox(label="Search Query", lines=3)],
286
+ outputs='json',
287
+ title=title,
288
+ description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
289
+ )
290
 
291
+ if __name__ == "__main__":
292
+ iface.launch()
293
 
294
 
295
+ # import torch
296
+ # import pandas as pd
297
+ # from sentence_transformers import SentenceTransformer, util
298
+ # import gradio as gr
299
+ # import re
300
+ # import numpy as np
301
+ # import math
302
+ # from collections import Counter
303
 
304
+ # # Load both models
305
+ # model1 = SentenceTransformer("distilbert-base-multilingual-cased")
306
+ # model2 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
307
 
308
+ # # Load data
309
+ # print("Loading data and embeddings...")
310
+ # df = pd.read_csv("cleaned1.csv")
311
+ # df2 = pd.read_csv("cleaned2.csv")
312
+ # df3 = pd.read_csv("cleaned3.csv")
313
 
314
+ # embeddings1 = torch.load("embeddings1_1.pt")
315
+ # embeddings2 = torch.load("embeddings2_1.pt")
316
+ # embeddings3 = torch.load("embeddings3_1.pt")
317
 
318
+ # embeddings1a = torch.load("embeddings1.pt")
319
+ # embeddings2a = torch.load("embeddings2.pt")
320
+ # embeddings3a = torch.load("embeddings3.pt")
321
 
322
+ # # Arabic stopwords
323
+ # ARABIC_STOPWORDS = {
324
+ # 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
325
+ # 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
326
+ # 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†',
327
+ # 'ุฑุถูŠ', 'ุนู„ูŠู‡ุง', 'ุนู†ู‡ู…', 'ุนู†ู‡', 'ุนู„ูŠู‡ู…', 'ุตู„ู‰', 'ูˆุณู„ู…',
328
+ # 'ุณู„ุงู…', 'ุนู„ูŠู‡', 'ุงู„ุฑุณูˆู„', 'ุงู„ู†ุจูŠ', 'ุนู„ูŠู‡', 'ุงู„ุณู„ุงู…', 'ุญุฏูŠุซ', 'ุงุญุงุฏูŠุซ'
329
+ # }
330
 
331
+ # def arabic_word_tokenize(text):
332
+ # if not isinstance(text, str): return []
333
+ # text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
334
+ # return [t for t in re.findall(r'[\u0600-\u06FF]{2,}', text) if t not in ARABIC_STOPWORDS]
335
 
336
+ # # Pre-tokenize questions and compute doc frequencies
337
+ # def setup_tokenization_and_freqs(questions):
338
+ # tokenized = [arabic_word_tokenize(q) for q in questions]
339
+ # doc_freqs = Counter(word for doc in tokenized for word in set(doc))
340
+ # return tokenized, doc_freqs
341
 
342
+ # tokenized1, doc_freqs1 = setup_tokenization_and_freqs(df["question"].values)
343
+ # tokenized2, doc_freqs2 = setup_tokenization_and_freqs(df2["question"].values)
344
+ # tokenized3, doc_freqs3 = setup_tokenization_and_freqs(df3["question"].values)
345
 
346
+ # def compute_word_overlap(query, questions):
347
+ # q_words = set(arabic_word_tokenize(query))
348
+ # scores = []
349
+ # for doc in questions:
350
+ # d_words = set(arabic_word_tokenize(doc))
351
+ # if not d_words or not q_words:
352
+ # scores.append(0.0)
353
+ # continue
354
+ # inter = len(q_words & d_words)
355
+ # union = len(q_words | d_words)
356
+ # jaccard = inter / union if union else 0.0
357
+ # coverage = inter / len(q_words)
358
+ # scores.append(0.7 * coverage + 0.3 * jaccard)
359
+ # return scores
360
 
361
+ # def lightweight_bm25_score(query_tokens, doc_tokens, doc_freqs, total_docs, k1=1.2, b=0.75):
362
+ # score = 0.0
363
+ # doc_len = len(doc_tokens)
364
+ # avg_doc_len = 10
365
+ # for term in query_tokens:
366
+ # if term in doc_tokens:
367
+ # tf = doc_tokens.count(term)
368
+ # df = doc_freqs.get(term, 0)
369
+ # if df > 0:
370
+ # idf = math.log((total_docs - df + 0.5) / (df + 0.5))
371
+ # score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len)))
372
+ # return score
373
 
374
+ # def normalize_scores(scores):
375
+ # arr = np.array(scores)
376
+ # if arr.max() == arr.min(): return np.zeros_like(arr)
377
+ # return (arr - arr.min()) / (arr.max() - arr.min())
378
+
379
+ # def combine_scores(query, questions, tokenized, doc_freqs, emb1, emb2):
380
+ # total_docs = len(questions)
381
+ # q_emb1 = model1.encode(query, convert_to_tensor=True)
382
+ # q_emb2 = model2.encode(query, convert_to_tensor=True)
383
 
384
+ # sim1 = util.pytorch_cos_sim(q_emb1, emb1)[0]
385
+ # sim2 = util.pytorch_cos_sim(q_emb2, emb2)[0]
386
+ # sim_scores = ((sim1 + sim2) / 2).cpu().numpy()
387
+
388
+ # bm25_scores = [lightweight_bm25_score(arabic_word_tokenize(query), doc_tokens, doc_freqs, total_docs)
389
+ # for doc_tokens in tokenized]
390
+ # word_scores = compute_word_overlap(query, questions)
391
+
392
+ # norm_bm25 = normalize_scores(bm25_scores)
393
+ # norm_word = normalize_scores(word_scores)
394
+ # norm_sim = normalize_scores(sim_scores)
395
+
396
+ # query_len = len(arabic_word_tokenize(query))
397
+ # if query_len <= 2:
398
+ # w_sem, w_bm, w_word = 0.3, 0.4, 0.3
399
+ # elif query_len <= 5:
400
+ # w_sem, w_bm, w_word = 0.4, 0.35, 0.25
401
+ # else:
402
+ # w_sem, w_bm, w_word = 0.5, 0.3, 0.2
403
+
404
+ # results = []
405
+ # for i, q in enumerate(questions):
406
+ # sem, bm, word = norm_sim[i], norm_bm25[i], norm_word[i]
407
+ # combined = w_sem*sem + w_bm*bm + w_word*word
408
+ # boost = 0.1 if sum([sem > 0.7, bm > 0.7, word > 0.5]) >= 2 else (0.05 if sum([sem > 0.7, bm > 0.7, word > 0.5]) == 1 else 0.0)
409
+ # results.append({
410
+ # "question": q,
411
+ # "semantic_score": sem,
412
+ # "bm25_score": bm,
413
+ # "word_overlap_score": word,
414
+ # "combined_score": combined + boost
415
+ # })
416
+ # return results
417
 
418
+ # def get_top_diverse(results, links, top_k=5):
419
+ # results = [dict(r, link=links[i]) for i, r in enumerate(results)]
420
+ # top_combined = sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
421
+ # used_q = {r['question'] for r in top_combined}
422
+ # top_bm = next((r for r in sorted(results, key=lambda x: x['bm25_score'], reverse=True) if r['question'] not in used_q), None)
423
+ # if top_bm: used_q.add(top_bm['question'])
424
+ # top_sem = next((r for r in sorted(results, key=lambda x: x['semantic_score'], reverse=True) if r['question'] not in used_q), None)
425
+ # final = top_combined + ([top_bm] if top_bm else []) + ([top_sem] if top_sem else [])
426
+ # return final[:top_k]
427
+
428
+ # def predict(query):
429
+ # print(f"Query: {query}")
430
+ # results1 = combine_scores(query, df["question"].values, tokenized1, doc_freqs1, embeddings1, embeddings1a)
431
+ # results2 = combine_scores(query, df2["question"].values, tokenized2, doc_freqs2, embeddings2, embeddings2a)
432
+ # results3 = combine_scores(query, df3["question"].values, tokenized3, doc_freqs3, embeddings3, embeddings3a)
433
+
434
+ # return {
435
+ # "top2": get_top_diverse(results2, df2["link"].values),
436
+ # "top3": get_top_diverse(results3, df3["url"].values),
437
+ # "top1": get_top_diverse(results1, df["link"].values),
438
+ # "query_info": {
439
+ # "query_length": len(arabic_word_tokenize(query))
440
+ # }
441
+ # }
442
 
443
+ # title = "Arabic Search: Dual-Model + BM25 + Overlap"
444
+ # iface = gr.Interface(
445
+ # fn=predict,
446
+ # inputs=[gr.Textbox(label="Search Query", lines=3)],
447
+ # outputs="json",
448
+ # title=title,
449
+ # description="Accurate Arabic search using two semantic models, fast BM25, and word overlap."
450
+ # )
451
 
452
+ # if __name__ == "__main__":
453
+ # iface.launch()