File size: 12,975 Bytes
00b3d0f
 
 
 
 
 
2bc77eb
1641ca7
00b3d0f
 
 
2f4967b
00b3d0f
 
 
 
b6b04c7
00b3d0f
 
 
 
89f676e
00b3d0f
 
 
89f676e
00b3d0f
 
 
 
 
 
 
762dded
3dafe6c
 
 
 
 
00b3d0f
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad2b97
00b3d0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad2b97
 
 
 
00b3d0f
3ad2b97
00b3d0f
 
 
3ad2b97
2f4967b
00b3d0f
 
762dded
00b3d0f
 
 
 
 
892da5a
00b3d0f
 
 
 
 
 
892da5a
00b3d0f
 
 
 
892da5a
00b3d0f
 
892da5a
00b3d0f
 
 
892da5a
00b3d0f
762dded
00b3d0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad2b97
 
 
00b3d0f
 
 
 
 
 
 
 
 
 
2f4967b
00b3d0f
 
 
2f4967b
00b3d0f
 
 
2f4967b
00b3d0f
 
 
2f4967b
ed06f94
00b3d0f
 
 
 
ed06f94
00b3d0f
 
 
 
 
 
 
 
 
2f4967b
00b3d0f
 
61a6c42
00b3d0f
 
 
 
61a6c42
00b3d0f
 
 
 
61a6c42
00b3d0f
 
 
 
 
 
2f4967b
00b3d0f
 
 
 
 
 
2f4967b
00b3d0f
61a6c42
00b3d0f
 
 
 
 
 
 
 
892da5a
00b3d0f
 
 
 
 
 
 
ed06f94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc9e59e
 
ed06f94
 
 
 
 
 
 
 
 
 
 
 
 
 
74b7f80
f46813a
00b3d0f
 
f46813a
ed06f94
 
00b3d0f
 
 
 
 
f46813a
ed06f94
 
00b3d0f
 
 
f46813a
00b3d0f
 
 
 
f46813a
00b3d0f
 
 
 
 
 
f46813a
00b3d0f
f46813a
00b3d0f
 
 
 
762dded
00b3d0f
61a6c42
00b3d0f
 
 
 
 
 
 
 
 
 
 
 
3dafe6c
00b3d0f
3dafe6c
00b3d0f
 
 
 
 
 
 
 
3dafe6c
00b3d0f
 
3dafe6c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import gradio as gr
import re
from rank_bm25 import BM25Okapi
import numpy as np

# Load models
model = SentenceTransformer("distilbert-base-multilingual-cased")
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Load data
df = pd.read_csv("cleaned1.csv")
df2 = pd.read_csv("cleaned2.csv")
df3 = pd.read_csv("cleaned3.csv")

# Load pre-computed embeddings
embeddings = torch.load("embeddings1_1.pt")
embeddings2 = torch.load("embeddings2_1.pt")
embeddings3 = torch.load("embeddings3_1.pt")

embeddingsa = torch.load("embeddings1.pt")
embeddingsa2 = torch.load("embeddings2.pt")
embeddingsa3 = torch.load("embeddings3.pt")

# Extract questions and links
df_questions = df["question"].values
df_links = df["link"].values
df2_questions = df2["question"].values
df2_links = df2["link"].values
df3_questions = df3["question"].values
df3_links = df3["url"].values

# ARABIC_STOPWORDS = {
#     'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
#     'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
#     'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†'
# }
ARABIC_STOPWORDS = {
    'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
    'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
    'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†',
    'ุฑุถูŠ', 'ุนู„ูŠู‡ุง', 'ุนู†ู‡ู…', 'ุนู†ู‡', 'ุนู„ูŠู‡ู…', 'ุตู„ู‰', 'ูˆุณู„ู…',
    'ุณู„ุงู…', 'ุนู„ูŠู‡', 'ุงู„ุฑุณูˆู„', 'ุงู„ู†ุจูŠ', 'ุนู„ูŠู‡', 'ุงู„ุณู„ุงู…', 'ุญุฏูŠุซ', 'ุงุญุงุฏูŠุซ'
}
def arabic_word_tokenize(text):
    if not isinstance(text, str):
        return []
    # Remove diacritics
    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
    # Extract only Arabic words (length โ‰ฅ 2)
    tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
    
    return [t for t in tokens if t not in ARABIC_STOPWORDS]

def prepare_bm25_corpus(questions):
    """Prepare tokenized corpus for BM25"""
    tokenized_corpus = []
    for question in questions:
        tokens = arabic_word_tokenize(question)
        tokenized_corpus.append(tokens)
    return tokenized_corpus

# Initialize BM25 models for each dataset
print("Initializing BM25 models...")
bm25_corpus1 = prepare_bm25_corpus(df_questions)
bm25_corpus2 = prepare_bm25_corpus(df2_questions)
bm25_corpus3 = prepare_bm25_corpus(df3_questions)

bm25_model1 = BM25Okapi(bm25_corpus1)
bm25_model2 = BM25Okapi(bm25_corpus2)
bm25_model3 = BM25Okapi(bm25_corpus3)
print("BM25 models initialized!")
corpus_length1 = len(df_questions)
corpus_length2 = len(df2_questions)
corpus_length3 = len(df3_questions)


def compute_bm25_scores(query, bm25_model,corpus_length):
    """Compute BM25 scores for a query"""
    query_tokens = arabic_word_tokenize(query)
    if not query_tokens:
        return np.zeros(corpus_length)
    
    scores = bm25_model.get_scores(query_tokens)
    return scores

def compute_word_overlap(query, questions):
    """Enhanced word overlap computation"""
    query_words = set(arabic_word_tokenize(query))
    if len(query_words) == 0:
        return [0.0] * len(questions)
    
    overlaps = []
    for q in questions:
        q_words = set(arabic_word_tokenize(q))
        if len(q_words) == 0:
            overlaps.append(0.0)
            continue
            
        # Use Jaccard similarity (intersection over union)
        intersection = len(query_words & q_words)
        union = len(query_words | q_words)
        jaccard = intersection / union if union > 0 else 0.0
        
        # Also compute coverage (how much of query is matched)
        coverage = intersection / len(query_words)
        
        # Combine both: prioritize coverage but consider similarity
        overlap_score = 0.7 * coverage + 0.3 * jaccard
        overlaps.append(overlap_score)
    
    return overlaps

def normalize_scores(scores):
    """Normalize scores to 0-1 range"""
    scores = np.array(scores)
    if np.max(scores) == np.min(scores):
        return np.zeros_like(scores)
    return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))

def predict(text):
    print(f"Received query: {text}")
    if not text or text.strip() == "":
        return "No query provided"

    # Semantic similarity scores
    query_embedding = model.encode(text, convert_to_tensor=True)
    query_embeddinga = modela.encode(text, convert_to_tensor=True)

    # Cosine similarities (averaged from two models)
    sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] + 
                   util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
    sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] + 
                   util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
    sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] + 
                   util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2

    # BM25 scores
    bm25_scores1 = compute_bm25_scores(text, bm25_model1,corpus_length1)
    bm25_scores2 = compute_bm25_scores(text, bm25_model2,corpus_length2)
    bm25_scores3 = compute_bm25_scores(text, bm25_model3,corpus_length3)

    # Word overlap scores
    word_overlap1 = compute_word_overlap(text, df_questions)
    word_overlap2 = compute_word_overlap(text, df2_questions)
    word_overlap3 = compute_word_overlap(text, df3_questions)

    # Normalize all scores for fair combination
    norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
    norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
    norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
    
    norm_bm25_1 = normalize_scores(bm25_scores1)
    norm_bm25_2 = normalize_scores(bm25_scores2)
    norm_bm25_3 = normalize_scores(bm25_scores3)
    
    norm_word1 = normalize_scores(word_overlap1)
    norm_word2 = normalize_scores(word_overlap2)
    norm_word3 = normalize_scores(word_overlap3)

    # Adaptive weighting based on query characteristics
    query_words = arabic_word_tokenize(text)
    query_length = len(query_words)
    
    if query_length <= 4:
        # Short queries: prioritize exact matches (BM25 + word overlap)
        semantic_weight = 0.3
        bm25_weight = 0.4
        word_weight = 0.3
    elif query_length <= 6:
        # Medium queries: balanced approach
        semantic_weight = 0.4
        bm25_weight = 0.35
        word_weight = 0.25
    else:
        # Long queries: prioritize semantic understanding
        semantic_weight = 0.5
        bm25_weight = 0.3
        word_weight = 0.2

    def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
        combined_results = []
        
        for i in range(len(questions)):
            semantic_score = float(norm_semantic[i])
            bm25_score = float(norm_bm25[i])
            word_score = float(norm_word[i])
            
            # Enhanced scoring with BM25
            combined_score = (semantic_weight * semantic_score + 
                            bm25_weight * bm25_score + 
                            word_weight * word_score)
            
            # Boost results that perform well across multiple metrics
            high_performance_count = sum([
                semantic_score > 0.7,
                bm25_score > 0.7,
                word_score > 0.5
            ])
            
            if high_performance_count >= 2:
                boost = 0.1
            elif high_performance_count >= 1:
                boost = 0.05
            else:
                boost = 0.0
                
            final_score = combined_score + boost
            
            combined_results.append({
                "question": questions[i],
                "link": links[i],
                "semantic_score": semantic_score,
                "bm25_score": bm25_score,
                "word_overlap_score": word_score,
                "combined_score": final_score
            })
        
        return combined_results

    # Create combined results for all datasets
    combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
    combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
    combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)

    # def get_diverse_top_results(combined_results, top_k=5):
    #     """Get diverse top results using multiple ranking strategies"""
    #     # Sort by combined score and get top candidates
    #     by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
    #     top_combined = by_combined[:3]
        
    #     # Get questions from top combined to avoid duplicates
    #     used_questions = {item["question"] for item in top_combined}
        
    #     # Add best BM25 result not already included
    #     by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
    #     bm25_pick = None
    #     for item in by_bm25:
    #         if item["question"] not in used_questions:
    #             bm25_pick = item
    #             break
        
    #     # Add best semantic result not already included
    #     by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
    #     semantic_pick = None
    #     if bm25_pick:
    #         used_questions.add(bm25_pick["question"])
        
    #     for item in by_semantic:
    #         if item["question"] not in used_questions:
    #             semantic_pick = item
    #             break
        
    #     # Combine results
    #     final_results = top_combined.copy()
    #     if bm25_pick:
    #         final_results.append(bm25_pick)
    #     if semantic_pick:
    #         final_results.append(semantic_pick)
        
    #     return final_results[:top_k]
    
    def get_diverse_top_results(combined_results, top_k=15):
        """Get diverse top results using multiple ranking strategies with BM25 threshold"""
        
        # First, check if any results have BM25 score > 0.1
        has_good_bm25 = any(item["bm25_score"] > 0.1 for item in combined_results)
        
        if has_good_bm25:
            # Filter results to only include those with BM25 > 0.1
            filtered_results = [item for item in combined_results if item["bm25_score"] > 0.1]
        else:
            # If all BM25 scores are <= 0.1, use all results
            filtered_results = combined_results
        
        # Sort by combined score and get top candidates from filtered results
        by_combined = sorted(filtered_results, key=lambda x: x["combined_score"], reverse=True)
        top_combined = by_combined[:top_k-5]
        
        # Get questions from top combined to avoid duplicates
        used_questions = {item["question"] for item in top_combined}
        
        # Add best BM25 result not already included (from filtered results)
        by_bm25 = sorted(filtered_results, key=lambda x: x["bm25_score"], reverse=True)
        bm25_pick = None
        for item in by_bm25:
            if item["question"] not in used_questions:
                bm25_pick = item
                break
        
        # Add best semantic result not already included (from filtered results)
        by_semantic = sorted(filtered_results, key=lambda x: x["semantic_score"], reverse=True)
        semantic_pick = None
        if bm25_pick:
            used_questions.add(bm25_pick["question"])
        
        for item in by_semantic:
            if item["question"] not in used_questions:
                semantic_pick = item
                break
        
        # Combine results
        final_results = top_combined.copy()
        if bm25_pick:
            final_results.append(bm25_pick)
        if semantic_pick:
            final_results.append(semantic_pick)
        
        return final_results[:top_k]
    
    # Get top results for each dataset
    top1 = get_diverse_top_results(combined1)
    top2 = get_diverse_top_results(combined2)
    top3 = get_diverse_top_results(combined3)

    results = {
        
        "top2": top2,
        "top3": top3,
        "top1": top1,
        "query_info": {
            "query_length": query_length,
            "weights": {
                "semantic": semantic_weight,
                "bm25": bm25_weight,
                "word_overlap": word_weight
            }
        }
    }

    return results

title = "Enhanced Search with BM25"
iface = gr.Interface(
    fn=predict,
    inputs=[gr.Textbox(label="Search Query", lines=3)],
    outputs='json',
    title=title,
    description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
)

if __name__ == "__main__":
    iface.launch()