Spaces:

mohbay
/

searchcsv2

Running

App Files Files Community

mohbay commited on Jul 8

Commit

00b3d0f

verified ·

1 Parent(s): 3dafe6c

Update app.py

Browse files

Files changed (1) hide show

app.py +393 -387

app.py CHANGED Viewed

@@ -1,447 +1,453 @@
-# import torch
-# import pandas as pd
-# from sentence_transformers import SentenceTransformer, util
-# import gradio as gr
-# import re
-# from rank_bm25 import BM25Okapi
-# import numpy as np
-# # Load models
-# model = SentenceTransformer("distilbert-base-multilingual-cased")
-# modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
-# # Load data
-# df = pd.read_csv("cleaned1.csv")
-# df2 = pd.read_csv("cleaned2.csv")
-# df3 = pd.read_csv("cleaned3.csv")
-# # Load pre-computed embeddings
-# embeddings = torch.load("embeddings1_1.pt")
-# embeddings2 = torch.load("embeddings2_1.pt")
-# embeddings3 = torch.load("embeddings3_1.pt")
-# embeddingsa = torch.load("embeddings1.pt")
-# embeddingsa2 = torch.load("embeddings2.pt")
-# embeddingsa3 = torch.load("embeddings3.pt")
-# # Extract questions and links
-# df_questions = df["question"].values
-# df_links = df["link"].values
-# df2_questions = df2["question"].values
-# df2_links = df2["link"].values
-# df3_questions = df3["question"].values
-# df3_links = df3["url"].values
 # ARABIC_STOPWORDS = {
 #     'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
 #     'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
 #     'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن'
 # }
-# def arabic_word_tokenize(text):
-#     if not isinstance(text, str):
-#         return []
-#     # Remove diacritics
-#     text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
-#     # Extract only Arabic words (length ≥ 2)
-#     tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
-#     return [t for t in tokens if t not in ARABIC_STOPWORDS]
-# def prepare_bm25_corpus(questions):
-#     """Prepare tokenized corpus for BM25"""
-#     tokenized_corpus = []
-#     for question in questions:
-#         tokens = arabic_word_tokenize(question)
-#         tokenized_corpus.append(tokens)
-#     return tokenized_corpus
-# # Initialize BM25 models for each dataset
-# print("Initializing BM25 models...")
-# bm25_corpus1 = prepare_bm25_corpus(df_questions)
-# bm25_corpus2 = prepare_bm25_corpus(df2_questions)
-# bm25_corpus3 = prepare_bm25_corpus(df3_questions)
-# bm25_model1 = BM25Okapi(bm25_corpus1)
-# bm25_model2 = BM25Okapi(bm25_corpus2)
-# bm25_model3 = BM25Okapi(bm25_corpus3)
-# print("BM25 models initialized!")
-# def compute_bm25_scores(query, bm25_model):
-#     """Compute BM25 scores for a query"""
-#     query_tokens = arabic_word_tokenize(query)
-#     if not query_tokens:
-#         return np.zeros(len(bm25_model.corpus))
-#     scores = bm25_model.get_scores(query_tokens)
-#     return scores
-# def compute_word_overlap(query, questions):
-#     """Enhanced word overlap computation"""
-#     query_words = set(arabic_word_tokenize(query))
-#     if len(query_words) == 0:
-#         return [0.0] * len(questions)
-#     overlaps = []
-#     for q in questions:
-#         q_words = set(arabic_word_tokenize(q))
-#         if len(q_words) == 0:
-#             overlaps.append(0.0)
-#             continue
-#         # Use Jaccard similarity (intersection over union)
-#         intersection = len(query_words & q_words)
-#         union = len(query_words | q_words)
-#         jaccard = intersection / union if union > 0 else 0.0
-#         # Also compute coverage (how much of query is matched)
-#         coverage = intersection / len(query_words)
-#         # Combine both: prioritize coverage but consider similarity
-#         overlap_score = 0.7 * coverage + 0.3 * jaccard
-#         overlaps.append(overlap_score)
-#     return overlaps
-# def normalize_scores(scores):
-#     """Normalize scores to 0-1 range"""
-#     scores = np.array(scores)
-#     if np.max(scores) == np.min(scores):
-#         return np.zeros_like(scores)
-#     return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
-# def predict(text):
-#     print(f"Received query: {text}")
-#     if not text or text.strip() == "":
-#         return "No query provided"
-#     # Semantic similarity scores
-#     query_embedding = model.encode(text, convert_to_tensor=True)
-#     query_embeddinga = modela.encode(text, convert_to_tensor=True)
-#     # Cosine similarities (averaged from two models)
-#     sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
-#                    util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
-#     sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
-#                    util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
-#     sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
-#                    util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
-#     # BM25 scores
-#     bm25_scores1 = compute_bm25_scores(text, bm25_model1)
-#     bm25_scores2 = compute_bm25_scores(text, bm25_model2)
-#     bm25_scores3 = compute_bm25_scores(text, bm25_model3)
-#     # Word overlap scores
-#     word_overlap1 = compute_word_overlap(text, df_questions)
-#     word_overlap2 = compute_word_overlap(text, df2_questions)
-#     word_overlap3 = compute_word_overlap(text, df3_questions)
-#     # Normalize all scores for fair combination
-#     norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
-#     norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
-#     norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
-#     norm_bm25_1 = normalize_scores(bm25_scores1)
-#     norm_bm25_2 = normalize_scores(bm25_scores2)
-#     norm_bm25_3 = normalize_scores(bm25_scores3)
-#     norm_word1 = normalize_scores(word_overlap1)
-#     norm_word2 = normalize_scores(word_overlap2)
-#     norm_word3 = normalize_scores(word_overlap3)
-#     # Adaptive weighting based on query characteristics
-#     query_words = arabic_word_tokenize(text)
-#     query_length = len(query_words)
-#     if query_length <= 2:
-#         # Short queries: prioritize exact matches (BM25 + word overlap)
-#         semantic_weight = 0.3
-#         bm25_weight = 0.4
-#         word_weight = 0.3
-#     elif query_length <= 5:
-#         # Medium queries: balanced approach
-#         semantic_weight = 0.4
-#         bm25_weight = 0.35
-#         word_weight = 0.25
-#     else:
-#         # Long queries: prioritize semantic understanding
-#         semantic_weight = 0.5
-#         bm25_weight = 0.3
-#         word_weight = 0.2
-#     def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
-#         combined_results = []
-#         for i in range(len(questions)):
-#             semantic_score = float(norm_semantic[i])
-#             bm25_score = float(norm_bm25[i])
-#             word_score = float(norm_word[i])
-#             # Enhanced scoring with BM25
-#             combined_score = (semantic_weight * semantic_score +
-#                             bm25_weight * bm25_score +
-#                             word_weight * word_score)
-#             # Boost results that perform well across multiple metrics
-#             high_performance_count = sum([
-#                 semantic_score > 0.7,
-#                 bm25_score > 0.7,
-#                 word_score > 0.5
-#             ])
-#             if high_performance_count >= 2:
-#                 boost = 0.1
-#             elif high_performance_count >= 1:
-#                 boost = 0.05
-#             else:
-#                 boost = 0.0
-#             final_score = combined_score + boost
-#             combined_results.append({
-#                 "question": questions[i],
-#                 "link": links[i],
-#                 "semantic_score": semantic_score,
-#                 "bm25_score": bm25_score,
-#                 "word_overlap_score": word_score,
-#                 "combined_score": final_score
-#             })
-#         return combined_results
-#     # Create combined results for all datasets
-#     combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
-#     combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
-#     combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
-#     def get_diverse_top_results(combined_results, top_k=5):
-#         """Get diverse top results using multiple ranking strategies"""
-#         # Sort by combined score and get top candidates
-#         by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
-#         top_combined = by_combined[:3]
-#         # Get questions from top combined to avoid duplicates
-#         used_questions = {item["question"] for item in top_combined}
-#         # Add best BM25 result not already included
-#         by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
-#         bm25_pick = None
-#         for item in by_bm25:
-#             if item["question"] not in used_questions:
-#                 bm25_pick = item
-#                 break
-#         # Add best semantic result not already included
-#         by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
-#         semantic_pick = None
-#         if bm25_pick:
-#             used_questions.add(bm25_pick["question"])
-#         for item in by_semantic:
-#             if item["question"] not in used_questions:
-#                 semantic_pick = item
-#                 break
-#         # Combine results
-#         final_results = top_combined.copy()
-#         if bm25_pick:
-#             final_results.append(bm25_pick)
-#         if semantic_pick:
-#             final_results.append(semantic_pick)
-#         return final_results[:top_k]
-#     # Get top results for each dataset
-#     top1 = get_diverse_top_results(combined1)
-#     top2 = get_diverse_top_results(combined2)
-#     top3 = get_diverse_top_results(combined3)
-#     results = {
-#         "top2": top2,
-#         "top3": top3,
-#         "top1": top1,
-#         "query_info": {
-#             "query_length": query_length,
-#             "weights": {
-#                 "semantic": semantic_weight,
-#                 "bm25": bm25_weight,
-#                 "word_overlap": word_weight
-#             }
-#         }
-#     }
-#     return results
-# title = "Enhanced Search with BM25"
-# iface = gr.Interface(
-#     fn=predict,
-#     inputs=[gr.Textbox(label="Search Query", lines=3)],
-#     outputs='json',
-#     title=title,
-#     description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
-# )
-# if __name__ == "__main__":
-#     iface.launch()
-import torch
-import pandas as pd
-from sentence_transformers import SentenceTransformer, util
-import gradio as gr
-import re
-import numpy as np
-import math
-from collections import Counter
-# Load both models
-model1 = SentenceTransformer("distilbert-base-multilingual-cased")
-model2 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
-# Load data
-print("Loading data and embeddings...")
-df = pd.read_csv("cleaned1.csv")
-df2 = pd.read_csv("cleaned2.csv")
-df3 = pd.read_csv("cleaned3.csv")
-embeddings1 = torch.load("embeddings1_1.pt")
-embeddings2 = torch.load("embeddings2_1.pt")
-embeddings3 = torch.load("embeddings3_1.pt")
-embeddings1a = torch.load("embeddings1.pt")
-embeddings2a = torch.load("embeddings2.pt")
-embeddings3a = torch.load("embeddings3.pt")
-# Arabic stopwords
-ARABIC_STOPWORDS = {
-    'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
-    'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
-    'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن',
-    'رضي', 'عليها', 'عنهم', 'عنه', 'عليهم', 'صلى', 'وسلم',
-    'سلام', 'عليه', 'الرسول', 'النبي', 'عليه', 'السلام', 'حديث', 'احاديث'
-}
-def arabic_word_tokenize(text):
-    if not isinstance(text, str): return []
-    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
-    return [t for t in re.findall(r'[\u0600-\u06FF]{2,}', text) if t not in ARABIC_STOPWORDS]
-# Pre-tokenize questions and compute doc frequencies
-def setup_tokenization_and_freqs(questions):
-    tokenized = [arabic_word_tokenize(q) for q in questions]
-    doc_freqs = Counter(word for doc in tokenized for word in set(doc))
-    return tokenized, doc_freqs
-tokenized1, doc_freqs1 = setup_tokenization_and_freqs(df["question"].values)
-tokenized2, doc_freqs2 = setup_tokenization_and_freqs(df2["question"].values)
-tokenized3, doc_freqs3 = setup_tokenization_and_freqs(df3["question"].values)
-def compute_word_overlap(query, questions):
-    q_words = set(arabic_word_tokenize(query))
-    scores = []
-    for doc in questions:
-        d_words = set(arabic_word_tokenize(doc))
-        if not d_words or not q_words:
-            scores.append(0.0)
-            continue
-        inter = len(q_words & d_words)
-        union = len(q_words | d_words)
-        jaccard = inter / union if union else 0.0
-        coverage = inter / len(q_words)
-        scores.append(0.7 * coverage + 0.3 * jaccard)
-    return scores
-def lightweight_bm25_score(query_tokens, doc_tokens, doc_freqs, total_docs, k1=1.2, b=0.75):
-    score = 0.0
-    doc_len = len(doc_tokens)
-    avg_doc_len = 10
-    for term in query_tokens:
-        if term in doc_tokens:
-            tf = doc_tokens.count(term)
-            df = doc_freqs.get(term, 0)
-            if df > 0:
-                idf = math.log((total_docs - df + 0.5) / (df + 0.5))
-                score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len)))
-    return score
-def normalize_scores(scores):
-    arr = np.array(scores)
-    if arr.max() == arr.min(): return np.zeros_like(arr)
-    return (arr - arr.min()) / (arr.max() - arr.min())
-def combine_scores(query, questions, tokenized, doc_freqs, emb1, emb2):
-    total_docs = len(questions)
-    q_emb1 = model1.encode(query, convert_to_tensor=True)
-    q_emb2 = model2.encode(query, convert_to_tensor=True)
-    sim1 = util.pytorch_cos_sim(q_emb1, emb1)[0]
-    sim2 = util.pytorch_cos_sim(q_emb2, emb2)[0]
-    sim_scores = ((sim1 + sim2) / 2).cpu().numpy()
-    bm25_scores = [lightweight_bm25_score(arabic_word_tokenize(query), doc_tokens, doc_freqs, total_docs)
-                   for doc_tokens in tokenized]
-    word_scores = compute_word_overlap(query, questions)
-    norm_bm25 = normalize_scores(bm25_scores)
-    norm_word = normalize_scores(word_scores)
-    norm_sim = normalize_scores(sim_scores)
-    query_len = len(arabic_word_tokenize(query))
-    if query_len <= 2:
-        w_sem, w_bm, w_word = 0.3, 0.4, 0.3
-    elif query_len <= 5:
-        w_sem, w_bm, w_word = 0.4, 0.35, 0.25
-    else:
-        w_sem, w_bm, w_word = 0.5, 0.3, 0.2
-    results = []
-    for i, q in enumerate(questions):
-        sem, bm, word = norm_sim[i], norm_bm25[i], norm_word[i]
-        combined = w_sem*sem + w_bm*bm + w_word*word
-        boost = 0.1 if sum([sem > 0.7, bm > 0.7, word > 0.5]) >= 2 else (0.05 if sum([sem > 0.7, bm > 0.7, word > 0.5]) == 1 else 0.0)
-        results.append({
-            "question": q,
-            "semantic_score": sem,
-            "bm25_score": bm,
-            "word_overlap_score": word,
-            "combined_score": combined + boost
-        })
-    return results
-def get_top_diverse(results, links, top_k=5):
-    results = [dict(r, link=links[i]) for i, r in enumerate(results)]
-    top_combined = sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
-    used_q = {r['question'] for r in top_combined}
-    top_bm = next((r for r in sorted(results, key=lambda x: x['bm25_score'], reverse=True) if r['question'] not in used_q), None)
-    if top_bm: used_q.add(top_bm['question'])
-    top_sem = next((r for r in sorted(results, key=lambda x: x['semantic_score'], reverse=True) if r['question'] not in used_q), None)
-    final = top_combined + ([top_bm] if top_bm else []) + ([top_sem] if top_sem else [])
-    return final[:top_k]
-def predict(query):
-    print(f"Query: {query}")
-    results1 = combine_scores(query, df["question"].values, tokenized1, doc_freqs1, embeddings1, embeddings1a)
-    results2 = combine_scores(query, df2["question"].values, tokenized2, doc_freqs2, embeddings2, embeddings2a)
-    results3 = combine_scores(query, df3["question"].values, tokenized3, doc_freqs3, embeddings3, embeddings3a)
-    return {
-        "top2": get_top_diverse(results2, df2["link"].values),
-        "top3": get_top_diverse(results3, df3["url"].values),
-        "top1": get_top_diverse(results1, df["link"].values),
-        "query_info": {
-            "query_length": len(arabic_word_tokenize(query))
-        }
-    }
-title = "Arabic Search: Dual-Model + BM25 + Overlap"
-iface = gr.Interface(
-    fn=predict,
-    inputs=[gr.Textbox(label="Search Query", lines=3)],
-    outputs="json",
-    title=title,
-    description="Accurate Arabic search using two semantic models, fast BM25, and word overlap."
-)
-if __name__ == "__main__":
-    iface.launch()

+import torch
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+import gradio as gr
+import re
+from rank_bm25 import BM25Okapi
+import numpy as np
+# Load models
+model = SentenceTransformer("distilbert-base-multilingual-cased")
+modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
+# Load data
+df = pd.read_csv("cleaned1.csv")
+df2 = pd.read_csv("cleaned2.csv")
+df3 = pd.read_csv("cleaned3.csv")
+# Load pre-computed embeddings
+embeddings = torch.load("embeddings1_1.pt")
+embeddings2 = torch.load("embeddings2_1.pt")
+embeddings3 = torch.load("embeddings3_1.pt")
+embeddingsa = torch.load("embeddings1.pt")
+embeddingsa2 = torch.load("embeddings2.pt")
+embeddingsa3 = torch.load("embeddings3.pt")
+# Extract questions and links
+df_questions = df["question"].values
+df_links = df["link"].values
+df2_questions = df2["question"].values
+df2_links = df2["link"].values
+df3_questions = df3["question"].values
+df3_links = df3["url"].values
 # ARABIC_STOPWORDS = {
 #     'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
 #     'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
 #     'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن'
 # }
+ARABIC_STOPWORDS = {
+    'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
+    'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
+    'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن',
+    'رضي', 'عليها', 'عنهم', 'عنه', 'عليهم', 'صلى', 'وسلم',
+    'سلام', 'عليه', 'الرسول', 'النبي', 'عليه', 'السلام', 'حديث', 'احاديث'
+}
+def arabic_word_tokenize(text):
+    if not isinstance(text, str):
+        return []
+    # Remove diacritics
+    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
+    # Extract only Arabic words (length ≥ 2)
+    tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
+    return [t for t in tokens if t not in ARABIC_STOPWORDS]
+def prepare_bm25_corpus(questions):
+    """Prepare tokenized corpus for BM25"""
+    tokenized_corpus = []
+    for question in questions:
+        tokens = arabic_word_tokenize(question)
+        tokenized_corpus.append(tokens)
+    return tokenized_corpus
+# Initialize BM25 models for each dataset
+print("Initializing BM25 models...")
+bm25_corpus1 = prepare_bm25_corpus(df_questions)
+bm25_corpus2 = prepare_bm25_corpus(df2_questions)
+bm25_corpus3 = prepare_bm25_corpus(df3_questions)
+bm25_model1 = BM25Okapi(bm25_corpus1)
+bm25_model2 = BM25Okapi(bm25_corpus2)
+bm25_model3 = BM25Okapi(bm25_corpus3)
+print("BM25 models initialized!")
+def compute_bm25_scores(query, bm25_model):
+    """Compute BM25 scores for a query"""
+    query_tokens = arabic_word_tokenize(query)
+    if not query_tokens:
+        return np.zeros(len(bm25_model.corpus))
+    scores = bm25_model.get_scores(query_tokens)
+    return scores
+def compute_word_overlap(query, questions):
+    """Enhanced word overlap computation"""
+    query_words = set(arabic_word_tokenize(query))
+    if len(query_words) == 0:
+        return [0.0] * len(questions)
+    overlaps = []
+    for q in questions:
+        q_words = set(arabic_word_tokenize(q))
+        if len(q_words) == 0:
+            overlaps.append(0.0)
+            continue
+        # Use Jaccard similarity (intersection over union)
+        intersection = len(query_words & q_words)
+        union = len(query_words | q_words)
+        jaccard = intersection / union if union > 0 else 0.0
+        # Also compute coverage (how much of query is matched)
+        coverage = intersection / len(query_words)
+        # Combine both: prioritize coverage but consider similarity
+        overlap_score = 0.7 * coverage + 0.3 * jaccard
+        overlaps.append(overlap_score)
+    return overlaps
+def normalize_scores(scores):
+    """Normalize scores to 0-1 range"""
+    scores = np.array(scores)
+    if np.max(scores) == np.min(scores):
+        return np.zeros_like(scores)
+    return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
+def predict(text):
+    print(f"Received query: {text}")
+    if not text or text.strip() == "":
+        return "No query provided"
+    # Semantic similarity scores
+    query_embedding = model.encode(text, convert_to_tensor=True)
+    query_embeddinga = modela.encode(text, convert_to_tensor=True)
+    # Cosine similarities (averaged from two models)
+    sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
+                   util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
+    sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
+                   util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
+    sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
+                   util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
+    # BM25 scores
+    bm25_scores1 = compute_bm25_scores(text, bm25_model1)
+    bm25_scores2 = compute_bm25_scores(text, bm25_model2)
+    bm25_scores3 = compute_bm25_scores(text, bm25_model3)
+    # Word overlap scores
+    word_overlap1 = compute_word_overlap(text, df_questions)
+    word_overlap2 = compute_word_overlap(text, df2_questions)
+    word_overlap3 = compute_word_overlap(text, df3_questions)
+    # Normalize all scores for fair combination
+    norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
+    norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
+    norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
+    norm_bm25_1 = normalize_scores(bm25_scores1)
+    norm_bm25_2 = normalize_scores(bm25_scores2)
+    norm_bm25_3 = normalize_scores(bm25_scores3)
+    norm_word1 = normalize_scores(word_overlap1)
+    norm_word2 = normalize_scores(word_overlap2)
+    norm_word3 = normalize_scores(word_overlap3)
+    # Adaptive weighting based on query characteristics
+    query_words = arabic_word_tokenize(text)
+    query_length = len(query_words)
+    if query_length <= 2:
+        # Short queries: prioritize exact matches (BM25 + word overlap)
+        semantic_weight = 0.3
+        bm25_weight = 0.4
+        word_weight = 0.3
+    elif query_length <= 5:
+        # Medium queries: balanced approach
+        semantic_weight = 0.4
+        bm25_weight = 0.35
+        word_weight = 0.25
+    else:
+        # Long queries: prioritize semantic understanding
+        semantic_weight = 0.5
+        bm25_weight = 0.3
+        word_weight = 0.2
+    def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
+        combined_results = []
+        for i in range(len(questions)):
+            semantic_score = float(norm_semantic[i])
+            bm25_score = float(norm_bm25[i])
+            word_score = float(norm_word[i])
+            # Enhanced scoring with BM25
+            combined_score = (semantic_weight * semantic_score +
+                            bm25_weight * bm25_score +
+                            word_weight * word_score)
+            # Boost results that perform well across multiple metrics
+            high_performance_count = sum([
+                semantic_score > 0.7,
+                bm25_score > 0.7,
+                word_score > 0.5
+            ])
+            if high_performance_count >= 2:
+                boost = 0.1
+            elif high_performance_count >= 1:
+                boost = 0.05
+            else:
+                boost = 0.0
+            final_score = combined_score + boost
+            combined_results.append({
+                "question": questions[i],
+                "link": links[i],
+                "semantic_score": semantic_score,
+                "bm25_score": bm25_score,
+                "word_overlap_score": word_score,
+                "combined_score": final_score
+            })
+        return combined_results
+    # Create combined results for all datasets
+    combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
+    combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
+    combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
+    def get_diverse_top_results(combined_results, top_k=5):
+        """Get diverse top results using multiple ranking strategies"""
+        # Sort by combined score and get top candidates
+        by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
+        top_combined = by_combined[:3]
+        # Get questions from top combined to avoid duplicates
+        used_questions = {item["question"] for item in top_combined}
+        # Add best BM25 result not already included
+        by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
+        bm25_pick = None
+        for item in by_bm25:
+            if item["question"] not in used_questions:
+                bm25_pick = item
+                break
+        # Add best semantic result not already included
+        by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
+        semantic_pick = None
+        if bm25_pick:
+            used_questions.add(bm25_pick["question"])
+        for item in by_semantic:
+            if item["question"] not in used_questions:
+                semantic_pick = item
+                break
+        # Combine results
+        final_results = top_combined.copy()
+        if bm25_pick:
+            final_results.append(bm25_pick)
+        if semantic_pick:
+            final_results.append(semantic_pick)
+        return final_results[:top_k]
+    # Get top results for each dataset
+    top1 = get_diverse_top_results(combined1)
+    top2 = get_diverse_top_results(combined2)
+    top3 = get_diverse_top_results(combined3)
+    results = {
+        "top2": top2,
+        "top3": top3,
+        "top1": top1,
+        "query_info": {
+            "query_length": query_length,
+            "weights": {
+                "semantic": semantic_weight,
+                "bm25": bm25_weight,
+                "word_overlap": word_weight
+            }
+        }
+    }
+    return results
+title = "Enhanced Search with BM25"
+iface = gr.Interface(
+    fn=predict,
+    inputs=[gr.Textbox(label="Search Query", lines=3)],
+    outputs='json',
+    title=title,
+    description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
+)
+if __name__ == "__main__":
+    iface.launch()
+# import torch
+# import pandas as pd
+# from sentence_transformers import SentenceTransformer, util
+# import gradio as gr
+# import re
+# import numpy as np
+# import math
+# from collections import Counter
+# # Load both models
+# model1 = SentenceTransformer("distilbert-base-multilingual-cased")
+# model2 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
+# # Load data
+# print("Loading data and embeddings...")
+# df = pd.read_csv("cleaned1.csv")
+# df2 = pd.read_csv("cleaned2.csv")
+# df3 = pd.read_csv("cleaned3.csv")
+# embeddings1 = torch.load("embeddings1_1.pt")
+# embeddings2 = torch.load("embeddings2_1.pt")
+# embeddings3 = torch.load("embeddings3_1.pt")
+# embeddings1a = torch.load("embeddings1.pt")
+# embeddings2a = torch.load("embeddings2.pt")
+# embeddings3a = torch.load("embeddings3.pt")
+# # Arabic stopwords
+# ARABIC_STOPWORDS = {
+#     'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
+#     'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
+#     'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن',
+#     'رضي', 'عليها', 'عنهم', 'عنه', 'عليهم', 'صلى', 'وسلم',
+#     'سلام', 'عليه', 'الرسول', 'النبي', 'عليه', 'السلام', 'حديث', 'احاديث'
+# }
+# def arabic_word_tokenize(text):
+#     if not isinstance(text, str): return []
+#     text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
+#     return [t for t in re.findall(r'[\u0600-\u06FF]{2,}', text) if t not in ARABIC_STOPWORDS]
+# # Pre-tokenize questions and compute doc frequencies
+# def setup_tokenization_and_freqs(questions):
+#     tokenized = [arabic_word_tokenize(q) for q in questions]
+#     doc_freqs = Counter(word for doc in tokenized for word in set(doc))
+#     return tokenized, doc_freqs
+# tokenized1, doc_freqs1 = setup_tokenization_and_freqs(df["question"].values)
+# tokenized2, doc_freqs2 = setup_tokenization_and_freqs(df2["question"].values)
+# tokenized3, doc_freqs3 = setup_tokenization_and_freqs(df3["question"].values)
+# def compute_word_overlap(query, questions):
+#     q_words = set(arabic_word_tokenize(query))
+#     scores = []
+#     for doc in questions:
+#         d_words = set(arabic_word_tokenize(doc))
+#         if not d_words or not q_words:
+#             scores.append(0.0)
+#             continue
+#         inter = len(q_words & d_words)
+#         union = len(q_words | d_words)
+#         jaccard = inter / union if union else 0.0
+#         coverage = inter / len(q_words)
+#         scores.append(0.7 * coverage + 0.3 * jaccard)
+#     return scores
+# def lightweight_bm25_score(query_tokens, doc_tokens, doc_freqs, total_docs, k1=1.2, b=0.75):
+#     score = 0.0
+#     doc_len = len(doc_tokens)
+#     avg_doc_len = 10
+#     for term in query_tokens:
+#         if term in doc_tokens:
+#             tf = doc_tokens.count(term)
+#             df = doc_freqs.get(term, 0)
+#             if df > 0:
+#                 idf = math.log((total_docs - df + 0.5) / (df + 0.5))
+#                 score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len)))
+#     return score
+# def normalize_scores(scores):
+#     arr = np.array(scores)
+#     if arr.max() == arr.min(): return np.zeros_like(arr)
+#     return (arr - arr.min()) / (arr.max() - arr.min())
+# def combine_scores(query, questions, tokenized, doc_freqs, emb1, emb2):
+#     total_docs = len(questions)
+#     q_emb1 = model1.encode(query, convert_to_tensor=True)
+#     q_emb2 = model2.encode(query, convert_to_tensor=True)
+#     sim1 = util.pytorch_cos_sim(q_emb1, emb1)[0]
+#     sim2 = util.pytorch_cos_sim(q_emb2, emb2)[0]
+#     sim_scores = ((sim1 + sim2) / 2).cpu().numpy()
+#     bm25_scores = [lightweight_bm25_score(arabic_word_tokenize(query), doc_tokens, doc_freqs, total_docs)
+#                    for doc_tokens in tokenized]
+#     word_scores = compute_word_overlap(query, questions)
+#     norm_bm25 = normalize_scores(bm25_scores)
+#     norm_word = normalize_scores(word_scores)
+#     norm_sim = normalize_scores(sim_scores)
+#     query_len = len(arabic_word_tokenize(query))
+#     if query_len <= 2:
+#         w_sem, w_bm, w_word = 0.3, 0.4, 0.3
+#     elif query_len <= 5:
+#         w_sem, w_bm, w_word = 0.4, 0.35, 0.25
+#     else:
+#         w_sem, w_bm, w_word = 0.5, 0.3, 0.2
+#     results = []
+#     for i, q in enumerate(questions):
+#         sem, bm, word = norm_sim[i], norm_bm25[i], norm_word[i]
+#         combined = w_sem*sem + w_bm*bm + w_word*word
+#         boost = 0.1 if sum([sem > 0.7, bm > 0.7, word > 0.5]) >= 2 else (0.05 if sum([sem > 0.7, bm > 0.7, word > 0.5]) == 1 else 0.0)
+#         results.append({
+#             "question": q,
+#             "semantic_score": sem,
+#             "bm25_score": bm,
+#             "word_overlap_score": word,
+#             "combined_score": combined + boost
+#         })
+#     return results
+# def get_top_diverse(results, links, top_k=5):
+#     results = [dict(r, link=links[i]) for i, r in enumerate(results)]
+#     top_combined = sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
+#     used_q = {r['question'] for r in top_combined}
+#     top_bm = next((r for r in sorted(results, key=lambda x: x['bm25_score'], reverse=True) if r['question'] not in used_q), None)
+#     if top_bm: used_q.add(top_bm['question'])
+#     top_sem = next((r for r in sorted(results, key=lambda x: x['semantic_score'], reverse=True) if r['question'] not in used_q), None)
+#     final = top_combined + ([top_bm] if top_bm else []) + ([top_sem] if top_sem else [])
+#     return final[:top_k]
+# def predict(query):
+#     print(f"Query: {query}")
+#     results1 = combine_scores(query, df["question"].values, tokenized1, doc_freqs1, embeddings1, embeddings1a)
+#     results2 = combine_scores(query, df2["question"].values, tokenized2, doc_freqs2, embeddings2, embeddings2a)
+#     results3 = combine_scores(query, df3["question"].values, tokenized3, doc_freqs3, embeddings3, embeddings3a)
+#     return {
+#         "top2": get_top_diverse(results2, df2["link"].values),
+#         "top3": get_top_diverse(results3, df3["url"].values),
+#         "top1": get_top_diverse(results1, df["link"].values),
+#         "query_info": {
+#             "query_length": len(arabic_word_tokenize(query))
+#         }
+#     }
+# title = "Arabic Search: Dual-Model + BM25 + Overlap"
+# iface = gr.Interface(
+#     fn=predict,
+#     inputs=[gr.Textbox(label="Search Query", lines=3)],
+#     outputs="json",
+#     title=title,
+#     description="Accurate Arabic search using two semantic models, fast BM25, and word overlap."
+# )
+# if __name__ == "__main__":
+#     iface.launch()