Spaces:

mohbay
/

searchcsv2

Running

File size: 12,975 Bytes

00b3d0f
 
 
 
 
 
2bc77eb
1641ca7
00b3d0f
 
 
2f4967b
00b3d0f
 
 
 
b6b04c7
00b3d0f
 
 
 
89f676e
00b3d0f
 
 
89f676e
00b3d0f
 
 
 
 
 
 
762dded
3dafe6c
 
 
 
 
00b3d0f
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad2b97
00b3d0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad2b97
 
 
 
00b3d0f
3ad2b97
00b3d0f
 
 
3ad2b97
2f4967b
00b3d0f
 
762dded
00b3d0f
 
 
 
 
892da5a
00b3d0f
 
 
 
 
 
892da5a
00b3d0f
 
 
 
892da5a
00b3d0f
 
892da5a
00b3d0f
 
 
892da5a
00b3d0f
762dded
00b3d0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ad2b97
 
 
00b3d0f
 
 
 
 
 
 
 
 
 
2f4967b
00b3d0f
 
 
2f4967b
00b3d0f
 
 
2f4967b
00b3d0f
 
 
2f4967b
ed06f94
00b3d0f
 
 
 
ed06f94
00b3d0f
 
 
 
 
 
 
 
 
2f4967b
00b3d0f
 
61a6c42
00b3d0f
 
 
 
61a6c42
00b3d0f
 
 
 
61a6c42
00b3d0f
 
 
 
 
 
2f4967b
00b3d0f
 
 
 
 
 
2f4967b
00b3d0f
61a6c42
00b3d0f
 
 
 
 
 
 
 
892da5a
00b3d0f
 
 
 
 
 
 
ed06f94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc9e59e
 
ed06f94
 
 
 
 
 
 
 
 
 
 
 
 
 
74b7f80
f46813a
00b3d0f
 
f46813a
ed06f94
 
00b3d0f
 
 
 
 
f46813a
ed06f94
 
00b3d0f
 
 
f46813a
00b3d0f
 
 
 
f46813a
00b3d0f
 
 
 
 
 
f46813a
00b3d0f
f46813a
00b3d0f
 
 
 
762dded
00b3d0f
61a6c42
00b3d0f
 
 
 
 
 
 
 
 
 
 
 
3dafe6c
00b3d0f
3dafe6c
00b3d0f
 
 
 
 
 
 
 
3dafe6c
00b3d0f
 
3dafe6c

import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import gradio as gr
import re
from rank_bm25 import BM25Okapi
import numpy as np

# Load models
model = SentenceTransformer("distilbert-base-multilingual-cased")
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Load data
df = pd.read_csv("cleaned1.csv")
df2 = pd.read_csv("cleaned2.csv")
df3 = pd.read_csv("cleaned3.csv")

# Load pre-computed embeddings
embeddings = torch.load("embeddings1_1.pt")
embeddings2 = torch.load("embeddings2_1.pt")
embeddings3 = torch.load("embeddings3_1.pt")

embeddingsa = torch.load("embeddings1.pt")
embeddingsa2 = torch.load("embeddings2.pt")
embeddingsa3 = torch.load("embeddings3.pt")

# Extract questions and links
df_questions = df["question"].values
df_links = df["link"].values
df2_questions = df2["question"].values
df2_links = df2["link"].values
df3_questions = df3["question"].values
df3_links = df3["url"].values

# ARABIC_STOPWORDS = {
#     'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
#     'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
#     'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن'
# }
ARABIC_STOPWORDS = {
    'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
    'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
    'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن',
    'رضي', 'عليها', 'عنهم', 'عنه', 'عليهم', 'صلى', 'وسلم',
    'سلام', 'عليه', 'الرسول', 'النبي', 'عليه', 'السلام', 'حديث', 'احاديث'
}
def arabic_word_tokenize(text):
    if not isinstance(text, str):
        return []
    # Remove diacritics
    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
    # Extract only Arabic words (length ≥ 2)
    tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
    
    return [t for t in tokens if t not in ARABIC_STOPWORDS]

def prepare_bm25_corpus(questions):
    """Prepare tokenized corpus for BM25"""
    tokenized_corpus = []
    for question in questions:
        tokens = arabic_word_tokenize(question)
        tokenized_corpus.append(tokens)
    return tokenized_corpus

# Initialize BM25 models for each dataset
print("Initializing BM25 models...")
bm25_corpus1 = prepare_bm25_corpus(df_questions)
bm25_corpus2 = prepare_bm25_corpus(df2_questions)
bm25_corpus3 = prepare_bm25_corpus(df3_questions)

bm25_model1 = BM25Okapi(bm25_corpus1)
bm25_model2 = BM25Okapi(bm25_corpus2)
bm25_model3 = BM25Okapi(bm25_corpus3)
print("BM25 models initialized!")
corpus_length1 = len(df_questions)
corpus_length2 = len(df2_questions)
corpus_length3 = len(df3_questions)


def compute_bm25_scores(query, bm25_model,corpus_length):
    """Compute BM25 scores for a query"""
    query_tokens = arabic_word_tokenize(query)
    if not query_tokens:
        return np.zeros(corpus_length)
    
    scores = bm25_model.get_scores(query_tokens)
    return scores

def compute_word_overlap(query, questions):
    """Enhanced word overlap computation"""
    query_words = set(arabic_word_tokenize(query))
    if len(query_words) == 0:
        return [0.0] * len(questions)
    
    overlaps = []
    for q in questions:
        q_words = set(arabic_word_tokenize(q))
        if len(q_words) == 0:
            overlaps.append(0.0)
            continue
            
        # Use Jaccard similarity (intersection over union)
        intersection = len(query_words & q_words)
        union = len(query_words | q_words)
        jaccard = intersection / union if union > 0 else 0.0
        
        # Also compute coverage (how much of query is matched)
        coverage = intersection / len(query_words)
        
        # Combine both: prioritize coverage but consider similarity
        overlap_score = 0.7 * coverage + 0.3 * jaccard
        overlaps.append(overlap_score)
    
    return overlaps

def normalize_scores(scores):
    """Normalize scores to 0-1 range"""
    scores = np.array(scores)
    if np.max(scores) == np.min(scores):
        return np.zeros_like(scores)
    return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))

def predict(text):
    print(f"Received query: {text}")
    if not text or text.strip() == "":
        return "No query provided"

    # Semantic similarity scores
    query_embedding = model.encode(text, convert_to_tensor=True)
    query_embeddinga = modela.encode(text, convert_to_tensor=True)

    # Cosine similarities (averaged from two models)
    sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] + 
                   util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
    sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] + 
                   util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
    sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] + 
                   util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2

    # BM25 scores
    bm25_scores1 = compute_bm25_scores(text, bm25_model1,corpus_length1)
    bm25_scores2 = compute_bm25_scores(text, bm25_model2,corpus_length2)
    bm25_scores3 = compute_bm25_scores(text, bm25_model3,corpus_length3)

    # Word overlap scores
    word_overlap1 = compute_word_overlap(text, df_questions)
    word_overlap2 = compute_word_overlap(text, df2_questions)
    word_overlap3 = compute_word_overlap(text, df3_questions)

    # Normalize all scores for fair combination
    norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
    norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
    norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
    
    norm_bm25_1 = normalize_scores(bm25_scores1)
    norm_bm25_2 = normalize_scores(bm25_scores2)
    norm_bm25_3 = normalize_scores(bm25_scores3)
    
    norm_word1 = normalize_scores(word_overlap1)
    norm_word2 = normalize_scores(word_overlap2)
    norm_word3 = normalize_scores(word_overlap3)

    # Adaptive weighting based on query characteristics
    query_words = arabic_word_tokenize(text)
    query_length = len(query_words)
    
    if query_length <= 4:
        # Short queries: prioritize exact matches (BM25 + word overlap)
        semantic_weight = 0.3
        bm25_weight = 0.4
        word_weight = 0.3
    elif query_length <= 6:
        # Medium queries: balanced approach
        semantic_weight = 0.4
        bm25_weight = 0.35
        word_weight = 0.25
    else:
        # Long queries: prioritize semantic understanding
        semantic_weight = 0.5
        bm25_weight = 0.3
        word_weight = 0.2

    def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
        combined_results = []
        
        for i in range(len(questions)):
            semantic_score = float(norm_semantic[i])
            bm25_score = float(norm_bm25[i])
            word_score = float(norm_word[i])
            
            # Enhanced scoring with BM25
            combined_score = (semantic_weight * semantic_score + 
                            bm25_weight * bm25_score + 
                            word_weight * word_score)
            
            # Boost results that perform well across multiple metrics
            high_performance_count = sum([
                semantic_score > 0.7,
                bm25_score > 0.7,
                word_score > 0.5
            ])
            
            if high_performance_count >= 2:
                boost = 0.1
            elif high_performance_count >= 1:
                boost = 0.05
            else:
                boost = 0.0
                
            final_score = combined_score + boost
            
            combined_results.append({
                "question": questions[i],
                "link": links[i],
                "semantic_score": semantic_score,
                "bm25_score": bm25_score,
                "word_overlap_score": word_score,
                "combined_score": final_score
            })
        
        return combined_results

    # Create combined results for all datasets
    combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
    combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
    combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)

    # def get_diverse_top_results(combined_results, top_k=5):
    #     """Get diverse top results using multiple ranking strategies"""
    #     # Sort by combined score and get top candidates
    #     by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
    #     top_combined = by_combined[:3]
        
    #     # Get questions from top combined to avoid duplicates
    #     used_questions = {item["question"] for item in top_combined}
        
    #     # Add best BM25 result not already included
    #     by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
    #     bm25_pick = None
    #     for item in by_bm25:
    #         if item["question"] not in used_questions:
    #             bm25_pick = item
    #             break
        
    #     # Add best semantic result not already included
    #     by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
    #     semantic_pick = None
    #     if bm25_pick:
    #         used_questions.add(bm25_pick["question"])
        
    #     for item in by_semantic:
    #         if item["question"] not in used_questions:
    #             semantic_pick = item
    #             break
        
    #     # Combine results
    #     final_results = top_combined.copy()
    #     if bm25_pick:
    #         final_results.append(bm25_pick)
    #     if semantic_pick:
    #         final_results.append(semantic_pick)
        
    #     return final_results[:top_k]
    
    def get_diverse_top_results(combined_results, top_k=15):
        """Get diverse top results using multiple ranking strategies with BM25 threshold"""
        
        # First, check if any results have BM25 score > 0.1
        has_good_bm25 = any(item["bm25_score"] > 0.1 for item in combined_results)
        
        if has_good_bm25:
            # Filter results to only include those with BM25 > 0.1
            filtered_results = [item for item in combined_results if item["bm25_score"] > 0.1]
        else:
            # If all BM25 scores are <= 0.1, use all results
            filtered_results = combined_results
        
        # Sort by combined score and get top candidates from filtered results
        by_combined = sorted(filtered_results, key=lambda x: x["combined_score"], reverse=True)
        top_combined = by_combined[:top_k-5]
        
        # Get questions from top combined to avoid duplicates
        used_questions = {item["question"] for item in top_combined}
        
        # Add best BM25 result not already included (from filtered results)
        by_bm25 = sorted(filtered_results, key=lambda x: x["bm25_score"], reverse=True)
        bm25_pick = None
        for item in by_bm25:
            if item["question"] not in used_questions:
                bm25_pick = item
                break
        
        # Add best semantic result not already included (from filtered results)
        by_semantic = sorted(filtered_results, key=lambda x: x["semantic_score"], reverse=True)
        semantic_pick = None
        if bm25_pick:
            used_questions.add(bm25_pick["question"])
        
        for item in by_semantic:
            if item["question"] not in used_questions:
                semantic_pick = item
                break
        
        # Combine results
        final_results = top_combined.copy()
        if bm25_pick:
            final_results.append(bm25_pick)
        if semantic_pick:
            final_results.append(semantic_pick)
        
        return final_results[:top_k]
    
    # Get top results for each dataset
    top1 = get_diverse_top_results(combined1)
    top2 = get_diverse_top_results(combined2)
    top3 = get_diverse_top_results(combined3)

    results = {
        
        "top2": top2,
        "top3": top3,
        "top1": top1,
        "query_info": {
            "query_length": query_length,
            "weights": {
                "semantic": semantic_weight,
                "bm25": bm25_weight,
                "word_overlap": word_weight
            }
        }
    }

    return results

title = "Enhanced Search with BM25"
iface = gr.Interface(
    fn=predict,
    inputs=[gr.Textbox(label="Search Query", lines=3)],
    outputs='json',
    title=title,
    description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
)

if __name__ == "__main__":
    iface.launch()