import gradio as gr import os import logging import chromadb from sentence_transformers import SentenceTransformer import google.generativeai as genai from typing import List, Dict # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' # Added format for cleaner logs ) logger = logging.getLogger(__name__) # Configure Gemini API GEMINI_API_KEY = os.getenv('G_API_KEY') genai.configure(api_key=GEMINI_API_KEY) # Global RAG instance rag_instance = None class SimpleRAG: """Dual-model RAG using two embedding models for better bilingual coverage.""" def __init__(self, arabert_db_path="./mub_chroma_db", distilbert_db_path="./mub_chroma_db_distilbert"): # Load Arabert model (Arabic-optimized) logger.info("Loading Arabert embedding model (Arabic-optimized)...") self.arabert_model = SentenceTransformer( "Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka" ) self.arabert_dim = 512 # Load DistilBERT model (Multilingual) logger.info("Loading DistilBERT embedding model (Multilingual)...") self.distilbert_model = SentenceTransformer("distilbert-base-multilingual-cased") self.distilbert_dim = 512 # Truncated to match ChromaDB collection # Load Arabert ChromaDB logger.info(f"Loading Arabert ChromaDB from {arabert_db_path}...") self.arabert_client = chromadb.PersistentClient(path=arabert_db_path) self.arabert_collection = self.arabert_client.get_collection(name="mub_info") logger.info(f"Arabert DB: Loaded {self.arabert_collection.count()} documents") # Load DistilBERT ChromaDB logger.info(f"Loading DistilBERT ChromaDB from {distilbert_db_path}...") self.distilbert_client = chromadb.PersistentClient(path=distilbert_db_path) self.distilbert_collection = self.distilbert_client.get_collection(name="mub_info") logger.info(f"DistilBERT DB: Loaded {self.distilbert_collection.count()} documents") def expand_query(self, query: str) -> List[str]: """ Expand query with synonyms and related terms for better retrieval. Only expands for specific keywords - otherwise returns original query only. """ query_lower = query.lower().strip() expanded_queries = [query] # Always include original first # Only expand if query contains specific keywords # Keep it simple - just add one alternative query per keyword expansions = { # English - Admission/Application related 'apply': 'admission requirements', 'application': 'admission', 'how to enroll': 'admission requirements', 'how to join': 'admission requirements', # English - Programs 'nursing program': 'nursing', 'nursing faculty': 'faculty of nursing', # English - Fees 'tuition': 'fees', 'cost': 'fees', # Arabic - Admission/Application 'كيف أقدم': 'شروط القبول', 'التقديم': 'القبول', 'كيف التسجيل': 'شروط القبول', # Arabic - Programs 'برنامج التمريض': 'التمريض', # Arabic - Fees 'تكلفة': 'رسوم', 'الأقساط': 'رسوم', } # Check for expansion matches (only add ONE alternative) for key, alternative in expansions.items(): if key in query_lower: alt_query = query_lower.replace(key, alternative) if alt_query != query_lower and alt_query not in expanded_queries: expanded_queries.append(alt_query) break # Only add one expansion # Limit to 2 queries total (original + 1 expansion max) return expanded_queries[:2] def translate_query(self, query: str, target_lang: str) -> str: """ Translate query using keyword matching only (no API calls). Fast and efficient for common queries. """ # Keyword translations for common queries keyword_translations = { 'en_to_ar': { 'president': 'رئيس', 'dean': 'عميد', 'tuition': 'رسوم', 'fees': 'رسوم', 'cost': 'تكلفة', 'nursing': 'التمريض', 'admission': 'القبول', 'apply': 'التقديم', 'application': 'طلب', 'enroll': 'التسجيل', 'enrollment': 'التسجيل', 'requirements': 'شروط', 'faculty': 'كلية', 'program': 'برنامج', 'university': 'جامعة', 'course': 'مقرر', 'department': 'قسم', 'student': 'طالب', 'registration': 'تسجيل', 'scholarship': 'منحة', 'contact': 'اتصال', 'location': 'موقع', 'address': 'عنوان', }, 'ar_to_en': { 'رئيس': 'president', 'عميد': 'dean', 'رسوم': 'fees', 'تكلفة': 'cost', 'التمريض': 'nursing', 'القبول': 'admission', 'التقديم': 'apply', 'طلب': 'application', 'التسجيل': 'registration', 'شروط': 'requirements', 'كلية': 'faculty', 'برنامج': 'program', 'جامعة': 'university', 'مقرر': 'course', 'قسم': 'department', 'طالب': 'student', 'منحة': 'scholarship', 'اتصال': 'contact', 'موقع': 'location', 'عنوان': 'address', } } query_lower = query.lower().strip() translation_dict = keyword_translations.get('en_to_ar' if target_lang == 'ar' else 'ar_to_en', {}) # Try exact match first for keyword, translation in translation_dict.items(): if keyword in query_lower or query_lower in keyword: logger.info(f"✅ Keyword match: '{query}' -> '{translation}'") return translation # Extract key words (remove question words and common words) words_to_remove = ['who', 'what', 'where', 'when', 'why', 'how', 'is', 'are', 'was', 'were', 'the', 'a', 'an', 'of', 'in', 'on', 'at', 'to', 'for', 'with', 'about'] query_words = query_lower.split() key_words = [w for w in query_words if w not in words_to_remove and len(w) > 2] # Try matching individual key words for word in key_words: for keyword, translation in translation_dict.items(): if word == keyword or keyword in word or word in keyword: logger.info(f"✅ Key word match ('{word}'): '{query}' -> '{translation}'") return translation # No translation found logger.info(f"ℹ️ No keyword match for: '{query}' (will search as-is)") return None # Return None to indicate no translation available def embed_text_arabert(self, text: str) -> List[float]: """Generate embedding using Arabert model.""" embedding = self.arabert_model.encode(text, convert_to_numpy=True).tolist() return embedding[:self.arabert_dim] # Truncate to 512 def embed_text_distilbert(self, text: str) -> List[float]: """Generate embedding using DistilBERT model.""" embedding = self.distilbert_model.encode(text, convert_to_numpy=True).tolist() return embedding[:512] # Truncate to 512 to match ChromaDB collection dimension def search(self, query: str, n_results: int = 10, distance_threshold: float = 1.5) -> List[Dict]: """ Enhanced search with query expansion and relevance filtering. Language-based model selection strategy: - English queries → DistilBERT (better for English) - Arabic queries → Arabert (optimized for Arabic) Args: query: Search query n_results: Number of results to retrieve per query expansion distance_threshold: Maximum distance to consider result relevant (lower = more similar) Default 1.5 works well for most queries. Increase to 2.0 for broader results. """ # Detect query language query_lang = detect_language(query) def format_results(results, source_model): """Helper to format ChromaDB results.""" formatted = [] if results['metadatas'] and len(results['metadatas']) > 0: # First, log all distances to understand the scale if results['distances']: all_distances = results['distances'][0] logger.info(f" Distance range: {min(all_distances):.3f} - {max(all_distances):.3f}") for idx, metadata in enumerate(results['metadatas'][0]): distance = results['distances'][0][idx] if results['distances'] else 999 # Handle both old and new metadata formats section = metadata.get('section') if not section: # New format: build from page_title and section_title page_title = metadata.get('page_title', '') section_title = metadata.get('section_title', '') if section_title: section = f"{page_title} - {section_title}" if page_title else section_title else: section = page_title or 'Unknown' doc_type = metadata.get('type') or metadata.get('language', 'content') formatted.append({ 'text': results['documents'][0][idx], 'section': section, 'type': doc_type, 'distance': distance }) return formatted # Expand query with synonyms - but keep original query prominent expanded_queries = self.expand_query(query) logger.info(f"🔍 Expanded '{query}' into {len(expanded_queries)} queries") all_results = [] seen_texts = set() # Avoid duplicates # Search with each expanded query for exp_idx, exp_query in enumerate(expanded_queries): logger.info(f" Searching with query {exp_idx+1}/{len(expanded_queries)}: '{exp_query}'") # Choose model based on detected language if query_lang == 'ar': query_embedding = self.embed_text_arabert(exp_query) search_results = self.arabert_collection.query( query_embeddings=[query_embedding], n_results=n_results, include=['documents', 'metadatas', 'distances'] ) formatted_results = format_results(search_results, 'arabert') else: query_embedding = self.embed_text_distilbert(exp_query) search_results = self.distilbert_collection.query( query_embeddings=[query_embedding], n_results=n_results, include=['documents', 'metadatas', 'distances'] ) formatted_results = format_results(search_results, 'distilbert') # Add unique results for result in formatted_results: text_hash = result['text'][:100] # Use first 100 chars as hash if text_hash not in seen_texts: seen_texts.add(text_hash) all_results.append(result) # Sort by distance (most relevant first) all_results.sort(key=lambda x: x['distance']) # Apply distance threshold AFTER seeing all results # Use adaptive threshold based on results if all_results and distance_threshold is not None: filtered_results = [r for r in all_results if r['distance'] <= distance_threshold] logger.info(f"📊 Before filtering: {len(all_results)} results, After filtering (threshold {distance_threshold}): {len(filtered_results)} results") all_results = filtered_results # Log results if all_results: logger.info(f"✅ Found {len(all_results)} unique results") logger.info(f" Best match distance: {all_results[0]['distance']:.3f}") logger.info(f" Top match: '{all_results[0]['section']}'") else: logger.warning(f"⚠️ No results found for query: {query}") # Return top results return all_results[:n_results] def get_rag(): """Get or create RAG instance.""" global rag_instance if rag_instance is None: logger.info("Initializing RAG system...") rag_instance = SimpleRAG() logger.info("RAG system ready!") return rag_instance def detect_language(text: str) -> str: """Detect if text is Arabic or English.""" arabic_chars = sum(1 for char in text if '\u0600' <= char <= '\u06FF') total_chars = sum(1 for char in text if char.isalpha()) if total_chars == 0: return 'en' if arabic_chars / total_chars > 0.3: return 'ar' return 'en' def format_sources(search_results): """Format search results as markdown.""" if not search_results: return "" sources_md = "\n\n---\n### 📚 Sources:\n\n" for idx, result in enumerate(search_results[:5], 1): relevance = round(100 - result['distance'] / 3, 1) sources_md += f"**{idx}. {result['section']}** (Relevance: {relevance}%)\n" sources_md += f"- Type: {result['type']}\n" preview = result['text'][:150] + "..." if len(result['text']) > 150 else result['text'] sources_md += f"- Preview: {preview}\n\n" return sources_md def generate_response(user_message, context, language='en'): """Generate response using Gemini with enhanced prompting.""" if language == 'ar': system_prompt = """أنت مساعد ذكي متخصص لجامعة المقاصد في بيروت (MUB). مهمتك: الإجابة على أسئلة الطلاب والزوار حول الجامعة بناءً على المعلومات المتوفرة في السياق. قواعد مهمة: 1. اقرأ السياق بعناية وابحث عن المعلومات ذات الصلة بالسؤال 2. استخدم المعلومات من السياق بغض النظر عن اللغة (عربي أو إنجليزي) 3. إذا كانت المعلومات بالإنجليزية، قم بترجمتها واستخدمها في إجابتك بالعربية 4. للأسئلة حول التقديم أو القبول: - ابحث عن شروط القبول والمتطلبات - اذكر الرسوم إن وُجدت - اذكر معلومات التواصل إن وُجدت 5. إذا وجدت معلومات جزئية فقط، اذكر ما تعرفه واشرح أنك قد تحتاج لمزيد من التفاصيل 6. إذا لم تجد أي معلومات ذات صلة في السياق، قل ذلك بوضوح واقترح على المستخدم التواصل مباشرة مع الجامعة 7. كن مهذباً ومفيداً وواضحاً 8. أجب باللغة العربية فقط ⚠️ مهم جداً: السياق قد يحتوي على معلومات بالعربية والإنجليزية معاً - اقرأ كل المصادر وليس فقط الأولى! """ else: system_prompt = """You are an expert intelligent assistant for Makassed University of Beirut (MUB). Your task: Answer questions from students and visitors about the university based on the available information in the context. Important Rules: 1. Carefully read the context and look for information relevant to the question 2. Use information from the context REGARDLESS of the language (Arabic or English) 3. If the information is in Arabic, translate and use it in your English response 4. For questions about applying or admission: - Look for admission requirements and prerequisites - Mention fees if found - Include contact information if available 5. If you find only partial information, mention what you know and explain that more details may be needed 6. If you don't find ANY relevant information in the context, say so clearly and suggest the user contact the university directly 7. Be polite, helpful, and clear 8. Answer in English only ⚠️ Very Important: The context may contain information in both Arabic and English - read ALL sources, not just the first one! """ full_prompt = f"""{system_prompt} CONTEXT (Multiple Sources): {context} USER QUESTION: {user_message} Instructions: 1. Read through ALL the context sources above carefully 2. Extract relevant information from ANY source (Arabic or English) 3. Provide a comprehensive answer based on the information found 4. If asking about procedures (like "how to apply"), provide step-by-step guidance if available 5. If the information is incomplete, be helpful with what you have and guide them to get more details Your Answer:""" try: model = genai.GenerativeModel('gemini-2.5-flash-lite') response = model.generate_content(full_prompt) if response and response.text: return response.text.strip() else: return "عذراً، لم أتمكن من إنشاء إجابة." if language == 'ar' else "Sorry, couldn't generate a response." except Exception as e: logger.error(f"❌ GEMINI ERROR: {e}") return "عذراً، حدث خطأ." if language == 'ar' else "Sorry, an error occurred." def chat(message, history): """Main chat function with auto language detection and improved retrieval.""" try: # --- LOGGING ADDED HERE --- logger.info(f"📝 USER QUERY: {message}") # -------------------------- # Auto-detect language from user's message lang_code = detect_language(message) logger.info(f"🌍 Language detected: {lang_code}") rag = get_rag() # Search without strict distance threshold - let the model decide relevance # This returns results and we'll let the LLM determine if they're relevant search_results = rag.search(message, n_results=10, distance_threshold=None) # If truly no results (which shouldn't happen with ChromaDB), return helpful message if len(search_results) == 0: logger.warning(f"⚠️ No results found in database") if lang_code == 'ar': return """عذراً، لم أتمكن من العثور على معلومات في قاعدة البيانات. للحصول على معلومات دقيقة، يرجى التواصل مباشرة مع جامعة المقاصد: - الهاتف: +961 03 14 10 44 - البريد الإلكتروني: info@mub.edu.lb - الموقع: https://mub.edu.lb يمكنني المساعدة في أسئلة أخرى حول الجامعة!""" else: return """I apologize, but I couldn't find information in the database. For accurate information, please contact Makassed University directly: - Phone: +961 03 14 10 44 - Email: info@mub.edu.lb - Website: https://mub.edu.lb I'm happy to help with other questions about the university!""" # Format context with language indicators context_parts = [] if lang_code == 'ar': context_parts.append("المعلومات من جامعة المقاصد (قد تكون بالعربية أو الإنجليزية):") else: context_parts.append("Information from Makassed University (may be in Arabic or English):") # Track language distribution in results languages_found = {'ar': 0, 'en': 0} for idx, result in enumerate(search_results, 1): # Detect source language source_lang = detect_language(result['text']) lang_label = "Arabic" if source_lang == 'ar' else "English" languages_found[source_lang] += 1 context_parts.append(f"\n--- Source {idx} [{lang_label}] (Relevance: {100 - (result['distance'] * 30):.1f}%) ---") context_parts.append(result['text']) # Log language distribution logger.info(f"📊 Sources: {languages_found['ar']} Arabic, {languages_found['en']} English | Query language: {lang_code}") context = "\n".join(context_parts) # Generate response bot_response = generate_response(message, context, lang_code) # Add sources sources = format_sources(search_results) full_response = bot_response + sources # --- LOGGING ADDED HERE --- # Log the first 100 chars of response to keep logs clean logger.info(f"🤖 BOT RESPONSE: {bot_response[:100].replace(chr(10), ' ')}...") # -------------------------- return full_response except Exception as e: logger.error(f"❌ CHAT FATAL ERROR: {e}") error_lang = detect_language(message) return "عذراً، حدث خطأ." if error_lang == 'ar' else "Sorry, an error occurred." # Create interface with gr.Blocks(title="MUB Chatbot") as demo: gr.Markdown(""" # 🎓 Makassed University of Beirut (MUB) Chatbot Ask me anything about Makassed University! أسألني عن جامعة المقاصد! **The chatbot will automatically respond in the language you use (English or Arabic)** """) chatbot = gr.Chatbot( height=500, show_label=False ) with gr.Row(): msg = gr.Textbox( placeholder="Type your question... / اكتب سؤالك...", show_label=False, scale=9 ) submit = gr.Button("Send", scale=1, variant="primary") gr.Examples( examples=[ ["What is Makassed University?"], ["What are the tuition fees?"], ["Tell me about nursing programs"], ["ما هي جامعة المقاصد؟"], ["ما هي رسوم الدراسة؟"], ], inputs=msg, ) gr.Markdown(""" --- **About:** AI chatbot powered by RAG + Google Gemini **Data:** Official MUB documentation **Note:** For official information, contact the university directly. """) def respond(message, chat_history): if not message.strip(): return "", chat_history bot_response = chat(message, chat_history) if chat_history is None: chat_history = [] chat_history.append({"role": "user", "content": message}) chat_history.append({"role": "assistant", "content": bot_response}) return "", chat_history msg.submit(respond, [msg, chatbot], [msg, chatbot], api_name="predict") submit.click(respond, [msg, chatbot], [msg, chatbot]) # Launch if __name__ == "__main__": demo.queue() demo.launch()