#!/usr/bin/env python3
"""
Gradio App to showcase Sanskrit-English tokenizer vs Qwen tokenizer
Interactive comparison with examples and custom input testing.
"""

import gradio as gr
import pandas as pd
from transformers import AutoTokenizer
import time
from collections import Counter
import json

# Load tokenizers
print("Loading tokenizers...")
try:
    our_tokenizer = AutoTokenizer.from_pretrained("diabolic6045/Sanskrit-English-qwen2-tokenizer")
    print("✅ Loaded our Sanskrit-English tokenizer")
except Exception as e:
    print(f"❌ Failed to load our tokenizer: {e}")
    our_tokenizer = None

try:
    qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B")
    print("✅ Loaded Qwen tokenizer")
except Exception as e:
    print(f"❌ Failed to load Qwen tokenizer: {e}")
    qwen_tokenizer = None

# Predefined examples for Gradio
EXAMPLES = [
    # Sanskrit examples
    "हरे कृष्ण हरे कृष्ण कृष्ण कृष्ण हरे हरे। हरे राम हरे राम राम राम हरे हरे॥",
    "कर्मण्येवाधिकारस्ते मा फलेषु कदाचन। मा कर्मफलहेतुर्भूर्मा ते सङ्गोऽस्त्वकर्मणि॥",
    "ॐ भूर्भुवः स्वः तत्सवितुर्वरेण्यं भर्गो देवस्य धीमहि धियो यो नः प्रचोदयात्॥",
    "जटाटवीगलज्जलप्रवाहपावितस्थले गलेऽवलम्ब्य लम्बितां भुजङ्गतुङ्गमालिकाम्।",
    "धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः। मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय॥",
    "सर्वे भवन्तु सुखिनः सर्वे सन्तु निरामयाः। सर्वे भद्राणि पश्यन्तु मा कश्चिद्दुःखभाग्भवेत्॥",
    
    # English examples
    "To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer",
    "The theory of relativity fundamentally changed our understanding of space and time",
    "Machine learning algorithms require substantial computational resources and large datasets",
    "The Bhagavad Gita is a 700-verse Hindu scripture that is part of the epic Mahabharata",
    "Sanskrit is considered the mother of all Indo-European languages and is still used today",
    "Yoga, meditation, and mindfulness practices have gained global popularity in recent years",
    
    # Mixed examples
    "The word 'dharma' (धर्म) in Sanskrit means duty, righteousness, or moral law",
    "Yoga (योग) is a spiritual practice that originated in ancient India",
    "The mantra 'Om Namah Shivaya' (ॐ नमः शिवाय) is a powerful invocation to Lord Shiva",
    "Karma (कर्म) is the universal law of cause and effect in Hindu philosophy",
    "The chakra (चक्र) system describes energy centers in the human body",
    "Moksha (मोक्ष) represents liberation from the cycle of birth and death"
]

def analyze_tokenizer(text, tokenizer, tokenizer_name):
    """Analyze text with a specific tokenizer."""
    if tokenizer is None:
        return {
            "tokens": ["Tokenizer not available"],
            "count": 0,
            "token_ids": [],
            "decoded": "Tokenizer not available",
            "compression_ratio": 0,
            "reconstruction_accurate": False,
            "time_ms": 0
        }
    
    start_time = time.time()
    
    # Tokenization
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.encode(text)
    decoded = tokenizer.decode(token_ids)
    
    end_time = time.time()
    time_ms = (end_time - start_time) * 1000
    
    # Analysis
    num_tokens = len(tokens)
    num_chars = len(text)
    compression_ratio = num_chars / num_tokens if num_tokens > 0 else 0
    
    # Check reconstruction (remove special tokens)
    decoded_clean = decoded.replace("<|endoftext|>", "").strip()
    reconstruction_accurate = decoded_clean == text
    
    return {
        "tokens": tokens,
        "count": num_tokens,
        "token_ids": token_ids,
        "decoded": decoded,
        "compression_ratio": compression_ratio,
        "reconstruction_accurate": reconstruction_accurate,
        "time_ms": time_ms
    }

def compare_tokenizers(text):
    """Compare both tokenizers on the same text."""
    if not text.strip():
        return "Please enter some text to analyze.", "", "", ""
    
    # Analyze with our tokenizer
    our_analysis = analyze_tokenizer(text, our_tokenizer, "Our Tokenizer")
    qwen_analysis = analyze_tokenizer(text, qwen_tokenizer, "Qwen Tokenizer")
    
    # Create comparison table
    comparison_data = []
    
    # Our tokenizer results
    our_tokens_str = " ".join(our_analysis["tokens"][:20])  # Show first 20 tokens
    if len(our_analysis["tokens"]) > 20:
        our_tokens_str += f" ... (+{len(our_analysis['tokens']) - 20} more)"
    
    comparison_data.append([
        "Our Sanskrit-English Tokenizer",
        our_analysis["count"],
        f"{our_analysis['compression_ratio']:.2f}",
        "✅" if our_analysis["reconstruction_accurate"] else "❌",
        f"{our_analysis['time_ms']:.2f}ms",
        our_tokens_str
    ])
    
    # Qwen tokenizer results
    qwen_tokens_str = " ".join(qwen_analysis["tokens"][:20])  # Show first 20 tokens
    if len(qwen_analysis["tokens"]) > 20:
        qwen_tokens_str += f" ... (+{len(qwen_analysis['tokens']) - 20} more)"
    
    comparison_data.append([
        "Original Qwen Tokenizer",
        qwen_analysis["count"],
        f"{qwen_analysis['compression_ratio']:.2f}",
        "✅" if qwen_analysis["reconstruction_accurate"] else "❌",
        f"{qwen_analysis['time_ms']:.2f}ms",
        qwen_tokens_str
    ])
    
    # Calculate improvement
    if our_analysis["count"] > 0 and qwen_analysis["count"] > 0:
        improvement = qwen_analysis["count"] / our_analysis["count"]
        improvement_text = f"**Improvement: {improvement:.1f}x fewer tokens with our tokenizer**"
    else:
        improvement_text = "Cannot calculate improvement"
    
    # Create DataFrame for display
    df = pd.DataFrame(comparison_data, columns=[
        "Tokenizer", "Token Count", "Compression Ratio", "Accurate", "Time", "Sample Tokens"
    ])
    
    return df, improvement_text, our_analysis["decoded"], qwen_analysis["decoded"]

def create_statistics_tab():
    """Create statistics and analysis tab."""
    def generate_stats():
        stats_data = []
        total_our_tokens = 0
        total_qwen_tokens = 0
        total_chars = 0
        
        for i, text in enumerate(EXAMPLES):
            our_analysis = analyze_tokenizer(text, our_tokenizer, "Our")
            qwen_analysis = analyze_tokenizer(text, qwen_tokenizer, "Qwen")
            
            total_our_tokens += our_analysis["count"]
            total_qwen_tokens += qwen_analysis["count"]
            total_chars += len(text)
            
            improvement = qwen_analysis["count"] / our_analysis["count"] if our_analysis["count"] > 0 else 0
            
            # Determine category based on index
            if i < 6:
                category = "Sanskrit"
            elif i < 12:
                category = "English"
            else:
                category = "Mixed"
            
            stats_data.append([
                category,
                text[:50] + "..." if len(text) > 50 else text,
                our_analysis["count"],
                qwen_analysis["count"],
                f"{improvement:.1f}x",
                f"{our_analysis['compression_ratio']:.2f}",
                f"{qwen_analysis['compression_ratio']:.2f}"
            ])
        
        overall_improvement = total_qwen_tokens / total_our_tokens if total_our_tokens > 0 else 0
        overall_compression = total_chars / total_our_tokens if total_our_tokens > 0 else 0
        
        stats_df = pd.DataFrame(stats_data, columns=[
            "Category", "Text Sample", "Our Tokens", "Qwen Tokens", "Improvement", "Our Compression", "Qwen Compression"
        ])
        
        summary = f"""
        ## Overall Statistics
        
        - **Total Tests**: {len(EXAMPLES)}
        - **Total Characters**: {total_chars:,}
        - **Our Total Tokens**: {total_our_tokens:,}
        - **Qwen Total Tokens**: {total_qwen_tokens:,}
        - **Overall Improvement**: {overall_improvement:.1f}x fewer tokens
        - **Our Average Compression**: {overall_compression:.2f} chars/token
        - **Token Reduction**: {total_qwen_tokens - total_our_tokens:,} tokens saved
        """
        
        return stats_df, summary
    
    return generate_stats

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Sanskrit-English Tokenizer Showcase", theme=gr.themes.Soft()) as app:
        gr.Markdown("""
        # 🔥 Sanskrit-English Tokenizer Showcase
        
        Compare our custom Sanskrit-English tokenizer with the original Qwen tokenizer.
        See the dramatic improvements in tokenization efficiency for Sanskrit text!
        """)
        
        with gr.Tabs():
            # Tab 1: Interactive Comparison with Examples
            with gr.Tab("🔍 Interactive Comparison"):
                gr.Markdown("Enter any text to compare tokenization between our tokenizer and Qwen's original tokenizer.")
                
                with gr.Row():
                    with gr.Column():
                        input_text = gr.Textbox(
                            label="Enter text to analyze",
                            placeholder="Type Sanskrit, English, or mixed text here...",
                            lines=3
                        )
                        analyze_btn = gr.Button("🔍 Analyze", variant="primary")
                    
                    with gr.Column():
                        improvement_text = gr.Markdown("")
                
                # Examples section
                gr.Markdown("### 📚 Try these examples:")
                examples = gr.Examples(
                    examples=EXAMPLES,
                    inputs=[input_text],
                    label="Click any example to test it"
                )
                
                with gr.Row():
                    with gr.Column():
                        comparison_table = gr.Dataframe(
                            label="Tokenization Comparison",
                            headers=["Tokenizer", "Token Count", "Compression Ratio", "Accurate", "Time", "Sample Tokens"],
                            datatype=["str", "number", "str", "str", "str", "str"]
                        )
                
                with gr.Row():
                    with gr.Column():
                        our_decoded = gr.Textbox(label="Our Tokenizer Decoded Output", lines=2)
                    with gr.Column():
                        qwen_decoded = gr.Textbox(label="Qwen Tokenizer Decoded Output", lines=2)
                
                analyze_btn.click(
                    fn=compare_tokenizers,
                    inputs=[input_text],
                    outputs=[comparison_table, improvement_text, our_decoded, qwen_decoded]
                )
            
            # Tab 2: Statistics
            with gr.Tab("📊 Statistics & Analysis"):
                gr.Markdown("View comprehensive statistics across all predefined examples.")
                
                stats_btn = gr.Button("📈 Generate Statistics", variant="primary")
                
                with gr.Row():
                    with gr.Column():
                        stats_summary = gr.Markdown("")
                    with gr.Column():
                        stats_table = gr.Dataframe(
                            label="Detailed Statistics",
                            headers=["Category", "Text Sample", "Our Tokens", "Qwen Tokens", "Improvement", "Our Compression", "Qwen Compression"],
                            datatype=["str", "str", "number", "number", "str", "str", "str"]
                        )
                
                stats_btn.click(
                    fn=create_statistics_tab(),
                    outputs=[stats_table, stats_summary]
                )
            
            # Tab 3: About
            with gr.Tab("ℹ️ About"):
                gr.Markdown("""
                ## About This Tokenizer
                
                ### Problem Solved
                The original Qwen2.5 tokenizer produces inefficient byte-level tokens for Sanskrit text:
                - **Qwen's output**: `['à¤¹', 'à¤°', 'à¥ĩ', 'Ġà¤ķ', 'à¥', 'ĥ', 'à¤·', 'à¥įà¤', '£']` (36 tokens)
                - **Our output**: `['▁हरे', '▁कृष्ण', '▁हरे', '▁कृष्ण', '▁कृष्ण', '▁कृष्ण', '▁हरे', '▁हरे']` (8 tokens)
                
                ### Key Features
                - **4.5x better efficiency** for Sanskrit text
                - **120,000 vocabulary** trained on English+Sanskrit corpus
                - **Clean, readable tokens** - no more byte-level artifacts
                - **Native Hugging Face format** - no custom code needed
                - **100% reconstruction accuracy** - perfect encode/decode
                - **Chat template support** - ready for inference
                
                ### Technical Details
                - **Architecture**: Native Hugging Face BPE with Metaspace pre-tokenizer
                - **Training Data**: 764K texts (100K English + 664K Sanskrit)
                - **Special Tokens**: `<|endoftext|>`, `<pad>`, `<unk>` (Qwen2 compatible)
                - **Model Size**: 3.5MB
                
                ### Use Cases
                - Sanskrit language models
                - English ↔ Sanskrit translation
                - Educational tools
                - Research applications
                
                **Created by**: Divax Shah (diabolic6045)  
                **Repository**: [diabolic6045/Sanskrit-English-qwen2-tokenizer](https://huggingface.co/diabolic6045/Sanskrit-English-qwen2-tokenizer)
                """)
    
    return app

if __name__ == "__main__":
    print("🚀 Starting Sanskrit-English Tokenizer Showcase App...")
    app = create_interface()
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        show_error=True
    )