#!/usr/bin/env python3 """ Gradio App to showcase Sanskrit-English tokenizer vs Qwen tokenizer Interactive comparison with examples and custom input testing. """ import gradio as gr import pandas as pd from transformers import AutoTokenizer import time from collections import Counter import json # Load tokenizers print("Loading tokenizers...") try: our_tokenizer = AutoTokenizer.from_pretrained("diabolic6045/Sanskrit-English-qwen2-tokenizer") print("✅ Loaded our Sanskrit-English tokenizer") except Exception as e: print(f"❌ Failed to load our tokenizer: {e}") our_tokenizer = None try: qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B") print("✅ Loaded Qwen tokenizer") except Exception as e: print(f"❌ Failed to load Qwen tokenizer: {e}") qwen_tokenizer = None # Predefined examples for Gradio EXAMPLES = [ # Sanskrit examples "हरे कृष्ण हरे कृष्ण कृष्ण कृष्ण हरे हरे। हरे राम हरे राम राम राम हरे हरे॥", "कर्मण्येवाधिकारस्ते मा फलेषु कदाचन। मा कर्मफलहेतुर्भूर्मा ते सङ्गोऽस्त्वकर्मणि॥", "ॐ भूर्भुवः स्वः तत्सवितुर्वरेण्यं भर्गो देवस्य धीमहि धियो यो नः प्रचोदयात्॥", "जटाटवीगलज्जलप्रवाहपावितस्थले गलेऽवलम्ब्य लम्बितां भुजङ्गतुङ्गमालिकाम्।", "धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः। मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय॥", "सर्वे भवन्तु सुखिनः सर्वे सन्तु निरामयाः। सर्वे भद्राणि पश्यन्तु मा कश्चिद्दुःखभाग्भवेत्॥", # English examples "To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer", "The theory of relativity fundamentally changed our understanding of space and time", "Machine learning algorithms require substantial computational resources and large datasets", "The Bhagavad Gita is a 700-verse Hindu scripture that is part of the epic Mahabharata", "Sanskrit is considered the mother of all Indo-European languages and is still used today", "Yoga, meditation, and mindfulness practices have gained global popularity in recent years", # Mixed examples "The word 'dharma' (धर्म) in Sanskrit means duty, righteousness, or moral law", "Yoga (योग) is a spiritual practice that originated in ancient India", "The mantra 'Om Namah Shivaya' (ॐ नमः शिवाय) is a powerful invocation to Lord Shiva", "Karma (कर्म) is the universal law of cause and effect in Hindu philosophy", "The chakra (चक्र) system describes energy centers in the human body", "Moksha (मोक्ष) represents liberation from the cycle of birth and death" ] def analyze_tokenizer(text, tokenizer, tokenizer_name): """Analyze text with a specific tokenizer.""" if tokenizer is None: return { "tokens": ["Tokenizer not available"], "count": 0, "token_ids": [], "decoded": "Tokenizer not available", "compression_ratio": 0, "reconstruction_accurate": False, "time_ms": 0 } start_time = time.time() # Tokenization tokens = tokenizer.tokenize(text) token_ids = tokenizer.encode(text) decoded = tokenizer.decode(token_ids) end_time = time.time() time_ms = (end_time - start_time) * 1000 # Analysis num_tokens = len(tokens) num_chars = len(text) compression_ratio = num_chars / num_tokens if num_tokens > 0 else 0 # Check reconstruction (remove special tokens) decoded_clean = decoded.replace("<|endoftext|>", "").strip() reconstruction_accurate = decoded_clean == text return { "tokens": tokens, "count": num_tokens, "token_ids": token_ids, "decoded": decoded, "compression_ratio": compression_ratio, "reconstruction_accurate": reconstruction_accurate, "time_ms": time_ms } def compare_tokenizers(text): """Compare both tokenizers on the same text.""" if not text.strip(): return "Please enter some text to analyze.", "", "", "" # Analyze with our tokenizer our_analysis = analyze_tokenizer(text, our_tokenizer, "Our Tokenizer") qwen_analysis = analyze_tokenizer(text, qwen_tokenizer, "Qwen Tokenizer") # Create comparison table comparison_data = [] # Our tokenizer results our_tokens_str = " ".join(our_analysis["tokens"][:20]) # Show first 20 tokens if len(our_analysis["tokens"]) > 20: our_tokens_str += f" ... (+{len(our_analysis['tokens']) - 20} more)" comparison_data.append([ "Our Sanskrit-English Tokenizer", our_analysis["count"], f"{our_analysis['compression_ratio']:.2f}", "✅" if our_analysis["reconstruction_accurate"] else "❌", f"{our_analysis['time_ms']:.2f}ms", our_tokens_str ]) # Qwen tokenizer results qwen_tokens_str = " ".join(qwen_analysis["tokens"][:20]) # Show first 20 tokens if len(qwen_analysis["tokens"]) > 20: qwen_tokens_str += f" ... (+{len(qwen_analysis['tokens']) - 20} more)" comparison_data.append([ "Original Qwen Tokenizer", qwen_analysis["count"], f"{qwen_analysis['compression_ratio']:.2f}", "✅" if qwen_analysis["reconstruction_accurate"] else "❌", f"{qwen_analysis['time_ms']:.2f}ms", qwen_tokens_str ]) # Calculate improvement if our_analysis["count"] > 0 and qwen_analysis["count"] > 0: improvement = qwen_analysis["count"] / our_analysis["count"] improvement_text = f"**Improvement: {improvement:.1f}x fewer tokens with our tokenizer**" else: improvement_text = "Cannot calculate improvement" # Create DataFrame for display df = pd.DataFrame(comparison_data, columns=[ "Tokenizer", "Token Count", "Compression Ratio", "Accurate", "Time", "Sample Tokens" ]) return df, improvement_text, our_analysis["decoded"], qwen_analysis["decoded"] def create_statistics_tab(): """Create statistics and analysis tab.""" def generate_stats(): stats_data = [] total_our_tokens = 0 total_qwen_tokens = 0 total_chars = 0 for i, text in enumerate(EXAMPLES): our_analysis = analyze_tokenizer(text, our_tokenizer, "Our") qwen_analysis = analyze_tokenizer(text, qwen_tokenizer, "Qwen") total_our_tokens += our_analysis["count"] total_qwen_tokens += qwen_analysis["count"] total_chars += len(text) improvement = qwen_analysis["count"] / our_analysis["count"] if our_analysis["count"] > 0 else 0 # Determine category based on index if i < 6: category = "Sanskrit" elif i < 12: category = "English" else: category = "Mixed" stats_data.append([ category, text[:50] + "..." if len(text) > 50 else text, our_analysis["count"], qwen_analysis["count"], f"{improvement:.1f}x", f"{our_analysis['compression_ratio']:.2f}", f"{qwen_analysis['compression_ratio']:.2f}" ]) overall_improvement = total_qwen_tokens / total_our_tokens if total_our_tokens > 0 else 0 overall_compression = total_chars / total_our_tokens if total_our_tokens > 0 else 0 stats_df = pd.DataFrame(stats_data, columns=[ "Category", "Text Sample", "Our Tokens", "Qwen Tokens", "Improvement", "Our Compression", "Qwen Compression" ]) summary = f""" ## Overall Statistics - **Total Tests**: {len(EXAMPLES)} - **Total Characters**: {total_chars:,} - **Our Total Tokens**: {total_our_tokens:,} - **Qwen Total Tokens**: {total_qwen_tokens:,} - **Overall Improvement**: {overall_improvement:.1f}x fewer tokens - **Our Average Compression**: {overall_compression:.2f} chars/token - **Token Reduction**: {total_qwen_tokens - total_our_tokens:,} tokens saved """ return stats_df, summary return generate_stats # Create Gradio interface def create_interface(): with gr.Blocks(title="Sanskrit-English Tokenizer Showcase", theme=gr.themes.Soft()) as app: gr.Markdown(""" # 🔥 Sanskrit-English Tokenizer Showcase Compare our custom Sanskrit-English tokenizer with the original Qwen tokenizer. See the dramatic improvements in tokenization efficiency for Sanskrit text! """) with gr.Tabs(): # Tab 1: Interactive Comparison with Examples with gr.Tab("🔍 Interactive Comparison"): gr.Markdown("Enter any text to compare tokenization between our tokenizer and Qwen's original tokenizer.") with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Enter text to analyze", placeholder="Type Sanskrit, English, or mixed text here...", lines=3 ) analyze_btn = gr.Button("🔍 Analyze", variant="primary") with gr.Column(): improvement_text = gr.Markdown("") # Examples section gr.Markdown("### 📚 Try these examples:") examples = gr.Examples( examples=EXAMPLES, inputs=[input_text], label="Click any example to test it" ) with gr.Row(): with gr.Column(): comparison_table = gr.Dataframe( label="Tokenization Comparison", headers=["Tokenizer", "Token Count", "Compression Ratio", "Accurate", "Time", "Sample Tokens"], datatype=["str", "number", "str", "str", "str", "str"] ) with gr.Row(): with gr.Column(): our_decoded = gr.Textbox(label="Our Tokenizer Decoded Output", lines=2) with gr.Column(): qwen_decoded = gr.Textbox(label="Qwen Tokenizer Decoded Output", lines=2) analyze_btn.click( fn=compare_tokenizers, inputs=[input_text], outputs=[comparison_table, improvement_text, our_decoded, qwen_decoded] ) # Tab 2: Statistics with gr.Tab("📊 Statistics & Analysis"): gr.Markdown("View comprehensive statistics across all predefined examples.") stats_btn = gr.Button("📈 Generate Statistics", variant="primary") with gr.Row(): with gr.Column(): stats_summary = gr.Markdown("") with gr.Column(): stats_table = gr.Dataframe( label="Detailed Statistics", headers=["Category", "Text Sample", "Our Tokens", "Qwen Tokens", "Improvement", "Our Compression", "Qwen Compression"], datatype=["str", "str", "number", "number", "str", "str", "str"] ) stats_btn.click( fn=create_statistics_tab(), outputs=[stats_table, stats_summary] ) # Tab 3: About with gr.Tab("ℹ️ About"): gr.Markdown(""" ## About This Tokenizer ### Problem Solved The original Qwen2.5 tokenizer produces inefficient byte-level tokens for Sanskrit text: - **Qwen's output**: `['ह', 'र', 'à¥ĩ', 'Ġà¤ķ', 'à¥', 'ĥ', 'ष', 'à¥įà¤', '£']` (36 tokens) - **Our output**: `['▁हरे', '▁कृष्ण', '▁हरे', '▁कृष्ण', '▁कृष्ण', '▁कृष्ण', '▁हरे', '▁हरे']` (8 tokens) ### Key Features - **4.5x better efficiency** for Sanskrit text - **120,000 vocabulary** trained on English+Sanskrit corpus - **Clean, readable tokens** - no more byte-level artifacts - **Native Hugging Face format** - no custom code needed - **100% reconstruction accuracy** - perfect encode/decode - **Chat template support** - ready for inference ### Technical Details - **Architecture**: Native Hugging Face BPE with Metaspace pre-tokenizer - **Training Data**: 764K texts (100K English + 664K Sanskrit) - **Special Tokens**: `<|endoftext|>`, ``, `` (Qwen2 compatible) - **Model Size**: 3.5MB ### Use Cases - Sanskrit language models - English ↔ Sanskrit translation - Educational tools - Research applications **Created by**: Divax Shah (diabolic6045) **Repository**: [diabolic6045/Sanskrit-English-qwen2-tokenizer](https://huggingface.co/diabolic6045/Sanskrit-English-qwen2-tokenizer) """) return app if __name__ == "__main__": print("🚀 Starting Sanskrit-English Tokenizer Showcase App...") app = create_interface() app.launch( server_name="0.0.0.0", server_port=7860, share=True, show_error=True )