Indic_ASR_Comparison_Multi

Running

App Files Files Community

AvtnshM commited on Sep 1

Commit

d0422d9

verified ·

1 Parent(s): 4e42095

V12

Browse files

Files changed (1) hide show

app.py +93 -624

app.py CHANGED Viewed

@@ -1,653 +1,122 @@
 import gradio as gr
 import torch
 import torchaudio
-import numpy as np
-from transformers import (
-    AutoProcessor,
-    AutoModelForSpeechSeq2Seq,
-    AutoModelForCTC,
-    Wav2Vec2Processor,
-    Wav2Vec2ForCTC
-)
 import librosa
 import time
-import os
-from typing import Dict, Tuple, Optional
-import jiwer
-import warnings
-warnings.filterwarnings("ignore")
 # Model configurations
-MODELS_CONFIG = {
-    "IndicConformer-600M": {
-        "repo_id": "ai4bharat/indic-conformer-600m-multilingual",
-        "type": "conformer",
-        "params": "600M",
-        "languages": "22 Indian languages (Hindi, Bengali, Gujarati, Marathi, Tamil, Telugu, Kannada, Malayalam, etc.)",
-        "architecture": "Multilingual Conformer-based Hybrid CTC + RNNT",
-        "license": "MIT",
-        "description": "AI4Bharat's comprehensive ASR model for all 22 official Indian languages"
-    },
-    "AudioX-North": {
-        "repo_id": "placeholder/audiox-north",  # Replace with actual repo when available
-        "type": "audiox",
-        "params": "Unknown",
-        "languages": "Hindi, Gujarati, Marathi",
-        "architecture": "Fine-tuned ASR with domain adaptation",
-        "license": "Unknown",
-        "description": "Jivi AI's specialized model for North Indian languages"
     },
-    "AudioX-South": {
-        "repo_id": "placeholder/audiox-south",  # Replace with actual repo when available
-        "type": "audiox",
-        "params": "Unknown",
-        "languages": "Tamil, Telugu, Kannada, Malayalam",
-        "architecture": "Fine-tuned ASR with domain adaptation",
-        "license": "Unknown",
-        "description": "Jivi AI's specialized model for South Indian languages"
     },
-    "Facebook-MMS": {
-        "repo_id": "facebook/mms-1b-all",
-        "type": "mms",
-        "params": "1B",
-        "languages": "1400+ languages worldwide",
-        "architecture": "Wav2Vec2 self-supervised pretraining",
-        "license": "CC-BY-NC 4.0",
-        "description": "Facebook's massive multilingual speech model"
     }
 }
-# Benchmark data from AudioX (Vistaar Benchmark)
-VISTAAR_BENCHMARK = {
-    "Hindi": {"AudioX": 12.14, "ElevenLabs": 13.64, "Sarvam": 14.28, "IndicWhisper": 13.59, "Azure": 20.03, "GPT-4": 18.65, "Google": 23.89, "Whisper-v3": 32.00},
-    "Gujarati": {"AudioX": 18.66, "ElevenLabs": 17.96, "Sarvam": 19.47, "IndicWhisper": 22.84, "Azure": 31.62, "GPT-4": 31.32, "Google": 36.48, "Whisper-v3": 53.75},
-    "Marathi": {"AudioX": 18.68, "ElevenLabs": 16.51, "Sarvam": 18.34, "IndicWhisper": 18.25, "Azure": 27.36, "GPT-4": 25.21, "Google": 26.48, "Whisper-v3": 78.28},
-    "Tamil": {"AudioX": 21.79, "ElevenLabs": 24.84, "Sarvam": 25.73, "IndicWhisper": 25.27, "Azure": 31.53, "GPT-4": 39.10, "Google": 33.62, "Whisper-v3": 52.44},
-    "Telugu": {"AudioX": 24.63, "ElevenLabs": 24.89, "Sarvam": 26.80, "IndicWhisper": 28.82, "Azure": 31.38, "GPT-4": 33.94, "Google": 42.42, "Whisper-v3": 179.58},
-    "Kannada": {"AudioX": 17.61, "ElevenLabs": 17.65, "Sarvam": 18.95, "IndicWhisper": 18.33, "Azure": 26.45, "GPT-4": 32.88, "Google": 31.48, "Whisper-v3": 67.02},
-    "Malayalam": {"AudioX": 26.92, "ElevenLabs": 28.88, "Sarvam": 32.64, "IndicWhisper": 32.34, "Azure": 41.84, "GPT-4": 46.11, "Google": 47.90, "Whisper-v3": 142.98}
-}
-class ASRModelManager:
-    def __init__(self):
-        self.loaded_models = {}
-        self.processors = {}
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    def load_model(self, model_name: str) -> Tuple[object, object]:
-        """Load model and processor with error handling"""
-        if model_name in self.loaded_models:
-            return self.loaded_models[model_name], self.processors[model_name]
-        try:
-            config = MODELS_CONFIG[model_name]
-            repo_id = config["repo_id"]
-            model_type = config["type"]
-            if model_type == "conformer":
-                # Load IndicConformer model
-                processor = AutoProcessor.from_pretrained(repo_id)
-                model = AutoModelForSpeechSeq2Seq.from_pretrained(
-                    repo_id,
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                    device_map="auto" if torch.cuda.is_available() else None
-                )
-            elif model_type == "mms":
-                # Load Facebook MMS model
-                processor = Wav2Vec2Processor.from_pretrained(repo_id)
-                model = Wav2Vec2ForCTC.from_pretrained(repo_id)
-                model = model.to(self.device)
-            elif model_type == "audiox":
-                # Placeholder for AudioX models - replace with actual implementation
-                # For now, using a fallback model for demonstration
-                processor = AutoProcessor.from_pretrained("openai/whisper-small")
-                model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
-                model = model.to(self.device)
-            self.loaded_models[model_name] = model
-            self.processors[model_name] = processor
-            return model, processor
-        except Exception as e:
-            raise Exception(f"Failed to load {model_name}: {str(e)}")
-def preprocess_audio(audio_path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]:
-    """Preprocess audio file for ASR inference"""
     try:
-        # Load and resample audio
-        audio, sr = librosa.load(audio_path, sr=target_sr)
-        # Normalize audio to prevent clipping
-        if np.max(np.abs(audio)) > 0:
-            audio = audio / np.max(np.abs(audio)) * 0.95
-        return audio, sr
     except Exception as e:
-        raise Exception(f"Audio preprocessing failed: {str(e)}")
-def calculate_wer_cer(reference: str, hypothesis: str) -> Tuple[float, float]:
-    """Calculate Word Error Rate and Character Error Rate"""
     try:
-        # Calculate WER using jiwer
-        wer = jiwer.wer(reference, hypothesis) * 100
-        # Calculate CER
-        cer = jiwer.cer(reference, hypothesis) * 100
-        return wer, cer
-    except Exception:
-        return 0.0, 0.0
-def transcribe_audio(
-    audio_file: str,
-    model_name: str,
-    reference_text: str = "",
-    language: str = "auto"
-) -> Tuple[str, str, float, float, float]:
-    """Perform ASR transcription and calculate metrics"""
-    if audio_file is None:
-        return "❌ Please upload an audio file", "", 0.0, 0.0, 0.0
     try:
-        # Start timing for RTF calculation
-        start_time = time.time()
-        # Preprocess audio
-        audio, sr = preprocess_audio(audio_file)
         audio_duration = len(audio) / sr
-        # Load model and processor
-        model, processor = model_manager.load_model(model_name)
-        # Perform transcription based on model type
-        config = MODELS_CONFIG[model_name]
-        if config["type"] == "conformer":
-            # IndicConformer inference
-            inputs = processor(
-                audio,
-                sampling_rate=sr,
-                return_tensors="pt",
-                padding=True
-            )
-            if torch.cuda.is_available():
-                inputs = {k: v.to("cuda") for k, v in inputs.items()}
-            with torch.no_grad():
-                predicted_ids = model.generate(**inputs, max_length=448)
-            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        elif config["type"] == "mms":
-            # Facebook MMS inference
-            inputs = processor(
-                audio,
-                sampling_rate=sr,
-                return_tensors="pt",
-                padding=True
-            )
-            if torch.cuda.is_available():
-                inputs = {k: v.to("cuda") for k, v in inputs.items()}
-            with torch.no_grad():
-                logits = model(**inputs).logits
-            predicted_ids = torch.argmax(logits, dim=-1)
-            transcription = processor.decode(predicted_ids[0])
-        elif config["type"] == "audiox":
-            # AudioX placeholder implementation
-            inputs = processor(
-                audio,
-                sampling_rate=sr,
-                return_tensors="pt"
-            )
-            if torch.cuda.is_available():
-                inputs = {k: v.to("cuda") for k, v in inputs.items()}
-            with torch.no_grad():
-                predicted_ids = model.generate(**inputs, max_length=448)
-            transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
-        # Calculate processing time and RTF
-        end_time = time.time()
-        processing_time = end_time - start_time
-        rtf = processing_time / audio_duration
-        # Calculate WER and CER if reference provided
-        wer, cer = 0.0, 0.0
-        if reference_text.strip():
-            wer, cer = calculate_wer_cer(reference_text.strip(), transcription.strip())
-        # Format model info
-        model_info = f"""
-        🤖 Model: {model_name}
-        📊 Parameters: {config['params']}
-        🗣️ Languages: {config['languages']}
-        ⚙️ Architecture: {config['architecture']}
-        ⏱️ Processing Time: {processing_time:.2f}s
-        🎵 Audio Duration: {audio_duration:.2f}s
-        """
-        return transcription.strip(), model_info, wer, cer, rtf
     except Exception as e:
-        return f"❌ Error: {str(e)}", "", 0.0, 0.0, 0.0
-def create_benchmark_table():
-    """Create the Vistaar benchmark comparison table"""
-    # Headers
-    headers = ["Language", "AudioX", "ElevenLabs", "Sarvam", "IndicWhisper", "Azure STT", "GPT-4", "Google STT", "Whisper-v3"]
-    # Data rows
-    rows = []
-    for lang, scores in VISTAAR_BENCHMARK.items():
-        row = [lang] + [f"{score:.2f}%" for score in scores.values()]
-        rows.append(row)
-    # Calculate and add average row
-    avg_row = ["🏆 Average"]
-    for provider in VISTAAR_BENCHMARK["Hindi"].keys():
-        avg_score = np.mean([VISTAAR_BENCHMARK[lang][provider] for lang in VISTAAR_BENCHMARK.keys()])
-        avg_row.append(f"{avg_score:.2f}%")
-    rows.append(avg_row)
-    return [headers] + rows
-def create_model_specs_table():
-    """Create model specifications comparison table"""
-    headers = ["Model", "Parameters", "Languages", "Architecture", "License", "Specialty"]
-    rows = [
-        ["IndicConformer-600M", "600M", "22 Indian", "Conformer CTC+RNNT", "MIT", "Comprehensive coverage"],
-        ["AudioX-North", "Unknown", "Hindi, Gujarati, Marathi", "Fine-tuned ASR", "Unknown", "North Indian optimization"],
-        ["AudioX-South", "Unknown", "Tamil, Telugu, Kannada, Malayalam", "Fine-tuned ASR", "Unknown", "South Indian optimization"],
-        ["Facebook MMS", "1B", "1400+ Global", "Wav2Vec2", "CC-BY-NC 4.0", "Massive multilingual"]
-    ]
-    return [headers] + rows
-# Initialize model manager
-model_manager = ASRModelManager()
-# Create Gradio interface
-with gr.Blocks(
-    title="🎯 ASR Model Comparison: IndicConformer vs AudioX vs MMS",
-    theme=gr.themes.Soft(),
-    css="""
-    .performance-card {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        padding: 1rem;
-        border-radius: 10px;
-        color: white;
-        margin: 0.5rem 0;
-    }
-    .metric-highlight {
-        background: #f0f9ff;
-        padding: 0.5rem;
-        border-left: 4px solid #3b82f6;
-        margin: 0.5rem 0;
-    }
-    """
-) as demo:
-    gr.Markdown("""
-    # 🎯 Comprehensive ASR Model Comparison Dashboard
-    Compare three cutting-edge Automatic Speech Recognition models for Indian languages:
-    - 🇮🇳 **AI4Bharat IndicConformer-600M**: Complete 22 Indian language coverage
-    - 🎯 **Jivi AI AudioX**: Specialized North/South variants with industry-leading accuracy
-    - 🌍 **Facebook MMS**: Massive 1B parameter multilingual model
-    ## 🏆 Key Highlight: AudioX achieves **20.1% average WER** - Best in class performance!
-    """)
-    with gr.Tabs():
-        # Live Testing Tab
-        with gr.TabItem("🎤 Live ASR Testing"):
-            gr.Markdown("### Upload audio and test model performance in real-time")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    audio_input = gr.Audio(
-                        label="📁 Upload Audio File",
-                        type="filepath",
-                        format="wav"
-                    )
-                    model_selector = gr.Dropdown(
-                        choices=list(MODELS_CONFIG.keys()),
-                        label="🤖 Select ASR Model",
-                        value="IndicConformer-600M",
-                        info="Choose the model for transcription"
-                    )
-                    reference_input = gr.Textbox(
-                        label="📝 Reference Text (Optional)",
-                        placeholder="Enter the correct transcription for accuracy calculation...",
-                        lines=3,
-                        info="Provide ground truth text to calculate WER and CER"
-                    )
-                    transcribe_button = gr.Button(
-                        "🚀 Transcribe Audio",
-                        variant="primary",
-                        size="lg"
-                    )
-                with gr.Column(scale=1):
-                    transcription_output = gr.Textbox(
-                        label="📄 Transcription Result",
-                        lines=5,
-                        max_lines=8
-                    )
-                    model_info_output = gr.Textbox(
-                        label="ℹ️ Model Information",
-                        lines=7
-                    )
-            with gr.Row():
-                with gr.Column():
-                    wer_output = gr.Number(
-                        label="📊 Word Error Rate (WER %)",
-                        precision=2,
-                        info="Lower is better"
-                    )
-                with gr.Column():
-                    cer_output = gr.Number(
-                        label="📊 Character Error Rate (CER %)",
-                        precision=2,
-                        info="Lower is better"
-                    )
-                with gr.Column():
-                    rtf_output = gr.Number(
-                        label="⚡ Real-Time Factor (RTF)",
-                        precision=3,
-                        info="< 1.0 = faster than real-time"
-                    )
-        # Benchmark Results Tab
-        with gr.TabItem("📊 Vistaar Benchmark Results"):
-            gr.Markdown("""
-            ## 🏆 Official Vistaar Benchmark Comparison (WER %)
-            Performance evaluation on AI4Bharat's standardized Vistaar benchmark across 7 Indian languages.
-            **Lower WER indicates better accuracy** ⬇️
-            """)
-            benchmark_df = gr.Dataframe(
-                value=create_benchmark_table(),
-                label="📈 Word Error Rate Comparison",
-                interactive=False,
-                wrap=True
-            )
-            gr.Markdown("""
-            ### 🎯 Key Performance Insights:
-            | 🏅 Rank | Model | Avg WER | Strength |
-            |---------|-------|---------|----------|
-            | 🥇 1st | **AudioX** | **20.1%** | Consistently best across languages |
-            | 🥈 2nd | ElevenLabs Scribe-v1 | 20.6% | Strong competitor, especially in Gujarati |
-            | 🥉 3rd | Sarvam saarika:v2 | 22.3% | Solid performance across the board |
-            | 4th | AI4Bharat IndicWhisper | 22.8% | Good baseline for comparison |
-            | 5th | Microsoft Azure STT | 30.0% | Commercial solution performance |
-            ### 💡 Analysis:
-            - **AudioX dominates** in 5 out of 7 languages
-            - **Specialized models outperform** general commercial solutions
-            - **Malayalam and Telugu** are the most challenging languages across all models
-            - **Hindi** shows the best performance across all models
-            """)
-        # Model Architecture Tab
-        with gr.TabItem("⚙️ Model Architecture & Specs"):
-            gr.Markdown("## 🔧 Technical Specifications Comparison")
-            specs_df = gr.Dataframe(
-                value=create_model_specs_table(),
-                label="📋 Model Architecture Details",
-                interactive=False
-            )
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("""
-                    ### 🎯 IndicConformer-600M
-                    **🏗️ Architecture**: Hybrid CTC + RNNT Conformer
-                    **🎯 Focus**: Comprehensive Indian language coverage
-                    **📊 Training**: Large-scale multilingual approach
-                    **⚡ Inference**: Dual decoding strategies
-                    **🎭 Use Cases**:
-                    - General-purpose Indian ASR
-                    - Research and development
-                    - Educational applications
-                    **✅ Strengths**:
-                    - Open-source MIT license
-                    - Covers all 22 official languages
-                    - Well-documented and accessible
-                    """)
-                with gr.Column():
-                    gr.Markdown("""
-                    ### 🏆 AudioX Series
-                    **🏗️ Architecture**: Specialized fine-tuned models
-                    **🎯 Focus**: Language-specific optimization
-                    **📊 Training**: Open-source + proprietary medical data
-                    **⚡ Inference**: Optimized for production
-                    **🎭 Use Cases**:
-                    - Production voice assistants
-                    - Healthcare transcription
-                    - Customer service automation
-                    - Content creation platforms
-                    **✅ Strengths**:
-                    - Industry-leading accuracy
-                    - Regional accent handling
-                    - Robust to noise and variations
-                    """)
-                with gr.Column():
-                    gr.Markdown("""
-                    ### 🌍 Facebook MMS
-                    **🏗️ Architecture**: Wav2Vec2 self-supervised
-                    **🎯 Focus**: Massive multilingual coverage
-                    **📊 Training**: 500K hours, 1400+ languages
-                    **⚡ Inference**: Requires task-specific fine-tuning
-                    **🎭 Use Cases**:
-                    - Research in multilingual ASR
-                    - Low-resource language support
-                    - Cross-lingual applications
-                    - Base model for fine-tuning
-                    **✅ Strengths**:
-                    - Unprecedented language coverage
-                    - Strong foundation model
-                    - Excellent for rare languages
-                    """)
-        # Performance Analysis Tab
-        with gr.TabItem("📈 Performance Deep Dive"):
-            gr.Markdown("""
-            # 🔍 Detailed Performance Analysis
-            ## 📊 Understanding ASR Metrics
-            """)
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("""
-                    ### 📉 Word Error Rate (WER)
-                    **Formula**: `(S + D + I) / N × 100%`
-                    - **S**: Substitutions
-                    - **D**: Deletions
-                    - **I**: Insertions
-                    - **N**: Total words in reference
-                    **Interpretation**:
-                    - **< 5%**: Excellent
-                    - **5-15%**: Good
-                    - **15-30%**: Fair
-                    - **> 30%**: Poor
-                    """)
-                with gr.Column():
-                    gr.Markdown("""
-                    ### 🔤 Character Error Rate (CER)
-                    **Formula**: Same as WER but at character level
-                    **Why CER matters**:
-                    - Better for morphologically rich languages
-                    - Captures partial word recognition
-                    - Useful for downstream NLP tasks
-                    - More granular error analysis
-                    **Typical Range**: Usually lower than WER
-                    """)
-                with gr.Column():
-                    gr.Markdown("""
-                    ### ⚡ Real-Time Factor (RTF)
-                    **Formula**: `Processing Time / Audio Duration`
-                    **Interpretation**:
-                    - **RTF < 1.0**: ⚡ Faster than real-time
-                    - **RTF = 1.0**: 🎯 Real-time processing
-                    - **RTF > 1.0**: 🐌 Slower than real-time
-                    **Production Requirements**:
-                    - Live applications: RTF < 0.3
-                    - Batch processing: RTF < 1.0 acceptable
-                    """)
-            gr.Markdown("""
-            ## 🏆 Language-Specific Performance Champions
-            | Language | 🥇 Best Model | WER Score | 🎯 Insights |
-            |----------|-------------|-----------|-----------|
-            | **Hindi** | AudioX | 12.14% | Strongest performance, most data available |
-            | **Gujarati** | ElevenLabs | 17.96% | Close race with AudioX (18.66%) |
-            | **Marathi** | ElevenLabs | 16.51% | Competitive performance across models |
-            | **Tamil** | AudioX | 21.79% | Dravidian language complexity handled well |
-            | **Telugu** | AudioX | 24.63% | Challenging agglutinative morphology |
-            | **Kannada** | AudioX | 17.61% | Consistent South Indian performance |
-            | **Malayalam** | AudioX | 26.92% | Most challenging across all models |
-            ### 🔍 Key Observations:
-            1. **AudioX Dominance**: Wins in 6 out of 7 languages
-            2. **Language Difficulty**: Malayalam > Telugu > Tamil (Dravidian complexity)
-            3. **Commercial Gap**: 10-15% WER difference vs specialized models
-            4. **Regional Patterns**: North Indian languages generally perform better
-            5. **Model Specialization**: Purpose-built models significantly outperform generic ones
-            """)
-        # Usage Guidelines Tab
-        with gr.TabItem("📖 Usage Guidelines"):
-            gr.Markdown("""
-            # 🚀 Model Selection Guide
-            ## 🎯 Which Model Should You Choose?
-            """)
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("""
-                    ### 🏆 Choose AudioX When:
-                    ✅ **Production Applications**
-                    ✅ **Highest Accuracy Requirements**
-                    ✅ **North/South Indian Languages**
-                    ✅ **Real-time Processing**
-                    ✅ **Commercial Deployment**
-                    ✅ **Healthcare/Medical Domain**
-                    **Best For**: Voice assistants, transcription services, customer support
-                    """)
-                with gr.Column():
-                    gr.Markdown("""
-                    ### 🎓 Choose IndicConformer When:
-                    ✅ **Research & Development**
-                    ✅ **Open Source Requirements**
-                    ✅ **All 22 Indian Languages**
-                    ✅ **Educational Projects**
-                    ✅ **Custom Fine-tuning**
-                    ✅ **Experimental Work**
-                    **Best For**: Academic research, prototyping, learning
-                    """)
-                with gr.Column():
-                    gr.Markdown("""
-                    ### 🌍 Choose Facebook MMS When:
-                    ✅ **Rare/Low-resource Languages**
-                    ✅ **Multilingual Applications**
-                    ✅ **Transfer Learning Base**
-                    ✅ **Research in Multilingual ASR**
-                    ✅ **Cross-lingual Studies**
-                    ✅ **Foundation Model Needs**
-                    **Best For**: Research, rare languages, base model
-                    """)
-            gr.Markdown("""
-            ## 🛠️ Implementation Tips
-            ### 📋 Pre-processing Recommendations:
-            - **Sample Rate**: Ensure 16kHz for all models
-            - **Audio Format**: WAV preferred over compressed formats
-            - **Noise Reduction**: Apply basic denoising for better results
-            - **Normalization**: Audio amplitude normalization recommended
-            ### ⚡ Performance Optimization:
-            - **GPU Usage**: Significant speedup with CUDA-enabled devices
-            - **Batch Processing**: Process multiple files together when possible
-            - **Model Caching**: Keep models loaded in memory for repeated use
-            - **Quantization**: Consider model quantization for deployment
-            ### 🎯 Accuracy Improvement:
-            - **Domain Adaptation**: Fine-tune on domain-specific data when possible
-            - **Language Models**: Integrate external LMs for better word-level accuracy
-            - **Post-processing**: Apply spelling correction and grammar checking
-            - **Ensemble Methods**: Combine multiple models for critical applications
-            """)
-    # Event handlers
-    transcribe_button.click(
         fn=transcribe_audio,
-        inputs=[audio_input, model_selector, reference_input],
-        outputs=[transcription_output, model_info_output, wer_output, cer_output, rtf_output],
-        show_progress=True
     )
-# Launch configuration
 if __name__ == "__main__":
-    demo.launch(
-        share=True,
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_error=True
-    )

 import gradio as gr
 import torch
 import torchaudio
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModelForCTC
 import librosa
+import numpy as np
+from jiwer import wer, cer
 import time
 # Model configurations
+MODEL_CONFIGS = {
+    "AudioX-North (Jivi AI)": {
+        "repo": "jiviai/audioX-north-v1",
+        "model_type": "seq2seq",
+        "description": "Supports Hindi, Gujarati, Marathi"
     },
+    "IndicConformer (AI4Bharat)": {
+        "repo": "ai4bharat/indic-conformer-600m-multilingual",
+        "model_type": "ctc_rnnt",
+        "description": "Supports 22 Indian languages"
     },
+    "MMS (Facebook)": {
+        "repo": "facebook/mms-1b",
+        "model_type": "ctc",
+        "description": "Supports over 1,400 languages (fine-tuning recommended)"
     }
 }
+# Load model and processor
+def load_model_and_processor(model_name):
+    config = MODEL_CONFIGS[model_name]
+    repo = config["repo"]
+    model_type = config["model_type"]
     try:
+        processor = AutoProcessor.from_pretrained(repo)
+        if model_type == "seq2seq":
+            model = AutoModelForSpeechSeq2Seq.from_pretrained(repo)
+        else:  # ctc or ctc_rnnt
+            model = AutoModelForCTC.from_pretrained(repo)
+        return model, processor, model_type
     except Exception as e:
+        return None, None, f"Error loading model: {str(e)}"
+# Compute metrics (WER, CER, RTF)
+def compute_metrics(reference, hypothesis, audio_duration):
+    if not reference or not hypothesis:
+        return None, None, None
     try:
+        wer_score = wer(reference, hypothesis)
+        cer_score = cer(reference, hypothesis)
+        rtf = audio_duration / time.time()  # Simplified; actual RTF needs processing time
+        return wer_score, cer_score, rtf
+    except Exception as e:
+        return None, None, f"Error computing metrics: {str(e)}"
+def transcribe_audio(audio_file, model_name, reference_text=""):
+    if not audio_file:
+        return "Please upload an audio file.", None, None, None
+    # Load model and processor
+    model, processor, model_type = load_model_and_processor(model_name)
+    if isinstance(model_type, str) and model_type.startswith("Error"):
+        return model_type, None, None, None
     try:
+        # Load and preprocess audio
+        audio, sr = librosa.load(audio_file, sr=16000)
         audio_duration = len(audio) / sr
+        # Process audio
+        inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
+        input_features = inputs["input_features"]
+        # Measure processing time for RTF
+        start_time = time.time()
+        with torch.no_grad():
+            if model_type == "seq2seq":
+                outputs = model.generate(input_features)
+            else:  # ctc or ctc_rnnt
+                outputs = model(input_features).logits
+                outputs = torch.argmax(outputs, dim=-1)
+        # Decode transcription
+        transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        # Compute metrics if reference text is provided
+        wer_score, cer_score, rtf = None, None, None
+        if reference_text:
+            wer_score, cer_score, rtf_error = compute_metrics(reference_text, transcription, audio_duration)
+            if isinstance(rtf_error, str):
+                return transcription, wer_score, cer_score, rtf_error
+            rtf = (time.time() - start_time) / audio_duration  # Actual RTF
+        return transcription, wer_score, cer_score, rtf
     except Exception as e:
+        return f"Error during transcription: {str(e)}", None, None, None
+# Gradio interface
+def create_interface():
+    model_choices = list(MODEL_CONFIGS.keys())
+    return gr.Interface(
         fn=transcribe_audio,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File (16kHz recommended)"),
+            gr.Dropdown(choices=model_choices, label="Select Model", value=model_choices[0]),
+            gr.Textbox(label="Reference Text (Optional for WER/CER)", placeholder="Enter ground truth text here")
+        ],
+        outputs=[
+            gr.Textbox(label="Transcription"),
+            gr.Textbox(label="WER"),
+            gr.Textbox(label="CER"),
+            gr.Textbox(label="RTF")
+        ],
+        title="Multilingual Speech-to-Text with Metrics",
+        description="Upload an audio file, select a model, and optionally provide reference text to compute WER, CER, and RTF.",
+        allow_flagging="never"
     )
 if __name__ == "__main__":
+    iface = create_interface()
+    iface.launch()