Indic_ASR_Comparison_Multi

Running

App Files Files Community

AvtnshM commited on Sep 2

Commit

eaef30e

verified ·

1 Parent(s): 71fe323

V22

Browse files

Files changed (1) hide show

app.py +202 -89

app.py CHANGED Viewed

@@ -6,29 +6,74 @@ from transformers import (
     AutoProcessor,
     AutoModelForCTC,
     AutoModel,
 )
 import librosa
 import numpy as np
 from jiwer import wer, cer
 import time
 # Model configurations
 MODEL_CONFIGS = {
-    "AudioX-North (Jivi AI)": {
         "repo": "jiviai/audioX-north-v1",
-        "model_type": "seq2seq",
         "description": "Supports Hindi, Gujarati, Marathi",
     },
-    "IndicConformer (AI4Bharat)": {
         "repo": "ai4bharat/indic-conformer-600m-multilingual",
         "model_type": "ctc_rnnt",
         "description": "Supports 22 Indian languages",
         "trust_remote_code": True,
     },
-    "MMS (Facebook)": {
         "repo": "facebook/mms-1b-all",
-        "model_type": "ctc",
-        "description": "Supports over 1,400 languages (fine-tuning recommended)",
     },
 }
@@ -40,8 +85,7 @@ def load_model_and_processor(model_name):
     trust_remote_code = config.get("trust_remote_code", False)
     try:
-        if model_name == "IndicConformer (AI4Bharat)":
-            # Use the working method for AI4Bharat model
             print(f"Loading {model_name}...")
             try:
                 model = AutoModel.from_pretrained(
@@ -53,21 +97,21 @@ def load_model_and_processor(model_name):
             except Exception as e1:
                 print(f"Primary loading failed, trying fallback: {e1}")
                 model = AutoModel.from_pretrained(repo, trust_remote_code=True)
-            # AI4Bharat doesn't use a traditional processor
             processor = None
             return model, processor, model_type
-        elif model_name == "MMS (Facebook)":
             model = AutoModelForCTC.from_pretrained(repo)
             processor = AutoProcessor.from_pretrained(repo)
-        else:  # AudioX-North
-            processor = AutoProcessor.from_pretrained(repo, trust_remote_code=trust_remote_code)
-            if model_type == "seq2seq":
-                model = AutoModelForSpeechSeq2Seq.from_pretrained(repo, trust_remote_code=trust_remote_code)
-            else:
-                model = AutoModelForCTC.from_pretrained(repo, trust_remote_code=trust_remote_code)
-        return model, processor, model_type
     except Exception as e:
         return None, None, f"Error loading model: {str(e)}"
@@ -86,13 +130,20 @@ def compute_metrics(reference, hypothesis, audio_duration, total_time):
         return None, None, None, None
 # Main transcription function
-def transcribe_audio(audio_file, selected_models, reference_text=""):
     if not audio_file:
         return "Please upload an audio file.", [], ""
     if not selected_models:
         return "Please select at least one model.", [], ""
     table_data = []
     try:
         # Load and preprocess audio once
@@ -100,48 +151,63 @@ def transcribe_audio(audio_file, selected_models, reference_text=""):
         audio_duration = len(audio) / sr
         for model_name in selected_models:
             model, processor, model_type = load_model_and_processor(model_name)
             if isinstance(model_type, str) and model_type.startswith("Error"):
                 table_data.append([
                     model_name,
                     f"Error: {model_type}",
-                    "-",
-                    "-",
-                    "-",
-                    "-"
                 ])
                 continue
             start_time = time.time()
-            # Handle different model types
             try:
-                if model_name == "IndicConformer (AI4Bharat)":
-                    # Use AI4Bharat specific processing
-                    wav = torch.from_numpy(audio).unsqueeze(0)  # Add batch dimension
                     if torch.max(torch.abs(wav)) > 0:
-                        wav = wav / torch.max(torch.abs(wav))  # Normalize
                     with torch.no_grad():
-                        # Default to Hindi and RNNT for AI4Bharat
-                        transcription = model(wav, "hi", "rnnt")
                         if isinstance(transcription, list):
                             transcription = transcription[0] if transcription else ""
                         transcription = str(transcription).strip()
-                else:
-                    # Standard processing for other models
                     inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
                     with torch.no_grad():
-                        if model_type == "seq2seq":
-                            input_features = inputs["input_features"]
-                            outputs = model.generate(input_features)
-                            transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-                        else:  # CTC or RNNT
-                            input_values = inputs["input_values"]
-                            logits = model(input_values).logits
-                            predicted_ids = torch.argmax(logits, dim=-1)
-                            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
             except Exception as e:
                 transcription = f"Processing error: {str(e)}"
@@ -169,18 +235,21 @@ def transcribe_audio(audio_file, selected_models, reference_text=""):
             ])
         # Create summary text
-        summary = f"**Audio Duration:** {audio_duration:.2f}s\n"
         summary += f"**Models Tested:** {len(selected_models)}\n"
         if reference_text:
             summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"
         # Create copyable text output
-        copyable_text = "SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*50 + "\n\n"
         copyable_text += f"Audio Duration: {audio_duration:.2f}s\n"
         copyable_text += f"Models Tested: {len(selected_models)}\n"
         if reference_text:
             copyable_text += f"Reference Text: {reference_text}\n"
-        copyable_text += "\n" + "-"*50 + "\n\n"
         for i, row in enumerate(table_data):
             copyable_text += f"MODEL {i+1}: {row[0]}\n"
@@ -189,105 +258,149 @@ def transcribe_audio(audio_file, selected_models, reference_text=""):
             copyable_text += f"CER: {row[3]}\n"
             copyable_text += f"RTF: {row[4]}\n"
             copyable_text += f"Time Taken: {row[5]}\n"
-            copyable_text += "\n" + "-"*30 + "\n\n"
         return summary, table_data, copyable_text
     except Exception as e:
         error_msg = f"Error during transcription: {str(e)}"
         return error_msg, [], error_msg
-# Create Gradio interface with blocks for better control
 def create_interface():
-    model_choices = list(MODEL_CONFIGS.keys())
     with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css="""
-        .paste-button { margin: 5px 0; }
         .copy-area { font-family: monospace; font-size: 12px; }
     """) as iface:
         gr.Markdown("""
-        # Multilingual Speech-to-Text Benchmark
-        Upload an audio file, select one or more models, and optionally provide reference text.
-        The app benchmarks WER, CER, RTF, and Time Taken for each model.
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 audio_input = gr.Audio(
-                    label="Upload Audio File (16kHz recommended)",
                     type="filepath"
                 )
                 model_selection = gr.CheckboxGroup(
-                    choices=model_choices,
-                    label="Select Models",
-                    value=[model_choices[0]],  # Default to first model
                     interactive=True
                 )
-                # Enhanced reference text input with paste functionality
-                with gr.Group():
-                    gr.Markdown("### Reference Text (Optional for WER/CER)")
-                    reference_input = gr.Textbox(
-                        label="Reference Text (optional, paste supported)",
-                        placeholder="Paste reference transcription here...",
-                        lines=4,
-                        interactive=True
-                    )
-                submit_btn = gr.Button("🚀 Transcribe", variant="primary", size="lg")
             with gr.Column(scale=2):
-                summary_output = gr.Markdown(label="Summary", value="Upload an audio file and select models to begin...")
                 results_table = gr.Dataframe(
-                    headers=["Model", "Transcription", "WER", "CER", "RTF", "Time Taken"],
                     datatype=["str", "str", "str", "str", "str", "str"],
-                    label="Results Comparison",
                     interactive=False,
                     wrap=True,
-                    column_widths=[150, 400, 80, 80, 80, 100]
                 )
                 # Copyable results section
                 with gr.Group():
-                    gr.Markdown("### 📋 Copy Results")
                     copyable_output = gr.Textbox(
                         label="Copy-Paste Friendly Results",
-                        lines=15,
-                        max_lines=30,
                         show_copy_button=True,
                         interactive=False,
                         elem_classes="copy-area",
-                        placeholder="Results will appear here in copy-paste friendly format..."
                     )
-        # Connect the function
         submit_btn.click(
             fn=transcribe_audio,
-            inputs=[audio_input, model_selection, reference_input],
             outputs=[summary_output, results_table, copyable_output]
         )
-        # Also allow triggering on Enter in reference text
         reference_input.submit(
             fn=transcribe_audio,
-            inputs=[audio_input, model_selection, reference_input],
             outputs=[summary_output, results_table, copyable_output]
         )
-        # Add example and instructions
         gr.Markdown("""
         ---
         ### 💡 Tips:
-        - **Reference Text**: Paste your ground truth text to calculate WER/CER metrics
-        - **Copy Results**: Use the copy button in the results section to copy formatted results
-        - **AI4Bharat Model**: Automatically uses Hindi language with RNNT decoding
-        - **Supported Formats**: WAV, MP3, FLAC, M4A (16kHz recommended for best results)
         """)
     return iface

     AutoProcessor,
     AutoModelForCTC,
     AutoModel,
+    WhisperProcessor,
+    WhisperForConditionalGeneration,
 )
 import librosa
 import numpy as np
 from jiwer import wer, cer
 import time
+# Language configurations
+LANGUAGE_CONFIGS = {
+    "Hindi (हिंदी)": {
+        "code": "hi",
+        "script": "Devanagari",
+        "models": ["AudioX-North", "IndicConformer", "MMS"]
+    },
+    "Gujarati (ગુજરાતી)": {
+        "code": "gu",
+        "script": "Gujarati",
+        "models": ["AudioX-North", "IndicConformer", "MMS"]
+    },
+    "Marathi (मराठी)": {
+        "code": "mr",
+        "script": "Devanagari",
+        "models": ["AudioX-North", "IndicConformer", "MMS"]
+    },
+    "Tamil (தமிழ்)": {
+        "code": "ta",
+        "script": "Tamil",
+        "models": ["AudioX-South", "IndicConformer", "MMS"]
+    },
+    "Telugu (తెలుగు)": {
+        "code": "te",
+        "script": "Telugu",
+        "models": ["AudioX-South", "IndicConformer", "MMS"]
+    },
+    "Kannada (ಕನ್ನಡ)": {
+        "code": "kn",
+        "script": "Kannada",
+        "models": ["AudioX-South", "IndicConformer", "MMS"]
+    }
+}
 # Model configurations
 MODEL_CONFIGS = {
+    "AudioX-North": {
         "repo": "jiviai/audioX-north-v1",
+        "model_type": "whisper",
         "description": "Supports Hindi, Gujarati, Marathi",
+        "languages": ["hi", "gu", "mr"]
     },
+    "AudioX-South": {
+        "repo": "jiviai/audioX-south-v1",
+        "model_type": "whisper",
+        "description": "Supports Tamil, Telugu, Kannada, Malayalam",
+        "languages": ["ta", "te", "kn", "ml"]
+    },
+    "IndicConformer": {
         "repo": "ai4bharat/indic-conformer-600m-multilingual",
         "model_type": "ctc_rnnt",
         "description": "Supports 22 Indian languages",
         "trust_remote_code": True,
+        "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur"]
     },
+    "MMS": {
         "repo": "facebook/mms-1b-all",
+        "model_type": "ctc",
+        "description": "Supports 1,400+ languages",
+        "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"]
     },
 }
     trust_remote_code = config.get("trust_remote_code", False)
     try:
+        if model_name == "IndicConformer":
             print(f"Loading {model_name}...")
             try:
                 model = AutoModel.from_pretrained(
             except Exception as e1:
                 print(f"Primary loading failed, trying fallback: {e1}")
                 model = AutoModel.from_pretrained(repo, trust_remote_code=True)
             processor = None
             return model, processor, model_type
+        elif model_name in ["AudioX-North", "AudioX-South"]:
+            # Use Whisper processor and model for AudioX variants
+            processor = WhisperProcessor.from_pretrained(repo)
+            model = WhisperForConditionalGeneration.from_pretrained(repo)
+            model.config.forced_decoder_ids = None
+            return model, processor, model_type
+        elif model_name == "MMS":
             model = AutoModelForCTC.from_pretrained(repo)
             processor = AutoProcessor.from_pretrained(repo)
+            return model, processor, model_type
     except Exception as e:
         return None, None, f"Error loading model: {str(e)}"
         return None, None, None, None
 # Main transcription function
+def transcribe_audio(audio_file, selected_language, selected_models, reference_text=""):
     if not audio_file:
         return "Please upload an audio file.", [], ""
     if not selected_models:
         return "Please select at least one model.", [], ""
+    if not selected_language:
+        return "Please select a language.", [], ""
+    # Get language info
+    lang_info = LANGUAGE_CONFIGS[selected_language]
+    lang_code = lang_info["code"]
     table_data = []
     try:
         # Load and preprocess audio once
         audio_duration = len(audio) / sr
         for model_name in selected_models:
+            # Check if model supports the selected language
+            if model_name.replace("AudioX-", "AudioX-") not in lang_info["models"]:
+                table_data.append([
+                    model_name,
+                    f"Language {selected_language} not supported by this model",
+                    "-", "-", "-", "-"
+                ])
+                continue
             model, processor, model_type = load_model_and_processor(model_name)
             if isinstance(model_type, str) and model_type.startswith("Error"):
                 table_data.append([
                     model_name,
                     f"Error: {model_type}",
+                    "-", "-", "-", "-"
                 ])
                 continue
             start_time = time.time()
             try:
+                if model_name == "IndicConformer":
+                    # AI4Bharat specific processing
+                    wav = torch.from_numpy(audio).unsqueeze(0)
                     if torch.max(torch.abs(wav)) > 0:
+                        wav = wav / torch.max(torch.abs(wav))
                     with torch.no_grad():
+                        transcription = model(wav, lang_code, "rnnt")
                         if isinstance(transcription, list):
                             transcription = transcription[0] if transcription else ""
                         transcription = str(transcription).strip()
+                elif model_name in ["AudioX-North", "AudioX-South"]:
+                    # AudioX Whisper-based processing
+                    if sr != 16000:
+                        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+                    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
+                    with torch.no_grad():
+                        predicted_ids = model.generate(
+                            input_features,
+                            task="transcribe",
+                            language=lang_code
+                        )
+                        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+                else:  # MMS
+                    # Standard CTC processing for MMS
                     inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
                     with torch.no_grad():
+                        input_values = inputs["input_values"]
+                        logits = model(input_values).logits
+                        predicted_ids = torch.argmax(logits, dim=-1)
+                        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
             except Exception as e:
                 transcription = f"Processing error: {str(e)}"
             ])
         # Create summary text
+        summary = f"**Language:** {selected_language} ({lang_code})\n"
+        summary += f"**Audio Duration:** {audio_duration:.2f}s\n"
         summary += f"**Models Tested:** {len(selected_models)}\n"
         if reference_text:
             summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"
         # Create copyable text output
+        copyable_text = "MULTILINGUAL SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*55 + "\n\n"
+        copyable_text += f"Language: {selected_language} ({lang_code})\n"
+        copyable_text += f"Script: {lang_info['script']}\n"
         copyable_text += f"Audio Duration: {audio_duration:.2f}s\n"
         copyable_text += f"Models Tested: {len(selected_models)}\n"
         if reference_text:
             copyable_text += f"Reference Text: {reference_text}\n"
+        copyable_text += "\n" + "-"*55 + "\n\n"
         for i, row in enumerate(table_data):
             copyable_text += f"MODEL {i+1}: {row[0]}\n"
             copyable_text += f"CER: {row[3]}\n"
             copyable_text += f"RTF: {row[4]}\n"
             copyable_text += f"Time Taken: {row[5]}\n"
+            copyable_text += "\n" + "-"*35 + "\n\n"
         return summary, table_data, copyable_text
     except Exception as e:
         error_msg = f"Error during transcription: {str(e)}"
         return error_msg, [], error_msg
+# Create Gradio interface
 def create_interface():
+    language_choices = list(LANGUAGE_CONFIGS.keys())
     with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css="""
+        .language-info { background: #f0f8ff; padding: 10px; border-radius: 5px; margin: 10px 0; }
         .copy-area { font-family: monospace; font-size: 12px; }
     """) as iface:
         gr.Markdown("""
+        # 🌏 Multilingual Speech-to-Text Benchmark
+        Compare ASR models across **6 Indian Languages** with comprehensive metrics.
+        **Supported Languages:** Hindi, Gujarati, Marathi, Tamil, Telugu, Kannada
         """)
         with gr.Row():
             with gr.Column(scale=1):
+                # Language selection
+                language_selection = gr.Dropdown(
+                    choices=language_choices,
+                    label="🗣️ Select Language",
+                    value=language_choices[0],
+                    interactive=True
+                )
                 audio_input = gr.Audio(
+                    label="📹 Upload Audio File (16kHz recommended)",
                     type="filepath"
                 )
+                # Dynamic model selection based on language
                 model_selection = gr.CheckboxGroup(
+                    choices=["AudioX-North", "IndicConformer", "MMS"],
+                    label="🤖 Select Models",
+                    value=["AudioX-North", "IndicConformer"],
                     interactive=True
                 )
+                reference_input = gr.Textbox(
+                    label="📝 Reference Text (optional, paste supported)",
+                    placeholder="Paste reference transcription here...",
+                    lines=4,
+                    interactive=True
+                )
+                submit_btn = gr.Button("🚀 Run Multilingual Benchmark", variant="primary", size="lg")
             with gr.Column(scale=2):
+                summary_output = gr.Markdown(
+                    label="📊 Summary",
+                    value="Select language, upload audio file and choose models to begin..."
+                )
                 results_table = gr.Dataframe(
+                    headers=["Model", "Transcription", "WER", "CER", "RTF", "Time"],
                     datatype=["str", "str", "str", "str", "str", "str"],
+                    label="🏆 Results Comparison",
                     interactive=False,
                     wrap=True,
+                    column_widths=[120, 350, 60, 60, 60, 80]
                 )
                 # Copyable results section
                 with gr.Group():
+                    gr.Markdown("### 📋 Export Results")
                     copyable_output = gr.Textbox(
                         label="Copy-Paste Friendly Results",
+                        lines=12,
+                        max_lines=25,
                         show_copy_button=True,
                         interactive=False,
                         elem_classes="copy-area",
+                        placeholder="Benchmark results will appear here..."
                     )
+        # Update model choices based on language selection
+        def update_model_choices(selected_language):
+            if not selected_language:
+                return gr.CheckboxGroup(choices=[], value=[])
+            lang_info = LANGUAGE_CONFIGS[selected_language]
+            available_models = lang_info["models"]
+            # Map display names
+            model_map = {
+                "AudioX-North": "AudioX-North",
+                "AudioX-South": "AudioX-South",
+                "IndicConformer": "IndicConformer",
+                "MMS": "MMS"
+            }
+            available_choices = [model_map[model] for model in available_models if model in model_map]
+            default_selection = available_choices[:2] if len(available_choices) >= 2 else available_choices
+            return gr.CheckboxGroup(choices=available_choices, value=default_selection)
+        # Connect language selection to model updates
+        language_selection.change(
+            fn=update_model_choices,
+            inputs=[language_selection],
+            outputs=[model_selection]
+        )
+        # Connect the main function
         submit_btn.click(
             fn=transcribe_audio,
+            inputs=[audio_input, language_selection, model_selection, reference_input],
             outputs=[summary_output, results_table, copyable_output]
         )
         reference_input.submit(
             fn=transcribe_audio,
+            inputs=[audio_input, language_selection, model_selection, reference_input],
             outputs=[summary_output, results_table, copyable_output]
         )
+        # Language information display
         gr.Markdown("""
         ---
+        ### 🔤 Language & Model Support Matrix
+        | Language | Script | AudioX-North | AudioX-South | IndicConformer | MMS |
+        |----------|---------|-------------|-------------|---------------|-----|
+        | Hindi | Devanagari | ✅ | ❌ | ✅ | ✅ |
+        | Gujarati | Gujarati | ✅ | ❌ | ✅ | ✅ |
+        | Marathi | Devanagari | ✅ | ❌ | ✅ | ✅ |
+        | Tamil | Tamil | ❌ | ✅ | ✅ | ✅ |
+        | Telugu | Telugu | ❌ | ✅ | ✅ | ✅ |
+        | Kannada | Kannada | ❌ | ✅ | ✅ | ✅ |
         ### 💡 Tips:
+        - **Models auto-filter** based on selected language
+        - **Reference Text**: Enable WER/CER calculation by providing ground truth
+        - **Copy Results**: Export formatted results using the copy button
+        - **Best Performance**: Use AudioX models for their specialized languages
         """)
     return iface