import gradio as gr import torch import torchaudio from transformers import ( AutoModelForSpeechSeq2Seq, AutoProcessor, AutoModelForCTC, AutoModel, WhisperProcessor, WhisperForConditionalGeneration, ) import librosa import numpy as np from jiwer import wer, cer import time # Language configurations LANGUAGE_CONFIGS = { "Hindi (हिंदी)": { "code": "hi", "script": "Devanagari", "models": ["AudioX-North", "IndicConformer", "MMS"] }, "Gujarati (ગુજરાતી)": { "code": "gu", "script": "Gujarati", "models": ["AudioX-North", "IndicConformer", "MMS"] }, "Marathi (मराठी)": { "code": "mr", "script": "Devanagari", "models": ["AudioX-North", "IndicConformer", "MMS"] }, "Tamil (தமிழ்)": { "code": "ta", "script": "Tamil", "models": ["AudioX-South", "IndicConformer", "MMS"] }, "Telugu (తెలుగు)": { "code": "te", "script": "Telugu", "models": ["AudioX-South", "IndicConformer", "MMS"] }, "Kannada (ಕನ್ನಡ)": { "code": "kn", "script": "Kannada", "models": ["AudioX-South", "IndicConformer", "MMS"] }, "Malayalam (മലയാളം)": { "code": "ml", "script": "Malayalam", "models": ["AudioX-South", "IndicConformer", "MMS"] } } # Model configurations MODEL_CONFIGS = { "AudioX-North": { "repo": "jiviai/audioX-north-v1", "model_type": "whisper", "description": "Supports Hindi, Gujarati, Marathi", "languages": ["hi", "gu", "mr"] }, "AudioX-South": { "repo": "jiviai/audioX-south-v1", "model_type": "whisper", "description": "Supports Tamil, Telugu, Kannada, Malayalam", "languages": ["ta", "te", "kn", "ml"] }, "IndicConformer": { "repo": "ai4bharat/indic-conformer-600m-multilingual", "model_type": "ctc_rnnt", "description": "Supports 22 Indian languages", "trust_remote_code": True, "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur"] }, "MMS": { "repo": "facebook/mms-1b-all", "model_type": "ctc", "description": "Supports 1,400+ languages", "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"] }, } # Load model and processor def load_model_and_processor(model_name): config = MODEL_CONFIGS[model_name] repo = config["repo"] model_type = config["model_type"] trust_remote_code = config.get("trust_remote_code", False) try: if model_name == "IndicConformer": print(f"Loading {model_name}...") try: model = AutoModel.from_pretrained( repo, trust_remote_code=True, torch_dtype=torch.float32, low_cpu_mem_usage=True ) except Exception as e1: print(f"Primary loading failed, trying fallback: {e1}") model = AutoModel.from_pretrained(repo, trust_remote_code=True) processor = None return model, processor, model_type elif model_name in ["AudioX-North", "AudioX-South"]: # Use Whisper processor and model for AudioX variants processor = WhisperProcessor.from_pretrained(repo) model = WhisperForConditionalGeneration.from_pretrained(repo) model.config.forced_decoder_ids = None return model, processor, model_type elif model_name == "MMS": model = AutoModelForCTC.from_pretrained(repo) processor = AutoProcessor.from_pretrained(repo) return model, processor, model_type except Exception as e: return None, None, f"Error loading model: {str(e)}" # Compute metrics (WER, CER, RTF) def compute_metrics(reference, hypothesis, audio_duration, total_time): if not reference or not hypothesis: return None, None, None, None try: reference = reference.strip().lower() hypothesis = hypothesis.strip().lower() wer_score = wer(reference, hypothesis) cer_score = cer(reference, hypothesis) rtf = total_time / audio_duration if audio_duration > 0 else None return wer_score, cer_score, rtf, total_time except Exception: return None, None, None, None # Main transcription function def transcribe_audio(audio_file, selected_language, selected_models, reference_text=""): if not audio_file: return "Please upload an audio file.", [], "" if not selected_models: return "Please select at least one model.", [], "" if not selected_language: return "Please select a language.", [], "" # Get language info lang_info = LANGUAGE_CONFIGS[selected_language] lang_code = lang_info["code"] table_data = [] try: # Load and preprocess audio once audio, sr = librosa.load(audio_file, sr=16000) audio_duration = len(audio) / sr for model_name in selected_models: # Check if model supports the selected language if model_name.replace("AudioX-", "AudioX-") not in lang_info["models"]: table_data.append([ model_name, f"Language {selected_language} not supported by this model", "-", "-", "-", "-" ]) continue model, processor, model_type = load_model_and_processor(model_name) if isinstance(model_type, str) and model_type.startswith("Error"): table_data.append([ model_name, f"Error: {model_type}", "-", "-", "-", "-" ]) continue start_time = time.time() try: if model_name == "IndicConformer": # AI4Bharat specific processing wav = torch.from_numpy(audio).unsqueeze(0) if torch.max(torch.abs(wav)) > 0: wav = wav / torch.max(torch.abs(wav)) with torch.no_grad(): transcription = model(wav, lang_code, "rnnt") if isinstance(transcription, list): transcription = transcription[0] if transcription else "" transcription = str(transcription).strip() elif model_name in ["AudioX-North", "AudioX-South"]: # AudioX Whisper-based processing if sr != 16000: audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features with torch.no_grad(): predicted_ids = model.generate( input_features, task="transcribe", language=lang_code ) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] else: # MMS # Standard CTC processing for MMS inputs = processor(audio, sampling_rate=16000, return_tensors="pt") with torch.no_grad(): input_values = inputs["input_values"] logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] except Exception as e: transcription = f"Processing error: {str(e)}" total_time = time.time() - start_time # Compute metrics wer_score, cer_score, rtf = "-", "-", "-" if reference_text and transcription and not transcription.startswith("Processing error"): wer_val, cer_val, rtf_val, _ = compute_metrics( reference_text, transcription, audio_duration, total_time ) wer_score = f"{wer_val:.3f}" if wer_val is not None else "-" cer_score = f"{cer_val:.3f}" if cer_val is not None else "-" rtf = f"{rtf_val:.3f}" if rtf_val is not None else "-" # Add row to table table_data.append([ model_name, transcription, wer_score, cer_score, rtf, f"{total_time:.2f}s" ]) # Create summary text summary = f"**Language:** {selected_language} ({lang_code})\n" summary += f"**Audio Duration:** {audio_duration:.2f}s\n" summary += f"**Models Tested:** {len(selected_models)}\n" if reference_text: summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n" # Create copyable text output copyable_text = "MULTILINGUAL SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*55 + "\n\n" copyable_text += f"Language: {selected_language} ({lang_code})\n" copyable_text += f"Script: {lang_info['script']}\n" copyable_text += f"Audio Duration: {audio_duration:.2f}s\n" copyable_text += f"Models Tested: {len(selected_models)}\n" if reference_text: copyable_text += f"Reference Text: {reference_text}\n" copyable_text += "\n" + "-"*55 + "\n\n" for i, row in enumerate(table_data): copyable_text += f"MODEL {i+1}: {row[0]}\n" copyable_text += f"Transcription: {row[1]}\n" copyable_text += f"WER: {row[2]}\n" copyable_text += f"CER: {row[3]}\n" copyable_text += f"RTF: {row[4]}\n" copyable_text += f"Time Taken: {row[5]}\n" copyable_text += "\n" + "-"*35 + "\n\n" return summary, table_data, copyable_text except Exception as e: error_msg = f"Error during transcription: {str(e)}" return error_msg, [], error_msg # Create Gradio interface def create_interface(): language_choices = list(LANGUAGE_CONFIGS.keys()) with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css=""" .language-info { background: #f0f8ff; padding: 10px; border-radius: 5px; margin: 10px 0; } .copy-area { font-family: monospace; font-size: 12px; } """) as iface: gr.Markdown(""" # 🌐 Multilingual Speech-to-Text Benchmark Compare ASR models across **7 Indian Languages** with comprehensive metrics. **Supported Languages:** Hindi, Gujarati, Marathi, Tamil, Telugu, Kannada, Malayalam """) with gr.Row(): with gr.Column(scale=1): # Language selection language_selection = gr.Dropdown( choices=language_choices, label="🗣️ Select Language", value=language_choices[0], interactive=True ) audio_input = gr.Audio( label="📹 Upload Audio File (16kHz recommended)", type="filepath" ) # Dynamic model selection based on language model_selection = gr.CheckboxGroup( choices=["AudioX-North", "IndicConformer", "MMS"], label="🤖 Select Models", value=["AudioX-North", "IndicConformer"], interactive=True ) reference_input = gr.Textbox( label="📄 Reference Text (optional, paste supported)", placeholder="Paste reference transcription here...", lines=4, interactive=True ) submit_btn = gr.Button("🚀 Run Multilingual Benchmark", variant="primary", size="lg") with gr.Column(scale=2): summary_output = gr.Markdown( label="📊 Summary", value="Select language, upload audio file and choose models to begin..." ) results_table = gr.Dataframe( headers=["Model", "Transcription", "WER", "CER", "RTF", "Time"], datatype=["str", "str", "str", "str", "str", "str"], label="🏆 Results Comparison", interactive=False, wrap=True, column_widths=[120, 350, 60, 60, 60, 80] ) # Copyable results section with gr.Group(): gr.Markdown("### 📋 Export Results") copyable_output = gr.Textbox( label="Copy-Paste Friendly Results", lines=12, max_lines=25, show_copy_button=True, interactive=False, elem_classes="copy-area", placeholder="Benchmark results will appear here..." ) # Update model choices based on language selection def update_model_choices(selected_language): if not selected_language: return gr.CheckboxGroup(choices=[], value=[]) lang_info = LANGUAGE_CONFIGS[selected_language] available_models = lang_info["models"] # Map display names model_map = { "AudioX-North": "AudioX-North", "AudioX-South": "AudioX-South", "IndicConformer": "IndicConformer", "MMS": "MMS" } available_choices = [model_map[model] for model in available_models if model in model_map] default_selection = available_choices[:2] if len(available_choices) >= 2 else available_choices return gr.CheckboxGroup(choices=available_choices, value=default_selection) # Connect language selection to model updates language_selection.change( fn=update_model_choices, inputs=[language_selection], outputs=[model_selection] ) # Connect the main function submit_btn.click( fn=transcribe_audio, inputs=[audio_input, language_selection, model_selection, reference_input], outputs=[summary_output, results_table, copyable_output] ) reference_input.submit( fn=transcribe_audio, inputs=[audio_input, language_selection, model_selection, reference_input], outputs=[summary_output, results_table, copyable_output] ) # Language information display gr.Markdown(""" --- ### 📤 Language & Model Support Matrix | Language | Script | AudioX-North | AudioX-South | IndicConformer | MMS | |----------|---------|-------------|-------------|---------------|-----| | Hindi | Devanagari | ✅ | ❌ | ✅ | ✅ | | Gujarati | Gujarati | ✅ | ❌ | ✅ | ✅ | | Marathi | Devanagari | ✅ | ❌ | ✅ | ✅ | | Tamil | Tamil | ❌ | ✅ | ✅ | ✅ | | Telugu | Telugu | ❌ | ✅ | ✅ | ✅ | | Kannada | Kannada | ❌ | ✅ | ✅ | ✅ | | Malayalam | Malayalam | ❌ | ✅ | ✅ | ✅ | ### 💡 Tips: - **Models auto-filter** based on selected language - **Reference Text**: Enable WER/CER calculation by providing ground truth - **Copy Results**: Export formatted results using the copy button - **Best Performance**: Use AudioX models for their specialized languages """) return iface if __name__ == "__main__": iface = create_interface() iface.launch( share=False, debug=True, server_name="0.0.0.0", server_port=7860, show_error=True )