AvtnshM commited on
Commit
eaef30e
·
verified ·
1 Parent(s): 71fe323
Files changed (1) hide show
  1. app.py +202 -89
app.py CHANGED
@@ -6,29 +6,74 @@ from transformers import (
6
  AutoProcessor,
7
  AutoModelForCTC,
8
  AutoModel,
 
 
9
  )
10
  import librosa
11
  import numpy as np
12
  from jiwer import wer, cer
13
  import time
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Model configurations
16
  MODEL_CONFIGS = {
17
- "AudioX-North (Jivi AI)": {
18
  "repo": "jiviai/audioX-north-v1",
19
- "model_type": "seq2seq",
20
  "description": "Supports Hindi, Gujarati, Marathi",
 
21
  },
22
- "IndicConformer (AI4Bharat)": {
 
 
 
 
 
 
23
  "repo": "ai4bharat/indic-conformer-600m-multilingual",
24
  "model_type": "ctc_rnnt",
25
  "description": "Supports 22 Indian languages",
26
  "trust_remote_code": True,
 
27
  },
28
- "MMS (Facebook)": {
29
  "repo": "facebook/mms-1b-all",
30
- "model_type": "ctc",
31
- "description": "Supports over 1,400 languages (fine-tuning recommended)",
 
32
  },
33
  }
34
 
@@ -40,8 +85,7 @@ def load_model_and_processor(model_name):
40
  trust_remote_code = config.get("trust_remote_code", False)
41
 
42
  try:
43
- if model_name == "IndicConformer (AI4Bharat)":
44
- # Use the working method for AI4Bharat model
45
  print(f"Loading {model_name}...")
46
  try:
47
  model = AutoModel.from_pretrained(
@@ -53,21 +97,21 @@ def load_model_and_processor(model_name):
53
  except Exception as e1:
54
  print(f"Primary loading failed, trying fallback: {e1}")
55
  model = AutoModel.from_pretrained(repo, trust_remote_code=True)
56
-
57
- # AI4Bharat doesn't use a traditional processor
58
  processor = None
59
  return model, processor, model_type
60
- elif model_name == "MMS (Facebook)":
 
 
 
 
 
 
 
 
61
  model = AutoModelForCTC.from_pretrained(repo)
62
  processor = AutoProcessor.from_pretrained(repo)
63
- else: # AudioX-North
64
- processor = AutoProcessor.from_pretrained(repo, trust_remote_code=trust_remote_code)
65
- if model_type == "seq2seq":
66
- model = AutoModelForSpeechSeq2Seq.from_pretrained(repo, trust_remote_code=trust_remote_code)
67
- else:
68
- model = AutoModelForCTC.from_pretrained(repo, trust_remote_code=trust_remote_code)
69
 
70
- return model, processor, model_type
71
  except Exception as e:
72
  return None, None, f"Error loading model: {str(e)}"
73
 
@@ -86,13 +130,20 @@ def compute_metrics(reference, hypothesis, audio_duration, total_time):
86
  return None, None, None, None
87
 
88
  # Main transcription function
89
- def transcribe_audio(audio_file, selected_models, reference_text=""):
90
  if not audio_file:
91
  return "Please upload an audio file.", [], ""
92
 
93
  if not selected_models:
94
  return "Please select at least one model.", [], ""
95
 
 
 
 
 
 
 
 
96
  table_data = []
97
  try:
98
  # Load and preprocess audio once
@@ -100,48 +151,63 @@ def transcribe_audio(audio_file, selected_models, reference_text=""):
100
  audio_duration = len(audio) / sr
101
 
102
  for model_name in selected_models:
 
 
 
 
 
 
 
 
 
103
  model, processor, model_type = load_model_and_processor(model_name)
104
  if isinstance(model_type, str) and model_type.startswith("Error"):
105
  table_data.append([
106
  model_name,
107
  f"Error: {model_type}",
108
- "-",
109
- "-",
110
- "-",
111
- "-"
112
  ])
113
  continue
114
 
115
  start_time = time.time()
116
 
117
- # Handle different model types
118
  try:
119
- if model_name == "IndicConformer (AI4Bharat)":
120
- # Use AI4Bharat specific processing
121
- wav = torch.from_numpy(audio).unsqueeze(0) # Add batch dimension
122
  if torch.max(torch.abs(wav)) > 0:
123
- wav = wav / torch.max(torch.abs(wav)) # Normalize
124
 
125
  with torch.no_grad():
126
- # Default to Hindi and RNNT for AI4Bharat
127
- transcription = model(wav, "hi", "rnnt")
128
  if isinstance(transcription, list):
129
  transcription = transcription[0] if transcription else ""
130
  transcription = str(transcription).strip()
131
- else:
132
- # Standard processing for other models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
134
 
135
  with torch.no_grad():
136
- if model_type == "seq2seq":
137
- input_features = inputs["input_features"]
138
- outputs = model.generate(input_features)
139
- transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
140
- else: # CTC or RNNT
141
- input_values = inputs["input_values"]
142
- logits = model(input_values).logits
143
- predicted_ids = torch.argmax(logits, dim=-1)
144
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
145
 
146
  except Exception as e:
147
  transcription = f"Processing error: {str(e)}"
@@ -169,18 +235,21 @@ def transcribe_audio(audio_file, selected_models, reference_text=""):
169
  ])
170
 
171
  # Create summary text
172
- summary = f"**Audio Duration:** {audio_duration:.2f}s\n"
 
173
  summary += f"**Models Tested:** {len(selected_models)}\n"
174
  if reference_text:
175
  summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"
176
 
177
  # Create copyable text output
178
- copyable_text = "SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*50 + "\n\n"
 
 
179
  copyable_text += f"Audio Duration: {audio_duration:.2f}s\n"
180
  copyable_text += f"Models Tested: {len(selected_models)}\n"
181
  if reference_text:
182
  copyable_text += f"Reference Text: {reference_text}\n"
183
- copyable_text += "\n" + "-"*50 + "\n\n"
184
 
185
  for i, row in enumerate(table_data):
186
  copyable_text += f"MODEL {i+1}: {row[0]}\n"
@@ -189,105 +258,149 @@ def transcribe_audio(audio_file, selected_models, reference_text=""):
189
  copyable_text += f"CER: {row[3]}\n"
190
  copyable_text += f"RTF: {row[4]}\n"
191
  copyable_text += f"Time Taken: {row[5]}\n"
192
- copyable_text += "\n" + "-"*30 + "\n\n"
193
 
194
  return summary, table_data, copyable_text
195
  except Exception as e:
196
  error_msg = f"Error during transcription: {str(e)}"
197
  return error_msg, [], error_msg
198
 
199
- # Create Gradio interface with blocks for better control
200
  def create_interface():
201
- model_choices = list(MODEL_CONFIGS.keys())
202
 
203
  with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css="""
204
- .paste-button { margin: 5px 0; }
205
  .copy-area { font-family: monospace; font-size: 12px; }
206
  """) as iface:
207
  gr.Markdown("""
208
- # Multilingual Speech-to-Text Benchmark
209
- Upload an audio file, select one or more models, and optionally provide reference text.
210
- The app benchmarks WER, CER, RTF, and Time Taken for each model.
 
 
211
  """)
212
 
213
  with gr.Row():
214
  with gr.Column(scale=1):
 
 
 
 
 
 
 
 
215
  audio_input = gr.Audio(
216
- label="Upload Audio File (16kHz recommended)",
217
  type="filepath"
218
  )
 
 
219
  model_selection = gr.CheckboxGroup(
220
- choices=model_choices,
221
- label="Select Models",
222
- value=[model_choices[0]], # Default to first model
223
  interactive=True
224
  )
225
 
226
- # Enhanced reference text input with paste functionality
227
- with gr.Group():
228
- gr.Markdown("### Reference Text (Optional for WER/CER)")
229
- reference_input = gr.Textbox(
230
- label="Reference Text (optional, paste supported)",
231
- placeholder="Paste reference transcription here...",
232
- lines=4,
233
- interactive=True
234
- )
235
-
236
-
237
-
238
-
239
-
240
-
241
-
242
- submit_btn = gr.Button("🚀 Transcribe", variant="primary", size="lg")
243
 
244
  with gr.Column(scale=2):
245
- summary_output = gr.Markdown(label="Summary", value="Upload an audio file and select models to begin...")
 
 
 
246
 
247
  results_table = gr.Dataframe(
248
- headers=["Model", "Transcription", "WER", "CER", "RTF", "Time Taken"],
249
  datatype=["str", "str", "str", "str", "str", "str"],
250
- label="Results Comparison",
251
  interactive=False,
252
  wrap=True,
253
- column_widths=[150, 400, 80, 80, 80, 100]
254
  )
255
 
256
  # Copyable results section
257
  with gr.Group():
258
- gr.Markdown("### 📋 Copy Results")
259
  copyable_output = gr.Textbox(
260
  label="Copy-Paste Friendly Results",
261
- lines=15,
262
- max_lines=30,
263
  show_copy_button=True,
264
  interactive=False,
265
  elem_classes="copy-area",
266
- placeholder="Results will appear here in copy-paste friendly format..."
267
  )
268
 
269
- # Connect the function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  submit_btn.click(
271
  fn=transcribe_audio,
272
- inputs=[audio_input, model_selection, reference_input],
273
  outputs=[summary_output, results_table, copyable_output]
274
  )
275
 
276
- # Also allow triggering on Enter in reference text
277
  reference_input.submit(
278
  fn=transcribe_audio,
279
- inputs=[audio_input, model_selection, reference_input],
280
  outputs=[summary_output, results_table, copyable_output]
281
  )
282
 
283
- # Add example and instructions
284
  gr.Markdown("""
285
  ---
 
 
 
 
 
 
 
 
 
 
 
286
  ### 💡 Tips:
287
- - **Reference Text**: Paste your ground truth text to calculate WER/CER metrics
288
- - **Copy Results**: Use the copy button in the results section to copy formatted results
289
- - **AI4Bharat Model**: Automatically uses Hindi language with RNNT decoding
290
- - **Supported Formats**: WAV, MP3, FLAC, M4A (16kHz recommended for best results)
291
  """)
292
 
293
  return iface
 
6
  AutoProcessor,
7
  AutoModelForCTC,
8
  AutoModel,
9
+ WhisperProcessor,
10
+ WhisperForConditionalGeneration,
11
  )
12
  import librosa
13
  import numpy as np
14
  from jiwer import wer, cer
15
  import time
16
 
17
+ # Language configurations
18
+ LANGUAGE_CONFIGS = {
19
+ "Hindi (हिंदी)": {
20
+ "code": "hi",
21
+ "script": "Devanagari",
22
+ "models": ["AudioX-North", "IndicConformer", "MMS"]
23
+ },
24
+ "Gujarati (ગુજરાતી)": {
25
+ "code": "gu",
26
+ "script": "Gujarati",
27
+ "models": ["AudioX-North", "IndicConformer", "MMS"]
28
+ },
29
+ "Marathi (मराठी)": {
30
+ "code": "mr",
31
+ "script": "Devanagari",
32
+ "models": ["AudioX-North", "IndicConformer", "MMS"]
33
+ },
34
+ "Tamil (தமிழ்)": {
35
+ "code": "ta",
36
+ "script": "Tamil",
37
+ "models": ["AudioX-South", "IndicConformer", "MMS"]
38
+ },
39
+ "Telugu (తెలుగు)": {
40
+ "code": "te",
41
+ "script": "Telugu",
42
+ "models": ["AudioX-South", "IndicConformer", "MMS"]
43
+ },
44
+ "Kannada (ಕನ್ನಡ)": {
45
+ "code": "kn",
46
+ "script": "Kannada",
47
+ "models": ["AudioX-South", "IndicConformer", "MMS"]
48
+ }
49
+ }
50
+
51
  # Model configurations
52
  MODEL_CONFIGS = {
53
+ "AudioX-North": {
54
  "repo": "jiviai/audioX-north-v1",
55
+ "model_type": "whisper",
56
  "description": "Supports Hindi, Gujarati, Marathi",
57
+ "languages": ["hi", "gu", "mr"]
58
  },
59
+ "AudioX-South": {
60
+ "repo": "jiviai/audioX-south-v1",
61
+ "model_type": "whisper",
62
+ "description": "Supports Tamil, Telugu, Kannada, Malayalam",
63
+ "languages": ["ta", "te", "kn", "ml"]
64
+ },
65
+ "IndicConformer": {
66
  "repo": "ai4bharat/indic-conformer-600m-multilingual",
67
  "model_type": "ctc_rnnt",
68
  "description": "Supports 22 Indian languages",
69
  "trust_remote_code": True,
70
+ "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml", "bn", "pa", "or", "as", "ur"]
71
  },
72
+ "MMS": {
73
  "repo": "facebook/mms-1b-all",
74
+ "model_type": "ctc",
75
+ "description": "Supports 1,400+ languages",
76
+ "languages": ["hi", "gu", "mr", "ta", "te", "kn", "ml"]
77
  },
78
  }
79
 
 
85
  trust_remote_code = config.get("trust_remote_code", False)
86
 
87
  try:
88
+ if model_name == "IndicConformer":
 
89
  print(f"Loading {model_name}...")
90
  try:
91
  model = AutoModel.from_pretrained(
 
97
  except Exception as e1:
98
  print(f"Primary loading failed, trying fallback: {e1}")
99
  model = AutoModel.from_pretrained(repo, trust_remote_code=True)
 
 
100
  processor = None
101
  return model, processor, model_type
102
+
103
+ elif model_name in ["AudioX-North", "AudioX-South"]:
104
+ # Use Whisper processor and model for AudioX variants
105
+ processor = WhisperProcessor.from_pretrained(repo)
106
+ model = WhisperForConditionalGeneration.from_pretrained(repo)
107
+ model.config.forced_decoder_ids = None
108
+ return model, processor, model_type
109
+
110
+ elif model_name == "MMS":
111
  model = AutoModelForCTC.from_pretrained(repo)
112
  processor = AutoProcessor.from_pretrained(repo)
113
+ return model, processor, model_type
 
 
 
 
 
114
 
 
115
  except Exception as e:
116
  return None, None, f"Error loading model: {str(e)}"
117
 
 
130
  return None, None, None, None
131
 
132
  # Main transcription function
133
+ def transcribe_audio(audio_file, selected_language, selected_models, reference_text=""):
134
  if not audio_file:
135
  return "Please upload an audio file.", [], ""
136
 
137
  if not selected_models:
138
  return "Please select at least one model.", [], ""
139
 
140
+ if not selected_language:
141
+ return "Please select a language.", [], ""
142
+
143
+ # Get language info
144
+ lang_info = LANGUAGE_CONFIGS[selected_language]
145
+ lang_code = lang_info["code"]
146
+
147
  table_data = []
148
  try:
149
  # Load and preprocess audio once
 
151
  audio_duration = len(audio) / sr
152
 
153
  for model_name in selected_models:
154
+ # Check if model supports the selected language
155
+ if model_name.replace("AudioX-", "AudioX-") not in lang_info["models"]:
156
+ table_data.append([
157
+ model_name,
158
+ f"Language {selected_language} not supported by this model",
159
+ "-", "-", "-", "-"
160
+ ])
161
+ continue
162
+
163
  model, processor, model_type = load_model_and_processor(model_name)
164
  if isinstance(model_type, str) and model_type.startswith("Error"):
165
  table_data.append([
166
  model_name,
167
  f"Error: {model_type}",
168
+ "-", "-", "-", "-"
 
 
 
169
  ])
170
  continue
171
 
172
  start_time = time.time()
173
 
 
174
  try:
175
+ if model_name == "IndicConformer":
176
+ # AI4Bharat specific processing
177
+ wav = torch.from_numpy(audio).unsqueeze(0)
178
  if torch.max(torch.abs(wav)) > 0:
179
+ wav = wav / torch.max(torch.abs(wav))
180
 
181
  with torch.no_grad():
182
+ transcription = model(wav, lang_code, "rnnt")
 
183
  if isinstance(transcription, list):
184
  transcription = transcription[0] if transcription else ""
185
  transcription = str(transcription).strip()
186
+
187
+ elif model_name in ["AudioX-North", "AudioX-South"]:
188
+ # AudioX Whisper-based processing
189
+ if sr != 16000:
190
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
191
+
192
+ input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
193
+
194
+ with torch.no_grad():
195
+ predicted_ids = model.generate(
196
+ input_features,
197
+ task="transcribe",
198
+ language=lang_code
199
+ )
200
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
201
+
202
+ else: # MMS
203
+ # Standard CTC processing for MMS
204
  inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
205
 
206
  with torch.no_grad():
207
+ input_values = inputs["input_values"]
208
+ logits = model(input_values).logits
209
+ predicted_ids = torch.argmax(logits, dim=-1)
210
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
 
 
 
 
211
 
212
  except Exception as e:
213
  transcription = f"Processing error: {str(e)}"
 
235
  ])
236
 
237
  # Create summary text
238
+ summary = f"**Language:** {selected_language} ({lang_code})\n"
239
+ summary += f"**Audio Duration:** {audio_duration:.2f}s\n"
240
  summary += f"**Models Tested:** {len(selected_models)}\n"
241
  if reference_text:
242
  summary += f"**Reference Text:** {reference_text[:100]}{'...' if len(reference_text) > 100 else ''}\n"
243
 
244
  # Create copyable text output
245
+ copyable_text = "MULTILINGUAL SPEECH-TO-TEXT BENCHMARK RESULTS\n" + "="*55 + "\n\n"
246
+ copyable_text += f"Language: {selected_language} ({lang_code})\n"
247
+ copyable_text += f"Script: {lang_info['script']}\n"
248
  copyable_text += f"Audio Duration: {audio_duration:.2f}s\n"
249
  copyable_text += f"Models Tested: {len(selected_models)}\n"
250
  if reference_text:
251
  copyable_text += f"Reference Text: {reference_text}\n"
252
+ copyable_text += "\n" + "-"*55 + "\n\n"
253
 
254
  for i, row in enumerate(table_data):
255
  copyable_text += f"MODEL {i+1}: {row[0]}\n"
 
258
  copyable_text += f"CER: {row[3]}\n"
259
  copyable_text += f"RTF: {row[4]}\n"
260
  copyable_text += f"Time Taken: {row[5]}\n"
261
+ copyable_text += "\n" + "-"*35 + "\n\n"
262
 
263
  return summary, table_data, copyable_text
264
  except Exception as e:
265
  error_msg = f"Error during transcription: {str(e)}"
266
  return error_msg, [], error_msg
267
 
268
+ # Create Gradio interface
269
  def create_interface():
270
+ language_choices = list(LANGUAGE_CONFIGS.keys())
271
 
272
  with gr.Blocks(title="Multilingual Speech-to-Text Benchmark", css="""
273
+ .language-info { background: #f0f8ff; padding: 10px; border-radius: 5px; margin: 10px 0; }
274
  .copy-area { font-family: monospace; font-size: 12px; }
275
  """) as iface:
276
  gr.Markdown("""
277
+ # 🌏 Multilingual Speech-to-Text Benchmark
278
+
279
+ Compare ASR models across **6 Indian Languages** with comprehensive metrics.
280
+
281
+ **Supported Languages:** Hindi, Gujarati, Marathi, Tamil, Telugu, Kannada
282
  """)
283
 
284
  with gr.Row():
285
  with gr.Column(scale=1):
286
+ # Language selection
287
+ language_selection = gr.Dropdown(
288
+ choices=language_choices,
289
+ label="🗣️ Select Language",
290
+ value=language_choices[0],
291
+ interactive=True
292
+ )
293
+
294
  audio_input = gr.Audio(
295
+ label="📹 Upload Audio File (16kHz recommended)",
296
  type="filepath"
297
  )
298
+
299
+ # Dynamic model selection based on language
300
  model_selection = gr.CheckboxGroup(
301
+ choices=["AudioX-North", "IndicConformer", "MMS"],
302
+ label="🤖 Select Models",
303
+ value=["AudioX-North", "IndicConformer"],
304
  interactive=True
305
  )
306
 
307
+ reference_input = gr.Textbox(
308
+ label="📝 Reference Text (optional, paste supported)",
309
+ placeholder="Paste reference transcription here...",
310
+ lines=4,
311
+ interactive=True
312
+ )
313
+
314
+ submit_btn = gr.Button("🚀 Run Multilingual Benchmark", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
315
 
316
  with gr.Column(scale=2):
317
+ summary_output = gr.Markdown(
318
+ label="📊 Summary",
319
+ value="Select language, upload audio file and choose models to begin..."
320
+ )
321
 
322
  results_table = gr.Dataframe(
323
+ headers=["Model", "Transcription", "WER", "CER", "RTF", "Time"],
324
  datatype=["str", "str", "str", "str", "str", "str"],
325
+ label="🏆 Results Comparison",
326
  interactive=False,
327
  wrap=True,
328
+ column_widths=[120, 350, 60, 60, 60, 80]
329
  )
330
 
331
  # Copyable results section
332
  with gr.Group():
333
+ gr.Markdown("### 📋 Export Results")
334
  copyable_output = gr.Textbox(
335
  label="Copy-Paste Friendly Results",
336
+ lines=12,
337
+ max_lines=25,
338
  show_copy_button=True,
339
  interactive=False,
340
  elem_classes="copy-area",
341
+ placeholder="Benchmark results will appear here..."
342
  )
343
 
344
+ # Update model choices based on language selection
345
+ def update_model_choices(selected_language):
346
+ if not selected_language:
347
+ return gr.CheckboxGroup(choices=[], value=[])
348
+
349
+ lang_info = LANGUAGE_CONFIGS[selected_language]
350
+ available_models = lang_info["models"]
351
+
352
+ # Map display names
353
+ model_map = {
354
+ "AudioX-North": "AudioX-North",
355
+ "AudioX-South": "AudioX-South",
356
+ "IndicConformer": "IndicConformer",
357
+ "MMS": "MMS"
358
+ }
359
+
360
+ available_choices = [model_map[model] for model in available_models if model in model_map]
361
+ default_selection = available_choices[:2] if len(available_choices) >= 2 else available_choices
362
+
363
+ return gr.CheckboxGroup(choices=available_choices, value=default_selection)
364
+
365
+ # Connect language selection to model updates
366
+ language_selection.change(
367
+ fn=update_model_choices,
368
+ inputs=[language_selection],
369
+ outputs=[model_selection]
370
+ )
371
+
372
+ # Connect the main function
373
  submit_btn.click(
374
  fn=transcribe_audio,
375
+ inputs=[audio_input, language_selection, model_selection, reference_input],
376
  outputs=[summary_output, results_table, copyable_output]
377
  )
378
 
 
379
  reference_input.submit(
380
  fn=transcribe_audio,
381
+ inputs=[audio_input, language_selection, model_selection, reference_input],
382
  outputs=[summary_output, results_table, copyable_output]
383
  )
384
 
385
+ # Language information display
386
  gr.Markdown("""
387
  ---
388
+ ### 🔤 Language & Model Support Matrix
389
+
390
+ | Language | Script | AudioX-North | AudioX-South | IndicConformer | MMS |
391
+ |----------|---------|-------------|-------------|---------------|-----|
392
+ | Hindi | Devanagari | ✅ | ❌ | ✅ | ✅ |
393
+ | Gujarati | Gujarati | ✅ | ❌ | ✅ | ✅ |
394
+ | Marathi | Devanagari | ✅ | ❌ | ✅ | ✅ |
395
+ | Tamil | Tamil | ❌ | ✅ | ✅ | ✅ |
396
+ | Telugu | Telugu | ❌ | ✅ | ✅ | ✅ |
397
+ | Kannada | Kannada | ❌ | ✅ | ✅ | ✅ |
398
+
399
  ### 💡 Tips:
400
+ - **Models auto-filter** based on selected language
401
+ - **Reference Text**: Enable WER/CER calculation by providing ground truth
402
+ - **Copy Results**: Export formatted results using the copy button
403
+ - **Best Performance**: Use AudioX models for their specialized languages
404
  """)
405
 
406
  return iface