import gradio as gr import spaces import torch from transformers import AutoProcessor, VoxtralForConditionalGeneration device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Load model and processor voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers") voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device) voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers") voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device) LANGUAGES = { "English": "en", "French": "fr", "German": "de", "Spanish": "es", "Italian": "it", "Portuguese": "pt", "Dutch": "nl", "Russian": "ru", "Chinese": "zh", "Japanese": "ja", "Arabic": "ar", } @spaces.GPU() def process_audio(audio_path, model_name, lang_name, max_tokens=500): """Process audio with selected Voxtral model and return the generated response. This function takes an audio file and processes it using the selected Voxtral model to generate a transcription in the specified language. Args: audio_path: Path to the audio file to be transcribed. model_name: Name of the Voxtral model to use ("Voxtral Mini (3B)" or "Voxtral Small (24B)"). lang_name: Name of the language for transcription (e.g., "English", "French", etc.). max_tokens: Maximum number of tokens to generate in the output (default: 500). Returns: String containing the transcribed text from the audio file, or an error message if the audio file is missing or an invalid model is selected. """ if not audio_path: return "Please upload an audio file." if model_name == "Voxtral Mini (3B)": model = voxtral_mini_model processor = voxtral_mini_processor repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers" elif model_name == "Voxtral Small (24B)": model = voxtral_small_model processor = voxtral_small_processor repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers" else: return "Invalid model selected." language = LANGUAGES[lang_name] inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id) inputs = inputs.to(device, dtype=torch.bfloat16) outputs = model.generate(**inputs, max_new_tokens=max_tokens) decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) return decoded_outputs[0] # Define Gradio interface with gr.Blocks(title="Voxtral Demo") as demo: gr.Markdown("# Voxtral Transcription Demo") gr.Markdown("Upload an audio file and get a transcription from Voxtral.") gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)") with gr.Row(): with gr.Column(): audio_input = gr.Audio(type="filepath", label="Upload Audio") model_selector = gr.Dropdown( choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"], value="Voxtral Mini (3B)", label="Select Model" ) language = gr.Dropdown( choices=list(LANGUAGES.keys()), value="English", label="Language" ) max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens") submit_btn = gr.Button("Extract Transcription", variant="primary") with gr.Column(): output_text = gr.Textbox(label="Generated Response", lines=10) submit_btn.click( fn=process_audio, inputs=[audio_input, model_selector, language, max_tokens], outputs=output_text ) gr.Examples( examples=[ ["examples/english_armstrong_small_step.mp3", "Voxtral Mini (3B)", "English", 500], ["examples/french_mathis_voice_intro.mp3", "Voxtral Mini (3B)", "French", 500], ["examples/german_spehr_voice_intro.mp3", "Voxtral Mini (3B)", "German", 500], ["examples/japanese_ann01_announcement.mp3", "Voxtral Mini (3B)", "Japanese", 500], ["examples/arabic_news_report.mp3", "Voxtral Mini (3B)", "Arabic", 500], ["examples/arabic_yousif_saif_football.mp3", "Voxtral Small (24B)", "Arabic", 500], ], inputs=[audio_input, model_selector, language, max_tokens], example_labels=[ "Neil Armstrong's 'small step' (English, 24s)", "Rémi Mathis voice intro (French, 16s)", "Christoph Spehr voice intro (German, 28s)", "Ann01 announcement (Japanese, 22s)", "News Report (Arabic, 10s)", "Football Commentry (Arabic, 11s)", ] ) # Launch the app if __name__ == "__main__": demo.queue().launch(share=False, ssr_mode=False, mcp_server=True)