Spaces:

Loren
/

Voxtral_Mini_Evaluation

Running on Zero

File size: 26,374 Bytes

import gradio as gr
import torch
from transformers import AutoProcessor, VoxtralForConditionalGeneration
from pydub import AudioSegment
from pydub.silence import detect_silence
import yt_dlp
import requests
import validators
from urllib.parse import urlparse
import subprocess
import os
import re
import glob
import spaces

### Initializations

MAX_TOKENS = 32000

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"*** Device: {device}")
model_name = 'mistralai/Voxtral-Mini-3B-2507'

processor = AutoProcessor.from_pretrained(model_name)
model = VoxtralForConditionalGeneration.from_pretrained(model_name,
                                                        torch_dtype=torch.bfloat16,
                                                        device_map=device)
# Supported languages
dict_languages = {"English": "en",
                  "French": "fr",
                  "German": "de",
                  "Spanish": "es",
                  "Italian": "it",
                  "Portuguese": "pt",
                  "Dutch": "nl",
                  "Hindi": "hi"}

# Whitelist of allowed MIME types for audio and video
ALLOWED_MIME_TYPES = {
    # Audio
    'audio/mpeg', 'audio/wav', 'audio/wave', 'audio/x-wav', 'audio/x-pn-wav',
    'audio/ogg', 'audio/vorbis', 'audio/aac', 'audio/mp4', 'audio/flac',
    'audio/x-flac', 'audio/opus', 'audio/webm',
    # Video
    'video/mp4', 'video/mpeg', 'video/ogg', 'video/webm', 'video/quicktime',
    'video/x-msvideo', 'video/x-matroska'
}

# Maximum allowed file size (in bytes). Ex: 1 GB
MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024  # 1 GB

# Directory where the files will be saved
DOWNLOAD_DIR = "downloaded_files"
if not os.path.exists(DOWNLOAD_DIR):
    os.makedirs(DOWNLOAD_DIR)

MAX_LEN = 1800000 # 30 mn
one_second_silence = AudioSegment.silent(duration=1000)

#### Functions

@spaces.GPU
def chunks_creation(audio_path):
    list_audio_path = [audio_path]
    audio = AudioSegment.from_file(audio_path)
    status = gr.Markdown("👍 Audio duration less than max")
    # Input too large ?
    if len(audio) > MAX_LEN:
        list_audio_path = []
        try:
            # Create list of chunks
            list_silent = detect_silence(audio,min_silence_len=300,
                    # silent if quieter than -14 dBFS threshold
                    silence_thresh=audio.dBFS-14, seek_step=100)
            list_interval = [(start, stop) for start, stop in list_silent]

            # Calculate speech intervals
            list_speech = []
            current_start = 0
            for start, stop in list_interval:
                if current_start < start:
                    list_interval.append((current_start, start))
                current_start = stop
            # Add last interval if needed
            if current_start < len(audio):
                list_speech.append((current_start, len(audio)))

            # Determination of chunks, to fit within the maximum duration
            list_chunks = []
            deb_chunk, fin_chunk = 0, list_speech[0][1]

            for start, end in list_speech[1:]:
                if end - deb_chunk + one_second_silence <= MAX_LEN:
                    fin_chunk = end + one_second_silence
                else:
                    list_chunks.append([deb_chunk, fin_chunk])
                    deb_chunk, fin_chunk = start, end
            list_chunks.append([deb_chunk, fin_chunk+one_second_silence])

            # Save chunks
            for i, (start, stop) in enumerate(list_chunks):
                segment = audio[start:stop]
                segment.export(f"chunk_{i}.wav", format="wav")
                list_audio_path.append(f"chunk_{i}.wav")

            status = f"✅ **Success!** {len(list_audio_path)} chunks saved."
        except Exception as e:
            status = gr.Markdown(f"❌ **Unexpected error during chuncks creation:** {e}")

    return list_audio_path, status
###

@spaces.GPU
def process_transcript(language: str, audio_path: str) -> str:
    """Process the audio file to return its transcription.



    Args:

        language: The language of the audio.

        audio_path: The path to the audio file.



    Returns:

        The transcribed text of the audio.

        The status of transcription : with or without chunking.

    """
    result = ""
    status = gr.Markdown()

    if audio_path is None:
        status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
    else:
        id_language = dict_languages[language]

        # Verification of the duration, for possible division into chunks
        list_audio_path, status = chunks_creation(audio_path)

        # Transcription process
        try:
            for path in list_audio_path:
                inputs = processor.apply_transcrition_request(language=id_language,
                                                              audio=path, model_id=model_name)
                inputs = inputs.to(device, dtype=torch.bfloat16)
                outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
                decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
                                                         skip_special_tokens=True)
                result += decoded_outputs[0]
            status = "✅ **Success!** Transcription done."
        except Exception as e:
            status = gr.Markdown(f"❌ **Unexpected error during transcription:** {e}")

    return result, status
###

@spaces.GPU
def process_translate(language: str, audio_path: str) -> str:
    result = ""
    status = gr.Markdown()

    if audio_path is None:
        status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
    else:
        try:
            conversation = [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "audio",
                            "path": audio_path,
                        },
                        {"type": "text", "text": "Translate this in "+language},
                    ],
                }
            ]

            inputs = processor.apply_chat_template(conversation)
            inputs = inputs.to(device, dtype=torch.bfloat16)

            outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
            decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
            result = decoded_outputs[0]
            status = "✅ **Success!** Translation done."
        except Exception as e:
            status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")

    return result, status
###

@spaces.GPU
def process_chat(question: str, audio_path: str) -> str:
    result = ""
    status = gr.Markdown()

    if audio_path is None:
        status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
    else:
        try:
            conversation = [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "audio",
                            "path": audio_path,
                        },
                        {"type": "text", "text": question},
                    ],
                }
            ]

            inputs = processor.apply_chat_template(conversation)
            inputs = inputs.to(device, dtype=torch.bfloat16)

            outputs = model.generate(**inputs, max_new_tokens=500)
            decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)

            result = decoded_outputs[0]
            status = "✅ **Success!** Translation done."
        except Exception as e:
            status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")

    return result, status
###

def disable_buttons():
    return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)

def enable_buttons():
    return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
###

def clear_audio():
    return None, None, None, None
###

@spaces.GPU
def voice_extract_demucs():
    """

    Returns the path of the voice extracted file.

    """
    try:
        cmd = [
            "demucs",
            "--two-stems=vocals",
            "--out", "demucs",
            "audio_file.wav"
        ]
        subprocess.run(cmd, check=True)
        voice_path = os.path.join("demucs", "htdemucs", "audio_file", "vocals.wav")
        success_message = "✅ **Success!** Voice extracted."
        return voice_path, voice_path, gr.Markdown(success_message)
    except Exception as e:
        return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
###

def secure_download_from_url(url: str):
    """

    Validates a URL and downloads the file if it is an authorized media.

    Returns the path of the downloaded file or an error message.

    """
    # Step 1: Validate the URL format
    if not validators.url(url):
        return None, None, gr.Markdown("❌ **Error:** The provided URL is invalid.")

    try:
        # Step 2: Send a HEAD request to check the headers without downloading the content
        # allow_redirects=True to follow redirects to the final file location.
        # timeout to avoid blocking requests.
        response = requests.head(url, allow_redirects=True, timeout=10)

        # Check if the request was successful (status code 2xx)
        response.raise_for_status()

        # Step 3: Validate the content type (MIME type)
        content_type = response.headers.get('Content-Type', '').split(';')[0].strip()
        if content_type not in ALLOWED_MIME_TYPES:
            error_message = (
                 f"❌ **Error:** The file type is not allowed.\n"
                 f" - **Type detected:** `{content_type}`\n"
                 f" - **Allowed types:** Audio and Video only."
            )
            return None, None, gr.Markdown(error_message)

        # Step 4: Validate the file size
        content_length = response.headers.get('Content-Length')
        if content_length and int(content_length) > MAX_FILE_SIZE:
            error_message = (
                f"❌ **Error:** The file is too large.\n"
                f" - **File size:** {int(content_length) / 1024 / 1024:.2f} MB\n"
                f" - **Maximum allowed size:** {MAX_FILE_SIZE / 1024 / 1024:.2f} MB"
            )
            return None, None, gr.Markdown(error_message)

        # Step 5: Secure streaming download
        with requests.get(url, stream=True, timeout=20) as r:
            r.raise_for_status()

            # Extract the file name from the URL
            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path)
            if not filename: # Si l'URL se termine par un '/'
                filename = "downloaded_media_file"

            filepath = os.path.join(DOWNLOAD_DIR, filename)

            # --- Step 6: Download the audio ---
            # Write the file in chunks to avoid overloading memory
            with open(filepath, 'wb') as f:
                downloaded_size = 0
                for chunk in r.iter_content(chunk_size=8192):
                    downloaded_size += len(chunk)
                    if downloaded_size > MAX_FILE_SIZE:
                         os.remove(filepath) # Supprimer le fichier partiel
                         return None, None, gr.Markdown("❌ **Error:** The file exceeds the maximum allowed size during download.")
                    f.write(chunk)

        # --- Step 7: Convert to WAV using Pydub ---
        audio_file = AudioSegment.from_file(filepath)
        file_handle = audio_file.export("audio_file.wav", format="wav")

        # --- Step 8: Clean up ---
        try:
            files = glob.glob(DOWNLOAD_DIR)
            for f in files:
                os.remove(f)
        except:
            pass

        success_message = (
            f"✅ **Success!** File downloaded and saved."
        )

        # Returns the file path and a success message.
        return "audio_file.wav", "audio_file.wav", gr.Markdown(success_message)

    except requests.exceptions.RequestException as e:
        # Handle network errors (timeout, DNS, connection refused, etc.)
        return None, None, gr.Markdown(f"❌ **Network error:** Unable to reach URL. Details: {e}")
    except Exception as e:
        # Handle Other potential errors
        return None, None, gr.Markdown(f"❌ **Unexpected error:** {e}")
###

def secure_download_youtube_audio(url: str):
    """

    Returns the path of the downloaded file or an error message.

    """
    # --- Step 1: Validate URL format with Regex ---
    youtube_regex = re.compile(
        r'^(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/'
        r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})')
    if not youtube_regex.match(url):
        return None, None, gr.Markdown("❌ **Error:** The URL '{url}' does not appear to be a valid YouTube URL.")

    try:
        # --- Step 2: Check video availability ---
        ydl_info_opts = {'quiet': True, 'skip_download': True}
        try:
            with yt_dlp.YoutubeDL(ydl_info_opts) as ydl:
                info = ydl.extract_info(url, download=False)
        except yt_dlp.utils.DownloadError as e:
            return None, None, gr.Markdown(f"❌ **Error:** The video at URL '{url}' is unavailable ({str(e)})")

        # --- Step 3: Select best audio format ---
        formats = [f for f in info['formats'] if f.get('acodec') != 'none']
        if not formats:
            return None, None, gr.Markdown("❌ **Error:** No audio-only stream was found for this video.")

        formats.sort(key=lambda f: f.get('abr') or 0, reverse=True)
        best_audio_format = formats[0]

        # --- Step 4: Check file size BEFORE downloading ---
        filesize = best_audio_format.get('filesize') or best_audio_format.get('filesize_approx')
        if filesize is None:
            print("Could not determine file size before downloading.")
            filesize = 1

        if filesize > MAX_FILE_SIZE:
            return None, None, gr.Markdown(
                f"❌ **Error:** The file is too large.\n"
                f" - **File size:** {filesize / 1024 / 1024:.2f} MB\n"
                f" - **Maximum allowed size:** {MAX_FILE_SIZE / 1024 / 1024:.2f} MB"
            )

        # --- Step 5: Download & convert directly to WAV ---
        ydl_opts = {
            'quiet': True,
            'format': f"{best_audio_format['format_id']}",
            'outtmpl': "audio_file",  # will be replaced by ffmpeg output
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '192',
            }],
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])

        success_message = "✅ **Success!** Audio extracted and saved."
        return "audio_file.wav", "audio_file.wav", gr.Markdown(success_message)

    except FileNotFoundError:
        return None, None, gr.Markdown("❌ **Error:** FFmpeg not found. Please ensure it is installed and in your system's PATH.")
    except Exception as e:
        return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
###

def voice_extract_demucs():
    """

    Returns the path of the voice extracted file.

    """
    try:
        cmd = [
            "demucs",
            "--two-stems=vocals",
            "--out", "demucs",
            "audio_file.wav"
        ]
        subprocess.run(cmd, check=True)
        voice_path = os.path.join("demucs", "htdemucs", "audio_file", "vocals.wav")
        success_message = "✅ **Success!** Voice extracted."
        return voice_path, voice_path, gr.Markdown(success_message)
    except Exception as e:
        return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
###

def clear_audio():
    return None, None, None, None
###



#### Gradio interface
with gr.Blocks(title="Voxtral") as voxtral:
    with gr.Row():
        gr.Markdown("# **Voxtral Mini Evaluation**")

        with gr.Accordion("🔎 More on Voxtral", open=False):
            gr.Markdown("""## **Key Features:**



#### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.

##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly

##### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding

##### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models

##### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)

##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents

##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")

    gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \

    capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and \

    audio understanding. Available languages: English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian.""")

    gr.Markdown("### **1.Choose the audio:**")
    sel_audio = gr.State()
    with gr.Row():
        with gr.Tabs():
            with gr.Tab("From record or file upload"):
                gr.Markdown("### **Upload an audio file, record via microphone, or select a demo file:**")
                gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription; if longer, it will be cut into chunks)*")
                sel_audio1 = gr.Audio(sources=["upload", "microphone"], type="filepath",
                                    label="Set an audio file to process it:")
                example1 = [["mapo_tofu.mp3"]]
                gr.Examples(
                    examples=example1,
                    inputs=sel_audio1,
                    outputs=None,
                    fn=None,
                    cache_examples=False,
                    run_on_click=False
                )
                status_output1 = gr.Markdown()
                with gr.Row():
                    voice_button1 = gr.Button("Extract voice (if noisy environment)")
                    voice_button1.click(
                        fn=voice_extract_demucs,
                        outputs=[sel_audio, sel_audio1, status_output1])
                    clear_audio1 = gr.Button("Clear audio")
                    clear_audio1.click(
                        fn=clear_audio,
                        outputs=[sel_audio, sel_audio, sel_audio1, status_output1])

            with gr.Tab("From file url (audio or video file)"):
                gr.Markdown("### **Enter the url of the file (mp3, wav, mp4, ...):**")
                url_input2 = gr.Textbox(label="URL (MP3 or MP4 file)",
                                       placeholder="https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4")
                example2 = [["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4"]]
                gr.Examples(
                    examples=example2,
                    inputs=url_input2,
                    outputs=None,
                    fn=None,
                    cache_examples=False,
                    run_on_click=False
                )
                download_button2 = gr.Button("Check and upload", variant="primary")
                input_audio2 = gr.Audio()
                status_output2 = gr.Markdown()
                download_button2.click(
                    fn=secure_download_from_url,
                    inputs=url_input2,
                    outputs=[input_audio2, sel_audio, status_output2]
                )
                with gr.Row():
                    voice_button2 = gr.Button("Extract voice (if noisy environment)")
                    voice_button2.click(
                        fn=voice_extract_demucs,
                        outputs=[input_audio2, sel_audio, status_output2])
                    clear_audio1 = gr.Button("Clear audio")
                    clear_audio1.click(
                        fn=clear_audio,
                        outputs=[sel_audio, url_input2, input_audio2, status_output2])

            with gr.Tab("From Youtube url:"):
                gr.Markdown("### **Enter the url of the Youtube video:**")
                url_input3 = gr.Textbox(label="Youtube url",
                                       placeholder="https://www.youtube.com/...")
                download_button3 = gr.Button("Check and upload", variant="primary")
                input_audio3 = gr.Audio()
                status_output3 = gr.Markdown()
                download_button3.click(
                    fn=secure_download_youtube_audio,
                    inputs=url_input3,
                    outputs=[input_audio3, sel_audio, status_output3]
                )
                with gr.Row():
                    voice_button3 = gr.Button("Extract voice (if noisy environment)")
                    voice_button3.click(
                        fn=voice_extract_demucs,
                        outputs=[input_audio3, sel_audio, status_output3])
                    clear_audio1 = gr.Button("Clear audio")
                    clear_audio1.click(
                        fn=clear_audio,
                        outputs=[sel_audio, url_input3, input_audio3, status_output3])

    with gr.Row():
        gr.Markdown("### **2. Choose one of theese tasks:**")

    with gr.Row():
        with gr.Column():
            with gr.Accordion("📝 Transcription", open=True):
                sel_language = gr.Dropdown(
                    choices=list(dict_languages.keys()),
                    value="English",
                    label="Select the language of the audio file:"
                )
                submit_transcript = gr.Button("Extract transcription", variant="primary")
                text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
                status_transcript = gr.Markdown()

        with gr.Column():
            with gr.Accordion("🔁 Translation", open=True):
                list_language = list(dict_languages.keys())
                list_language.pop(list_language.index(sel_language.value)) # Fix: Access the value of the dropdown
                sel_translate_language = gr.Dropdown(
                    choices=list(dict_languages.keys()),
                    value="English",
                    label="Select the language for translation:"
                )
                submit_translate = gr.Button("Translate audio file", variant="primary")
                text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
                status_translate = gr.Markdown()

        with gr.Column():
            with gr.Accordion("🤖 Ask audio file", open=True):
                question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
                submit_chat = gr.Button("Ask audio file", variant="primary")
                example_chat = [["What is the subject of this audio file?"], ["Quels sont les ingrédients ?"]]
                gr.Examples(
                    examples=example_chat,
                    inputs=question_chat,
                    outputs=None,
                    fn=None,
                    cache_examples=False,
                    run_on_click=False
                )
                text_chat = gr.Textbox(label="💬 Model answer", lines=10)
                status_chat = gr.Markdown()

### Processing

    # Transcription
    submit_transcript.click(
        disable_buttons,
        outputs=[submit_transcript, submit_translate, submit_chat],
        trigger_mode="once",
    ).then(
        fn=process_transcript,
        inputs=[sel_language, sel_audio],
        outputs=[text_transcript, status_transcript]
    ).then(
        enable_buttons,
        outputs=[submit_transcript, submit_translate, submit_chat],
    )

    # Translation
    submit_translate.click(
        disable_buttons,
        outputs=[submit_transcript, submit_translate, submit_chat],
        trigger_mode="once",
    ).then(
        fn=process_translate,
        inputs=[sel_translate_language, sel_audio],
        outputs=[text_translate, status_translate]
    ).then(
        enable_buttons,
        outputs=[submit_transcript, submit_translate, submit_chat],
    )

    # Chat
    submit_chat.click(
        disable_buttons,
        outputs=[submit_transcript, submit_translate, submit_chat],
        trigger_mode="once",
    ).then(
        fn=process_chat,
        inputs=[question_chat, sel_audio],
        outputs=[text_chat, status_chat]
    ).then(
        enable_buttons,
        outputs=[submit_transcript, submit_translate, submit_chat],
    )

### Launch the app

if __name__ == "__main__":
    voxtral.queue().launch(debug=True)