# modified from https://github.com/XiaomiMiMo/MiMo-VL/tree/main/app.py import os import gradio as gr from infer import MiMoVLInfer import spaces # infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL") infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL-2508") label_translations = { "gr_chatinterface_ofl": { "English": "Chatbot", }, "gr_chatinterface_ol": { "English": "Chatbot", }, "gr_tab_ol": { "English": "Online", }, "gr_tab_ofl": { "English": "Offline", }, "gr_temperature": { "English": "Temperature", }, "gr_webcam_image": { "English": "🤳 Open Webcam", }, "gr_webcam_images": { "English": "📹 Recorded Frames", }, "gr_chatinterface_ofl.textbox.placeholder": { "English": "Ask me anything. You can also drop in images and .mp4 videos.", }, "gr_chatinterface_ol.textbox.placeholder": { "English": "Ask me anything...", } } @spaces.GPU(duration=180) def offline_chat(gr_inputs: dict, gr_history: list, infer_history: list, temperature: float): infer.to_device("cuda") try: yield [{"role": "assistant", "content": "⏳ Reserving GPU & preparing inference…"}], infer_history for response_text, infer_history in infer(inputs=gr_inputs, history=infer_history, temperature=temperature): if response_text.startswith('') and '' not in response_text: reasoning_text = response_text.lstrip('') response_message = [{ "role": "assistant", "content": reasoning_text, 'metadata': {'title': '🤔 Thinking'} }] yield response_message, infer_history elif '' in response_text and '' in response_text: reasoning_text, response_text2 = response_text.split('', 1) reasoning_text = reasoning_text.lstrip('') response_message = [{ "role": "assistant", "content": reasoning_text, 'metadata': {'title': '🤔 Thinking'} }, { "role": "assistant", "content": response_text2 }] yield response_message, infer_history else: yield [{"role": "assistant", "content": response_text}], infer_history finally: infer.to_device("cpu") @spaces.GPU(duration=120) def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, gr_counter: int, infer_history: list, temperature: float): infer.to_device("cuda") try: if not gr_webcam_images: gr_webcam_images = [] gr_webcam_images = gr_webcam_images[gr_counter:] inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]} # send an immediate chunk yield f'received {len(gr_webcam_images)} new frames, processing…', gr_counter + len(gr_webcam_images), infer_history for response_message, infer_history in offline_chat( inputs, gr_history, infer_history, temperature): yield response_message, gr.skip(), infer_history finally: infer.to_device("cpu") with gr.Blocks() as demo: gr.Markdown("""
MiMo-7b-VL
""") with gr.Column(): # gr_title = gr.Markdown('# MiMo-VL') with gr.Row(): gr_lang_selector = gr.Dropdown(choices=["English"], value="English", label="🌐 Interface", interactive=True, min_width=250, scale=0) with gr.Tabs(): with gr.Tab("Offline") as gr_tab_ofl: gr_infer_history = gr.State([]) gr_temperature_hidden = gr.Slider(minimum=0.0, maximum=2.0, step=0.1, value=1.0, interactive=True, visible=False) gr_chatinterface_ofl = gr.ChatInterface( fn=offline_chat, type="messages", multimodal=True, chatbot=gr.Chatbot(height=800), textbox=gr.MultimodalTextbox( file_count="multiple", file_types=["image", ".mp4"], sources=["upload"], stop_btn=True, placeholder=label_translations[ 'gr_chatinterface_ofl.textbox.placeholder']['English'], ), additional_inputs=[ gr_infer_history, gr_temperature_hidden ], additional_outputs=[gr_infer_history], ) gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear], fn=lambda: [], outputs=[gr_infer_history]) with gr.Row(): with gr.Column(scale=1, min_width=200): gr_temperature_ofl = gr.Slider( minimum=0.0, maximum=2.0, step=0.1, value=0.4, label=label_translations['gr_temperature']['English'], interactive=True) gr_temperature_ofl.change(lambda x: x, inputs=gr_temperature_ofl, outputs=gr_temperature_hidden) with gr.Column(scale=8): with gr.Column(visible=True) as gr_examples_en: gr.Examples( examples=[ { "text": "Who are you?", "files": [] }, ], inputs=[gr_chatinterface_ofl.textbox], ) with gr.Tab("Online") as gr_tab_ol: with gr.Row(): with gr.Column(scale=1): gr_infer_history = gr.State([]) gr_temperature_hidden = gr.Slider(minimum=0.0, maximum=2.0, step=0.1, value=1.0, interactive=True, visible=False) with gr.Row(): with gr.Column(scale=1): gr_webcam_image = gr.Image( label=label_translations['gr_webcam_image'] ['English'], sources="webcam", height=250, type='filepath') gr_webcam_images = gr.Gallery( label=label_translations['gr_webcam_images'] ['English'], show_label=True, format='webp', columns=1, height=250, preview=True, interactive=False) gr_counter = gr.Number(value=0, visible=False) with gr.Column(scale=3): gr_chatinterface_ol = gr.ChatInterface( fn=online_record_chat, type="messages", multimodal=False, chatbot=gr.Chatbot(height=800), textbox=gr. Textbox(placeholder=label_translations[ 'gr_chatinterface_ol.textbox.placeholder'] ['English'], submit_btn=True, stop_btn=True), additional_inputs=[ gr_webcam_images, gr_counter, gr_infer_history, gr_temperature_hidden ], additional_outputs=[ gr_counter, gr_infer_history ], ) def cache_webcam(recorded_image: str, recorded_images: list): if not recorded_images: recorded_images = [] return recorded_images + [recorded_image] gr_webcam_image.stream( fn=cache_webcam, inputs=[gr_webcam_image, gr_webcam_images], outputs=[gr_webcam_images], stream_every=1, concurrency_limit=30, ) with gr.Row(): gr_temperature_ol = gr.Slider( minimum=0.0, maximum=2.0, step=0.1, value=0.4, label=label_translations['gr_temperature'] ['English'], interactive=True) gr_temperature_ol.change( lambda x: x, inputs=gr_temperature_ol, outputs=gr_temperature_hidden) def update_lang(lang: str): return ( gr.update(label=label_translations['gr_chatinterface_ofl'][lang]), gr.update(label=label_translations['gr_chatinterface_ol'][lang]), gr.update(placeholder=label_translations[ 'gr_chatinterface_ofl.textbox.placeholder'][lang]), gr.update(placeholder=label_translations[ 'gr_chatinterface_ol.textbox.placeholder'][lang]), gr.update(label=label_translations['gr_tab_ofl'][lang]), gr.update(label=label_translations['gr_tab_ol'][lang]), gr.update(label=label_translations['gr_temperature'][lang]), gr.update(label=label_translations['gr_temperature'][lang]), gr.update(visible=lang == 'English'), gr.update(visible=lang != 'English'), gr.update(label=label_translations['gr_webcam_image'][lang]), gr.update(label=label_translations['gr_webcam_images'][lang]), ) gr_lang_selector.change(fn=update_lang, inputs=[gr_lang_selector], outputs=[ gr_chatinterface_ofl.chatbot, gr_chatinterface_ol.chatbot, gr_chatinterface_ofl.textbox, gr_chatinterface_ol.textbox, gr_tab_ofl, gr_tab_ol, gr_temperature_ofl, gr_temperature_ol, gr_examples_en, gr_webcam_image, gr_webcam_images, ]) demo.queue(default_concurrency_limit=2, max_size=50) if __name__ == "__main__": demo.launch()