Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Running on Zero

App Files Files Community

jake commited on 29 days ago

Commit

e7b4b89

1 Parent(s): d3b2d63

change app

Browse files

Files changed (1) hide show

app.py +745 -34

app.py CHANGED Viewed

@@ -1,7 +1,198 @@
 """
-Gradio Space entrypoint mirroring `MMaDA/inference/gradio_multimodal_demo_inst.py`.
-It downloads the published checkpoint once via huggingface_hub, wires it into
-OmadaDemo, and launches the existing Blocks UI.
 Environment overrides:
   MODEL_REPO_ID      (default: jaeikkim/AIDAS-Omni-Modal-Diffusion)
@@ -12,33 +203,42 @@ Environment overrides:
   STYLE_REVISION     (default: main)
   HF_TOKEN           (optional, for private model/dataset)
   TRAIN_CONFIG_PATH  (default: MMaDA/inference/demo/demo.yaml)
-  DEVICE             (default: auto cuda/cpu)
-  PORT               (default: 7860; Space sets this)
 """
 import os
 import sys
 import subprocess
 import importlib
-import spaces
 from pathlib import Path
 from packaging.version import parse as parse_version
-# Ensure local project is importable
 PROJECT_ROOT = Path(__file__).resolve().parent
 MMADA_ROOT = PROJECT_ROOT / "MMaDA"
 if str(MMADA_ROOT) not in sys.path:
     sys.path.insert(0, str(MMADA_ROOT))
 EMOVA_ROOT = PROJECT_ROOT / "EMOVA_speech_tokenizer"
 if str(EMOVA_ROOT) not in sys.path:
     sys.path.insert(0, str(EMOVA_ROOT))
 def ensure_hf_hub(target: str = "0.36.0"):
     """
     Make sure huggingface_hub stays <1.0 to satisfy transformers/tokenizers.
-    The Space base image installs gradio which may upgrade it to 1.x; we downgrade here.
     """
     try:
         import huggingface_hub as hub
@@ -53,6 +253,7 @@ def ensure_hf_hub(target: str = "0.36.0"):
             [sys.executable, "-m", "pip", "install", f"huggingface-hub=={target}", "--no-cache-dir"]
         )
         hub = importlib.reload(hub)
     # Backfill missing constants in older hub versions to avoid AttributeError.
     try:
         import huggingface_hub.constants as hub_consts  # type: ignore
@@ -65,9 +266,22 @@ def ensure_hf_hub(target: str = "0.36.0"):
 snapshot_download = ensure_hf_hub().snapshot_download
-from inference.gradio_multimodal_demo_inst import OmadaDemo, build_demo  # noqa: E402
 def download_assets() -> Path:
     """Download demo assets (logo + sample prompts/media) and return the root path."""
     repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/AIDAS-Omni-Modal-Diffusion-assets")
@@ -127,25 +341,39 @@ def download_checkpoint() -> Path:
         )
     )
-    # If snapshot itself is unwrapped_model, return it; otherwise point a symlink to it.
     if snapshot_path.name == "unwrapped_model":
         return snapshot_path
     nested = snapshot_path / "unwrapped_model"
     if nested.is_dir():
         return nested
     aliased = snapshot_path.parent / "unwrapped_model"
     if not aliased.exists():
         aliased.symlink_to(snapshot_path, target_is_directory=True)
     return aliased
-@spaces.GPU
-def main():
-    checkpoint_dir = download_checkpoint()
     asset_root = download_assets()
     style_root = download_style()
-    # Symlink style centroid npy files to expected locations
     style_targets = [
         MMADA_ROOT / "models" / "speech_tokenization" / "condition_style_centroid",
         PROJECT_ROOT
@@ -155,33 +383,516 @@ def main():
         / "condition_style_centroid",
     ]
     for starget in style_targets:
-        if starget.exists():
-            continue
-        starget.parent.mkdir(parents=True, exist_ok=True)
-        starget.symlink_to(style_root, target_is_directory=True)
-    # Point demo assets (logo, sample prompts/media) to the downloaded dataset
-    from inference import gradio_multimodal_demo_inst as demo_mod  # noqa: WPS433
-    demo_root = asset_root / "demo"
-    demo_mod.DEMO_ROOT = demo_root
-    demo_mod.LOGO_PATH = demo_root / "logo.png"
-    demo_mod.T2S_TEXT_PATH = demo_root / "t2s" / "text.txt"
-    demo_mod.CHAT_TEXT_PATH = demo_root / "chat" / "text.txt"
-    demo_mod.T2I_TEXT_PATH = demo_root / "t2i" / "text.txt"
     default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
     legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
     train_config = os.getenv("TRAIN_CONFIG_PATH")
     if not train_config:
-        # Prefer configs/mmada_demo.yaml (in repo), fallback to legacy path if restored.
         train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
-    device = os.getenv("DEVICE")
-    port = int(os.getenv("PORT", "7860"))
-    app = OmadaDemo(train_config=train_config, checkpoint=str(checkpoint_dir), device=device)
-    build_demo(app, share=False, server_name="0.0.0.0", server_port=port)
 if __name__ == "__main__":
-    main()

+# """
+# Gradio Space entrypoint mirroring `MMaDA/inference/gradio_multimodal_demo_inst.py`.
+# It downloads the published checkpoint once via huggingface_hub, wires it into
+# OmadaDemo, and launches the existing Blocks UI.
+# Environment overrides:
+#   MODEL_REPO_ID      (default: jaeikkim/AIDAS-Omni-Modal-Diffusion)
+#   MODEL_REVISION     (default: main)
+#   ASSET_REPO_ID      (default: jaeikkim/AIDAS-Omni-Modal-Diffusion-assets)
+#   ASSET_REVISION     (default: main)
+#   STYLE_REPO_ID      (default: jaeikkim/aidas-style-centroid)
+#   STYLE_REVISION     (default: main)
+#   HF_TOKEN           (optional, for private model/dataset)
+#   TRAIN_CONFIG_PATH  (default: MMaDA/inference/demo/demo.yaml)
+#   DEVICE             (default: auto cuda/cpu)
+#   PORT               (default: 7860; Space sets this)
+# """
+# import os
+# import sys
+# import subprocess
+# import importlib
+# import spaces
+# from pathlib import Path
+# from packaging.version import parse as parse_version
+# # Ensure local project is importable
+# PROJECT_ROOT = Path(__file__).resolve().parent
+# MMADA_ROOT = PROJECT_ROOT / "MMaDA"
+# if str(MMADA_ROOT) not in sys.path:
+#     sys.path.insert(0, str(MMADA_ROOT))
+# EMOVA_ROOT = PROJECT_ROOT / "EMOVA_speech_tokenizer"
+# if str(EMOVA_ROOT) not in sys.path:
+#     sys.path.insert(0, str(EMOVA_ROOT))
+# def ensure_hf_hub(target: str = "0.36.0"):
+#     """
+#     Make sure huggingface_hub stays <1.0 to satisfy transformers/tokenizers.
+#     The Space base image installs gradio which may upgrade it to 1.x; we downgrade here.
+#     """
+#     try:
+#         import huggingface_hub as hub
+#     except ImportError:
+#         subprocess.check_call(
+#             [sys.executable, "-m", "pip", "install", f"huggingface-hub=={target}", "--no-cache-dir"]
+#         )
+#         import huggingface_hub as hub
+#     if parse_version(hub.__version__) >= parse_version("1.0.0"):
+#         subprocess.check_call(
+#             [sys.executable, "-m", "pip", "install", f"huggingface-hub=={target}", "--no-cache-dir"]
+#         )
+#         hub = importlib.reload(hub)
+#     # Backfill missing constants in older hub versions to avoid AttributeError.
+#     try:
+#         import huggingface_hub.constants as hub_consts  # type: ignore
+#     except Exception:
+#         hub_consts = None
+#     if hub_consts and not hasattr(hub_consts, "HF_HUB_ENABLE_HF_TRANSFER"):
+#         setattr(hub_consts, "HF_HUB_ENABLE_HF_TRANSFER", False)
+#     return hub
+# snapshot_download = ensure_hf_hub().snapshot_download
+# from inference.gradio_multimodal_demo_inst import OmadaDemo, build_demo  # noqa: E402
+# def download_assets() -> Path:
+#     """Download demo assets (logo + sample prompts/media) and return the root path."""
+#     repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/AIDAS-Omni-Modal-Diffusion-assets")
+#     revision = os.getenv("ASSET_REVISION", "main")
+#     token = os.getenv("HF_TOKEN")
+#     cache_dir = PROJECT_ROOT / "_asset_cache"
+#     cache_dir.mkdir(parents=True, exist_ok=True)
+#     return Path(
+#         snapshot_download(
+#             repo_id=repo_id,
+#             revision=revision,
+#             repo_type="dataset",
+#             local_dir=cache_dir,
+#             local_dir_use_symlinks=False,
+#             token=token,
+#         )
+#     )
+# def download_style() -> Path:
+#     """Download style centroid dataset and return the root path."""
+#     repo_id = os.getenv("STYLE_REPO_ID", "jaeikkim/aidas-style-centroid")
+#     revision = os.getenv("STYLE_REVISION", "main")
+#     token = os.getenv("HF_TOKEN")
+#     cache_dir = PROJECT_ROOT / "_style_cache"
+#     cache_dir.mkdir(parents=True, exist_ok=True)
+#     return Path(
+#         snapshot_download(
+#             repo_id=repo_id,
+#             revision=revision,
+#             repo_type="dataset",
+#             local_dir=cache_dir,
+#             local_dir_use_symlinks=False,
+#             token=token,
+#         )
+#     )
+# def download_checkpoint() -> Path:
+#     """Download checkpoint snapshot and return an `unwrapped_model` directory."""
+#     repo_id = os.getenv("MODEL_REPO_ID", "jaeikkim/AIDAS-Omni-Modal-Diffusion")
+#     revision = os.getenv("MODEL_REVISION", "main")
+#     token = os.getenv("HF_TOKEN")
+#     cache_dir = PROJECT_ROOT / "_ckpt_cache"
+#     cache_dir.mkdir(parents=True, exist_ok=True)
+#     snapshot_path = Path(
+#         snapshot_download(
+#             repo_id=repo_id,
+#             revision=revision,
+#             repo_type="model",
+#             local_dir=cache_dir,
+#             local_dir_use_symlinks=False,
+#             token=token,
+#         )
+#     )
+#     # If snapshot itself is unwrapped_model, return it; otherwise point a symlink to it.
+#     if snapshot_path.name == "unwrapped_model":
+#         return snapshot_path
+#     nested = snapshot_path / "unwrapped_model"
+#     if nested.is_dir():
+#         return nested
+#     aliased = snapshot_path.parent / "unwrapped_model"
+#     if not aliased.exists():
+#         aliased.symlink_to(snapshot_path, target_is_directory=True)
+#     return aliased
+# @spaces.GPU
+# def main():
+#     checkpoint_dir = download_checkpoint()
+#     asset_root = download_assets()
+#     style_root = download_style()
+#     # Symlink style centroid npy files to expected locations
+#     style_targets = [
+#         MMADA_ROOT / "models" / "speech_tokenization" / "condition_style_centroid",
+#         PROJECT_ROOT
+#         / "EMOVA_speech_tokenizer"
+#         / "emova_speech_tokenizer"
+#         / "speech_tokenization"
+#         / "condition_style_centroid",
+#     ]
+#     for starget in style_targets:
+#         if starget.exists():
+#             continue
+#         starget.parent.mkdir(parents=True, exist_ok=True)
+#         starget.symlink_to(style_root, target_is_directory=True)
+#     # Point demo assets (logo, sample prompts/media) to the downloaded dataset
+#     from inference import gradio_multimodal_demo_inst as demo_mod  # noqa: WPS433
+#     demo_root = asset_root / "demo"
+#     demo_mod.DEMO_ROOT = demo_root
+#     demo_mod.LOGO_PATH = demo_root / "logo.png"
+#     demo_mod.T2S_TEXT_PATH = demo_root / "t2s" / "text.txt"
+#     demo_mod.CHAT_TEXT_PATH = demo_root / "chat" / "text.txt"
+#     demo_mod.T2I_TEXT_PATH = demo_root / "t2i" / "text.txt"
+#     default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
+#     legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
+#     train_config = os.getenv("TRAIN_CONFIG_PATH")
+#     if not train_config:
+#         # Prefer configs/mmada_demo.yaml (in repo), fallback to legacy path if restored.
+#         train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
+#     device = os.getenv("DEVICE")
+#     port = int(os.getenv("PORT", "7860"))
+#     app = OmadaDemo(train_config=train_config, checkpoint=str(checkpoint_dir), device=device)
+#     build_demo(app, share=False, server_name="0.0.0.0", server_port=port)
+# if __name__ == "__main__":
+#     main()
 """
+ZeroGPU-friendly Gradio entrypoint for OMada demo.
+- Downloads checkpoint + assets + style centroids from Hugging Face Hub
+- Instantiates OmadaDemo once (global)
+- Exposes 10 modalities via Gradio tabs
+- Uses @spaces.GPU only on inference handlers so GPU is allocated per request
 Environment overrides:
   MODEL_REPO_ID      (default: jaeikkim/AIDAS-Omni-Modal-Diffusion)
   STYLE_REVISION     (default: main)
   HF_TOKEN           (optional, for private model/dataset)
   TRAIN_CONFIG_PATH  (default: MMaDA/inference/demo/demo.yaml)
+  DEVICE             (default: cuda)
 """
 import os
 import sys
 import subprocess
 import importlib
 from pathlib import Path
+import gradio as gr
+import spaces
 from packaging.version import parse as parse_version
+# ---------------------------
+# Project roots & sys.path
+# ---------------------------
 PROJECT_ROOT = Path(__file__).resolve().parent
 MMADA_ROOT = PROJECT_ROOT / "MMaDA"
 if str(MMADA_ROOT) not in sys.path:
     sys.path.insert(0, str(MMADA_ROOT))
 EMOVA_ROOT = PROJECT_ROOT / "EMOVA_speech_tokenizer"
 if str(EMOVA_ROOT) not in sys.path:
     sys.path.insert(0, str(EMOVA_ROOT))
+# ---------------------------
+# HuggingFace Hub helper
+# ---------------------------
 def ensure_hf_hub(target: str = "0.36.0"):
     """
     Make sure huggingface_hub stays <1.0 to satisfy transformers/tokenizers.
+    The Spaces base image may pull in a newer version via gradio, so we pin it.
     """
     try:
         import huggingface_hub as hub
             [sys.executable, "-m", "pip", "install", f"huggingface-hub=={target}", "--no-cache-dir"]
         )
         hub = importlib.reload(hub)
     # Backfill missing constants in older hub versions to avoid AttributeError.
     try:
         import huggingface_hub.constants as hub_consts  # type: ignore
 snapshot_download = ensure_hf_hub().snapshot_download
+# ---------------------------
+# Imports from OMada demo
+# ---------------------------
+from inference.gradio_multimodal_demo_inst import (  # noqa: E402
+    OmadaDemo,
+    CUSTOM_CSS,
+    FORCE_LIGHT_MODE_JS,
+)
+# ---------------------------
+# HF download helpers
+# ---------------------------
 def download_assets() -> Path:
     """Download demo assets (logo + sample prompts/media) and return the root path."""
     repo_id = os.getenv("ASSET_REPO_ID", "jaeikkim/AIDAS-Omni-Modal-Diffusion-assets")
         )
     )
+    # If snapshot itself is unwrapped_model, return it; otherwise look for nested dir,
+    # and finally alias via symlink.
     if snapshot_path.name == "unwrapped_model":
         return snapshot_path
     nested = snapshot_path / "unwrapped_model"
     if nested.is_dir():
         return nested
     aliased = snapshot_path.parent / "unwrapped_model"
     if not aliased.exists():
         aliased.symlink_to(snapshot_path, target_is_directory=True)
     return aliased
+# ---------------------------
+# Global OmadaDemo instance
+# ---------------------------
+APP = None  # type: ignore
+def get_app() -> OmadaDemo:
+    global APP
+    if APP is not None:
+        return APP
+    # Download everything once
+    ckpt_dir = download_checkpoint()
     asset_root = download_assets()
     style_root = download_style()
+    # Wire style centroids to expected locations
     style_targets = [
         MMADA_ROOT / "models" / "speech_tokenization" / "condition_style_centroid",
         PROJECT_ROOT
         / "condition_style_centroid",
     ]
     for starget in style_targets:
+        if not starget.exists():
+            starget.parent.mkdir(parents=True, exist_ok=True)
+            starget.symlink_to(style_root, target_is_directory=True)
+    # Choose train config
     default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
     legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
     train_config = os.getenv("TRAIN_CONFIG_PATH")
     if not train_config:
         train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
+    # Device: in ZeroGPU environment, "cuda" is virtualized and only actually
+    # attached inside @spaces.GPU handlers.
+    device = os.getenv("DEVICE", "cuda")
+    APP = OmadaDemo(train_config=train_config, checkpoint=str(ckpt_dir), device=device)
+    return APP
+# ---------------------------
+# ZeroGPU-wrapped handlers
+# ---------------------------
+@spaces.GPU
+def t2s_handler(
+    text,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+    cfg_scale,
+    gender,
+    emotion,
+    speed,
+    pitch,
+):
+    app = get_app()
+    audio, status = app.run_t2s(
+        text=text,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+        cfg_scale=float(cfg_scale),
+        gender_choice=gender,
+        emotion_choice=emotion,
+        speed_choice=speed,
+        pitch_choice=pitch,
+    )
+    return audio, status
+@spaces.GPU
+def s2s_handler(
+    audio_path,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+    cfg_scale,
+):
+    app = get_app()
+    audio, status = app.run_s2s(
+        audio_path=audio_path,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+        cfg_scale=float(cfg_scale),
+    )
+    return audio, status
+@spaces.GPU
+def s2t_handler(
+    audio_path,
+    steps,
+    block_len,
+    max_tokens,
+    remasking,
+):
+    app = get_app()
+    text, status = app.run_s2t(
+        audio_path=audio_path,
+        steps=int(steps),
+        block_length=int(block_len),
+        max_new_tokens=int(max_tokens),
+        remasking=str(remasking),
+    )
+    return text, status
+@spaces.GPU
+def v2t_handler(
+    video,
+    steps,
+    block_len,
+    max_tokens,
+):
+    app = get_app()
+    text, status = app.run_v2t(
+        video_path=video,
+        steps=int(steps),
+        block_length=int(block_len),
+        max_new_tokens=int(max_tokens),
+    )
+    return text, status
+@spaces.GPU
+def v2s_handler(
+    video,
+    message,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+    cfg_scale,
+):
+    app = get_app()
+    audio, status = app.run_v2s(
+        video_path=video,
+        message=message,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+        cfg_scale=float(cfg_scale),
+    )
+    return audio, status
+@spaces.GPU
+def i2s_handler(
+    image,
+    message,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+    cfg_scale,
+):
+    app = get_app()
+    audio, status = app.run_i2s(
+        image=image,
+        message=message,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+        cfg_scale=float(cfg_scale),
+    )
+    return audio, status
+@spaces.GPU
+def chat_handler(
+    message,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+):
+    app = get_app()
+    text, status = app.run_chat(
+        message=message,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+    )
+    return text, status
+@spaces.GPU
+def mmu_handler(
+    image_a,
+    image_b,
+    question,
+    max_tokens,
+    steps,
+    block_len,
+    temperature,
+):
+    app = get_app()
+    text, status = app.run_mmu_dual(
+        image_a=image_a,
+        image_b=image_b,
+        message=question,
+        max_new_tokens=int(max_tokens),
+        steps=int(steps),
+        block_length=int(block_len),
+        temperature=float(temperature),
+    )
+    return text, status
+@spaces.GPU
+def t2i_handler(
+    prompt,
+    timesteps,
+    temperature,
+    guidance,
+):
+    app = get_app()
+    image, status = app.run_t2i(
+        prompt=prompt,
+        timesteps=int(timesteps),
+        temperature=float(temperature),
+        guidance_scale=float(guidance),
+    )
+    return image, status
+@spaces.GPU
+def i2i_handler(
+    instruction,
+    image,
+    timesteps,
+    temperature,
+    guidance,
+):
+    app = get_app()
+    image_out, status = app.run_i2i(
+        instruction=instruction,
+        source_image=image,
+        timesteps=int(timesteps),
+        temperature=float(temperature),
+        guidance_scale=float(guidance),
+    )
+    return image_out, status
+# ---------------------------
+# Gradio UI (10 tabs)
+# ---------------------------
+theme = gr.themes.Soft(primary_hue="blue", neutral_hue="gray")
+with gr.Blocks(
+    title="AIDAS Lab @ SNU - OMni-modal Diffusion (ZeroGPU)",
+    css=CUSTOM_CSS,
+    theme=theme,
+    js=FORCE_LIGHT_MODE_JS,
+) as demo:
+    gr.Markdown(
+        "## Omni-modal Diffusion Foundation Model\n"
+        "### ZeroGPU-compatible demo (AIDAS Lab @ SNU)"
+    )
+    with gr.Tab("Text → Speech (T2S)"):
+        with gr.Row():
+            t2s_text = gr.Textbox(
+                label="Input text",
+                lines=4,
+                placeholder="Type the speech you want to synthesize...",
+            )
+            t2s_audio = gr.Audio(label="Generated speech", type="numpy")
+        t2s_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            t2s_max_tokens = gr.Slider(2, 512, value=384, step=2, label="Speech token length")
+            t2s_steps = gr.Slider(2, 512, value=128, step=2, label="Total refinement steps")
+            t2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            t2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            t2s_cfg = gr.Slider(0.0, 6.0, value=3.5, step=0.1, label="CFG scale")
+            with gr.Row():
+                t2s_gender = gr.Dropdown(["random", "female", "male"], value="random", label="Gender")
+                t2s_emotion = gr.Dropdown(["random", "angry", "happy", "neutral", "sad"], value="random", label="Emotion")
+            with gr.Row():
+                t2s_speed = gr.Dropdown(["random", "normal", "fast", "slow"], value="random", label="Speed")
+                t2s_pitch = gr.Dropdown(["random", "normal", "high", "low"], value="random", label="Pitch")
+        t2s_btn = gr.Button("Generate speech", variant="primary")
+        t2s_btn.click(
+            t2s_handler,
+            inputs=[
+                t2s_text,
+                t2s_max_tokens,
+                t2s_steps,
+                t2s_block,
+                t2s_temperature,
+                t2s_cfg,
+                t2s_gender,
+                t2s_emotion,
+                t2s_speed,
+                t2s_pitch,
+            ],
+            outputs=[t2s_audio, t2s_status],
+        )
+    with gr.Tab("Speech → Speech (S2S)"):
+        s2s_audio_in = gr.Audio(type="filepath", label="Source speech", sources=["microphone", "upload"])
+        s2s_audio_out = gr.Audio(type="numpy", label="Reply speech")
+        s2s_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            s2s_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Reply token length")
+            s2s_steps = gr.Slider(2, 512, value=128, step=2, label="Refinement steps")
+            s2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            s2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="Sampling temperature")
+            s2s_cfg = gr.Slider(0.0, 6.0, value=4.0, step=0.1, label="CFG scale")
+        s2s_btn = gr.Button("Generate reply speech", variant="primary")
+        s2s_btn.click(
+            s2s_handler,
+            inputs=[
+                s2s_audio_in,
+                s2s_max_tokens,
+                s2s_steps,
+                s2s_block,
+                s2s_temperature,
+                s2s_cfg,
+            ],
+            outputs=[s2s_audio_out, s2s_status],
+        )
+    with gr.Tab("Speech → Text (S2T)"):
+        s2t_audio_in = gr.Audio(type="filepath", label="Speech input", sources=["microphone", "upload"])
+        s2t_text_out = gr.Textbox(label="Transcription", lines=4)
+        s2t_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            s2t_steps = gr.Slider(2, 512, value=128, step=2, label="Denoising steps")
+            s2t_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            s2t_max_tokens = gr.Slider(2, 512, value=128, step=2, label="Max new tokens")
+            s2t_remasking = gr.Dropdown(
+                ["low_confidence", "random"],
+                value="low_confidence",
+                label="Remasking strategy",
+            )
+        s2t_btn = gr.Button("Transcribe", variant="primary")
+        s2t_btn.click(
+            s2t_handler,
+            inputs=[s2t_audio_in, s2t_steps, s2t_block, s2t_max_tokens, s2t_remasking],
+            outputs=[s2t_text_out, s2t_status],
+        )
+    with gr.Tab("Video → Text (V2T)"):
+        v2t_video_in = gr.Video(
+            label="Upload or record video",
+            height=256,
+            sources=["upload", "webcam"],
+        )
+        v2t_text_out = gr.Textbox(label="Caption / answer", lines=4)
+        v2t_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            v2t_steps = gr.Slider(2, 512, value=64, step=2, label="Denoising steps")
+            v2t_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
+            v2t_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Max new tokens")
+        v2t_btn = gr.Button("Generate caption", variant="primary")
+        v2t_btn.click(
+            v2t_handler,
+            inputs=[v2t_video_in, v2t_steps, v2t_block, v2t_max_tokens],
+            outputs=[v2t_text_out, v2t_status],
+        )
+    with gr.Tab("Video → Speech (V2S)"):
+        v2s_video_in = gr.Video(
+            label="Upload or record video",
+            height=256,
+            sources=["upload", "webcam"],
+        )
+        v2s_prompt = gr.Textbox(
+            label="Optional instruction",
+            placeholder="(Optional) e.g., 'Describe this scene in spoken form.'",
+        )
+        v2s_audio_out = gr.Audio(type="numpy", label="Generated speech")
+        v2s_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            v2s_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Reply token length")
+            v2s_steps = gr.Slider(2, 512, value=128, step=2, label="Refinement steps")
+            v2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            v2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            v2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
+        v2s_btn = gr.Button("Generate speech from video", variant="primary")
+        v2s_btn.click(
+            v2s_handler,
+            inputs=[
+                v2s_video_in,
+                v2s_prompt,
+                v2s_max_tokens,
+                v2s_steps,
+                v2s_block,
+                v2s_temperature,
+                v2s_cfg,
+            ],
+            outputs=[v2s_audio_out, v2s_status],
+        )
+    with gr.Tab("Image → Speech (I2S)"):
+        i2s_image_in = gr.Image(type="pil", label="Image input", sources=["upload"])
+        i2s_prompt = gr.Textbox(
+            label="Optional question",
+            placeholder="(Optional) e.g., 'Describe this image aloud.'",
+        )
+        i2s_audio_out = gr.Audio(type="numpy", label="Spoken description")
+        i2s_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            i2s_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Reply token length")
+            i2s_steps = gr.Slider(2, 512, value=256, step=2, label="Refinement steps")
+            i2s_block = gr.Slider(2, 512, value=256, step=2, label="Block length")
+            i2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            i2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
+        i2s_btn = gr.Button("Generate spoken description", variant="primary")
+        i2s_btn.click(
+            i2s_handler,
+            inputs=[
+                i2s_image_in,
+                i2s_prompt,
+                i2s_max_tokens,
+                i2s_steps,
+                i2s_block,
+                i2s_temperature,
+                i2s_cfg,
+            ],
+            outputs=[i2s_audio_out, i2s_status],
+        )
+    with gr.Tab("Text Chat"):
+        chat_in = gr.Textbox(
+            label="Message",
+            lines=4,
+            placeholder="Ask anything. The model will reply in text.",
+        )
+        chat_out = gr.Textbox(label="Assistant reply", lines=6)
+        chat_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            chat_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Reply max tokens")
+            chat_steps = gr.Slider(2, 512, value=64, step=2, label="Refinement steps")
+            chat_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
+            chat_temperature_slider = gr.Slider(0.0, 2.0, value=0.8, step=0.05, label="Sampling temperature")
+        chat_btn = gr.Button("Send", variant="primary")
+        chat_btn.click(
+            chat_handler,
+            inputs=[
+                chat_in,
+                chat_max_tokens,
+                chat_steps,
+                chat_block,
+                chat_temperature_slider,
+            ],
+            outputs=[chat_out, chat_status],
+        )
+    with gr.Tab("MMU (2 images → text)"):
+        mmu_img_a = gr.Image(type="pil", label="Image A", sources=["upload"])
+        mmu_img_b = gr.Image(type="pil", label="Image B", sources=["upload"])
+        mmu_question = gr.Textbox(
+            label="Question",
+            lines=3,
+            placeholder="Ask about the relationship or differences between the two images.",
+        )
+        mmu_answer = gr.Textbox(label="Answer", lines=6)
+        mmu_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            mmu_max_tokens = gr.Slider(2, 512, value=256, step=2, label="Answer max tokens")
+            mmu_steps = gr.Slider(2, 512, value=256, step=2, label="Refinement steps")
+            mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
+            mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
+        mmu_btn = gr.Button("Answer about the two images", variant="primary")
+        mmu_btn.click(
+            mmu_handler,
+            inputs=[
+                mmu_img_a,
+                mmu_img_b,
+                mmu_question,
+                mmu_max_tokens,
+                mmu_steps,
+                mmu_block,
+                mmu_temperature,
+            ],
+            outputs=[mmu_answer, mmu_status],
+        )
+    with gr.Tab("Text → Image (T2I)"):
+        t2i_prompt = gr.Textbox(
+            label="Prompt",
+            lines=4,
+            placeholder="Describe the image you want to generate...",
+        )
+        t2i_image_out = gr.Image(label="Generated image")
+        t2i_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            t2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="Timesteps")
+            t2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            t2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
+        t2i_btn = gr.Button("Generate image", variant="primary")
+        t2i_btn.click(
+            t2i_handler,
+            inputs=[t2i_prompt, t2i_timesteps, t2i_temperature, t2i_guidance],
+            outputs=[t2i_image_out, t2i_status],
+        )
+    with gr.Tab("Image Editing (I2I)"):
+        i2i_image_in = gr.Image(type="pil", label="Reference image", sources=["upload"])
+        i2i_instr = gr.Textbox(
+            label="Editing instruction",
+            lines=4,
+            placeholder="Describe how you want to edit the image...",
+        )
+        i2i_image_out = gr.Image(label="Edited image")
+        i2i_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            i2i_timesteps = gr.Slider(4, 128, value=18, step=2, label="Timesteps")
+            i2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            i2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
+        i2i_btn = gr.Button("Apply edit", variant="primary")
+        i2i_btn.click(
+            i2i_handler,
+            inputs=[i2i_instr, i2i_image_in, i2i_timesteps, i2i_temperature, i2i_guidance],
+            outputs=[i2i_image_out, i2i_status],
+        )
 if __name__ == "__main__":
+    demo.launch()