Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Running on Zero

App Files Files Community

jake commited on about 1 month ago

Commit

06b0a1f

1 Parent(s): df89a6a

TF

Browse files

Files changed (1) hide show

app.py +192 -337

app.py CHANGED Viewed

@@ -5,17 +5,6 @@ ZeroGPU-friendly Gradio entrypoint for OMada demo.
 - Instantiates OmadaDemo once (global)
 - Exposes 10 modalities via Gradio tabs
 - Uses @spaces.GPU only on inference handlers so GPU is allocated per request
-Environment overrides:
-  MODEL_REPO_ID      (default: jaeikkim/AIDAS-Omni-Modal-Diffusion)
-  MODEL_REVISION     (default: main)
-  ASSET_REPO_ID      (default: jaeikkim/AIDAS-Omni-Modal-Diffusion-assets)
-  ASSET_REVISION     (default: main)
-  STYLE_REPO_ID      (default: jaeikkim/aidas-style-centroid)
-  STYLE_REVISION     (default: main)
-  HF_TOKEN           (optional, for private model/dataset)
-  TRAIN_CONFIG_PATH  (default: MMaDA/inference/demo/demo.yaml)
-  DEVICE             (default: cuda)
 """
 import os
@@ -49,8 +38,6 @@ if str(EMOVA_ROOT) not in sys.path:
 def ensure_hf_hub(target: str = "0.36.0"):
     """
     Make sure huggingface_hub stays <1.0 to satisfy transformers/tokenizers.
-    The Spaces base image may pull in a newer version via gradio, so we pin it.
     """
     try:
         import huggingface_hub as hub
@@ -80,7 +67,7 @@ snapshot_download = ensure_hf_hub().snapshot_download
 # ---------------------------
-# Imports from OMada demo
 # ---------------------------
 from inference.gradio_multimodal_demo_inst import (  # noqa: E402
@@ -153,8 +140,6 @@ def download_checkpoint() -> Path:
         )
     )
-    # If snapshot itself is unwrapped_model, return it; otherwise look for nested dir,
-    # and finally alias via symlink.
     if snapshot_path.name == "unwrapped_model":
         return snapshot_path
@@ -169,82 +154,57 @@ def download_checkpoint() -> Path:
 # ---------------------------
-# Assets & examples from HF dataset
 # ---------------------------
 ASSET_ROOT = download_assets()
-DEMO_ROOT = ASSET_ROOT / "demo"
-LOGO_PATH = DEMO_ROOT / "logo.png"
-T2S_TEXT_PATH = DEMO_ROOT / "t2s" / "text.txt"
-CHAT_TEXT_PATH = DEMO_ROOT / "chat" / "text.txt"
-T2I_TEXT_PATH = DEMO_ROOT / "t2i" / "text.txt"
 def _load_text_examples(path: Path):
     if not path.exists():
         return []
-    try:
-        lines = [
-            line.strip()
-            for line in path.read_text(encoding="utf-8").splitlines()
-            if line.strip()
-        ]
-    except Exception:
-        return []
-    return [[line] for line in lines]
 def _load_media_examples(subdir: str, suffixes):
-    d = DEMO_ROOT / subdir
     if not d.exists():
         return []
-    examples = []
     for p in sorted(d.iterdir()):
         if p.is_file() and p.suffix.lower() in suffixes:
-            examples.append([str(p)])
-    return examples
-# 텍스트 기반 예제
-T2S_EXAMPLES = _load_text_examples(T2S_TEXT_PATH)
-CHAT_EXAMPLES = _load_text_examples(CHAT_TEXT_PATH)
-T2I_EXAMPLES = _load_text_examples(T2I_TEXT_PATH)
-# 오디오 / 비디오 / 이미지 예제
-_AUDIO_SUFFIXES = {".wav", ".mp3", ".flac", ".ogg"}
-_VIDEO_SUFFIXES = {".mp4", ".mov", ".avi", ".webm"}
-_IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp"}
-S2T_EXAMPLES = _load_media_examples("s2t", _AUDIO_SUFFIXES)
-V2T_EXAMPLES = _load_media_examples("v2t", _VIDEO_SUFFIXES)
-S2S_EXAMPLES = _load_media_examples("s2s", _AUDIO_SUFFIXES)
-if not S2S_EXAMPLES and S2T_EXAMPLES:
-    S2S_EXAMPLES = S2T_EXAMPLES[: min(4, len(S2T_EXAMPLES))]
-V2S_EXAMPLES = _load_media_examples("v2s", _VIDEO_SUFFIXES)
-if not V2S_EXAMPLES and V2T_EXAMPLES:
-    V2S_EXAMPLES = V2T_EXAMPLES[: min(4, len(V2T_EXAMPLES))]
-I2S_EXAMPLES = _load_media_examples("i2s", _IMAGE_SUFFIXES)
-# MMU: 2 images + question
-MMU_DIR = DEMO_ROOT / "mmu"
-MMU_EXAMPLES = []
-if MMU_DIR.exists():
-    mmu_imgs = [
-        p for p in sorted(MMU_DIR.iterdir())
-        if p.is_file() and p.suffix.lower() in _IMAGE_SUFFIXES
-    ]
-    if len(mmu_imgs) >= 2:
-        MMU_EXAMPLES = [[
-            str(mmu_imgs[0]),
-            str(mmu_imgs[1]),
-            "What are the differences between the two images?"
-        ]]
-# i2s가 없고 mmu 예제가 있으면, 첫 번째 이미지를 이미지 예제로 재사용
 if not I2S_EXAMPLES and MMU_EXAMPLES:
     I2S_EXAMPLES = [[MMU_EXAMPLES[0][0]]]
@@ -260,9 +220,7 @@ def get_app() -> OmadaDemo:
     if APP is not None:
         return APP
-    # Download ckpt + style centroids once
     ckpt_dir = download_checkpoint()
-    style_root = download_style()
     # Wire style centroids to expected locations
     style_targets = [
@@ -276,19 +234,15 @@ def get_app() -> OmadaDemo:
     for starget in style_targets:
         if not starget.exists():
             starget.parent.mkdir(parents=True, exist_ok=True)
-            starget.symlink_to(style_root, target_is_directory=True)
-    # Choose train config
     default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
     legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
     train_config = os.getenv("TRAIN_CONFIG_PATH")
     if not train_config:
         train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
-    # Device: in ZeroGPU environment, "cuda" is virtualized and only actually
-    # attached inside @spaces.GPU handlers.
     device = os.getenv("DEVICE", "cuda")
     APP = OmadaDemo(train_config=train_config, checkpoint=str(ckpt_dir), device=device)
     return APP
@@ -296,20 +250,9 @@ def get_app() -> OmadaDemo:
 # ---------------------------
 # ZeroGPU-wrapped handlers
 # ---------------------------
 @spaces.GPU
-def t2s_handler(
-    text,
-    max_tokens,
-    steps,
-    block_len,
-    temperature,
-    cfg_scale,
-    gender,
-    emotion,
-    speed,
-    pitch,
-):
     app = get_app()
     audio, status = app.run_t2s(
         text=text,
@@ -325,16 +268,8 @@ def t2s_handler(
     )
     return audio, status
 @spaces.GPU
-def s2s_handler(
-    audio_path,
-    max_tokens,
-    steps,
-    block_len,
-    temperature,
-    cfg_scale,
-):
     app = get_app()
     audio, status = app.run_s2s(
         audio_path=audio_path,
@@ -346,15 +281,8 @@ def s2s_handler(
     )
     return audio, status
 @spaces.GPU
-def s2t_handler(
-    audio_path,
-    steps,
-    block_len,
-    max_tokens,
-    remasking,
-):
     app = get_app()
     text, status = app.run_s2t(
         audio_path=audio_path,
@@ -365,14 +293,8 @@ def s2t_handler(
     )
     return text, status
 @spaces.GPU
-def v2t_handler(
-    video,
-    steps,
-    block_len,
-    max_tokens,
-):
     app = get_app()
     text, status = app.run_v2t(
         video_path=video,
@@ -382,17 +304,8 @@ def v2t_handler(
     )
     return text, status
 @spaces.GPU
-def v2s_handler(
-    video,
-    message,
-    max_tokens,
-    steps,
-    block_len,
-    temperature,
-    cfg_scale,
-):
     app = get_app()
     audio, status = app.run_v2s(
         video_path=video,
@@ -405,17 +318,8 @@ def v2s_handler(
     )
     return audio, status
 @spaces.GPU
-def i2s_handler(
-    image,
-    message,
-    max_tokens,
-    steps,
-    block_len,
-    temperature,
-    cfg_scale,
-):
     app = get_app()
     audio, status = app.run_i2s(
         image=image,
@@ -428,15 +332,8 @@ def i2s_handler(
     )
     return audio, status
 @spaces.GPU
-def chat_handler(
-    message,
-    max_tokens,
-    steps,
-    block_len,
-    temperature,
-):
     app = get_app()
     text, status = app.run_chat(
         message=message,
@@ -447,17 +344,8 @@ def chat_handler(
     )
     return text, status
 @spaces.GPU
-def mmu_handler(
-    image_a,
-    image_b,
-    question,
-    max_tokens,
-    steps,
-    block_len,
-    temperature,
-):
     app = get_app()
     text, status = app.run_mmu_dual(
         image_a=image_a,
@@ -470,14 +358,8 @@ def mmu_handler(
     )
     return text, status
 @spaces.GPU
-def t2i_handler(
-    prompt,
-    timesteps,
-    temperature,
-    guidance,
-):
     app = get_app()
     image, status = app.run_t2i(
         prompt=prompt,
@@ -487,15 +369,8 @@ def t2i_handler(
     )
     return image, status
 @spaces.GPU
-def i2i_handler(
-    instruction,
-    image,
-    timesteps,
-    temperature,
-    guidance,
-):
     app = get_app()
     image_out, status = app.run_i2i(
         instruction=instruction,
@@ -508,32 +383,31 @@ def i2i_handler(
 # ---------------------------
-# Gradio UI (10 tabs)
 # ---------------------------
 theme = gr.themes.Soft(primary_hue="blue", neutral_hue="gray")
 with gr.Blocks(
-    title="AIDAS Lab @ SNU - OMni-modal Diffusion",
     css=CUSTOM_CSS,
     theme=theme,
     js=FORCE_LIGHT_MODE_JS,
 ) as demo:
-    # 로고 (있으면)
-    if LOGO_PATH.exists():
-        gr.Image(
-            value=str(LOGO_PATH),
-            show_label=False,
-            height=140,
-            interactive=False,
         )
-    gr.Markdown(
-        "## Omni-modal Diffusion Foundation Model\n"
-        "### AIDAS Lab @ SNU"
-    )
-    # ---------- T2S ----------
     with gr.Tab("Text → Speech (T2S)"):
         with gr.Row():
             t2s_text = gr.Textbox(
@@ -555,6 +429,13 @@ with gr.Blocks(
             with gr.Row():
                 t2s_speed = gr.Dropdown(["random", "normal", "fast", "slow"], value="random", label="Speed")
                 t2s_pitch = gr.Dropdown(["random", "normal", "high", "low"], value="random", label="Pitch")
         t2s_btn = gr.Button("Generate speech", variant="primary")
         t2s_btn.click(
             t2s_handler,
@@ -573,15 +454,7 @@ with gr.Blocks(
             outputs=[t2s_audio, t2s_status],
         )
-        if T2S_EXAMPLES:
-            gr.Markdown("**Sample prompts**")
-            gr.Examples(
-                examples=T2S_EXAMPLES,
-                inputs=[t2s_text],
-                examples_per_page=4,
-            )
-    # ---------- S2S ----------
     with gr.Tab("Speech → Speech (S2S)"):
         s2s_audio_in = gr.Audio(type="filepath", label="Source speech", sources=["microphone", "upload"])
         s2s_audio_out = gr.Audio(type="numpy", label="Reply speech")
@@ -592,6 +465,13 @@ with gr.Blocks(
             s2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
             s2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="Sampling temperature")
             s2s_cfg = gr.Slider(0.0, 6.0, value=4.0, step=0.1, label="CFG scale")
         s2s_btn = gr.Button("Generate reply speech", variant="primary")
         s2s_btn.click(
             s2s_handler,
@@ -606,15 +486,7 @@ with gr.Blocks(
             outputs=[s2s_audio_out, s2s_status],
         )
-        if S2S_EXAMPLES:
-            gr.Markdown("**Sample S2S clips**")
-            gr.Examples(
-                examples=S2S_EXAMPLES,
-                inputs=[s2s_audio_in],
-                examples_per_page=4,
-            )
-    # ---------- S2T ----------
     with gr.Tab("Speech → Text (S2T)"):
         s2t_audio_in = gr.Audio(type="filepath", label="Speech input", sources=["microphone", "upload"])
         s2t_text_out = gr.Textbox(label="Transcription", lines=4)
@@ -628,6 +500,13 @@ with gr.Blocks(
                 value="low_confidence",
                 label="Remasking strategy",
             )
         s2t_btn = gr.Button("Transcribe", variant="primary")
         s2t_btn.click(
             s2t_handler,
@@ -635,15 +514,7 @@ with gr.Blocks(
             outputs=[s2t_text_out, s2t_status],
         )
-        if S2T_EXAMPLES:
-            gr.Markdown("**Sample S2T clips**")
-            gr.Examples(
-                examples=S2T_EXAMPLES,
-                inputs=[s2t_audio_in],
-                examples_per_page=4,
-            )
-    # ---------- V2T ----------
     with gr.Tab("Video → Text (V2T)"):
         v2t_video_in = gr.Video(
             label="Upload or record video",
@@ -656,6 +527,13 @@ with gr.Blocks(
             v2t_steps = gr.Slider(2, 512, value=64, step=2, label="Denoising steps")
             v2t_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
             v2t_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Max new tokens")
         v2t_btn = gr.Button("Generate caption", variant="primary")
         v2t_btn.click(
             v2t_handler,
@@ -663,15 +541,7 @@ with gr.Blocks(
             outputs=[v2t_text_out, v2t_status],
         )
-        if V2T_EXAMPLES:
-            gr.Markdown("**Sample videos**")
-            gr.Examples(
-                examples=V2T_EXAMPLES,
-                inputs=[v2t_video_in],
-                examples_per_page=4,
-            )
-    # ---------- V2S ----------
     with gr.Tab("Video → Speech (V2S)"):
         v2s_video_in = gr.Video(
             label="Upload or record video",
@@ -690,6 +560,7 @@ with gr.Blocks(
             v2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
             v2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
             v2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
         v2s_btn = gr.Button("Generate speech from video", variant="primary")
         v2s_btn.click(
             v2s_handler,
@@ -705,100 +576,7 @@ with gr.Blocks(
             outputs=[v2s_audio_out, v2s_status],
         )
-        if V2S_EXAMPLES:
-            gr.Markdown("**Sample videos**")
-            gr.Examples(
-                examples=V2S_EXAMPLES,
-                inputs=[v2s_video_in],
-                examples_per_page=4,
-            )
-    # ---------- T2I ----------
-    with gr.Tab("Text → Image (T2I)"):
-        t2i_prompt = gr.Textbox(
-            label="Prompt",
-            lines=4,
-            placeholder="Describe the image you want to generate...",
-        )
-        t2i_image_out = gr.Image(label="Generated image")
-        t2i_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            t2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="Timesteps")
-            t2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
-            t2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
-        t2i_btn = gr.Button("Generate image", variant="primary")
-        t2i_btn.click(
-            t2i_handler,
-            inputs=[t2i_prompt, t2i_timesteps, t2i_temperature, t2i_guidance],
-            outputs=[t2i_image_out, t2i_status],
-        )
-        if T2I_EXAMPLES:
-            gr.Markdown("**Sample prompts**")
-            gr.Examples(
-                examples=T2I_EXAMPLES,
-                inputs=[t2i_prompt],
-                examples_per_page=4,
-            )
-    # ---------- I2I ----------
-    with gr.Tab("Image Editing (I2I)"):
-        i2i_image_in = gr.Image(type="pil", label="Reference image", sources=["upload"])
-        i2i_instr = gr.Textbox(
-            label="Editing instruction",
-            lines=4,
-            placeholder="Describe how you want to edit the image...",
-        )
-        i2i_image_out = gr.Image(label="Edited image")
-        i2i_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            i2i_timesteps = gr.Slider(4, 128, value=18, step=2, label="Timesteps")
-            i2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
-            i2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
-        i2i_btn = gr.Button("Apply edit", variant="primary")
-        i2i_btn.click(
-            i2i_handler,
-            inputs=[i2i_instr, i2i_image_in, i2i_timesteps, i2i_temperature, i2i_guidance],
-            outputs=[i2i_image_out, i2i_status],
-        )
-    # ---------- Chat ----------
-    with gr.Tab("Text Chat"):
-        chat_in = gr.Textbox(
-            label="Message",
-            lines=4,
-            placeholder="Ask anything. The model will reply in text.",
-        )
-        chat_out = gr.Textbox(label="Assistant reply", lines=6)
-        chat_status = gr.Textbox(label="Status", interactive=False)
-        with gr.Accordion("Advanced settings", open=False):
-            chat_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Reply max tokens")
-            chat_steps = gr.Slider(2, 512, value=64, step=2, label="Refinement steps")
-            chat_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
-            chat_temperature_slider = gr.Slider(0.0, 2.0, value=0.8, step=0.05, label="Sampling temperature")
-        chat_btn = gr.Button("Send", variant="primary")
-        chat_btn.click(
-            chat_handler,
-            inputs=[
-                chat_in,
-                chat_max_tokens,
-                chat_steps,
-                chat_block,
-                chat_temperature_slider,
-            ],
-            outputs=[chat_out, chat_status],
-        )
-        if CHAT_EXAMPLES:
-            gr.Markdown("**Sample prompts**")
-            gr.Examples(
-                examples=CHAT_EXAMPLES,
-                inputs=[chat_in],
-                examples_per_page=4,
-            )
-    # ---------- I2S ----------
     with gr.Tab("Image → Speech (I2S)"):
         i2s_image_in = gr.Image(type="pil", label="Image input", sources=["upload"])
         i2s_prompt = gr.Textbox(
@@ -813,6 +591,13 @@ with gr.Blocks(
             i2s_block = gr.Slider(2, 512, value=256, step=2, label="Block length")
             i2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
             i2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
         i2s_btn = gr.Button("Generate spoken description", variant="primary")
         i2s_btn.click(
             i2s_handler,
@@ -828,16 +613,41 @@ with gr.Blocks(
             outputs=[i2s_audio_out, i2s_status],
         )
-        if I2S_EXAMPLES:
-            gr.Markdown("**Sample images**")
-            gr.Examples(
-                examples=I2S_EXAMPLES,
-                inputs=[i2s_image_in],
-                examples_per_page=4,
-            )
-    # ---------- MMU ----------
     with gr.Tab("MMU (2 images → text)"):
         mmu_img_a = gr.Image(type="pil", label="Image A", sources=["upload"])
         mmu_img_b = gr.Image(type="pil", label="Image B", sources=["upload"])
@@ -853,6 +663,13 @@ with gr.Blocks(
             mmu_steps = gr.Slider(2, 512, value=256, step=2, label="Refinement steps")
             mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
             mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
         mmu_btn = gr.Button("Answer about the two images", variant="primary")
         mmu_btn.click(
             mmu_handler,
@@ -868,16 +685,54 @@ with gr.Blocks(
             outputs=[mmu_answer, mmu_status],
         )
-        if MMU_EXAMPLES:
-            gr.Markdown("**Sample MMU example**")
-            gr.Examples(
-                examples=MMU_EXAMPLES,
-                inputs=[mmu_img_a, mmu_img_b, mmu_question],
-                examples_per_page=1,
-            )
-        # I2I는 별도 예제 텍스트/이미지 구조가 애매해서 일단 생략
-        # (필요하면 demo/i2i_prompt.txt + demo/i2i_images/ 로 나눠서 넣고 wiring 하면 됨)
 if __name__ == "__main__":
     demo.launch()

 - Instantiates OmadaDemo once (global)
 - Exposes 10 modalities via Gradio tabs
 - Uses @spaces.GPU only on inference handlers so GPU is allocated per request
 """
 import os
 def ensure_hf_hub(target: str = "0.36.0"):
     """
     Make sure huggingface_hub stays <1.0 to satisfy transformers/tokenizers.
     """
     try:
         import huggingface_hub as hub
 # ---------------------------
+# OMada demo imports
 # ---------------------------
 from inference.gradio_multimodal_demo_inst import (  # noqa: E402
         )
     )
     if snapshot_path.name == "unwrapped_model":
         return snapshot_path
 # ---------------------------
+# Assets (for examples + logo)
 # ---------------------------
 ASSET_ROOT = download_assets()
+STYLE_ROOT = download_style()
+LOGO_PATH = ASSET_ROOT / "logo.png"  # optional
 def _load_text_examples(path: Path):
     if not path.exists():
         return []
+    lines = [
+        ln.strip()
+        for ln in path.read_text(encoding="utf-8").splitlines()
+        if ln.strip()
+    ]
+    return [[ln] for ln in lines]
 def _load_media_examples(subdir: str, suffixes):
+    d = ASSET_ROOT / subdir
     if not d.exists():
         return []
+    ex = []
     for p in sorted(d.iterdir()):
         if p.is_file() and p.suffix.lower() in suffixes:
+            ex.append([str(p)])
+    return ex
+# text-based examples
+T2S_EXAMPLES = _load_text_examples(ASSET_ROOT / "t2s" / "text.txt")
+CHAT_EXAMPLES = _load_text_examples(ASSET_ROOT / "chat" / "text.txt")
+T2I_EXAMPLES = _load_text_examples(ASSET_ROOT / "t2i" / "text.txt")
+# audio / video / image examples
+S2T_EXAMPLES = _load_media_examples("s2t", {".wav", ".mp3", ".flac", ".ogg"})
+S2S_EXAMPLES = _load_media_examples("s2s", {".wav", ".mp3", ".flac", ".ogg"})
+V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
+# MMU images (and fallback for I2S)
+MMU_IMAGE_A = ASSET_ROOT / "mmu" / "1.jpg"
+MMU_IMAGE_B = ASSET_ROOT / "mmu" / "2.jpg"
+if MMU_IMAGE_A.exists() and MMU_IMAGE_B.exists():
+    MMU_EXAMPLES = [[str(MMU_IMAGE_A), str(MMU_IMAGE_B),
+                     "What are the differences in coloring and physical features between animal1 and animal2 in the bird images?"]]
+else:
+    MMU_EXAMPLES = []
+I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
 if not I2S_EXAMPLES and MMU_EXAMPLES:
+    # use image A from MMU as sample I2S input
     I2S_EXAMPLES = [[MMU_EXAMPLES[0][0]]]
     if APP is not None:
         return APP
     ckpt_dir = download_checkpoint()
     # Wire style centroids to expected locations
     style_targets = [
     for starget in style_targets:
         if not starget.exists():
             starget.parent.mkdir(parents=True, exist_ok=True)
+            starget.symlink_to(STYLE_ROOT, target_is_directory=True)
     default_cfg = PROJECT_ROOT / "MMaDA" / "inference" / "demo" / "demo.yaml"
     legacy_cfg = PROJECT_ROOT / "MMaDA" / "configs" / "mmada_demo.yaml"
     train_config = os.getenv("TRAIN_CONFIG_PATH")
     if not train_config:
         train_config = str(default_cfg if default_cfg.exists() else legacy_cfg)
     device = os.getenv("DEVICE", "cuda")
     APP = OmadaDemo(train_config=train_config, checkpoint=str(ckpt_dir), device=device)
     return APP
 # ---------------------------
 # ZeroGPU-wrapped handlers
 # ---------------------------
+# (== 그대로, 생략 없이 둔 부분 ==)
 @spaces.GPU
+def t2s_handler(text, max_tokens, steps, block_len, temperature, cfg_scale, gender, emotion, speed, pitch):
     app = get_app()
     audio, status = app.run_t2s(
         text=text,
     )
     return audio, status
 @spaces.GPU
+def s2s_handler(audio_path, max_tokens, steps, block_len, temperature, cfg_scale):
     app = get_app()
     audio, status = app.run_s2s(
         audio_path=audio_path,
     )
     return audio, status
 @spaces.GPU
+def s2t_handler(audio_path, steps, block_len, max_tokens, remasking):
     app = get_app()
     text, status = app.run_s2t(
         audio_path=audio_path,
     )
     return text, status
 @spaces.GPU
+def v2t_handler(video, steps, block_len, max_tokens):
     app = get_app()
     text, status = app.run_v2t(
         video_path=video,
     )
     return text, status
 @spaces.GPU
+def v2s_handler(video, message, max_tokens, steps, block_len, temperature, cfg_scale):
     app = get_app()
     audio, status = app.run_v2s(
         video_path=video,
     )
     return audio, status
 @spaces.GPU
+def i2s_handler(image, message, max_tokens, steps, block_len, temperature, cfg_scale):
     app = get_app()
     audio, status = app.run_i2s(
         image=image,
     )
     return audio, status
 @spaces.GPU
+def chat_handler(message, max_tokens, steps, block_len, temperature):
     app = get_app()
     text, status = app.run_chat(
         message=message,
     )
     return text, status
 @spaces.GPU
+def mmu_handler(image_a, image_b, question, max_tokens, steps, block_len, temperature):
     app = get_app()
     text, status = app.run_mmu_dual(
         image_a=image_a,
     )
     return text, status
 @spaces.GPU
+def t2i_handler(prompt, timesteps, temperature, guidance):
     app = get_app()
     image, status = app.run_t2i(
         prompt=prompt,
     )
     return image, status
 @spaces.GPU
+def i2i_handler(instruction, image, timesteps, temperature, guidance):
     app = get_app()
     image_out, status = app.run_i2i(
         instruction=instruction,
 # ---------------------------
+# Gradio UI (10 tabs + examples)
 # ---------------------------
 theme = gr.themes.Soft(primary_hue="blue", neutral_hue="gray")
 with gr.Blocks(
+    title="AIDAS Lab @ SNU - Omni-modal Diffusion",
     css=CUSTOM_CSS,
     theme=theme,
     js=FORCE_LIGHT_MODE_JS,
 ) as demo:
+    with gr.Row():
+        if LOGO_PATH.exists():
+            gr.Image(
+                value=str(LOGO_PATH),
+                show_label=False,
+                height=80,
+                interactive=False,
+            )
+        gr.Markdown(
+            "## Omni-modal Diffusion Foundation Model\n"
+            "### AIDAS Lab @ SNU"
         )
+    # ---- T2S ----
     with gr.Tab("Text → Speech (T2S)"):
         with gr.Row():
             t2s_text = gr.Textbox(
             with gr.Row():
                 t2s_speed = gr.Dropdown(["random", "normal", "fast", "slow"], value="random", label="Speed")
                 t2s_pitch = gr.Dropdown(["random", "normal", "high", "low"], value="random", label="Pitch")
+        if T2S_EXAMPLES:
+            with gr.Accordion("Sample prompts", open=False):
+                gr.Examples(
+                    examples=T2S_EXAMPLES,
+                    inputs=[t2s_text],
+                    examples_per_page=6,
+                )
         t2s_btn = gr.Button("Generate speech", variant="primary")
         t2s_btn.click(
             t2s_handler,
             outputs=[t2s_audio, t2s_status],
         )
+    # ---- S2S ----
     with gr.Tab("Speech → Speech (S2S)"):
         s2s_audio_in = gr.Audio(type="filepath", label="Source speech", sources=["microphone", "upload"])
         s2s_audio_out = gr.Audio(type="numpy", label="Reply speech")
             s2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
             s2s_temperature = gr.Slider(0.0, 2.0, value=0.0, step=0.05, label="Sampling temperature")
             s2s_cfg = gr.Slider(0.0, 6.0, value=4.0, step=0.1, label="CFG scale")
+        if S2S_EXAMPLES:
+            with gr.Accordion("Sample clips", open=False):
+                gr.Examples(
+                    examples=S2S_EXAMPLES,
+                    inputs=[s2s_audio_in],
+                    examples_per_page=4,
+                )
         s2s_btn = gr.Button("Generate reply speech", variant="primary")
         s2s_btn.click(
             s2s_handler,
             outputs=[s2s_audio_out, s2s_status],
         )
+    # ---- S2T ----
     with gr.Tab("Speech → Text (S2T)"):
         s2t_audio_in = gr.Audio(type="filepath", label="Speech input", sources=["microphone", "upload"])
         s2t_text_out = gr.Textbox(label="Transcription", lines=4)
                 value="low_confidence",
                 label="Remasking strategy",
             )
+        if S2T_EXAMPLES:
+            with gr.Accordion("Sample clips", open=False):
+                gr.Examples(
+                    examples=S2T_EXAMPLES,
+                    inputs=[s2t_audio_in],
+                    examples_per_page=4,
+                )
         s2t_btn = gr.Button("Transcribe", variant="primary")
         s2t_btn.click(
             s2t_handler,
             outputs=[s2t_text_out, s2t_status],
         )
+    # ---- V2T ----
     with gr.Tab("Video → Text (V2T)"):
         v2t_video_in = gr.Video(
             label="Upload or record video",
             v2t_steps = gr.Slider(2, 512, value=64, step=2, label="Denoising steps")
             v2t_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
             v2t_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Max new tokens")
+        if V2T_EXAMPLES:
+            with gr.Accordion("Sample videos", open=False):
+                gr.Examples(
+                    examples=V2T_EXAMPLES,
+                    inputs=[v2t_video_in],
+                    examples_per_page=4,
+                )
         v2t_btn = gr.Button("Generate caption", variant="primary")
         v2t_btn.click(
             v2t_handler,
             outputs=[v2t_text_out, v2t_status],
         )
+    # ---- V2S ----
     with gr.Tab("Video → Speech (V2S)"):
         v2s_video_in = gr.Video(
             label="Upload or record video",
             v2s_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
             v2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
             v2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
+        # (optional v2s examples: if you later add 'v2s' folder, same 패턴으로 붙이면 됨)
         v2s_btn = gr.Button("Generate speech from video", variant="primary")
         v2s_btn.click(
             v2s_handler,
             outputs=[v2s_audio_out, v2s_status],
         )
+    # ---- I2S ----
     with gr.Tab("Image → Speech (I2S)"):
         i2s_image_in = gr.Image(type="pil", label="Image input", sources=["upload"])
         i2s_prompt = gr.Textbox(
             i2s_block = gr.Slider(2, 512, value=256, step=2, label="Block length")
             i2s_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
             i2s_cfg = gr.Slider(0.0, 6.0, value=3.0, step=0.1, label="CFG scale")
+        if I2S_EXAMPLES:
+            with gr.Accordion("Sample images", open=False):
+                gr.Examples(
+                    examples=I2S_EXAMPLES,
+                    inputs=[i2s_image_in],
+                    examples_per_page=4,
+                )
         i2s_btn = gr.Button("Generate spoken description", variant="primary")
         i2s_btn.click(
             i2s_handler,
             outputs=[i2s_audio_out, i2s_status],
         )
+    # ---- Chat ----
+    with gr.Tab("Text Chat"):
+        chat_in = gr.Textbox(
+            label="Message",
+            lines=4,
+            placeholder="Ask anything. The model will reply in text.",
+        )
+        chat_out = gr.Textbox(label="Assistant reply", lines=6)
+        chat_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            chat_max_tokens = gr.Slider(2, 512, value=64, step=2, label="Reply max tokens")
+            chat_steps = gr.Slider(2, 512, value=64, step=2, label="Refinement steps")
+            chat_block = gr.Slider(2, 512, value=64, step=2, label="Block length")
+            chat_temperature_slider = gr.Slider(0.0, 2.0, value=0.8, step=0.05, label="Sampling temperature")
+        if CHAT_EXAMPLES:
+            with gr.Accordion("Sample prompts", open=False):
+                gr.Examples(
+                    examples=CHAT_EXAMPLES,
+                    inputs=[chat_in],
+                    examples_per_page=6,
+                )
+        chat_btn = gr.Button("Send", variant="primary")
+        chat_btn.click(
+            chat_handler,
+            inputs=[
+                chat_in,
+                chat_max_tokens,
+                chat_steps,
+                chat_block,
+                chat_temperature_slider,
+            ],
+            outputs=[chat_out, chat_status],
+        )
+    # ---- MMU ----
     with gr.Tab("MMU (2 images → text)"):
         mmu_img_a = gr.Image(type="pil", label="Image A", sources=["upload"])
         mmu_img_b = gr.Image(type="pil", label="Image B", sources=["upload"])
             mmu_steps = gr.Slider(2, 512, value=256, step=2, label="Refinement steps")
             mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
             mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
+        if MMU_EXAMPLES:
+            with gr.Accordion("Sample MMU pair", open=False):
+                gr.Examples(
+                    examples=MMU_EXAMPLES,
+                    inputs=[mmu_img_a, mmu_img_b, mmu_question],
+                    examples_per_page=1,
+                )
         mmu_btn = gr.Button("Answer about the two images", variant="primary")
         mmu_btn.click(
             mmu_handler,
             outputs=[mmu_answer, mmu_status],
         )
+    # ---- T2I ----
+    with gr.Tab("Text → Image (T2I)"):
+        t2i_prompt = gr.Textbox(
+            label="Prompt",
+            lines=4,
+            placeholder="Describe the image you want to generate...",
+        )
+        t2i_image_out = gr.Image(label="Generated image")
+        t2i_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            t2i_timesteps = gr.Slider(4, 128, value=32, step=2, label="Timesteps")
+            t2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            t2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
+        if T2I_EXAMPLES:
+            with gr.Accordion("Sample prompts", open=False):
+                gr.Examples(
+                    examples=T2I_EXAMPLES,
+                    inputs=[t2i_prompt],
+                    examples_per_page=6,
+                )
+        t2i_btn = gr.Button("Generate image", variant="primary")
+        t2i_btn.click(
+            t2i_handler,
+            inputs=[t2i_prompt, t2i_timesteps, t2i_temperature, t2i_guidance],
+            outputs=[t2i_image_out, t2i_status],
+        )
+    # ---- I2I ----
+    with gr.Tab("Image Editing (I2I)"):
+        i2i_image_in = gr.Image(type="pil", label="Reference image", sources=["upload"])
+        i2i_instr = gr.Textbox(
+            label="Editing instruction",
+            lines=4,
+            placeholder="Describe how you want to edit the image...",
+        )
+        i2i_image_out = gr.Image(label="Edited image")
+        i2i_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            i2i_timesteps = gr.Slider(4, 128, value=18, step=2, label="Timesteps")
+            i2i_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            i2i_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
+        i2i_btn = gr.Button("Apply edit", variant="primary")
+        i2i_btn.click(
+            i2i_handler,
+            inputs=[i2i_instr, i2i_image_in, i2i_timesteps, i2i_temperature, i2i_guidance],
+            outputs=[i2i_image_out, i2i_status],
+        )
 if __name__ == "__main__":
     demo.launch()