Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Running on Zero

App Files Files Community

3v324v23 commited on 7 days ago

Commit

e7c040d

1 Parent(s): 3170149

..

Browse files

Files changed (3) hide show

MMaDA/app.py +11 -14
MMaDA/inference/gradio_multimodal_demo_inst.py +25 -71
app.py +26 -21

MMaDA/app.py CHANGED Viewed

@@ -275,12 +275,11 @@ def build_zero_gpu_demo(app: OmadaDemo):
                 )
             # ============================================================
-            # 8) MMU (2 images → text)
             # ============================================================
-            with gr.Tab("MMU (Dual-Image Reasoning)"):
-                mmu_img1 = gr.Image(type="filepath", label="Image 1")
-                mmu_img2 = gr.Image(type="filepath", label="Image 2")
                 mmu_prompt = gr.Textbox(label="Prompt")
                 mmu_btn = gr.Button("Run MMU")
                 mmu_out = gr.Textbox(label="Output")
@@ -289,25 +288,23 @@ def build_zero_gpu_demo(app: OmadaDemo):
                 mmu_examples = []
                 mmu_dir = DEMO_ROOT / "mmu"
                 if mmu_dir.exists():
-                    imgs = list(mmu_dir.glob("*.png"))
-                    if len(imgs) >= 2:
                         mmu_examples.append([
-                            str(imgs[0]),
-                            str(imgs[1]),
-                            "Describe the relation between two objects."
                         ])
                 if len(mmu_examples) > 0:
                     gr.Examples(
                         examples=mmu_examples,
-                        inputs=[mmu_img1, mmu_img2, mmu_prompt],
                         outputs=[mmu_out, mmu_status],
-                        fn=gpu_handler(app.run_mmu_dual),
                     )
                 mmu_btn.click(
-                    gpu_handler(app.run_mmu_dual),
-                    inputs=[mmu_img1, mmu_img2, mmu_prompt],
                     outputs=[mmu_out, mmu_status]
                 )
@@ -395,4 +392,4 @@ def main():
 if __name__ == "__main__":
-    main()

                 )
             # ============================================================
+            # 8) MMU (single image → text)
             # ============================================================
+            with gr.Tab("MMU (Image → Text)"):
+                mmu_img = gr.Image(type="filepath", label="Input Image")
                 mmu_prompt = gr.Textbox(label="Prompt")
                 mmu_btn = gr.Button("Run MMU")
                 mmu_out = gr.Textbox(label="Output")
                 mmu_examples = []
                 mmu_dir = DEMO_ROOT / "mmu"
                 if mmu_dir.exists():
+                    for f in mmu_dir.glob("*.png"):
                         mmu_examples.append([
+                            str(f),
+                            "Describe the main subject of this image."
                         ])
                 if len(mmu_examples) > 0:
                     gr.Examples(
                         examples=mmu_examples,
+                        inputs=[mmu_img, mmu_prompt],
                         outputs=[mmu_out, mmu_status],
+                        fn=gpu_handler(app.run_mmu),
                     )
                 mmu_btn.click(
+                    gpu_handler(app.run_mmu),
+                    inputs=[mmu_img, mmu_prompt],
                     outputs=[mmu_out, mmu_status]
                 )
 if __name__ == "__main__":
+    main()

MMaDA/inference/gradio_multimodal_demo_inst.py CHANGED Viewed

@@ -521,21 +521,13 @@ if not V2S_EXAMPLES:
 I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
 LOGO_DATA_URI = _load_logo_data()
-MMU_IMAGE_A = DEMO_ROOT / "mmu" / "1.jpg"
-MMU_IMAGE_B = DEMO_ROOT / "mmu" / "2.jpg"
-# MMU_IMAGE_C = DEMO_ROOT / "mmu" / "SD_IMG_00235_1.png"
-# MMU_IMAGE_D = DEMO_ROOT / "mmu" / "SD_IMG_00235_2.png"
-if MMU_IMAGE_A.exists() and MMU_IMAGE_B.exists():
     MMU_EXAMPLES = [
-        # [
-        #     str(MMU_IMAGE_C),
-        #     str(MMU_IMAGE_D),
-        #     "What are the differences between the two images?"
-        # ],
         [
-            str(MMU_IMAGE_A),
-            str(MMU_IMAGE_B),
-            "What are the differences in coloring and physical features between animal1 and animal2 in the bird images?",
         ]
     ]
 else:
@@ -1550,18 +1542,23 @@ class OmadaDemo:
         block_length: int,
         temperature: float,
     ) -> Tuple[str, str]:
         if isinstance(images, Image.Image):
-            normalized = [images]
         elif images is None:
             normalized = []
         else:
             normalized = [img for img in images if img is not None]
         if not normalized:
-            return "", "Please provide at least one image for MMU reasoning."
         reply, status = self._mmu_answer(
-            normalized,
             message,
             max_new_tokens=max_new_tokens,
             steps=steps,
@@ -1570,36 +1567,6 @@ class OmadaDemo:
         )
         return reply, status
-    # ------------------------------------------------------------------
-    # Multi-image MMU (2 Images → Text)
-    # ------------------------------------------------------------------
-    def run_mmu_dual(
-        self,
-        image_a: Optional[Image.Image],
-        image_b: Optional[Image.Image],
-        message: str,
-        max_new_tokens: int,
-        steps: int,
-        block_length: int,
-        temperature: float,
-    ) -> Tuple[str, str]:
-        images: List[Image.Image] = []
-        if image_a is not None:
-            images.append(image_a)
-        if image_b is not None:
-            images.append(image_b)
-        if len(images) < 2:
-            return "", "Please provide two images for MMU reasoning."
-        return self.run_mmu(
-            images,
-            message=message,
-            max_new_tokens=max_new_tokens,
-            steps=steps,
-            block_length=block_length,
-            temperature=temperature,
-        )
     # ------------------------------------------------------------------
     # Helpers
     # ------------------------------------------------------------------
@@ -2046,7 +2013,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             "Speech": ["Text → Speech", "Speech → Speech", "Speech → Text"],
             "Video": ["Video → Text", "Video → Speech"],
             "Image": ["Text → Image", "Image Editing"],
-            "Multi-Modal": [],
             "Text": ["Text"],
         }
         default_group = "Speech"
@@ -2058,7 +2025,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             "Video → Speech": "Upload video on the right. Optionally provide guidance here.",
             "Video → Text": "Upload video on the right, then leave notes here if needed.",
             "Text": "Ask anything and the assistant will reply with text.",
-            "MMU (2 Images → Text)": "Ask a question about the two uploaded images.",
             "Text → Image": "Describe the image you want to generate...",
             "Image Editing": "Describe how you want to edit the uploaded image...",
         }
@@ -2275,9 +2242,8 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                                 )
                 with gr.Column(visible=False, elem_classes=["omada-mode-panel"]) as mmu_panel:
                     with gr.Group(elem_classes=["omada-card"]):
-                        gr.Markdown("### Multi-image Reasoning")
-                        mmu_image_a = gr.Image(type="pil", label="Image A", sources=["upload"])
-                        mmu_image_b = gr.Image(type="pil", label="Image B", sources=["upload"])
                         with gr.Accordion("Generation settings", open=True, elem_classes=["omada-advanced"]):
                             mmu_max_tokens = gr.Slider(2, 512, value=256, label="Answer max tokens", step=2)
                             with gr.Row():
@@ -2290,7 +2256,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                             with gr.Column(elem_classes=["omada-examples"]):
                                 gr.Examples(
                                     examples=MMU_EXAMPLES,
-                                    inputs=[mmu_image_a, mmu_image_b, chat_input],
                                     examples_per_page=1,
                                 )
@@ -2302,7 +2268,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             show_s2t = group == "Speech" and mode == "Speech → Text"
             show_v2t = group == "Video" and mode == "Video → Text"
             show_chat = group == "Text" and mode == "Text"
-            show_mmu = group == "Multi-Modal" and mode == "MMU (2 Images → Text)"
             show_image = group == "Image" and mode in ("Text → Image", "Image Editing")
             placeholder = placeholder_map.get(mode, chat_input.placeholder)
             image_mode_value = "Generation" if mode == "Text → Image" else "Editing"
@@ -2430,12 +2396,6 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             i2i_timesteps,
             i2i_temperature,
             i2i_guidance,
-            ti2ti_image,
-            ti2ti_text_tokens,
-            ti2ti_img_timesteps,
-            ti2ti_text_timesteps,
-            ti2ti_temperature,
-            ti2ti_guidance,
             v2s_video_path,
             v2s_max_tokens,
             v2s_steps,
@@ -2446,8 +2406,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             chat_steps,
             chat_block,
             chat_temperature,
-            mmu_image_a,
-            mmu_image_b,
             mmu_max_tokens,
             mmu_steps,
             mmu_block,
@@ -2536,10 +2495,9 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                 )
                 response = _render_text_message(status, reply)
                 display_user_raw = message or "[Text request]"
-            elif mode == "MMU (2 Images → Text)":
-                reply, status = app.run_mmu_dual(
-                    mmu_image_a,
-                    mmu_image_b,
                     message,
                     mmu_max_tokens,
                     mmu_steps,
@@ -2633,8 +2591,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             chat_steps,
             chat_block,
             chat_temperature,
-            mmu_image_a,
-            mmu_image_b,
             mmu_max_tokens,
             mmu_steps,
             mmu_block,
@@ -2656,8 +2613,6 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                 gr.update(value=None),
                 gr.update(value=None),
                 gr.update(value=None),
-                gr.update(value=None),
-                gr.update(value=None),
             )
         clear_button.click(
@@ -2672,8 +2627,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                 i2s_image,
                 v2s_video,
                 i2i_image,
-                mmu_image_a,
-                mmu_image_b,
             ],
         )

 I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
 LOGO_DATA_URI = _load_logo_data()
+MMU_IMAGE = DEMO_ROOT / "mmu" / "1.jpg"
+# MMU_IMAGE_ALT = DEMO_ROOT / "mmu" / "SD_IMG_00235_1.png"
+if MMU_IMAGE.exists():
     MMU_EXAMPLES = [
         [
+            str(MMU_IMAGE),
+            "Describe the scene in this image in detail.",
         ]
     ]
 else:
         block_length: int,
         temperature: float,
     ) -> Tuple[str, str]:
+        """
+        MMU demo now consumes exactly one image. If callers pass a list (for
+        backwards compatibility), we keep only the first valid image.
+        """
         if isinstance(images, Image.Image):
+            normalized: List[Image.Image] = [images]
         elif images is None:
             normalized = []
         else:
             normalized = [img for img in images if img is not None]
         if not normalized:
+            return "", "Please provide an image for MMU reasoning."
+        primary_image = normalized[0]
         reply, status = self._mmu_answer(
+            [primary_image],
             message,
             max_new_tokens=max_new_tokens,
             steps=steps,
         )
         return reply, status
     # ------------------------------------------------------------------
     # Helpers
     # ------------------------------------------------------------------
             "Speech": ["Text → Speech", "Speech → Speech", "Speech → Text"],
             "Video": ["Video → Text", "Video → Speech"],
             "Image": ["Text → Image", "Image Editing"],
+            "Multi-Modal": ["MMU (Image → Text)"],
             "Text": ["Text"],
         }
         default_group = "Speech"
             "Video → Speech": "Upload video on the right. Optionally provide guidance here.",
             "Video → Text": "Upload video on the right, then leave notes here if needed.",
             "Text": "Ask anything and the assistant will reply with text.",
+            "MMU (Image → Text)": "Ask a question about the uploaded image.",
             "Text → Image": "Describe the image you want to generate...",
             "Image Editing": "Describe how you want to edit the uploaded image...",
         }
                                 )
                 with gr.Column(visible=False, elem_classes=["omada-mode-panel"]) as mmu_panel:
                     with gr.Group(elem_classes=["omada-card"]):
+                        gr.Markdown("### Image Reasoning")
+                        mmu_image = gr.Image(type="pil", label="Image", sources=["upload"])
                         with gr.Accordion("Generation settings", open=True, elem_classes=["omada-advanced"]):
                             mmu_max_tokens = gr.Slider(2, 512, value=256, label="Answer max tokens", step=2)
                             with gr.Row():
                             with gr.Column(elem_classes=["omada-examples"]):
                                 gr.Examples(
                                     examples=MMU_EXAMPLES,
+                                    inputs=[mmu_image, chat_input],
                                     examples_per_page=1,
                                 )
             show_s2t = group == "Speech" and mode == "Speech → Text"
             show_v2t = group == "Video" and mode == "Video → Text"
             show_chat = group == "Text" and mode == "Text"
+            show_mmu = group == "Multi-Modal" and mode == "MMU (Image → Text)"
             show_image = group == "Image" and mode in ("Text → Image", "Image Editing")
             placeholder = placeholder_map.get(mode, chat_input.placeholder)
             image_mode_value = "Generation" if mode == "Text → Image" else "Editing"
             i2i_timesteps,
             i2i_temperature,
             i2i_guidance,
             v2s_video_path,
             v2s_max_tokens,
             v2s_steps,
             chat_steps,
             chat_block,
             chat_temperature,
+            mmu_image,
             mmu_max_tokens,
             mmu_steps,
             mmu_block,
                 )
                 response = _render_text_message(status, reply)
                 display_user_raw = message or "[Text request]"
+            elif mode == "MMU (Image → Text)":
+                reply, status = app.run_mmu(
+                    [mmu_image] if mmu_image is not None else [],
                     message,
                     mmu_max_tokens,
                     mmu_steps,
             chat_steps,
             chat_block,
             chat_temperature,
+            mmu_image,
             mmu_max_tokens,
             mmu_steps,
             mmu_block,
                 gr.update(value=None),
                 gr.update(value=None),
                 gr.update(value=None),
             )
         clear_button.click(
                 i2s_image,
                 v2s_video,
                 i2i_image,
+                mmu_image,
             ],
         )

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import sys
 import subprocess
 import importlib
 from pathlib import Path
 import gradio as gr
 import spaces
@@ -256,17 +257,24 @@ V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
 V2S_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
 # MMU images (and fallback for I2S)
-MMU_IMAGE_A = ASSET_ROOT / "mmu" / "1.jpg"
-MMU_IMAGE_B = ASSET_ROOT / "mmu" / "2.jpg"
-if MMU_IMAGE_A.exists() and MMU_IMAGE_B.exists():
-    MMU_EXAMPLES = [[str(MMU_IMAGE_A), str(MMU_IMAGE_B),
-                     "What are the differences in coloring and physical features between animal1 and animal2 in the bird images?"]]
-else:
-    MMU_EXAMPLES = []
 I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
 if not I2S_EXAMPLES and MMU_EXAMPLES:
-    # use image A from MMU as sample I2S input
     I2S_EXAMPLES = [[MMU_EXAMPLES[0][0]]]
@@ -407,11 +415,10 @@ def chat_handler(message, max_tokens, steps, block_len, temperature):
     return text, status
 @spaces.GPU
-def mmu_handler(image_a, image_b, question, max_tokens, steps, block_len, temperature):
     app = get_app()
-    text, status = app.run_mmu_dual(
-        image_a=image_a,
-        image_b=image_b,
         message=question,
         max_new_tokens=int(max_tokens),
         steps=int(steps),
@@ -827,13 +834,12 @@ with gr.Blocks(
         )
     # ---- MMU ----
-    with gr.Tab("MMU (2 images → text)"):
-        mmu_img_a = gr.Image(type="pil", label="Image A", sources=["upload"])
-        mmu_img_b = gr.Image(type="pil", label="Image B", sources=["upload"])
         mmu_question = gr.Textbox(
             label="Question",
             lines=3,
-            placeholder="Ask about the relationship or differences between the two images.",
         )
         mmu_answer = gr.Textbox(label="Answer", lines=6)
         mmu_status = gr.Textbox(label="Status", interactive=False)
@@ -843,18 +849,17 @@ with gr.Blocks(
             mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
             mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
         if MMU_EXAMPLES:
-            with gr.Accordion("Sample MMU pair", open=False):
                 gr.Examples(
                     examples=MMU_EXAMPLES,
-                    inputs=[mmu_img_a, mmu_img_b, mmu_question],
                     examples_per_page=1,
                 )
-        mmu_btn = gr.Button("Answer about the two images", variant="primary")
         mmu_btn.click(
             mmu_handler,
             inputs=[
-                mmu_img_a,
-                mmu_img_b,
                 mmu_question,
                 mmu_max_tokens,
                 mmu_steps,

 import subprocess
 import importlib
 from pathlib import Path
+from typing import List
 import gradio as gr
 import spaces
 V2S_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
 # MMU images (and fallback for I2S)
+MMU_DIR = ASSET_ROOT / "mmu"
+MMU_EXAMPLES: List[List[str]] = []
+if MMU_DIR.exists():
+    for path in sorted(
+        [
+            p
+            for p in MMU_DIR.iterdir()
+            if p.suffix.lower() in {".png", ".jpg", ".jpeg", ".webp"}
+        ]
+    ):
+        MMU_EXAMPLES.append([
+            str(path),
+            "Describe the important objects and their relationships in this image.",
+        ])
 I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
 if not I2S_EXAMPLES and MMU_EXAMPLES:
+    # use the first MMU sample image if no dedicated I2S example exists
     I2S_EXAMPLES = [[MMU_EXAMPLES[0][0]]]
     return text, status
 @spaces.GPU
+def mmu_handler(image, question, max_tokens, steps, block_len, temperature):
     app = get_app()
+    text, status = app.run_mmu(
+        images=image,
         message=question,
         max_new_tokens=int(max_tokens),
         steps=int(steps),
         )
     # ---- MMU ----
+    with gr.Tab("MMU (Image → Text)"):
+        mmu_img = gr.Image(type="pil", label="Input image", sources=["upload"])
         mmu_question = gr.Textbox(
             label="Question",
             lines=3,
+            placeholder="Ask about the scene, objects, or context of the image.",
         )
         mmu_answer = gr.Textbox(label="Answer", lines=6)
         mmu_status = gr.Textbox(label="Status", interactive=False)
             mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
             mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
         if MMU_EXAMPLES:
+            with gr.Accordion("Sample MMU prompts", open=False):
                 gr.Examples(
                     examples=MMU_EXAMPLES,
+                    inputs=[mmu_img, mmu_question],
                     examples_per_page=1,
                 )
+        mmu_btn = gr.Button("Answer about the image", variant="primary")
         mmu_btn.click(
             mmu_handler,
             inputs=[
+                mmu_img,
                 mmu_question,
                 mmu_max_tokens,
                 mmu_steps,