3v324v23 commited on
Commit
e7c040d
Β·
1 Parent(s): 3170149
Files changed (3) hide show
  1. MMaDA/app.py +11 -14
  2. MMaDA/inference/gradio_multimodal_demo_inst.py +25 -71
  3. app.py +26 -21
MMaDA/app.py CHANGED
@@ -275,12 +275,11 @@ def build_zero_gpu_demo(app: OmadaDemo):
275
  )
276
 
277
  # ============================================================
278
- # 8) MMU (2 images β†’ text)
279
  # ============================================================
280
- with gr.Tab("MMU (Dual-Image Reasoning)"):
281
 
282
- mmu_img1 = gr.Image(type="filepath", label="Image 1")
283
- mmu_img2 = gr.Image(type="filepath", label="Image 2")
284
  mmu_prompt = gr.Textbox(label="Prompt")
285
  mmu_btn = gr.Button("Run MMU")
286
  mmu_out = gr.Textbox(label="Output")
@@ -289,25 +288,23 @@ def build_zero_gpu_demo(app: OmadaDemo):
289
  mmu_examples = []
290
  mmu_dir = DEMO_ROOT / "mmu"
291
  if mmu_dir.exists():
292
- imgs = list(mmu_dir.glob("*.png"))
293
- if len(imgs) >= 2:
294
  mmu_examples.append([
295
- str(imgs[0]),
296
- str(imgs[1]),
297
- "Describe the relation between two objects."
298
  ])
299
 
300
  if len(mmu_examples) > 0:
301
  gr.Examples(
302
  examples=mmu_examples,
303
- inputs=[mmu_img1, mmu_img2, mmu_prompt],
304
  outputs=[mmu_out, mmu_status],
305
- fn=gpu_handler(app.run_mmu_dual),
306
  )
307
 
308
  mmu_btn.click(
309
- gpu_handler(app.run_mmu_dual),
310
- inputs=[mmu_img1, mmu_img2, mmu_prompt],
311
  outputs=[mmu_out, mmu_status]
312
  )
313
 
@@ -395,4 +392,4 @@ def main():
395
 
396
 
397
  if __name__ == "__main__":
398
- main()
 
275
  )
276
 
277
  # ============================================================
278
+ # 8) MMU (single image β†’ text)
279
  # ============================================================
280
+ with gr.Tab("MMU (Image β†’ Text)"):
281
 
282
+ mmu_img = gr.Image(type="filepath", label="Input Image")
 
283
  mmu_prompt = gr.Textbox(label="Prompt")
284
  mmu_btn = gr.Button("Run MMU")
285
  mmu_out = gr.Textbox(label="Output")
 
288
  mmu_examples = []
289
  mmu_dir = DEMO_ROOT / "mmu"
290
  if mmu_dir.exists():
291
+ for f in mmu_dir.glob("*.png"):
 
292
  mmu_examples.append([
293
+ str(f),
294
+ "Describe the main subject of this image."
 
295
  ])
296
 
297
  if len(mmu_examples) > 0:
298
  gr.Examples(
299
  examples=mmu_examples,
300
+ inputs=[mmu_img, mmu_prompt],
301
  outputs=[mmu_out, mmu_status],
302
+ fn=gpu_handler(app.run_mmu),
303
  )
304
 
305
  mmu_btn.click(
306
+ gpu_handler(app.run_mmu),
307
+ inputs=[mmu_img, mmu_prompt],
308
  outputs=[mmu_out, mmu_status]
309
  )
310
 
 
392
 
393
 
394
  if __name__ == "__main__":
395
+ main()
MMaDA/inference/gradio_multimodal_demo_inst.py CHANGED
@@ -521,21 +521,13 @@ if not V2S_EXAMPLES:
521
  I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
522
  LOGO_DATA_URI = _load_logo_data()
523
 
524
- MMU_IMAGE_A = DEMO_ROOT / "mmu" / "1.jpg"
525
- MMU_IMAGE_B = DEMO_ROOT / "mmu" / "2.jpg"
526
- # MMU_IMAGE_C = DEMO_ROOT / "mmu" / "SD_IMG_00235_1.png"
527
- # MMU_IMAGE_D = DEMO_ROOT / "mmu" / "SD_IMG_00235_2.png"
528
- if MMU_IMAGE_A.exists() and MMU_IMAGE_B.exists():
529
  MMU_EXAMPLES = [
530
- # [
531
- # str(MMU_IMAGE_C),
532
- # str(MMU_IMAGE_D),
533
- # "What are the differences between the two images?"
534
- # ],
535
  [
536
- str(MMU_IMAGE_A),
537
- str(MMU_IMAGE_B),
538
- "What are the differences in coloring and physical features between animal1 and animal2 in the bird images?",
539
  ]
540
  ]
541
  else:
@@ -1550,18 +1542,23 @@ class OmadaDemo:
1550
  block_length: int,
1551
  temperature: float,
1552
  ) -> Tuple[str, str]:
 
 
 
 
1553
  if isinstance(images, Image.Image):
1554
- normalized = [images]
1555
  elif images is None:
1556
  normalized = []
1557
  else:
1558
  normalized = [img for img in images if img is not None]
1559
 
1560
  if not normalized:
1561
- return "", "Please provide at least one image for MMU reasoning."
1562
 
 
1563
  reply, status = self._mmu_answer(
1564
- normalized,
1565
  message,
1566
  max_new_tokens=max_new_tokens,
1567
  steps=steps,
@@ -1570,36 +1567,6 @@ class OmadaDemo:
1570
  )
1571
  return reply, status
1572
 
1573
- # ------------------------------------------------------------------
1574
- # Multi-image MMU (2 Images β†’ Text)
1575
- # ------------------------------------------------------------------
1576
- def run_mmu_dual(
1577
- self,
1578
- image_a: Optional[Image.Image],
1579
- image_b: Optional[Image.Image],
1580
- message: str,
1581
- max_new_tokens: int,
1582
- steps: int,
1583
- block_length: int,
1584
- temperature: float,
1585
- ) -> Tuple[str, str]:
1586
- images: List[Image.Image] = []
1587
- if image_a is not None:
1588
- images.append(image_a)
1589
- if image_b is not None:
1590
- images.append(image_b)
1591
- if len(images) < 2:
1592
- return "", "Please provide two images for MMU reasoning."
1593
-
1594
- return self.run_mmu(
1595
- images,
1596
- message=message,
1597
- max_new_tokens=max_new_tokens,
1598
- steps=steps,
1599
- block_length=block_length,
1600
- temperature=temperature,
1601
- )
1602
-
1603
  # ------------------------------------------------------------------
1604
  # Helpers
1605
  # ------------------------------------------------------------------
@@ -2046,7 +2013,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2046
  "Speech": ["Text β†’ Speech", "Speech β†’ Speech", "Speech β†’ Text"],
2047
  "Video": ["Video β†’ Text", "Video β†’ Speech"],
2048
  "Image": ["Text β†’ Image", "Image Editing"],
2049
- "Multi-Modal": [],
2050
  "Text": ["Text"],
2051
  }
2052
  default_group = "Speech"
@@ -2058,7 +2025,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2058
  "Video β†’ Speech": "Upload video on the right. Optionally provide guidance here.",
2059
  "Video β†’ Text": "Upload video on the right, then leave notes here if needed.",
2060
  "Text": "Ask anything and the assistant will reply with text.",
2061
- "MMU (2 Images β†’ Text)": "Ask a question about the two uploaded images.",
2062
  "Text β†’ Image": "Describe the image you want to generate...",
2063
  "Image Editing": "Describe how you want to edit the uploaded image...",
2064
  }
@@ -2275,9 +2242,8 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2275
  )
2276
  with gr.Column(visible=False, elem_classes=["omada-mode-panel"]) as mmu_panel:
2277
  with gr.Group(elem_classes=["omada-card"]):
2278
- gr.Markdown("### Multi-image Reasoning")
2279
- mmu_image_a = gr.Image(type="pil", label="Image A", sources=["upload"])
2280
- mmu_image_b = gr.Image(type="pil", label="Image B", sources=["upload"])
2281
  with gr.Accordion("Generation settings", open=True, elem_classes=["omada-advanced"]):
2282
  mmu_max_tokens = gr.Slider(2, 512, value=256, label="Answer max tokens", step=2)
2283
  with gr.Row():
@@ -2290,7 +2256,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2290
  with gr.Column(elem_classes=["omada-examples"]):
2291
  gr.Examples(
2292
  examples=MMU_EXAMPLES,
2293
- inputs=[mmu_image_a, mmu_image_b, chat_input],
2294
  examples_per_page=1,
2295
  )
2296
 
@@ -2302,7 +2268,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2302
  show_s2t = group == "Speech" and mode == "Speech β†’ Text"
2303
  show_v2t = group == "Video" and mode == "Video β†’ Text"
2304
  show_chat = group == "Text" and mode == "Text"
2305
- show_mmu = group == "Multi-Modal" and mode == "MMU (2 Images β†’ Text)"
2306
  show_image = group == "Image" and mode in ("Text β†’ Image", "Image Editing")
2307
  placeholder = placeholder_map.get(mode, chat_input.placeholder)
2308
  image_mode_value = "Generation" if mode == "Text β†’ Image" else "Editing"
@@ -2430,12 +2396,6 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2430
  i2i_timesteps,
2431
  i2i_temperature,
2432
  i2i_guidance,
2433
- ti2ti_image,
2434
- ti2ti_text_tokens,
2435
- ti2ti_img_timesteps,
2436
- ti2ti_text_timesteps,
2437
- ti2ti_temperature,
2438
- ti2ti_guidance,
2439
  v2s_video_path,
2440
  v2s_max_tokens,
2441
  v2s_steps,
@@ -2446,8 +2406,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2446
  chat_steps,
2447
  chat_block,
2448
  chat_temperature,
2449
- mmu_image_a,
2450
- mmu_image_b,
2451
  mmu_max_tokens,
2452
  mmu_steps,
2453
  mmu_block,
@@ -2536,10 +2495,9 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2536
  )
2537
  response = _render_text_message(status, reply)
2538
  display_user_raw = message or "[Text request]"
2539
- elif mode == "MMU (2 Images β†’ Text)":
2540
- reply, status = app.run_mmu_dual(
2541
- mmu_image_a,
2542
- mmu_image_b,
2543
  message,
2544
  mmu_max_tokens,
2545
  mmu_steps,
@@ -2633,8 +2591,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2633
  chat_steps,
2634
  chat_block,
2635
  chat_temperature,
2636
- mmu_image_a,
2637
- mmu_image_b,
2638
  mmu_max_tokens,
2639
  mmu_steps,
2640
  mmu_block,
@@ -2656,8 +2613,6 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2656
  gr.update(value=None),
2657
  gr.update(value=None),
2658
  gr.update(value=None),
2659
- gr.update(value=None),
2660
- gr.update(value=None),
2661
  )
2662
 
2663
  clear_button.click(
@@ -2672,8 +2627,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
2672
  i2s_image,
2673
  v2s_video,
2674
  i2i_image,
2675
- mmu_image_a,
2676
- mmu_image_b,
2677
  ],
2678
  )
2679
 
 
521
  I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
522
  LOGO_DATA_URI = _load_logo_data()
523
 
524
+ MMU_IMAGE = DEMO_ROOT / "mmu" / "1.jpg"
525
+ # MMU_IMAGE_ALT = DEMO_ROOT / "mmu" / "SD_IMG_00235_1.png"
526
+ if MMU_IMAGE.exists():
 
 
527
  MMU_EXAMPLES = [
 
 
 
 
 
528
  [
529
+ str(MMU_IMAGE),
530
+ "Describe the scene in this image in detail.",
 
531
  ]
532
  ]
533
  else:
 
1542
  block_length: int,
1543
  temperature: float,
1544
  ) -> Tuple[str, str]:
1545
+ """
1546
+ MMU demo now consumes exactly one image. If callers pass a list (for
1547
+ backwards compatibility), we keep only the first valid image.
1548
+ """
1549
  if isinstance(images, Image.Image):
1550
+ normalized: List[Image.Image] = [images]
1551
  elif images is None:
1552
  normalized = []
1553
  else:
1554
  normalized = [img for img in images if img is not None]
1555
 
1556
  if not normalized:
1557
+ return "", "Please provide an image for MMU reasoning."
1558
 
1559
+ primary_image = normalized[0]
1560
  reply, status = self._mmu_answer(
1561
+ [primary_image],
1562
  message,
1563
  max_new_tokens=max_new_tokens,
1564
  steps=steps,
 
1567
  )
1568
  return reply, status
1569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1570
  # ------------------------------------------------------------------
1571
  # Helpers
1572
  # ------------------------------------------------------------------
 
2013
  "Speech": ["Text β†’ Speech", "Speech β†’ Speech", "Speech β†’ Text"],
2014
  "Video": ["Video β†’ Text", "Video β†’ Speech"],
2015
  "Image": ["Text β†’ Image", "Image Editing"],
2016
+ "Multi-Modal": ["MMU (Image β†’ Text)"],
2017
  "Text": ["Text"],
2018
  }
2019
  default_group = "Speech"
 
2025
  "Video β†’ Speech": "Upload video on the right. Optionally provide guidance here.",
2026
  "Video β†’ Text": "Upload video on the right, then leave notes here if needed.",
2027
  "Text": "Ask anything and the assistant will reply with text.",
2028
+ "MMU (Image β†’ Text)": "Ask a question about the uploaded image.",
2029
  "Text β†’ Image": "Describe the image you want to generate...",
2030
  "Image Editing": "Describe how you want to edit the uploaded image...",
2031
  }
 
2242
  )
2243
  with gr.Column(visible=False, elem_classes=["omada-mode-panel"]) as mmu_panel:
2244
  with gr.Group(elem_classes=["omada-card"]):
2245
+ gr.Markdown("### Image Reasoning")
2246
+ mmu_image = gr.Image(type="pil", label="Image", sources=["upload"])
 
2247
  with gr.Accordion("Generation settings", open=True, elem_classes=["omada-advanced"]):
2248
  mmu_max_tokens = gr.Slider(2, 512, value=256, label="Answer max tokens", step=2)
2249
  with gr.Row():
 
2256
  with gr.Column(elem_classes=["omada-examples"]):
2257
  gr.Examples(
2258
  examples=MMU_EXAMPLES,
2259
+ inputs=[mmu_image, chat_input],
2260
  examples_per_page=1,
2261
  )
2262
 
 
2268
  show_s2t = group == "Speech" and mode == "Speech β†’ Text"
2269
  show_v2t = group == "Video" and mode == "Video β†’ Text"
2270
  show_chat = group == "Text" and mode == "Text"
2271
+ show_mmu = group == "Multi-Modal" and mode == "MMU (Image β†’ Text)"
2272
  show_image = group == "Image" and mode in ("Text β†’ Image", "Image Editing")
2273
  placeholder = placeholder_map.get(mode, chat_input.placeholder)
2274
  image_mode_value = "Generation" if mode == "Text β†’ Image" else "Editing"
 
2396
  i2i_timesteps,
2397
  i2i_temperature,
2398
  i2i_guidance,
 
 
 
 
 
 
2399
  v2s_video_path,
2400
  v2s_max_tokens,
2401
  v2s_steps,
 
2406
  chat_steps,
2407
  chat_block,
2408
  chat_temperature,
2409
+ mmu_image,
 
2410
  mmu_max_tokens,
2411
  mmu_steps,
2412
  mmu_block,
 
2495
  )
2496
  response = _render_text_message(status, reply)
2497
  display_user_raw = message or "[Text request]"
2498
+ elif mode == "MMU (Image β†’ Text)":
2499
+ reply, status = app.run_mmu(
2500
+ [mmu_image] if mmu_image is not None else [],
 
2501
  message,
2502
  mmu_max_tokens,
2503
  mmu_steps,
 
2591
  chat_steps,
2592
  chat_block,
2593
  chat_temperature,
2594
+ mmu_image,
 
2595
  mmu_max_tokens,
2596
  mmu_steps,
2597
  mmu_block,
 
2613
  gr.update(value=None),
2614
  gr.update(value=None),
2615
  gr.update(value=None),
 
 
2616
  )
2617
 
2618
  clear_button.click(
 
2627
  i2s_image,
2628
  v2s_video,
2629
  i2i_image,
2630
+ mmu_image,
 
2631
  ],
2632
  )
2633
 
app.py CHANGED
@@ -12,6 +12,7 @@ import sys
12
  import subprocess
13
  import importlib
14
  from pathlib import Path
 
15
 
16
  import gradio as gr
17
  import spaces
@@ -256,17 +257,24 @@ V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
256
  V2S_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
257
 
258
  # MMU images (and fallback for I2S)
259
- MMU_IMAGE_A = ASSET_ROOT / "mmu" / "1.jpg"
260
- MMU_IMAGE_B = ASSET_ROOT / "mmu" / "2.jpg"
261
- if MMU_IMAGE_A.exists() and MMU_IMAGE_B.exists():
262
- MMU_EXAMPLES = [[str(MMU_IMAGE_A), str(MMU_IMAGE_B),
263
- "What are the differences in coloring and physical features between animal1 and animal2 in the bird images?"]]
264
- else:
265
- MMU_EXAMPLES = []
 
 
 
 
 
 
 
266
 
267
  I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
268
  if not I2S_EXAMPLES and MMU_EXAMPLES:
269
- # use image A from MMU as sample I2S input
270
  I2S_EXAMPLES = [[MMU_EXAMPLES[0][0]]]
271
 
272
 
@@ -407,11 +415,10 @@ def chat_handler(message, max_tokens, steps, block_len, temperature):
407
  return text, status
408
 
409
  @spaces.GPU
410
- def mmu_handler(image_a, image_b, question, max_tokens, steps, block_len, temperature):
411
  app = get_app()
412
- text, status = app.run_mmu_dual(
413
- image_a=image_a,
414
- image_b=image_b,
415
  message=question,
416
  max_new_tokens=int(max_tokens),
417
  steps=int(steps),
@@ -827,13 +834,12 @@ with gr.Blocks(
827
  )
828
 
829
  # ---- MMU ----
830
- with gr.Tab("MMU (2 images β†’ text)"):
831
- mmu_img_a = gr.Image(type="pil", label="Image A", sources=["upload"])
832
- mmu_img_b = gr.Image(type="pil", label="Image B", sources=["upload"])
833
  mmu_question = gr.Textbox(
834
  label="Question",
835
  lines=3,
836
- placeholder="Ask about the relationship or differences between the two images.",
837
  )
838
  mmu_answer = gr.Textbox(label="Answer", lines=6)
839
  mmu_status = gr.Textbox(label="Status", interactive=False)
@@ -843,18 +849,17 @@ with gr.Blocks(
843
  mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
844
  mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
845
  if MMU_EXAMPLES:
846
- with gr.Accordion("Sample MMU pair", open=False):
847
  gr.Examples(
848
  examples=MMU_EXAMPLES,
849
- inputs=[mmu_img_a, mmu_img_b, mmu_question],
850
  examples_per_page=1,
851
  )
852
- mmu_btn = gr.Button("Answer about the two images", variant="primary")
853
  mmu_btn.click(
854
  mmu_handler,
855
  inputs=[
856
- mmu_img_a,
857
- mmu_img_b,
858
  mmu_question,
859
  mmu_max_tokens,
860
  mmu_steps,
 
12
  import subprocess
13
  import importlib
14
  from pathlib import Path
15
+ from typing import List
16
 
17
  import gradio as gr
18
  import spaces
 
257
  V2S_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
258
 
259
  # MMU images (and fallback for I2S)
260
+ MMU_DIR = ASSET_ROOT / "mmu"
261
+ MMU_EXAMPLES: List[List[str]] = []
262
+ if MMU_DIR.exists():
263
+ for path in sorted(
264
+ [
265
+ p
266
+ for p in MMU_DIR.iterdir()
267
+ if p.suffix.lower() in {".png", ".jpg", ".jpeg", ".webp"}
268
+ ]
269
+ ):
270
+ MMU_EXAMPLES.append([
271
+ str(path),
272
+ "Describe the important objects and their relationships in this image.",
273
+ ])
274
 
275
  I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
276
  if not I2S_EXAMPLES and MMU_EXAMPLES:
277
+ # use the first MMU sample image if no dedicated I2S example exists
278
  I2S_EXAMPLES = [[MMU_EXAMPLES[0][0]]]
279
 
280
 
 
415
  return text, status
416
 
417
  @spaces.GPU
418
+ def mmu_handler(image, question, max_tokens, steps, block_len, temperature):
419
  app = get_app()
420
+ text, status = app.run_mmu(
421
+ images=image,
 
422
  message=question,
423
  max_new_tokens=int(max_tokens),
424
  steps=int(steps),
 
834
  )
835
 
836
  # ---- MMU ----
837
+ with gr.Tab("MMU (Image β†’ Text)"):
838
+ mmu_img = gr.Image(type="pil", label="Input image", sources=["upload"])
 
839
  mmu_question = gr.Textbox(
840
  label="Question",
841
  lines=3,
842
+ placeholder="Ask about the scene, objects, or context of the image.",
843
  )
844
  mmu_answer = gr.Textbox(label="Answer", lines=6)
845
  mmu_status = gr.Textbox(label="Status", interactive=False)
 
849
  mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
850
  mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
851
  if MMU_EXAMPLES:
852
+ with gr.Accordion("Sample MMU prompts", open=False):
853
  gr.Examples(
854
  examples=MMU_EXAMPLES,
855
+ inputs=[mmu_img, mmu_question],
856
  examples_per_page=1,
857
  )
858
+ mmu_btn = gr.Button("Answer about the image", variant="primary")
859
  mmu_btn.click(
860
  mmu_handler,
861
  inputs=[
862
+ mmu_img,
 
863
  mmu_question,
864
  mmu_max_tokens,
865
  mmu_steps,