Spaces:
Running
on
Zero
Running
on
Zero
..
Browse files- MMaDA/app.py +11 -14
- MMaDA/inference/gradio_multimodal_demo_inst.py +25 -71
- app.py +26 -21
MMaDA/app.py
CHANGED
|
@@ -275,12 +275,11 @@ def build_zero_gpu_demo(app: OmadaDemo):
|
|
| 275 |
)
|
| 276 |
|
| 277 |
# ============================================================
|
| 278 |
-
# 8) MMU (
|
| 279 |
# ============================================================
|
| 280 |
-
with gr.Tab("MMU (
|
| 281 |
|
| 282 |
-
|
| 283 |
-
mmu_img2 = gr.Image(type="filepath", label="Image 2")
|
| 284 |
mmu_prompt = gr.Textbox(label="Prompt")
|
| 285 |
mmu_btn = gr.Button("Run MMU")
|
| 286 |
mmu_out = gr.Textbox(label="Output")
|
|
@@ -289,25 +288,23 @@ def build_zero_gpu_demo(app: OmadaDemo):
|
|
| 289 |
mmu_examples = []
|
| 290 |
mmu_dir = DEMO_ROOT / "mmu"
|
| 291 |
if mmu_dir.exists():
|
| 292 |
-
|
| 293 |
-
if len(imgs) >= 2:
|
| 294 |
mmu_examples.append([
|
| 295 |
-
str(
|
| 296 |
-
|
| 297 |
-
"Describe the relation between two objects."
|
| 298 |
])
|
| 299 |
|
| 300 |
if len(mmu_examples) > 0:
|
| 301 |
gr.Examples(
|
| 302 |
examples=mmu_examples,
|
| 303 |
-
inputs=[
|
| 304 |
outputs=[mmu_out, mmu_status],
|
| 305 |
-
fn=gpu_handler(app.
|
| 306 |
)
|
| 307 |
|
| 308 |
mmu_btn.click(
|
| 309 |
-
gpu_handler(app.
|
| 310 |
-
inputs=[
|
| 311 |
outputs=[mmu_out, mmu_status]
|
| 312 |
)
|
| 313 |
|
|
@@ -395,4 +392,4 @@ def main():
|
|
| 395 |
|
| 396 |
|
| 397 |
if __name__ == "__main__":
|
| 398 |
-
main()
|
|
|
|
| 275 |
)
|
| 276 |
|
| 277 |
# ============================================================
|
| 278 |
+
# 8) MMU (single image β text)
|
| 279 |
# ============================================================
|
| 280 |
+
with gr.Tab("MMU (Image β Text)"):
|
| 281 |
|
| 282 |
+
mmu_img = gr.Image(type="filepath", label="Input Image")
|
|
|
|
| 283 |
mmu_prompt = gr.Textbox(label="Prompt")
|
| 284 |
mmu_btn = gr.Button("Run MMU")
|
| 285 |
mmu_out = gr.Textbox(label="Output")
|
|
|
|
| 288 |
mmu_examples = []
|
| 289 |
mmu_dir = DEMO_ROOT / "mmu"
|
| 290 |
if mmu_dir.exists():
|
| 291 |
+
for f in mmu_dir.glob("*.png"):
|
|
|
|
| 292 |
mmu_examples.append([
|
| 293 |
+
str(f),
|
| 294 |
+
"Describe the main subject of this image."
|
|
|
|
| 295 |
])
|
| 296 |
|
| 297 |
if len(mmu_examples) > 0:
|
| 298 |
gr.Examples(
|
| 299 |
examples=mmu_examples,
|
| 300 |
+
inputs=[mmu_img, mmu_prompt],
|
| 301 |
outputs=[mmu_out, mmu_status],
|
| 302 |
+
fn=gpu_handler(app.run_mmu),
|
| 303 |
)
|
| 304 |
|
| 305 |
mmu_btn.click(
|
| 306 |
+
gpu_handler(app.run_mmu),
|
| 307 |
+
inputs=[mmu_img, mmu_prompt],
|
| 308 |
outputs=[mmu_out, mmu_status]
|
| 309 |
)
|
| 310 |
|
|
|
|
| 392 |
|
| 393 |
|
| 394 |
if __name__ == "__main__":
|
| 395 |
+
main()
|
MMaDA/inference/gradio_multimodal_demo_inst.py
CHANGED
|
@@ -521,21 +521,13 @@ if not V2S_EXAMPLES:
|
|
| 521 |
I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
|
| 522 |
LOGO_DATA_URI = _load_logo_data()
|
| 523 |
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
# MMU_IMAGE_D = DEMO_ROOT / "mmu" / "SD_IMG_00235_2.png"
|
| 528 |
-
if MMU_IMAGE_A.exists() and MMU_IMAGE_B.exists():
|
| 529 |
MMU_EXAMPLES = [
|
| 530 |
-
# [
|
| 531 |
-
# str(MMU_IMAGE_C),
|
| 532 |
-
# str(MMU_IMAGE_D),
|
| 533 |
-
# "What are the differences between the two images?"
|
| 534 |
-
# ],
|
| 535 |
[
|
| 536 |
-
str(
|
| 537 |
-
|
| 538 |
-
"What are the differences in coloring and physical features between animal1 and animal2 in the bird images?",
|
| 539 |
]
|
| 540 |
]
|
| 541 |
else:
|
|
@@ -1550,18 +1542,23 @@ class OmadaDemo:
|
|
| 1550 |
block_length: int,
|
| 1551 |
temperature: float,
|
| 1552 |
) -> Tuple[str, str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1553 |
if isinstance(images, Image.Image):
|
| 1554 |
-
normalized = [images]
|
| 1555 |
elif images is None:
|
| 1556 |
normalized = []
|
| 1557 |
else:
|
| 1558 |
normalized = [img for img in images if img is not None]
|
| 1559 |
|
| 1560 |
if not normalized:
|
| 1561 |
-
return "", "Please provide
|
| 1562 |
|
|
|
|
| 1563 |
reply, status = self._mmu_answer(
|
| 1564 |
-
|
| 1565 |
message,
|
| 1566 |
max_new_tokens=max_new_tokens,
|
| 1567 |
steps=steps,
|
|
@@ -1570,36 +1567,6 @@ class OmadaDemo:
|
|
| 1570 |
)
|
| 1571 |
return reply, status
|
| 1572 |
|
| 1573 |
-
# ------------------------------------------------------------------
|
| 1574 |
-
# Multi-image MMU (2 Images β Text)
|
| 1575 |
-
# ------------------------------------------------------------------
|
| 1576 |
-
def run_mmu_dual(
|
| 1577 |
-
self,
|
| 1578 |
-
image_a: Optional[Image.Image],
|
| 1579 |
-
image_b: Optional[Image.Image],
|
| 1580 |
-
message: str,
|
| 1581 |
-
max_new_tokens: int,
|
| 1582 |
-
steps: int,
|
| 1583 |
-
block_length: int,
|
| 1584 |
-
temperature: float,
|
| 1585 |
-
) -> Tuple[str, str]:
|
| 1586 |
-
images: List[Image.Image] = []
|
| 1587 |
-
if image_a is not None:
|
| 1588 |
-
images.append(image_a)
|
| 1589 |
-
if image_b is not None:
|
| 1590 |
-
images.append(image_b)
|
| 1591 |
-
if len(images) < 2:
|
| 1592 |
-
return "", "Please provide two images for MMU reasoning."
|
| 1593 |
-
|
| 1594 |
-
return self.run_mmu(
|
| 1595 |
-
images,
|
| 1596 |
-
message=message,
|
| 1597 |
-
max_new_tokens=max_new_tokens,
|
| 1598 |
-
steps=steps,
|
| 1599 |
-
block_length=block_length,
|
| 1600 |
-
temperature=temperature,
|
| 1601 |
-
)
|
| 1602 |
-
|
| 1603 |
# ------------------------------------------------------------------
|
| 1604 |
# Helpers
|
| 1605 |
# ------------------------------------------------------------------
|
|
@@ -2046,7 +2013,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2046 |
"Speech": ["Text β Speech", "Speech β Speech", "Speech β Text"],
|
| 2047 |
"Video": ["Video β Text", "Video β Speech"],
|
| 2048 |
"Image": ["Text β Image", "Image Editing"],
|
| 2049 |
-
"Multi-Modal": [],
|
| 2050 |
"Text": ["Text"],
|
| 2051 |
}
|
| 2052 |
default_group = "Speech"
|
|
@@ -2058,7 +2025,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2058 |
"Video β Speech": "Upload video on the right. Optionally provide guidance here.",
|
| 2059 |
"Video β Text": "Upload video on the right, then leave notes here if needed.",
|
| 2060 |
"Text": "Ask anything and the assistant will reply with text.",
|
| 2061 |
-
"MMU (
|
| 2062 |
"Text β Image": "Describe the image you want to generate...",
|
| 2063 |
"Image Editing": "Describe how you want to edit the uploaded image...",
|
| 2064 |
}
|
|
@@ -2275,9 +2242,8 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2275 |
)
|
| 2276 |
with gr.Column(visible=False, elem_classes=["omada-mode-panel"]) as mmu_panel:
|
| 2277 |
with gr.Group(elem_classes=["omada-card"]):
|
| 2278 |
-
gr.Markdown("###
|
| 2279 |
-
|
| 2280 |
-
mmu_image_b = gr.Image(type="pil", label="Image B", sources=["upload"])
|
| 2281 |
with gr.Accordion("Generation settings", open=True, elem_classes=["omada-advanced"]):
|
| 2282 |
mmu_max_tokens = gr.Slider(2, 512, value=256, label="Answer max tokens", step=2)
|
| 2283 |
with gr.Row():
|
|
@@ -2290,7 +2256,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2290 |
with gr.Column(elem_classes=["omada-examples"]):
|
| 2291 |
gr.Examples(
|
| 2292 |
examples=MMU_EXAMPLES,
|
| 2293 |
-
inputs=[
|
| 2294 |
examples_per_page=1,
|
| 2295 |
)
|
| 2296 |
|
|
@@ -2302,7 +2268,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2302 |
show_s2t = group == "Speech" and mode == "Speech β Text"
|
| 2303 |
show_v2t = group == "Video" and mode == "Video β Text"
|
| 2304 |
show_chat = group == "Text" and mode == "Text"
|
| 2305 |
-
show_mmu = group == "Multi-Modal" and mode == "MMU (
|
| 2306 |
show_image = group == "Image" and mode in ("Text β Image", "Image Editing")
|
| 2307 |
placeholder = placeholder_map.get(mode, chat_input.placeholder)
|
| 2308 |
image_mode_value = "Generation" if mode == "Text β Image" else "Editing"
|
|
@@ -2430,12 +2396,6 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2430 |
i2i_timesteps,
|
| 2431 |
i2i_temperature,
|
| 2432 |
i2i_guidance,
|
| 2433 |
-
ti2ti_image,
|
| 2434 |
-
ti2ti_text_tokens,
|
| 2435 |
-
ti2ti_img_timesteps,
|
| 2436 |
-
ti2ti_text_timesteps,
|
| 2437 |
-
ti2ti_temperature,
|
| 2438 |
-
ti2ti_guidance,
|
| 2439 |
v2s_video_path,
|
| 2440 |
v2s_max_tokens,
|
| 2441 |
v2s_steps,
|
|
@@ -2446,8 +2406,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2446 |
chat_steps,
|
| 2447 |
chat_block,
|
| 2448 |
chat_temperature,
|
| 2449 |
-
|
| 2450 |
-
mmu_image_b,
|
| 2451 |
mmu_max_tokens,
|
| 2452 |
mmu_steps,
|
| 2453 |
mmu_block,
|
|
@@ -2536,10 +2495,9 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2536 |
)
|
| 2537 |
response = _render_text_message(status, reply)
|
| 2538 |
display_user_raw = message or "[Text request]"
|
| 2539 |
-
elif mode == "MMU (
|
| 2540 |
-
reply, status = app.
|
| 2541 |
-
|
| 2542 |
-
mmu_image_b,
|
| 2543 |
message,
|
| 2544 |
mmu_max_tokens,
|
| 2545 |
mmu_steps,
|
|
@@ -2633,8 +2591,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2633 |
chat_steps,
|
| 2634 |
chat_block,
|
| 2635 |
chat_temperature,
|
| 2636 |
-
|
| 2637 |
-
mmu_image_b,
|
| 2638 |
mmu_max_tokens,
|
| 2639 |
mmu_steps,
|
| 2640 |
mmu_block,
|
|
@@ -2656,8 +2613,6 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2656 |
gr.update(value=None),
|
| 2657 |
gr.update(value=None),
|
| 2658 |
gr.update(value=None),
|
| 2659 |
-
gr.update(value=None),
|
| 2660 |
-
gr.update(value=None),
|
| 2661 |
)
|
| 2662 |
|
| 2663 |
clear_button.click(
|
|
@@ -2672,8 +2627,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
|
|
| 2672 |
i2s_image,
|
| 2673 |
v2s_video,
|
| 2674 |
i2i_image,
|
| 2675 |
-
|
| 2676 |
-
mmu_image_b,
|
| 2677 |
],
|
| 2678 |
)
|
| 2679 |
|
|
|
|
| 521 |
I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
|
| 522 |
LOGO_DATA_URI = _load_logo_data()
|
| 523 |
|
| 524 |
+
MMU_IMAGE = DEMO_ROOT / "mmu" / "1.jpg"
|
| 525 |
+
# MMU_IMAGE_ALT = DEMO_ROOT / "mmu" / "SD_IMG_00235_1.png"
|
| 526 |
+
if MMU_IMAGE.exists():
|
|
|
|
|
|
|
| 527 |
MMU_EXAMPLES = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
[
|
| 529 |
+
str(MMU_IMAGE),
|
| 530 |
+
"Describe the scene in this image in detail.",
|
|
|
|
| 531 |
]
|
| 532 |
]
|
| 533 |
else:
|
|
|
|
| 1542 |
block_length: int,
|
| 1543 |
temperature: float,
|
| 1544 |
) -> Tuple[str, str]:
|
| 1545 |
+
"""
|
| 1546 |
+
MMU demo now consumes exactly one image. If callers pass a list (for
|
| 1547 |
+
backwards compatibility), we keep only the first valid image.
|
| 1548 |
+
"""
|
| 1549 |
if isinstance(images, Image.Image):
|
| 1550 |
+
normalized: List[Image.Image] = [images]
|
| 1551 |
elif images is None:
|
| 1552 |
normalized = []
|
| 1553 |
else:
|
| 1554 |
normalized = [img for img in images if img is not None]
|
| 1555 |
|
| 1556 |
if not normalized:
|
| 1557 |
+
return "", "Please provide an image for MMU reasoning."
|
| 1558 |
|
| 1559 |
+
primary_image = normalized[0]
|
| 1560 |
reply, status = self._mmu_answer(
|
| 1561 |
+
[primary_image],
|
| 1562 |
message,
|
| 1563 |
max_new_tokens=max_new_tokens,
|
| 1564 |
steps=steps,
|
|
|
|
| 1567 |
)
|
| 1568 |
return reply, status
|
| 1569 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1570 |
# ------------------------------------------------------------------
|
| 1571 |
# Helpers
|
| 1572 |
# ------------------------------------------------------------------
|
|
|
|
| 2013 |
"Speech": ["Text β Speech", "Speech β Speech", "Speech β Text"],
|
| 2014 |
"Video": ["Video β Text", "Video β Speech"],
|
| 2015 |
"Image": ["Text β Image", "Image Editing"],
|
| 2016 |
+
"Multi-Modal": ["MMU (Image β Text)"],
|
| 2017 |
"Text": ["Text"],
|
| 2018 |
}
|
| 2019 |
default_group = "Speech"
|
|
|
|
| 2025 |
"Video β Speech": "Upload video on the right. Optionally provide guidance here.",
|
| 2026 |
"Video β Text": "Upload video on the right, then leave notes here if needed.",
|
| 2027 |
"Text": "Ask anything and the assistant will reply with text.",
|
| 2028 |
+
"MMU (Image β Text)": "Ask a question about the uploaded image.",
|
| 2029 |
"Text β Image": "Describe the image you want to generate...",
|
| 2030 |
"Image Editing": "Describe how you want to edit the uploaded image...",
|
| 2031 |
}
|
|
|
|
| 2242 |
)
|
| 2243 |
with gr.Column(visible=False, elem_classes=["omada-mode-panel"]) as mmu_panel:
|
| 2244 |
with gr.Group(elem_classes=["omada-card"]):
|
| 2245 |
+
gr.Markdown("### Image Reasoning")
|
| 2246 |
+
mmu_image = gr.Image(type="pil", label="Image", sources=["upload"])
|
|
|
|
| 2247 |
with gr.Accordion("Generation settings", open=True, elem_classes=["omada-advanced"]):
|
| 2248 |
mmu_max_tokens = gr.Slider(2, 512, value=256, label="Answer max tokens", step=2)
|
| 2249 |
with gr.Row():
|
|
|
|
| 2256 |
with gr.Column(elem_classes=["omada-examples"]):
|
| 2257 |
gr.Examples(
|
| 2258 |
examples=MMU_EXAMPLES,
|
| 2259 |
+
inputs=[mmu_image, chat_input],
|
| 2260 |
examples_per_page=1,
|
| 2261 |
)
|
| 2262 |
|
|
|
|
| 2268 |
show_s2t = group == "Speech" and mode == "Speech β Text"
|
| 2269 |
show_v2t = group == "Video" and mode == "Video β Text"
|
| 2270 |
show_chat = group == "Text" and mode == "Text"
|
| 2271 |
+
show_mmu = group == "Multi-Modal" and mode == "MMU (Image β Text)"
|
| 2272 |
show_image = group == "Image" and mode in ("Text β Image", "Image Editing")
|
| 2273 |
placeholder = placeholder_map.get(mode, chat_input.placeholder)
|
| 2274 |
image_mode_value = "Generation" if mode == "Text β Image" else "Editing"
|
|
|
|
| 2396 |
i2i_timesteps,
|
| 2397 |
i2i_temperature,
|
| 2398 |
i2i_guidance,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2399 |
v2s_video_path,
|
| 2400 |
v2s_max_tokens,
|
| 2401 |
v2s_steps,
|
|
|
|
| 2406 |
chat_steps,
|
| 2407 |
chat_block,
|
| 2408 |
chat_temperature,
|
| 2409 |
+
mmu_image,
|
|
|
|
| 2410 |
mmu_max_tokens,
|
| 2411 |
mmu_steps,
|
| 2412 |
mmu_block,
|
|
|
|
| 2495 |
)
|
| 2496 |
response = _render_text_message(status, reply)
|
| 2497 |
display_user_raw = message or "[Text request]"
|
| 2498 |
+
elif mode == "MMU (Image β Text)":
|
| 2499 |
+
reply, status = app.run_mmu(
|
| 2500 |
+
[mmu_image] if mmu_image is not None else [],
|
|
|
|
| 2501 |
message,
|
| 2502 |
mmu_max_tokens,
|
| 2503 |
mmu_steps,
|
|
|
|
| 2591 |
chat_steps,
|
| 2592 |
chat_block,
|
| 2593 |
chat_temperature,
|
| 2594 |
+
mmu_image,
|
|
|
|
| 2595 |
mmu_max_tokens,
|
| 2596 |
mmu_steps,
|
| 2597 |
mmu_block,
|
|
|
|
| 2613 |
gr.update(value=None),
|
| 2614 |
gr.update(value=None),
|
| 2615 |
gr.update(value=None),
|
|
|
|
|
|
|
| 2616 |
)
|
| 2617 |
|
| 2618 |
clear_button.click(
|
|
|
|
| 2627 |
i2s_image,
|
| 2628 |
v2s_video,
|
| 2629 |
i2i_image,
|
| 2630 |
+
mmu_image,
|
|
|
|
| 2631 |
],
|
| 2632 |
)
|
| 2633 |
|
app.py
CHANGED
|
@@ -12,6 +12,7 @@ import sys
|
|
| 12 |
import subprocess
|
| 13 |
import importlib
|
| 14 |
from pathlib import Path
|
|
|
|
| 15 |
|
| 16 |
import gradio as gr
|
| 17 |
import spaces
|
|
@@ -256,17 +257,24 @@ V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
|
|
| 256 |
V2S_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
|
| 257 |
|
| 258 |
# MMU images (and fallback for I2S)
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
if
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
|
| 268 |
if not I2S_EXAMPLES and MMU_EXAMPLES:
|
| 269 |
-
# use
|
| 270 |
I2S_EXAMPLES = [[MMU_EXAMPLES[0][0]]]
|
| 271 |
|
| 272 |
|
|
@@ -407,11 +415,10 @@ def chat_handler(message, max_tokens, steps, block_len, temperature):
|
|
| 407 |
return text, status
|
| 408 |
|
| 409 |
@spaces.GPU
|
| 410 |
-
def mmu_handler(
|
| 411 |
app = get_app()
|
| 412 |
-
text, status = app.
|
| 413 |
-
|
| 414 |
-
image_b=image_b,
|
| 415 |
message=question,
|
| 416 |
max_new_tokens=int(max_tokens),
|
| 417 |
steps=int(steps),
|
|
@@ -827,13 +834,12 @@ with gr.Blocks(
|
|
| 827 |
)
|
| 828 |
|
| 829 |
# ---- MMU ----
|
| 830 |
-
with gr.Tab("MMU (
|
| 831 |
-
|
| 832 |
-
mmu_img_b = gr.Image(type="pil", label="Image B", sources=["upload"])
|
| 833 |
mmu_question = gr.Textbox(
|
| 834 |
label="Question",
|
| 835 |
lines=3,
|
| 836 |
-
placeholder="Ask about the
|
| 837 |
)
|
| 838 |
mmu_answer = gr.Textbox(label="Answer", lines=6)
|
| 839 |
mmu_status = gr.Textbox(label="Status", interactive=False)
|
|
@@ -843,18 +849,17 @@ with gr.Blocks(
|
|
| 843 |
mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
|
| 844 |
mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
|
| 845 |
if MMU_EXAMPLES:
|
| 846 |
-
with gr.Accordion("Sample MMU
|
| 847 |
gr.Examples(
|
| 848 |
examples=MMU_EXAMPLES,
|
| 849 |
-
inputs=[
|
| 850 |
examples_per_page=1,
|
| 851 |
)
|
| 852 |
-
mmu_btn = gr.Button("Answer about the
|
| 853 |
mmu_btn.click(
|
| 854 |
mmu_handler,
|
| 855 |
inputs=[
|
| 856 |
-
|
| 857 |
-
mmu_img_b,
|
| 858 |
mmu_question,
|
| 859 |
mmu_max_tokens,
|
| 860 |
mmu_steps,
|
|
|
|
| 12 |
import subprocess
|
| 13 |
import importlib
|
| 14 |
from pathlib import Path
|
| 15 |
+
from typing import List
|
| 16 |
|
| 17 |
import gradio as gr
|
| 18 |
import spaces
|
|
|
|
| 257 |
V2S_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
|
| 258 |
|
| 259 |
# MMU images (and fallback for I2S)
|
| 260 |
+
MMU_DIR = ASSET_ROOT / "mmu"
|
| 261 |
+
MMU_EXAMPLES: List[List[str]] = []
|
| 262 |
+
if MMU_DIR.exists():
|
| 263 |
+
for path in sorted(
|
| 264 |
+
[
|
| 265 |
+
p
|
| 266 |
+
for p in MMU_DIR.iterdir()
|
| 267 |
+
if p.suffix.lower() in {".png", ".jpg", ".jpeg", ".webp"}
|
| 268 |
+
]
|
| 269 |
+
):
|
| 270 |
+
MMU_EXAMPLES.append([
|
| 271 |
+
str(path),
|
| 272 |
+
"Describe the important objects and their relationships in this image.",
|
| 273 |
+
])
|
| 274 |
|
| 275 |
I2S_EXAMPLES = _load_media_examples("i2s", {".png", ".jpg", ".jpeg", ".webp"})
|
| 276 |
if not I2S_EXAMPLES and MMU_EXAMPLES:
|
| 277 |
+
# use the first MMU sample image if no dedicated I2S example exists
|
| 278 |
I2S_EXAMPLES = [[MMU_EXAMPLES[0][0]]]
|
| 279 |
|
| 280 |
|
|
|
|
| 415 |
return text, status
|
| 416 |
|
| 417 |
@spaces.GPU
|
| 418 |
+
def mmu_handler(image, question, max_tokens, steps, block_len, temperature):
|
| 419 |
app = get_app()
|
| 420 |
+
text, status = app.run_mmu(
|
| 421 |
+
images=image,
|
|
|
|
| 422 |
message=question,
|
| 423 |
max_new_tokens=int(max_tokens),
|
| 424 |
steps=int(steps),
|
|
|
|
| 834 |
)
|
| 835 |
|
| 836 |
# ---- MMU ----
|
| 837 |
+
with gr.Tab("MMU (Image β Text)"):
|
| 838 |
+
mmu_img = gr.Image(type="pil", label="Input image", sources=["upload"])
|
|
|
|
| 839 |
mmu_question = gr.Textbox(
|
| 840 |
label="Question",
|
| 841 |
lines=3,
|
| 842 |
+
placeholder="Ask about the scene, objects, or context of the image.",
|
| 843 |
)
|
| 844 |
mmu_answer = gr.Textbox(label="Answer", lines=6)
|
| 845 |
mmu_status = gr.Textbox(label="Status", interactive=False)
|
|
|
|
| 849 |
mmu_block = gr.Slider(2, 512, value=128, step=2, label="Block length")
|
| 850 |
mmu_temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Sampling temperature")
|
| 851 |
if MMU_EXAMPLES:
|
| 852 |
+
with gr.Accordion("Sample MMU prompts", open=False):
|
| 853 |
gr.Examples(
|
| 854 |
examples=MMU_EXAMPLES,
|
| 855 |
+
inputs=[mmu_img, mmu_question],
|
| 856 |
examples_per_page=1,
|
| 857 |
)
|
| 858 |
+
mmu_btn = gr.Button("Answer about the image", variant="primary")
|
| 859 |
mmu_btn.click(
|
| 860 |
mmu_handler,
|
| 861 |
inputs=[
|
| 862 |
+
mmu_img,
|
|
|
|
| 863 |
mmu_question,
|
| 864 |
mmu_max_tokens,
|
| 865 |
mmu_steps,
|