Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Running on Zero

App Files Files Community

jaeikkim commited on 21 days ago

Commit

333ef29

1 Parent(s): 88f06d8

Add TI2TI UI without binaries

Browse files

Files changed (21) hide show

.gitignore +3 -0
MMaDA/inference/__pycache__/common.cpython-310.pyc +0 -0
MMaDA/inference/gradio_multimodal_demo_inst.py +249 -3
MMaDA/models/__pycache__/__init__.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/common_modules.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/configuration_emova_speech_tokenizer.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/configuration_llada.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/misc.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/modeling_emova_speech_tokenizer.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/modeling_llada.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/modeling_magvitv2.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/modeling_mmada.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/modeling_omada.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/modeling_utils.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/modeling_video_encoder.cpython-310.pyc +0 -0
MMaDA/models/__pycache__/sampling.cpython-310.pyc +0 -0
MMaDA/training/__pycache__/__init__.cpython-310.pyc +0 -0
MMaDA/training/__pycache__/data.cpython-310.pyc +0 -0
MMaDA/training/__pycache__/prompting_utils.cpython-310.pyc +0 -0
MMaDA/training/__pycache__/utils.cpython-310.pyc +0 -0
app.py +77 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+*.pyc
+MMaDA/inference/demo/ti2ti/

MMaDA/inference/__pycache__/common.cpython-310.pyc DELETED Viewed

Binary file (5.69 kB)

MMaDA/inference/gradio_multimodal_demo_inst.py CHANGED Viewed

@@ -495,6 +495,32 @@ def _load_i2i_examples():
     return examples
 def _load_media_examples(subdir: str, suffixes):
     target_dir = DEMO_ROOT / subdir
     if not target_dir.exists():
@@ -510,6 +536,7 @@ T2S_EXAMPLES = _load_t2s_examples()
 CHAT_EXAMPLES = _load_chat_examples()
 T2I_EXAMPLES = _load_t2i_examples()
 I2I_EXAMPLES = _load_i2i_examples()
 S2T_EXAMPLES = _load_media_examples("s2t", {".wav", ".mp3", ".flac", ".ogg"})
 V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
 S2S_EXAMPLES = _load_media_examples("s2s", {".wav", ".mp3", ".flac", ".ogg"})
@@ -629,6 +656,33 @@ def _render_image_message(status: str, image: Optional[Image.Image]) -> str:
     return _render_response(status, image_html)
 def _format_user_message(message: str) -> str:
     clean = html.escape(message or "")
     return clean.replace("\n", "<br>")
@@ -1180,6 +1234,146 @@ class OmadaDemo:
         image = self._decode_image_tokens(gen_tokens[0])
         return image, "Edited image generated."
     # ------------------------------------------------------------------
     # Video-to-Speech
     # ------------------------------------------------------------------
@@ -1866,7 +2060,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
         group_to_modes = {
             "Any → Speech": ["Text → Speech", "Speech → Speech", "Video → Speech", "Image → Speech"],
             "Any → Text": ["Speech → Text", "Video → Text", "Chat", "MMU (2 Images → Text)"],
-            "Image Generation": ["Text → Image", "Image Editing"],
         }
         default_group = "Any → Speech"
         default_mode = group_to_modes[default_group][0]
@@ -1881,6 +2075,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             "MMU (2 Images → Text)": "Ask a question about the two uploaded images.",
             "Text → Image": "Describe the image you want to generate...",
             "Image Editing": "Describe how you want to edit the uploaded image...",
         }
         with gr.Row(elem_classes=["omada-layout"], equal_height=False):
             with gr.Column(scale=3, min_width=480, elem_classes=["omada-chat-column"]):
@@ -2075,6 +2270,27 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                                     inputs=[chat_input],
                                     examples_per_page=4,
                                 )
                 with gr.Column(visible=False, elem_classes=["omada-mode-panel"]) as chat_panel:
                     with gr.Group(elem_classes=["omada-card"]):
                         gr.Markdown("### Chat Controls")
@@ -2123,7 +2339,8 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             show_v2t = group == "Any → Text" and mode == "Video → Text"
             show_chat = group == "Any → Text" and mode == "Chat"
             show_mmu = group == "Any → Text" and mode == "MMU (2 Images → Text)"
-            show_image = group == "Image Generation"
             placeholder = placeholder_map.get(mode, chat_input.placeholder)
             image_mode_value = "Generation" if mode == "Text → Image" else "Editing"
             t2i_visible = show_image and mode == "Text → Image"
@@ -2140,6 +2357,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                 gr.update(visible=show_chat),
                 gr.update(visible=show_mmu),
                 gr.update(visible=show_image),
                 image_mode_update,
                 gr.update(visible=t2i_visible),
                 gr.update(visible=i2i_visible),
@@ -2169,6 +2387,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                 chat_panel,
                 mmu_panel,
                 image_panel,
                 image_mode_selector,
                 t2i_settings,
                 i2i_settings,
@@ -2189,6 +2408,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                 chat_panel,
                 mmu_panel,
                 image_panel,
                 image_mode_selector,
                 t2i_settings,
                 i2i_settings,
@@ -2250,6 +2470,12 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             i2i_timesteps,
             i2i_temperature,
             i2i_guidance,
             v2s_video_path,
             v2s_max_tokens,
             v2s_steps,
@@ -2390,7 +2616,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                         )
                         response = _render_image_message(status, image_result)
                     display_user_raw = message or "[Image generation request]"
-                else:  # Image Editing
                     image_result, status = app.run_i2i(
                         message,
                         i2i_image,
@@ -2400,6 +2626,18 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                     )
                     response = _render_image_message(status, image_result)
                     display_user_raw = message or "[Image editing request]"
             if not response:
                 status = f"Mode '{mode}' is not supported."
@@ -2453,6 +2691,12 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
             i2i_timesteps,
             i2i_temperature,
             i2i_guidance,
             v2s_video,
             v2s_max_tokens,
             v2s_steps,
@@ -2487,6 +2731,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                 gr.update(value=None),
                 gr.update(value=None),
                 gr.update(value=None),
             )
         clear_button.click(
@@ -2501,6 +2746,7 @@ def build_demo(app: OmadaDemo, share: bool, server_name: str, server_port: Optio
                 i2s_image,
                 v2s_video,
                 i2i_image,
                 mmu_image_a,
                 mmu_image_b,
             ],

     return examples
+def _load_ti2ti_examples():
+    """demo/ti2ti의 sample##_src.png + sample##_instr.txt 쌍을 Examples로 묶어줌."""
+    d = DEMO_ROOT / "ti2ti"
+    if not d.exists():
+        return []
+    src_files = sorted([p for p in d.iterdir() if p.is_file() and p.name.endswith("_src.png")])
+    txt_files = {
+        p.name.replace("_instr.txt", ""): p
+        for p in d.iterdir()
+        if p.is_file() and p.name.endswith("_instr.txt")
+    }
+    examples = []
+    for src in src_files:
+        stem = src.name.replace("_src.png", "")
+        txt = txt_files.get(stem)
+        if not txt:
+            continue
+        instruction = txt.read_text(encoding="utf-8").strip()
+        if not instruction:
+            continue
+        examples.append([str(src), instruction])
+    return examples
 def _load_media_examples(subdir: str, suffixes):
     target_dir = DEMO_ROOT / subdir
     if not target_dir.exists():
 CHAT_EXAMPLES = _load_chat_examples()
 T2I_EXAMPLES = _load_t2i_examples()
 I2I_EXAMPLES = _load_i2i_examples()
+TI2TI_EXAMPLES = _load_ti2ti_examples()
 S2T_EXAMPLES = _load_media_examples("s2t", {".wav", ".mp3", ".flac", ".ogg"})
 V2T_EXAMPLES = _load_media_examples("v2t", {".mp4", ".mov", ".avi", ".webm"})
 S2S_EXAMPLES = _load_media_examples("s2s", {".wav", ".mp3", ".flac", ".ogg"})
     return _render_response(status, image_html)
+def _render_image_text_message(status: str, image: Optional[Image.Image], text: str) -> str:
+    """Render combined text + image output for TI2TI."""
+    blocks = []
+    text_clean = (text or "").strip()
+    if text_clean:
+        safe_text = html.escape(text_clean).replace("\n", "<br>")
+        blocks.append(f"<div class='omada-response-block'>{safe_text}</div>")
+    if image is not None:
+        buffer = io.BytesIO()
+        try:
+            image.save(buffer, format="PNG")
+            encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
+            blocks.append(
+                "<div class='omada-response-block'>"
+                "<img src='data:image/png;base64,"
+                f"{encoded}"
+                "' alt='Generated image' style='max-width:100%;border-radius:12px;' />"
+                "</div>"
+            )
+        except Exception:
+            pass
+    body = "".join(blocks)
+    return _render_response(status, body if body else None)
 def _format_user_message(message: str) -> str:
     clean = html.escape(message or "")
     return clean.replace("\n", "<br>")
         image = self._decode_image_tokens(gen_tokens[0])
         return image, "Edited image generated."
+    # ------------------------------------------------------------------
+    # Text+Image → Text+Image (TI2TI)
+    # ------------------------------------------------------------------
+    def run_ti2ti(
+        self,
+        instruction: str,
+        source_image: Optional[Image.Image],
+        text_tokens: int,
+        timesteps_image: int,
+        timesteps_text: int,
+        temperature: float,
+        guidance_scale: float,
+    ) -> Tuple[Optional[Image.Image], str, str]:
+        instruction_clean = (instruction or "").strip()
+        if source_image is None:
+            return None, "", "Please upload a source image."
+        if not instruction_clean:
+            return None, "", "Provide an editing instruction for TI2TI."
+        try:
+            src_tokens = self._prepare_image_tokens(source_image)
+        except Exception as exc:
+            return None, "", f"Failed to encode source image: {exc}"
+        text_tokens = max(4, min(int(text_tokens), self.max_text_len))
+        prompt_ids = self.uni_prompting.text_tokenizer(instruction_clean)['input_ids']
+        if isinstance(prompt_ids, list) and prompt_ids and isinstance(prompt_ids[0], list):
+            prompt_ids = prompt_ids[0]
+        if len(prompt_ids) == 0 or prompt_ids[0] != self.uni_prompting.text_tokenizer.bos_token_id:
+            prompt_ids = [self.uni_prompting.text_tokenizer.bos_token_id] + prompt_ids
+        prompt_ids = prompt_ids + [self.uni_prompting.text_tokenizer.eos_token_id]
+        prompt_tensor = torch.tensor(prompt_ids, device=self.device, dtype=torch.long)
+        ti2ti_id = int(self.uni_prompting.sptids_dict['<|ti2ti|>'][0].item())
+        soi_id = int(self.uni_prompting.sptids_dict['<|soi|>'][0].item())
+        eoi_id = int(self.uni_prompting.sptids_dict['<|eoi|>'][0].item())
+        pad_raw = getattr(self.uni_prompting, "pad_id", 0)
+        pad_id = int(pad_raw if pad_raw is not None else 0)
+        img_placeholder = torch.full(
+            (self.image_seq_len,),
+            self.mask_token_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        text_placeholder = torch.full(
+            (text_tokens,),
+            self.mask_token_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        src_flat = src_tokens.view(-1)
+        prompt_len = prompt_tensor.numel()
+        img_len = img_placeholder.numel()
+        text_len = text_placeholder.numel()
+        prompt_start = 2 + src_flat.numel() + 1
+        prompt_end = prompt_start + prompt_len
+        img_start = prompt_end + 1
+        img_end = img_start + img_len
+        text_start = img_end + 1
+        text_end = text_start + text_len
+        seq_parts = [
+            torch.tensor([ti2ti_id, soi_id], device=self.device, dtype=torch.long),
+            src_flat,
+            torch.tensor([eoi_id], device=self.device, dtype=torch.long),
+            prompt_tensor,
+            torch.tensor([soi_id], device=self.device, dtype=torch.long),
+            img_placeholder,
+            torch.tensor([eoi_id], device=self.device, dtype=torch.long),
+            text_placeholder,
+        ]
+        seq = torch.cat(seq_parts, dim=0).unsqueeze(0)
+        attn = torch.ones_like(seq, dtype=torch.long, device=self.device)
+        uncond_seq = seq.clone()
+        uncond_attn = attn.clone()
+        uncond_seq[:, prompt_start:prompt_end] = pad_id
+        uncond_attn[:, prompt_start:prompt_end] = 0
+        with torch.no_grad():
+            filled_tokens, _ = self.model.ti2ti_generate(
+                input_ids=seq.to(self.device),
+                uncond_input_ids=uncond_seq.to(self.device),
+                attention_mask=attn.to(self.device),
+                uncond_attention_mask=uncond_attn.to(self.device),
+                temperature=float(temperature),
+                timesteps=int(timesteps_image),
+                timesteps_text=int(timesteps_text),
+                timesteps_image=int(timesteps_image),
+                guidance_scale=float(guidance_scale),
+                noise_schedule=self.image_noise_schedule,
+                seq_len=self.image_seq_len,
+                mask_token_id=self.mask_token_id,
+                codebook_size=self.codebook_size,
+                uni_prompting=self.uni_prompting,
+                config=self.train_cfg,
+            )
+        if filled_tokens is None:
+            return None, "", "TI2TI generation failed."
+        filled_tokens = torch.clamp(
+            filled_tokens,
+            min=0,
+            max=self.codebook_size + self.text_vocab_size - 1,
+        )
+        pred_img_tokens = filled_tokens[:, img_start:img_end] - self.text_vocab_size
+        pred_img_tokens = torch.clamp(pred_img_tokens, min=0, max=self.codebook_size - 1)
+        try:
+            image_out = self._decode_image_tokens(pred_img_tokens[0])
+        except Exception as exc:
+            return None, "", f"Failed to decode generated image: {exc}"
+        text_slice = slice(text_start, min(text_end, filled_tokens.shape[1]))
+        text_block = filled_tokens[:, text_slice]
+        text_vocab = self.text_vocab_size
+        mask_id = int(self.mask_token_id)
+        eos_id = int(self.uni_prompting.text_tokenizer.eos_token_id)
+        eot_id = int(self.uni_prompting.sptids_dict.get("<|eot_id|>", torch.tensor([eos_id], device=self.device))[0].item())
+        pad_token_id = int(pad_id)
+        pred_texts = []
+        for row in text_block:
+            seq_list = []
+            for t in row.tolist():
+                if t in (pad_token_id, mask_id):
+                    continue
+                if t == eos_id or t == eot_id:
+                    break
+                if 0 <= t < text_vocab:
+                    seq_list.append(int(t))
+            pred_texts.append(self.uni_prompting.text_tokenizer.decode(seq_list, skip_special_tokens=True))
+        pred_text = pred_texts[0] if pred_texts else ""
+        status = "TI2TI generated image and text."
+        return image_out, pred_text, status
     # ------------------------------------------------------------------
     # Video-to-Speech
     # ------------------------------------------------------------------
         group_to_modes = {
             "Any → Speech": ["Text → Speech", "Speech → Speech", "Video → Speech", "Image → Speech"],
             "Any → Text": ["Speech → Text", "Video → Text", "Chat", "MMU (2 Images → Text)"],
+            "Image Generation": ["Text → Image", "Image Editing", "Text+Image → Text+Image (TI2TI)"],
         }
         default_group = "Any → Speech"
         default_mode = group_to_modes[default_group][0]
             "MMU (2 Images → Text)": "Ask a question about the two uploaded images.",
             "Text → Image": "Describe the image you want to generate...",
             "Image Editing": "Describe how you want to edit the uploaded image...",
+            "Text+Image → Text+Image (TI2TI)": "Upload an image and describe how you want it edited and captioned.",
         }
         with gr.Row(elem_classes=["omada-layout"], equal_height=False):
             with gr.Column(scale=3, min_width=480, elem_classes=["omada-chat-column"]):
                                     inputs=[chat_input],
                                     examples_per_page=4,
                                 )
+                with gr.Column(visible=False, elem_classes=["omada-mode-panel"]) as ti2ti_panel:
+                    with gr.Group(elem_classes=["omada-card"]):
+                        gr.Markdown("### Text+Image → Text+Image (TI2TI)")
+                        ti2ti_image = gr.Image(type="pil", label="Source image", sources=["upload"])
+                        with gr.Accordion("Generation settings", open=True, elem_classes=["omada-advanced"]):
+                            ti2ti_text_tokens = gr.Slider(8, 256, value=64, label="Text placeholder tokens", step=4)
+                            with gr.Row():
+                                ti2ti_img_timesteps = gr.Slider(4, 128, value=64, label="Image timesteps", step=2)
+                                ti2ti_text_timesteps = gr.Slider(4, 128, value=64, label="Text timesteps", step=2)
+                            with gr.Row():
+                                ti2ti_temperature = gr.Slider(0.0, 2.0, value=1.0, label="Sampling temperature", step=0.05)
+                                ti2ti_guidance = gr.Slider(0.0, 8.0, value=3.5, label="CFG scale", step=0.1)
+                    if TI2TI_EXAMPLES:
+                        with gr.Group(elem_classes=["omada-card", "omada-examples-card"]):
+                            gr.Markdown("**Sample edits**")
+                            with gr.Column(elem_classes=["omada-examples"]):
+                                gr.Examples(
+                                    examples=TI2TI_EXAMPLES,
+                                    inputs=[ti2ti_image, chat_input],
+                                    examples_per_page=4,
+                                )
                 with gr.Column(visible=False, elem_classes=["omada-mode-panel"]) as chat_panel:
                     with gr.Group(elem_classes=["omada-card"]):
                         gr.Markdown("### Chat Controls")
             show_v2t = group == "Any → Text" and mode == "Video → Text"
             show_chat = group == "Any → Text" and mode == "Chat"
             show_mmu = group == "Any → Text" and mode == "MMU (2 Images → Text)"
+            show_image = group == "Image Generation" and mode in ("Text → Image", "Image Editing")
+            show_ti2ti = group == "Image Generation" and mode == "Text+Image → Text+Image (TI2TI)"
             placeholder = placeholder_map.get(mode, chat_input.placeholder)
             image_mode_value = "Generation" if mode == "Text → Image" else "Editing"
             t2i_visible = show_image and mode == "Text → Image"
                 gr.update(visible=show_chat),
                 gr.update(visible=show_mmu),
                 gr.update(visible=show_image),
+                gr.update(visible=show_ti2ti),
                 image_mode_update,
                 gr.update(visible=t2i_visible),
                 gr.update(visible=i2i_visible),
                 chat_panel,
                 mmu_panel,
                 image_panel,
+                ti2ti_panel,
                 image_mode_selector,
                 t2i_settings,
                 i2i_settings,
                 chat_panel,
                 mmu_panel,
                 image_panel,
+                ti2ti_panel,
                 image_mode_selector,
                 t2i_settings,
                 i2i_settings,
             i2i_timesteps,
             i2i_temperature,
             i2i_guidance,
+            ti2ti_image,
+            ti2ti_text_tokens,
+            ti2ti_img_timesteps,
+            ti2ti_text_timesteps,
+            ti2ti_temperature,
+            ti2ti_guidance,
             v2s_video_path,
             v2s_max_tokens,
             v2s_steps,
                         )
                         response = _render_image_message(status, image_result)
                     display_user_raw = message or "[Image generation request]"
+                elif mode == "Image Editing":
                     image_result, status = app.run_i2i(
                         message,
                         i2i_image,
                     )
                     response = _render_image_message(status, image_result)
                     display_user_raw = message or "[Image editing request]"
+                else:  # TI2TI
+                    image_result, text_result, status = app.run_ti2ti(
+                        message,
+                        ti2ti_image,
+                        ti2ti_text_tokens,
+                        ti2ti_img_timesteps,
+                        ti2ti_text_timesteps,
+                        ti2ti_temperature,
+                        ti2ti_guidance,
+                    )
+                    response = _render_image_text_message(status, image_result, text_result)
+                    display_user_raw = message or "[TI2TI request]"
             if not response:
                 status = f"Mode '{mode}' is not supported."
             i2i_timesteps,
             i2i_temperature,
             i2i_guidance,
+            ti2ti_image,
+            ti2ti_text_tokens,
+            ti2ti_img_timesteps,
+            ti2ti_text_timesteps,
+            ti2ti_temperature,
+            ti2ti_guidance,
             v2s_video,
             v2s_max_tokens,
             v2s_steps,
                 gr.update(value=None),
                 gr.update(value=None),
                 gr.update(value=None),
+                gr.update(value=None),
             )
         clear_button.click(
                 i2s_image,
                 v2s_video,
                 i2i_image,
+                ti2ti_image,
                 mmu_image_a,
                 mmu_image_b,
             ],

MMaDA/models/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (469 Bytes)

MMaDA/models/__pycache__/common_modules.cpython-310.pyc DELETED Viewed

Binary file (10.2 kB)

MMaDA/models/__pycache__/configuration_emova_speech_tokenizer.cpython-310.pyc DELETED Viewed

Binary file (9.62 kB)

MMaDA/models/__pycache__/configuration_llada.cpython-310.pyc DELETED Viewed

Binary file (6.19 kB)

MMaDA/models/__pycache__/misc.cpython-310.pyc DELETED Viewed

Binary file (1.49 kB)

MMaDA/models/__pycache__/modeling_emova_speech_tokenizer.cpython-310.pyc DELETED Viewed

Binary file (3.34 kB)

MMaDA/models/__pycache__/modeling_llada.cpython-310.pyc DELETED Viewed

Binary file (40.3 kB)

MMaDA/models/__pycache__/modeling_magvitv2.cpython-310.pyc DELETED Viewed

Binary file (11.1 kB)

MMaDA/models/__pycache__/modeling_mmada.cpython-310.pyc DELETED Viewed

Binary file (20.2 kB)

MMaDA/models/__pycache__/modeling_omada.cpython-310.pyc DELETED Viewed

Binary file (31.9 kB)

MMaDA/models/__pycache__/modeling_utils.cpython-310.pyc DELETED Viewed

Binary file (39.7 kB)

MMaDA/models/__pycache__/modeling_video_encoder.cpython-310.pyc DELETED Viewed

Binary file (1.15 kB)

MMaDA/models/__pycache__/sampling.cpython-310.pyc DELETED Viewed

Binary file (4.19 kB)

MMaDA/training/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (182 Bytes)

MMaDA/training/__pycache__/data.cpython-310.pyc DELETED Viewed

Binary file (73 kB)

MMaDA/training/__pycache__/prompting_utils.cpython-310.pyc DELETED Viewed

Binary file (35.3 kB)

MMaDA/training/__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (5.97 kB)

app.py CHANGED Viewed

@@ -219,11 +219,35 @@ def _load_i2i_examples():
         examples.append([str(img_path), instruction])
     return examples
 # text-based examples
 T2S_EXAMPLES = _load_text_examples(ASSET_ROOT / "t2s" / "text.txt")
 CHAT_EXAMPLES = _load_text_examples(ASSET_ROOT / "chat" / "text.txt")
 T2I_EXAMPLES = _load_text_examples(ASSET_ROOT / "t2i" / "text.txt")
 I2I_EXAMPLES = _load_i2i_examples()
 # audio / video / image examples
 S2T_EXAMPLES = _load_media_examples("s2t", {".wav", ".mp3", ".flac", ".ogg"})
@@ -419,6 +443,20 @@ def i2i_handler(instruction, image, timesteps, temperature, guidance):
     )
     return image_out, status
 # ---------------------------
 # Gradio UI (10 tabs + examples)
@@ -678,6 +716,45 @@ with gr.Blocks(
             outputs=[i2i_image_out, i2i_status],
         )
     # ---- I2S ----
     with gr.Tab("Image → Speech (I2S)"):
         i2s_image_in = gr.Image(type="pil", label="Image input", sources=["upload"])

         examples.append([str(img_path), instruction])
     return examples
+def _load_ti2ti_examples():
+    """Load TI2TI examples: pairs of source image + instruction text."""
+    d = ASSET_ROOT / "ti2ti"
+    if not d.exists():
+        return []
+    src_files = sorted(
+        [p for p in d.iterdir() if p.is_file() and p.name.endswith("_src.png")],
+    )
+    txt_files = {p.name.replace("_instr.txt", ""): p for p in d.iterdir() if p.is_file() and p.name.endswith("_instr.txt")}
+    examples = []
+    for src in src_files:
+        stem = src.name.replace("_src.png", "")
+        txt = txt_files.get(stem)
+        if not txt:
+            continue
+        instruction = txt.read_text(encoding="utf-8").strip()
+        if not instruction:
+            continue
+        examples.append([str(src), instruction])
+    return examples
 # text-based examples
 T2S_EXAMPLES = _load_text_examples(ASSET_ROOT / "t2s" / "text.txt")
 CHAT_EXAMPLES = _load_text_examples(ASSET_ROOT / "chat" / "text.txt")
 T2I_EXAMPLES = _load_text_examples(ASSET_ROOT / "t2i" / "text.txt")
 I2I_EXAMPLES = _load_i2i_examples()
+TI2TI_EXAMPLES = _load_ti2ti_examples()
 # audio / video / image examples
 S2T_EXAMPLES = _load_media_examples("s2t", {".wav", ".mp3", ".flac", ".ogg"})
     )
     return image_out, status
+@spaces.GPU
+def ti2ti_handler(instruction, image, text_tokens, timesteps_image, timesteps_text, temperature, guidance):
+    app = get_app()
+    image_out, text_out, status = app.run_ti2ti(
+        instruction=instruction,
+        source_image=image,
+        text_tokens=int(text_tokens),
+        timesteps_image=int(timesteps_image),
+        timesteps_text=int(timesteps_text),
+        temperature=float(temperature),
+        guidance_scale=float(guidance),
+    )
+    return image_out, text_out, status
 # ---------------------------
 # Gradio UI (10 tabs + examples)
             outputs=[i2i_image_out, i2i_status],
         )
+    # ---- TI2TI ----
+    with gr.Tab("Text+Image → Text+Image (TI2TI)"):
+        ti2ti_image_in = gr.Image(type="pil", label="Source image", sources=["upload"])
+        ti2ti_instr = gr.Textbox(
+            label="Editing instruction",
+            lines=4,
+            placeholder="Describe how you want the image edited and what to say about it...",
+        )
+        ti2ti_image_out = gr.Image(label="Edited image")
+        ti2ti_text_out = gr.Textbox(label="Generated text", lines=4)
+        ti2ti_status = gr.Textbox(label="Status", interactive=False)
+        with gr.Accordion("Advanced settings", open=False):
+            ti2ti_text_tokens = gr.Slider(8, 256, value=64, step=4, label="Text placeholder tokens")
+            ti2ti_img_steps = gr.Slider(4, 128, value=64, step=2, label="Image timesteps")
+            ti2ti_text_steps = gr.Slider(4, 128, value=64, step=2, label="Text timesteps")
+            ti2ti_temperature = gr.Slider(0.0, 2.0, value=1.0, step=0.05, label="Sampling temperature")
+            ti2ti_guidance = gr.Slider(0.0, 8.0, value=3.5, step=0.1, label="CFG scale")
+        if TI2TI_EXAMPLES:
+            with gr.Accordion("Sample edits", open=False):
+                gr.Examples(
+                    examples=TI2TI_EXAMPLES,
+                    inputs=[ti2ti_image_in, ti2ti_instr],
+                    examples_per_page=4,
+                )
+        ti2ti_btn = gr.Button("Generate edited image + text", variant="primary")
+        ti2ti_btn.click(
+            ti2ti_handler,
+            inputs=[
+                ti2ti_instr,
+                ti2ti_image_in,
+                ti2ti_text_tokens,
+                ti2ti_img_steps,
+                ti2ti_text_steps,
+                ti2ti_temperature,
+                ti2ti_guidance,
+            ],
+            outputs=[ti2ti_image_out, ti2ti_text_out, ti2ti_status],
+        )
     # ---- I2S ----
     with gr.Tab("Image → Speech (I2S)"):
         i2s_image_in = gr.Image(type="pil", label="Image input", sources=["upload"])