Add diffusers support

by dn6 HF Staff - opened 13 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+2951

-1

Files changed (13) hide show

README.md +43 -1
__init__.py +57 -0
modular_blocks.py +1188 -0
modular_config.json +7 -0
modular_model_index.json +76 -0
transformer/__init__.py +16 -0
transformer/config.json +59 -0
transformer/diffusion_pytorch_model.safetensors +3 -0
transformer/model.py +1100 -0
vae/__init__.py +18 -0
vae/ae_model.py +368 -0
vae/config.json +13 -0
vae/diffusion_pytorch_model.safetensors +3 -0

README.md CHANGED Viewed

@@ -79,6 +79,48 @@ This checkpoint is intended to be used with Overworld’s interactive runtime st
 - Play on our official desktop client, [Biome](https://over.world/install)
 - Use our [world_engine](https://github.com/Wayfarer-Labs/world_engine) inference library to build your own applications
 ### Recommended setup
@@ -129,4 +171,4 @@ Please see our blog post, ["Engineering Safety for Interactive World Models"](ht
 - [Website](http://over.world/)
 - [Discord](https://discord.gg/MEmQa7Wux4)
-- [X/Twitter](https://x.com/overworld_ai)

 - Play on our official desktop client, [Biome](https://over.world/install)
 - Use our [world_engine](https://github.com/Wayfarer-Labs/world_engine) inference library to build your own applications
+### Diffusers (Modular Pipeline)
+This model can also be used with [Modular Diffusers](https://huggingface.co/docs/diffusers/main/en/modular_diffusers):
+```python
+import torch
+from diffusers.modular_pipelines import ModularPipeline
+from diffusers.utils import load_image, export_to_video
+pipe = ModularPipeline.from_pretrained(
+    "Overworld/Waypoint-1.5-1B", trust_remote_code=True
+)
+pipe.load_components(
+    device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+pipe.transformer.apply_inference_patches()
+pipe.transformer.compile(fullgraph=True, mode="max-autotune", dynamic=False)
+# Seed the world with an image
+image = load_image("https://huggingface.co/spaces/Overworld/waypoint-1-small/resolve/main/starter_18.png").resize((1024, 512))
+state = pipe(
+    image=image,
+    prompt="An explorable world",
+    button=set(),
+    mouse=(0.0, 0.0),
+    output_type="pil",
+)
+# Generate subsequent frames with controller inputs
+state.values["image"] = None
+frames = []
+for _ in range(150):
+    state = pipe(
+        state,
+        button={87},       # W key (walk forward)
+        mouse=(0.0, 0.0),
+        output_type="pil",
+    )
+    frames.append(state.values["images"])  # list of PIL images
+export_to_video(outputs, "waypoint-v1-5.mp4", fps=60)
+```
 ### Recommended setup
 - [Website](http://over.world/)
 - [Discord](https://discord.gg/MEmQa7Wux4)
+- [X/Twitter](https://x.com/overworld_ai)

__init__.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (C) 2025 Hugging Face Team and Overworld
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+WorldEngine Modular Pipeline
+A Diffusers-compatible modular pipeline for frame-by-frame world model generation.
+Supports text and controller (mouse + button + scroll) conditioning.
+"""
+from .modular_blocks import (
+    WorldEngineBlocks,
+    AUTO_BLOCKS,
+    WorldEngineTextEncoderStep,
+    WorldEngineControllerEncoderStep,
+    WorldEngineBeforeDenoiseStep,
+    WorldEngineSetTimestepsStep,
+    WorldEnginePrepareLatentsStep,
+    WorldEngineSetupKVCacheStep,
+    WorldEngineDenoiseLoop,
+    WorldEngineDecodeStep,
+    StaticKVCache,
+    LayerKVCache,
+)
+from .transformer import WorldModel
+from .vae import ChunkedStreamingTAEHV
+__version__ = "0.1.0"
+__all__ = [
+    "WorldEngineBlocks",
+    "AUTO_BLOCKS",
+    "WorldEngineTextEncoderStep",
+    "WorldEngineControllerEncoderStep",
+    "WorldEngineBeforeDenoiseStep",
+    "WorldEngineSetTimestepsStep",
+    "WorldEnginePrepareLatentsStep",
+    "WorldEngineSetupKVCacheStep",
+    "WorldEngineDenoiseLoop",
+    "WorldEngineDecodeStep",
+    "WorldModel",
+    "ChunkedStreamingTAEHV",
+    "StaticKVCache",
+    "LayerKVCache",
+]

modular_blocks.py ADDED Viewed

	@@ -0,0 +1,1188 @@

+# Copyright (C) 2025 Hugging Face Team and Overworld
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""WorldEngine modular pipeline blocks.
+All pipeline step classes for text encoding, controller encoding,
+KV cache setup, latent preparation, denoising, and decoding.
+"""
+import html
+import numpy as np
+import PIL.Image
+import regex as re
+import torch
+from torch import nn, Tensor
+from tensordict import TensorDict
+from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE, BlockMask
+from transformers import AutoTokenizer, UMT5EncoderModel
+from diffusers import AutoModel
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils import is_ftfy_available, logging
+from diffusers.modular_pipelines import (
+    ModularPipelineBlocks,
+    ModularPipeline,
+    PipelineState,
+    SequentialPipelineBlocks,
+)
+from diffusers.modular_pipelines.modular_pipeline_utils import (
+    ComponentSpec,
+    ConfigSpec,
+    InputParam,
+    InsertableDict,
+    OutputParam,
+)
+if is_ftfy_available():
+    import ftfy
+logger = logging.get_logger(__name__)
+# ---------------------------------------------------------------------------
+# Text encoding helpers
+# ---------------------------------------------------------------------------
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+# ---------------------------------------------------------------------------
+# Block mask construction
+# ---------------------------------------------------------------------------
+def make_block_mask(T: int, L: int, written: torch.Tensor) -> BlockMask:
+    """
+    Create a block mask for flex_attention.
+    T and L must be exact multiples of the sparse block size; written must be
+    block-aligned (each block is either all True or all False).
+    Args:
+        T: Q length for this frame
+        L: KV capacity == written.numel()
+        written: [L] bool, True where there is valid KV data
+    """
+    BS = _DEFAULT_SPARSE_BLOCK_SIZE
+    if not torch.compiler.is_compiling():
+        torch._check(T % BS == 0, f"T ({T}) must be a multiple of block size ({BS})")
+        torch._check(L % BS == 0, f"L ({L}) must be a multiple of block size ({BS})")
+    Q_blocks = T // BS
+    KV_blocks = L // BS
+    written_blocks = written.view(KV_blocks, BS)
+    block_any = written_blocks.any(-1)
+    if not torch.compiler.is_compiling():
+        assert torch.equal(block_any, written_blocks.all(-1)), "written must be block-aligned"
+    # Every KV block is a full block (no partial blocks)
+    full_bm = block_any[None, :].expand(Q_blocks, KV_blocks)
+    full_kv_num_blocks = full_bm.sum(dim=-1, dtype=torch.int32)[None, None].contiguous()
+    full_kv_indices = full_bm.argsort(dim=-1, descending=True, stable=True).to(torch.int32)[None, None].contiguous()
+    # No partial blocks
+    kv_num_blocks = torch.zeros((1, 1, Q_blocks), dtype=torch.int32, device=written.device)
+    kv_indices = torch.zeros((1, 1, Q_blocks, KV_blocks), dtype=torch.int32, device=written.device)
+    return BlockMask.from_kv_blocks(
+        kv_num_blocks,
+        kv_indices,
+        full_kv_num_blocks,
+        full_kv_indices,
+        BLOCK_SIZE=BS,
+        mask_mod=None,
+        seq_lengths=(T, L),
+        compute_q_blocks=False,
+    )
+# ---------------------------------------------------------------------------
+# KV cache
+# ---------------------------------------------------------------------------
+class LayerKVCache(nn.Module):
+    """
+    Ring-buffer KV cache with fixed capacity L (tokens) for history plus
+    one extra frame (tokens_per_frame) at the tail holding the current frame.
+    """
+    def __init__(
+        self, B, H, L, Dh, dtype, tokens_per_frame: int, pinned_dilation: int = 1
+    ):
+        super().__init__()
+        self.tpf = tokens_per_frame
+        self.L = L
+        # total KV capacity: ring (L) + tail frame (tpf)
+        self.capacity = L + self.tpf
+        self.pinned_dilation = pinned_dilation
+        self.num_buckets = (L // self.tpf) // self.pinned_dilation
+        assert (L // self.tpf) % pinned_dilation == 0 and L % self.tpf == 0
+        # KV buffer: [2, B, H, capacity, Dh]
+        self.kv = nn.Buffer(
+            torch.zeros(2, B, H, self.capacity, Dh, dtype=dtype),
+            persistent=False,
+        )
+        # which slots have ever been written
+        # tail slice [L, L+tpf) always holds the current frame and is considered written
+        written = torch.zeros(self.capacity, dtype=torch.bool)
+        written[L:] = True
+        self.written = nn.Buffer(written, persistent=False)
+        # _mask_written is a scratch buffer for computing block masks without cloning
+        self._mask_written = nn.Buffer(torch.zeros_like(written), persistent=False)
+        # Precompute indices:
+        #   frame_offsets: [0, 1, ..., tpf-1] (for ring indexing)
+        #   current_idx:   [L, L+1, ..., L+tpf-1] (tail slice)
+        self.frame_offsets = nn.Buffer(
+            torch.arange(self.tpf, dtype=torch.long), persistent=False
+        )
+        self.current_idx = nn.Buffer(self.frame_offsets + L, persistent=False)
+    def reset(self):
+        self.kv.zero_()
+        self.written.zero_()
+        self.written[self.L :].fill_(True)
+    def upsert(self, kv: Tensor, pos_ids: TensorDict, is_frozen: bool):
+        """
+        Args:
+            kv: [2, B, H, T, Dh] for a single frame (T = tokens_per_frame)
+            pos_ids: TensorDict with f_pos [B, T] for cache slot indexing
+        """
+        T = self.tpf
+        f_pos = pos_ids["f_pos"]
+        if not torch.compiler.is_compiling():
+            torch._check(
+                kv.size(3) == self.tpf, "KV cache expects exactly one frame per upsert"
+            )
+            torch._check(f_pos.shape == (kv.size(1), T), "f_pos must be [B, T]")
+            torch._check(self.tpf <= self.L, "frame longer than KV ring capacity")
+            torch._check(
+                self.L % self.tpf == 0,
+                f"L ({self.L}) must be a multiple of tokens_per_frame ({self.tpf})",
+            )
+            torch._check(
+                self.kv.size(3) == self.capacity,
+                "KV buffer too long (expected L + tokens_per_frame)",
+            )
+            torch._check(
+                (f_pos >= 0).all().item(),
+                "f_pos must be non-negative during inference",
+            )
+            torch._check(
+                ((f_pos == f_pos[:, :1]).all()).item(),
+                "f_pos must be constant within frame",
+            )
+        frame_idx = f_pos[0, 0]
+        # map frame_idx to a bucket, each bucket owns T contiguous slots
+        bucket = (frame_idx + (self.pinned_dilation - 1)) // self.pinned_dilation
+        slot = bucket % self.num_buckets
+        base = slot * T
+        # indices in the ring for this frame: [T] in [0, L)
+        ring_idx = self.frame_offsets + base
+        # Always write current frame into the tail slice [L, L+T):
+        # this is the "self-attention component" for the current frame.
+        self.kv.index_copy_(3, self.current_idx, kv)
+        write_step = (frame_idx.remainder(self.pinned_dilation) == 0)
+        mask_written = self._mask_written
+        mask_written.copy_(self.written)
+        mask_written[ring_idx] = mask_written[ring_idx] & ~write_step
+        bm = make_block_mask(T, self.capacity, mask_written)
+        # Persist current frame into the ring for future queries when unfrozen.
+        if not is_frozen:
+            dst = torch.where(write_step, ring_idx, self.current_idx)
+            self.kv.index_copy_(3, dst, kv)
+            self.written[dst] = True
+        k, v = self.kv.unbind(0)
+        return k, v, bm
+class StaticKVCache(nn.Module):
+    """Static KV cache with per-layer configuration for local/global attention."""
+    def __init__(self, config, batch_size, dtype):
+        super().__init__()
+        self.tpf = config.height * config.width
+        local_L = config.local_window * self.tpf
+        global_L = config.global_window * self.tpf
+        period = config.global_attn_period
+        off = getattr(config, "global_attn_offset", 0) % period
+        self.layers = nn.ModuleList(
+            [
+                LayerKVCache(
+                    batch_size,
+                    getattr(config, "n_kv_heads", None) or config.n_heads,
+                    global_L if ((layer_idx - off) % period == 0) else local_L,
+                    config.d_model // config.n_heads,
+                    dtype,
+                    self.tpf,
+                    (
+                        config.global_pinned_dilation
+                        if ((layer_idx - off) % period == 0)
+                        else 1
+                    ),
+                )
+                for layer_idx in range(config.n_layers)
+            ]
+        )
+        self._is_frozen = True
+    def reset(self):
+        for layer in self.layers:
+            layer.reset()
+        self._is_frozen = True
+    @torch.inference_mode()
+    def get_state(self):
+        """Captures a world state to continue via load_state."""
+        layers = [(layer.kv.detach().clone(), layer.written.detach().clone()) for layer in self.layers]
+        return {"_is_frozen": self._is_frozen, "layers": layers}
+    @torch.inference_mode()
+    def load_state(self, state):
+        """Loads a world state object saved via get_state."""
+        self._is_frozen = bool(state.get("_is_frozen", True))
+        for layer, (kv, written) in zip(self.layers, state["layers"]):
+            layer.kv.copy_(kv)
+            layer.written.copy_(written)
+    def set_frozen(self, is_frozen: bool):
+        self._is_frozen = is_frozen
+    def upsert(self, k: Tensor, v: Tensor, pos_ids: TensorDict, layer: int):
+        kv = torch.stack([k, v], dim=0)
+        return self.layers[layer].upsert(kv, pos_ids, self._is_frozen)
+# ---------------------------------------------------------------------------
+# Pipeline step: Text Encoder
+# ---------------------------------------------------------------------------
+class WorldEngineTextEncoderStep(ModularPipelineBlocks):
+    """Encodes text prompts using UMT5-XL for conditioning."""
+    model_name = "world_engine"
+    @property
+    def description(self) -> str:
+        return (
+            "Text Encoder step that generates text embeddings to guide frame generation"
+        )
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", UMT5EncoderModel),
+            ComponentSpec("tokenizer", AutoTokenizer),
+        ]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "prompt",
+                description="The prompt or prompts to guide the frame generation",
+            ),
+            InputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                description="Pre-computed text embeddings",
+            ),
+            InputParam(
+                "prompt_pad_mask",
+                type_hint=torch.Tensor,
+                description="Padding mask for prompt embeddings",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Text embeddings used to guide frame generation",
+            ),
+            OutputParam(
+                "prompt_pad_mask",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Padding mask for prompt embeddings",
+            ),
+        ]
+    @staticmethod
+    def check_inputs(block_state):
+        if block_state.prompt is not None and (
+            not isinstance(block_state.prompt, str)
+            and not isinstance(block_state.prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}"
+            )
+    @staticmethod
+    def encode_prompt(
+        components,
+        prompt: str | list[str],
+        device: torch.device,
+        max_sequence_length: int = 512,
+    ):
+        dtype = components.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(p) for p in prompt]
+        text_inputs = components.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(device)
+        attention_mask = text_inputs.attention_mask.to(device)
+        prompt_embeds = components.text_encoder(
+            text_input_ids, attention_mask
+        ).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype)
+        # Zero out padding
+        prompt_embeds = prompt_embeds * attention_mask.unsqueeze(-1).type_as(
+            prompt_embeds
+        )
+        # Create padding mask (True where padded)
+        prompt_pad_mask = attention_mask.eq(0)
+        return prompt_embeds, prompt_pad_mask
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(block_state)
+        device = components._execution_device
+        if block_state.prompt_embeds is None:
+            block_state.prompt = block_state.prompt or "An explorable world"
+            (
+                block_state.prompt_embeds,
+                block_state.prompt_pad_mask,
+            ) = self.encode_prompt(components, block_state.prompt, device)
+            block_state.prompt_embeds = block_state.prompt_embeds.contiguous()
+        if block_state.prompt_pad_mask is None:
+            block_state.prompt_pad_mask = torch.zeros(
+                block_state.prompt_embeds.shape[:2],
+                dtype=torch.bool,
+                device=device,
+            )
+        self.set_block_state(state, block_state)
+        return components, state
+# ---------------------------------------------------------------------------
+# Pipeline step: Controller Encoder
+# ---------------------------------------------------------------------------
+class WorldEngineControllerEncoderStep(ModularPipelineBlocks):
+    """Encodes controller inputs (mouse + buttons + scroll) for conditioning."""
+    model_name = "world_engine"
+    @property
+    def description(self) -> str:
+        return "Controller Encoder step that encodes mouse, button, and scroll inputs for conditioning"
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return []  # Controller embedding is part of transformer
+    @property
+    def expected_configs(self) -> list[ConfigSpec]:
+        return [ConfigSpec("n_buttons", 256)]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "button",
+                type_hint=set[int],
+                default=set(),
+                description="Set of pressed button IDs",
+            ),
+            InputParam(
+                "mouse",
+                type_hint=tuple[float, float],
+                default=(0.0, 0.0),
+                description="Mouse velocity (x, y)",
+            ),
+            InputParam(
+                "scroll",
+                type_hint=int,
+                default=0,
+                description="Scroll wheel direction (-1, 0, 1)",
+            ),
+            InputParam(
+                "button_tensor",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="One-hot encoded button tensor",
+            ),
+            InputParam(
+                "mouse_tensor",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Mouse velocity tensor",
+            ),
+            InputParam(
+                "scroll_tensor",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Scroll wheel sign tensor",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "button_tensor",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="One-hot encoded button tensor",
+            ),
+            OutputParam(
+                "mouse_tensor",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Mouse velocity tensor",
+            ),
+            OutputParam(
+                "scroll_tensor",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Scroll wheel sign tensor",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+        n_buttons = components.config.n_buttons
+        # Create or reuse button tensor [1, 1, n_buttons]
+        if block_state.button_tensor is None:
+            block_state.button_tensor = torch.zeros(
+                (1, 1, n_buttons), device=device, dtype=dtype
+            )
+        # Update button tensor in-place (avoid dynamic shapes for torch.compile)
+        block_state.button_tensor.zero_()
+        if block_state.button:
+            for btn_id in block_state.button:
+                if 0 <= btn_id < n_buttons:
+                    block_state.button_tensor[0, 0, btn_id] = 1.0
+        # Create or reuse mouse tensor [1, 1, 2]
+        if block_state.mouse_tensor is None:
+            block_state.mouse_tensor = torch.zeros(
+                (1, 1, 2), device=device, dtype=dtype
+            )
+        # Update mouse tensor in-place
+        mouse = block_state.mouse if block_state.mouse is not None else (0.0, 0.0)
+        block_state.mouse_tensor[0, 0, 0] = mouse[0]
+        block_state.mouse_tensor[0, 0, 1] = mouse[1]
+        # Create or reuse scroll tensor [1, 1, 1]
+        if block_state.scroll_tensor is None:
+            block_state.scroll_tensor = torch.zeros(
+                (1, 1, 1), device=device, dtype=dtype
+            )
+        # Update scroll tensor in-place (sign of scroll value: -1, 0, or 1)
+        scroll = block_state.scroll if block_state.scroll is not None else 0
+        block_state.scroll_tensor[0, 0, 0] = float(scroll > 0) - float(scroll < 0)
+        self.set_block_state(state, block_state)
+        return components, state
+# ---------------------------------------------------------------------------
+# Pipeline step: Set Timesteps
+# ---------------------------------------------------------------------------
+class WorldEngineSetTimestepsStep(ModularPipelineBlocks):
+    """Sets up the scheduler sigmas for rectified flow denoising."""
+    model_name = "world_engine"
+    @property
+    def description(self) -> str:
+        return "Sets up scheduler sigmas for rectified flow denoising"
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return []
+    @property
+    def expected_configs(self) -> list[ConfigSpec]:
+        return [ConfigSpec("scheduler_sigmas", [1.0, 0.94921875, 0.83984375, 0.0])]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "scheduler_sigmas",
+                type_hint=list[float],
+                description="Custom scheduler sigmas (overrides config)",
+            ),
+            InputParam(
+                "frame_timestamp",
+                type_hint=torch.Tensor,
+                description="Current frame timestamp",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "scheduler_sigmas",
+                type_hint=torch.Tensor,
+                description="Tensor of scheduler sigmas for denoising",
+            ),
+            OutputParam(
+                "frame_timestamp",
+                type_hint=torch.Tensor,
+                description="Current frame timestamp (unscaled counter)",
+            ),
+            OutputParam(
+                "ts_mult",
+                type_hint=int,
+                description="Timestamp multiplier (base_fps // latent_fps)",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+        # Use provided sigmas or get from config
+        sigmas = block_state.scheduler_sigmas
+        if sigmas is None:
+            sigmas = components.config.scheduler_sigmas
+            block_state.scheduler_sigmas = torch.tensor(
+                sigmas, device=device, dtype=dtype
+            )
+        frame_ts = block_state.frame_timestamp
+        if frame_ts is None:
+            frame_ts = torch.tensor([[0]], dtype=torch.long, device=device)
+        elif isinstance(frame_ts, int):
+            frame_ts = torch.tensor([[frame_ts]], dtype=torch.long, device=device)
+        # Compute ts_mult: ratio of base_fps to latent_fps
+        t_cfg = components.transformer.config
+        base_fps = getattr(t_cfg, "base_fps", 60)
+        inference_fps = getattr(t_cfg, "inference_fps", base_fps)
+        temporal_compression = getattr(t_cfg, "temporal_compression", 1)
+        latent_fps = inference_fps / temporal_compression
+        ts_mult = int(base_fps) // int(latent_fps)
+        block_state.ts_mult = ts_mult
+        block_state.frame_timestamp = frame_ts
+        self.set_block_state(state, block_state)
+        return components, state
+# ---------------------------------------------------------------------------
+# Pipeline step: Setup KV Cache
+# ---------------------------------------------------------------------------
+class WorldEngineSetupKVCacheStep(ModularPipelineBlocks):
+    """Initializes or reuses the KV cache for autoregressive generation."""
+    model_name = "world_engine"
+    @property
+    def description(self) -> str:
+        return "Initializes or reuses KV cache for autoregressive frame generation"
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return []
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "kv_cache",
+                type_hint=StaticKVCache | None,
+                description="Existing KV cache (will be reused if provided)",
+            ),
+            InputParam(
+                "reset_cache",
+                type_hint=bool,
+                default=False,
+                description="If True, reset the KV cache even if one exists",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "kv_cache",
+                type_hint=StaticKVCache,
+                description="KV cache for transformer attention",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+        # Create or reuse KV cache
+        if block_state.kv_cache is None:
+            block_state.kv_cache = StaticKVCache(
+                components.transformer.config,
+                batch_size=1,
+                dtype=dtype,
+            ).to(device)
+        elif block_state.reset_cache:
+            block_state.kv_cache.reset()
+        self.set_block_state(state, block_state)
+        return components, state
+# ---------------------------------------------------------------------------
+# Pipeline step: Prepare Latents
+# ---------------------------------------------------------------------------
+class WorldEnginePrepareLatentsStep(ModularPipelineBlocks):
+    """Prepares latents for frame generation, optionally encoding an input image."""
+    model_name = "world_engine"
+    @property
+    def description(self) -> str:
+        return (
+            "Prepares latents for frame generation. If an image is provided on the "
+            "first frame, encodes it and caches it as context. Always creates fresh "
+            "random noise for the actual denoising."
+        )
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict(
+                    {
+                        "vae_scale_factor": 16,
+                        "do_normalize": False,
+                        "do_convert_rgb": False,
+                    }
+                ),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def expected_configs(self) -> list[ConfigSpec]:
+        return [
+            ConfigSpec("channels", 16),
+            ConfigSpec("height", 16),
+            ConfigSpec("width", 16),
+            ConfigSpec("patch", [2, 2]),
+            ConfigSpec("vae_scale_factor", 16),
+        ]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "image",
+                type_hint=PIL.Image.Image | torch.Tensor,
+                description="Input image (PIL Image or [H, W, 3] uint8 tensor), only used on first frame",
+            ),
+            InputParam(
+                "latents",
+                type_hint=torch.Tensor,
+                description="Latent tensor for denoising [1, 1, C, H, W]. Only used if use_random_latents=False.",
+            ),
+            InputParam(
+                "use_random_latents",
+                type_hint=bool,
+                default=True,
+                description="If True, always generate fresh random latents. If False, use provided latents.",
+            ),
+            InputParam(
+                "kv_cache",
+                description="KV cache to update",
+            ),
+            InputParam(
+                "frame_timestamp",
+                type_hint=torch.Tensor,
+                description="Current frame timestamp",
+            ),
+            InputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                description="Prompt embeddings for cache pass",
+            ),
+            InputParam(
+                "prompt_pad_mask",
+                type_hint=torch.Tensor,
+                description="Prompt padding mask",
+            ),
+            InputParam(
+                "button_tensor",
+                type_hint=torch.Tensor,
+                description="Button tensor for cache pass",
+            ),
+            InputParam(
+                "mouse_tensor",
+                type_hint=torch.Tensor,
+                description="Mouse tensor for cache pass",
+            ),
+            InputParam(
+                "scroll_tensor",
+                type_hint=torch.Tensor,
+                description="Scroll tensor for cache pass",
+            ),
+            InputParam(
+                "generator",
+                type_hint=torch.Generator,
+                default=None,
+                description="torch Generator for deterministic output",
+            ),
+            InputParam(
+                "ts_mult",
+                required=True,
+                type_hint=int,
+                description="Timestamp multiplier (base_fps // latent_fps)",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "latents",
+                type_hint=torch.Tensor,
+                description="Latent tensor for denoising [1, 1, C, H, W]",
+            ),
+        ]
+    @staticmethod
+    def _cache_pass(
+        transformer,
+        x,
+        frame_timestamp,
+        frame_idx,
+        prompt_emb,
+        prompt_pad_mask,
+        mouse,
+        button,
+        scroll,
+        kv_cache,
+    ):
+        """Cache pass to persist frame in KV cache."""
+        kv_cache.set_frozen(False)
+        transformer(
+            x=x,
+            sigma=x.new_zeros((x.size(0), x.size(1))),
+            frame_timestamp=frame_timestamp,
+            frame_idx=frame_idx,
+            prompt_emb=prompt_emb,
+            prompt_pad_mask=prompt_pad_mask,
+            mouse=mouse,
+            button=button,
+            scroll=scroll,
+            kv_cache=kv_cache,
+        )
+    @torch.inference_mode()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        dtype = components.transformer.dtype
+        # Get latent shape info
+        channels = components.config.channels
+        height = components.config.height   # patch grid height
+        width = components.config.width     # patch grid width
+        patch = components.config.patch
+        vae_scale_factor = components.config.vae_scale_factor
+        pH, pW = patch if isinstance(patch, (list, tuple)) else (patch, patch)
+        latent_H = height * pH
+        latent_W = width * pW
+        shape = (1, 1, channels, latent_H, latent_W)
+        # Pixel dimensions for image preprocessing
+        pixel_H = latent_H * vae_scale_factor
+        pixel_W = latent_W * vae_scale_factor
+        if block_state.image is not None:
+            image = block_state.image
+            # Preprocess: PIL/tensor -> [B, C, H, W] float32 in [0, 1]
+            image = components.image_processor.preprocess(
+                image,
+                height=pixel_H,
+                width=pixel_W,
+            )
+            # Convert to [H, W, 3] uint8 for VAE encoder
+            image = (image[0].permute(1, 2, 0) * 255).to(torch.uint8)
+            assert image.dtype == torch.uint8, (
+                f"Expected uint8 image, got {image.dtype}"
+            )
+            # Temporal VAE expects [T, H, W, C]; repeat the single frame
+            t_down = getattr(components.vae, "t_downscale", 1)
+            if t_down > 1:
+                image = image.unsqueeze(0).expand(t_down, -1, -1, -1)
+            latents = components.vae.encode(image)
+            latents = latents.unsqueeze(1)
+            # Run cache pass to persist encoded frame
+            ts_mult = block_state.ts_mult
+            self._cache_pass(
+                components.transformer,
+                latents,
+                block_state.frame_timestamp * ts_mult,
+                block_state.frame_timestamp,
+                block_state.prompt_embeds,
+                block_state.prompt_pad_mask,
+                block_state.mouse_tensor,
+                block_state.button_tensor,
+                block_state.scroll_tensor,
+                block_state.kv_cache,
+            )
+            block_state.frame_timestamp.add_(1)
+        # Generate latents based on use_random_latents flag
+        if block_state.use_random_latents or block_state.latents is None:
+            block_state.latents = torch.randn(
+                shape, device=device, dtype=torch.bfloat16
+            )
+        self.set_block_state(state, block_state)
+        return components, state
+# ---------------------------------------------------------------------------
+# Pipeline step: Before Denoise (sequential wrapper)
+# ---------------------------------------------------------------------------
+class WorldEngineBeforeDenoiseStep(SequentialPipelineBlocks):
+    """Sequential pipeline that prepares all inputs for denoising."""
+    block_classes = [
+        WorldEngineSetTimestepsStep,
+        WorldEngineSetupKVCacheStep,
+        WorldEnginePrepareLatentsStep,
+    ]
+    block_names = ["set_timesteps", "setup_kv_cache", "prepare_latents"]
+    @property
+    def description(self) -> str:
+        return (
+            "Before denoise step that prepares inputs for denoising:\n"
+            " - WorldEngineSetTimestepsStep: Set up scheduler sigmas\n"
+            " - WorldEngineSetupKVCacheStep: Initialize or reuse KV cache\n"
+            " - WorldEnginePrepareLatentsStep: Encode image (if first frame) and create noise"
+        )
+# ---------------------------------------------------------------------------
+# Pipeline step: Denoise Loop
+# ---------------------------------------------------------------------------
+class WorldEngineDenoiseLoop(ModularPipelineBlocks):
+    """Denoises latents using rectified flow and updates KV cache."""
+    model_name = "world_engine"
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("transformer", AutoModel)]
+    @property
+    def description(self) -> str:
+        return (
+            "Denoises latents using rectified flow (x = x + dsigma * v) "
+            "and updates KV cache for autoregressive generation."
+        )
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("scheduler_sigmas", required=True, type_hint=torch.Tensor, description="Scheduler sigmas for denoising"),
+            InputParam("latents", required=True, type_hint=torch.Tensor, description="Initial noisy latents [1, 1, C, H, W]"),
+            InputParam("kv_cache", required=True, description="KV cache for transformer attention"),
+            InputParam("frame_timestamp", required=True, type_hint=torch.Tensor, description="Current frame timestamp"),
+            InputParam("prompt_embeds", required=True, type_hint=torch.Tensor, description="Text embeddings for conditioning"),
+            InputParam("prompt_pad_mask", type_hint=torch.Tensor, description="Padding mask for prompt embeddings"),
+            InputParam("button_tensor", required=True, type_hint=torch.Tensor, description="One-hot encoded button tensor"),
+            InputParam("mouse_tensor", required=True, type_hint=torch.Tensor, description="Mouse velocity tensor"),
+            InputParam("scroll_tensor", required=True, type_hint=torch.Tensor, description="Scroll wheel sign tensor"),
+            InputParam("ts_mult", required=True, type_hint=int, description="Timestamp multiplier (base_fps // latent_fps)"),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("latents", type_hint=torch.Tensor, description="Denoised latents"),
+        ]
+    @staticmethod
+    def _denoise_pass(
+        transformer, x, sigmas, frame_timestamp, frame_idx,
+        prompt_emb, prompt_pad_mask, mouse, button, scroll, kv_cache,
+    ):
+        """Denoising loop using rectified flow."""
+        kv_cache.set_frozen(True)
+        sigma = x.new_empty((x.size(0), x.size(1)))
+        for step_sig, step_dsig in zip(sigmas, sigmas.diff()):
+            v = transformer(
+                x=x, sigma=sigma.fill_(step_sig),
+                frame_timestamp=frame_timestamp, frame_idx=frame_idx,
+                prompt_emb=prompt_emb, prompt_pad_mask=prompt_pad_mask,
+                mouse=mouse, button=button, scroll=scroll,
+                kv_cache=kv_cache,
+            )
+            x = x + step_dsig * v
+        return x
+    @staticmethod
+    def _cache_pass(
+        transformer, x, frame_timestamp, frame_idx,
+        prompt_emb, prompt_pad_mask, mouse, button, scroll, kv_cache,
+    ):
+        """Cache pass to persist frame for next generation."""
+        kv_cache.set_frozen(False)
+        transformer(
+            x=x, sigma=x.new_zeros((x.size(0), x.size(1))),
+            frame_timestamp=frame_timestamp, frame_idx=frame_idx,
+            prompt_emb=prompt_emb, prompt_pad_mask=prompt_pad_mask,
+            mouse=mouse, button=button, scroll=scroll,
+            kv_cache=kv_cache,
+        )
+    @torch.inference_mode()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        ts_mult = block_state.ts_mult
+        block_state.latents = self._denoise_pass(
+            components.transformer,
+            block_state.latents,
+            block_state.scheduler_sigmas,
+            block_state.frame_timestamp * ts_mult,
+            block_state.frame_timestamp,
+            block_state.prompt_embeds,
+            block_state.prompt_pad_mask,
+            block_state.mouse_tensor,
+            block_state.button_tensor,
+            block_state.scroll_tensor,
+            block_state.kv_cache,
+        ).clone()
+        self._cache_pass(
+            components.transformer,
+            block_state.latents,
+            block_state.frame_timestamp * ts_mult,
+            block_state.frame_timestamp,
+            block_state.prompt_embeds,
+            block_state.prompt_pad_mask,
+            block_state.mouse_tensor,
+            block_state.button_tensor,
+            block_state.scroll_tensor,
+            block_state.kv_cache,
+        )
+        block_state.frame_timestamp.add_(1)
+        self.set_block_state(state, block_state)
+        return components, state
+# ---------------------------------------------------------------------------
+# Pipeline step: Decode
+# ---------------------------------------------------------------------------
+class WorldEngineDecodeStep(ModularPipelineBlocks):
+    """Decodes denoised latents back to RGB image using VAE."""
+    model_name = "world_engine"
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoModel),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict(
+                    {
+                        "vae_scale_factor": 16,
+                        "do_normalize": False,
+                        "do_convert_rgb": True,
+                    }
+                ),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def description(self) -> str:
+        return "Decodes denoised latents to RGB image using the VAE decoder"
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("latents", required=True, type_hint=torch.Tensor, description="Denoised latent tensor [1, 1, C, H, W]"),
+            InputParam("output_type", default="pil", description="The output format for the generated images (pil, latent, pt, or np)"),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "images",
+                type_hint=PIL.Image.Image | torch.Tensor | np.ndarray,
+                description="Decoded RGB image in requested output format",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        latents = block_state.latents
+        output_type = block_state.output_type or "pil"
+        if output_type == "latent":
+            block_state.images = latents
+        else:
+            # Decode to image
+            frames = components.vae.decode(latents.squeeze(1))
+            if frames.dim() == 3:
+                # Single frame [H, W, C] — wrap so the loop below works uniformly
+                frames = frames.unsqueeze(0)
+            # Postprocess based on output_type
+            if output_type == "pt":
+                block_state.images = frames
+            elif output_type == "np":
+                block_state.images = frames.cpu().numpy()
+            else:  # "pil"
+                block_state.images = [
+                    PIL.Image.fromarray(f.cpu().numpy()) for f in frames
+                ]
+        # Clear latents so next frame generates fresh random noise
+        block_state.latents = None
+        self.set_block_state(state, block_state)
+        return components, state
+# ---------------------------------------------------------------------------
+# Top-level block registry
+# ---------------------------------------------------------------------------
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", WorldEngineTextEncoderStep),
+        ("controller_encoder", WorldEngineControllerEncoderStep),
+        ("before_denoise", WorldEngineBeforeDenoiseStep),
+        ("denoise", WorldEngineDenoiseLoop),
+        ("decode", WorldEngineDecodeStep),
+    ]
+)
+class WorldEngineBlocks(SequentialPipelineBlocks):
+    """Sequential pipeline blocks for WorldEngine frame generation."""
+    block_classes = list(AUTO_BLOCKS.values())
+    block_names = list(AUTO_BLOCKS.keys())

modular_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_class_name": "WorldEngineBlocks",
+  "_diffusers_version": "0.36.0.dev0",
+  "auto_map": {
+    "ModularPipelineBlocks": "modular_blocks.WorldEngineBlocks"
+  }
+}

modular_model_index.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+    "_blocks_class_name": "WorldEngineBlocks",
+    "_class_name": "ModularPipeline",
+    "_diffusers_version": "0.36.0.dev0",
+    "channels": 32,
+    "height": 16,
+    "width": 32,
+    "patch": [
+        2,
+        2
+    ],
+    "vae_scale_factor": 16,
+    "n_buttons": 256,
+    "tokens_per_frame": 512,
+    "scheduler_sigmas": [
+        1.0,
+        0.9,
+        0.75,
+        0.3,
+        0.0
+    ],
+    "transformer": [
+        null,
+        null,
+        {
+            "pretrained_model_name_or_path": "Overworld/Waypoint-1.5-1B",
+            "subfolder": "transformer",
+            "type_hint": [
+                "diffusers",
+                "AutoModel"
+            ],
+            "revision": null,
+            "variant": null
+        }
+    ],
+    "vae": [
+        null,
+        null,
+        {
+            "pretrained_model_name_or_path": "Overworld/Waypoint-1.5-1B",
+            "subfolder": "vae",
+            "type_hint": [
+                "diffusers",
+                "AutoModel"
+            ],
+            "revision": null,
+            "variant": null
+        }
+    ],
+    "text_encoder": [
+        null,
+        null,
+        {
+            "pretrained_model_name_or_path": "google/umt5-xl",
+            "type_hint": [
+                "transformers",
+                "UMT5EncoderModel"
+            ],
+            "revision": null,
+            "variant": null
+        }
+    ],
+    "tokenizer": [
+        null,
+        null,
+        {
+            "pretrained_model_name_or_path": "google/umt5-xl",
+            "type_hint": [
+                "transformers",
+                "AutoTokenizer"
+            ],
+            "revision": null,
+            "variant": null
+        }
+    ]
+}

transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (C) 2025 Hugging Face Team and Overworld
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+from .model import WorldModel

transformer/config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "_class_name": "WorldModel",
+  "_diffusers_version": "0.36.0.dev0",
+  "auto_map": {
+    "AutoModel": "model.WorldModel"
+  },
+  "d_model": 2048,
+  "n_heads": 32,
+  "n_kv_heads": 16,
+  "n_layers": 24,
+  "mlp_ratio": 4,
+  "channels": 32,
+  "height": 16,
+  "width": 32,
+  "patch": [
+    2,
+    2
+  ],
+  "tokens_per_frame": 512,
+  "n_frames": 512,
+  "local_window": 16,
+  "global_window": 128,
+  "global_attn_period": 4,
+  "global_pinned_dilation": 8,
+  "global_attn_offset": -1,
+  "value_residual": true,
+  "gated_attn": false,
+  "n_buttons": 256,
+  "ctrl_conditioning": true,
+  "ctrl_conditioning_period": 3,
+  "ctrl_cond_dropout": 0.0,
+  "prompt_conditioning": null,
+  "prompt_conditioning_period": 3,
+  "prompt_embedding_dim": 2048,
+  "prompt_cond_dropout": 0.0,
+  "noise_conditioning": "wan",
+  "base_fps": 15,
+  "causal": true,
+  "mlp_gradient_checkpointing": true,
+  "block_gradient_checkpointing": true,
+  "rope_impl": "ortho",
+  "moe": false,
+  "moe_top_k": 2,
+  "moe_n_experts": 8,
+  "moe_mlp_ratio": null,
+  "gated_linear": false,
+  "temporal_compression": 4,
+  "inference_fps": 60,
+  "taehv_ae": true,
+  "rope_nyquist_frac": 0.8,
+  "rope_theta": 10000.0,
+  "scheduler_sigmas": [
+    1.0,
+    0.9,
+    0.75,
+    0.3,
+    0.0
+  ]
+}

transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:467ea80d76f63a82a2f5ba80d1170c27dc1060e60c3ede341105bff89b468700
+size 7443340184

transformer/model.py ADDED Viewed

	@@ -0,0 +1,1100 @@

+# Copyright (C) 2025 Hugging Face Team and Overworld
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""WorldModel transformer for frame generation.
+Single-file model containing all building blocks: nn primitives, attention,
+RoPE, quantization, inference caching, and the top-level WorldModel.
+"""
+import warnings
+import einops as eo
+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from tensordict import TensorDict
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+try:
+    from fbgemm_gpu.experimental.gen_ai.moe import index_shuffling
+    import fbgemm_gpu.experimental.gen_ai.moe.gather_scatter  # noqa
+    HAS_FBGEMM = True
+except ImportError:
+    HAS_FBGEMM = False
+# ---------------------------------------------------------------------------
+# NN primitives
+# ---------------------------------------------------------------------------
+class NoCastModule(torch.nn.Module):
+    """Module that prevents dtype casting during .to() calls."""
+    def _apply(self, fn):
+        def keep_dtype(t):
+            old_dtype = t.dtype
+            out = fn(t)
+            if out.dtype is not old_dtype:
+                warnings.warn(
+                    f"{self.__class__.__name__}: requested dtype cast ignored; "
+                    f"keeping {old_dtype}.",
+                    stacklevel=3,
+                )
+                out = out.to(dtype=old_dtype)
+            return out
+        return super()._apply(keep_dtype)
+    def to(self, *args, **kwargs):
+        warn_cast = False
+        if args and isinstance(args[0], torch.Tensor):
+            ref, *rest = args
+            args = (ref.device, *rest)
+            base = next(self.parameters(), None) or next(self.buffers(), None)
+            if base is not None and ref.dtype is not base.dtype:
+                warn_cast = True
+        if kwargs.pop("dtype", None) is not None:
+            warn_cast = True
+        args = tuple(a for a in args if not isinstance(a, torch.dtype))
+        if warn_cast:
+            warnings.warn(
+                f"{self.__class__.__name__}.to: requested dtype cast ignored; "
+                "keeping existing dtypes.",
+                stacklevel=2,
+            )
+        return super().to(*args, **kwargs)
+def rms_norm(x: torch.Tensor) -> torch.Tensor:
+    """Root mean square layer normalization."""
+    return F.rms_norm(x, (x.size(-1),))
+class MLP(nn.Module):
+    """Simple MLP with SiLU activation."""
+    def __init__(self, dim_in, dim_middle, dim_out):
+        super().__init__()
+        self.fc1 = nn.Linear(dim_in, dim_middle, bias=False)
+        self.fc2 = nn.Linear(dim_middle, dim_out, bias=False)
+    def forward(self, x):
+        return self.fc2(F.silu(self.fc1(x)))
+class AdaLN(nn.Module):
+    """Adaptive Layer Normalization."""
+    def __init__(self, dim):
+        super().__init__()
+        self.fc = nn.Linear(dim, 2 * dim, bias=False)
+    def forward(self, x, cond):
+        b, n, d = cond.shape
+        _, nm, _ = x.shape
+        m = nm // n
+        y = F.silu(cond)
+        ab = self.fc(y)  # [b, n, 2d]
+        ab = ab.view(b, n, 1, 2 * d)  # [b, n, 1, 2d]
+        ab = ab.expand(-1, -1, m, -1)  # [b, n, m, 2d]
+        ab = ab.reshape(b, nm, 2 * d)  # [b, nm, 2d]
+        a, b_ = ab.chunk(2, dim=-1)  # [b, nm, d] each
+        x = rms_norm(x) * (1 + a) + b_
+        return x
+def ada_rmsnorm(x, scale, bias):
+    """Adaptive RMS normalization with scale and bias."""
+    x4 = eo.rearrange(x, "b (n m) d -> b n m d", n=scale.size(1))
+    y4 = rms_norm(x4) * (1 + scale.unsqueeze(2)) + bias.unsqueeze(2)
+    return eo.rearrange(y4, "b n m d -> b (n m) d")
+def ada_gate(x, gate):
+    """Apply gating to x with per-frame gates."""
+    x4 = eo.rearrange(x, "b (n m) d -> b n m d", n=gate.size(1))
+    return eo.rearrange(x4 * gate.unsqueeze(2), "b n m d -> b (n m) d")
+class NoiseConditioner(NoCastModule):
+    """Sigma -> logSNR -> Fourier Features -> Dense embedding."""
+    def __init__(self, dim, fourier_dim=512, base=10_000.0):
+        super().__init__()
+        assert fourier_dim % 2 == 0
+        half = fourier_dim // 2
+        self.freq = nn.Buffer(
+            torch.logspace(0, -1, steps=half, base=base, dtype=torch.float32),
+            persistent=False,
+        )
+        self.mlp = MLP(fourier_dim, dim * 4, dim)
+    def forward(self, s, eps=torch.finfo(torch.float32).eps):
+        assert self.freq.dtype == torch.float32
+        orig_dtype, shape = s.dtype, s.shape
+        with torch.autocast("cuda", enabled=False):
+            s = s.reshape(-1).float()
+            s = s * 1000
+            phase = s[:, None] * self.freq[None, :]
+            emb = torch.cat((torch.sin(phase), torch.cos(phase)), dim=-1)
+            emb = emb * 2**0.5
+            emb = self.mlp(emb)
+        return emb.to(orig_dtype).view(*shape, -1)
+# ---------------------------------------------------------------------------
+# Attention
+# ---------------------------------------------------------------------------
+class OrthoRoPEAngles(NoCastModule):
+    """Computes RoPE angles on the fly each forward pass."""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        d_head = config.d_model // config.n_heads
+        torch._assert(d_head % 8 == 0, "d_head must be divisible by 8")
+        d_xy, d_t = d_head // 8, d_head // 4
+        nyq = float(getattr(config, "rope_nyquist_frac", 0.8))
+        max_freq = min(self.config.height, self.config.width) * nyq
+        n = (d_xy + 1) // 2
+        xy = (torch.linspace(1.0, max_freq / 2, n, dtype=torch.float32) * torch.pi).repeat_interleave(2)[:d_xy]
+        theta = float(getattr(config, "rope_theta", 10000.0))
+        inv_t = 1.0 / (theta ** (torch.arange(0, d_t, 2, dtype=torch.float32) / d_t))
+        inv_t = inv_t.repeat_interleave(2)
+        self.register_buffer("xy", xy, persistent=False)
+        self.register_buffer("inv_t", inv_t, persistent=False)
+    @torch.autocast("cuda", enabled=False)
+    def forward(self, pos_ids):
+        if not torch.compiler.is_compiling():
+            torch._assert(
+                (pos_ids["y_pos"].max() < self.config.height) & (pos_ids["x_pos"].max() < self.config.width),
+                f"pos_ids out of bounds, {self.config.height}, {self.config.width}"
+            )
+        x = (2.0 * pos_ids["x_pos"].float() + 1.0) / self.config.width - 1.0
+        y = (2.0 * pos_ids["y_pos"].float() + 1.0) / self.config.height - 1.0
+        t = pos_ids["t_pos"].float()
+        freqs = torch.cat(
+            (x.unsqueeze(-1) * self.xy, y.unsqueeze(-1) * self.xy, t.unsqueeze(-1) * self.inv_t),
+            dim=-1,
+        )
+        return freqs.cos()[:, None], freqs.sin()[:, None]
+class OrthoRoPE(NoCastModule):
+    """Applies precomputed RoPE angles to input tensors."""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        assert not getattr(self.config, "has_audio", False)
+    @torch.autocast("cuda", enabled=False)
+    def forward(self, x, rope_angles):
+        cos, sin = rope_angles
+        x0, x1 = x.float().unfold(-1, 2, 2).unbind(-1)
+        y0 = x0 * cos - x1 * sin
+        y1 = x1 * cos + x0 * sin
+        return torch.cat((y0, y1), dim=-1).type_as(x)
+class Attn(nn.Module):
+    """Self-attention with RoPE and optional GQA, value residual, and gated attention."""
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.value_residual = getattr(config, "value_residual", False)
+        if self.value_residual:
+            self.v_lamb = nn.Parameter(torch.tensor(0.5))
+        self.n_heads = config.n_heads
+        self.n_kv_heads = getattr(config, "n_kv_heads", None) or config.n_heads
+        self.d_head = config.d_model // self.n_heads
+        assert config.d_model % self.n_heads == 0
+        self.enable_gqa = self.n_heads != self.n_kv_heads
+        self.q_proj = nn.Linear(config.d_model, self.n_heads * self.d_head, bias=False)
+        self.k_proj = nn.Linear(
+            config.d_model, self.n_kv_heads * self.d_head, bias=False
+        )
+        self.v_proj = nn.Linear(
+            config.d_model, self.n_kv_heads * self.d_head, bias=False
+        )
+        self.out_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.rope = OrthoRoPE(config)
+        self.gated_attn = getattr(config, "gated_attn", False)
+        if self.gated_attn:
+            self.gate_proj = nn.Linear(
+                self.n_heads, self.n_heads, bias=False
+            )
+            nn.init.zeros_(self.gate_proj.weight)
+    def forward(self, x, pos_ids, rope_angles, v1, kv_cache):
+        from torch.nn.attention.flex_attention import flex_attention
+        q = eo.rearrange(
+            self.q_proj(x), "b t (h d) -> b h t d", h=self.n_heads, d=self.d_head
+        )
+        k = eo.rearrange(
+            self.k_proj(x), "b t (h d) -> b h t d", h=self.n_kv_heads, d=self.d_head
+        )
+        v = eo.rearrange(
+            self.v_proj(x), "b t (h d) -> b h t d", h=self.n_kv_heads, d=self.d_head
+        )
+        if self.value_residual:
+            v1 = v if v1 is None else v1
+            v = torch.lerp(v, v1.view_as(v), self.v_lamb)
+        q, k = rms_norm(q), rms_norm(k)
+        q, k = self.rope(q, rope_angles), self.rope(k, rope_angles)
+        k, v, bm = kv_cache.upsert(k, v, pos_ids, self.layer_idx)
+        y = flex_attention(q, k, v, block_mask=bm, enable_gqa=self.enable_gqa)
+        if self.gated_attn:
+            gates = torch.sigmoid(self.gate_proj(x[..., : self.n_heads]))
+            y = y * gates.permute(0, 2, 1).unsqueeze(-1)
+        y = eo.rearrange(y, "b h t d -> b t (h d)")
+        y = self.out_proj(y)
+        return y, v1
+class MergedQKVAttn(Attn):
+    def __init__(self, src: Attn, config):
+        super().__init__(config, src.layer_idx)
+        self.to(device=src.q_proj.weight.device, dtype=src.q_proj.weight.dtype)
+        self.load_state_dict(
+            src.state_dict(), strict=False
+        )
+        self.train(src.training)
+        self.q_out = self.n_heads * self.d_head
+        self.kv_out = self.n_kv_heads * self.d_head
+        self.qkv_proj = nn.Linear(
+            self.q_proj.in_features,
+            self.q_out + 2 * self.kv_out,
+            bias=False,
+            device=self.q_proj.weight.device,
+            dtype=self.q_proj.weight.dtype,
+        )
+        with torch.no_grad():
+            self.qkv_proj.weight.copy_(
+                torch.cat(
+                    [self.q_proj.weight, self.k_proj.weight, self.v_proj.weight], dim=0
+                )
+            )
+        del self.q_proj, self.k_proj, self.v_proj
+    def forward(self, x, pos_ids, rope_angles, v1, kv_cache):
+        from torch.nn.attention.flex_attention import flex_attention
+        q, k, v = self.qkv_proj(x).split((self.q_out, self.kv_out, self.kv_out), dim=-1)
+        B, T = x.shape[:2]
+        q = q.reshape(B, T, self.n_heads, self.d_head).transpose(1, 2)
+        k = k.reshape(B, T, self.n_kv_heads, self.d_head).transpose(1, 2)
+        v = v.reshape(B, T, self.n_kv_heads, self.d_head).transpose(1, 2)
+        if self.value_residual:
+            v1 = v if v1 is None else v1
+            v = torch.lerp(v, v1.view_as(v), self.v_lamb)
+        q, k = rms_norm(q), rms_norm(k)
+        q, k = self.rope(q, rope_angles), self.rope(k, rope_angles)
+        k, v, bm = kv_cache.upsert(k, v, pos_ids, self.layer_idx)
+        y = flex_attention(q, k, v, block_mask=bm, enable_gqa=self.enable_gqa)
+        if self.gated_attn:
+            gates = torch.sigmoid(self.gate_proj(x[..., : self.n_heads]))
+            y = y * gates.permute(0, 2, 1).unsqueeze(-1)
+        y = y.transpose(1, 2).reshape(B, T, -1)
+        y = self.out_proj(y)
+        return y, v1
+class CrossAttention(nn.Module):
+    """Cross-attention for prompt conditioning."""
+    def __init__(self, config, context_dim=None):
+        super().__init__()
+        assert config.d_model % config.n_heads == 0
+        self.d_head = config.d_model // config.n_heads
+        self.inner_dim = context_dim or config.d_model
+        assert self.inner_dim % self.d_head == 0
+        self.n_heads = self.inner_dim // self.d_head
+        self.q_proj = nn.Linear(config.d_model, self.inner_dim, bias=False)
+        self.k_proj = nn.Linear(
+            context_dim or config.d_model, self.inner_dim, bias=False
+        )
+        self.v_proj = nn.Linear(
+            context_dim or config.d_model, self.inner_dim, bias=False
+        )
+        self.out_proj = nn.Linear(self.inner_dim, config.d_model, bias=False)
+        self.out_proj.weight.detach().zero_()
+    def forward(self, x, context, context_pad_mask=None):
+        from torch.nn.attention.flex_attention import flex_attention
+        q = eo.rearrange(self.q_proj(x), "b t (h d) -> b h t d", h=self.n_heads)
+        k = eo.rearrange(self.k_proj(context), "b t (h d) -> b h t d", h=self.n_heads)
+        v = eo.rearrange(self.v_proj(context), "b t (h d) -> b h t d", h=self.n_heads)
+        q, k = rms_norm(q), rms_norm(k)
+        out = flex_attention(q, k, v)
+        out = out.transpose(1, 2).contiguous().reshape(x.size(0), x.size(1), -1)
+        return self.out_proj(out)
+# ---------------------------------------------------------------------------
+# Inference caching
+# ---------------------------------------------------------------------------
+def _bf16_u16(x: Tensor) -> Tensor:
+    return x.contiguous().view(torch.int16).to(torch.int32) & 0xFFFF
+class CachedDenoiseStepEmb(nn.Module):
+    """bf16 sigma -> bf16 embedding via 64k LUT."""
+    def __init__(self, base: nn.Module, sigmas: list[float]):
+        super().__init__()
+        device = next(base.parameters()).device
+        levels = torch.tensor(sigmas, device=device, dtype=torch.bfloat16)
+        bits = _bf16_u16(levels)
+        if torch.unique(bits).numel() != bits.numel():
+            raise ValueError(
+                "scheduler_sigmas collide in bf16; caching would be ambiguous"
+            )
+        with torch.no_grad():
+            table = (
+                base(levels[:, None]).squeeze(1).to(torch.bfloat16).contiguous()
+            )
+        lut = torch.full((65536,), -1, device=device, dtype=torch.int32)
+        lut[bits] = torch.arange(bits.numel(), device=device, dtype=torch.int32)
+        self.register_buffer("table", table, persistent=False)
+        self.register_buffer("lut", lut, persistent=False)
+        self.register_buffer(
+            "oob",
+            torch.tensor(bits.numel(), device=device, dtype=torch.int32),
+            persistent=False,
+        )
+    def forward(self, sigma: Tensor) -> Tensor:
+        if sigma.dtype is not torch.bfloat16:
+            raise RuntimeError("CachedDenoiseStepEmb expects sigma bf16")
+        idx = self.lut[_bf16_u16(sigma)]
+        idx = torch.where(idx >= 0, idx, self.oob)
+        return self.table[idx.to(torch.int64)]
+class CachedCondHead(nn.Module):
+    """bf16 cond -> cached conditioning; invalid cond => OOB index error."""
+    def __init__(
+        self, base, cached_denoise_step_emb: CachedDenoiseStepEmb, max_key_dims: int = 8
+    ):
+        super().__init__()
+        table = cached_denoise_step_emb.table
+        S, D = table.shape
+        with torch.no_grad():
+            emb = table[:, None, :]
+            cache = (
+                torch.stack([t.squeeze(1) for t in base(emb)], 0)
+                .to(torch.bfloat16)
+                .contiguous()
+            )
+        key_dim = None
+        for d in range(min(D, max_key_dims)):
+            b = _bf16_u16(table[:, d])
+            if torch.unique(b).numel() == S:
+                key_dim = d
+                key_bits = b
+                break
+        if key_dim is None:
+            raise ValueError(
+                "Could not find a unique bf16 key dim for cond->sigma mapping"
+            )
+        lut = torch.full((65536,), -1, device=table.device, dtype=torch.int32)
+        lut[key_bits] = torch.arange(S, device=table.device, dtype=torch.int32)
+        self.key_dim = int(key_dim)
+        self.register_buffer("cache", cache, persistent=False)
+        self.register_buffer("lut", lut, persistent=False)
+        self.register_buffer(
+            "oob",
+            torch.tensor(S, device=table.device, dtype=torch.int32),
+            persistent=False,
+        )
+    def forward(self, cond: Tensor):
+        if cond.dtype is not torch.bfloat16:
+            raise RuntimeError("CachedCondHead expects cond bf16")
+        idx = self.lut[_bf16_u16(cond[..., self.key_dim])]
+        idx = torch.where(idx >= 0, idx, self.oob)
+        g = self.cache[:, idx.to(torch.int64)]
+        return tuple(g.unbind(0))
+# ---------------------------------------------------------------------------
+# Quantization
+# ---------------------------------------------------------------------------
+QUANTS = [None]
+try:
+    from flashinfer import nvfp4_quantize, mm_fp4, SfLayout
+    QUANTS.append("nvfp4")
+except ImportError:
+    pass
+@torch.library.custom_op("world_engine::fp4_linear", mutates_args=())
+def fp4_linear(
+    a_bf16: torch.Tensor,
+    b_fp4_T: torch.Tensor,
+    a_global_sf: torch.Tensor,
+    b_sf_T: torch.Tensor,
+    alpha: torch.Tensor,
+) -> torch.Tensor:
+    a_fp4, a_sf = nvfp4_quantize(
+        a_bf16, a_global_sf, sfLayout=SfLayout.layout_128x4, do_shuffle=False,
+    )
+    return mm_fp4(
+        a_fp4, b_fp4_T, a_sf, b_sf_T, alpha, out_dtype=torch.bfloat16, backend="cutlass"
+    )
+@fp4_linear.register_fake
+def _fp4_linear_fake(
+    a_bf16: torch.Tensor, b_fp4_T: torch.Tensor,
+    a_global_sf: torch.Tensor, b_sf_T: torch.Tensor, alpha: torch.Tensor,
+) -> torch.Tensor:
+    return torch.empty(
+        (a_bf16.shape[0], b_fp4_T.shape[1]), device=a_bf16.device, dtype=torch.bfloat16
+    )
+class FP4Linear(nn.Module):
+    """FP4 Linear layer using FlashInfer's NVFP4 quantization."""
+    def __init__(self, lin: nn.Linear):
+        super().__init__()
+        self.in_features = lin.in_features
+        self.out_features = lin.out_features
+        assert self.in_features % 32 == 0 and self.out_features % 32 == 0
+        self.weight = nn.Parameter(lin.weight.detach().clone())
+        self._weight_fp4_T = None
+        self._weight_scales_T = None
+        self._alpha = None
+        self._dummy_scale = None
+        self._weight_global_sf = None
+        with torch.no_grad():
+            self._dummy_scale = torch.full((1,), 1.0, device=self.weight.device, dtype=torch.float32)
+            weight_bf16 = self.weight.to(torch.bfloat16).to(self.weight.device).contiguous()
+            weight_amax = weight_bf16.float().abs().nan_to_num().max()
+            self._weight_global_sf = (1.0) / weight_amax
+            self._alpha = 1.0 / (self._weight_global_sf * self._dummy_scale)
+            w_fp4, w_sf = nvfp4_quantize(
+                weight_bf16, self._weight_global_sf, sfLayout=SfLayout.layout_128x4, do_shuffle=False,
+            )
+            self._weight_fp4_T = w_fp4.t()
+            self._weight_scales_T = w_sf.t()
+            assert self.weight.is_cuda
+            lazy_x = torch.zeros((1, lin.in_features), device=self.weight.device, dtype=torch.bfloat16)
+            fp4_linear(lazy_x, self._weight_fp4_T, self._dummy_scale, self._weight_scales_T, self._alpha)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_flat = x.reshape(-1, x.shape[-1])
+        y = fp4_linear(
+            x_flat.to(torch.bfloat16).contiguous(),
+            self._weight_fp4_T, self._dummy_scale, self._weight_scales_T, self._alpha,
+        )
+        return y.reshape(x.shape[:-1] + (-1,))
+class FP8W8A8Linear(nn.Module):
+    __constants__ = ("in_features", "out_features")
+    def __init__(self, lin: nn.Linear):
+        super().__init__()
+        self.in_features, self.out_features = lin.in_features, lin.out_features
+        f8 = torch.float8_e4m3fn
+        inv = 1.0 / float(torch.finfo(f8).max)
+        self._inv = inv
+        w = lin.weight.detach()
+        ws = (w.abs().amax() * inv).clamp_min(1e-8).float()
+        wf8 = (w / ws.to(w.dtype)).to(f8).contiguous()
+        self.register_buffer("wT", wf8.t())
+        self.register_buffer("ws", ws)
+        if lin.bias is None:
+            self.bias = None
+        else:
+            self.register_buffer("bias", lin.bias.detach().to(torch.float16))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        s = x.shape
+        x2 = x.reshape(-1, s[-1])
+        xs = (x2.abs().amax() * self._inv).clamp_min(1e-8).float()
+        xf8 = (x2 / xs.to(x2.dtype)).to(torch.float8_e4m3fn).contiguous()
+        y = torch._scaled_mm(
+            xf8, self.wT, xs, self.ws,
+            bias=self.bias, out_dtype=torch.float16, use_fast_accum=True,
+        )
+        return y.reshape(*s[:-1], self.out_features).to(x.dtype)
+class FP8Linear(nn.Module):
+    def __init__(self, lin: nn.Linear):
+        super().__init__()
+        self.in_features, self.out_features = lin.in_features, lin.out_features
+        self.bias = (
+            nn.Parameter(lin.bias.data.clone().to(torch.float8_e4m3fn))
+            if lin.bias is not None else None
+        )
+        w_amax = lin.weight.data.abs().amax()
+        w = lin.weight.data.clone().div(w_amax).to(torch.float8_e4m3fn)
+        self.register_buffer("w_amax", w_amax)
+        self.register_buffer("weightT", w.t())
+        self.dummy_scale = torch.ones((), device=lin.weight.device, dtype=torch.float32)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fp8 = x.to(torch.float8_e4m3fn).reshape(-1, x.size(-1)).contiguous()
+        result = torch._scaled_mm(
+            x_fp8, self.weightT,
+            bias=self.bias, scale_a=self.dummy_scale, scale_b=self.w_amax,
+            out_dtype=torch.bfloat16, use_fast_accum=True,
+        )
+        return result.reshape(x.shape[:-1] + (-1,))
+def quantize_model(model: nn.Module, quant: str):
+    if quant is None:
+        return model
+    def eligible(m: nn.Module) -> bool:
+        w = getattr(m, "weight", None)
+        if not isinstance(m, nn.Linear):
+            return False
+        if getattr(w, "dtype", None) != torch.bfloat16:
+            return False
+        o, k = w.shape
+        return (o % 32 == 0) and (k % 32 == 0)
+    new_linear = {"w8a8": FP8W8A8Linear, "nvfp4": FP4Linear, "fp8": FP8Linear}[quant]
+    for name, child in model.named_children():
+        setattr(model, name, new_linear(child)) if eligible(child) else quantize_model(child, quant)
+    return model
+# ---------------------------------------------------------------------------
+# Inference patches
+# ---------------------------------------------------------------------------
+def patch_cached_noise_conditioning(model) -> None:
+    cached_denoise_step_emb = CachedDenoiseStepEmb(
+        model.denoise_step_emb, model.config.scheduler_sigmas
+    )
+    model.denoise_step_emb = cached_denoise_step_emb
+    for blk in model.transformer.blocks:
+        blk.attn_cond_head = CachedCondHead(blk.attn_cond_head, cached_denoise_step_emb)
+        blk.mlp_cond_head = CachedCondHead(blk.mlp_cond_head, cached_denoise_step_emb)
+def patch_Attn_merge_qkv(model) -> None:
+    for name, mod in list(model.named_modules()):
+        if isinstance(mod, Attn) and not isinstance(mod, MergedQKVAttn):
+            model.set_submodule(name, MergedQKVAttn(mod, model.config))
+def _apply_inference_patches(model) -> None:
+    patch_cached_noise_conditioning(model)
+    patch_Attn_merge_qkv(model)
+# ---------------------------------------------------------------------------
+# Model components
+# ---------------------------------------------------------------------------
+class CFG(nn.Module):
+    def __init__(self, d_model: int, dropout: float):
+        super().__init__()
+        self.dropout = dropout
+        self.null_emb = nn.Parameter(torch.zeros(1, 1, d_model))
+    def forward(
+        self, x: torch.Tensor, is_conditioned: bool | None = None
+    ) -> torch.Tensor:
+        B, L, _ = x.shape
+        null = self.null_emb.expand(B, L, -1)
+        if self.training or is_conditioned is None:
+            if self.dropout == 0.0:
+                return x
+            drop = torch.rand(B, 1, 1, device=x.device) < self.dropout
+            return torch.where(drop, null, x)
+        return x if is_conditioned else null
+class ControllerInputEmbedding(nn.Module):
+    """Embeds controller inputs (mouse + buttons) into model dimension."""
+    def __init__(self, n_buttons: int, d_model: int, mlp_ratio: int = 4):
+        super().__init__()
+        self.mlp = MLP(n_buttons + 3, d_model * mlp_ratio, d_model)
+    def forward(self, mouse: Tensor, button: Tensor, scroll: Tensor):
+        assert len(mouse.shape) == 3
+        x = torch.cat((mouse, button, scroll), dim=-1)
+        return self.mlp(x)
+class MLPFusion(nn.Module):
+    """Fuses per-group conditioning into tokens via split linear projections."""
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.fc1_x = nn.Linear(d_model, d_model, bias=False)
+        self.fc1_c = nn.Linear(d_model, d_model, bias=False)
+        self.fc2 = nn.Linear(d_model, d_model, bias=False)
+    def forward(self, x: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        B, _, D = x.shape
+        L = cond.shape[1]
+        x = x.reshape(B, L, -1, D)
+        return self.fc2(F.silu(self.fc1_x(x) + self.fc1_c(cond).unsqueeze(2))).flatten(
+            1, 2
+        )
+class MoEWithoutFBGEMM(nn.Module):
+    """MoE implementation using torch grouped_mm (no fbgemm dependency)."""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.moe_top_k
+        moe_mlp_ratio = getattr(config, "moe_mlp_ratio", None) or config.mlp_ratio / config.moe_top_k
+        d_intermediate = int(config.d_model * moe_mlp_ratio)
+        self.router = nn.Linear(config.d_model, config.moe_n_experts, bias=False)
+        self.expert_in_proj = nn.Parameter(
+            torch.empty(config.moe_n_experts, d_intermediate * (2 if config.gated_linear else 1), config.d_model)
+        )
+        self.expert_out_proj = nn.Parameter(torch.empty(config.moe_n_experts, config.d_model, d_intermediate))
+    def forward(self, x: torch.Tensor, gate: torch.Tensor | None = None) -> torch.Tensor:
+        if self.training or torch.is_grad_enabled():
+            raise NotImplementedError("inference only")
+        orig_shape = x.shape
+        x = x.reshape(-1, orig_shape[-1])
+        logits = self.router(x) if gate is None else gate.reshape(-1, gate.size(-1))
+        logits_fp32 = logits.float()
+        scores, expert = logits.topk(self.top_k, dim=-1, sorted=False)
+        weights = (scores.float() - logits_fp32.logsumexp(dim=-1, keepdim=True)).exp().to(x.dtype)
+        expert = expert.flatten()
+        expert_sorted, sort_idx = expert.sort()
+        expert_ids = torch.arange(self.expert_in_proj.size(0), device=expert.device, dtype=expert_sorted.dtype)
+        offsets = torch.searchsorted(expert_sorted, expert_ids, right=True).to(torch.int32)
+        src = sort_idx // self.top_k
+        x_grouped = x.index_select(0, torch.cat((src, src[:1]), dim=0))
+        h = F.grouped_mm(x_grouped, self.expert_in_proj.transpose(-2, -1), offs=offsets)
+        h[-1].zero_()
+        if self.config.gated_linear:
+            gate_act, up = h.chunk(2, dim=-1)
+            h = F.silu(gate_act) * up
+        else:
+            h = F.silu(h)
+        y_grouped = F.grouped_mm(h, self.expert_out_proj.transpose(-2, -1), offs=offsets)[:-1]
+        y = torch.empty_like(y_grouped).index_copy_(0, sort_idx, y_grouped).view(x.size(0), self.top_k, -1)
+        return (y * weights.unsqueeze(-1)).sum(dim=1).reshape(orig_shape)
+class MoE(nn.Module):
+    """MoE implementation using fbgemm optimized kernels."""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.moe_top_k
+        moe_mlp_ratio = getattr(config, "moe_mlp_ratio", None) or (config.mlp_ratio / config.moe_top_k)
+        d_int = int(config.d_model * moe_mlp_ratio)
+        self.router = nn.Linear(config.d_model, config.moe_n_experts, bias=False)
+        self.expert_in_proj = nn.Parameter(
+            torch.empty(config.moe_n_experts, d_int * (2 if config.gated_linear else 1), config.d_model)
+        )
+        self.expert_out_proj = nn.Parameter(torch.empty(config.moe_n_experts, config.d_model, d_int))
+    def forward(self, x: torch.Tensor, gate: torch.Tensor | None = None) -> torch.Tensor:
+        if self.training or torch.is_grad_enabled():
+            raise NotImplementedError("inference only")
+        orig = x.shape
+        x = x.reshape(-1, orig[-1])
+        logits = self.router(x) if gate is None else gate.reshape(-1, gate.size(-1))
+        logits32 = logits.float()
+        token_counts, expert_sorted, src = index_shuffling(logits32, top_k=self.top_k)
+        E = self.expert_in_proj.size(0)
+        offs = token_counts[:E].cumsum(0).to(torch.int32)
+        src = src.to(torch.long)
+        expert_sorted = expert_sorted.to(torch.long)
+        logZ = logits32.logsumexp(-1)
+        w = (logits32[src, expert_sorted] - logZ[src]).exp().to(x.dtype)
+        xg = x.index_select(0, torch.cat((src, src[:1]), 0))
+        h = F.grouped_mm(xg, self.expert_in_proj.transpose(-2, -1), offs=offs)
+        if self.config.gated_linear:
+            ga, up = h.chunk(2, -1)
+            h = F.silu(ga) * up
+        else:
+            h = F.silu(h)
+        yg = F.grouped_mm(h, self.expert_out_proj.transpose(-2, -1), offs=offs)[:-1]
+        out = torch.zeros_like(x)
+        torch.ops.fbgemm.scatter_add_dense_tokens(out, (yg * w.unsqueeze(-1)).contiguous(), src)
+        return out.reshape(orig)
+class CondHead(nn.Module):
+    """Per-layer conditioning head: bias_in -> SiLU -> Linear -> chunk(n_cond)."""
+    def __init__(self, d_model: int, noise_conditioning: str = "wan", n_cond: int = 3):
+        super().__init__()
+        self.bias_in = (
+            nn.Parameter(torch.zeros(d_model)) if noise_conditioning == "wan" else None
+        )
+        self.cond_proj = nn.ModuleList(
+            [nn.Linear(d_model, d_model, bias=False) for _ in range(n_cond)]
+        )
+    def forward(self, cond):
+        cond = cond + self.bias_in if self.bias_in is not None else cond
+        h = F.silu(cond)
+        return tuple(p(h) for p in self.cond_proj)
+# ---------------------------------------------------------------------------
+# Transformer blocks
+# ---------------------------------------------------------------------------
+class WorldDiTBlock(nn.Module):
+    """Single transformer block with self-attention, optional cross-attention, and MLP."""
+    def __init__(
+        self, d_model, n_heads, mlp_ratio, layer_idx,
+        prompt_conditioning, prompt_conditioning_period, prompt_embedding_dim,
+        ctrl_conditioning_period, noise_conditioning, config,
+    ):
+        super().__init__()
+        self.config = config
+        self.attn = Attn(config, layer_idx)
+        if getattr(config, "moe", False):
+            self.dit_mlp = MoE(config) if HAS_FBGEMM else MoEWithoutFBGEMM(config)
+        else:
+            self.dit_mlp = MLP(d_model, d_model * mlp_ratio, d_model)
+        self.attn_cond_head = CondHead(d_model, noise_conditioning, n_cond=3)
+        self.mlp_cond_head = CondHead(d_model, noise_conditioning, n_cond=3)
+        do_prompt_cond = (
+            prompt_conditioning is not None
+            and layer_idx % prompt_conditioning_period == 0
+        )
+        self.prompt_cross_attn = (
+            CrossAttention(config, prompt_embedding_dim) if do_prompt_cond else None
+        )
+        do_ctrl_cond = ctrl_conditioning_period is not None and layer_idx % ctrl_conditioning_period == 0
+        self.ctrl_mlpfusion = MLPFusion(d_model) if do_ctrl_cond else None
+    def forward(self, x, pos_ids, rope_angles, cond, ctx, v, kv_cache=None):
+        s0, b0, g0 = self.attn_cond_head(cond)
+        s1, b1, g1 = self.mlp_cond_head(cond)
+        residual = x
+        x = ada_rmsnorm(x, s0, b0)
+        x, v = self.attn(x, pos_ids, rope_angles, v, kv_cache=kv_cache)
+        x = ada_gate(x, g0) + residual
+        if self.prompt_cross_attn is not None:
+            x = (
+                self.prompt_cross_attn(
+                    rms_norm(x),
+                    context=rms_norm(ctx["prompt_emb"]),
+                    context_pad_mask=ctx["prompt_pad_mask"],
+                )
+                + x
+            )
+        if self.ctrl_mlpfusion is not None:
+            x = self.ctrl_mlpfusion(rms_norm(x), rms_norm(ctx["ctrl_emb"])) + x
+        x = ada_gate(self.dit_mlp(ada_rmsnorm(x, s1, b1)), g1) + x
+        return x, v
+class WorldDiT(nn.Module):
+    """Stack of WorldDiTBlocks with shared parameters."""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.blocks = nn.ModuleList(
+            [
+                WorldDiTBlock(
+                    d_model=config.d_model,
+                    n_heads=config.n_heads,
+                    mlp_ratio=config.mlp_ratio,
+                    layer_idx=idx,
+                    prompt_conditioning=config.prompt_conditioning,
+                    prompt_conditioning_period=config.prompt_conditioning_period,
+                    prompt_embedding_dim=config.prompt_embedding_dim,
+                    ctrl_conditioning_period=config.ctrl_conditioning_period,
+                    noise_conditioning=config.noise_conditioning,
+                    config=config,
+                )
+                for idx in range(config.n_layers)
+            ]
+        )
+        self.rope_angles = OrthoRoPEAngles(config)
+    def forward(self, x, pos_ids, cond, ctx, kv_cache=None):
+        rope_angles = self.rope_angles(pos_ids)
+        v = None
+        for i, block in enumerate(self.blocks):
+            x, v = block(x, pos_ids, rope_angles, cond, ctx, v, kv_cache=kv_cache)
+        return x
+# ---------------------------------------------------------------------------
+# Top-level model
+# ---------------------------------------------------------------------------
+class WorldModel(ModelMixin, ConfigMixin):
+    """
+    WORLD: Wayfarer Operator-driven Rectified-flow Long-context Diffuser.
+    Denoises a frame given:
+    - All previous frames (via KV cache)
+    - The prompt embedding
+    - The controller input embedding
+    - The current noise level
+    """
+    _supports_gradient_checkpointing = False
+    _keep_in_fp32_modules = ["denoise_step_emb", "rope_angles"]
+    @register_to_config
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 32,
+        n_kv_heads: int | None = None,
+        n_layers: int = 24,
+        mlp_ratio: int = 4,
+        channels: int = 32,
+        height: int = 16,
+        width: int = 16,
+        patch: tuple = (2, 2),
+        tokens_per_frame: int = 256,
+        n_frames: int = 4096,
+        local_window: int = 16,
+        global_window: int = 128,
+        global_attn_period: int = 4,
+        global_pinned_dilation: int = 8,
+        global_attn_offset: int = 0,
+        value_residual: bool = True,
+        gated_attn: bool = False,
+        n_buttons: int = 256,
+        ctrl_conditioning: str | None = "mlp_fusion",
+        ctrl_conditioning_period: int | None = 3,
+        ctrl_cond_dropout: float = 0.0,
+        prompt_conditioning: str | None = None,
+        prompt_conditioning_period: int = 3,
+        prompt_embedding_dim: int = 2048,
+        prompt_cond_dropout: float = 0.0,
+        noise_conditioning: str = "wan",
+        scheduler_sigmas: list[float] | None = [
+            1.0, 0.8609585762023926, 0.729332447052002, 0.3205108940601349, 0.0,
+        ],
+        base_fps: int = 60,
+        causal: bool = True,
+        mlp_gradient_checkpointing: bool = True,
+        block_gradient_checkpointing: bool = True,
+        rope_impl: str = "ortho",
+        moe: bool = False,
+        moe_top_k: int = 2,
+        moe_n_experts: int = 8,
+        moe_mlp_ratio: float | None = None,
+        gated_linear: bool = False,
+        temporal_compression: int = 1,
+        inference_fps: int | None = None,
+        taehv_ae: bool = False,
+        rope_nyquist_frac: float = 0.8,
+        rope_theta: float = 10000.0,
+    ):
+        super().__init__()
+        self.denoise_step_emb = NoiseConditioner(d_model)
+        self.ctrl_emb = ControllerInputEmbedding(n_buttons, d_model, mlp_ratio)
+        if self.config.ctrl_conditioning is not None:
+            self.ctrl_cfg = CFG(self.config.d_model, self.config.ctrl_cond_dropout)
+        if self.config.prompt_conditioning is not None:
+            self.prompt_cfg = CFG(
+                self.config.prompt_embedding_dim, self.config.prompt_cond_dropout
+            )
+        self.transformer = WorldDiT(self.config)
+        self.patch = tuple(patch)
+        C, D = channels, d_model
+        self.patchify = nn.Conv2d(
+            C, D, kernel_size=self.patch, stride=self.patch, bias=False
+        )
+        self.unpatchify = nn.ConvTranspose2d(
+            D, C, kernel_size=self.patch, stride=self.patch, bias=True
+        )
+        self.out_norm = AdaLN(d_model)
+        T = tokens_per_frame
+        idx = torch.arange(T, dtype=torch.long)
+        self.register_buffer(
+            "_t_pos_1f", torch.empty(T, dtype=torch.long), persistent=False
+        )
+        self.register_buffer(
+            "_y_pos_1f", idx.div(width, rounding_mode="floor"), persistent=False
+        )
+        self.register_buffer("_x_pos_1f", idx.remainder(width), persistent=False)
+    def forward(
+        self,
+        x: Tensor,
+        sigma: Tensor,
+        frame_timestamp: Tensor,
+        frame_idx: Tensor | None = None,
+        prompt_emb: Tensor | None = None,
+        prompt_pad_mask: Tensor | None = None,
+        mouse: Tensor | None = None,
+        button: Tensor | None = None,
+        scroll: Tensor | None = None,
+        kv_cache=None,
+    ):
+        B, N, C, H, W = x.shape
+        ph, pw = self.patch
+        assert (H % ph == 0) and (W % pw == 0), "H, W must be divisible by patch"
+        Hp, Wp = H // ph, W // pw
+        torch._assert(
+            Hp * Wp == self.config.tokens_per_frame,
+            f"{Hp} * {Wp} != {self.config.tokens_per_frame}",
+        )
+        torch._assert(
+            B == 1 and N == 1, "WorldModel.forward currently supports B==1, N==1"
+        )
+        self._t_pos_1f.copy_(frame_timestamp[0, 0].expand_as(self._t_pos_1f))
+        pos_ids = TensorDict(
+            {
+                "f_pos": (frame_timestamp if frame_idx is None else frame_idx)[0, 0].expand_as(self._t_pos_1f)[None],
+                "t_pos": self._t_pos_1f[None],
+                "y_pos": self._y_pos_1f[None],
+                "x_pos": self._x_pos_1f[None],
+            },
+            batch_size=[1, self._t_pos_1f.numel()],
+        )
+        cond = self.denoise_step_emb(sigma)
+        assert button is not None
+        ctx = {
+            "ctrl_emb": self.ctrl_emb(mouse, button, scroll),
+            "prompt_emb": prompt_emb,
+            "prompt_pad_mask": prompt_pad_mask,
+        }
+        D = self.config.d_model
+        x = self.patchify(x.reshape(B * N, C, H, W))
+        x = eo.rearrange(x.view(B, N, D, Hp, Wp), "b n d hp wp -> b (n hp wp) d")
+        x = self.transformer(x, pos_ids, cond, ctx, kv_cache)
+        x = F.silu(self.out_norm(x, cond))
+        x = eo.rearrange(x, "b (n hp wp) d -> (b n) d hp wp", n=N, hp=Hp, wp=Wp)
+        x = self.unpatchify(x)
+        x = x.view(B, N, C, H, W)
+        return x
+    def get_active_parameters(self) -> int:
+        total = sum(p.numel() for p in self.parameters())
+        c = self.config
+        if getattr(c, "moe", False):
+            moe_mlp_ratio = getattr(c, "moe_mlp_ratio", None) or c.mlp_ratio / c.moe_top_k
+            hidden, top_k = int(c.d_model * moe_mlp_ratio), min(c.moe_top_k, c.moe_n_experts)
+            total -= (c.moe_n_experts - top_k) * c.n_layers * c.d_model * hidden * (3 if c.gated_linear else 2)
+        return total
+    def quantize(self, quant_type: str):
+        quantize_model(self, quant_type)
+    def apply_inference_patches(self):
+        _apply_inference_patches(self)

vae/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (C) 2025 Hugging Face Team and Overworld
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+from .ae_model import ChunkedStreamingTAEHV
+__all__ = ["ChunkedStreamingTAEHV"]

vae/ae_model.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# Copyright (C) 2025 Hugging Face Team and Overworld
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""Streaming TAEHV autoencoder for WorldEngine wp-1.5 temporal-compressed latent decoding."""
+from collections import namedtuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+# ---------------------------------------------------------------------------
+# Building blocks (mirror the taehv library)
+# ---------------------------------------------------------------------------
+TWorkItem = namedtuple("TWorkItem", ("input_tensor", "block_index"))
+def _conv(n_in, n_out, **kwargs):
+    return nn.Conv2d(n_in, n_out, 3, padding=1, **kwargs)
+class Clamp(nn.Module):
+    def forward(self, x):
+        return torch.tanh(x / 3) * 3
+class MemBlock(nn.Module):
+    def __init__(self, n_in, n_out):
+        super().__init__()
+        self.conv = nn.Sequential(
+            _conv(n_in * 2, n_out),
+            nn.ReLU(inplace=True),
+            _conv(n_out, n_out),
+            nn.ReLU(inplace=True),
+            _conv(n_out, n_out),
+        )
+        self.skip = (
+            nn.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
+        )
+        self.act = nn.ReLU(inplace=True)
+    def forward(self, x, past):
+        return self.act(self.conv(torch.cat([x, past], 1)) + self.skip(x))
+class TPool(nn.Module):
+    def __init__(self, n_f, stride):
+        super().__init__()
+        self.stride = stride
+        self.conv = nn.Conv2d(n_f * stride, n_f, 1, bias=False)
+    def forward(self, x):
+        _NT, C, H, W = x.shape
+        return self.conv(x.reshape(-1, self.stride * C, H, W))
+class TGrow(nn.Module):
+    def __init__(self, n_f, stride):
+        super().__init__()
+        self.stride = stride
+        self.conv = nn.Conv2d(n_f, n_f * stride, 1, bias=False)
+    def forward(self, x):
+        _NT, C, H, W = x.shape
+        x = self.conv(x)
+        return x.reshape(-1, C, H, W)
+# ---------------------------------------------------------------------------
+# Sequential streaming helpers
+# ---------------------------------------------------------------------------
+def _sequential_single_step(model, memory, work_queue):
+    """Process the work queue until an output frame is produced or the queue is empty."""
+    while work_queue:
+        xt, i = work_queue.pop(0)
+        if i == len(model):
+            return xt.unsqueeze(1)
+        b = model[i]
+        if isinstance(b, MemBlock):
+            if memory[i] is None:
+                xt_new = b(xt, xt * 0)
+            else:
+                xt_new = b(xt, memory[i])
+            memory[i] = xt
+            work_queue.insert(0, TWorkItem(xt_new, i + 1))
+        elif isinstance(b, TPool):
+            if memory[i] is None:
+                memory[i] = []
+            memory[i].append(xt)
+            if len(memory[i]) == b.stride:
+                N, C, H, W = xt.shape
+                xt = b(torch.cat(memory[i], 1).view(N * b.stride, C, H, W))
+                memory[i] = []
+                work_queue.insert(0, TWorkItem(xt, i + 1))
+        elif isinstance(b, TGrow):
+            xt = b(xt)
+            NT, C, H, W = xt.shape
+            for xt_next in reversed(
+                xt.view(NT // b.stride, b.stride * C, H, W).chunk(b.stride, 1)
+            ):
+                work_queue.insert(0, TWorkItem(xt_next, i + 1))
+        else:
+            xt = b(xt)
+            work_queue.insert(0, TWorkItem(xt, i + 1))
+    return None
+def _apply_parallel(model, x):
+    """Apply model with parallelization over time axis. x: NTCHW."""
+    N, T, C, H, W = x.shape
+    x = x.reshape(N * T, C, H, W)
+    for b in model:
+        if isinstance(b, MemBlock):
+            NT, C, H, W = x.shape
+            T = NT // N
+            _x = x.reshape(N, T, C, H, W)
+            block_memory = F.pad(_x, (0, 0, 0, 0, 0, 0, 1, 0), value=0)[:, :T].reshape(
+                x.shape
+            )
+            x = b(x, block_memory)
+        else:
+            x = b(x)
+    NT, C, H, W = x.shape
+    T = NT // N
+    return x.view(N, T, C, H, W)
+# ---------------------------------------------------------------------------
+# ChunkedStreamingTAEHV
+# ---------------------------------------------------------------------------
+class ChunkedStreamingTAEHV(ModelMixin, ConfigMixin):
+    """Streaming TAEHV autoencoder for temporal-compressed latent decoding.
+    Owns the encoder/decoder weights directly so diffusers can load them
+    from safetensors. Provides a streaming interface that processes one
+    temporal chunk at a time, maintaining internal state across calls.
+    """
+    _supports_gradient_checkpointing = False
+    @register_to_config
+    def __init__(
+        self,
+        latent_channels: int = 32,
+        patch_size: int = 2,
+        image_channels: int = 3,
+        encoder_time_downscale: tuple[bool, ...] = (True, True, False),
+        decoder_time_upscale: tuple[bool, ...] = (False, True, True),
+        decoder_space_upscale: tuple[bool, ...] = (True, True, True),
+    ):
+        super().__init__()
+        in_ch = image_channels * patch_size ** 2
+        self.encoder = nn.Sequential(
+            _conv(in_ch, 64), nn.ReLU(inplace=True),
+            TPool(64, 2 if encoder_time_downscale[0] else 1),
+            _conv(64, 64, stride=2, bias=False),
+            MemBlock(64, 64), MemBlock(64, 64), MemBlock(64, 64),
+            TPool(64, 2 if encoder_time_downscale[1] else 1),
+            _conv(64, 64, stride=2, bias=False),
+            MemBlock(64, 64), MemBlock(64, 64), MemBlock(64, 64),
+            TPool(64, 2 if encoder_time_downscale[2] else 1),
+            _conv(64, 64, stride=2, bias=False),
+            MemBlock(64, 64), MemBlock(64, 64), MemBlock(64, 64),
+            _conv(64, latent_channels),
+        )
+        n_f = [256, 128, 64, 64]
+        self.decoder = nn.Sequential(
+            Clamp(),
+            _conv(latent_channels, n_f[0]), nn.ReLU(inplace=True),
+            MemBlock(n_f[0], n_f[0]), MemBlock(n_f[0], n_f[0]), MemBlock(n_f[0], n_f[0]),
+            nn.Upsample(scale_factor=2 if decoder_space_upscale[0] else 1),
+            TGrow(n_f[0], 2 if decoder_time_upscale[0] else 1),
+            _conv(n_f[0], n_f[1], bias=False),
+            MemBlock(n_f[1], n_f[1]), MemBlock(n_f[1], n_f[1]), MemBlock(n_f[1], n_f[1]),
+            nn.Upsample(scale_factor=2 if decoder_space_upscale[1] else 1),
+            TGrow(n_f[1], 2 if decoder_time_upscale[1] else 1),
+            _conv(n_f[1], n_f[2], bias=False),
+            MemBlock(n_f[2], n_f[2]), MemBlock(n_f[2], n_f[2]), MemBlock(n_f[2], n_f[2]),
+            nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1),
+            TGrow(n_f[2], 2 if decoder_time_upscale[2] else 1),
+            _conv(n_f[2], n_f[3], bias=False),
+            nn.ReLU(inplace=True),
+            _conv(n_f[3], image_channels * patch_size ** 2),
+        )
+        # Computed properties
+        self.t_downscale = 2 ** sum(
+            t.stride == 2 for t in self.encoder if isinstance(t, TPool)
+        )
+        self.t_upscale = 2 ** sum(
+            t.stride == 2 for t in self.decoder if isinstance(t, TGrow)
+        )
+        self.frames_to_trim = self.t_upscale - 1
+        self.patch_size = patch_size
+        # Streaming state (initialised on first use / reset)
+        self._encoder_work_queue: list[TWorkItem] = []
+        self._encoder_memory: list = [None] * len(self.encoder)
+        self._decoder_work_queue: list[TWorkItem] = []
+        self._decoder_memory: list = [None] * len(self.decoder)
+        self._n_frames_encoded: int = 0
+        self._n_frames_decoded: int = 0
+        self._last_encoder_input_frame: Tensor | None = None
+    # ------------------------------------------------------------------
+    # Streaming state management
+    # ------------------------------------------------------------------
+    def reset(self):
+        """Reset streaming state for a new sequence."""
+        self._encoder_work_queue = []
+        self._encoder_memory = [None] * len(self.encoder)
+        self._decoder_work_queue = []
+        self._decoder_memory = [None] * len(self.decoder)
+        self._n_frames_encoded = 0
+        self._n_frames_decoded = 0
+        self._last_encoder_input_frame = None
+    # ------------------------------------------------------------------
+    # Pre/post processing
+    # ------------------------------------------------------------------
+    def _preprocess_input_frames(self, x: Tensor) -> Tensor:
+        if self.patch_size > 1:
+            x = F.pixel_unshuffle(x, self.patch_size)
+        return x
+    def _postprocess_output_frames(self, x: Tensor) -> Tensor:
+        if self.patch_size > 1:
+            x = F.pixel_shuffle(x, self.patch_size)
+        return x.clamp_(0, 1)
+    # ------------------------------------------------------------------
+    # Streaming encode / decode (one chunk at a time)
+    # ------------------------------------------------------------------
+    def _streaming_encode_step(self, x: Tensor | None = None) -> Tensor | None:
+        """Feed an input frame and try to produce an encoder output.
+        Args:
+            x: N1CHW RGB frame tensor with values in [0, 1], or None.
+        Returns:
+            N1CHW latent tensor, or None if not enough input accumulated.
+        """
+        if x is not None:
+            self._last_encoder_input_frame = x[:, -1:]
+            x = self._preprocess_input_frames(x)
+            self._encoder_work_queue.extend(
+                TWorkItem(xt, 0) for xt in x.unbind(1)
+            )
+            self._n_frames_encoded += x.shape[1]
+        return _sequential_single_step(
+            self.encoder, self._encoder_memory, self._encoder_work_queue
+        )
+    def _streaming_decode_step(self, x: Tensor | None = None) -> Tensor | None:
+        """Feed a latent and try to produce a decoded frame.
+        Args:
+            x: N1CHW latent tensor, or None to retrieve the next pending frame.
+        Returns:
+            N1CHW decoded RGB frame tensor, or None.
+        """
+        if x is not None:
+            self._decoder_work_queue.extend(
+                TWorkItem(xt, 0) for xt in x.unbind(1)
+            )
+        while True:
+            xt = _sequential_single_step(
+                self.decoder, self._decoder_memory, self._decoder_work_queue
+            )
+            if xt is None:
+                return None
+            self._n_frames_decoded += 1
+            if self._n_frames_decoded <= self.frames_to_trim:
+                continue
+            return self._postprocess_output_frames(xt)
+    def _flush_decoder(self) -> list[Tensor]:
+        """Drain all remaining decoded frames from the decoder."""
+        frames = []
+        while (frame := self._streaming_decode_step()) is not None:
+            frames.append(frame)
+        return frames
+    # ------------------------------------------------------------------
+    # Pipeline-facing encode / decode
+    # ------------------------------------------------------------------
+    @torch.inference_mode()
+    def encode(self, img: Tensor) -> Tensor:
+        """Encode a chunk of frames to a single latent.
+        Args:
+            img: [T, H, W, C] uint8 where T == t_downscale
+        Returns:
+            latent: [B, C, h, w]
+        """
+        assert img.dim() == 4 and img.shape[-1] == 3, "Expected [T, H, W, C] RGB uint8"
+        if img.shape[0] != self.t_downscale:
+            raise ValueError(
+                f"Expected {self.t_downscale} frames, got {img.shape[0]}"
+            )
+        rgb = (
+            img.unsqueeze(0)
+            .to(device=self.device, dtype=self.dtype)
+            .permute(0, 1, 4, 2, 3)
+            .contiguous()
+            .div(255)
+        )
+        latent = self._streaming_encode_step(rgb)
+        if latent is None:
+            raise RuntimeError("Expected a latent after a full chunk")
+        return latent.squeeze(1)
+    @torch.inference_mode()
+    def decode(self, latent: Tensor) -> Tensor:
+        """Decode a latent to frames.
+        Args:
+            latent: [B, C, h, w]
+        Returns:
+            frames: [T, H, W, C] uint8
+        """
+        assert latent.dim() == 4, "Expected [B, C, h, w] latent tensor"
+        z = latent.unsqueeze(1).to(device=self.device, dtype=self.dtype)
+        if self._n_frames_decoded == 0:
+            for _ in range(self.frames_to_trim):
+                self._streaming_decode_step(z)
+                self._flush_decoder()
+        first = self._streaming_decode_step(z)
+        assert first is not None, "Expected decoded output after a latent"
+        frames = [first, *self._flush_decoder()]
+        decoded = torch.cat(frames, dim=1)
+        decoded = (decoded.clamp(0, 1) * 255).round().to(torch.uint8)
+        return decoded.squeeze(0).permute(0, 2, 3, 1)[..., :3]

vae/config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_class_name": "ChunkedStreamingTAEHV",
+  "_diffusers_version": "0.36.0.dev0",
+  "auto_map": {
+    "AutoModel": "ae_model.ChunkedStreamingTAEHV"
+  },
+  "latent_channels": 32,
+  "patch_size": 2,
+  "image_channels": 3,
+  "encoder_time_downscale": [true, true, false],
+  "decoder_time_upscale": [false, true, true],
+  "decoder_space_upscale": [true, true, true]
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b52e245bb86c62e159f50338e2e8f422d4b6f98b467164939c1c031c7d61352e
+size 22755856