File size: 12,113 Bytes

10b2bb7

"""Shared MoE Audio Projector.

A simplified MoE projector combining the best ideas:
- Shared expert: Always-on baseline processing (from GLM4)
- Zero-initialized router: Learns specialization naturally (from Qwen3)
- Simple top-k softmax: No grouping complexity (from Mixtral)
- Renormalized weights: Top-k weights sum to 1

Architecture:
    Output = SharedExpert(x) + TopKRoutedExperts(x)

The shared expert ensures every audio token gets consistent baseline
processing, while routed experts can specialize for different patterns
(e.g., vowels vs consonants, silence vs speech).
"""

import torch
import torch.nn as nn
import torch.nn.functional as F  # noqa: N812


class SharedExpert(nn.Module):
    """Shared expert MLP that processes all tokens."""

    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super().__init__()
        self.gate_proj = nn.Linear(input_dim, hidden_dim, bias=False)
        self.up_proj = nn.Linear(input_dim, hidden_dim, bias=False)
        self.down_proj = nn.Linear(hidden_dim, output_dim, bias=False)
        self.act = nn.SiLU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))


class SwiGLUExpert(nn.Module):
    """Single SwiGLU expert MLP."""

    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super().__init__()
        self.gate_proj = nn.Linear(input_dim, hidden_dim, bias=False)
        self.up_proj = nn.Linear(input_dim, hidden_dim, bias=False)
        self.down_proj = nn.Linear(hidden_dim, output_dim, bias=False)
        self.act = nn.SiLU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))


class RoutedExperts(nn.Module):
    """
    Sparse routed experts using token dispatch.

    For each expert, gathers assigned tokens, processes them, then scatters back.
    Memory-efficient: O(num_tokens * hidden_dim) instead of
    O(num_tokens * num_experts * hidden_dim * input_dim).
    """

    def __init__(
        self, num_experts: int, top_k: int, input_dim: int, hidden_dim: int, output_dim: int
    ):
        super().__init__()
        self.num_experts = num_experts
        self.top_k = top_k
        self.output_dim = output_dim

        # ModuleList of expert MLPs
        self.experts = nn.ModuleList([
            SwiGLUExpert(input_dim, hidden_dim, output_dim)
            for _ in range(num_experts)
        ])

    def forward(
        self,
        hidden_states: torch.Tensor,
        top_k_indices: torch.Tensor,
        top_k_weights: torch.Tensor,
    ) -> torch.Tensor:
        """
        Token dispatch approach - memory efficient.

        Args:
            hidden_states: [num_tokens, input_dim]
            top_k_indices: [num_tokens, top_k]
            top_k_weights: [num_tokens, top_k]

        Returns:
            output: [num_tokens, output_dim]
        """
        num_tokens = hidden_states.shape[0]
        device = hidden_states.device
        dtype = hidden_states.dtype

        # Output accumulator
        output = torch.zeros(num_tokens, self.output_dim, device=device, dtype=dtype)

        # Process each expert
        for expert_idx, expert in enumerate(self.experts):
            # Find which (token, slot) pairs use this expert
            # top_k_indices: [N, K], we want all positions where value == expert_idx
            expert_mask = top_k_indices == expert_idx  # [N, K]

            if not expert_mask.any():
                continue

            # Get token indices and slot indices where this expert is selected
            token_indices, slot_indices = torch.where(expert_mask)

            # Gather the tokens for this expert
            expert_input = hidden_states[token_indices]  # [num_selected, input_dim]

            # Process through expert
            expert_output = expert(expert_input)  # [num_selected, output_dim]

            # Get weights for these tokens at these slots
            weights = top_k_weights[token_indices, slot_indices]  # [num_selected]

            # Weighted output
            weighted_output = expert_output * weights.unsqueeze(-1)

            # Scatter-add back to output
            output.index_add_(0, token_indices, weighted_output)

        return output


class SharedMoEBlock(nn.Module):
    """MoE block with shared expert + sparse routed experts."""

    def __init__(
        self,
        input_dim: int,
        hidden_dim: int,
        output_dim: int,
        num_experts: int = 4,
        top_k: int = 2,
    ):
        super().__init__()
        self.num_experts = num_experts
        self.top_k = top_k

        # Router: zero-initialized for natural learning
        self.router = nn.Linear(input_dim, num_experts, bias=False)
        nn.init.zeros_(self.router.weight)

        # Shared expert (always active)
        self.shared_expert = SharedExpert(input_dim, hidden_dim, output_dim)

        # Routed experts (sparse)
        self.routed_experts = RoutedExperts(
            num_experts, self.top_k, input_dim, hidden_dim, output_dim
        )


        # For auxiliary loss
        self.last_router_logits = None

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        batch_size, seq_len, dim = hidden_states.shape

        # Shared expert output (all tokens)
        shared_out = self.shared_expert(hidden_states)

        # Routing
        flat_hidden = hidden_states.view(-1, dim)
        router_logits = self.router(flat_hidden)
        self.last_router_logits = router_logits

        # Softmax -> top-k -> renormalize
        router_probs = F.softmax(router_logits.float(), dim=-1)
        top_k_weights, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
        top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
        top_k_weights = top_k_weights.to(hidden_states.dtype)

        # Routed expert output
        routed_out = self.routed_experts(flat_hidden, top_k_indices, top_k_weights)
        routed_out = routed_out.view(batch_size, seq_len, -1)

        # Combine: shared expert baseline + routed experts (grow in via zero-init down_proj)
        return shared_out + routed_out


def load_balancing_loss(router_logits: torch.Tensor, num_experts: int, top_k: int) -> torch.Tensor:
    """Auxiliary loss to encourage balanced expert usage."""
    if router_logits is None:
        return torch.tensor(0.0)

    probs = F.softmax(router_logits.float(), dim=-1)
    _, selected = torch.topk(probs, top_k, dim=-1)

    # Fraction of tokens per expert
    expert_mask = F.one_hot(selected, num_experts).float()
    tokens_per_expert = expert_mask.mean(dim=(0, 1))

    # Average probability per expert
    prob_per_expert = probs.mean(dim=0)

    # Balance loss
    return (tokens_per_expert * prob_per_expert).sum() * num_experts


def z_loss(router_logits: torch.Tensor) -> torch.Tensor:
    """Z-loss to prevent router logits from growing too large.

    From DeepSeek/Switch Transformer: penalizes large logits to keep
    softmax in its "soft" regime where gradients flow properly.
    """
    if router_logits is None:
        return torch.tensor(0.0)

    # logsumexp ≈ max(logits), squaring penalizes large values
    return torch.logsumexp(router_logits.float(), dim=-1).square().mean()


class SharedMoEAudioProjector(nn.Module):
    """Shared MoE Audio Projector.

    Combines a shared expert (always-on) with sparse routed experts.
    Uses zero-initialized router for natural specialization learning.

    Config options:
        - num_experts: Number of routed experts (default: 4)
        - num_experts_per_tok: Top-k routing (default: 2)
        - router_aux_loss_coef: Load balancing loss weight (default: 0.01)
        - router_z_loss_coef: Z-loss weight to prevent large logits (default: 0.001)
    """

    def __init__(self, config):
        super().__init__()

        # Temporal downsampling
        self.k = getattr(config, "projector_pool_stride", 4)

        # Dimensions
        self.encoder_dim = config.encoder_dim
        in_dim = self.encoder_dim * self.k
        out_dim = config.llm_dim
        # No expansion - keep hidden dim same as input dim
        hidden_dim = getattr(config, "projector_hidden_dim", None) or in_dim

        # MoE config
        self.num_experts = getattr(config, "num_experts", 4)
        self.top_k = getattr(config, "num_experts_per_tok", 2)
        self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.01)
        self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.001)

        # Layers
        self.moe = SharedMoEBlock(in_dim, hidden_dim, out_dim, self.num_experts, self.top_k)

        # Init
        self._init_weights()

    def _init_weights(self):
        with torch.no_grad():
            # Xavier init: std = 1/sqrt(fan_in)
            in_dim = self.encoder_dim * self.k
            std = 1.0 / (in_dim ** 0.5)

            # Use a smaller std for the final projection in the shared expert's residual path
            down_proj_std = std / 2.0

            # Shared expert
            nn.init.normal_(self.moe.shared_expert.gate_proj.weight, std=std)
            nn.init.normal_(self.moe.shared_expert.up_proj.weight, std=std)
            nn.init.normal_(self.moe.shared_expert.down_proj.weight, std=down_proj_std)

            # Routed experts - zero init down_proj so they "grow in" from zero
            for expert in self.moe.routed_experts.experts:
                nn.init.normal_(expert.gate_proj.weight, std=std)
                nn.init.normal_(expert.up_proj.weight, std=std)
                nn.init.zeros_(expert.down_proj.weight)

            # Router stays zero-initialized

    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor | None = None) -> torch.Tensor:
        batch_size, seq_len, dim = x.size()

        # Dtype
        target_dtype = self.moe.shared_expert.gate_proj.weight.dtype
        if x.dtype != target_dtype:
            x = x.to(target_dtype)

        # Pad for pooling
        if seq_len % self.k:
            x = F.pad(x, (0, 0, 0, self.k - seq_len % self.k))
            if attention_mask is not None:
                attention_mask = F.pad(attention_mask, (0, self.k - seq_len % self.k), value=0)

        # Store pooled attention mask for aux loss
        if attention_mask is not None:
            # Max-pool the attention mask
            pooled_mask = F.max_pool1d(attention_mask.float().unsqueeze(1), self.k, self.k)
            self.last_attention_mask = pooled_mask.squeeze(1).bool()
        else:
            self.last_attention_mask = None

        # Temporal pooling
        x = x.view(batch_size, -1, dim * self.k)

        # Forward
        x = self.moe(x)

        return x

    def get_aux_loss(self) -> torch.Tensor:
        """Get auxiliary losses (call after forward).

        Combines:
        - Load balancing loss: encourages balanced expert usage
        - Z-loss: prevents router logits from growing too large
        """
        router_logits = self.moe.last_router_logits
        if router_logits is None:
            return torch.tensor(0.0, device=self.moe.router.weight.device)

        # Retrieve the attention mask stored during the forward pass
        attention_mask = getattr(self, "last_attention_mask", None)

        # If a mask exists, filter the logits to only include un-padded tokens
        if attention_mask is not None:
            flat_mask = attention_mask.view(-1)
            # Ensure the mask is not all False, which would create an empty tensor
            if flat_mask.any():
                active_logits = router_logits[flat_mask]
            else:
                # If the mask is all False, there are no tokens to compute loss on
                return torch.tensor(0.0, device=router_logits.device)
        else:
            active_logits = router_logits

        balance_loss = load_balancing_loss(active_logits, self.num_experts, self.top_k)
        z = z_loss(active_logits)

        return self.aux_loss_coef * balance_loss + self.z_loss_coef * z