Upload 14 files

Browse files

Files changed (11) hide show

DCMoE.py +561 -0
README (1).md +216 -0
config.json +4 -0
deepspeed_utils.py +124 -0
model-00001-of-00003.safetensors +2 -2
modeling.py +1182 -0
special_tokens_map.json +78 -1
tokenizer_config.json +102 -2
utils.py +491 -0
video_preprocessor_config (1).json +43 -0
vocab.json +0 -0

DCMoE.py ADDED Viewed

	@@ -0,0 +1,561 @@

+import copy
+import os
+from typing import Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+import deepspeed
+from deepspeed import comm as dist
+from deepspeed.utils import groups, log_dist
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+from deepspeed.moe.sharded_moe import FIRST_ALLTOALL_TIMER, MOE_TIMER, SECOND_ALLTOALL_TIMER, _AllToAll, einsum, gumbel_rsample
+from transformers.activations import ACT2FN
+def compress_matrix(A: torch.Tensor, mask: torch.Tensor, force_dim: int = None, allow_larger_dim=None) -> torch.Tensor:
+    if A.shape[:2] != mask.shape:
+        raise ValueError("First two dimensions of A and mask must match.")
+    if mask.ndim != 2:
+        raise ValueError("mask must be a 2D tensor.")
+    if not ((mask == 0) | (mask == 1)).all():
+        raise ValueError(
+            f"mask must only contain 0s and 1s. dtype: {mask.dtype}. "
+            f"Invalid elements found at indices: {((mask != 0) & (mask != 1)).nonzero().tolist()} "  # Get indices of elements not 0 AND not 1
+            f"with corresponding values: {mask[((mask != 0) & (mask != 1))].tolist()}. "  # Get the values at those indices
+            f"\nOriginal mask (showing up to first 20 elements if large):\n{mask.flatten()[:20]}{'...' if mask.numel() > 20 else ''}"
+        )
+    S, E = mask.shape
+    trailing_dims_shape = A.shape[2:]
+    num_trailing_dims = len(trailing_dims_shape)
+    device = A.device
+    ones_per_column = mask.sum(dim=0)
+    X = ones_per_column.max().item() if force_dim is None else force_dim
+    if X == 0:
+        return torch.empty((0, E, *trailing_dims_shape), dtype=A.dtype, device=device)
+    sorted_row_indices_2d = torch.argsort(mask.float(), dim=0, descending=True)
+    view_shape_for_indices = (S, E, *((1,) * num_trailing_dims))
+    expanded_indices = sorted_row_indices_2d.view(view_shape_for_indices).expand_as(A)
+    A_gathered = torch.gather(A, 0, expanded_indices)
+    if X <= A_gathered.shape[0]:
+        B_candidate = A_gathered[:X, ...]
+    elif allow_larger_dim or allow_larger_dim is None:
+        if allow_larger_dim is None:
+            print(f"[Warning compress_matrix] Target dimension X ({X}) is larger than "
+                      f"A's original row count S ({S}). Padding B_candidate with zeros.")
+        B_candidate = A_gathered
+        zeros_shape = [X - A_gathered.shape[0]] + list(B_candidate.shape[1:])
+        B_candidate = torch.cat((B_candidate, torch.zeros(zeros_shape, dtype=B_candidate.dtype, device=B_candidate.device)), dim=0)  # Shape (X_target_dim, E, ...)
+    else:
+        raise AssertionError(
+                f"Target dimension X ({X}) is larger than A's original row count S ({S}) "
+                f"and allow_larger_dim is False. Padding is disallowed."
+            )
+    row_indices_for_B = torch.arange(X, device=device).unsqueeze(1)
+    b_mask_2d = row_indices_for_B < ones_per_column.unsqueeze(0)
+    view_shape_for_b_mask = (X, E, *((1,) * num_trailing_dims))
+    B = B_candidate * b_mask_2d.view(view_shape_for_b_mask).to(A.dtype)
+    return B
+def decompress_matrix(B: torch.Tensor, mask: torch.Tensor, allow_larger_dim=None) -> torch.Tensor:
+    if B.shape[1] != mask.shape[1]:
+        raise ValueError("B's second dimension and mask's second dimension (E) must match.")
+    if mask.ndim != 2:
+        raise ValueError("mask must be a 2D tensor.")
+    if not ((mask == 0) | (mask == 1)).all():
+        raise ValueError("mask must only contain 0s and 1s.")
+    S, E = mask.shape
+    X = B.shape[0]
+    trailing_dims_shape = B.shape[2:]
+    num_trailing_dims = len(trailing_dims_shape)
+    device = B.device
+    if X == 0:  return torch.zeros((S, E, *trailing_dims_shape), dtype=B.dtype, device=device)
+    if X <= S: pass
+    elif allow_larger_dim or allow_larger_dim is None:
+        if allow_larger_dim is None:
+                print(f"[Warning decompress_matrix] Input B.shape[0] ({X}) is larger than "
+                      f"target A's row count S ({S}). Truncating B to its first {S} rows.")
+        B = B[:S, ...]
+        X = S
+    else:
+        raise AssertionError(
+                f"Input B.shape[0] ({X}) is larger than target A's row count S ({S}) "
+                f"and allow_larger_dim is False. Truncation is disallowed."
+            )
+    sorted_row_indices_2d = torch.argsort(mask.float(), dim=0, descending=True)
+    target_A_row_indices_2d = sorted_row_indices_2d[:X, :]
+    A_reconstructed = torch.zeros((S, E, *trailing_dims_shape), dtype=B.dtype, device=device)
+    view_shape_for_target_indices = (X, E, *((1,) * num_trailing_dims))
+    expanded_target_indices = target_A_row_indices_2d.view(view_shape_for_target_indices).expand_as(B)
+    A_reconstructed.scatter_(dim=0, index=expanded_target_indices, src=B)
+    return A_reconstructed
+class AudioSharedExpertMLP(nn.Module):
+    """
+    Shared expert MLP for UniMoE-Audio model.
+    Handles common audio feature transformations across all tokens.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.shared_intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+class AudioDynamicExpertMLP(nn.Module):
+    """
+    Dynamic expert MLP for UniMoE-Audio model.
+    Specialized for adaptive audio feature processing based on content.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.dynamic_intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+class AudioNullExpertMLP(nn.Module):
+    """
+    Null expert MLP for UniMoE-Audio model.
+    Returns zero output for tokens that don't require expert processing.
+    """
+    def __init__(self, config):
+        super().__init__()
+    def forward(self, hidden_state):
+        return torch.zeros_like(hidden_state, dtype=hidden_state.dtype, device=hidden_state.device)
+def audio_sparse_expert_mixer(scores, top_k, jitter_eps, training):
+    """
+    Sparse expert mixing function for UniMoE-Audio.
+    Implements adaptive expert selection with noise injection for training.
+    """
+    masked_scores = scores
+    multiplier_list = []
+    selected_experts_list = []
+    for _ in range(top_k):
+        with torch.no_grad():
+            mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True)
+            factor = scores.abs().clamp(min=mask_logits_threshold.abs())
+            mask_logits_threshold = ((mask_logits_threshold - scores) / factor) > (2 * jitter_eps)
+        masked_gates = masked_scores.masked_fill(mask_logits_threshold, float("-inf"))
+        selected_experts = max_ind
+        masked_gates = torch.softmax(masked_gates, dim=-1)
+        multiplier_o = masked_gates.gather(dim=-1, index=selected_experts)
+        multiplier = multiplier_o
+        masked_scores = torch.scatter(
+            masked_scores,
+            -1,
+            selected_experts,
+            float("-inf"),
+        )
+        multiplier_list.append(multiplier)
+        selected_experts_list.append(selected_experts)
+    multiplier = torch.concat(multiplier_list, dim=-1)
+    selected_experts = torch.concat(selected_experts_list, dim=-1)
+    return (
+        multiplier,
+        selected_experts,
+    )
+def audio_dynamic_expert_selection(logits, top_p):
+    """
+    Dynamic expert selection for UniMoE-Audio based on cumulative probability threshold.
+    Adapts the number of experts based on audio content complexity.
+    """
+    dynamic_scores = torch.softmax(logits, dim=-1)
+    dynamic_scores_sorted, _ = torch.sort(dynamic_scores, dim=-1, descending=True)
+    dynamic_scores_cumsum = dynamic_scores_sorted.cumsum(dim=-1)
+    dynamic_top_k = (~(dynamic_scores_cumsum >= top_p)).sum(dim=-1)
+    dynamic_top_k = dynamic_top_k + 1
+    return dynamic_top_k
+def _audio_expert_capacity(num_tokens, num_experts, capacity_factor: Tensor, min_capacity: Tensor) -> Tensor:
+    """Calculate expert capacity for UniMoE-Audio based on token distribution and capacity factor."""
+    capacity = torch.ceil((num_tokens / num_experts) * capacity_factor).to(torch.int64)
+    if capacity < min_capacity:
+        capacity = min_capacity.to(torch.int64)
+    return capacity
+def calculate_audio_global_routing_weight(
+    expert_mask: torch.Tensor,
+    full_router_logits: torch.Tensor,
+    mlp_dynamic_expert_num: int,
+    routing_weights: torch.Tensor,
+):
+    """
+    Calculate global routing weights for UniMoE-Audio combining dynamic and fixed expert weights.
+    Optimized for audio generation tasks.
+    """
+    global_weight = torch.softmax(full_router_logits.masked_fill(expert_mask == 0, float("-inf")), dim=-1)
+    global_dynamic_weight = global_weight[:, :mlp_dynamic_expert_num]
+    global_fixed_weight = global_weight[:, mlp_dynamic_expert_num:]
+    global_dynamic_weight = routing_weights * global_dynamic_weight.sum(-1).unsqueeze(-1).expand(-1, routing_weights.shape[-1])
+    global_weight = torch.cat((global_dynamic_weight, global_fixed_weight), dim=-1)
+    return global_weight
+class UniMoEAudioSparseMoeBlock(nn.Module):
+    """
+    UniMoE-Audio Sparse Mixture of Experts block with dynamic routing and expert selection.
+    Optimized for audio generation tasks with efficient sparse operations and capacity management.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.mlp_dynamic_expert_num = config.mlp_dynamic_expert_num + config.mlp_dynamic_null_expert_num
+        self.mlp_dynamic_real_expert_num = config.mlp_dynamic_expert_num
+        self.mlp_dynamic_null_expert_num = config.mlp_dynamic_null_expert_num
+        self.mlp_dynamic_top_p = config.mlp_dynamic_top_p
+        self.mlp_dynamic_top_k = config.mlp_dynamic_top_k
+        self.mlp_fixed_expert_num = config.mlp_fixed_expert_num
+        self.num_experts = self.mlp_dynamic_expert_num + self.mlp_fixed_expert_num
+        if self.mlp_dynamic_top_p == 0:
+            print(f"mlp_dynamic_top_p is 0, will use mlp_dynamic_top_k={self.mlp_dynamic_top_k} instead !!!")
+        self.ignore_differentiable_router = config.ignore_differentiable_router
+        if self.ignore_differentiable_router:
+            print("ignore_differentiable_router is True, will not use router_logits !!!")
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.fixed_real_moe = nn.ModuleList([AudioSharedExpertMLP(config) for _ in range(self.mlp_fixed_expert_num)])
+        self.dynamic_real_moe = UniMoEAudioMoE(config, AudioDynamicExpertMLP(config), self.mlp_dynamic_real_expert_num, config.ep_size)
+        self.router_jitter_noise = config.router_jitter_noise
+        self.input_jitter_noise = config.input_jitter_noise
+        self.min_capacity = config.min_capacity
+        self.capacity_factor = config.capacity_factor
+        self.token_drop = config.token_drop
+        self.drop_policy = config.drop_policy
+        self.avg_hidden_states_last = config.avg_hidden_states_last
+        self.drop_token_num_print = config.drop_token_num_print
+        self.fp32_gate = config.fp32_gate
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, aux_balance_weight: torch.Tensor=None):
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        original_hidden_states = hidden_states
+        if self.training and self.fp32_gate:
+            hidden_states = hidden_states.float()
+        if self.training and self.input_jitter_noise > 0:
+            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.input_jitter_noise, 1.0 + self.input_jitter_noise)
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.training and self.fp32_gate:
+            full_router_logits = torch.nn.functional.linear(hidden_states, weight=self.gate.weight.float(), bias=None)
+        else:
+            full_router_logits = self.gate(hidden_states)
+        dynamic_router_logits = full_router_logits[:, : self.mlp_dynamic_expert_num]
+        if self.mlp_dynamic_top_p != 0:
+            dynamic_top_k = audio_dynamic_expert_selection(dynamic_router_logits, self.mlp_dynamic_top_p)
+        else:
+            dynamic_top_k = torch.full((dynamic_router_logits.shape[0],), self.mlp_dynamic_top_k, dtype=torch.int, device=dynamic_router_logits.device)
+        expert_mask = torch.zeros((batch_size * sequence_length, self.num_experts), dtype=torch.int, device=hidden_states.device)
+        routing_weights = torch.zeros((batch_size * sequence_length, self.mlp_dynamic_expert_num), dtype=hidden_states.dtype, device=hidden_states.device)
+        for top_k in range(1, self.mlp_dynamic_expert_num + 1):
+            group_idx = torch.nonzero(dynamic_top_k == top_k, as_tuple=True)[0]
+            if len(group_idx) == 0:
+                continue
+            dynamic_group_logits = dynamic_router_logits[group_idx]
+            group_routing_weights, group_selected_experts = audio_sparse_expert_mixer(
+                dynamic_group_logits,
+                top_k=top_k,
+                jitter_eps=self.router_jitter_noise,
+                training=self.training and not self.ignore_differentiable_router,
+            )
+            group_expert_mask = torch.nn.functional.one_hot(group_selected_experts, num_classes=self.num_experts)
+            group_expert_mask = group_expert_mask.sum(dim=1)
+            group_weight = torch.zeros((len(group_idx), self.mlp_dynamic_expert_num), dtype=hidden_states.dtype, device=hidden_states.device)
+            group_weight.scatter_(dim=-1, index=group_selected_experts, src=group_routing_weights)
+            routing_weights.index_add_(0, group_idx, group_weight)
+            expert_mask.index_add_(0, group_idx, group_expert_mask.to(expert_mask.dtype))
+        routing_weights = routing_weights / (routing_weights.sum(dim=-1).unsqueeze(-1).expand(-1, routing_weights.shape[-1]) + 1e-6)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(expert_mask.dtype).view(-1).unsqueeze(-1).expand(-1, self.num_experts)
+            expert_mask = expert_mask * attention_mask
+        if self.mlp_dynamic_expert_num < self.num_experts:
+            expert_mask[:, self.mlp_dynamic_expert_num :] = 1
+        aux_loss = audio_load_balancing_loss_func(
+            expert_mask=expert_mask,
+            mlp_dynamic_expert_num=self.mlp_dynamic_expert_num,
+            global_weight=None,
+            full_router_logits=full_router_logits,
+            routing_weights=routing_weights,
+            aux_balance_weight=aux_balance_weight,
+        )
+        if self.token_drop:
+            expert_mask_dtype = expert_mask.dtype
+            capacity = _audio_expert_capacity(batch_size * sequence_length, self.mlp_dynamic_expert_num, torch.tensor(self.capacity_factor), torch.tensor(self.min_capacity))
+            if self.drop_policy == "probs":
+                if capacity > dynamic_router_logits.shape[0]:
+                    print(f"[warning] token capacity({capacity}) > token num({dynamic_router_logits.shape[0]}), setting capacity=token num")
+                    capacity = dynamic_router_logits.shape[0]
+                dynamic_expert_mask = expert_mask[:, : self.mlp_dynamic_expert_num].bool()
+                token_drop_router_logits = torch.masked_fill(dynamic_router_logits, ~dynamic_expert_mask, torch.finfo(dynamic_router_logits.dtype).min)
+                capacity_probs, capacity_indices = torch.topk(token_drop_router_logits, k=capacity, dim=0, sorted=False)
+                capacity_mask = torch.zeros_like(expert_mask).scatter(0, capacity_indices, 1)
+                capacity_mask[:, self.mlp_dynamic_expert_num :] = 1
+                expert_mask = torch.logical_and(expert_mask, capacity_mask)
+                ori_token_num = dynamic_expert_mask.sum().item()
+                cur_token_num = expert_mask[:, : self.mlp_dynamic_expert_num].sum().item()
+                if self.drop_token_num_print and ("RANK" not in os.environ or int(os.environ["RANK"]) == 0):
+                    print(f"drop {ori_token_num - cur_token_num} tokens from total {ori_token_num} tokens")
+            elif self.drop_policy == "position":
+                locations = torch.cumsum(expert_mask, dim=0) - 1
+                expert_mask *= torch.lt(locations, capacity)
+            else:
+                raise ValueError(f"Invalid drop_policy: {self.drop_policy}")
+            expert_mask = expert_mask.to(expert_mask_dtype)
+            routing_weights = routing_weights.masked_fill(~(expert_mask[:, : self.mlp_dynamic_expert_num].bool()), 0.0)
+            routing_weights = routing_weights / (routing_weights.sum(dim=-1).unsqueeze(-1).expand(-1, routing_weights.shape[-1]) + 1e-6)
+        if self.mlp_dynamic_expert_num < self.num_experts:
+            global_weight = calculate_audio_global_routing_weight(expert_mask, full_router_logits, self.mlp_dynamic_expert_num, routing_weights)
+        else:
+            global_weight = routing_weights
+        hidden_states = original_hidden_states.view(-1, hidden_dim)
+        final_hidden_states = torch.zeros((batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device)
+        global_weight = global_weight.to(hidden_states.dtype)
+        current_hidden_states = self.dynamic_real_moe(hidden_states, expert_mask=expert_mask[:, : self.mlp_dynamic_real_expert_num], router_weight=global_weight[:, : self.mlp_dynamic_real_expert_num])
+        final_hidden_states = final_hidden_states + current_hidden_states
+        for expert_idx in range(self.mlp_fixed_expert_num):
+            expert_layer = self.fixed_real_moe[expert_idx]
+            current_state = hidden_states
+            current_global_weight = global_weight[:, self.mlp_dynamic_expert_num + expert_idx].unsqueeze(-1)
+            current_hidden_states = expert_layer(current_state) * current_global_weight
+            final_hidden_states = final_hidden_states + current_hidden_states
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        if not self.training and self.avg_hidden_states_last:
+            dist.all_reduce(final_hidden_states, op=dist.ReduceOp.AVG, group=self.dynamic_real_moe.deepspeed_moe.ep_group)
+        return final_hidden_states, full_router_logits, dynamic_top_k, expert_mask, global_weight, aux_loss
+def audio_load_balancing_loss_func(
+    expert_mask: torch.Tensor,
+    mlp_dynamic_expert_num: int,
+    global_weight: Optional[torch.Tensor] = None,
+    full_router_logits: Optional[torch.Tensor] = None,
+    routing_weights: Optional[torch.Tensor] = None,
+    aux_balance_weight: Optional[torch.Tensor] = None,
+) -> float:
+    """Calculate load balancing loss for UniMoE-Audio expert routing to encourage balanced usage."""
+    min_dtype = torch.finfo(full_router_logits.dtype).min
+    global_weight = full_router_logits.masked_fill(expert_mask == 0, min_dtype)
+    global_weight = global_weight[:, :mlp_dynamic_expert_num]
+    global_weight = torch.softmax(global_weight, dim=-1)
+    expert_mask = expert_mask[:, :mlp_dynamic_expert_num]
+    num_experts = expert_mask.shape[-1]
+    if aux_balance_weight is None:
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        router_prob_per_expert = torch.mean(global_weight, dim=0)
+    else:
+        batch_size, sequence_length = aux_balance_weight.shape
+        num_hidden_layers = global_weight.shape[0] // (batch_size * sequence_length)
+        expert_attention_mask = aux_balance_weight[None, :, :, None].expand((num_hidden_layers, batch_size, sequence_length, num_experts)).reshape(-1, num_experts).to(global_weight.device)
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(expert_attention_mask, dim=0)
+        router_prob_per_expert = torch.sum(global_weight * expert_attention_mask, dim=0) / torch.sum(expert_attention_mask, dim=0)
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert)
+    return overall_loss * num_experts
+class AudioExperts(deepspeed.moe.experts.Experts):
+    """Custom Audio experts class extending DeepSpeed MoE experts with additional functionality."""
+    def __init__(self, expert, num_local_experts=1, expert_group_name=None):
+        super(deepspeed.moe.experts.Experts, self).__init__()
+        self.deepspeed_experts = torch.nn.ModuleList([copy.deepcopy(expert) for i in range(num_local_experts)])
+        self.num_local_experts = num_local_experts
+        for expert in self.deepspeed_experts:
+            for name, param in expert.named_parameters():
+                param.allreduce = False
+                param.group_name = expert_group_name
+    def forward(self, inputs):
+        chunks = inputs.chunk(self.num_local_experts, dim=1)
+        expert_outputs = []
+        for chunk, expert in zip(chunks, self.deepspeed_experts):
+            out = expert(chunk)
+            if type(out) is tuple:
+                out = out[0]
+            expert_outputs += [out]
+        expert_output = torch.cat(expert_outputs, dim=1)
+        return expert_output
+class AudioMOELayer(deepspeed.moe.sharded_moe.MOELayer):
+    """Custom Audio MoE layer extending DeepSpeed MOELayer with matrix compression optimization."""
+    def __init__(
+        self,
+        experts: nn.Module,
+        ep_group_name,
+        ep_size,
+        num_local_experts: int,
+        use_tutel: bool = False,
+    ) -> None:
+        super(deepspeed.moe.sharded_moe.MOELayer, self).__init__()
+        self.experts = experts
+        self.ep_group = None
+        self.ep_size = ep_size
+        self.ep_group_name = ep_group_name
+        self.num_local_experts = num_local_experts
+        self.time_falltoall = 0.0
+        self.time_salltoall = 0.0
+        self.time_moe = 0.0
+        self.timers = SynchronizedWallClockTimer()
+        self.wall_clock_breakdown = False
+    def _set_ep_group(self, ep_group):
+        self.ep_group = ep_group
+    def forward(self, hidden_states: Tensor, expert_mask: Tensor, router_weight: Tensor) -> Tensor:
+        router_weight = router_weight * expert_mask
+        if self.wall_clock_breakdown:
+            self.timers(MOE_TIMER).start()
+        d_model = hidden_states.shape[-1]
+        seq_len = hidden_states.shape[0]
+        expert_num = expert_mask.shape[-1]
+        capacity = expert_mask.sum(dim=0).max()
+        if self.ep_group is not None:
+            dist.all_reduce(capacity, op=dist.ReduceOp.MAX, group=self.ep_group)
+        compres_hidden_states = hidden_states.unsqueeze(1).expand(seq_len, expert_num, d_model)
+        compres_hidden_states = compress_matrix(compres_hidden_states, expert_mask, force_dim=capacity, allow_larger_dim=True)  # [C, expert_num, d_model]
+        compres_expert_mask = compress_matrix(expert_mask, expert_mask, force_dim=capacity, allow_larger_dim=True)
+        dispatched_input = einsum("ce,cem->ecm", compres_expert_mask, compres_hidden_states)
+        if self.wall_clock_breakdown:
+            self.timers(FIRST_ALLTOALL_TIMER).start()
+        dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input)
+        if self.wall_clock_breakdown:
+            self.timers(FIRST_ALLTOALL_TIMER).stop()
+            self.time_falltoall = self.timers(FIRST_ALLTOALL_TIMER).elapsed(reset=False)
+        dispatched_input = dispatched_input.reshape(self.ep_size, self.num_local_experts, -1, d_model)
+        expert_output = self.experts(dispatched_input)
+        if self.wall_clock_breakdown:
+            self.timers(SECOND_ALLTOALL_TIMER).start()
+        expert_output = _AllToAll.apply(self.ep_group, expert_output)
+        if self.wall_clock_breakdown:
+            self.timers(SECOND_ALLTOALL_TIMER).stop()
+            self.time_salltoall = self.timers(SECOND_ALLTOALL_TIMER).elapsed(reset=False)
+        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, -1, d_model)
+        expert_output = decompress_matrix(expert_output.transpose(0, 1), expert_mask, allow_larger_dim=True)
+        combined_output = einsum("se,sem->sm", router_weight, expert_output)
+        if self.wall_clock_breakdown:
+            self.timers(MOE_TIMER).stop()
+            self.time_moe = self.timers(MOE_TIMER).elapsed(reset=False)
+        return combined_output
+class UniMoEAudioMoE(deepspeed.moe.layer.MoE):
+    """Custom Audio MoE class extending DeepSpeed MoE with configuration and parallelism setup."""
+    def __init__(self, config, expert, num_experts, ep_size, moe_name_prefix="ep_size"):
+        super(deepspeed.moe.layer.MoE, self).__init__()
+        self.enable_expert_tensor_parallelism = config.enable_expert_tensor_parallelism
+        self.ep_size = ep_size
+        self.num_experts = num_experts
+        self.expert_group_name = f"{moe_name_prefix}_{self.ep_size}"
+        self.num_local_experts = self.num_experts // self.ep_size
+        log_dist(f"Creating MoE layer with num_experts: {self.num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {self.ep_size}", [0])
+        experts = AudioExperts(expert, self.num_local_experts, self.expert_group_name)
+        self.deepspeed_moe = AudioMOELayer(experts, self.expert_group_name, self.ep_size, self.num_local_experts)
+    def set_deepspeed_parallelism(self, use_data_before_expert_parallel_=False):
+        self._create_process_groups(use_data_before_expert_parallel_=use_data_before_expert_parallel_)
+    def _create_process_groups(self, use_data_before_expert_parallel_=False):
+        if self.expert_group_name not in groups._get_expert_parallel_group_dict():
+            print(f"No existing process group found, creating a new group named: {self.expert_group_name}")
+            if (groups.mpu is None) or (not self.enable_expert_tensor_parallelism):
+                groups._create_expert_and_data_parallel(self.ep_size, use_data_before_expert_parallel_=use_data_before_expert_parallel_)
+            else:
+                groups._create_expert_data_and_model_parallel(self.ep_size, mpu=groups.mpu, use_data_before_expert_parallel_=use_data_before_expert_parallel_)
+        self.deepspeed_moe._set_ep_group(groups._get_expert_parallel_group(self.expert_group_name))
+    def forward(self, *input_args, **input_kwargs):
+        return self.deepspeed_moe(*input_args, **input_kwargs)

README (1).md ADDED Viewed

	@@ -0,0 +1,216 @@

+---
+license: mit
+language:
+- en
+- zh
+base_model:
+- Qwen/Qwen2-0.5B
+pipeline_tag: feature-extraction
+library_name: sentence-transformers
+tags:
+- MoE
+- Unified Generation
+- Speech and Music
+- Multi-modal
+datasets:
+---
+<h1 align="center">UniMoE-Audio</h1>
+**UniMoE-Audio** is a unified framework that seamlessly combines speech and music generation. Powered by a novel dynamic-capacity Mixture-of-Experts design, it adapts intelligently to input complexity, enabling high-fidelity voice and expressive music within a single model.
+## Key Innovations
+#### **Top-P Dynamic Routing Strategy**
+We introduce a **Top-P routing strategy** that overcomes the limitations of conventional static Top-K routing:
+- **Dynamic Expert Allocation**: Instead of assigning a fixed number of experts to every token, our approach dynamically determines the number of experts based on token complexity
+- **Resource Efficiency**: Simple tokens don't consume unnecessary resources, while complex tokens receive sufficient processing power
+- **Performance Optimization**: Results in improved overall efficiency and performance
+#### **Three-Stage Training Curriculum**
+We employ a comprehensive training approach to enable effective joint learning from imbalanced data:
+1. **Independent Specialist Training** - Initial expert specialization
+2. **Integration with Warm-up** - Gradual system integration
+3. **Synergistic Joint Training** - Collaborative optimization
+## Model Information
+- **Base Model**: Qwen2.5-VL with MoE extensions
+- **Audio Codec**: DAC (Descript Audio Codec) with 12 channels
+- **Expert Configuration**: 8 dynamic experts + 2 shared experts
+- **Audio Sampling Rate**: 16kHz
+- Usage:
+  - Text-to-Speech (TTS)
+  - Speech-to-Text (STT)
+  - Music Generation
+- GPU Requirements:
+  - Memory: 16GB+
+  - CUDA-enabled GPU
+## Open-source Plan
+- [☑️] Model Checkpoint
+    - [☑️] [UniMoE-Audio-preview](https://huggingface.co/foggyforest/UniMoE-Audio-preview)
+- [☑️] Inference Code: [HITsz-TMG/UniMoE-Audio](https://github.com/HITsz-TMG/UMOE-Scaling-Unified-Multimodal-LLMs/tree/master/UniMoE-Audio)
+- [☑️] Technical Report: [UniMoE-Audio: Unified Speech and Music Generation with Dynamic-Capacity MoE]()
+## Evaluation
+### Speech Synthesis
+![Speech Synthesis](./imgs/Speech_Generation.png)
+### Text to Music Generation
+![Text to Music Generation](./imgs/T2M.png)
+### Video-Text to Music Generation
+![Video-Text to Music Generation](./imgs/VT2M.png)
+## Requirements
+We recommend using conda to install the environment.
+```bash
+conda env create -f configs/enviroment.yml      # add -n for your name
+conda activate unimoe-audio                     # default name
+```
+then install the torch packages
+  ```bash
+   # Use the official index
+   pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121
+   # Use Tsinghua mirror source
+   pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 -i https://pypi.tuna.tsinghua.edu.cn/simple/ --extra-index-url https://download.pytorch.org/whl/cu121
+   # Use Alibaba Cloud mirror source
+   pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 -i https://mirrors.aliyun.com/pypi/simple/ --extra-index-url https://download.pytorch.org/whl/cu121
+   ```
+A `dac model` is also required to be downloaded in '/path/to/UniMoE-Audio/utils/dac_model'.
+It will be automatically downloaded when running the first time.
+## Usage
+Please move to the `utils` folder to your working directory.
+Then you can use the model like this:
+```python
+from modeling import UniMoEAudio
+MODEL_NAME= "HIT-TMG/UniMoE-Audio-Preview"
+# Load model
+unimoe_audio = UniMoEAudio.from_pretrained(
+    MODEL_NAME,
+    cache_dir='./cache',
+    torch_dtype='bfloat16',
+    device_id=0
+)
+```
+### TTS Example:
+```python
+# TTS/Voice Cloning
+target_text = "Target Text"
+prompt_audio = "/path/to/your/prompt_audio.wav"
+prompt_text  = "Prompt Text"
+# Encode prompt audio
+prompt_codec = unimoe_audio.dac.encode(prompt_audio)
+prompt_codec_input_ids = unimoe_audio._preprocess_codec(
+      codec=prompt_codec,
+      codec_delay_pattern=unimoe_audio.model.config.codec_delay_pattern,
+      codec_channels=unimoe_audio.model.num_channels,
+      codec_bos_value=unimoe_audio.model.config.codec_bos_value,
+      codec_eos_value=unimoe_audio.model.config.codec_eos_value,
+      codec_pad_value=unimoe_audio.model.config.codec_pad_value
+  )
+# Construct prompt text
+text_input, _, _ = unimoe_audio._prepare_prompt(task="speech", caption=target_text, prompt_text=prompt_text, prompt_codec_input_ids=prompt_codec_input_ids)
+# Tokenize input text
+source_input = unimoe_audio.tokenizer(text_input, add_special_tokens=False, return_tensors="pt", padding=True)
+prompt_codec_input_ids = prompt_codec_input_ids.unsqueeze(0).expand(len(text_input), -1, -1).reshape(-1, prompt_codec_input_ids.shape[1])
+#Speech Generation
+unimoe_audio._generate_core(
+        source_input,
+        prompt_codec_input_ids,
+        save_name = "speech",
+        output_dir = "./",
+        cfg_scale = 1.0,
+        temperature = 1.0,
+        top_p = 1.0,
+        cfg_filter_top_k = 45,
+        eos_prob_mul_factor = 1.0,
+        do_sample = True,
+        debug_guidance_step = -1,
+        use_cache = True
+        )
+```
+### T2M Example:
+```python
+caption = "music deccription"
+# Construct prompt text
+text_input, _, _ = unimoe_audio._prepare_prompt(task="music", caption=caption)
+# Tokenize input text
+source_input = unimoe_audio.tokenizer(text_input, add_special_tokens=False, return_tensors="pt", padding=True)
+#music generation with prompt text
+unimoe_audio._generate_core(
+        source_input,
+        None,
+        save_name = "music",
+        output_dir = "./",
+        cfg_scale = 10.0,
+        temperature = 1.0,
+        top_p = 1.0,
+        cfg_filter_top_k = 45,
+        eos_prob_mul_factor = 0.6,
+        do_sample = True,
+        debug_guidance_step = -1,
+        use_cache = True
+        )
+```
+### VT2M Example:
+```python
+# VT2M
+caption = "music deccription"
+prompt_video = "/path/to/your/video.mp4"
+#prepare prompt
+text_input, video_inputs, fps_inputs = unimoe_audio._prepare_prompt(task="music", caption=caption, video=prompt_video, fps=1, sampling_fps=1, max_frames=1)
+#input processor
+source_input = unimoe_audio.processor(
+            text=text_input,
+            images=None,
+            videos=video_inputs,
+            fps=fps_inputs,
+            padding=True,
+            return_tensors="pt",
+            do_resize=False
+        )
+#music generation with prompt video
+unimoe_audio._generate_core(
+        source_input,
+        None,
+        save_name = "video_music",
+        output_dir = "./",
+        rebuild_codec=None,
+        cfg_scale = 10.0,
+        temperature = 1.0,
+        top_p = 1.0,
+        cfg_filter_top_k = 45,
+        eos_prob_mul_factor = 0.6,
+        do_sample = True,
+        debug_guidance_step = -1,
+        use_cache = True
+        )
+```

config.json CHANGED Viewed

@@ -2,6 +2,10 @@
   "architectures": [
     "UniAudioRVQQwen2_5VLMoEForConditionalGeneration"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "codec_bos_value": 1026,

   "architectures": [
     "UniAudioRVQQwen2_5VLMoEForConditionalGeneration"
   ],
+  "auto_map": {
+    "AutoConfig": "modeling.UniMoEAudioConfig",
+    "AutoModelForCausalLM": "modeling.UniMoEAudio"
+  },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "codec_bos_value": 1026,

deepspeed_utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+import deepspeed
+import torch
+import torch.nn.functional as F
+from deepspeed import comm as dist
+from deepspeed.moe.sharded_moe import _capacity, _one_hot_to_float, einsum, gumbel_rsample
+from torch import Tensor
+try:
+    # To enable Tutel MoE optimizations:
+    #   python3 -m pip install --user --upgrade git+https://github.com/microsoft/[email protected]
+    from tutel import moe as tutel_moe
+    TUTEL_INSTALLED = True
+except:
+    # Fail silently so we don't spam logs unnecessarily if user isn't using tutel
+    TUTEL_INSTALLED = False
+    pass
+# =============================================================================
+# DeepSpeed MoE Inference Utilities
+# =============================================================================
+def _AllToAll_forward(ctx: Any, group: dist.ProcessGroup, input: Tensor) -> Tensor:  # type: ignore
+    ctx.group = group
+    input = input.contiguous()
+    return input
+def gate_forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
+    d_model = input[0].shape[-1]
+    reshaped_input = input[0].reshape(-1, d_model)
+    if self.use_tutel:
+        self.l_aux, C, E, indices_, locations_, gates_, self.exp_counts = self.gate(reshaped_input, input[1], True)
+        S, M = reshaped_input.size(0), reshaped_input.size(1)
+        if not hasattr(self, "_tutel_dispatcher"):
+            self._tutel_dispatcher = tutel_moe.fast_dispatcher(E, C, M, dispatch_dtype=reshaped_input.dtype)
+        self._tutel_dispatcher.update(indices_, locations_, gates_, capacity=C)
+        dispatched_input = self._tutel_dispatcher.encode(reshaped_input)
+    else:
+        self.l_aux, combine_weights, dispatch_mask, self.exp_counts = self.gate(reshaped_input, input[1])
+        dispatched_input = einsum("sec,sm->ecm", dispatch_mask.type_as(input[0]), reshaped_input)
+    dispatched_input = dispatched_input.reshape(self.ep_size, self.num_local_experts, -1, d_model)
+    expert_output = self.experts(dispatched_input)
+    expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, dispatched_input.shape[2], -1)
+    if self.use_tutel:
+        combined_output = self._tutel_dispatcher.decode(expert_output.view(E * C, M))
+    else:
+        combined_output = einsum("sec,ecm->sm", combine_weights.type_as(input[0]), expert_output)
+    a = combined_output.reshape(input[0].size()[:-1] + (-1,))
+    return a
+def top2gating(
+    logits: Tensor, capacity_factor: float, min_capacity: int, drop_tokens: bool = True, ep_group: Union[torch.distributed.ProcessGroup, None] = None, top2_2nd_expert_sampling: bool = True
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    """Implements Top2Gating on logits."""
+    gates = F.softmax(logits, dim=1)
+    indices1_s = torch.argmax(gates, dim=1)
+    num_experts = int(gates.shape[1])
+    mask1 = F.one_hot(indices1_s, num_classes=num_experts)
+    if top2_2nd_expert_sampling:
+        logits += gumbel_rsample(logits.shape, device=logits.device)
+    logits_except1 = logits.masked_fill(mask1.bool(), float("-inf"))
+    indices2_s = torch.argmax(logits_except1, dim=1)
+    mask2 = F.one_hot(indices2_s, num_classes=num_experts)
+    locations1 = torch.cumsum(mask1, dim=0) - 1
+    locations2 = torch.cumsum(mask2, dim=0) - 1
+    locations2 += torch.sum(mask1, dim=0, keepdim=True)
+    me = torch.mean(gates, dim=0)
+    ce = torch.mean(mask1.float(), dim=0)
+    l_aux = torch.mean(me * ce) * num_experts * num_experts
+    exp_counts = torch.sum(mask1 + mask2, dim=0).detach().to(logits.device)
+    if drop_tokens:
+        capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity))
+        mask1 *= torch.lt(locations1, capacity)
+        mask2 *= torch.lt(locations2, capacity)
+    else:
+        new_capacity = torch.max(exp_counts)
+        capacity = new_capacity
+    locations1_s = torch.sum(locations1 * mask1, dim=1)
+    locations2_s = torch.sum(locations2 * mask2, dim=1)
+    mask1_float = mask1.float()
+    mask2_float = mask2.float()
+    gates1_s = einsum("se,se->s", gates, mask1_float)
+    gates2_s = einsum("se,se->s", gates, mask2_float)
+    denom_s = gates1_s + gates2_s
+    denom_s = torch.clamp(denom_s, min=torch.finfo(denom_s.dtype).eps)
+    gates1_s /= denom_s
+    gates2_s /= denom_s
+    gates1 = einsum("s,se->se", gates1_s, mask1_float)
+    gates2 = einsum("s,se->se", gates2_s, mask2_float)
+    locations1_sc = _one_hot_to_float(locations1_s, capacity)
+    locations2_sc = _one_hot_to_float(locations2_s, capacity)
+    combine1_sec = einsum("se,sc->sec", gates1, locations1_sc)
+    combine2_sec = einsum("se,sc->sec", gates2, locations2_sc)
+    combine_weights = combine1_sec + combine2_sec
+    dispatch_mask = combine_weights.bool()
+    return l_aux, combine_weights, dispatch_mask, exp_counts
+# Apply the modifications to deepspeed
+deepspeed.moe.sharded_moe.MOELayer.forward = gate_forward
+deepspeed.moe.sharded_moe.top2gating = top2gating
+deepspeed.moe.sharded_moe._AllToAll.forward = _AllToAll_forward

model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d853dc5fdece11379a9ef43710c18f6f7fd55aaa7cf6257c183738edb6882100
-size 4999916992

 version https://git-lfs.github.com/spec/v1
+oid sha256:254260c822c07d95dcd11f897c656eda8d08e5849832d4fd4f67c074c449b2fb
+size 4999916960

modeling.py ADDED Viewed

	@@ -0,0 +1,1182 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen2-VL model."""
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.modeling_outputs import (
+    ModelOutput,
+)
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+    Qwen2_5_VLVisionConfig,
+    Qwen2_5_VLTextConfig,
+    Qwen2_5_VLConfig,
+)
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VLAttention,
+    Qwen2RMSNorm,
+    Qwen2_5_VLRotaryEmbedding,
+)
+from DCMoE import UniMoEAudioSparseMoeBlock
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VisionTransformerPretrainedModel
+logger = logging.get_logger(__name__)
+FAST_INIT = True
+if FAST_INIT:
+    logger.warning(f"using FAST initial for Grin Qwen2_vl !!!")
+class Qwen2_5_VLMoETextConfig(Qwen2_5_VLTextConfig):
+    model_type = "qwen2_5_vl_moe_text"
+    def __init__(
+        self,
+        mlp_dynamic_expert_num=4,
+        mlp_dynamic_null_expert_num=0,
+        mlp_dynamic_top_p=0.7,
+        mlp_dynamic_top_k=2,
+        mlp_fixed_expert_num=2,
+        dynamic_intermediate_size=8960,
+        shared_intermediate_size=8960,
+        ignore_differentiable_router=False,
+        enable_expert_tensor_parallelism: bool = False,
+        ep_size=1,
+        fixed_ep_size=1,
+        router_jitter_noise=0.01,
+        input_jitter_noise=0.01,
+        token_drop=False,
+        drop_policy: str = "probs",
+        min_capacity: int = 8,
+        capacity_factor: float = 1.0,
+        fp32_gate=True,
+        avg_hidden_states_last=False,
+        drop_token_num_print=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.mlp_dynamic_expert_num = mlp_dynamic_expert_num
+        self.mlp_dynamic_top_p = mlp_dynamic_top_p
+        self.mlp_dynamic_top_k = mlp_dynamic_top_k
+        self.mlp_fixed_expert_num = mlp_fixed_expert_num
+        self.mlp_dynamic_null_expert_num = mlp_dynamic_null_expert_num
+        self.dynamic_intermediate_size = dynamic_intermediate_size
+        self.shared_intermediate_size = shared_intermediate_size
+        self.ignore_differentiable_router = ignore_differentiable_router
+        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
+        self.ep_size = ep_size
+        self.fixed_ep_size = fixed_ep_size
+        self.input_jitter_noise = input_jitter_noise
+        self.router_jitter_noise = router_jitter_noise
+        self.token_drop = token_drop
+        self.drop_policy = drop_policy
+        self.min_capacity = min_capacity
+        self.capacity_factor = capacity_factor
+        self.fp32_gate = fp32_gate
+        self.avg_hidden_states_last = avg_hidden_states_last
+        self.drop_token_num_print = drop_token_num_print
+class UniMoEAudioConfig(PretrainedConfig):
+    model_type = "uni_audio_rvq_qwen2_5vl_moe"
+    sub_configs = {"vision_config": Qwen2_5_VLVisionConfig, "text_config": Qwen2_5_VLMoETextConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        codec_vocab_size=1028,
+        codec_delay_pattern=[0, 8, 9, 10, 11, 12, 13, 14, 15],
+        codec_channels=9,
+        codec_eos_value=1024,
+        codec_pad_value=1025,
+        codec_bos_value=1026,
+        codec_placeholder_value=None,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.codec_vocab_size = codec_vocab_size
+        self.codec_delay_pattern = codec_delay_pattern
+        self.codec_channels = codec_channels
+        self.codec_eos_value = codec_eos_value
+        self.codec_pad_value = codec_pad_value
+        self.codec_bos_value = codec_bos_value
+        self.codec_placeholder_value = codec_placeholder_value
+        super().__init__(**kwargs)
+@dataclass
+class MoEQwen2_5VLCausalLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+    all_router_logits: Tuple = None
+    all_router_top_k: Tuple = None
+    all_router_expert_mask: Tuple = None
+    all_router_weight: Tuple = None
+    aux_balance_loss: torch.FloatTensor = None
+@dataclass
+class BaseModelOutputWithPast(ModelOutput):
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    all_router_logits: Tuple = None
+    all_router_top_k: Tuple = None
+    all_router_weight: Tuple = None
+    all_router_expert_mask: Tuple = None
+    all_aux_loss: Tuple = None
+class Qwen2_5_VLMoEDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Qwen2_5_VLMoETextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        self.self_attn = Qwen2_5_VLAttention(config, layer_idx)
+        self.mlp = UniMoEAudioSparseMoeBlock(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_type = config.layer_types[layer_idx]
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        padding_token_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits_and_topk: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, router_logits, router_top_k, router_expert_mask, router_weight, aux_loss = self.mlp(hidden_states, padding_token_mask)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if output_router_logits_and_topk:
+            outputs += (router_logits,)
+            outputs += (router_top_k,)
+        outputs += (router_expert_mask,)
+        outputs += (router_weight,)
+        outputs += (aux_loss,)
+        return outputs
+class Qwen2_5_VLMoEPreTrainedModel(PreTrainedModel):
+    config_class = UniMoEAudioConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2_5_VLMoEDecoderLayer", "Qwen2_5_VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_flash_attn_3 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if FAST_INIT:
+            if isinstance(module, UniMoEAudioSparseMoeBlock):
+                module.gate.weight.data.normal_(mean=0.0, std=std)
+                if module.gate.bias is not None:
+                    module.gate.bias.data.zero_()
+            elif isinstance(module, nn.Embedding):
+                module.weight.data.normal_(mean=0.0, std=std)
+                if module.padding_idx is not None:
+                    module.weight.data[module.padding_idx].zero_()
+        else:
+            if isinstance(module, (nn.Linear, nn.Conv3d)):
+                module.weight.data.normal_(mean=0.0, std=std)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+            elif isinstance(module, nn.Embedding):
+                module.weight.data.normal_(mean=0.0, std=std)
+                if module.padding_idx is not None:
+                    module.weight.data[module.padding_idx].zero_()
+            elif isinstance(module, Qwen2RMSNorm):
+                module.weight.data.fill_(1.0)
+class Qwen2_5_VLMoETextModel(Qwen2_5_VLMoEPreTrainedModel):
+    config_class = Qwen2_5_VLMoETextConfig
+    def __init__(self, config: Qwen2_5_VLMoETextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2_5_VLMoEDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config)
+        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        padding_token_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits_and_topk: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.dim() == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits_and_topk else None
+        all_router_top_k = () if output_router_logits_and_topk else None
+        all_router_expert_mask = ()
+        all_router_weight = ()
+        all_aux_loss = ()
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+                padding_token_mask=padding_token_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                output_router_logits_and_topk=output_router_logits_and_topk,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+            if output_router_logits_and_topk:
+                all_router_logits += (layer_outputs[-5],)
+                all_router_top_k += (layer_outputs[-4],)
+            all_router_expert_mask += (layer_outputs[-3],)
+            all_router_weight += (layer_outputs[-2],)
+            all_aux_loss += (layer_outputs[-1],)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_router_logits,
+                    all_router_top_k,
+                    all_router_expert_mask,
+                    all_router_weight,
+                    all_aux_loss]
+                    if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            all_router_logits=all_router_logits,
+            all_router_top_k=all_router_top_k,
+            all_router_expert_mask=all_router_expert_mask,
+            all_router_weight=all_router_weight,
+            all_aux_loss=all_aux_loss,
+        )
+class UniMoEAudio(Qwen2_5_VLMoEPreTrainedModel):
+    base_model_prefix = ""
+    _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
+    config_class = UniMoEAudioConfig
+    _checkpoint_conversion_mapping = {
+        "^visual": "visual",
+        r"^model(?!\.(language_model|visual))": "language_model",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(config.vision_config, attn_implementation=config._attn_implementation)
+        self.language_model = Qwen2_5_VLMoETextModel._from_config(config.text_config)
+        self.rope_deltas = None
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.num_channels = config.codec_channels
+        self.codec_vocab_size = config.codec_vocab_size
+        self.codec_embed_tokens = nn.ModuleList(
+            [nn.Embedding(self.codec_vocab_size, config.text_config.hidden_size) for embed_idx in range(self.num_channels)])
+        self.codec_placeholder_value = config.codec_placeholder_value
+        self.codec_head = nn.Linear(config.text_config.hidden_size, self.num_channels * self.codec_vocab_size, bias=False)
+        self.post_init()
+    @property
+    def cur_aux_weight(self):
+        if self.training_steps >= self.l_aux_weight_decay_steps:
+            return self.min_l_aux_weight
+        return self.l_aux_weight - (self.l_aux_weight - self.min_l_aux_weight) / self.l_aux_weight_decay_steps * self.training_steps
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+    def get_decoder(self):
+        return self.language_model
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                    range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                    expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+                    second_per_grid_t = torch.as_tensor(
+                        second_per_grid_t, dtype=range_tensor.dtype, device=range_tensor.device
+                    )
+                    time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
+                    time_tensor_long = time_tensor.long()
+                    t_index = time_tensor_long.flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+            return position_ids, mrope_position_deltas
+    def get_video_features(self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None):
+        pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+        video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+        split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        video_embeds = torch.split(video_embeds, split_sizes)
+        return video_embeds
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        pixel_values = pixel_values.type(self.visual.dtype)
+        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        image_embeds = torch.split(image_embeds, split_sizes)
+        return image_embeds
+    def codec_embedding(self, codec_input_ids):
+        x = None
+        for i in range(self.num_channels):
+            channel_tokens = codec_input_ids[..., i]
+            channel_embed = self.codec_embed_tokens[i](channel_tokens)
+            x = channel_embed if x is None else x + channel_embed
+        return x
+    def calculate_input_embedding(self, input_ids, codec_input_ids):
+        inputs_embeds = self.language_model.embed_tokens(input_ids)
+        if codec_input_ids is not None:
+            codec_input_embeds = self.codec_embedding(codec_input_ids)
+            codec_mask = (input_ids == self.codec_placeholder_value).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds = inputs_embeds.masked_scatter(codec_mask, codec_input_embeds)
+        return inputs_embeds
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        codec_input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        codec_labels: Optional[torch.LongTensor] = None,
+        padding_token_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits_and_topk: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, MoEQwen2_5VLCausalLMOutputWithPast]:
+        return_dict = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if inputs_embeds is None:
+            inputs_embeds = self.calculate_input_embedding(input_ids, codec_input_ids)
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values, image_grid_thw)
+            image_embeds = torch.cat(image_embeds, dim=0)
+            if input_ids is None:
+                image_mask = inputs_embeds == self.get_input_embeddings()(
+                    torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+                image_mask = image_mask.all(-1)
+            else:
+                image_mask = input_ids == self.config.image_token_id
+            n_image_tokens = (image_mask).sum()
+            image_mask = image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+            n_image_features = image_embeds.shape[0]
+            if not is_torchdynamo_compiling() and n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        if pixel_values_videos is not None:
+            video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0)
+            if input_ids is None:
+                video_mask = inputs_embeds == self.get_input_embeddings()(
+                    torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+                video_mask = video_mask.all(-1)
+            else:
+                video_mask = input_ids == self.config.video_token_id
+            n_video_tokens = (video_mask).sum()
+            n_video_features = video_embeds.shape[0]
+            video_mask = video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+            if not is_torchdynamo_compiling() and n_video_tokens != n_video_features:
+                raise ValueError(
+                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+                )
+            video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
+                attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
+                attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    second_per_grid_ts=second_per_grid_ts,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        if padding_token_mask is None:
+            padding_token_mask = attention_mask.bool()
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            padding_token_mask=padding_token_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits_and_topk=output_router_logits_and_topk,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states).float()
+        codec_logits = self.codec_head(hidden_states).float()
+        codec_logits = codec_logits.view((logits.shape[0], logits.shape[1], self.num_channels, self.codec_vocab_size))
+        loss = None
+        if labels is not None:
+            all_aux_loss = outputs.all_aux_loss if return_dict else outputs[-1]
+            all_aux_loss = torch.mean(torch.cat([l.unsqueeze(0) for l in all_aux_loss], dim=0))
+            aux_loss = self.cur_aux_weight * all_aux_loss
+            self.training_steps += 1
+            codec_loss = None
+            if codec_labels is not None:
+                for i in range(self.num_channels):
+                    channel_logits = codec_logits[:, :, i].float()
+                    channel_labels = codec_labels[:, :, i]
+                    shift_channel_logits = channel_logits[..., :-1, :].contiguous()
+                    shift_channel_labels = channel_labels[..., 1:].contiguous()
+                    if i!= 0 and (shift_channel_labels != -100).sum() == 0:
+                        continue
+                    loss_fct = CrossEntropyLoss()
+                    shift_channel_logits = shift_channel_logits.view(-1, self.codec_vocab_size)
+                    shift_channel_labels = shift_channel_labels.view(-1)
+                    shift_channel_labels = shift_channel_labels.to(shift_channel_logits.device)
+                    channel_loss = loss_fct(shift_channel_logits, shift_channel_labels)
+                    codec_loss = channel_loss if codec_loss is None else codec_loss + channel_loss
+            loss = codec_loss + aux_loss
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return MoEQwen2_5VLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            all_router_logits=outputs.all_router_logits,
+            all_router_top_k=outputs.all_router_top_k,
+            all_router_expert_mask=outputs.all_router_expert_mask,
+            all_router_weight=outputs.all_router_weight,
+            aux_balance_loss=all_aux_loss,
+        )
+    @staticmethod
+    def _sample_next_token(
+        logits_BCxV: torch.Tensor,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+        audio_eos_value: int,
+    ) -> torch.Tensor:
+        if temperature == 0.0:
+            return torch.argmax(logits_BCxV, dim=-1)
+        logits_BCxV = logits_BCxV / temperature
+        if audio_eos_value is not None and audio_eos_value >= 0:
+            top_logit_indices_BC = torch.argmax(logits_BCxV, dim=-1)
+            eos_not_highest_mask_BC = top_logit_indices_BC != audio_eos_value
+            mask_eos_unless_highest_BCxV = torch.zeros_like(logits_BCxV, dtype=torch.bool)
+            mask_eos_unless_highest_BCxV[eos_not_highest_mask_BC, audio_eos_value] = True
+            logits_BCxV = logits_BCxV.masked_fill(mask_eos_unless_highest_BCxV, -torch.inf)
+        if top_k is not None:
+            _, top_k_indices_BCxV = torch.topk(logits_BCxV, k=top_k, dim=-1)
+            mask = torch.ones_like(logits_BCxV, dtype=torch.bool)
+            mask = mask.scatter(dim=-1, index=top_k_indices_BCxV, value=False)
+            logits_BCxV = logits_BCxV.masked_fill(mask, -torch.inf)
+        if top_p < 1.0:
+            probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
+            sorted_probs_BCxV, sorted_indices_BCxV = torch.sort(probs_BCxV, dim=-1, descending=True)
+            cumulative_probs_BCxV = torch.cumsum(sorted_probs_BCxV, dim=-1)
+            sorted_indices_to_remove_BCxV = cumulative_probs_BCxV > top_p
+            sorted_indices_to_remove_BCxV = torch.roll(sorted_indices_to_remove_BCxV, shifts=1, dims=-1)
+            sorted_indices_to_remove_BCxV[..., 0] = torch.zeros_like(sorted_indices_to_remove_BCxV[..., 0])
+            indices_to_remove_BCxV = torch.zeros_like(sorted_indices_to_remove_BCxV)
+            indices_to_remove_BCxV = indices_to_remove_BCxV.scatter(dim=-1, index=sorted_indices_BCxV, src=sorted_indices_to_remove_BCxV)
+            logits_BCxV = logits_BCxV.masked_fill(indices_to_remove_BCxV, -torch.inf)
+        final_probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
+        sampled_indices_BC = torch.multinomial(final_probs_BCxV, num_samples=1)
+        sampled_indices_C = sampled_indices_BC.squeeze(-1)
+        return sampled_indices_C
+    def _decoder_step(
+        self,
+        tokens_Bx1xC: torch.Tensor,
+        model_kwargs,
+        cfg_scale: float,
+        neg_input_size: int,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+        do_sample=True,
+        eos_prob_mul_factor=1.0,
+        labels_Bx1xC=None,
+        use_cache=True,
+        enable_eos=True,
+    ) -> torch.Tensor:
+        B = tokens_Bx1xC.shape[0]
+        audio_eos_value = self.config.codec_eos_value
+        attention_mask = model_kwargs["attention_mask"]
+        cache_position = model_kwargs["cache_position"]
+        past_key_values = model_kwargs["past_key_values"]
+        input_ids = model_kwargs["input_ids"]
+        codec_input_ids = model_kwargs["codec_input_ids"]
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -tokens_Bx1xC.shape[1] :]
+            position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        tokens_Bx1xC = tokens_Bx1xC.repeat_interleave(neg_input_size, dim=0)
+        codec_input_ids = torch.cat((codec_input_ids, tokens_Bx1xC), dim=1) if codec_input_ids is not None else tokens_Bx1xC.clone()
+        input_ids = torch.cat((input_ids, torch.ones(input_ids.shape[0], 1).to(input_ids) * self.codec_placeholder_value), dim=-1)
+        if use_cache:
+            codec_input_embeds = self.codec_embedding(tokens_Bx1xC)
+            outputs = self.language_model(
+                input_ids=None,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=codec_input_embeds,
+                use_cache=True,
+                output_attentions=False,
+                output_hidden_states=False,
+                return_dict=True,
+                cache_position=cache_position,
+            )
+        else:
+            batch_codec_input_ids = codec_input_ids.contiguous().view(-1, self.num_channels)
+            inputs_embeds = self.calculate_input_embedding(input_ids, batch_codec_input_ids)
+            outputs = self.language_model(
+                input_ids=None,
+                attention_mask=attention_mask,
+                position_ids=attention_mask.long().cumsum(-1) - 1,
+                past_key_values=None,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+                output_attentions=False,
+                output_hidden_states=False,
+                return_dict=True,
+                cache_position=None,
+            )
+        last_hidden_state = outputs.last_hidden_state
+        codec_logits = self.codec_head(last_hidden_state).float()
+        codec_logits = codec_logits.view((codec_logits.shape[0], codec_logits.shape[1], self.num_channels, self.codec_vocab_size))
+        model_kwargs["past_key_values"] = outputs.past_key_values
+        attention_mask = model_kwargs["attention_mask"]
+        model_kwargs["attention_mask"] = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+        model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
+        model_kwargs["input_ids"] = input_ids
+        model_kwargs["codec_input_ids"] = codec_input_ids
+        logits_Bx1xCxV = codec_logits[: , -1:].clone()
+        logits_last_2BxCxV = logits_Bx1xCxV[:, -1]
+        logits_last_Bx2xCxV = logits_last_2BxCxV.view(B, neg_input_size, *logits_last_2BxCxV.shape[1:])
+        if cfg_scale is not None:
+            cond_logits_BxCxV = logits_last_Bx2xCxV[:, -1, :, :]  # Shape [B, C, V]
+            logits_BxCxV = cond_logits_BxCxV
+            for ni in range(neg_input_size - 1):
+                uncond_logits_BxCxV = logits_last_Bx2xCxV[:, ni, :, :]  # Shape [B, C, V]
+                cfg_weight = cfg_scale[ni] if isinstance(cfg_scale, List) else cfg_scale
+                logits_BxCxV = logits_BxCxV + cfg_weight * (cond_logits_BxCxV - uncond_logits_BxCxV)
+        else:
+            logits_BxCxV = logits_last_Bx2xCxV[:, -1, :, :]  # Shape [B, C, V]
+        if enable_eos:
+            logits_BxCxV[:, :, audio_eos_value + 1 :] = torch.full_like(
+                logits_BxCxV[:, :, audio_eos_value + 1 :],
+                fill_value=-torch.inf,
+            )
+            logits_BxCxV[:, 1:, audio_eos_value:] = torch.full_like(
+                logits_BxCxV[:, 1:, audio_eos_value:],
+                fill_value=-torch.inf,
+            )
+            logits_BxCxV[:, 0, audio_eos_value] *= torch.tensor(eos_prob_mul_factor, device=self.device)
+        else:
+            logits_BxCxV[:, :, audio_eos_value:] = torch.full_like(
+                logits_BxCxV[:, :, audio_eos_value:],
+                fill_value=-torch.inf,
+            )
+        flat_logits_BCxV = logits_BxCxV.reshape(B * self.num_channels, -1)
+        if do_sample:
+            pred_BC = self._sample_next_token(
+                flat_logits_BCxV.float(),
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                audio_eos_value=audio_eos_value,
+            )
+        else:
+            pred_BC = torch.argmax(flat_logits_BCxV, dim=1)
+        pred_BxC = pred_BC.view(B, self.num_channels)
+        return pred_BxC, model_kwargs
+    def generate(
+        self,
+        input_ids,
+        attention_mask,
+        dec_output,
+        max_tokens,
+        min_tokens=None,
+        codec_input_ids: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        neg_input_size = 2,
+        cfg_scale = 3.0,
+        temperature: float = 1.2,
+        top_p: float = 0.95,
+        cfg_filter_top_k: int = 45,
+        eos_prob_mul_factor: float = 0.8,
+        do_sample: bool = True,
+        debug_guidance_step: int = 0,
+        use_cache=True,
+    ):
+        if codec_input_ids is not None:
+            assert use_cache
+        batch_size = input_ids.shape[0] // neg_input_size
+        audio_eos_value = self.config.codec_eos_value
+        audio_pad_value = self.config.codec_pad_value
+        delay_pattern = self.config.codec_delay_pattern
+        max_delay_pattern = max(delay_pattern)
+        delay_pattern_Cx = torch.tensor(delay_pattern, device=self.device, dtype=torch.long)
+        dec_step = min(dec_output.prefill_steps) - 1
+        eos_detected_Bx = torch.zeros((batch_size,), dtype=torch.bool, device=self.device)
+        eos_countdown_Bx = torch.full((batch_size,), -1, dtype=torch.long, device=self.device)
+        finished_step_Bx = torch.full((batch_size,), -1, dtype=torch.long, device=self.device)
+        bos_over = False
+        model_kwargs = dict(attention_mask=attention_mask, use_cache=True)
+        model_kwargs["past_key_values"] = DynamicCache()
+        model_kwargs["cache_position"] = torch.ones_like(input_ids[0, :], dtype=torch.int64).cumsum(0) - 1
+        attention_mask = model_kwargs["attention_mask"]
+        past_key_values = model_kwargs["past_key_values"]
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        cache_position = torch.arange(0, input_ids.shape[-1], device=input_ids.device)
+        inputs_embeds = self.calculate_input_embedding(input_ids, codec_input_ids)
+        outputs = self.language_model(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            second_per_grid_ts=second_per_grid_ts,
+            use_cache=True,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+            cache_position=cache_position,
+        )
+        model_kwargs["input_ids"] = input_ids
+        model_kwargs["codec_input_ids"] = None
+        model_kwargs["labels"] = torch.ones_like(input_ids[neg_input_size-1::neg_input_size]) * -100
+        labels_Bx1xC = dec_output.get_labels_at(0)
+        if labels_Bx1xC is not None:
+            model_kwargs["codec_labels"] = (torch.ones_like(input_ids[neg_input_size-1::neg_input_size]) * -100).unsqueeze(-1).expand(-1, -1, self.num_channels)
+            assert (labels_Bx1xC != self.config.codec_bos_value).sum() == 0
+            labels_Bx1xC = torch.full_like(labels_Bx1xC, -100)
+            model_kwargs["codec_labels"] = torch.cat((model_kwargs["codec_labels"], labels_Bx1xC), dim=1)
+        model_kwargs["past_key_values"] = outputs.past_key_values
+        attention_mask = model_kwargs["attention_mask"]
+        model_kwargs["attention_mask"] = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+        model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
+        while dec_step < max_tokens:
+            if (eos_countdown_Bx == 0).all():
+                break
+            current_step_idx = dec_step + 1
+            tokens_Bx1xC = dec_output.get_tokens_at(dec_step)
+            labels_Bx1xC = dec_output.get_labels_at(dec_step + 1)
+            pred_BxC, model_kwargs = self._decoder_step(
+                tokens_Bx1xC=tokens_Bx1xC,
+                model_kwargs=model_kwargs,
+                cfg_scale=cfg_scale,
+                neg_input_size=neg_input_size,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=cfg_filter_top_k,
+                do_sample=do_sample,
+                eos_prob_mul_factor=eos_prob_mul_factor,
+                labels_Bx1xC=labels_Bx1xC,
+                use_cache=use_cache,
+                enable_eos=(min_tokens is None or dec_step >= min_tokens),
+            )
+            if labels_Bx1xC is not None and (dec_step < debug_guidance_step or debug_guidance_step==-1):
+                pred_BxC = labels_Bx1xC[:, 0]
+            active_mask_Bx = eos_countdown_Bx != 0
+            eos_trigger_Bx = torch.zeros_like(active_mask_Bx)
+            if active_mask_Bx.any():
+                is_eos_token = (~eos_detected_Bx[active_mask_Bx]) & (pred_BxC[active_mask_Bx, 0] == audio_eos_value)
+                is_max_len = current_step_idx >= max_tokens - max_delay_pattern
+                eos_trigger_Bx[active_mask_Bx] = is_eos_token | is_max_len
+            eos_detected_Bx |= eos_trigger_Bx
+            start_countdown_mask_Bx = eos_trigger_Bx & (eos_countdown_Bx < 0)
+            if start_countdown_mask_Bx.any():
+                eos_countdown_Bx[start_countdown_mask_Bx] = max_delay_pattern
+                finished_step_Bx[start_countdown_mask_Bx] = current_step_idx
+            padding_mask_Bx = eos_countdown_Bx > 0
+            if padding_mask_Bx.any():
+                pred_active_BxC = pred_BxC[padding_mask_Bx].clone()
+                countdown_active_Bx = eos_countdown_Bx[padding_mask_Bx]
+                step_after_eos_Bx = max_delay_pattern - countdown_active_Bx
+                step_after_eos_Bx_ = step_after_eos_Bx.unsqueeze(1)
+                delay_pattern_Cx_ = delay_pattern_Cx.unsqueeze(0)
+                eos_mask_NxC = step_after_eos_Bx_ == delay_pattern_Cx_
+                pad_mask_NxC = step_after_eos_Bx_ > delay_pattern_Cx_
+                pred_active_BxC[eos_mask_NxC] = audio_eos_value
+                pred_active_BxC[pad_mask_NxC] = audio_pad_value
+                pred_BxC[padding_mask_Bx] = pred_active_BxC
+                eos_countdown_Bx[padding_mask_Bx] -= 1
+            if not bos_over:
+                bos_over = all(current_step_idx - prefill_step >= max_delay_pattern for prefill_step in dec_output.prefill_steps)
+            dec_output.update_one(pred_BxC, current_step_idx, not bos_over)
+            dec_step += 1
+        final_step = dec_step + 1
+        finished_step_Bx[finished_step_Bx == -1] = final_step - max_delay_pattern
+        prefill_steps_tensor = torch.tensor(dec_output.prefill_steps, device=self.device)
+        lengths_Bx = finished_step_Bx - prefill_steps_tensor
+        lengths_Bx = torch.clamp(lengths_Bx, min=0)
+        max_len = lengths_Bx.max().item() + max_delay_pattern
+        if max_len > 0:
+            num_channels = self.num_channels
+            generated_codes = torch.full(
+                (batch_size, max_len, num_channels),
+                fill_value=audio_pad_value,
+                dtype=torch.long,
+                device=self.device,
+            )
+            for i in range(batch_size):
+                start_step = dec_output.prefill_steps[i]
+                actual_len = lengths_Bx[i].item() + max_delay_pattern
+                if actual_len > 0:
+                    tokens_to_copy = dec_output.generated_tokens[i, start_step : start_step + actual_len, :]
+                    generated_codes[i, :actual_len, :] = tokens_to_copy
+            return generated_codes, lengths_Bx
+        else:
+            print("Warning: Nothing generated for any sequence in the batch.")
+            return None, None
+# AutoConfig.register("qwen2_5_vl_moe_text", Qwen2_5_VLMoETextConfig)
+# AutoModelForCausalLM.register(Qwen2_5_VLMoETextConfig, Qwen2_5_VLMoETextModel)
+# AutoConfig.register("uni_audio_rvq_qwen2_5vl_moe", UniMoEAudioConfig)
+# AutoModelForCausalLM.register(UniMoEAudioConfig, UniMoEAudio)

special_tokens_map.json CHANGED Viewed

@@ -12,7 +12,84 @@
     "<|vision_end|>",
     "<|vision_pad|>",
     "<|image_pad|>",
-    "<|video_pad|>"
   ],
   "eos_token": {
     "content": "<|im_end|>",

     "<|vision_end|>",
     "<|vision_pad|>",
     "<|image_pad|>",
+    "<|video_pad|>",
+    {
+      "content": "<|AUDIO_PLACEHOLDER|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|AUDIO_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|AUDIO_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|SPEECH_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|SPEECH_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|VOICE_PROMPT_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|VOICE_PROMPT_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|SPEECH_PROMPT_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|SPEECH_PROMPT_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|MUSIC_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|MUSIC_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
   ],
   "eos_token": {
     "content": "<|im_end|>",

tokenizer_config.json CHANGED Viewed

@@ -177,6 +177,94 @@
       "rstrip": false,
       "single_word": false,
       "special": false
     }
   },
   "additional_special_tokens": [
@@ -192,15 +280,27 @@
     "<|vision_end|>",
     "<|vision_pad|>",
     "<|image_pad|>",
-    "<|video_pad|>"
   ],
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
-  "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "processor_class": "Qwen2_5_VLProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",

       "rstrip": false,
       "single_word": false,
       "special": false
+    },
+    "151665": {
+      "content": "<|AUDIO_PLACEHOLDER|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|AUDIO_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|AUDIO_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|SPEECH_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|SPEECH_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|VOICE_PROMPT_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|VOICE_PROMPT_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|SPEECH_PROMPT_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|SPEECH_PROMPT_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|MUSIC_START|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<|MUSIC_END|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "additional_special_tokens": [
     "<|vision_end|>",
     "<|vision_pad|>",
     "<|image_pad|>",
+    "<|video_pad|>",
+    "<|AUDIO_PLACEHOLDER|>",
+    "<|AUDIO_START|>",
+    "<|AUDIO_END|>",
+    "<|SPEECH_START|>",
+    "<|SPEECH_END|>",
+    "<|VOICE_PROMPT_START|>",
+    "<|VOICE_PROMPT_END|>",
+    "<|SPEECH_PROMPT_START|>",
+    "<|SPEECH_PROMPT_END|>",
+    "<|MUSIC_START|>",
+    "<|MUSIC_END|>"
   ],
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "extra_special_tokens": {},
+  "model_max_length": 4096,
   "pad_token": "<|endoftext|>",
+  "padding_side": "right",
   "processor_class": "Qwen2_5_VLProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",

utils.py ADDED Viewed

	@@ -0,0 +1,491 @@

+# -*- coding: utf-8 -*-
+"""
+UniMoE Audio Utilities Module
+Author: UniMoE Audio Team
+"""
+import copy
+import glob
+import json
+import math
+import os
+import re
+import shutil
+import sys
+import time
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union, TYPE_CHECKING, Callable
+import dac
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchaudio
+import transformers
+from audiotools import AudioSignal
+from safetensors import safe_open
+from tqdm import tqdm
+from transformers import AutoProcessor, AutoTokenizer, LogitsProcessor, LogitsProcessorList
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from PIL import Image
+from torchvision import io, transforms
+from torchvision.transforms import InterpolationMode
+import torchvision
+from qwen_vl_utils import smart_resize, process_vision_info
+import deepspeed
+from deepspeed import comm as dist
+from deepspeed.moe.sharded_moe import _capacity, _one_hot_to_float, einsum, gumbel_rsample
+from torch import Tensor
+try:
+    import torch_npu
+    IS_CUDA = False
+except:
+    IS_CUDA = True
+try:
+    # To enable Tutel MoE optimizations:
+    #   python3 -m pip install --user --upgrade git+https://github.com/microsoft/[email protected]
+    from tutel import moe as tutel_moe
+    TUTEL_INSTALLED = True
+except:
+    # Fail silently so we don't spam logs unnecessarily if user isn't using tutel
+    TUTEL_INSTALLED = False
+    pass
+SYSTEM_MESSAGE = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"""
+INPUT_FORMAT = """<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"""
+AUDIO_START = "<|AUDIO_START|>"
+DEFAULT_VIDEO_PROMPT = "<|vision_start|><|video_pad|><|vision_end|>{}"
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+VIDEO_TOTAL_PIXELS = 16 * 28 * 28
+VIDEO_MIN_PIXELS = 16 * 28 * 28
+VIDEO_MAX_PIXELS = 64 * 28 * 28
+FRAME_FACTOR = 2
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+IMG_START_TOKEN='<img>'
+IMG_END_TOKEN='</img>'
+IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'
+IMG_PREFIX_FORMAT = "<|IMAGE_PLACE_HOLDER|>"
+# =============================================================================
+# DAC Utilities
+# =============================================================================
+class Dac:
+    def __init__(self):
+        base_dir = os.path.dirname(__file__)
+        dac_model_dir = os.path.join(base_dir, "dac_model")
+        model_path = os.path.join(dac_model_dir, "weights_16khz.pth")
+        if not os.path.isfile(model_path):
+            print(f"DAC model not found at {model_path}, downloading...")
+            os.makedirs(dac_model_dir, exist_ok=True)
+            downloaded_path = dac.utils.download(model_type="16khz")
+            shutil.move(downloaded_path, model_path)
+            print(f"DAC model downloaded and saved to {model_path}")
+        env_path = os.environ.get("DAC_WEIGHTS")
+        candidates = []
+        if env_path:
+            candidates.append(env_path)
+        candidates.extend([
+            model_path,
+            os.path.join(base_dir, "weights_16khz.pth"),
+            os.path.join(os.getcwd(), "utils", "dac_model", "weights_16khz.pth"),
+            os.path.join(os.getcwd(), "dac_model", "weights_16khz.pth"),
+        ])
+        final_model_path = next((p for p in candidates if p and os.path.isfile(p)), None)
+        if not final_model_path:
+            searched = "\n - " + "\n - ".join(candidates)
+            raise FileNotFoundError(
+                "DAC weights not found. Please place weights_16khz.pth in one of the following locations or set DAC_WEIGHTS to an absolute path:" + searched
+            )
+        self.model = dac.DAC.load(final_model_path)
+        self.resampler = dict()
+        if IS_CUDA:
+            self.model = self.model.to("cuda")
+        else:
+            self.model = self.model.to("npu")
+    def encode(self, audio_path):
+        signal = AudioSignal(audio_path)
+        if signal.audio_data.shape[1] == 2:
+            signal.audio_data = 0.5 * (signal.audio_data[:, :1, :] + signal.audio_data[:, 1:, :])
+        signal.to(self.model.device)
+        if signal.sample_rate != 16000:
+            if not str(signal.sample_rate) in self.resampler:
+                self.resampler[str(signal.sample_rate)] = torchaudio.transforms.Resample(signal.sample_rate, 16000)
+                if IS_CUDA:
+                    self.resampler[str(signal.sample_rate)] = self.resampler[str(signal.sample_rate)].cuda()
+                else:
+                    self.resampler[str(signal.sample_rate)] = self.resampler[str(signal.sample_rate)].npu()
+            signal.audio_data = self.resampler[str(signal.sample_rate)](signal.audio_data)
+            signal.sample_rate = 16000
+        x = self.model.preprocess(signal.audio_data.to(self.model.device), signal.sample_rate)
+        z, codes, latents, _, _ = self.model.encode(x)
+        codes = codes[0].clone().detach().transpose(0, 1)
+        assert codes.shape[1] == 12 and len(codes.shape) == 2
+        codes = codes.tolist()
+        return codes
+    def decode(self, codes, save_path, min_duration=None):
+        assert codes.shape[0] == 1 and codes.shape[1] == 12
+        z, _, _ = self.model.quantizer.from_codes(codes.to(self.model.device))
+        audio_out = self.model.decode(z)[0].detach().cpu()
+        sample_rate = 16000
+        duration = audio_out.size(1) / sample_rate
+        if min_duration is not None and duration < min_duration:
+            padding_duration = min_duration - duration
+            padding_samples = int(padding_duration * sample_rate)
+            padding = torch.zeros((audio_out.size(0), padding_samples), dtype=audio_out.dtype, device=audio_out.device)
+            audio_out = torch.cat((audio_out, padding), dim=1)
+        torchaudio.save(save_path, audio_out.detach().cpu(), sample_rate=16000, encoding="PCM_S", bits_per_sample=16)
+def build_delay_indices(B: int, T: int, C: int, delay_pattern: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
+    delay_arr = torch.tensor(delay_pattern, dtype=torch.int32)
+    t_idx_BxT = torch.broadcast_to(
+        torch.arange(T, dtype=torch.int32)[None, :],
+        [B, T],
+    )
+    t_idx_BxTx1 = t_idx_BxT[..., None]
+    t_idx_BxTxC = t_idx_BxTx1 - delay_arr.view(1, 1, C)
+    b_idx_BxTxC = torch.broadcast_to(
+        torch.arange(B, dtype=torch.int32).view(B, 1, 1),
+        [B, T, C],
+    )
+    c_idx_BxTxC = torch.broadcast_to(
+        torch.arange(C, dtype=torch.int32).view(1, 1, C),
+        [B, T, C],
+    )
+    t_clamped_BxTxC = torch.clamp(t_idx_BxTxC, 0, T - 1)
+    indices_BTCx3 = torch.stack(
+        [
+            b_idx_BxTxC.reshape(-1),
+            t_clamped_BxTxC.reshape(-1),
+            c_idx_BxTxC.reshape(-1),
+        ],
+        dim=1,
+    ).long()
+    return t_idx_BxTxC, indices_BTCx3
+def apply_audio_delay(audio_BxTxC: torch.Tensor, pad_value: int, bos_value: int, precomp: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+    device = audio_BxTxC.device
+    t_idx_BxTxC, indices_BTCx3 = precomp
+    t_idx_BxTxC = t_idx_BxTxC.to(device)
+    indices_BTCx3 = indices_BTCx3.to(device)
+    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
+    gathered_BxTxC = gathered_flat.view(audio_BxTxC.shape)
+    mask_bos = t_idx_BxTxC < 0
+    mask_pad = t_idx_BxTxC >= audio_BxTxC.shape[1]
+    bos_tensor = torch.tensor(bos_value, dtype=audio_BxTxC.dtype, device=device)
+    pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
+    result_BxTxC = torch.where(mask_bos, bos_tensor, torch.where(mask_pad, pad_tensor, gathered_BxTxC))
+    return result_BxTxC
+def build_revert_indices(B: int, T: int, C: int, delay_pattern: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
+    device = None
+    delay_arr = torch.tensor(delay_pattern, dtype=torch.int32, device=device)
+    t_idx_BT1 = torch.broadcast_to(torch.arange(T, device=device).unsqueeze(0), [B, T])
+    t_idx_BT1 = t_idx_BT1.unsqueeze(-1)
+    t_idx_BxTxC = torch.minimum(
+        t_idx_BT1 + delay_arr.view(1, 1, C),
+        torch.tensor(T - 1, device=device),
+    )
+    b_idx_BxTxC = torch.broadcast_to(torch.arange(B, device=device).view(B, 1, 1), [B, T, C])
+    c_idx_BxTxC = torch.broadcast_to(torch.arange(C, device=device).view(1, 1, C), [B, T, C])
+    indices_BTCx3 = torch.stack(
+        [
+            b_idx_BxTxC.reshape(-1),
+            t_idx_BxTxC.reshape(-1),
+            c_idx_BxTxC.reshape(-1),
+        ],
+        axis=1,
+    ).long()
+    return t_idx_BxTxC, indices_BTCx3
+def revert_audio_delay(
+    audio_BxTxC: torch.Tensor,
+    pad_value: int,
+    precomp: Tuple[torch.Tensor, torch.Tensor],
+    T: int,
+) -> torch.Tensor:
+    t_idx_BxTxC, indices_BTCx3 = precomp
+    device = audio_BxTxC.device
+    t_idx_BxTxC = t_idx_BxTxC.to(device)
+    indices_BTCx3 = indices_BTCx3.to(device)
+    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
+    gathered_BxTxC = gathered_flat.view(audio_BxTxC.size())
+    pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
+    T_tensor = torch.tensor(T, device=device)
+    result_BxTxC = torch.where(t_idx_BxTxC >= T_tensor, pad_tensor, gathered_BxTxC)
+    return result_BxTxC
+def prepare_audio_prompt(model, audio_prompts: list[torch.Tensor]):
+    num_channels = model.config.codec_channels
+    audio_bos_value = model.config.codec_bos_value
+    delay_pattern = model.config.codec_delay_pattern
+    max_delay_pattern = max(delay_pattern)
+    batch_size = len(audio_prompts)
+    max_len = max(p.shape[0] if p is not None else 0 for p in audio_prompts) + max_delay_pattern + 1
+    prefill_steps = []
+    prefill = torch.full(
+        (batch_size, max_len, num_channels),
+        fill_value=-1,
+        dtype=torch.int,
+        device=model.device,
+    )
+    prefill[:, 0, :] = audio_bos_value
+    for i in range(batch_size):
+        prompt = audio_prompts[i]
+        if prompt is not None:
+            prompt = prompt.to(device=model.device, dtype=torch.int)
+            prefill[i, 1 : prompt.shape[0] + 1, :] = prompt
+            prefill_steps.append(prompt.shape[0] + 1)
+        else:
+            prefill_steps.append(1)
+    delay_precomp = build_delay_indices(
+        B=batch_size,
+        T=max_len,
+        C=num_channels,
+        delay_pattern=delay_pattern,
+    )
+    delayed_batch = apply_audio_delay(
+        audio_BxTxC=prefill,
+        pad_value=-1,
+        bos_value=audio_bos_value,
+        precomp=delay_precomp,
+    )
+    return delayed_batch, prefill_steps
+class DecoderOutput:
+    def __init__(self, prefill, prefill_steps, device: torch.device, labels_prefill=None):
+        self.generated_tokens = prefill
+        self.prefill_steps = prefill_steps
+        self.labels_prefill = labels_prefill
+        self.device = device
+    def get_tokens_at(self, step_from: int, step_to: int = None) -> torch.Tensor:
+        if step_to is None:
+            step_to = step_from + 1
+        return self.generated_tokens[:, step_from:step_to, :].to(self.device)
+    def get_labels_at(self, step_from: int, step_to: int = None) -> torch.Tensor:
+        if step_to is None:
+            step_to = step_from + 1
+        if self.labels_prefill is None:
+            return None
+        return self.labels_prefill[:, step_from:step_to, :].to(self.device)
+    def update_one(self, dec_out: torch.Tensor, step: int, apply_mask: bool = False):
+        dec_out = dec_out.to(self.generated_tokens.dtype).to(self.generated_tokens.device)
+        if apply_mask:
+            assert step < self.generated_tokens.shape[1]
+            mask = self.generated_tokens[:, step, :] == -1
+            self.generated_tokens[:, step, :] = torch.where(mask, dec_out, self.generated_tokens[:, step, :])
+        else:
+            assert step == self.generated_tokens.shape[1]
+            self.generated_tokens = torch.cat((self.generated_tokens, dec_out[:, None, :]), dim=1)
+def generate_output(model, generated_codes: torch.Tensor, lengths_Bx: torch.Tensor) -> list[np.ndarray]:
+    num_channels = model.config.codec_channels
+    batch_size = generated_codes.shape[0]
+    seq_length = generated_codes.shape[1]
+    delay_pattern = model.config.codec_delay_pattern
+    audio_pad_value = model.config.codec_pad_value
+    max_delay_pattern = max(delay_pattern)
+    revert_precomp = build_revert_indices(
+        B=batch_size,
+        T=seq_length,
+        C=num_channels,
+        delay_pattern=delay_pattern,
+    )
+    codebook = revert_audio_delay(
+        audio_BxTxC=generated_codes,
+        pad_value=audio_pad_value,
+        precomp=revert_precomp,
+        T=seq_length,
+    )[:, :-max_delay_pattern, :]
+    audios = []
+    for i in range(batch_size):
+        audios.append(codebook[i, : lengths_Bx[i], :].cpu())
+    return audios
+def frame_process(images, **ele):
+    images = [torchvision.transforms.functional.pil_to_tensor(img) for img in images]
+    video = torch.stack(images, dim=0)
+    # copy from fetch_video
+    nframes, _, height, width = video.shape
+    min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
+    total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
+    max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
+    max_pixels_supposed = ele.get("max_pixels", max_pixels)
+    if max_pixels_supposed > max_pixels:
+        print(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].")
+    max_pixels = min(max_pixels_supposed, max_pixels)
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=IMAGE_FACTOR,
+        )
+    else:
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=IMAGE_FACTOR,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    video = transforms.functional.resize(
+        video,
+        [resized_height, resized_width],
+        interpolation=InterpolationMode.BICUBIC,
+        antialias=True,
+    ).float()
+    return video
+def preprocess_codec(model, codec):
+    """Preprocess codec tokens"""
+    codec_token = torch.tensor(codec, dtype=torch.long)
+    codec_token_len = codec_token.shape[0]
+    max_delay_pattern = max(model.config.codec_delay_pattern)
+    codec_input_ids = torch.zeros((codec_token_len + max_delay_pattern + 1, model.num_channels), dtype=torch.long)
+    for c in range(model.num_channels):
+        start = model.config.codec_delay_pattern[c] + 1
+        codec_input_ids[:start, c] = model.config.codec_bos_value
+        codec_input_ids[start : start + codec_token_len, c] = codec_token[:, c]
+        codec_input_ids[start + codec_token_len :, c] = model.config.codec_pad_value
+        if start + codec_token_len < codec_input_ids.shape[0]:
+            codec_input_ids[start + codec_token_len, c] = model.config.codec_eos_value
+    return codec_input_ids
+def tts_preprocess(batch_caption, prompt_codec, prompt_text, device):
+    text_input = []
+    codec_input_ids = []
+    for caption in batch_caption:
+       prompt_caption = "<|SPEECH_PROMPT_START|>" + prompt_text + "<|SPEECH_PROMPT_END|>"
+       prompt_caption += "<|VOICE_PROMPT_START|>" + "<|AUDIO_PLACEHOLDER|>" * prompt_codec.shape[0] + "<|VOICE_PROMPT_END|>"
+       prompt_caption_fn = lambda x: prompt_caption + "<|SPEECH_START|>" + x + "<|SPEECH_END|>"
+       text_input.append(SYSTEM_MESSAGE + INPUT_FORMAT.format(f"<|SPEECH_PROMPT_START|>{prompt_text}<|SPEECH_PROMPT_END|><|VOICE_PROMPT_START|><|VOICE_PROMPT_END|><|SPEECH_START|>{caption}<|SPEECH_END|>") + AUDIO_START)
+       text_input.append(SYSTEM_MESSAGE + INPUT_FORMAT.format(prompt_caption_fn("")) + AUDIO_START)
+       text_input.append(SYSTEM_MESSAGE + INPUT_FORMAT.format(prompt_caption_fn(caption)) + AUDIO_START)
+       codec_input_ids.append(prompt_codec.clone())
+       codec_input_ids.append(prompt_codec.clone())
+    codec_input_ids = torch.cat(codec_input_ids, dim=0).to(device)
+    tts_generation_kwargs = {
+        "codec_input_ids": codec_input_ids,
+        "cfg_scale": [2, 3],
+        "neg_input_size": 3,
+    }
+    return  text_input, tts_generation_kwargs
+def t2m_preprocess(batch_caption):
+    text_input = []
+    for caption in batch_caption:
+       text_input.append(SYSTEM_MESSAGE + INPUT_FORMAT.format("<|MUSIC_START|>" + "Low quality." + "<|MUSIC_END|>") + AUDIO_START)
+       text_input.append(SYSTEM_MESSAGE + INPUT_FORMAT.format("<|MUSIC_START|>" + caption + "<|MUSIC_END|>") + AUDIO_START)
+    t2m_generation_kwargs = {
+        "cfg_scale": 10,
+        "neg_input_size": 2,
+    }
+    return  text_input, t2m_generation_kwargs
+def v2m_preprocess(batch_caption, batch_video, fps=1):
+    def extract_images_from_video(video_path, fps=1, max_frames=1):
+        video = VideoFileClip(video_path)
+        duration = video.duration
+        # 提取图片
+        images = []
+        for i, t in enumerate(range(0, math.ceil(duration * fps))):
+            time_in_video = t / fps
+            frame = video.get_frame(time_in_video)
+            img = Image.fromarray(frame)
+            images.append(img)
+            if max_frames is not None and i >= max_frames - 1:
+                break
+        return images
+    text_input = []
+    video_inputs = []
+    fps_inputs = []
+    for caption, video in zip(batch_caption, batch_video):
+        text_input.append(SYSTEM_MESSAGE + INPUT_FORMAT.format("<|MUSIC_START|>" + "Low quality." + "<|MUSIC_END|>") + AUDIO_START)
+        text_input.append(SYSTEM_MESSAGE + INPUT_FORMAT.format("<|MUSIC_START|>" + caption + "<|MUSIC_END|>") + AUDIO_START)
+        video_input = frame_process(
+            extract_images_from_video(video, fps),
+            fps = fps,
+        )
+        video_inputs.append(video_input)
+        video_inputs.append(video_input)
+        fps_inputs.append(fps)
+        fps_inputs.append(fps)
+    t2m_generation_kwargs = {
+        "cfg_scale": 10,
+        "neg_input_size": 2,
+    }
+    return  text_input, video_inputs, fps_inputs, t2m_generation_kwargs

video_preprocessor_config (1).json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": false,
+  "fps": null,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_frames": 4,
+  "min_pixels": 3136,
+  "num_frames": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "size_divisor": null,
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}

vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff