moonshotai
/

Kimi-VL-A3B-Instruct

@@ -6,6 +6,7 @@ logger = logging.get_logger(__name__)
 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class DeepseekV3Config(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
@@ -122,30 +123,30 @@ class DeepseekV3Config(PretrainedConfig):
         vocab_size=129280,
         hidden_size=7168,
         intermediate_size=18432,
-        moe_intermediate_size = 2048,
         num_hidden_layers=61,
         num_nextn_predict_layers=1,
         num_attention_heads=128,
         num_key_value_heads=128,
-        n_shared_experts = 1,
-        n_routed_experts = 256,
-        ep_size = 1,
-        routed_scaling_factor = 2.5,
-        kv_lora_rank = 512,
-        q_lora_rank = 1536,
-        qk_rope_head_dim = 64,
-        v_head_dim = 128,
-        qk_nope_head_dim = 128,
-        topk_method = 'noaux_tc',
-        n_group = 8,
-        topk_group = 4,
-        num_experts_per_tok = 8,
-        moe_layer_freq = 1,
-        first_k_dense_replace = 3,
-        norm_topk_prob = True,
-        scoring_func = 'sigmoid',
-        aux_loss_alpha = 0.001,
-        seq_aux = True,
         hidden_act="silu",
         max_position_embeddings=4096,
         initializer_range=0.02,
@@ -252,7 +253,7 @@ class KimiVLConfig(PretrainedConfig):
         ignore_index: int = -100,
         media_placeholder_token_id: int = 163605,
         pad_token_id: int = 0,
-        **kwargs
     ):
         if vision_config is None:
             vision_config = MoonViTConfig()
@@ -269,4 +270,15 @@ class KimiVLConfig(PretrainedConfig):
         self.ignore_index = ignore_index
         self.media_placeholder_token_id = media_placeholder_token_id
         super().__init__(pad_token_id=pad_token_id, **kwargs)

 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class DeepseekV3Config(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
         vocab_size=129280,
         hidden_size=7168,
         intermediate_size=18432,
+        moe_intermediate_size=2048,
         num_hidden_layers=61,
         num_nextn_predict_layers=1,
         num_attention_heads=128,
         num_key_value_heads=128,
+        n_shared_experts=1,
+        n_routed_experts=256,
+        ep_size=1,
+        routed_scaling_factor=2.5,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method="noaux_tc",
+        n_group=8,
+        topk_group=4,
+        num_experts_per_tok=8,
+        moe_layer_freq=1,
+        first_k_dense_replace=3,
+        norm_topk_prob=True,
+        scoring_func="sigmoid",
+        aux_loss_alpha=0.001,
+        seq_aux=True,
         hidden_act="silu",
         max_position_embeddings=4096,
         initializer_range=0.02,
         ignore_index: int = -100,
         media_placeholder_token_id: int = 163605,
         pad_token_id: int = 0,
+        **kwargs,
     ):
         if vision_config is None:
             vision_config = MoonViTConfig()
         self.ignore_index = ignore_index
         self.media_placeholder_token_id = media_placeholder_token_id
+        attn_implementation = kwargs.get("attn_implementation")
+        if attn_implementation is not None:
+            if attn_implementation in ["eager", "flash_attention_2"]:
+                self._attn_implementation = attn_implementation
+                self.vision_config._attn_implementation = attn_implementation
+                self.text_config._attn_implementation = attn_implementation
+            else:
+                raise ValueError(
+                    f"Invalid attention implementation: {attn_implementation}"
+                )
         super().__init__(pad_token_id=pad_token_id, **kwargs)

modeling_kimi_vl.py CHANGED Viewed

@@ -145,19 +145,13 @@ def multihead_attention(
     return attn_out
-def sdpa_attention(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
     q_cu_seqlens: Optional[torch.Tensor] = None,
     k_cu_seqlens: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
-    """SDPA attention.
-    Args:
-        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
-            or (tot_seqlens, num_heads, head_dim) if packing.
-    """
     seq_length = q.shape[0]
     attention_mask = torch.zeros(
         [1, seq_length, seq_length], device=q.device, dtype=torch.bool
@@ -171,7 +165,12 @@ def sdpa_attention(
     q = q.transpose(0, 1)
     k = k.transpose(0, 1)
     v = v.transpose(0, 1)
-    attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
     attn_output = attn_output.transpose(0, 1)
     attn_output = attn_output.reshape(seq_length, -1)
     return attn_output
@@ -179,7 +178,7 @@ def sdpa_attention(
 VL_VISION_ATTENTION_FUNCTIONS = {
     "flash_attention_2": multihead_attention,
-    "sdpa": sdpa_attention,
 }
@@ -412,7 +411,7 @@ class MoonVitEncoderLayer(nn.Module):
         hidden_dim: int,
         mlp_dim: int,
         *,
-        attn_implementation: str = "sdpa",
         activation=F.gelu,
         attn_bias: bool = False,
     ):

     return attn_out
+def eager_attention(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
     q_cu_seqlens: Optional[torch.Tensor] = None,
     k_cu_seqlens: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     seq_length = q.shape[0]
     attention_mask = torch.zeros(
         [1, seq_length, seq_length], device=q.device, dtype=torch.bool
     q = q.transpose(0, 1)
     k = k.transpose(0, 1)
     v = v.transpose(0, 1)
+    attn_weight = q @ k.transpose(-2, -1) / math.sqrt(q.shape[-1])
+    attn_weight += attention_mask
+    attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32).to(q.dtype)
+    attn_output = attn_weight @ v
     attn_output = attn_output.transpose(0, 1)
     attn_output = attn_output.reshape(seq_length, -1)
     return attn_output
 VL_VISION_ATTENTION_FUNCTIONS = {
     "flash_attention_2": multihead_attention,
+    "eager": eager_attention,
 }
         hidden_dim: int,
         mlp_dim: int,
         *,
+        attn_implementation: str = "eager",
         activation=F.gelu,
         attn_bias: bool = False,
     ):