can set attn_implemention (#8)

- can set attn_implementation (9e6c3226b877a6be05e385279d56fcf26a0f9fab)
- add sdpa back (7718375747c38ef6a6e957a615edd4b3df495282)
- add blank (d869dc5ea79e62a0697c986a0fdeab12860c65bf)

Files changed (2) hide show

configuration_kimi_vl.py +33 -21
modeling_kimi_vl.py +33 -1

configuration_kimi_vl.py CHANGED Viewed

@@ -6,6 +6,7 @@ logger = logging.get_logger(__name__)
 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class DeepseekV3Config(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
@@ -122,30 +123,30 @@ class DeepseekV3Config(PretrainedConfig):
         vocab_size=129280,
         hidden_size=7168,
         intermediate_size=18432,
-        moe_intermediate_size = 2048,
         num_hidden_layers=61,
         num_nextn_predict_layers=1,
         num_attention_heads=128,
         num_key_value_heads=128,
-        n_shared_experts = 1,
-        n_routed_experts = 256,
-        ep_size = 1,
-        routed_scaling_factor = 2.5,
-        kv_lora_rank = 512,
-        q_lora_rank = 1536,
-        qk_rope_head_dim = 64,
-        v_head_dim = 128,
-        qk_nope_head_dim = 128,
-        topk_method = 'noaux_tc',
-        n_group = 8,
-        topk_group = 4,
-        num_experts_per_tok = 8,
-        moe_layer_freq = 1,
-        first_k_dense_replace = 3,
-        norm_topk_prob = True,
-        scoring_func = 'sigmoid',
-        aux_loss_alpha = 0.001,
-        seq_aux = True,
         hidden_act="silu",
         max_position_embeddings=4096,
         initializer_range=0.02,
@@ -252,7 +253,7 @@ class KimiVLConfig(PretrainedConfig):
         ignore_index: int = -100,
         media_placeholder_token_id: int = 163605,
         pad_token_id: int = 0,
-        **kwargs
     ):
         if vision_config is None:
             vision_config = MoonViTConfig()
@@ -269,4 +270,15 @@ class KimiVLConfig(PretrainedConfig):
         self.ignore_index = ignore_index
         self.media_placeholder_token_id = media_placeholder_token_id
         super().__init__(pad_token_id=pad_token_id, **kwargs)

 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class DeepseekV3Config(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
         vocab_size=129280,
         hidden_size=7168,
         intermediate_size=18432,
+        moe_intermediate_size=2048,
         num_hidden_layers=61,
         num_nextn_predict_layers=1,
         num_attention_heads=128,
         num_key_value_heads=128,
+        n_shared_experts=1,
+        n_routed_experts=256,
+        ep_size=1,
+        routed_scaling_factor=2.5,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method="noaux_tc",
+        n_group=8,
+        topk_group=4,
+        num_experts_per_tok=8,
+        moe_layer_freq=1,
+        first_k_dense_replace=3,
+        norm_topk_prob=True,
+        scoring_func="sigmoid",
+        aux_loss_alpha=0.001,
+        seq_aux=True,
         hidden_act="silu",
         max_position_embeddings=4096,
         initializer_range=0.02,
         ignore_index: int = -100,
         media_placeholder_token_id: int = 163605,
         pad_token_id: int = 0,
+        **kwargs,
     ):
         if vision_config is None:
             vision_config = MoonViTConfig()
         self.ignore_index = ignore_index
         self.media_placeholder_token_id = media_placeholder_token_id
+        attn_implementation = kwargs.get("attn_implementation")
+        if attn_implementation is not None:
+            if attn_implementation in ["eager", "flash_attention_2"]:
+                self._attn_implementation = attn_implementation
+                self.vision_config._attn_implementation = attn_implementation
+                self.text_config._attn_implementation = attn_implementation
+            else:
+                raise ValueError(
+                    f"Invalid attention implementation: {attn_implementation}"
+                )
         super().__init__(pad_token_id=pad_token_id, **kwargs)

modeling_kimi_vl.py CHANGED Viewed

@@ -177,9 +177,41 @@ def sdpa_attention(
     return attn_output
 VL_VISION_ATTENTION_FUNCTIONS = {
     "flash_attention_2": multihead_attention,
     "sdpa": sdpa_attention,
 }
@@ -412,7 +444,7 @@ class MoonVitEncoderLayer(nn.Module):
         hidden_dim: int,
         mlp_dim: int,
         *,
-        attn_implementation: str = "sdpa",
         activation=F.gelu,
         attn_bias: bool = False,
     ):

     return attn_output
+def eager_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    seq_length = q.shape[0]
+    attention_mask = torch.zeros(
+        [1, seq_length, seq_length], device=q.device, dtype=torch.bool
+    )
+    for i in range(1, len(q_cu_seqlens)):
+        attention_mask[
+            ...,
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+        ] = True
+    q = q.transpose(0, 1)
+    k = k.transpose(0, 1)
+    v = v.transpose(0, 1)
+    attn_weight = q @ k.transpose(-2, -1) / math.sqrt(q.shape[-1])
+    attn_weight += attention_mask
+    attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32).to(q.dtype)
+    attn_output = attn_weight @ v
+    attn_output = attn_output.transpose(0, 1)
+    attn_output = attn_output.reshape(seq_length, -1)
+    return attn_output
 VL_VISION_ATTENTION_FUNCTIONS = {
     "flash_attention_2": multihead_attention,
     "sdpa": sdpa_attention,
+    "eager": eager_attention,
 }
         hidden_dim: int,
         mlp_dim: int,
         *,
+        attn_implementation: str = "eager",
         activation=F.gelu,
         attn_bias: bool = False,
     ):