zhouzaida
commited on
Commit
·
9e6c322
1
Parent(s):
a98910a
can set attn_implementation
Browse files- configuration_kimi_vl.py +33 -21
- modeling_kimi_vl.py +9 -10
configuration_kimi_vl.py
CHANGED
|
@@ -6,6 +6,7 @@ logger = logging.get_logger(__name__)
|
|
| 6 |
|
| 7 |
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
| 8 |
|
|
|
|
| 9 |
class DeepseekV3Config(PretrainedConfig):
|
| 10 |
r"""
|
| 11 |
This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
|
|
@@ -122,30 +123,30 @@ class DeepseekV3Config(PretrainedConfig):
|
|
| 122 |
vocab_size=129280,
|
| 123 |
hidden_size=7168,
|
| 124 |
intermediate_size=18432,
|
| 125 |
-
moe_intermediate_size
|
| 126 |
num_hidden_layers=61,
|
| 127 |
num_nextn_predict_layers=1,
|
| 128 |
num_attention_heads=128,
|
| 129 |
num_key_value_heads=128,
|
| 130 |
-
n_shared_experts
|
| 131 |
-
n_routed_experts
|
| 132 |
-
ep_size
|
| 133 |
-
routed_scaling_factor
|
| 134 |
-
kv_lora_rank
|
| 135 |
-
q_lora_rank
|
| 136 |
-
qk_rope_head_dim
|
| 137 |
-
v_head_dim
|
| 138 |
-
qk_nope_head_dim
|
| 139 |
-
topk_method
|
| 140 |
-
n_group
|
| 141 |
-
topk_group
|
| 142 |
-
num_experts_per_tok
|
| 143 |
-
moe_layer_freq
|
| 144 |
-
first_k_dense_replace
|
| 145 |
-
norm_topk_prob
|
| 146 |
-
scoring_func
|
| 147 |
-
aux_loss_alpha
|
| 148 |
-
seq_aux
|
| 149 |
hidden_act="silu",
|
| 150 |
max_position_embeddings=4096,
|
| 151 |
initializer_range=0.02,
|
|
@@ -252,7 +253,7 @@ class KimiVLConfig(PretrainedConfig):
|
|
| 252 |
ignore_index: int = -100,
|
| 253 |
media_placeholder_token_id: int = 163605,
|
| 254 |
pad_token_id: int = 0,
|
| 255 |
-
**kwargs
|
| 256 |
):
|
| 257 |
if vision_config is None:
|
| 258 |
vision_config = MoonViTConfig()
|
|
@@ -269,4 +270,15 @@ class KimiVLConfig(PretrainedConfig):
|
|
| 269 |
self.ignore_index = ignore_index
|
| 270 |
self.media_placeholder_token_id = media_placeholder_token_id
|
| 271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
|
|
|
| 6 |
|
| 7 |
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
| 8 |
|
| 9 |
+
|
| 10 |
class DeepseekV3Config(PretrainedConfig):
|
| 11 |
r"""
|
| 12 |
This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
|
|
|
|
| 123 |
vocab_size=129280,
|
| 124 |
hidden_size=7168,
|
| 125 |
intermediate_size=18432,
|
| 126 |
+
moe_intermediate_size=2048,
|
| 127 |
num_hidden_layers=61,
|
| 128 |
num_nextn_predict_layers=1,
|
| 129 |
num_attention_heads=128,
|
| 130 |
num_key_value_heads=128,
|
| 131 |
+
n_shared_experts=1,
|
| 132 |
+
n_routed_experts=256,
|
| 133 |
+
ep_size=1,
|
| 134 |
+
routed_scaling_factor=2.5,
|
| 135 |
+
kv_lora_rank=512,
|
| 136 |
+
q_lora_rank=1536,
|
| 137 |
+
qk_rope_head_dim=64,
|
| 138 |
+
v_head_dim=128,
|
| 139 |
+
qk_nope_head_dim=128,
|
| 140 |
+
topk_method="noaux_tc",
|
| 141 |
+
n_group=8,
|
| 142 |
+
topk_group=4,
|
| 143 |
+
num_experts_per_tok=8,
|
| 144 |
+
moe_layer_freq=1,
|
| 145 |
+
first_k_dense_replace=3,
|
| 146 |
+
norm_topk_prob=True,
|
| 147 |
+
scoring_func="sigmoid",
|
| 148 |
+
aux_loss_alpha=0.001,
|
| 149 |
+
seq_aux=True,
|
| 150 |
hidden_act="silu",
|
| 151 |
max_position_embeddings=4096,
|
| 152 |
initializer_range=0.02,
|
|
|
|
| 253 |
ignore_index: int = -100,
|
| 254 |
media_placeholder_token_id: int = 163605,
|
| 255 |
pad_token_id: int = 0,
|
| 256 |
+
**kwargs,
|
| 257 |
):
|
| 258 |
if vision_config is None:
|
| 259 |
vision_config = MoonViTConfig()
|
|
|
|
| 270 |
self.ignore_index = ignore_index
|
| 271 |
self.media_placeholder_token_id = media_placeholder_token_id
|
| 272 |
|
| 273 |
+
attn_implementation = kwargs.get("attn_implementation")
|
| 274 |
+
if attn_implementation is not None:
|
| 275 |
+
if attn_implementation in ["eager", "flash_attention_2"]:
|
| 276 |
+
self._attn_implementation = attn_implementation
|
| 277 |
+
self.vision_config._attn_implementation = attn_implementation
|
| 278 |
+
self.text_config._attn_implementation = attn_implementation
|
| 279 |
+
else:
|
| 280 |
+
raise ValueError(
|
| 281 |
+
f"Invalid attention implementation: {attn_implementation}"
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
modeling_kimi_vl.py
CHANGED
|
@@ -145,19 +145,13 @@ def multihead_attention(
|
|
| 145 |
return attn_out
|
| 146 |
|
| 147 |
|
| 148 |
-
def
|
| 149 |
q: torch.Tensor,
|
| 150 |
k: torch.Tensor,
|
| 151 |
v: torch.Tensor,
|
| 152 |
q_cu_seqlens: Optional[torch.Tensor] = None,
|
| 153 |
k_cu_seqlens: Optional[torch.Tensor] = None,
|
| 154 |
) -> torch.Tensor:
|
| 155 |
-
"""SDPA attention.
|
| 156 |
-
|
| 157 |
-
Args:
|
| 158 |
-
q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
|
| 159 |
-
or (tot_seqlens, num_heads, head_dim) if packing.
|
| 160 |
-
"""
|
| 161 |
seq_length = q.shape[0]
|
| 162 |
attention_mask = torch.zeros(
|
| 163 |
[1, seq_length, seq_length], device=q.device, dtype=torch.bool
|
|
@@ -171,7 +165,12 @@ def sdpa_attention(
|
|
| 171 |
q = q.transpose(0, 1)
|
| 172 |
k = k.transpose(0, 1)
|
| 173 |
v = v.transpose(0, 1)
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
attn_output = attn_output.transpose(0, 1)
|
| 176 |
attn_output = attn_output.reshape(seq_length, -1)
|
| 177 |
return attn_output
|
|
@@ -179,7 +178,7 @@ def sdpa_attention(
|
|
| 179 |
|
| 180 |
VL_VISION_ATTENTION_FUNCTIONS = {
|
| 181 |
"flash_attention_2": multihead_attention,
|
| 182 |
-
"
|
| 183 |
}
|
| 184 |
|
| 185 |
|
|
@@ -412,7 +411,7 @@ class MoonVitEncoderLayer(nn.Module):
|
|
| 412 |
hidden_dim: int,
|
| 413 |
mlp_dim: int,
|
| 414 |
*,
|
| 415 |
-
attn_implementation: str = "
|
| 416 |
activation=F.gelu,
|
| 417 |
attn_bias: bool = False,
|
| 418 |
):
|
|
|
|
| 145 |
return attn_out
|
| 146 |
|
| 147 |
|
| 148 |
+
def eager_attention(
|
| 149 |
q: torch.Tensor,
|
| 150 |
k: torch.Tensor,
|
| 151 |
v: torch.Tensor,
|
| 152 |
q_cu_seqlens: Optional[torch.Tensor] = None,
|
| 153 |
k_cu_seqlens: Optional[torch.Tensor] = None,
|
| 154 |
) -> torch.Tensor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
seq_length = q.shape[0]
|
| 156 |
attention_mask = torch.zeros(
|
| 157 |
[1, seq_length, seq_length], device=q.device, dtype=torch.bool
|
|
|
|
| 165 |
q = q.transpose(0, 1)
|
| 166 |
k = k.transpose(0, 1)
|
| 167 |
v = v.transpose(0, 1)
|
| 168 |
+
|
| 169 |
+
attn_weight = q @ k.transpose(-2, -1) / math.sqrt(q.shape[-1])
|
| 170 |
+
attn_weight += attention_mask
|
| 171 |
+
attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32).to(q.dtype)
|
| 172 |
+
|
| 173 |
+
attn_output = attn_weight @ v
|
| 174 |
attn_output = attn_output.transpose(0, 1)
|
| 175 |
attn_output = attn_output.reshape(seq_length, -1)
|
| 176 |
return attn_output
|
|
|
|
| 178 |
|
| 179 |
VL_VISION_ATTENTION_FUNCTIONS = {
|
| 180 |
"flash_attention_2": multihead_attention,
|
| 181 |
+
"eager": eager_attention,
|
| 182 |
}
|
| 183 |
|
| 184 |
|
|
|
|
| 411 |
hidden_dim: int,
|
| 412 |
mlp_dim: int,
|
| 413 |
*,
|
| 414 |
+
attn_implementation: str = "eager",
|
| 415 |
activation=F.gelu,
|
| 416 |
attn_bias: bool = False,
|
| 417 |
):
|