Upload DogeForCausalLM
Browse files- config.json +47 -44
- configuration_doge.py +13 -13
- generation_config.json +7 -7
- modeling_doge.py +19 -18
config.json
CHANGED
|
@@ -1,44 +1,47 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_name_or_path": "./results/Doge-60M",
|
| 3 |
-
"architectures": [
|
| 4 |
-
"DogeForCausalLM"
|
| 5 |
-
],
|
| 6 |
-
"attention_dropout": 0.0,
|
| 7 |
-
"auto_map": {
|
| 8 |
-
"AutoConfig": "configuration_doge.DogeConfig",
|
| 9 |
-
"AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
|
| 10 |
-
},
|
| 11 |
-
"bos_token_id": 0,
|
| 12 |
-
"dynamic_mask_ratio": 0.0,
|
| 13 |
-
"eos_token_id": 1,
|
| 14 |
-
"expert_retrieval_size": 256,
|
| 15 |
-
"hidden_act": "silu",
|
| 16 |
-
"hidden_bias": false,
|
| 17 |
-
"hidden_dropout": 0.0,
|
| 18 |
-
"hidden_size": 512,
|
| 19 |
-
"initializer_range": 0.02,
|
| 20 |
-
"intermediate_size": 1024,
|
| 21 |
-
"is_moe": false,
|
| 22 |
-
"max_position_embeddings": 2048,
|
| 23 |
-
"model_type": "doge",
|
| 24 |
-
"num_attention_heads": 4,
|
| 25 |
-
"num_cdmmoe_experts": 2048,
|
| 26 |
-
"num_cdmmoe_experts_per_head": 8,
|
| 27 |
-
"num_cdmmoe_heads": 4,
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "./results/Doge-60M",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"DogeForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"auto_map": {
|
| 8 |
+
"AutoConfig": "configuration_doge.DogeConfig",
|
| 9 |
+
"AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
|
| 10 |
+
},
|
| 11 |
+
"bos_token_id": 0,
|
| 12 |
+
"dynamic_mask_ratio": 0.0,
|
| 13 |
+
"eos_token_id": 1,
|
| 14 |
+
"expert_retrieval_size": 256,
|
| 15 |
+
"hidden_act": "silu",
|
| 16 |
+
"hidden_bias": false,
|
| 17 |
+
"hidden_dropout": 0.0,
|
| 18 |
+
"hidden_size": 512,
|
| 19 |
+
"initializer_range": 0.02,
|
| 20 |
+
"intermediate_size": 1024,
|
| 21 |
+
"is_moe": false,
|
| 22 |
+
"max_position_embeddings": 2048,
|
| 23 |
+
"model_type": "doge",
|
| 24 |
+
"num_attention_heads": 4,
|
| 25 |
+
"num_cdmmoe_experts": 2048,
|
| 26 |
+
"num_cdmmoe_experts_per_head": 8,
|
| 27 |
+
"num_cdmmoe_heads": 4,
|
| 28 |
+
"num_cdmoe_experts": 16348,
|
| 29 |
+
"num_cdmoe_experts_per_head": 8,
|
| 30 |
+
"num_cdmoe_heads": 4,
|
| 31 |
+
"num_channels": 3,
|
| 32 |
+
"num_hidden_layers": 16,
|
| 33 |
+
"num_key_value_heads": 2,
|
| 34 |
+
"pad_token_id": 2,
|
| 35 |
+
"patch_size": 16,
|
| 36 |
+
"rms_norm_eps": 1e-06,
|
| 37 |
+
"rope_scaling": {
|
| 38 |
+
"factor": 4.0,
|
| 39 |
+
"original_max_position_embeddings": 2048,
|
| 40 |
+
"rope_type": "dynamic"
|
| 41 |
+
},
|
| 42 |
+
"rope_theta": 10000.0,
|
| 43 |
+
"torch_dtype": "float32",
|
| 44 |
+
"transformers_version": "4.47.1",
|
| 45 |
+
"use_cache": true,
|
| 46 |
+
"vocab_size": 32768
|
| 47 |
+
}
|
configuration_doge.py
CHANGED
|
@@ -40,7 +40,7 @@ class DogeConfig(PretrainedConfig):
|
|
| 40 |
hidden_size (`int`, *optional*, defaults to 1024):
|
| 41 |
Dimension of the hidden representations.
|
| 42 |
intermediate_size (`int`, *optional*, defaults to 2048):
|
| 43 |
-
Dimension of the
|
| 44 |
num_hidden_layers (`int`, *optional*, defaults to 32):
|
| 45 |
Number of hidden layers in the Transformer decoder.
|
| 46 |
hidden_bias (`bool`, *optional*, defaults to `False`):
|
|
@@ -115,13 +115,13 @@ class DogeConfig(PretrainedConfig):
|
|
| 115 |
The ratio to control the proportion of the dynamic mask filled with the minimum value.
|
| 116 |
is_moe (`bool`, *optional*, defaults to `False`):
|
| 117 |
Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
|
| 118 |
-
|
| 119 |
-
Number of Private Experts for the Cross Domain Mixture of Experts.
|
| 120 |
-
|
| 121 |
Number of heads of Private Experts for the Cross Domain Mixture of Experts.
|
| 122 |
-
|
| 123 |
Number of Private Experts per head for the Cross Domain Mixture of Experts.
|
| 124 |
-
expert_retrieval_size (`int`, *optional*, defaults to
|
| 125 |
Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
|
| 126 |
"""
|
| 127 |
|
|
@@ -158,10 +158,10 @@ class DogeConfig(PretrainedConfig):
|
|
| 158 |
attention_dropout=0.0,
|
| 159 |
dynamic_mask_ratio=0.0,
|
| 160 |
is_moe=False,
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
expert_retrieval_size=
|
| 165 |
**kwargs,
|
| 166 |
):
|
| 167 |
self.vocab_size = vocab_size
|
|
@@ -188,9 +188,9 @@ class DogeConfig(PretrainedConfig):
|
|
| 188 |
self.attention_dropout = attention_dropout
|
| 189 |
self.dynamic_mask_ratio = dynamic_mask_ratio
|
| 190 |
self.is_moe = is_moe
|
| 191 |
-
self.
|
| 192 |
-
self.
|
| 193 |
-
self.
|
| 194 |
self.expert_retrieval_size = expert_retrieval_size
|
| 195 |
|
| 196 |
# Validate the correctness of rotary position embeddings parameters
|
|
|
|
| 40 |
hidden_size (`int`, *optional*, defaults to 1024):
|
| 41 |
Dimension of the hidden representations.
|
| 42 |
intermediate_size (`int`, *optional*, defaults to 2048):
|
| 43 |
+
Dimension of the MLP representations.
|
| 44 |
num_hidden_layers (`int`, *optional*, defaults to 32):
|
| 45 |
Number of hidden layers in the Transformer decoder.
|
| 46 |
hidden_bias (`bool`, *optional*, defaults to `False`):
|
|
|
|
| 115 |
The ratio to control the proportion of the dynamic mask filled with the minimum value.
|
| 116 |
is_moe (`bool`, *optional*, defaults to `False`):
|
| 117 |
Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
|
| 118 |
+
num_cdmoe_experts (`int`, *optional*, defaults to 16348):
|
| 119 |
+
Number of Private Experts for the Cross Domain Mixture of Experts. calculation formula: :math:`\text{num_cdmoe_experts} = (32 \times \text{num_cdmoe_heads})^2`
|
| 120 |
+
num_cdmoe_heads (`int`, *optional*, defaults to 4):
|
| 121 |
Number of heads of Private Experts for the Cross Domain Mixture of Experts.
|
| 122 |
+
num_cdmoe_experts_per_head (`int`, *optional*, defaults to 8):
|
| 123 |
Number of Private Experts per head for the Cross Domain Mixture of Experts.
|
| 124 |
+
expert_retrieval_size (`int`, *optional*, defaults to 64):
|
| 125 |
Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
|
| 126 |
"""
|
| 127 |
|
|
|
|
| 158 |
attention_dropout=0.0,
|
| 159 |
dynamic_mask_ratio=0.0,
|
| 160 |
is_moe=False,
|
| 161 |
+
num_cdmoe_experts=16348,
|
| 162 |
+
num_cdmoe_heads=4,
|
| 163 |
+
num_cdmoe_experts_per_head=8,
|
| 164 |
+
expert_retrieval_size=64,
|
| 165 |
**kwargs,
|
| 166 |
):
|
| 167 |
self.vocab_size = vocab_size
|
|
|
|
| 188 |
self.attention_dropout = attention_dropout
|
| 189 |
self.dynamic_mask_ratio = dynamic_mask_ratio
|
| 190 |
self.is_moe = is_moe
|
| 191 |
+
self.num_cdmoe_experts = num_cdmoe_experts
|
| 192 |
+
self.num_cdmoe_heads = num_cdmoe_heads
|
| 193 |
+
self.num_cdmoe_experts_per_head = num_cdmoe_experts_per_head
|
| 194 |
self.expert_retrieval_size = expert_retrieval_size
|
| 195 |
|
| 196 |
# Validate the correctness of rotary position embeddings parameters
|
generation_config.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
{
|
| 2 |
-
"_from_model_config": true,
|
| 3 |
-
"bos_token_id": 0,
|
| 4 |
-
"eos_token_id": 1,
|
| 5 |
-
"pad_token_id": 2,
|
| 6 |
-
"transformers_version": "4.
|
| 7 |
-
}
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 0,
|
| 4 |
+
"eos_token_id": 1,
|
| 5 |
+
"pad_token_id": 2,
|
| 6 |
+
"transformers_version": "4.47.1"
|
| 7 |
+
}
|
modeling_doge.py
CHANGED
|
@@ -22,7 +22,6 @@ import math
|
|
| 22 |
from typing import List, Optional, Tuple, Union
|
| 23 |
|
| 24 |
import torch
|
| 25 |
-
from torch.nn.attention.flex_attention import flex_attention
|
| 26 |
import torch.nn.functional as F
|
| 27 |
import torch.utils.checkpoint
|
| 28 |
from torch import nn
|
|
@@ -40,6 +39,7 @@ from transformers.modeling_utils import PreTrainedModel
|
|
| 40 |
from transformers.utils import (
|
| 41 |
add_start_docstrings,
|
| 42 |
add_start_docstrings_to_model_forward,
|
|
|
|
| 43 |
logging,
|
| 44 |
replace_return_docstrings,
|
| 45 |
)
|
|
@@ -50,6 +50,9 @@ try:
|
|
| 50 |
except ImportError:
|
| 51 |
einx_add = None
|
| 52 |
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
logger = logging.get_logger(__name__)
|
| 55 |
|
|
@@ -308,12 +311,10 @@ class DogeDynamicMaskAttention(nn.Module):
|
|
| 308 |
min_type = torch.finfo(hidden_states.dtype).min
|
| 309 |
attn_mask = dynamic_mask[:, :, None, :]
|
| 310 |
if 0.0 < dynamic_mask_ratio < 1.0:
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
).values
|
| 316 |
-
attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
|
| 317 |
if attention_mask is not None:
|
| 318 |
attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
|
| 319 |
return attn_mask
|
|
@@ -479,18 +480,18 @@ class DogeCDMoE(DogeMLP):
|
|
| 479 |
self.act_fn = ACT2FN[config.hidden_act]
|
| 480 |
|
| 481 |
self.expert_retrieval_dim = config.expert_retrieval_size
|
| 482 |
-
self.
|
| 483 |
-
self.
|
| 484 |
-
self.
|
| 485 |
-
self.num_keys = int(math.sqrt(self.
|
| 486 |
|
| 487 |
# queries and keys for retrieval experts
|
| 488 |
-
self.queries = nn.Linear(self.hidden_dim, self.
|
| 489 |
-
self.keys = nn.Parameter(torch.zeros(self.
|
| 490 |
|
| 491 |
# experts
|
| 492 |
-
self.down_embed = nn.Embedding(self.
|
| 493 |
-
self.up_embed = nn.Embedding(self.
|
| 494 |
|
| 495 |
def forward(
|
| 496 |
self,
|
|
@@ -501,11 +502,11 @@ class DogeCDMoE(DogeMLP):
|
|
| 501 |
|
| 502 |
# get similarity with queries and keys
|
| 503 |
queries = self.queries(hidden_states)
|
| 504 |
-
queries = queries.view(bsz, seq_len, 2, self.
|
| 505 |
sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
|
| 506 |
|
| 507 |
# get experts with the highest similarity
|
| 508 |
-
(scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.
|
| 509 |
if einx_add is not None:
|
| 510 |
all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
|
| 511 |
all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
|
|
@@ -514,7 +515,7 @@ class DogeCDMoE(DogeMLP):
|
|
| 514 |
all_scores = all_scores.view(*scores_x.shape[:-1], -1)
|
| 515 |
all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
|
| 516 |
all_indices = all_indices.view(*indices_x.shape[:-1], -1)
|
| 517 |
-
scores, pk_indices = all_scores.topk(self.
|
| 518 |
indices = all_indices.gather(-1, pk_indices)
|
| 519 |
down_embed = self.down_embed(indices)
|
| 520 |
up_embed = self.up_embed(indices)
|
|
|
|
| 22 |
from typing import List, Optional, Tuple, Union
|
| 23 |
|
| 24 |
import torch
|
|
|
|
| 25 |
import torch.nn.functional as F
|
| 26 |
import torch.utils.checkpoint
|
| 27 |
from torch import nn
|
|
|
|
| 39 |
from transformers.utils import (
|
| 40 |
add_start_docstrings,
|
| 41 |
add_start_docstrings_to_model_forward,
|
| 42 |
+
is_torch_greater_or_equal,
|
| 43 |
logging,
|
| 44 |
replace_return_docstrings,
|
| 45 |
)
|
|
|
|
| 50 |
except ImportError:
|
| 51 |
einx_add = None
|
| 52 |
|
| 53 |
+
if is_torch_greater_or_equal("2.5"):
|
| 54 |
+
from torch.nn.attention.flex_attention import flex_attention
|
| 55 |
+
|
| 56 |
|
| 57 |
logger = logging.get_logger(__name__)
|
| 58 |
|
|
|
|
| 311 |
min_type = torch.finfo(hidden_states.dtype).min
|
| 312 |
attn_mask = dynamic_mask[:, :, None, :]
|
| 313 |
if 0.0 < dynamic_mask_ratio < 1.0:
|
| 314 |
+
num_dynamic_mask = int(attn_mask.shape[-1] * dynamic_mask_ratio)
|
| 315 |
+
if num_dynamic_mask > 0:
|
| 316 |
+
rate_value = torch.kthvalue(attn_mask, num_dynamic_mask, dim=-1, keepdim=True).values
|
| 317 |
+
attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
|
|
|
|
|
|
|
| 318 |
if attention_mask is not None:
|
| 319 |
attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
|
| 320 |
return attn_mask
|
|
|
|
| 480 |
self.act_fn = ACT2FN[config.hidden_act]
|
| 481 |
|
| 482 |
self.expert_retrieval_dim = config.expert_retrieval_size
|
| 483 |
+
self.num_cdmoe_experts = config.num_cdmoe_experts
|
| 484 |
+
self.num_cdmoe_heads = config.num_cdmoe_heads
|
| 485 |
+
self.num_cdmoe_experts_per_head = config.num_cdmoe_experts_per_head
|
| 486 |
+
self.num_keys = int(math.sqrt(self.num_cdmoe_experts))
|
| 487 |
|
| 488 |
# queries and keys for retrieval experts
|
| 489 |
+
self.queries = nn.Linear(self.hidden_dim, self.num_cdmoe_heads * self.expert_retrieval_dim, bias=False)
|
| 490 |
+
self.keys = nn.Parameter(torch.zeros(self.num_cdmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
|
| 491 |
|
| 492 |
# experts
|
| 493 |
+
self.down_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
|
| 494 |
+
self.up_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
|
| 495 |
|
| 496 |
def forward(
|
| 497 |
self,
|
|
|
|
| 502 |
|
| 503 |
# get similarity with queries and keys
|
| 504 |
queries = self.queries(hidden_states)
|
| 505 |
+
queries = queries.view(bsz, seq_len, 2, self.num_cdmoe_heads, -1).permute(2, 0, 1, 3, 4)
|
| 506 |
sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
|
| 507 |
|
| 508 |
# get experts with the highest similarity
|
| 509 |
+
(scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmoe_experts_per_head, dim=-1)
|
| 510 |
if einx_add is not None:
|
| 511 |
all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
|
| 512 |
all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
|
|
|
|
| 515 |
all_scores = all_scores.view(*scores_x.shape[:-1], -1)
|
| 516 |
all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
|
| 517 |
all_indices = all_indices.view(*indices_x.shape[:-1], -1)
|
| 518 |
+
scores, pk_indices = all_scores.topk(self.num_cdmoe_experts_per_head, dim=-1)
|
| 519 |
indices = all_indices.gather(-1, pk_indices)
|
| 520 |
down_embed = self.down_embed(indices)
|
| 521 |
up_embed = self.up_embed(indices)
|