prasadsachin commited on
Commit
7e1f7bd
·
verified ·
1 Parent(s): f5414e8

Upload folder using huggingface_hub

Browse files
assets/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
assets/tokenizer/vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "module": "keras_hub.src.models.qwen_moe.qwen_moe_backbone",
3
+ "class_name": "QwenMoeBackbone",
4
+ "config": {
5
+ "name": "qwen_moe_backbone",
6
+ "trainable": true,
7
+ "vocabulary_size": 151936,
8
+ "num_layers": 24,
9
+ "num_query_heads": 16,
10
+ "hidden_dim": 2048,
11
+ "intermediate_dim": 5632,
12
+ "moe_intermediate_dim": 1408,
13
+ "shared_expert_intermediate_dim": 5632,
14
+ "rope_max_wavelength": 1000000.0,
15
+ "num_key_value_heads": 16,
16
+ "rope_scaling_factor": 1.0,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "dropout": 0,
19
+ "tie_word_embeddings": false,
20
+ "use_sliding_window_attention": false,
21
+ "sliding_window_size": 32768,
22
+ "num_experts": 60,
23
+ "top_k": 4,
24
+ "norm_top_k_prob": false,
25
+ "decoder_sparse_step": 1,
26
+ "mlp_only_layers": [],
27
+ "output_router_logits": false
28
+ },
29
+ "registered_name": "keras_hub>QwenMoeBackbone"
30
+ }
metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "keras_version": "3.10.0.dev2025061703",
3
+ "keras_hub_version": "0.22.0.dev202506170415",
4
+ "parameter_count": 14315784192,
5
+ "date_saved": "2025-06-17@15:00:41",
6
+ "tasks": [
7
+ "CausalLM"
8
+ ]
9
+ }
model.weights.json ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 28631568384.0
4
+ },
5
+ "weight_map": {
6
+ "/layers/reversible_embedding/vars": [
7
+ "model_00000.weights.h5"
8
+ ],
9
+ "/layers/qwen_moe_transformer_decoder/mlp/expert_bank/vars": [
10
+ "model_00000.weights.h5"
11
+ ],
12
+ "/layers/qwen_moe_transformer_decoder/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00000.weights.h5",
13
+ "/layers/qwen_moe_transformer_decoder/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00000.weights.h5",
14
+ "/layers/qwen_moe_transformer_decoder/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00000.weights.h5",
15
+ "/layers/qwen_moe_transformer_decoder/mlp/shared_expert_gate_dense/vars": "model_00000.weights.h5",
16
+ "/layers/qwen_moe_transformer_decoder/mlp/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
17
+ "/layers/qwen_moe_transformer_decoder/_feedforward_layernorm/vars": "model_00000.weights.h5",
18
+ "/layers/qwen_moe_transformer_decoder/_self_attention_layer/key_dense/vars": [
19
+ "model_00000.weights.h5"
20
+ ],
21
+ "/layers/qwen_moe_transformer_decoder/_self_attention_layer/query_dense/vars": [
22
+ "model_00000.weights.h5"
23
+ ],
24
+ "/layers/qwen_moe_transformer_decoder/_self_attention_layer/value_dense/vars": [
25
+ "model_00000.weights.h5"
26
+ ],
27
+ "/layers/qwen_moe_transformer_decoder/_self_attention_layer/_output_dense/vars": "model_00000.weights.h5",
28
+ "/layers/qwen_moe_transformer_decoder/_self_attention_layernorm/vars": "model_00000.weights.h5",
29
+ "/layers/qwen_moe_transformer_decoder_1/mlp/expert_bank/vars": [
30
+ "model_00000.weights.h5"
31
+ ],
32
+ "/layers/qwen_moe_transformer_decoder_1/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00000.weights.h5",
33
+ "/layers/qwen_moe_transformer_decoder_1/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00000.weights.h5",
34
+ "/layers/qwen_moe_transformer_decoder_1/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00000.weights.h5",
35
+ "/layers/qwen_moe_transformer_decoder_1/mlp/shared_expert_gate_dense/vars": "model_00000.weights.h5",
36
+ "/layers/qwen_moe_transformer_decoder_1/mlp/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
37
+ "/layers/qwen_moe_transformer_decoder_1/_feedforward_layernorm/vars": "model_00000.weights.h5",
38
+ "/layers/qwen_moe_transformer_decoder_1/_self_attention_layer/key_dense/vars": [
39
+ "model_00000.weights.h5"
40
+ ],
41
+ "/layers/qwen_moe_transformer_decoder_1/_self_attention_layer/query_dense/vars": [
42
+ "model_00000.weights.h5"
43
+ ],
44
+ "/layers/qwen_moe_transformer_decoder_1/_self_attention_layer/value_dense/vars": [
45
+ "model_00000.weights.h5"
46
+ ],
47
+ "/layers/qwen_moe_transformer_decoder_1/_self_attention_layer/_output_dense/vars": "model_00000.weights.h5",
48
+ "/layers/qwen_moe_transformer_decoder_1/_self_attention_layernorm/vars": "model_00000.weights.h5",
49
+ "/layers/qwen_moe_transformer_decoder_2/mlp/expert_bank/vars": [
50
+ "model_00000.weights.h5"
51
+ ],
52
+ "/layers/qwen_moe_transformer_decoder_2/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00000.weights.h5",
53
+ "/layers/qwen_moe_transformer_decoder_2/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00000.weights.h5",
54
+ "/layers/qwen_moe_transformer_decoder_2/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00000.weights.h5",
55
+ "/layers/qwen_moe_transformer_decoder_2/mlp/shared_expert_gate_dense/vars": "model_00000.weights.h5",
56
+ "/layers/qwen_moe_transformer_decoder_2/mlp/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
57
+ "/layers/qwen_moe_transformer_decoder_2/_feedforward_layernorm/vars": "model_00000.weights.h5",
58
+ "/layers/qwen_moe_transformer_decoder_2/_self_attention_layer/key_dense/vars": [
59
+ "model_00000.weights.h5"
60
+ ],
61
+ "/layers/qwen_moe_transformer_decoder_2/_self_attention_layer/query_dense/vars": [
62
+ "model_00000.weights.h5"
63
+ ],
64
+ "/layers/qwen_moe_transformer_decoder_2/_self_attention_layer/value_dense/vars": [
65
+ "model_00000.weights.h5"
66
+ ],
67
+ "/layers/qwen_moe_transformer_decoder_2/_self_attention_layer/_output_dense/vars": "model_00000.weights.h5",
68
+ "/layers/qwen_moe_transformer_decoder_2/_self_attention_layernorm/vars": "model_00000.weights.h5",
69
+ "/layers/qwen_moe_transformer_decoder_3/mlp/expert_bank/vars": [
70
+ "model_00000.weights.h5"
71
+ ],
72
+ "/layers/qwen_moe_transformer_decoder_3/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00000.weights.h5",
73
+ "/layers/qwen_moe_transformer_decoder_3/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00000.weights.h5",
74
+ "/layers/qwen_moe_transformer_decoder_3/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00000.weights.h5",
75
+ "/layers/qwen_moe_transformer_decoder_3/mlp/shared_expert_gate_dense/vars": "model_00000.weights.h5",
76
+ "/layers/qwen_moe_transformer_decoder_3/mlp/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
77
+ "/layers/qwen_moe_transformer_decoder_3/_feedforward_layernorm/vars": "model_00000.weights.h5",
78
+ "/layers/qwen_moe_transformer_decoder_3/_self_attention_layer/key_dense/vars": [
79
+ "model_00000.weights.h5"
80
+ ],
81
+ "/layers/qwen_moe_transformer_decoder_3/_self_attention_layer/query_dense/vars": [
82
+ "model_00000.weights.h5"
83
+ ],
84
+ "/layers/qwen_moe_transformer_decoder_3/_self_attention_layer/value_dense/vars": [
85
+ "model_00000.weights.h5"
86
+ ],
87
+ "/layers/qwen_moe_transformer_decoder_3/_self_attention_layer/_output_dense/vars": "model_00000.weights.h5",
88
+ "/layers/qwen_moe_transformer_decoder_3/_self_attention_layernorm/vars": "model_00000.weights.h5",
89
+ "/layers/qwen_moe_transformer_decoder_4/mlp/expert_bank/vars": [
90
+ "model_00000.weights.h5"
91
+ ],
92
+ "/layers/qwen_moe_transformer_decoder_4/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00000.weights.h5",
93
+ "/layers/qwen_moe_transformer_decoder_4/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00000.weights.h5",
94
+ "/layers/qwen_moe_transformer_decoder_4/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00000.weights.h5",
95
+ "/layers/qwen_moe_transformer_decoder_4/mlp/shared_expert_gate_dense/vars": "model_00000.weights.h5",
96
+ "/layers/qwen_moe_transformer_decoder_4/mlp/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
97
+ "/layers/qwen_moe_transformer_decoder_4/_feedforward_layernorm/vars": "model_00000.weights.h5",
98
+ "/layers/qwen_moe_transformer_decoder_4/_self_attention_layer/key_dense/vars": [
99
+ "model_00000.weights.h5"
100
+ ],
101
+ "/layers/qwen_moe_transformer_decoder_4/_self_attention_layer/query_dense/vars": [
102
+ "model_00000.weights.h5"
103
+ ],
104
+ "/layers/qwen_moe_transformer_decoder_4/_self_attention_layer/value_dense/vars": [
105
+ "model_00000.weights.h5"
106
+ ],
107
+ "/layers/qwen_moe_transformer_decoder_4/_self_attention_layer/_output_dense/vars": "model_00000.weights.h5",
108
+ "/layers/qwen_moe_transformer_decoder_4/_self_attention_layernorm/vars": "model_00000.weights.h5",
109
+ "/layers/qwen_moe_transformer_decoder_5/mlp/expert_bank/vars": [
110
+ "model_00000.weights.h5"
111
+ ],
112
+ "/layers/qwen_moe_transformer_decoder_5/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00000.weights.h5",
113
+ "/layers/qwen_moe_transformer_decoder_5/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00000.weights.h5",
114
+ "/layers/qwen_moe_transformer_decoder_5/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00000.weights.h5",
115
+ "/layers/qwen_moe_transformer_decoder_5/mlp/shared_expert_gate_dense/vars": "model_00000.weights.h5",
116
+ "/layers/qwen_moe_transformer_decoder_5/mlp/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
117
+ "/layers/qwen_moe_transformer_decoder_5/_feedforward_layernorm/vars": "model_00000.weights.h5",
118
+ "/layers/qwen_moe_transformer_decoder_5/_self_attention_layer/key_dense/vars": [
119
+ "model_00000.weights.h5"
120
+ ],
121
+ "/layers/qwen_moe_transformer_decoder_5/_self_attention_layer/query_dense/vars": [
122
+ "model_00000.weights.h5"
123
+ ],
124
+ "/layers/qwen_moe_transformer_decoder_5/_self_attention_layer/value_dense/vars": [
125
+ "model_00000.weights.h5"
126
+ ],
127
+ "/layers/qwen_moe_transformer_decoder_5/_self_attention_layer/_output_dense/vars": "model_00000.weights.h5",
128
+ "/layers/qwen_moe_transformer_decoder_5/_self_attention_layernorm/vars": "model_00000.weights.h5",
129
+ "/layers/qwen_moe_transformer_decoder_6/mlp/expert_bank/vars": [
130
+ "model_00000.weights.h5"
131
+ ],
132
+ "/layers/qwen_moe_transformer_decoder_6/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00000.weights.h5",
133
+ "/layers/qwen_moe_transformer_decoder_6/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00000.weights.h5",
134
+ "/layers/qwen_moe_transformer_decoder_6/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00000.weights.h5",
135
+ "/layers/qwen_moe_transformer_decoder_6/mlp/shared_expert_gate_dense/vars": "model_00000.weights.h5",
136
+ "/layers/qwen_moe_transformer_decoder_6/mlp/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
137
+ "/layers/qwen_moe_transformer_decoder_6/_feedforward_layernorm/vars": "model_00000.weights.h5",
138
+ "/layers/qwen_moe_transformer_decoder_6/_self_attention_layer/key_dense/vars": [
139
+ "model_00000.weights.h5"
140
+ ],
141
+ "/layers/qwen_moe_transformer_decoder_6/_self_attention_layer/query_dense/vars": [
142
+ "model_00000.weights.h5"
143
+ ],
144
+ "/layers/qwen_moe_transformer_decoder_6/_self_attention_layer/value_dense/vars": [
145
+ "model_00000.weights.h5"
146
+ ],
147
+ "/layers/qwen_moe_transformer_decoder_6/_self_attention_layer/_output_dense/vars": "model_00000.weights.h5",
148
+ "/layers/qwen_moe_transformer_decoder_6/_self_attention_layernorm/vars": "model_00000.weights.h5",
149
+ "/layers/qwen_moe_transformer_decoder_7/mlp/expert_bank/vars": [
150
+ "model_00000.weights.h5"
151
+ ],
152
+ "/layers/qwen_moe_transformer_decoder_7/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00000.weights.h5",
153
+ "/layers/qwen_moe_transformer_decoder_7/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00000.weights.h5",
154
+ "/layers/qwen_moe_transformer_decoder_7/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00000.weights.h5",
155
+ "/layers/qwen_moe_transformer_decoder_7/mlp/shared_expert_gate_dense/vars": "model_00000.weights.h5",
156
+ "/layers/qwen_moe_transformer_decoder_7/mlp/_sparse_feedforward_gate_dense/vars": "model_00000.weights.h5",
157
+ "/layers/qwen_moe_transformer_decoder_7/_feedforward_layernorm/vars": "model_00000.weights.h5",
158
+ "/layers/qwen_moe_transformer_decoder_7/_self_attention_layer/key_dense/vars": [
159
+ "model_00000.weights.h5"
160
+ ],
161
+ "/layers/qwen_moe_transformer_decoder_7/_self_attention_layer/query_dense/vars": [
162
+ "model_00000.weights.h5"
163
+ ],
164
+ "/layers/qwen_moe_transformer_decoder_7/_self_attention_layer/value_dense/vars": [
165
+ "model_00000.weights.h5"
166
+ ],
167
+ "/layers/qwen_moe_transformer_decoder_7/_self_attention_layer/_output_dense/vars": "model_00000.weights.h5",
168
+ "/layers/qwen_moe_transformer_decoder_7/_self_attention_layernorm/vars": "model_00000.weights.h5",
169
+ "/layers/qwen_moe_transformer_decoder_8/mlp/expert_bank/vars": [
170
+ "model_00001.weights.h5"
171
+ ],
172
+ "/layers/qwen_moe_transformer_decoder_8/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00001.weights.h5",
173
+ "/layers/qwen_moe_transformer_decoder_8/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00001.weights.h5",
174
+ "/layers/qwen_moe_transformer_decoder_8/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00001.weights.h5",
175
+ "/layers/qwen_moe_transformer_decoder_8/mlp/shared_expert_gate_dense/vars": "model_00001.weights.h5",
176
+ "/layers/qwen_moe_transformer_decoder_8/mlp/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
177
+ "/layers/qwen_moe_transformer_decoder_8/_feedforward_layernorm/vars": "model_00001.weights.h5",
178
+ "/layers/qwen_moe_transformer_decoder_8/_self_attention_layer/key_dense/vars": [
179
+ "model_00001.weights.h5"
180
+ ],
181
+ "/layers/qwen_moe_transformer_decoder_8/_self_attention_layer/query_dense/vars": [
182
+ "model_00001.weights.h5"
183
+ ],
184
+ "/layers/qwen_moe_transformer_decoder_8/_self_attention_layer/value_dense/vars": [
185
+ "model_00001.weights.h5"
186
+ ],
187
+ "/layers/qwen_moe_transformer_decoder_8/_self_attention_layer/_output_dense/vars": "model_00001.weights.h5",
188
+ "/layers/qwen_moe_transformer_decoder_8/_self_attention_layernorm/vars": "model_00001.weights.h5",
189
+ "/layers/qwen_moe_transformer_decoder_9/mlp/expert_bank/vars": [
190
+ "model_00001.weights.h5"
191
+ ],
192
+ "/layers/qwen_moe_transformer_decoder_9/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00001.weights.h5",
193
+ "/layers/qwen_moe_transformer_decoder_9/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00001.weights.h5",
194
+ "/layers/qwen_moe_transformer_decoder_9/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00001.weights.h5",
195
+ "/layers/qwen_moe_transformer_decoder_9/mlp/shared_expert_gate_dense/vars": "model_00001.weights.h5",
196
+ "/layers/qwen_moe_transformer_decoder_9/mlp/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
197
+ "/layers/qwen_moe_transformer_decoder_9/_feedforward_layernorm/vars": "model_00001.weights.h5",
198
+ "/layers/qwen_moe_transformer_decoder_9/_self_attention_layer/key_dense/vars": [
199
+ "model_00001.weights.h5"
200
+ ],
201
+ "/layers/qwen_moe_transformer_decoder_9/_self_attention_layer/query_dense/vars": [
202
+ "model_00001.weights.h5"
203
+ ],
204
+ "/layers/qwen_moe_transformer_decoder_9/_self_attention_layer/value_dense/vars": [
205
+ "model_00001.weights.h5"
206
+ ],
207
+ "/layers/qwen_moe_transformer_decoder_9/_self_attention_layer/_output_dense/vars": "model_00001.weights.h5",
208
+ "/layers/qwen_moe_transformer_decoder_9/_self_attention_layernorm/vars": "model_00001.weights.h5",
209
+ "/layers/qwen_moe_transformer_decoder_10/mlp/expert_bank/vars": [
210
+ "model_00001.weights.h5"
211
+ ],
212
+ "/layers/qwen_moe_transformer_decoder_10/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00001.weights.h5",
213
+ "/layers/qwen_moe_transformer_decoder_10/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00001.weights.h5",
214
+ "/layers/qwen_moe_transformer_decoder_10/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00001.weights.h5",
215
+ "/layers/qwen_moe_transformer_decoder_10/mlp/shared_expert_gate_dense/vars": "model_00001.weights.h5",
216
+ "/layers/qwen_moe_transformer_decoder_10/mlp/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
217
+ "/layers/qwen_moe_transformer_decoder_10/_feedforward_layernorm/vars": "model_00001.weights.h5",
218
+ "/layers/qwen_moe_transformer_decoder_10/_self_attention_layer/key_dense/vars": [
219
+ "model_00001.weights.h5"
220
+ ],
221
+ "/layers/qwen_moe_transformer_decoder_10/_self_attention_layer/query_dense/vars": [
222
+ "model_00001.weights.h5"
223
+ ],
224
+ "/layers/qwen_moe_transformer_decoder_10/_self_attention_layer/value_dense/vars": [
225
+ "model_00001.weights.h5"
226
+ ],
227
+ "/layers/qwen_moe_transformer_decoder_10/_self_attention_layer/_output_dense/vars": "model_00001.weights.h5",
228
+ "/layers/qwen_moe_transformer_decoder_10/_self_attention_layernorm/vars": "model_00001.weights.h5",
229
+ "/layers/qwen_moe_transformer_decoder_11/mlp/expert_bank/vars": [
230
+ "model_00001.weights.h5"
231
+ ],
232
+ "/layers/qwen_moe_transformer_decoder_11/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00001.weights.h5",
233
+ "/layers/qwen_moe_transformer_decoder_11/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00001.weights.h5",
234
+ "/layers/qwen_moe_transformer_decoder_11/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00001.weights.h5",
235
+ "/layers/qwen_moe_transformer_decoder_11/mlp/shared_expert_gate_dense/vars": "model_00001.weights.h5",
236
+ "/layers/qwen_moe_transformer_decoder_11/mlp/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
237
+ "/layers/qwen_moe_transformer_decoder_11/_feedforward_layernorm/vars": "model_00001.weights.h5",
238
+ "/layers/qwen_moe_transformer_decoder_11/_self_attention_layer/key_dense/vars": [
239
+ "model_00001.weights.h5"
240
+ ],
241
+ "/layers/qwen_moe_transformer_decoder_11/_self_attention_layer/query_dense/vars": [
242
+ "model_00001.weights.h5"
243
+ ],
244
+ "/layers/qwen_moe_transformer_decoder_11/_self_attention_layer/value_dense/vars": [
245
+ "model_00001.weights.h5"
246
+ ],
247
+ "/layers/qwen_moe_transformer_decoder_11/_self_attention_layer/_output_dense/vars": "model_00001.weights.h5",
248
+ "/layers/qwen_moe_transformer_decoder_11/_self_attention_layernorm/vars": "model_00001.weights.h5",
249
+ "/layers/qwen_moe_transformer_decoder_12/mlp/expert_bank/vars": [
250
+ "model_00001.weights.h5"
251
+ ],
252
+ "/layers/qwen_moe_transformer_decoder_12/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00001.weights.h5",
253
+ "/layers/qwen_moe_transformer_decoder_12/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00001.weights.h5",
254
+ "/layers/qwen_moe_transformer_decoder_12/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00001.weights.h5",
255
+ "/layers/qwen_moe_transformer_decoder_12/mlp/shared_expert_gate_dense/vars": "model_00001.weights.h5",
256
+ "/layers/qwen_moe_transformer_decoder_12/mlp/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
257
+ "/layers/qwen_moe_transformer_decoder_12/_feedforward_layernorm/vars": "model_00001.weights.h5",
258
+ "/layers/qwen_moe_transformer_decoder_12/_self_attention_layer/key_dense/vars": [
259
+ "model_00001.weights.h5"
260
+ ],
261
+ "/layers/qwen_moe_transformer_decoder_12/_self_attention_layer/query_dense/vars": [
262
+ "model_00001.weights.h5"
263
+ ],
264
+ "/layers/qwen_moe_transformer_decoder_12/_self_attention_layer/value_dense/vars": [
265
+ "model_00001.weights.h5"
266
+ ],
267
+ "/layers/qwen_moe_transformer_decoder_12/_self_attention_layer/_output_dense/vars": "model_00001.weights.h5",
268
+ "/layers/qwen_moe_transformer_decoder_12/_self_attention_layernorm/vars": "model_00001.weights.h5",
269
+ "/layers/qwen_moe_transformer_decoder_13/mlp/expert_bank/vars": [
270
+ "model_00001.weights.h5"
271
+ ],
272
+ "/layers/qwen_moe_transformer_decoder_13/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00001.weights.h5",
273
+ "/layers/qwen_moe_transformer_decoder_13/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00001.weights.h5",
274
+ "/layers/qwen_moe_transformer_decoder_13/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00001.weights.h5",
275
+ "/layers/qwen_moe_transformer_decoder_13/mlp/shared_expert_gate_dense/vars": "model_00001.weights.h5",
276
+ "/layers/qwen_moe_transformer_decoder_13/mlp/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
277
+ "/layers/qwen_moe_transformer_decoder_13/_feedforward_layernorm/vars": "model_00001.weights.h5",
278
+ "/layers/qwen_moe_transformer_decoder_13/_self_attention_layer/key_dense/vars": [
279
+ "model_00001.weights.h5"
280
+ ],
281
+ "/layers/qwen_moe_transformer_decoder_13/_self_attention_layer/query_dense/vars": [
282
+ "model_00001.weights.h5"
283
+ ],
284
+ "/layers/qwen_moe_transformer_decoder_13/_self_attention_layer/value_dense/vars": [
285
+ "model_00001.weights.h5"
286
+ ],
287
+ "/layers/qwen_moe_transformer_decoder_13/_self_attention_layer/_output_dense/vars": "model_00001.weights.h5",
288
+ "/layers/qwen_moe_transformer_decoder_13/_self_attention_layernorm/vars": "model_00001.weights.h5",
289
+ "/layers/qwen_moe_transformer_decoder_14/mlp/expert_bank/vars": [
290
+ "model_00001.weights.h5"
291
+ ],
292
+ "/layers/qwen_moe_transformer_decoder_14/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00001.weights.h5",
293
+ "/layers/qwen_moe_transformer_decoder_14/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00001.weights.h5",
294
+ "/layers/qwen_moe_transformer_decoder_14/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00001.weights.h5",
295
+ "/layers/qwen_moe_transformer_decoder_14/mlp/shared_expert_gate_dense/vars": "model_00001.weights.h5",
296
+ "/layers/qwen_moe_transformer_decoder_14/mlp/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
297
+ "/layers/qwen_moe_transformer_decoder_14/_feedforward_layernorm/vars": "model_00001.weights.h5",
298
+ "/layers/qwen_moe_transformer_decoder_14/_self_attention_layer/key_dense/vars": [
299
+ "model_00001.weights.h5"
300
+ ],
301
+ "/layers/qwen_moe_transformer_decoder_14/_self_attention_layer/query_dense/vars": [
302
+ "model_00001.weights.h5"
303
+ ],
304
+ "/layers/qwen_moe_transformer_decoder_14/_self_attention_layer/value_dense/vars": [
305
+ "model_00001.weights.h5"
306
+ ],
307
+ "/layers/qwen_moe_transformer_decoder_14/_self_attention_layer/_output_dense/vars": "model_00001.weights.h5",
308
+ "/layers/qwen_moe_transformer_decoder_14/_self_attention_layernorm/vars": "model_00001.weights.h5",
309
+ "/layers/qwen_moe_transformer_decoder_15/mlp/expert_bank/vars": [
310
+ "model_00001.weights.h5"
311
+ ],
312
+ "/layers/qwen_moe_transformer_decoder_15/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00001.weights.h5",
313
+ "/layers/qwen_moe_transformer_decoder_15/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00001.weights.h5",
314
+ "/layers/qwen_moe_transformer_decoder_15/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00001.weights.h5",
315
+ "/layers/qwen_moe_transformer_decoder_15/mlp/shared_expert_gate_dense/vars": "model_00001.weights.h5",
316
+ "/layers/qwen_moe_transformer_decoder_15/mlp/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
317
+ "/layers/qwen_moe_transformer_decoder_15/_feedforward_layernorm/vars": "model_00001.weights.h5",
318
+ "/layers/qwen_moe_transformer_decoder_15/_self_attention_layer/key_dense/vars": [
319
+ "model_00001.weights.h5"
320
+ ],
321
+ "/layers/qwen_moe_transformer_decoder_15/_self_attention_layer/query_dense/vars": [
322
+ "model_00001.weights.h5"
323
+ ],
324
+ "/layers/qwen_moe_transformer_decoder_15/_self_attention_layer/value_dense/vars": [
325
+ "model_00001.weights.h5"
326
+ ],
327
+ "/layers/qwen_moe_transformer_decoder_15/_self_attention_layer/_output_dense/vars": "model_00001.weights.h5",
328
+ "/layers/qwen_moe_transformer_decoder_15/_self_attention_layernorm/vars": "model_00001.weights.h5",
329
+ "/layers/qwen_moe_transformer_decoder_16/mlp/expert_bank/vars": [
330
+ "model_00001.weights.h5"
331
+ ],
332
+ "/layers/qwen_moe_transformer_decoder_16/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00001.weights.h5",
333
+ "/layers/qwen_moe_transformer_decoder_16/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00001.weights.h5",
334
+ "/layers/qwen_moe_transformer_decoder_16/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00001.weights.h5",
335
+ "/layers/qwen_moe_transformer_decoder_16/mlp/shared_expert_gate_dense/vars": "model_00001.weights.h5",
336
+ "/layers/qwen_moe_transformer_decoder_16/mlp/_sparse_feedforward_gate_dense/vars": "model_00001.weights.h5",
337
+ "/layers/qwen_moe_transformer_decoder_16/_feedforward_layernorm/vars": "model_00001.weights.h5",
338
+ "/layers/qwen_moe_transformer_decoder_16/_self_attention_layer/key_dense/vars": [
339
+ "model_00001.weights.h5"
340
+ ],
341
+ "/layers/qwen_moe_transformer_decoder_16/_self_attention_layer/query_dense/vars": [
342
+ "model_00001.weights.h5"
343
+ ],
344
+ "/layers/qwen_moe_transformer_decoder_16/_self_attention_layer/value_dense/vars": [
345
+ "model_00001.weights.h5"
346
+ ],
347
+ "/layers/qwen_moe_transformer_decoder_16/_self_attention_layer/_output_dense/vars": "model_00001.weights.h5",
348
+ "/layers/qwen_moe_transformer_decoder_16/_self_attention_layernorm/vars": "model_00001.weights.h5",
349
+ "/layers/qwen_moe_transformer_decoder_17/mlp/expert_bank/vars": [
350
+ "model_00002.weights.h5"
351
+ ],
352
+ "/layers/qwen_moe_transformer_decoder_17/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00002.weights.h5",
353
+ "/layers/qwen_moe_transformer_decoder_17/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00002.weights.h5",
354
+ "/layers/qwen_moe_transformer_decoder_17/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00002.weights.h5",
355
+ "/layers/qwen_moe_transformer_decoder_17/mlp/shared_expert_gate_dense/vars": "model_00002.weights.h5",
356
+ "/layers/qwen_moe_transformer_decoder_17/mlp/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
357
+ "/layers/qwen_moe_transformer_decoder_17/_feedforward_layernorm/vars": "model_00002.weights.h5",
358
+ "/layers/qwen_moe_transformer_decoder_17/_self_attention_layer/key_dense/vars": [
359
+ "model_00002.weights.h5"
360
+ ],
361
+ "/layers/qwen_moe_transformer_decoder_17/_self_attention_layer/query_dense/vars": [
362
+ "model_00002.weights.h5"
363
+ ],
364
+ "/layers/qwen_moe_transformer_decoder_17/_self_attention_layer/value_dense/vars": [
365
+ "model_00002.weights.h5"
366
+ ],
367
+ "/layers/qwen_moe_transformer_decoder_17/_self_attention_layer/_output_dense/vars": "model_00002.weights.h5",
368
+ "/layers/qwen_moe_transformer_decoder_17/_self_attention_layernorm/vars": "model_00002.weights.h5",
369
+ "/layers/qwen_moe_transformer_decoder_18/mlp/expert_bank/vars": [
370
+ "model_00002.weights.h5"
371
+ ],
372
+ "/layers/qwen_moe_transformer_decoder_18/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00002.weights.h5",
373
+ "/layers/qwen_moe_transformer_decoder_18/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00002.weights.h5",
374
+ "/layers/qwen_moe_transformer_decoder_18/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00002.weights.h5",
375
+ "/layers/qwen_moe_transformer_decoder_18/mlp/shared_expert_gate_dense/vars": "model_00002.weights.h5",
376
+ "/layers/qwen_moe_transformer_decoder_18/mlp/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
377
+ "/layers/qwen_moe_transformer_decoder_18/_feedforward_layernorm/vars": "model_00002.weights.h5",
378
+ "/layers/qwen_moe_transformer_decoder_18/_self_attention_layer/key_dense/vars": [
379
+ "model_00002.weights.h5"
380
+ ],
381
+ "/layers/qwen_moe_transformer_decoder_18/_self_attention_layer/query_dense/vars": [
382
+ "model_00002.weights.h5"
383
+ ],
384
+ "/layers/qwen_moe_transformer_decoder_18/_self_attention_layer/value_dense/vars": [
385
+ "model_00002.weights.h5"
386
+ ],
387
+ "/layers/qwen_moe_transformer_decoder_18/_self_attention_layer/_output_dense/vars": "model_00002.weights.h5",
388
+ "/layers/qwen_moe_transformer_decoder_18/_self_attention_layernorm/vars": "model_00002.weights.h5",
389
+ "/layers/qwen_moe_transformer_decoder_19/mlp/expert_bank/vars": [
390
+ "model_00002.weights.h5"
391
+ ],
392
+ "/layers/qwen_moe_transformer_decoder_19/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00002.weights.h5",
393
+ "/layers/qwen_moe_transformer_decoder_19/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00002.weights.h5",
394
+ "/layers/qwen_moe_transformer_decoder_19/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00002.weights.h5",
395
+ "/layers/qwen_moe_transformer_decoder_19/mlp/shared_expert_gate_dense/vars": "model_00002.weights.h5",
396
+ "/layers/qwen_moe_transformer_decoder_19/mlp/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
397
+ "/layers/qwen_moe_transformer_decoder_19/_feedforward_layernorm/vars": "model_00002.weights.h5",
398
+ "/layers/qwen_moe_transformer_decoder_19/_self_attention_layer/key_dense/vars": [
399
+ "model_00002.weights.h5"
400
+ ],
401
+ "/layers/qwen_moe_transformer_decoder_19/_self_attention_layer/query_dense/vars": [
402
+ "model_00002.weights.h5"
403
+ ],
404
+ "/layers/qwen_moe_transformer_decoder_19/_self_attention_layer/value_dense/vars": [
405
+ "model_00002.weights.h5"
406
+ ],
407
+ "/layers/qwen_moe_transformer_decoder_19/_self_attention_layer/_output_dense/vars": "model_00002.weights.h5",
408
+ "/layers/qwen_moe_transformer_decoder_19/_self_attention_layernorm/vars": "model_00002.weights.h5",
409
+ "/layers/qwen_moe_transformer_decoder_20/mlp/expert_bank/vars": [
410
+ "model_00002.weights.h5"
411
+ ],
412
+ "/layers/qwen_moe_transformer_decoder_20/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00002.weights.h5",
413
+ "/layers/qwen_moe_transformer_decoder_20/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00002.weights.h5",
414
+ "/layers/qwen_moe_transformer_decoder_20/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00002.weights.h5",
415
+ "/layers/qwen_moe_transformer_decoder_20/mlp/shared_expert_gate_dense/vars": "model_00002.weights.h5",
416
+ "/layers/qwen_moe_transformer_decoder_20/mlp/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
417
+ "/layers/qwen_moe_transformer_decoder_20/_feedforward_layernorm/vars": "model_00002.weights.h5",
418
+ "/layers/qwen_moe_transformer_decoder_20/_self_attention_layer/key_dense/vars": [
419
+ "model_00002.weights.h5"
420
+ ],
421
+ "/layers/qwen_moe_transformer_decoder_20/_self_attention_layer/query_dense/vars": [
422
+ "model_00002.weights.h5"
423
+ ],
424
+ "/layers/qwen_moe_transformer_decoder_20/_self_attention_layer/value_dense/vars": [
425
+ "model_00002.weights.h5"
426
+ ],
427
+ "/layers/qwen_moe_transformer_decoder_20/_self_attention_layer/_output_dense/vars": "model_00002.weights.h5",
428
+ "/layers/qwen_moe_transformer_decoder_20/_self_attention_layernorm/vars": "model_00002.weights.h5",
429
+ "/layers/qwen_moe_transformer_decoder_21/mlp/expert_bank/vars": [
430
+ "model_00002.weights.h5"
431
+ ],
432
+ "/layers/qwen_moe_transformer_decoder_21/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00002.weights.h5",
433
+ "/layers/qwen_moe_transformer_decoder_21/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00002.weights.h5",
434
+ "/layers/qwen_moe_transformer_decoder_21/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00002.weights.h5",
435
+ "/layers/qwen_moe_transformer_decoder_21/mlp/shared_expert_gate_dense/vars": "model_00002.weights.h5",
436
+ "/layers/qwen_moe_transformer_decoder_21/mlp/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
437
+ "/layers/qwen_moe_transformer_decoder_21/_feedforward_layernorm/vars": "model_00002.weights.h5",
438
+ "/layers/qwen_moe_transformer_decoder_21/_self_attention_layer/key_dense/vars": [
439
+ "model_00002.weights.h5"
440
+ ],
441
+ "/layers/qwen_moe_transformer_decoder_21/_self_attention_layer/query_dense/vars": [
442
+ "model_00002.weights.h5"
443
+ ],
444
+ "/layers/qwen_moe_transformer_decoder_21/_self_attention_layer/value_dense/vars": [
445
+ "model_00002.weights.h5"
446
+ ],
447
+ "/layers/qwen_moe_transformer_decoder_21/_self_attention_layer/_output_dense/vars": "model_00002.weights.h5",
448
+ "/layers/qwen_moe_transformer_decoder_21/_self_attention_layernorm/vars": "model_00002.weights.h5",
449
+ "/layers/qwen_moe_transformer_decoder_22/mlp/expert_bank/vars": [
450
+ "model_00002.weights.h5"
451
+ ],
452
+ "/layers/qwen_moe_transformer_decoder_22/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00002.weights.h5",
453
+ "/layers/qwen_moe_transformer_decoder_22/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00002.weights.h5",
454
+ "/layers/qwen_moe_transformer_decoder_22/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00002.weights.h5",
455
+ "/layers/qwen_moe_transformer_decoder_22/mlp/shared_expert_gate_dense/vars": "model_00002.weights.h5",
456
+ "/layers/qwen_moe_transformer_decoder_22/mlp/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
457
+ "/layers/qwen_moe_transformer_decoder_22/_feedforward_layernorm/vars": "model_00002.weights.h5",
458
+ "/layers/qwen_moe_transformer_decoder_22/_self_attention_layer/key_dense/vars": [
459
+ "model_00002.weights.h5"
460
+ ],
461
+ "/layers/qwen_moe_transformer_decoder_22/_self_attention_layer/query_dense/vars": [
462
+ "model_00002.weights.h5"
463
+ ],
464
+ "/layers/qwen_moe_transformer_decoder_22/_self_attention_layer/value_dense/vars": [
465
+ "model_00002.weights.h5"
466
+ ],
467
+ "/layers/qwen_moe_transformer_decoder_22/_self_attention_layer/_output_dense/vars": "model_00002.weights.h5",
468
+ "/layers/qwen_moe_transformer_decoder_22/_self_attention_layernorm/vars": "model_00002.weights.h5",
469
+ "/layers/qwen_moe_transformer_decoder_23/mlp/expert_bank/vars": [
470
+ "model_00002.weights.h5"
471
+ ],
472
+ "/layers/qwen_moe_transformer_decoder_23/mlp/shared_expert_dense/_feedforward_gate_dense/vars": "model_00002.weights.h5",
473
+ "/layers/qwen_moe_transformer_decoder_23/mlp/shared_expert_dense/_feedforward_intermediate_dense/vars": "model_00002.weights.h5",
474
+ "/layers/qwen_moe_transformer_decoder_23/mlp/shared_expert_dense/_feedforward_output_dense/vars": "model_00002.weights.h5",
475
+ "/layers/qwen_moe_transformer_decoder_23/mlp/shared_expert_gate_dense/vars": "model_00002.weights.h5",
476
+ "/layers/qwen_moe_transformer_decoder_23/mlp/_sparse_feedforward_gate_dense/vars": "model_00002.weights.h5",
477
+ "/layers/qwen_moe_transformer_decoder_23/_feedforward_layernorm/vars": "model_00002.weights.h5",
478
+ "/layers/qwen_moe_transformer_decoder_23/_self_attention_layer/key_dense/vars": [
479
+ "model_00002.weights.h5"
480
+ ],
481
+ "/layers/qwen_moe_transformer_decoder_23/_self_attention_layer/query_dense/vars": [
482
+ "model_00002.weights.h5"
483
+ ],
484
+ "/layers/qwen_moe_transformer_decoder_23/_self_attention_layer/value_dense/vars": [
485
+ "model_00002.weights.h5"
486
+ ],
487
+ "/layers/qwen_moe_transformer_decoder_23/_self_attention_layer/_output_dense/vars": "model_00002.weights.h5",
488
+ "/layers/qwen_moe_transformer_decoder_23/_self_attention_layernorm/vars": "model_00002.weights.h5",
489
+ "/layers/qwen_layer_norm/vars": "model_00002.weights.h5"
490
+ }
491
+ }
model_00000.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d0cfefc0799613dbbec7c8713afaa68c84e657d7f855be037b673abb8a650db
3
+ size 10374018336
model_00001.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:149ff8eaafa5d3f24515332b82446d119ad15159722566a7279b055b53fcbff8
3
+ size 10270516040
model_00002.weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:523451da6c7a24ca64243c13d559ef222d0db1a70768a4d5278be3dd79159370
3
+ size 7988179920
preprocessor.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "module": "keras_hub.src.models.qwen_moe.qwen_moe_causal_lm_preprocessor",
3
+ "class_name": "QwenMoeCausalLMPreprocessor",
4
+ "config": {
5
+ "name": "qwen_moe_causal_lm_preprocessor_1",
6
+ "trainable": true,
7
+ "dtype": {
8
+ "module": "keras",
9
+ "class_name": "DTypePolicy",
10
+ "config": {
11
+ "name": "float32"
12
+ },
13
+ "registered_name": null
14
+ },
15
+ "tokenizer": {
16
+ "module": "keras_hub.src.models.qwen_moe.qwen_moe_tokenizer",
17
+ "class_name": "QwenMoeTokenizer",
18
+ "config": {
19
+ "name": "qwen_moe_tokenizer",
20
+ "trainable": true,
21
+ "dtype": {
22
+ "module": "keras",
23
+ "class_name": "DTypePolicy",
24
+ "config": {
25
+ "name": "int32"
26
+ },
27
+ "registered_name": null
28
+ },
29
+ "config_file": "tokenizer.json",
30
+ "sequence_length": null,
31
+ "add_prefix_space": false,
32
+ "unsplittable_tokens": [
33
+ "<|im_end|>",
34
+ "<|endoftext|>",
35
+ "<|im_start|>"
36
+ ]
37
+ },
38
+ "registered_name": "keras_hub>QwenMoeTokenizer"
39
+ },
40
+ "config_file": "preprocessor.json",
41
+ "sequence_length": 1024,
42
+ "add_start_token": true,
43
+ "add_end_token": true
44
+ },
45
+ "registered_name": "keras_hub>QwenMoeCausalLMPreprocessor"
46
+ }
task.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "module": "keras_hub.src.models.qwen_moe.qwen_moe_causal_lm",
3
+ "class_name": "QwenMoeCausalLM",
4
+ "config": {
5
+ "backbone": {
6
+ "module": "keras_hub.src.models.qwen_moe.qwen_moe_backbone",
7
+ "class_name": "QwenMoeBackbone",
8
+ "config": {
9
+ "name": "qwen_moe_backbone",
10
+ "trainable": true,
11
+ "vocabulary_size": 151936,
12
+ "num_layers": 24,
13
+ "num_query_heads": 16,
14
+ "hidden_dim": 2048,
15
+ "intermediate_dim": 5632,
16
+ "moe_intermediate_dim": 1408,
17
+ "shared_expert_intermediate_dim": 5632,
18
+ "rope_max_wavelength": 1000000.0,
19
+ "num_key_value_heads": 16,
20
+ "rope_scaling_factor": 1.0,
21
+ "layer_norm_epsilon": 1e-06,
22
+ "dropout": 0,
23
+ "tie_word_embeddings": false,
24
+ "use_sliding_window_attention": false,
25
+ "sliding_window_size": 32768,
26
+ "num_experts": 60,
27
+ "top_k": 4,
28
+ "norm_top_k_prob": false,
29
+ "decoder_sparse_step": 1,
30
+ "mlp_only_layers": [],
31
+ "output_router_logits": false
32
+ },
33
+ "registered_name": "keras_hub>QwenMoeBackbone"
34
+ },
35
+ "preprocessor": {
36
+ "module": "keras_hub.src.models.qwen_moe.qwen_moe_causal_lm_preprocessor",
37
+ "class_name": "QwenMoeCausalLMPreprocessor",
38
+ "config": {
39
+ "name": "qwen_moe_causal_lm_preprocessor_1",
40
+ "trainable": true,
41
+ "dtype": {
42
+ "module": "keras",
43
+ "class_name": "DTypePolicy",
44
+ "config": {
45
+ "name": "float32"
46
+ },
47
+ "registered_name": null
48
+ },
49
+ "tokenizer": {
50
+ "module": "keras_hub.src.models.qwen_moe.qwen_moe_tokenizer",
51
+ "class_name": "QwenMoeTokenizer",
52
+ "config": {
53
+ "name": "qwen_moe_tokenizer",
54
+ "trainable": true,
55
+ "dtype": {
56
+ "module": "keras",
57
+ "class_name": "DTypePolicy",
58
+ "config": {
59
+ "name": "int32"
60
+ },
61
+ "registered_name": null
62
+ },
63
+ "config_file": "tokenizer.json",
64
+ "sequence_length": null,
65
+ "add_prefix_space": false,
66
+ "unsplittable_tokens": [
67
+ "<|im_end|>",
68
+ "<|endoftext|>",
69
+ "<|im_start|>"
70
+ ]
71
+ },
72
+ "registered_name": "keras_hub>QwenMoeTokenizer"
73
+ },
74
+ "config_file": "preprocessor.json",
75
+ "sequence_length": 1024,
76
+ "add_start_token": true,
77
+ "add_end_token": true
78
+ },
79
+ "registered_name": "keras_hub>QwenMoeCausalLMPreprocessor"
80
+ },
81
+ "name": "qwen_moe_causal_lm_1"
82
+ },
83
+ "registered_name": "keras_hub>QwenMoeCausalLM"
84
+ }
tokenizer.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "module": "keras_hub.src.models.qwen_moe.qwen_moe_tokenizer",
3
+ "class_name": "QwenMoeTokenizer",
4
+ "config": {
5
+ "name": "qwen_moe_tokenizer",
6
+ "trainable": true,
7
+ "dtype": {
8
+ "module": "keras",
9
+ "class_name": "DTypePolicy",
10
+ "config": {
11
+ "name": "int32"
12
+ },
13
+ "registered_name": null
14
+ },
15
+ "config_file": "tokenizer.json",
16
+ "sequence_length": null,
17
+ "add_prefix_space": false,
18
+ "unsplittable_tokens": [
19
+ "<|im_end|>",
20
+ "<|endoftext|>",
21
+ "<|im_start|>"
22
+ ]
23
+ },
24
+ "registered_name": "keras_hub>QwenMoeTokenizer"
25
+ }