kernelpool commited on
Commit
40d83c1
·
verified ·
1 Parent(s): 46ef93e

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ base_model: inclusionAI/Ring-flash-linear-2.0
6
+ pipeline_tag: text-generation
7
+ library_name: mlx
8
+ tags:
9
+ - moe
10
+ - mlx
11
+ ---
12
+
13
+ # mlx-community/Ring-flash-linear-2.0-4bit
14
+
15
+ This model [mlx-community/Ring-flash-linear-2.0-4bit](https://huggingface.co/mlx-community/Ring-flash-linear-2.0-4bit) was
16
+ converted to MLX format from [inclusionAI/Ring-flash-linear-2.0](https://huggingface.co/inclusionAI/Ring-flash-linear-2.0)
17
+ using mlx-lm version **0.28.2**.
18
+
19
+ ## Use with mlx
20
+
21
+ ```bash
22
+ pip install mlx-lm
23
+ ```
24
+
25
+ ```python
26
+ from mlx_lm import load, generate
27
+
28
+ model, tokenizer = load("mlx-community/Ring-flash-linear-2.0-4bit")
29
+
30
+ prompt = "hello"
31
+
32
+ if tokenizer.chat_template is not None:
33
+ messages = [{"role": "user", "content": prompt}]
34
+ prompt = tokenizer.apply_chat_template(
35
+ messages, add_generation_prompt=True
36
+ )
37
+
38
+ response = generate(model, tokenizer, prompt=prompt, verbose=True)
39
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if messages[0].role == 'system' %}
2
+ {{- '<role>SYSTEM</role>' + messages[0].content + '\n' }}
3
+ {%- elif tools %}
4
+ {{- '<role>SYSTEM</role>' }}
5
+ {%- else %}
6
+ {{- '<role>SYSTEM</role>你是一个智能助手(AI Assistant),是由蚂蚁集团的百灵团队(Bailing team)开发。You are an AI Assistant, developed by the Bailing team at Ant Group.\n' }}
7
+ {%- endif %}
8
+ {%- if tools %}
9
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
10
+ {%- for tool in tools %}
11
+ {{- "\n" }}
12
+ {{- tool | tojson }}
13
+ {%- endfor %}
14
+ {{- "\n</tools>\n\nIf none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. \nIf you need to use a function, for each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>\n" }}
15
+ {%- endif %}
16
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
17
+ {%- for message in messages[::-1] %}
18
+ {%- set index = (messages|length - 1) - loop.index0 %}
19
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
20
+ {%- set ns.multi_step_tool = false %}
21
+ {%- set ns.last_query_index = index %}
22
+ {%- endif %}
23
+ {%- endfor %}
24
+ {%- for message in messages %}
25
+ {%- if message.content is string %}
26
+ {%- set content = message.content %}
27
+ {%- else %}
28
+ {%- set content = '' %}
29
+ {%- endif %}
30
+ {%- if message.role == "user" %}
31
+ {{- '<role>HUMAN</role>' + message.content }}
32
+ {%- elif message.role == "system" and not loop.first %}
33
+ {{- '<role>SYSTEM</role>' + message.content }}
34
+ {%- elif message.role == "assistant" %}
35
+ {%- set reasoning_content = '' %}
36
+ {%- if message.reasoning_content is string %}
37
+ {%- set reasoning_content = message.reasoning_content %}
38
+ {%- else %}
39
+ {%- if '</think>' in content %}
40
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
41
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
42
+ {%- endif %}
43
+ {%- endif %}
44
+ {%- if loop.index0 > ns.last_query_index %}
45
+ {%- if loop.last or (not loop.last and reasoning_content) %}
46
+ {{- '<role>ASSISTANT</role>' + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
47
+ {%- else %}
48
+ {{- '<role>ASSISTANT</role>' + content }}
49
+ {%- endif %}
50
+ {%- else %}
51
+ {{- '<role>ASSISTANT</role>' + content }}
52
+ {%- endif %}
53
+ {%- if message.tool_calls %}
54
+ {%- for tool_call in message.tool_calls %}
55
+ {%- if (loop.first and content) or (not loop.first) %}
56
+ {{- '\n' }}
57
+ {%- endif %}
58
+ {%- if tool_call.function %}
59
+ {%- set tool_call = tool_call.function %}
60
+ {%- endif %}
61
+ {{- '<tool_call>\n{"name": "' }}
62
+ {{- tool_call.name }}
63
+ {{- '", "arguments": ' }}
64
+ {%- if tool_call.arguments is string %}
65
+ {{- tool_call.arguments }}
66
+ {%- else %}
67
+ {{- tool_call.arguments | tojson }}
68
+ {%- endif %}
69
+ {{- '}\n</tool_call>' }}
70
+ {%- endfor %}
71
+ {%- endif %}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<role>OBSERVATION</role>' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- endif %}
80
+ {%- endfor %}
81
+ {%- if add_generation_prompt %}
82
+ {{- '<role>ASSISTANT</role><think>\n' }}
83
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BailingMoeLinearV2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_bailing_moe_linear_v2.BailingMoeLinearV2Config",
8
+ "AutoModel": "modeling_bailing_moe_linear_v2.BailingMoeLinearV2Model",
9
+ "AutoModelForCausalLM": "modeling_bailing_moe_linear_v2.BailingMoeLinearV2ForCausalLM"
10
+ },
11
+ "embedding_dropout": 0.0,
12
+ "eos_token_id": 156892,
13
+ "first_k_dense_replace": 1,
14
+ "group_norm_size": 4,
15
+ "head_dim": 128,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 4096,
18
+ "intermediate_size": 9216,
19
+ "layer_group_size": 8,
20
+ "linear_silu": false,
21
+ "max_position_embeddings": 131072,
22
+ "model_type": "bailing_moe_linear",
23
+ "moe_intermediate_size": 1024,
24
+ "moe_router_enable_expert_bias": true,
25
+ "moe_shared_expert_intermediate_size": 1024,
26
+ "n_group": 8,
27
+ "norm_topk_prob": true,
28
+ "num_attention_heads": 32,
29
+ "num_experts": 256,
30
+ "num_experts_per_tok": 8,
31
+ "num_hidden_layers": 32,
32
+ "num_key_value_heads": 4,
33
+ "num_nextn_predict_layers": 0,
34
+ "num_shared_experts": 1,
35
+ "output_dropout": 0.0,
36
+ "pad_token_id": 156892,
37
+ "partial_rotary_factor": 0.5,
38
+ "quantization": {
39
+ "group_size": 64,
40
+ "bits": 4,
41
+ "mode": "affine",
42
+ "model.layers.1.mlp.gate.gate_proj": {
43
+ "group_size": 64,
44
+ "bits": 8
45
+ },
46
+ "model.layers.2.mlp.gate.gate_proj": {
47
+ "group_size": 64,
48
+ "bits": 8
49
+ },
50
+ "model.layers.3.mlp.gate.gate_proj": {
51
+ "group_size": 64,
52
+ "bits": 8
53
+ },
54
+ "model.layers.4.mlp.gate.gate_proj": {
55
+ "group_size": 64,
56
+ "bits": 8
57
+ },
58
+ "model.layers.5.mlp.gate.gate_proj": {
59
+ "group_size": 64,
60
+ "bits": 8
61
+ },
62
+ "model.layers.6.mlp.gate.gate_proj": {
63
+ "group_size": 64,
64
+ "bits": 8
65
+ },
66
+ "model.layers.7.mlp.gate.gate_proj": {
67
+ "group_size": 64,
68
+ "bits": 8
69
+ },
70
+ "model.layers.8.mlp.gate.gate_proj": {
71
+ "group_size": 64,
72
+ "bits": 8
73
+ },
74
+ "model.layers.9.mlp.gate.gate_proj": {
75
+ "group_size": 64,
76
+ "bits": 8
77
+ },
78
+ "model.layers.10.mlp.gate.gate_proj": {
79
+ "group_size": 64,
80
+ "bits": 8
81
+ },
82
+ "model.layers.11.mlp.gate.gate_proj": {
83
+ "group_size": 64,
84
+ "bits": 8
85
+ },
86
+ "model.layers.12.mlp.gate.gate_proj": {
87
+ "group_size": 64,
88
+ "bits": 8
89
+ },
90
+ "model.layers.13.mlp.gate.gate_proj": {
91
+ "group_size": 64,
92
+ "bits": 8
93
+ },
94
+ "model.layers.14.mlp.gate.gate_proj": {
95
+ "group_size": 64,
96
+ "bits": 8
97
+ },
98
+ "model.layers.15.mlp.gate.gate_proj": {
99
+ "group_size": 64,
100
+ "bits": 8
101
+ },
102
+ "model.layers.16.mlp.gate.gate_proj": {
103
+ "group_size": 64,
104
+ "bits": 8
105
+ },
106
+ "model.layers.17.mlp.gate.gate_proj": {
107
+ "group_size": 64,
108
+ "bits": 8
109
+ },
110
+ "model.layers.18.mlp.gate.gate_proj": {
111
+ "group_size": 64,
112
+ "bits": 8
113
+ },
114
+ "model.layers.19.mlp.gate.gate_proj": {
115
+ "group_size": 64,
116
+ "bits": 8
117
+ },
118
+ "model.layers.20.mlp.gate.gate_proj": {
119
+ "group_size": 64,
120
+ "bits": 8
121
+ },
122
+ "model.layers.21.mlp.gate.gate_proj": {
123
+ "group_size": 64,
124
+ "bits": 8
125
+ },
126
+ "model.layers.22.mlp.gate.gate_proj": {
127
+ "group_size": 64,
128
+ "bits": 8
129
+ },
130
+ "model.layers.23.mlp.gate.gate_proj": {
131
+ "group_size": 64,
132
+ "bits": 8
133
+ },
134
+ "model.layers.24.mlp.gate.gate_proj": {
135
+ "group_size": 64,
136
+ "bits": 8
137
+ },
138
+ "model.layers.25.mlp.gate.gate_proj": {
139
+ "group_size": 64,
140
+ "bits": 8
141
+ },
142
+ "model.layers.26.mlp.gate.gate_proj": {
143
+ "group_size": 64,
144
+ "bits": 8
145
+ },
146
+ "model.layers.27.mlp.gate.gate_proj": {
147
+ "group_size": 64,
148
+ "bits": 8
149
+ },
150
+ "model.layers.28.mlp.gate.gate_proj": {
151
+ "group_size": 64,
152
+ "bits": 8
153
+ },
154
+ "model.layers.29.mlp.gate.gate_proj": {
155
+ "group_size": 64,
156
+ "bits": 8
157
+ },
158
+ "model.layers.30.mlp.gate.gate_proj": {
159
+ "group_size": 64,
160
+ "bits": 8
161
+ },
162
+ "model.layers.31.mlp.gate.gate_proj": {
163
+ "group_size": 64,
164
+ "bits": 8
165
+ }
166
+ },
167
+ "quantization_config": {
168
+ "group_size": 64,
169
+ "bits": 4,
170
+ "mode": "affine",
171
+ "model.layers.1.mlp.gate.gate_proj": {
172
+ "group_size": 64,
173
+ "bits": 8
174
+ },
175
+ "model.layers.2.mlp.gate.gate_proj": {
176
+ "group_size": 64,
177
+ "bits": 8
178
+ },
179
+ "model.layers.3.mlp.gate.gate_proj": {
180
+ "group_size": 64,
181
+ "bits": 8
182
+ },
183
+ "model.layers.4.mlp.gate.gate_proj": {
184
+ "group_size": 64,
185
+ "bits": 8
186
+ },
187
+ "model.layers.5.mlp.gate.gate_proj": {
188
+ "group_size": 64,
189
+ "bits": 8
190
+ },
191
+ "model.layers.6.mlp.gate.gate_proj": {
192
+ "group_size": 64,
193
+ "bits": 8
194
+ },
195
+ "model.layers.7.mlp.gate.gate_proj": {
196
+ "group_size": 64,
197
+ "bits": 8
198
+ },
199
+ "model.layers.8.mlp.gate.gate_proj": {
200
+ "group_size": 64,
201
+ "bits": 8
202
+ },
203
+ "model.layers.9.mlp.gate.gate_proj": {
204
+ "group_size": 64,
205
+ "bits": 8
206
+ },
207
+ "model.layers.10.mlp.gate.gate_proj": {
208
+ "group_size": 64,
209
+ "bits": 8
210
+ },
211
+ "model.layers.11.mlp.gate.gate_proj": {
212
+ "group_size": 64,
213
+ "bits": 8
214
+ },
215
+ "model.layers.12.mlp.gate.gate_proj": {
216
+ "group_size": 64,
217
+ "bits": 8
218
+ },
219
+ "model.layers.13.mlp.gate.gate_proj": {
220
+ "group_size": 64,
221
+ "bits": 8
222
+ },
223
+ "model.layers.14.mlp.gate.gate_proj": {
224
+ "group_size": 64,
225
+ "bits": 8
226
+ },
227
+ "model.layers.15.mlp.gate.gate_proj": {
228
+ "group_size": 64,
229
+ "bits": 8
230
+ },
231
+ "model.layers.16.mlp.gate.gate_proj": {
232
+ "group_size": 64,
233
+ "bits": 8
234
+ },
235
+ "model.layers.17.mlp.gate.gate_proj": {
236
+ "group_size": 64,
237
+ "bits": 8
238
+ },
239
+ "model.layers.18.mlp.gate.gate_proj": {
240
+ "group_size": 64,
241
+ "bits": 8
242
+ },
243
+ "model.layers.19.mlp.gate.gate_proj": {
244
+ "group_size": 64,
245
+ "bits": 8
246
+ },
247
+ "model.layers.20.mlp.gate.gate_proj": {
248
+ "group_size": 64,
249
+ "bits": 8
250
+ },
251
+ "model.layers.21.mlp.gate.gate_proj": {
252
+ "group_size": 64,
253
+ "bits": 8
254
+ },
255
+ "model.layers.22.mlp.gate.gate_proj": {
256
+ "group_size": 64,
257
+ "bits": 8
258
+ },
259
+ "model.layers.23.mlp.gate.gate_proj": {
260
+ "group_size": 64,
261
+ "bits": 8
262
+ },
263
+ "model.layers.24.mlp.gate.gate_proj": {
264
+ "group_size": 64,
265
+ "bits": 8
266
+ },
267
+ "model.layers.25.mlp.gate.gate_proj": {
268
+ "group_size": 64,
269
+ "bits": 8
270
+ },
271
+ "model.layers.26.mlp.gate.gate_proj": {
272
+ "group_size": 64,
273
+ "bits": 8
274
+ },
275
+ "model.layers.27.mlp.gate.gate_proj": {
276
+ "group_size": 64,
277
+ "bits": 8
278
+ },
279
+ "model.layers.28.mlp.gate.gate_proj": {
280
+ "group_size": 64,
281
+ "bits": 8
282
+ },
283
+ "model.layers.29.mlp.gate.gate_proj": {
284
+ "group_size": 64,
285
+ "bits": 8
286
+ },
287
+ "model.layers.30.mlp.gate.gate_proj": {
288
+ "group_size": 64,
289
+ "bits": 8
290
+ },
291
+ "model.layers.31.mlp.gate.gate_proj": {
292
+ "group_size": 64,
293
+ "bits": 8
294
+ }
295
+ },
296
+ "rms_norm_eps": 1e-06,
297
+ "rope_scaling": null,
298
+ "rope_theta": 600000,
299
+ "routed_scaling_factor": 2.5,
300
+ "router_dtype": "fp32",
301
+ "score_function": "sigmoid",
302
+ "tie_word_embeddings": false,
303
+ "topk_group": 4,
304
+ "torch_dtype": "bfloat16",
305
+ "transformers_version": "4.56.1",
306
+ "use_bias": false,
307
+ "use_cache": true,
308
+ "use_qk_norm": true,
309
+ "use_qkv_bias": false,
310
+ "use_rmsnorm": true,
311
+ "vocab_size": 157184
312
+ }
configuration_bailing_moe_linear_v2.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bailing MoE V2 model configuration"""
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+
6
+ class BailingMoeLinearV2Config(PretrainedConfig):
7
+
8
+ def __init__(
9
+ self,
10
+ vocab_size=157184,
11
+ hidden_size=2048,
12
+ intermediate_size=5120,
13
+ num_hidden_layers=20,
14
+ num_attention_heads=16,
15
+ num_key_value_heads=4,
16
+ hidden_act="silu",
17
+ use_qkv_bias=False, # bailing only
18
+ use_bias=False, # bailing only
19
+ rms_norm_eps=1e-06,
20
+ tie_word_embeddings=False, # PretrainedConfig key, here change default value.
21
+ embedding_dropout=0.0,
22
+ attention_dropout=0.0,
23
+ output_dropout=0.0,
24
+ initializer_range=0.02,
25
+ max_position_embeddings=32768,
26
+ rope_theta=600000.0,
27
+ use_cache=True,
28
+ max_window_layers=20,
29
+ rope_scaling=None,
30
+ pad_token_id=156892,
31
+ eos_token_id=156892,
32
+ num_experts=256,
33
+ num_shared_experts=1,
34
+ num_experts_per_tok=8,
35
+ n_group=8,
36
+ topk_group=4,
37
+ moe_intermediate_size=512,
38
+ first_k_dense_replace=1,
39
+ head_dim=128,
40
+ output_router_logits=False,
41
+ use_qk_norm=True,
42
+ num_nextn_predict_layers=0,
43
+ mtp_loss_scaling_factor=0,
44
+ moe_router_enable_expert_bias=True,
45
+ routed_scaling_factor=1.0,
46
+ layer_group_size=1,
47
+ group_norm_size=1,
48
+ linear_silu=False,
49
+ **kwargs,
50
+ ):
51
+ self.num_hidden_layers = num_hidden_layers
52
+ self.vocab_size = vocab_size
53
+ self.hidden_size = hidden_size
54
+ self.intermediate_size = intermediate_size
55
+ self.num_attention_heads = num_attention_heads
56
+ self.num_key_value_heads = num_key_value_heads
57
+ self.hidden_act = hidden_act
58
+ self.use_qkv_bias = use_qkv_bias
59
+ self.use_bias = use_bias
60
+ self.rms_norm_eps = rms_norm_eps
61
+ self.embedding_dropout = embedding_dropout
62
+ self.attention_dropout = attention_dropout
63
+ self.output_dropout = output_dropout
64
+ self.num_nextn_predict_layers = num_nextn_predict_layers
65
+ self.mtp_loss_scaling_factor = mtp_loss_scaling_factor
66
+ self.initializer_range = initializer_range
67
+ self.max_position_embeddings = max_position_embeddings
68
+ self.rope_theta = rope_theta
69
+ self.use_cache = use_cache
70
+ self.max_window_layers = max_window_layers
71
+ self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
72
+ self.rope_scaling = rope_scaling
73
+ self.use_qk_norm = use_qk_norm
74
+ self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
75
+ self.routed_scaling_factor = routed_scaling_factor
76
+
77
+ # MoE configs
78
+ self.num_experts = num_experts
79
+ self.num_shared_experts = num_shared_experts
80
+ self.num_experts_per_tok = num_experts_per_tok
81
+ self.n_group = n_group
82
+ self.topk_group = topk_group
83
+ self.moe_intermediate_size = moe_intermediate_size
84
+ self.first_k_dense_replace = first_k_dense_replace
85
+ self.output_router_logits = output_router_logits
86
+
87
+ # Linear configs
88
+ self.layer_group_size = layer_group_size
89
+ self.group_norm_size = group_norm_size
90
+ self.linear_silu = linear_silu
91
+
92
+ super().__init__(pad_token_id=pad_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 156891,
3
+ "eos_token_id": [
4
+ 156892,
5
+ 156895
6
+ ],
7
+ "pad_token_id": 156892,
8
+ "transformers_version": "4.56.1"
9
+ }
model-00001-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7be5d31f092df63867c10cccd6a2f70a60edfe22c87853c4ac3de9ad15f81a43
3
+ size 4858938458
model-00002-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1253ea7c208d0039d76ebbc215cf828a22b7471f3be40ace65a5187cda13f198
3
+ size 4998062930
model-00003-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ed4213a57a665100e548ed1004050ca9d5c1cd91cbb6a941153f641871bf835
3
+ size 4916694848
model-00004-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92c424b4b2016fd7ed225d795aa0ba5b579c3a20b0603d1be77ea819b52339e3
3
+ size 4998063073
model-00005-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ff401251181c38555edcc9d4b6e1807f39101e9adc64a270b9d2125ca8fdd01
3
+ size 4998063007
model-00006-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:829c8c2b7e6f47dd7443a335cda2da6ec763227fa158fdf7b8ff82c8db7a9ba2
3
+ size 4916694926
model-00007-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3a63fa3acb4217387681462e8e224034ff8c7235408e7dcfc590673963c0177
3
+ size 4998063017
model-00008-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e47ee7ffc46cb388538b611cdb8da1f2d8624945ee59b12a3425deb96db8eb
3
+ size 4998063111
model-00009-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66df737483696122fb66be1a22bd2c35aa334932379ec3bf03ba3114f2ba8120
3
+ size 4916694902
model-00010-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:206b6c30eb47c3cc31c91d859663aacaf965398ce81bcc6c28dcfa5799c39aad
3
+ size 4998063129
model-00011-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2140ccd4dad8ae192ebb8191fd71155960ae8de521d54341fc30bd2e8598cfed
3
+ size 4998063019
model-00012-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8bf9dcb9be6ce319e15021ed624982b41e7134ced333eee0325585b27f2cb34
3
+ size 4023699062
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_bailing_moe_linear_v2.py ADDED
@@ -0,0 +1,1758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 Antgroup and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """PyTorch BailingMoE model."""
21
+
22
+ import math
23
+ import warnings
24
+ from typing import List, Optional, Tuple, Union
25
+
26
+ import torch
27
+ import torch.nn.functional as F
28
+ from torch import nn
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.cache_utils import Cache, DynamicCache
32
+ from transformers.modeling_attn_mask_utils import (
33
+ AttentionMaskConverter,
34
+ _prepare_4d_attention_mask,
35
+ _prepare_4d_causal_attention_mask,
36
+ _prepare_4d_causal_attention_mask_for_sdpa,
37
+ )
38
+ from transformers.modeling_outputs import MoeModelOutputWithPast
39
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
40
+ from transformers.modeling_utils import PreTrainedModel
41
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
42
+ from transformers.utils import (
43
+ add_start_docstrings,
44
+ add_start_docstrings_to_model_forward,
45
+ is_flash_attn_2_available,
46
+ is_flash_attn_greater_or_equal_2_10,
47
+ logging,
48
+ replace_return_docstrings,
49
+ )
50
+ from transformers.utils.import_utils import is_torch_fx_available
51
+ from .configuration_bailing_moe_linear_v2 import BailingMoeLinearV2Config
52
+ from transformers.generation.utils import GenerationMixin
53
+ from dataclasses import dataclass
54
+ from transformers.utils import ModelOutput
55
+
56
+
57
+ if is_flash_attn_2_available():
58
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
59
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
60
+
61
+ from fla.ops.simple_gla.fused_recurrent import fused_recurrent_simple_gla
62
+ from fla.ops.simple_gla.chunk import chunk_simple_gla
63
+
64
+
65
+ # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
66
+ # It means that the function will not be traced through and simply appear as a node in the graph.
67
+ if is_torch_fx_available():
68
+ if not is_torch_greater_or_equal_than_1_13:
69
+ import torch.fx
70
+
71
+ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
72
+
73
+
74
+ logger = logging.get_logger(__name__)
75
+
76
+ _CONFIG_FOR_DOC = "BailingMoeLinearV2Config"
77
+
78
+
79
+ def roll_tensor(tensor, shifts=-1, dims=-1, fill_value=0):
80
+ """Roll the tensor input along the given dimension(s).
81
+ Inserted elements are set to be 0.0.
82
+ """
83
+ rolled_tensor = torch.roll(tensor, shifts=shifts, dims=dims)
84
+ rolled_tensor.select(dims, shifts).fill_(fill_value)
85
+ return rolled_tensor, rolled_tensor.sum()
86
+
87
+
88
+ @dataclass
89
+ class MoEV2CausalLMOutputWithPast(ModelOutput):
90
+ """
91
+ Base class for causal language model (or autoregressive) outputs as well as Mixture of Expert's router hidden
92
+ states terms, to train a MoE model.
93
+ Args:
94
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
95
+ Language modeling loss (for next-token prediction).
96
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
97
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
98
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
99
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
100
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
101
+ `past_key_values` input) to speed up sequential decoding.
102
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
103
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
104
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
105
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
106
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
107
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
108
+ sequence_length)`.
109
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
110
+ heads.
111
+ z_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
112
+ z_loss for the sparse modules.
113
+ aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
114
+ aux_loss for the sparse modules.
115
+ router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
116
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
117
+ Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse
118
+ modules.
119
+ """
120
+
121
+ loss: Optional[torch.FloatTensor] = None
122
+ logits: Optional[torch.FloatTensor] = None
123
+ past_key_values: Optional[Cache] = None
124
+ hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
125
+ attentions: Optional[tuple[torch.FloatTensor, ...]] = None
126
+ z_loss: Optional[torch.FloatTensor] = None
127
+ aux_loss: Optional[torch.FloatTensor] = None
128
+ router_logits: Optional[tuple[torch.FloatTensor]] = None
129
+ mtp_loss: Optional[torch.FloatTensor] = None
130
+ mtp_logits: Optional[tuple[torch.FloatTensor, ...]] = None
131
+
132
+
133
+ class MoeV2ModelOutputWithPast(MoeModelOutputWithPast):
134
+
135
+ def __init__(self, mtp_hidden_states=None, **kwargs):
136
+ super().__init__(**kwargs)
137
+ self.mtp_hidden_states = mtp_hidden_states
138
+
139
+
140
+ def _get_unpad_data(attention_mask):
141
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
142
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
143
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
144
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
145
+ return (
146
+ indices,
147
+ cu_seqlens,
148
+ max_seqlen_in_batch,
149
+ )
150
+
151
+
152
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
153
+ warnings.warn(
154
+ "Calling `transformers.models.BailingMoeV2.modeling_BailingMoeV2._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
155
+ )
156
+ return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
157
+
158
+
159
+ def _make_causal_mask(
160
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
161
+ ):
162
+ warnings.warn(
163
+ "Calling `transformers.models.BailingMoeV2.modeling_BailingMoeV2._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.BailingMoeV2.modeling_BailingMoeV2.AttentionMaskConverter._make_causal_mask"
164
+ )
165
+ return AttentionMaskConverter._make_causal_mask(
166
+ input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
167
+ )
168
+
169
+
170
+ class BailingMoeV2RMSNorm(nn.Module):
171
+ def __init__(self, hidden_size, eps=1e-6):
172
+ """
173
+ BailingMoeV2RMSNorm is equivalent to T5LayerNorm
174
+ """
175
+ super().__init__()
176
+ self.weight = nn.Parameter(torch.ones(hidden_size))
177
+ self.variance_epsilon = eps
178
+
179
+ def forward(self, hidden_states):
180
+ input_dtype = hidden_states.dtype
181
+ hidden_states = hidden_states.to(torch.float32)
182
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
183
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
184
+ return self.weight * hidden_states.to(input_dtype)
185
+
186
+
187
+ class BailingMoeV2GroupRMSNorm(nn.Module):
188
+ def __init__(self, hidden_size, group_norm_size, eps=1e-6):
189
+ """
190
+ BailingMoeV2RMSNorm is equivalent to T5LayerNorm
191
+ """
192
+ super().__init__()
193
+ self.weight = nn.Parameter(torch.ones(hidden_size))
194
+ self.group_norm_size = group_norm_size
195
+ assert hidden_size % group_norm_size == 0, "hidden_size must be divisible by group_norm_size"
196
+ self.variance_epsilon = eps
197
+
198
+ def forward(self, hidden_states):
199
+ input_dtype = hidden_states.dtype
200
+ input_shape = hidden_states.size()
201
+ group_input_shape = input_shape[:-1] + (self.group_norm_size, input_shape[-1] // self.group_norm_size)
202
+ hidden_states = hidden_states.view(group_input_shape)
203
+ hidden_states = hidden_states.to(torch.float32)
204
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
205
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
206
+ return self.weight * hidden_states.to(input_dtype).view(input_shape)
207
+
208
+
209
+ ALL_LAYERNORM_LAYERS.append(BailingMoeV2RMSNorm)
210
+
211
+
212
+ class BailingMoeV2RotaryEmbedding(nn.Module):
213
+ def __init__(self, config: BailingMoeLinearV2Config, device=None):
214
+ super().__init__()
215
+ # BC: "rope_type" was originally "type"
216
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
217
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
218
+ else:
219
+ self.rope_type = "default"
220
+ self.max_seq_len_cached = config.max_position_embeddings
221
+ self.original_max_seq_len = config.max_position_embeddings
222
+
223
+ self.config = config
224
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
225
+
226
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
227
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
228
+ self.original_inv_freq = self.inv_freq
229
+
230
+ @torch.no_grad()
231
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
232
+ def forward(self, x, position_ids):
233
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
234
+ position_ids_expanded = position_ids[:, None, :].float()
235
+
236
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
237
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
238
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
239
+ emb = torch.cat((freqs, freqs), dim=-1)
240
+ cos = emb.cos() * self.attention_scaling
241
+ sin = emb.sin() * self.attention_scaling
242
+
243
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
244
+
245
+
246
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
247
+ def rotate_half(x):
248
+ """Rotates half the hidden dims of the input."""
249
+ x1 = x[..., : x.shape[-1] // 2]
250
+ x2 = x[..., x.shape[-1] // 2 :]
251
+ return torch.cat((-x2, x1), dim=-1)
252
+
253
+
254
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
255
+ def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
256
+ """Applies Rotary Position Embedding to the query and key tensors.
257
+ Args:
258
+ q (`torch.Tensor`): The query tensor.
259
+ k (`torch.Tensor`): The key tensor.
260
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
261
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
262
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
263
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
264
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
265
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
266
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
267
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
268
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
269
+ Returns:
270
+ `tuple(torch.Tensor)` comprising the query and key tensors rotated using the Rotary Position Embedding.
271
+ """
272
+ cos = cos.unsqueeze(unsqueeze_dim)
273
+ sin = sin.unsqueeze(unsqueeze_dim)
274
+
275
+ # Keep half or full tensor for later concatenation
276
+ rotary_dim = cos.shape[-1]
277
+ q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
278
+ k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
279
+
280
+ # Apply rotary embeddings on the first half or full tensor
281
+ q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
282
+ k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
283
+
284
+ # Concatenate back to full shape
285
+ q_embed = torch.cat([q_embed, q_pass], dim=-1)
286
+ k_embed = torch.cat([k_embed, k_pass], dim=-1)
287
+ return q_embed, k_embed
288
+
289
+
290
+ class BailingMoeV2MLP(nn.Module):
291
+ def __init__(self, config: BailingMoeLinearV2Config, intermediate_size: int):
292
+ super().__init__()
293
+ self.config = config
294
+ self.hidden_size = config.hidden_size
295
+ self.intermediate_size = intermediate_size
296
+
297
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
298
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
299
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
300
+ self.act_fn = ACT2FN[config.hidden_act]
301
+
302
+ def forward(self, x):
303
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
304
+
305
+
306
+ class BailingMoeV2Gate(nn.Module):
307
+ def __init__(self, config):
308
+ super().__init__()
309
+ self.config = config
310
+ self.top_k = config.num_experts_per_tok
311
+ self.num_experts = config.num_experts
312
+
313
+ self.n_group = config.n_group
314
+ self.topk_group = config.topk_group
315
+
316
+ # topk selection algorithm
317
+ self.gating_dim = config.hidden_size
318
+ self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim)))
319
+ self.routed_scaling_factor = config.routed_scaling_factor
320
+
321
+ self.register_buffer("expert_bias", torch.zeros((self.num_experts)))
322
+ self.reset_parameters()
323
+
324
+ def reset_parameters(self) -> None:
325
+ import torch.nn.init as init
326
+
327
+ init.kaiming_uniform_(self.weight, a=math.sqrt(5))
328
+
329
+ def group_limited_topk(
330
+ self,
331
+ scores: torch.Tensor,
332
+ ):
333
+ num_tokens, _ = scores.size()
334
+ # Organize the experts into groups
335
+ group_scores = scores.view(num_tokens, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
336
+ group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
337
+ group_mask = torch.zeros_like(group_scores)
338
+ group_mask.scatter_(1, group_idx, 1)
339
+
340
+ # Mask the experts based on selection groups
341
+ score_mask = (
342
+ group_mask.unsqueeze(-1)
343
+ .expand(num_tokens, self.n_group, self.num_experts // self.n_group)
344
+ .reshape(num_tokens, -1)
345
+ )
346
+
347
+ masked_scores = scores.masked_fill(~score_mask.bool(), float('-inf'))
348
+ probs, top_indices = torch.topk(masked_scores, k=self.top_k, dim=-1)
349
+
350
+ return probs, top_indices
351
+
352
+ def forward(self, hidden_states):
353
+ # compute gating score
354
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
355
+ logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
356
+
357
+ scores = torch.sigmoid(logits.float()).type_as(logits)
358
+
359
+ scores_for_routing = scores + self.expert_bias
360
+ _, topk_idx = self.group_limited_topk(scores_for_routing)
361
+
362
+ scores = torch.gather(scores, dim=1, index=topk_idx).type_as(logits)
363
+
364
+ topk_weight = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if self.top_k > 1 else scores
365
+ topk_weight = topk_weight * self.routed_scaling_factor
366
+
367
+ return topk_idx, topk_weight, logits
368
+
369
+
370
+ class BailingMoeV2SparseMoeBlock(nn.Module):
371
+ """
372
+ A mixed expert module containing shared experts.
373
+ """
374
+
375
+ def __init__(self, config: BailingMoeLinearV2Config):
376
+ super().__init__()
377
+ self.config = config
378
+ self.num_experts_per_tok = config.num_experts_per_tok
379
+ self._setup_experts()
380
+ self.gate = BailingMoeV2Gate(config)
381
+ if config.num_shared_experts is not None:
382
+ self.shared_experts = BailingMoeV2MLP(
383
+ config=config, intermediate_size=config.moe_intermediate_size * config.num_shared_experts
384
+ )
385
+
386
+ def _setup_experts(self):
387
+ self.experts = nn.ModuleList(
388
+ [
389
+ BailingMoeV2MLP(config=self.config, intermediate_size=self.config.moe_intermediate_size)
390
+ for _ in range(self.config.num_experts)
391
+ ]
392
+ )
393
+
394
+ def forward(self, hidden_states):
395
+ identity = hidden_states
396
+ bsz, seq_len, h = hidden_states.shape
397
+ topk_idx, topk_weight, router_logits = self.gate(hidden_states)
398
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
399
+ flat_topk_idx = topk_idx.view(-1)
400
+ if self.training:
401
+ hidden_states = hidden_states.repeat_interleave(self.num_experts_per_tok, dim=0)
402
+ y = torch.empty_like(hidden_states)
403
+ for i, expert in enumerate(self.experts):
404
+ y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
405
+ y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
406
+ y = y.to(hidden_states.dtype).view(bsz, seq_len, h)
407
+ else:
408
+ y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(bsz, seq_len, h)
409
+ if self.config.num_shared_experts is not None:
410
+ y = y + self.shared_experts(identity)
411
+ return y, (router_logits.view(bsz, seq_len, -1), topk_idx.view(bsz, seq_len, -1))
412
+
413
+ @torch.no_grad()
414
+ def moe_infer(self, x, topk_ids, topk_weight):
415
+ cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
416
+ cnts.scatter_(1, topk_ids, 1)
417
+ tokens_per_expert = cnts.sum(dim=0)
418
+ idxs = topk_ids.view(-1).argsort()
419
+ sorted_tokens = x[idxs // topk_ids.shape[1]]
420
+ tokens_per_expert = tokens_per_expert.cpu().numpy()
421
+ outputs = []
422
+ start_idx = 0
423
+ for i, num_tokens in enumerate(tokens_per_expert):
424
+ end_idx = start_idx + num_tokens
425
+ if num_tokens == 0:
426
+ continue
427
+ expert = self.experts[i]
428
+ tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
429
+ expert_out = expert(tokens_for_this_expert)
430
+ outputs.append(expert_out.to(x.device))
431
+ start_idx = end_idx
432
+
433
+ outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
434
+ new_x = torch.empty_like(outs)
435
+ new_x[idxs] = outs
436
+ final_out = (
437
+ new_x.view(*topk_ids.shape, -1)
438
+ .type(topk_weight.dtype)
439
+ .mul_(topk_weight.unsqueeze(dim=-1))
440
+ .sum(dim=1)
441
+ .type(new_x.dtype)
442
+ )
443
+ return final_out
444
+
445
+
446
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
447
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int, head_first: bool = True) -> torch.Tensor:
448
+ """
449
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). If head_first is True, the hidden states go from (batch,
450
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
451
+ """
452
+ if n_rep == 1:
453
+ return hidden_states
454
+ if head_first:
455
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
456
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
457
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
458
+ else:
459
+ batch, slen, num_key_value_heads, head_dim = hidden_states.shape
460
+ hidden_states = hidden_states[:, :, :, None, :].expand(batch, slen, num_key_value_heads, n_rep, head_dim)
461
+ return hidden_states.reshape(batch, slen, num_key_value_heads * n_rep, head_dim)
462
+
463
+
464
+ # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->BailingMoeV2
465
+ class BailingMoeV2Attention(nn.Module):
466
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
467
+
468
+ def __init__(self, config: BailingMoeLinearV2Config, layer_idx: Optional[int] = None):
469
+ super().__init__()
470
+ self.config = config
471
+ self.layer_idx = layer_idx
472
+ if layer_idx is None:
473
+ logger.warning_once(
474
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
475
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
476
+ "when creating this class."
477
+ )
478
+
479
+ self.attention_dropout = config.attention_dropout
480
+ self.hidden_size = config.hidden_size
481
+ self.num_heads = config.num_attention_heads
482
+ self.head_dim = config.head_dim or self.hidden_size // self.num_heads
483
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
484
+ self.rope_dim = int(self.head_dim * partial_rotary_factor)
485
+ self.num_key_value_heads = config.num_key_value_heads
486
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
487
+ self.max_position_embeddings = config.max_position_embeddings
488
+ self.rope_theta = config.rope_theta
489
+ self.is_causal = True
490
+
491
+ self.query_key_value = nn.Linear(
492
+ self.hidden_size,
493
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
494
+ bias=config.use_qkv_bias,
495
+ )
496
+
497
+ if self.config.use_qk_norm:
498
+ self.query_layernorm = BailingMoeV2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
499
+ self.key_layernorm = BailingMoeV2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
500
+ self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.use_bias)
501
+
502
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
503
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
504
+
505
+ def forward(
506
+ self,
507
+ hidden_states: torch.Tensor,
508
+ attention_mask: Optional[torch.Tensor] = None,
509
+ position_ids: Optional[torch.LongTensor] = None,
510
+ past_key_value: Optional[Cache] = None,
511
+ output_attentions: bool = False,
512
+ use_cache: bool = False,
513
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
514
+ **kwargs,
515
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
516
+
517
+ bsz, q_len, _ = hidden_states.size()
518
+
519
+ qkv = self.query_key_value(hidden_states)
520
+ qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
521
+
522
+ query_states, key_states, value_states = qkv.split(
523
+ [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2
524
+ )
525
+ query_states = query_states.transpose(1, 2)
526
+ key_states = key_states.transpose(1, 2)
527
+ value_states = value_states.transpose(1, 2)
528
+
529
+ if self.config.use_qk_norm:
530
+ query_states = self.query_layernorm(query_states)
531
+ key_states = self.key_layernorm(key_states)
532
+
533
+ cos, sin = position_embeddings
534
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
535
+
536
+ if past_key_value is not None:
537
+ if self.layer_idx is None:
538
+ raise ValueError(
539
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
540
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
541
+ "with a layer index."
542
+ )
543
+ cache_kwargs = {"sin": sin, "cos": cos}
544
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
545
+
546
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
547
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
548
+
549
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
550
+
551
+ kv_seq_len = key_states.shape[-2]
552
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
553
+ raise ValueError(
554
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
555
+ f" {attn_weights.size()}"
556
+ )
557
+
558
+ if attention_mask is not None:
559
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
560
+ raise ValueError(
561
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
562
+ )
563
+ attn_weights = attn_weights + attention_mask
564
+
565
+ # upcast attention to fp32
566
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
567
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
568
+ attn_output = torch.matmul(attn_weights, value_states)
569
+
570
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
571
+ raise ValueError(
572
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
573
+ f" {attn_output.size()}"
574
+ )
575
+
576
+ attn_output = attn_output.transpose(1, 2).contiguous()
577
+
578
+ attn_output = attn_output.reshape(bsz, q_len, -1)
579
+
580
+ attn_output = self.dense(attn_output)
581
+
582
+ if not output_attentions:
583
+ attn_weights = None
584
+
585
+ return attn_output, attn_weights, past_key_value
586
+
587
+
588
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->BailingMoeV2
589
+ class BailingMoeV2FlashAttention2(BailingMoeV2Attention):
590
+ """
591
+ BailingMoeV2 flash attention module. This module inherits from `BailingMoeV2Attention` as the weights of the module stays
592
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
593
+ flash attention and deal with padding tokens in case the input contains any of them.
594
+ """
595
+
596
+ def __init__(self, *args, **kwargs):
597
+ super().__init__(*args, **kwargs)
598
+
599
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
600
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
601
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
602
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
603
+
604
+ def forward(
605
+ self,
606
+ hidden_states: torch.Tensor,
607
+ attention_mask: Optional[torch.LongTensor] = None,
608
+ position_ids: Optional[torch.LongTensor] = None,
609
+ past_key_value: Optional[Cache] = None,
610
+ output_attentions: bool = False,
611
+ use_cache: bool = False,
612
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
613
+ **kwargs,
614
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
615
+ # BailingMoeV2FlashAttention2 attention does not support output_attentions
616
+ output_attentions = False
617
+
618
+ bsz, q_len, _ = hidden_states.size()
619
+
620
+ # Flash attention requires the input to have the shape
621
+ # batch_size x seq_length x head_dim x hidden_dim
622
+ # therefore we just need to keep the original shape
623
+
624
+ qkv = self.query_key_value(hidden_states)
625
+ qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
626
+
627
+ query_states, key_states, value_states = qkv.split(
628
+ [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2
629
+ )
630
+ query_states = query_states.transpose(1, 2)
631
+ key_states = key_states.transpose(1, 2)
632
+ value_states = value_states.transpose(1, 2)
633
+
634
+ if self.config.use_qk_norm:
635
+ query_states = self.query_layernorm(query_states)
636
+ key_states = self.key_layernorm(key_states)
637
+
638
+ cos, sin = position_embeddings
639
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
640
+
641
+ if past_key_value is not None:
642
+ cache_kwargs = {"sin": sin, "cos": cos}
643
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
644
+
645
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
646
+ # to be able to avoid many of these transpose/reshape/view.
647
+ query_states = query_states.transpose(1, 2)
648
+ key_states = key_states.transpose(1, 2)
649
+ value_states = value_states.transpose(1, 2)
650
+
651
+ dropout_rate = self.attention_dropout if self.training else 0.0
652
+
653
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
654
+ # therefore the input hidden states gets silently cast in float32. Hence, we need
655
+ # cast them back in the correct dtype just to be sure everything works as expected.
656
+ # This might slow down training & inference so it is recommended to not cast the LayerNorms
657
+ # in fp32. (BailingMoeV2RMSNorm handles it correctly)
658
+
659
+ input_dtype = query_states.dtype
660
+ if input_dtype == torch.float32:
661
+ # Handle the case where the model is quantized
662
+ if hasattr(self.config, "_pre_quantization_dtype"):
663
+ target_dtype = self.config._pre_quantization_dtype
664
+ elif torch.is_autocast_enabled():
665
+ target_dtype = torch.get_autocast_gpu_dtype()
666
+ else:
667
+ target_dtype = self.query_key_value.weight.dtype
668
+
669
+ logger.warning_once(
670
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
671
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
672
+ f" {target_dtype}."
673
+ )
674
+
675
+ query_states = query_states.to(target_dtype)
676
+ key_states = key_states.to(target_dtype)
677
+ value_states = value_states.to(target_dtype)
678
+
679
+ attn_output = self._flash_attention_forward(
680
+ query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
681
+ )
682
+
683
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
684
+ attn_output = self.dense(attn_output)
685
+
686
+ if not output_attentions:
687
+ attn_weights = None
688
+
689
+ return attn_output, attn_weights, past_key_value
690
+
691
+ def _flash_attention_forward(
692
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
693
+ ):
694
+ """
695
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
696
+ first unpad the input, then computes the attention scores and pad the final attention scores.
697
+ Args:
698
+ query_states (`torch.Tensor`):
699
+ Input query states to be passed to Flash Attention API
700
+ key_states (`torch.Tensor`):
701
+ Input key states to be passed to Flash Attention API
702
+ value_states (`torch.Tensor`):
703
+ Input value states to be passed to Flash Attention API
704
+ attention_mask (`torch.Tensor`):
705
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
706
+ position of padding tokens and 1 for the position of non-padding tokens.
707
+ dropout (`int`, *optional*):
708
+ Attention dropout
709
+ softmax_scale (`float`, *optional*):
710
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
711
+ query_length (`int`):
712
+ The length of the query sequence in terms of tokens. This represents the number of tokens in the
713
+ `query_states` tensor along the sequence dimension. It is used to determine the effective sequence
714
+ length for attention computations.
715
+ """
716
+ if not self._flash_attn_uses_top_left_mask:
717
+ causal = self.is_causal
718
+ else:
719
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in BailingMoeV2FlashAttention2 __init__.
720
+ causal = self.is_causal and query_length != 1
721
+
722
+ # Contains at least one padding token in the sequence
723
+ if attention_mask is not None:
724
+ batch_size = query_states.shape[0]
725
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
726
+ query_states, key_states, value_states, attention_mask, query_length
727
+ )
728
+
729
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
730
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
731
+
732
+ attn_output_unpad = flash_attn_varlen_func(
733
+ query_states,
734
+ key_states,
735
+ value_states,
736
+ cu_seqlens_q=cu_seqlens_q,
737
+ cu_seqlens_k=cu_seqlens_k,
738
+ max_seqlen_q=max_seqlen_in_batch_q,
739
+ max_seqlen_k=max_seqlen_in_batch_k,
740
+ dropout_p=dropout,
741
+ softmax_scale=softmax_scale,
742
+ causal=causal,
743
+ )
744
+
745
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
746
+ else:
747
+ attn_output = flash_attn_func(
748
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
749
+ )
750
+
751
+ return attn_output
752
+
753
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
754
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
755
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
756
+
757
+ key_layer = index_first_axis(
758
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
759
+ )
760
+ value_layer = index_first_axis(
761
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
762
+ )
763
+ if query_length == kv_seq_len:
764
+ query_layer = index_first_axis(
765
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
766
+ )
767
+ cu_seqlens_q = cu_seqlens_k
768
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
769
+ indices_q = indices_k
770
+ elif query_length == 1:
771
+ max_seqlen_in_batch_q = 1
772
+ cu_seqlens_q = torch.arange(
773
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
774
+ ) # There is a memcpy here, that is very bad.
775
+ indices_q = cu_seqlens_q[:-1]
776
+ query_layer = query_layer.squeeze(1)
777
+ else:
778
+ # The -q_len: slice assumes left padding.
779
+ attention_mask = attention_mask[:, -query_length:]
780
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
781
+
782
+ return (
783
+ query_layer,
784
+ key_layer,
785
+ value_layer,
786
+ indices_q,
787
+ (cu_seqlens_q, cu_seqlens_k),
788
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
789
+ )
790
+
791
+
792
+ # Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->BailingMoeV2
793
+ class BailingMoeV2SdpaAttention(BailingMoeV2Attention):
794
+ """
795
+ BailingMoeV2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
796
+ `BailingMoeV2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
797
+ SDPA API.
798
+ """
799
+
800
+ # Adapted from BailingMoeV2Attention.forward
801
+ def forward(
802
+ self,
803
+ hidden_states: torch.Tensor,
804
+ attention_mask: Optional[torch.Tensor] = None,
805
+ position_ids: Optional[torch.LongTensor] = None,
806
+ past_key_value: Optional[Cache] = None,
807
+ output_attentions: bool = False,
808
+ use_cache: bool = False,
809
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
810
+ **kwargs,
811
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
812
+ if output_attentions:
813
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
814
+ logger.warning_once(
815
+ "BailingMoeV2Model is using BailingMoeV2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
816
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
817
+ )
818
+ return super().forward(
819
+ hidden_states=hidden_states,
820
+ attention_mask=attention_mask,
821
+ position_ids=position_ids,
822
+ past_key_value=past_key_value,
823
+ output_attentions=output_attentions,
824
+ use_cache=use_cache,
825
+ )
826
+
827
+ bsz, q_len, _ = hidden_states.size()
828
+
829
+ qkv = self.query_key_value(hidden_states)
830
+ qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
831
+
832
+ query_states, key_states, value_states = qkv.split(
833
+ [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2
834
+ )
835
+ query_states = query_states.transpose(1, 2)
836
+ key_states = key_states.transpose(1, 2)
837
+ value_states = value_states.transpose(1, 2)
838
+
839
+ if self.config.use_qk_norm:
840
+ query_states = self.query_layernorm(query_states)
841
+ key_states = self.key_layernorm(key_states)
842
+
843
+ cos, sin = position_embeddings
844
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
845
+
846
+ if past_key_value is not None:
847
+ cache_kwargs = {"sin": sin, "cos": cos}
848
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
849
+
850
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
851
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
852
+
853
+ if attention_mask is not None:
854
+ kv_seq_len = key_states.shape[-2]
855
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
856
+ raise ValueError(
857
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
858
+ )
859
+
860
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
861
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
862
+ if query_states.device.type == "cuda" and attention_mask is not None:
863
+ query_states = query_states.contiguous()
864
+ key_states = key_states.contiguous()
865
+ value_states = value_states.contiguous()
866
+
867
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
868
+ query_states,
869
+ key_states,
870
+ value_states,
871
+ attn_mask=attention_mask,
872
+ dropout_p=self.attention_dropout if self.training else 0.0,
873
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
874
+ is_causal=self.is_causal and attention_mask is None and q_len > 1,
875
+ )
876
+
877
+ attn_output = attn_output.transpose(1, 2).contiguous()
878
+ attn_output = attn_output.reshape(bsz, q_len, -1)
879
+
880
+ attn_output = self.dense(attn_output)
881
+
882
+ return attn_output, None, past_key_value
883
+
884
+
885
+ ATTENTION_CLASSES = {
886
+ "eager": BailingMoeV2Attention,
887
+ "flash_attention_2": BailingMoeV2FlashAttention2,
888
+ "sdpa": BailingMoeV2SdpaAttention,
889
+ }
890
+
891
+
892
+ class BailingMoeV2LinearAttention(nn.Module):
893
+ """
894
+ BailingMoeAttention implements a linear attention mechanism based on Lightning Attention-2
895
+ (https://arxiv.org/abs/2401.04658) with efficient computation using flash-linear-attention operators.
896
+
897
+ The implementation leverages optimized kernels from the flash-linear-attention library
898
+ (https://github.com/fla-org/flash-linear-attention) for maximum performance.
899
+ """
900
+ def __init__(self, config: BailingMoeLinearV2Config, layer_idx: Optional[int] = None):
901
+ super().__init__()
902
+ self.config = config
903
+ self.layer_idx = layer_idx
904
+ if layer_idx is None:
905
+ logger.warning_once(
906
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
907
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
908
+ "when creating this class."
909
+ )
910
+ self.hidden_size = config.hidden_size
911
+ self.num_heads = config.num_attention_heads
912
+ self.head_dim = config.head_dim or self.hidden_size // self.num_heads
913
+ self.num_key_value_heads = config.num_attention_heads
914
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
915
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
916
+ self.rope_dim = int(self.head_dim * partial_rotary_factor)
917
+
918
+ self.use_qk_norm = getattr(config, "use_qk_norm", False)
919
+ self.rms_norm_eps = getattr(config, "rms_norm_eps", 1e-5)
920
+ self.mode = 'chunk'
921
+
922
+ self.query_key_value = nn.Linear(
923
+ self.hidden_size,
924
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
925
+ bias=config.use_qkv_bias,
926
+ )
927
+
928
+ if self.config.use_qk_norm:
929
+ self.query_layernorm = BailingMoeV2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
930
+ self.key_layernorm = BailingMoeV2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
931
+
932
+ self.rotary_emb = BailingMoeV2RotaryEmbedding(config=config)
933
+
934
+ self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.use_bias)
935
+
936
+ self.g_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
937
+ self.g_norm = BailingMoeV2GroupRMSNorm(self.num_heads * self.head_dim, group_norm_size=config.group_norm_size, eps=self.rms_norm_eps)
938
+ slope = - BailingMoeV2LinearAttention.build_slope_tensor(self.num_heads) * (1 - (self.layer_idx - 1) / (self.config.num_hidden_layers - 1) + 1e-5)
939
+ self.register_buffer('slope', slope, persistent=False)
940
+
941
+ self.lightning_attn_ops = {
942
+ 'chunk': chunk_simple_gla,
943
+ 'fused_recurrent': fused_recurrent_simple_gla
944
+ }
945
+
946
+ @staticmethod
947
+ def build_slope_tensor(n_attention_heads: int):
948
+ """
949
+ Build a tensor of slopes for Lightning Attention-2 as described in the paper:
950
+ "Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence Lengths in Large Language Models"
951
+ (https://arxiv.org/abs/2401.04658)
952
+
953
+ This function computes the slope values that control the decay rate of attention scores
954
+ based on the number of attention heads. The slopes are designed to have specific
955
+ mathematical properties that work optimally when the number of heads is a power of 2.
956
+
957
+ For non-power-of-2 head counts, a workaround is implemented to maintain similar properties.
958
+
959
+ Args:
960
+ n_attention_heads (int): Number of attention heads in the model
961
+
962
+ Returns:
963
+ torch.Tensor: A tensor of shape [n_attention_heads] containing the computed slopes
964
+
965
+ Note:
966
+ Code copied from: https://github.com/OpenNLPLab/lightning-attention/blob/d15c38529bbd5c2c82b44ddda3cac885825aa873/lightning_attn/utils/utils.py#L6
967
+ """
968
+ def get_slopes(n):
969
+ def get_slopes_power_of_2(n):
970
+ start = 2 ** (-(2 ** -(math.log2(n) - 3)))
971
+ ratio = start
972
+ return [start * ratio ** i for i in range(n)]
973
+
974
+ if math.log2(n).is_integer():
975
+ return get_slopes_power_of_2(
976
+ n) # In the paper, we only train models that have 2^a heads for some a. This function has
977
+ else: # some good properties that only occur when the input is a power of 2. To maintain that even
978
+ closest_power_of_2 = 2 ** math.floor(
979
+ math.log2(n)) # when the number of heads is not a power of 2, we use this workaround.
980
+ return (get_slopes_power_of_2(closest_power_of_2)
981
+ + get_slopes(2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
982
+
983
+ slopes = torch.tensor(get_slopes(n_attention_heads), dtype=torch.float)
984
+ return slopes
985
+
986
+
987
+ def forward(
988
+ self,
989
+ hidden_states: torch.Tensor,
990
+ attention_mask: Optional[torch.Tensor] = None,
991
+ position_ids: Optional[torch.LongTensor] = None,
992
+ past_key_value: Optional[Cache] = None,
993
+ output_attentions: bool = False,
994
+ use_cache: bool = False,
995
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
996
+ **kwargs,
997
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
998
+ if attention_mask is not None:
999
+ assert len(attention_mask.shape) == 2, (
1000
+ "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
1001
+ "for padding purposes (0 indicating padding). "
1002
+ "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
1003
+ )
1004
+
1005
+ # launching the triton kernel for just one token will actually be slower
1006
+ mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
1007
+
1008
+ # Currently output_attentions can only be False, returning attention weights is not supported
1009
+ assert not output_attentions, "output_attentions can only be False, returning attention weights is not supported"
1010
+
1011
+ bsz, q_len, _ = hidden_states.size()
1012
+ device = hidden_states.device
1013
+
1014
+ qkv = self.query_key_value(hidden_states)
1015
+ qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
1016
+ query_states, key_states, value_states = qkv.split(
1017
+ [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2
1018
+ )
1019
+ if self.config.use_qk_norm:
1020
+ query_states = self.query_layernorm(query_states)
1021
+ key_states = self.key_layernorm(key_states)
1022
+
1023
+ cos, sin = position_embeddings
1024
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=2)
1025
+
1026
+ if self.num_key_value_groups > 1:
1027
+ # [bsz, q_len, n_kv_heads, head_dim] -> [bsz, q_len, n_heads, head_dim]
1028
+ key_states = repeat_kv(key_states, self.num_key_value_groups, head_first=False)
1029
+ value_states = repeat_kv(value_states, self.num_key_value_groups, head_first=False)
1030
+
1031
+ recurrent_state = None
1032
+ if past_key_value is not None and isinstance(past_key_value, Cache):
1033
+ # ensure the cache list is long enough
1034
+ while len(past_key_value.layers) <= self.layer_idx:
1035
+ past_key_value.layers.append(DynamicLayer())
1036
+
1037
+ if past_key_value.layers[self.layer_idx].keys is not None:
1038
+ recurrent_state = past_key_value.layers[self.layer_idx].keys
1039
+ # ensure recurrent_state is on the same device as hidden_states
1040
+ if recurrent_state.device != hidden_states.device:
1041
+ recurrent_state = recurrent_state.to(device).contiguous()
1042
+
1043
+ if recurrent_state is None:
1044
+ # dealing with left-padding
1045
+ if attention_mask is not None and use_cache:
1046
+ value_states = value_states.mul_(attention_mask[:, -q_len:, None, None])
1047
+
1048
+ o, recurrent_state = self.lightning_attn_ops[mode](
1049
+ q=query_states,
1050
+ k=key_states,
1051
+ v=value_states,
1052
+ g=self.slope[None, None, :].expand(bsz, q_len, self.num_heads),
1053
+ initial_state=recurrent_state,
1054
+ output_final_state=use_cache,
1055
+ )
1056
+
1057
+ o = o.reshape(bsz, q_len, -1)
1058
+ o = self.g_norm(o)
1059
+ g_proj = self.g_proj(hidden_states)
1060
+ o = o * torch.sigmoid_(g_proj)
1061
+ o = self.dense(o)
1062
+
1063
+ if use_cache and past_key_value is not None and isinstance(past_key_value, Cache):
1064
+ target_device = None
1065
+ for cache in past_key_value.layers:
1066
+ if cache.keys is not None:
1067
+ target_device = cache.keys.device
1068
+ break
1069
+ if target_device is None:
1070
+ target_device = recurrent_state.device
1071
+
1072
+ # move to target device
1073
+ if recurrent_state.device != target_device:
1074
+ recurrent_state = recurrent_state.to(target_device)
1075
+
1076
+ past_key_value.layers[self.layer_idx].keys = recurrent_state
1077
+
1078
+ return o, None, past_key_value
1079
+
1080
+
1081
+ class BailingMoeV2MTPLayer(nn.Module):
1082
+ def __init__(self, config: BailingMoeLinearV2Config, layer_idx: int):
1083
+ super().__init__()
1084
+ self.layer_idx = layer_idx
1085
+ self.input_layernorm = BailingMoeV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1086
+ self.enorm = BailingMoeV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1087
+
1088
+ self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
1089
+ self.post_attention_layernorm = BailingMoeV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1090
+ self.attention = ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
1091
+ self.mlp = BailingMoeV2SparseMoeBlock(config)
1092
+
1093
+ self.hnorm = BailingMoeV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1094
+ self.final_layernorm = BailingMoeV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1095
+
1096
+ def forward(
1097
+ self,
1098
+ input_embeds,
1099
+ hidden_states: torch.Tensor,
1100
+ attention_mask: Optional[torch.Tensor] = None,
1101
+ position_ids: Optional[torch.LongTensor] = None,
1102
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
1103
+ output_attentions: Optional[bool] = False,
1104
+ output_router_logits: Optional[bool] = False,
1105
+ use_cache: Optional[bool] = False,
1106
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
1107
+ **kwargs,
1108
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
1109
+ input_embeds = self.enorm(input_embeds)
1110
+ hidden_states = self.hnorm(hidden_states)
1111
+ hidden_states = self.eh_proj(torch.cat([input_embeds, hidden_states], dim=-1))
1112
+ residual = hidden_states
1113
+
1114
+ hidden_states = self.input_layernorm(hidden_states)
1115
+
1116
+ # Self Attention
1117
+ hidden_states, self_attn_weights, present_key_value = self.attention(
1118
+ hidden_states=hidden_states,
1119
+ attention_mask=attention_mask,
1120
+ position_ids=position_ids,
1121
+ past_key_value=past_key_value,
1122
+ output_attentions=output_attentions,
1123
+ position_embeddings=position_embeddings,
1124
+ use_cache=use_cache,
1125
+ )
1126
+ hidden_states = residual + hidden_states
1127
+
1128
+ # Fully Connected
1129
+ residual = hidden_states
1130
+ hidden_states = self.post_attention_layernorm(hidden_states)
1131
+ hidden_states = self.mlp(hidden_states)
1132
+ if isinstance(hidden_states, tuple):
1133
+ hidden_states, router_logits = hidden_states
1134
+ else:
1135
+ router_logits = None
1136
+ hidden_states = residual + hidden_states.to(residual.device)
1137
+ hidden_states = self.final_layernorm(hidden_states)
1138
+
1139
+ outputs = (hidden_states,)
1140
+
1141
+ if output_attentions:
1142
+ outputs += (self_attn_weights,)
1143
+
1144
+ if use_cache:
1145
+ outputs += (present_key_value,)
1146
+
1147
+ if output_router_logits:
1148
+ outputs += (router_logits,)
1149
+
1150
+ return outputs
1151
+
1152
+
1153
+ class BailingMoeLinearV2DecoderLayer(nn.Module):
1154
+ def __init__(self, config: BailingMoeLinearV2Config, layer_idx: int):
1155
+ super().__init__()
1156
+ self.hidden_size = config.hidden_size
1157
+ self.attention_layer_type = "attention" if (layer_idx + 1) % config.layer_group_size == 0 or \
1158
+ layer_idx >= config.num_hidden_layers // config.layer_group_size * config.layer_group_size else "linear_attention"
1159
+
1160
+ if self.attention_layer_type == "attention":
1161
+ self.attention = ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
1162
+ else:
1163
+ self.attention = BailingMoeV2LinearAttention(
1164
+ config=config,
1165
+ layer_idx=layer_idx
1166
+ )
1167
+
1168
+ self.mlp = (
1169
+ BailingMoeV2SparseMoeBlock(config)
1170
+ if (config.num_experts is not None and layer_idx >= config.first_k_dense_replace)
1171
+ else BailingMoeV2MLP(config=config, intermediate_size=config.intermediate_size)
1172
+ )
1173
+ self.input_layernorm = BailingMoeV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1174
+ self.post_attention_layernorm = BailingMoeV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1175
+
1176
+ def forward(
1177
+ self,
1178
+ hidden_states: torch.Tensor,
1179
+ attention_mask: Optional[torch.Tensor] = None,
1180
+ position_ids: Optional[torch.LongTensor] = None,
1181
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
1182
+ output_attentions: Optional[bool] = False,
1183
+ output_router_logits: Optional[bool] = False,
1184
+ use_cache: Optional[bool] = False,
1185
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
1186
+ **kwargs,
1187
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
1188
+ """
1189
+ Args:
1190
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
1191
+ attention_mask (`torch.FloatTensor`, *optional*):
1192
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
1193
+ query_sequence_length, key_sequence_length)` if default attention is used.
1194
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1195
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1196
+ config.n_positions - 1]`.
1197
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*):
1198
+ cached past key and value projection states
1199
+ output_attentions (`bool`, *optional*):
1200
+ Whether to return the attentions tensors of all attention layers. See `attentions` under
1201
+ returned tensors for more detail.
1202
+ output_router_logits (`bool`, *optional*):
1203
+ Whether or not to return the logits of all the routers. They are useful for computing the router loss,
1204
+ and should not be returned during inference.
1205
+ use_cache (`bool`, *optional*):
1206
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
1207
+ (see `past_key_values`).
1208
+ """
1209
+ residual = hidden_states
1210
+
1211
+ hidden_states = self.input_layernorm(hidden_states)
1212
+
1213
+ # Self Attention
1214
+ if self.attention_layer_type == "attention":
1215
+ hidden_states, self_attn_weights, present_key_value = self.attention(
1216
+ hidden_states=hidden_states,
1217
+ attention_mask=attention_mask,
1218
+ position_ids=position_ids,
1219
+ past_key_value=past_key_value,
1220
+ output_attentions=output_attentions,
1221
+ position_embeddings=position_embeddings,
1222
+ use_cache=use_cache,
1223
+ )
1224
+ else:
1225
+ batch_size, seq_len = hidden_states.shape[0], hidden_states.shape[1]
1226
+ device = hidden_states.device
1227
+
1228
+ if attention_mask is None:
1229
+ # if attention_mask is None, create a full mask
1230
+ attention_mask = torch.ones((batch_size, seq_len), dtype=torch.int32, device=device)
1231
+ elif attention_mask.dim() == 4 and attention_mask.shape[1] == 1:
1232
+ attention_mask = attention_mask[:, 0, -1, :].to(torch.int32)
1233
+ attention_mask = (attention_mask > -1e4).to(torch.int32)
1234
+ elif attention_mask.dim() == 2:
1235
+ attention_mask = attention_mask.to(torch.int32)
1236
+ else:
1237
+ raise ValueError(f"Unsupported mask dimension: {attention_mask.shape}")
1238
+
1239
+ hidden_states, self_attn_weights, present_key_value = self.attention(
1240
+ hidden_states=hidden_states,
1241
+ attention_mask=attention_mask,
1242
+ past_key_value=past_key_value,
1243
+ position_ids=position_ids,
1244
+ use_cache=use_cache,
1245
+ output_attentions=output_attentions,
1246
+ position_embeddings=position_embeddings,
1247
+ )
1248
+
1249
+ hidden_states = residual + hidden_states
1250
+
1251
+ # Fully Connected
1252
+ residual = hidden_states
1253
+ hidden_states = self.post_attention_layernorm(hidden_states)
1254
+ hidden_states = self.mlp(hidden_states)
1255
+ if isinstance(hidden_states, tuple):
1256
+ hidden_states, router_logits = hidden_states
1257
+ else:
1258
+ router_logits = None
1259
+ hidden_states = residual + hidden_states.to(residual.device)
1260
+
1261
+ outputs = (hidden_states,)
1262
+
1263
+ if output_attentions:
1264
+ outputs += (self_attn_weights,)
1265
+
1266
+ if use_cache:
1267
+ outputs += (present_key_value,)
1268
+
1269
+ if output_router_logits:
1270
+ outputs += (router_logits,)
1271
+
1272
+ return outputs
1273
+
1274
+
1275
+ BAILINGMOEV2_START_DOCSTRING = r"""
1276
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1277
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1278
+ etc.)
1279
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1280
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1281
+ and behavior.
1282
+ Parameters:
1283
+ config ([`BailingMoeLinearV2Config`]):
1284
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
1285
+ load the weights associated with the model, only the configuration. Check out the
1286
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
1287
+ """
1288
+
1289
+
1290
+ @add_start_docstrings(
1291
+ "The bare BailingMoeV2 Model outputting raw hidden-states without any specific head on top.",
1292
+ BAILINGMOEV2_START_DOCSTRING,
1293
+ )
1294
+ class BailingMoeV2PreTrainedModel(PreTrainedModel):
1295
+ config_class = BailingMoeLinearV2Config
1296
+ base_model_prefix = "model"
1297
+ supports_gradient_checkpointing = True
1298
+ _no_split_modules = ["BailingMoeLinearV2DecoderLayer"]
1299
+ _skip_keys_device_placement = "past_key_values"
1300
+ _supports_flash_attn_2 = True
1301
+ _supports_sdpa = True
1302
+ _supports_cache_class = True
1303
+
1304
+ def _init_weights(self, module):
1305
+ std = self.config.initializer_range
1306
+ if isinstance(module, nn.Linear):
1307
+ module.weight.data.normal_(mean=0.0, std=std)
1308
+ if module.bias is not None:
1309
+ module.bias.data.zero_()
1310
+ elif isinstance(module, nn.Embedding):
1311
+ module.weight.data.normal_(mean=0.0, std=std)
1312
+ if module.padding_idx is not None:
1313
+ module.weight.data[module.padding_idx].zero_()
1314
+
1315
+
1316
+ BAILINGMOEV2_INPUTS_DOCSTRING = r"""
1317
+ Args:
1318
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1319
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1320
+ it.
1321
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1322
+ [`PreTrainedTokenizer.__call__`] for details.
1323
+ [What are input IDs?](../glossary#input-ids)
1324
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1325
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1326
+ - 1 for tokens that are **not masked**,
1327
+ - 0 for tokens that are **masked**.
1328
+ [What are attention masks?](../glossary#attention-mask)
1329
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1330
+ [`PreTrainedTokenizer.__call__`] for details.
1331
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
1332
+ `past_key_values`).
1333
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1334
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1335
+ information on the default strategy.
1336
+ - 1 indicates the head is **not masked**,
1337
+ - 0 indicates the head is **masked**.
1338
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1339
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1340
+ config.n_positions - 1]`.
1341
+ [What are position IDs?](../glossary#position-ids)
1342
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1343
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1344
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1345
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
1346
+ Two formats are allowed:
1347
+ - a [`~cache_utils.Cache`] instance;
1348
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1349
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1350
+ cache format.
1351
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1352
+ legacy cache format will be returned.
1353
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1354
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1355
+ of shape `(batch_size, sequence_length)`.
1356
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1357
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1358
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1359
+ model's internal embedding lookup matrix.
1360
+ use_cache (`bool`, *optional*):
1361
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1362
+ `past_key_values`).
1363
+ output_attentions (`bool`, *optional*):
1364
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1365
+ tensors for more detail.
1366
+ output_hidden_states (`bool`, *optional*):
1367
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1368
+ more detail.
1369
+ return_dict (`bool`, *optional*):
1370
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1371
+ """
1372
+
1373
+
1374
+ @add_start_docstrings(
1375
+ "The bare BailingMoeV2 Model outputting raw hidden-states without any specific head on top.",
1376
+ BAILINGMOEV2_START_DOCSTRING,
1377
+ )
1378
+ class BailingMoeLinearV2Model(BailingMoeV2PreTrainedModel):
1379
+ """
1380
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`BailingMoeLinearV2DecoderLayer`]
1381
+ Args:
1382
+ config: BailingMoeLinearV2Config
1383
+ """
1384
+
1385
+ def __init__(self, config: BailingMoeLinearV2Config):
1386
+ super().__init__(config)
1387
+ self.padding_idx = config.pad_token_id
1388
+ self.vocab_size = config.vocab_size
1389
+ self.num_nextn_predict_layers = config.num_nextn_predict_layers
1390
+
1391
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
1392
+ self.layers = []
1393
+ for layer_idx in range(config.num_hidden_layers + config.num_nextn_predict_layers):
1394
+ layer_cls = BailingMoeLinearV2DecoderLayer if layer_idx < config.num_hidden_layers else BailingMoeV2MTPLayer
1395
+ self.layers.append(layer_cls(config, layer_idx))
1396
+
1397
+ self.layers = nn.ModuleList(self.layers)
1398
+
1399
+ self._use_sdpa = config._attn_implementation == "sdpa"
1400
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1401
+ self.norm = BailingMoeV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1402
+ self.rotary_emb = BailingMoeV2RotaryEmbedding(config=config)
1403
+ self.gradient_checkpointing = False
1404
+ # Initialize weights and apply final processing
1405
+ self.post_init()
1406
+
1407
+ def get_input_embeddings(self):
1408
+ return self.word_embeddings
1409
+
1410
+ def set_input_embeddings(self, value):
1411
+ self.word_embeddings = value
1412
+
1413
+ @add_start_docstrings_to_model_forward(BAILINGMOEV2_INPUTS_DOCSTRING)
1414
+ def forward(
1415
+ self,
1416
+ input_ids: torch.LongTensor = None,
1417
+ attention_mask: Optional[torch.Tensor] = None,
1418
+ position_ids: Optional[torch.LongTensor] = None,
1419
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1420
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1421
+ use_cache: Optional[bool] = None,
1422
+ output_attentions: Optional[bool] = None,
1423
+ output_hidden_states: Optional[bool] = None,
1424
+ output_router_logits: Optional[bool] = None,
1425
+ return_dict: Optional[bool] = None,
1426
+ **kwargs,
1427
+ ) -> Union[Tuple, MoeV2ModelOutputWithPast]:
1428
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1429
+ output_hidden_states = (
1430
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1431
+ )
1432
+ output_router_logits = (
1433
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
1434
+ )
1435
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1436
+
1437
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1438
+
1439
+ # retrieve input_ids and inputs_embeds
1440
+ if input_ids is not None and inputs_embeds is not None:
1441
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
1442
+ elif input_ids is not None:
1443
+ batch_size, seq_length = input_ids.shape[:2]
1444
+ elif inputs_embeds is not None:
1445
+ batch_size, seq_length = inputs_embeds.shape[:2]
1446
+ else:
1447
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
1448
+
1449
+ if self.gradient_checkpointing and self.training:
1450
+ if use_cache:
1451
+ logger.warning_once(
1452
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
1453
+ )
1454
+ use_cache = False
1455
+
1456
+ if use_cache and past_key_values is None:
1457
+ past_key_values = DynamicCache()
1458
+
1459
+ if inputs_embeds is None:
1460
+ inputs_embeds = self.word_embeddings(input_ids)
1461
+
1462
+ softmax_attention_layer_id = self.config.layer_group_size - 1
1463
+ past_seen_tokens = past_key_values.get_seq_length(layer_idx=softmax_attention_layer_id) if past_key_values is not None else 0
1464
+
1465
+ if position_ids is None:
1466
+ position_ids = torch.arange(
1467
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
1468
+ )
1469
+ position_ids = position_ids.unsqueeze(0)
1470
+
1471
+ if self._use_flash_attention_2:
1472
+ # 2d mask is passed through the layers
1473
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
1474
+ elif self._use_sdpa and not output_attentions:
1475
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
1476
+ # the manual implementation that requires a 4D causal mask in all cases.
1477
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
1478
+ attention_mask,
1479
+ (batch_size, seq_length),
1480
+ inputs_embeds,
1481
+ past_seen_tokens,
1482
+ )
1483
+ else:
1484
+ # 4d mask is passed through the layers
1485
+ attention_mask = _prepare_4d_causal_attention_mask(
1486
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_seen_tokens
1487
+ )
1488
+
1489
+ # embed positions
1490
+ hidden_states = inputs_embeds
1491
+
1492
+ # create position embeddings to be shared across the decoder layers
1493
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
1494
+
1495
+ # decoder layers
1496
+ all_hidden_states = () if output_hidden_states else None
1497
+ all_self_attns = () if output_attentions else None
1498
+ all_router_logits = () if output_router_logits else None
1499
+ next_decoder_cache = None
1500
+ layers = self.layers[: -self.num_nextn_predict_layers] if self.num_nextn_predict_layers > 0 else self.layers
1501
+ mtp_layers = self.layers[-self.num_nextn_predict_layers :] if self.num_nextn_predict_layers > 0 else None
1502
+
1503
+ for decoder_layer in layers:
1504
+ if output_hidden_states:
1505
+ all_hidden_states += (hidden_states,)
1506
+
1507
+ if self.gradient_checkpointing and self.training:
1508
+ layer_outputs = self._gradient_checkpointing_func(
1509
+ decoder_layer.__call__,
1510
+ hidden_states,
1511
+ attention_mask,
1512
+ position_ids,
1513
+ past_key_values,
1514
+ output_attentions,
1515
+ output_router_logits,
1516
+ use_cache,
1517
+ position_embeddings,
1518
+ )
1519
+ else:
1520
+ layer_outputs = decoder_layer(
1521
+ hidden_states,
1522
+ attention_mask=attention_mask,
1523
+ position_ids=position_ids,
1524
+ past_key_value=past_key_values,
1525
+ output_attentions=output_attentions,
1526
+ output_router_logits=output_router_logits,
1527
+ use_cache=use_cache,
1528
+ position_embeddings=position_embeddings,
1529
+ )
1530
+ hidden_states = layer_outputs[0]
1531
+
1532
+ if use_cache:
1533
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1534
+
1535
+ if output_attentions:
1536
+ all_self_attns += (layer_outputs[1],)
1537
+
1538
+ if output_router_logits and layer_outputs[-1] is not None:
1539
+ all_router_logits += (layer_outputs[-1],)
1540
+
1541
+ hidden_states = self.norm(hidden_states)
1542
+ main_hidden_states = hidden_states
1543
+
1544
+ # add hidden states from the last decoder layer
1545
+ if output_hidden_states:
1546
+ all_hidden_states += (main_hidden_states,)
1547
+
1548
+ mtp_hidden_states = None
1549
+
1550
+ if mtp_layers:
1551
+ for decoder_layer in mtp_layers:
1552
+ input_ids, _ = roll_tensor(input_ids, shifts=-1, dims=-1)
1553
+ inputs_embeds = self.word_embeddings(input_ids)
1554
+
1555
+ if self.gradient_checkpointing and self.training:
1556
+ layer_outputs = self._gradient_checkpointing_func(
1557
+ decoder_layer.__call__,
1558
+ inputs_embeds,
1559
+ hidden_states,
1560
+ attention_mask,
1561
+ position_ids,
1562
+ past_key_values,
1563
+ output_attentions,
1564
+ output_router_logits,
1565
+ use_cache,
1566
+ position_embeddings,
1567
+ )
1568
+ else:
1569
+ layer_outputs = decoder_layer(
1570
+ inputs_embeds,
1571
+ hidden_states,
1572
+ attention_mask=attention_mask,
1573
+ position_ids=position_ids,
1574
+ past_key_value=past_key_values,
1575
+ output_attentions=output_attentions,
1576
+ output_router_logits=output_router_logits,
1577
+ use_cache=use_cache,
1578
+ position_embeddings=position_embeddings,
1579
+ )
1580
+ if mtp_hidden_states is None:
1581
+ mtp_hidden_states = []
1582
+ hidden_states = layer_outputs[0]
1583
+ mtp_hidden_states.append(hidden_states)
1584
+
1585
+ if output_hidden_states:
1586
+ all_hidden_states += (hidden_states,)
1587
+
1588
+ if use_cache:
1589
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1590
+
1591
+ if output_attentions:
1592
+ all_self_attns += (layer_outputs[1],)
1593
+
1594
+ if output_router_logits and layer_outputs[-1] is not None:
1595
+ all_router_logits += (layer_outputs[-1],)
1596
+
1597
+ next_cache = None
1598
+ if use_cache:
1599
+ next_cache = next_decoder_cache
1600
+ if not return_dict:
1601
+ return tuple(
1602
+ v
1603
+ for v in [main_hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
1604
+ if v is not None
1605
+ )
1606
+ return MoeV2ModelOutputWithPast(
1607
+ last_hidden_state=main_hidden_states,
1608
+ past_key_values=next_cache,
1609
+ hidden_states=all_hidden_states,
1610
+ mtp_hidden_states=mtp_hidden_states,
1611
+ attentions=all_self_attns,
1612
+ router_logits=all_router_logits,
1613
+ )
1614
+
1615
+
1616
+ class BailingMoeLinearV2ForCausalLM(BailingMoeV2PreTrainedModel, GenerationMixin):
1617
+ _tied_weights_keys = ["lm_head.weight"]
1618
+
1619
+ def __init__(self, config: BailingMoeLinearV2Config):
1620
+ super().__init__(config)
1621
+ self.model = BailingMoeLinearV2Model(config)
1622
+ self.vocab_size = config.vocab_size
1623
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1624
+ self.num_nextn_predict_layers = config.num_nextn_predict_layers
1625
+ self.mtp_loss_scaling_factor = config.mtp_loss_scaling_factor
1626
+
1627
+ # Initialize weights and apply final processing
1628
+ self.post_init()
1629
+
1630
+ def get_input_embeddings(self):
1631
+ return self.model.word_embeddings
1632
+
1633
+ def set_input_embeddings(self, value):
1634
+ self.model.word_embeddings = value
1635
+
1636
+ def get_output_embeddings(self):
1637
+ return self.lm_head
1638
+
1639
+ def set_output_embeddings(self, new_embeddings):
1640
+ self.lm_head = new_embeddings
1641
+
1642
+ def set_decoder(self, decoder):
1643
+ self.model = decoder
1644
+
1645
+ def get_decoder(self):
1646
+ return self.model
1647
+
1648
+ @add_start_docstrings_to_model_forward(BAILINGMOEV2_INPUTS_DOCSTRING)
1649
+ @replace_return_docstrings(output_type=MoEV2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1650
+ def forward(
1651
+ self,
1652
+ input_ids: torch.LongTensor = None,
1653
+ attention_mask: Optional[torch.Tensor] = None,
1654
+ position_ids: Optional[torch.LongTensor] = None,
1655
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1656
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1657
+ labels: Optional[torch.LongTensor] = None,
1658
+ use_cache: Optional[bool] = None,
1659
+ output_attentions: Optional[bool] = None,
1660
+ output_hidden_states: Optional[bool] = None,
1661
+ output_router_logits: Optional[bool] = None,
1662
+ return_dict: Optional[bool] = None,
1663
+ **kwargs,
1664
+ ) -> Union[Tuple, MoEV2CausalLMOutputWithPast]:
1665
+ r"""
1666
+ Args:
1667
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1668
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1669
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1670
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1671
+ Returns:
1672
+ Example:
1673
+ ```python
1674
+ >>> from transformers import AutoTokenizer
1675
+ >>> model = BailingMoeLinearV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1676
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1677
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1678
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1679
+ >>> # Generate
1680
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1681
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1682
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1683
+ ```"""
1684
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1685
+ output_hidden_states = (
1686
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1687
+ )
1688
+ output_router_logits = (
1689
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
1690
+ )
1691
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1692
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1693
+ outputs = self.model(
1694
+ input_ids=input_ids,
1695
+ attention_mask=attention_mask,
1696
+ position_ids=position_ids,
1697
+ past_key_values=past_key_values,
1698
+ inputs_embeds=inputs_embeds,
1699
+ use_cache=use_cache,
1700
+ output_attentions=output_attentions,
1701
+ output_hidden_states=output_hidden_states,
1702
+ output_router_logits=output_router_logits,
1703
+ return_dict=return_dict,
1704
+ **kwargs,
1705
+ )
1706
+
1707
+ loss = None
1708
+ all_mtp_loss = None
1709
+ aux_loss = None
1710
+ hidden_states = outputs[0]
1711
+ logits = self.lm_head(hidden_states)
1712
+ logits = logits.float()
1713
+
1714
+ if labels is not None:
1715
+ loss = self.loss_function(logits, labels, self.config.vocab_size, **kwargs)
1716
+
1717
+ all_mtp_logits = None
1718
+ if self.num_nextn_predict_layers > 0:
1719
+ mtp_hidden_states = outputs.mtp_hidden_states
1720
+ shift_labels_mtp = None
1721
+ for i in range(self.num_nextn_predict_layers):
1722
+ mtp_hidden_states = mtp_hidden_states[i]
1723
+ mtp_logits = self.lm_head(mtp_hidden_states).float()
1724
+ if all_mtp_logits is None:
1725
+ all_mtp_logits = []
1726
+ all_mtp_logits.append(mtp_logits)
1727
+ if labels is not None:
1728
+ if shift_labels_mtp is None:
1729
+ shift_labels_mtp = labels.clone()
1730
+ shift_labels_mtp, _ = roll_tensor(shift_labels_mtp, shifts=-1, dims=-1, fill_value=-100)
1731
+ mtp_logits_ = mtp_logits.view(-1, self.config.vocab_size)
1732
+ mtp_loss = self.loss_function(mtp_logits_, shift_labels_mtp.to(mtp_logits_.device).view(-1), self.config.vocab_size, **kwargs)
1733
+ if loss is not None:
1734
+ loss += self.mtp_loss_scaling_factor * mtp_loss
1735
+ else:
1736
+ loss = self.mtp_loss_scaling_factor * mtp_loss
1737
+
1738
+ if all_mtp_loss is None:
1739
+ all_mtp_loss = []
1740
+ all_mtp_loss.append(mtp_loss)
1741
+
1742
+ if not return_dict:
1743
+ output = (logits,) + outputs[1:]
1744
+ if output_router_logits:
1745
+ output = (aux_loss,) + output
1746
+ return (loss,) + output if loss is not None else output
1747
+
1748
+ return MoEV2CausalLMOutputWithPast(
1749
+ loss=loss,
1750
+ mtp_loss=all_mtp_loss,
1751
+ aux_loss=aux_loss,
1752
+ logits=logits,
1753
+ mtp_logits=all_mtp_logits,
1754
+ past_key_values=outputs.past_key_values,
1755
+ hidden_states=outputs.hidden_states,
1756
+ attentions=outputs.attentions,
1757
+ router_logits=outputs.router_logits,
1758
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdcadf59ad1db38dde175f2a82d3ec2dde15986ac1f81aef69c5cdd03afc6e1b
3
+ size 12205847
tokenizer_config.json ADDED
@@ -0,0 +1,2114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "156891": {
6
+ "content": "<|startoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "156892": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "156893": {
22
+ "content": "[CLS]",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "156894": {
30
+ "content": "[gMASK]",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "156895": {
38
+ "content": "<|reserved_token_0|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "156896": {
46
+ "content": "<|reserved_token_1|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "156897": {
54
+ "content": "<|reserved_token_2|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "156898": {
62
+ "content": "<|reserved_token_3|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "156899": {
70
+ "content": "<|reserved_token_4|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "156900": {
78
+ "content": "<|reserved_token_5|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "156901": {
86
+ "content": "<|reserved_token_6|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "156902": {
94
+ "content": "<|reserved_token_7|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "156903": {
102
+ "content": "<|reserved_token_8|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "156904": {
110
+ "content": "<|reserved_token_9|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "156905": {
118
+ "content": "<|reserved_token_10|>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "156906": {
126
+ "content": "<|reserved_token_11|>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "156907": {
134
+ "content": "<|reserved_token_12|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "156908": {
142
+ "content": "<|reserved_token_13|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "156909": {
150
+ "content": "<|reserved_token_14|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "156910": {
158
+ "content": "<|reserved_token_15|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "156911": {
166
+ "content": "<|reserved_token_16|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "156912": {
174
+ "content": "<|reserved_token_17|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "156913": {
182
+ "content": "<|reserved_token_18|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "156914": {
190
+ "content": "<|reserved_token_19|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "156915": {
198
+ "content": "<|reserved_token_20|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "156916": {
206
+ "content": "<|reserved_token_21|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "156917": {
214
+ "content": "<|reserved_token_22|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "156918": {
222
+ "content": "<|reserved_token_23|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "156919": {
230
+ "content": "<|reserved_token_24|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "156920": {
238
+ "content": "<|reserved_token_25|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "156921": {
246
+ "content": "<|reserved_token_26|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "156922": {
254
+ "content": "<|reserved_token_27|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "156923": {
262
+ "content": "<|reserved_token_28|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "156924": {
270
+ "content": "<|reserved_token_29|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "156925": {
278
+ "content": "<|reserved_token_30|>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "156926": {
286
+ "content": "<|reserved_token_31|>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "156927": {
294
+ "content": "<|reserved_token_32|>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "156928": {
302
+ "content": "<|reserved_token_33|>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "156929": {
310
+ "content": "<|reserved_token_34|>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "156930": {
318
+ "content": "<|reserved_token_35|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "156931": {
326
+ "content": "<|reserved_token_36|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "156932": {
334
+ "content": "<|reserved_token_37|>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "156933": {
342
+ "content": "<|reserved_token_38|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "156934": {
350
+ "content": "<|reserved_token_39|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "156935": {
358
+ "content": "<|reserved_token_40|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "156936": {
366
+ "content": "<|reserved_token_41|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "156937": {
374
+ "content": "<|reserved_token_42|>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "156938": {
382
+ "content": "<|reserved_token_43|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "156939": {
390
+ "content": "<|reserved_token_44|>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "156940": {
398
+ "content": "<|reserved_token_45|>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "156941": {
406
+ "content": "<|reserved_token_46|>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "156942": {
414
+ "content": "<|reserved_token_47|>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "156943": {
422
+ "content": "<|reserved_token_48|>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "156944": {
430
+ "content": "<|reserved_token_49|>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "156945": {
438
+ "content": "<|reserved_token_50|>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "156946": {
446
+ "content": "<|reserved_token_51|>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "156947": {
454
+ "content": "<|reserved_token_52|>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "156948": {
462
+ "content": "<|reserved_token_53|>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "156949": {
470
+ "content": "<|reserved_token_54|>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "156950": {
478
+ "content": "<|reserved_token_55|>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "156951": {
486
+ "content": "<|reserved_token_56|>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "156952": {
494
+ "content": "<|reserved_token_57|>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": true
500
+ },
501
+ "156953": {
502
+ "content": "<|reserved_token_58|>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": true
508
+ },
509
+ "156954": {
510
+ "content": "<|reserved_token_59|>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": true
516
+ },
517
+ "156955": {
518
+ "content": "<|reserved_token_60|>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": true
524
+ },
525
+ "156956": {
526
+ "content": "<|reserved_token_61|>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": true
532
+ },
533
+ "156957": {
534
+ "content": "<|reserved_token_62|>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": true
540
+ },
541
+ "156958": {
542
+ "content": "<|reserved_token_63|>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": true
548
+ },
549
+ "156959": {
550
+ "content": "<|reserved_token_64|>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": true
556
+ },
557
+ "156960": {
558
+ "content": "<|reserved_token_65|>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": true
564
+ },
565
+ "156961": {
566
+ "content": "<|reserved_token_66|>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": true
572
+ },
573
+ "156962": {
574
+ "content": "<|reserved_token_67|>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": true
580
+ },
581
+ "156963": {
582
+ "content": "<|reserved_token_68|>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": true
588
+ },
589
+ "156964": {
590
+ "content": "<|reserved_token_69|>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": true
596
+ },
597
+ "156965": {
598
+ "content": "<|reserved_token_70|>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": true
604
+ },
605
+ "156966": {
606
+ "content": "<|reserved_token_71|>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": true
612
+ },
613
+ "156967": {
614
+ "content": "<|reserved_token_72|>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": true
620
+ },
621
+ "156968": {
622
+ "content": "<|reserved_token_73|>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": true
628
+ },
629
+ "156969": {
630
+ "content": "<|reserved_token_74|>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": true
636
+ },
637
+ "156970": {
638
+ "content": "<|reserved_token_75|>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": true
644
+ },
645
+ "156971": {
646
+ "content": "<|reserved_token_76|>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": true
652
+ },
653
+ "156972": {
654
+ "content": "<|reserved_token_77|>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": true
660
+ },
661
+ "156973": {
662
+ "content": "<|reserved_token_78|>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": true
668
+ },
669
+ "156974": {
670
+ "content": "<|reserved_token_79|>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": true
676
+ },
677
+ "156975": {
678
+ "content": "<|reserved_token_80|>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": true
684
+ },
685
+ "156976": {
686
+ "content": "<|reserved_token_81|>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": true
692
+ },
693
+ "156977": {
694
+ "content": "<|reserved_token_82|>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": true
700
+ },
701
+ "156978": {
702
+ "content": "<|reserved_token_83|>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": true
708
+ },
709
+ "156979": {
710
+ "content": "<|reserved_token_84|>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": true
716
+ },
717
+ "156980": {
718
+ "content": "<|reserved_token_85|>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": true
724
+ },
725
+ "156981": {
726
+ "content": "<|reserved_token_86|>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": true
732
+ },
733
+ "156982": {
734
+ "content": "<|reserved_token_87|>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": true
740
+ },
741
+ "156983": {
742
+ "content": "<|reserved_token_88|>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": true
748
+ },
749
+ "156984": {
750
+ "content": "<|reserved_token_89|>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": true
756
+ },
757
+ "156985": {
758
+ "content": "<|reserved_token_90|>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": true
764
+ },
765
+ "156986": {
766
+ "content": "<|reserved_token_91|>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": true
772
+ },
773
+ "156987": {
774
+ "content": "<|reserved_token_92|>",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": true
780
+ },
781
+ "156988": {
782
+ "content": "<|reserved_token_93|>",
783
+ "lstrip": false,
784
+ "normalized": false,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": true
788
+ },
789
+ "156989": {
790
+ "content": "<|reserved_token_94|>",
791
+ "lstrip": false,
792
+ "normalized": false,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": true
796
+ },
797
+ "156990": {
798
+ "content": "<|reserved_token_95|>",
799
+ "lstrip": false,
800
+ "normalized": false,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": true
804
+ },
805
+ "156991": {
806
+ "content": "<|reserved_token_96|>",
807
+ "lstrip": false,
808
+ "normalized": false,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": true
812
+ },
813
+ "156992": {
814
+ "content": "<|reserved_token_97|>",
815
+ "lstrip": false,
816
+ "normalized": false,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": true
820
+ },
821
+ "156993": {
822
+ "content": "<|reserved_token_98|>",
823
+ "lstrip": false,
824
+ "normalized": false,
825
+ "rstrip": false,
826
+ "single_word": false,
827
+ "special": true
828
+ },
829
+ "156994": {
830
+ "content": "<|reserved_token_99|>",
831
+ "lstrip": false,
832
+ "normalized": false,
833
+ "rstrip": false,
834
+ "single_word": false,
835
+ "special": true
836
+ },
837
+ "156995": {
838
+ "content": "<|reserved_token_100|>",
839
+ "lstrip": false,
840
+ "normalized": false,
841
+ "rstrip": false,
842
+ "single_word": false,
843
+ "special": true
844
+ },
845
+ "156996": {
846
+ "content": "<|reserved_token_101|>",
847
+ "lstrip": false,
848
+ "normalized": false,
849
+ "rstrip": false,
850
+ "single_word": false,
851
+ "special": true
852
+ },
853
+ "156997": {
854
+ "content": "<|reserved_token_102|>",
855
+ "lstrip": false,
856
+ "normalized": false,
857
+ "rstrip": false,
858
+ "single_word": false,
859
+ "special": true
860
+ },
861
+ "156998": {
862
+ "content": "<|reserved_token_103|>",
863
+ "lstrip": false,
864
+ "normalized": false,
865
+ "rstrip": false,
866
+ "single_word": false,
867
+ "special": true
868
+ },
869
+ "156999": {
870
+ "content": "<|reserved_token_104|>",
871
+ "lstrip": false,
872
+ "normalized": false,
873
+ "rstrip": false,
874
+ "single_word": false,
875
+ "special": true
876
+ },
877
+ "157000": {
878
+ "content": "<|reserved_token_105|>",
879
+ "lstrip": false,
880
+ "normalized": false,
881
+ "rstrip": false,
882
+ "single_word": false,
883
+ "special": true
884
+ },
885
+ "157001": {
886
+ "content": "<|reserved_token_106|>",
887
+ "lstrip": false,
888
+ "normalized": false,
889
+ "rstrip": false,
890
+ "single_word": false,
891
+ "special": true
892
+ },
893
+ "157002": {
894
+ "content": "<|reserved_token_107|>",
895
+ "lstrip": false,
896
+ "normalized": false,
897
+ "rstrip": false,
898
+ "single_word": false,
899
+ "special": true
900
+ },
901
+ "157003": {
902
+ "content": "<|reserved_token_108|>",
903
+ "lstrip": false,
904
+ "normalized": false,
905
+ "rstrip": false,
906
+ "single_word": false,
907
+ "special": true
908
+ },
909
+ "157004": {
910
+ "content": "<|reserved_token_109|>",
911
+ "lstrip": false,
912
+ "normalized": false,
913
+ "rstrip": false,
914
+ "single_word": false,
915
+ "special": true
916
+ },
917
+ "157005": {
918
+ "content": "<|reserved_token_110|>",
919
+ "lstrip": false,
920
+ "normalized": false,
921
+ "rstrip": false,
922
+ "single_word": false,
923
+ "special": true
924
+ },
925
+ "157006": {
926
+ "content": "<|reserved_token_111|>",
927
+ "lstrip": false,
928
+ "normalized": false,
929
+ "rstrip": false,
930
+ "single_word": false,
931
+ "special": true
932
+ },
933
+ "157007": {
934
+ "content": "<|reserved_token_112|>",
935
+ "lstrip": false,
936
+ "normalized": false,
937
+ "rstrip": false,
938
+ "single_word": false,
939
+ "special": true
940
+ },
941
+ "157008": {
942
+ "content": "<|reserved_token_113|>",
943
+ "lstrip": false,
944
+ "normalized": false,
945
+ "rstrip": false,
946
+ "single_word": false,
947
+ "special": true
948
+ },
949
+ "157009": {
950
+ "content": "<|reserved_token_114|>",
951
+ "lstrip": false,
952
+ "normalized": false,
953
+ "rstrip": false,
954
+ "single_word": false,
955
+ "special": true
956
+ },
957
+ "157010": {
958
+ "content": "<|reserved_token_115|>",
959
+ "lstrip": false,
960
+ "normalized": false,
961
+ "rstrip": false,
962
+ "single_word": false,
963
+ "special": true
964
+ },
965
+ "157011": {
966
+ "content": "<|reserved_token_116|>",
967
+ "lstrip": false,
968
+ "normalized": false,
969
+ "rstrip": false,
970
+ "single_word": false,
971
+ "special": true
972
+ },
973
+ "157012": {
974
+ "content": "<|reserved_token_117|>",
975
+ "lstrip": false,
976
+ "normalized": false,
977
+ "rstrip": false,
978
+ "single_word": false,
979
+ "special": true
980
+ },
981
+ "157013": {
982
+ "content": "<|reserved_token_118|>",
983
+ "lstrip": false,
984
+ "normalized": false,
985
+ "rstrip": false,
986
+ "single_word": false,
987
+ "special": true
988
+ },
989
+ "157014": {
990
+ "content": "<|reserved_token_119|>",
991
+ "lstrip": false,
992
+ "normalized": false,
993
+ "rstrip": false,
994
+ "single_word": false,
995
+ "special": true
996
+ },
997
+ "157015": {
998
+ "content": "<|reserved_token_120|>",
999
+ "lstrip": false,
1000
+ "normalized": false,
1001
+ "rstrip": false,
1002
+ "single_word": false,
1003
+ "special": true
1004
+ },
1005
+ "157016": {
1006
+ "content": "<|reserved_token_121|>",
1007
+ "lstrip": false,
1008
+ "normalized": false,
1009
+ "rstrip": false,
1010
+ "single_word": false,
1011
+ "special": true
1012
+ },
1013
+ "157017": {
1014
+ "content": "<|reserved_token_122|>",
1015
+ "lstrip": false,
1016
+ "normalized": false,
1017
+ "rstrip": false,
1018
+ "single_word": false,
1019
+ "special": true
1020
+ },
1021
+ "157018": {
1022
+ "content": "<|reserved_token_123|>",
1023
+ "lstrip": false,
1024
+ "normalized": false,
1025
+ "rstrip": false,
1026
+ "single_word": false,
1027
+ "special": true
1028
+ },
1029
+ "157019": {
1030
+ "content": "<|reserved_token_124|>",
1031
+ "lstrip": false,
1032
+ "normalized": false,
1033
+ "rstrip": false,
1034
+ "single_word": false,
1035
+ "special": true
1036
+ },
1037
+ "157020": {
1038
+ "content": "<|reserved_token_125|>",
1039
+ "lstrip": false,
1040
+ "normalized": false,
1041
+ "rstrip": false,
1042
+ "single_word": false,
1043
+ "special": true
1044
+ },
1045
+ "157021": {
1046
+ "content": "<|reserved_token_126|>",
1047
+ "lstrip": false,
1048
+ "normalized": false,
1049
+ "rstrip": false,
1050
+ "single_word": false,
1051
+ "special": true
1052
+ },
1053
+ "157022": {
1054
+ "content": "<|reserved_token_127|>",
1055
+ "lstrip": false,
1056
+ "normalized": false,
1057
+ "rstrip": false,
1058
+ "single_word": false,
1059
+ "special": true
1060
+ },
1061
+ "157023": {
1062
+ "content": "<|reserved_token_128|>",
1063
+ "lstrip": false,
1064
+ "normalized": false,
1065
+ "rstrip": false,
1066
+ "single_word": false,
1067
+ "special": true
1068
+ },
1069
+ "157024": {
1070
+ "content": "<|reserved_token_129|>",
1071
+ "lstrip": false,
1072
+ "normalized": false,
1073
+ "rstrip": false,
1074
+ "single_word": false,
1075
+ "special": true
1076
+ },
1077
+ "157025": {
1078
+ "content": "<|reserved_token_130|>",
1079
+ "lstrip": false,
1080
+ "normalized": false,
1081
+ "rstrip": false,
1082
+ "single_word": false,
1083
+ "special": true
1084
+ },
1085
+ "157026": {
1086
+ "content": "<|reserved_token_131|>",
1087
+ "lstrip": false,
1088
+ "normalized": false,
1089
+ "rstrip": false,
1090
+ "single_word": false,
1091
+ "special": true
1092
+ },
1093
+ "157027": {
1094
+ "content": "<|reserved_token_132|>",
1095
+ "lstrip": false,
1096
+ "normalized": false,
1097
+ "rstrip": false,
1098
+ "single_word": false,
1099
+ "special": true
1100
+ },
1101
+ "157028": {
1102
+ "content": "<|reserved_token_133|>",
1103
+ "lstrip": false,
1104
+ "normalized": false,
1105
+ "rstrip": false,
1106
+ "single_word": false,
1107
+ "special": true
1108
+ },
1109
+ "157029": {
1110
+ "content": "<|reserved_token_134|>",
1111
+ "lstrip": false,
1112
+ "normalized": false,
1113
+ "rstrip": false,
1114
+ "single_word": false,
1115
+ "special": true
1116
+ },
1117
+ "157030": {
1118
+ "content": "<|reserved_token_135|>",
1119
+ "lstrip": false,
1120
+ "normalized": false,
1121
+ "rstrip": false,
1122
+ "single_word": false,
1123
+ "special": true
1124
+ },
1125
+ "157031": {
1126
+ "content": "<|reserved_token_136|>",
1127
+ "lstrip": false,
1128
+ "normalized": false,
1129
+ "rstrip": false,
1130
+ "single_word": false,
1131
+ "special": true
1132
+ },
1133
+ "157032": {
1134
+ "content": "<|reserved_token_137|>",
1135
+ "lstrip": false,
1136
+ "normalized": false,
1137
+ "rstrip": false,
1138
+ "single_word": false,
1139
+ "special": true
1140
+ },
1141
+ "157033": {
1142
+ "content": "<|reserved_token_138|>",
1143
+ "lstrip": false,
1144
+ "normalized": false,
1145
+ "rstrip": false,
1146
+ "single_word": false,
1147
+ "special": true
1148
+ },
1149
+ "157034": {
1150
+ "content": "<|reserved_token_139|>",
1151
+ "lstrip": false,
1152
+ "normalized": false,
1153
+ "rstrip": false,
1154
+ "single_word": false,
1155
+ "special": true
1156
+ },
1157
+ "157035": {
1158
+ "content": "<|reserved_token_140|>",
1159
+ "lstrip": false,
1160
+ "normalized": false,
1161
+ "rstrip": false,
1162
+ "single_word": false,
1163
+ "special": true
1164
+ },
1165
+ "157036": {
1166
+ "content": "<|reserved_token_141|>",
1167
+ "lstrip": false,
1168
+ "normalized": false,
1169
+ "rstrip": false,
1170
+ "single_word": false,
1171
+ "special": true
1172
+ },
1173
+ "157037": {
1174
+ "content": "<|reserved_token_142|>",
1175
+ "lstrip": false,
1176
+ "normalized": false,
1177
+ "rstrip": false,
1178
+ "single_word": false,
1179
+ "special": true
1180
+ },
1181
+ "157038": {
1182
+ "content": "<|reserved_token_143|>",
1183
+ "lstrip": false,
1184
+ "normalized": false,
1185
+ "rstrip": false,
1186
+ "single_word": false,
1187
+ "special": true
1188
+ },
1189
+ "157039": {
1190
+ "content": "<|reserved_token_144|>",
1191
+ "lstrip": false,
1192
+ "normalized": false,
1193
+ "rstrip": false,
1194
+ "single_word": false,
1195
+ "special": true
1196
+ },
1197
+ "157040": {
1198
+ "content": "<|reserved_token_145|>",
1199
+ "lstrip": false,
1200
+ "normalized": false,
1201
+ "rstrip": false,
1202
+ "single_word": false,
1203
+ "special": true
1204
+ },
1205
+ "157041": {
1206
+ "content": "<|reserved_token_146|>",
1207
+ "lstrip": false,
1208
+ "normalized": false,
1209
+ "rstrip": false,
1210
+ "single_word": false,
1211
+ "special": true
1212
+ },
1213
+ "157042": {
1214
+ "content": "<|reserved_token_147|>",
1215
+ "lstrip": false,
1216
+ "normalized": false,
1217
+ "rstrip": false,
1218
+ "single_word": false,
1219
+ "special": true
1220
+ },
1221
+ "157043": {
1222
+ "content": "<|reserved_token_148|>",
1223
+ "lstrip": false,
1224
+ "normalized": false,
1225
+ "rstrip": false,
1226
+ "single_word": false,
1227
+ "special": true
1228
+ },
1229
+ "157044": {
1230
+ "content": "<|reserved_token_149|>",
1231
+ "lstrip": false,
1232
+ "normalized": false,
1233
+ "rstrip": false,
1234
+ "single_word": false,
1235
+ "special": true
1236
+ },
1237
+ "157045": {
1238
+ "content": "<|reserved_token_150|>",
1239
+ "lstrip": false,
1240
+ "normalized": false,
1241
+ "rstrip": false,
1242
+ "single_word": false,
1243
+ "special": true
1244
+ },
1245
+ "157046": {
1246
+ "content": "<|reserved_token_151|>",
1247
+ "lstrip": false,
1248
+ "normalized": false,
1249
+ "rstrip": false,
1250
+ "single_word": false,
1251
+ "special": true
1252
+ },
1253
+ "157047": {
1254
+ "content": "<|reserved_token_152|>",
1255
+ "lstrip": false,
1256
+ "normalized": false,
1257
+ "rstrip": false,
1258
+ "single_word": false,
1259
+ "special": true
1260
+ },
1261
+ "157048": {
1262
+ "content": "<|reserved_token_153|>",
1263
+ "lstrip": false,
1264
+ "normalized": false,
1265
+ "rstrip": false,
1266
+ "single_word": false,
1267
+ "special": true
1268
+ },
1269
+ "157049": {
1270
+ "content": "<|reserved_token_154|>",
1271
+ "lstrip": false,
1272
+ "normalized": false,
1273
+ "rstrip": false,
1274
+ "single_word": false,
1275
+ "special": true
1276
+ },
1277
+ "157050": {
1278
+ "content": "<|reserved_token_155|>",
1279
+ "lstrip": false,
1280
+ "normalized": false,
1281
+ "rstrip": false,
1282
+ "single_word": false,
1283
+ "special": true
1284
+ },
1285
+ "157051": {
1286
+ "content": "<|reserved_token_156|>",
1287
+ "lstrip": false,
1288
+ "normalized": false,
1289
+ "rstrip": false,
1290
+ "single_word": false,
1291
+ "special": true
1292
+ },
1293
+ "157052": {
1294
+ "content": "<|reserved_token_157|>",
1295
+ "lstrip": false,
1296
+ "normalized": false,
1297
+ "rstrip": false,
1298
+ "single_word": false,
1299
+ "special": true
1300
+ },
1301
+ "157053": {
1302
+ "content": "<|reserved_token_158|>",
1303
+ "lstrip": false,
1304
+ "normalized": false,
1305
+ "rstrip": false,
1306
+ "single_word": false,
1307
+ "special": true
1308
+ },
1309
+ "157054": {
1310
+ "content": "<|reserved_token_159|>",
1311
+ "lstrip": false,
1312
+ "normalized": false,
1313
+ "rstrip": false,
1314
+ "single_word": false,
1315
+ "special": true
1316
+ },
1317
+ "157055": {
1318
+ "content": "<|reserved_token_160|>",
1319
+ "lstrip": false,
1320
+ "normalized": false,
1321
+ "rstrip": false,
1322
+ "single_word": false,
1323
+ "special": true
1324
+ },
1325
+ "157056": {
1326
+ "content": "<|reserved_token_161|>",
1327
+ "lstrip": false,
1328
+ "normalized": false,
1329
+ "rstrip": false,
1330
+ "single_word": false,
1331
+ "special": true
1332
+ },
1333
+ "157057": {
1334
+ "content": "<|reserved_token_162|>",
1335
+ "lstrip": false,
1336
+ "normalized": false,
1337
+ "rstrip": false,
1338
+ "single_word": false,
1339
+ "special": true
1340
+ },
1341
+ "157058": {
1342
+ "content": "<|reserved_token_163|>",
1343
+ "lstrip": false,
1344
+ "normalized": false,
1345
+ "rstrip": false,
1346
+ "single_word": false,
1347
+ "special": true
1348
+ },
1349
+ "157059": {
1350
+ "content": "<|reserved_token_164|>",
1351
+ "lstrip": false,
1352
+ "normalized": false,
1353
+ "rstrip": false,
1354
+ "single_word": false,
1355
+ "special": true
1356
+ },
1357
+ "157060": {
1358
+ "content": "<|reserved_token_165|>",
1359
+ "lstrip": false,
1360
+ "normalized": false,
1361
+ "rstrip": false,
1362
+ "single_word": false,
1363
+ "special": true
1364
+ },
1365
+ "157061": {
1366
+ "content": "<|reserved_token_166|>",
1367
+ "lstrip": false,
1368
+ "normalized": false,
1369
+ "rstrip": false,
1370
+ "single_word": false,
1371
+ "special": true
1372
+ },
1373
+ "157062": {
1374
+ "content": "<|reserved_token_167|>",
1375
+ "lstrip": false,
1376
+ "normalized": false,
1377
+ "rstrip": false,
1378
+ "single_word": false,
1379
+ "special": true
1380
+ },
1381
+ "157063": {
1382
+ "content": "<|reserved_token_168|>",
1383
+ "lstrip": false,
1384
+ "normalized": false,
1385
+ "rstrip": false,
1386
+ "single_word": false,
1387
+ "special": true
1388
+ },
1389
+ "157064": {
1390
+ "content": "<|reserved_token_169|>",
1391
+ "lstrip": false,
1392
+ "normalized": false,
1393
+ "rstrip": false,
1394
+ "single_word": false,
1395
+ "special": true
1396
+ },
1397
+ "157065": {
1398
+ "content": "<|reserved_token_170|>",
1399
+ "lstrip": false,
1400
+ "normalized": false,
1401
+ "rstrip": false,
1402
+ "single_word": false,
1403
+ "special": true
1404
+ },
1405
+ "157066": {
1406
+ "content": "<|reserved_token_171|>",
1407
+ "lstrip": false,
1408
+ "normalized": false,
1409
+ "rstrip": false,
1410
+ "single_word": false,
1411
+ "special": true
1412
+ },
1413
+ "157067": {
1414
+ "content": "<|reserved_token_172|>",
1415
+ "lstrip": false,
1416
+ "normalized": false,
1417
+ "rstrip": false,
1418
+ "single_word": false,
1419
+ "special": true
1420
+ },
1421
+ "157068": {
1422
+ "content": "<|reserved_token_173|>",
1423
+ "lstrip": false,
1424
+ "normalized": false,
1425
+ "rstrip": false,
1426
+ "single_word": false,
1427
+ "special": true
1428
+ },
1429
+ "157069": {
1430
+ "content": "<|reserved_token_174|>",
1431
+ "lstrip": false,
1432
+ "normalized": false,
1433
+ "rstrip": false,
1434
+ "single_word": false,
1435
+ "special": true
1436
+ },
1437
+ "157070": {
1438
+ "content": "<|reserved_token_175|>",
1439
+ "lstrip": false,
1440
+ "normalized": false,
1441
+ "rstrip": false,
1442
+ "single_word": false,
1443
+ "special": true
1444
+ },
1445
+ "157071": {
1446
+ "content": "<|reserved_token_176|>",
1447
+ "lstrip": false,
1448
+ "normalized": false,
1449
+ "rstrip": false,
1450
+ "single_word": false,
1451
+ "special": true
1452
+ },
1453
+ "157072": {
1454
+ "content": "<|reserved_token_177|>",
1455
+ "lstrip": false,
1456
+ "normalized": false,
1457
+ "rstrip": false,
1458
+ "single_word": false,
1459
+ "special": true
1460
+ },
1461
+ "157073": {
1462
+ "content": "<|reserved_token_178|>",
1463
+ "lstrip": false,
1464
+ "normalized": false,
1465
+ "rstrip": false,
1466
+ "single_word": false,
1467
+ "special": true
1468
+ },
1469
+ "157074": {
1470
+ "content": "<|reserved_token_179|>",
1471
+ "lstrip": false,
1472
+ "normalized": false,
1473
+ "rstrip": false,
1474
+ "single_word": false,
1475
+ "special": true
1476
+ },
1477
+ "157075": {
1478
+ "content": "<|reserved_token_180|>",
1479
+ "lstrip": false,
1480
+ "normalized": false,
1481
+ "rstrip": false,
1482
+ "single_word": false,
1483
+ "special": true
1484
+ },
1485
+ "157076": {
1486
+ "content": "<|reserved_token_181|>",
1487
+ "lstrip": false,
1488
+ "normalized": false,
1489
+ "rstrip": false,
1490
+ "single_word": false,
1491
+ "special": true
1492
+ },
1493
+ "157077": {
1494
+ "content": "<|reserved_token_182|>",
1495
+ "lstrip": false,
1496
+ "normalized": false,
1497
+ "rstrip": false,
1498
+ "single_word": false,
1499
+ "special": true
1500
+ },
1501
+ "157078": {
1502
+ "content": "<|reserved_token_183|>",
1503
+ "lstrip": false,
1504
+ "normalized": false,
1505
+ "rstrip": false,
1506
+ "single_word": false,
1507
+ "special": true
1508
+ },
1509
+ "157079": {
1510
+ "content": "<|reserved_token_184|>",
1511
+ "lstrip": false,
1512
+ "normalized": false,
1513
+ "rstrip": false,
1514
+ "single_word": false,
1515
+ "special": true
1516
+ },
1517
+ "157080": {
1518
+ "content": "<|reserved_token_185|>",
1519
+ "lstrip": false,
1520
+ "normalized": false,
1521
+ "rstrip": false,
1522
+ "single_word": false,
1523
+ "special": true
1524
+ },
1525
+ "157081": {
1526
+ "content": "<|reserved_token_186|>",
1527
+ "lstrip": false,
1528
+ "normalized": false,
1529
+ "rstrip": false,
1530
+ "single_word": false,
1531
+ "special": true
1532
+ },
1533
+ "157082": {
1534
+ "content": "<|reserved_token_187|>",
1535
+ "lstrip": false,
1536
+ "normalized": false,
1537
+ "rstrip": false,
1538
+ "single_word": false,
1539
+ "special": true
1540
+ },
1541
+ "157083": {
1542
+ "content": "<|reserved_token_188|>",
1543
+ "lstrip": false,
1544
+ "normalized": false,
1545
+ "rstrip": false,
1546
+ "single_word": false,
1547
+ "special": true
1548
+ },
1549
+ "157084": {
1550
+ "content": "<|reserved_token_189|>",
1551
+ "lstrip": false,
1552
+ "normalized": false,
1553
+ "rstrip": false,
1554
+ "single_word": false,
1555
+ "special": true
1556
+ },
1557
+ "157085": {
1558
+ "content": "<|reserved_token_190|>",
1559
+ "lstrip": false,
1560
+ "normalized": false,
1561
+ "rstrip": false,
1562
+ "single_word": false,
1563
+ "special": true
1564
+ },
1565
+ "157086": {
1566
+ "content": "<|reserved_token_191|>",
1567
+ "lstrip": false,
1568
+ "normalized": false,
1569
+ "rstrip": false,
1570
+ "single_word": false,
1571
+ "special": true
1572
+ },
1573
+ "157087": {
1574
+ "content": "<|reserved_token_192|>",
1575
+ "lstrip": false,
1576
+ "normalized": false,
1577
+ "rstrip": false,
1578
+ "single_word": false,
1579
+ "special": true
1580
+ },
1581
+ "157088": {
1582
+ "content": "<|reserved_token_193|>",
1583
+ "lstrip": false,
1584
+ "normalized": false,
1585
+ "rstrip": false,
1586
+ "single_word": false,
1587
+ "special": true
1588
+ },
1589
+ "157089": {
1590
+ "content": "<|reserved_token_194|>",
1591
+ "lstrip": false,
1592
+ "normalized": false,
1593
+ "rstrip": false,
1594
+ "single_word": false,
1595
+ "special": true
1596
+ },
1597
+ "157090": {
1598
+ "content": "<|reserved_token_195|>",
1599
+ "lstrip": false,
1600
+ "normalized": false,
1601
+ "rstrip": false,
1602
+ "single_word": false,
1603
+ "special": true
1604
+ },
1605
+ "157091": {
1606
+ "content": "<|reserved_token_196|>",
1607
+ "lstrip": false,
1608
+ "normalized": false,
1609
+ "rstrip": false,
1610
+ "single_word": false,
1611
+ "special": true
1612
+ },
1613
+ "157092": {
1614
+ "content": "<|reserved_token_197|>",
1615
+ "lstrip": false,
1616
+ "normalized": false,
1617
+ "rstrip": false,
1618
+ "single_word": false,
1619
+ "special": true
1620
+ },
1621
+ "157093": {
1622
+ "content": "<|reserved_token_198|>",
1623
+ "lstrip": false,
1624
+ "normalized": false,
1625
+ "rstrip": false,
1626
+ "single_word": false,
1627
+ "special": true
1628
+ },
1629
+ "157094": {
1630
+ "content": "<|reserved_token_199|>",
1631
+ "lstrip": false,
1632
+ "normalized": false,
1633
+ "rstrip": false,
1634
+ "single_word": false,
1635
+ "special": true
1636
+ },
1637
+ "157095": {
1638
+ "content": "<|reserved_token_200|>",
1639
+ "lstrip": false,
1640
+ "normalized": false,
1641
+ "rstrip": false,
1642
+ "single_word": false,
1643
+ "special": true
1644
+ },
1645
+ "157096": {
1646
+ "content": "<|reserved_token_201|>",
1647
+ "lstrip": false,
1648
+ "normalized": false,
1649
+ "rstrip": false,
1650
+ "single_word": false,
1651
+ "special": true
1652
+ },
1653
+ "157097": {
1654
+ "content": "<|reserved_token_202|>",
1655
+ "lstrip": false,
1656
+ "normalized": false,
1657
+ "rstrip": false,
1658
+ "single_word": false,
1659
+ "special": true
1660
+ },
1661
+ "157098": {
1662
+ "content": "<|reserved_token_203|>",
1663
+ "lstrip": false,
1664
+ "normalized": false,
1665
+ "rstrip": false,
1666
+ "single_word": false,
1667
+ "special": true
1668
+ },
1669
+ "157099": {
1670
+ "content": "<|reserved_token_204|>",
1671
+ "lstrip": false,
1672
+ "normalized": false,
1673
+ "rstrip": false,
1674
+ "single_word": false,
1675
+ "special": true
1676
+ },
1677
+ "157100": {
1678
+ "content": "<|reserved_token_205|>",
1679
+ "lstrip": false,
1680
+ "normalized": false,
1681
+ "rstrip": false,
1682
+ "single_word": false,
1683
+ "special": true
1684
+ },
1685
+ "157101": {
1686
+ "content": "<|reserved_token_206|>",
1687
+ "lstrip": false,
1688
+ "normalized": false,
1689
+ "rstrip": false,
1690
+ "single_word": false,
1691
+ "special": true
1692
+ },
1693
+ "157102": {
1694
+ "content": "<|reserved_token_207|>",
1695
+ "lstrip": false,
1696
+ "normalized": false,
1697
+ "rstrip": false,
1698
+ "single_word": false,
1699
+ "special": true
1700
+ },
1701
+ "157103": {
1702
+ "content": "<|reserved_token_208|>",
1703
+ "lstrip": false,
1704
+ "normalized": false,
1705
+ "rstrip": false,
1706
+ "single_word": false,
1707
+ "special": true
1708
+ },
1709
+ "157104": {
1710
+ "content": "<|reserved_token_209|>",
1711
+ "lstrip": false,
1712
+ "normalized": false,
1713
+ "rstrip": false,
1714
+ "single_word": false,
1715
+ "special": true
1716
+ },
1717
+ "157105": {
1718
+ "content": "<|reserved_token_210|>",
1719
+ "lstrip": false,
1720
+ "normalized": false,
1721
+ "rstrip": false,
1722
+ "single_word": false,
1723
+ "special": true
1724
+ },
1725
+ "157106": {
1726
+ "content": "<|reserved_token_211|>",
1727
+ "lstrip": false,
1728
+ "normalized": false,
1729
+ "rstrip": false,
1730
+ "single_word": false,
1731
+ "special": true
1732
+ },
1733
+ "157107": {
1734
+ "content": "<|reserved_token_212|>",
1735
+ "lstrip": false,
1736
+ "normalized": false,
1737
+ "rstrip": false,
1738
+ "single_word": false,
1739
+ "special": true
1740
+ },
1741
+ "157108": {
1742
+ "content": "<|reserved_token_213|>",
1743
+ "lstrip": false,
1744
+ "normalized": false,
1745
+ "rstrip": false,
1746
+ "single_word": false,
1747
+ "special": true
1748
+ },
1749
+ "157109": {
1750
+ "content": "<|reserved_token_214|>",
1751
+ "lstrip": false,
1752
+ "normalized": false,
1753
+ "rstrip": false,
1754
+ "single_word": false,
1755
+ "special": true
1756
+ },
1757
+ "157110": {
1758
+ "content": "<|reserved_token_215|>",
1759
+ "lstrip": false,
1760
+ "normalized": false,
1761
+ "rstrip": false,
1762
+ "single_word": false,
1763
+ "special": true
1764
+ },
1765
+ "157111": {
1766
+ "content": "<|reserved_token_216|>",
1767
+ "lstrip": false,
1768
+ "normalized": false,
1769
+ "rstrip": false,
1770
+ "single_word": false,
1771
+ "special": true
1772
+ },
1773
+ "157112": {
1774
+ "content": "<|reserved_token_217|>",
1775
+ "lstrip": false,
1776
+ "normalized": false,
1777
+ "rstrip": false,
1778
+ "single_word": false,
1779
+ "special": true
1780
+ },
1781
+ "157113": {
1782
+ "content": "<|reserved_token_218|>",
1783
+ "lstrip": false,
1784
+ "normalized": false,
1785
+ "rstrip": false,
1786
+ "single_word": false,
1787
+ "special": true
1788
+ },
1789
+ "157114": {
1790
+ "content": "<|reserved_token_219|>",
1791
+ "lstrip": false,
1792
+ "normalized": false,
1793
+ "rstrip": false,
1794
+ "single_word": false,
1795
+ "special": true
1796
+ },
1797
+ "157115": {
1798
+ "content": "<|reserved_token_220|>",
1799
+ "lstrip": false,
1800
+ "normalized": false,
1801
+ "rstrip": false,
1802
+ "single_word": false,
1803
+ "special": true
1804
+ },
1805
+ "157116": {
1806
+ "content": "<|reserved_token_221|>",
1807
+ "lstrip": false,
1808
+ "normalized": false,
1809
+ "rstrip": false,
1810
+ "single_word": false,
1811
+ "special": true
1812
+ },
1813
+ "157117": {
1814
+ "content": "<|reserved_token_222|>",
1815
+ "lstrip": false,
1816
+ "normalized": false,
1817
+ "rstrip": false,
1818
+ "single_word": false,
1819
+ "special": true
1820
+ },
1821
+ "157118": {
1822
+ "content": "<|reserved_token_223|>",
1823
+ "lstrip": false,
1824
+ "normalized": false,
1825
+ "rstrip": false,
1826
+ "single_word": false,
1827
+ "special": true
1828
+ },
1829
+ "157119": {
1830
+ "content": "<|reserved_token_224|>",
1831
+ "lstrip": false,
1832
+ "normalized": false,
1833
+ "rstrip": false,
1834
+ "single_word": false,
1835
+ "special": true
1836
+ },
1837
+ "157120": {
1838
+ "content": "<|reserved_token_225|>",
1839
+ "lstrip": false,
1840
+ "normalized": false,
1841
+ "rstrip": false,
1842
+ "single_word": false,
1843
+ "special": true
1844
+ },
1845
+ "157121": {
1846
+ "content": "<|reserved_token_226|>",
1847
+ "lstrip": false,
1848
+ "normalized": false,
1849
+ "rstrip": false,
1850
+ "single_word": false,
1851
+ "special": true
1852
+ },
1853
+ "157122": {
1854
+ "content": "<|reserved_token_227|>",
1855
+ "lstrip": false,
1856
+ "normalized": false,
1857
+ "rstrip": false,
1858
+ "single_word": false,
1859
+ "special": true
1860
+ },
1861
+ "157123": {
1862
+ "content": "<|reserved_token_228|>",
1863
+ "lstrip": false,
1864
+ "normalized": false,
1865
+ "rstrip": false,
1866
+ "single_word": false,
1867
+ "special": true
1868
+ },
1869
+ "157124": {
1870
+ "content": "<|reserved_token_229|>",
1871
+ "lstrip": false,
1872
+ "normalized": false,
1873
+ "rstrip": false,
1874
+ "single_word": false,
1875
+ "special": true
1876
+ },
1877
+ "157125": {
1878
+ "content": "<|reserved_token_230|>",
1879
+ "lstrip": false,
1880
+ "normalized": false,
1881
+ "rstrip": false,
1882
+ "single_word": false,
1883
+ "special": true
1884
+ },
1885
+ "157126": {
1886
+ "content": "<|reserved_token_231|>",
1887
+ "lstrip": false,
1888
+ "normalized": false,
1889
+ "rstrip": false,
1890
+ "single_word": false,
1891
+ "special": true
1892
+ },
1893
+ "157127": {
1894
+ "content": "<|reserved_token_232|>",
1895
+ "lstrip": false,
1896
+ "normalized": false,
1897
+ "rstrip": false,
1898
+ "single_word": false,
1899
+ "special": true
1900
+ },
1901
+ "157128": {
1902
+ "content": "<|reserved_token_233|>",
1903
+ "lstrip": false,
1904
+ "normalized": false,
1905
+ "rstrip": false,
1906
+ "single_word": false,
1907
+ "special": true
1908
+ },
1909
+ "157129": {
1910
+ "content": "<|reserved_token_234|>",
1911
+ "lstrip": false,
1912
+ "normalized": false,
1913
+ "rstrip": false,
1914
+ "single_word": false,
1915
+ "special": true
1916
+ },
1917
+ "157130": {
1918
+ "content": "<|reserved_token_235|>",
1919
+ "lstrip": false,
1920
+ "normalized": false,
1921
+ "rstrip": false,
1922
+ "single_word": false,
1923
+ "special": true
1924
+ },
1925
+ "157131": {
1926
+ "content": "<|reserved_token_236|>",
1927
+ "lstrip": false,
1928
+ "normalized": false,
1929
+ "rstrip": false,
1930
+ "single_word": false,
1931
+ "special": true
1932
+ },
1933
+ "157132": {
1934
+ "content": "<|reserved_token_237|>",
1935
+ "lstrip": false,
1936
+ "normalized": false,
1937
+ "rstrip": false,
1938
+ "single_word": false,
1939
+ "special": true
1940
+ },
1941
+ "157133": {
1942
+ "content": "<|reserved_token_238|>",
1943
+ "lstrip": false,
1944
+ "normalized": false,
1945
+ "rstrip": false,
1946
+ "single_word": false,
1947
+ "special": true
1948
+ },
1949
+ "157134": {
1950
+ "content": "<|reserved_token_239|>",
1951
+ "lstrip": false,
1952
+ "normalized": false,
1953
+ "rstrip": false,
1954
+ "single_word": false,
1955
+ "special": true
1956
+ },
1957
+ "157135": {
1958
+ "content": "<|reserved_token_240|>",
1959
+ "lstrip": false,
1960
+ "normalized": false,
1961
+ "rstrip": false,
1962
+ "single_word": false,
1963
+ "special": true
1964
+ },
1965
+ "157136": {
1966
+ "content": "<|reserved_token_241|>",
1967
+ "lstrip": false,
1968
+ "normalized": false,
1969
+ "rstrip": false,
1970
+ "single_word": false,
1971
+ "special": true
1972
+ },
1973
+ "157137": {
1974
+ "content": "<|reserved_token_242|>",
1975
+ "lstrip": false,
1976
+ "normalized": false,
1977
+ "rstrip": false,
1978
+ "single_word": false,
1979
+ "special": true
1980
+ },
1981
+ "157138": {
1982
+ "content": "<|reserved_token_243|>",
1983
+ "lstrip": false,
1984
+ "normalized": false,
1985
+ "rstrip": false,
1986
+ "single_word": false,
1987
+ "special": true
1988
+ },
1989
+ "157139": {
1990
+ "content": "<|reserved_token_244|>",
1991
+ "lstrip": false,
1992
+ "normalized": false,
1993
+ "rstrip": false,
1994
+ "single_word": false,
1995
+ "special": true
1996
+ },
1997
+ "157140": {
1998
+ "content": "<|reserved_token_245|>",
1999
+ "lstrip": false,
2000
+ "normalized": false,
2001
+ "rstrip": false,
2002
+ "single_word": false,
2003
+ "special": true
2004
+ },
2005
+ "157141": {
2006
+ "content": "<|reserved_token_246|>",
2007
+ "lstrip": false,
2008
+ "normalized": false,
2009
+ "rstrip": false,
2010
+ "single_word": false,
2011
+ "special": true
2012
+ },
2013
+ "157142": {
2014
+ "content": "<|reserved_token_247|>",
2015
+ "lstrip": false,
2016
+ "normalized": false,
2017
+ "rstrip": false,
2018
+ "single_word": false,
2019
+ "special": true
2020
+ },
2021
+ "157143": {
2022
+ "content": "<|reserved_token_248|>",
2023
+ "lstrip": false,
2024
+ "normalized": false,
2025
+ "rstrip": false,
2026
+ "single_word": false,
2027
+ "special": true
2028
+ },
2029
+ "157144": {
2030
+ "content": "<|reserved_token_249|>",
2031
+ "lstrip": false,
2032
+ "normalized": false,
2033
+ "rstrip": false,
2034
+ "single_word": false,
2035
+ "special": true
2036
+ },
2037
+ "157145": {
2038
+ "content": "<|reserved_token_250|>",
2039
+ "lstrip": false,
2040
+ "normalized": false,
2041
+ "rstrip": false,
2042
+ "single_word": false,
2043
+ "special": true
2044
+ },
2045
+ "157146": {
2046
+ "content": "<|reserved_token_251|>",
2047
+ "lstrip": false,
2048
+ "normalized": false,
2049
+ "rstrip": false,
2050
+ "single_word": false,
2051
+ "special": true
2052
+ },
2053
+ "157147": {
2054
+ "content": "<|reserved_token_252|>",
2055
+ "lstrip": false,
2056
+ "normalized": false,
2057
+ "rstrip": false,
2058
+ "single_word": false,
2059
+ "special": true
2060
+ },
2061
+ "157148": {
2062
+ "content": "<|reserved_token_253|>",
2063
+ "lstrip": false,
2064
+ "normalized": false,
2065
+ "rstrip": false,
2066
+ "single_word": false,
2067
+ "special": true
2068
+ },
2069
+ "157149": {
2070
+ "content": "<|reserved_token_254|>",
2071
+ "lstrip": false,
2072
+ "normalized": false,
2073
+ "rstrip": false,
2074
+ "single_word": false,
2075
+ "special": true
2076
+ },
2077
+ "157150": {
2078
+ "content": "<|reserved_token_255|>",
2079
+ "lstrip": false,
2080
+ "normalized": false,
2081
+ "rstrip": false,
2082
+ "single_word": false,
2083
+ "special": true
2084
+ },
2085
+ "157151": {
2086
+ "content": "<role>",
2087
+ "lstrip": false,
2088
+ "normalized": false,
2089
+ "rstrip": false,
2090
+ "single_word": false,
2091
+ "special": true
2092
+ },
2093
+ "157152": {
2094
+ "content": "</role>",
2095
+ "lstrip": false,
2096
+ "normalized": false,
2097
+ "rstrip": false,
2098
+ "single_word": false,
2099
+ "special": true
2100
+ }
2101
+ },
2102
+ "bos_token": "<|startoftext|>",
2103
+ "clean_up_tokenization_spaces": false,
2104
+ "cls_token": "[CLS]",
2105
+ "eos_token": "<|endoftext|>",
2106
+ "extra_special_tokens": {},
2107
+ "fast_tokenizer": true,
2108
+ "gmask_token": "[gMASK]",
2109
+ "merges_file": null,
2110
+ "model_max_length": 1000000000000000019884624838656,
2111
+ "pad_token": "<|endoftext|>",
2112
+ "tokenizer_class": "PreTrainedTokenizerFast",
2113
+ "trust_remote_code": true
2114
+ }