ZYXue commited on
Commit
802532c
·
verified ·
1 Parent(s): 30aafe9

Upload folder using huggingface_hub

Browse files
Files changed (44) hide show
  1. .gitattributes +3 -0
  2. README.md +202 -0
  3. adapter_config.json +29 -0
  4. adapter_model.safetensors +3 -0
  5. added_tokens.json +24 -0
  6. chat_template.jinja +7 -0
  7. checkpoint-200/README.md +202 -0
  8. checkpoint-200/adapter_config.json +29 -0
  9. checkpoint-200/adapter_model.safetensors +3 -0
  10. checkpoint-200/added_tokens.json +24 -0
  11. checkpoint-200/chat_template.jinja +7 -0
  12. checkpoint-200/merges.txt +0 -0
  13. checkpoint-200/optimizer.pt +3 -0
  14. checkpoint-200/rng_state.pth +3 -0
  15. checkpoint-200/scheduler.pt +3 -0
  16. checkpoint-200/special_tokens_map.json +31 -0
  17. checkpoint-200/tokenizer.json +3 -0
  18. checkpoint-200/tokenizer_config.json +207 -0
  19. checkpoint-200/trainer_state.json +1843 -0
  20. checkpoint-200/training_args.bin +3 -0
  21. checkpoint-200/vocab.json +0 -0
  22. checkpoint-320/README.md +202 -0
  23. checkpoint-320/adapter_config.json +29 -0
  24. checkpoint-320/adapter_model.safetensors +3 -0
  25. checkpoint-320/added_tokens.json +24 -0
  26. checkpoint-320/chat_template.jinja +7 -0
  27. checkpoint-320/merges.txt +0 -0
  28. checkpoint-320/optimizer.pt +3 -0
  29. checkpoint-320/rng_state.pth +3 -0
  30. checkpoint-320/scheduler.pt +3 -0
  31. checkpoint-320/special_tokens_map.json +31 -0
  32. checkpoint-320/tokenizer.json +3 -0
  33. checkpoint-320/tokenizer_config.json +207 -0
  34. checkpoint-320/trainer_state.json +2923 -0
  35. checkpoint-320/training_args.bin +3 -0
  36. checkpoint-320/vocab.json +0 -0
  37. merges.txt +0 -0
  38. special_tokens_map.json +31 -0
  39. tokenizer.json +3 -0
  40. tokenizer_config.json +207 -0
  41. training_args.bin +3 -0
  42. training_args.json +28 -0
  43. training_log.json +1318 -0
  44. vocab.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-320/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-VL-7B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7f841b4bcf9a79ca58b01986f5212b5ae4951dfcec18a880ebf0db9e257d602
3
+ size 20200056
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
checkpoint-200/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-VL-7B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
checkpoint-200/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
checkpoint-200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7f841b4bcf9a79ca58b01986f5212b5ae4951dfcec18a880ebf0db9e257d602
3
+ size 20200056
checkpoint-200/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-200/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
checkpoint-200/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca2ac737939e8bca052e3213e1cb57c622467d7e1c0b35fef0e57e32cb6c8ffe
3
+ size 40466443
checkpoint-200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ad5f4dac8c8f22dd51d6ff589c953bfcbee9552cc67d00c221a20372a23c5ec
3
+ size 14645
checkpoint-200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97c9fe9cd09f40f700384cf87478f8d3de41c2a75de82450af7b385d04eebbd6
3
+ size 1401
checkpoint-200/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-200/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoint-200/trainer_state.json ADDED
@@ -0,0 +1,1843 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 200,
3
+ "best_metric": 0.07848864793777466,
4
+ "best_model_checkpoint": "sft_prefill/prompt_id_3/qwen2-VL-7B-Instruct-syn-count-lora/checkpoint-200",
5
+ "epoch": 1.25,
6
+ "eval_steps": 200,
7
+ "global_step": 200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00625,
14
+ "grad_norm": 0.9298801422119141,
15
+ "learning_rate": 0.0002,
16
+ "loss": 0.4917,
17
+ "mean_token_accuracy": 0.6875,
18
+ "num_tokens": 125296.0,
19
+ "step": 1
20
+ },
21
+ {
22
+ "epoch": 0.0125,
23
+ "grad_norm": 0.8382226824760437,
24
+ "learning_rate": 0.0002,
25
+ "loss": 0.6049,
26
+ "mean_token_accuracy": 0.75,
27
+ "num_tokens": 128352.0,
28
+ "step": 2
29
+ },
30
+ {
31
+ "epoch": 0.01875,
32
+ "grad_norm": 0.9454999566078186,
33
+ "learning_rate": 0.0002,
34
+ "loss": 0.6407,
35
+ "mean_token_accuracy": 0.65625,
36
+ "num_tokens": 131408.0,
37
+ "step": 3
38
+ },
39
+ {
40
+ "epoch": 0.025,
41
+ "grad_norm": 0.9727649092674255,
42
+ "learning_rate": 0.0002,
43
+ "loss": 0.3972,
44
+ "mean_token_accuracy": 0.90625,
45
+ "num_tokens": 134464.0,
46
+ "step": 4
47
+ },
48
+ {
49
+ "epoch": 0.03125,
50
+ "grad_norm": 0.785910964012146,
51
+ "learning_rate": 0.0002,
52
+ "loss": 0.453,
53
+ "mean_token_accuracy": 0.84375,
54
+ "num_tokens": 137520.0,
55
+ "step": 5
56
+ },
57
+ {
58
+ "epoch": 0.0375,
59
+ "grad_norm": 0.9556392431259155,
60
+ "learning_rate": 0.0002,
61
+ "loss": 0.418,
62
+ "mean_token_accuracy": 0.8125,
63
+ "num_tokens": 140576.0,
64
+ "step": 6
65
+ },
66
+ {
67
+ "epoch": 0.04375,
68
+ "grad_norm": 1.160456657409668,
69
+ "learning_rate": 0.0002,
70
+ "loss": 0.303,
71
+ "mean_token_accuracy": 0.875,
72
+ "num_tokens": 143632.0,
73
+ "step": 7
74
+ },
75
+ {
76
+ "epoch": 0.05,
77
+ "grad_norm": 1.127308964729309,
78
+ "learning_rate": 0.0002,
79
+ "loss": 0.1847,
80
+ "mean_token_accuracy": 0.9375,
81
+ "num_tokens": 146688.0,
82
+ "step": 8
83
+ },
84
+ {
85
+ "epoch": 0.05625,
86
+ "grad_norm": 1.7690690755844116,
87
+ "learning_rate": 0.0002,
88
+ "loss": 0.3514,
89
+ "mean_token_accuracy": 0.84375,
90
+ "num_tokens": 149744.0,
91
+ "step": 9
92
+ },
93
+ {
94
+ "epoch": 0.0625,
95
+ "grad_norm": 0.8747241497039795,
96
+ "learning_rate": 0.0002,
97
+ "loss": 0.2434,
98
+ "mean_token_accuracy": 0.84375,
99
+ "num_tokens": 152800.0,
100
+ "step": 10
101
+ },
102
+ {
103
+ "epoch": 0.06875,
104
+ "grad_norm": 0.7658981084823608,
105
+ "learning_rate": 0.0002,
106
+ "loss": 0.1577,
107
+ "mean_token_accuracy": 0.9375,
108
+ "num_tokens": 155856.0,
109
+ "step": 11
110
+ },
111
+ {
112
+ "epoch": 0.075,
113
+ "grad_norm": 0.6385300755500793,
114
+ "learning_rate": 0.0002,
115
+ "loss": 0.209,
116
+ "mean_token_accuracy": 0.875,
117
+ "num_tokens": 158912.0,
118
+ "step": 12
119
+ },
120
+ {
121
+ "epoch": 0.08125,
122
+ "grad_norm": 1.2857609987258911,
123
+ "learning_rate": 0.0002,
124
+ "loss": 0.2123,
125
+ "mean_token_accuracy": 0.875,
126
+ "num_tokens": 161968.0,
127
+ "step": 13
128
+ },
129
+ {
130
+ "epoch": 0.0875,
131
+ "grad_norm": 0.5603316426277161,
132
+ "learning_rate": 0.0002,
133
+ "loss": 0.2712,
134
+ "mean_token_accuracy": 0.96875,
135
+ "num_tokens": 165024.0,
136
+ "step": 14
137
+ },
138
+ {
139
+ "epoch": 0.09375,
140
+ "grad_norm": 1.0855859518051147,
141
+ "learning_rate": 0.0002,
142
+ "loss": 0.2975,
143
+ "mean_token_accuracy": 0.875,
144
+ "num_tokens": 168080.0,
145
+ "step": 15
146
+ },
147
+ {
148
+ "epoch": 0.1,
149
+ "grad_norm": 2.589662790298462,
150
+ "learning_rate": 0.0002,
151
+ "loss": 0.3977,
152
+ "mean_token_accuracy": 0.78125,
153
+ "num_tokens": 171136.0,
154
+ "step": 16
155
+ },
156
+ {
157
+ "epoch": 0.10625,
158
+ "grad_norm": 1.242841124534607,
159
+ "learning_rate": 0.0002,
160
+ "loss": 0.3933,
161
+ "mean_token_accuracy": 0.90625,
162
+ "num_tokens": 174192.0,
163
+ "step": 17
164
+ },
165
+ {
166
+ "epoch": 0.1125,
167
+ "grad_norm": 1.0371448993682861,
168
+ "learning_rate": 0.0002,
169
+ "loss": 0.2866,
170
+ "mean_token_accuracy": 0.875,
171
+ "num_tokens": 177248.0,
172
+ "step": 18
173
+ },
174
+ {
175
+ "epoch": 0.11875,
176
+ "grad_norm": 2.609177827835083,
177
+ "learning_rate": 0.0002,
178
+ "loss": 0.4174,
179
+ "mean_token_accuracy": 0.75,
180
+ "num_tokens": 180304.0,
181
+ "step": 19
182
+ },
183
+ {
184
+ "epoch": 0.125,
185
+ "grad_norm": 1.229350209236145,
186
+ "learning_rate": 0.0002,
187
+ "loss": 0.3588,
188
+ "mean_token_accuracy": 0.84375,
189
+ "num_tokens": 183360.0,
190
+ "step": 20
191
+ },
192
+ {
193
+ "epoch": 0.13125,
194
+ "grad_norm": 1.2149115800857544,
195
+ "learning_rate": 0.0002,
196
+ "loss": 0.1919,
197
+ "mean_token_accuracy": 0.9375,
198
+ "num_tokens": 186416.0,
199
+ "step": 21
200
+ },
201
+ {
202
+ "epoch": 0.1375,
203
+ "grad_norm": 3.0612738132476807,
204
+ "learning_rate": 0.0002,
205
+ "loss": 0.475,
206
+ "mean_token_accuracy": 0.84375,
207
+ "num_tokens": 189472.0,
208
+ "step": 22
209
+ },
210
+ {
211
+ "epoch": 0.14375,
212
+ "grad_norm": 1.1256693601608276,
213
+ "learning_rate": 0.0002,
214
+ "loss": 0.223,
215
+ "mean_token_accuracy": 0.9375,
216
+ "num_tokens": 192528.0,
217
+ "step": 23
218
+ },
219
+ {
220
+ "epoch": 0.15,
221
+ "grad_norm": 1.5483283996582031,
222
+ "learning_rate": 0.0002,
223
+ "loss": 0.3825,
224
+ "mean_token_accuracy": 0.78125,
225
+ "num_tokens": 195584.0,
226
+ "step": 24
227
+ },
228
+ {
229
+ "epoch": 0.15625,
230
+ "grad_norm": 0.8419892191886902,
231
+ "learning_rate": 0.0002,
232
+ "loss": 0.1645,
233
+ "mean_token_accuracy": 0.875,
234
+ "num_tokens": 198640.0,
235
+ "step": 25
236
+ },
237
+ {
238
+ "epoch": 0.1625,
239
+ "grad_norm": 1.5078842639923096,
240
+ "learning_rate": 0.0002,
241
+ "loss": 0.2517,
242
+ "mean_token_accuracy": 0.9375,
243
+ "num_tokens": 201696.0,
244
+ "step": 26
245
+ },
246
+ {
247
+ "epoch": 0.16875,
248
+ "grad_norm": 0.7717946171760559,
249
+ "learning_rate": 0.0002,
250
+ "loss": 0.1865,
251
+ "mean_token_accuracy": 0.9375,
252
+ "num_tokens": 204752.0,
253
+ "step": 27
254
+ },
255
+ {
256
+ "epoch": 0.175,
257
+ "grad_norm": 0.7887358069419861,
258
+ "learning_rate": 0.0002,
259
+ "loss": 0.2887,
260
+ "mean_token_accuracy": 0.90625,
261
+ "num_tokens": 207808.0,
262
+ "step": 28
263
+ },
264
+ {
265
+ "epoch": 0.18125,
266
+ "grad_norm": 1.0774492025375366,
267
+ "learning_rate": 0.0002,
268
+ "loss": 0.2154,
269
+ "mean_token_accuracy": 0.90625,
270
+ "num_tokens": 210864.0,
271
+ "step": 29
272
+ },
273
+ {
274
+ "epoch": 0.1875,
275
+ "grad_norm": 1.2108778953552246,
276
+ "learning_rate": 0.0002,
277
+ "loss": 0.3605,
278
+ "mean_token_accuracy": 0.84375,
279
+ "num_tokens": 213920.0,
280
+ "step": 30
281
+ },
282
+ {
283
+ "epoch": 0.19375,
284
+ "grad_norm": 0.4569832980632782,
285
+ "learning_rate": 0.0002,
286
+ "loss": 0.1299,
287
+ "mean_token_accuracy": 1.0,
288
+ "num_tokens": 216976.0,
289
+ "step": 31
290
+ },
291
+ {
292
+ "epoch": 0.2,
293
+ "grad_norm": 0.789750337600708,
294
+ "learning_rate": 0.0002,
295
+ "loss": 0.1772,
296
+ "mean_token_accuracy": 0.96875,
297
+ "num_tokens": 220032.0,
298
+ "step": 32
299
+ },
300
+ {
301
+ "epoch": 0.20625,
302
+ "grad_norm": 1.5008147954940796,
303
+ "learning_rate": 0.0002,
304
+ "loss": 0.3281,
305
+ "mean_token_accuracy": 0.84375,
306
+ "num_tokens": 223088.0,
307
+ "step": 33
308
+ },
309
+ {
310
+ "epoch": 0.2125,
311
+ "grad_norm": 1.082711935043335,
312
+ "learning_rate": 0.0002,
313
+ "loss": 0.1973,
314
+ "mean_token_accuracy": 0.9375,
315
+ "num_tokens": 226144.0,
316
+ "step": 34
317
+ },
318
+ {
319
+ "epoch": 0.21875,
320
+ "grad_norm": 0.6501540541648865,
321
+ "learning_rate": 0.0002,
322
+ "loss": 0.2645,
323
+ "mean_token_accuracy": 0.9375,
324
+ "num_tokens": 229200.0,
325
+ "step": 35
326
+ },
327
+ {
328
+ "epoch": 0.225,
329
+ "grad_norm": 0.6396588087081909,
330
+ "learning_rate": 0.0002,
331
+ "loss": 0.1938,
332
+ "mean_token_accuracy": 0.90625,
333
+ "num_tokens": 232256.0,
334
+ "step": 36
335
+ },
336
+ {
337
+ "epoch": 0.23125,
338
+ "grad_norm": 0.7765557169914246,
339
+ "learning_rate": 0.0002,
340
+ "loss": 0.1577,
341
+ "mean_token_accuracy": 0.90625,
342
+ "num_tokens": 235312.0,
343
+ "step": 37
344
+ },
345
+ {
346
+ "epoch": 0.2375,
347
+ "grad_norm": 1.0910521745681763,
348
+ "learning_rate": 0.0002,
349
+ "loss": 0.1917,
350
+ "mean_token_accuracy": 0.875,
351
+ "num_tokens": 238368.0,
352
+ "step": 38
353
+ },
354
+ {
355
+ "epoch": 0.24375,
356
+ "grad_norm": 0.7262992262840271,
357
+ "learning_rate": 0.0002,
358
+ "loss": 0.0703,
359
+ "mean_token_accuracy": 1.0,
360
+ "num_tokens": 241424.0,
361
+ "step": 39
362
+ },
363
+ {
364
+ "epoch": 0.25,
365
+ "grad_norm": 0.598296582698822,
366
+ "learning_rate": 0.0002,
367
+ "loss": 0.1402,
368
+ "mean_token_accuracy": 0.96875,
369
+ "num_tokens": 244480.0,
370
+ "step": 40
371
+ },
372
+ {
373
+ "epoch": 0.25625,
374
+ "grad_norm": 1.0417550802230835,
375
+ "learning_rate": 0.0002,
376
+ "loss": 0.1274,
377
+ "mean_token_accuracy": 0.96875,
378
+ "num_tokens": 247536.0,
379
+ "step": 41
380
+ },
381
+ {
382
+ "epoch": 0.2625,
383
+ "grad_norm": 1.5009475946426392,
384
+ "learning_rate": 0.0002,
385
+ "loss": 0.1871,
386
+ "mean_token_accuracy": 0.9375,
387
+ "num_tokens": 250592.0,
388
+ "step": 42
389
+ },
390
+ {
391
+ "epoch": 0.26875,
392
+ "grad_norm": 2.052367925643921,
393
+ "learning_rate": 0.0002,
394
+ "loss": 0.2981,
395
+ "mean_token_accuracy": 0.875,
396
+ "num_tokens": 253648.0,
397
+ "step": 43
398
+ },
399
+ {
400
+ "epoch": 0.275,
401
+ "grad_norm": 0.6979355216026306,
402
+ "learning_rate": 0.0002,
403
+ "loss": 0.0962,
404
+ "mean_token_accuracy": 0.96875,
405
+ "num_tokens": 256704.0,
406
+ "step": 44
407
+ },
408
+ {
409
+ "epoch": 0.28125,
410
+ "grad_norm": 1.2591170072555542,
411
+ "learning_rate": 0.0002,
412
+ "loss": 0.1616,
413
+ "mean_token_accuracy": 0.9375,
414
+ "num_tokens": 259760.0,
415
+ "step": 45
416
+ },
417
+ {
418
+ "epoch": 0.2875,
419
+ "grad_norm": 0.5454805493354797,
420
+ "learning_rate": 0.0002,
421
+ "loss": 0.0614,
422
+ "mean_token_accuracy": 0.96875,
423
+ "num_tokens": 262816.0,
424
+ "step": 46
425
+ },
426
+ {
427
+ "epoch": 0.29375,
428
+ "grad_norm": 1.3832871913909912,
429
+ "learning_rate": 0.0002,
430
+ "loss": 0.1468,
431
+ "mean_token_accuracy": 0.96875,
432
+ "num_tokens": 265872.0,
433
+ "step": 47
434
+ },
435
+ {
436
+ "epoch": 0.3,
437
+ "grad_norm": 0.5263135433197021,
438
+ "learning_rate": 0.0002,
439
+ "loss": 0.0482,
440
+ "mean_token_accuracy": 1.0,
441
+ "num_tokens": 268928.0,
442
+ "step": 48
443
+ },
444
+ {
445
+ "epoch": 0.30625,
446
+ "grad_norm": 1.7050193548202515,
447
+ "learning_rate": 0.0002,
448
+ "loss": 0.1177,
449
+ "mean_token_accuracy": 0.9375,
450
+ "num_tokens": 271984.0,
451
+ "step": 49
452
+ },
453
+ {
454
+ "epoch": 0.3125,
455
+ "grad_norm": 1.4185047149658203,
456
+ "learning_rate": 0.0002,
457
+ "loss": 0.0788,
458
+ "mean_token_accuracy": 0.96875,
459
+ "num_tokens": 275040.0,
460
+ "step": 50
461
+ },
462
+ {
463
+ "epoch": 0.31875,
464
+ "grad_norm": 1.192116379737854,
465
+ "learning_rate": 0.0002,
466
+ "loss": 0.1786,
467
+ "mean_token_accuracy": 0.90625,
468
+ "num_tokens": 278096.0,
469
+ "step": 51
470
+ },
471
+ {
472
+ "epoch": 0.325,
473
+ "grad_norm": 3.244797945022583,
474
+ "learning_rate": 0.0002,
475
+ "loss": 0.1685,
476
+ "mean_token_accuracy": 0.9375,
477
+ "num_tokens": 281152.0,
478
+ "step": 52
479
+ },
480
+ {
481
+ "epoch": 0.33125,
482
+ "grad_norm": 2.192440986633301,
483
+ "learning_rate": 0.0002,
484
+ "loss": 0.1178,
485
+ "mean_token_accuracy": 0.9375,
486
+ "num_tokens": 284208.0,
487
+ "step": 53
488
+ },
489
+ {
490
+ "epoch": 0.3375,
491
+ "grad_norm": 2.3711836338043213,
492
+ "learning_rate": 0.0002,
493
+ "loss": 0.1016,
494
+ "mean_token_accuracy": 0.9375,
495
+ "num_tokens": 287264.0,
496
+ "step": 54
497
+ },
498
+ {
499
+ "epoch": 0.34375,
500
+ "grad_norm": 2.2536869049072266,
501
+ "learning_rate": 0.0002,
502
+ "loss": 0.1223,
503
+ "mean_token_accuracy": 0.9375,
504
+ "num_tokens": 290320.0,
505
+ "step": 55
506
+ },
507
+ {
508
+ "epoch": 0.35,
509
+ "grad_norm": 1.4965548515319824,
510
+ "learning_rate": 0.0002,
511
+ "loss": 0.0331,
512
+ "mean_token_accuracy": 0.96875,
513
+ "num_tokens": 293376.0,
514
+ "step": 56
515
+ },
516
+ {
517
+ "epoch": 0.35625,
518
+ "grad_norm": 2.7128384113311768,
519
+ "learning_rate": 0.0002,
520
+ "loss": 0.1314,
521
+ "mean_token_accuracy": 0.9375,
522
+ "num_tokens": 296432.0,
523
+ "step": 57
524
+ },
525
+ {
526
+ "epoch": 0.3625,
527
+ "grad_norm": 1.1565158367156982,
528
+ "learning_rate": 0.0002,
529
+ "loss": 0.0504,
530
+ "mean_token_accuracy": 0.96875,
531
+ "num_tokens": 299488.0,
532
+ "step": 58
533
+ },
534
+ {
535
+ "epoch": 0.36875,
536
+ "grad_norm": 1.363576054573059,
537
+ "learning_rate": 0.0002,
538
+ "loss": 0.0327,
539
+ "mean_token_accuracy": 1.0,
540
+ "num_tokens": 302544.0,
541
+ "step": 59
542
+ },
543
+ {
544
+ "epoch": 0.375,
545
+ "grad_norm": 1.7460052967071533,
546
+ "learning_rate": 0.0002,
547
+ "loss": 0.0766,
548
+ "mean_token_accuracy": 0.96875,
549
+ "num_tokens": 305600.0,
550
+ "step": 60
551
+ },
552
+ {
553
+ "epoch": 0.38125,
554
+ "grad_norm": 0.4773832857608795,
555
+ "learning_rate": 0.0002,
556
+ "loss": 0.0167,
557
+ "mean_token_accuracy": 1.0,
558
+ "num_tokens": 308656.0,
559
+ "step": 61
560
+ },
561
+ {
562
+ "epoch": 0.3875,
563
+ "grad_norm": 0.5730915665626526,
564
+ "learning_rate": 0.0002,
565
+ "loss": 0.0176,
566
+ "mean_token_accuracy": 1.0,
567
+ "num_tokens": 311712.0,
568
+ "step": 62
569
+ },
570
+ {
571
+ "epoch": 0.39375,
572
+ "grad_norm": 3.6676278114318848,
573
+ "learning_rate": 0.0002,
574
+ "loss": 0.1523,
575
+ "mean_token_accuracy": 0.9375,
576
+ "num_tokens": 314768.0,
577
+ "step": 63
578
+ },
579
+ {
580
+ "epoch": 0.4,
581
+ "grad_norm": 1.425026535987854,
582
+ "learning_rate": 0.0002,
583
+ "loss": 0.0172,
584
+ "mean_token_accuracy": 1.0,
585
+ "num_tokens": 317824.0,
586
+ "step": 64
587
+ },
588
+ {
589
+ "epoch": 0.40625,
590
+ "grad_norm": 2.4328842163085938,
591
+ "learning_rate": 0.0002,
592
+ "loss": 0.1337,
593
+ "mean_token_accuracy": 0.96875,
594
+ "num_tokens": 320880.0,
595
+ "step": 65
596
+ },
597
+ {
598
+ "epoch": 0.4125,
599
+ "grad_norm": 2.682482957839966,
600
+ "learning_rate": 0.0002,
601
+ "loss": 0.18,
602
+ "mean_token_accuracy": 0.96875,
603
+ "num_tokens": 323936.0,
604
+ "step": 66
605
+ },
606
+ {
607
+ "epoch": 0.41875,
608
+ "grad_norm": 0.41104015707969666,
609
+ "learning_rate": 0.0002,
610
+ "loss": 0.0076,
611
+ "mean_token_accuracy": 1.0,
612
+ "num_tokens": 326992.0,
613
+ "step": 67
614
+ },
615
+ {
616
+ "epoch": 0.425,
617
+ "grad_norm": 0.22206450998783112,
618
+ "learning_rate": 0.0002,
619
+ "loss": 0.0051,
620
+ "mean_token_accuracy": 1.0,
621
+ "num_tokens": 330048.0,
622
+ "step": 68
623
+ },
624
+ {
625
+ "epoch": 0.43125,
626
+ "grad_norm": 1.8949733972549438,
627
+ "learning_rate": 0.0002,
628
+ "loss": 0.0408,
629
+ "mean_token_accuracy": 0.96875,
630
+ "num_tokens": 333104.0,
631
+ "step": 69
632
+ },
633
+ {
634
+ "epoch": 0.4375,
635
+ "grad_norm": 3.621080160140991,
636
+ "learning_rate": 0.0002,
637
+ "loss": 0.0744,
638
+ "mean_token_accuracy": 0.9375,
639
+ "num_tokens": 336160.0,
640
+ "step": 70
641
+ },
642
+ {
643
+ "epoch": 0.44375,
644
+ "grad_norm": 0.6198443174362183,
645
+ "learning_rate": 0.0002,
646
+ "loss": 0.0123,
647
+ "mean_token_accuracy": 1.0,
648
+ "num_tokens": 339216.0,
649
+ "step": 71
650
+ },
651
+ {
652
+ "epoch": 0.45,
653
+ "grad_norm": 0.46570342779159546,
654
+ "learning_rate": 0.0002,
655
+ "loss": 0.006,
656
+ "mean_token_accuracy": 1.0,
657
+ "num_tokens": 342272.0,
658
+ "step": 72
659
+ },
660
+ {
661
+ "epoch": 0.45625,
662
+ "grad_norm": 4.823789596557617,
663
+ "learning_rate": 0.0002,
664
+ "loss": 0.3016,
665
+ "mean_token_accuracy": 0.9375,
666
+ "num_tokens": 345328.0,
667
+ "step": 73
668
+ },
669
+ {
670
+ "epoch": 0.4625,
671
+ "grad_norm": 4.304416179656982,
672
+ "learning_rate": 0.0002,
673
+ "loss": 0.1201,
674
+ "mean_token_accuracy": 0.96875,
675
+ "num_tokens": 348384.0,
676
+ "step": 74
677
+ },
678
+ {
679
+ "epoch": 0.46875,
680
+ "grad_norm": 1.780882716178894,
681
+ "learning_rate": 0.0002,
682
+ "loss": 0.037,
683
+ "mean_token_accuracy": 0.96875,
684
+ "num_tokens": 351440.0,
685
+ "step": 75
686
+ },
687
+ {
688
+ "epoch": 0.475,
689
+ "grad_norm": 1.6134687662124634,
690
+ "learning_rate": 0.0002,
691
+ "loss": 0.0413,
692
+ "mean_token_accuracy": 0.96875,
693
+ "num_tokens": 354496.0,
694
+ "step": 76
695
+ },
696
+ {
697
+ "epoch": 0.48125,
698
+ "grad_norm": 3.6872904300689697,
699
+ "learning_rate": 0.0002,
700
+ "loss": 0.0697,
701
+ "mean_token_accuracy": 0.96875,
702
+ "num_tokens": 357552.0,
703
+ "step": 77
704
+ },
705
+ {
706
+ "epoch": 0.4875,
707
+ "grad_norm": 3.585052728652954,
708
+ "learning_rate": 0.0002,
709
+ "loss": 0.0485,
710
+ "mean_token_accuracy": 0.96875,
711
+ "num_tokens": 360608.0,
712
+ "step": 78
713
+ },
714
+ {
715
+ "epoch": 0.49375,
716
+ "grad_norm": 3.138762950897217,
717
+ "learning_rate": 0.0002,
718
+ "loss": 0.0686,
719
+ "mean_token_accuracy": 0.96875,
720
+ "num_tokens": 363664.0,
721
+ "step": 79
722
+ },
723
+ {
724
+ "epoch": 0.5,
725
+ "grad_norm": 3.895049571990967,
726
+ "learning_rate": 0.0002,
727
+ "loss": 0.0649,
728
+ "mean_token_accuracy": 0.96875,
729
+ "num_tokens": 366720.0,
730
+ "step": 80
731
+ },
732
+ {
733
+ "epoch": 0.50625,
734
+ "grad_norm": 3.5866286754608154,
735
+ "learning_rate": 0.0002,
736
+ "loss": 0.1002,
737
+ "mean_token_accuracy": 0.96875,
738
+ "num_tokens": 369776.0,
739
+ "step": 81
740
+ },
741
+ {
742
+ "epoch": 0.5125,
743
+ "grad_norm": 2.039658546447754,
744
+ "learning_rate": 0.0002,
745
+ "loss": 0.0235,
746
+ "mean_token_accuracy": 1.0,
747
+ "num_tokens": 372832.0,
748
+ "step": 82
749
+ },
750
+ {
751
+ "epoch": 0.51875,
752
+ "grad_norm": 5.831839084625244,
753
+ "learning_rate": 0.0002,
754
+ "loss": 0.1955,
755
+ "mean_token_accuracy": 0.9375,
756
+ "num_tokens": 375888.0,
757
+ "step": 83
758
+ },
759
+ {
760
+ "epoch": 0.525,
761
+ "grad_norm": 3.6485228538513184,
762
+ "learning_rate": 0.0002,
763
+ "loss": 0.1692,
764
+ "mean_token_accuracy": 0.96875,
765
+ "num_tokens": 378944.0,
766
+ "step": 84
767
+ },
768
+ {
769
+ "epoch": 0.53125,
770
+ "grad_norm": 1.0485862493515015,
771
+ "learning_rate": 0.0002,
772
+ "loss": 0.0178,
773
+ "mean_token_accuracy": 1.0,
774
+ "num_tokens": 382000.0,
775
+ "step": 85
776
+ },
777
+ {
778
+ "epoch": 0.5375,
779
+ "grad_norm": 2.2051405906677246,
780
+ "learning_rate": 0.0002,
781
+ "loss": 0.0282,
782
+ "mean_token_accuracy": 1.0,
783
+ "num_tokens": 385056.0,
784
+ "step": 86
785
+ },
786
+ {
787
+ "epoch": 0.54375,
788
+ "grad_norm": 1.2796984910964966,
789
+ "learning_rate": 0.0002,
790
+ "loss": 0.0191,
791
+ "mean_token_accuracy": 1.0,
792
+ "num_tokens": 388112.0,
793
+ "step": 87
794
+ },
795
+ {
796
+ "epoch": 0.55,
797
+ "grad_norm": 1.5338424444198608,
798
+ "learning_rate": 0.0002,
799
+ "loss": 0.0177,
800
+ "mean_token_accuracy": 1.0,
801
+ "num_tokens": 391168.0,
802
+ "step": 88
803
+ },
804
+ {
805
+ "epoch": 0.55625,
806
+ "grad_norm": 1.2561614513397217,
807
+ "learning_rate": 0.0002,
808
+ "loss": 0.0207,
809
+ "mean_token_accuracy": 1.0,
810
+ "num_tokens": 394224.0,
811
+ "step": 89
812
+ },
813
+ {
814
+ "epoch": 0.5625,
815
+ "grad_norm": 0.8224933743476868,
816
+ "learning_rate": 0.0002,
817
+ "loss": 0.0082,
818
+ "mean_token_accuracy": 1.0,
819
+ "num_tokens": 397280.0,
820
+ "step": 90
821
+ },
822
+ {
823
+ "epoch": 0.56875,
824
+ "grad_norm": 3.2320916652679443,
825
+ "learning_rate": 0.0002,
826
+ "loss": 0.0583,
827
+ "mean_token_accuracy": 0.96875,
828
+ "num_tokens": 400336.0,
829
+ "step": 91
830
+ },
831
+ {
832
+ "epoch": 0.575,
833
+ "grad_norm": 7.057133197784424,
834
+ "learning_rate": 0.0002,
835
+ "loss": 0.259,
836
+ "mean_token_accuracy": 0.9375,
837
+ "num_tokens": 403392.0,
838
+ "step": 92
839
+ },
840
+ {
841
+ "epoch": 0.58125,
842
+ "grad_norm": 2.365320920944214,
843
+ "learning_rate": 0.0002,
844
+ "loss": 0.0295,
845
+ "mean_token_accuracy": 1.0,
846
+ "num_tokens": 406448.0,
847
+ "step": 93
848
+ },
849
+ {
850
+ "epoch": 0.5875,
851
+ "grad_norm": 0.06329375505447388,
852
+ "learning_rate": 0.0002,
853
+ "loss": 0.0014,
854
+ "mean_token_accuracy": 1.0,
855
+ "num_tokens": 409504.0,
856
+ "step": 94
857
+ },
858
+ {
859
+ "epoch": 0.59375,
860
+ "grad_norm": 1.554870367050171,
861
+ "learning_rate": 0.0002,
862
+ "loss": 0.024,
863
+ "mean_token_accuracy": 0.96875,
864
+ "num_tokens": 412560.0,
865
+ "step": 95
866
+ },
867
+ {
868
+ "epoch": 0.6,
869
+ "grad_norm": 2.3296000957489014,
870
+ "learning_rate": 0.0002,
871
+ "loss": 0.0795,
872
+ "mean_token_accuracy": 0.96875,
873
+ "num_tokens": 415616.0,
874
+ "step": 96
875
+ },
876
+ {
877
+ "epoch": 0.60625,
878
+ "grad_norm": 6.749006271362305,
879
+ "learning_rate": 0.0002,
880
+ "loss": 0.1903,
881
+ "mean_token_accuracy": 0.90625,
882
+ "num_tokens": 418672.0,
883
+ "step": 97
884
+ },
885
+ {
886
+ "epoch": 0.6125,
887
+ "grad_norm": 10.803217887878418,
888
+ "learning_rate": 0.0002,
889
+ "loss": 0.2885,
890
+ "mean_token_accuracy": 0.90625,
891
+ "num_tokens": 421728.0,
892
+ "step": 98
893
+ },
894
+ {
895
+ "epoch": 0.61875,
896
+ "grad_norm": 2.8866775035858154,
897
+ "learning_rate": 0.0002,
898
+ "loss": 0.2661,
899
+ "mean_token_accuracy": 0.96875,
900
+ "num_tokens": 424784.0,
901
+ "step": 99
902
+ },
903
+ {
904
+ "epoch": 0.625,
905
+ "grad_norm": 3.800149917602539,
906
+ "learning_rate": 0.0002,
907
+ "loss": 0.0502,
908
+ "mean_token_accuracy": 1.0,
909
+ "num_tokens": 427840.0,
910
+ "step": 100
911
+ },
912
+ {
913
+ "epoch": 0.63125,
914
+ "grad_norm": 0.4669257402420044,
915
+ "learning_rate": 0.0002,
916
+ "loss": 0.0041,
917
+ "mean_token_accuracy": 1.0,
918
+ "num_tokens": 430896.0,
919
+ "step": 101
920
+ },
921
+ {
922
+ "epoch": 0.6375,
923
+ "grad_norm": 3.8480825424194336,
924
+ "learning_rate": 0.0002,
925
+ "loss": 0.0947,
926
+ "mean_token_accuracy": 0.9375,
927
+ "num_tokens": 433952.0,
928
+ "step": 102
929
+ },
930
+ {
931
+ "epoch": 0.64375,
932
+ "grad_norm": 4.139023303985596,
933
+ "learning_rate": 0.0002,
934
+ "loss": 0.1131,
935
+ "mean_token_accuracy": 0.96875,
936
+ "num_tokens": 437008.0,
937
+ "step": 103
938
+ },
939
+ {
940
+ "epoch": 0.65,
941
+ "grad_norm": 3.4455084800720215,
942
+ "learning_rate": 0.0002,
943
+ "loss": 0.1105,
944
+ "mean_token_accuracy": 0.9375,
945
+ "num_tokens": 440064.0,
946
+ "step": 104
947
+ },
948
+ {
949
+ "epoch": 0.65625,
950
+ "grad_norm": 2.1717028617858887,
951
+ "learning_rate": 0.0002,
952
+ "loss": 0.0794,
953
+ "mean_token_accuracy": 0.9375,
954
+ "num_tokens": 443120.0,
955
+ "step": 105
956
+ },
957
+ {
958
+ "epoch": 0.6625,
959
+ "grad_norm": 0.43985098600387573,
960
+ "learning_rate": 0.0002,
961
+ "loss": 0.0057,
962
+ "mean_token_accuracy": 1.0,
963
+ "num_tokens": 446176.0,
964
+ "step": 106
965
+ },
966
+ {
967
+ "epoch": 0.66875,
968
+ "grad_norm": 5.351183891296387,
969
+ "learning_rate": 0.0002,
970
+ "loss": 0.3953,
971
+ "mean_token_accuracy": 0.8125,
972
+ "num_tokens": 449232.0,
973
+ "step": 107
974
+ },
975
+ {
976
+ "epoch": 0.675,
977
+ "grad_norm": 4.6742048263549805,
978
+ "learning_rate": 0.0002,
979
+ "loss": 0.1099,
980
+ "mean_token_accuracy": 0.96875,
981
+ "num_tokens": 452288.0,
982
+ "step": 108
983
+ },
984
+ {
985
+ "epoch": 0.68125,
986
+ "grad_norm": 4.851898670196533,
987
+ "learning_rate": 0.0002,
988
+ "loss": 0.2198,
989
+ "mean_token_accuracy": 0.96875,
990
+ "num_tokens": 455344.0,
991
+ "step": 109
992
+ },
993
+ {
994
+ "epoch": 0.6875,
995
+ "grad_norm": 2.46817946434021,
996
+ "learning_rate": 0.0002,
997
+ "loss": 0.0586,
998
+ "mean_token_accuracy": 0.96875,
999
+ "num_tokens": 458400.0,
1000
+ "step": 110
1001
+ },
1002
+ {
1003
+ "epoch": 0.69375,
1004
+ "grad_norm": 5.507438659667969,
1005
+ "learning_rate": 0.0002,
1006
+ "loss": 0.3492,
1007
+ "mean_token_accuracy": 0.9375,
1008
+ "num_tokens": 461456.0,
1009
+ "step": 111
1010
+ },
1011
+ {
1012
+ "epoch": 0.7,
1013
+ "grad_norm": 0.21149688959121704,
1014
+ "learning_rate": 0.0002,
1015
+ "loss": 0.003,
1016
+ "mean_token_accuracy": 1.0,
1017
+ "num_tokens": 464512.0,
1018
+ "step": 112
1019
+ },
1020
+ {
1021
+ "epoch": 0.70625,
1022
+ "grad_norm": 2.1346304416656494,
1023
+ "learning_rate": 0.0002,
1024
+ "loss": 0.0365,
1025
+ "mean_token_accuracy": 1.0,
1026
+ "num_tokens": 467568.0,
1027
+ "step": 113
1028
+ },
1029
+ {
1030
+ "epoch": 0.7125,
1031
+ "grad_norm": 2.06246280670166,
1032
+ "learning_rate": 0.0002,
1033
+ "loss": 0.1228,
1034
+ "mean_token_accuracy": 0.96875,
1035
+ "num_tokens": 470624.0,
1036
+ "step": 114
1037
+ },
1038
+ {
1039
+ "epoch": 0.71875,
1040
+ "grad_norm": 1.1220771074295044,
1041
+ "learning_rate": 0.0002,
1042
+ "loss": 0.0174,
1043
+ "mean_token_accuracy": 1.0,
1044
+ "num_tokens": 473680.0,
1045
+ "step": 115
1046
+ },
1047
+ {
1048
+ "epoch": 0.725,
1049
+ "grad_norm": 1.283275842666626,
1050
+ "learning_rate": 0.0002,
1051
+ "loss": 0.0473,
1052
+ "mean_token_accuracy": 1.0,
1053
+ "num_tokens": 476736.0,
1054
+ "step": 116
1055
+ },
1056
+ {
1057
+ "epoch": 0.73125,
1058
+ "grad_norm": 3.0137429237365723,
1059
+ "learning_rate": 0.0002,
1060
+ "loss": 0.073,
1061
+ "mean_token_accuracy": 0.9375,
1062
+ "num_tokens": 479792.0,
1063
+ "step": 117
1064
+ },
1065
+ {
1066
+ "epoch": 0.7375,
1067
+ "grad_norm": 3.159397840499878,
1068
+ "learning_rate": 0.0002,
1069
+ "loss": 0.0736,
1070
+ "mean_token_accuracy": 0.96875,
1071
+ "num_tokens": 482848.0,
1072
+ "step": 118
1073
+ },
1074
+ {
1075
+ "epoch": 0.74375,
1076
+ "grad_norm": 4.265467643737793,
1077
+ "learning_rate": 0.0002,
1078
+ "loss": 0.3511,
1079
+ "mean_token_accuracy": 0.90625,
1080
+ "num_tokens": 485904.0,
1081
+ "step": 119
1082
+ },
1083
+ {
1084
+ "epoch": 0.75,
1085
+ "grad_norm": 4.885653495788574,
1086
+ "learning_rate": 0.0002,
1087
+ "loss": 0.1691,
1088
+ "mean_token_accuracy": 0.875,
1089
+ "num_tokens": 488960.0,
1090
+ "step": 120
1091
+ },
1092
+ {
1093
+ "epoch": 0.75625,
1094
+ "grad_norm": 5.490376949310303,
1095
+ "learning_rate": 0.0002,
1096
+ "loss": 0.2357,
1097
+ "mean_token_accuracy": 0.9375,
1098
+ "num_tokens": 492016.0,
1099
+ "step": 121
1100
+ },
1101
+ {
1102
+ "epoch": 0.7625,
1103
+ "grad_norm": 3.725234031677246,
1104
+ "learning_rate": 0.0002,
1105
+ "loss": 0.1983,
1106
+ "mean_token_accuracy": 0.9375,
1107
+ "num_tokens": 495072.0,
1108
+ "step": 122
1109
+ },
1110
+ {
1111
+ "epoch": 0.76875,
1112
+ "grad_norm": 1.6382085084915161,
1113
+ "learning_rate": 0.0002,
1114
+ "loss": 0.043,
1115
+ "mean_token_accuracy": 0.96875,
1116
+ "num_tokens": 498128.0,
1117
+ "step": 123
1118
+ },
1119
+ {
1120
+ "epoch": 0.775,
1121
+ "grad_norm": 0.3425021171569824,
1122
+ "learning_rate": 0.0002,
1123
+ "loss": 0.0069,
1124
+ "mean_token_accuracy": 1.0,
1125
+ "num_tokens": 501184.0,
1126
+ "step": 124
1127
+ },
1128
+ {
1129
+ "epoch": 0.78125,
1130
+ "grad_norm": 1.504239559173584,
1131
+ "learning_rate": 0.0002,
1132
+ "loss": 0.0348,
1133
+ "mean_token_accuracy": 1.0,
1134
+ "num_tokens": 504240.0,
1135
+ "step": 125
1136
+ },
1137
+ {
1138
+ "epoch": 0.7875,
1139
+ "grad_norm": 0.16000418365001678,
1140
+ "learning_rate": 0.0002,
1141
+ "loss": 0.0071,
1142
+ "mean_token_accuracy": 1.0,
1143
+ "num_tokens": 507296.0,
1144
+ "step": 126
1145
+ },
1146
+ {
1147
+ "epoch": 0.79375,
1148
+ "grad_norm": 0.48438990116119385,
1149
+ "learning_rate": 0.0002,
1150
+ "loss": 0.0153,
1151
+ "mean_token_accuracy": 1.0,
1152
+ "num_tokens": 510352.0,
1153
+ "step": 127
1154
+ },
1155
+ {
1156
+ "epoch": 0.8,
1157
+ "grad_norm": 0.3561486601829529,
1158
+ "learning_rate": 0.0002,
1159
+ "loss": 0.0071,
1160
+ "mean_token_accuracy": 1.0,
1161
+ "num_tokens": 513408.0,
1162
+ "step": 128
1163
+ },
1164
+ {
1165
+ "epoch": 0.80625,
1166
+ "grad_norm": 3.513791084289551,
1167
+ "learning_rate": 0.0002,
1168
+ "loss": 0.114,
1169
+ "mean_token_accuracy": 0.9375,
1170
+ "num_tokens": 516464.0,
1171
+ "step": 129
1172
+ },
1173
+ {
1174
+ "epoch": 0.8125,
1175
+ "grad_norm": 0.7476550340652466,
1176
+ "learning_rate": 0.0002,
1177
+ "loss": 0.027,
1178
+ "mean_token_accuracy": 1.0,
1179
+ "num_tokens": 519520.0,
1180
+ "step": 130
1181
+ },
1182
+ {
1183
+ "epoch": 0.81875,
1184
+ "grad_norm": 4.814530372619629,
1185
+ "learning_rate": 0.0002,
1186
+ "loss": 0.2155,
1187
+ "mean_token_accuracy": 0.9375,
1188
+ "num_tokens": 522576.0,
1189
+ "step": 131
1190
+ },
1191
+ {
1192
+ "epoch": 0.825,
1193
+ "grad_norm": 4.203996658325195,
1194
+ "learning_rate": 0.0002,
1195
+ "loss": 0.0732,
1196
+ "mean_token_accuracy": 0.9375,
1197
+ "num_tokens": 525632.0,
1198
+ "step": 132
1199
+ },
1200
+ {
1201
+ "epoch": 0.83125,
1202
+ "grad_norm": 3.171036958694458,
1203
+ "learning_rate": 0.0002,
1204
+ "loss": 0.0479,
1205
+ "mean_token_accuracy": 1.0,
1206
+ "num_tokens": 528688.0,
1207
+ "step": 133
1208
+ },
1209
+ {
1210
+ "epoch": 0.8375,
1211
+ "grad_norm": 3.515819549560547,
1212
+ "learning_rate": 0.0002,
1213
+ "loss": 0.1001,
1214
+ "mean_token_accuracy": 0.96875,
1215
+ "num_tokens": 531744.0,
1216
+ "step": 134
1217
+ },
1218
+ {
1219
+ "epoch": 0.84375,
1220
+ "grad_norm": 1.6061947345733643,
1221
+ "learning_rate": 0.0002,
1222
+ "loss": 0.0795,
1223
+ "mean_token_accuracy": 0.96875,
1224
+ "num_tokens": 534800.0,
1225
+ "step": 135
1226
+ },
1227
+ {
1228
+ "epoch": 0.85,
1229
+ "grad_norm": 0.5217976570129395,
1230
+ "learning_rate": 0.0002,
1231
+ "loss": 0.0143,
1232
+ "mean_token_accuracy": 1.0,
1233
+ "num_tokens": 537856.0,
1234
+ "step": 136
1235
+ },
1236
+ {
1237
+ "epoch": 0.85625,
1238
+ "grad_norm": 1.6569266319274902,
1239
+ "learning_rate": 0.0002,
1240
+ "loss": 0.025,
1241
+ "mean_token_accuracy": 1.0,
1242
+ "num_tokens": 540912.0,
1243
+ "step": 137
1244
+ },
1245
+ {
1246
+ "epoch": 0.8625,
1247
+ "grad_norm": 2.0939252376556396,
1248
+ "learning_rate": 0.0002,
1249
+ "loss": 0.0375,
1250
+ "mean_token_accuracy": 0.96875,
1251
+ "num_tokens": 543968.0,
1252
+ "step": 138
1253
+ },
1254
+ {
1255
+ "epoch": 0.86875,
1256
+ "grad_norm": 1.8286008834838867,
1257
+ "learning_rate": 0.0002,
1258
+ "loss": 0.0464,
1259
+ "mean_token_accuracy": 1.0,
1260
+ "num_tokens": 547024.0,
1261
+ "step": 139
1262
+ },
1263
+ {
1264
+ "epoch": 0.875,
1265
+ "grad_norm": 6.246391296386719,
1266
+ "learning_rate": 0.0002,
1267
+ "loss": 0.135,
1268
+ "mean_token_accuracy": 0.9375,
1269
+ "num_tokens": 550080.0,
1270
+ "step": 140
1271
+ },
1272
+ {
1273
+ "epoch": 0.88125,
1274
+ "grad_norm": 0.9313927292823792,
1275
+ "learning_rate": 0.0002,
1276
+ "loss": 0.0437,
1277
+ "mean_token_accuracy": 1.0,
1278
+ "num_tokens": 553136.0,
1279
+ "step": 141
1280
+ },
1281
+ {
1282
+ "epoch": 0.8875,
1283
+ "grad_norm": 3.962883472442627,
1284
+ "learning_rate": 0.0002,
1285
+ "loss": 0.1026,
1286
+ "mean_token_accuracy": 0.9375,
1287
+ "num_tokens": 556192.0,
1288
+ "step": 142
1289
+ },
1290
+ {
1291
+ "epoch": 0.89375,
1292
+ "grad_norm": 4.730908393859863,
1293
+ "learning_rate": 0.0002,
1294
+ "loss": 0.104,
1295
+ "mean_token_accuracy": 0.90625,
1296
+ "num_tokens": 559248.0,
1297
+ "step": 143
1298
+ },
1299
+ {
1300
+ "epoch": 0.9,
1301
+ "grad_norm": 4.035704135894775,
1302
+ "learning_rate": 0.0002,
1303
+ "loss": 0.16,
1304
+ "mean_token_accuracy": 0.96875,
1305
+ "num_tokens": 562304.0,
1306
+ "step": 144
1307
+ },
1308
+ {
1309
+ "epoch": 0.90625,
1310
+ "grad_norm": 5.092385768890381,
1311
+ "learning_rate": 0.0002,
1312
+ "loss": 0.1841,
1313
+ "mean_token_accuracy": 0.9375,
1314
+ "num_tokens": 565360.0,
1315
+ "step": 145
1316
+ },
1317
+ {
1318
+ "epoch": 0.9125,
1319
+ "grad_norm": 6.0376386642456055,
1320
+ "learning_rate": 0.0002,
1321
+ "loss": 0.2612,
1322
+ "mean_token_accuracy": 0.875,
1323
+ "num_tokens": 568416.0,
1324
+ "step": 146
1325
+ },
1326
+ {
1327
+ "epoch": 0.91875,
1328
+ "grad_norm": 3.7592973709106445,
1329
+ "learning_rate": 0.0002,
1330
+ "loss": 0.047,
1331
+ "mean_token_accuracy": 1.0,
1332
+ "num_tokens": 571472.0,
1333
+ "step": 147
1334
+ },
1335
+ {
1336
+ "epoch": 0.925,
1337
+ "grad_norm": 0.9640989899635315,
1338
+ "learning_rate": 0.0002,
1339
+ "loss": 0.0236,
1340
+ "mean_token_accuracy": 0.96875,
1341
+ "num_tokens": 574528.0,
1342
+ "step": 148
1343
+ },
1344
+ {
1345
+ "epoch": 0.93125,
1346
+ "grad_norm": 1.7307640314102173,
1347
+ "learning_rate": 0.0002,
1348
+ "loss": 0.061,
1349
+ "mean_token_accuracy": 0.96875,
1350
+ "num_tokens": 577584.0,
1351
+ "step": 149
1352
+ },
1353
+ {
1354
+ "epoch": 0.9375,
1355
+ "grad_norm": 2.1097357273101807,
1356
+ "learning_rate": 0.0002,
1357
+ "loss": 0.3053,
1358
+ "mean_token_accuracy": 0.96875,
1359
+ "num_tokens": 580640.0,
1360
+ "step": 150
1361
+ },
1362
+ {
1363
+ "epoch": 0.94375,
1364
+ "grad_norm": 3.9413912296295166,
1365
+ "learning_rate": 0.0002,
1366
+ "loss": 0.1405,
1367
+ "mean_token_accuracy": 0.96875,
1368
+ "num_tokens": 583696.0,
1369
+ "step": 151
1370
+ },
1371
+ {
1372
+ "epoch": 0.95,
1373
+ "grad_norm": 3.341071128845215,
1374
+ "learning_rate": 0.0002,
1375
+ "loss": 0.1131,
1376
+ "mean_token_accuracy": 0.9375,
1377
+ "num_tokens": 586752.0,
1378
+ "step": 152
1379
+ },
1380
+ {
1381
+ "epoch": 0.95625,
1382
+ "grad_norm": 1.0671991109848022,
1383
+ "learning_rate": 0.0002,
1384
+ "loss": 0.0147,
1385
+ "mean_token_accuracy": 1.0,
1386
+ "num_tokens": 589808.0,
1387
+ "step": 153
1388
+ },
1389
+ {
1390
+ "epoch": 0.9625,
1391
+ "grad_norm": 6.068633556365967,
1392
+ "learning_rate": 0.0002,
1393
+ "loss": 0.0888,
1394
+ "mean_token_accuracy": 0.9375,
1395
+ "num_tokens": 592864.0,
1396
+ "step": 154
1397
+ },
1398
+ {
1399
+ "epoch": 0.96875,
1400
+ "grad_norm": 1.0805017948150635,
1401
+ "learning_rate": 0.0002,
1402
+ "loss": 0.0136,
1403
+ "mean_token_accuracy": 1.0,
1404
+ "num_tokens": 595920.0,
1405
+ "step": 155
1406
+ },
1407
+ {
1408
+ "epoch": 0.975,
1409
+ "grad_norm": 0.40108004212379456,
1410
+ "learning_rate": 0.0002,
1411
+ "loss": 0.0091,
1412
+ "mean_token_accuracy": 1.0,
1413
+ "num_tokens": 598976.0,
1414
+ "step": 156
1415
+ },
1416
+ {
1417
+ "epoch": 0.98125,
1418
+ "grad_norm": 0.7008132934570312,
1419
+ "learning_rate": 0.0002,
1420
+ "loss": 0.011,
1421
+ "mean_token_accuracy": 1.0,
1422
+ "num_tokens": 602032.0,
1423
+ "step": 157
1424
+ },
1425
+ {
1426
+ "epoch": 0.9875,
1427
+ "grad_norm": 0.42773208022117615,
1428
+ "learning_rate": 0.0002,
1429
+ "loss": 0.0063,
1430
+ "mean_token_accuracy": 1.0,
1431
+ "num_tokens": 605088.0,
1432
+ "step": 158
1433
+ },
1434
+ {
1435
+ "epoch": 0.99375,
1436
+ "grad_norm": 0.9735352396965027,
1437
+ "learning_rate": 0.0002,
1438
+ "loss": 0.014,
1439
+ "mean_token_accuracy": 1.0,
1440
+ "num_tokens": 608144.0,
1441
+ "step": 159
1442
+ },
1443
+ {
1444
+ "epoch": 1.0,
1445
+ "grad_norm": 2.4267780780792236,
1446
+ "learning_rate": 0.0002,
1447
+ "loss": 0.0344,
1448
+ "mean_token_accuracy": 0.96875,
1449
+ "num_tokens": 611200.0,
1450
+ "step": 160
1451
+ },
1452
+ {
1453
+ "epoch": 1.00625,
1454
+ "grad_norm": 4.375260829925537,
1455
+ "learning_rate": 0.0002,
1456
+ "loss": 0.0981,
1457
+ "mean_token_accuracy": 0.9375,
1458
+ "num_tokens": 614256.0,
1459
+ "step": 161
1460
+ },
1461
+ {
1462
+ "epoch": 1.0125,
1463
+ "grad_norm": 2.6353538036346436,
1464
+ "learning_rate": 0.0002,
1465
+ "loss": 0.1965,
1466
+ "mean_token_accuracy": 0.96875,
1467
+ "num_tokens": 617312.0,
1468
+ "step": 162
1469
+ },
1470
+ {
1471
+ "epoch": 1.01875,
1472
+ "grad_norm": 2.906750440597534,
1473
+ "learning_rate": 0.0002,
1474
+ "loss": 0.0887,
1475
+ "mean_token_accuracy": 0.96875,
1476
+ "num_tokens": 620368.0,
1477
+ "step": 163
1478
+ },
1479
+ {
1480
+ "epoch": 1.025,
1481
+ "grad_norm": 2.4843482971191406,
1482
+ "learning_rate": 0.0002,
1483
+ "loss": 0.0409,
1484
+ "mean_token_accuracy": 0.96875,
1485
+ "num_tokens": 623424.0,
1486
+ "step": 164
1487
+ },
1488
+ {
1489
+ "epoch": 1.03125,
1490
+ "grad_norm": 2.4637041091918945,
1491
+ "learning_rate": 0.0002,
1492
+ "loss": 0.0588,
1493
+ "mean_token_accuracy": 0.96875,
1494
+ "num_tokens": 626480.0,
1495
+ "step": 165
1496
+ },
1497
+ {
1498
+ "epoch": 1.0375,
1499
+ "grad_norm": 3.256136655807495,
1500
+ "learning_rate": 0.0002,
1501
+ "loss": 0.0632,
1502
+ "mean_token_accuracy": 0.96875,
1503
+ "num_tokens": 629536.0,
1504
+ "step": 166
1505
+ },
1506
+ {
1507
+ "epoch": 1.04375,
1508
+ "grad_norm": 3.7600295543670654,
1509
+ "learning_rate": 0.0002,
1510
+ "loss": 0.0903,
1511
+ "mean_token_accuracy": 0.9375,
1512
+ "num_tokens": 632592.0,
1513
+ "step": 167
1514
+ },
1515
+ {
1516
+ "epoch": 1.05,
1517
+ "grad_norm": 4.013559341430664,
1518
+ "learning_rate": 0.0002,
1519
+ "loss": 0.0796,
1520
+ "mean_token_accuracy": 0.9375,
1521
+ "num_tokens": 635648.0,
1522
+ "step": 168
1523
+ },
1524
+ {
1525
+ "epoch": 1.05625,
1526
+ "grad_norm": 1.3275935649871826,
1527
+ "learning_rate": 0.0002,
1528
+ "loss": 0.0415,
1529
+ "mean_token_accuracy": 0.96875,
1530
+ "num_tokens": 638704.0,
1531
+ "step": 169
1532
+ },
1533
+ {
1534
+ "epoch": 1.0625,
1535
+ "grad_norm": 3.0830256938934326,
1536
+ "learning_rate": 0.0002,
1537
+ "loss": 0.0989,
1538
+ "mean_token_accuracy": 0.96875,
1539
+ "num_tokens": 641760.0,
1540
+ "step": 170
1541
+ },
1542
+ {
1543
+ "epoch": 1.06875,
1544
+ "grad_norm": 5.780794620513916,
1545
+ "learning_rate": 0.0002,
1546
+ "loss": 0.0974,
1547
+ "mean_token_accuracy": 0.9375,
1548
+ "num_tokens": 644816.0,
1549
+ "step": 171
1550
+ },
1551
+ {
1552
+ "epoch": 1.075,
1553
+ "grad_norm": 1.1954656839370728,
1554
+ "learning_rate": 0.0002,
1555
+ "loss": 0.0363,
1556
+ "mean_token_accuracy": 1.0,
1557
+ "num_tokens": 647872.0,
1558
+ "step": 172
1559
+ },
1560
+ {
1561
+ "epoch": 1.08125,
1562
+ "grad_norm": 0.6503368616104126,
1563
+ "learning_rate": 0.0002,
1564
+ "loss": 0.0086,
1565
+ "mean_token_accuracy": 1.0,
1566
+ "num_tokens": 650928.0,
1567
+ "step": 173
1568
+ },
1569
+ {
1570
+ "epoch": 1.0875,
1571
+ "grad_norm": 0.2741992771625519,
1572
+ "learning_rate": 0.0002,
1573
+ "loss": 0.005,
1574
+ "mean_token_accuracy": 1.0,
1575
+ "num_tokens": 653984.0,
1576
+ "step": 174
1577
+ },
1578
+ {
1579
+ "epoch": 1.09375,
1580
+ "grad_norm": 4.483480930328369,
1581
+ "learning_rate": 0.0002,
1582
+ "loss": 0.1765,
1583
+ "mean_token_accuracy": 0.9375,
1584
+ "num_tokens": 657040.0,
1585
+ "step": 175
1586
+ },
1587
+ {
1588
+ "epoch": 1.1,
1589
+ "grad_norm": 2.0801796913146973,
1590
+ "learning_rate": 0.0002,
1591
+ "loss": 0.2432,
1592
+ "mean_token_accuracy": 0.96875,
1593
+ "num_tokens": 660096.0,
1594
+ "step": 176
1595
+ },
1596
+ {
1597
+ "epoch": 1.10625,
1598
+ "grad_norm": 4.300808906555176,
1599
+ "learning_rate": 0.0002,
1600
+ "loss": 0.0805,
1601
+ "mean_token_accuracy": 0.96875,
1602
+ "num_tokens": 663152.0,
1603
+ "step": 177
1604
+ },
1605
+ {
1606
+ "epoch": 1.1125,
1607
+ "grad_norm": 1.7215920686721802,
1608
+ "learning_rate": 0.0002,
1609
+ "loss": 0.0227,
1610
+ "mean_token_accuracy": 0.96875,
1611
+ "num_tokens": 666208.0,
1612
+ "step": 178
1613
+ },
1614
+ {
1615
+ "epoch": 1.11875,
1616
+ "grad_norm": 3.1990017890930176,
1617
+ "learning_rate": 0.0002,
1618
+ "loss": 0.3566,
1619
+ "mean_token_accuracy": 0.9375,
1620
+ "num_tokens": 669264.0,
1621
+ "step": 179
1622
+ },
1623
+ {
1624
+ "epoch": 1.125,
1625
+ "grad_norm": 1.593938946723938,
1626
+ "learning_rate": 0.0002,
1627
+ "loss": 0.0232,
1628
+ "mean_token_accuracy": 1.0,
1629
+ "num_tokens": 672320.0,
1630
+ "step": 180
1631
+ },
1632
+ {
1633
+ "epoch": 1.13125,
1634
+ "grad_norm": 5.725439071655273,
1635
+ "learning_rate": 0.0002,
1636
+ "loss": 0.1591,
1637
+ "mean_token_accuracy": 0.9375,
1638
+ "num_tokens": 675376.0,
1639
+ "step": 181
1640
+ },
1641
+ {
1642
+ "epoch": 1.1375,
1643
+ "grad_norm": 4.107408046722412,
1644
+ "learning_rate": 0.0002,
1645
+ "loss": 0.0734,
1646
+ "mean_token_accuracy": 0.96875,
1647
+ "num_tokens": 678432.0,
1648
+ "step": 182
1649
+ },
1650
+ {
1651
+ "epoch": 1.14375,
1652
+ "grad_norm": 0.5303417444229126,
1653
+ "learning_rate": 0.0002,
1654
+ "loss": 0.0099,
1655
+ "mean_token_accuracy": 1.0,
1656
+ "num_tokens": 681488.0,
1657
+ "step": 183
1658
+ },
1659
+ {
1660
+ "epoch": 1.15,
1661
+ "grad_norm": 0.7143228650093079,
1662
+ "learning_rate": 0.0002,
1663
+ "loss": 0.0127,
1664
+ "mean_token_accuracy": 1.0,
1665
+ "num_tokens": 684544.0,
1666
+ "step": 184
1667
+ },
1668
+ {
1669
+ "epoch": 1.15625,
1670
+ "grad_norm": 0.10430416464805603,
1671
+ "learning_rate": 0.0002,
1672
+ "loss": 0.0028,
1673
+ "mean_token_accuracy": 1.0,
1674
+ "num_tokens": 687600.0,
1675
+ "step": 185
1676
+ },
1677
+ {
1678
+ "epoch": 1.1625,
1679
+ "grad_norm": 0.10262572765350342,
1680
+ "learning_rate": 0.0002,
1681
+ "loss": 0.0022,
1682
+ "mean_token_accuracy": 1.0,
1683
+ "num_tokens": 690656.0,
1684
+ "step": 186
1685
+ },
1686
+ {
1687
+ "epoch": 1.16875,
1688
+ "grad_norm": 0.5091086626052856,
1689
+ "learning_rate": 0.0002,
1690
+ "loss": 0.0082,
1691
+ "mean_token_accuracy": 1.0,
1692
+ "num_tokens": 693712.0,
1693
+ "step": 187
1694
+ },
1695
+ {
1696
+ "epoch": 1.175,
1697
+ "grad_norm": 0.32739120721817017,
1698
+ "learning_rate": 0.0002,
1699
+ "loss": 0.006,
1700
+ "mean_token_accuracy": 1.0,
1701
+ "num_tokens": 696768.0,
1702
+ "step": 188
1703
+ },
1704
+ {
1705
+ "epoch": 1.18125,
1706
+ "grad_norm": 2.958453416824341,
1707
+ "learning_rate": 0.0002,
1708
+ "loss": 0.0506,
1709
+ "mean_token_accuracy": 0.96875,
1710
+ "num_tokens": 699824.0,
1711
+ "step": 189
1712
+ },
1713
+ {
1714
+ "epoch": 1.1875,
1715
+ "grad_norm": 0.6987707018852234,
1716
+ "learning_rate": 0.0002,
1717
+ "loss": 0.0107,
1718
+ "mean_token_accuracy": 1.0,
1719
+ "num_tokens": 702880.0,
1720
+ "step": 190
1721
+ },
1722
+ {
1723
+ "epoch": 1.19375,
1724
+ "grad_norm": 1.3608295917510986,
1725
+ "learning_rate": 0.0002,
1726
+ "loss": 0.0287,
1727
+ "mean_token_accuracy": 1.0,
1728
+ "num_tokens": 705936.0,
1729
+ "step": 191
1730
+ },
1731
+ {
1732
+ "epoch": 1.2,
1733
+ "grad_norm": 0.14391912519931793,
1734
+ "learning_rate": 0.0002,
1735
+ "loss": 0.0021,
1736
+ "mean_token_accuracy": 1.0,
1737
+ "num_tokens": 708992.0,
1738
+ "step": 192
1739
+ },
1740
+ {
1741
+ "epoch": 1.20625,
1742
+ "grad_norm": 2.5519983768463135,
1743
+ "learning_rate": 0.0002,
1744
+ "loss": 0.1491,
1745
+ "mean_token_accuracy": 0.96875,
1746
+ "num_tokens": 712048.0,
1747
+ "step": 193
1748
+ },
1749
+ {
1750
+ "epoch": 1.2125,
1751
+ "grad_norm": 6.5541253089904785,
1752
+ "learning_rate": 0.0002,
1753
+ "loss": 0.1414,
1754
+ "mean_token_accuracy": 0.9375,
1755
+ "num_tokens": 715104.0,
1756
+ "step": 194
1757
+ },
1758
+ {
1759
+ "epoch": 1.21875,
1760
+ "grad_norm": 0.2820110619068146,
1761
+ "learning_rate": 0.0002,
1762
+ "loss": 0.003,
1763
+ "mean_token_accuracy": 1.0,
1764
+ "num_tokens": 718160.0,
1765
+ "step": 195
1766
+ },
1767
+ {
1768
+ "epoch": 1.225,
1769
+ "grad_norm": 2.9799859523773193,
1770
+ "learning_rate": 0.0002,
1771
+ "loss": 0.0506,
1772
+ "mean_token_accuracy": 0.96875,
1773
+ "num_tokens": 721216.0,
1774
+ "step": 196
1775
+ },
1776
+ {
1777
+ "epoch": 1.23125,
1778
+ "grad_norm": 1.1640214920043945,
1779
+ "learning_rate": 0.0002,
1780
+ "loss": 0.0182,
1781
+ "mean_token_accuracy": 1.0,
1782
+ "num_tokens": 724272.0,
1783
+ "step": 197
1784
+ },
1785
+ {
1786
+ "epoch": 1.2375,
1787
+ "grad_norm": 2.498863935470581,
1788
+ "learning_rate": 0.0002,
1789
+ "loss": 0.073,
1790
+ "mean_token_accuracy": 0.9375,
1791
+ "num_tokens": 727328.0,
1792
+ "step": 198
1793
+ },
1794
+ {
1795
+ "epoch": 1.24375,
1796
+ "grad_norm": 1.2676233053207397,
1797
+ "learning_rate": 0.0002,
1798
+ "loss": 0.0265,
1799
+ "mean_token_accuracy": 1.0,
1800
+ "num_tokens": 730384.0,
1801
+ "step": 199
1802
+ },
1803
+ {
1804
+ "epoch": 1.25,
1805
+ "grad_norm": 0.3039199113845825,
1806
+ "learning_rate": 0.0002,
1807
+ "loss": 0.0079,
1808
+ "step": 200
1809
+ },
1810
+ {
1811
+ "epoch": 1.25,
1812
+ "eval_loss": 0.07848864793777466,
1813
+ "eval_mean_token_accuracy": 0.9703125,
1814
+ "eval_model_preparation_time": 0.0042,
1815
+ "eval_num_tokens": 733440.0,
1816
+ "eval_runtime": 71.4875,
1817
+ "eval_samples_per_second": 8.953,
1818
+ "eval_steps_per_second": 0.56,
1819
+ "step": 200
1820
+ }
1821
+ ],
1822
+ "logging_steps": 1,
1823
+ "max_steps": 320,
1824
+ "num_input_tokens_seen": 0,
1825
+ "num_train_epochs": 2,
1826
+ "save_steps": 200,
1827
+ "stateful_callbacks": {
1828
+ "TrainerControl": {
1829
+ "args": {
1830
+ "should_epoch_stop": false,
1831
+ "should_evaluate": false,
1832
+ "should_log": false,
1833
+ "should_save": true,
1834
+ "should_training_stop": false
1835
+ },
1836
+ "attributes": {}
1837
+ }
1838
+ },
1839
+ "total_flos": 2.84289248722944e+16,
1840
+ "train_batch_size": 16,
1841
+ "trial_name": null,
1842
+ "trial_params": null
1843
+ }
checkpoint-200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1e17f907302d781eecf6b58e9d4136a88ad4089312e1b26d13d23c8ff5af5cb
3
+ size 6161
checkpoint-200/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-320/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-VL-7B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
checkpoint-320/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
checkpoint-320/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:530fabf0be943088618405f86cd5303d922e0e5f96f279fad4af24d343946040
3
+ size 20200056
checkpoint-320/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-320/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
checkpoint-320/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-320/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75dcacc1adf3aa4ab6c7c8b31873a37f7e0cc4f47b760b246fdd98985afe218e
3
+ size 40466443
checkpoint-320/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:819b28a03a83ccada450e3d740a066905c699b36484adb52e93a4911c6ad9bc4
3
+ size 14645
checkpoint-320/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2a804a95a4754b7694d7b5887b5c0f89e1280617b9486e1f9fd471f0f33b52
3
+ size 1465
checkpoint-320/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-320/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-320/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoint-320/trainer_state.json ADDED
@@ -0,0 +1,2923 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 200,
3
+ "best_metric": 0.07848864793777466,
4
+ "best_model_checkpoint": "sft_prefill/prompt_id_3/qwen2-VL-7B-Instruct-syn-count-lora/checkpoint-200",
5
+ "epoch": 2.0,
6
+ "eval_steps": 200,
7
+ "global_step": 320,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00625,
14
+ "grad_norm": 0.9298801422119141,
15
+ "learning_rate": 0.0002,
16
+ "loss": 0.4917,
17
+ "mean_token_accuracy": 0.6875,
18
+ "num_tokens": 125296.0,
19
+ "step": 1
20
+ },
21
+ {
22
+ "epoch": 0.0125,
23
+ "grad_norm": 0.8382226824760437,
24
+ "learning_rate": 0.0002,
25
+ "loss": 0.6049,
26
+ "mean_token_accuracy": 0.75,
27
+ "num_tokens": 128352.0,
28
+ "step": 2
29
+ },
30
+ {
31
+ "epoch": 0.01875,
32
+ "grad_norm": 0.9454999566078186,
33
+ "learning_rate": 0.0002,
34
+ "loss": 0.6407,
35
+ "mean_token_accuracy": 0.65625,
36
+ "num_tokens": 131408.0,
37
+ "step": 3
38
+ },
39
+ {
40
+ "epoch": 0.025,
41
+ "grad_norm": 0.9727649092674255,
42
+ "learning_rate": 0.0002,
43
+ "loss": 0.3972,
44
+ "mean_token_accuracy": 0.90625,
45
+ "num_tokens": 134464.0,
46
+ "step": 4
47
+ },
48
+ {
49
+ "epoch": 0.03125,
50
+ "grad_norm": 0.785910964012146,
51
+ "learning_rate": 0.0002,
52
+ "loss": 0.453,
53
+ "mean_token_accuracy": 0.84375,
54
+ "num_tokens": 137520.0,
55
+ "step": 5
56
+ },
57
+ {
58
+ "epoch": 0.0375,
59
+ "grad_norm": 0.9556392431259155,
60
+ "learning_rate": 0.0002,
61
+ "loss": 0.418,
62
+ "mean_token_accuracy": 0.8125,
63
+ "num_tokens": 140576.0,
64
+ "step": 6
65
+ },
66
+ {
67
+ "epoch": 0.04375,
68
+ "grad_norm": 1.160456657409668,
69
+ "learning_rate": 0.0002,
70
+ "loss": 0.303,
71
+ "mean_token_accuracy": 0.875,
72
+ "num_tokens": 143632.0,
73
+ "step": 7
74
+ },
75
+ {
76
+ "epoch": 0.05,
77
+ "grad_norm": 1.127308964729309,
78
+ "learning_rate": 0.0002,
79
+ "loss": 0.1847,
80
+ "mean_token_accuracy": 0.9375,
81
+ "num_tokens": 146688.0,
82
+ "step": 8
83
+ },
84
+ {
85
+ "epoch": 0.05625,
86
+ "grad_norm": 1.7690690755844116,
87
+ "learning_rate": 0.0002,
88
+ "loss": 0.3514,
89
+ "mean_token_accuracy": 0.84375,
90
+ "num_tokens": 149744.0,
91
+ "step": 9
92
+ },
93
+ {
94
+ "epoch": 0.0625,
95
+ "grad_norm": 0.8747241497039795,
96
+ "learning_rate": 0.0002,
97
+ "loss": 0.2434,
98
+ "mean_token_accuracy": 0.84375,
99
+ "num_tokens": 152800.0,
100
+ "step": 10
101
+ },
102
+ {
103
+ "epoch": 0.06875,
104
+ "grad_norm": 0.7658981084823608,
105
+ "learning_rate": 0.0002,
106
+ "loss": 0.1577,
107
+ "mean_token_accuracy": 0.9375,
108
+ "num_tokens": 155856.0,
109
+ "step": 11
110
+ },
111
+ {
112
+ "epoch": 0.075,
113
+ "grad_norm": 0.6385300755500793,
114
+ "learning_rate": 0.0002,
115
+ "loss": 0.209,
116
+ "mean_token_accuracy": 0.875,
117
+ "num_tokens": 158912.0,
118
+ "step": 12
119
+ },
120
+ {
121
+ "epoch": 0.08125,
122
+ "grad_norm": 1.2857609987258911,
123
+ "learning_rate": 0.0002,
124
+ "loss": 0.2123,
125
+ "mean_token_accuracy": 0.875,
126
+ "num_tokens": 161968.0,
127
+ "step": 13
128
+ },
129
+ {
130
+ "epoch": 0.0875,
131
+ "grad_norm": 0.5603316426277161,
132
+ "learning_rate": 0.0002,
133
+ "loss": 0.2712,
134
+ "mean_token_accuracy": 0.96875,
135
+ "num_tokens": 165024.0,
136
+ "step": 14
137
+ },
138
+ {
139
+ "epoch": 0.09375,
140
+ "grad_norm": 1.0855859518051147,
141
+ "learning_rate": 0.0002,
142
+ "loss": 0.2975,
143
+ "mean_token_accuracy": 0.875,
144
+ "num_tokens": 168080.0,
145
+ "step": 15
146
+ },
147
+ {
148
+ "epoch": 0.1,
149
+ "grad_norm": 2.589662790298462,
150
+ "learning_rate": 0.0002,
151
+ "loss": 0.3977,
152
+ "mean_token_accuracy": 0.78125,
153
+ "num_tokens": 171136.0,
154
+ "step": 16
155
+ },
156
+ {
157
+ "epoch": 0.10625,
158
+ "grad_norm": 1.242841124534607,
159
+ "learning_rate": 0.0002,
160
+ "loss": 0.3933,
161
+ "mean_token_accuracy": 0.90625,
162
+ "num_tokens": 174192.0,
163
+ "step": 17
164
+ },
165
+ {
166
+ "epoch": 0.1125,
167
+ "grad_norm": 1.0371448993682861,
168
+ "learning_rate": 0.0002,
169
+ "loss": 0.2866,
170
+ "mean_token_accuracy": 0.875,
171
+ "num_tokens": 177248.0,
172
+ "step": 18
173
+ },
174
+ {
175
+ "epoch": 0.11875,
176
+ "grad_norm": 2.609177827835083,
177
+ "learning_rate": 0.0002,
178
+ "loss": 0.4174,
179
+ "mean_token_accuracy": 0.75,
180
+ "num_tokens": 180304.0,
181
+ "step": 19
182
+ },
183
+ {
184
+ "epoch": 0.125,
185
+ "grad_norm": 1.229350209236145,
186
+ "learning_rate": 0.0002,
187
+ "loss": 0.3588,
188
+ "mean_token_accuracy": 0.84375,
189
+ "num_tokens": 183360.0,
190
+ "step": 20
191
+ },
192
+ {
193
+ "epoch": 0.13125,
194
+ "grad_norm": 1.2149115800857544,
195
+ "learning_rate": 0.0002,
196
+ "loss": 0.1919,
197
+ "mean_token_accuracy": 0.9375,
198
+ "num_tokens": 186416.0,
199
+ "step": 21
200
+ },
201
+ {
202
+ "epoch": 0.1375,
203
+ "grad_norm": 3.0612738132476807,
204
+ "learning_rate": 0.0002,
205
+ "loss": 0.475,
206
+ "mean_token_accuracy": 0.84375,
207
+ "num_tokens": 189472.0,
208
+ "step": 22
209
+ },
210
+ {
211
+ "epoch": 0.14375,
212
+ "grad_norm": 1.1256693601608276,
213
+ "learning_rate": 0.0002,
214
+ "loss": 0.223,
215
+ "mean_token_accuracy": 0.9375,
216
+ "num_tokens": 192528.0,
217
+ "step": 23
218
+ },
219
+ {
220
+ "epoch": 0.15,
221
+ "grad_norm": 1.5483283996582031,
222
+ "learning_rate": 0.0002,
223
+ "loss": 0.3825,
224
+ "mean_token_accuracy": 0.78125,
225
+ "num_tokens": 195584.0,
226
+ "step": 24
227
+ },
228
+ {
229
+ "epoch": 0.15625,
230
+ "grad_norm": 0.8419892191886902,
231
+ "learning_rate": 0.0002,
232
+ "loss": 0.1645,
233
+ "mean_token_accuracy": 0.875,
234
+ "num_tokens": 198640.0,
235
+ "step": 25
236
+ },
237
+ {
238
+ "epoch": 0.1625,
239
+ "grad_norm": 1.5078842639923096,
240
+ "learning_rate": 0.0002,
241
+ "loss": 0.2517,
242
+ "mean_token_accuracy": 0.9375,
243
+ "num_tokens": 201696.0,
244
+ "step": 26
245
+ },
246
+ {
247
+ "epoch": 0.16875,
248
+ "grad_norm": 0.7717946171760559,
249
+ "learning_rate": 0.0002,
250
+ "loss": 0.1865,
251
+ "mean_token_accuracy": 0.9375,
252
+ "num_tokens": 204752.0,
253
+ "step": 27
254
+ },
255
+ {
256
+ "epoch": 0.175,
257
+ "grad_norm": 0.7887358069419861,
258
+ "learning_rate": 0.0002,
259
+ "loss": 0.2887,
260
+ "mean_token_accuracy": 0.90625,
261
+ "num_tokens": 207808.0,
262
+ "step": 28
263
+ },
264
+ {
265
+ "epoch": 0.18125,
266
+ "grad_norm": 1.0774492025375366,
267
+ "learning_rate": 0.0002,
268
+ "loss": 0.2154,
269
+ "mean_token_accuracy": 0.90625,
270
+ "num_tokens": 210864.0,
271
+ "step": 29
272
+ },
273
+ {
274
+ "epoch": 0.1875,
275
+ "grad_norm": 1.2108778953552246,
276
+ "learning_rate": 0.0002,
277
+ "loss": 0.3605,
278
+ "mean_token_accuracy": 0.84375,
279
+ "num_tokens": 213920.0,
280
+ "step": 30
281
+ },
282
+ {
283
+ "epoch": 0.19375,
284
+ "grad_norm": 0.4569832980632782,
285
+ "learning_rate": 0.0002,
286
+ "loss": 0.1299,
287
+ "mean_token_accuracy": 1.0,
288
+ "num_tokens": 216976.0,
289
+ "step": 31
290
+ },
291
+ {
292
+ "epoch": 0.2,
293
+ "grad_norm": 0.789750337600708,
294
+ "learning_rate": 0.0002,
295
+ "loss": 0.1772,
296
+ "mean_token_accuracy": 0.96875,
297
+ "num_tokens": 220032.0,
298
+ "step": 32
299
+ },
300
+ {
301
+ "epoch": 0.20625,
302
+ "grad_norm": 1.5008147954940796,
303
+ "learning_rate": 0.0002,
304
+ "loss": 0.3281,
305
+ "mean_token_accuracy": 0.84375,
306
+ "num_tokens": 223088.0,
307
+ "step": 33
308
+ },
309
+ {
310
+ "epoch": 0.2125,
311
+ "grad_norm": 1.082711935043335,
312
+ "learning_rate": 0.0002,
313
+ "loss": 0.1973,
314
+ "mean_token_accuracy": 0.9375,
315
+ "num_tokens": 226144.0,
316
+ "step": 34
317
+ },
318
+ {
319
+ "epoch": 0.21875,
320
+ "grad_norm": 0.6501540541648865,
321
+ "learning_rate": 0.0002,
322
+ "loss": 0.2645,
323
+ "mean_token_accuracy": 0.9375,
324
+ "num_tokens": 229200.0,
325
+ "step": 35
326
+ },
327
+ {
328
+ "epoch": 0.225,
329
+ "grad_norm": 0.6396588087081909,
330
+ "learning_rate": 0.0002,
331
+ "loss": 0.1938,
332
+ "mean_token_accuracy": 0.90625,
333
+ "num_tokens": 232256.0,
334
+ "step": 36
335
+ },
336
+ {
337
+ "epoch": 0.23125,
338
+ "grad_norm": 0.7765557169914246,
339
+ "learning_rate": 0.0002,
340
+ "loss": 0.1577,
341
+ "mean_token_accuracy": 0.90625,
342
+ "num_tokens": 235312.0,
343
+ "step": 37
344
+ },
345
+ {
346
+ "epoch": 0.2375,
347
+ "grad_norm": 1.0910521745681763,
348
+ "learning_rate": 0.0002,
349
+ "loss": 0.1917,
350
+ "mean_token_accuracy": 0.875,
351
+ "num_tokens": 238368.0,
352
+ "step": 38
353
+ },
354
+ {
355
+ "epoch": 0.24375,
356
+ "grad_norm": 0.7262992262840271,
357
+ "learning_rate": 0.0002,
358
+ "loss": 0.0703,
359
+ "mean_token_accuracy": 1.0,
360
+ "num_tokens": 241424.0,
361
+ "step": 39
362
+ },
363
+ {
364
+ "epoch": 0.25,
365
+ "grad_norm": 0.598296582698822,
366
+ "learning_rate": 0.0002,
367
+ "loss": 0.1402,
368
+ "mean_token_accuracy": 0.96875,
369
+ "num_tokens": 244480.0,
370
+ "step": 40
371
+ },
372
+ {
373
+ "epoch": 0.25625,
374
+ "grad_norm": 1.0417550802230835,
375
+ "learning_rate": 0.0002,
376
+ "loss": 0.1274,
377
+ "mean_token_accuracy": 0.96875,
378
+ "num_tokens": 247536.0,
379
+ "step": 41
380
+ },
381
+ {
382
+ "epoch": 0.2625,
383
+ "grad_norm": 1.5009475946426392,
384
+ "learning_rate": 0.0002,
385
+ "loss": 0.1871,
386
+ "mean_token_accuracy": 0.9375,
387
+ "num_tokens": 250592.0,
388
+ "step": 42
389
+ },
390
+ {
391
+ "epoch": 0.26875,
392
+ "grad_norm": 2.052367925643921,
393
+ "learning_rate": 0.0002,
394
+ "loss": 0.2981,
395
+ "mean_token_accuracy": 0.875,
396
+ "num_tokens": 253648.0,
397
+ "step": 43
398
+ },
399
+ {
400
+ "epoch": 0.275,
401
+ "grad_norm": 0.6979355216026306,
402
+ "learning_rate": 0.0002,
403
+ "loss": 0.0962,
404
+ "mean_token_accuracy": 0.96875,
405
+ "num_tokens": 256704.0,
406
+ "step": 44
407
+ },
408
+ {
409
+ "epoch": 0.28125,
410
+ "grad_norm": 1.2591170072555542,
411
+ "learning_rate": 0.0002,
412
+ "loss": 0.1616,
413
+ "mean_token_accuracy": 0.9375,
414
+ "num_tokens": 259760.0,
415
+ "step": 45
416
+ },
417
+ {
418
+ "epoch": 0.2875,
419
+ "grad_norm": 0.5454805493354797,
420
+ "learning_rate": 0.0002,
421
+ "loss": 0.0614,
422
+ "mean_token_accuracy": 0.96875,
423
+ "num_tokens": 262816.0,
424
+ "step": 46
425
+ },
426
+ {
427
+ "epoch": 0.29375,
428
+ "grad_norm": 1.3832871913909912,
429
+ "learning_rate": 0.0002,
430
+ "loss": 0.1468,
431
+ "mean_token_accuracy": 0.96875,
432
+ "num_tokens": 265872.0,
433
+ "step": 47
434
+ },
435
+ {
436
+ "epoch": 0.3,
437
+ "grad_norm": 0.5263135433197021,
438
+ "learning_rate": 0.0002,
439
+ "loss": 0.0482,
440
+ "mean_token_accuracy": 1.0,
441
+ "num_tokens": 268928.0,
442
+ "step": 48
443
+ },
444
+ {
445
+ "epoch": 0.30625,
446
+ "grad_norm": 1.7050193548202515,
447
+ "learning_rate": 0.0002,
448
+ "loss": 0.1177,
449
+ "mean_token_accuracy": 0.9375,
450
+ "num_tokens": 271984.0,
451
+ "step": 49
452
+ },
453
+ {
454
+ "epoch": 0.3125,
455
+ "grad_norm": 1.4185047149658203,
456
+ "learning_rate": 0.0002,
457
+ "loss": 0.0788,
458
+ "mean_token_accuracy": 0.96875,
459
+ "num_tokens": 275040.0,
460
+ "step": 50
461
+ },
462
+ {
463
+ "epoch": 0.31875,
464
+ "grad_norm": 1.192116379737854,
465
+ "learning_rate": 0.0002,
466
+ "loss": 0.1786,
467
+ "mean_token_accuracy": 0.90625,
468
+ "num_tokens": 278096.0,
469
+ "step": 51
470
+ },
471
+ {
472
+ "epoch": 0.325,
473
+ "grad_norm": 3.244797945022583,
474
+ "learning_rate": 0.0002,
475
+ "loss": 0.1685,
476
+ "mean_token_accuracy": 0.9375,
477
+ "num_tokens": 281152.0,
478
+ "step": 52
479
+ },
480
+ {
481
+ "epoch": 0.33125,
482
+ "grad_norm": 2.192440986633301,
483
+ "learning_rate": 0.0002,
484
+ "loss": 0.1178,
485
+ "mean_token_accuracy": 0.9375,
486
+ "num_tokens": 284208.0,
487
+ "step": 53
488
+ },
489
+ {
490
+ "epoch": 0.3375,
491
+ "grad_norm": 2.3711836338043213,
492
+ "learning_rate": 0.0002,
493
+ "loss": 0.1016,
494
+ "mean_token_accuracy": 0.9375,
495
+ "num_tokens": 287264.0,
496
+ "step": 54
497
+ },
498
+ {
499
+ "epoch": 0.34375,
500
+ "grad_norm": 2.2536869049072266,
501
+ "learning_rate": 0.0002,
502
+ "loss": 0.1223,
503
+ "mean_token_accuracy": 0.9375,
504
+ "num_tokens": 290320.0,
505
+ "step": 55
506
+ },
507
+ {
508
+ "epoch": 0.35,
509
+ "grad_norm": 1.4965548515319824,
510
+ "learning_rate": 0.0002,
511
+ "loss": 0.0331,
512
+ "mean_token_accuracy": 0.96875,
513
+ "num_tokens": 293376.0,
514
+ "step": 56
515
+ },
516
+ {
517
+ "epoch": 0.35625,
518
+ "grad_norm": 2.7128384113311768,
519
+ "learning_rate": 0.0002,
520
+ "loss": 0.1314,
521
+ "mean_token_accuracy": 0.9375,
522
+ "num_tokens": 296432.0,
523
+ "step": 57
524
+ },
525
+ {
526
+ "epoch": 0.3625,
527
+ "grad_norm": 1.1565158367156982,
528
+ "learning_rate": 0.0002,
529
+ "loss": 0.0504,
530
+ "mean_token_accuracy": 0.96875,
531
+ "num_tokens": 299488.0,
532
+ "step": 58
533
+ },
534
+ {
535
+ "epoch": 0.36875,
536
+ "grad_norm": 1.363576054573059,
537
+ "learning_rate": 0.0002,
538
+ "loss": 0.0327,
539
+ "mean_token_accuracy": 1.0,
540
+ "num_tokens": 302544.0,
541
+ "step": 59
542
+ },
543
+ {
544
+ "epoch": 0.375,
545
+ "grad_norm": 1.7460052967071533,
546
+ "learning_rate": 0.0002,
547
+ "loss": 0.0766,
548
+ "mean_token_accuracy": 0.96875,
549
+ "num_tokens": 305600.0,
550
+ "step": 60
551
+ },
552
+ {
553
+ "epoch": 0.38125,
554
+ "grad_norm": 0.4773832857608795,
555
+ "learning_rate": 0.0002,
556
+ "loss": 0.0167,
557
+ "mean_token_accuracy": 1.0,
558
+ "num_tokens": 308656.0,
559
+ "step": 61
560
+ },
561
+ {
562
+ "epoch": 0.3875,
563
+ "grad_norm": 0.5730915665626526,
564
+ "learning_rate": 0.0002,
565
+ "loss": 0.0176,
566
+ "mean_token_accuracy": 1.0,
567
+ "num_tokens": 311712.0,
568
+ "step": 62
569
+ },
570
+ {
571
+ "epoch": 0.39375,
572
+ "grad_norm": 3.6676278114318848,
573
+ "learning_rate": 0.0002,
574
+ "loss": 0.1523,
575
+ "mean_token_accuracy": 0.9375,
576
+ "num_tokens": 314768.0,
577
+ "step": 63
578
+ },
579
+ {
580
+ "epoch": 0.4,
581
+ "grad_norm": 1.425026535987854,
582
+ "learning_rate": 0.0002,
583
+ "loss": 0.0172,
584
+ "mean_token_accuracy": 1.0,
585
+ "num_tokens": 317824.0,
586
+ "step": 64
587
+ },
588
+ {
589
+ "epoch": 0.40625,
590
+ "grad_norm": 2.4328842163085938,
591
+ "learning_rate": 0.0002,
592
+ "loss": 0.1337,
593
+ "mean_token_accuracy": 0.96875,
594
+ "num_tokens": 320880.0,
595
+ "step": 65
596
+ },
597
+ {
598
+ "epoch": 0.4125,
599
+ "grad_norm": 2.682482957839966,
600
+ "learning_rate": 0.0002,
601
+ "loss": 0.18,
602
+ "mean_token_accuracy": 0.96875,
603
+ "num_tokens": 323936.0,
604
+ "step": 66
605
+ },
606
+ {
607
+ "epoch": 0.41875,
608
+ "grad_norm": 0.41104015707969666,
609
+ "learning_rate": 0.0002,
610
+ "loss": 0.0076,
611
+ "mean_token_accuracy": 1.0,
612
+ "num_tokens": 326992.0,
613
+ "step": 67
614
+ },
615
+ {
616
+ "epoch": 0.425,
617
+ "grad_norm": 0.22206450998783112,
618
+ "learning_rate": 0.0002,
619
+ "loss": 0.0051,
620
+ "mean_token_accuracy": 1.0,
621
+ "num_tokens": 330048.0,
622
+ "step": 68
623
+ },
624
+ {
625
+ "epoch": 0.43125,
626
+ "grad_norm": 1.8949733972549438,
627
+ "learning_rate": 0.0002,
628
+ "loss": 0.0408,
629
+ "mean_token_accuracy": 0.96875,
630
+ "num_tokens": 333104.0,
631
+ "step": 69
632
+ },
633
+ {
634
+ "epoch": 0.4375,
635
+ "grad_norm": 3.621080160140991,
636
+ "learning_rate": 0.0002,
637
+ "loss": 0.0744,
638
+ "mean_token_accuracy": 0.9375,
639
+ "num_tokens": 336160.0,
640
+ "step": 70
641
+ },
642
+ {
643
+ "epoch": 0.44375,
644
+ "grad_norm": 0.6198443174362183,
645
+ "learning_rate": 0.0002,
646
+ "loss": 0.0123,
647
+ "mean_token_accuracy": 1.0,
648
+ "num_tokens": 339216.0,
649
+ "step": 71
650
+ },
651
+ {
652
+ "epoch": 0.45,
653
+ "grad_norm": 0.46570342779159546,
654
+ "learning_rate": 0.0002,
655
+ "loss": 0.006,
656
+ "mean_token_accuracy": 1.0,
657
+ "num_tokens": 342272.0,
658
+ "step": 72
659
+ },
660
+ {
661
+ "epoch": 0.45625,
662
+ "grad_norm": 4.823789596557617,
663
+ "learning_rate": 0.0002,
664
+ "loss": 0.3016,
665
+ "mean_token_accuracy": 0.9375,
666
+ "num_tokens": 345328.0,
667
+ "step": 73
668
+ },
669
+ {
670
+ "epoch": 0.4625,
671
+ "grad_norm": 4.304416179656982,
672
+ "learning_rate": 0.0002,
673
+ "loss": 0.1201,
674
+ "mean_token_accuracy": 0.96875,
675
+ "num_tokens": 348384.0,
676
+ "step": 74
677
+ },
678
+ {
679
+ "epoch": 0.46875,
680
+ "grad_norm": 1.780882716178894,
681
+ "learning_rate": 0.0002,
682
+ "loss": 0.037,
683
+ "mean_token_accuracy": 0.96875,
684
+ "num_tokens": 351440.0,
685
+ "step": 75
686
+ },
687
+ {
688
+ "epoch": 0.475,
689
+ "grad_norm": 1.6134687662124634,
690
+ "learning_rate": 0.0002,
691
+ "loss": 0.0413,
692
+ "mean_token_accuracy": 0.96875,
693
+ "num_tokens": 354496.0,
694
+ "step": 76
695
+ },
696
+ {
697
+ "epoch": 0.48125,
698
+ "grad_norm": 3.6872904300689697,
699
+ "learning_rate": 0.0002,
700
+ "loss": 0.0697,
701
+ "mean_token_accuracy": 0.96875,
702
+ "num_tokens": 357552.0,
703
+ "step": 77
704
+ },
705
+ {
706
+ "epoch": 0.4875,
707
+ "grad_norm": 3.585052728652954,
708
+ "learning_rate": 0.0002,
709
+ "loss": 0.0485,
710
+ "mean_token_accuracy": 0.96875,
711
+ "num_tokens": 360608.0,
712
+ "step": 78
713
+ },
714
+ {
715
+ "epoch": 0.49375,
716
+ "grad_norm": 3.138762950897217,
717
+ "learning_rate": 0.0002,
718
+ "loss": 0.0686,
719
+ "mean_token_accuracy": 0.96875,
720
+ "num_tokens": 363664.0,
721
+ "step": 79
722
+ },
723
+ {
724
+ "epoch": 0.5,
725
+ "grad_norm": 3.895049571990967,
726
+ "learning_rate": 0.0002,
727
+ "loss": 0.0649,
728
+ "mean_token_accuracy": 0.96875,
729
+ "num_tokens": 366720.0,
730
+ "step": 80
731
+ },
732
+ {
733
+ "epoch": 0.50625,
734
+ "grad_norm": 3.5866286754608154,
735
+ "learning_rate": 0.0002,
736
+ "loss": 0.1002,
737
+ "mean_token_accuracy": 0.96875,
738
+ "num_tokens": 369776.0,
739
+ "step": 81
740
+ },
741
+ {
742
+ "epoch": 0.5125,
743
+ "grad_norm": 2.039658546447754,
744
+ "learning_rate": 0.0002,
745
+ "loss": 0.0235,
746
+ "mean_token_accuracy": 1.0,
747
+ "num_tokens": 372832.0,
748
+ "step": 82
749
+ },
750
+ {
751
+ "epoch": 0.51875,
752
+ "grad_norm": 5.831839084625244,
753
+ "learning_rate": 0.0002,
754
+ "loss": 0.1955,
755
+ "mean_token_accuracy": 0.9375,
756
+ "num_tokens": 375888.0,
757
+ "step": 83
758
+ },
759
+ {
760
+ "epoch": 0.525,
761
+ "grad_norm": 3.6485228538513184,
762
+ "learning_rate": 0.0002,
763
+ "loss": 0.1692,
764
+ "mean_token_accuracy": 0.96875,
765
+ "num_tokens": 378944.0,
766
+ "step": 84
767
+ },
768
+ {
769
+ "epoch": 0.53125,
770
+ "grad_norm": 1.0485862493515015,
771
+ "learning_rate": 0.0002,
772
+ "loss": 0.0178,
773
+ "mean_token_accuracy": 1.0,
774
+ "num_tokens": 382000.0,
775
+ "step": 85
776
+ },
777
+ {
778
+ "epoch": 0.5375,
779
+ "grad_norm": 2.2051405906677246,
780
+ "learning_rate": 0.0002,
781
+ "loss": 0.0282,
782
+ "mean_token_accuracy": 1.0,
783
+ "num_tokens": 385056.0,
784
+ "step": 86
785
+ },
786
+ {
787
+ "epoch": 0.54375,
788
+ "grad_norm": 1.2796984910964966,
789
+ "learning_rate": 0.0002,
790
+ "loss": 0.0191,
791
+ "mean_token_accuracy": 1.0,
792
+ "num_tokens": 388112.0,
793
+ "step": 87
794
+ },
795
+ {
796
+ "epoch": 0.55,
797
+ "grad_norm": 1.5338424444198608,
798
+ "learning_rate": 0.0002,
799
+ "loss": 0.0177,
800
+ "mean_token_accuracy": 1.0,
801
+ "num_tokens": 391168.0,
802
+ "step": 88
803
+ },
804
+ {
805
+ "epoch": 0.55625,
806
+ "grad_norm": 1.2561614513397217,
807
+ "learning_rate": 0.0002,
808
+ "loss": 0.0207,
809
+ "mean_token_accuracy": 1.0,
810
+ "num_tokens": 394224.0,
811
+ "step": 89
812
+ },
813
+ {
814
+ "epoch": 0.5625,
815
+ "grad_norm": 0.8224933743476868,
816
+ "learning_rate": 0.0002,
817
+ "loss": 0.0082,
818
+ "mean_token_accuracy": 1.0,
819
+ "num_tokens": 397280.0,
820
+ "step": 90
821
+ },
822
+ {
823
+ "epoch": 0.56875,
824
+ "grad_norm": 3.2320916652679443,
825
+ "learning_rate": 0.0002,
826
+ "loss": 0.0583,
827
+ "mean_token_accuracy": 0.96875,
828
+ "num_tokens": 400336.0,
829
+ "step": 91
830
+ },
831
+ {
832
+ "epoch": 0.575,
833
+ "grad_norm": 7.057133197784424,
834
+ "learning_rate": 0.0002,
835
+ "loss": 0.259,
836
+ "mean_token_accuracy": 0.9375,
837
+ "num_tokens": 403392.0,
838
+ "step": 92
839
+ },
840
+ {
841
+ "epoch": 0.58125,
842
+ "grad_norm": 2.365320920944214,
843
+ "learning_rate": 0.0002,
844
+ "loss": 0.0295,
845
+ "mean_token_accuracy": 1.0,
846
+ "num_tokens": 406448.0,
847
+ "step": 93
848
+ },
849
+ {
850
+ "epoch": 0.5875,
851
+ "grad_norm": 0.06329375505447388,
852
+ "learning_rate": 0.0002,
853
+ "loss": 0.0014,
854
+ "mean_token_accuracy": 1.0,
855
+ "num_tokens": 409504.0,
856
+ "step": 94
857
+ },
858
+ {
859
+ "epoch": 0.59375,
860
+ "grad_norm": 1.554870367050171,
861
+ "learning_rate": 0.0002,
862
+ "loss": 0.024,
863
+ "mean_token_accuracy": 0.96875,
864
+ "num_tokens": 412560.0,
865
+ "step": 95
866
+ },
867
+ {
868
+ "epoch": 0.6,
869
+ "grad_norm": 2.3296000957489014,
870
+ "learning_rate": 0.0002,
871
+ "loss": 0.0795,
872
+ "mean_token_accuracy": 0.96875,
873
+ "num_tokens": 415616.0,
874
+ "step": 96
875
+ },
876
+ {
877
+ "epoch": 0.60625,
878
+ "grad_norm": 6.749006271362305,
879
+ "learning_rate": 0.0002,
880
+ "loss": 0.1903,
881
+ "mean_token_accuracy": 0.90625,
882
+ "num_tokens": 418672.0,
883
+ "step": 97
884
+ },
885
+ {
886
+ "epoch": 0.6125,
887
+ "grad_norm": 10.803217887878418,
888
+ "learning_rate": 0.0002,
889
+ "loss": 0.2885,
890
+ "mean_token_accuracy": 0.90625,
891
+ "num_tokens": 421728.0,
892
+ "step": 98
893
+ },
894
+ {
895
+ "epoch": 0.61875,
896
+ "grad_norm": 2.8866775035858154,
897
+ "learning_rate": 0.0002,
898
+ "loss": 0.2661,
899
+ "mean_token_accuracy": 0.96875,
900
+ "num_tokens": 424784.0,
901
+ "step": 99
902
+ },
903
+ {
904
+ "epoch": 0.625,
905
+ "grad_norm": 3.800149917602539,
906
+ "learning_rate": 0.0002,
907
+ "loss": 0.0502,
908
+ "mean_token_accuracy": 1.0,
909
+ "num_tokens": 427840.0,
910
+ "step": 100
911
+ },
912
+ {
913
+ "epoch": 0.63125,
914
+ "grad_norm": 0.4669257402420044,
915
+ "learning_rate": 0.0002,
916
+ "loss": 0.0041,
917
+ "mean_token_accuracy": 1.0,
918
+ "num_tokens": 430896.0,
919
+ "step": 101
920
+ },
921
+ {
922
+ "epoch": 0.6375,
923
+ "grad_norm": 3.8480825424194336,
924
+ "learning_rate": 0.0002,
925
+ "loss": 0.0947,
926
+ "mean_token_accuracy": 0.9375,
927
+ "num_tokens": 433952.0,
928
+ "step": 102
929
+ },
930
+ {
931
+ "epoch": 0.64375,
932
+ "grad_norm": 4.139023303985596,
933
+ "learning_rate": 0.0002,
934
+ "loss": 0.1131,
935
+ "mean_token_accuracy": 0.96875,
936
+ "num_tokens": 437008.0,
937
+ "step": 103
938
+ },
939
+ {
940
+ "epoch": 0.65,
941
+ "grad_norm": 3.4455084800720215,
942
+ "learning_rate": 0.0002,
943
+ "loss": 0.1105,
944
+ "mean_token_accuracy": 0.9375,
945
+ "num_tokens": 440064.0,
946
+ "step": 104
947
+ },
948
+ {
949
+ "epoch": 0.65625,
950
+ "grad_norm": 2.1717028617858887,
951
+ "learning_rate": 0.0002,
952
+ "loss": 0.0794,
953
+ "mean_token_accuracy": 0.9375,
954
+ "num_tokens": 443120.0,
955
+ "step": 105
956
+ },
957
+ {
958
+ "epoch": 0.6625,
959
+ "grad_norm": 0.43985098600387573,
960
+ "learning_rate": 0.0002,
961
+ "loss": 0.0057,
962
+ "mean_token_accuracy": 1.0,
963
+ "num_tokens": 446176.0,
964
+ "step": 106
965
+ },
966
+ {
967
+ "epoch": 0.66875,
968
+ "grad_norm": 5.351183891296387,
969
+ "learning_rate": 0.0002,
970
+ "loss": 0.3953,
971
+ "mean_token_accuracy": 0.8125,
972
+ "num_tokens": 449232.0,
973
+ "step": 107
974
+ },
975
+ {
976
+ "epoch": 0.675,
977
+ "grad_norm": 4.6742048263549805,
978
+ "learning_rate": 0.0002,
979
+ "loss": 0.1099,
980
+ "mean_token_accuracy": 0.96875,
981
+ "num_tokens": 452288.0,
982
+ "step": 108
983
+ },
984
+ {
985
+ "epoch": 0.68125,
986
+ "grad_norm": 4.851898670196533,
987
+ "learning_rate": 0.0002,
988
+ "loss": 0.2198,
989
+ "mean_token_accuracy": 0.96875,
990
+ "num_tokens": 455344.0,
991
+ "step": 109
992
+ },
993
+ {
994
+ "epoch": 0.6875,
995
+ "grad_norm": 2.46817946434021,
996
+ "learning_rate": 0.0002,
997
+ "loss": 0.0586,
998
+ "mean_token_accuracy": 0.96875,
999
+ "num_tokens": 458400.0,
1000
+ "step": 110
1001
+ },
1002
+ {
1003
+ "epoch": 0.69375,
1004
+ "grad_norm": 5.507438659667969,
1005
+ "learning_rate": 0.0002,
1006
+ "loss": 0.3492,
1007
+ "mean_token_accuracy": 0.9375,
1008
+ "num_tokens": 461456.0,
1009
+ "step": 111
1010
+ },
1011
+ {
1012
+ "epoch": 0.7,
1013
+ "grad_norm": 0.21149688959121704,
1014
+ "learning_rate": 0.0002,
1015
+ "loss": 0.003,
1016
+ "mean_token_accuracy": 1.0,
1017
+ "num_tokens": 464512.0,
1018
+ "step": 112
1019
+ },
1020
+ {
1021
+ "epoch": 0.70625,
1022
+ "grad_norm": 2.1346304416656494,
1023
+ "learning_rate": 0.0002,
1024
+ "loss": 0.0365,
1025
+ "mean_token_accuracy": 1.0,
1026
+ "num_tokens": 467568.0,
1027
+ "step": 113
1028
+ },
1029
+ {
1030
+ "epoch": 0.7125,
1031
+ "grad_norm": 2.06246280670166,
1032
+ "learning_rate": 0.0002,
1033
+ "loss": 0.1228,
1034
+ "mean_token_accuracy": 0.96875,
1035
+ "num_tokens": 470624.0,
1036
+ "step": 114
1037
+ },
1038
+ {
1039
+ "epoch": 0.71875,
1040
+ "grad_norm": 1.1220771074295044,
1041
+ "learning_rate": 0.0002,
1042
+ "loss": 0.0174,
1043
+ "mean_token_accuracy": 1.0,
1044
+ "num_tokens": 473680.0,
1045
+ "step": 115
1046
+ },
1047
+ {
1048
+ "epoch": 0.725,
1049
+ "grad_norm": 1.283275842666626,
1050
+ "learning_rate": 0.0002,
1051
+ "loss": 0.0473,
1052
+ "mean_token_accuracy": 1.0,
1053
+ "num_tokens": 476736.0,
1054
+ "step": 116
1055
+ },
1056
+ {
1057
+ "epoch": 0.73125,
1058
+ "grad_norm": 3.0137429237365723,
1059
+ "learning_rate": 0.0002,
1060
+ "loss": 0.073,
1061
+ "mean_token_accuracy": 0.9375,
1062
+ "num_tokens": 479792.0,
1063
+ "step": 117
1064
+ },
1065
+ {
1066
+ "epoch": 0.7375,
1067
+ "grad_norm": 3.159397840499878,
1068
+ "learning_rate": 0.0002,
1069
+ "loss": 0.0736,
1070
+ "mean_token_accuracy": 0.96875,
1071
+ "num_tokens": 482848.0,
1072
+ "step": 118
1073
+ },
1074
+ {
1075
+ "epoch": 0.74375,
1076
+ "grad_norm": 4.265467643737793,
1077
+ "learning_rate": 0.0002,
1078
+ "loss": 0.3511,
1079
+ "mean_token_accuracy": 0.90625,
1080
+ "num_tokens": 485904.0,
1081
+ "step": 119
1082
+ },
1083
+ {
1084
+ "epoch": 0.75,
1085
+ "grad_norm": 4.885653495788574,
1086
+ "learning_rate": 0.0002,
1087
+ "loss": 0.1691,
1088
+ "mean_token_accuracy": 0.875,
1089
+ "num_tokens": 488960.0,
1090
+ "step": 120
1091
+ },
1092
+ {
1093
+ "epoch": 0.75625,
1094
+ "grad_norm": 5.490376949310303,
1095
+ "learning_rate": 0.0002,
1096
+ "loss": 0.2357,
1097
+ "mean_token_accuracy": 0.9375,
1098
+ "num_tokens": 492016.0,
1099
+ "step": 121
1100
+ },
1101
+ {
1102
+ "epoch": 0.7625,
1103
+ "grad_norm": 3.725234031677246,
1104
+ "learning_rate": 0.0002,
1105
+ "loss": 0.1983,
1106
+ "mean_token_accuracy": 0.9375,
1107
+ "num_tokens": 495072.0,
1108
+ "step": 122
1109
+ },
1110
+ {
1111
+ "epoch": 0.76875,
1112
+ "grad_norm": 1.6382085084915161,
1113
+ "learning_rate": 0.0002,
1114
+ "loss": 0.043,
1115
+ "mean_token_accuracy": 0.96875,
1116
+ "num_tokens": 498128.0,
1117
+ "step": 123
1118
+ },
1119
+ {
1120
+ "epoch": 0.775,
1121
+ "grad_norm": 0.3425021171569824,
1122
+ "learning_rate": 0.0002,
1123
+ "loss": 0.0069,
1124
+ "mean_token_accuracy": 1.0,
1125
+ "num_tokens": 501184.0,
1126
+ "step": 124
1127
+ },
1128
+ {
1129
+ "epoch": 0.78125,
1130
+ "grad_norm": 1.504239559173584,
1131
+ "learning_rate": 0.0002,
1132
+ "loss": 0.0348,
1133
+ "mean_token_accuracy": 1.0,
1134
+ "num_tokens": 504240.0,
1135
+ "step": 125
1136
+ },
1137
+ {
1138
+ "epoch": 0.7875,
1139
+ "grad_norm": 0.16000418365001678,
1140
+ "learning_rate": 0.0002,
1141
+ "loss": 0.0071,
1142
+ "mean_token_accuracy": 1.0,
1143
+ "num_tokens": 507296.0,
1144
+ "step": 126
1145
+ },
1146
+ {
1147
+ "epoch": 0.79375,
1148
+ "grad_norm": 0.48438990116119385,
1149
+ "learning_rate": 0.0002,
1150
+ "loss": 0.0153,
1151
+ "mean_token_accuracy": 1.0,
1152
+ "num_tokens": 510352.0,
1153
+ "step": 127
1154
+ },
1155
+ {
1156
+ "epoch": 0.8,
1157
+ "grad_norm": 0.3561486601829529,
1158
+ "learning_rate": 0.0002,
1159
+ "loss": 0.0071,
1160
+ "mean_token_accuracy": 1.0,
1161
+ "num_tokens": 513408.0,
1162
+ "step": 128
1163
+ },
1164
+ {
1165
+ "epoch": 0.80625,
1166
+ "grad_norm": 3.513791084289551,
1167
+ "learning_rate": 0.0002,
1168
+ "loss": 0.114,
1169
+ "mean_token_accuracy": 0.9375,
1170
+ "num_tokens": 516464.0,
1171
+ "step": 129
1172
+ },
1173
+ {
1174
+ "epoch": 0.8125,
1175
+ "grad_norm": 0.7476550340652466,
1176
+ "learning_rate": 0.0002,
1177
+ "loss": 0.027,
1178
+ "mean_token_accuracy": 1.0,
1179
+ "num_tokens": 519520.0,
1180
+ "step": 130
1181
+ },
1182
+ {
1183
+ "epoch": 0.81875,
1184
+ "grad_norm": 4.814530372619629,
1185
+ "learning_rate": 0.0002,
1186
+ "loss": 0.2155,
1187
+ "mean_token_accuracy": 0.9375,
1188
+ "num_tokens": 522576.0,
1189
+ "step": 131
1190
+ },
1191
+ {
1192
+ "epoch": 0.825,
1193
+ "grad_norm": 4.203996658325195,
1194
+ "learning_rate": 0.0002,
1195
+ "loss": 0.0732,
1196
+ "mean_token_accuracy": 0.9375,
1197
+ "num_tokens": 525632.0,
1198
+ "step": 132
1199
+ },
1200
+ {
1201
+ "epoch": 0.83125,
1202
+ "grad_norm": 3.171036958694458,
1203
+ "learning_rate": 0.0002,
1204
+ "loss": 0.0479,
1205
+ "mean_token_accuracy": 1.0,
1206
+ "num_tokens": 528688.0,
1207
+ "step": 133
1208
+ },
1209
+ {
1210
+ "epoch": 0.8375,
1211
+ "grad_norm": 3.515819549560547,
1212
+ "learning_rate": 0.0002,
1213
+ "loss": 0.1001,
1214
+ "mean_token_accuracy": 0.96875,
1215
+ "num_tokens": 531744.0,
1216
+ "step": 134
1217
+ },
1218
+ {
1219
+ "epoch": 0.84375,
1220
+ "grad_norm": 1.6061947345733643,
1221
+ "learning_rate": 0.0002,
1222
+ "loss": 0.0795,
1223
+ "mean_token_accuracy": 0.96875,
1224
+ "num_tokens": 534800.0,
1225
+ "step": 135
1226
+ },
1227
+ {
1228
+ "epoch": 0.85,
1229
+ "grad_norm": 0.5217976570129395,
1230
+ "learning_rate": 0.0002,
1231
+ "loss": 0.0143,
1232
+ "mean_token_accuracy": 1.0,
1233
+ "num_tokens": 537856.0,
1234
+ "step": 136
1235
+ },
1236
+ {
1237
+ "epoch": 0.85625,
1238
+ "grad_norm": 1.6569266319274902,
1239
+ "learning_rate": 0.0002,
1240
+ "loss": 0.025,
1241
+ "mean_token_accuracy": 1.0,
1242
+ "num_tokens": 540912.0,
1243
+ "step": 137
1244
+ },
1245
+ {
1246
+ "epoch": 0.8625,
1247
+ "grad_norm": 2.0939252376556396,
1248
+ "learning_rate": 0.0002,
1249
+ "loss": 0.0375,
1250
+ "mean_token_accuracy": 0.96875,
1251
+ "num_tokens": 543968.0,
1252
+ "step": 138
1253
+ },
1254
+ {
1255
+ "epoch": 0.86875,
1256
+ "grad_norm": 1.8286008834838867,
1257
+ "learning_rate": 0.0002,
1258
+ "loss": 0.0464,
1259
+ "mean_token_accuracy": 1.0,
1260
+ "num_tokens": 547024.0,
1261
+ "step": 139
1262
+ },
1263
+ {
1264
+ "epoch": 0.875,
1265
+ "grad_norm": 6.246391296386719,
1266
+ "learning_rate": 0.0002,
1267
+ "loss": 0.135,
1268
+ "mean_token_accuracy": 0.9375,
1269
+ "num_tokens": 550080.0,
1270
+ "step": 140
1271
+ },
1272
+ {
1273
+ "epoch": 0.88125,
1274
+ "grad_norm": 0.9313927292823792,
1275
+ "learning_rate": 0.0002,
1276
+ "loss": 0.0437,
1277
+ "mean_token_accuracy": 1.0,
1278
+ "num_tokens": 553136.0,
1279
+ "step": 141
1280
+ },
1281
+ {
1282
+ "epoch": 0.8875,
1283
+ "grad_norm": 3.962883472442627,
1284
+ "learning_rate": 0.0002,
1285
+ "loss": 0.1026,
1286
+ "mean_token_accuracy": 0.9375,
1287
+ "num_tokens": 556192.0,
1288
+ "step": 142
1289
+ },
1290
+ {
1291
+ "epoch": 0.89375,
1292
+ "grad_norm": 4.730908393859863,
1293
+ "learning_rate": 0.0002,
1294
+ "loss": 0.104,
1295
+ "mean_token_accuracy": 0.90625,
1296
+ "num_tokens": 559248.0,
1297
+ "step": 143
1298
+ },
1299
+ {
1300
+ "epoch": 0.9,
1301
+ "grad_norm": 4.035704135894775,
1302
+ "learning_rate": 0.0002,
1303
+ "loss": 0.16,
1304
+ "mean_token_accuracy": 0.96875,
1305
+ "num_tokens": 562304.0,
1306
+ "step": 144
1307
+ },
1308
+ {
1309
+ "epoch": 0.90625,
1310
+ "grad_norm": 5.092385768890381,
1311
+ "learning_rate": 0.0002,
1312
+ "loss": 0.1841,
1313
+ "mean_token_accuracy": 0.9375,
1314
+ "num_tokens": 565360.0,
1315
+ "step": 145
1316
+ },
1317
+ {
1318
+ "epoch": 0.9125,
1319
+ "grad_norm": 6.0376386642456055,
1320
+ "learning_rate": 0.0002,
1321
+ "loss": 0.2612,
1322
+ "mean_token_accuracy": 0.875,
1323
+ "num_tokens": 568416.0,
1324
+ "step": 146
1325
+ },
1326
+ {
1327
+ "epoch": 0.91875,
1328
+ "grad_norm": 3.7592973709106445,
1329
+ "learning_rate": 0.0002,
1330
+ "loss": 0.047,
1331
+ "mean_token_accuracy": 1.0,
1332
+ "num_tokens": 571472.0,
1333
+ "step": 147
1334
+ },
1335
+ {
1336
+ "epoch": 0.925,
1337
+ "grad_norm": 0.9640989899635315,
1338
+ "learning_rate": 0.0002,
1339
+ "loss": 0.0236,
1340
+ "mean_token_accuracy": 0.96875,
1341
+ "num_tokens": 574528.0,
1342
+ "step": 148
1343
+ },
1344
+ {
1345
+ "epoch": 0.93125,
1346
+ "grad_norm": 1.7307640314102173,
1347
+ "learning_rate": 0.0002,
1348
+ "loss": 0.061,
1349
+ "mean_token_accuracy": 0.96875,
1350
+ "num_tokens": 577584.0,
1351
+ "step": 149
1352
+ },
1353
+ {
1354
+ "epoch": 0.9375,
1355
+ "grad_norm": 2.1097357273101807,
1356
+ "learning_rate": 0.0002,
1357
+ "loss": 0.3053,
1358
+ "mean_token_accuracy": 0.96875,
1359
+ "num_tokens": 580640.0,
1360
+ "step": 150
1361
+ },
1362
+ {
1363
+ "epoch": 0.94375,
1364
+ "grad_norm": 3.9413912296295166,
1365
+ "learning_rate": 0.0002,
1366
+ "loss": 0.1405,
1367
+ "mean_token_accuracy": 0.96875,
1368
+ "num_tokens": 583696.0,
1369
+ "step": 151
1370
+ },
1371
+ {
1372
+ "epoch": 0.95,
1373
+ "grad_norm": 3.341071128845215,
1374
+ "learning_rate": 0.0002,
1375
+ "loss": 0.1131,
1376
+ "mean_token_accuracy": 0.9375,
1377
+ "num_tokens": 586752.0,
1378
+ "step": 152
1379
+ },
1380
+ {
1381
+ "epoch": 0.95625,
1382
+ "grad_norm": 1.0671991109848022,
1383
+ "learning_rate": 0.0002,
1384
+ "loss": 0.0147,
1385
+ "mean_token_accuracy": 1.0,
1386
+ "num_tokens": 589808.0,
1387
+ "step": 153
1388
+ },
1389
+ {
1390
+ "epoch": 0.9625,
1391
+ "grad_norm": 6.068633556365967,
1392
+ "learning_rate": 0.0002,
1393
+ "loss": 0.0888,
1394
+ "mean_token_accuracy": 0.9375,
1395
+ "num_tokens": 592864.0,
1396
+ "step": 154
1397
+ },
1398
+ {
1399
+ "epoch": 0.96875,
1400
+ "grad_norm": 1.0805017948150635,
1401
+ "learning_rate": 0.0002,
1402
+ "loss": 0.0136,
1403
+ "mean_token_accuracy": 1.0,
1404
+ "num_tokens": 595920.0,
1405
+ "step": 155
1406
+ },
1407
+ {
1408
+ "epoch": 0.975,
1409
+ "grad_norm": 0.40108004212379456,
1410
+ "learning_rate": 0.0002,
1411
+ "loss": 0.0091,
1412
+ "mean_token_accuracy": 1.0,
1413
+ "num_tokens": 598976.0,
1414
+ "step": 156
1415
+ },
1416
+ {
1417
+ "epoch": 0.98125,
1418
+ "grad_norm": 0.7008132934570312,
1419
+ "learning_rate": 0.0002,
1420
+ "loss": 0.011,
1421
+ "mean_token_accuracy": 1.0,
1422
+ "num_tokens": 602032.0,
1423
+ "step": 157
1424
+ },
1425
+ {
1426
+ "epoch": 0.9875,
1427
+ "grad_norm": 0.42773208022117615,
1428
+ "learning_rate": 0.0002,
1429
+ "loss": 0.0063,
1430
+ "mean_token_accuracy": 1.0,
1431
+ "num_tokens": 605088.0,
1432
+ "step": 158
1433
+ },
1434
+ {
1435
+ "epoch": 0.99375,
1436
+ "grad_norm": 0.9735352396965027,
1437
+ "learning_rate": 0.0002,
1438
+ "loss": 0.014,
1439
+ "mean_token_accuracy": 1.0,
1440
+ "num_tokens": 608144.0,
1441
+ "step": 159
1442
+ },
1443
+ {
1444
+ "epoch": 1.0,
1445
+ "grad_norm": 2.4267780780792236,
1446
+ "learning_rate": 0.0002,
1447
+ "loss": 0.0344,
1448
+ "mean_token_accuracy": 0.96875,
1449
+ "num_tokens": 611200.0,
1450
+ "step": 160
1451
+ },
1452
+ {
1453
+ "epoch": 1.00625,
1454
+ "grad_norm": 4.375260829925537,
1455
+ "learning_rate": 0.0002,
1456
+ "loss": 0.0981,
1457
+ "mean_token_accuracy": 0.9375,
1458
+ "num_tokens": 614256.0,
1459
+ "step": 161
1460
+ },
1461
+ {
1462
+ "epoch": 1.0125,
1463
+ "grad_norm": 2.6353538036346436,
1464
+ "learning_rate": 0.0002,
1465
+ "loss": 0.1965,
1466
+ "mean_token_accuracy": 0.96875,
1467
+ "num_tokens": 617312.0,
1468
+ "step": 162
1469
+ },
1470
+ {
1471
+ "epoch": 1.01875,
1472
+ "grad_norm": 2.906750440597534,
1473
+ "learning_rate": 0.0002,
1474
+ "loss": 0.0887,
1475
+ "mean_token_accuracy": 0.96875,
1476
+ "num_tokens": 620368.0,
1477
+ "step": 163
1478
+ },
1479
+ {
1480
+ "epoch": 1.025,
1481
+ "grad_norm": 2.4843482971191406,
1482
+ "learning_rate": 0.0002,
1483
+ "loss": 0.0409,
1484
+ "mean_token_accuracy": 0.96875,
1485
+ "num_tokens": 623424.0,
1486
+ "step": 164
1487
+ },
1488
+ {
1489
+ "epoch": 1.03125,
1490
+ "grad_norm": 2.4637041091918945,
1491
+ "learning_rate": 0.0002,
1492
+ "loss": 0.0588,
1493
+ "mean_token_accuracy": 0.96875,
1494
+ "num_tokens": 626480.0,
1495
+ "step": 165
1496
+ },
1497
+ {
1498
+ "epoch": 1.0375,
1499
+ "grad_norm": 3.256136655807495,
1500
+ "learning_rate": 0.0002,
1501
+ "loss": 0.0632,
1502
+ "mean_token_accuracy": 0.96875,
1503
+ "num_tokens": 629536.0,
1504
+ "step": 166
1505
+ },
1506
+ {
1507
+ "epoch": 1.04375,
1508
+ "grad_norm": 3.7600295543670654,
1509
+ "learning_rate": 0.0002,
1510
+ "loss": 0.0903,
1511
+ "mean_token_accuracy": 0.9375,
1512
+ "num_tokens": 632592.0,
1513
+ "step": 167
1514
+ },
1515
+ {
1516
+ "epoch": 1.05,
1517
+ "grad_norm": 4.013559341430664,
1518
+ "learning_rate": 0.0002,
1519
+ "loss": 0.0796,
1520
+ "mean_token_accuracy": 0.9375,
1521
+ "num_tokens": 635648.0,
1522
+ "step": 168
1523
+ },
1524
+ {
1525
+ "epoch": 1.05625,
1526
+ "grad_norm": 1.3275935649871826,
1527
+ "learning_rate": 0.0002,
1528
+ "loss": 0.0415,
1529
+ "mean_token_accuracy": 0.96875,
1530
+ "num_tokens": 638704.0,
1531
+ "step": 169
1532
+ },
1533
+ {
1534
+ "epoch": 1.0625,
1535
+ "grad_norm": 3.0830256938934326,
1536
+ "learning_rate": 0.0002,
1537
+ "loss": 0.0989,
1538
+ "mean_token_accuracy": 0.96875,
1539
+ "num_tokens": 641760.0,
1540
+ "step": 170
1541
+ },
1542
+ {
1543
+ "epoch": 1.06875,
1544
+ "grad_norm": 5.780794620513916,
1545
+ "learning_rate": 0.0002,
1546
+ "loss": 0.0974,
1547
+ "mean_token_accuracy": 0.9375,
1548
+ "num_tokens": 644816.0,
1549
+ "step": 171
1550
+ },
1551
+ {
1552
+ "epoch": 1.075,
1553
+ "grad_norm": 1.1954656839370728,
1554
+ "learning_rate": 0.0002,
1555
+ "loss": 0.0363,
1556
+ "mean_token_accuracy": 1.0,
1557
+ "num_tokens": 647872.0,
1558
+ "step": 172
1559
+ },
1560
+ {
1561
+ "epoch": 1.08125,
1562
+ "grad_norm": 0.6503368616104126,
1563
+ "learning_rate": 0.0002,
1564
+ "loss": 0.0086,
1565
+ "mean_token_accuracy": 1.0,
1566
+ "num_tokens": 650928.0,
1567
+ "step": 173
1568
+ },
1569
+ {
1570
+ "epoch": 1.0875,
1571
+ "grad_norm": 0.2741992771625519,
1572
+ "learning_rate": 0.0002,
1573
+ "loss": 0.005,
1574
+ "mean_token_accuracy": 1.0,
1575
+ "num_tokens": 653984.0,
1576
+ "step": 174
1577
+ },
1578
+ {
1579
+ "epoch": 1.09375,
1580
+ "grad_norm": 4.483480930328369,
1581
+ "learning_rate": 0.0002,
1582
+ "loss": 0.1765,
1583
+ "mean_token_accuracy": 0.9375,
1584
+ "num_tokens": 657040.0,
1585
+ "step": 175
1586
+ },
1587
+ {
1588
+ "epoch": 1.1,
1589
+ "grad_norm": 2.0801796913146973,
1590
+ "learning_rate": 0.0002,
1591
+ "loss": 0.2432,
1592
+ "mean_token_accuracy": 0.96875,
1593
+ "num_tokens": 660096.0,
1594
+ "step": 176
1595
+ },
1596
+ {
1597
+ "epoch": 1.10625,
1598
+ "grad_norm": 4.300808906555176,
1599
+ "learning_rate": 0.0002,
1600
+ "loss": 0.0805,
1601
+ "mean_token_accuracy": 0.96875,
1602
+ "num_tokens": 663152.0,
1603
+ "step": 177
1604
+ },
1605
+ {
1606
+ "epoch": 1.1125,
1607
+ "grad_norm": 1.7215920686721802,
1608
+ "learning_rate": 0.0002,
1609
+ "loss": 0.0227,
1610
+ "mean_token_accuracy": 0.96875,
1611
+ "num_tokens": 666208.0,
1612
+ "step": 178
1613
+ },
1614
+ {
1615
+ "epoch": 1.11875,
1616
+ "grad_norm": 3.1990017890930176,
1617
+ "learning_rate": 0.0002,
1618
+ "loss": 0.3566,
1619
+ "mean_token_accuracy": 0.9375,
1620
+ "num_tokens": 669264.0,
1621
+ "step": 179
1622
+ },
1623
+ {
1624
+ "epoch": 1.125,
1625
+ "grad_norm": 1.593938946723938,
1626
+ "learning_rate": 0.0002,
1627
+ "loss": 0.0232,
1628
+ "mean_token_accuracy": 1.0,
1629
+ "num_tokens": 672320.0,
1630
+ "step": 180
1631
+ },
1632
+ {
1633
+ "epoch": 1.13125,
1634
+ "grad_norm": 5.725439071655273,
1635
+ "learning_rate": 0.0002,
1636
+ "loss": 0.1591,
1637
+ "mean_token_accuracy": 0.9375,
1638
+ "num_tokens": 675376.0,
1639
+ "step": 181
1640
+ },
1641
+ {
1642
+ "epoch": 1.1375,
1643
+ "grad_norm": 4.107408046722412,
1644
+ "learning_rate": 0.0002,
1645
+ "loss": 0.0734,
1646
+ "mean_token_accuracy": 0.96875,
1647
+ "num_tokens": 678432.0,
1648
+ "step": 182
1649
+ },
1650
+ {
1651
+ "epoch": 1.14375,
1652
+ "grad_norm": 0.5303417444229126,
1653
+ "learning_rate": 0.0002,
1654
+ "loss": 0.0099,
1655
+ "mean_token_accuracy": 1.0,
1656
+ "num_tokens": 681488.0,
1657
+ "step": 183
1658
+ },
1659
+ {
1660
+ "epoch": 1.15,
1661
+ "grad_norm": 0.7143228650093079,
1662
+ "learning_rate": 0.0002,
1663
+ "loss": 0.0127,
1664
+ "mean_token_accuracy": 1.0,
1665
+ "num_tokens": 684544.0,
1666
+ "step": 184
1667
+ },
1668
+ {
1669
+ "epoch": 1.15625,
1670
+ "grad_norm": 0.10430416464805603,
1671
+ "learning_rate": 0.0002,
1672
+ "loss": 0.0028,
1673
+ "mean_token_accuracy": 1.0,
1674
+ "num_tokens": 687600.0,
1675
+ "step": 185
1676
+ },
1677
+ {
1678
+ "epoch": 1.1625,
1679
+ "grad_norm": 0.10262572765350342,
1680
+ "learning_rate": 0.0002,
1681
+ "loss": 0.0022,
1682
+ "mean_token_accuracy": 1.0,
1683
+ "num_tokens": 690656.0,
1684
+ "step": 186
1685
+ },
1686
+ {
1687
+ "epoch": 1.16875,
1688
+ "grad_norm": 0.5091086626052856,
1689
+ "learning_rate": 0.0002,
1690
+ "loss": 0.0082,
1691
+ "mean_token_accuracy": 1.0,
1692
+ "num_tokens": 693712.0,
1693
+ "step": 187
1694
+ },
1695
+ {
1696
+ "epoch": 1.175,
1697
+ "grad_norm": 0.32739120721817017,
1698
+ "learning_rate": 0.0002,
1699
+ "loss": 0.006,
1700
+ "mean_token_accuracy": 1.0,
1701
+ "num_tokens": 696768.0,
1702
+ "step": 188
1703
+ },
1704
+ {
1705
+ "epoch": 1.18125,
1706
+ "grad_norm": 2.958453416824341,
1707
+ "learning_rate": 0.0002,
1708
+ "loss": 0.0506,
1709
+ "mean_token_accuracy": 0.96875,
1710
+ "num_tokens": 699824.0,
1711
+ "step": 189
1712
+ },
1713
+ {
1714
+ "epoch": 1.1875,
1715
+ "grad_norm": 0.6987707018852234,
1716
+ "learning_rate": 0.0002,
1717
+ "loss": 0.0107,
1718
+ "mean_token_accuracy": 1.0,
1719
+ "num_tokens": 702880.0,
1720
+ "step": 190
1721
+ },
1722
+ {
1723
+ "epoch": 1.19375,
1724
+ "grad_norm": 1.3608295917510986,
1725
+ "learning_rate": 0.0002,
1726
+ "loss": 0.0287,
1727
+ "mean_token_accuracy": 1.0,
1728
+ "num_tokens": 705936.0,
1729
+ "step": 191
1730
+ },
1731
+ {
1732
+ "epoch": 1.2,
1733
+ "grad_norm": 0.14391912519931793,
1734
+ "learning_rate": 0.0002,
1735
+ "loss": 0.0021,
1736
+ "mean_token_accuracy": 1.0,
1737
+ "num_tokens": 708992.0,
1738
+ "step": 192
1739
+ },
1740
+ {
1741
+ "epoch": 1.20625,
1742
+ "grad_norm": 2.5519983768463135,
1743
+ "learning_rate": 0.0002,
1744
+ "loss": 0.1491,
1745
+ "mean_token_accuracy": 0.96875,
1746
+ "num_tokens": 712048.0,
1747
+ "step": 193
1748
+ },
1749
+ {
1750
+ "epoch": 1.2125,
1751
+ "grad_norm": 6.5541253089904785,
1752
+ "learning_rate": 0.0002,
1753
+ "loss": 0.1414,
1754
+ "mean_token_accuracy": 0.9375,
1755
+ "num_tokens": 715104.0,
1756
+ "step": 194
1757
+ },
1758
+ {
1759
+ "epoch": 1.21875,
1760
+ "grad_norm": 0.2820110619068146,
1761
+ "learning_rate": 0.0002,
1762
+ "loss": 0.003,
1763
+ "mean_token_accuracy": 1.0,
1764
+ "num_tokens": 718160.0,
1765
+ "step": 195
1766
+ },
1767
+ {
1768
+ "epoch": 1.225,
1769
+ "grad_norm": 2.9799859523773193,
1770
+ "learning_rate": 0.0002,
1771
+ "loss": 0.0506,
1772
+ "mean_token_accuracy": 0.96875,
1773
+ "num_tokens": 721216.0,
1774
+ "step": 196
1775
+ },
1776
+ {
1777
+ "epoch": 1.23125,
1778
+ "grad_norm": 1.1640214920043945,
1779
+ "learning_rate": 0.0002,
1780
+ "loss": 0.0182,
1781
+ "mean_token_accuracy": 1.0,
1782
+ "num_tokens": 724272.0,
1783
+ "step": 197
1784
+ },
1785
+ {
1786
+ "epoch": 1.2375,
1787
+ "grad_norm": 2.498863935470581,
1788
+ "learning_rate": 0.0002,
1789
+ "loss": 0.073,
1790
+ "mean_token_accuracy": 0.9375,
1791
+ "num_tokens": 727328.0,
1792
+ "step": 198
1793
+ },
1794
+ {
1795
+ "epoch": 1.24375,
1796
+ "grad_norm": 1.2676233053207397,
1797
+ "learning_rate": 0.0002,
1798
+ "loss": 0.0265,
1799
+ "mean_token_accuracy": 1.0,
1800
+ "num_tokens": 730384.0,
1801
+ "step": 199
1802
+ },
1803
+ {
1804
+ "epoch": 1.25,
1805
+ "grad_norm": 0.3039199113845825,
1806
+ "learning_rate": 0.0002,
1807
+ "loss": 0.0079,
1808
+ "step": 200
1809
+ },
1810
+ {
1811
+ "epoch": 1.25,
1812
+ "eval_loss": 0.07848864793777466,
1813
+ "eval_mean_token_accuracy": 0.9703125,
1814
+ "eval_model_preparation_time": 0.0042,
1815
+ "eval_num_tokens": 733440.0,
1816
+ "eval_runtime": 71.4875,
1817
+ "eval_samples_per_second": 8.953,
1818
+ "eval_steps_per_second": 0.56,
1819
+ "step": 200
1820
+ },
1821
+ {
1822
+ "epoch": 1.25625,
1823
+ "grad_norm": 3.308570623397827,
1824
+ "learning_rate": 0.0002,
1825
+ "loss": 0.0569,
1826
+ "mean_token_accuracy": 0.984375,
1827
+ "num_tokens": 736496.0,
1828
+ "step": 201
1829
+ },
1830
+ {
1831
+ "epoch": 1.2625,
1832
+ "grad_norm": 1.4555875062942505,
1833
+ "learning_rate": 0.0002,
1834
+ "loss": 0.0138,
1835
+ "mean_token_accuracy": 1.0,
1836
+ "num_tokens": 739552.0,
1837
+ "step": 202
1838
+ },
1839
+ {
1840
+ "epoch": 1.26875,
1841
+ "grad_norm": 0.012952354736626148,
1842
+ "learning_rate": 0.0002,
1843
+ "loss": 0.0001,
1844
+ "mean_token_accuracy": 1.0,
1845
+ "num_tokens": 742608.0,
1846
+ "step": 203
1847
+ },
1848
+ {
1849
+ "epoch": 1.275,
1850
+ "grad_norm": 5.3269829750061035,
1851
+ "learning_rate": 0.0002,
1852
+ "loss": 0.2567,
1853
+ "mean_token_accuracy": 0.9375,
1854
+ "num_tokens": 745664.0,
1855
+ "step": 204
1856
+ },
1857
+ {
1858
+ "epoch": 1.28125,
1859
+ "grad_norm": 0.021568113937973976,
1860
+ "learning_rate": 0.0002,
1861
+ "loss": 0.0002,
1862
+ "mean_token_accuracy": 1.0,
1863
+ "num_tokens": 748720.0,
1864
+ "step": 205
1865
+ },
1866
+ {
1867
+ "epoch": 1.2875,
1868
+ "grad_norm": 1.318519115447998,
1869
+ "learning_rate": 0.0002,
1870
+ "loss": 0.0142,
1871
+ "mean_token_accuracy": 1.0,
1872
+ "num_tokens": 751776.0,
1873
+ "step": 206
1874
+ },
1875
+ {
1876
+ "epoch": 1.29375,
1877
+ "grad_norm": 3.792602300643921,
1878
+ "learning_rate": 0.0002,
1879
+ "loss": 0.0546,
1880
+ "mean_token_accuracy": 0.96875,
1881
+ "num_tokens": 754832.0,
1882
+ "step": 207
1883
+ },
1884
+ {
1885
+ "epoch": 1.3,
1886
+ "grad_norm": 3.9029130935668945,
1887
+ "learning_rate": 0.0002,
1888
+ "loss": 0.0485,
1889
+ "mean_token_accuracy": 0.96875,
1890
+ "num_tokens": 757888.0,
1891
+ "step": 208
1892
+ },
1893
+ {
1894
+ "epoch": 1.30625,
1895
+ "grad_norm": 5.260088920593262,
1896
+ "learning_rate": 0.0002,
1897
+ "loss": 0.1518,
1898
+ "mean_token_accuracy": 0.96875,
1899
+ "num_tokens": 760944.0,
1900
+ "step": 209
1901
+ },
1902
+ {
1903
+ "epoch": 1.3125,
1904
+ "grad_norm": 7.682434558868408,
1905
+ "learning_rate": 0.0002,
1906
+ "loss": 0.1869,
1907
+ "mean_token_accuracy": 0.9375,
1908
+ "num_tokens": 764000.0,
1909
+ "step": 210
1910
+ },
1911
+ {
1912
+ "epoch": 1.31875,
1913
+ "grad_norm": 5.554214000701904,
1914
+ "learning_rate": 0.0002,
1915
+ "loss": 0.2424,
1916
+ "mean_token_accuracy": 0.96875,
1917
+ "num_tokens": 767056.0,
1918
+ "step": 211
1919
+ },
1920
+ {
1921
+ "epoch": 1.325,
1922
+ "grad_norm": 1.796474814414978,
1923
+ "learning_rate": 0.0002,
1924
+ "loss": 0.0379,
1925
+ "mean_token_accuracy": 0.96875,
1926
+ "num_tokens": 770112.0,
1927
+ "step": 212
1928
+ },
1929
+ {
1930
+ "epoch": 1.33125,
1931
+ "grad_norm": 0.8003765344619751,
1932
+ "learning_rate": 0.0002,
1933
+ "loss": 0.0088,
1934
+ "mean_token_accuracy": 1.0,
1935
+ "num_tokens": 773168.0,
1936
+ "step": 213
1937
+ },
1938
+ {
1939
+ "epoch": 1.3375,
1940
+ "grad_norm": 0.07899386435747147,
1941
+ "learning_rate": 0.0002,
1942
+ "loss": 0.001,
1943
+ "mean_token_accuracy": 1.0,
1944
+ "num_tokens": 776224.0,
1945
+ "step": 214
1946
+ },
1947
+ {
1948
+ "epoch": 1.34375,
1949
+ "grad_norm": 1.10870361328125,
1950
+ "learning_rate": 0.0002,
1951
+ "loss": 0.0102,
1952
+ "mean_token_accuracy": 1.0,
1953
+ "num_tokens": 779280.0,
1954
+ "step": 215
1955
+ },
1956
+ {
1957
+ "epoch": 1.35,
1958
+ "grad_norm": 4.3541107177734375,
1959
+ "learning_rate": 0.0002,
1960
+ "loss": 0.1525,
1961
+ "mean_token_accuracy": 0.96875,
1962
+ "num_tokens": 782336.0,
1963
+ "step": 216
1964
+ },
1965
+ {
1966
+ "epoch": 1.35625,
1967
+ "grad_norm": 0.10654051601886749,
1968
+ "learning_rate": 0.0002,
1969
+ "loss": 0.0018,
1970
+ "mean_token_accuracy": 1.0,
1971
+ "num_tokens": 785392.0,
1972
+ "step": 217
1973
+ },
1974
+ {
1975
+ "epoch": 1.3625,
1976
+ "grad_norm": 0.1457299143075943,
1977
+ "learning_rate": 0.0002,
1978
+ "loss": 0.0026,
1979
+ "mean_token_accuracy": 1.0,
1980
+ "num_tokens": 788448.0,
1981
+ "step": 218
1982
+ },
1983
+ {
1984
+ "epoch": 1.36875,
1985
+ "grad_norm": 0.3237255811691284,
1986
+ "learning_rate": 0.0002,
1987
+ "loss": 0.0031,
1988
+ "mean_token_accuracy": 1.0,
1989
+ "num_tokens": 791504.0,
1990
+ "step": 219
1991
+ },
1992
+ {
1993
+ "epoch": 1.375,
1994
+ "grad_norm": 1.9996052980422974,
1995
+ "learning_rate": 0.0002,
1996
+ "loss": 0.0206,
1997
+ "mean_token_accuracy": 1.0,
1998
+ "num_tokens": 794560.0,
1999
+ "step": 220
2000
+ },
2001
+ {
2002
+ "epoch": 1.38125,
2003
+ "grad_norm": 4.9992289543151855,
2004
+ "learning_rate": 0.0002,
2005
+ "loss": 0.1205,
2006
+ "mean_token_accuracy": 0.96875,
2007
+ "num_tokens": 797616.0,
2008
+ "step": 221
2009
+ },
2010
+ {
2011
+ "epoch": 1.3875,
2012
+ "grad_norm": 1.0994348526000977,
2013
+ "learning_rate": 0.0002,
2014
+ "loss": 0.01,
2015
+ "mean_token_accuracy": 1.0,
2016
+ "num_tokens": 800672.0,
2017
+ "step": 222
2018
+ },
2019
+ {
2020
+ "epoch": 1.39375,
2021
+ "grad_norm": 5.1901068687438965,
2022
+ "learning_rate": 0.0002,
2023
+ "loss": 0.0597,
2024
+ "mean_token_accuracy": 0.96875,
2025
+ "num_tokens": 803728.0,
2026
+ "step": 223
2027
+ },
2028
+ {
2029
+ "epoch": 1.4,
2030
+ "grad_norm": 8.030043601989746,
2031
+ "learning_rate": 0.0002,
2032
+ "loss": 0.1187,
2033
+ "mean_token_accuracy": 0.9375,
2034
+ "num_tokens": 806784.0,
2035
+ "step": 224
2036
+ },
2037
+ {
2038
+ "epoch": 1.40625,
2039
+ "grad_norm": 5.358715057373047,
2040
+ "learning_rate": 0.0002,
2041
+ "loss": 0.0635,
2042
+ "mean_token_accuracy": 0.96875,
2043
+ "num_tokens": 809840.0,
2044
+ "step": 225
2045
+ },
2046
+ {
2047
+ "epoch": 1.4125,
2048
+ "grad_norm": 5.855217933654785,
2049
+ "learning_rate": 0.0002,
2050
+ "loss": 0.1923,
2051
+ "mean_token_accuracy": 0.9375,
2052
+ "num_tokens": 812896.0,
2053
+ "step": 226
2054
+ },
2055
+ {
2056
+ "epoch": 1.41875,
2057
+ "grad_norm": 1.0728706121444702,
2058
+ "learning_rate": 0.0002,
2059
+ "loss": 0.0098,
2060
+ "mean_token_accuracy": 1.0,
2061
+ "num_tokens": 815952.0,
2062
+ "step": 227
2063
+ },
2064
+ {
2065
+ "epoch": 1.425,
2066
+ "grad_norm": 0.028943141922354698,
2067
+ "learning_rate": 0.0002,
2068
+ "loss": 0.0003,
2069
+ "mean_token_accuracy": 1.0,
2070
+ "num_tokens": 819008.0,
2071
+ "step": 228
2072
+ },
2073
+ {
2074
+ "epoch": 1.43125,
2075
+ "grad_norm": 4.162194728851318,
2076
+ "learning_rate": 0.0002,
2077
+ "loss": 0.0434,
2078
+ "mean_token_accuracy": 0.96875,
2079
+ "num_tokens": 822064.0,
2080
+ "step": 229
2081
+ },
2082
+ {
2083
+ "epoch": 1.4375,
2084
+ "grad_norm": 0.4618898630142212,
2085
+ "learning_rate": 0.0002,
2086
+ "loss": 0.005,
2087
+ "mean_token_accuracy": 1.0,
2088
+ "num_tokens": 825120.0,
2089
+ "step": 230
2090
+ },
2091
+ {
2092
+ "epoch": 1.44375,
2093
+ "grad_norm": 0.33329862356185913,
2094
+ "learning_rate": 0.0002,
2095
+ "loss": 0.0029,
2096
+ "mean_token_accuracy": 1.0,
2097
+ "num_tokens": 828176.0,
2098
+ "step": 231
2099
+ },
2100
+ {
2101
+ "epoch": 1.45,
2102
+ "grad_norm": 0.5071545839309692,
2103
+ "learning_rate": 0.0002,
2104
+ "loss": 0.0043,
2105
+ "mean_token_accuracy": 1.0,
2106
+ "num_tokens": 831232.0,
2107
+ "step": 232
2108
+ },
2109
+ {
2110
+ "epoch": 1.45625,
2111
+ "grad_norm": 0.6205556392669678,
2112
+ "learning_rate": 0.0002,
2113
+ "loss": 0.0057,
2114
+ "mean_token_accuracy": 1.0,
2115
+ "num_tokens": 834288.0,
2116
+ "step": 233
2117
+ },
2118
+ {
2119
+ "epoch": 1.4625,
2120
+ "grad_norm": 0.00851200520992279,
2121
+ "learning_rate": 0.0002,
2122
+ "loss": 0.0003,
2123
+ "mean_token_accuracy": 1.0,
2124
+ "num_tokens": 837344.0,
2125
+ "step": 234
2126
+ },
2127
+ {
2128
+ "epoch": 1.46875,
2129
+ "grad_norm": 0.04946492984890938,
2130
+ "learning_rate": 0.0002,
2131
+ "loss": 0.0005,
2132
+ "mean_token_accuracy": 1.0,
2133
+ "num_tokens": 840400.0,
2134
+ "step": 235
2135
+ },
2136
+ {
2137
+ "epoch": 1.475,
2138
+ "grad_norm": 3.8646678924560547,
2139
+ "learning_rate": 0.0002,
2140
+ "loss": 0.0244,
2141
+ "mean_token_accuracy": 1.0,
2142
+ "num_tokens": 843456.0,
2143
+ "step": 236
2144
+ },
2145
+ {
2146
+ "epoch": 1.48125,
2147
+ "grad_norm": 5.360538005828857,
2148
+ "learning_rate": 0.0002,
2149
+ "loss": 0.0776,
2150
+ "mean_token_accuracy": 0.96875,
2151
+ "num_tokens": 846512.0,
2152
+ "step": 237
2153
+ },
2154
+ {
2155
+ "epoch": 1.4875,
2156
+ "grad_norm": 1.9940391778945923,
2157
+ "learning_rate": 0.0002,
2158
+ "loss": 0.0203,
2159
+ "mean_token_accuracy": 1.0,
2160
+ "num_tokens": 849568.0,
2161
+ "step": 238
2162
+ },
2163
+ {
2164
+ "epoch": 1.49375,
2165
+ "grad_norm": 5.840383529663086,
2166
+ "learning_rate": 0.0002,
2167
+ "loss": 0.2391,
2168
+ "mean_token_accuracy": 0.9375,
2169
+ "num_tokens": 852624.0,
2170
+ "step": 239
2171
+ },
2172
+ {
2173
+ "epoch": 1.5,
2174
+ "grad_norm": 4.679400444030762,
2175
+ "learning_rate": 0.0002,
2176
+ "loss": 0.0533,
2177
+ "mean_token_accuracy": 0.96875,
2178
+ "num_tokens": 855680.0,
2179
+ "step": 240
2180
+ },
2181
+ {
2182
+ "epoch": 1.50625,
2183
+ "grad_norm": 2.590467691421509,
2184
+ "learning_rate": 0.0002,
2185
+ "loss": 0.0536,
2186
+ "mean_token_accuracy": 0.96875,
2187
+ "num_tokens": 858736.0,
2188
+ "step": 241
2189
+ },
2190
+ {
2191
+ "epoch": 1.5125,
2192
+ "grad_norm": 0.5134260654449463,
2193
+ "learning_rate": 0.0002,
2194
+ "loss": 0.0045,
2195
+ "mean_token_accuracy": 1.0,
2196
+ "num_tokens": 861792.0,
2197
+ "step": 242
2198
+ },
2199
+ {
2200
+ "epoch": 1.51875,
2201
+ "grad_norm": 1.5896207094192505,
2202
+ "learning_rate": 0.0002,
2203
+ "loss": 0.0131,
2204
+ "mean_token_accuracy": 1.0,
2205
+ "num_tokens": 864848.0,
2206
+ "step": 243
2207
+ },
2208
+ {
2209
+ "epoch": 1.525,
2210
+ "grad_norm": 0.48414427042007446,
2211
+ "learning_rate": 0.0002,
2212
+ "loss": 0.0037,
2213
+ "mean_token_accuracy": 1.0,
2214
+ "num_tokens": 867904.0,
2215
+ "step": 244
2216
+ },
2217
+ {
2218
+ "epoch": 1.53125,
2219
+ "grad_norm": 1.9993484020233154,
2220
+ "learning_rate": 0.0002,
2221
+ "loss": 0.0182,
2222
+ "mean_token_accuracy": 1.0,
2223
+ "num_tokens": 870960.0,
2224
+ "step": 245
2225
+ },
2226
+ {
2227
+ "epoch": 1.5375,
2228
+ "grad_norm": 0.592039942741394,
2229
+ "learning_rate": 0.0002,
2230
+ "loss": 0.0078,
2231
+ "mean_token_accuracy": 1.0,
2232
+ "num_tokens": 874016.0,
2233
+ "step": 246
2234
+ },
2235
+ {
2236
+ "epoch": 1.54375,
2237
+ "grad_norm": 4.422524929046631,
2238
+ "learning_rate": 0.0002,
2239
+ "loss": 0.0413,
2240
+ "mean_token_accuracy": 0.96875,
2241
+ "num_tokens": 877072.0,
2242
+ "step": 247
2243
+ },
2244
+ {
2245
+ "epoch": 1.55,
2246
+ "grad_norm": 4.080215930938721,
2247
+ "learning_rate": 0.0002,
2248
+ "loss": 0.1189,
2249
+ "mean_token_accuracy": 0.96875,
2250
+ "num_tokens": 880128.0,
2251
+ "step": 248
2252
+ },
2253
+ {
2254
+ "epoch": 1.55625,
2255
+ "grad_norm": 2.1959433555603027,
2256
+ "learning_rate": 0.0002,
2257
+ "loss": 0.0267,
2258
+ "mean_token_accuracy": 0.96875,
2259
+ "num_tokens": 883184.0,
2260
+ "step": 249
2261
+ },
2262
+ {
2263
+ "epoch": 1.5625,
2264
+ "grad_norm": 4.379952430725098,
2265
+ "learning_rate": 0.0002,
2266
+ "loss": 0.0507,
2267
+ "mean_token_accuracy": 0.96875,
2268
+ "num_tokens": 886240.0,
2269
+ "step": 250
2270
+ },
2271
+ {
2272
+ "epoch": 1.56875,
2273
+ "grad_norm": 2.473543882369995,
2274
+ "learning_rate": 0.0002,
2275
+ "loss": 0.0261,
2276
+ "mean_token_accuracy": 0.96875,
2277
+ "num_tokens": 889296.0,
2278
+ "step": 251
2279
+ },
2280
+ {
2281
+ "epoch": 1.575,
2282
+ "grad_norm": 0.6901207566261292,
2283
+ "learning_rate": 0.0002,
2284
+ "loss": 0.0043,
2285
+ "mean_token_accuracy": 1.0,
2286
+ "num_tokens": 892352.0,
2287
+ "step": 252
2288
+ },
2289
+ {
2290
+ "epoch": 1.58125,
2291
+ "grad_norm": 3.800703763961792,
2292
+ "learning_rate": 0.0002,
2293
+ "loss": 0.1643,
2294
+ "mean_token_accuracy": 0.96875,
2295
+ "num_tokens": 895408.0,
2296
+ "step": 253
2297
+ },
2298
+ {
2299
+ "epoch": 1.5875,
2300
+ "grad_norm": 1.523447036743164,
2301
+ "learning_rate": 0.0002,
2302
+ "loss": 0.4695,
2303
+ "mean_token_accuracy": 0.96875,
2304
+ "num_tokens": 898464.0,
2305
+ "step": 254
2306
+ },
2307
+ {
2308
+ "epoch": 1.59375,
2309
+ "grad_norm": 0.06425034999847412,
2310
+ "learning_rate": 0.0002,
2311
+ "loss": 0.0008,
2312
+ "mean_token_accuracy": 1.0,
2313
+ "num_tokens": 901520.0,
2314
+ "step": 255
2315
+ },
2316
+ {
2317
+ "epoch": 1.6,
2318
+ "grad_norm": 0.7315100431442261,
2319
+ "learning_rate": 0.0002,
2320
+ "loss": 0.0066,
2321
+ "mean_token_accuracy": 1.0,
2322
+ "num_tokens": 904576.0,
2323
+ "step": 256
2324
+ },
2325
+ {
2326
+ "epoch": 1.60625,
2327
+ "grad_norm": 0.39078789949417114,
2328
+ "learning_rate": 0.0002,
2329
+ "loss": 0.0042,
2330
+ "mean_token_accuracy": 1.0,
2331
+ "num_tokens": 907632.0,
2332
+ "step": 257
2333
+ },
2334
+ {
2335
+ "epoch": 1.6125,
2336
+ "grad_norm": 7.1944475173950195,
2337
+ "learning_rate": 0.0002,
2338
+ "loss": 0.2334,
2339
+ "mean_token_accuracy": 0.96875,
2340
+ "num_tokens": 910688.0,
2341
+ "step": 258
2342
+ },
2343
+ {
2344
+ "epoch": 1.61875,
2345
+ "grad_norm": 0.3877175748348236,
2346
+ "learning_rate": 0.0002,
2347
+ "loss": 0.0046,
2348
+ "mean_token_accuracy": 1.0,
2349
+ "num_tokens": 913744.0,
2350
+ "step": 259
2351
+ },
2352
+ {
2353
+ "epoch": 1.625,
2354
+ "grad_norm": 1.4738430976867676,
2355
+ "learning_rate": 0.0002,
2356
+ "loss": 0.0207,
2357
+ "mean_token_accuracy": 1.0,
2358
+ "num_tokens": 916800.0,
2359
+ "step": 260
2360
+ },
2361
+ {
2362
+ "epoch": 1.63125,
2363
+ "grad_norm": 3.2106664180755615,
2364
+ "learning_rate": 0.0002,
2365
+ "loss": 0.0356,
2366
+ "mean_token_accuracy": 1.0,
2367
+ "num_tokens": 919856.0,
2368
+ "step": 261
2369
+ },
2370
+ {
2371
+ "epoch": 1.6375,
2372
+ "grad_norm": 0.09959662705659866,
2373
+ "learning_rate": 0.0002,
2374
+ "loss": 0.0008,
2375
+ "mean_token_accuracy": 1.0,
2376
+ "num_tokens": 922912.0,
2377
+ "step": 262
2378
+ },
2379
+ {
2380
+ "epoch": 1.64375,
2381
+ "grad_norm": 3.7656471729278564,
2382
+ "learning_rate": 0.0002,
2383
+ "loss": 0.0371,
2384
+ "mean_token_accuracy": 0.96875,
2385
+ "num_tokens": 925968.0,
2386
+ "step": 263
2387
+ },
2388
+ {
2389
+ "epoch": 1.65,
2390
+ "grad_norm": 0.057951927185058594,
2391
+ "learning_rate": 0.0002,
2392
+ "loss": 0.0009,
2393
+ "mean_token_accuracy": 1.0,
2394
+ "num_tokens": 929024.0,
2395
+ "step": 264
2396
+ },
2397
+ {
2398
+ "epoch": 1.65625,
2399
+ "grad_norm": 0.7342841029167175,
2400
+ "learning_rate": 0.0002,
2401
+ "loss": 0.0082,
2402
+ "mean_token_accuracy": 1.0,
2403
+ "num_tokens": 932080.0,
2404
+ "step": 265
2405
+ },
2406
+ {
2407
+ "epoch": 1.6625,
2408
+ "grad_norm": 2.154627799987793,
2409
+ "learning_rate": 0.0002,
2410
+ "loss": 0.0259,
2411
+ "mean_token_accuracy": 0.96875,
2412
+ "num_tokens": 935136.0,
2413
+ "step": 266
2414
+ },
2415
+ {
2416
+ "epoch": 1.66875,
2417
+ "grad_norm": 0.036353010684251785,
2418
+ "learning_rate": 0.0002,
2419
+ "loss": 0.0008,
2420
+ "mean_token_accuracy": 1.0,
2421
+ "num_tokens": 938192.0,
2422
+ "step": 267
2423
+ },
2424
+ {
2425
+ "epoch": 1.675,
2426
+ "grad_norm": 0.013025197200477123,
2427
+ "learning_rate": 0.0002,
2428
+ "loss": 0.0002,
2429
+ "mean_token_accuracy": 1.0,
2430
+ "num_tokens": 941248.0,
2431
+ "step": 268
2432
+ },
2433
+ {
2434
+ "epoch": 1.68125,
2435
+ "grad_norm": 0.11635535210371017,
2436
+ "learning_rate": 0.0002,
2437
+ "loss": 0.0011,
2438
+ "mean_token_accuracy": 1.0,
2439
+ "num_tokens": 944304.0,
2440
+ "step": 269
2441
+ },
2442
+ {
2443
+ "epoch": 1.6875,
2444
+ "grad_norm": 1.7156617641448975,
2445
+ "learning_rate": 0.0002,
2446
+ "loss": 0.0207,
2447
+ "mean_token_accuracy": 1.0,
2448
+ "num_tokens": 947360.0,
2449
+ "step": 270
2450
+ },
2451
+ {
2452
+ "epoch": 1.69375,
2453
+ "grad_norm": 0.04577786847949028,
2454
+ "learning_rate": 0.0002,
2455
+ "loss": 0.001,
2456
+ "mean_token_accuracy": 1.0,
2457
+ "num_tokens": 950416.0,
2458
+ "step": 271
2459
+ },
2460
+ {
2461
+ "epoch": 1.7,
2462
+ "grad_norm": 0.22839251160621643,
2463
+ "learning_rate": 0.0002,
2464
+ "loss": 0.0017,
2465
+ "mean_token_accuracy": 1.0,
2466
+ "num_tokens": 953472.0,
2467
+ "step": 272
2468
+ },
2469
+ {
2470
+ "epoch": 1.70625,
2471
+ "grad_norm": 3.693096399307251,
2472
+ "learning_rate": 0.0002,
2473
+ "loss": 0.0622,
2474
+ "mean_token_accuracy": 0.96875,
2475
+ "num_tokens": 956528.0,
2476
+ "step": 273
2477
+ },
2478
+ {
2479
+ "epoch": 1.7125,
2480
+ "grad_norm": 0.9133250117301941,
2481
+ "learning_rate": 0.0002,
2482
+ "loss": 0.0083,
2483
+ "mean_token_accuracy": 1.0,
2484
+ "num_tokens": 959584.0,
2485
+ "step": 274
2486
+ },
2487
+ {
2488
+ "epoch": 1.71875,
2489
+ "grad_norm": 0.2589430809020996,
2490
+ "learning_rate": 0.0002,
2491
+ "loss": 0.0025,
2492
+ "mean_token_accuracy": 1.0,
2493
+ "num_tokens": 962640.0,
2494
+ "step": 275
2495
+ },
2496
+ {
2497
+ "epoch": 1.725,
2498
+ "grad_norm": 0.027116835117340088,
2499
+ "learning_rate": 0.0002,
2500
+ "loss": 0.0006,
2501
+ "mean_token_accuracy": 1.0,
2502
+ "num_tokens": 965696.0,
2503
+ "step": 276
2504
+ },
2505
+ {
2506
+ "epoch": 1.73125,
2507
+ "grad_norm": 2.58829402923584,
2508
+ "learning_rate": 0.0002,
2509
+ "loss": 0.0452,
2510
+ "mean_token_accuracy": 0.96875,
2511
+ "num_tokens": 968752.0,
2512
+ "step": 277
2513
+ },
2514
+ {
2515
+ "epoch": 1.7375,
2516
+ "grad_norm": 0.12397311627864838,
2517
+ "learning_rate": 0.0002,
2518
+ "loss": 0.0009,
2519
+ "mean_token_accuracy": 1.0,
2520
+ "num_tokens": 971808.0,
2521
+ "step": 278
2522
+ },
2523
+ {
2524
+ "epoch": 1.74375,
2525
+ "grad_norm": 0.032883170992136,
2526
+ "learning_rate": 0.0002,
2527
+ "loss": 0.0005,
2528
+ "mean_token_accuracy": 1.0,
2529
+ "num_tokens": 974864.0,
2530
+ "step": 279
2531
+ },
2532
+ {
2533
+ "epoch": 1.75,
2534
+ "grad_norm": 0.024885807186365128,
2535
+ "learning_rate": 0.0002,
2536
+ "loss": 0.0005,
2537
+ "mean_token_accuracy": 1.0,
2538
+ "num_tokens": 977920.0,
2539
+ "step": 280
2540
+ },
2541
+ {
2542
+ "epoch": 1.75625,
2543
+ "grad_norm": 0.11918771266937256,
2544
+ "learning_rate": 0.0002,
2545
+ "loss": 0.0019,
2546
+ "mean_token_accuracy": 1.0,
2547
+ "num_tokens": 980976.0,
2548
+ "step": 281
2549
+ },
2550
+ {
2551
+ "epoch": 1.7625,
2552
+ "grad_norm": 5.538094997406006,
2553
+ "learning_rate": 0.0002,
2554
+ "loss": 0.062,
2555
+ "mean_token_accuracy": 0.96875,
2556
+ "num_tokens": 984032.0,
2557
+ "step": 282
2558
+ },
2559
+ {
2560
+ "epoch": 1.76875,
2561
+ "grad_norm": 0.20568646490573883,
2562
+ "learning_rate": 0.0002,
2563
+ "loss": 0.0027,
2564
+ "mean_token_accuracy": 1.0,
2565
+ "num_tokens": 987088.0,
2566
+ "step": 283
2567
+ },
2568
+ {
2569
+ "epoch": 1.775,
2570
+ "grad_norm": 3.7988224029541016,
2571
+ "learning_rate": 0.0002,
2572
+ "loss": 0.0376,
2573
+ "mean_token_accuracy": 0.96875,
2574
+ "num_tokens": 990144.0,
2575
+ "step": 284
2576
+ },
2577
+ {
2578
+ "epoch": 1.78125,
2579
+ "grad_norm": 0.0016885192599147558,
2580
+ "learning_rate": 0.0002,
2581
+ "loss": 0.0,
2582
+ "mean_token_accuracy": 1.0,
2583
+ "num_tokens": 993200.0,
2584
+ "step": 285
2585
+ },
2586
+ {
2587
+ "epoch": 1.7875,
2588
+ "grad_norm": 0.08663017302751541,
2589
+ "learning_rate": 0.0002,
2590
+ "loss": 0.001,
2591
+ "mean_token_accuracy": 1.0,
2592
+ "num_tokens": 996256.0,
2593
+ "step": 286
2594
+ },
2595
+ {
2596
+ "epoch": 1.79375,
2597
+ "grad_norm": 3.149121046066284,
2598
+ "learning_rate": 0.0002,
2599
+ "loss": 0.0219,
2600
+ "mean_token_accuracy": 1.0,
2601
+ "num_tokens": 999312.0,
2602
+ "step": 287
2603
+ },
2604
+ {
2605
+ "epoch": 1.8,
2606
+ "grad_norm": 1.3945331573486328,
2607
+ "learning_rate": 0.0002,
2608
+ "loss": 0.0077,
2609
+ "mean_token_accuracy": 1.0,
2610
+ "num_tokens": 1002368.0,
2611
+ "step": 288
2612
+ },
2613
+ {
2614
+ "epoch": 1.80625,
2615
+ "grad_norm": 0.5408581495285034,
2616
+ "learning_rate": 0.0002,
2617
+ "loss": 0.0055,
2618
+ "mean_token_accuracy": 1.0,
2619
+ "num_tokens": 1005424.0,
2620
+ "step": 289
2621
+ },
2622
+ {
2623
+ "epoch": 1.8125,
2624
+ "grad_norm": 5.049882888793945,
2625
+ "learning_rate": 0.0002,
2626
+ "loss": 0.1378,
2627
+ "mean_token_accuracy": 0.96875,
2628
+ "num_tokens": 1008480.0,
2629
+ "step": 290
2630
+ },
2631
+ {
2632
+ "epoch": 1.81875,
2633
+ "grad_norm": 1.854457974433899,
2634
+ "learning_rate": 0.0002,
2635
+ "loss": 0.0117,
2636
+ "mean_token_accuracy": 1.0,
2637
+ "num_tokens": 1011536.0,
2638
+ "step": 291
2639
+ },
2640
+ {
2641
+ "epoch": 1.825,
2642
+ "grad_norm": 5.968977451324463,
2643
+ "learning_rate": 0.0002,
2644
+ "loss": 0.0976,
2645
+ "mean_token_accuracy": 0.96875,
2646
+ "num_tokens": 1014592.0,
2647
+ "step": 292
2648
+ },
2649
+ {
2650
+ "epoch": 1.83125,
2651
+ "grad_norm": 0.39035147428512573,
2652
+ "learning_rate": 0.0002,
2653
+ "loss": 0.0026,
2654
+ "mean_token_accuracy": 1.0,
2655
+ "num_tokens": 1017648.0,
2656
+ "step": 293
2657
+ },
2658
+ {
2659
+ "epoch": 1.8375,
2660
+ "grad_norm": 0.5014843940734863,
2661
+ "learning_rate": 0.0002,
2662
+ "loss": 0.0037,
2663
+ "mean_token_accuracy": 1.0,
2664
+ "num_tokens": 1020704.0,
2665
+ "step": 294
2666
+ },
2667
+ {
2668
+ "epoch": 1.84375,
2669
+ "grad_norm": 7.198896884918213,
2670
+ "learning_rate": 0.0002,
2671
+ "loss": 0.113,
2672
+ "mean_token_accuracy": 0.96875,
2673
+ "num_tokens": 1023760.0,
2674
+ "step": 295
2675
+ },
2676
+ {
2677
+ "epoch": 1.85,
2678
+ "grad_norm": 5.184747695922852,
2679
+ "learning_rate": 0.0002,
2680
+ "loss": 0.047,
2681
+ "mean_token_accuracy": 0.96875,
2682
+ "num_tokens": 1026816.0,
2683
+ "step": 296
2684
+ },
2685
+ {
2686
+ "epoch": 1.85625,
2687
+ "grad_norm": 0.03395215794444084,
2688
+ "learning_rate": 0.0002,
2689
+ "loss": 0.0004,
2690
+ "mean_token_accuracy": 1.0,
2691
+ "num_tokens": 1029872.0,
2692
+ "step": 297
2693
+ },
2694
+ {
2695
+ "epoch": 1.8625,
2696
+ "grad_norm": 0.002945690182968974,
2697
+ "learning_rate": 0.0002,
2698
+ "loss": 0.0001,
2699
+ "mean_token_accuracy": 1.0,
2700
+ "num_tokens": 1032928.0,
2701
+ "step": 298
2702
+ },
2703
+ {
2704
+ "epoch": 1.86875,
2705
+ "grad_norm": 0.009323005564510822,
2706
+ "learning_rate": 0.0002,
2707
+ "loss": 0.0001,
2708
+ "mean_token_accuracy": 1.0,
2709
+ "num_tokens": 1035984.0,
2710
+ "step": 299
2711
+ },
2712
+ {
2713
+ "epoch": 1.875,
2714
+ "grad_norm": 0.5545831918716431,
2715
+ "learning_rate": 0.0002,
2716
+ "loss": 0.0045,
2717
+ "mean_token_accuracy": 1.0,
2718
+ "num_tokens": 1039040.0,
2719
+ "step": 300
2720
+ },
2721
+ {
2722
+ "epoch": 1.88125,
2723
+ "grad_norm": 6.735720634460449,
2724
+ "learning_rate": 0.0002,
2725
+ "loss": 0.1614,
2726
+ "mean_token_accuracy": 0.96875,
2727
+ "num_tokens": 1042096.0,
2728
+ "step": 301
2729
+ },
2730
+ {
2731
+ "epoch": 1.8875,
2732
+ "grad_norm": 5.312285423278809,
2733
+ "learning_rate": 0.0002,
2734
+ "loss": 0.259,
2735
+ "mean_token_accuracy": 0.96875,
2736
+ "num_tokens": 1045152.0,
2737
+ "step": 302
2738
+ },
2739
+ {
2740
+ "epoch": 1.89375,
2741
+ "grad_norm": 3.6888339519500732,
2742
+ "learning_rate": 0.0002,
2743
+ "loss": 0.0775,
2744
+ "mean_token_accuracy": 0.96875,
2745
+ "num_tokens": 1048208.0,
2746
+ "step": 303
2747
+ },
2748
+ {
2749
+ "epoch": 1.9,
2750
+ "grad_norm": 1.1625453233718872,
2751
+ "learning_rate": 0.0002,
2752
+ "loss": 0.0084,
2753
+ "mean_token_accuracy": 1.0,
2754
+ "num_tokens": 1051264.0,
2755
+ "step": 304
2756
+ },
2757
+ {
2758
+ "epoch": 1.90625,
2759
+ "grad_norm": 2.7333149909973145,
2760
+ "learning_rate": 0.0002,
2761
+ "loss": 0.0198,
2762
+ "mean_token_accuracy": 1.0,
2763
+ "num_tokens": 1054320.0,
2764
+ "step": 305
2765
+ },
2766
+ {
2767
+ "epoch": 1.9125,
2768
+ "grad_norm": 6.946017265319824,
2769
+ "learning_rate": 0.0002,
2770
+ "loss": 0.0668,
2771
+ "mean_token_accuracy": 0.96875,
2772
+ "num_tokens": 1057376.0,
2773
+ "step": 306
2774
+ },
2775
+ {
2776
+ "epoch": 1.91875,
2777
+ "grad_norm": 0.8210978507995605,
2778
+ "learning_rate": 0.0002,
2779
+ "loss": 0.0047,
2780
+ "mean_token_accuracy": 1.0,
2781
+ "num_tokens": 1060432.0,
2782
+ "step": 307
2783
+ },
2784
+ {
2785
+ "epoch": 1.925,
2786
+ "grad_norm": 5.008805274963379,
2787
+ "learning_rate": 0.0002,
2788
+ "loss": 0.2898,
2789
+ "mean_token_accuracy": 0.96875,
2790
+ "num_tokens": 1063488.0,
2791
+ "step": 308
2792
+ },
2793
+ {
2794
+ "epoch": 1.93125,
2795
+ "grad_norm": 2.841658353805542,
2796
+ "learning_rate": 0.0002,
2797
+ "loss": 0.0158,
2798
+ "mean_token_accuracy": 1.0,
2799
+ "num_tokens": 1066544.0,
2800
+ "step": 309
2801
+ },
2802
+ {
2803
+ "epoch": 1.9375,
2804
+ "grad_norm": 5.415345191955566,
2805
+ "learning_rate": 0.0002,
2806
+ "loss": 0.3233,
2807
+ "mean_token_accuracy": 0.9375,
2808
+ "num_tokens": 1069600.0,
2809
+ "step": 310
2810
+ },
2811
+ {
2812
+ "epoch": 1.94375,
2813
+ "grad_norm": 4.167904853820801,
2814
+ "learning_rate": 0.0002,
2815
+ "loss": 0.0631,
2816
+ "mean_token_accuracy": 0.96875,
2817
+ "num_tokens": 1072656.0,
2818
+ "step": 311
2819
+ },
2820
+ {
2821
+ "epoch": 1.95,
2822
+ "grad_norm": 0.2593608498573303,
2823
+ "learning_rate": 0.0002,
2824
+ "loss": 0.0017,
2825
+ "mean_token_accuracy": 1.0,
2826
+ "num_tokens": 1075712.0,
2827
+ "step": 312
2828
+ },
2829
+ {
2830
+ "epoch": 1.95625,
2831
+ "grad_norm": 3.686906576156616,
2832
+ "learning_rate": 0.0002,
2833
+ "loss": 0.0414,
2834
+ "mean_token_accuracy": 0.96875,
2835
+ "num_tokens": 1078768.0,
2836
+ "step": 313
2837
+ },
2838
+ {
2839
+ "epoch": 1.9625,
2840
+ "grad_norm": 0.01392375398427248,
2841
+ "learning_rate": 0.0002,
2842
+ "loss": 0.0002,
2843
+ "mean_token_accuracy": 1.0,
2844
+ "num_tokens": 1081824.0,
2845
+ "step": 314
2846
+ },
2847
+ {
2848
+ "epoch": 1.96875,
2849
+ "grad_norm": 0.021308658644557,
2850
+ "learning_rate": 0.0002,
2851
+ "loss": 0.0002,
2852
+ "mean_token_accuracy": 1.0,
2853
+ "num_tokens": 1084880.0,
2854
+ "step": 315
2855
+ },
2856
+ {
2857
+ "epoch": 1.975,
2858
+ "grad_norm": 1.8390535116195679,
2859
+ "learning_rate": 0.0002,
2860
+ "loss": 0.0124,
2861
+ "mean_token_accuracy": 1.0,
2862
+ "num_tokens": 1087936.0,
2863
+ "step": 316
2864
+ },
2865
+ {
2866
+ "epoch": 1.98125,
2867
+ "grad_norm": 5.717790603637695,
2868
+ "learning_rate": 0.0002,
2869
+ "loss": 0.0658,
2870
+ "mean_token_accuracy": 0.96875,
2871
+ "num_tokens": 1090992.0,
2872
+ "step": 317
2873
+ },
2874
+ {
2875
+ "epoch": 1.9875,
2876
+ "grad_norm": 0.08841279149055481,
2877
+ "learning_rate": 0.0002,
2878
+ "loss": 0.0008,
2879
+ "mean_token_accuracy": 1.0,
2880
+ "num_tokens": 1094048.0,
2881
+ "step": 318
2882
+ },
2883
+ {
2884
+ "epoch": 1.99375,
2885
+ "grad_norm": 0.13375015556812286,
2886
+ "learning_rate": 0.0002,
2887
+ "loss": 0.0021,
2888
+ "mean_token_accuracy": 1.0,
2889
+ "num_tokens": 1097104.0,
2890
+ "step": 319
2891
+ },
2892
+ {
2893
+ "epoch": 2.0,
2894
+ "grad_norm": 0.138557568192482,
2895
+ "learning_rate": 0.0002,
2896
+ "loss": 0.0011,
2897
+ "mean_token_accuracy": 1.0,
2898
+ "num_tokens": 1100160.0,
2899
+ "step": 320
2900
+ }
2901
+ ],
2902
+ "logging_steps": 1,
2903
+ "max_steps": 320,
2904
+ "num_input_tokens_seen": 0,
2905
+ "num_train_epochs": 2,
2906
+ "save_steps": 200,
2907
+ "stateful_callbacks": {
2908
+ "TrainerControl": {
2909
+ "args": {
2910
+ "should_epoch_stop": false,
2911
+ "should_evaluate": false,
2912
+ "should_log": false,
2913
+ "should_save": true,
2914
+ "should_training_stop": true
2915
+ },
2916
+ "attributes": {}
2917
+ }
2918
+ },
2919
+ "total_flos": 4.548627979567104e+16,
2920
+ "train_batch_size": 16,
2921
+ "trial_name": null,
2922
+ "trial_params": null
2923
+ }
checkpoint-320/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1e17f907302d781eecf6b58e9d4136a88ad4089312e1b26d13d23c8ff5af5cb
3
+ size 6161
checkpoint-320/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1e17f907302d781eecf6b58e9d4136a88ad4089312e1b26d13d23c8ff5af5cb
3
+ size 6161
training_args.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_r": 16,
3
+ "lora_alpha": 16,
4
+ "lora_dropout": 0.05,
5
+ "lora_target_modules": "q_proj,v_proj",
6
+ "tuning_strategy": "lora",
7
+ "num_trainable_layers": 2,
8
+ "output_dir": "sft_prefill/prompt_id_3/qwen2-VL-7B-Instruct-syn-count-lora",
9
+ "num_train_epochs": 2,
10
+ "learning_rate": 0.0002,
11
+ "per_device_train_batch_size": 16,
12
+ "per_device_eval_batch_size": 16,
13
+ "gradient_accumulation_steps": 1,
14
+ "logging_steps": 10,
15
+ "eval_steps": 200,
16
+ "save_steps": 200,
17
+ "warmup_ratio": 0.03,
18
+ "weight_decay": 0.0,
19
+ "max_grad_norm": 0.3,
20
+ "lr_scheduler_type": "constant",
21
+ "bf16": true,
22
+ "tf32": true,
23
+ "gradient_checkpointing": true,
24
+ "optim": "adamw_torch_fused",
25
+ "ft_type": "SFT_Prefill",
26
+ "data_type": "small",
27
+ "prompt_id": 3
28
+ }
training_log.json ADDED
@@ -0,0 +1,1318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_loss": [
3
+ {
4
+ "step": 1,
5
+ "loss": 0.4917
6
+ },
7
+ {
8
+ "step": 2,
9
+ "loss": 0.6049
10
+ },
11
+ {
12
+ "step": 3,
13
+ "loss": 0.6407
14
+ },
15
+ {
16
+ "step": 4,
17
+ "loss": 0.3972
18
+ },
19
+ {
20
+ "step": 5,
21
+ "loss": 0.453
22
+ },
23
+ {
24
+ "step": 6,
25
+ "loss": 0.418
26
+ },
27
+ {
28
+ "step": 7,
29
+ "loss": 0.303
30
+ },
31
+ {
32
+ "step": 8,
33
+ "loss": 0.1847
34
+ },
35
+ {
36
+ "step": 9,
37
+ "loss": 0.3514
38
+ },
39
+ {
40
+ "step": 10,
41
+ "loss": 0.2434
42
+ },
43
+ {
44
+ "step": 11,
45
+ "loss": 0.1577
46
+ },
47
+ {
48
+ "step": 12,
49
+ "loss": 0.209
50
+ },
51
+ {
52
+ "step": 13,
53
+ "loss": 0.2123
54
+ },
55
+ {
56
+ "step": 14,
57
+ "loss": 0.2712
58
+ },
59
+ {
60
+ "step": 15,
61
+ "loss": 0.2975
62
+ },
63
+ {
64
+ "step": 16,
65
+ "loss": 0.3977
66
+ },
67
+ {
68
+ "step": 17,
69
+ "loss": 0.3933
70
+ },
71
+ {
72
+ "step": 18,
73
+ "loss": 0.2866
74
+ },
75
+ {
76
+ "step": 19,
77
+ "loss": 0.4174
78
+ },
79
+ {
80
+ "step": 20,
81
+ "loss": 0.3588
82
+ },
83
+ {
84
+ "step": 21,
85
+ "loss": 0.1919
86
+ },
87
+ {
88
+ "step": 22,
89
+ "loss": 0.475
90
+ },
91
+ {
92
+ "step": 23,
93
+ "loss": 0.223
94
+ },
95
+ {
96
+ "step": 24,
97
+ "loss": 0.3825
98
+ },
99
+ {
100
+ "step": 25,
101
+ "loss": 0.1645
102
+ },
103
+ {
104
+ "step": 26,
105
+ "loss": 0.2517
106
+ },
107
+ {
108
+ "step": 27,
109
+ "loss": 0.1865
110
+ },
111
+ {
112
+ "step": 28,
113
+ "loss": 0.2887
114
+ },
115
+ {
116
+ "step": 29,
117
+ "loss": 0.2154
118
+ },
119
+ {
120
+ "step": 30,
121
+ "loss": 0.3605
122
+ },
123
+ {
124
+ "step": 31,
125
+ "loss": 0.1299
126
+ },
127
+ {
128
+ "step": 32,
129
+ "loss": 0.1772
130
+ },
131
+ {
132
+ "step": 33,
133
+ "loss": 0.3281
134
+ },
135
+ {
136
+ "step": 34,
137
+ "loss": 0.1973
138
+ },
139
+ {
140
+ "step": 35,
141
+ "loss": 0.2645
142
+ },
143
+ {
144
+ "step": 36,
145
+ "loss": 0.1938
146
+ },
147
+ {
148
+ "step": 37,
149
+ "loss": 0.1577
150
+ },
151
+ {
152
+ "step": 38,
153
+ "loss": 0.1917
154
+ },
155
+ {
156
+ "step": 39,
157
+ "loss": 0.0703
158
+ },
159
+ {
160
+ "step": 40,
161
+ "loss": 0.1402
162
+ },
163
+ {
164
+ "step": 41,
165
+ "loss": 0.1274
166
+ },
167
+ {
168
+ "step": 42,
169
+ "loss": 0.1871
170
+ },
171
+ {
172
+ "step": 43,
173
+ "loss": 0.2981
174
+ },
175
+ {
176
+ "step": 44,
177
+ "loss": 0.0962
178
+ },
179
+ {
180
+ "step": 45,
181
+ "loss": 0.1616
182
+ },
183
+ {
184
+ "step": 46,
185
+ "loss": 0.0614
186
+ },
187
+ {
188
+ "step": 47,
189
+ "loss": 0.1468
190
+ },
191
+ {
192
+ "step": 48,
193
+ "loss": 0.0482
194
+ },
195
+ {
196
+ "step": 49,
197
+ "loss": 0.1177
198
+ },
199
+ {
200
+ "step": 50,
201
+ "loss": 0.0788
202
+ },
203
+ {
204
+ "step": 51,
205
+ "loss": 0.1786
206
+ },
207
+ {
208
+ "step": 52,
209
+ "loss": 0.1685
210
+ },
211
+ {
212
+ "step": 53,
213
+ "loss": 0.1178
214
+ },
215
+ {
216
+ "step": 54,
217
+ "loss": 0.1016
218
+ },
219
+ {
220
+ "step": 55,
221
+ "loss": 0.1223
222
+ },
223
+ {
224
+ "step": 56,
225
+ "loss": 0.0331
226
+ },
227
+ {
228
+ "step": 57,
229
+ "loss": 0.1314
230
+ },
231
+ {
232
+ "step": 58,
233
+ "loss": 0.0504
234
+ },
235
+ {
236
+ "step": 59,
237
+ "loss": 0.0327
238
+ },
239
+ {
240
+ "step": 60,
241
+ "loss": 0.0766
242
+ },
243
+ {
244
+ "step": 61,
245
+ "loss": 0.0167
246
+ },
247
+ {
248
+ "step": 62,
249
+ "loss": 0.0176
250
+ },
251
+ {
252
+ "step": 63,
253
+ "loss": 0.1523
254
+ },
255
+ {
256
+ "step": 64,
257
+ "loss": 0.0172
258
+ },
259
+ {
260
+ "step": 65,
261
+ "loss": 0.1337
262
+ },
263
+ {
264
+ "step": 66,
265
+ "loss": 0.18
266
+ },
267
+ {
268
+ "step": 67,
269
+ "loss": 0.0076
270
+ },
271
+ {
272
+ "step": 68,
273
+ "loss": 0.0051
274
+ },
275
+ {
276
+ "step": 69,
277
+ "loss": 0.0408
278
+ },
279
+ {
280
+ "step": 70,
281
+ "loss": 0.0744
282
+ },
283
+ {
284
+ "step": 71,
285
+ "loss": 0.0123
286
+ },
287
+ {
288
+ "step": 72,
289
+ "loss": 0.006
290
+ },
291
+ {
292
+ "step": 73,
293
+ "loss": 0.3016
294
+ },
295
+ {
296
+ "step": 74,
297
+ "loss": 0.1201
298
+ },
299
+ {
300
+ "step": 75,
301
+ "loss": 0.037
302
+ },
303
+ {
304
+ "step": 76,
305
+ "loss": 0.0413
306
+ },
307
+ {
308
+ "step": 77,
309
+ "loss": 0.0697
310
+ },
311
+ {
312
+ "step": 78,
313
+ "loss": 0.0485
314
+ },
315
+ {
316
+ "step": 79,
317
+ "loss": 0.0686
318
+ },
319
+ {
320
+ "step": 80,
321
+ "loss": 0.0649
322
+ },
323
+ {
324
+ "step": 81,
325
+ "loss": 0.1002
326
+ },
327
+ {
328
+ "step": 82,
329
+ "loss": 0.0235
330
+ },
331
+ {
332
+ "step": 83,
333
+ "loss": 0.1955
334
+ },
335
+ {
336
+ "step": 84,
337
+ "loss": 0.1692
338
+ },
339
+ {
340
+ "step": 85,
341
+ "loss": 0.0178
342
+ },
343
+ {
344
+ "step": 86,
345
+ "loss": 0.0282
346
+ },
347
+ {
348
+ "step": 87,
349
+ "loss": 0.0191
350
+ },
351
+ {
352
+ "step": 88,
353
+ "loss": 0.0177
354
+ },
355
+ {
356
+ "step": 89,
357
+ "loss": 0.0207
358
+ },
359
+ {
360
+ "step": 90,
361
+ "loss": 0.0082
362
+ },
363
+ {
364
+ "step": 91,
365
+ "loss": 0.0583
366
+ },
367
+ {
368
+ "step": 92,
369
+ "loss": 0.259
370
+ },
371
+ {
372
+ "step": 93,
373
+ "loss": 0.0295
374
+ },
375
+ {
376
+ "step": 94,
377
+ "loss": 0.0014
378
+ },
379
+ {
380
+ "step": 95,
381
+ "loss": 0.024
382
+ },
383
+ {
384
+ "step": 96,
385
+ "loss": 0.0795
386
+ },
387
+ {
388
+ "step": 97,
389
+ "loss": 0.1903
390
+ },
391
+ {
392
+ "step": 98,
393
+ "loss": 0.2885
394
+ },
395
+ {
396
+ "step": 99,
397
+ "loss": 0.2661
398
+ },
399
+ {
400
+ "step": 100,
401
+ "loss": 0.0502
402
+ },
403
+ {
404
+ "step": 101,
405
+ "loss": 0.0041
406
+ },
407
+ {
408
+ "step": 102,
409
+ "loss": 0.0947
410
+ },
411
+ {
412
+ "step": 103,
413
+ "loss": 0.1131
414
+ },
415
+ {
416
+ "step": 104,
417
+ "loss": 0.1105
418
+ },
419
+ {
420
+ "step": 105,
421
+ "loss": 0.0794
422
+ },
423
+ {
424
+ "step": 106,
425
+ "loss": 0.0057
426
+ },
427
+ {
428
+ "step": 107,
429
+ "loss": 0.3953
430
+ },
431
+ {
432
+ "step": 108,
433
+ "loss": 0.1099
434
+ },
435
+ {
436
+ "step": 109,
437
+ "loss": 0.2198
438
+ },
439
+ {
440
+ "step": 110,
441
+ "loss": 0.0586
442
+ },
443
+ {
444
+ "step": 111,
445
+ "loss": 0.3492
446
+ },
447
+ {
448
+ "step": 112,
449
+ "loss": 0.003
450
+ },
451
+ {
452
+ "step": 113,
453
+ "loss": 0.0365
454
+ },
455
+ {
456
+ "step": 114,
457
+ "loss": 0.1228
458
+ },
459
+ {
460
+ "step": 115,
461
+ "loss": 0.0174
462
+ },
463
+ {
464
+ "step": 116,
465
+ "loss": 0.0473
466
+ },
467
+ {
468
+ "step": 117,
469
+ "loss": 0.073
470
+ },
471
+ {
472
+ "step": 118,
473
+ "loss": 0.0736
474
+ },
475
+ {
476
+ "step": 119,
477
+ "loss": 0.3511
478
+ },
479
+ {
480
+ "step": 120,
481
+ "loss": 0.1691
482
+ },
483
+ {
484
+ "step": 121,
485
+ "loss": 0.2357
486
+ },
487
+ {
488
+ "step": 122,
489
+ "loss": 0.1983
490
+ },
491
+ {
492
+ "step": 123,
493
+ "loss": 0.043
494
+ },
495
+ {
496
+ "step": 124,
497
+ "loss": 0.0069
498
+ },
499
+ {
500
+ "step": 125,
501
+ "loss": 0.0348
502
+ },
503
+ {
504
+ "step": 126,
505
+ "loss": 0.0071
506
+ },
507
+ {
508
+ "step": 127,
509
+ "loss": 0.0153
510
+ },
511
+ {
512
+ "step": 128,
513
+ "loss": 0.0071
514
+ },
515
+ {
516
+ "step": 129,
517
+ "loss": 0.114
518
+ },
519
+ {
520
+ "step": 130,
521
+ "loss": 0.027
522
+ },
523
+ {
524
+ "step": 131,
525
+ "loss": 0.2155
526
+ },
527
+ {
528
+ "step": 132,
529
+ "loss": 0.0732
530
+ },
531
+ {
532
+ "step": 133,
533
+ "loss": 0.0479
534
+ },
535
+ {
536
+ "step": 134,
537
+ "loss": 0.1001
538
+ },
539
+ {
540
+ "step": 135,
541
+ "loss": 0.0795
542
+ },
543
+ {
544
+ "step": 136,
545
+ "loss": 0.0143
546
+ },
547
+ {
548
+ "step": 137,
549
+ "loss": 0.025
550
+ },
551
+ {
552
+ "step": 138,
553
+ "loss": 0.0375
554
+ },
555
+ {
556
+ "step": 139,
557
+ "loss": 0.0464
558
+ },
559
+ {
560
+ "step": 140,
561
+ "loss": 0.135
562
+ },
563
+ {
564
+ "step": 141,
565
+ "loss": 0.0437
566
+ },
567
+ {
568
+ "step": 142,
569
+ "loss": 0.1026
570
+ },
571
+ {
572
+ "step": 143,
573
+ "loss": 0.104
574
+ },
575
+ {
576
+ "step": 144,
577
+ "loss": 0.16
578
+ },
579
+ {
580
+ "step": 145,
581
+ "loss": 0.1841
582
+ },
583
+ {
584
+ "step": 146,
585
+ "loss": 0.2612
586
+ },
587
+ {
588
+ "step": 147,
589
+ "loss": 0.047
590
+ },
591
+ {
592
+ "step": 148,
593
+ "loss": 0.0236
594
+ },
595
+ {
596
+ "step": 149,
597
+ "loss": 0.061
598
+ },
599
+ {
600
+ "step": 150,
601
+ "loss": 0.3053
602
+ },
603
+ {
604
+ "step": 151,
605
+ "loss": 0.1405
606
+ },
607
+ {
608
+ "step": 152,
609
+ "loss": 0.1131
610
+ },
611
+ {
612
+ "step": 153,
613
+ "loss": 0.0147
614
+ },
615
+ {
616
+ "step": 154,
617
+ "loss": 0.0888
618
+ },
619
+ {
620
+ "step": 155,
621
+ "loss": 0.0136
622
+ },
623
+ {
624
+ "step": 156,
625
+ "loss": 0.0091
626
+ },
627
+ {
628
+ "step": 157,
629
+ "loss": 0.011
630
+ },
631
+ {
632
+ "step": 158,
633
+ "loss": 0.0063
634
+ },
635
+ {
636
+ "step": 159,
637
+ "loss": 0.014
638
+ },
639
+ {
640
+ "step": 160,
641
+ "loss": 0.0344
642
+ },
643
+ {
644
+ "step": 161,
645
+ "loss": 0.0981
646
+ },
647
+ {
648
+ "step": 162,
649
+ "loss": 0.1965
650
+ },
651
+ {
652
+ "step": 163,
653
+ "loss": 0.0887
654
+ },
655
+ {
656
+ "step": 164,
657
+ "loss": 0.0409
658
+ },
659
+ {
660
+ "step": 165,
661
+ "loss": 0.0588
662
+ },
663
+ {
664
+ "step": 166,
665
+ "loss": 0.0632
666
+ },
667
+ {
668
+ "step": 167,
669
+ "loss": 0.0903
670
+ },
671
+ {
672
+ "step": 168,
673
+ "loss": 0.0796
674
+ },
675
+ {
676
+ "step": 169,
677
+ "loss": 0.0415
678
+ },
679
+ {
680
+ "step": 170,
681
+ "loss": 0.0989
682
+ },
683
+ {
684
+ "step": 171,
685
+ "loss": 0.0974
686
+ },
687
+ {
688
+ "step": 172,
689
+ "loss": 0.0363
690
+ },
691
+ {
692
+ "step": 173,
693
+ "loss": 0.0086
694
+ },
695
+ {
696
+ "step": 174,
697
+ "loss": 0.005
698
+ },
699
+ {
700
+ "step": 175,
701
+ "loss": 0.1765
702
+ },
703
+ {
704
+ "step": 176,
705
+ "loss": 0.2432
706
+ },
707
+ {
708
+ "step": 177,
709
+ "loss": 0.0805
710
+ },
711
+ {
712
+ "step": 178,
713
+ "loss": 0.0227
714
+ },
715
+ {
716
+ "step": 179,
717
+ "loss": 0.3566
718
+ },
719
+ {
720
+ "step": 180,
721
+ "loss": 0.0232
722
+ },
723
+ {
724
+ "step": 181,
725
+ "loss": 0.1591
726
+ },
727
+ {
728
+ "step": 182,
729
+ "loss": 0.0734
730
+ },
731
+ {
732
+ "step": 183,
733
+ "loss": 0.0099
734
+ },
735
+ {
736
+ "step": 184,
737
+ "loss": 0.0127
738
+ },
739
+ {
740
+ "step": 185,
741
+ "loss": 0.0028
742
+ },
743
+ {
744
+ "step": 186,
745
+ "loss": 0.0022
746
+ },
747
+ {
748
+ "step": 187,
749
+ "loss": 0.0082
750
+ },
751
+ {
752
+ "step": 188,
753
+ "loss": 0.006
754
+ },
755
+ {
756
+ "step": 189,
757
+ "loss": 0.0506
758
+ },
759
+ {
760
+ "step": 190,
761
+ "loss": 0.0107
762
+ },
763
+ {
764
+ "step": 191,
765
+ "loss": 0.0287
766
+ },
767
+ {
768
+ "step": 192,
769
+ "loss": 0.0021
770
+ },
771
+ {
772
+ "step": 193,
773
+ "loss": 0.1491
774
+ },
775
+ {
776
+ "step": 194,
777
+ "loss": 0.1414
778
+ },
779
+ {
780
+ "step": 195,
781
+ "loss": 0.003
782
+ },
783
+ {
784
+ "step": 196,
785
+ "loss": 0.0506
786
+ },
787
+ {
788
+ "step": 197,
789
+ "loss": 0.0182
790
+ },
791
+ {
792
+ "step": 198,
793
+ "loss": 0.073
794
+ },
795
+ {
796
+ "step": 199,
797
+ "loss": 0.0265
798
+ },
799
+ {
800
+ "step": 200,
801
+ "loss": 0.0079
802
+ },
803
+ {
804
+ "step": 201,
805
+ "loss": 0.0569
806
+ },
807
+ {
808
+ "step": 202,
809
+ "loss": 0.0138
810
+ },
811
+ {
812
+ "step": 203,
813
+ "loss": 0.0001
814
+ },
815
+ {
816
+ "step": 204,
817
+ "loss": 0.2567
818
+ },
819
+ {
820
+ "step": 205,
821
+ "loss": 0.0002
822
+ },
823
+ {
824
+ "step": 206,
825
+ "loss": 0.0142
826
+ },
827
+ {
828
+ "step": 207,
829
+ "loss": 0.0546
830
+ },
831
+ {
832
+ "step": 208,
833
+ "loss": 0.0485
834
+ },
835
+ {
836
+ "step": 209,
837
+ "loss": 0.1518
838
+ },
839
+ {
840
+ "step": 210,
841
+ "loss": 0.1869
842
+ },
843
+ {
844
+ "step": 211,
845
+ "loss": 0.2424
846
+ },
847
+ {
848
+ "step": 212,
849
+ "loss": 0.0379
850
+ },
851
+ {
852
+ "step": 213,
853
+ "loss": 0.0088
854
+ },
855
+ {
856
+ "step": 214,
857
+ "loss": 0.001
858
+ },
859
+ {
860
+ "step": 215,
861
+ "loss": 0.0102
862
+ },
863
+ {
864
+ "step": 216,
865
+ "loss": 0.1525
866
+ },
867
+ {
868
+ "step": 217,
869
+ "loss": 0.0018
870
+ },
871
+ {
872
+ "step": 218,
873
+ "loss": 0.0026
874
+ },
875
+ {
876
+ "step": 219,
877
+ "loss": 0.0031
878
+ },
879
+ {
880
+ "step": 220,
881
+ "loss": 0.0206
882
+ },
883
+ {
884
+ "step": 221,
885
+ "loss": 0.1205
886
+ },
887
+ {
888
+ "step": 222,
889
+ "loss": 0.01
890
+ },
891
+ {
892
+ "step": 223,
893
+ "loss": 0.0597
894
+ },
895
+ {
896
+ "step": 224,
897
+ "loss": 0.1187
898
+ },
899
+ {
900
+ "step": 225,
901
+ "loss": 0.0635
902
+ },
903
+ {
904
+ "step": 226,
905
+ "loss": 0.1923
906
+ },
907
+ {
908
+ "step": 227,
909
+ "loss": 0.0098
910
+ },
911
+ {
912
+ "step": 228,
913
+ "loss": 0.0003
914
+ },
915
+ {
916
+ "step": 229,
917
+ "loss": 0.0434
918
+ },
919
+ {
920
+ "step": 230,
921
+ "loss": 0.005
922
+ },
923
+ {
924
+ "step": 231,
925
+ "loss": 0.0029
926
+ },
927
+ {
928
+ "step": 232,
929
+ "loss": 0.0043
930
+ },
931
+ {
932
+ "step": 233,
933
+ "loss": 0.0057
934
+ },
935
+ {
936
+ "step": 234,
937
+ "loss": 0.0003
938
+ },
939
+ {
940
+ "step": 235,
941
+ "loss": 0.0005
942
+ },
943
+ {
944
+ "step": 236,
945
+ "loss": 0.0244
946
+ },
947
+ {
948
+ "step": 237,
949
+ "loss": 0.0776
950
+ },
951
+ {
952
+ "step": 238,
953
+ "loss": 0.0203
954
+ },
955
+ {
956
+ "step": 239,
957
+ "loss": 0.2391
958
+ },
959
+ {
960
+ "step": 240,
961
+ "loss": 0.0533
962
+ },
963
+ {
964
+ "step": 241,
965
+ "loss": 0.0536
966
+ },
967
+ {
968
+ "step": 242,
969
+ "loss": 0.0045
970
+ },
971
+ {
972
+ "step": 243,
973
+ "loss": 0.0131
974
+ },
975
+ {
976
+ "step": 244,
977
+ "loss": 0.0037
978
+ },
979
+ {
980
+ "step": 245,
981
+ "loss": 0.0182
982
+ },
983
+ {
984
+ "step": 246,
985
+ "loss": 0.0078
986
+ },
987
+ {
988
+ "step": 247,
989
+ "loss": 0.0413
990
+ },
991
+ {
992
+ "step": 248,
993
+ "loss": 0.1189
994
+ },
995
+ {
996
+ "step": 249,
997
+ "loss": 0.0267
998
+ },
999
+ {
1000
+ "step": 250,
1001
+ "loss": 0.0507
1002
+ },
1003
+ {
1004
+ "step": 251,
1005
+ "loss": 0.0261
1006
+ },
1007
+ {
1008
+ "step": 252,
1009
+ "loss": 0.0043
1010
+ },
1011
+ {
1012
+ "step": 253,
1013
+ "loss": 0.1643
1014
+ },
1015
+ {
1016
+ "step": 254,
1017
+ "loss": 0.4695
1018
+ },
1019
+ {
1020
+ "step": 255,
1021
+ "loss": 0.0008
1022
+ },
1023
+ {
1024
+ "step": 256,
1025
+ "loss": 0.0066
1026
+ },
1027
+ {
1028
+ "step": 257,
1029
+ "loss": 0.0042
1030
+ },
1031
+ {
1032
+ "step": 258,
1033
+ "loss": 0.2334
1034
+ },
1035
+ {
1036
+ "step": 259,
1037
+ "loss": 0.0046
1038
+ },
1039
+ {
1040
+ "step": 260,
1041
+ "loss": 0.0207
1042
+ },
1043
+ {
1044
+ "step": 261,
1045
+ "loss": 0.0356
1046
+ },
1047
+ {
1048
+ "step": 262,
1049
+ "loss": 0.0008
1050
+ },
1051
+ {
1052
+ "step": 263,
1053
+ "loss": 0.0371
1054
+ },
1055
+ {
1056
+ "step": 264,
1057
+ "loss": 0.0009
1058
+ },
1059
+ {
1060
+ "step": 265,
1061
+ "loss": 0.0082
1062
+ },
1063
+ {
1064
+ "step": 266,
1065
+ "loss": 0.0259
1066
+ },
1067
+ {
1068
+ "step": 267,
1069
+ "loss": 0.0008
1070
+ },
1071
+ {
1072
+ "step": 268,
1073
+ "loss": 0.0002
1074
+ },
1075
+ {
1076
+ "step": 269,
1077
+ "loss": 0.0011
1078
+ },
1079
+ {
1080
+ "step": 270,
1081
+ "loss": 0.0207
1082
+ },
1083
+ {
1084
+ "step": 271,
1085
+ "loss": 0.001
1086
+ },
1087
+ {
1088
+ "step": 272,
1089
+ "loss": 0.0017
1090
+ },
1091
+ {
1092
+ "step": 273,
1093
+ "loss": 0.0622
1094
+ },
1095
+ {
1096
+ "step": 274,
1097
+ "loss": 0.0083
1098
+ },
1099
+ {
1100
+ "step": 275,
1101
+ "loss": 0.0025
1102
+ },
1103
+ {
1104
+ "step": 276,
1105
+ "loss": 0.0006
1106
+ },
1107
+ {
1108
+ "step": 277,
1109
+ "loss": 0.0452
1110
+ },
1111
+ {
1112
+ "step": 278,
1113
+ "loss": 0.0009
1114
+ },
1115
+ {
1116
+ "step": 279,
1117
+ "loss": 0.0005
1118
+ },
1119
+ {
1120
+ "step": 280,
1121
+ "loss": 0.0005
1122
+ },
1123
+ {
1124
+ "step": 281,
1125
+ "loss": 0.0019
1126
+ },
1127
+ {
1128
+ "step": 282,
1129
+ "loss": 0.062
1130
+ },
1131
+ {
1132
+ "step": 283,
1133
+ "loss": 0.0027
1134
+ },
1135
+ {
1136
+ "step": 284,
1137
+ "loss": 0.0376
1138
+ },
1139
+ {
1140
+ "step": 285,
1141
+ "loss": 0.0
1142
+ },
1143
+ {
1144
+ "step": 286,
1145
+ "loss": 0.001
1146
+ },
1147
+ {
1148
+ "step": 287,
1149
+ "loss": 0.0219
1150
+ },
1151
+ {
1152
+ "step": 288,
1153
+ "loss": 0.0077
1154
+ },
1155
+ {
1156
+ "step": 289,
1157
+ "loss": 0.0055
1158
+ },
1159
+ {
1160
+ "step": 290,
1161
+ "loss": 0.1378
1162
+ },
1163
+ {
1164
+ "step": 291,
1165
+ "loss": 0.0117
1166
+ },
1167
+ {
1168
+ "step": 292,
1169
+ "loss": 0.0976
1170
+ },
1171
+ {
1172
+ "step": 293,
1173
+ "loss": 0.0026
1174
+ },
1175
+ {
1176
+ "step": 294,
1177
+ "loss": 0.0037
1178
+ },
1179
+ {
1180
+ "step": 295,
1181
+ "loss": 0.113
1182
+ },
1183
+ {
1184
+ "step": 296,
1185
+ "loss": 0.047
1186
+ },
1187
+ {
1188
+ "step": 297,
1189
+ "loss": 0.0004
1190
+ },
1191
+ {
1192
+ "step": 298,
1193
+ "loss": 0.0001
1194
+ },
1195
+ {
1196
+ "step": 299,
1197
+ "loss": 0.0001
1198
+ },
1199
+ {
1200
+ "step": 300,
1201
+ "loss": 0.0045
1202
+ },
1203
+ {
1204
+ "step": 301,
1205
+ "loss": 0.1614
1206
+ },
1207
+ {
1208
+ "step": 302,
1209
+ "loss": 0.259
1210
+ },
1211
+ {
1212
+ "step": 303,
1213
+ "loss": 0.0775
1214
+ },
1215
+ {
1216
+ "step": 304,
1217
+ "loss": 0.0084
1218
+ },
1219
+ {
1220
+ "step": 305,
1221
+ "loss": 0.0198
1222
+ },
1223
+ {
1224
+ "step": 306,
1225
+ "loss": 0.0668
1226
+ },
1227
+ {
1228
+ "step": 307,
1229
+ "loss": 0.0047
1230
+ },
1231
+ {
1232
+ "step": 308,
1233
+ "loss": 0.2898
1234
+ },
1235
+ {
1236
+ "step": 309,
1237
+ "loss": 0.0158
1238
+ },
1239
+ {
1240
+ "step": 310,
1241
+ "loss": 0.3233
1242
+ },
1243
+ {
1244
+ "step": 311,
1245
+ "loss": 0.0631
1246
+ },
1247
+ {
1248
+ "step": 312,
1249
+ "loss": 0.0017
1250
+ },
1251
+ {
1252
+ "step": 313,
1253
+ "loss": 0.0414
1254
+ },
1255
+ {
1256
+ "step": 314,
1257
+ "loss": 0.0002
1258
+ },
1259
+ {
1260
+ "step": 315,
1261
+ "loss": 0.0002
1262
+ },
1263
+ {
1264
+ "step": 316,
1265
+ "loss": 0.0124
1266
+ },
1267
+ {
1268
+ "step": 317,
1269
+ "loss": 0.0658
1270
+ },
1271
+ {
1272
+ "step": 318,
1273
+ "loss": 0.0008
1274
+ },
1275
+ {
1276
+ "step": 319,
1277
+ "loss": 0.0021
1278
+ },
1279
+ {
1280
+ "step": 320,
1281
+ "loss": 0.0011
1282
+ }
1283
+ ],
1284
+ "eval_loss": [
1285
+ {
1286
+ "step": 200,
1287
+ "eval_loss": 0.07848864793777466
1288
+ }
1289
+ ],
1290
+ "args": {
1291
+ "lora_r": 16,
1292
+ "lora_alpha": 16,
1293
+ "lora_dropout": 0.05,
1294
+ "lora_target_modules": "q_proj,v_proj",
1295
+ "tuning_strategy": "lora",
1296
+ "num_trainable_layers": 2,
1297
+ "output_dir": "sft_prefill/prompt_id_3/qwen2-VL-7B-Instruct-syn-count-lora",
1298
+ "num_train_epochs": 2,
1299
+ "learning_rate": 0.0002,
1300
+ "per_device_train_batch_size": 16,
1301
+ "per_device_eval_batch_size": 16,
1302
+ "gradient_accumulation_steps": 1,
1303
+ "logging_steps": 10,
1304
+ "eval_steps": 200,
1305
+ "save_steps": 200,
1306
+ "warmup_ratio": 0.03,
1307
+ "weight_decay": 0.0,
1308
+ "max_grad_norm": 0.3,
1309
+ "lr_scheduler_type": "constant",
1310
+ "bf16": true,
1311
+ "tf32": true,
1312
+ "gradient_checkpointing": true,
1313
+ "optim": "adamw_torch_fused",
1314
+ "ft_type": "SFT_Prefill",
1315
+ "data_type": "small",
1316
+ "prompt_id": 3
1317
+ }
1318
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff