danhtran2mind commited on
Commit
dbb27a2
·
verified ·
1 Parent(s): 347094e

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-13200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-13230/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/qwen3-0.6b-unsloth-bnb-4bit
3
+ library_name: peft
4
+ model_name: Qwen-3-0.6B-Reasoning-Vi-Medical-LoRA
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ - unsloth
10
+ licence: license
11
+ ---
12
+
13
+ # Model Card for Qwen-3-0.6B-Reasoning-Vi-Medical-LoRA
14
+
15
+ This model is a fine-tuned version of [unsloth/qwen3-0.6b-unsloth-bnb-4bit](https://huggingface.co/unsloth/qwen3-0.6b-unsloth-bnb-4bit).
16
+ It has been trained using [TRL](https://github.com/huggingface/trl).
17
+
18
+ ## Quick start
19
+
20
+ ```python
21
+ from transformers import pipeline
22
+
23
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
24
+ generator = pipeline("text-generation", model="None", device="cuda")
25
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
26
+ print(output["generated_text"])
27
+ ```
28
+
29
+ ## Training procedure
30
+
31
+
32
+
33
+
34
+ This model was trained with SFT.
35
+
36
+ ### Framework versions
37
+
38
+ - PEFT 0.14.0
39
+ - TRL: 0.19.1
40
+ - Transformers: 4.51.3
41
+ - Pytorch: 2.7.0
42
+ - Datasets: 3.6.0
43
+ - Tokenizers: 0.21.1
44
+
45
+ ## Citations
46
+
47
+
48
+
49
+ Cite TRL as:
50
+
51
+ ```bibtex
52
+ @misc{vonwerra2022trl,
53
+ title = {{TRL: Transformer Reinforcement Learning}},
54
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
55
+ year = 2020,
56
+ journal = {GitHub repository},
57
+ publisher = {GitHub},
58
+ howpublished = {\url{https://github.com/huggingface/trl}}
59
+ }
60
+ ```
adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/qwen3-0.6b-unsloth-bnb-4bit",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "up_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "k_proj",
30
+ "gate_proj",
31
+ "v_proj",
32
+ "o_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff894fe2534f1ab5109d59bb2f7fb64770e0a5fdd6c145d04ef5ddb79169a22a
3
+ size 40422168
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-13200/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/qwen3-0.6b-unsloth-bnb-4bit
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-13200/adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/qwen3-0.6b-unsloth-bnb-4bit",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "up_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "k_proj",
30
+ "gate_proj",
31
+ "v_proj",
32
+ "o_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
checkpoint-13200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84502ed2f392d38f57c06b7fd9304bed0e967e0395b17a497ebe7b1b2b506c7b
3
+ size 40422168
checkpoint-13200/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-13200/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-13200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:249a5babf0dd03cda652085d01b57d4fc950ea453e3055ff5c846ea64cf7be41
3
+ size 21979091
checkpoint-13200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970f20ed294a0d8701bfcccff463f693cc5687d59a27abebbf761ee3255ea47d
3
+ size 14645
checkpoint-13200/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81a899ec02bceca3d64b55f4fcc622ff203093ebd5adf78500204bfe722e04cc
3
+ size 1383
checkpoint-13200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b14be9cd05f4cd81d2d7a8c2206eda5a3e1010b885a3eccd6877bb7670b99e0
3
+ size 1465
checkpoint-13200/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|vision_pad|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-13200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae1a036a9837df9caeebb840d09d80e8feef0f6d2bae982970d1ad34f5946aff
3
+ size 11422753
checkpoint-13200/tokenizer_config.json ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for forward_message in messages %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- set message = messages[index] %}\n {%- set current_content = message.content if message.content is defined and message.content is not none else '' %}\n {%- set tool_start = '<tool_response>' %}\n {%- set tool_start_length = tool_start|length %}\n {%- set start_of_message = current_content[:tool_start_length] %}\n {%- set tool_end = '</tool_response>' %}\n {%- set tool_end_length = tool_end|length %}\n {%- set start_pos = (current_content|length) - tool_end_length %}\n {%- if start_pos < 0 %}\n {%- set start_pos = 0 %}\n {%- endif %}\n {%- set end_of_message = current_content[start_pos:] %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(start_of_message == tool_start and end_of_message == tool_end) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set m_content = message.content if message.content is defined and message.content is not none else '' %}\n {%- set content = m_content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in m_content %}\n {%- set content = (m_content.split('</think>')|last).lstrip('\\n') %}\n {%- set reasoning_content = (m_content.split('</think>')|first).rstrip('\\n') %}\n {%- set reasoning_content = (reasoning_content.split('<think>')|last).lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and (not reasoning_content.strip() == '')) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 40960,
236
+ "pad_token": "<|vision_pad|>",
237
+ "padding_side": "right",
238
+ "split_special_tokens": false,
239
+ "tokenizer_class": "Qwen2Tokenizer",
240
+ "unk_token": null
241
+ }
checkpoint-13200/trainer_state.json ADDED
@@ -0,0 +1,2014 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 12700,
3
+ "best_metric": 0.6587069034576416,
4
+ "best_model_checkpoint": "./Qwen-3-0.6B-Reasoning-Vi-Medical-LoRA/checkpoint-12700",
5
+ "epoch": 29.93091732729332,
6
+ "eval_steps": 100,
7
+ "global_step": 13200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.22650056625141562,
14
+ "grad_norm": 0.32067689299583435,
15
+ "learning_rate": 0.0001936551724137931,
16
+ "loss": 1.3117,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.22650056625141562,
21
+ "eval_loss": 1.2771011590957642,
22
+ "eval_runtime": 215.7306,
23
+ "eval_samples_per_second": 7.273,
24
+ "eval_steps_per_second": 0.913,
25
+ "step": 100
26
+ },
27
+ {
28
+ "epoch": 0.45300113250283125,
29
+ "grad_norm": 0.368208110332489,
30
+ "learning_rate": 0.00018445977011494254,
31
+ "loss": 1.2627,
32
+ "step": 200
33
+ },
34
+ {
35
+ "epoch": 0.45300113250283125,
36
+ "eval_loss": 1.234209418296814,
37
+ "eval_runtime": 215.6413,
38
+ "eval_samples_per_second": 7.276,
39
+ "eval_steps_per_second": 0.914,
40
+ "step": 200
41
+ },
42
+ {
43
+ "epoch": 0.6795016987542469,
44
+ "grad_norm": 0.34856918454170227,
45
+ "learning_rate": 0.00017526436781609196,
46
+ "loss": 1.2199,
47
+ "step": 300
48
+ },
49
+ {
50
+ "epoch": 0.6795016987542469,
51
+ "eval_loss": 1.2032462358474731,
52
+ "eval_runtime": 215.6165,
53
+ "eval_samples_per_second": 7.277,
54
+ "eval_steps_per_second": 0.914,
55
+ "step": 300
56
+ },
57
+ {
58
+ "epoch": 0.9060022650056625,
59
+ "grad_norm": 0.33870309591293335,
60
+ "learning_rate": 0.0001660689655172414,
61
+ "loss": 1.1941,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 0.9060022650056625,
66
+ "eval_loss": 1.1780856847763062,
67
+ "eval_runtime": 215.6336,
68
+ "eval_samples_per_second": 7.276,
69
+ "eval_steps_per_second": 0.914,
70
+ "step": 400
71
+ },
72
+ {
73
+ "epoch": 1.1336353340883352,
74
+ "grad_norm": 0.38066866993904114,
75
+ "learning_rate": 0.00015687356321839082,
76
+ "loss": 1.1664,
77
+ "step": 500
78
+ },
79
+ {
80
+ "epoch": 1.1336353340883352,
81
+ "eval_loss": 1.1569600105285645,
82
+ "eval_runtime": 215.6796,
83
+ "eval_samples_per_second": 7.275,
84
+ "eval_steps_per_second": 0.913,
85
+ "step": 500
86
+ },
87
+ {
88
+ "epoch": 1.3601359003397508,
89
+ "grad_norm": 0.3768746852874756,
90
+ "learning_rate": 0.00014767816091954024,
91
+ "loss": 1.1276,
92
+ "step": 600
93
+ },
94
+ {
95
+ "epoch": 1.3601359003397508,
96
+ "eval_loss": 1.1395982503890991,
97
+ "eval_runtime": 215.7974,
98
+ "eval_samples_per_second": 7.271,
99
+ "eval_steps_per_second": 0.913,
100
+ "step": 600
101
+ },
102
+ {
103
+ "epoch": 1.5866364665911665,
104
+ "grad_norm": 0.3873467743396759,
105
+ "learning_rate": 0.00013848275862068967,
106
+ "loss": 1.1059,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 1.5866364665911665,
111
+ "eval_loss": 1.1225543022155762,
112
+ "eval_runtime": 215.8352,
113
+ "eval_samples_per_second": 7.269,
114
+ "eval_steps_per_second": 0.913,
115
+ "step": 700
116
+ },
117
+ {
118
+ "epoch": 1.8131370328425822,
119
+ "grad_norm": 0.399214506149292,
120
+ "learning_rate": 0.00012928735632183907,
121
+ "loss": 1.087,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 1.8131370328425822,
126
+ "eval_loss": 1.1066261529922485,
127
+ "eval_runtime": 215.8039,
128
+ "eval_samples_per_second": 7.27,
129
+ "eval_steps_per_second": 0.913,
130
+ "step": 800
131
+ },
132
+ {
133
+ "epoch": 2.0407701019252547,
134
+ "grad_norm": 0.3834548890590668,
135
+ "learning_rate": 0.00012009195402298852,
136
+ "loss": 1.0775,
137
+ "step": 900
138
+ },
139
+ {
140
+ "epoch": 2.0407701019252547,
141
+ "eval_loss": 1.0939360857009888,
142
+ "eval_runtime": 215.8709,
143
+ "eval_samples_per_second": 7.268,
144
+ "eval_steps_per_second": 0.913,
145
+ "step": 900
146
+ },
147
+ {
148
+ "epoch": 2.2672706681766703,
149
+ "grad_norm": 0.41335031390190125,
150
+ "learning_rate": 0.00011089655172413794,
151
+ "loss": 1.0294,
152
+ "step": 1000
153
+ },
154
+ {
155
+ "epoch": 2.2672706681766703,
156
+ "eval_loss": 1.081494688987732,
157
+ "eval_runtime": 215.8265,
158
+ "eval_samples_per_second": 7.27,
159
+ "eval_steps_per_second": 0.913,
160
+ "step": 1000
161
+ },
162
+ {
163
+ "epoch": 2.493771234428086,
164
+ "grad_norm": 0.43652597069740295,
165
+ "learning_rate": 0.00010170114942528736,
166
+ "loss": 1.0077,
167
+ "step": 1100
168
+ },
169
+ {
170
+ "epoch": 2.493771234428086,
171
+ "eval_loss": 1.066120982170105,
172
+ "eval_runtime": 215.7444,
173
+ "eval_samples_per_second": 7.272,
174
+ "eval_steps_per_second": 0.913,
175
+ "step": 1100
176
+ },
177
+ {
178
+ "epoch": 2.7202718006795017,
179
+ "grad_norm": 0.4415673613548279,
180
+ "learning_rate": 9.250574712643678e-05,
181
+ "loss": 0.9984,
182
+ "step": 1200
183
+ },
184
+ {
185
+ "epoch": 2.7202718006795017,
186
+ "eval_loss": 1.0539467334747314,
187
+ "eval_runtime": 215.873,
188
+ "eval_samples_per_second": 7.268,
189
+ "eval_steps_per_second": 0.913,
190
+ "step": 1200
191
+ },
192
+ {
193
+ "epoch": 2.9467723669309174,
194
+ "grad_norm": 0.44037237763404846,
195
+ "learning_rate": 8.33103448275862e-05,
196
+ "loss": 0.9837,
197
+ "step": 1300
198
+ },
199
+ {
200
+ "epoch": 2.9467723669309174,
201
+ "eval_loss": 1.044096827507019,
202
+ "eval_runtime": 215.7373,
203
+ "eval_samples_per_second": 7.273,
204
+ "eval_steps_per_second": 0.913,
205
+ "step": 1300
206
+ },
207
+ {
208
+ "epoch": 3.17440543601359,
209
+ "grad_norm": 0.5079408288002014,
210
+ "learning_rate": 7.411494252873563e-05,
211
+ "loss": 0.9642,
212
+ "step": 1400
213
+ },
214
+ {
215
+ "epoch": 3.17440543601359,
216
+ "eval_loss": 1.0374841690063477,
217
+ "eval_runtime": 215.6881,
218
+ "eval_samples_per_second": 7.274,
219
+ "eval_steps_per_second": 0.913,
220
+ "step": 1400
221
+ },
222
+ {
223
+ "epoch": 3.4009060022650055,
224
+ "grad_norm": 0.4961177408695221,
225
+ "learning_rate": 6.491954022988506e-05,
226
+ "loss": 0.9381,
227
+ "step": 1500
228
+ },
229
+ {
230
+ "epoch": 3.4009060022650055,
231
+ "eval_loss": 1.0282294750213623,
232
+ "eval_runtime": 215.6305,
233
+ "eval_samples_per_second": 7.276,
234
+ "eval_steps_per_second": 0.914,
235
+ "step": 1500
236
+ },
237
+ {
238
+ "epoch": 3.627406568516421,
239
+ "grad_norm": 0.5088583827018738,
240
+ "learning_rate": 5.5724137931034484e-05,
241
+ "loss": 0.9261,
242
+ "step": 1600
243
+ },
244
+ {
245
+ "epoch": 3.627406568516421,
246
+ "eval_loss": 1.0186975002288818,
247
+ "eval_runtime": 215.8306,
248
+ "eval_samples_per_second": 7.27,
249
+ "eval_steps_per_second": 0.913,
250
+ "step": 1600
251
+ },
252
+ {
253
+ "epoch": 3.853907134767837,
254
+ "grad_norm": 0.5300412178039551,
255
+ "learning_rate": 4.652873563218391e-05,
256
+ "loss": 0.9115,
257
+ "step": 1700
258
+ },
259
+ {
260
+ "epoch": 3.853907134767837,
261
+ "eval_loss": 1.011821985244751,
262
+ "eval_runtime": 215.7992,
263
+ "eval_samples_per_second": 7.271,
264
+ "eval_steps_per_second": 0.913,
265
+ "step": 1700
266
+ },
267
+ {
268
+ "epoch": 4.081540203850509,
269
+ "grad_norm": 0.5446588397026062,
270
+ "learning_rate": 3.733333333333334e-05,
271
+ "loss": 0.9033,
272
+ "step": 1800
273
+ },
274
+ {
275
+ "epoch": 4.081540203850509,
276
+ "eval_loss": 1.0095539093017578,
277
+ "eval_runtime": 215.7598,
278
+ "eval_samples_per_second": 7.272,
279
+ "eval_steps_per_second": 0.913,
280
+ "step": 1800
281
+ },
282
+ {
283
+ "epoch": 4.308040770101925,
284
+ "grad_norm": 0.5339412093162537,
285
+ "learning_rate": 2.813793103448276e-05,
286
+ "loss": 0.8647,
287
+ "step": 1900
288
+ },
289
+ {
290
+ "epoch": 4.308040770101925,
291
+ "eval_loss": 1.002418875694275,
292
+ "eval_runtime": 215.8479,
293
+ "eval_samples_per_second": 7.269,
294
+ "eval_steps_per_second": 0.913,
295
+ "step": 1900
296
+ },
297
+ {
298
+ "epoch": 4.534541336353341,
299
+ "grad_norm": 0.5594246983528137,
300
+ "learning_rate": 1.8942528735632184e-05,
301
+ "loss": 0.8718,
302
+ "step": 2000
303
+ },
304
+ {
305
+ "epoch": 4.534541336353341,
306
+ "eval_loss": 0.9978017807006836,
307
+ "eval_runtime": 215.8323,
308
+ "eval_samples_per_second": 7.27,
309
+ "eval_steps_per_second": 0.913,
310
+ "step": 2000
311
+ },
312
+ {
313
+ "epoch": 4.761041902604757,
314
+ "grad_norm": 0.5511975288391113,
315
+ "learning_rate": 9.747126436781609e-06,
316
+ "loss": 0.8611,
317
+ "step": 2100
318
+ },
319
+ {
320
+ "epoch": 4.761041902604757,
321
+ "eval_loss": 0.9954851269721985,
322
+ "eval_runtime": 215.843,
323
+ "eval_samples_per_second": 7.269,
324
+ "eval_steps_per_second": 0.913,
325
+ "step": 2100
326
+ },
327
+ {
328
+ "epoch": 4.987542468856172,
329
+ "grad_norm": 0.5623785853385925,
330
+ "learning_rate": 5.517241379310344e-07,
331
+ "loss": 0.8602,
332
+ "step": 2200
333
+ },
334
+ {
335
+ "epoch": 4.987542468856172,
336
+ "eval_loss": 0.9939414262771606,
337
+ "eval_runtime": 215.789,
338
+ "eval_samples_per_second": 7.271,
339
+ "eval_steps_per_second": 0.913,
340
+ "step": 2200
341
+ },
342
+ {
343
+ "epoch": 5.215175537938845,
344
+ "grad_norm": 0.6162961721420288,
345
+ "learning_rate": 9.639269406392695e-05,
346
+ "loss": 0.8571,
347
+ "step": 2300
348
+ },
349
+ {
350
+ "epoch": 5.215175537938845,
351
+ "eval_loss": 1.001980185508728,
352
+ "eval_runtime": 215.4279,
353
+ "eval_samples_per_second": 7.283,
354
+ "eval_steps_per_second": 0.914,
355
+ "step": 2300
356
+ },
357
+ {
358
+ "epoch": 5.44167610419026,
359
+ "grad_norm": 0.6526823043823242,
360
+ "learning_rate": 9.182648401826485e-05,
361
+ "loss": 0.8618,
362
+ "step": 2400
363
+ },
364
+ {
365
+ "epoch": 5.44167610419026,
366
+ "eval_loss": 0.9889749884605408,
367
+ "eval_runtime": 215.4552,
368
+ "eval_samples_per_second": 7.282,
369
+ "eval_steps_per_second": 0.914,
370
+ "step": 2400
371
+ },
372
+ {
373
+ "epoch": 5.668176670441676,
374
+ "grad_norm": 0.6778553128242493,
375
+ "learning_rate": 8.726027397260274e-05,
376
+ "loss": 0.8467,
377
+ "step": 2500
378
+ },
379
+ {
380
+ "epoch": 5.668176670441676,
381
+ "eval_loss": 0.9737924337387085,
382
+ "eval_runtime": 215.4971,
383
+ "eval_samples_per_second": 7.281,
384
+ "eval_steps_per_second": 0.914,
385
+ "step": 2500
386
+ },
387
+ {
388
+ "epoch": 5.8946772366930915,
389
+ "grad_norm": 0.6477532386779785,
390
+ "learning_rate": 8.269406392694065e-05,
391
+ "loss": 0.8449,
392
+ "step": 2600
393
+ },
394
+ {
395
+ "epoch": 5.8946772366930915,
396
+ "eval_loss": 0.9627026319503784,
397
+ "eval_runtime": 215.5931,
398
+ "eval_samples_per_second": 7.278,
399
+ "eval_steps_per_second": 0.914,
400
+ "step": 2600
401
+ },
402
+ {
403
+ "epoch": 6.122310305775764,
404
+ "grad_norm": 0.7134155035018921,
405
+ "learning_rate": 7.812785388127854e-05,
406
+ "loss": 0.8181,
407
+ "step": 2700
408
+ },
409
+ {
410
+ "epoch": 6.122310305775764,
411
+ "eval_loss": 0.9615710973739624,
412
+ "eval_runtime": 215.6571,
413
+ "eval_samples_per_second": 7.275,
414
+ "eval_steps_per_second": 0.913,
415
+ "step": 2700
416
+ },
417
+ {
418
+ "epoch": 6.34881087202718,
419
+ "grad_norm": 0.7087521553039551,
420
+ "learning_rate": 7.356164383561645e-05,
421
+ "loss": 0.7748,
422
+ "step": 2800
423
+ },
424
+ {
425
+ "epoch": 6.34881087202718,
426
+ "eval_loss": 0.9508717656135559,
427
+ "eval_runtime": 215.7854,
428
+ "eval_samples_per_second": 7.271,
429
+ "eval_steps_per_second": 0.913,
430
+ "step": 2800
431
+ },
432
+ {
433
+ "epoch": 6.575311438278596,
434
+ "grad_norm": 0.718744158744812,
435
+ "learning_rate": 6.899543378995434e-05,
436
+ "loss": 0.7679,
437
+ "step": 2900
438
+ },
439
+ {
440
+ "epoch": 6.575311438278596,
441
+ "eval_loss": 0.9414442181587219,
442
+ "eval_runtime": 215.8161,
443
+ "eval_samples_per_second": 7.27,
444
+ "eval_steps_per_second": 0.913,
445
+ "step": 2900
446
+ },
447
+ {
448
+ "epoch": 6.801812004530011,
449
+ "grad_norm": 0.7241001725196838,
450
+ "learning_rate": 6.442922374429225e-05,
451
+ "loss": 0.7708,
452
+ "step": 3000
453
+ },
454
+ {
455
+ "epoch": 6.801812004530011,
456
+ "eval_loss": 0.9294878840446472,
457
+ "eval_runtime": 215.8924,
458
+ "eval_samples_per_second": 7.268,
459
+ "eval_steps_per_second": 0.912,
460
+ "step": 3000
461
+ },
462
+ {
463
+ "epoch": 7.029445073612684,
464
+ "grad_norm": 0.7078688740730286,
465
+ "learning_rate": 5.986301369863014e-05,
466
+ "loss": 0.7602,
467
+ "step": 3100
468
+ },
469
+ {
470
+ "epoch": 7.029445073612684,
471
+ "eval_loss": 0.9215793013572693,
472
+ "eval_runtime": 215.8631,
473
+ "eval_samples_per_second": 7.268,
474
+ "eval_steps_per_second": 0.913,
475
+ "step": 3100
476
+ },
477
+ {
478
+ "epoch": 7.2559456398641,
479
+ "grad_norm": 0.7763993740081787,
480
+ "learning_rate": 5.529680365296805e-05,
481
+ "loss": 0.7042,
482
+ "step": 3200
483
+ },
484
+ {
485
+ "epoch": 7.2559456398641,
486
+ "eval_loss": 0.9177075624465942,
487
+ "eval_runtime": 215.8296,
488
+ "eval_samples_per_second": 7.27,
489
+ "eval_steps_per_second": 0.913,
490
+ "step": 3200
491
+ },
492
+ {
493
+ "epoch": 7.482446206115515,
494
+ "grad_norm": 0.7907236814498901,
495
+ "learning_rate": 5.0730593607305946e-05,
496
+ "loss": 0.7041,
497
+ "step": 3300
498
+ },
499
+ {
500
+ "epoch": 7.482446206115515,
501
+ "eval_loss": 0.9082866907119751,
502
+ "eval_runtime": 215.8314,
503
+ "eval_samples_per_second": 7.27,
504
+ "eval_steps_per_second": 0.913,
505
+ "step": 3300
506
+ },
507
+ {
508
+ "epoch": 7.7089467723669305,
509
+ "grad_norm": 0.809502899646759,
510
+ "learning_rate": 4.616438356164384e-05,
511
+ "loss": 0.6968,
512
+ "step": 3400
513
+ },
514
+ {
515
+ "epoch": 7.7089467723669305,
516
+ "eval_loss": 0.9002099633216858,
517
+ "eval_runtime": 215.7011,
518
+ "eval_samples_per_second": 7.274,
519
+ "eval_steps_per_second": 0.913,
520
+ "step": 3400
521
+ },
522
+ {
523
+ "epoch": 7.935447338618347,
524
+ "grad_norm": 0.8147194981575012,
525
+ "learning_rate": 4.159817351598174e-05,
526
+ "loss": 0.689,
527
+ "step": 3500
528
+ },
529
+ {
530
+ "epoch": 7.935447338618347,
531
+ "eval_loss": 0.887977123260498,
532
+ "eval_runtime": 215.5757,
533
+ "eval_samples_per_second": 7.278,
534
+ "eval_steps_per_second": 0.914,
535
+ "step": 3500
536
+ },
537
+ {
538
+ "epoch": 8.163080407701019,
539
+ "grad_norm": 0.7601930499076843,
540
+ "learning_rate": 3.703196347031964e-05,
541
+ "loss": 0.6579,
542
+ "step": 3600
543
+ },
544
+ {
545
+ "epoch": 8.163080407701019,
546
+ "eval_loss": 0.8895569443702698,
547
+ "eval_runtime": 215.5399,
548
+ "eval_samples_per_second": 7.279,
549
+ "eval_steps_per_second": 0.914,
550
+ "step": 3600
551
+ },
552
+ {
553
+ "epoch": 8.389580973952436,
554
+ "grad_norm": 0.9018923044204712,
555
+ "learning_rate": 3.246575342465754e-05,
556
+ "loss": 0.6431,
557
+ "step": 3700
558
+ },
559
+ {
560
+ "epoch": 8.389580973952436,
561
+ "eval_loss": 0.886945366859436,
562
+ "eval_runtime": 215.5014,
563
+ "eval_samples_per_second": 7.281,
564
+ "eval_steps_per_second": 0.914,
565
+ "step": 3700
566
+ },
567
+ {
568
+ "epoch": 8.61608154020385,
569
+ "grad_norm": 0.8826325535774231,
570
+ "learning_rate": 2.7899543378995436e-05,
571
+ "loss": 0.6343,
572
+ "step": 3800
573
+ },
574
+ {
575
+ "epoch": 8.61608154020385,
576
+ "eval_loss": 0.879227340221405,
577
+ "eval_runtime": 215.5183,
578
+ "eval_samples_per_second": 7.28,
579
+ "eval_steps_per_second": 0.914,
580
+ "step": 3800
581
+ },
582
+ {
583
+ "epoch": 8.842582106455266,
584
+ "grad_norm": 0.8975309133529663,
585
+ "learning_rate": 2.3333333333333336e-05,
586
+ "loss": 0.638,
587
+ "step": 3900
588
+ },
589
+ {
590
+ "epoch": 8.842582106455266,
591
+ "eval_loss": 0.8713725805282593,
592
+ "eval_runtime": 215.5235,
593
+ "eval_samples_per_second": 7.28,
594
+ "eval_steps_per_second": 0.914,
595
+ "step": 3900
596
+ },
597
+ {
598
+ "epoch": 9.070215175537939,
599
+ "grad_norm": 0.8930607438087463,
600
+ "learning_rate": 1.8767123287671235e-05,
601
+ "loss": 0.6248,
602
+ "step": 4000
603
+ },
604
+ {
605
+ "epoch": 9.070215175537939,
606
+ "eval_loss": 0.8720493316650391,
607
+ "eval_runtime": 215.4472,
608
+ "eval_samples_per_second": 7.283,
609
+ "eval_steps_per_second": 0.914,
610
+ "step": 4000
611
+ },
612
+ {
613
+ "epoch": 9.296715741789354,
614
+ "grad_norm": 0.8194226622581482,
615
+ "learning_rate": 1.4200913242009135e-05,
616
+ "loss": 0.5963,
617
+ "step": 4100
618
+ },
619
+ {
620
+ "epoch": 9.296715741789354,
621
+ "eval_loss": 0.8683505654335022,
622
+ "eval_runtime": 215.4449,
623
+ "eval_samples_per_second": 7.283,
624
+ "eval_steps_per_second": 0.914,
625
+ "step": 4100
626
+ },
627
+ {
628
+ "epoch": 9.52321630804077,
629
+ "grad_norm": 0.9078500270843506,
630
+ "learning_rate": 9.634703196347032e-06,
631
+ "loss": 0.5933,
632
+ "step": 4200
633
+ },
634
+ {
635
+ "epoch": 9.52321630804077,
636
+ "eval_loss": 0.8663039803504944,
637
+ "eval_runtime": 215.4014,
638
+ "eval_samples_per_second": 7.284,
639
+ "eval_steps_per_second": 0.915,
640
+ "step": 4200
641
+ },
642
+ {
643
+ "epoch": 9.749716874292186,
644
+ "grad_norm": 0.8323531746864319,
645
+ "learning_rate": 5.068493150684932e-06,
646
+ "loss": 0.6018,
647
+ "step": 4300
648
+ },
649
+ {
650
+ "epoch": 9.749716874292186,
651
+ "eval_loss": 0.864680826663971,
652
+ "eval_runtime": 215.5321,
653
+ "eval_samples_per_second": 7.28,
654
+ "eval_steps_per_second": 0.914,
655
+ "step": 4300
656
+ },
657
+ {
658
+ "epoch": 9.976217440543602,
659
+ "grad_norm": 0.8527385592460632,
660
+ "learning_rate": 5.022831050228311e-07,
661
+ "loss": 0.5912,
662
+ "step": 4400
663
+ },
664
+ {
665
+ "epoch": 9.976217440543602,
666
+ "eval_loss": 0.8630263209342957,
667
+ "eval_runtime": 215.6024,
668
+ "eval_samples_per_second": 7.277,
669
+ "eval_steps_per_second": 0.914,
670
+ "step": 4400
671
+ },
672
+ {
673
+ "epoch": 10.203850509626275,
674
+ "grad_norm": 1.06992506980896,
675
+ "learning_rate": 9.831626848691696e-05,
676
+ "loss": 0.6137,
677
+ "step": 4500
678
+ },
679
+ {
680
+ "epoch": 10.203850509626275,
681
+ "eval_loss": 0.8823357820510864,
682
+ "eval_runtime": 215.9148,
683
+ "eval_samples_per_second": 7.267,
684
+ "eval_steps_per_second": 0.912,
685
+ "step": 4500
686
+ },
687
+ {
688
+ "epoch": 10.43035107587769,
689
+ "grad_norm": 1.1724179983139038,
690
+ "learning_rate": 9.604095563139933e-05,
691
+ "loss": 0.6269,
692
+ "step": 4600
693
+ },
694
+ {
695
+ "epoch": 10.43035107587769,
696
+ "eval_loss": 0.8721606135368347,
697
+ "eval_runtime": 215.8627,
698
+ "eval_samples_per_second": 7.269,
699
+ "eval_steps_per_second": 0.913,
700
+ "step": 4600
701
+ },
702
+ {
703
+ "epoch": 10.656851642129105,
704
+ "grad_norm": 0.9768912196159363,
705
+ "learning_rate": 9.37656427758817e-05,
706
+ "loss": 0.6269,
707
+ "step": 4700
708
+ },
709
+ {
710
+ "epoch": 10.656851642129105,
711
+ "eval_loss": 0.857449471950531,
712
+ "eval_runtime": 215.9592,
713
+ "eval_samples_per_second": 7.265,
714
+ "eval_steps_per_second": 0.912,
715
+ "step": 4700
716
+ },
717
+ {
718
+ "epoch": 10.88335220838052,
719
+ "grad_norm": 0.9589354991912842,
720
+ "learning_rate": 9.149032992036407e-05,
721
+ "loss": 0.6214,
722
+ "step": 4800
723
+ },
724
+ {
725
+ "epoch": 10.88335220838052,
726
+ "eval_loss": 0.8469829559326172,
727
+ "eval_runtime": 216.0164,
728
+ "eval_samples_per_second": 7.263,
729
+ "eval_steps_per_second": 0.912,
730
+ "step": 4800
731
+ },
732
+ {
733
+ "epoch": 11.110985277463193,
734
+ "grad_norm": 1.1246964931488037,
735
+ "learning_rate": 8.921501706484642e-05,
736
+ "loss": 0.6,
737
+ "step": 4900
738
+ },
739
+ {
740
+ "epoch": 11.110985277463193,
741
+ "eval_loss": 0.83788001537323,
742
+ "eval_runtime": 215.9754,
743
+ "eval_samples_per_second": 7.265,
744
+ "eval_steps_per_second": 0.912,
745
+ "step": 4900
746
+ },
747
+ {
748
+ "epoch": 11.337485843714608,
749
+ "grad_norm": 1.138035535812378,
750
+ "learning_rate": 8.693970420932879e-05,
751
+ "loss": 0.5559,
752
+ "step": 5000
753
+ },
754
+ {
755
+ "epoch": 11.337485843714608,
756
+ "eval_loss": 0.8344744443893433,
757
+ "eval_runtime": 216.059,
758
+ "eval_samples_per_second": 7.262,
759
+ "eval_steps_per_second": 0.912,
760
+ "step": 5000
761
+ },
762
+ {
763
+ "epoch": 11.563986409966025,
764
+ "grad_norm": 1.1976112127304077,
765
+ "learning_rate": 4.6044039483675025e-05,
766
+ "loss": 0.5355,
767
+ "step": 5100
768
+ },
769
+ {
770
+ "epoch": 11.563986409966025,
771
+ "eval_loss": 0.8142299652099609,
772
+ "eval_runtime": 216.0109,
773
+ "eval_samples_per_second": 7.264,
774
+ "eval_steps_per_second": 0.912,
775
+ "step": 5100
776
+ },
777
+ {
778
+ "epoch": 11.79048697621744,
779
+ "grad_norm": 1.095481514930725,
780
+ "learning_rate": 4.3006833712984054e-05,
781
+ "loss": 0.5353,
782
+ "step": 5200
783
+ },
784
+ {
785
+ "epoch": 11.79048697621744,
786
+ "eval_loss": 0.8013305068016052,
787
+ "eval_runtime": 215.9277,
788
+ "eval_samples_per_second": 7.266,
789
+ "eval_steps_per_second": 0.912,
790
+ "step": 5200
791
+ },
792
+ {
793
+ "epoch": 12.018120045300114,
794
+ "grad_norm": 1.0251810550689697,
795
+ "learning_rate": 3.996962794229309e-05,
796
+ "loss": 0.5364,
797
+ "step": 5300
798
+ },
799
+ {
800
+ "epoch": 12.018120045300114,
801
+ "eval_loss": 0.7993968725204468,
802
+ "eval_runtime": 216.1353,
803
+ "eval_samples_per_second": 7.259,
804
+ "eval_steps_per_second": 0.911,
805
+ "step": 5300
806
+ },
807
+ {
808
+ "epoch": 12.244620611551529,
809
+ "grad_norm": 0.9645494222640991,
810
+ "learning_rate": 3.6932422171602125e-05,
811
+ "loss": 0.474,
812
+ "step": 5400
813
+ },
814
+ {
815
+ "epoch": 12.244620611551529,
816
+ "eval_loss": 0.7945672273635864,
817
+ "eval_runtime": 216.0266,
818
+ "eval_samples_per_second": 7.263,
819
+ "eval_steps_per_second": 0.912,
820
+ "step": 5400
821
+ },
822
+ {
823
+ "epoch": 12.471121177802944,
824
+ "grad_norm": 0.9431504011154175,
825
+ "learning_rate": 3.389521640091116e-05,
826
+ "loss": 0.48,
827
+ "step": 5500
828
+ },
829
+ {
830
+ "epoch": 12.471121177802944,
831
+ "eval_loss": 0.7862181067466736,
832
+ "eval_runtime": 216.036,
833
+ "eval_samples_per_second": 7.263,
834
+ "eval_steps_per_second": 0.912,
835
+ "step": 5500
836
+ },
837
+ {
838
+ "epoch": 12.69762174405436,
839
+ "grad_norm": 0.9955912232398987,
840
+ "learning_rate": 3.0858010630220196e-05,
841
+ "loss": 0.4733,
842
+ "step": 5600
843
+ },
844
+ {
845
+ "epoch": 12.69762174405436,
846
+ "eval_loss": 0.77918940782547,
847
+ "eval_runtime": 215.9528,
848
+ "eval_samples_per_second": 7.265,
849
+ "eval_steps_per_second": 0.912,
850
+ "step": 5600
851
+ },
852
+ {
853
+ "epoch": 12.924122310305776,
854
+ "grad_norm": 1.0505925416946411,
855
+ "learning_rate": 2.782080485952924e-05,
856
+ "loss": 0.4772,
857
+ "step": 5700
858
+ },
859
+ {
860
+ "epoch": 12.924122310305776,
861
+ "eval_loss": 0.7714400291442871,
862
+ "eval_runtime": 215.9696,
863
+ "eval_samples_per_second": 7.265,
864
+ "eval_steps_per_second": 0.912,
865
+ "step": 5700
866
+ },
867
+ {
868
+ "epoch": 13.15175537938845,
869
+ "grad_norm": 0.9624122381210327,
870
+ "learning_rate": 2.478359908883827e-05,
871
+ "loss": 0.4534,
872
+ "step": 5800
873
+ },
874
+ {
875
+ "epoch": 13.15175537938845,
876
+ "eval_loss": 0.7735848426818848,
877
+ "eval_runtime": 215.9988,
878
+ "eval_samples_per_second": 7.264,
879
+ "eval_steps_per_second": 0.912,
880
+ "step": 5800
881
+ },
882
+ {
883
+ "epoch": 13.378255945639864,
884
+ "grad_norm": 0.9842013716697693,
885
+ "learning_rate": 2.1746393318147306e-05,
886
+ "loss": 0.4437,
887
+ "step": 5900
888
+ },
889
+ {
890
+ "epoch": 13.378255945639864,
891
+ "eval_loss": 0.7693312764167786,
892
+ "eval_runtime": 216.0532,
893
+ "eval_samples_per_second": 7.262,
894
+ "eval_steps_per_second": 0.912,
895
+ "step": 5900
896
+ },
897
+ {
898
+ "epoch": 13.60475651189128,
899
+ "grad_norm": 0.9403685927391052,
900
+ "learning_rate": 1.8709187547456342e-05,
901
+ "loss": 0.4362,
902
+ "step": 6000
903
+ },
904
+ {
905
+ "epoch": 13.60475651189128,
906
+ "eval_loss": 0.762976348400116,
907
+ "eval_runtime": 216.071,
908
+ "eval_samples_per_second": 7.262,
909
+ "eval_steps_per_second": 0.912,
910
+ "step": 6000
911
+ },
912
+ {
913
+ "epoch": 13.831257078142695,
914
+ "grad_norm": 0.9903466105461121,
915
+ "learning_rate": 1.5671981776765377e-05,
916
+ "loss": 0.4349,
917
+ "step": 6100
918
+ },
919
+ {
920
+ "epoch": 13.831257078142695,
921
+ "eval_loss": 0.76067715883255,
922
+ "eval_runtime": 215.8384,
923
+ "eval_samples_per_second": 7.269,
924
+ "eval_steps_per_second": 0.913,
925
+ "step": 6100
926
+ },
927
+ {
928
+ "epoch": 14.058890147225368,
929
+ "grad_norm": 0.9362127780914307,
930
+ "learning_rate": 1.2634776006074411e-05,
931
+ "loss": 0.4385,
932
+ "step": 6200
933
+ },
934
+ {
935
+ "epoch": 14.058890147225368,
936
+ "eval_loss": 0.7605064511299133,
937
+ "eval_runtime": 215.995,
938
+ "eval_samples_per_second": 7.264,
939
+ "eval_steps_per_second": 0.912,
940
+ "step": 6200
941
+ },
942
+ {
943
+ "epoch": 14.285390713476783,
944
+ "grad_norm": 1.0396395921707153,
945
+ "learning_rate": 3.2087853220838355e-05,
946
+ "loss": 0.4171,
947
+ "step": 6300
948
+ },
949
+ {
950
+ "epoch": 14.285390713476783,
951
+ "eval_loss": 0.762564480304718,
952
+ "eval_runtime": 217.6902,
953
+ "eval_samples_per_second": 7.207,
954
+ "eval_steps_per_second": 0.905,
955
+ "step": 6300
956
+ },
957
+ {
958
+ "epoch": 14.5118912797282,
959
+ "grad_norm": 0.9981446266174316,
960
+ "learning_rate": 2.9409401366010447e-05,
961
+ "loss": 0.4262,
962
+ "step": 6400
963
+ },
964
+ {
965
+ "epoch": 14.5118912797282,
966
+ "eval_loss": 0.75795578956604,
967
+ "eval_runtime": 217.8007,
968
+ "eval_samples_per_second": 7.204,
969
+ "eval_steps_per_second": 0.904,
970
+ "step": 6400
971
+ },
972
+ {
973
+ "epoch": 14.738391845979615,
974
+ "grad_norm": 1.2502585649490356,
975
+ "learning_rate": 2.673094951118254e-05,
976
+ "loss": 0.423,
977
+ "step": 6500
978
+ },
979
+ {
980
+ "epoch": 14.738391845979615,
981
+ "eval_loss": 0.7491943836212158,
982
+ "eval_runtime": 217.9332,
983
+ "eval_samples_per_second": 7.199,
984
+ "eval_steps_per_second": 0.904,
985
+ "step": 6500
986
+ },
987
+ {
988
+ "epoch": 14.96489241223103,
989
+ "grad_norm": 1.0835864543914795,
990
+ "learning_rate": 2.405249765635463e-05,
991
+ "loss": 0.4219,
992
+ "step": 6600
993
+ },
994
+ {
995
+ "epoch": 14.96489241223103,
996
+ "eval_loss": 0.744881272315979,
997
+ "eval_runtime": 217.9821,
998
+ "eval_samples_per_second": 7.198,
999
+ "eval_steps_per_second": 0.904,
1000
+ "step": 6600
1001
+ },
1002
+ {
1003
+ "epoch": 15.192525481313703,
1004
+ "grad_norm": 1.0534075498580933,
1005
+ "learning_rate": 2.1374045801526718e-05,
1006
+ "loss": 0.4077,
1007
+ "step": 6700
1008
+ },
1009
+ {
1010
+ "epoch": 15.192525481313703,
1011
+ "eval_loss": 0.747353732585907,
1012
+ "eval_runtime": 217.9166,
1013
+ "eval_samples_per_second": 7.2,
1014
+ "eval_steps_per_second": 0.904,
1015
+ "step": 6700
1016
+ },
1017
+ {
1018
+ "epoch": 15.419026047565119,
1019
+ "grad_norm": 1.0463495254516602,
1020
+ "learning_rate": 1.869559394669881e-05,
1021
+ "loss": 0.3925,
1022
+ "step": 6800
1023
+ },
1024
+ {
1025
+ "epoch": 15.419026047565119,
1026
+ "eval_loss": 0.7430649399757385,
1027
+ "eval_runtime": 217.9436,
1028
+ "eval_samples_per_second": 7.199,
1029
+ "eval_steps_per_second": 0.904,
1030
+ "step": 6800
1031
+ },
1032
+ {
1033
+ "epoch": 15.645526613816534,
1034
+ "grad_norm": 0.9589468240737915,
1035
+ "learning_rate": 1.60171420918709e-05,
1036
+ "loss": 0.3932,
1037
+ "step": 6900
1038
+ },
1039
+ {
1040
+ "epoch": 15.645526613816534,
1041
+ "eval_loss": 0.7386749386787415,
1042
+ "eval_runtime": 217.8985,
1043
+ "eval_samples_per_second": 7.201,
1044
+ "eval_steps_per_second": 0.904,
1045
+ "step": 6900
1046
+ },
1047
+ {
1048
+ "epoch": 15.87202718006795,
1049
+ "grad_norm": 1.1412949562072754,
1050
+ "learning_rate": 1.333869023704299e-05,
1051
+ "loss": 0.3938,
1052
+ "step": 7000
1053
+ },
1054
+ {
1055
+ "epoch": 15.87202718006795,
1056
+ "eval_loss": 0.7356697916984558,
1057
+ "eval_runtime": 217.9654,
1058
+ "eval_samples_per_second": 7.198,
1059
+ "eval_steps_per_second": 0.904,
1060
+ "step": 7000
1061
+ },
1062
+ {
1063
+ "epoch": 16.099660249150624,
1064
+ "grad_norm": 0.9367544651031494,
1065
+ "learning_rate": 1.066023838221508e-05,
1066
+ "loss": 0.3897,
1067
+ "step": 7100
1068
+ },
1069
+ {
1070
+ "epoch": 16.099660249150624,
1071
+ "eval_loss": 0.7375982999801636,
1072
+ "eval_runtime": 217.89,
1073
+ "eval_samples_per_second": 7.201,
1074
+ "eval_steps_per_second": 0.904,
1075
+ "step": 7100
1076
+ },
1077
+ {
1078
+ "epoch": 16.326160815402037,
1079
+ "grad_norm": 1.0161197185516357,
1080
+ "learning_rate": 7.98178652738717e-06,
1081
+ "loss": 0.3713,
1082
+ "step": 7200
1083
+ },
1084
+ {
1085
+ "epoch": 16.326160815402037,
1086
+ "eval_loss": 0.7364110350608826,
1087
+ "eval_runtime": 217.9527,
1088
+ "eval_samples_per_second": 7.199,
1089
+ "eval_steps_per_second": 0.904,
1090
+ "step": 7200
1091
+ },
1092
+ {
1093
+ "epoch": 16.552661381653454,
1094
+ "grad_norm": 0.9260092377662659,
1095
+ "learning_rate": 5.303334672559261e-06,
1096
+ "loss": 0.3804,
1097
+ "step": 7300
1098
+ },
1099
+ {
1100
+ "epoch": 16.552661381653454,
1101
+ "eval_loss": 0.733728289604187,
1102
+ "eval_runtime": 217.9029,
1103
+ "eval_samples_per_second": 7.2,
1104
+ "eval_steps_per_second": 0.904,
1105
+ "step": 7300
1106
+ },
1107
+ {
1108
+ "epoch": 16.77916194790487,
1109
+ "grad_norm": 1.008174180984497,
1110
+ "learning_rate": 2.6248828177313514e-06,
1111
+ "loss": 0.3704,
1112
+ "step": 7400
1113
+ },
1114
+ {
1115
+ "epoch": 16.77916194790487,
1116
+ "eval_loss": 0.7327025532722473,
1117
+ "eval_runtime": 217.9462,
1118
+ "eval_samples_per_second": 7.199,
1119
+ "eval_steps_per_second": 0.904,
1120
+ "step": 7400
1121
+ },
1122
+ {
1123
+ "epoch": 17.006795016987542,
1124
+ "grad_norm": 0.88117915391922,
1125
+ "learning_rate": 3.0056882821387945e-05,
1126
+ "loss": 0.3357,
1127
+ "step": 7500
1128
+ },
1129
+ {
1130
+ "epoch": 17.006795016987542,
1131
+ "eval_loss": 0.7340475916862488,
1132
+ "eval_runtime": 215.1992,
1133
+ "eval_samples_per_second": 7.291,
1134
+ "eval_steps_per_second": 0.915,
1135
+ "step": 7500
1136
+ },
1137
+ {
1138
+ "epoch": 17.23329558323896,
1139
+ "grad_norm": 1.0653574466705322,
1140
+ "learning_rate": 2.7781569965870308e-05,
1141
+ "loss": 0.371,
1142
+ "step": 7600
1143
+ },
1144
+ {
1145
+ "epoch": 17.23329558323896,
1146
+ "eval_loss": 0.7350865602493286,
1147
+ "eval_runtime": 215.7811,
1148
+ "eval_samples_per_second": 7.271,
1149
+ "eval_steps_per_second": 0.913,
1150
+ "step": 7600
1151
+ },
1152
+ {
1153
+ "epoch": 17.459796149490373,
1154
+ "grad_norm": 1.0504379272460938,
1155
+ "learning_rate": 2.5506257110352678e-05,
1156
+ "loss": 0.3784,
1157
+ "step": 7700
1158
+ },
1159
+ {
1160
+ "epoch": 17.459796149490373,
1161
+ "eval_loss": 0.7296854257583618,
1162
+ "eval_runtime": 215.8568,
1163
+ "eval_samples_per_second": 7.269,
1164
+ "eval_steps_per_second": 0.913,
1165
+ "step": 7700
1166
+ },
1167
+ {
1168
+ "epoch": 17.68629671574179,
1169
+ "grad_norm": 1.2694584131240845,
1170
+ "learning_rate": 2.323094425483504e-05,
1171
+ "loss": 0.3769,
1172
+ "step": 7800
1173
+ },
1174
+ {
1175
+ "epoch": 17.68629671574179,
1176
+ "eval_loss": 0.7242677211761475,
1177
+ "eval_runtime": 215.9243,
1178
+ "eval_samples_per_second": 7.266,
1179
+ "eval_steps_per_second": 0.912,
1180
+ "step": 7800
1181
+ },
1182
+ {
1183
+ "epoch": 17.912797281993203,
1184
+ "grad_norm": 1.146080493927002,
1185
+ "learning_rate": 2.0955631399317408e-05,
1186
+ "loss": 0.3808,
1187
+ "step": 7900
1188
+ },
1189
+ {
1190
+ "epoch": 17.912797281993203,
1191
+ "eval_loss": 0.7212691307067871,
1192
+ "eval_runtime": 216.0431,
1193
+ "eval_samples_per_second": 7.262,
1194
+ "eval_steps_per_second": 0.912,
1195
+ "step": 7900
1196
+ },
1197
+ {
1198
+ "epoch": 18.140430351075878,
1199
+ "grad_norm": 1.0066969394683838,
1200
+ "learning_rate": 1.8680318543799774e-05,
1201
+ "loss": 0.3625,
1202
+ "step": 8000
1203
+ },
1204
+ {
1205
+ "epoch": 18.140430351075878,
1206
+ "eval_loss": 0.7235797643661499,
1207
+ "eval_runtime": 216.0282,
1208
+ "eval_samples_per_second": 7.263,
1209
+ "eval_steps_per_second": 0.912,
1210
+ "step": 8000
1211
+ },
1212
+ {
1213
+ "epoch": 18.366930917327295,
1214
+ "grad_norm": 1.040211796760559,
1215
+ "learning_rate": 1.6427758816837314e-05,
1216
+ "loss": 0.3506,
1217
+ "step": 8100
1218
+ },
1219
+ {
1220
+ "epoch": 18.366930917327295,
1221
+ "eval_loss": 0.7218917608261108,
1222
+ "eval_runtime": 216.0625,
1223
+ "eval_samples_per_second": 7.262,
1224
+ "eval_steps_per_second": 0.912,
1225
+ "step": 8100
1226
+ },
1227
+ {
1228
+ "epoch": 18.59343148357871,
1229
+ "grad_norm": 0.9982612133026123,
1230
+ "learning_rate": 1.4152445961319682e-05,
1231
+ "loss": 0.3531,
1232
+ "step": 8200
1233
+ },
1234
+ {
1235
+ "epoch": 18.59343148357871,
1236
+ "eval_loss": 0.7176269888877869,
1237
+ "eval_runtime": 215.9028,
1238
+ "eval_samples_per_second": 7.267,
1239
+ "eval_steps_per_second": 0.912,
1240
+ "step": 8200
1241
+ },
1242
+ {
1243
+ "epoch": 18.819932049830125,
1244
+ "grad_norm": 1.133723497390747,
1245
+ "learning_rate": 1.1877133105802047e-05,
1246
+ "loss": 0.3566,
1247
+ "step": 8300
1248
+ },
1249
+ {
1250
+ "epoch": 18.819932049830125,
1251
+ "eval_loss": 0.7127575874328613,
1252
+ "eval_runtime": 215.8405,
1253
+ "eval_samples_per_second": 7.269,
1254
+ "eval_steps_per_second": 0.913,
1255
+ "step": 8300
1256
+ },
1257
+ {
1258
+ "epoch": 19.047565118912797,
1259
+ "grad_norm": 0.9450774192810059,
1260
+ "learning_rate": 9.601820250284414e-06,
1261
+ "loss": 0.3553,
1262
+ "step": 8400
1263
+ },
1264
+ {
1265
+ "epoch": 19.047565118912797,
1266
+ "eval_loss": 0.7169352173805237,
1267
+ "eval_runtime": 215.9082,
1268
+ "eval_samples_per_second": 7.267,
1269
+ "eval_steps_per_second": 0.912,
1270
+ "step": 8400
1271
+ },
1272
+ {
1273
+ "epoch": 19.274065685164214,
1274
+ "grad_norm": 0.9257975816726685,
1275
+ "learning_rate": 7.326507394766781e-06,
1276
+ "loss": 0.3366,
1277
+ "step": 8500
1278
+ },
1279
+ {
1280
+ "epoch": 19.274065685164214,
1281
+ "eval_loss": 0.7128849625587463,
1282
+ "eval_runtime": 216.0119,
1283
+ "eval_samples_per_second": 7.263,
1284
+ "eval_steps_per_second": 0.912,
1285
+ "step": 8500
1286
+ },
1287
+ {
1288
+ "epoch": 19.500566251415627,
1289
+ "grad_norm": 1.0109655857086182,
1290
+ "learning_rate": 5.051194539249147e-06,
1291
+ "loss": 0.3335,
1292
+ "step": 8600
1293
+ },
1294
+ {
1295
+ "epoch": 19.500566251415627,
1296
+ "eval_loss": 0.7129219174385071,
1297
+ "eval_runtime": 215.981,
1298
+ "eval_samples_per_second": 7.265,
1299
+ "eval_steps_per_second": 0.912,
1300
+ "step": 8600
1301
+ },
1302
+ {
1303
+ "epoch": 19.727066817667044,
1304
+ "grad_norm": 1.021828293800354,
1305
+ "learning_rate": 2.7758816837315134e-06,
1306
+ "loss": 0.3398,
1307
+ "step": 8700
1308
+ },
1309
+ {
1310
+ "epoch": 19.727066817667044,
1311
+ "eval_loss": 0.7109224200248718,
1312
+ "eval_runtime": 215.6766,
1313
+ "eval_samples_per_second": 7.275,
1314
+ "eval_steps_per_second": 0.913,
1315
+ "step": 8700
1316
+ },
1317
+ {
1318
+ "epoch": 19.95356738391846,
1319
+ "grad_norm": 1.0146093368530273,
1320
+ "learning_rate": 5.005688282138794e-07,
1321
+ "loss": 0.3375,
1322
+ "step": 8800
1323
+ },
1324
+ {
1325
+ "epoch": 19.95356738391846,
1326
+ "eval_loss": 0.7103046774864197,
1327
+ "eval_runtime": 215.5394,
1328
+ "eval_samples_per_second": 7.279,
1329
+ "eval_steps_per_second": 0.914,
1330
+ "step": 8800
1331
+ },
1332
+ {
1333
+ "epoch": 20.181200453001132,
1334
+ "grad_norm": 1.1958893537521362,
1335
+ "learning_rate": 2.4621773954316228e-05,
1336
+ "loss": 0.3306,
1337
+ "step": 8900
1338
+ },
1339
+ {
1340
+ "epoch": 20.181200453001132,
1341
+ "eval_loss": 0.7169004082679749,
1342
+ "eval_runtime": 215.7176,
1343
+ "eval_samples_per_second": 7.273,
1344
+ "eval_steps_per_second": 0.913,
1345
+ "step": 8900
1346
+ },
1347
+ {
1348
+ "epoch": 20.40770101925255,
1349
+ "grad_norm": 1.2821824550628662,
1350
+ "learning_rate": 2.2644121427865127e-05,
1351
+ "loss": 0.3442,
1352
+ "step": 9000
1353
+ },
1354
+ {
1355
+ "epoch": 20.40770101925255,
1356
+ "eval_loss": 0.710050106048584,
1357
+ "eval_runtime": 215.7663,
1358
+ "eval_samples_per_second": 7.272,
1359
+ "eval_steps_per_second": 0.913,
1360
+ "step": 9000
1361
+ },
1362
+ {
1363
+ "epoch": 20.634201585503963,
1364
+ "grad_norm": 1.118553876876831,
1365
+ "learning_rate": 2.0666468901414023e-05,
1366
+ "loss": 0.3434,
1367
+ "step": 9100
1368
+ },
1369
+ {
1370
+ "epoch": 20.634201585503963,
1371
+ "eval_loss": 0.7061675190925598,
1372
+ "eval_runtime": 215.9231,
1373
+ "eval_samples_per_second": 7.266,
1374
+ "eval_steps_per_second": 0.912,
1375
+ "step": 9100
1376
+ },
1377
+ {
1378
+ "epoch": 20.86070215175538,
1379
+ "grad_norm": 1.131901502609253,
1380
+ "learning_rate": 1.868881637496292e-05,
1381
+ "loss": 0.3431,
1382
+ "step": 9200
1383
+ },
1384
+ {
1385
+ "epoch": 20.86070215175538,
1386
+ "eval_loss": 0.7025442123413086,
1387
+ "eval_runtime": 215.8012,
1388
+ "eval_samples_per_second": 7.271,
1389
+ "eval_steps_per_second": 0.913,
1390
+ "step": 9200
1391
+ },
1392
+ {
1393
+ "epoch": 21.08833522083805,
1394
+ "grad_norm": 1.063588261604309,
1395
+ "learning_rate": 1.6711163848511818e-05,
1396
+ "loss": 0.3361,
1397
+ "step": 9300
1398
+ },
1399
+ {
1400
+ "epoch": 21.08833522083805,
1401
+ "eval_loss": 0.7048903107643127,
1402
+ "eval_runtime": 215.6814,
1403
+ "eval_samples_per_second": 7.275,
1404
+ "eval_steps_per_second": 0.913,
1405
+ "step": 9300
1406
+ },
1407
+ {
1408
+ "epoch": 21.314835787089468,
1409
+ "grad_norm": 1.0730327367782593,
1410
+ "learning_rate": 1.4733511322060714e-05,
1411
+ "loss": 0.3201,
1412
+ "step": 9400
1413
+ },
1414
+ {
1415
+ "epoch": 21.314835787089468,
1416
+ "eval_loss": 0.7042800188064575,
1417
+ "eval_runtime": 215.7788,
1418
+ "eval_samples_per_second": 7.271,
1419
+ "eval_steps_per_second": 0.913,
1420
+ "step": 9400
1421
+ },
1422
+ {
1423
+ "epoch": 21.541336353340885,
1424
+ "grad_norm": 1.002642273902893,
1425
+ "learning_rate": 1.2755858795609612e-05,
1426
+ "loss": 0.3248,
1427
+ "step": 9500
1428
+ },
1429
+ {
1430
+ "epoch": 21.541336353340885,
1431
+ "eval_loss": 0.6989186406135559,
1432
+ "eval_runtime": 215.7873,
1433
+ "eval_samples_per_second": 7.271,
1434
+ "eval_steps_per_second": 0.913,
1435
+ "step": 9500
1436
+ },
1437
+ {
1438
+ "epoch": 21.7678369195923,
1439
+ "grad_norm": 1.184122920036316,
1440
+ "learning_rate": 1.077820626915851e-05,
1441
+ "loss": 0.3183,
1442
+ "step": 9600
1443
+ },
1444
+ {
1445
+ "epoch": 21.7678369195923,
1446
+ "eval_loss": 0.6965128183364868,
1447
+ "eval_runtime": 215.7509,
1448
+ "eval_samples_per_second": 7.272,
1449
+ "eval_steps_per_second": 0.913,
1450
+ "step": 9600
1451
+ },
1452
+ {
1453
+ "epoch": 21.994337485843715,
1454
+ "grad_norm": 1.00730562210083,
1455
+ "learning_rate": 8.800553742707407e-06,
1456
+ "loss": 0.3243,
1457
+ "step": 9700
1458
+ },
1459
+ {
1460
+ "epoch": 21.994337485843715,
1461
+ "eval_loss": 0.69374018907547,
1462
+ "eval_runtime": 215.6336,
1463
+ "eval_samples_per_second": 7.276,
1464
+ "eval_steps_per_second": 0.914,
1465
+ "step": 9700
1466
+ },
1467
+ {
1468
+ "epoch": 22.221970554926386,
1469
+ "grad_norm": 1.0043355226516724,
1470
+ "learning_rate": 6.822901216256304e-06,
1471
+ "loss": 0.3118,
1472
+ "step": 9800
1473
+ },
1474
+ {
1475
+ "epoch": 22.221970554926386,
1476
+ "eval_loss": 0.6979511380195618,
1477
+ "eval_runtime": 215.6267,
1478
+ "eval_samples_per_second": 7.276,
1479
+ "eval_steps_per_second": 0.914,
1480
+ "step": 9800
1481
+ },
1482
+ {
1483
+ "epoch": 22.448471121177803,
1484
+ "grad_norm": 1.0051404237747192,
1485
+ "learning_rate": 4.845248689805201e-06,
1486
+ "loss": 0.3068,
1487
+ "step": 9900
1488
+ },
1489
+ {
1490
+ "epoch": 22.448471121177803,
1491
+ "eval_loss": 0.6955877542495728,
1492
+ "eval_runtime": 215.5799,
1493
+ "eval_samples_per_second": 7.278,
1494
+ "eval_steps_per_second": 0.914,
1495
+ "step": 9900
1496
+ },
1497
+ {
1498
+ "epoch": 22.674971687429217,
1499
+ "grad_norm": 0.9242558479309082,
1500
+ "learning_rate": 2.8675961633540988e-06,
1501
+ "loss": 0.3094,
1502
+ "step": 10000
1503
+ },
1504
+ {
1505
+ "epoch": 22.674971687429217,
1506
+ "eval_loss": 0.6944177746772766,
1507
+ "eval_runtime": 215.6097,
1508
+ "eval_samples_per_second": 7.277,
1509
+ "eval_steps_per_second": 0.914,
1510
+ "step": 10000
1511
+ },
1512
+ {
1513
+ "epoch": 22.901472253680634,
1514
+ "grad_norm": 1.057633638381958,
1515
+ "learning_rate": 8.899436369029961e-07,
1516
+ "loss": 0.3092,
1517
+ "step": 10100
1518
+ },
1519
+ {
1520
+ "epoch": 22.901472253680634,
1521
+ "eval_loss": 0.6938230991363525,
1522
+ "eval_runtime": 215.7357,
1523
+ "eval_samples_per_second": 7.273,
1524
+ "eval_steps_per_second": 0.913,
1525
+ "step": 10100
1526
+ },
1527
+ {
1528
+ "epoch": 23.12910532276331,
1529
+ "grad_norm": 1.0774931907653809,
1530
+ "learning_rate": 1.5043201455206912e-05,
1531
+ "loss": 0.3076,
1532
+ "step": 10200
1533
+ },
1534
+ {
1535
+ "epoch": 23.12910532276331,
1536
+ "eval_loss": 0.6974820494651794,
1537
+ "eval_runtime": 214.8368,
1538
+ "eval_samples_per_second": 7.303,
1539
+ "eval_steps_per_second": 0.917,
1540
+ "step": 10200
1541
+ },
1542
+ {
1543
+ "epoch": 23.355605889014722,
1544
+ "grad_norm": 1.0320171117782593,
1545
+ "learning_rate": 1.3224192814915873e-05,
1546
+ "loss": 0.3084,
1547
+ "step": 10300
1548
+ },
1549
+ {
1550
+ "epoch": 23.355605889014722,
1551
+ "eval_loss": 0.6950554251670837,
1552
+ "eval_runtime": 214.9493,
1553
+ "eval_samples_per_second": 7.299,
1554
+ "eval_steps_per_second": 0.916,
1555
+ "step": 10300
1556
+ },
1557
+ {
1558
+ "epoch": 23.58210645526614,
1559
+ "grad_norm": 1.0282411575317383,
1560
+ "learning_rate": 1.142337426102774e-05,
1561
+ "loss": 0.3077,
1562
+ "step": 10400
1563
+ },
1564
+ {
1565
+ "epoch": 23.58210645526614,
1566
+ "eval_loss": 0.6931228637695312,
1567
+ "eval_runtime": 214.8764,
1568
+ "eval_samples_per_second": 7.302,
1569
+ "eval_steps_per_second": 0.917,
1570
+ "step": 10400
1571
+ },
1572
+ {
1573
+ "epoch": 23.808607021517552,
1574
+ "grad_norm": 1.0363404750823975,
1575
+ "learning_rate": 9.6043656207367e-06,
1576
+ "loss": 0.3088,
1577
+ "step": 10500
1578
+ },
1579
+ {
1580
+ "epoch": 23.808607021517552,
1581
+ "eval_loss": 0.6890929341316223,
1582
+ "eval_runtime": 214.9182,
1583
+ "eval_samples_per_second": 7.3,
1584
+ "eval_steps_per_second": 0.917,
1585
+ "step": 10500
1586
+ },
1587
+ {
1588
+ "epoch": 24.036240090600227,
1589
+ "grad_norm": 0.9383705854415894,
1590
+ "learning_rate": 7.785356980445657e-06,
1591
+ "loss": 0.3092,
1592
+ "step": 10600
1593
+ },
1594
+ {
1595
+ "epoch": 24.036240090600227,
1596
+ "eval_loss": 0.6934636831283569,
1597
+ "eval_runtime": 214.8602,
1598
+ "eval_samples_per_second": 7.302,
1599
+ "eval_steps_per_second": 0.917,
1600
+ "step": 10600
1601
+ },
1602
+ {
1603
+ "epoch": 24.26274065685164,
1604
+ "grad_norm": 0.905327320098877,
1605
+ "learning_rate": 5.966348340154616e-06,
1606
+ "loss": 0.2941,
1607
+ "step": 10700
1608
+ },
1609
+ {
1610
+ "epoch": 24.26274065685164,
1611
+ "eval_loss": 0.691253125667572,
1612
+ "eval_runtime": 215.1136,
1613
+ "eval_samples_per_second": 7.294,
1614
+ "eval_steps_per_second": 0.916,
1615
+ "step": 10700
1616
+ },
1617
+ {
1618
+ "epoch": 24.489241223103058,
1619
+ "grad_norm": 0.9248319864273071,
1620
+ "learning_rate": 4.147339699863575e-06,
1621
+ "loss": 0.2965,
1622
+ "step": 10800
1623
+ },
1624
+ {
1625
+ "epoch": 24.489241223103058,
1626
+ "eval_loss": 0.688778281211853,
1627
+ "eval_runtime": 215.1479,
1628
+ "eval_samples_per_second": 7.293,
1629
+ "eval_steps_per_second": 0.916,
1630
+ "step": 10800
1631
+ },
1632
+ {
1633
+ "epoch": 24.715741789354475,
1634
+ "grad_norm": 0.9588720202445984,
1635
+ "learning_rate": 2.3283310595725328e-06,
1636
+ "loss": 0.296,
1637
+ "step": 10900
1638
+ },
1639
+ {
1640
+ "epoch": 24.715741789354475,
1641
+ "eval_loss": 0.6882308125495911,
1642
+ "eval_runtime": 215.299,
1643
+ "eval_samples_per_second": 7.288,
1644
+ "eval_steps_per_second": 0.915,
1645
+ "step": 10900
1646
+ },
1647
+ {
1648
+ "epoch": 24.942242355605888,
1649
+ "grad_norm": 0.9268137216567993,
1650
+ "learning_rate": 5.093224192814915e-07,
1651
+ "loss": 0.2979,
1652
+ "step": 11000
1653
+ },
1654
+ {
1655
+ "epoch": 24.942242355605888,
1656
+ "eval_loss": 0.6880246996879578,
1657
+ "eval_runtime": 215.276,
1658
+ "eval_samples_per_second": 7.288,
1659
+ "eval_steps_per_second": 0.915,
1660
+ "step": 11000
1661
+ },
1662
+ {
1663
+ "epoch": 25.169875424688563,
1664
+ "grad_norm": 1.163781762123108,
1665
+ "learning_rate": 3.231818181818182e-05,
1666
+ "loss": 0.299,
1667
+ "step": 11100
1668
+ },
1669
+ {
1670
+ "epoch": 25.169875424688563,
1671
+ "eval_loss": 0.6973585486412048,
1672
+ "eval_runtime": 215.0775,
1673
+ "eval_samples_per_second": 7.295,
1674
+ "eval_steps_per_second": 0.916,
1675
+ "step": 11100
1676
+ },
1677
+ {
1678
+ "epoch": 25.396375990939976,
1679
+ "grad_norm": 1.3955178260803223,
1680
+ "learning_rate": 3.0803030303030304e-05,
1681
+ "loss": 0.3092,
1682
+ "step": 11200
1683
+ },
1684
+ {
1685
+ "epoch": 25.396375990939976,
1686
+ "eval_loss": 0.6922005414962769,
1687
+ "eval_runtime": 214.9326,
1688
+ "eval_samples_per_second": 7.3,
1689
+ "eval_steps_per_second": 0.917,
1690
+ "step": 11200
1691
+ },
1692
+ {
1693
+ "epoch": 25.622876557191393,
1694
+ "grad_norm": 1.630706548690796,
1695
+ "learning_rate": 2.9287878787878793e-05,
1696
+ "loss": 0.3106,
1697
+ "step": 11300
1698
+ },
1699
+ {
1700
+ "epoch": 25.622876557191393,
1701
+ "eval_loss": 0.6866771578788757,
1702
+ "eval_runtime": 215.0055,
1703
+ "eval_samples_per_second": 7.297,
1704
+ "eval_steps_per_second": 0.916,
1705
+ "step": 11300
1706
+ },
1707
+ {
1708
+ "epoch": 25.84937712344281,
1709
+ "grad_norm": 1.2561249732971191,
1710
+ "learning_rate": 2.7772727272727272e-05,
1711
+ "loss": 0.3113,
1712
+ "step": 11400
1713
+ },
1714
+ {
1715
+ "epoch": 25.84937712344281,
1716
+ "eval_loss": 0.6819568276405334,
1717
+ "eval_runtime": 214.834,
1718
+ "eval_samples_per_second": 7.303,
1719
+ "eval_steps_per_second": 0.917,
1720
+ "step": 11400
1721
+ },
1722
+ {
1723
+ "epoch": 26.07701019252548,
1724
+ "grad_norm": 1.5864068269729614,
1725
+ "learning_rate": 2.6257575757575757e-05,
1726
+ "loss": 0.3081,
1727
+ "step": 11500
1728
+ },
1729
+ {
1730
+ "epoch": 26.07701019252548,
1731
+ "eval_loss": 0.683868408203125,
1732
+ "eval_runtime": 214.7915,
1733
+ "eval_samples_per_second": 7.305,
1734
+ "eval_steps_per_second": 0.917,
1735
+ "step": 11500
1736
+ },
1737
+ {
1738
+ "epoch": 26.3035107587769,
1739
+ "grad_norm": 1.2162772417068481,
1740
+ "learning_rate": 2.475757575757576e-05,
1741
+ "loss": 0.2911,
1742
+ "step": 11600
1743
+ },
1744
+ {
1745
+ "epoch": 26.3035107587769,
1746
+ "eval_loss": 0.6809530258178711,
1747
+ "eval_runtime": 214.7759,
1748
+ "eval_samples_per_second": 7.305,
1749
+ "eval_steps_per_second": 0.917,
1750
+ "step": 11600
1751
+ },
1752
+ {
1753
+ "epoch": 26.530011325028312,
1754
+ "grad_norm": 1.2697758674621582,
1755
+ "learning_rate": 2.3242424242424243e-05,
1756
+ "loss": 0.289,
1757
+ "step": 11700
1758
+ },
1759
+ {
1760
+ "epoch": 26.530011325028312,
1761
+ "eval_loss": 0.6744586229324341,
1762
+ "eval_runtime": 214.8373,
1763
+ "eval_samples_per_second": 7.303,
1764
+ "eval_steps_per_second": 0.917,
1765
+ "step": 11700
1766
+ },
1767
+ {
1768
+ "epoch": 26.75651189127973,
1769
+ "grad_norm": 1.2038872241973877,
1770
+ "learning_rate": 2.172727272727273e-05,
1771
+ "loss": 0.293,
1772
+ "step": 11800
1773
+ },
1774
+ {
1775
+ "epoch": 26.75651189127973,
1776
+ "eval_loss": 0.6742814183235168,
1777
+ "eval_runtime": 214.8304,
1778
+ "eval_samples_per_second": 7.303,
1779
+ "eval_steps_per_second": 0.917,
1780
+ "step": 11800
1781
+ },
1782
+ {
1783
+ "epoch": 26.983012457531142,
1784
+ "grad_norm": 1.2992146015167236,
1785
+ "learning_rate": 2.0212121212121214e-05,
1786
+ "loss": 0.2921,
1787
+ "step": 11900
1788
+ },
1789
+ {
1790
+ "epoch": 26.983012457531142,
1791
+ "eval_loss": 0.6651853322982788,
1792
+ "eval_runtime": 214.8882,
1793
+ "eval_samples_per_second": 7.301,
1794
+ "eval_steps_per_second": 0.917,
1795
+ "step": 11900
1796
+ },
1797
+ {
1798
+ "epoch": 27.210645526613817,
1799
+ "grad_norm": 1.5248701572418213,
1800
+ "learning_rate": 1.86969696969697e-05,
1801
+ "loss": 0.2761,
1802
+ "step": 12000
1803
+ },
1804
+ {
1805
+ "epoch": 27.210645526613817,
1806
+ "eval_loss": 0.6715984344482422,
1807
+ "eval_runtime": 214.8134,
1808
+ "eval_samples_per_second": 7.304,
1809
+ "eval_steps_per_second": 0.917,
1810
+ "step": 12000
1811
+ },
1812
+ {
1813
+ "epoch": 27.43714609286523,
1814
+ "grad_norm": 1.3465806245803833,
1815
+ "learning_rate": 1.718181818181818e-05,
1816
+ "loss": 0.2778,
1817
+ "step": 12100
1818
+ },
1819
+ {
1820
+ "epoch": 27.43714609286523,
1821
+ "eval_loss": 0.6680512428283691,
1822
+ "eval_runtime": 214.8681,
1823
+ "eval_samples_per_second": 7.302,
1824
+ "eval_steps_per_second": 0.917,
1825
+ "step": 12100
1826
+ },
1827
+ {
1828
+ "epoch": 27.663646659116647,
1829
+ "grad_norm": 1.044279932975769,
1830
+ "learning_rate": 1.5666666666666667e-05,
1831
+ "loss": 0.2766,
1832
+ "step": 12200
1833
+ },
1834
+ {
1835
+ "epoch": 27.663646659116647,
1836
+ "eval_loss": 0.66595059633255,
1837
+ "eval_runtime": 214.8457,
1838
+ "eval_samples_per_second": 7.303,
1839
+ "eval_steps_per_second": 0.917,
1840
+ "step": 12200
1841
+ },
1842
+ {
1843
+ "epoch": 27.890147225368064,
1844
+ "grad_norm": 1.033585548400879,
1845
+ "learning_rate": 1.4151515151515152e-05,
1846
+ "loss": 0.2755,
1847
+ "step": 12300
1848
+ },
1849
+ {
1850
+ "epoch": 27.890147225368064,
1851
+ "eval_loss": 0.6637829542160034,
1852
+ "eval_runtime": 215.9457,
1853
+ "eval_samples_per_second": 7.266,
1854
+ "eval_steps_per_second": 0.912,
1855
+ "step": 12300
1856
+ },
1857
+ {
1858
+ "epoch": 28.117780294450736,
1859
+ "grad_norm": 1.020391821861267,
1860
+ "learning_rate": 1.2636363636363638e-05,
1861
+ "loss": 0.267,
1862
+ "step": 12400
1863
+ },
1864
+ {
1865
+ "epoch": 28.117780294450736,
1866
+ "eval_loss": 0.6649137139320374,
1867
+ "eval_runtime": 215.9471,
1868
+ "eval_samples_per_second": 7.266,
1869
+ "eval_steps_per_second": 0.912,
1870
+ "step": 12400
1871
+ },
1872
+ {
1873
+ "epoch": 28.344280860702153,
1874
+ "grad_norm": 0.961401641368866,
1875
+ "learning_rate": 1.1121212121212121e-05,
1876
+ "loss": 0.2603,
1877
+ "step": 12500
1878
+ },
1879
+ {
1880
+ "epoch": 28.344280860702153,
1881
+ "eval_loss": 0.6652226448059082,
1882
+ "eval_runtime": 215.9698,
1883
+ "eval_samples_per_second": 7.265,
1884
+ "eval_steps_per_second": 0.912,
1885
+ "step": 12500
1886
+ },
1887
+ {
1888
+ "epoch": 28.570781426953566,
1889
+ "grad_norm": 0.9490793347358704,
1890
+ "learning_rate": 9.606060606060607e-06,
1891
+ "loss": 0.2619,
1892
+ "step": 12600
1893
+ },
1894
+ {
1895
+ "epoch": 28.570781426953566,
1896
+ "eval_loss": 0.6631964445114136,
1897
+ "eval_runtime": 216.054,
1898
+ "eval_samples_per_second": 7.262,
1899
+ "eval_steps_per_second": 0.912,
1900
+ "step": 12600
1901
+ },
1902
+ {
1903
+ "epoch": 28.797281993204983,
1904
+ "grad_norm": 1.015561580657959,
1905
+ "learning_rate": 8.09090909090909e-06,
1906
+ "loss": 0.2648,
1907
+ "step": 12700
1908
+ },
1909
+ {
1910
+ "epoch": 28.797281993204983,
1911
+ "eval_loss": 0.6587069034576416,
1912
+ "eval_runtime": 216.0262,
1913
+ "eval_samples_per_second": 7.263,
1914
+ "eval_steps_per_second": 0.912,
1915
+ "step": 12700
1916
+ },
1917
+ {
1918
+ "epoch": 29.024915062287654,
1919
+ "grad_norm": 0.8936730623245239,
1920
+ "learning_rate": 6.575757575757575e-06,
1921
+ "loss": 0.2621,
1922
+ "step": 12800
1923
+ },
1924
+ {
1925
+ "epoch": 29.024915062287654,
1926
+ "eval_loss": 0.6623508930206299,
1927
+ "eval_runtime": 216.1177,
1928
+ "eval_samples_per_second": 7.26,
1929
+ "eval_steps_per_second": 0.912,
1930
+ "step": 12800
1931
+ },
1932
+ {
1933
+ "epoch": 29.25141562853907,
1934
+ "grad_norm": 0.8353849053382874,
1935
+ "learning_rate": 5.060606060606061e-06,
1936
+ "loss": 0.2497,
1937
+ "step": 12900
1938
+ },
1939
+ {
1940
+ "epoch": 29.25141562853907,
1941
+ "eval_loss": 0.6611577272415161,
1942
+ "eval_runtime": 216.148,
1943
+ "eval_samples_per_second": 7.259,
1944
+ "eval_steps_per_second": 0.911,
1945
+ "step": 12900
1946
+ },
1947
+ {
1948
+ "epoch": 29.477916194790488,
1949
+ "grad_norm": 0.9153599739074707,
1950
+ "learning_rate": 3.5454545454545454e-06,
1951
+ "loss": 0.2555,
1952
+ "step": 13000
1953
+ },
1954
+ {
1955
+ "epoch": 29.477916194790488,
1956
+ "eval_loss": 0.660586416721344,
1957
+ "eval_runtime": 216.1833,
1958
+ "eval_samples_per_second": 7.258,
1959
+ "eval_steps_per_second": 0.911,
1960
+ "step": 13000
1961
+ },
1962
+ {
1963
+ "epoch": 29.7044167610419,
1964
+ "grad_norm": 0.9058592319488525,
1965
+ "learning_rate": 2.0303030303030303e-06,
1966
+ "loss": 0.2502,
1967
+ "step": 13100
1968
+ },
1969
+ {
1970
+ "epoch": 29.7044167610419,
1971
+ "eval_loss": 0.6602495908737183,
1972
+ "eval_runtime": 216.1128,
1973
+ "eval_samples_per_second": 7.26,
1974
+ "eval_steps_per_second": 0.912,
1975
+ "step": 13100
1976
+ },
1977
+ {
1978
+ "epoch": 29.93091732729332,
1979
+ "grad_norm": 0.9629655480384827,
1980
+ "learning_rate": 5.151515151515152e-07,
1981
+ "loss": 0.2525,
1982
+ "step": 13200
1983
+ },
1984
+ {
1985
+ "epoch": 29.93091732729332,
1986
+ "eval_loss": 0.6595110297203064,
1987
+ "eval_runtime": 216.0902,
1988
+ "eval_samples_per_second": 7.261,
1989
+ "eval_steps_per_second": 0.912,
1990
+ "step": 13200
1991
+ }
1992
+ ],
1993
+ "logging_steps": 100,
1994
+ "max_steps": 13230,
1995
+ "num_input_tokens_seen": 0,
1996
+ "num_train_epochs": 30,
1997
+ "save_steps": 100,
1998
+ "stateful_callbacks": {
1999
+ "TrainerControl": {
2000
+ "args": {
2001
+ "should_epoch_stop": false,
2002
+ "should_evaluate": false,
2003
+ "should_log": false,
2004
+ "should_save": true,
2005
+ "should_training_stop": false
2006
+ },
2007
+ "attributes": {}
2008
+ }
2009
+ },
2010
+ "total_flos": 6.633805787136e+17,
2011
+ "train_batch_size": 8,
2012
+ "trial_name": null,
2013
+ "trial_params": null
2014
+ }
checkpoint-13200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd1454bdddf5d6a918ffb8a8fbe7331b220c89a157922bc08953307b6efaaae
3
+ size 6161
checkpoint-13200/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-13230/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/qwen3-0.6b-unsloth-bnb-4bit
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.14.0
checkpoint-13230/adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/qwen3-0.6b-unsloth-bnb-4bit",
5
+ "bias": "none",
6
+ "eva_config": null,
7
+ "exclude_modules": null,
8
+ "fan_in_fan_out": false,
9
+ "inference_mode": true,
10
+ "init_lora_weights": true,
11
+ "layer_replication": null,
12
+ "layers_pattern": null,
13
+ "layers_to_transform": null,
14
+ "loftq_config": {},
15
+ "lora_alpha": 16,
16
+ "lora_bias": false,
17
+ "lora_dropout": 0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "up_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "k_proj",
30
+ "gate_proj",
31
+ "v_proj",
32
+ "o_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": false
37
+ }
checkpoint-13230/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b03ffb6c9ccc44fbdbe87e699eebe64564f8c0b0421c09af94cbd7f6973fa93
3
+ size 40422168
checkpoint-13230/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
checkpoint-13230/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-13230/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4226c840dc3a8513d3f2a56be7c50eb6c773c320706985becb35b998593f7ae7
3
+ size 21979091
checkpoint-13230/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970f20ed294a0d8701bfcccff463f693cc5687d59a27abebbf761ee3255ea47d
3
+ size 14645
checkpoint-13230/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e50017faec0b4c5f5b0cd3658e2a583c5c4a15823c3e3313abbd4d8b97dbb79d
3
+ size 1383
checkpoint-13230/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c6fd0219c5c2765064cacc396ba2623ebdd31f73f4547e5338da2d5e5ad8f8a
3
+ size 1465
checkpoint-13230/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|vision_pad|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-13230/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae1a036a9837df9caeebb840d09d80e8feef0f6d2bae982970d1ad34f5946aff
3
+ size 11422753
checkpoint-13230/tokenizer_config.json ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for forward_message in messages %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- set message = messages[index] %}\n {%- set current_content = message.content if message.content is defined and message.content is not none else '' %}\n {%- set tool_start = '<tool_response>' %}\n {%- set tool_start_length = tool_start|length %}\n {%- set start_of_message = current_content[:tool_start_length] %}\n {%- set tool_end = '</tool_response>' %}\n {%- set tool_end_length = tool_end|length %}\n {%- set start_pos = (current_content|length) - tool_end_length %}\n {%- if start_pos < 0 %}\n {%- set start_pos = 0 %}\n {%- endif %}\n {%- set end_of_message = current_content[start_pos:] %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(start_of_message == tool_start and end_of_message == tool_end) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set m_content = message.content if message.content is defined and message.content is not none else '' %}\n {%- set content = m_content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in m_content %}\n {%- set content = (m_content.split('</think>')|last).lstrip('\\n') %}\n {%- set reasoning_content = (m_content.split('</think>')|first).rstrip('\\n') %}\n {%- set reasoning_content = (reasoning_content.split('<think>')|last).lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and (not reasoning_content.strip() == '')) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 40960,
236
+ "pad_token": "<|vision_pad|>",
237
+ "padding_side": "right",
238
+ "split_special_tokens": false,
239
+ "tokenizer_class": "Qwen2Tokenizer",
240
+ "unk_token": null
241
+ }
checkpoint-13230/trainer_state.json ADDED
@@ -0,0 +1,2014 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 12700,
3
+ "best_metric": 0.6587069034576416,
4
+ "best_model_checkpoint": "./Qwen-3-0.6B-Reasoning-Vi-Medical-LoRA/checkpoint-12700",
5
+ "epoch": 29.998867497168742,
6
+ "eval_steps": 100,
7
+ "global_step": 13230,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.22650056625141562,
14
+ "grad_norm": 0.32067689299583435,
15
+ "learning_rate": 0.0001936551724137931,
16
+ "loss": 1.3117,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.22650056625141562,
21
+ "eval_loss": 1.2771011590957642,
22
+ "eval_runtime": 215.7306,
23
+ "eval_samples_per_second": 7.273,
24
+ "eval_steps_per_second": 0.913,
25
+ "step": 100
26
+ },
27
+ {
28
+ "epoch": 0.45300113250283125,
29
+ "grad_norm": 0.368208110332489,
30
+ "learning_rate": 0.00018445977011494254,
31
+ "loss": 1.2627,
32
+ "step": 200
33
+ },
34
+ {
35
+ "epoch": 0.45300113250283125,
36
+ "eval_loss": 1.234209418296814,
37
+ "eval_runtime": 215.6413,
38
+ "eval_samples_per_second": 7.276,
39
+ "eval_steps_per_second": 0.914,
40
+ "step": 200
41
+ },
42
+ {
43
+ "epoch": 0.6795016987542469,
44
+ "grad_norm": 0.34856918454170227,
45
+ "learning_rate": 0.00017526436781609196,
46
+ "loss": 1.2199,
47
+ "step": 300
48
+ },
49
+ {
50
+ "epoch": 0.6795016987542469,
51
+ "eval_loss": 1.2032462358474731,
52
+ "eval_runtime": 215.6165,
53
+ "eval_samples_per_second": 7.277,
54
+ "eval_steps_per_second": 0.914,
55
+ "step": 300
56
+ },
57
+ {
58
+ "epoch": 0.9060022650056625,
59
+ "grad_norm": 0.33870309591293335,
60
+ "learning_rate": 0.0001660689655172414,
61
+ "loss": 1.1941,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 0.9060022650056625,
66
+ "eval_loss": 1.1780856847763062,
67
+ "eval_runtime": 215.6336,
68
+ "eval_samples_per_second": 7.276,
69
+ "eval_steps_per_second": 0.914,
70
+ "step": 400
71
+ },
72
+ {
73
+ "epoch": 1.1336353340883352,
74
+ "grad_norm": 0.38066866993904114,
75
+ "learning_rate": 0.00015687356321839082,
76
+ "loss": 1.1664,
77
+ "step": 500
78
+ },
79
+ {
80
+ "epoch": 1.1336353340883352,
81
+ "eval_loss": 1.1569600105285645,
82
+ "eval_runtime": 215.6796,
83
+ "eval_samples_per_second": 7.275,
84
+ "eval_steps_per_second": 0.913,
85
+ "step": 500
86
+ },
87
+ {
88
+ "epoch": 1.3601359003397508,
89
+ "grad_norm": 0.3768746852874756,
90
+ "learning_rate": 0.00014767816091954024,
91
+ "loss": 1.1276,
92
+ "step": 600
93
+ },
94
+ {
95
+ "epoch": 1.3601359003397508,
96
+ "eval_loss": 1.1395982503890991,
97
+ "eval_runtime": 215.7974,
98
+ "eval_samples_per_second": 7.271,
99
+ "eval_steps_per_second": 0.913,
100
+ "step": 600
101
+ },
102
+ {
103
+ "epoch": 1.5866364665911665,
104
+ "grad_norm": 0.3873467743396759,
105
+ "learning_rate": 0.00013848275862068967,
106
+ "loss": 1.1059,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 1.5866364665911665,
111
+ "eval_loss": 1.1225543022155762,
112
+ "eval_runtime": 215.8352,
113
+ "eval_samples_per_second": 7.269,
114
+ "eval_steps_per_second": 0.913,
115
+ "step": 700
116
+ },
117
+ {
118
+ "epoch": 1.8131370328425822,
119
+ "grad_norm": 0.399214506149292,
120
+ "learning_rate": 0.00012928735632183907,
121
+ "loss": 1.087,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 1.8131370328425822,
126
+ "eval_loss": 1.1066261529922485,
127
+ "eval_runtime": 215.8039,
128
+ "eval_samples_per_second": 7.27,
129
+ "eval_steps_per_second": 0.913,
130
+ "step": 800
131
+ },
132
+ {
133
+ "epoch": 2.0407701019252547,
134
+ "grad_norm": 0.3834548890590668,
135
+ "learning_rate": 0.00012009195402298852,
136
+ "loss": 1.0775,
137
+ "step": 900
138
+ },
139
+ {
140
+ "epoch": 2.0407701019252547,
141
+ "eval_loss": 1.0939360857009888,
142
+ "eval_runtime": 215.8709,
143
+ "eval_samples_per_second": 7.268,
144
+ "eval_steps_per_second": 0.913,
145
+ "step": 900
146
+ },
147
+ {
148
+ "epoch": 2.2672706681766703,
149
+ "grad_norm": 0.41335031390190125,
150
+ "learning_rate": 0.00011089655172413794,
151
+ "loss": 1.0294,
152
+ "step": 1000
153
+ },
154
+ {
155
+ "epoch": 2.2672706681766703,
156
+ "eval_loss": 1.081494688987732,
157
+ "eval_runtime": 215.8265,
158
+ "eval_samples_per_second": 7.27,
159
+ "eval_steps_per_second": 0.913,
160
+ "step": 1000
161
+ },
162
+ {
163
+ "epoch": 2.493771234428086,
164
+ "grad_norm": 0.43652597069740295,
165
+ "learning_rate": 0.00010170114942528736,
166
+ "loss": 1.0077,
167
+ "step": 1100
168
+ },
169
+ {
170
+ "epoch": 2.493771234428086,
171
+ "eval_loss": 1.066120982170105,
172
+ "eval_runtime": 215.7444,
173
+ "eval_samples_per_second": 7.272,
174
+ "eval_steps_per_second": 0.913,
175
+ "step": 1100
176
+ },
177
+ {
178
+ "epoch": 2.7202718006795017,
179
+ "grad_norm": 0.4415673613548279,
180
+ "learning_rate": 9.250574712643678e-05,
181
+ "loss": 0.9984,
182
+ "step": 1200
183
+ },
184
+ {
185
+ "epoch": 2.7202718006795017,
186
+ "eval_loss": 1.0539467334747314,
187
+ "eval_runtime": 215.873,
188
+ "eval_samples_per_second": 7.268,
189
+ "eval_steps_per_second": 0.913,
190
+ "step": 1200
191
+ },
192
+ {
193
+ "epoch": 2.9467723669309174,
194
+ "grad_norm": 0.44037237763404846,
195
+ "learning_rate": 8.33103448275862e-05,
196
+ "loss": 0.9837,
197
+ "step": 1300
198
+ },
199
+ {
200
+ "epoch": 2.9467723669309174,
201
+ "eval_loss": 1.044096827507019,
202
+ "eval_runtime": 215.7373,
203
+ "eval_samples_per_second": 7.273,
204
+ "eval_steps_per_second": 0.913,
205
+ "step": 1300
206
+ },
207
+ {
208
+ "epoch": 3.17440543601359,
209
+ "grad_norm": 0.5079408288002014,
210
+ "learning_rate": 7.411494252873563e-05,
211
+ "loss": 0.9642,
212
+ "step": 1400
213
+ },
214
+ {
215
+ "epoch": 3.17440543601359,
216
+ "eval_loss": 1.0374841690063477,
217
+ "eval_runtime": 215.6881,
218
+ "eval_samples_per_second": 7.274,
219
+ "eval_steps_per_second": 0.913,
220
+ "step": 1400
221
+ },
222
+ {
223
+ "epoch": 3.4009060022650055,
224
+ "grad_norm": 0.4961177408695221,
225
+ "learning_rate": 6.491954022988506e-05,
226
+ "loss": 0.9381,
227
+ "step": 1500
228
+ },
229
+ {
230
+ "epoch": 3.4009060022650055,
231
+ "eval_loss": 1.0282294750213623,
232
+ "eval_runtime": 215.6305,
233
+ "eval_samples_per_second": 7.276,
234
+ "eval_steps_per_second": 0.914,
235
+ "step": 1500
236
+ },
237
+ {
238
+ "epoch": 3.627406568516421,
239
+ "grad_norm": 0.5088583827018738,
240
+ "learning_rate": 5.5724137931034484e-05,
241
+ "loss": 0.9261,
242
+ "step": 1600
243
+ },
244
+ {
245
+ "epoch": 3.627406568516421,
246
+ "eval_loss": 1.0186975002288818,
247
+ "eval_runtime": 215.8306,
248
+ "eval_samples_per_second": 7.27,
249
+ "eval_steps_per_second": 0.913,
250
+ "step": 1600
251
+ },
252
+ {
253
+ "epoch": 3.853907134767837,
254
+ "grad_norm": 0.5300412178039551,
255
+ "learning_rate": 4.652873563218391e-05,
256
+ "loss": 0.9115,
257
+ "step": 1700
258
+ },
259
+ {
260
+ "epoch": 3.853907134767837,
261
+ "eval_loss": 1.011821985244751,
262
+ "eval_runtime": 215.7992,
263
+ "eval_samples_per_second": 7.271,
264
+ "eval_steps_per_second": 0.913,
265
+ "step": 1700
266
+ },
267
+ {
268
+ "epoch": 4.081540203850509,
269
+ "grad_norm": 0.5446588397026062,
270
+ "learning_rate": 3.733333333333334e-05,
271
+ "loss": 0.9033,
272
+ "step": 1800
273
+ },
274
+ {
275
+ "epoch": 4.081540203850509,
276
+ "eval_loss": 1.0095539093017578,
277
+ "eval_runtime": 215.7598,
278
+ "eval_samples_per_second": 7.272,
279
+ "eval_steps_per_second": 0.913,
280
+ "step": 1800
281
+ },
282
+ {
283
+ "epoch": 4.308040770101925,
284
+ "grad_norm": 0.5339412093162537,
285
+ "learning_rate": 2.813793103448276e-05,
286
+ "loss": 0.8647,
287
+ "step": 1900
288
+ },
289
+ {
290
+ "epoch": 4.308040770101925,
291
+ "eval_loss": 1.002418875694275,
292
+ "eval_runtime": 215.8479,
293
+ "eval_samples_per_second": 7.269,
294
+ "eval_steps_per_second": 0.913,
295
+ "step": 1900
296
+ },
297
+ {
298
+ "epoch": 4.534541336353341,
299
+ "grad_norm": 0.5594246983528137,
300
+ "learning_rate": 1.8942528735632184e-05,
301
+ "loss": 0.8718,
302
+ "step": 2000
303
+ },
304
+ {
305
+ "epoch": 4.534541336353341,
306
+ "eval_loss": 0.9978017807006836,
307
+ "eval_runtime": 215.8323,
308
+ "eval_samples_per_second": 7.27,
309
+ "eval_steps_per_second": 0.913,
310
+ "step": 2000
311
+ },
312
+ {
313
+ "epoch": 4.761041902604757,
314
+ "grad_norm": 0.5511975288391113,
315
+ "learning_rate": 9.747126436781609e-06,
316
+ "loss": 0.8611,
317
+ "step": 2100
318
+ },
319
+ {
320
+ "epoch": 4.761041902604757,
321
+ "eval_loss": 0.9954851269721985,
322
+ "eval_runtime": 215.843,
323
+ "eval_samples_per_second": 7.269,
324
+ "eval_steps_per_second": 0.913,
325
+ "step": 2100
326
+ },
327
+ {
328
+ "epoch": 4.987542468856172,
329
+ "grad_norm": 0.5623785853385925,
330
+ "learning_rate": 5.517241379310344e-07,
331
+ "loss": 0.8602,
332
+ "step": 2200
333
+ },
334
+ {
335
+ "epoch": 4.987542468856172,
336
+ "eval_loss": 0.9939414262771606,
337
+ "eval_runtime": 215.789,
338
+ "eval_samples_per_second": 7.271,
339
+ "eval_steps_per_second": 0.913,
340
+ "step": 2200
341
+ },
342
+ {
343
+ "epoch": 5.215175537938845,
344
+ "grad_norm": 0.6162961721420288,
345
+ "learning_rate": 9.639269406392695e-05,
346
+ "loss": 0.8571,
347
+ "step": 2300
348
+ },
349
+ {
350
+ "epoch": 5.215175537938845,
351
+ "eval_loss": 1.001980185508728,
352
+ "eval_runtime": 215.4279,
353
+ "eval_samples_per_second": 7.283,
354
+ "eval_steps_per_second": 0.914,
355
+ "step": 2300
356
+ },
357
+ {
358
+ "epoch": 5.44167610419026,
359
+ "grad_norm": 0.6526823043823242,
360
+ "learning_rate": 9.182648401826485e-05,
361
+ "loss": 0.8618,
362
+ "step": 2400
363
+ },
364
+ {
365
+ "epoch": 5.44167610419026,
366
+ "eval_loss": 0.9889749884605408,
367
+ "eval_runtime": 215.4552,
368
+ "eval_samples_per_second": 7.282,
369
+ "eval_steps_per_second": 0.914,
370
+ "step": 2400
371
+ },
372
+ {
373
+ "epoch": 5.668176670441676,
374
+ "grad_norm": 0.6778553128242493,
375
+ "learning_rate": 8.726027397260274e-05,
376
+ "loss": 0.8467,
377
+ "step": 2500
378
+ },
379
+ {
380
+ "epoch": 5.668176670441676,
381
+ "eval_loss": 0.9737924337387085,
382
+ "eval_runtime": 215.4971,
383
+ "eval_samples_per_second": 7.281,
384
+ "eval_steps_per_second": 0.914,
385
+ "step": 2500
386
+ },
387
+ {
388
+ "epoch": 5.8946772366930915,
389
+ "grad_norm": 0.6477532386779785,
390
+ "learning_rate": 8.269406392694065e-05,
391
+ "loss": 0.8449,
392
+ "step": 2600
393
+ },
394
+ {
395
+ "epoch": 5.8946772366930915,
396
+ "eval_loss": 0.9627026319503784,
397
+ "eval_runtime": 215.5931,
398
+ "eval_samples_per_second": 7.278,
399
+ "eval_steps_per_second": 0.914,
400
+ "step": 2600
401
+ },
402
+ {
403
+ "epoch": 6.122310305775764,
404
+ "grad_norm": 0.7134155035018921,
405
+ "learning_rate": 7.812785388127854e-05,
406
+ "loss": 0.8181,
407
+ "step": 2700
408
+ },
409
+ {
410
+ "epoch": 6.122310305775764,
411
+ "eval_loss": 0.9615710973739624,
412
+ "eval_runtime": 215.6571,
413
+ "eval_samples_per_second": 7.275,
414
+ "eval_steps_per_second": 0.913,
415
+ "step": 2700
416
+ },
417
+ {
418
+ "epoch": 6.34881087202718,
419
+ "grad_norm": 0.7087521553039551,
420
+ "learning_rate": 7.356164383561645e-05,
421
+ "loss": 0.7748,
422
+ "step": 2800
423
+ },
424
+ {
425
+ "epoch": 6.34881087202718,
426
+ "eval_loss": 0.9508717656135559,
427
+ "eval_runtime": 215.7854,
428
+ "eval_samples_per_second": 7.271,
429
+ "eval_steps_per_second": 0.913,
430
+ "step": 2800
431
+ },
432
+ {
433
+ "epoch": 6.575311438278596,
434
+ "grad_norm": 0.718744158744812,
435
+ "learning_rate": 6.899543378995434e-05,
436
+ "loss": 0.7679,
437
+ "step": 2900
438
+ },
439
+ {
440
+ "epoch": 6.575311438278596,
441
+ "eval_loss": 0.9414442181587219,
442
+ "eval_runtime": 215.8161,
443
+ "eval_samples_per_second": 7.27,
444
+ "eval_steps_per_second": 0.913,
445
+ "step": 2900
446
+ },
447
+ {
448
+ "epoch": 6.801812004530011,
449
+ "grad_norm": 0.7241001725196838,
450
+ "learning_rate": 6.442922374429225e-05,
451
+ "loss": 0.7708,
452
+ "step": 3000
453
+ },
454
+ {
455
+ "epoch": 6.801812004530011,
456
+ "eval_loss": 0.9294878840446472,
457
+ "eval_runtime": 215.8924,
458
+ "eval_samples_per_second": 7.268,
459
+ "eval_steps_per_second": 0.912,
460
+ "step": 3000
461
+ },
462
+ {
463
+ "epoch": 7.029445073612684,
464
+ "grad_norm": 0.7078688740730286,
465
+ "learning_rate": 5.986301369863014e-05,
466
+ "loss": 0.7602,
467
+ "step": 3100
468
+ },
469
+ {
470
+ "epoch": 7.029445073612684,
471
+ "eval_loss": 0.9215793013572693,
472
+ "eval_runtime": 215.8631,
473
+ "eval_samples_per_second": 7.268,
474
+ "eval_steps_per_second": 0.913,
475
+ "step": 3100
476
+ },
477
+ {
478
+ "epoch": 7.2559456398641,
479
+ "grad_norm": 0.7763993740081787,
480
+ "learning_rate": 5.529680365296805e-05,
481
+ "loss": 0.7042,
482
+ "step": 3200
483
+ },
484
+ {
485
+ "epoch": 7.2559456398641,
486
+ "eval_loss": 0.9177075624465942,
487
+ "eval_runtime": 215.8296,
488
+ "eval_samples_per_second": 7.27,
489
+ "eval_steps_per_second": 0.913,
490
+ "step": 3200
491
+ },
492
+ {
493
+ "epoch": 7.482446206115515,
494
+ "grad_norm": 0.7907236814498901,
495
+ "learning_rate": 5.0730593607305946e-05,
496
+ "loss": 0.7041,
497
+ "step": 3300
498
+ },
499
+ {
500
+ "epoch": 7.482446206115515,
501
+ "eval_loss": 0.9082866907119751,
502
+ "eval_runtime": 215.8314,
503
+ "eval_samples_per_second": 7.27,
504
+ "eval_steps_per_second": 0.913,
505
+ "step": 3300
506
+ },
507
+ {
508
+ "epoch": 7.7089467723669305,
509
+ "grad_norm": 0.809502899646759,
510
+ "learning_rate": 4.616438356164384e-05,
511
+ "loss": 0.6968,
512
+ "step": 3400
513
+ },
514
+ {
515
+ "epoch": 7.7089467723669305,
516
+ "eval_loss": 0.9002099633216858,
517
+ "eval_runtime": 215.7011,
518
+ "eval_samples_per_second": 7.274,
519
+ "eval_steps_per_second": 0.913,
520
+ "step": 3400
521
+ },
522
+ {
523
+ "epoch": 7.935447338618347,
524
+ "grad_norm": 0.8147194981575012,
525
+ "learning_rate": 4.159817351598174e-05,
526
+ "loss": 0.689,
527
+ "step": 3500
528
+ },
529
+ {
530
+ "epoch": 7.935447338618347,
531
+ "eval_loss": 0.887977123260498,
532
+ "eval_runtime": 215.5757,
533
+ "eval_samples_per_second": 7.278,
534
+ "eval_steps_per_second": 0.914,
535
+ "step": 3500
536
+ },
537
+ {
538
+ "epoch": 8.163080407701019,
539
+ "grad_norm": 0.7601930499076843,
540
+ "learning_rate": 3.703196347031964e-05,
541
+ "loss": 0.6579,
542
+ "step": 3600
543
+ },
544
+ {
545
+ "epoch": 8.163080407701019,
546
+ "eval_loss": 0.8895569443702698,
547
+ "eval_runtime": 215.5399,
548
+ "eval_samples_per_second": 7.279,
549
+ "eval_steps_per_second": 0.914,
550
+ "step": 3600
551
+ },
552
+ {
553
+ "epoch": 8.389580973952436,
554
+ "grad_norm": 0.9018923044204712,
555
+ "learning_rate": 3.246575342465754e-05,
556
+ "loss": 0.6431,
557
+ "step": 3700
558
+ },
559
+ {
560
+ "epoch": 8.389580973952436,
561
+ "eval_loss": 0.886945366859436,
562
+ "eval_runtime": 215.5014,
563
+ "eval_samples_per_second": 7.281,
564
+ "eval_steps_per_second": 0.914,
565
+ "step": 3700
566
+ },
567
+ {
568
+ "epoch": 8.61608154020385,
569
+ "grad_norm": 0.8826325535774231,
570
+ "learning_rate": 2.7899543378995436e-05,
571
+ "loss": 0.6343,
572
+ "step": 3800
573
+ },
574
+ {
575
+ "epoch": 8.61608154020385,
576
+ "eval_loss": 0.879227340221405,
577
+ "eval_runtime": 215.5183,
578
+ "eval_samples_per_second": 7.28,
579
+ "eval_steps_per_second": 0.914,
580
+ "step": 3800
581
+ },
582
+ {
583
+ "epoch": 8.842582106455266,
584
+ "grad_norm": 0.8975309133529663,
585
+ "learning_rate": 2.3333333333333336e-05,
586
+ "loss": 0.638,
587
+ "step": 3900
588
+ },
589
+ {
590
+ "epoch": 8.842582106455266,
591
+ "eval_loss": 0.8713725805282593,
592
+ "eval_runtime": 215.5235,
593
+ "eval_samples_per_second": 7.28,
594
+ "eval_steps_per_second": 0.914,
595
+ "step": 3900
596
+ },
597
+ {
598
+ "epoch": 9.070215175537939,
599
+ "grad_norm": 0.8930607438087463,
600
+ "learning_rate": 1.8767123287671235e-05,
601
+ "loss": 0.6248,
602
+ "step": 4000
603
+ },
604
+ {
605
+ "epoch": 9.070215175537939,
606
+ "eval_loss": 0.8720493316650391,
607
+ "eval_runtime": 215.4472,
608
+ "eval_samples_per_second": 7.283,
609
+ "eval_steps_per_second": 0.914,
610
+ "step": 4000
611
+ },
612
+ {
613
+ "epoch": 9.296715741789354,
614
+ "grad_norm": 0.8194226622581482,
615
+ "learning_rate": 1.4200913242009135e-05,
616
+ "loss": 0.5963,
617
+ "step": 4100
618
+ },
619
+ {
620
+ "epoch": 9.296715741789354,
621
+ "eval_loss": 0.8683505654335022,
622
+ "eval_runtime": 215.4449,
623
+ "eval_samples_per_second": 7.283,
624
+ "eval_steps_per_second": 0.914,
625
+ "step": 4100
626
+ },
627
+ {
628
+ "epoch": 9.52321630804077,
629
+ "grad_norm": 0.9078500270843506,
630
+ "learning_rate": 9.634703196347032e-06,
631
+ "loss": 0.5933,
632
+ "step": 4200
633
+ },
634
+ {
635
+ "epoch": 9.52321630804077,
636
+ "eval_loss": 0.8663039803504944,
637
+ "eval_runtime": 215.4014,
638
+ "eval_samples_per_second": 7.284,
639
+ "eval_steps_per_second": 0.915,
640
+ "step": 4200
641
+ },
642
+ {
643
+ "epoch": 9.749716874292186,
644
+ "grad_norm": 0.8323531746864319,
645
+ "learning_rate": 5.068493150684932e-06,
646
+ "loss": 0.6018,
647
+ "step": 4300
648
+ },
649
+ {
650
+ "epoch": 9.749716874292186,
651
+ "eval_loss": 0.864680826663971,
652
+ "eval_runtime": 215.5321,
653
+ "eval_samples_per_second": 7.28,
654
+ "eval_steps_per_second": 0.914,
655
+ "step": 4300
656
+ },
657
+ {
658
+ "epoch": 9.976217440543602,
659
+ "grad_norm": 0.8527385592460632,
660
+ "learning_rate": 5.022831050228311e-07,
661
+ "loss": 0.5912,
662
+ "step": 4400
663
+ },
664
+ {
665
+ "epoch": 9.976217440543602,
666
+ "eval_loss": 0.8630263209342957,
667
+ "eval_runtime": 215.6024,
668
+ "eval_samples_per_second": 7.277,
669
+ "eval_steps_per_second": 0.914,
670
+ "step": 4400
671
+ },
672
+ {
673
+ "epoch": 10.203850509626275,
674
+ "grad_norm": 1.06992506980896,
675
+ "learning_rate": 9.831626848691696e-05,
676
+ "loss": 0.6137,
677
+ "step": 4500
678
+ },
679
+ {
680
+ "epoch": 10.203850509626275,
681
+ "eval_loss": 0.8823357820510864,
682
+ "eval_runtime": 215.9148,
683
+ "eval_samples_per_second": 7.267,
684
+ "eval_steps_per_second": 0.912,
685
+ "step": 4500
686
+ },
687
+ {
688
+ "epoch": 10.43035107587769,
689
+ "grad_norm": 1.1724179983139038,
690
+ "learning_rate": 9.604095563139933e-05,
691
+ "loss": 0.6269,
692
+ "step": 4600
693
+ },
694
+ {
695
+ "epoch": 10.43035107587769,
696
+ "eval_loss": 0.8721606135368347,
697
+ "eval_runtime": 215.8627,
698
+ "eval_samples_per_second": 7.269,
699
+ "eval_steps_per_second": 0.913,
700
+ "step": 4600
701
+ },
702
+ {
703
+ "epoch": 10.656851642129105,
704
+ "grad_norm": 0.9768912196159363,
705
+ "learning_rate": 9.37656427758817e-05,
706
+ "loss": 0.6269,
707
+ "step": 4700
708
+ },
709
+ {
710
+ "epoch": 10.656851642129105,
711
+ "eval_loss": 0.857449471950531,
712
+ "eval_runtime": 215.9592,
713
+ "eval_samples_per_second": 7.265,
714
+ "eval_steps_per_second": 0.912,
715
+ "step": 4700
716
+ },
717
+ {
718
+ "epoch": 10.88335220838052,
719
+ "grad_norm": 0.9589354991912842,
720
+ "learning_rate": 9.149032992036407e-05,
721
+ "loss": 0.6214,
722
+ "step": 4800
723
+ },
724
+ {
725
+ "epoch": 10.88335220838052,
726
+ "eval_loss": 0.8469829559326172,
727
+ "eval_runtime": 216.0164,
728
+ "eval_samples_per_second": 7.263,
729
+ "eval_steps_per_second": 0.912,
730
+ "step": 4800
731
+ },
732
+ {
733
+ "epoch": 11.110985277463193,
734
+ "grad_norm": 1.1246964931488037,
735
+ "learning_rate": 8.921501706484642e-05,
736
+ "loss": 0.6,
737
+ "step": 4900
738
+ },
739
+ {
740
+ "epoch": 11.110985277463193,
741
+ "eval_loss": 0.83788001537323,
742
+ "eval_runtime": 215.9754,
743
+ "eval_samples_per_second": 7.265,
744
+ "eval_steps_per_second": 0.912,
745
+ "step": 4900
746
+ },
747
+ {
748
+ "epoch": 11.337485843714608,
749
+ "grad_norm": 1.138035535812378,
750
+ "learning_rate": 8.693970420932879e-05,
751
+ "loss": 0.5559,
752
+ "step": 5000
753
+ },
754
+ {
755
+ "epoch": 11.337485843714608,
756
+ "eval_loss": 0.8344744443893433,
757
+ "eval_runtime": 216.059,
758
+ "eval_samples_per_second": 7.262,
759
+ "eval_steps_per_second": 0.912,
760
+ "step": 5000
761
+ },
762
+ {
763
+ "epoch": 11.563986409966025,
764
+ "grad_norm": 1.1976112127304077,
765
+ "learning_rate": 4.6044039483675025e-05,
766
+ "loss": 0.5355,
767
+ "step": 5100
768
+ },
769
+ {
770
+ "epoch": 11.563986409966025,
771
+ "eval_loss": 0.8142299652099609,
772
+ "eval_runtime": 216.0109,
773
+ "eval_samples_per_second": 7.264,
774
+ "eval_steps_per_second": 0.912,
775
+ "step": 5100
776
+ },
777
+ {
778
+ "epoch": 11.79048697621744,
779
+ "grad_norm": 1.095481514930725,
780
+ "learning_rate": 4.3006833712984054e-05,
781
+ "loss": 0.5353,
782
+ "step": 5200
783
+ },
784
+ {
785
+ "epoch": 11.79048697621744,
786
+ "eval_loss": 0.8013305068016052,
787
+ "eval_runtime": 215.9277,
788
+ "eval_samples_per_second": 7.266,
789
+ "eval_steps_per_second": 0.912,
790
+ "step": 5200
791
+ },
792
+ {
793
+ "epoch": 12.018120045300114,
794
+ "grad_norm": 1.0251810550689697,
795
+ "learning_rate": 3.996962794229309e-05,
796
+ "loss": 0.5364,
797
+ "step": 5300
798
+ },
799
+ {
800
+ "epoch": 12.018120045300114,
801
+ "eval_loss": 0.7993968725204468,
802
+ "eval_runtime": 216.1353,
803
+ "eval_samples_per_second": 7.259,
804
+ "eval_steps_per_second": 0.911,
805
+ "step": 5300
806
+ },
807
+ {
808
+ "epoch": 12.244620611551529,
809
+ "grad_norm": 0.9645494222640991,
810
+ "learning_rate": 3.6932422171602125e-05,
811
+ "loss": 0.474,
812
+ "step": 5400
813
+ },
814
+ {
815
+ "epoch": 12.244620611551529,
816
+ "eval_loss": 0.7945672273635864,
817
+ "eval_runtime": 216.0266,
818
+ "eval_samples_per_second": 7.263,
819
+ "eval_steps_per_second": 0.912,
820
+ "step": 5400
821
+ },
822
+ {
823
+ "epoch": 12.471121177802944,
824
+ "grad_norm": 0.9431504011154175,
825
+ "learning_rate": 3.389521640091116e-05,
826
+ "loss": 0.48,
827
+ "step": 5500
828
+ },
829
+ {
830
+ "epoch": 12.471121177802944,
831
+ "eval_loss": 0.7862181067466736,
832
+ "eval_runtime": 216.036,
833
+ "eval_samples_per_second": 7.263,
834
+ "eval_steps_per_second": 0.912,
835
+ "step": 5500
836
+ },
837
+ {
838
+ "epoch": 12.69762174405436,
839
+ "grad_norm": 0.9955912232398987,
840
+ "learning_rate": 3.0858010630220196e-05,
841
+ "loss": 0.4733,
842
+ "step": 5600
843
+ },
844
+ {
845
+ "epoch": 12.69762174405436,
846
+ "eval_loss": 0.77918940782547,
847
+ "eval_runtime": 215.9528,
848
+ "eval_samples_per_second": 7.265,
849
+ "eval_steps_per_second": 0.912,
850
+ "step": 5600
851
+ },
852
+ {
853
+ "epoch": 12.924122310305776,
854
+ "grad_norm": 1.0505925416946411,
855
+ "learning_rate": 2.782080485952924e-05,
856
+ "loss": 0.4772,
857
+ "step": 5700
858
+ },
859
+ {
860
+ "epoch": 12.924122310305776,
861
+ "eval_loss": 0.7714400291442871,
862
+ "eval_runtime": 215.9696,
863
+ "eval_samples_per_second": 7.265,
864
+ "eval_steps_per_second": 0.912,
865
+ "step": 5700
866
+ },
867
+ {
868
+ "epoch": 13.15175537938845,
869
+ "grad_norm": 0.9624122381210327,
870
+ "learning_rate": 2.478359908883827e-05,
871
+ "loss": 0.4534,
872
+ "step": 5800
873
+ },
874
+ {
875
+ "epoch": 13.15175537938845,
876
+ "eval_loss": 0.7735848426818848,
877
+ "eval_runtime": 215.9988,
878
+ "eval_samples_per_second": 7.264,
879
+ "eval_steps_per_second": 0.912,
880
+ "step": 5800
881
+ },
882
+ {
883
+ "epoch": 13.378255945639864,
884
+ "grad_norm": 0.9842013716697693,
885
+ "learning_rate": 2.1746393318147306e-05,
886
+ "loss": 0.4437,
887
+ "step": 5900
888
+ },
889
+ {
890
+ "epoch": 13.378255945639864,
891
+ "eval_loss": 0.7693312764167786,
892
+ "eval_runtime": 216.0532,
893
+ "eval_samples_per_second": 7.262,
894
+ "eval_steps_per_second": 0.912,
895
+ "step": 5900
896
+ },
897
+ {
898
+ "epoch": 13.60475651189128,
899
+ "grad_norm": 0.9403685927391052,
900
+ "learning_rate": 1.8709187547456342e-05,
901
+ "loss": 0.4362,
902
+ "step": 6000
903
+ },
904
+ {
905
+ "epoch": 13.60475651189128,
906
+ "eval_loss": 0.762976348400116,
907
+ "eval_runtime": 216.071,
908
+ "eval_samples_per_second": 7.262,
909
+ "eval_steps_per_second": 0.912,
910
+ "step": 6000
911
+ },
912
+ {
913
+ "epoch": 13.831257078142695,
914
+ "grad_norm": 0.9903466105461121,
915
+ "learning_rate": 1.5671981776765377e-05,
916
+ "loss": 0.4349,
917
+ "step": 6100
918
+ },
919
+ {
920
+ "epoch": 13.831257078142695,
921
+ "eval_loss": 0.76067715883255,
922
+ "eval_runtime": 215.8384,
923
+ "eval_samples_per_second": 7.269,
924
+ "eval_steps_per_second": 0.913,
925
+ "step": 6100
926
+ },
927
+ {
928
+ "epoch": 14.058890147225368,
929
+ "grad_norm": 0.9362127780914307,
930
+ "learning_rate": 1.2634776006074411e-05,
931
+ "loss": 0.4385,
932
+ "step": 6200
933
+ },
934
+ {
935
+ "epoch": 14.058890147225368,
936
+ "eval_loss": 0.7605064511299133,
937
+ "eval_runtime": 215.995,
938
+ "eval_samples_per_second": 7.264,
939
+ "eval_steps_per_second": 0.912,
940
+ "step": 6200
941
+ },
942
+ {
943
+ "epoch": 14.285390713476783,
944
+ "grad_norm": 1.0396395921707153,
945
+ "learning_rate": 3.2087853220838355e-05,
946
+ "loss": 0.4171,
947
+ "step": 6300
948
+ },
949
+ {
950
+ "epoch": 14.285390713476783,
951
+ "eval_loss": 0.762564480304718,
952
+ "eval_runtime": 217.6902,
953
+ "eval_samples_per_second": 7.207,
954
+ "eval_steps_per_second": 0.905,
955
+ "step": 6300
956
+ },
957
+ {
958
+ "epoch": 14.5118912797282,
959
+ "grad_norm": 0.9981446266174316,
960
+ "learning_rate": 2.9409401366010447e-05,
961
+ "loss": 0.4262,
962
+ "step": 6400
963
+ },
964
+ {
965
+ "epoch": 14.5118912797282,
966
+ "eval_loss": 0.75795578956604,
967
+ "eval_runtime": 217.8007,
968
+ "eval_samples_per_second": 7.204,
969
+ "eval_steps_per_second": 0.904,
970
+ "step": 6400
971
+ },
972
+ {
973
+ "epoch": 14.738391845979615,
974
+ "grad_norm": 1.2502585649490356,
975
+ "learning_rate": 2.673094951118254e-05,
976
+ "loss": 0.423,
977
+ "step": 6500
978
+ },
979
+ {
980
+ "epoch": 14.738391845979615,
981
+ "eval_loss": 0.7491943836212158,
982
+ "eval_runtime": 217.9332,
983
+ "eval_samples_per_second": 7.199,
984
+ "eval_steps_per_second": 0.904,
985
+ "step": 6500
986
+ },
987
+ {
988
+ "epoch": 14.96489241223103,
989
+ "grad_norm": 1.0835864543914795,
990
+ "learning_rate": 2.405249765635463e-05,
991
+ "loss": 0.4219,
992
+ "step": 6600
993
+ },
994
+ {
995
+ "epoch": 14.96489241223103,
996
+ "eval_loss": 0.744881272315979,
997
+ "eval_runtime": 217.9821,
998
+ "eval_samples_per_second": 7.198,
999
+ "eval_steps_per_second": 0.904,
1000
+ "step": 6600
1001
+ },
1002
+ {
1003
+ "epoch": 15.192525481313703,
1004
+ "grad_norm": 1.0534075498580933,
1005
+ "learning_rate": 2.1374045801526718e-05,
1006
+ "loss": 0.4077,
1007
+ "step": 6700
1008
+ },
1009
+ {
1010
+ "epoch": 15.192525481313703,
1011
+ "eval_loss": 0.747353732585907,
1012
+ "eval_runtime": 217.9166,
1013
+ "eval_samples_per_second": 7.2,
1014
+ "eval_steps_per_second": 0.904,
1015
+ "step": 6700
1016
+ },
1017
+ {
1018
+ "epoch": 15.419026047565119,
1019
+ "grad_norm": 1.0463495254516602,
1020
+ "learning_rate": 1.869559394669881e-05,
1021
+ "loss": 0.3925,
1022
+ "step": 6800
1023
+ },
1024
+ {
1025
+ "epoch": 15.419026047565119,
1026
+ "eval_loss": 0.7430649399757385,
1027
+ "eval_runtime": 217.9436,
1028
+ "eval_samples_per_second": 7.199,
1029
+ "eval_steps_per_second": 0.904,
1030
+ "step": 6800
1031
+ },
1032
+ {
1033
+ "epoch": 15.645526613816534,
1034
+ "grad_norm": 0.9589468240737915,
1035
+ "learning_rate": 1.60171420918709e-05,
1036
+ "loss": 0.3932,
1037
+ "step": 6900
1038
+ },
1039
+ {
1040
+ "epoch": 15.645526613816534,
1041
+ "eval_loss": 0.7386749386787415,
1042
+ "eval_runtime": 217.8985,
1043
+ "eval_samples_per_second": 7.201,
1044
+ "eval_steps_per_second": 0.904,
1045
+ "step": 6900
1046
+ },
1047
+ {
1048
+ "epoch": 15.87202718006795,
1049
+ "grad_norm": 1.1412949562072754,
1050
+ "learning_rate": 1.333869023704299e-05,
1051
+ "loss": 0.3938,
1052
+ "step": 7000
1053
+ },
1054
+ {
1055
+ "epoch": 15.87202718006795,
1056
+ "eval_loss": 0.7356697916984558,
1057
+ "eval_runtime": 217.9654,
1058
+ "eval_samples_per_second": 7.198,
1059
+ "eval_steps_per_second": 0.904,
1060
+ "step": 7000
1061
+ },
1062
+ {
1063
+ "epoch": 16.099660249150624,
1064
+ "grad_norm": 0.9367544651031494,
1065
+ "learning_rate": 1.066023838221508e-05,
1066
+ "loss": 0.3897,
1067
+ "step": 7100
1068
+ },
1069
+ {
1070
+ "epoch": 16.099660249150624,
1071
+ "eval_loss": 0.7375982999801636,
1072
+ "eval_runtime": 217.89,
1073
+ "eval_samples_per_second": 7.201,
1074
+ "eval_steps_per_second": 0.904,
1075
+ "step": 7100
1076
+ },
1077
+ {
1078
+ "epoch": 16.326160815402037,
1079
+ "grad_norm": 1.0161197185516357,
1080
+ "learning_rate": 7.98178652738717e-06,
1081
+ "loss": 0.3713,
1082
+ "step": 7200
1083
+ },
1084
+ {
1085
+ "epoch": 16.326160815402037,
1086
+ "eval_loss": 0.7364110350608826,
1087
+ "eval_runtime": 217.9527,
1088
+ "eval_samples_per_second": 7.199,
1089
+ "eval_steps_per_second": 0.904,
1090
+ "step": 7200
1091
+ },
1092
+ {
1093
+ "epoch": 16.552661381653454,
1094
+ "grad_norm": 0.9260092377662659,
1095
+ "learning_rate": 5.303334672559261e-06,
1096
+ "loss": 0.3804,
1097
+ "step": 7300
1098
+ },
1099
+ {
1100
+ "epoch": 16.552661381653454,
1101
+ "eval_loss": 0.733728289604187,
1102
+ "eval_runtime": 217.9029,
1103
+ "eval_samples_per_second": 7.2,
1104
+ "eval_steps_per_second": 0.904,
1105
+ "step": 7300
1106
+ },
1107
+ {
1108
+ "epoch": 16.77916194790487,
1109
+ "grad_norm": 1.008174180984497,
1110
+ "learning_rate": 2.6248828177313514e-06,
1111
+ "loss": 0.3704,
1112
+ "step": 7400
1113
+ },
1114
+ {
1115
+ "epoch": 16.77916194790487,
1116
+ "eval_loss": 0.7327025532722473,
1117
+ "eval_runtime": 217.9462,
1118
+ "eval_samples_per_second": 7.199,
1119
+ "eval_steps_per_second": 0.904,
1120
+ "step": 7400
1121
+ },
1122
+ {
1123
+ "epoch": 17.006795016987542,
1124
+ "grad_norm": 0.88117915391922,
1125
+ "learning_rate": 3.0056882821387945e-05,
1126
+ "loss": 0.3357,
1127
+ "step": 7500
1128
+ },
1129
+ {
1130
+ "epoch": 17.006795016987542,
1131
+ "eval_loss": 0.7340475916862488,
1132
+ "eval_runtime": 215.1992,
1133
+ "eval_samples_per_second": 7.291,
1134
+ "eval_steps_per_second": 0.915,
1135
+ "step": 7500
1136
+ },
1137
+ {
1138
+ "epoch": 17.23329558323896,
1139
+ "grad_norm": 1.0653574466705322,
1140
+ "learning_rate": 2.7781569965870308e-05,
1141
+ "loss": 0.371,
1142
+ "step": 7600
1143
+ },
1144
+ {
1145
+ "epoch": 17.23329558323896,
1146
+ "eval_loss": 0.7350865602493286,
1147
+ "eval_runtime": 215.7811,
1148
+ "eval_samples_per_second": 7.271,
1149
+ "eval_steps_per_second": 0.913,
1150
+ "step": 7600
1151
+ },
1152
+ {
1153
+ "epoch": 17.459796149490373,
1154
+ "grad_norm": 1.0504379272460938,
1155
+ "learning_rate": 2.5506257110352678e-05,
1156
+ "loss": 0.3784,
1157
+ "step": 7700
1158
+ },
1159
+ {
1160
+ "epoch": 17.459796149490373,
1161
+ "eval_loss": 0.7296854257583618,
1162
+ "eval_runtime": 215.8568,
1163
+ "eval_samples_per_second": 7.269,
1164
+ "eval_steps_per_second": 0.913,
1165
+ "step": 7700
1166
+ },
1167
+ {
1168
+ "epoch": 17.68629671574179,
1169
+ "grad_norm": 1.2694584131240845,
1170
+ "learning_rate": 2.323094425483504e-05,
1171
+ "loss": 0.3769,
1172
+ "step": 7800
1173
+ },
1174
+ {
1175
+ "epoch": 17.68629671574179,
1176
+ "eval_loss": 0.7242677211761475,
1177
+ "eval_runtime": 215.9243,
1178
+ "eval_samples_per_second": 7.266,
1179
+ "eval_steps_per_second": 0.912,
1180
+ "step": 7800
1181
+ },
1182
+ {
1183
+ "epoch": 17.912797281993203,
1184
+ "grad_norm": 1.146080493927002,
1185
+ "learning_rate": 2.0955631399317408e-05,
1186
+ "loss": 0.3808,
1187
+ "step": 7900
1188
+ },
1189
+ {
1190
+ "epoch": 17.912797281993203,
1191
+ "eval_loss": 0.7212691307067871,
1192
+ "eval_runtime": 216.0431,
1193
+ "eval_samples_per_second": 7.262,
1194
+ "eval_steps_per_second": 0.912,
1195
+ "step": 7900
1196
+ },
1197
+ {
1198
+ "epoch": 18.140430351075878,
1199
+ "grad_norm": 1.0066969394683838,
1200
+ "learning_rate": 1.8680318543799774e-05,
1201
+ "loss": 0.3625,
1202
+ "step": 8000
1203
+ },
1204
+ {
1205
+ "epoch": 18.140430351075878,
1206
+ "eval_loss": 0.7235797643661499,
1207
+ "eval_runtime": 216.0282,
1208
+ "eval_samples_per_second": 7.263,
1209
+ "eval_steps_per_second": 0.912,
1210
+ "step": 8000
1211
+ },
1212
+ {
1213
+ "epoch": 18.366930917327295,
1214
+ "grad_norm": 1.040211796760559,
1215
+ "learning_rate": 1.6427758816837314e-05,
1216
+ "loss": 0.3506,
1217
+ "step": 8100
1218
+ },
1219
+ {
1220
+ "epoch": 18.366930917327295,
1221
+ "eval_loss": 0.7218917608261108,
1222
+ "eval_runtime": 216.0625,
1223
+ "eval_samples_per_second": 7.262,
1224
+ "eval_steps_per_second": 0.912,
1225
+ "step": 8100
1226
+ },
1227
+ {
1228
+ "epoch": 18.59343148357871,
1229
+ "grad_norm": 0.9982612133026123,
1230
+ "learning_rate": 1.4152445961319682e-05,
1231
+ "loss": 0.3531,
1232
+ "step": 8200
1233
+ },
1234
+ {
1235
+ "epoch": 18.59343148357871,
1236
+ "eval_loss": 0.7176269888877869,
1237
+ "eval_runtime": 215.9028,
1238
+ "eval_samples_per_second": 7.267,
1239
+ "eval_steps_per_second": 0.912,
1240
+ "step": 8200
1241
+ },
1242
+ {
1243
+ "epoch": 18.819932049830125,
1244
+ "grad_norm": 1.133723497390747,
1245
+ "learning_rate": 1.1877133105802047e-05,
1246
+ "loss": 0.3566,
1247
+ "step": 8300
1248
+ },
1249
+ {
1250
+ "epoch": 18.819932049830125,
1251
+ "eval_loss": 0.7127575874328613,
1252
+ "eval_runtime": 215.8405,
1253
+ "eval_samples_per_second": 7.269,
1254
+ "eval_steps_per_second": 0.913,
1255
+ "step": 8300
1256
+ },
1257
+ {
1258
+ "epoch": 19.047565118912797,
1259
+ "grad_norm": 0.9450774192810059,
1260
+ "learning_rate": 9.601820250284414e-06,
1261
+ "loss": 0.3553,
1262
+ "step": 8400
1263
+ },
1264
+ {
1265
+ "epoch": 19.047565118912797,
1266
+ "eval_loss": 0.7169352173805237,
1267
+ "eval_runtime": 215.9082,
1268
+ "eval_samples_per_second": 7.267,
1269
+ "eval_steps_per_second": 0.912,
1270
+ "step": 8400
1271
+ },
1272
+ {
1273
+ "epoch": 19.274065685164214,
1274
+ "grad_norm": 0.9257975816726685,
1275
+ "learning_rate": 7.326507394766781e-06,
1276
+ "loss": 0.3366,
1277
+ "step": 8500
1278
+ },
1279
+ {
1280
+ "epoch": 19.274065685164214,
1281
+ "eval_loss": 0.7128849625587463,
1282
+ "eval_runtime": 216.0119,
1283
+ "eval_samples_per_second": 7.263,
1284
+ "eval_steps_per_second": 0.912,
1285
+ "step": 8500
1286
+ },
1287
+ {
1288
+ "epoch": 19.500566251415627,
1289
+ "grad_norm": 1.0109655857086182,
1290
+ "learning_rate": 5.051194539249147e-06,
1291
+ "loss": 0.3335,
1292
+ "step": 8600
1293
+ },
1294
+ {
1295
+ "epoch": 19.500566251415627,
1296
+ "eval_loss": 0.7129219174385071,
1297
+ "eval_runtime": 215.981,
1298
+ "eval_samples_per_second": 7.265,
1299
+ "eval_steps_per_second": 0.912,
1300
+ "step": 8600
1301
+ },
1302
+ {
1303
+ "epoch": 19.727066817667044,
1304
+ "grad_norm": 1.021828293800354,
1305
+ "learning_rate": 2.7758816837315134e-06,
1306
+ "loss": 0.3398,
1307
+ "step": 8700
1308
+ },
1309
+ {
1310
+ "epoch": 19.727066817667044,
1311
+ "eval_loss": 0.7109224200248718,
1312
+ "eval_runtime": 215.6766,
1313
+ "eval_samples_per_second": 7.275,
1314
+ "eval_steps_per_second": 0.913,
1315
+ "step": 8700
1316
+ },
1317
+ {
1318
+ "epoch": 19.95356738391846,
1319
+ "grad_norm": 1.0146093368530273,
1320
+ "learning_rate": 5.005688282138794e-07,
1321
+ "loss": 0.3375,
1322
+ "step": 8800
1323
+ },
1324
+ {
1325
+ "epoch": 19.95356738391846,
1326
+ "eval_loss": 0.7103046774864197,
1327
+ "eval_runtime": 215.5394,
1328
+ "eval_samples_per_second": 7.279,
1329
+ "eval_steps_per_second": 0.914,
1330
+ "step": 8800
1331
+ },
1332
+ {
1333
+ "epoch": 20.181200453001132,
1334
+ "grad_norm": 1.1958893537521362,
1335
+ "learning_rate": 2.4621773954316228e-05,
1336
+ "loss": 0.3306,
1337
+ "step": 8900
1338
+ },
1339
+ {
1340
+ "epoch": 20.181200453001132,
1341
+ "eval_loss": 0.7169004082679749,
1342
+ "eval_runtime": 215.7176,
1343
+ "eval_samples_per_second": 7.273,
1344
+ "eval_steps_per_second": 0.913,
1345
+ "step": 8900
1346
+ },
1347
+ {
1348
+ "epoch": 20.40770101925255,
1349
+ "grad_norm": 1.2821824550628662,
1350
+ "learning_rate": 2.2644121427865127e-05,
1351
+ "loss": 0.3442,
1352
+ "step": 9000
1353
+ },
1354
+ {
1355
+ "epoch": 20.40770101925255,
1356
+ "eval_loss": 0.710050106048584,
1357
+ "eval_runtime": 215.7663,
1358
+ "eval_samples_per_second": 7.272,
1359
+ "eval_steps_per_second": 0.913,
1360
+ "step": 9000
1361
+ },
1362
+ {
1363
+ "epoch": 20.634201585503963,
1364
+ "grad_norm": 1.118553876876831,
1365
+ "learning_rate": 2.0666468901414023e-05,
1366
+ "loss": 0.3434,
1367
+ "step": 9100
1368
+ },
1369
+ {
1370
+ "epoch": 20.634201585503963,
1371
+ "eval_loss": 0.7061675190925598,
1372
+ "eval_runtime": 215.9231,
1373
+ "eval_samples_per_second": 7.266,
1374
+ "eval_steps_per_second": 0.912,
1375
+ "step": 9100
1376
+ },
1377
+ {
1378
+ "epoch": 20.86070215175538,
1379
+ "grad_norm": 1.131901502609253,
1380
+ "learning_rate": 1.868881637496292e-05,
1381
+ "loss": 0.3431,
1382
+ "step": 9200
1383
+ },
1384
+ {
1385
+ "epoch": 20.86070215175538,
1386
+ "eval_loss": 0.7025442123413086,
1387
+ "eval_runtime": 215.8012,
1388
+ "eval_samples_per_second": 7.271,
1389
+ "eval_steps_per_second": 0.913,
1390
+ "step": 9200
1391
+ },
1392
+ {
1393
+ "epoch": 21.08833522083805,
1394
+ "grad_norm": 1.063588261604309,
1395
+ "learning_rate": 1.6711163848511818e-05,
1396
+ "loss": 0.3361,
1397
+ "step": 9300
1398
+ },
1399
+ {
1400
+ "epoch": 21.08833522083805,
1401
+ "eval_loss": 0.7048903107643127,
1402
+ "eval_runtime": 215.6814,
1403
+ "eval_samples_per_second": 7.275,
1404
+ "eval_steps_per_second": 0.913,
1405
+ "step": 9300
1406
+ },
1407
+ {
1408
+ "epoch": 21.314835787089468,
1409
+ "grad_norm": 1.0730327367782593,
1410
+ "learning_rate": 1.4733511322060714e-05,
1411
+ "loss": 0.3201,
1412
+ "step": 9400
1413
+ },
1414
+ {
1415
+ "epoch": 21.314835787089468,
1416
+ "eval_loss": 0.7042800188064575,
1417
+ "eval_runtime": 215.7788,
1418
+ "eval_samples_per_second": 7.271,
1419
+ "eval_steps_per_second": 0.913,
1420
+ "step": 9400
1421
+ },
1422
+ {
1423
+ "epoch": 21.541336353340885,
1424
+ "grad_norm": 1.002642273902893,
1425
+ "learning_rate": 1.2755858795609612e-05,
1426
+ "loss": 0.3248,
1427
+ "step": 9500
1428
+ },
1429
+ {
1430
+ "epoch": 21.541336353340885,
1431
+ "eval_loss": 0.6989186406135559,
1432
+ "eval_runtime": 215.7873,
1433
+ "eval_samples_per_second": 7.271,
1434
+ "eval_steps_per_second": 0.913,
1435
+ "step": 9500
1436
+ },
1437
+ {
1438
+ "epoch": 21.7678369195923,
1439
+ "grad_norm": 1.184122920036316,
1440
+ "learning_rate": 1.077820626915851e-05,
1441
+ "loss": 0.3183,
1442
+ "step": 9600
1443
+ },
1444
+ {
1445
+ "epoch": 21.7678369195923,
1446
+ "eval_loss": 0.6965128183364868,
1447
+ "eval_runtime": 215.7509,
1448
+ "eval_samples_per_second": 7.272,
1449
+ "eval_steps_per_second": 0.913,
1450
+ "step": 9600
1451
+ },
1452
+ {
1453
+ "epoch": 21.994337485843715,
1454
+ "grad_norm": 1.00730562210083,
1455
+ "learning_rate": 8.800553742707407e-06,
1456
+ "loss": 0.3243,
1457
+ "step": 9700
1458
+ },
1459
+ {
1460
+ "epoch": 21.994337485843715,
1461
+ "eval_loss": 0.69374018907547,
1462
+ "eval_runtime": 215.6336,
1463
+ "eval_samples_per_second": 7.276,
1464
+ "eval_steps_per_second": 0.914,
1465
+ "step": 9700
1466
+ },
1467
+ {
1468
+ "epoch": 22.221970554926386,
1469
+ "grad_norm": 1.0043355226516724,
1470
+ "learning_rate": 6.822901216256304e-06,
1471
+ "loss": 0.3118,
1472
+ "step": 9800
1473
+ },
1474
+ {
1475
+ "epoch": 22.221970554926386,
1476
+ "eval_loss": 0.6979511380195618,
1477
+ "eval_runtime": 215.6267,
1478
+ "eval_samples_per_second": 7.276,
1479
+ "eval_steps_per_second": 0.914,
1480
+ "step": 9800
1481
+ },
1482
+ {
1483
+ "epoch": 22.448471121177803,
1484
+ "grad_norm": 1.0051404237747192,
1485
+ "learning_rate": 4.845248689805201e-06,
1486
+ "loss": 0.3068,
1487
+ "step": 9900
1488
+ },
1489
+ {
1490
+ "epoch": 22.448471121177803,
1491
+ "eval_loss": 0.6955877542495728,
1492
+ "eval_runtime": 215.5799,
1493
+ "eval_samples_per_second": 7.278,
1494
+ "eval_steps_per_second": 0.914,
1495
+ "step": 9900
1496
+ },
1497
+ {
1498
+ "epoch": 22.674971687429217,
1499
+ "grad_norm": 0.9242558479309082,
1500
+ "learning_rate": 2.8675961633540988e-06,
1501
+ "loss": 0.3094,
1502
+ "step": 10000
1503
+ },
1504
+ {
1505
+ "epoch": 22.674971687429217,
1506
+ "eval_loss": 0.6944177746772766,
1507
+ "eval_runtime": 215.6097,
1508
+ "eval_samples_per_second": 7.277,
1509
+ "eval_steps_per_second": 0.914,
1510
+ "step": 10000
1511
+ },
1512
+ {
1513
+ "epoch": 22.901472253680634,
1514
+ "grad_norm": 1.057633638381958,
1515
+ "learning_rate": 8.899436369029961e-07,
1516
+ "loss": 0.3092,
1517
+ "step": 10100
1518
+ },
1519
+ {
1520
+ "epoch": 22.901472253680634,
1521
+ "eval_loss": 0.6938230991363525,
1522
+ "eval_runtime": 215.7357,
1523
+ "eval_samples_per_second": 7.273,
1524
+ "eval_steps_per_second": 0.913,
1525
+ "step": 10100
1526
+ },
1527
+ {
1528
+ "epoch": 23.12910532276331,
1529
+ "grad_norm": 1.0774931907653809,
1530
+ "learning_rate": 1.5043201455206912e-05,
1531
+ "loss": 0.3076,
1532
+ "step": 10200
1533
+ },
1534
+ {
1535
+ "epoch": 23.12910532276331,
1536
+ "eval_loss": 0.6974820494651794,
1537
+ "eval_runtime": 214.8368,
1538
+ "eval_samples_per_second": 7.303,
1539
+ "eval_steps_per_second": 0.917,
1540
+ "step": 10200
1541
+ },
1542
+ {
1543
+ "epoch": 23.355605889014722,
1544
+ "grad_norm": 1.0320171117782593,
1545
+ "learning_rate": 1.3224192814915873e-05,
1546
+ "loss": 0.3084,
1547
+ "step": 10300
1548
+ },
1549
+ {
1550
+ "epoch": 23.355605889014722,
1551
+ "eval_loss": 0.6950554251670837,
1552
+ "eval_runtime": 214.9493,
1553
+ "eval_samples_per_second": 7.299,
1554
+ "eval_steps_per_second": 0.916,
1555
+ "step": 10300
1556
+ },
1557
+ {
1558
+ "epoch": 23.58210645526614,
1559
+ "grad_norm": 1.0282411575317383,
1560
+ "learning_rate": 1.142337426102774e-05,
1561
+ "loss": 0.3077,
1562
+ "step": 10400
1563
+ },
1564
+ {
1565
+ "epoch": 23.58210645526614,
1566
+ "eval_loss": 0.6931228637695312,
1567
+ "eval_runtime": 214.8764,
1568
+ "eval_samples_per_second": 7.302,
1569
+ "eval_steps_per_second": 0.917,
1570
+ "step": 10400
1571
+ },
1572
+ {
1573
+ "epoch": 23.808607021517552,
1574
+ "grad_norm": 1.0363404750823975,
1575
+ "learning_rate": 9.6043656207367e-06,
1576
+ "loss": 0.3088,
1577
+ "step": 10500
1578
+ },
1579
+ {
1580
+ "epoch": 23.808607021517552,
1581
+ "eval_loss": 0.6890929341316223,
1582
+ "eval_runtime": 214.9182,
1583
+ "eval_samples_per_second": 7.3,
1584
+ "eval_steps_per_second": 0.917,
1585
+ "step": 10500
1586
+ },
1587
+ {
1588
+ "epoch": 24.036240090600227,
1589
+ "grad_norm": 0.9383705854415894,
1590
+ "learning_rate": 7.785356980445657e-06,
1591
+ "loss": 0.3092,
1592
+ "step": 10600
1593
+ },
1594
+ {
1595
+ "epoch": 24.036240090600227,
1596
+ "eval_loss": 0.6934636831283569,
1597
+ "eval_runtime": 214.8602,
1598
+ "eval_samples_per_second": 7.302,
1599
+ "eval_steps_per_second": 0.917,
1600
+ "step": 10600
1601
+ },
1602
+ {
1603
+ "epoch": 24.26274065685164,
1604
+ "grad_norm": 0.905327320098877,
1605
+ "learning_rate": 5.966348340154616e-06,
1606
+ "loss": 0.2941,
1607
+ "step": 10700
1608
+ },
1609
+ {
1610
+ "epoch": 24.26274065685164,
1611
+ "eval_loss": 0.691253125667572,
1612
+ "eval_runtime": 215.1136,
1613
+ "eval_samples_per_second": 7.294,
1614
+ "eval_steps_per_second": 0.916,
1615
+ "step": 10700
1616
+ },
1617
+ {
1618
+ "epoch": 24.489241223103058,
1619
+ "grad_norm": 0.9248319864273071,
1620
+ "learning_rate": 4.147339699863575e-06,
1621
+ "loss": 0.2965,
1622
+ "step": 10800
1623
+ },
1624
+ {
1625
+ "epoch": 24.489241223103058,
1626
+ "eval_loss": 0.688778281211853,
1627
+ "eval_runtime": 215.1479,
1628
+ "eval_samples_per_second": 7.293,
1629
+ "eval_steps_per_second": 0.916,
1630
+ "step": 10800
1631
+ },
1632
+ {
1633
+ "epoch": 24.715741789354475,
1634
+ "grad_norm": 0.9588720202445984,
1635
+ "learning_rate": 2.3283310595725328e-06,
1636
+ "loss": 0.296,
1637
+ "step": 10900
1638
+ },
1639
+ {
1640
+ "epoch": 24.715741789354475,
1641
+ "eval_loss": 0.6882308125495911,
1642
+ "eval_runtime": 215.299,
1643
+ "eval_samples_per_second": 7.288,
1644
+ "eval_steps_per_second": 0.915,
1645
+ "step": 10900
1646
+ },
1647
+ {
1648
+ "epoch": 24.942242355605888,
1649
+ "grad_norm": 0.9268137216567993,
1650
+ "learning_rate": 5.093224192814915e-07,
1651
+ "loss": 0.2979,
1652
+ "step": 11000
1653
+ },
1654
+ {
1655
+ "epoch": 24.942242355605888,
1656
+ "eval_loss": 0.6880246996879578,
1657
+ "eval_runtime": 215.276,
1658
+ "eval_samples_per_second": 7.288,
1659
+ "eval_steps_per_second": 0.915,
1660
+ "step": 11000
1661
+ },
1662
+ {
1663
+ "epoch": 25.169875424688563,
1664
+ "grad_norm": 1.163781762123108,
1665
+ "learning_rate": 3.231818181818182e-05,
1666
+ "loss": 0.299,
1667
+ "step": 11100
1668
+ },
1669
+ {
1670
+ "epoch": 25.169875424688563,
1671
+ "eval_loss": 0.6973585486412048,
1672
+ "eval_runtime": 215.0775,
1673
+ "eval_samples_per_second": 7.295,
1674
+ "eval_steps_per_second": 0.916,
1675
+ "step": 11100
1676
+ },
1677
+ {
1678
+ "epoch": 25.396375990939976,
1679
+ "grad_norm": 1.3955178260803223,
1680
+ "learning_rate": 3.0803030303030304e-05,
1681
+ "loss": 0.3092,
1682
+ "step": 11200
1683
+ },
1684
+ {
1685
+ "epoch": 25.396375990939976,
1686
+ "eval_loss": 0.6922005414962769,
1687
+ "eval_runtime": 214.9326,
1688
+ "eval_samples_per_second": 7.3,
1689
+ "eval_steps_per_second": 0.917,
1690
+ "step": 11200
1691
+ },
1692
+ {
1693
+ "epoch": 25.622876557191393,
1694
+ "grad_norm": 1.630706548690796,
1695
+ "learning_rate": 2.9287878787878793e-05,
1696
+ "loss": 0.3106,
1697
+ "step": 11300
1698
+ },
1699
+ {
1700
+ "epoch": 25.622876557191393,
1701
+ "eval_loss": 0.6866771578788757,
1702
+ "eval_runtime": 215.0055,
1703
+ "eval_samples_per_second": 7.297,
1704
+ "eval_steps_per_second": 0.916,
1705
+ "step": 11300
1706
+ },
1707
+ {
1708
+ "epoch": 25.84937712344281,
1709
+ "grad_norm": 1.2561249732971191,
1710
+ "learning_rate": 2.7772727272727272e-05,
1711
+ "loss": 0.3113,
1712
+ "step": 11400
1713
+ },
1714
+ {
1715
+ "epoch": 25.84937712344281,
1716
+ "eval_loss": 0.6819568276405334,
1717
+ "eval_runtime": 214.834,
1718
+ "eval_samples_per_second": 7.303,
1719
+ "eval_steps_per_second": 0.917,
1720
+ "step": 11400
1721
+ },
1722
+ {
1723
+ "epoch": 26.07701019252548,
1724
+ "grad_norm": 1.5864068269729614,
1725
+ "learning_rate": 2.6257575757575757e-05,
1726
+ "loss": 0.3081,
1727
+ "step": 11500
1728
+ },
1729
+ {
1730
+ "epoch": 26.07701019252548,
1731
+ "eval_loss": 0.683868408203125,
1732
+ "eval_runtime": 214.7915,
1733
+ "eval_samples_per_second": 7.305,
1734
+ "eval_steps_per_second": 0.917,
1735
+ "step": 11500
1736
+ },
1737
+ {
1738
+ "epoch": 26.3035107587769,
1739
+ "grad_norm": 1.2162772417068481,
1740
+ "learning_rate": 2.475757575757576e-05,
1741
+ "loss": 0.2911,
1742
+ "step": 11600
1743
+ },
1744
+ {
1745
+ "epoch": 26.3035107587769,
1746
+ "eval_loss": 0.6809530258178711,
1747
+ "eval_runtime": 214.7759,
1748
+ "eval_samples_per_second": 7.305,
1749
+ "eval_steps_per_second": 0.917,
1750
+ "step": 11600
1751
+ },
1752
+ {
1753
+ "epoch": 26.530011325028312,
1754
+ "grad_norm": 1.2697758674621582,
1755
+ "learning_rate": 2.3242424242424243e-05,
1756
+ "loss": 0.289,
1757
+ "step": 11700
1758
+ },
1759
+ {
1760
+ "epoch": 26.530011325028312,
1761
+ "eval_loss": 0.6744586229324341,
1762
+ "eval_runtime": 214.8373,
1763
+ "eval_samples_per_second": 7.303,
1764
+ "eval_steps_per_second": 0.917,
1765
+ "step": 11700
1766
+ },
1767
+ {
1768
+ "epoch": 26.75651189127973,
1769
+ "grad_norm": 1.2038872241973877,
1770
+ "learning_rate": 2.172727272727273e-05,
1771
+ "loss": 0.293,
1772
+ "step": 11800
1773
+ },
1774
+ {
1775
+ "epoch": 26.75651189127973,
1776
+ "eval_loss": 0.6742814183235168,
1777
+ "eval_runtime": 214.8304,
1778
+ "eval_samples_per_second": 7.303,
1779
+ "eval_steps_per_second": 0.917,
1780
+ "step": 11800
1781
+ },
1782
+ {
1783
+ "epoch": 26.983012457531142,
1784
+ "grad_norm": 1.2992146015167236,
1785
+ "learning_rate": 2.0212121212121214e-05,
1786
+ "loss": 0.2921,
1787
+ "step": 11900
1788
+ },
1789
+ {
1790
+ "epoch": 26.983012457531142,
1791
+ "eval_loss": 0.6651853322982788,
1792
+ "eval_runtime": 214.8882,
1793
+ "eval_samples_per_second": 7.301,
1794
+ "eval_steps_per_second": 0.917,
1795
+ "step": 11900
1796
+ },
1797
+ {
1798
+ "epoch": 27.210645526613817,
1799
+ "grad_norm": 1.5248701572418213,
1800
+ "learning_rate": 1.86969696969697e-05,
1801
+ "loss": 0.2761,
1802
+ "step": 12000
1803
+ },
1804
+ {
1805
+ "epoch": 27.210645526613817,
1806
+ "eval_loss": 0.6715984344482422,
1807
+ "eval_runtime": 214.8134,
1808
+ "eval_samples_per_second": 7.304,
1809
+ "eval_steps_per_second": 0.917,
1810
+ "step": 12000
1811
+ },
1812
+ {
1813
+ "epoch": 27.43714609286523,
1814
+ "grad_norm": 1.3465806245803833,
1815
+ "learning_rate": 1.718181818181818e-05,
1816
+ "loss": 0.2778,
1817
+ "step": 12100
1818
+ },
1819
+ {
1820
+ "epoch": 27.43714609286523,
1821
+ "eval_loss": 0.6680512428283691,
1822
+ "eval_runtime": 214.8681,
1823
+ "eval_samples_per_second": 7.302,
1824
+ "eval_steps_per_second": 0.917,
1825
+ "step": 12100
1826
+ },
1827
+ {
1828
+ "epoch": 27.663646659116647,
1829
+ "grad_norm": 1.044279932975769,
1830
+ "learning_rate": 1.5666666666666667e-05,
1831
+ "loss": 0.2766,
1832
+ "step": 12200
1833
+ },
1834
+ {
1835
+ "epoch": 27.663646659116647,
1836
+ "eval_loss": 0.66595059633255,
1837
+ "eval_runtime": 214.8457,
1838
+ "eval_samples_per_second": 7.303,
1839
+ "eval_steps_per_second": 0.917,
1840
+ "step": 12200
1841
+ },
1842
+ {
1843
+ "epoch": 27.890147225368064,
1844
+ "grad_norm": 1.033585548400879,
1845
+ "learning_rate": 1.4151515151515152e-05,
1846
+ "loss": 0.2755,
1847
+ "step": 12300
1848
+ },
1849
+ {
1850
+ "epoch": 27.890147225368064,
1851
+ "eval_loss": 0.6637829542160034,
1852
+ "eval_runtime": 215.9457,
1853
+ "eval_samples_per_second": 7.266,
1854
+ "eval_steps_per_second": 0.912,
1855
+ "step": 12300
1856
+ },
1857
+ {
1858
+ "epoch": 28.117780294450736,
1859
+ "grad_norm": 1.020391821861267,
1860
+ "learning_rate": 1.2636363636363638e-05,
1861
+ "loss": 0.267,
1862
+ "step": 12400
1863
+ },
1864
+ {
1865
+ "epoch": 28.117780294450736,
1866
+ "eval_loss": 0.6649137139320374,
1867
+ "eval_runtime": 215.9471,
1868
+ "eval_samples_per_second": 7.266,
1869
+ "eval_steps_per_second": 0.912,
1870
+ "step": 12400
1871
+ },
1872
+ {
1873
+ "epoch": 28.344280860702153,
1874
+ "grad_norm": 0.961401641368866,
1875
+ "learning_rate": 1.1121212121212121e-05,
1876
+ "loss": 0.2603,
1877
+ "step": 12500
1878
+ },
1879
+ {
1880
+ "epoch": 28.344280860702153,
1881
+ "eval_loss": 0.6652226448059082,
1882
+ "eval_runtime": 215.9698,
1883
+ "eval_samples_per_second": 7.265,
1884
+ "eval_steps_per_second": 0.912,
1885
+ "step": 12500
1886
+ },
1887
+ {
1888
+ "epoch": 28.570781426953566,
1889
+ "grad_norm": 0.9490793347358704,
1890
+ "learning_rate": 9.606060606060607e-06,
1891
+ "loss": 0.2619,
1892
+ "step": 12600
1893
+ },
1894
+ {
1895
+ "epoch": 28.570781426953566,
1896
+ "eval_loss": 0.6631964445114136,
1897
+ "eval_runtime": 216.054,
1898
+ "eval_samples_per_second": 7.262,
1899
+ "eval_steps_per_second": 0.912,
1900
+ "step": 12600
1901
+ },
1902
+ {
1903
+ "epoch": 28.797281993204983,
1904
+ "grad_norm": 1.015561580657959,
1905
+ "learning_rate": 8.09090909090909e-06,
1906
+ "loss": 0.2648,
1907
+ "step": 12700
1908
+ },
1909
+ {
1910
+ "epoch": 28.797281993204983,
1911
+ "eval_loss": 0.6587069034576416,
1912
+ "eval_runtime": 216.0262,
1913
+ "eval_samples_per_second": 7.263,
1914
+ "eval_steps_per_second": 0.912,
1915
+ "step": 12700
1916
+ },
1917
+ {
1918
+ "epoch": 29.024915062287654,
1919
+ "grad_norm": 0.8936730623245239,
1920
+ "learning_rate": 6.575757575757575e-06,
1921
+ "loss": 0.2621,
1922
+ "step": 12800
1923
+ },
1924
+ {
1925
+ "epoch": 29.024915062287654,
1926
+ "eval_loss": 0.6623508930206299,
1927
+ "eval_runtime": 216.1177,
1928
+ "eval_samples_per_second": 7.26,
1929
+ "eval_steps_per_second": 0.912,
1930
+ "step": 12800
1931
+ },
1932
+ {
1933
+ "epoch": 29.25141562853907,
1934
+ "grad_norm": 0.8353849053382874,
1935
+ "learning_rate": 5.060606060606061e-06,
1936
+ "loss": 0.2497,
1937
+ "step": 12900
1938
+ },
1939
+ {
1940
+ "epoch": 29.25141562853907,
1941
+ "eval_loss": 0.6611577272415161,
1942
+ "eval_runtime": 216.148,
1943
+ "eval_samples_per_second": 7.259,
1944
+ "eval_steps_per_second": 0.911,
1945
+ "step": 12900
1946
+ },
1947
+ {
1948
+ "epoch": 29.477916194790488,
1949
+ "grad_norm": 0.9153599739074707,
1950
+ "learning_rate": 3.5454545454545454e-06,
1951
+ "loss": 0.2555,
1952
+ "step": 13000
1953
+ },
1954
+ {
1955
+ "epoch": 29.477916194790488,
1956
+ "eval_loss": 0.660586416721344,
1957
+ "eval_runtime": 216.1833,
1958
+ "eval_samples_per_second": 7.258,
1959
+ "eval_steps_per_second": 0.911,
1960
+ "step": 13000
1961
+ },
1962
+ {
1963
+ "epoch": 29.7044167610419,
1964
+ "grad_norm": 0.9058592319488525,
1965
+ "learning_rate": 2.0303030303030303e-06,
1966
+ "loss": 0.2502,
1967
+ "step": 13100
1968
+ },
1969
+ {
1970
+ "epoch": 29.7044167610419,
1971
+ "eval_loss": 0.6602495908737183,
1972
+ "eval_runtime": 216.1128,
1973
+ "eval_samples_per_second": 7.26,
1974
+ "eval_steps_per_second": 0.912,
1975
+ "step": 13100
1976
+ },
1977
+ {
1978
+ "epoch": 29.93091732729332,
1979
+ "grad_norm": 0.9629655480384827,
1980
+ "learning_rate": 5.151515151515152e-07,
1981
+ "loss": 0.2525,
1982
+ "step": 13200
1983
+ },
1984
+ {
1985
+ "epoch": 29.93091732729332,
1986
+ "eval_loss": 0.6595110297203064,
1987
+ "eval_runtime": 216.0902,
1988
+ "eval_samples_per_second": 7.261,
1989
+ "eval_steps_per_second": 0.912,
1990
+ "step": 13200
1991
+ }
1992
+ ],
1993
+ "logging_steps": 100,
1994
+ "max_steps": 13230,
1995
+ "num_input_tokens_seen": 0,
1996
+ "num_train_epochs": 30,
1997
+ "save_steps": 100,
1998
+ "stateful_callbacks": {
1999
+ "TrainerControl": {
2000
+ "args": {
2001
+ "should_epoch_stop": false,
2002
+ "should_evaluate": false,
2003
+ "should_log": false,
2004
+ "should_save": true,
2005
+ "should_training_stop": true
2006
+ },
2007
+ "attributes": {}
2008
+ }
2009
+ },
2010
+ "total_flos": 6.6487625048064e+17,
2011
+ "train_batch_size": 8,
2012
+ "trial_name": null,
2013
+ "trial_params": null
2014
+ }
checkpoint-13230/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd1454bdddf5d6a918ffb8a8fbe7331b220c89a157922bc08953307b6efaaae
3
+ size 6161
checkpoint-13230/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|vision_pad|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for forward_message in messages %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- set message = messages[index] %}\n {%- set current_content = message.content if message.content is defined and message.content is not none else '' %}\n {%- set tool_start = '<tool_response>' %}\n {%- set tool_start_length = tool_start|length %}\n {%- set start_of_message = current_content[:tool_start_length] %}\n {%- set tool_end = '</tool_response>' %}\n {%- set tool_end_length = tool_end|length %}\n {%- set start_pos = (current_content|length) - tool_end_length %}\n {%- if start_pos < 0 %}\n {%- set start_pos = 0 %}\n {%- endif %}\n {%- set end_of_message = current_content[start_pos:] %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(start_of_message == tool_start and end_of_message == tool_end) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set m_content = message.content if message.content is defined and message.content is not none else '' %}\n {%- set content = m_content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in m_content %}\n {%- set content = (m_content.split('</think>')|last).lstrip('\\n') %}\n {%- set reasoning_content = (m_content.split('</think>')|first).rstrip('\\n') %}\n {%- set reasoning_content = (reasoning_content.split('<think>')|last).lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and (not reasoning_content.strip() == '')) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 40960,
236
+ "pad_token": "<|vision_pad|>",
237
+ "padding_side": "right",
238
+ "split_special_tokens": false,
239
+ "tokenizer_class": "Qwen2Tokenizer",
240
+ "unk_token": null
241
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff