ZYXue commited on Sep 16

Commit

802532c

verified ·

1 Parent(s): 30aafe9

Upload folder using huggingface_hub

Browse files

Files changed (44) hide show

.gitattributes +3 -0
README.md +202 -0
adapter_config.json +29 -0
adapter_model.safetensors +3 -0
added_tokens.json +24 -0
chat_template.jinja +7 -0
checkpoint-200/README.md +202 -0
checkpoint-200/adapter_config.json +29 -0
checkpoint-200/adapter_model.safetensors +3 -0
checkpoint-200/added_tokens.json +24 -0
checkpoint-200/chat_template.jinja +7 -0
checkpoint-200/merges.txt +0 -0
checkpoint-200/optimizer.pt +3 -0
checkpoint-200/rng_state.pth +3 -0
checkpoint-200/scheduler.pt +3 -0
checkpoint-200/special_tokens_map.json +31 -0
checkpoint-200/tokenizer.json +3 -0
checkpoint-200/tokenizer_config.json +207 -0
checkpoint-200/trainer_state.json +1843 -0
checkpoint-200/training_args.bin +3 -0
checkpoint-200/vocab.json +0 -0
checkpoint-320/README.md +202 -0
checkpoint-320/adapter_config.json +29 -0
checkpoint-320/adapter_model.safetensors +3 -0
checkpoint-320/added_tokens.json +24 -0
checkpoint-320/chat_template.jinja +7 -0
checkpoint-320/merges.txt +0 -0
checkpoint-320/optimizer.pt +3 -0
checkpoint-320/rng_state.pth +3 -0
checkpoint-320/scheduler.pt +3 -0
checkpoint-320/special_tokens_map.json +31 -0
checkpoint-320/tokenizer.json +3 -0
checkpoint-320/tokenizer_config.json +207 -0
checkpoint-320/trainer_state.json +2923 -0
checkpoint-320/training_args.bin +3 -0
checkpoint-320/vocab.json +0 -0
merges.txt +0 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +207 -0
training_args.bin +3 -0
training_args.json +28 -0
training_log.json +1318 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-320/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2.5-VL-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f841b4bcf9a79ca58b01986f5212b5ae4951dfcec18a880ebf0db9e257d602
+size 20200056

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

checkpoint-200/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2.5-VL-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

checkpoint-200/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-200/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f841b4bcf9a79ca58b01986f5212b5ae4951dfcec18a880ebf0db9e257d602
+size 20200056

checkpoint-200/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-200/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

checkpoint-200/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca2ac737939e8bca052e3213e1cb57c622467d7e1c0b35fef0e57e32cb6c8ffe
+size 40466443

checkpoint-200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ad5f4dac8c8f22dd51d6ff589c953bfcbee9552cc67d00c221a20372a23c5ec
+size 14645

checkpoint-200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97c9fe9cd09f40f700384cf87478f8d3de41c2a75de82450af7b385d04eebbd6
+size 1401

checkpoint-200/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-200/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoint-200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1843 @@

+{
+  "best_global_step": 200,
+  "best_metric": 0.07848864793777466,
+  "best_model_checkpoint": "sft_prefill/prompt_id_3/qwen2-VL-7B-Instruct-syn-count-lora/checkpoint-200",
+  "epoch": 1.25,
+  "eval_steps": 200,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00625,
+      "grad_norm": 0.9298801422119141,
+      "learning_rate": 0.0002,
+      "loss": 0.4917,
+      "mean_token_accuracy": 0.6875,
+      "num_tokens": 125296.0,
+      "step": 1
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.8382226824760437,
+      "learning_rate": 0.0002,
+      "loss": 0.6049,
+      "mean_token_accuracy": 0.75,
+      "num_tokens": 128352.0,
+      "step": 2
+    },
+    {
+      "epoch": 0.01875,
+      "grad_norm": 0.9454999566078186,
+      "learning_rate": 0.0002,
+      "loss": 0.6407,
+      "mean_token_accuracy": 0.65625,
+      "num_tokens": 131408.0,
+      "step": 3
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.9727649092674255,
+      "learning_rate": 0.0002,
+      "loss": 0.3972,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 134464.0,
+      "step": 4
+    },
+    {
+      "epoch": 0.03125,
+      "grad_norm": 0.785910964012146,
+      "learning_rate": 0.0002,
+      "loss": 0.453,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 137520.0,
+      "step": 5
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.9556392431259155,
+      "learning_rate": 0.0002,
+      "loss": 0.418,
+      "mean_token_accuracy": 0.8125,
+      "num_tokens": 140576.0,
+      "step": 6
+    },
+    {
+      "epoch": 0.04375,
+      "grad_norm": 1.160456657409668,
+      "learning_rate": 0.0002,
+      "loss": 0.303,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 143632.0,
+      "step": 7
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.127308964729309,
+      "learning_rate": 0.0002,
+      "loss": 0.1847,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 146688.0,
+      "step": 8
+    },
+    {
+      "epoch": 0.05625,
+      "grad_norm": 1.7690690755844116,
+      "learning_rate": 0.0002,
+      "loss": 0.3514,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 149744.0,
+      "step": 9
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.8747241497039795,
+      "learning_rate": 0.0002,
+      "loss": 0.2434,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 152800.0,
+      "step": 10
+    },
+    {
+      "epoch": 0.06875,
+      "grad_norm": 0.7658981084823608,
+      "learning_rate": 0.0002,
+      "loss": 0.1577,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 155856.0,
+      "step": 11
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.6385300755500793,
+      "learning_rate": 0.0002,
+      "loss": 0.209,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 158912.0,
+      "step": 12
+    },
+    {
+      "epoch": 0.08125,
+      "grad_norm": 1.2857609987258911,
+      "learning_rate": 0.0002,
+      "loss": 0.2123,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 161968.0,
+      "step": 13
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.5603316426277161,
+      "learning_rate": 0.0002,
+      "loss": 0.2712,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 165024.0,
+      "step": 14
+    },
+    {
+      "epoch": 0.09375,
+      "grad_norm": 1.0855859518051147,
+      "learning_rate": 0.0002,
+      "loss": 0.2975,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 168080.0,
+      "step": 15
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.589662790298462,
+      "learning_rate": 0.0002,
+      "loss": 0.3977,
+      "mean_token_accuracy": 0.78125,
+      "num_tokens": 171136.0,
+      "step": 16
+    },
+    {
+      "epoch": 0.10625,
+      "grad_norm": 1.242841124534607,
+      "learning_rate": 0.0002,
+      "loss": 0.3933,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 174192.0,
+      "step": 17
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 1.0371448993682861,
+      "learning_rate": 0.0002,
+      "loss": 0.2866,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 177248.0,
+      "step": 18
+    },
+    {
+      "epoch": 0.11875,
+      "grad_norm": 2.609177827835083,
+      "learning_rate": 0.0002,
+      "loss": 0.4174,
+      "mean_token_accuracy": 0.75,
+      "num_tokens": 180304.0,
+      "step": 19
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 1.229350209236145,
+      "learning_rate": 0.0002,
+      "loss": 0.3588,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 183360.0,
+      "step": 20
+    },
+    {
+      "epoch": 0.13125,
+      "grad_norm": 1.2149115800857544,
+      "learning_rate": 0.0002,
+      "loss": 0.1919,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 186416.0,
+      "step": 21
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 3.0612738132476807,
+      "learning_rate": 0.0002,
+      "loss": 0.475,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 189472.0,
+      "step": 22
+    },
+    {
+      "epoch": 0.14375,
+      "grad_norm": 1.1256693601608276,
+      "learning_rate": 0.0002,
+      "loss": 0.223,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 192528.0,
+      "step": 23
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.5483283996582031,
+      "learning_rate": 0.0002,
+      "loss": 0.3825,
+      "mean_token_accuracy": 0.78125,
+      "num_tokens": 195584.0,
+      "step": 24
+    },
+    {
+      "epoch": 0.15625,
+      "grad_norm": 0.8419892191886902,
+      "learning_rate": 0.0002,
+      "loss": 0.1645,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 198640.0,
+      "step": 25
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 1.5078842639923096,
+      "learning_rate": 0.0002,
+      "loss": 0.2517,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 201696.0,
+      "step": 26
+    },
+    {
+      "epoch": 0.16875,
+      "grad_norm": 0.7717946171760559,
+      "learning_rate": 0.0002,
+      "loss": 0.1865,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 204752.0,
+      "step": 27
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.7887358069419861,
+      "learning_rate": 0.0002,
+      "loss": 0.2887,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 207808.0,
+      "step": 28
+    },
+    {
+      "epoch": 0.18125,
+      "grad_norm": 1.0774492025375366,
+      "learning_rate": 0.0002,
+      "loss": 0.2154,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 210864.0,
+      "step": 29
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 1.2108778953552246,
+      "learning_rate": 0.0002,
+      "loss": 0.3605,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 213920.0,
+      "step": 30
+    },
+    {
+      "epoch": 0.19375,
+      "grad_norm": 0.4569832980632782,
+      "learning_rate": 0.0002,
+      "loss": 0.1299,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 216976.0,
+      "step": 31
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.789750337600708,
+      "learning_rate": 0.0002,
+      "loss": 0.1772,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 220032.0,
+      "step": 32
+    },
+    {
+      "epoch": 0.20625,
+      "grad_norm": 1.5008147954940796,
+      "learning_rate": 0.0002,
+      "loss": 0.3281,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 223088.0,
+      "step": 33
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 1.082711935043335,
+      "learning_rate": 0.0002,
+      "loss": 0.1973,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 226144.0,
+      "step": 34
+    },
+    {
+      "epoch": 0.21875,
+      "grad_norm": 0.6501540541648865,
+      "learning_rate": 0.0002,
+      "loss": 0.2645,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 229200.0,
+      "step": 35
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.6396588087081909,
+      "learning_rate": 0.0002,
+      "loss": 0.1938,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 232256.0,
+      "step": 36
+    },
+    {
+      "epoch": 0.23125,
+      "grad_norm": 0.7765557169914246,
+      "learning_rate": 0.0002,
+      "loss": 0.1577,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 235312.0,
+      "step": 37
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 1.0910521745681763,
+      "learning_rate": 0.0002,
+      "loss": 0.1917,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 238368.0,
+      "step": 38
+    },
+    {
+      "epoch": 0.24375,
+      "grad_norm": 0.7262992262840271,
+      "learning_rate": 0.0002,
+      "loss": 0.0703,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 241424.0,
+      "step": 39
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.598296582698822,
+      "learning_rate": 0.0002,
+      "loss": 0.1402,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 244480.0,
+      "step": 40
+    },
+    {
+      "epoch": 0.25625,
+      "grad_norm": 1.0417550802230835,
+      "learning_rate": 0.0002,
+      "loss": 0.1274,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 247536.0,
+      "step": 41
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 1.5009475946426392,
+      "learning_rate": 0.0002,
+      "loss": 0.1871,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 250592.0,
+      "step": 42
+    },
+    {
+      "epoch": 0.26875,
+      "grad_norm": 2.052367925643921,
+      "learning_rate": 0.0002,
+      "loss": 0.2981,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 253648.0,
+      "step": 43
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.6979355216026306,
+      "learning_rate": 0.0002,
+      "loss": 0.0962,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 256704.0,
+      "step": 44
+    },
+    {
+      "epoch": 0.28125,
+      "grad_norm": 1.2591170072555542,
+      "learning_rate": 0.0002,
+      "loss": 0.1616,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 259760.0,
+      "step": 45
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.5454805493354797,
+      "learning_rate": 0.0002,
+      "loss": 0.0614,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 262816.0,
+      "step": 46
+    },
+    {
+      "epoch": 0.29375,
+      "grad_norm": 1.3832871913909912,
+      "learning_rate": 0.0002,
+      "loss": 0.1468,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 265872.0,
+      "step": 47
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5263135433197021,
+      "learning_rate": 0.0002,
+      "loss": 0.0482,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 268928.0,
+      "step": 48
+    },
+    {
+      "epoch": 0.30625,
+      "grad_norm": 1.7050193548202515,
+      "learning_rate": 0.0002,
+      "loss": 0.1177,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 271984.0,
+      "step": 49
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 1.4185047149658203,
+      "learning_rate": 0.0002,
+      "loss": 0.0788,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 275040.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.31875,
+      "grad_norm": 1.192116379737854,
+      "learning_rate": 0.0002,
+      "loss": 0.1786,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 278096.0,
+      "step": 51
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 3.244797945022583,
+      "learning_rate": 0.0002,
+      "loss": 0.1685,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 281152.0,
+      "step": 52
+    },
+    {
+      "epoch": 0.33125,
+      "grad_norm": 2.192440986633301,
+      "learning_rate": 0.0002,
+      "loss": 0.1178,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 284208.0,
+      "step": 53
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 2.3711836338043213,
+      "learning_rate": 0.0002,
+      "loss": 0.1016,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 287264.0,
+      "step": 54
+    },
+    {
+      "epoch": 0.34375,
+      "grad_norm": 2.2536869049072266,
+      "learning_rate": 0.0002,
+      "loss": 0.1223,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 290320.0,
+      "step": 55
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.4965548515319824,
+      "learning_rate": 0.0002,
+      "loss": 0.0331,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 293376.0,
+      "step": 56
+    },
+    {
+      "epoch": 0.35625,
+      "grad_norm": 2.7128384113311768,
+      "learning_rate": 0.0002,
+      "loss": 0.1314,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 296432.0,
+      "step": 57
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 1.1565158367156982,
+      "learning_rate": 0.0002,
+      "loss": 0.0504,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 299488.0,
+      "step": 58
+    },
+    {
+      "epoch": 0.36875,
+      "grad_norm": 1.363576054573059,
+      "learning_rate": 0.0002,
+      "loss": 0.0327,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 302544.0,
+      "step": 59
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 1.7460052967071533,
+      "learning_rate": 0.0002,
+      "loss": 0.0766,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 305600.0,
+      "step": 60
+    },
+    {
+      "epoch": 0.38125,
+      "grad_norm": 0.4773832857608795,
+      "learning_rate": 0.0002,
+      "loss": 0.0167,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 308656.0,
+      "step": 61
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.5730915665626526,
+      "learning_rate": 0.0002,
+      "loss": 0.0176,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 311712.0,
+      "step": 62
+    },
+    {
+      "epoch": 0.39375,
+      "grad_norm": 3.6676278114318848,
+      "learning_rate": 0.0002,
+      "loss": 0.1523,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 314768.0,
+      "step": 63
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.425026535987854,
+      "learning_rate": 0.0002,
+      "loss": 0.0172,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 317824.0,
+      "step": 64
+    },
+    {
+      "epoch": 0.40625,
+      "grad_norm": 2.4328842163085938,
+      "learning_rate": 0.0002,
+      "loss": 0.1337,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 320880.0,
+      "step": 65
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 2.682482957839966,
+      "learning_rate": 0.0002,
+      "loss": 0.18,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 323936.0,
+      "step": 66
+    },
+    {
+      "epoch": 0.41875,
+      "grad_norm": 0.41104015707969666,
+      "learning_rate": 0.0002,
+      "loss": 0.0076,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 326992.0,
+      "step": 67
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.22206450998783112,
+      "learning_rate": 0.0002,
+      "loss": 0.0051,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 330048.0,
+      "step": 68
+    },
+    {
+      "epoch": 0.43125,
+      "grad_norm": 1.8949733972549438,
+      "learning_rate": 0.0002,
+      "loss": 0.0408,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 333104.0,
+      "step": 69
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 3.621080160140991,
+      "learning_rate": 0.0002,
+      "loss": 0.0744,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 336160.0,
+      "step": 70
+    },
+    {
+      "epoch": 0.44375,
+      "grad_norm": 0.6198443174362183,
+      "learning_rate": 0.0002,
+      "loss": 0.0123,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 339216.0,
+      "step": 71
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.46570342779159546,
+      "learning_rate": 0.0002,
+      "loss": 0.006,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 342272.0,
+      "step": 72
+    },
+    {
+      "epoch": 0.45625,
+      "grad_norm": 4.823789596557617,
+      "learning_rate": 0.0002,
+      "loss": 0.3016,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 345328.0,
+      "step": 73
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 4.304416179656982,
+      "learning_rate": 0.0002,
+      "loss": 0.1201,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 348384.0,
+      "step": 74
+    },
+    {
+      "epoch": 0.46875,
+      "grad_norm": 1.780882716178894,
+      "learning_rate": 0.0002,
+      "loss": 0.037,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 351440.0,
+      "step": 75
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 1.6134687662124634,
+      "learning_rate": 0.0002,
+      "loss": 0.0413,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 354496.0,
+      "step": 76
+    },
+    {
+      "epoch": 0.48125,
+      "grad_norm": 3.6872904300689697,
+      "learning_rate": 0.0002,
+      "loss": 0.0697,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 357552.0,
+      "step": 77
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 3.585052728652954,
+      "learning_rate": 0.0002,
+      "loss": 0.0485,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 360608.0,
+      "step": 78
+    },
+    {
+      "epoch": 0.49375,
+      "grad_norm": 3.138762950897217,
+      "learning_rate": 0.0002,
+      "loss": 0.0686,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 363664.0,
+      "step": 79
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 3.895049571990967,
+      "learning_rate": 0.0002,
+      "loss": 0.0649,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 366720.0,
+      "step": 80
+    },
+    {
+      "epoch": 0.50625,
+      "grad_norm": 3.5866286754608154,
+      "learning_rate": 0.0002,
+      "loss": 0.1002,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 369776.0,
+      "step": 81
+    },
+    {
+      "epoch": 0.5125,
+      "grad_norm": 2.039658546447754,
+      "learning_rate": 0.0002,
+      "loss": 0.0235,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 372832.0,
+      "step": 82
+    },
+    {
+      "epoch": 0.51875,
+      "grad_norm": 5.831839084625244,
+      "learning_rate": 0.0002,
+      "loss": 0.1955,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 375888.0,
+      "step": 83
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 3.6485228538513184,
+      "learning_rate": 0.0002,
+      "loss": 0.1692,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 378944.0,
+      "step": 84
+    },
+    {
+      "epoch": 0.53125,
+      "grad_norm": 1.0485862493515015,
+      "learning_rate": 0.0002,
+      "loss": 0.0178,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 382000.0,
+      "step": 85
+    },
+    {
+      "epoch": 0.5375,
+      "grad_norm": 2.2051405906677246,
+      "learning_rate": 0.0002,
+      "loss": 0.0282,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 385056.0,
+      "step": 86
+    },
+    {
+      "epoch": 0.54375,
+      "grad_norm": 1.2796984910964966,
+      "learning_rate": 0.0002,
+      "loss": 0.0191,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 388112.0,
+      "step": 87
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.5338424444198608,
+      "learning_rate": 0.0002,
+      "loss": 0.0177,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 391168.0,
+      "step": 88
+    },
+    {
+      "epoch": 0.55625,
+      "grad_norm": 1.2561614513397217,
+      "learning_rate": 0.0002,
+      "loss": 0.0207,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 394224.0,
+      "step": 89
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.8224933743476868,
+      "learning_rate": 0.0002,
+      "loss": 0.0082,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 397280.0,
+      "step": 90
+    },
+    {
+      "epoch": 0.56875,
+      "grad_norm": 3.2320916652679443,
+      "learning_rate": 0.0002,
+      "loss": 0.0583,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 400336.0,
+      "step": 91
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 7.057133197784424,
+      "learning_rate": 0.0002,
+      "loss": 0.259,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 403392.0,
+      "step": 92
+    },
+    {
+      "epoch": 0.58125,
+      "grad_norm": 2.365320920944214,
+      "learning_rate": 0.0002,
+      "loss": 0.0295,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 406448.0,
+      "step": 93
+    },
+    {
+      "epoch": 0.5875,
+      "grad_norm": 0.06329375505447388,
+      "learning_rate": 0.0002,
+      "loss": 0.0014,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 409504.0,
+      "step": 94
+    },
+    {
+      "epoch": 0.59375,
+      "grad_norm": 1.554870367050171,
+      "learning_rate": 0.0002,
+      "loss": 0.024,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 412560.0,
+      "step": 95
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 2.3296000957489014,
+      "learning_rate": 0.0002,
+      "loss": 0.0795,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 415616.0,
+      "step": 96
+    },
+    {
+      "epoch": 0.60625,
+      "grad_norm": 6.749006271362305,
+      "learning_rate": 0.0002,
+      "loss": 0.1903,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 418672.0,
+      "step": 97
+    },
+    {
+      "epoch": 0.6125,
+      "grad_norm": 10.803217887878418,
+      "learning_rate": 0.0002,
+      "loss": 0.2885,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 421728.0,
+      "step": 98
+    },
+    {
+      "epoch": 0.61875,
+      "grad_norm": 2.8866775035858154,
+      "learning_rate": 0.0002,
+      "loss": 0.2661,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 424784.0,
+      "step": 99
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 3.800149917602539,
+      "learning_rate": 0.0002,
+      "loss": 0.0502,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 427840.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.63125,
+      "grad_norm": 0.4669257402420044,
+      "learning_rate": 0.0002,
+      "loss": 0.0041,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 430896.0,
+      "step": 101
+    },
+    {
+      "epoch": 0.6375,
+      "grad_norm": 3.8480825424194336,
+      "learning_rate": 0.0002,
+      "loss": 0.0947,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 433952.0,
+      "step": 102
+    },
+    {
+      "epoch": 0.64375,
+      "grad_norm": 4.139023303985596,
+      "learning_rate": 0.0002,
+      "loss": 0.1131,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 437008.0,
+      "step": 103
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 3.4455084800720215,
+      "learning_rate": 0.0002,
+      "loss": 0.1105,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 440064.0,
+      "step": 104
+    },
+    {
+      "epoch": 0.65625,
+      "grad_norm": 2.1717028617858887,
+      "learning_rate": 0.0002,
+      "loss": 0.0794,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 443120.0,
+      "step": 105
+    },
+    {
+      "epoch": 0.6625,
+      "grad_norm": 0.43985098600387573,
+      "learning_rate": 0.0002,
+      "loss": 0.0057,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 446176.0,
+      "step": 106
+    },
+    {
+      "epoch": 0.66875,
+      "grad_norm": 5.351183891296387,
+      "learning_rate": 0.0002,
+      "loss": 0.3953,
+      "mean_token_accuracy": 0.8125,
+      "num_tokens": 449232.0,
+      "step": 107
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 4.6742048263549805,
+      "learning_rate": 0.0002,
+      "loss": 0.1099,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 452288.0,
+      "step": 108
+    },
+    {
+      "epoch": 0.68125,
+      "grad_norm": 4.851898670196533,
+      "learning_rate": 0.0002,
+      "loss": 0.2198,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 455344.0,
+      "step": 109
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 2.46817946434021,
+      "learning_rate": 0.0002,
+      "loss": 0.0586,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 458400.0,
+      "step": 110
+    },
+    {
+      "epoch": 0.69375,
+      "grad_norm": 5.507438659667969,
+      "learning_rate": 0.0002,
+      "loss": 0.3492,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 461456.0,
+      "step": 111
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.21149688959121704,
+      "learning_rate": 0.0002,
+      "loss": 0.003,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 464512.0,
+      "step": 112
+    },
+    {
+      "epoch": 0.70625,
+      "grad_norm": 2.1346304416656494,
+      "learning_rate": 0.0002,
+      "loss": 0.0365,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 467568.0,
+      "step": 113
+    },
+    {
+      "epoch": 0.7125,
+      "grad_norm": 2.06246280670166,
+      "learning_rate": 0.0002,
+      "loss": 0.1228,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 470624.0,
+      "step": 114
+    },
+    {
+      "epoch": 0.71875,
+      "grad_norm": 1.1220771074295044,
+      "learning_rate": 0.0002,
+      "loss": 0.0174,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 473680.0,
+      "step": 115
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 1.283275842666626,
+      "learning_rate": 0.0002,
+      "loss": 0.0473,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 476736.0,
+      "step": 116
+    },
+    {
+      "epoch": 0.73125,
+      "grad_norm": 3.0137429237365723,
+      "learning_rate": 0.0002,
+      "loss": 0.073,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 479792.0,
+      "step": 117
+    },
+    {
+      "epoch": 0.7375,
+      "grad_norm": 3.159397840499878,
+      "learning_rate": 0.0002,
+      "loss": 0.0736,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 482848.0,
+      "step": 118
+    },
+    {
+      "epoch": 0.74375,
+      "grad_norm": 4.265467643737793,
+      "learning_rate": 0.0002,
+      "loss": 0.3511,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 485904.0,
+      "step": 119
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 4.885653495788574,
+      "learning_rate": 0.0002,
+      "loss": 0.1691,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 488960.0,
+      "step": 120
+    },
+    {
+      "epoch": 0.75625,
+      "grad_norm": 5.490376949310303,
+      "learning_rate": 0.0002,
+      "loss": 0.2357,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 492016.0,
+      "step": 121
+    },
+    {
+      "epoch": 0.7625,
+      "grad_norm": 3.725234031677246,
+      "learning_rate": 0.0002,
+      "loss": 0.1983,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 495072.0,
+      "step": 122
+    },
+    {
+      "epoch": 0.76875,
+      "grad_norm": 1.6382085084915161,
+      "learning_rate": 0.0002,
+      "loss": 0.043,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 498128.0,
+      "step": 123
+    },
+    {
+      "epoch": 0.775,
+      "grad_norm": 0.3425021171569824,
+      "learning_rate": 0.0002,
+      "loss": 0.0069,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 501184.0,
+      "step": 124
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 1.504239559173584,
+      "learning_rate": 0.0002,
+      "loss": 0.0348,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 504240.0,
+      "step": 125
+    },
+    {
+      "epoch": 0.7875,
+      "grad_norm": 0.16000418365001678,
+      "learning_rate": 0.0002,
+      "loss": 0.0071,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 507296.0,
+      "step": 126
+    },
+    {
+      "epoch": 0.79375,
+      "grad_norm": 0.48438990116119385,
+      "learning_rate": 0.0002,
+      "loss": 0.0153,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 510352.0,
+      "step": 127
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3561486601829529,
+      "learning_rate": 0.0002,
+      "loss": 0.0071,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 513408.0,
+      "step": 128
+    },
+    {
+      "epoch": 0.80625,
+      "grad_norm": 3.513791084289551,
+      "learning_rate": 0.0002,
+      "loss": 0.114,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 516464.0,
+      "step": 129
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.7476550340652466,
+      "learning_rate": 0.0002,
+      "loss": 0.027,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 519520.0,
+      "step": 130
+    },
+    {
+      "epoch": 0.81875,
+      "grad_norm": 4.814530372619629,
+      "learning_rate": 0.0002,
+      "loss": 0.2155,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 522576.0,
+      "step": 131
+    },
+    {
+      "epoch": 0.825,
+      "grad_norm": 4.203996658325195,
+      "learning_rate": 0.0002,
+      "loss": 0.0732,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 525632.0,
+      "step": 132
+    },
+    {
+      "epoch": 0.83125,
+      "grad_norm": 3.171036958694458,
+      "learning_rate": 0.0002,
+      "loss": 0.0479,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 528688.0,
+      "step": 133
+    },
+    {
+      "epoch": 0.8375,
+      "grad_norm": 3.515819549560547,
+      "learning_rate": 0.0002,
+      "loss": 0.1001,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 531744.0,
+      "step": 134
+    },
+    {
+      "epoch": 0.84375,
+      "grad_norm": 1.6061947345733643,
+      "learning_rate": 0.0002,
+      "loss": 0.0795,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 534800.0,
+      "step": 135
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.5217976570129395,
+      "learning_rate": 0.0002,
+      "loss": 0.0143,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 537856.0,
+      "step": 136
+    },
+    {
+      "epoch": 0.85625,
+      "grad_norm": 1.6569266319274902,
+      "learning_rate": 0.0002,
+      "loss": 0.025,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 540912.0,
+      "step": 137
+    },
+    {
+      "epoch": 0.8625,
+      "grad_norm": 2.0939252376556396,
+      "learning_rate": 0.0002,
+      "loss": 0.0375,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 543968.0,
+      "step": 138
+    },
+    {
+      "epoch": 0.86875,
+      "grad_norm": 1.8286008834838867,
+      "learning_rate": 0.0002,
+      "loss": 0.0464,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 547024.0,
+      "step": 139
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 6.246391296386719,
+      "learning_rate": 0.0002,
+      "loss": 0.135,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 550080.0,
+      "step": 140
+    },
+    {
+      "epoch": 0.88125,
+      "grad_norm": 0.9313927292823792,
+      "learning_rate": 0.0002,
+      "loss": 0.0437,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 553136.0,
+      "step": 141
+    },
+    {
+      "epoch": 0.8875,
+      "grad_norm": 3.962883472442627,
+      "learning_rate": 0.0002,
+      "loss": 0.1026,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 556192.0,
+      "step": 142
+    },
+    {
+      "epoch": 0.89375,
+      "grad_norm": 4.730908393859863,
+      "learning_rate": 0.0002,
+      "loss": 0.104,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 559248.0,
+      "step": 143
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 4.035704135894775,
+      "learning_rate": 0.0002,
+      "loss": 0.16,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 562304.0,
+      "step": 144
+    },
+    {
+      "epoch": 0.90625,
+      "grad_norm": 5.092385768890381,
+      "learning_rate": 0.0002,
+      "loss": 0.1841,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 565360.0,
+      "step": 145
+    },
+    {
+      "epoch": 0.9125,
+      "grad_norm": 6.0376386642456055,
+      "learning_rate": 0.0002,
+      "loss": 0.2612,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 568416.0,
+      "step": 146
+    },
+    {
+      "epoch": 0.91875,
+      "grad_norm": 3.7592973709106445,
+      "learning_rate": 0.0002,
+      "loss": 0.047,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 571472.0,
+      "step": 147
+    },
+    {
+      "epoch": 0.925,
+      "grad_norm": 0.9640989899635315,
+      "learning_rate": 0.0002,
+      "loss": 0.0236,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 574528.0,
+      "step": 148
+    },
+    {
+      "epoch": 0.93125,
+      "grad_norm": 1.7307640314102173,
+      "learning_rate": 0.0002,
+      "loss": 0.061,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 577584.0,
+      "step": 149
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 2.1097357273101807,
+      "learning_rate": 0.0002,
+      "loss": 0.3053,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 580640.0,
+      "step": 150
+    },
+    {
+      "epoch": 0.94375,
+      "grad_norm": 3.9413912296295166,
+      "learning_rate": 0.0002,
+      "loss": 0.1405,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 583696.0,
+      "step": 151
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 3.341071128845215,
+      "learning_rate": 0.0002,
+      "loss": 0.1131,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 586752.0,
+      "step": 152
+    },
+    {
+      "epoch": 0.95625,
+      "grad_norm": 1.0671991109848022,
+      "learning_rate": 0.0002,
+      "loss": 0.0147,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 589808.0,
+      "step": 153
+    },
+    {
+      "epoch": 0.9625,
+      "grad_norm": 6.068633556365967,
+      "learning_rate": 0.0002,
+      "loss": 0.0888,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 592864.0,
+      "step": 154
+    },
+    {
+      "epoch": 0.96875,
+      "grad_norm": 1.0805017948150635,
+      "learning_rate": 0.0002,
+      "loss": 0.0136,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 595920.0,
+      "step": 155
+    },
+    {
+      "epoch": 0.975,
+      "grad_norm": 0.40108004212379456,
+      "learning_rate": 0.0002,
+      "loss": 0.0091,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 598976.0,
+      "step": 156
+    },
+    {
+      "epoch": 0.98125,
+      "grad_norm": 0.7008132934570312,
+      "learning_rate": 0.0002,
+      "loss": 0.011,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 602032.0,
+      "step": 157
+    },
+    {
+      "epoch": 0.9875,
+      "grad_norm": 0.42773208022117615,
+      "learning_rate": 0.0002,
+      "loss": 0.0063,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 605088.0,
+      "step": 158
+    },
+    {
+      "epoch": 0.99375,
+      "grad_norm": 0.9735352396965027,
+      "learning_rate": 0.0002,
+      "loss": 0.014,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 608144.0,
+      "step": 159
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.4267780780792236,
+      "learning_rate": 0.0002,
+      "loss": 0.0344,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 611200.0,
+      "step": 160
+    },
+    {
+      "epoch": 1.00625,
+      "grad_norm": 4.375260829925537,
+      "learning_rate": 0.0002,
+      "loss": 0.0981,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 614256.0,
+      "step": 161
+    },
+    {
+      "epoch": 1.0125,
+      "grad_norm": 2.6353538036346436,
+      "learning_rate": 0.0002,
+      "loss": 0.1965,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 617312.0,
+      "step": 162
+    },
+    {
+      "epoch": 1.01875,
+      "grad_norm": 2.906750440597534,
+      "learning_rate": 0.0002,
+      "loss": 0.0887,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 620368.0,
+      "step": 163
+    },
+    {
+      "epoch": 1.025,
+      "grad_norm": 2.4843482971191406,
+      "learning_rate": 0.0002,
+      "loss": 0.0409,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 623424.0,
+      "step": 164
+    },
+    {
+      "epoch": 1.03125,
+      "grad_norm": 2.4637041091918945,
+      "learning_rate": 0.0002,
+      "loss": 0.0588,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 626480.0,
+      "step": 165
+    },
+    {
+      "epoch": 1.0375,
+      "grad_norm": 3.256136655807495,
+      "learning_rate": 0.0002,
+      "loss": 0.0632,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 629536.0,
+      "step": 166
+    },
+    {
+      "epoch": 1.04375,
+      "grad_norm": 3.7600295543670654,
+      "learning_rate": 0.0002,
+      "loss": 0.0903,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 632592.0,
+      "step": 167
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 4.013559341430664,
+      "learning_rate": 0.0002,
+      "loss": 0.0796,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 635648.0,
+      "step": 168
+    },
+    {
+      "epoch": 1.05625,
+      "grad_norm": 1.3275935649871826,
+      "learning_rate": 0.0002,
+      "loss": 0.0415,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 638704.0,
+      "step": 169
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 3.0830256938934326,
+      "learning_rate": 0.0002,
+      "loss": 0.0989,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 641760.0,
+      "step": 170
+    },
+    {
+      "epoch": 1.06875,
+      "grad_norm": 5.780794620513916,
+      "learning_rate": 0.0002,
+      "loss": 0.0974,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 644816.0,
+      "step": 171
+    },
+    {
+      "epoch": 1.075,
+      "grad_norm": 1.1954656839370728,
+      "learning_rate": 0.0002,
+      "loss": 0.0363,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 647872.0,
+      "step": 172
+    },
+    {
+      "epoch": 1.08125,
+      "grad_norm": 0.6503368616104126,
+      "learning_rate": 0.0002,
+      "loss": 0.0086,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 650928.0,
+      "step": 173
+    },
+    {
+      "epoch": 1.0875,
+      "grad_norm": 0.2741992771625519,
+      "learning_rate": 0.0002,
+      "loss": 0.005,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 653984.0,
+      "step": 174
+    },
+    {
+      "epoch": 1.09375,
+      "grad_norm": 4.483480930328369,
+      "learning_rate": 0.0002,
+      "loss": 0.1765,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 657040.0,
+      "step": 175
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.0801796913146973,
+      "learning_rate": 0.0002,
+      "loss": 0.2432,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 660096.0,
+      "step": 176
+    },
+    {
+      "epoch": 1.10625,
+      "grad_norm": 4.300808906555176,
+      "learning_rate": 0.0002,
+      "loss": 0.0805,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 663152.0,
+      "step": 177
+    },
+    {
+      "epoch": 1.1125,
+      "grad_norm": 1.7215920686721802,
+      "learning_rate": 0.0002,
+      "loss": 0.0227,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 666208.0,
+      "step": 178
+    },
+    {
+      "epoch": 1.11875,
+      "grad_norm": 3.1990017890930176,
+      "learning_rate": 0.0002,
+      "loss": 0.3566,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 669264.0,
+      "step": 179
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 1.593938946723938,
+      "learning_rate": 0.0002,
+      "loss": 0.0232,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 672320.0,
+      "step": 180
+    },
+    {
+      "epoch": 1.13125,
+      "grad_norm": 5.725439071655273,
+      "learning_rate": 0.0002,
+      "loss": 0.1591,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 675376.0,
+      "step": 181
+    },
+    {
+      "epoch": 1.1375,
+      "grad_norm": 4.107408046722412,
+      "learning_rate": 0.0002,
+      "loss": 0.0734,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 678432.0,
+      "step": 182
+    },
+    {
+      "epoch": 1.14375,
+      "grad_norm": 0.5303417444229126,
+      "learning_rate": 0.0002,
+      "loss": 0.0099,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 681488.0,
+      "step": 183
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.7143228650093079,
+      "learning_rate": 0.0002,
+      "loss": 0.0127,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 684544.0,
+      "step": 184
+    },
+    {
+      "epoch": 1.15625,
+      "grad_norm": 0.10430416464805603,
+      "learning_rate": 0.0002,
+      "loss": 0.0028,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 687600.0,
+      "step": 185
+    },
+    {
+      "epoch": 1.1625,
+      "grad_norm": 0.10262572765350342,
+      "learning_rate": 0.0002,
+      "loss": 0.0022,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 690656.0,
+      "step": 186
+    },
+    {
+      "epoch": 1.16875,
+      "grad_norm": 0.5091086626052856,
+      "learning_rate": 0.0002,
+      "loss": 0.0082,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 693712.0,
+      "step": 187
+    },
+    {
+      "epoch": 1.175,
+      "grad_norm": 0.32739120721817017,
+      "learning_rate": 0.0002,
+      "loss": 0.006,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 696768.0,
+      "step": 188
+    },
+    {
+      "epoch": 1.18125,
+      "grad_norm": 2.958453416824341,
+      "learning_rate": 0.0002,
+      "loss": 0.0506,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 699824.0,
+      "step": 189
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 0.6987707018852234,
+      "learning_rate": 0.0002,
+      "loss": 0.0107,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 702880.0,
+      "step": 190
+    },
+    {
+      "epoch": 1.19375,
+      "grad_norm": 1.3608295917510986,
+      "learning_rate": 0.0002,
+      "loss": 0.0287,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 705936.0,
+      "step": 191
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.14391912519931793,
+      "learning_rate": 0.0002,
+      "loss": 0.0021,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 708992.0,
+      "step": 192
+    },
+    {
+      "epoch": 1.20625,
+      "grad_norm": 2.5519983768463135,
+      "learning_rate": 0.0002,
+      "loss": 0.1491,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 712048.0,
+      "step": 193
+    },
+    {
+      "epoch": 1.2125,
+      "grad_norm": 6.5541253089904785,
+      "learning_rate": 0.0002,
+      "loss": 0.1414,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 715104.0,
+      "step": 194
+    },
+    {
+      "epoch": 1.21875,
+      "grad_norm": 0.2820110619068146,
+      "learning_rate": 0.0002,
+      "loss": 0.003,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 718160.0,
+      "step": 195
+    },
+    {
+      "epoch": 1.225,
+      "grad_norm": 2.9799859523773193,
+      "learning_rate": 0.0002,
+      "loss": 0.0506,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 721216.0,
+      "step": 196
+    },
+    {
+      "epoch": 1.23125,
+      "grad_norm": 1.1640214920043945,
+      "learning_rate": 0.0002,
+      "loss": 0.0182,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 724272.0,
+      "step": 197
+    },
+    {
+      "epoch": 1.2375,
+      "grad_norm": 2.498863935470581,
+      "learning_rate": 0.0002,
+      "loss": 0.073,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 727328.0,
+      "step": 198
+    },
+    {
+      "epoch": 1.24375,
+      "grad_norm": 1.2676233053207397,
+      "learning_rate": 0.0002,
+      "loss": 0.0265,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 730384.0,
+      "step": 199
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.3039199113845825,
+      "learning_rate": 0.0002,
+      "loss": 0.0079,
+      "step": 200
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 0.07848864793777466,
+      "eval_mean_token_accuracy": 0.9703125,
+      "eval_model_preparation_time": 0.0042,
+      "eval_num_tokens": 733440.0,
+      "eval_runtime": 71.4875,
+      "eval_samples_per_second": 8.953,
+      "eval_steps_per_second": 0.56,
+      "step": 200
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 320,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.84289248722944e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1e17f907302d781eecf6b58e9d4136a88ad4089312e1b26d13d23c8ff5af5cb
+size 6161

checkpoint-200/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-320/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: Qwen/Qwen2.5-VL-7B-Instruct
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.2

checkpoint-320/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-320/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:530fabf0be943088618405f86cd5303d922e0e5f96f279fad4af24d343946040
+size 20200056

checkpoint-320/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-320/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

checkpoint-320/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-320/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75dcacc1adf3aa4ab6c7c8b31873a37f7e0cc4f47b760b246fdd98985afe218e
+size 40466443

checkpoint-320/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:819b28a03a83ccada450e3d740a066905c699b36484adb52e93a4911c6ad9bc4
+size 14645

checkpoint-320/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a2a804a95a4754b7694d7b5887b5c0f89e1280617b9486e1f9fd471f0f33b52
+size 1465

checkpoint-320/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-320/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoint-320/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoint-320/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2923 @@

+{
+  "best_global_step": 200,
+  "best_metric": 0.07848864793777466,
+  "best_model_checkpoint": "sft_prefill/prompt_id_3/qwen2-VL-7B-Instruct-syn-count-lora/checkpoint-200",
+  "epoch": 2.0,
+  "eval_steps": 200,
+  "global_step": 320,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00625,
+      "grad_norm": 0.9298801422119141,
+      "learning_rate": 0.0002,
+      "loss": 0.4917,
+      "mean_token_accuracy": 0.6875,
+      "num_tokens": 125296.0,
+      "step": 1
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.8382226824760437,
+      "learning_rate": 0.0002,
+      "loss": 0.6049,
+      "mean_token_accuracy": 0.75,
+      "num_tokens": 128352.0,
+      "step": 2
+    },
+    {
+      "epoch": 0.01875,
+      "grad_norm": 0.9454999566078186,
+      "learning_rate": 0.0002,
+      "loss": 0.6407,
+      "mean_token_accuracy": 0.65625,
+      "num_tokens": 131408.0,
+      "step": 3
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.9727649092674255,
+      "learning_rate": 0.0002,
+      "loss": 0.3972,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 134464.0,
+      "step": 4
+    },
+    {
+      "epoch": 0.03125,
+      "grad_norm": 0.785910964012146,
+      "learning_rate": 0.0002,
+      "loss": 0.453,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 137520.0,
+      "step": 5
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.9556392431259155,
+      "learning_rate": 0.0002,
+      "loss": 0.418,
+      "mean_token_accuracy": 0.8125,
+      "num_tokens": 140576.0,
+      "step": 6
+    },
+    {
+      "epoch": 0.04375,
+      "grad_norm": 1.160456657409668,
+      "learning_rate": 0.0002,
+      "loss": 0.303,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 143632.0,
+      "step": 7
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.127308964729309,
+      "learning_rate": 0.0002,
+      "loss": 0.1847,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 146688.0,
+      "step": 8
+    },
+    {
+      "epoch": 0.05625,
+      "grad_norm": 1.7690690755844116,
+      "learning_rate": 0.0002,
+      "loss": 0.3514,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 149744.0,
+      "step": 9
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.8747241497039795,
+      "learning_rate": 0.0002,
+      "loss": 0.2434,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 152800.0,
+      "step": 10
+    },
+    {
+      "epoch": 0.06875,
+      "grad_norm": 0.7658981084823608,
+      "learning_rate": 0.0002,
+      "loss": 0.1577,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 155856.0,
+      "step": 11
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.6385300755500793,
+      "learning_rate": 0.0002,
+      "loss": 0.209,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 158912.0,
+      "step": 12
+    },
+    {
+      "epoch": 0.08125,
+      "grad_norm": 1.2857609987258911,
+      "learning_rate": 0.0002,
+      "loss": 0.2123,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 161968.0,
+      "step": 13
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.5603316426277161,
+      "learning_rate": 0.0002,
+      "loss": 0.2712,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 165024.0,
+      "step": 14
+    },
+    {
+      "epoch": 0.09375,
+      "grad_norm": 1.0855859518051147,
+      "learning_rate": 0.0002,
+      "loss": 0.2975,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 168080.0,
+      "step": 15
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.589662790298462,
+      "learning_rate": 0.0002,
+      "loss": 0.3977,
+      "mean_token_accuracy": 0.78125,
+      "num_tokens": 171136.0,
+      "step": 16
+    },
+    {
+      "epoch": 0.10625,
+      "grad_norm": 1.242841124534607,
+      "learning_rate": 0.0002,
+      "loss": 0.3933,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 174192.0,
+      "step": 17
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 1.0371448993682861,
+      "learning_rate": 0.0002,
+      "loss": 0.2866,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 177248.0,
+      "step": 18
+    },
+    {
+      "epoch": 0.11875,
+      "grad_norm": 2.609177827835083,
+      "learning_rate": 0.0002,
+      "loss": 0.4174,
+      "mean_token_accuracy": 0.75,
+      "num_tokens": 180304.0,
+      "step": 19
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 1.229350209236145,
+      "learning_rate": 0.0002,
+      "loss": 0.3588,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 183360.0,
+      "step": 20
+    },
+    {
+      "epoch": 0.13125,
+      "grad_norm": 1.2149115800857544,
+      "learning_rate": 0.0002,
+      "loss": 0.1919,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 186416.0,
+      "step": 21
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 3.0612738132476807,
+      "learning_rate": 0.0002,
+      "loss": 0.475,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 189472.0,
+      "step": 22
+    },
+    {
+      "epoch": 0.14375,
+      "grad_norm": 1.1256693601608276,
+      "learning_rate": 0.0002,
+      "loss": 0.223,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 192528.0,
+      "step": 23
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.5483283996582031,
+      "learning_rate": 0.0002,
+      "loss": 0.3825,
+      "mean_token_accuracy": 0.78125,
+      "num_tokens": 195584.0,
+      "step": 24
+    },
+    {
+      "epoch": 0.15625,
+      "grad_norm": 0.8419892191886902,
+      "learning_rate": 0.0002,
+      "loss": 0.1645,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 198640.0,
+      "step": 25
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 1.5078842639923096,
+      "learning_rate": 0.0002,
+      "loss": 0.2517,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 201696.0,
+      "step": 26
+    },
+    {
+      "epoch": 0.16875,
+      "grad_norm": 0.7717946171760559,
+      "learning_rate": 0.0002,
+      "loss": 0.1865,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 204752.0,
+      "step": 27
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.7887358069419861,
+      "learning_rate": 0.0002,
+      "loss": 0.2887,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 207808.0,
+      "step": 28
+    },
+    {
+      "epoch": 0.18125,
+      "grad_norm": 1.0774492025375366,
+      "learning_rate": 0.0002,
+      "loss": 0.2154,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 210864.0,
+      "step": 29
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 1.2108778953552246,
+      "learning_rate": 0.0002,
+      "loss": 0.3605,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 213920.0,
+      "step": 30
+    },
+    {
+      "epoch": 0.19375,
+      "grad_norm": 0.4569832980632782,
+      "learning_rate": 0.0002,
+      "loss": 0.1299,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 216976.0,
+      "step": 31
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.789750337600708,
+      "learning_rate": 0.0002,
+      "loss": 0.1772,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 220032.0,
+      "step": 32
+    },
+    {
+      "epoch": 0.20625,
+      "grad_norm": 1.5008147954940796,
+      "learning_rate": 0.0002,
+      "loss": 0.3281,
+      "mean_token_accuracy": 0.84375,
+      "num_tokens": 223088.0,
+      "step": 33
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 1.082711935043335,
+      "learning_rate": 0.0002,
+      "loss": 0.1973,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 226144.0,
+      "step": 34
+    },
+    {
+      "epoch": 0.21875,
+      "grad_norm": 0.6501540541648865,
+      "learning_rate": 0.0002,
+      "loss": 0.2645,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 229200.0,
+      "step": 35
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.6396588087081909,
+      "learning_rate": 0.0002,
+      "loss": 0.1938,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 232256.0,
+      "step": 36
+    },
+    {
+      "epoch": 0.23125,
+      "grad_norm": 0.7765557169914246,
+      "learning_rate": 0.0002,
+      "loss": 0.1577,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 235312.0,
+      "step": 37
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 1.0910521745681763,
+      "learning_rate": 0.0002,
+      "loss": 0.1917,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 238368.0,
+      "step": 38
+    },
+    {
+      "epoch": 0.24375,
+      "grad_norm": 0.7262992262840271,
+      "learning_rate": 0.0002,
+      "loss": 0.0703,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 241424.0,
+      "step": 39
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.598296582698822,
+      "learning_rate": 0.0002,
+      "loss": 0.1402,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 244480.0,
+      "step": 40
+    },
+    {
+      "epoch": 0.25625,
+      "grad_norm": 1.0417550802230835,
+      "learning_rate": 0.0002,
+      "loss": 0.1274,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 247536.0,
+      "step": 41
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 1.5009475946426392,
+      "learning_rate": 0.0002,
+      "loss": 0.1871,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 250592.0,
+      "step": 42
+    },
+    {
+      "epoch": 0.26875,
+      "grad_norm": 2.052367925643921,
+      "learning_rate": 0.0002,
+      "loss": 0.2981,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 253648.0,
+      "step": 43
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.6979355216026306,
+      "learning_rate": 0.0002,
+      "loss": 0.0962,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 256704.0,
+      "step": 44
+    },
+    {
+      "epoch": 0.28125,
+      "grad_norm": 1.2591170072555542,
+      "learning_rate": 0.0002,
+      "loss": 0.1616,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 259760.0,
+      "step": 45
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.5454805493354797,
+      "learning_rate": 0.0002,
+      "loss": 0.0614,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 262816.0,
+      "step": 46
+    },
+    {
+      "epoch": 0.29375,
+      "grad_norm": 1.3832871913909912,
+      "learning_rate": 0.0002,
+      "loss": 0.1468,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 265872.0,
+      "step": 47
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5263135433197021,
+      "learning_rate": 0.0002,
+      "loss": 0.0482,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 268928.0,
+      "step": 48
+    },
+    {
+      "epoch": 0.30625,
+      "grad_norm": 1.7050193548202515,
+      "learning_rate": 0.0002,
+      "loss": 0.1177,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 271984.0,
+      "step": 49
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 1.4185047149658203,
+      "learning_rate": 0.0002,
+      "loss": 0.0788,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 275040.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.31875,
+      "grad_norm": 1.192116379737854,
+      "learning_rate": 0.0002,
+      "loss": 0.1786,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 278096.0,
+      "step": 51
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 3.244797945022583,
+      "learning_rate": 0.0002,
+      "loss": 0.1685,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 281152.0,
+      "step": 52
+    },
+    {
+      "epoch": 0.33125,
+      "grad_norm": 2.192440986633301,
+      "learning_rate": 0.0002,
+      "loss": 0.1178,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 284208.0,
+      "step": 53
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 2.3711836338043213,
+      "learning_rate": 0.0002,
+      "loss": 0.1016,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 287264.0,
+      "step": 54
+    },
+    {
+      "epoch": 0.34375,
+      "grad_norm": 2.2536869049072266,
+      "learning_rate": 0.0002,
+      "loss": 0.1223,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 290320.0,
+      "step": 55
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.4965548515319824,
+      "learning_rate": 0.0002,
+      "loss": 0.0331,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 293376.0,
+      "step": 56
+    },
+    {
+      "epoch": 0.35625,
+      "grad_norm": 2.7128384113311768,
+      "learning_rate": 0.0002,
+      "loss": 0.1314,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 296432.0,
+      "step": 57
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 1.1565158367156982,
+      "learning_rate": 0.0002,
+      "loss": 0.0504,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 299488.0,
+      "step": 58
+    },
+    {
+      "epoch": 0.36875,
+      "grad_norm": 1.363576054573059,
+      "learning_rate": 0.0002,
+      "loss": 0.0327,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 302544.0,
+      "step": 59
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 1.7460052967071533,
+      "learning_rate": 0.0002,
+      "loss": 0.0766,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 305600.0,
+      "step": 60
+    },
+    {
+      "epoch": 0.38125,
+      "grad_norm": 0.4773832857608795,
+      "learning_rate": 0.0002,
+      "loss": 0.0167,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 308656.0,
+      "step": 61
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.5730915665626526,
+      "learning_rate": 0.0002,
+      "loss": 0.0176,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 311712.0,
+      "step": 62
+    },
+    {
+      "epoch": 0.39375,
+      "grad_norm": 3.6676278114318848,
+      "learning_rate": 0.0002,
+      "loss": 0.1523,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 314768.0,
+      "step": 63
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.425026535987854,
+      "learning_rate": 0.0002,
+      "loss": 0.0172,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 317824.0,
+      "step": 64
+    },
+    {
+      "epoch": 0.40625,
+      "grad_norm": 2.4328842163085938,
+      "learning_rate": 0.0002,
+      "loss": 0.1337,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 320880.0,
+      "step": 65
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 2.682482957839966,
+      "learning_rate": 0.0002,
+      "loss": 0.18,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 323936.0,
+      "step": 66
+    },
+    {
+      "epoch": 0.41875,
+      "grad_norm": 0.41104015707969666,
+      "learning_rate": 0.0002,
+      "loss": 0.0076,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 326992.0,
+      "step": 67
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.22206450998783112,
+      "learning_rate": 0.0002,
+      "loss": 0.0051,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 330048.0,
+      "step": 68
+    },
+    {
+      "epoch": 0.43125,
+      "grad_norm": 1.8949733972549438,
+      "learning_rate": 0.0002,
+      "loss": 0.0408,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 333104.0,
+      "step": 69
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 3.621080160140991,
+      "learning_rate": 0.0002,
+      "loss": 0.0744,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 336160.0,
+      "step": 70
+    },
+    {
+      "epoch": 0.44375,
+      "grad_norm": 0.6198443174362183,
+      "learning_rate": 0.0002,
+      "loss": 0.0123,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 339216.0,
+      "step": 71
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.46570342779159546,
+      "learning_rate": 0.0002,
+      "loss": 0.006,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 342272.0,
+      "step": 72
+    },
+    {
+      "epoch": 0.45625,
+      "grad_norm": 4.823789596557617,
+      "learning_rate": 0.0002,
+      "loss": 0.3016,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 345328.0,
+      "step": 73
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 4.304416179656982,
+      "learning_rate": 0.0002,
+      "loss": 0.1201,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 348384.0,
+      "step": 74
+    },
+    {
+      "epoch": 0.46875,
+      "grad_norm": 1.780882716178894,
+      "learning_rate": 0.0002,
+      "loss": 0.037,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 351440.0,
+      "step": 75
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 1.6134687662124634,
+      "learning_rate": 0.0002,
+      "loss": 0.0413,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 354496.0,
+      "step": 76
+    },
+    {
+      "epoch": 0.48125,
+      "grad_norm": 3.6872904300689697,
+      "learning_rate": 0.0002,
+      "loss": 0.0697,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 357552.0,
+      "step": 77
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 3.585052728652954,
+      "learning_rate": 0.0002,
+      "loss": 0.0485,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 360608.0,
+      "step": 78
+    },
+    {
+      "epoch": 0.49375,
+      "grad_norm": 3.138762950897217,
+      "learning_rate": 0.0002,
+      "loss": 0.0686,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 363664.0,
+      "step": 79
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 3.895049571990967,
+      "learning_rate": 0.0002,
+      "loss": 0.0649,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 366720.0,
+      "step": 80
+    },
+    {
+      "epoch": 0.50625,
+      "grad_norm": 3.5866286754608154,
+      "learning_rate": 0.0002,
+      "loss": 0.1002,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 369776.0,
+      "step": 81
+    },
+    {
+      "epoch": 0.5125,
+      "grad_norm": 2.039658546447754,
+      "learning_rate": 0.0002,
+      "loss": 0.0235,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 372832.0,
+      "step": 82
+    },
+    {
+      "epoch": 0.51875,
+      "grad_norm": 5.831839084625244,
+      "learning_rate": 0.0002,
+      "loss": 0.1955,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 375888.0,
+      "step": 83
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 3.6485228538513184,
+      "learning_rate": 0.0002,
+      "loss": 0.1692,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 378944.0,
+      "step": 84
+    },
+    {
+      "epoch": 0.53125,
+      "grad_norm": 1.0485862493515015,
+      "learning_rate": 0.0002,
+      "loss": 0.0178,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 382000.0,
+      "step": 85
+    },
+    {
+      "epoch": 0.5375,
+      "grad_norm": 2.2051405906677246,
+      "learning_rate": 0.0002,
+      "loss": 0.0282,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 385056.0,
+      "step": 86
+    },
+    {
+      "epoch": 0.54375,
+      "grad_norm": 1.2796984910964966,
+      "learning_rate": 0.0002,
+      "loss": 0.0191,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 388112.0,
+      "step": 87
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.5338424444198608,
+      "learning_rate": 0.0002,
+      "loss": 0.0177,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 391168.0,
+      "step": 88
+    },
+    {
+      "epoch": 0.55625,
+      "grad_norm": 1.2561614513397217,
+      "learning_rate": 0.0002,
+      "loss": 0.0207,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 394224.0,
+      "step": 89
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.8224933743476868,
+      "learning_rate": 0.0002,
+      "loss": 0.0082,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 397280.0,
+      "step": 90
+    },
+    {
+      "epoch": 0.56875,
+      "grad_norm": 3.2320916652679443,
+      "learning_rate": 0.0002,
+      "loss": 0.0583,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 400336.0,
+      "step": 91
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 7.057133197784424,
+      "learning_rate": 0.0002,
+      "loss": 0.259,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 403392.0,
+      "step": 92
+    },
+    {
+      "epoch": 0.58125,
+      "grad_norm": 2.365320920944214,
+      "learning_rate": 0.0002,
+      "loss": 0.0295,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 406448.0,
+      "step": 93
+    },
+    {
+      "epoch": 0.5875,
+      "grad_norm": 0.06329375505447388,
+      "learning_rate": 0.0002,
+      "loss": 0.0014,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 409504.0,
+      "step": 94
+    },
+    {
+      "epoch": 0.59375,
+      "grad_norm": 1.554870367050171,
+      "learning_rate": 0.0002,
+      "loss": 0.024,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 412560.0,
+      "step": 95
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 2.3296000957489014,
+      "learning_rate": 0.0002,
+      "loss": 0.0795,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 415616.0,
+      "step": 96
+    },
+    {
+      "epoch": 0.60625,
+      "grad_norm": 6.749006271362305,
+      "learning_rate": 0.0002,
+      "loss": 0.1903,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 418672.0,
+      "step": 97
+    },
+    {
+      "epoch": 0.6125,
+      "grad_norm": 10.803217887878418,
+      "learning_rate": 0.0002,
+      "loss": 0.2885,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 421728.0,
+      "step": 98
+    },
+    {
+      "epoch": 0.61875,
+      "grad_norm": 2.8866775035858154,
+      "learning_rate": 0.0002,
+      "loss": 0.2661,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 424784.0,
+      "step": 99
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 3.800149917602539,
+      "learning_rate": 0.0002,
+      "loss": 0.0502,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 427840.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.63125,
+      "grad_norm": 0.4669257402420044,
+      "learning_rate": 0.0002,
+      "loss": 0.0041,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 430896.0,
+      "step": 101
+    },
+    {
+      "epoch": 0.6375,
+      "grad_norm": 3.8480825424194336,
+      "learning_rate": 0.0002,
+      "loss": 0.0947,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 433952.0,
+      "step": 102
+    },
+    {
+      "epoch": 0.64375,
+      "grad_norm": 4.139023303985596,
+      "learning_rate": 0.0002,
+      "loss": 0.1131,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 437008.0,
+      "step": 103
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 3.4455084800720215,
+      "learning_rate": 0.0002,
+      "loss": 0.1105,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 440064.0,
+      "step": 104
+    },
+    {
+      "epoch": 0.65625,
+      "grad_norm": 2.1717028617858887,
+      "learning_rate": 0.0002,
+      "loss": 0.0794,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 443120.0,
+      "step": 105
+    },
+    {
+      "epoch": 0.6625,
+      "grad_norm": 0.43985098600387573,
+      "learning_rate": 0.0002,
+      "loss": 0.0057,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 446176.0,
+      "step": 106
+    },
+    {
+      "epoch": 0.66875,
+      "grad_norm": 5.351183891296387,
+      "learning_rate": 0.0002,
+      "loss": 0.3953,
+      "mean_token_accuracy": 0.8125,
+      "num_tokens": 449232.0,
+      "step": 107
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 4.6742048263549805,
+      "learning_rate": 0.0002,
+      "loss": 0.1099,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 452288.0,
+      "step": 108
+    },
+    {
+      "epoch": 0.68125,
+      "grad_norm": 4.851898670196533,
+      "learning_rate": 0.0002,
+      "loss": 0.2198,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 455344.0,
+      "step": 109
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 2.46817946434021,
+      "learning_rate": 0.0002,
+      "loss": 0.0586,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 458400.0,
+      "step": 110
+    },
+    {
+      "epoch": 0.69375,
+      "grad_norm": 5.507438659667969,
+      "learning_rate": 0.0002,
+      "loss": 0.3492,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 461456.0,
+      "step": 111
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.21149688959121704,
+      "learning_rate": 0.0002,
+      "loss": 0.003,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 464512.0,
+      "step": 112
+    },
+    {
+      "epoch": 0.70625,
+      "grad_norm": 2.1346304416656494,
+      "learning_rate": 0.0002,
+      "loss": 0.0365,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 467568.0,
+      "step": 113
+    },
+    {
+      "epoch": 0.7125,
+      "grad_norm": 2.06246280670166,
+      "learning_rate": 0.0002,
+      "loss": 0.1228,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 470624.0,
+      "step": 114
+    },
+    {
+      "epoch": 0.71875,
+      "grad_norm": 1.1220771074295044,
+      "learning_rate": 0.0002,
+      "loss": 0.0174,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 473680.0,
+      "step": 115
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 1.283275842666626,
+      "learning_rate": 0.0002,
+      "loss": 0.0473,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 476736.0,
+      "step": 116
+    },
+    {
+      "epoch": 0.73125,
+      "grad_norm": 3.0137429237365723,
+      "learning_rate": 0.0002,
+      "loss": 0.073,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 479792.0,
+      "step": 117
+    },
+    {
+      "epoch": 0.7375,
+      "grad_norm": 3.159397840499878,
+      "learning_rate": 0.0002,
+      "loss": 0.0736,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 482848.0,
+      "step": 118
+    },
+    {
+      "epoch": 0.74375,
+      "grad_norm": 4.265467643737793,
+      "learning_rate": 0.0002,
+      "loss": 0.3511,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 485904.0,
+      "step": 119
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 4.885653495788574,
+      "learning_rate": 0.0002,
+      "loss": 0.1691,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 488960.0,
+      "step": 120
+    },
+    {
+      "epoch": 0.75625,
+      "grad_norm": 5.490376949310303,
+      "learning_rate": 0.0002,
+      "loss": 0.2357,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 492016.0,
+      "step": 121
+    },
+    {
+      "epoch": 0.7625,
+      "grad_norm": 3.725234031677246,
+      "learning_rate": 0.0002,
+      "loss": 0.1983,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 495072.0,
+      "step": 122
+    },
+    {
+      "epoch": 0.76875,
+      "grad_norm": 1.6382085084915161,
+      "learning_rate": 0.0002,
+      "loss": 0.043,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 498128.0,
+      "step": 123
+    },
+    {
+      "epoch": 0.775,
+      "grad_norm": 0.3425021171569824,
+      "learning_rate": 0.0002,
+      "loss": 0.0069,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 501184.0,
+      "step": 124
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 1.504239559173584,
+      "learning_rate": 0.0002,
+      "loss": 0.0348,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 504240.0,
+      "step": 125
+    },
+    {
+      "epoch": 0.7875,
+      "grad_norm": 0.16000418365001678,
+      "learning_rate": 0.0002,
+      "loss": 0.0071,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 507296.0,
+      "step": 126
+    },
+    {
+      "epoch": 0.79375,
+      "grad_norm": 0.48438990116119385,
+      "learning_rate": 0.0002,
+      "loss": 0.0153,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 510352.0,
+      "step": 127
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.3561486601829529,
+      "learning_rate": 0.0002,
+      "loss": 0.0071,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 513408.0,
+      "step": 128
+    },
+    {
+      "epoch": 0.80625,
+      "grad_norm": 3.513791084289551,
+      "learning_rate": 0.0002,
+      "loss": 0.114,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 516464.0,
+      "step": 129
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.7476550340652466,
+      "learning_rate": 0.0002,
+      "loss": 0.027,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 519520.0,
+      "step": 130
+    },
+    {
+      "epoch": 0.81875,
+      "grad_norm": 4.814530372619629,
+      "learning_rate": 0.0002,
+      "loss": 0.2155,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 522576.0,
+      "step": 131
+    },
+    {
+      "epoch": 0.825,
+      "grad_norm": 4.203996658325195,
+      "learning_rate": 0.0002,
+      "loss": 0.0732,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 525632.0,
+      "step": 132
+    },
+    {
+      "epoch": 0.83125,
+      "grad_norm": 3.171036958694458,
+      "learning_rate": 0.0002,
+      "loss": 0.0479,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 528688.0,
+      "step": 133
+    },
+    {
+      "epoch": 0.8375,
+      "grad_norm": 3.515819549560547,
+      "learning_rate": 0.0002,
+      "loss": 0.1001,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 531744.0,
+      "step": 134
+    },
+    {
+      "epoch": 0.84375,
+      "grad_norm": 1.6061947345733643,
+      "learning_rate": 0.0002,
+      "loss": 0.0795,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 534800.0,
+      "step": 135
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.5217976570129395,
+      "learning_rate": 0.0002,
+      "loss": 0.0143,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 537856.0,
+      "step": 136
+    },
+    {
+      "epoch": 0.85625,
+      "grad_norm": 1.6569266319274902,
+      "learning_rate": 0.0002,
+      "loss": 0.025,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 540912.0,
+      "step": 137
+    },
+    {
+      "epoch": 0.8625,
+      "grad_norm": 2.0939252376556396,
+      "learning_rate": 0.0002,
+      "loss": 0.0375,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 543968.0,
+      "step": 138
+    },
+    {
+      "epoch": 0.86875,
+      "grad_norm": 1.8286008834838867,
+      "learning_rate": 0.0002,
+      "loss": 0.0464,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 547024.0,
+      "step": 139
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 6.246391296386719,
+      "learning_rate": 0.0002,
+      "loss": 0.135,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 550080.0,
+      "step": 140
+    },
+    {
+      "epoch": 0.88125,
+      "grad_norm": 0.9313927292823792,
+      "learning_rate": 0.0002,
+      "loss": 0.0437,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 553136.0,
+      "step": 141
+    },
+    {
+      "epoch": 0.8875,
+      "grad_norm": 3.962883472442627,
+      "learning_rate": 0.0002,
+      "loss": 0.1026,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 556192.0,
+      "step": 142
+    },
+    {
+      "epoch": 0.89375,
+      "grad_norm": 4.730908393859863,
+      "learning_rate": 0.0002,
+      "loss": 0.104,
+      "mean_token_accuracy": 0.90625,
+      "num_tokens": 559248.0,
+      "step": 143
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 4.035704135894775,
+      "learning_rate": 0.0002,
+      "loss": 0.16,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 562304.0,
+      "step": 144
+    },
+    {
+      "epoch": 0.90625,
+      "grad_norm": 5.092385768890381,
+      "learning_rate": 0.0002,
+      "loss": 0.1841,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 565360.0,
+      "step": 145
+    },
+    {
+      "epoch": 0.9125,
+      "grad_norm": 6.0376386642456055,
+      "learning_rate": 0.0002,
+      "loss": 0.2612,
+      "mean_token_accuracy": 0.875,
+      "num_tokens": 568416.0,
+      "step": 146
+    },
+    {
+      "epoch": 0.91875,
+      "grad_norm": 3.7592973709106445,
+      "learning_rate": 0.0002,
+      "loss": 0.047,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 571472.0,
+      "step": 147
+    },
+    {
+      "epoch": 0.925,
+      "grad_norm": 0.9640989899635315,
+      "learning_rate": 0.0002,
+      "loss": 0.0236,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 574528.0,
+      "step": 148
+    },
+    {
+      "epoch": 0.93125,
+      "grad_norm": 1.7307640314102173,
+      "learning_rate": 0.0002,
+      "loss": 0.061,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 577584.0,
+      "step": 149
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 2.1097357273101807,
+      "learning_rate": 0.0002,
+      "loss": 0.3053,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 580640.0,
+      "step": 150
+    },
+    {
+      "epoch": 0.94375,
+      "grad_norm": 3.9413912296295166,
+      "learning_rate": 0.0002,
+      "loss": 0.1405,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 583696.0,
+      "step": 151
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 3.341071128845215,
+      "learning_rate": 0.0002,
+      "loss": 0.1131,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 586752.0,
+      "step": 152
+    },
+    {
+      "epoch": 0.95625,
+      "grad_norm": 1.0671991109848022,
+      "learning_rate": 0.0002,
+      "loss": 0.0147,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 589808.0,
+      "step": 153
+    },
+    {
+      "epoch": 0.9625,
+      "grad_norm": 6.068633556365967,
+      "learning_rate": 0.0002,
+      "loss": 0.0888,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 592864.0,
+      "step": 154
+    },
+    {
+      "epoch": 0.96875,
+      "grad_norm": 1.0805017948150635,
+      "learning_rate": 0.0002,
+      "loss": 0.0136,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 595920.0,
+      "step": 155
+    },
+    {
+      "epoch": 0.975,
+      "grad_norm": 0.40108004212379456,
+      "learning_rate": 0.0002,
+      "loss": 0.0091,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 598976.0,
+      "step": 156
+    },
+    {
+      "epoch": 0.98125,
+      "grad_norm": 0.7008132934570312,
+      "learning_rate": 0.0002,
+      "loss": 0.011,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 602032.0,
+      "step": 157
+    },
+    {
+      "epoch": 0.9875,
+      "grad_norm": 0.42773208022117615,
+      "learning_rate": 0.0002,
+      "loss": 0.0063,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 605088.0,
+      "step": 158
+    },
+    {
+      "epoch": 0.99375,
+      "grad_norm": 0.9735352396965027,
+      "learning_rate": 0.0002,
+      "loss": 0.014,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 608144.0,
+      "step": 159
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.4267780780792236,
+      "learning_rate": 0.0002,
+      "loss": 0.0344,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 611200.0,
+      "step": 160
+    },
+    {
+      "epoch": 1.00625,
+      "grad_norm": 4.375260829925537,
+      "learning_rate": 0.0002,
+      "loss": 0.0981,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 614256.0,
+      "step": 161
+    },
+    {
+      "epoch": 1.0125,
+      "grad_norm": 2.6353538036346436,
+      "learning_rate": 0.0002,
+      "loss": 0.1965,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 617312.0,
+      "step": 162
+    },
+    {
+      "epoch": 1.01875,
+      "grad_norm": 2.906750440597534,
+      "learning_rate": 0.0002,
+      "loss": 0.0887,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 620368.0,
+      "step": 163
+    },
+    {
+      "epoch": 1.025,
+      "grad_norm": 2.4843482971191406,
+      "learning_rate": 0.0002,
+      "loss": 0.0409,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 623424.0,
+      "step": 164
+    },
+    {
+      "epoch": 1.03125,
+      "grad_norm": 2.4637041091918945,
+      "learning_rate": 0.0002,
+      "loss": 0.0588,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 626480.0,
+      "step": 165
+    },
+    {
+      "epoch": 1.0375,
+      "grad_norm": 3.256136655807495,
+      "learning_rate": 0.0002,
+      "loss": 0.0632,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 629536.0,
+      "step": 166
+    },
+    {
+      "epoch": 1.04375,
+      "grad_norm": 3.7600295543670654,
+      "learning_rate": 0.0002,
+      "loss": 0.0903,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 632592.0,
+      "step": 167
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 4.013559341430664,
+      "learning_rate": 0.0002,
+      "loss": 0.0796,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 635648.0,
+      "step": 168
+    },
+    {
+      "epoch": 1.05625,
+      "grad_norm": 1.3275935649871826,
+      "learning_rate": 0.0002,
+      "loss": 0.0415,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 638704.0,
+      "step": 169
+    },
+    {
+      "epoch": 1.0625,
+      "grad_norm": 3.0830256938934326,
+      "learning_rate": 0.0002,
+      "loss": 0.0989,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 641760.0,
+      "step": 170
+    },
+    {
+      "epoch": 1.06875,
+      "grad_norm": 5.780794620513916,
+      "learning_rate": 0.0002,
+      "loss": 0.0974,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 644816.0,
+      "step": 171
+    },
+    {
+      "epoch": 1.075,
+      "grad_norm": 1.1954656839370728,
+      "learning_rate": 0.0002,
+      "loss": 0.0363,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 647872.0,
+      "step": 172
+    },
+    {
+      "epoch": 1.08125,
+      "grad_norm": 0.6503368616104126,
+      "learning_rate": 0.0002,
+      "loss": 0.0086,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 650928.0,
+      "step": 173
+    },
+    {
+      "epoch": 1.0875,
+      "grad_norm": 0.2741992771625519,
+      "learning_rate": 0.0002,
+      "loss": 0.005,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 653984.0,
+      "step": 174
+    },
+    {
+      "epoch": 1.09375,
+      "grad_norm": 4.483480930328369,
+      "learning_rate": 0.0002,
+      "loss": 0.1765,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 657040.0,
+      "step": 175
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.0801796913146973,
+      "learning_rate": 0.0002,
+      "loss": 0.2432,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 660096.0,
+      "step": 176
+    },
+    {
+      "epoch": 1.10625,
+      "grad_norm": 4.300808906555176,
+      "learning_rate": 0.0002,
+      "loss": 0.0805,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 663152.0,
+      "step": 177
+    },
+    {
+      "epoch": 1.1125,
+      "grad_norm": 1.7215920686721802,
+      "learning_rate": 0.0002,
+      "loss": 0.0227,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 666208.0,
+      "step": 178
+    },
+    {
+      "epoch": 1.11875,
+      "grad_norm": 3.1990017890930176,
+      "learning_rate": 0.0002,
+      "loss": 0.3566,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 669264.0,
+      "step": 179
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 1.593938946723938,
+      "learning_rate": 0.0002,
+      "loss": 0.0232,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 672320.0,
+      "step": 180
+    },
+    {
+      "epoch": 1.13125,
+      "grad_norm": 5.725439071655273,
+      "learning_rate": 0.0002,
+      "loss": 0.1591,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 675376.0,
+      "step": 181
+    },
+    {
+      "epoch": 1.1375,
+      "grad_norm": 4.107408046722412,
+      "learning_rate": 0.0002,
+      "loss": 0.0734,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 678432.0,
+      "step": 182
+    },
+    {
+      "epoch": 1.14375,
+      "grad_norm": 0.5303417444229126,
+      "learning_rate": 0.0002,
+      "loss": 0.0099,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 681488.0,
+      "step": 183
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 0.7143228650093079,
+      "learning_rate": 0.0002,
+      "loss": 0.0127,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 684544.0,
+      "step": 184
+    },
+    {
+      "epoch": 1.15625,
+      "grad_norm": 0.10430416464805603,
+      "learning_rate": 0.0002,
+      "loss": 0.0028,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 687600.0,
+      "step": 185
+    },
+    {
+      "epoch": 1.1625,
+      "grad_norm": 0.10262572765350342,
+      "learning_rate": 0.0002,
+      "loss": 0.0022,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 690656.0,
+      "step": 186
+    },
+    {
+      "epoch": 1.16875,
+      "grad_norm": 0.5091086626052856,
+      "learning_rate": 0.0002,
+      "loss": 0.0082,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 693712.0,
+      "step": 187
+    },
+    {
+      "epoch": 1.175,
+      "grad_norm": 0.32739120721817017,
+      "learning_rate": 0.0002,
+      "loss": 0.006,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 696768.0,
+      "step": 188
+    },
+    {
+      "epoch": 1.18125,
+      "grad_norm": 2.958453416824341,
+      "learning_rate": 0.0002,
+      "loss": 0.0506,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 699824.0,
+      "step": 189
+    },
+    {
+      "epoch": 1.1875,
+      "grad_norm": 0.6987707018852234,
+      "learning_rate": 0.0002,
+      "loss": 0.0107,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 702880.0,
+      "step": 190
+    },
+    {
+      "epoch": 1.19375,
+      "grad_norm": 1.3608295917510986,
+      "learning_rate": 0.0002,
+      "loss": 0.0287,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 705936.0,
+      "step": 191
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.14391912519931793,
+      "learning_rate": 0.0002,
+      "loss": 0.0021,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 708992.0,
+      "step": 192
+    },
+    {
+      "epoch": 1.20625,
+      "grad_norm": 2.5519983768463135,
+      "learning_rate": 0.0002,
+      "loss": 0.1491,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 712048.0,
+      "step": 193
+    },
+    {
+      "epoch": 1.2125,
+      "grad_norm": 6.5541253089904785,
+      "learning_rate": 0.0002,
+      "loss": 0.1414,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 715104.0,
+      "step": 194
+    },
+    {
+      "epoch": 1.21875,
+      "grad_norm": 0.2820110619068146,
+      "learning_rate": 0.0002,
+      "loss": 0.003,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 718160.0,
+      "step": 195
+    },
+    {
+      "epoch": 1.225,
+      "grad_norm": 2.9799859523773193,
+      "learning_rate": 0.0002,
+      "loss": 0.0506,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 721216.0,
+      "step": 196
+    },
+    {
+      "epoch": 1.23125,
+      "grad_norm": 1.1640214920043945,
+      "learning_rate": 0.0002,
+      "loss": 0.0182,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 724272.0,
+      "step": 197
+    },
+    {
+      "epoch": 1.2375,
+      "grad_norm": 2.498863935470581,
+      "learning_rate": 0.0002,
+      "loss": 0.073,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 727328.0,
+      "step": 198
+    },
+    {
+      "epoch": 1.24375,
+      "grad_norm": 1.2676233053207397,
+      "learning_rate": 0.0002,
+      "loss": 0.0265,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 730384.0,
+      "step": 199
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.3039199113845825,
+      "learning_rate": 0.0002,
+      "loss": 0.0079,
+      "step": 200
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 0.07848864793777466,
+      "eval_mean_token_accuracy": 0.9703125,
+      "eval_model_preparation_time": 0.0042,
+      "eval_num_tokens": 733440.0,
+      "eval_runtime": 71.4875,
+      "eval_samples_per_second": 8.953,
+      "eval_steps_per_second": 0.56,
+      "step": 200
+    },
+    {
+      "epoch": 1.25625,
+      "grad_norm": 3.308570623397827,
+      "learning_rate": 0.0002,
+      "loss": 0.0569,
+      "mean_token_accuracy": 0.984375,
+      "num_tokens": 736496.0,
+      "step": 201
+    },
+    {
+      "epoch": 1.2625,
+      "grad_norm": 1.4555875062942505,
+      "learning_rate": 0.0002,
+      "loss": 0.0138,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 739552.0,
+      "step": 202
+    },
+    {
+      "epoch": 1.26875,
+      "grad_norm": 0.012952354736626148,
+      "learning_rate": 0.0002,
+      "loss": 0.0001,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 742608.0,
+      "step": 203
+    },
+    {
+      "epoch": 1.275,
+      "grad_norm": 5.3269829750061035,
+      "learning_rate": 0.0002,
+      "loss": 0.2567,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 745664.0,
+      "step": 204
+    },
+    {
+      "epoch": 1.28125,
+      "grad_norm": 0.021568113937973976,
+      "learning_rate": 0.0002,
+      "loss": 0.0002,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 748720.0,
+      "step": 205
+    },
+    {
+      "epoch": 1.2875,
+      "grad_norm": 1.318519115447998,
+      "learning_rate": 0.0002,
+      "loss": 0.0142,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 751776.0,
+      "step": 206
+    },
+    {
+      "epoch": 1.29375,
+      "grad_norm": 3.792602300643921,
+      "learning_rate": 0.0002,
+      "loss": 0.0546,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 754832.0,
+      "step": 207
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 3.9029130935668945,
+      "learning_rate": 0.0002,
+      "loss": 0.0485,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 757888.0,
+      "step": 208
+    },
+    {
+      "epoch": 1.30625,
+      "grad_norm": 5.260088920593262,
+      "learning_rate": 0.0002,
+      "loss": 0.1518,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 760944.0,
+      "step": 209
+    },
+    {
+      "epoch": 1.3125,
+      "grad_norm": 7.682434558868408,
+      "learning_rate": 0.0002,
+      "loss": 0.1869,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 764000.0,
+      "step": 210
+    },
+    {
+      "epoch": 1.31875,
+      "grad_norm": 5.554214000701904,
+      "learning_rate": 0.0002,
+      "loss": 0.2424,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 767056.0,
+      "step": 211
+    },
+    {
+      "epoch": 1.325,
+      "grad_norm": 1.796474814414978,
+      "learning_rate": 0.0002,
+      "loss": 0.0379,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 770112.0,
+      "step": 212
+    },
+    {
+      "epoch": 1.33125,
+      "grad_norm": 0.8003765344619751,
+      "learning_rate": 0.0002,
+      "loss": 0.0088,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 773168.0,
+      "step": 213
+    },
+    {
+      "epoch": 1.3375,
+      "grad_norm": 0.07899386435747147,
+      "learning_rate": 0.0002,
+      "loss": 0.001,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 776224.0,
+      "step": 214
+    },
+    {
+      "epoch": 1.34375,
+      "grad_norm": 1.10870361328125,
+      "learning_rate": 0.0002,
+      "loss": 0.0102,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 779280.0,
+      "step": 215
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 4.3541107177734375,
+      "learning_rate": 0.0002,
+      "loss": 0.1525,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 782336.0,
+      "step": 216
+    },
+    {
+      "epoch": 1.35625,
+      "grad_norm": 0.10654051601886749,
+      "learning_rate": 0.0002,
+      "loss": 0.0018,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 785392.0,
+      "step": 217
+    },
+    {
+      "epoch": 1.3625,
+      "grad_norm": 0.1457299143075943,
+      "learning_rate": 0.0002,
+      "loss": 0.0026,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 788448.0,
+      "step": 218
+    },
+    {
+      "epoch": 1.36875,
+      "grad_norm": 0.3237255811691284,
+      "learning_rate": 0.0002,
+      "loss": 0.0031,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 791504.0,
+      "step": 219
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 1.9996052980422974,
+      "learning_rate": 0.0002,
+      "loss": 0.0206,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 794560.0,
+      "step": 220
+    },
+    {
+      "epoch": 1.38125,
+      "grad_norm": 4.9992289543151855,
+      "learning_rate": 0.0002,
+      "loss": 0.1205,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 797616.0,
+      "step": 221
+    },
+    {
+      "epoch": 1.3875,
+      "grad_norm": 1.0994348526000977,
+      "learning_rate": 0.0002,
+      "loss": 0.01,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 800672.0,
+      "step": 222
+    },
+    {
+      "epoch": 1.39375,
+      "grad_norm": 5.1901068687438965,
+      "learning_rate": 0.0002,
+      "loss": 0.0597,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 803728.0,
+      "step": 223
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 8.030043601989746,
+      "learning_rate": 0.0002,
+      "loss": 0.1187,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 806784.0,
+      "step": 224
+    },
+    {
+      "epoch": 1.40625,
+      "grad_norm": 5.358715057373047,
+      "learning_rate": 0.0002,
+      "loss": 0.0635,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 809840.0,
+      "step": 225
+    },
+    {
+      "epoch": 1.4125,
+      "grad_norm": 5.855217933654785,
+      "learning_rate": 0.0002,
+      "loss": 0.1923,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 812896.0,
+      "step": 226
+    },
+    {
+      "epoch": 1.41875,
+      "grad_norm": 1.0728706121444702,
+      "learning_rate": 0.0002,
+      "loss": 0.0098,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 815952.0,
+      "step": 227
+    },
+    {
+      "epoch": 1.425,
+      "grad_norm": 0.028943141922354698,
+      "learning_rate": 0.0002,
+      "loss": 0.0003,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 819008.0,
+      "step": 228
+    },
+    {
+      "epoch": 1.43125,
+      "grad_norm": 4.162194728851318,
+      "learning_rate": 0.0002,
+      "loss": 0.0434,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 822064.0,
+      "step": 229
+    },
+    {
+      "epoch": 1.4375,
+      "grad_norm": 0.4618898630142212,
+      "learning_rate": 0.0002,
+      "loss": 0.005,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 825120.0,
+      "step": 230
+    },
+    {
+      "epoch": 1.44375,
+      "grad_norm": 0.33329862356185913,
+      "learning_rate": 0.0002,
+      "loss": 0.0029,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 828176.0,
+      "step": 231
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 0.5071545839309692,
+      "learning_rate": 0.0002,
+      "loss": 0.0043,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 831232.0,
+      "step": 232
+    },
+    {
+      "epoch": 1.45625,
+      "grad_norm": 0.6205556392669678,
+      "learning_rate": 0.0002,
+      "loss": 0.0057,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 834288.0,
+      "step": 233
+    },
+    {
+      "epoch": 1.4625,
+      "grad_norm": 0.00851200520992279,
+      "learning_rate": 0.0002,
+      "loss": 0.0003,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 837344.0,
+      "step": 234
+    },
+    {
+      "epoch": 1.46875,
+      "grad_norm": 0.04946492984890938,
+      "learning_rate": 0.0002,
+      "loss": 0.0005,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 840400.0,
+      "step": 235
+    },
+    {
+      "epoch": 1.475,
+      "grad_norm": 3.8646678924560547,
+      "learning_rate": 0.0002,
+      "loss": 0.0244,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 843456.0,
+      "step": 236
+    },
+    {
+      "epoch": 1.48125,
+      "grad_norm": 5.360538005828857,
+      "learning_rate": 0.0002,
+      "loss": 0.0776,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 846512.0,
+      "step": 237
+    },
+    {
+      "epoch": 1.4875,
+      "grad_norm": 1.9940391778945923,
+      "learning_rate": 0.0002,
+      "loss": 0.0203,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 849568.0,
+      "step": 238
+    },
+    {
+      "epoch": 1.49375,
+      "grad_norm": 5.840383529663086,
+      "learning_rate": 0.0002,
+      "loss": 0.2391,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 852624.0,
+      "step": 239
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 4.679400444030762,
+      "learning_rate": 0.0002,
+      "loss": 0.0533,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 855680.0,
+      "step": 240
+    },
+    {
+      "epoch": 1.50625,
+      "grad_norm": 2.590467691421509,
+      "learning_rate": 0.0002,
+      "loss": 0.0536,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 858736.0,
+      "step": 241
+    },
+    {
+      "epoch": 1.5125,
+      "grad_norm": 0.5134260654449463,
+      "learning_rate": 0.0002,
+      "loss": 0.0045,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 861792.0,
+      "step": 242
+    },
+    {
+      "epoch": 1.51875,
+      "grad_norm": 1.5896207094192505,
+      "learning_rate": 0.0002,
+      "loss": 0.0131,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 864848.0,
+      "step": 243
+    },
+    {
+      "epoch": 1.525,
+      "grad_norm": 0.48414427042007446,
+      "learning_rate": 0.0002,
+      "loss": 0.0037,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 867904.0,
+      "step": 244
+    },
+    {
+      "epoch": 1.53125,
+      "grad_norm": 1.9993484020233154,
+      "learning_rate": 0.0002,
+      "loss": 0.0182,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 870960.0,
+      "step": 245
+    },
+    {
+      "epoch": 1.5375,
+      "grad_norm": 0.592039942741394,
+      "learning_rate": 0.0002,
+      "loss": 0.0078,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 874016.0,
+      "step": 246
+    },
+    {
+      "epoch": 1.54375,
+      "grad_norm": 4.422524929046631,
+      "learning_rate": 0.0002,
+      "loss": 0.0413,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 877072.0,
+      "step": 247
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 4.080215930938721,
+      "learning_rate": 0.0002,
+      "loss": 0.1189,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 880128.0,
+      "step": 248
+    },
+    {
+      "epoch": 1.55625,
+      "grad_norm": 2.1959433555603027,
+      "learning_rate": 0.0002,
+      "loss": 0.0267,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 883184.0,
+      "step": 249
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 4.379952430725098,
+      "learning_rate": 0.0002,
+      "loss": 0.0507,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 886240.0,
+      "step": 250
+    },
+    {
+      "epoch": 1.56875,
+      "grad_norm": 2.473543882369995,
+      "learning_rate": 0.0002,
+      "loss": 0.0261,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 889296.0,
+      "step": 251
+    },
+    {
+      "epoch": 1.575,
+      "grad_norm": 0.6901207566261292,
+      "learning_rate": 0.0002,
+      "loss": 0.0043,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 892352.0,
+      "step": 252
+    },
+    {
+      "epoch": 1.58125,
+      "grad_norm": 3.800703763961792,
+      "learning_rate": 0.0002,
+      "loss": 0.1643,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 895408.0,
+      "step": 253
+    },
+    {
+      "epoch": 1.5875,
+      "grad_norm": 1.523447036743164,
+      "learning_rate": 0.0002,
+      "loss": 0.4695,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 898464.0,
+      "step": 254
+    },
+    {
+      "epoch": 1.59375,
+      "grad_norm": 0.06425034999847412,
+      "learning_rate": 0.0002,
+      "loss": 0.0008,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 901520.0,
+      "step": 255
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.7315100431442261,
+      "learning_rate": 0.0002,
+      "loss": 0.0066,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 904576.0,
+      "step": 256
+    },
+    {
+      "epoch": 1.60625,
+      "grad_norm": 0.39078789949417114,
+      "learning_rate": 0.0002,
+      "loss": 0.0042,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 907632.0,
+      "step": 257
+    },
+    {
+      "epoch": 1.6125,
+      "grad_norm": 7.1944475173950195,
+      "learning_rate": 0.0002,
+      "loss": 0.2334,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 910688.0,
+      "step": 258
+    },
+    {
+      "epoch": 1.61875,
+      "grad_norm": 0.3877175748348236,
+      "learning_rate": 0.0002,
+      "loss": 0.0046,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 913744.0,
+      "step": 259
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": 1.4738430976867676,
+      "learning_rate": 0.0002,
+      "loss": 0.0207,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 916800.0,
+      "step": 260
+    },
+    {
+      "epoch": 1.63125,
+      "grad_norm": 3.2106664180755615,
+      "learning_rate": 0.0002,
+      "loss": 0.0356,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 919856.0,
+      "step": 261
+    },
+    {
+      "epoch": 1.6375,
+      "grad_norm": 0.09959662705659866,
+      "learning_rate": 0.0002,
+      "loss": 0.0008,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 922912.0,
+      "step": 262
+    },
+    {
+      "epoch": 1.64375,
+      "grad_norm": 3.7656471729278564,
+      "learning_rate": 0.0002,
+      "loss": 0.0371,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 925968.0,
+      "step": 263
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 0.057951927185058594,
+      "learning_rate": 0.0002,
+      "loss": 0.0009,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 929024.0,
+      "step": 264
+    },
+    {
+      "epoch": 1.65625,
+      "grad_norm": 0.7342841029167175,
+      "learning_rate": 0.0002,
+      "loss": 0.0082,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 932080.0,
+      "step": 265
+    },
+    {
+      "epoch": 1.6625,
+      "grad_norm": 2.154627799987793,
+      "learning_rate": 0.0002,
+      "loss": 0.0259,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 935136.0,
+      "step": 266
+    },
+    {
+      "epoch": 1.66875,
+      "grad_norm": 0.036353010684251785,
+      "learning_rate": 0.0002,
+      "loss": 0.0008,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 938192.0,
+      "step": 267
+    },
+    {
+      "epoch": 1.675,
+      "grad_norm": 0.013025197200477123,
+      "learning_rate": 0.0002,
+      "loss": 0.0002,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 941248.0,
+      "step": 268
+    },
+    {
+      "epoch": 1.68125,
+      "grad_norm": 0.11635535210371017,
+      "learning_rate": 0.0002,
+      "loss": 0.0011,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 944304.0,
+      "step": 269
+    },
+    {
+      "epoch": 1.6875,
+      "grad_norm": 1.7156617641448975,
+      "learning_rate": 0.0002,
+      "loss": 0.0207,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 947360.0,
+      "step": 270
+    },
+    {
+      "epoch": 1.69375,
+      "grad_norm": 0.04577786847949028,
+      "learning_rate": 0.0002,
+      "loss": 0.001,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 950416.0,
+      "step": 271
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 0.22839251160621643,
+      "learning_rate": 0.0002,
+      "loss": 0.0017,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 953472.0,
+      "step": 272
+    },
+    {
+      "epoch": 1.70625,
+      "grad_norm": 3.693096399307251,
+      "learning_rate": 0.0002,
+      "loss": 0.0622,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 956528.0,
+      "step": 273
+    },
+    {
+      "epoch": 1.7125,
+      "grad_norm": 0.9133250117301941,
+      "learning_rate": 0.0002,
+      "loss": 0.0083,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 959584.0,
+      "step": 274
+    },
+    {
+      "epoch": 1.71875,
+      "grad_norm": 0.2589430809020996,
+      "learning_rate": 0.0002,
+      "loss": 0.0025,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 962640.0,
+      "step": 275
+    },
+    {
+      "epoch": 1.725,
+      "grad_norm": 0.027116835117340088,
+      "learning_rate": 0.0002,
+      "loss": 0.0006,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 965696.0,
+      "step": 276
+    },
+    {
+      "epoch": 1.73125,
+      "grad_norm": 2.58829402923584,
+      "learning_rate": 0.0002,
+      "loss": 0.0452,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 968752.0,
+      "step": 277
+    },
+    {
+      "epoch": 1.7375,
+      "grad_norm": 0.12397311627864838,
+      "learning_rate": 0.0002,
+      "loss": 0.0009,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 971808.0,
+      "step": 278
+    },
+    {
+      "epoch": 1.74375,
+      "grad_norm": 0.032883170992136,
+      "learning_rate": 0.0002,
+      "loss": 0.0005,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 974864.0,
+      "step": 279
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.024885807186365128,
+      "learning_rate": 0.0002,
+      "loss": 0.0005,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 977920.0,
+      "step": 280
+    },
+    {
+      "epoch": 1.75625,
+      "grad_norm": 0.11918771266937256,
+      "learning_rate": 0.0002,
+      "loss": 0.0019,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 980976.0,
+      "step": 281
+    },
+    {
+      "epoch": 1.7625,
+      "grad_norm": 5.538094997406006,
+      "learning_rate": 0.0002,
+      "loss": 0.062,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 984032.0,
+      "step": 282
+    },
+    {
+      "epoch": 1.76875,
+      "grad_norm": 0.20568646490573883,
+      "learning_rate": 0.0002,
+      "loss": 0.0027,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 987088.0,
+      "step": 283
+    },
+    {
+      "epoch": 1.775,
+      "grad_norm": 3.7988224029541016,
+      "learning_rate": 0.0002,
+      "loss": 0.0376,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 990144.0,
+      "step": 284
+    },
+    {
+      "epoch": 1.78125,
+      "grad_norm": 0.0016885192599147558,
+      "learning_rate": 0.0002,
+      "loss": 0.0,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 993200.0,
+      "step": 285
+    },
+    {
+      "epoch": 1.7875,
+      "grad_norm": 0.08663017302751541,
+      "learning_rate": 0.0002,
+      "loss": 0.001,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 996256.0,
+      "step": 286
+    },
+    {
+      "epoch": 1.79375,
+      "grad_norm": 3.149121046066284,
+      "learning_rate": 0.0002,
+      "loss": 0.0219,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 999312.0,
+      "step": 287
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 1.3945331573486328,
+      "learning_rate": 0.0002,
+      "loss": 0.0077,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1002368.0,
+      "step": 288
+    },
+    {
+      "epoch": 1.80625,
+      "grad_norm": 0.5408581495285034,
+      "learning_rate": 0.0002,
+      "loss": 0.0055,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1005424.0,
+      "step": 289
+    },
+    {
+      "epoch": 1.8125,
+      "grad_norm": 5.049882888793945,
+      "learning_rate": 0.0002,
+      "loss": 0.1378,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1008480.0,
+      "step": 290
+    },
+    {
+      "epoch": 1.81875,
+      "grad_norm": 1.854457974433899,
+      "learning_rate": 0.0002,
+      "loss": 0.0117,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1011536.0,
+      "step": 291
+    },
+    {
+      "epoch": 1.825,
+      "grad_norm": 5.968977451324463,
+      "learning_rate": 0.0002,
+      "loss": 0.0976,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1014592.0,
+      "step": 292
+    },
+    {
+      "epoch": 1.83125,
+      "grad_norm": 0.39035147428512573,
+      "learning_rate": 0.0002,
+      "loss": 0.0026,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1017648.0,
+      "step": 293
+    },
+    {
+      "epoch": 1.8375,
+      "grad_norm": 0.5014843940734863,
+      "learning_rate": 0.0002,
+      "loss": 0.0037,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1020704.0,
+      "step": 294
+    },
+    {
+      "epoch": 1.84375,
+      "grad_norm": 7.198896884918213,
+      "learning_rate": 0.0002,
+      "loss": 0.113,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1023760.0,
+      "step": 295
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 5.184747695922852,
+      "learning_rate": 0.0002,
+      "loss": 0.047,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1026816.0,
+      "step": 296
+    },
+    {
+      "epoch": 1.85625,
+      "grad_norm": 0.03395215794444084,
+      "learning_rate": 0.0002,
+      "loss": 0.0004,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1029872.0,
+      "step": 297
+    },
+    {
+      "epoch": 1.8625,
+      "grad_norm": 0.002945690182968974,
+      "learning_rate": 0.0002,
+      "loss": 0.0001,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1032928.0,
+      "step": 298
+    },
+    {
+      "epoch": 1.86875,
+      "grad_norm": 0.009323005564510822,
+      "learning_rate": 0.0002,
+      "loss": 0.0001,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1035984.0,
+      "step": 299
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 0.5545831918716431,
+      "learning_rate": 0.0002,
+      "loss": 0.0045,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1039040.0,
+      "step": 300
+    },
+    {
+      "epoch": 1.88125,
+      "grad_norm": 6.735720634460449,
+      "learning_rate": 0.0002,
+      "loss": 0.1614,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1042096.0,
+      "step": 301
+    },
+    {
+      "epoch": 1.8875,
+      "grad_norm": 5.312285423278809,
+      "learning_rate": 0.0002,
+      "loss": 0.259,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1045152.0,
+      "step": 302
+    },
+    {
+      "epoch": 1.89375,
+      "grad_norm": 3.6888339519500732,
+      "learning_rate": 0.0002,
+      "loss": 0.0775,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1048208.0,
+      "step": 303
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.1625453233718872,
+      "learning_rate": 0.0002,
+      "loss": 0.0084,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1051264.0,
+      "step": 304
+    },
+    {
+      "epoch": 1.90625,
+      "grad_norm": 2.7333149909973145,
+      "learning_rate": 0.0002,
+      "loss": 0.0198,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1054320.0,
+      "step": 305
+    },
+    {
+      "epoch": 1.9125,
+      "grad_norm": 6.946017265319824,
+      "learning_rate": 0.0002,
+      "loss": 0.0668,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1057376.0,
+      "step": 306
+    },
+    {
+      "epoch": 1.91875,
+      "grad_norm": 0.8210978507995605,
+      "learning_rate": 0.0002,
+      "loss": 0.0047,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1060432.0,
+      "step": 307
+    },
+    {
+      "epoch": 1.925,
+      "grad_norm": 5.008805274963379,
+      "learning_rate": 0.0002,
+      "loss": 0.2898,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1063488.0,
+      "step": 308
+    },
+    {
+      "epoch": 1.93125,
+      "grad_norm": 2.841658353805542,
+      "learning_rate": 0.0002,
+      "loss": 0.0158,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1066544.0,
+      "step": 309
+    },
+    {
+      "epoch": 1.9375,
+      "grad_norm": 5.415345191955566,
+      "learning_rate": 0.0002,
+      "loss": 0.3233,
+      "mean_token_accuracy": 0.9375,
+      "num_tokens": 1069600.0,
+      "step": 310
+    },
+    {
+      "epoch": 1.94375,
+      "grad_norm": 4.167904853820801,
+      "learning_rate": 0.0002,
+      "loss": 0.0631,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1072656.0,
+      "step": 311
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 0.2593608498573303,
+      "learning_rate": 0.0002,
+      "loss": 0.0017,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1075712.0,
+      "step": 312
+    },
+    {
+      "epoch": 1.95625,
+      "grad_norm": 3.686906576156616,
+      "learning_rate": 0.0002,
+      "loss": 0.0414,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1078768.0,
+      "step": 313
+    },
+    {
+      "epoch": 1.9625,
+      "grad_norm": 0.01392375398427248,
+      "learning_rate": 0.0002,
+      "loss": 0.0002,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1081824.0,
+      "step": 314
+    },
+    {
+      "epoch": 1.96875,
+      "grad_norm": 0.021308658644557,
+      "learning_rate": 0.0002,
+      "loss": 0.0002,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1084880.0,
+      "step": 315
+    },
+    {
+      "epoch": 1.975,
+      "grad_norm": 1.8390535116195679,
+      "learning_rate": 0.0002,
+      "loss": 0.0124,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1087936.0,
+      "step": 316
+    },
+    {
+      "epoch": 1.98125,
+      "grad_norm": 5.717790603637695,
+      "learning_rate": 0.0002,
+      "loss": 0.0658,
+      "mean_token_accuracy": 0.96875,
+      "num_tokens": 1090992.0,
+      "step": 317
+    },
+    {
+      "epoch": 1.9875,
+      "grad_norm": 0.08841279149055481,
+      "learning_rate": 0.0002,
+      "loss": 0.0008,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1094048.0,
+      "step": 318
+    },
+    {
+      "epoch": 1.99375,
+      "grad_norm": 0.13375015556812286,
+      "learning_rate": 0.0002,
+      "loss": 0.0021,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1097104.0,
+      "step": 319
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.138557568192482,
+      "learning_rate": 0.0002,
+      "loss": 0.0011,
+      "mean_token_accuracy": 1.0,
+      "num_tokens": 1100160.0,
+      "step": 320
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 320,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.548627979567104e+16,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-320/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1e17f907302d781eecf6b58e9d4136a88ad4089312e1b26d13d23c8ff5af5cb
+size 6161

checkpoint-320/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1e17f907302d781eecf6b58e9d4136a88ad4089312e1b26d13d23c8ff5af5cb
+size 6161

training_args.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "lora_r": 16,
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "lora_target_modules": "q_proj,v_proj",
+  "tuning_strategy": "lora",
+  "num_trainable_layers": 2,
+  "output_dir": "sft_prefill/prompt_id_3/qwen2-VL-7B-Instruct-syn-count-lora",
+  "num_train_epochs": 2,
+  "learning_rate": 0.0002,
+  "per_device_train_batch_size": 16,
+  "per_device_eval_batch_size": 16,
+  "gradient_accumulation_steps": 1,
+  "logging_steps": 10,
+  "eval_steps": 200,
+  "save_steps": 200,
+  "warmup_ratio": 0.03,
+  "weight_decay": 0.0,
+  "max_grad_norm": 0.3,
+  "lr_scheduler_type": "constant",
+  "bf16": true,
+  "tf32": true,
+  "gradient_checkpointing": true,
+  "optim": "adamw_torch_fused",
+  "ft_type": "SFT_Prefill",
+  "data_type": "small",
+  "prompt_id": 3
+}

training_log.json ADDED Viewed

	@@ -0,0 +1,1318 @@

+{
+  "train_loss": [
+    {
+      "step": 1,
+      "loss": 0.4917
+    },
+    {
+      "step": 2,
+      "loss": 0.6049
+    },
+    {
+      "step": 3,
+      "loss": 0.6407
+    },
+    {
+      "step": 4,
+      "loss": 0.3972
+    },
+    {
+      "step": 5,
+      "loss": 0.453
+    },
+    {
+      "step": 6,
+      "loss": 0.418
+    },
+    {
+      "step": 7,
+      "loss": 0.303
+    },
+    {
+      "step": 8,
+      "loss": 0.1847
+    },
+    {
+      "step": 9,
+      "loss": 0.3514
+    },
+    {
+      "step": 10,
+      "loss": 0.2434
+    },
+    {
+      "step": 11,
+      "loss": 0.1577
+    },
+    {
+      "step": 12,
+      "loss": 0.209
+    },
+    {
+      "step": 13,
+      "loss": 0.2123
+    },
+    {
+      "step": 14,
+      "loss": 0.2712
+    },
+    {
+      "step": 15,
+      "loss": 0.2975
+    },
+    {
+      "step": 16,
+      "loss": 0.3977
+    },
+    {
+      "step": 17,
+      "loss": 0.3933
+    },
+    {
+      "step": 18,
+      "loss": 0.2866
+    },
+    {
+      "step": 19,
+      "loss": 0.4174
+    },
+    {
+      "step": 20,
+      "loss": 0.3588
+    },
+    {
+      "step": 21,
+      "loss": 0.1919
+    },
+    {
+      "step": 22,
+      "loss": 0.475
+    },
+    {
+      "step": 23,
+      "loss": 0.223
+    },
+    {
+      "step": 24,
+      "loss": 0.3825
+    },
+    {
+      "step": 25,
+      "loss": 0.1645
+    },
+    {
+      "step": 26,
+      "loss": 0.2517
+    },
+    {
+      "step": 27,
+      "loss": 0.1865
+    },
+    {
+      "step": 28,
+      "loss": 0.2887
+    },
+    {
+      "step": 29,
+      "loss": 0.2154
+    },
+    {
+      "step": 30,
+      "loss": 0.3605
+    },
+    {
+      "step": 31,
+      "loss": 0.1299
+    },
+    {
+      "step": 32,
+      "loss": 0.1772
+    },
+    {
+      "step": 33,
+      "loss": 0.3281
+    },
+    {
+      "step": 34,
+      "loss": 0.1973
+    },
+    {
+      "step": 35,
+      "loss": 0.2645
+    },
+    {
+      "step": 36,
+      "loss": 0.1938
+    },
+    {
+      "step": 37,
+      "loss": 0.1577
+    },
+    {
+      "step": 38,
+      "loss": 0.1917
+    },
+    {
+      "step": 39,
+      "loss": 0.0703
+    },
+    {
+      "step": 40,
+      "loss": 0.1402
+    },
+    {
+      "step": 41,
+      "loss": 0.1274
+    },
+    {
+      "step": 42,
+      "loss": 0.1871
+    },
+    {
+      "step": 43,
+      "loss": 0.2981
+    },
+    {
+      "step": 44,
+      "loss": 0.0962
+    },
+    {
+      "step": 45,
+      "loss": 0.1616
+    },
+    {
+      "step": 46,
+      "loss": 0.0614
+    },
+    {
+      "step": 47,
+      "loss": 0.1468
+    },
+    {
+      "step": 48,
+      "loss": 0.0482
+    },
+    {
+      "step": 49,
+      "loss": 0.1177
+    },
+    {
+      "step": 50,
+      "loss": 0.0788
+    },
+    {
+      "step": 51,
+      "loss": 0.1786
+    },
+    {
+      "step": 52,
+      "loss": 0.1685
+    },
+    {
+      "step": 53,
+      "loss": 0.1178
+    },
+    {
+      "step": 54,
+      "loss": 0.1016
+    },
+    {
+      "step": 55,
+      "loss": 0.1223
+    },
+    {
+      "step": 56,
+      "loss": 0.0331
+    },
+    {
+      "step": 57,
+      "loss": 0.1314
+    },
+    {
+      "step": 58,
+      "loss": 0.0504
+    },
+    {
+      "step": 59,
+      "loss": 0.0327
+    },
+    {
+      "step": 60,
+      "loss": 0.0766
+    },
+    {
+      "step": 61,
+      "loss": 0.0167
+    },
+    {
+      "step": 62,
+      "loss": 0.0176
+    },
+    {
+      "step": 63,
+      "loss": 0.1523
+    },
+    {
+      "step": 64,
+      "loss": 0.0172
+    },
+    {
+      "step": 65,
+      "loss": 0.1337
+    },
+    {
+      "step": 66,
+      "loss": 0.18
+    },
+    {
+      "step": 67,
+      "loss": 0.0076
+    },
+    {
+      "step": 68,
+      "loss": 0.0051
+    },
+    {
+      "step": 69,
+      "loss": 0.0408
+    },
+    {
+      "step": 70,
+      "loss": 0.0744
+    },
+    {
+      "step": 71,
+      "loss": 0.0123
+    },
+    {
+      "step": 72,
+      "loss": 0.006
+    },
+    {
+      "step": 73,
+      "loss": 0.3016
+    },
+    {
+      "step": 74,
+      "loss": 0.1201
+    },
+    {
+      "step": 75,
+      "loss": 0.037
+    },
+    {
+      "step": 76,
+      "loss": 0.0413
+    },
+    {
+      "step": 77,
+      "loss": 0.0697
+    },
+    {
+      "step": 78,
+      "loss": 0.0485
+    },
+    {
+      "step": 79,
+      "loss": 0.0686
+    },
+    {
+      "step": 80,
+      "loss": 0.0649
+    },
+    {
+      "step": 81,
+      "loss": 0.1002
+    },
+    {
+      "step": 82,
+      "loss": 0.0235
+    },
+    {
+      "step": 83,
+      "loss": 0.1955
+    },
+    {
+      "step": 84,
+      "loss": 0.1692
+    },
+    {
+      "step": 85,
+      "loss": 0.0178
+    },
+    {
+      "step": 86,
+      "loss": 0.0282
+    },
+    {
+      "step": 87,
+      "loss": 0.0191
+    },
+    {
+      "step": 88,
+      "loss": 0.0177
+    },
+    {
+      "step": 89,
+      "loss": 0.0207
+    },
+    {
+      "step": 90,
+      "loss": 0.0082
+    },
+    {
+      "step": 91,
+      "loss": 0.0583
+    },
+    {
+      "step": 92,
+      "loss": 0.259
+    },
+    {
+      "step": 93,
+      "loss": 0.0295
+    },
+    {
+      "step": 94,
+      "loss": 0.0014
+    },
+    {
+      "step": 95,
+      "loss": 0.024
+    },
+    {
+      "step": 96,
+      "loss": 0.0795
+    },
+    {
+      "step": 97,
+      "loss": 0.1903
+    },
+    {
+      "step": 98,
+      "loss": 0.2885
+    },
+    {
+      "step": 99,
+      "loss": 0.2661
+    },
+    {
+      "step": 100,
+      "loss": 0.0502
+    },
+    {
+      "step": 101,
+      "loss": 0.0041
+    },
+    {
+      "step": 102,
+      "loss": 0.0947
+    },
+    {
+      "step": 103,
+      "loss": 0.1131
+    },
+    {
+      "step": 104,
+      "loss": 0.1105
+    },
+    {
+      "step": 105,
+      "loss": 0.0794
+    },
+    {
+      "step": 106,
+      "loss": 0.0057
+    },
+    {
+      "step": 107,
+      "loss": 0.3953
+    },
+    {
+      "step": 108,
+      "loss": 0.1099
+    },
+    {
+      "step": 109,
+      "loss": 0.2198
+    },
+    {
+      "step": 110,
+      "loss": 0.0586
+    },
+    {
+      "step": 111,
+      "loss": 0.3492
+    },
+    {
+      "step": 112,
+      "loss": 0.003
+    },
+    {
+      "step": 113,
+      "loss": 0.0365
+    },
+    {
+      "step": 114,
+      "loss": 0.1228
+    },
+    {
+      "step": 115,
+      "loss": 0.0174
+    },
+    {
+      "step": 116,
+      "loss": 0.0473
+    },
+    {
+      "step": 117,
+      "loss": 0.073
+    },
+    {
+      "step": 118,
+      "loss": 0.0736
+    },
+    {
+      "step": 119,
+      "loss": 0.3511
+    },
+    {
+      "step": 120,
+      "loss": 0.1691
+    },
+    {
+      "step": 121,
+      "loss": 0.2357
+    },
+    {
+      "step": 122,
+      "loss": 0.1983
+    },
+    {
+      "step": 123,
+      "loss": 0.043
+    },
+    {
+      "step": 124,
+      "loss": 0.0069
+    },
+    {
+      "step": 125,
+      "loss": 0.0348
+    },
+    {
+      "step": 126,
+      "loss": 0.0071
+    },
+    {
+      "step": 127,
+      "loss": 0.0153
+    },
+    {
+      "step": 128,
+      "loss": 0.0071
+    },
+    {
+      "step": 129,
+      "loss": 0.114
+    },
+    {
+      "step": 130,
+      "loss": 0.027
+    },
+    {
+      "step": 131,
+      "loss": 0.2155
+    },
+    {
+      "step": 132,
+      "loss": 0.0732
+    },
+    {
+      "step": 133,
+      "loss": 0.0479
+    },
+    {
+      "step": 134,
+      "loss": 0.1001
+    },
+    {
+      "step": 135,
+      "loss": 0.0795
+    },
+    {
+      "step": 136,
+      "loss": 0.0143
+    },
+    {
+      "step": 137,
+      "loss": 0.025
+    },
+    {
+      "step": 138,
+      "loss": 0.0375
+    },
+    {
+      "step": 139,
+      "loss": 0.0464
+    },
+    {
+      "step": 140,
+      "loss": 0.135
+    },
+    {
+      "step": 141,
+      "loss": 0.0437
+    },
+    {
+      "step": 142,
+      "loss": 0.1026
+    },
+    {
+      "step": 143,
+      "loss": 0.104
+    },
+    {
+      "step": 144,
+      "loss": 0.16
+    },
+    {
+      "step": 145,
+      "loss": 0.1841
+    },
+    {
+      "step": 146,
+      "loss": 0.2612
+    },
+    {
+      "step": 147,
+      "loss": 0.047
+    },
+    {
+      "step": 148,
+      "loss": 0.0236
+    },
+    {
+      "step": 149,
+      "loss": 0.061
+    },
+    {
+      "step": 150,
+      "loss": 0.3053
+    },
+    {
+      "step": 151,
+      "loss": 0.1405
+    },
+    {
+      "step": 152,
+      "loss": 0.1131
+    },
+    {
+      "step": 153,
+      "loss": 0.0147
+    },
+    {
+      "step": 154,
+      "loss": 0.0888
+    },
+    {
+      "step": 155,
+      "loss": 0.0136
+    },
+    {
+      "step": 156,
+      "loss": 0.0091
+    },
+    {
+      "step": 157,
+      "loss": 0.011
+    },
+    {
+      "step": 158,
+      "loss": 0.0063
+    },
+    {
+      "step": 159,
+      "loss": 0.014
+    },
+    {
+      "step": 160,
+      "loss": 0.0344
+    },
+    {
+      "step": 161,
+      "loss": 0.0981
+    },
+    {
+      "step": 162,
+      "loss": 0.1965
+    },
+    {
+      "step": 163,
+      "loss": 0.0887
+    },
+    {
+      "step": 164,
+      "loss": 0.0409
+    },
+    {
+      "step": 165,
+      "loss": 0.0588
+    },
+    {
+      "step": 166,
+      "loss": 0.0632
+    },
+    {
+      "step": 167,
+      "loss": 0.0903
+    },
+    {
+      "step": 168,
+      "loss": 0.0796
+    },
+    {
+      "step": 169,
+      "loss": 0.0415
+    },
+    {
+      "step": 170,
+      "loss": 0.0989
+    },
+    {
+      "step": 171,
+      "loss": 0.0974
+    },
+    {
+      "step": 172,
+      "loss": 0.0363
+    },
+    {
+      "step": 173,
+      "loss": 0.0086
+    },
+    {
+      "step": 174,
+      "loss": 0.005
+    },
+    {
+      "step": 175,
+      "loss": 0.1765
+    },
+    {
+      "step": 176,
+      "loss": 0.2432
+    },
+    {
+      "step": 177,
+      "loss": 0.0805
+    },
+    {
+      "step": 178,
+      "loss": 0.0227
+    },
+    {
+      "step": 179,
+      "loss": 0.3566
+    },
+    {
+      "step": 180,
+      "loss": 0.0232
+    },
+    {
+      "step": 181,
+      "loss": 0.1591
+    },
+    {
+      "step": 182,
+      "loss": 0.0734
+    },
+    {
+      "step": 183,
+      "loss": 0.0099
+    },
+    {
+      "step": 184,
+      "loss": 0.0127
+    },
+    {
+      "step": 185,
+      "loss": 0.0028
+    },
+    {
+      "step": 186,
+      "loss": 0.0022
+    },
+    {
+      "step": 187,
+      "loss": 0.0082
+    },
+    {
+      "step": 188,
+      "loss": 0.006
+    },
+    {
+      "step": 189,
+      "loss": 0.0506
+    },
+    {
+      "step": 190,
+      "loss": 0.0107
+    },
+    {
+      "step": 191,
+      "loss": 0.0287
+    },
+    {
+      "step": 192,
+      "loss": 0.0021
+    },
+    {
+      "step": 193,
+      "loss": 0.1491
+    },
+    {
+      "step": 194,
+      "loss": 0.1414
+    },
+    {
+      "step": 195,
+      "loss": 0.003
+    },
+    {
+      "step": 196,
+      "loss": 0.0506
+    },
+    {
+      "step": 197,
+      "loss": 0.0182
+    },
+    {
+      "step": 198,
+      "loss": 0.073
+    },
+    {
+      "step": 199,
+      "loss": 0.0265
+    },
+    {
+      "step": 200,
+      "loss": 0.0079
+    },
+    {
+      "step": 201,
+      "loss": 0.0569
+    },
+    {
+      "step": 202,
+      "loss": 0.0138
+    },
+    {
+      "step": 203,
+      "loss": 0.0001
+    },
+    {
+      "step": 204,
+      "loss": 0.2567
+    },
+    {
+      "step": 205,
+      "loss": 0.0002
+    },
+    {
+      "step": 206,
+      "loss": 0.0142
+    },
+    {
+      "step": 207,
+      "loss": 0.0546
+    },
+    {
+      "step": 208,
+      "loss": 0.0485
+    },
+    {
+      "step": 209,
+      "loss": 0.1518
+    },
+    {
+      "step": 210,
+      "loss": 0.1869
+    },
+    {
+      "step": 211,
+      "loss": 0.2424
+    },
+    {
+      "step": 212,
+      "loss": 0.0379
+    },
+    {
+      "step": 213,
+      "loss": 0.0088
+    },
+    {
+      "step": 214,
+      "loss": 0.001
+    },
+    {
+      "step": 215,
+      "loss": 0.0102
+    },
+    {
+      "step": 216,
+      "loss": 0.1525
+    },
+    {
+      "step": 217,
+      "loss": 0.0018
+    },
+    {
+      "step": 218,
+      "loss": 0.0026
+    },
+    {
+      "step": 219,
+      "loss": 0.0031
+    },
+    {
+      "step": 220,
+      "loss": 0.0206
+    },
+    {
+      "step": 221,
+      "loss": 0.1205
+    },
+    {
+      "step": 222,
+      "loss": 0.01
+    },
+    {
+      "step": 223,
+      "loss": 0.0597
+    },
+    {
+      "step": 224,
+      "loss": 0.1187
+    },
+    {
+      "step": 225,
+      "loss": 0.0635
+    },
+    {
+      "step": 226,
+      "loss": 0.1923
+    },
+    {
+      "step": 227,
+      "loss": 0.0098
+    },
+    {
+      "step": 228,
+      "loss": 0.0003
+    },
+    {
+      "step": 229,
+      "loss": 0.0434
+    },
+    {
+      "step": 230,
+      "loss": 0.005
+    },
+    {
+      "step": 231,
+      "loss": 0.0029
+    },
+    {
+      "step": 232,
+      "loss": 0.0043
+    },
+    {
+      "step": 233,
+      "loss": 0.0057
+    },
+    {
+      "step": 234,
+      "loss": 0.0003
+    },
+    {
+      "step": 235,
+      "loss": 0.0005
+    },
+    {
+      "step": 236,
+      "loss": 0.0244
+    },
+    {
+      "step": 237,
+      "loss": 0.0776
+    },
+    {
+      "step": 238,
+      "loss": 0.0203
+    },
+    {
+      "step": 239,
+      "loss": 0.2391
+    },
+    {
+      "step": 240,
+      "loss": 0.0533
+    },
+    {
+      "step": 241,
+      "loss": 0.0536
+    },
+    {
+      "step": 242,
+      "loss": 0.0045
+    },
+    {
+      "step": 243,
+      "loss": 0.0131
+    },
+    {
+      "step": 244,
+      "loss": 0.0037
+    },
+    {
+      "step": 245,
+      "loss": 0.0182
+    },
+    {
+      "step": 246,
+      "loss": 0.0078
+    },
+    {
+      "step": 247,
+      "loss": 0.0413
+    },
+    {
+      "step": 248,
+      "loss": 0.1189
+    },
+    {
+      "step": 249,
+      "loss": 0.0267
+    },
+    {
+      "step": 250,
+      "loss": 0.0507
+    },
+    {
+      "step": 251,
+      "loss": 0.0261
+    },
+    {
+      "step": 252,
+      "loss": 0.0043
+    },
+    {
+      "step": 253,
+      "loss": 0.1643
+    },
+    {
+      "step": 254,
+      "loss": 0.4695
+    },
+    {
+      "step": 255,
+      "loss": 0.0008
+    },
+    {
+      "step": 256,
+      "loss": 0.0066
+    },
+    {
+      "step": 257,
+      "loss": 0.0042
+    },
+    {
+      "step": 258,
+      "loss": 0.2334
+    },
+    {
+      "step": 259,
+      "loss": 0.0046
+    },
+    {
+      "step": 260,
+      "loss": 0.0207
+    },
+    {
+      "step": 261,
+      "loss": 0.0356
+    },
+    {
+      "step": 262,
+      "loss": 0.0008
+    },
+    {
+      "step": 263,
+      "loss": 0.0371
+    },
+    {
+      "step": 264,
+      "loss": 0.0009
+    },
+    {
+      "step": 265,
+      "loss": 0.0082
+    },
+    {
+      "step": 266,
+      "loss": 0.0259
+    },
+    {
+      "step": 267,
+      "loss": 0.0008
+    },
+    {
+      "step": 268,
+      "loss": 0.0002
+    },
+    {
+      "step": 269,
+      "loss": 0.0011
+    },
+    {
+      "step": 270,
+      "loss": 0.0207
+    },
+    {
+      "step": 271,
+      "loss": 0.001
+    },
+    {
+      "step": 272,
+      "loss": 0.0017
+    },
+    {
+      "step": 273,
+      "loss": 0.0622
+    },
+    {
+      "step": 274,
+      "loss": 0.0083
+    },
+    {
+      "step": 275,
+      "loss": 0.0025
+    },
+    {
+      "step": 276,
+      "loss": 0.0006
+    },
+    {
+      "step": 277,
+      "loss": 0.0452
+    },
+    {
+      "step": 278,
+      "loss": 0.0009
+    },
+    {
+      "step": 279,
+      "loss": 0.0005
+    },
+    {
+      "step": 280,
+      "loss": 0.0005
+    },
+    {
+      "step": 281,
+      "loss": 0.0019
+    },
+    {
+      "step": 282,
+      "loss": 0.062
+    },
+    {
+      "step": 283,
+      "loss": 0.0027
+    },
+    {
+      "step": 284,
+      "loss": 0.0376
+    },
+    {
+      "step": 285,
+      "loss": 0.0
+    },
+    {
+      "step": 286,
+      "loss": 0.001
+    },
+    {
+      "step": 287,
+      "loss": 0.0219
+    },
+    {
+      "step": 288,
+      "loss": 0.0077
+    },
+    {
+      "step": 289,
+      "loss": 0.0055
+    },
+    {
+      "step": 290,
+      "loss": 0.1378
+    },
+    {
+      "step": 291,
+      "loss": 0.0117
+    },
+    {
+      "step": 292,
+      "loss": 0.0976
+    },
+    {
+      "step": 293,
+      "loss": 0.0026
+    },
+    {
+      "step": 294,
+      "loss": 0.0037
+    },
+    {
+      "step": 295,
+      "loss": 0.113
+    },
+    {
+      "step": 296,
+      "loss": 0.047
+    },
+    {
+      "step": 297,
+      "loss": 0.0004
+    },
+    {
+      "step": 298,
+      "loss": 0.0001
+    },
+    {
+      "step": 299,
+      "loss": 0.0001
+    },
+    {
+      "step": 300,
+      "loss": 0.0045
+    },
+    {
+      "step": 301,
+      "loss": 0.1614
+    },
+    {
+      "step": 302,
+      "loss": 0.259
+    },
+    {
+      "step": 303,
+      "loss": 0.0775
+    },
+    {
+      "step": 304,
+      "loss": 0.0084
+    },
+    {
+      "step": 305,
+      "loss": 0.0198
+    },
+    {
+      "step": 306,
+      "loss": 0.0668
+    },
+    {
+      "step": 307,
+      "loss": 0.0047
+    },
+    {
+      "step": 308,
+      "loss": 0.2898
+    },
+    {
+      "step": 309,
+      "loss": 0.0158
+    },
+    {
+      "step": 310,
+      "loss": 0.3233
+    },
+    {
+      "step": 311,
+      "loss": 0.0631
+    },
+    {
+      "step": 312,
+      "loss": 0.0017
+    },
+    {
+      "step": 313,
+      "loss": 0.0414
+    },
+    {
+      "step": 314,
+      "loss": 0.0002
+    },
+    {
+      "step": 315,
+      "loss": 0.0002
+    },
+    {
+      "step": 316,
+      "loss": 0.0124
+    },
+    {
+      "step": 317,
+      "loss": 0.0658
+    },
+    {
+      "step": 318,
+      "loss": 0.0008
+    },
+    {
+      "step": 319,
+      "loss": 0.0021
+    },
+    {
+      "step": 320,
+      "loss": 0.0011
+    }
+  ],
+  "eval_loss": [
+    {
+      "step": 200,
+      "eval_loss": 0.07848864793777466
+    }
+  ],
+  "args": {
+    "lora_r": 16,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+    "lora_target_modules": "q_proj,v_proj",
+    "tuning_strategy": "lora",
+    "num_trainable_layers": 2,
+    "output_dir": "sft_prefill/prompt_id_3/qwen2-VL-7B-Instruct-syn-count-lora",
+    "num_train_epochs": 2,
+    "learning_rate": 0.0002,
+    "per_device_train_batch_size": 16,
+    "per_device_eval_batch_size": 16,
+    "gradient_accumulation_steps": 1,
+    "logging_steps": 10,
+    "eval_steps": 200,
+    "save_steps": 200,
+    "warmup_ratio": 0.03,
+    "weight_decay": 0.0,
+    "max_grad_norm": 0.3,
+    "lr_scheduler_type": "constant",
+    "bf16": true,
+    "tf32": true,
+    "gradient_checkpointing": true,
+    "optim": "adamw_torch_fused",
+    "ft_type": "SFT_Prefill",
+    "data_type": "small",
+    "prompt_id": 3
+  }
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff