diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..99689f2064ae7705151b634b83c849b44e127847 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-1976/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2470/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md new file mode 100644 index 0000000000000000000000000000000000000000..bbcf180165d2e37cca437506237ec52d6ff25aae --- /dev/null +++ b/.ipynb_checkpoints/README-checkpoint.md @@ -0,0 +1,61 @@ +--- +library_name: peft +license: other +base_model: meta-llama/Llama-3.3-70B-Instruct +tags: +- llama-factory +- lora +- generated_from_trainer +model-index: +- name: saves_fullp_ds3 + results: [] +--- + + + +# saves_fullp_ds3 + +This model is a fine-tuned version of [/workspace/llms/Llama/Llama-3.3-70B-Instruct](https://huggingface.co//workspace/llms/Llama/Llama-3.3-70B-Instruct) on the data_short_chunk dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 1 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- total_train_batch_size: 8 +- total_eval_batch_size: 64 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 10.0 + +### Training results + + + +### Framework versions + +- PEFT 0.15.2 +- Transformers 4.51.3 +- Pytorch 2.4.1+cu124 +- Datasets 3.6.0 +- Tokenizers 0.21.1 \ No newline at end of file diff --git a/.ipynb_checkpoints/training_loss-checkpoint.png b/.ipynb_checkpoints/training_loss-checkpoint.png new file mode 100644 index 0000000000000000000000000000000000000000..b726123e33dd1390382a6a679ce48247ee002686 Binary files /dev/null and b/.ipynb_checkpoints/training_loss-checkpoint.png differ diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bbcf180165d2e37cca437506237ec52d6ff25aae --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +--- +library_name: peft +license: other +base_model: meta-llama/Llama-3.3-70B-Instruct +tags: +- llama-factory +- lora +- generated_from_trainer +model-index: +- name: saves_fullp_ds3 + results: [] +--- + + + +# saves_fullp_ds3 + +This model is a fine-tuned version of [/workspace/llms/Llama/Llama-3.3-70B-Instruct](https://huggingface.co//workspace/llms/Llama/Llama-3.3-70B-Instruct) on the data_short_chunk dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 1e-05 +- train_batch_size: 1 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 8 +- total_train_batch_size: 8 +- total_eval_batch_size: 64 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 10.0 + +### Training results + + + +### Framework versions + +- PEFT 0.15.2 +- Transformers 4.51.3 +- Pytorch 2.4.1+cu124 +- Datasets 3.6.0 +- Tokenizers 0.21.1 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..066439893af62fc73d76190c54bbd164236f3c3f --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/llms/Llama/Llama-3.3-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3a50b59e024ea094364c047037eda8ad502bc535 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4759d9f6e45e8d965c676ba9f62be02b36b985ce2c373942f50ac32b1690f35 +size 207244392 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..896e55bff219afc5b3341922c117c6239e776b4f --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 10.0, + "total_flos": 750948524032000.0, + "train_loss": 1.639819175053222, + "train_runtime": 7687.531, + "train_samples_per_second": 2.57, + "train_steps_per_second": 0.321 +} \ No newline at end of file diff --git a/checkpoint-1976/README.md b/checkpoint-1976/README.md new file mode 100644 index 0000000000000000000000000000000000000000..55b2fbb7c5bc2d687ad0ac5e2fdc9358b78dd42a --- /dev/null +++ b/checkpoint-1976/README.md @@ -0,0 +1,202 @@ +--- +base_model: /workspace/llms/Llama/Llama-3.3-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.2 \ No newline at end of file diff --git a/checkpoint-1976/adapter_config.json b/checkpoint-1976/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..066439893af62fc73d76190c54bbd164236f3c3f --- /dev/null +++ b/checkpoint-1976/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/llms/Llama/Llama-3.3-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1976/adapter_model.safetensors b/checkpoint-1976/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..31e0132ecec3a9d0c4fa8a3e90530fffbb82fcdc --- /dev/null +++ b/checkpoint-1976/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f2adfb02459cdb05462efaaa4361e35a0476287c07645f9ec42e20612bfe0a2 +size 207244392 diff --git a/checkpoint-1976/global_step1976/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e112f1dc5f82153b43f37db501220904142894df --- /dev/null +++ b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c83707a20845cfc060f8ca2574af60b8608b251f4685b502482e37b779d7dd4 +size 155324144 diff --git a/checkpoint-1976/global_step1976/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b465e72f1009c0b5bca1c93a667c67dbb4c35bb8 --- /dev/null +++ b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26aa9cb8a17c56d94c78a0f9fe3c72284e9f917908e20dd1cbca2b205519087c +size 155324144 diff --git a/checkpoint-1976/global_step1976/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68d6a41e14571dd21ac16468fab95834ab46a171 --- /dev/null +++ b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0d5899dfae4c650af86251717711e708256259c98770f049a69ce3b02f886fe +size 155324144 diff --git a/checkpoint-1976/global_step1976/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5faba48124b36375d24c51d05d8c37d1effb7574 --- /dev/null +++ b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987a8b2903176897ff4562203067e63168039a218a36fe6216a442f0ee1cf315 +size 155324144 diff --git a/checkpoint-1976/global_step1976/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b477a750f37ff960ec076128b8bb089d71d2961 --- /dev/null +++ b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a28b5660a8055dd4b885c3e539b3e4d7ae4e9251f48cd3cef1ff61f3207eef7a +size 155324144 diff --git a/checkpoint-1976/global_step1976/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..694ae310f3e9f404d74821681ba91b00a82addd0 --- /dev/null +++ b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2952861cbe30620d7836ba19844c5def9752caab3bd3e67121e94db2f1cfb7 +size 155324144 diff --git a/checkpoint-1976/global_step1976/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf3eb1d6575f0ab632c5c81c2385dcc3230b9a3b --- /dev/null +++ b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f65ccb152a2427355b454c8af322e9a408cd7bdf6025c86f62fa6ab020916f2 +size 155324144 diff --git a/checkpoint-1976/global_step1976/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8770442f984371b1b9a747af333196bd4196c6a --- /dev/null +++ b/checkpoint-1976/global_step1976/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20a640e4a3c2ed8f42673fa5a9eaa9057e881eeae6c4495a9971d5cf83faa4ce +size 155324144 diff --git a/checkpoint-1976/global_step1976/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-1976/global_step1976/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16db8fd9944bb8377978af25e41b71c60680fcb2 --- /dev/null +++ b/checkpoint-1976/global_step1976/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a076608d13b1e49c5b8d5e479873b3071e3084ee2a13ec8f7b9f2d6547122a3 +size 1107654 diff --git a/checkpoint-1976/global_step1976/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-1976/global_step1976/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e28ab6a16eeb18489a747fbe3e8ceda7567c0a2 --- /dev/null +++ b/checkpoint-1976/global_step1976/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7edc091a40ee0b9625860c25d1f5c804b768f6e3397c94c3a07b933c942795b9 +size 1107654 diff --git a/checkpoint-1976/global_step1976/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-1976/global_step1976/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6fad17d57551c30f02ba5269ef9d77b31856c50 --- /dev/null +++ b/checkpoint-1976/global_step1976/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb356b884e175bdd0dd1ea33e76a1854422133502463933df314640f8121df3d +size 1107654 diff --git a/checkpoint-1976/global_step1976/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-1976/global_step1976/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8043287b0519e9d01062e9ef09a659e601d7b33 --- /dev/null +++ b/checkpoint-1976/global_step1976/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9be29bc5dfc2aa9b97c622808068f9c4c9f048a7d93f60c90fb8f296b867f8ae +size 1107654 diff --git a/checkpoint-1976/global_step1976/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-1976/global_step1976/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecd01f7a2611e1cb8fd3637fd8c0f1eea1e2bcc3 --- /dev/null +++ b/checkpoint-1976/global_step1976/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c74312a103f0d0ab4cf96479e01a5f1aea10abc4e37ec4f96137a6dd031512c +size 1107654 diff --git a/checkpoint-1976/global_step1976/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-1976/global_step1976/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c45efb6be60879303d21261e51277fb45efecf66 --- /dev/null +++ b/checkpoint-1976/global_step1976/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e34ed257f37ac3d6ee6d2ea6af897497d5b7fbccc7b54d044e5efbbbdc702cce +size 1107654 diff --git a/checkpoint-1976/global_step1976/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-1976/global_step1976/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..277eb3a6185c60e576aa0eeeae8733979ba6e5c5 --- /dev/null +++ b/checkpoint-1976/global_step1976/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b37a5d11c13f54241747f361536d07d5cbe85f5c33676594bd07fc6fcd50c77 +size 1107654 diff --git a/checkpoint-1976/global_step1976/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-1976/global_step1976/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9241ebfe0b1b3a5adbf248f270abae9aec8f8363 --- /dev/null +++ b/checkpoint-1976/global_step1976/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ef7b70623aec703342b4a85a2e53e78ddda9d19dc10f4156e7dd64b23267cf2 +size 1107654 diff --git a/checkpoint-1976/latest b/checkpoint-1976/latest new file mode 100644 index 0000000000000000000000000000000000000000..544f68503ce3a470b8001ccd75000a197d21e271 --- /dev/null +++ b/checkpoint-1976/latest @@ -0,0 +1 @@ +global_step1976 \ No newline at end of file diff --git a/checkpoint-1976/rng_state_0.pth b/checkpoint-1976/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6a74f25da28f01a2e6b66587824ee5f5cc9be737 --- /dev/null +++ b/checkpoint-1976/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ee195ebde9bf012f945f068f133e7fe22fef5450c496607e3ef11cc2034a186 +size 15984 diff --git a/checkpoint-1976/rng_state_1.pth b/checkpoint-1976/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f44ddc47315653477728c971b4ea191a3df8b92c --- /dev/null +++ b/checkpoint-1976/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0fe1a3315d60b197207c5cb249d0ce4f9ce6d7585e696276d9ffbcb5379893 +size 15984 diff --git a/checkpoint-1976/rng_state_2.pth b/checkpoint-1976/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..04636b9eca6484a4339eaa1e3acdf15d42d493b3 --- /dev/null +++ b/checkpoint-1976/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c5bd6eae04542162b3e94245555bd81312524066bc01d0ebbfc4fd8554240e +size 15984 diff --git a/checkpoint-1976/rng_state_3.pth b/checkpoint-1976/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..05435e407541728c3159054a4beb6705039a8ddf --- /dev/null +++ b/checkpoint-1976/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45b74942c68b00d657cfce186b0eeb4aa8f52efa04b114803b605fee8de45972 +size 15984 diff --git a/checkpoint-1976/rng_state_4.pth b/checkpoint-1976/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..94fdf5f2c3e5df27424e6482bf52255531147a23 --- /dev/null +++ b/checkpoint-1976/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cd66dd2ba958fc9929441817d8154abbd929c0aa9cd66ff3171965bdaaf5d78 +size 15984 diff --git a/checkpoint-1976/rng_state_5.pth b/checkpoint-1976/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..da6e37fc011d97a1512e1e746bdd410a738c018a --- /dev/null +++ b/checkpoint-1976/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89eeedefdd62514d0130acc330a5c08e9774c95d38c60997905cfd65fc54b710 +size 15984 diff --git a/checkpoint-1976/rng_state_6.pth b/checkpoint-1976/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..751fd85c617e15dee9713bc0f0c533af5bd18c8e --- /dev/null +++ b/checkpoint-1976/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43ced939100082608f57561a10e1888e69210c80675068db530c5815889910e +size 15984 diff --git a/checkpoint-1976/rng_state_7.pth b/checkpoint-1976/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..4aacf54fa8285b7e199a7cd62f1ee3d8b9beb5e5 --- /dev/null +++ b/checkpoint-1976/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d8d6ee244d99525e7004ae3f02d44ae63082d81fbbab7306f641ac6aeeb736f +size 15984 diff --git a/checkpoint-1976/scheduler.pt b/checkpoint-1976/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a2f9e0492757703f7332095f086682c6525c68c --- /dev/null +++ b/checkpoint-1976/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60cbbd964026139034626f734281459b8bb3677157bcb494c6d6e932b364a81 +size 1064 diff --git a/checkpoint-1976/special_tokens_map.json b/checkpoint-1976/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-1976/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1976/tokenizer.json b/checkpoint-1976/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-1976/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1976/tokenizer_config.json b/checkpoint-1976/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a65829f8d45598369efc368800ef14b5dbd9f997 --- /dev/null +++ b/checkpoint-1976/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-1976/trainer_state.json b/checkpoint-1976/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..23bf7917e6e9f65f4fcb033dc06921fbd66373a3 --- /dev/null +++ b/checkpoint-1976/trainer_state.json @@ -0,0 +1,13866 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 1976, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004048582995951417, + "grad_norm": 0.46473725687854356, + "learning_rate": 0.0, + "loss": 2.5926, + "step": 1 + }, + { + "epoch": 0.008097165991902834, + "grad_norm": 0.7862315968268553, + "learning_rate": 4.0485829959514176e-08, + "loss": 2.9114, + "step": 2 + }, + { + "epoch": 0.012145748987854251, + "grad_norm": 0.6677933506680473, + "learning_rate": 8.097165991902835e-08, + "loss": 2.7471, + "step": 3 + }, + { + "epoch": 0.016194331983805668, + "grad_norm": 0.8630518959378011, + "learning_rate": 1.2145748987854252e-07, + "loss": 2.8706, + "step": 4 + }, + { + "epoch": 0.020242914979757085, + "grad_norm": 0.5173190139924537, + "learning_rate": 1.619433198380567e-07, + "loss": 2.9912, + "step": 5 + }, + { + "epoch": 0.024291497975708502, + "grad_norm": 0.7759993718339214, + "learning_rate": 2.0242914979757086e-07, + "loss": 3.0072, + "step": 6 + }, + { + "epoch": 0.02834008097165992, + "grad_norm": 1.3755130452390263, + "learning_rate": 2.4291497975708504e-07, + "loss": 2.4721, + "step": 7 + }, + { + "epoch": 0.032388663967611336, + "grad_norm": 0.44121276912866286, + "learning_rate": 2.834008097165992e-07, + "loss": 2.843, + "step": 8 + }, + { + "epoch": 0.03643724696356275, + "grad_norm": 0.5559835506705462, + "learning_rate": 3.238866396761134e-07, + "loss": 2.9053, + "step": 9 + }, + { + "epoch": 0.04048582995951417, + "grad_norm": 0.6731704914870359, + "learning_rate": 3.6437246963562754e-07, + "loss": 2.7608, + "step": 10 + }, + { + "epoch": 0.044534412955465584, + "grad_norm": 0.43190024730085624, + "learning_rate": 4.048582995951417e-07, + "loss": 2.7074, + "step": 11 + }, + { + "epoch": 0.048582995951417005, + "grad_norm": 0.7594718614486027, + "learning_rate": 4.453441295546559e-07, + "loss": 2.7846, + "step": 12 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.4278958670654092, + "learning_rate": 4.858299595141701e-07, + "loss": 3.018, + "step": 13 + }, + { + "epoch": 0.05668016194331984, + "grad_norm": 0.48698492939265825, + "learning_rate": 5.263157894736843e-07, + "loss": 2.8131, + "step": 14 + }, + { + "epoch": 0.06072874493927125, + "grad_norm": 0.405274105300616, + "learning_rate": 5.668016194331984e-07, + "loss": 2.8777, + "step": 15 + }, + { + "epoch": 0.06477732793522267, + "grad_norm": 0.5554327831452092, + "learning_rate": 6.072874493927125e-07, + "loss": 2.9472, + "step": 16 + }, + { + "epoch": 0.06882591093117409, + "grad_norm": 0.44756530277540646, + "learning_rate": 6.477732793522268e-07, + "loss": 3.0157, + "step": 17 + }, + { + "epoch": 0.0728744939271255, + "grad_norm": 0.8072585997136504, + "learning_rate": 6.882591093117409e-07, + "loss": 2.7773, + "step": 18 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.5635933276885046, + "learning_rate": 7.287449392712551e-07, + "loss": 2.7169, + "step": 19 + }, + { + "epoch": 0.08097165991902834, + "grad_norm": 0.4673928500608582, + "learning_rate": 7.692307692307694e-07, + "loss": 2.7934, + "step": 20 + }, + { + "epoch": 0.08502024291497975, + "grad_norm": 1.3664880257539318, + "learning_rate": 8.097165991902834e-07, + "loss": 2.713, + "step": 21 + }, + { + "epoch": 0.08906882591093117, + "grad_norm": 0.6438340318121762, + "learning_rate": 8.502024291497976e-07, + "loss": 2.8722, + "step": 22 + }, + { + "epoch": 0.0931174089068826, + "grad_norm": 0.512121787489251, + "learning_rate": 8.906882591093118e-07, + "loss": 2.722, + "step": 23 + }, + { + "epoch": 0.09716599190283401, + "grad_norm": 1.023552604444706, + "learning_rate": 9.31174089068826e-07, + "loss": 2.5291, + "step": 24 + }, + { + "epoch": 0.10121457489878542, + "grad_norm": 0.556430330792241, + "learning_rate": 9.716599190283402e-07, + "loss": 2.7028, + "step": 25 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.0165779263195185, + "learning_rate": 1.0121457489878542e-06, + "loss": 2.7946, + "step": 26 + }, + { + "epoch": 0.10931174089068826, + "grad_norm": 0.8434539164732048, + "learning_rate": 1.0526315789473685e-06, + "loss": 2.6139, + "step": 27 + }, + { + "epoch": 0.11336032388663968, + "grad_norm": 0.6252954896694622, + "learning_rate": 1.0931174089068828e-06, + "loss": 2.469, + "step": 28 + }, + { + "epoch": 0.11740890688259109, + "grad_norm": 0.8618444900481227, + "learning_rate": 1.133603238866397e-06, + "loss": 2.6452, + "step": 29 + }, + { + "epoch": 0.1214574898785425, + "grad_norm": 0.9066908581713439, + "learning_rate": 1.174089068825911e-06, + "loss": 2.4396, + "step": 30 + }, + { + "epoch": 0.12550607287449392, + "grad_norm": 0.528141325017682, + "learning_rate": 1.214574898785425e-06, + "loss": 2.469, + "step": 31 + }, + { + "epoch": 0.12955465587044535, + "grad_norm": 0.6378156052352336, + "learning_rate": 1.2550607287449393e-06, + "loss": 2.5795, + "step": 32 + }, + { + "epoch": 0.13360323886639677, + "grad_norm": 0.5624703100477139, + "learning_rate": 1.2955465587044536e-06, + "loss": 2.6768, + "step": 33 + }, + { + "epoch": 0.13765182186234817, + "grad_norm": 0.5821134471598685, + "learning_rate": 1.336032388663968e-06, + "loss": 2.8086, + "step": 34 + }, + { + "epoch": 0.1417004048582996, + "grad_norm": 0.6258194867082703, + "learning_rate": 1.3765182186234818e-06, + "loss": 2.3603, + "step": 35 + }, + { + "epoch": 0.145748987854251, + "grad_norm": 0.5477831289461287, + "learning_rate": 1.417004048582996e-06, + "loss": 2.7758, + "step": 36 + }, + { + "epoch": 0.14979757085020243, + "grad_norm": 0.5008051448479439, + "learning_rate": 1.4574898785425101e-06, + "loss": 2.7543, + "step": 37 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.5096264603702895, + "learning_rate": 1.4979757085020244e-06, + "loss": 2.7356, + "step": 38 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.6456644864025523, + "learning_rate": 1.5384615384615387e-06, + "loss": 3.0218, + "step": 39 + }, + { + "epoch": 0.16194331983805668, + "grad_norm": 0.5888424191973028, + "learning_rate": 1.5789473684210526e-06, + "loss": 2.6165, + "step": 40 + }, + { + "epoch": 0.1659919028340081, + "grad_norm": 0.7898553504446816, + "learning_rate": 1.6194331983805669e-06, + "loss": 2.6223, + "step": 41 + }, + { + "epoch": 0.1700404858299595, + "grad_norm": 0.6232472926548593, + "learning_rate": 1.6599190283400812e-06, + "loss": 2.7768, + "step": 42 + }, + { + "epoch": 0.17408906882591094, + "grad_norm": 0.6922764219271268, + "learning_rate": 1.7004048582995952e-06, + "loss": 2.479, + "step": 43 + }, + { + "epoch": 0.17813765182186234, + "grad_norm": 0.6679665416214551, + "learning_rate": 1.7408906882591095e-06, + "loss": 2.6842, + "step": 44 + }, + { + "epoch": 0.18218623481781376, + "grad_norm": 0.48868645690455986, + "learning_rate": 1.7813765182186236e-06, + "loss": 2.3611, + "step": 45 + }, + { + "epoch": 0.1862348178137652, + "grad_norm": 1.0959755351532565, + "learning_rate": 1.8218623481781379e-06, + "loss": 2.6644, + "step": 46 + }, + { + "epoch": 0.1902834008097166, + "grad_norm": 0.7403727047924632, + "learning_rate": 1.862348178137652e-06, + "loss": 2.7313, + "step": 47 + }, + { + "epoch": 0.19433198380566802, + "grad_norm": 0.5355809576361324, + "learning_rate": 1.902834008097166e-06, + "loss": 2.976, + "step": 48 + }, + { + "epoch": 0.19838056680161945, + "grad_norm": 0.6203117033335515, + "learning_rate": 1.9433198380566803e-06, + "loss": 2.8615, + "step": 49 + }, + { + "epoch": 0.20242914979757085, + "grad_norm": 0.6748602332749001, + "learning_rate": 1.9838056680161946e-06, + "loss": 2.7385, + "step": 50 + }, + { + "epoch": 0.20647773279352227, + "grad_norm": 0.6061522444778688, + "learning_rate": 2.0242914979757085e-06, + "loss": 2.7926, + "step": 51 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.5677094053210018, + "learning_rate": 2.0647773279352228e-06, + "loss": 2.8905, + "step": 52 + }, + { + "epoch": 0.2145748987854251, + "grad_norm": 0.7539663022721307, + "learning_rate": 2.105263157894737e-06, + "loss": 2.7044, + "step": 53 + }, + { + "epoch": 0.21862348178137653, + "grad_norm": 0.5511775427996539, + "learning_rate": 2.1457489878542513e-06, + "loss": 2.6044, + "step": 54 + }, + { + "epoch": 0.22267206477732793, + "grad_norm": 0.5001055873779205, + "learning_rate": 2.1862348178137656e-06, + "loss": 2.7154, + "step": 55 + }, + { + "epoch": 0.22672064777327935, + "grad_norm": 5.059433496293122, + "learning_rate": 2.2267206477732795e-06, + "loss": 2.6151, + "step": 56 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.5976992576491789, + "learning_rate": 2.267206477732794e-06, + "loss": 2.8561, + "step": 57 + }, + { + "epoch": 0.23481781376518218, + "grad_norm": 0.5650795458768608, + "learning_rate": 2.307692307692308e-06, + "loss": 2.994, + "step": 58 + }, + { + "epoch": 0.2388663967611336, + "grad_norm": 1.110043039226332, + "learning_rate": 2.348178137651822e-06, + "loss": 2.9581, + "step": 59 + }, + { + "epoch": 0.242914979757085, + "grad_norm": 0.8353821859752748, + "learning_rate": 2.3886639676113362e-06, + "loss": 2.9613, + "step": 60 + }, + { + "epoch": 0.24696356275303644, + "grad_norm": 0.7575324618871198, + "learning_rate": 2.42914979757085e-06, + "loss": 2.7295, + "step": 61 + }, + { + "epoch": 0.25101214574898784, + "grad_norm": 0.7791476828146748, + "learning_rate": 2.4696356275303644e-06, + "loss": 2.7126, + "step": 62 + }, + { + "epoch": 0.2550607287449393, + "grad_norm": 0.4809737260566304, + "learning_rate": 2.5101214574898787e-06, + "loss": 2.8892, + "step": 63 + }, + { + "epoch": 0.2591093117408907, + "grad_norm": 0.5968909877448142, + "learning_rate": 2.550607287449393e-06, + "loss": 2.6468, + "step": 64 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.7701935599652083, + "learning_rate": 2.5910931174089072e-06, + "loss": 2.5171, + "step": 65 + }, + { + "epoch": 0.26720647773279355, + "grad_norm": 0.49540617385936636, + "learning_rate": 2.631578947368421e-06, + "loss": 2.5617, + "step": 66 + }, + { + "epoch": 0.27125506072874495, + "grad_norm": 0.5880768265382437, + "learning_rate": 2.672064777327936e-06, + "loss": 2.6525, + "step": 67 + }, + { + "epoch": 0.27530364372469635, + "grad_norm": 0.8719044761766179, + "learning_rate": 2.7125506072874497e-06, + "loss": 2.5136, + "step": 68 + }, + { + "epoch": 0.2793522267206478, + "grad_norm": 0.7508384152907464, + "learning_rate": 2.7530364372469636e-06, + "loss": 2.7136, + "step": 69 + }, + { + "epoch": 0.2834008097165992, + "grad_norm": 0.7593508374848729, + "learning_rate": 2.7935222672064783e-06, + "loss": 2.5836, + "step": 70 + }, + { + "epoch": 0.2874493927125506, + "grad_norm": 0.6236865711432193, + "learning_rate": 2.834008097165992e-06, + "loss": 2.6042, + "step": 71 + }, + { + "epoch": 0.291497975708502, + "grad_norm": 0.9207439340534006, + "learning_rate": 2.8744939271255064e-06, + "loss": 2.4534, + "step": 72 + }, + { + "epoch": 0.29554655870445345, + "grad_norm": 0.9048216657065745, + "learning_rate": 2.9149797570850203e-06, + "loss": 2.7732, + "step": 73 + }, + { + "epoch": 0.29959514170040485, + "grad_norm": 1.0531213295224573, + "learning_rate": 2.955465587044535e-06, + "loss": 2.6927, + "step": 74 + }, + { + "epoch": 0.30364372469635625, + "grad_norm": 0.8889664393499657, + "learning_rate": 2.995951417004049e-06, + "loss": 2.7532, + "step": 75 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.678148296266936, + "learning_rate": 3.0364372469635627e-06, + "loss": 2.4982, + "step": 76 + }, + { + "epoch": 0.3117408906882591, + "grad_norm": 0.9143989903488097, + "learning_rate": 3.0769230769230774e-06, + "loss": 2.4821, + "step": 77 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.7430526887934812, + "learning_rate": 3.1174089068825913e-06, + "loss": 2.8892, + "step": 78 + }, + { + "epoch": 0.31983805668016196, + "grad_norm": 1.0967354490931058, + "learning_rate": 3.157894736842105e-06, + "loss": 2.5355, + "step": 79 + }, + { + "epoch": 0.32388663967611336, + "grad_norm": 0.6474936013842225, + "learning_rate": 3.19838056680162e-06, + "loss": 2.4627, + "step": 80 + }, + { + "epoch": 0.32793522267206476, + "grad_norm": 0.8223317792104156, + "learning_rate": 3.2388663967611337e-06, + "loss": 2.5097, + "step": 81 + }, + { + "epoch": 0.3319838056680162, + "grad_norm": 0.8471027758590536, + "learning_rate": 3.279352226720648e-06, + "loss": 2.5888, + "step": 82 + }, + { + "epoch": 0.3360323886639676, + "grad_norm": 0.4892443825365843, + "learning_rate": 3.3198380566801623e-06, + "loss": 2.4857, + "step": 83 + }, + { + "epoch": 0.340080971659919, + "grad_norm": 0.6329419393193343, + "learning_rate": 3.3603238866396766e-06, + "loss": 2.3704, + "step": 84 + }, + { + "epoch": 0.3441295546558704, + "grad_norm": 0.7450745621264726, + "learning_rate": 3.4008097165991905e-06, + "loss": 2.4814, + "step": 85 + }, + { + "epoch": 0.3481781376518219, + "grad_norm": 0.7915890438013479, + "learning_rate": 3.4412955465587043e-06, + "loss": 2.7336, + "step": 86 + }, + { + "epoch": 0.3522267206477733, + "grad_norm": 0.8224002727747803, + "learning_rate": 3.481781376518219e-06, + "loss": 2.6197, + "step": 87 + }, + { + "epoch": 0.3562753036437247, + "grad_norm": 0.7379097347027997, + "learning_rate": 3.522267206477733e-06, + "loss": 2.3123, + "step": 88 + }, + { + "epoch": 0.3603238866396761, + "grad_norm": 0.63590140796502, + "learning_rate": 3.562753036437247e-06, + "loss": 2.659, + "step": 89 + }, + { + "epoch": 0.3643724696356275, + "grad_norm": 0.9402424866754966, + "learning_rate": 3.6032388663967615e-06, + "loss": 2.6324, + "step": 90 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.7757246306456501, + "learning_rate": 3.6437246963562758e-06, + "loss": 2.5935, + "step": 91 + }, + { + "epoch": 0.3724696356275304, + "grad_norm": 0.7001956828085119, + "learning_rate": 3.6842105263157896e-06, + "loss": 2.8634, + "step": 92 + }, + { + "epoch": 0.3765182186234818, + "grad_norm": 0.6770880287428972, + "learning_rate": 3.724696356275304e-06, + "loss": 2.3526, + "step": 93 + }, + { + "epoch": 0.3805668016194332, + "grad_norm": 0.7469924696350099, + "learning_rate": 3.7651821862348182e-06, + "loss": 2.4551, + "step": 94 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.6156146016330529, + "learning_rate": 3.805668016194332e-06, + "loss": 2.441, + "step": 95 + }, + { + "epoch": 0.38866396761133604, + "grad_norm": 0.7142333380873401, + "learning_rate": 3.846153846153847e-06, + "loss": 2.5222, + "step": 96 + }, + { + "epoch": 0.39271255060728744, + "grad_norm": 0.6126483934481857, + "learning_rate": 3.886639676113361e-06, + "loss": 2.6018, + "step": 97 + }, + { + "epoch": 0.3967611336032389, + "grad_norm": 0.7531177478658849, + "learning_rate": 3.9271255060728745e-06, + "loss": 2.4227, + "step": 98 + }, + { + "epoch": 0.4008097165991903, + "grad_norm": 0.7172471080034739, + "learning_rate": 3.967611336032389e-06, + "loss": 2.4637, + "step": 99 + }, + { + "epoch": 0.4048582995951417, + "grad_norm": 0.7800438096349082, + "learning_rate": 4.008097165991903e-06, + "loss": 2.5228, + "step": 100 + }, + { + "epoch": 0.4089068825910931, + "grad_norm": 0.8009705607457139, + "learning_rate": 4.048582995951417e-06, + "loss": 2.6356, + "step": 101 + }, + { + "epoch": 0.41295546558704455, + "grad_norm": 0.9574889353775141, + "learning_rate": 4.089068825910931e-06, + "loss": 2.3874, + "step": 102 + }, + { + "epoch": 0.41700404858299595, + "grad_norm": 0.7824043116812712, + "learning_rate": 4.1295546558704455e-06, + "loss": 2.6671, + "step": 103 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.7116660818199502, + "learning_rate": 4.170040485829959e-06, + "loss": 2.6795, + "step": 104 + }, + { + "epoch": 0.4251012145748988, + "grad_norm": 0.6234909516086495, + "learning_rate": 4.210526315789474e-06, + "loss": 2.4891, + "step": 105 + }, + { + "epoch": 0.4291497975708502, + "grad_norm": 0.7507042701110958, + "learning_rate": 4.251012145748988e-06, + "loss": 2.5374, + "step": 106 + }, + { + "epoch": 0.4331983805668016, + "grad_norm": 0.5830775553501698, + "learning_rate": 4.291497975708503e-06, + "loss": 2.4393, + "step": 107 + }, + { + "epoch": 0.43724696356275305, + "grad_norm": 0.8561666711107475, + "learning_rate": 4.3319838056680166e-06, + "loss": 2.3122, + "step": 108 + }, + { + "epoch": 0.44129554655870445, + "grad_norm": 0.914997362840242, + "learning_rate": 4.372469635627531e-06, + "loss": 2.5436, + "step": 109 + }, + { + "epoch": 0.44534412955465585, + "grad_norm": 0.6732155905531092, + "learning_rate": 4.412955465587045e-06, + "loss": 2.5005, + "step": 110 + }, + { + "epoch": 0.4493927125506073, + "grad_norm": 0.7462341368666683, + "learning_rate": 4.453441295546559e-06, + "loss": 2.4483, + "step": 111 + }, + { + "epoch": 0.4534412955465587, + "grad_norm": 0.8245738963488927, + "learning_rate": 4.493927125506074e-06, + "loss": 2.5333, + "step": 112 + }, + { + "epoch": 0.4574898785425101, + "grad_norm": 0.7702932505386926, + "learning_rate": 4.534412955465588e-06, + "loss": 2.5613, + "step": 113 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 1.3101615300934801, + "learning_rate": 4.5748987854251014e-06, + "loss": 2.973, + "step": 114 + }, + { + "epoch": 0.46558704453441296, + "grad_norm": 0.7651586289456958, + "learning_rate": 4.615384615384616e-06, + "loss": 2.5947, + "step": 115 + }, + { + "epoch": 0.46963562753036436, + "grad_norm": 0.8222224925704688, + "learning_rate": 4.65587044534413e-06, + "loss": 2.4581, + "step": 116 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.6556587501075568, + "learning_rate": 4.696356275303644e-06, + "loss": 2.4571, + "step": 117 + }, + { + "epoch": 0.4777327935222672, + "grad_norm": 0.821438637414972, + "learning_rate": 4.736842105263158e-06, + "loss": 2.6622, + "step": 118 + }, + { + "epoch": 0.4817813765182186, + "grad_norm": 0.6254867878515806, + "learning_rate": 4.7773279352226725e-06, + "loss": 2.3622, + "step": 119 + }, + { + "epoch": 0.48582995951417, + "grad_norm": 0.6606998242945233, + "learning_rate": 4.817813765182186e-06, + "loss": 2.4812, + "step": 120 + }, + { + "epoch": 0.4898785425101215, + "grad_norm": 0.9140647082414407, + "learning_rate": 4.8582995951417e-06, + "loss": 2.5297, + "step": 121 + }, + { + "epoch": 0.4939271255060729, + "grad_norm": 0.8543729933153993, + "learning_rate": 4.898785425101215e-06, + "loss": 2.5534, + "step": 122 + }, + { + "epoch": 0.4979757085020243, + "grad_norm": 0.9641287101724041, + "learning_rate": 4.939271255060729e-06, + "loss": 2.3909, + "step": 123 + }, + { + "epoch": 0.5020242914979757, + "grad_norm": 0.7562747998003689, + "learning_rate": 4.9797570850202435e-06, + "loss": 2.3104, + "step": 124 + }, + { + "epoch": 0.5060728744939271, + "grad_norm": 0.9684058066200523, + "learning_rate": 5.020242914979757e-06, + "loss": 2.5894, + "step": 125 + }, + { + "epoch": 0.5101214574898786, + "grad_norm": 1.0833146453760147, + "learning_rate": 5.060728744939272e-06, + "loss": 2.686, + "step": 126 + }, + { + "epoch": 0.5141700404858299, + "grad_norm": 0.7212110120886743, + "learning_rate": 5.101214574898786e-06, + "loss": 2.5203, + "step": 127 + }, + { + "epoch": 0.5182186234817814, + "grad_norm": 0.9848467525032204, + "learning_rate": 5.1417004048583e-06, + "loss": 2.66, + "step": 128 + }, + { + "epoch": 0.5222672064777328, + "grad_norm": 0.78315965526943, + "learning_rate": 5.1821862348178145e-06, + "loss": 2.5008, + "step": 129 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8583112834837245, + "learning_rate": 5.222672064777329e-06, + "loss": 2.3134, + "step": 130 + }, + { + "epoch": 0.5303643724696356, + "grad_norm": 0.7581206885647646, + "learning_rate": 5.263157894736842e-06, + "loss": 2.4191, + "step": 131 + }, + { + "epoch": 0.5344129554655871, + "grad_norm": 0.9695513408717512, + "learning_rate": 5.303643724696357e-06, + "loss": 2.5499, + "step": 132 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 6.764939321667699, + "learning_rate": 5.344129554655872e-06, + "loss": 2.4736, + "step": 133 + }, + { + "epoch": 0.5425101214574899, + "grad_norm": 1.0247610500949114, + "learning_rate": 5.384615384615385e-06, + "loss": 2.3723, + "step": 134 + }, + { + "epoch": 0.5465587044534413, + "grad_norm": 15.672428379790873, + "learning_rate": 5.425101214574899e-06, + "loss": 3.4815, + "step": 135 + }, + { + "epoch": 0.5506072874493927, + "grad_norm": 2.249245731133667, + "learning_rate": 5.465587044534414e-06, + "loss": 3.4231, + "step": 136 + }, + { + "epoch": 0.5546558704453441, + "grad_norm": 3.797144058522148, + "learning_rate": 5.506072874493927e-06, + "loss": 4.4025, + "step": 137 + }, + { + "epoch": 0.5587044534412956, + "grad_norm": 0.8114215476851966, + "learning_rate": 5.546558704453442e-06, + "loss": 2.3958, + "step": 138 + }, + { + "epoch": 0.562753036437247, + "grad_norm": 0.7631595156767096, + "learning_rate": 5.5870445344129565e-06, + "loss": 2.1963, + "step": 139 + }, + { + "epoch": 0.5668016194331984, + "grad_norm": 0.8648024420211529, + "learning_rate": 5.6275303643724695e-06, + "loss": 2.4664, + "step": 140 + }, + { + "epoch": 0.5708502024291497, + "grad_norm": 1.1398946486999715, + "learning_rate": 5.668016194331984e-06, + "loss": 2.2672, + "step": 141 + }, + { + "epoch": 0.5748987854251012, + "grad_norm": 0.7035715089344788, + "learning_rate": 5.708502024291498e-06, + "loss": 2.4001, + "step": 142 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.7842465817250697, + "learning_rate": 5.748987854251013e-06, + "loss": 2.2186, + "step": 143 + }, + { + "epoch": 0.582995951417004, + "grad_norm": 0.8358191441707306, + "learning_rate": 5.789473684210527e-06, + "loss": 2.5692, + "step": 144 + }, + { + "epoch": 0.5870445344129555, + "grad_norm": 0.7027969455146362, + "learning_rate": 5.8299595141700406e-06, + "loss": 2.3088, + "step": 145 + }, + { + "epoch": 0.5910931174089069, + "grad_norm": 0.7026752876788243, + "learning_rate": 5.870445344129555e-06, + "loss": 2.4148, + "step": 146 + }, + { + "epoch": 0.5951417004048583, + "grad_norm": 0.9049685837714232, + "learning_rate": 5.91093117408907e-06, + "loss": 2.146, + "step": 147 + }, + { + "epoch": 0.5991902834008097, + "grad_norm": 0.8388567349727308, + "learning_rate": 5.951417004048583e-06, + "loss": 2.0989, + "step": 148 + }, + { + "epoch": 0.6032388663967612, + "grad_norm": 0.773577497225349, + "learning_rate": 5.991902834008098e-06, + "loss": 2.2379, + "step": 149 + }, + { + "epoch": 0.6072874493927125, + "grad_norm": 0.7826979729986758, + "learning_rate": 6.0323886639676124e-06, + "loss": 2.18, + "step": 150 + }, + { + "epoch": 0.611336032388664, + "grad_norm": 0.8592925674032668, + "learning_rate": 6.0728744939271254e-06, + "loss": 2.4302, + "step": 151 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.6169427006453612, + "learning_rate": 6.11336032388664e-06, + "loss": 2.2208, + "step": 152 + }, + { + "epoch": 0.6194331983805668, + "grad_norm": 0.8979145279675816, + "learning_rate": 6.153846153846155e-06, + "loss": 2.3089, + "step": 153 + }, + { + "epoch": 0.6234817813765182, + "grad_norm": 0.8069478254920203, + "learning_rate": 6.194331983805668e-06, + "loss": 2.5248, + "step": 154 + }, + { + "epoch": 0.6275303643724697, + "grad_norm": 0.702872317531758, + "learning_rate": 6.234817813765183e-06, + "loss": 2.2786, + "step": 155 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 1.1902510486781737, + "learning_rate": 6.275303643724697e-06, + "loss": 2.564, + "step": 156 + }, + { + "epoch": 0.6356275303643725, + "grad_norm": 0.7322358696471963, + "learning_rate": 6.31578947368421e-06, + "loss": 2.2575, + "step": 157 + }, + { + "epoch": 0.6396761133603239, + "grad_norm": 0.827272619073328, + "learning_rate": 6.356275303643725e-06, + "loss": 2.4085, + "step": 158 + }, + { + "epoch": 0.6437246963562753, + "grad_norm": 0.844449245612401, + "learning_rate": 6.39676113360324e-06, + "loss": 2.3392, + "step": 159 + }, + { + "epoch": 0.6477732793522267, + "grad_norm": 0.6963954379010507, + "learning_rate": 6.437246963562754e-06, + "loss": 2.3474, + "step": 160 + }, + { + "epoch": 0.6518218623481782, + "grad_norm": 1.0062158283533227, + "learning_rate": 6.4777327935222675e-06, + "loss": 2.206, + "step": 161 + }, + { + "epoch": 0.6558704453441295, + "grad_norm": 0.7010434692271018, + "learning_rate": 6.518218623481782e-06, + "loss": 2.4407, + "step": 162 + }, + { + "epoch": 0.659919028340081, + "grad_norm": 0.8546299950775236, + "learning_rate": 6.558704453441296e-06, + "loss": 2.3308, + "step": 163 + }, + { + "epoch": 0.6639676113360324, + "grad_norm": 0.9160069550133176, + "learning_rate": 6.599190283400811e-06, + "loss": 2.2799, + "step": 164 + }, + { + "epoch": 0.6680161943319838, + "grad_norm": 0.6991934828570997, + "learning_rate": 6.639676113360325e-06, + "loss": 2.3277, + "step": 165 + }, + { + "epoch": 0.6720647773279352, + "grad_norm": 2.441952914795693, + "learning_rate": 6.6801619433198385e-06, + "loss": 2.2357, + "step": 166 + }, + { + "epoch": 0.6761133603238867, + "grad_norm": 0.7134946099061733, + "learning_rate": 6.720647773279353e-06, + "loss": 2.1807, + "step": 167 + }, + { + "epoch": 0.680161943319838, + "grad_norm": 0.7920123504029117, + "learning_rate": 6.761133603238867e-06, + "loss": 2.4623, + "step": 168 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.7987244705898385, + "learning_rate": 6.801619433198381e-06, + "loss": 2.2289, + "step": 169 + }, + { + "epoch": 0.6882591093117408, + "grad_norm": 0.8092206406250949, + "learning_rate": 6.842105263157896e-06, + "loss": 2.3704, + "step": 170 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.7440145606342271, + "learning_rate": 6.882591093117409e-06, + "loss": 2.3322, + "step": 171 + }, + { + "epoch": 0.6963562753036437, + "grad_norm": 0.704685785309606, + "learning_rate": 6.923076923076923e-06, + "loss": 2.1067, + "step": 172 + }, + { + "epoch": 0.7004048582995951, + "grad_norm": 0.8716057180507851, + "learning_rate": 6.963562753036438e-06, + "loss": 2.6915, + "step": 173 + }, + { + "epoch": 0.7044534412955465, + "grad_norm": 0.8610302596466904, + "learning_rate": 7.004048582995951e-06, + "loss": 2.3607, + "step": 174 + }, + { + "epoch": 0.708502024291498, + "grad_norm": 0.7454341645101108, + "learning_rate": 7.044534412955466e-06, + "loss": 2.0946, + "step": 175 + }, + { + "epoch": 0.7125506072874493, + "grad_norm": 0.775526558923258, + "learning_rate": 7.0850202429149805e-06, + "loss": 2.2197, + "step": 176 + }, + { + "epoch": 0.7165991902834008, + "grad_norm": 0.7425363416700347, + "learning_rate": 7.125506072874494e-06, + "loss": 2.2515, + "step": 177 + }, + { + "epoch": 0.7206477732793523, + "grad_norm": 0.799480261879121, + "learning_rate": 7.165991902834008e-06, + "loss": 2.2984, + "step": 178 + }, + { + "epoch": 0.7246963562753036, + "grad_norm": 1.208911299168472, + "learning_rate": 7.206477732793523e-06, + "loss": 2.3498, + "step": 179 + }, + { + "epoch": 0.728744939271255, + "grad_norm": 0.8451843361875137, + "learning_rate": 7.246963562753037e-06, + "loss": 2.3922, + "step": 180 + }, + { + "epoch": 0.7327935222672065, + "grad_norm": 0.6688748588442022, + "learning_rate": 7.2874493927125516e-06, + "loss": 2.2572, + "step": 181 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 1.1693138233285796, + "learning_rate": 7.327935222672065e-06, + "loss": 2.327, + "step": 182 + }, + { + "epoch": 0.7408906882591093, + "grad_norm": 1.6904745941237547, + "learning_rate": 7.368421052631579e-06, + "loss": 2.8703, + "step": 183 + }, + { + "epoch": 0.7449392712550608, + "grad_norm": 0.8844949083017518, + "learning_rate": 7.408906882591094e-06, + "loss": 2.2888, + "step": 184 + }, + { + "epoch": 0.7489878542510121, + "grad_norm": 0.8858477106782153, + "learning_rate": 7.449392712550608e-06, + "loss": 2.2582, + "step": 185 + }, + { + "epoch": 0.7530364372469636, + "grad_norm": 0.7394352987608678, + "learning_rate": 7.489878542510122e-06, + "loss": 2.0775, + "step": 186 + }, + { + "epoch": 0.757085020242915, + "grad_norm": 0.8834206013583122, + "learning_rate": 7.5303643724696364e-06, + "loss": 2.2682, + "step": 187 + }, + { + "epoch": 0.7611336032388664, + "grad_norm": 6.250751086281045, + "learning_rate": 7.570850202429151e-06, + "loss": 3.2512, + "step": 188 + }, + { + "epoch": 0.7651821862348178, + "grad_norm": 35.543626516502854, + "learning_rate": 7.611336032388664e-06, + "loss": 3.2673, + "step": 189 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 4.671464673421441, + "learning_rate": 7.651821862348178e-06, + "loss": 3.288, + "step": 190 + }, + { + "epoch": 0.7732793522267206, + "grad_norm": 0.8467043403003462, + "learning_rate": 7.692307692307694e-06, + "loss": 2.3525, + "step": 191 + }, + { + "epoch": 0.7773279352226721, + "grad_norm": 0.7553553742503454, + "learning_rate": 7.732793522267207e-06, + "loss": 2.4147, + "step": 192 + }, + { + "epoch": 0.7813765182186235, + "grad_norm": 0.6722184689731728, + "learning_rate": 7.773279352226721e-06, + "loss": 2.4408, + "step": 193 + }, + { + "epoch": 0.7854251012145749, + "grad_norm": 0.8742278117345931, + "learning_rate": 7.813765182186235e-06, + "loss": 2.2427, + "step": 194 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.7018298382516639, + "learning_rate": 7.854251012145749e-06, + "loss": 2.1401, + "step": 195 + }, + { + "epoch": 0.7935222672064778, + "grad_norm": 0.8441291024867053, + "learning_rate": 7.894736842105265e-06, + "loss": 2.417, + "step": 196 + }, + { + "epoch": 0.7975708502024291, + "grad_norm": 0.8440780587728888, + "learning_rate": 7.935222672064778e-06, + "loss": 2.343, + "step": 197 + }, + { + "epoch": 0.8016194331983806, + "grad_norm": 0.7817852912155946, + "learning_rate": 7.975708502024292e-06, + "loss": 2.0718, + "step": 198 + }, + { + "epoch": 0.805668016194332, + "grad_norm": 0.8173811480736421, + "learning_rate": 8.016194331983806e-06, + "loss": 1.9574, + "step": 199 + }, + { + "epoch": 0.8097165991902834, + "grad_norm": 0.9130733429115842, + "learning_rate": 8.056680161943322e-06, + "loss": 2.1815, + "step": 200 + }, + { + "epoch": 0.8137651821862348, + "grad_norm": 0.9847086103025836, + "learning_rate": 8.097165991902834e-06, + "loss": 2.3515, + "step": 201 + }, + { + "epoch": 0.8178137651821862, + "grad_norm": 0.8676876881551969, + "learning_rate": 8.13765182186235e-06, + "loss": 2.0846, + "step": 202 + }, + { + "epoch": 0.8218623481781376, + "grad_norm": 13.90144045255743, + "learning_rate": 8.178137651821862e-06, + "loss": 2.901, + "step": 203 + }, + { + "epoch": 0.8259109311740891, + "grad_norm": 26.964637613541246, + "learning_rate": 8.218623481781377e-06, + "loss": 4.9217, + "step": 204 + }, + { + "epoch": 0.8299595141700404, + "grad_norm": 0.9450475296548486, + "learning_rate": 8.259109311740891e-06, + "loss": 2.213, + "step": 205 + }, + { + "epoch": 0.8340080971659919, + "grad_norm": 0.8251626027353501, + "learning_rate": 8.299595141700405e-06, + "loss": 2.1265, + "step": 206 + }, + { + "epoch": 0.8380566801619433, + "grad_norm": 1.5637444134794973, + "learning_rate": 8.340080971659919e-06, + "loss": 2.1168, + "step": 207 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.8572609413241875, + "learning_rate": 8.380566801619434e-06, + "loss": 2.2021, + "step": 208 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.8829526183041908, + "learning_rate": 8.421052631578948e-06, + "loss": 2.1197, + "step": 209 + }, + { + "epoch": 0.8502024291497976, + "grad_norm": 0.8230040936414714, + "learning_rate": 8.461538461538462e-06, + "loss": 2.1389, + "step": 210 + }, + { + "epoch": 0.854251012145749, + "grad_norm": 1.0630722291016348, + "learning_rate": 8.502024291497976e-06, + "loss": 2.2071, + "step": 211 + }, + { + "epoch": 0.8582995951417004, + "grad_norm": 0.8285650816893187, + "learning_rate": 8.54251012145749e-06, + "loss": 2.1278, + "step": 212 + }, + { + "epoch": 0.8623481781376519, + "grad_norm": 0.9374104368567024, + "learning_rate": 8.582995951417005e-06, + "loss": 2.2602, + "step": 213 + }, + { + "epoch": 0.8663967611336032, + "grad_norm": 0.9292432454800617, + "learning_rate": 8.62348178137652e-06, + "loss": 2.2139, + "step": 214 + }, + { + "epoch": 0.8704453441295547, + "grad_norm": 1.102816596900189, + "learning_rate": 8.663967611336033e-06, + "loss": 2.6954, + "step": 215 + }, + { + "epoch": 0.8744939271255061, + "grad_norm": 1.0693734533760941, + "learning_rate": 8.704453441295547e-06, + "loss": 2.6307, + "step": 216 + }, + { + "epoch": 0.8785425101214575, + "grad_norm": 0.9576307746487195, + "learning_rate": 8.744939271255063e-06, + "loss": 2.3637, + "step": 217 + }, + { + "epoch": 0.8825910931174089, + "grad_norm": 0.9705930148144204, + "learning_rate": 8.785425101214575e-06, + "loss": 2.2346, + "step": 218 + }, + { + "epoch": 0.8866396761133604, + "grad_norm": 1.0504776994181708, + "learning_rate": 8.82591093117409e-06, + "loss": 1.8973, + "step": 219 + }, + { + "epoch": 0.8906882591093117, + "grad_norm": 0.8931928814405187, + "learning_rate": 8.866396761133604e-06, + "loss": 2.2742, + "step": 220 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.9688347208506803, + "learning_rate": 8.906882591093118e-06, + "loss": 2.2952, + "step": 221 + }, + { + "epoch": 0.8987854251012146, + "grad_norm": 0.978996274596435, + "learning_rate": 8.947368421052632e-06, + "loss": 2.0332, + "step": 222 + }, + { + "epoch": 0.902834008097166, + "grad_norm": 0.9073798024023706, + "learning_rate": 8.987854251012147e-06, + "loss": 2.0714, + "step": 223 + }, + { + "epoch": 0.9068825910931174, + "grad_norm": 1.1581613082581128, + "learning_rate": 9.02834008097166e-06, + "loss": 2.2157, + "step": 224 + }, + { + "epoch": 0.9109311740890689, + "grad_norm": 1.0884120135655109, + "learning_rate": 9.068825910931175e-06, + "loss": 1.7915, + "step": 225 + }, + { + "epoch": 0.9149797570850202, + "grad_norm": 0.9581672716343882, + "learning_rate": 9.109311740890689e-06, + "loss": 2.0722, + "step": 226 + }, + { + "epoch": 0.9190283400809717, + "grad_norm": 0.9523432975820123, + "learning_rate": 9.149797570850203e-06, + "loss": 2.0351, + "step": 227 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.9395262500295037, + "learning_rate": 9.190283400809717e-06, + "loss": 2.1823, + "step": 228 + }, + { + "epoch": 0.9271255060728745, + "grad_norm": 1.0734663585541728, + "learning_rate": 9.230769230769232e-06, + "loss": 2.2329, + "step": 229 + }, + { + "epoch": 0.9311740890688259, + "grad_norm": 5.915661456573777, + "learning_rate": 9.271255060728746e-06, + "loss": 2.142, + "step": 230 + }, + { + "epoch": 0.9352226720647774, + "grad_norm": 0.943964635554494, + "learning_rate": 9.31174089068826e-06, + "loss": 2.0151, + "step": 231 + }, + { + "epoch": 0.9392712550607287, + "grad_norm": 0.9400321772267921, + "learning_rate": 9.352226720647774e-06, + "loss": 1.9453, + "step": 232 + }, + { + "epoch": 0.9433198380566802, + "grad_norm": 1.0803744575815664, + "learning_rate": 9.392712550607288e-06, + "loss": 2.2879, + "step": 233 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 1.1375116889631114, + "learning_rate": 9.433198380566803e-06, + "loss": 1.997, + "step": 234 + }, + { + "epoch": 0.951417004048583, + "grad_norm": 1.0484948139162147, + "learning_rate": 9.473684210526315e-06, + "loss": 2.0557, + "step": 235 + }, + { + "epoch": 0.9554655870445344, + "grad_norm": 1.9953282124950078, + "learning_rate": 9.514170040485831e-06, + "loss": 2.2939, + "step": 236 + }, + { + "epoch": 0.9595141700404858, + "grad_norm": 0.976191957030197, + "learning_rate": 9.554655870445345e-06, + "loss": 2.0733, + "step": 237 + }, + { + "epoch": 0.9635627530364372, + "grad_norm": 1.2563869839657487, + "learning_rate": 9.595141700404859e-06, + "loss": 2.0464, + "step": 238 + }, + { + "epoch": 0.9676113360323887, + "grad_norm": 1.5608940397030466, + "learning_rate": 9.635627530364373e-06, + "loss": 2.336, + "step": 239 + }, + { + "epoch": 0.97165991902834, + "grad_norm": 1.3591514491532213, + "learning_rate": 9.676113360323888e-06, + "loss": 2.3022, + "step": 240 + }, + { + "epoch": 0.9757085020242915, + "grad_norm": 0.9384697642414853, + "learning_rate": 9.7165991902834e-06, + "loss": 2.0917, + "step": 241 + }, + { + "epoch": 0.979757085020243, + "grad_norm": 1.0921517070072044, + "learning_rate": 9.757085020242916e-06, + "loss": 2.2454, + "step": 242 + }, + { + "epoch": 0.9838056680161943, + "grad_norm": 1.0952417249590038, + "learning_rate": 9.79757085020243e-06, + "loss": 2.2731, + "step": 243 + }, + { + "epoch": 0.9878542510121457, + "grad_norm": 1.004948368911197, + "learning_rate": 9.838056680161944e-06, + "loss": 2.0318, + "step": 244 + }, + { + "epoch": 0.9919028340080972, + "grad_norm": 0.9149897248279167, + "learning_rate": 9.878542510121458e-06, + "loss": 2.0005, + "step": 245 + }, + { + "epoch": 0.9959514170040485, + "grad_norm": 0.8508821706595309, + "learning_rate": 9.919028340080973e-06, + "loss": 2.2101, + "step": 246 + }, + { + "epoch": 1.0, + "grad_norm": 1.0244113302231659, + "learning_rate": 9.959514170040487e-06, + "loss": 2.0861, + "step": 247 + }, + { + "epoch": 1.0040485829959513, + "grad_norm": 0.9985250389875123, + "learning_rate": 1e-05, + "loss": 2.1654, + "step": 248 + }, + { + "epoch": 1.008097165991903, + "grad_norm": 1.5212147724237604, + "learning_rate": 9.999995007009308e-06, + "loss": 2.3841, + "step": 249 + }, + { + "epoch": 1.0121457489878543, + "grad_norm": 1.5612489351031709, + "learning_rate": 9.999980028047207e-06, + "loss": 2.2013, + "step": 250 + }, + { + "epoch": 1.0161943319838056, + "grad_norm": 1.3355032190827423, + "learning_rate": 9.99995506314361e-06, + "loss": 2.3109, + "step": 251 + }, + { + "epoch": 1.0202429149797572, + "grad_norm": 1.309995468445311, + "learning_rate": 9.999920112348379e-06, + "loss": 2.5018, + "step": 252 + }, + { + "epoch": 1.0242914979757085, + "grad_norm": 1.4582415698006528, + "learning_rate": 9.999875175731316e-06, + "loss": 2.4387, + "step": 253 + }, + { + "epoch": 1.0283400809716599, + "grad_norm": 1.2959671971401512, + "learning_rate": 9.99982025338217e-06, + "loss": 2.0271, + "step": 254 + }, + { + "epoch": 1.0323886639676114, + "grad_norm": 1.3702661061884107, + "learning_rate": 9.999755345410628e-06, + "loss": 2.1942, + "step": 255 + }, + { + "epoch": 1.0364372469635628, + "grad_norm": 1.2343807344186972, + "learning_rate": 9.999680451946327e-06, + "loss": 2.3802, + "step": 256 + }, + { + "epoch": 1.040485829959514, + "grad_norm": 1.2422842542141688, + "learning_rate": 9.999595573138845e-06, + "loss": 2.1737, + "step": 257 + }, + { + "epoch": 1.0445344129554657, + "grad_norm": 1.0535455017417064, + "learning_rate": 9.9995007091577e-06, + "loss": 2.1892, + "step": 258 + }, + { + "epoch": 1.048582995951417, + "grad_norm": 1.1326643708775719, + "learning_rate": 9.999395860192354e-06, + "loss": 2.165, + "step": 259 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.1512147523566951, + "learning_rate": 9.99928102645221e-06, + "loss": 2.4136, + "step": 260 + }, + { + "epoch": 1.05668016194332, + "grad_norm": 1.161431041066393, + "learning_rate": 9.999156208166614e-06, + "loss": 2.2649, + "step": 261 + }, + { + "epoch": 1.0607287449392713, + "grad_norm": 1.0550067630684001, + "learning_rate": 9.999021405584855e-06, + "loss": 2.2776, + "step": 262 + }, + { + "epoch": 1.0647773279352226, + "grad_norm": 1.2456078968374804, + "learning_rate": 9.99887661897616e-06, + "loss": 2.2937, + "step": 263 + }, + { + "epoch": 1.0688259109311742, + "grad_norm": 2.6565909174287934, + "learning_rate": 9.998721848629691e-06, + "loss": 2.3373, + "step": 264 + }, + { + "epoch": 1.0728744939271255, + "grad_norm": 1.2585354952683687, + "learning_rate": 9.99855709485456e-06, + "loss": 2.1755, + "step": 265 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 1.0397633573741487, + "learning_rate": 9.99838235797981e-06, + "loss": 2.1224, + "step": 266 + }, + { + "epoch": 1.0809716599190284, + "grad_norm": 1.3490485543349722, + "learning_rate": 9.998197638354428e-06, + "loss": 2.162, + "step": 267 + }, + { + "epoch": 1.0850202429149798, + "grad_norm": 0.9779246835555004, + "learning_rate": 9.998002936347334e-06, + "loss": 2.0674, + "step": 268 + }, + { + "epoch": 1.0890688259109311, + "grad_norm": 1.326338728002689, + "learning_rate": 9.997798252347382e-06, + "loss": 2.1639, + "step": 269 + }, + { + "epoch": 1.0931174089068827, + "grad_norm": 1.0363012993300713, + "learning_rate": 9.99758358676337e-06, + "loss": 2.2088, + "step": 270 + }, + { + "epoch": 1.097165991902834, + "grad_norm": 1.0931184449284037, + "learning_rate": 9.99735894002403e-06, + "loss": 1.9417, + "step": 271 + }, + { + "epoch": 1.1012145748987854, + "grad_norm": 1.1142050270090365, + "learning_rate": 9.99712431257802e-06, + "loss": 2.1229, + "step": 272 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.1058458560003002, + "learning_rate": 9.99687970489394e-06, + "loss": 2.147, + "step": 273 + }, + { + "epoch": 1.1093117408906883, + "grad_norm": 1.1507827310584715, + "learning_rate": 9.996625117460319e-06, + "loss": 2.0305, + "step": 274 + }, + { + "epoch": 1.1133603238866396, + "grad_norm": 1.4399534822311415, + "learning_rate": 9.996360550785619e-06, + "loss": 1.993, + "step": 275 + }, + { + "epoch": 1.117408906882591, + "grad_norm": 1.3360646827911495, + "learning_rate": 9.996086005398228e-06, + "loss": 1.9789, + "step": 276 + }, + { + "epoch": 1.1214574898785425, + "grad_norm": 1.1287606232609018, + "learning_rate": 9.995801481846474e-06, + "loss": 1.9362, + "step": 277 + }, + { + "epoch": 1.125506072874494, + "grad_norm": 1.0926872380366626, + "learning_rate": 9.9955069806986e-06, + "loss": 1.8981, + "step": 278 + }, + { + "epoch": 1.1295546558704452, + "grad_norm": 1.225113996229143, + "learning_rate": 9.995202502542785e-06, + "loss": 1.877, + "step": 279 + }, + { + "epoch": 1.1336032388663968, + "grad_norm": 1.350566519940966, + "learning_rate": 9.99488804798713e-06, + "loss": 2.1812, + "step": 280 + }, + { + "epoch": 1.1376518218623481, + "grad_norm": 1.3946048118439773, + "learning_rate": 9.994563617659665e-06, + "loss": 2.0952, + "step": 281 + }, + { + "epoch": 1.1417004048582995, + "grad_norm": 1.016854167145539, + "learning_rate": 9.99422921220834e-06, + "loss": 1.7897, + "step": 282 + }, + { + "epoch": 1.145748987854251, + "grad_norm": 1.1675202565627227, + "learning_rate": 9.993884832301029e-06, + "loss": 2.1832, + "step": 283 + }, + { + "epoch": 1.1497975708502024, + "grad_norm": 1.1052537876752062, + "learning_rate": 9.993530478625524e-06, + "loss": 2.0419, + "step": 284 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.0339091939503424, + "learning_rate": 9.99316615188954e-06, + "loss": 2.1765, + "step": 285 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 1.224235640342616, + "learning_rate": 9.992791852820709e-06, + "loss": 2.414, + "step": 286 + }, + { + "epoch": 1.1619433198380567, + "grad_norm": 1.1077938277922803, + "learning_rate": 9.992407582166582e-06, + "loss": 2.0729, + "step": 287 + }, + { + "epoch": 1.165991902834008, + "grad_norm": 1.1047832453065312, + "learning_rate": 9.99201334069462e-06, + "loss": 2.0816, + "step": 288 + }, + { + "epoch": 1.1700404858299596, + "grad_norm": 1.020340791924455, + "learning_rate": 9.991609129192202e-06, + "loss": 2.4242, + "step": 289 + }, + { + "epoch": 1.174089068825911, + "grad_norm": 1.0597565636193305, + "learning_rate": 9.991194948466615e-06, + "loss": 1.9546, + "step": 290 + }, + { + "epoch": 1.1781376518218623, + "grad_norm": 2.733652108939615, + "learning_rate": 9.990770799345064e-06, + "loss": 2.0891, + "step": 291 + }, + { + "epoch": 1.1821862348178138, + "grad_norm": 1.06820787268932, + "learning_rate": 9.990336682674656e-06, + "loss": 1.8523, + "step": 292 + }, + { + "epoch": 1.1862348178137652, + "grad_norm": 2.087421429190754, + "learning_rate": 9.989892599322404e-06, + "loss": 2.0252, + "step": 293 + }, + { + "epoch": 1.1902834008097165, + "grad_norm": 1.0884298591172652, + "learning_rate": 9.989438550175235e-06, + "loss": 2.094, + "step": 294 + }, + { + "epoch": 1.194331983805668, + "grad_norm": 1.4465924376774404, + "learning_rate": 9.98897453613997e-06, + "loss": 2.2522, + "step": 295 + }, + { + "epoch": 1.1983805668016194, + "grad_norm": 1.2561153181877684, + "learning_rate": 9.988500558143337e-06, + "loss": 2.3174, + "step": 296 + }, + { + "epoch": 1.2024291497975708, + "grad_norm": 1.299592783957394, + "learning_rate": 9.988016617131966e-06, + "loss": 2.0626, + "step": 297 + }, + { + "epoch": 1.2064777327935223, + "grad_norm": 1.616312765069768, + "learning_rate": 9.987522714072377e-06, + "loss": 2.332, + "step": 298 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 1.1673730449379247, + "learning_rate": 9.987018849950996e-06, + "loss": 2.3944, + "step": 299 + }, + { + "epoch": 1.214574898785425, + "grad_norm": 1.143398053052611, + "learning_rate": 9.986505025774137e-06, + "loss": 2.1948, + "step": 300 + }, + { + "epoch": 1.2186234817813766, + "grad_norm": 1.097402992490867, + "learning_rate": 9.985981242568009e-06, + "loss": 2.0261, + "step": 301 + }, + { + "epoch": 1.222672064777328, + "grad_norm": 1.1862462194607237, + "learning_rate": 9.985447501378706e-06, + "loss": 2.0268, + "step": 302 + }, + { + "epoch": 1.2267206477732793, + "grad_norm": 1.1867953576661743, + "learning_rate": 9.984903803272216e-06, + "loss": 2.0609, + "step": 303 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 1.160233224133256, + "learning_rate": 9.984350149334415e-06, + "loss": 2.118, + "step": 304 + }, + { + "epoch": 1.2348178137651822, + "grad_norm": 1.1580496833430431, + "learning_rate": 9.983786540671052e-06, + "loss": 2.2939, + "step": 305 + }, + { + "epoch": 1.2388663967611335, + "grad_norm": 1.1904466983631679, + "learning_rate": 9.983212978407767e-06, + "loss": 2.2554, + "step": 306 + }, + { + "epoch": 1.242914979757085, + "grad_norm": 1.191066075711238, + "learning_rate": 9.982629463690075e-06, + "loss": 2.2252, + "step": 307 + }, + { + "epoch": 1.2469635627530364, + "grad_norm": 0.9748723838702108, + "learning_rate": 9.982035997683372e-06, + "loss": 2.0288, + "step": 308 + }, + { + "epoch": 1.2510121457489878, + "grad_norm": 1.0421752021046666, + "learning_rate": 9.981432581572925e-06, + "loss": 2.0528, + "step": 309 + }, + { + "epoch": 1.2550607287449393, + "grad_norm": 1.1354302953976132, + "learning_rate": 9.980819216563875e-06, + "loss": 2.1848, + "step": 310 + }, + { + "epoch": 1.2591093117408907, + "grad_norm": 1.1565556608606453, + "learning_rate": 9.980195903881231e-06, + "loss": 1.9964, + "step": 311 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.0637756069428104, + "learning_rate": 9.979562644769871e-06, + "loss": 1.8735, + "step": 312 + }, + { + "epoch": 1.2672064777327936, + "grad_norm": 1.0699259387542537, + "learning_rate": 9.978919440494538e-06, + "loss": 2.0595, + "step": 313 + }, + { + "epoch": 1.271255060728745, + "grad_norm": 1.1179452169818913, + "learning_rate": 9.978266292339838e-06, + "loss": 2.1342, + "step": 314 + }, + { + "epoch": 1.2753036437246963, + "grad_norm": 0.9851906694579183, + "learning_rate": 9.977603201610236e-06, + "loss": 2.0658, + "step": 315 + }, + { + "epoch": 1.2793522267206479, + "grad_norm": 1.664317835506444, + "learning_rate": 9.976930169630052e-06, + "loss": 2.1478, + "step": 316 + }, + { + "epoch": 1.2834008097165992, + "grad_norm": 2.1052363417173012, + "learning_rate": 9.976247197743465e-06, + "loss": 1.8522, + "step": 317 + }, + { + "epoch": 1.2874493927125505, + "grad_norm": 1.1846256759923113, + "learning_rate": 9.975554287314505e-06, + "loss": 1.9432, + "step": 318 + }, + { + "epoch": 1.291497975708502, + "grad_norm": 1.138896431387234, + "learning_rate": 9.974851439727045e-06, + "loss": 1.8181, + "step": 319 + }, + { + "epoch": 1.2955465587044535, + "grad_norm": 1.153796269934686, + "learning_rate": 9.974138656384815e-06, + "loss": 2.1573, + "step": 320 + }, + { + "epoch": 1.2995951417004048, + "grad_norm": 1.703181471948063, + "learning_rate": 9.973415938711383e-06, + "loss": 2.1787, + "step": 321 + }, + { + "epoch": 1.3036437246963564, + "grad_norm": 1.7096036636558702, + "learning_rate": 9.972683288150155e-06, + "loss": 1.9479, + "step": 322 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 1.1866073546875906, + "learning_rate": 9.97194070616438e-06, + "loss": 1.9284, + "step": 323 + }, + { + "epoch": 1.311740890688259, + "grad_norm": 1.0952591943942271, + "learning_rate": 9.971188194237141e-06, + "loss": 1.9908, + "step": 324 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 1.5313235105110092, + "learning_rate": 9.97042575387135e-06, + "loss": 2.0365, + "step": 325 + }, + { + "epoch": 1.319838056680162, + "grad_norm": 1.2326037015549494, + "learning_rate": 9.969653386589749e-06, + "loss": 1.9016, + "step": 326 + }, + { + "epoch": 1.3238866396761133, + "grad_norm": 1.08612437072456, + "learning_rate": 9.968871093934908e-06, + "loss": 1.9295, + "step": 327 + }, + { + "epoch": 1.3279352226720649, + "grad_norm": 1.1765201682452633, + "learning_rate": 9.968078877469221e-06, + "loss": 1.9057, + "step": 328 + }, + { + "epoch": 1.3319838056680162, + "grad_norm": 1.1266840563836074, + "learning_rate": 9.967276738774897e-06, + "loss": 1.7933, + "step": 329 + }, + { + "epoch": 1.3360323886639676, + "grad_norm": 1.096241365913634, + "learning_rate": 9.966464679453969e-06, + "loss": 1.8225, + "step": 330 + }, + { + "epoch": 1.3400809716599191, + "grad_norm": 1.0190613068454424, + "learning_rate": 9.965642701128273e-06, + "loss": 1.7548, + "step": 331 + }, + { + "epoch": 1.3441295546558705, + "grad_norm": 1.045370042720153, + "learning_rate": 9.964810805439464e-06, + "loss": 1.8602, + "step": 332 + }, + { + "epoch": 1.3481781376518218, + "grad_norm": 1.2609434903119947, + "learning_rate": 9.963968994049e-06, + "loss": 2.0594, + "step": 333 + }, + { + "epoch": 1.3522267206477734, + "grad_norm": 2.6150970483606812, + "learning_rate": 9.963117268638147e-06, + "loss": 1.8496, + "step": 334 + }, + { + "epoch": 1.3562753036437247, + "grad_norm": 1.2099371136718209, + "learning_rate": 9.962255630907964e-06, + "loss": 1.6494, + "step": 335 + }, + { + "epoch": 1.360323886639676, + "grad_norm": 1.313765722576788, + "learning_rate": 9.961384082579311e-06, + "loss": 1.9562, + "step": 336 + }, + { + "epoch": 1.3643724696356276, + "grad_norm": 1.2172159882432991, + "learning_rate": 9.96050262539284e-06, + "loss": 2.0155, + "step": 337 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.2586156100651915, + "learning_rate": 9.959611261108999e-06, + "loss": 1.9085, + "step": 338 + }, + { + "epoch": 1.3724696356275303, + "grad_norm": 1.5183212778349207, + "learning_rate": 9.958709991508013e-06, + "loss": 2.0875, + "step": 339 + }, + { + "epoch": 1.376518218623482, + "grad_norm": 1.1522560111562028, + "learning_rate": 9.957798818389894e-06, + "loss": 1.619, + "step": 340 + }, + { + "epoch": 1.3805668016194332, + "grad_norm": 1.1594845675041106, + "learning_rate": 9.956877743574437e-06, + "loss": 1.809, + "step": 341 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 1.1122066306670175, + "learning_rate": 9.955946768901207e-06, + "loss": 1.7047, + "step": 342 + }, + { + "epoch": 1.3886639676113361, + "grad_norm": 1.330314253280862, + "learning_rate": 9.955005896229543e-06, + "loss": 1.7574, + "step": 343 + }, + { + "epoch": 1.3927125506072875, + "grad_norm": 1.1715493987473338, + "learning_rate": 9.954055127438554e-06, + "loss": 1.903, + "step": 344 + }, + { + "epoch": 1.3967611336032388, + "grad_norm": 1.3791674988449036, + "learning_rate": 9.95309446442711e-06, + "loss": 1.7259, + "step": 345 + }, + { + "epoch": 1.4008097165991904, + "grad_norm": 1.1049829081327143, + "learning_rate": 9.952123909113842e-06, + "loss": 1.7903, + "step": 346 + }, + { + "epoch": 1.4048582995951417, + "grad_norm": 1.2032214776472194, + "learning_rate": 9.951143463437145e-06, + "loss": 1.8805, + "step": 347 + }, + { + "epoch": 1.408906882591093, + "grad_norm": 1.4430732870842997, + "learning_rate": 9.950153129355156e-06, + "loss": 1.963, + "step": 348 + }, + { + "epoch": 1.4129554655870447, + "grad_norm": 1.1510222292519288, + "learning_rate": 9.949152908845771e-06, + "loss": 1.8567, + "step": 349 + }, + { + "epoch": 1.417004048582996, + "grad_norm": 1.195578264117532, + "learning_rate": 9.948142803906623e-06, + "loss": 2.0649, + "step": 350 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.233691487377917, + "learning_rate": 9.947122816555091e-06, + "loss": 2.1272, + "step": 351 + }, + { + "epoch": 1.425101214574899, + "grad_norm": 1.1086448213277071, + "learning_rate": 9.94609294882829e-06, + "loss": 1.9559, + "step": 352 + }, + { + "epoch": 1.4291497975708503, + "grad_norm": 1.095236792272251, + "learning_rate": 9.94505320278307e-06, + "loss": 2.0925, + "step": 353 + }, + { + "epoch": 1.4331983805668016, + "grad_norm": 1.5358655904235856, + "learning_rate": 9.944003580496004e-06, + "loss": 2.1299, + "step": 354 + }, + { + "epoch": 1.4372469635627532, + "grad_norm": 4.618210545500014, + "learning_rate": 9.942944084063397e-06, + "loss": 1.906, + "step": 355 + }, + { + "epoch": 1.4412955465587045, + "grad_norm": 1.2771853507714968, + "learning_rate": 9.94187471560127e-06, + "loss": 1.8895, + "step": 356 + }, + { + "epoch": 1.4453441295546559, + "grad_norm": 1.503260525653169, + "learning_rate": 9.940795477245362e-06, + "loss": 2.123, + "step": 357 + }, + { + "epoch": 1.4493927125506074, + "grad_norm": 1.1357577615662766, + "learning_rate": 9.939706371151124e-06, + "loss": 1.9087, + "step": 358 + }, + { + "epoch": 1.4534412955465588, + "grad_norm": 1.3448821103990194, + "learning_rate": 9.938607399493714e-06, + "loss": 1.8989, + "step": 359 + }, + { + "epoch": 1.45748987854251, + "grad_norm": 1.3913310219583304, + "learning_rate": 9.937498564467993e-06, + "loss": 2.2799, + "step": 360 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 1.9605641433764716, + "learning_rate": 9.936379868288525e-06, + "loss": 2.5915, + "step": 361 + }, + { + "epoch": 1.465587044534413, + "grad_norm": 1.2844543412275256, + "learning_rate": 9.935251313189564e-06, + "loss": 2.1301, + "step": 362 + }, + { + "epoch": 1.4696356275303644, + "grad_norm": 1.034982029315575, + "learning_rate": 9.934112901425058e-06, + "loss": 2.0549, + "step": 363 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.204999735063322, + "learning_rate": 9.932964635268637e-06, + "loss": 1.9596, + "step": 364 + }, + { + "epoch": 1.4777327935222673, + "grad_norm": 1.286601988495976, + "learning_rate": 9.931806517013612e-06, + "loss": 2.0348, + "step": 365 + }, + { + "epoch": 1.4817813765182186, + "grad_norm": 0.9482600934112612, + "learning_rate": 9.930638548972976e-06, + "loss": 1.9226, + "step": 366 + }, + { + "epoch": 1.48582995951417, + "grad_norm": 1.2527379198286719, + "learning_rate": 9.92946073347939e-06, + "loss": 1.9363, + "step": 367 + }, + { + "epoch": 1.4898785425101215, + "grad_norm": 1.416748811839403, + "learning_rate": 9.92827307288518e-06, + "loss": 1.8743, + "step": 368 + }, + { + "epoch": 1.4939271255060729, + "grad_norm": 1.4807677636442649, + "learning_rate": 9.927075569562342e-06, + "loss": 1.9204, + "step": 369 + }, + { + "epoch": 1.4979757085020242, + "grad_norm": 1.3869419977919077, + "learning_rate": 9.925868225902518e-06, + "loss": 1.8206, + "step": 370 + }, + { + "epoch": 1.5020242914979756, + "grad_norm": 1.1484019096824427, + "learning_rate": 9.924651044317017e-06, + "loss": 1.741, + "step": 371 + }, + { + "epoch": 1.5060728744939271, + "grad_norm": 1.33557569757452, + "learning_rate": 9.923424027236786e-06, + "loss": 2.0195, + "step": 372 + }, + { + "epoch": 1.5101214574898787, + "grad_norm": 1.3948710108814935, + "learning_rate": 9.922187177112422e-06, + "loss": 2.0682, + "step": 373 + }, + { + "epoch": 1.5141700404858298, + "grad_norm": 0.9670281862333157, + "learning_rate": 9.920940496414153e-06, + "loss": 2.0098, + "step": 374 + }, + { + "epoch": 1.5182186234817814, + "grad_norm": 1.1816940948972323, + "learning_rate": 9.919683987631849e-06, + "loss": 2.041, + "step": 375 + }, + { + "epoch": 1.522267206477733, + "grad_norm": 1.1912191018269882, + "learning_rate": 9.918417653275004e-06, + "loss": 1.9668, + "step": 376 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 2.3568912806934783, + "learning_rate": 9.917141495872733e-06, + "loss": 1.737, + "step": 377 + }, + { + "epoch": 1.5303643724696356, + "grad_norm": 1.4730591126031292, + "learning_rate": 9.915855517973776e-06, + "loss": 1.8672, + "step": 378 + }, + { + "epoch": 1.5344129554655872, + "grad_norm": 1.5631199604094446, + "learning_rate": 9.914559722146483e-06, + "loss": 2.0038, + "step": 379 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 2.5148949693335014, + "learning_rate": 9.913254110978812e-06, + "loss": 2.0916, + "step": 380 + }, + { + "epoch": 1.54251012145749, + "grad_norm": 1.0936340454215232, + "learning_rate": 9.911938687078324e-06, + "loss": 1.9959, + "step": 381 + }, + { + "epoch": 1.5465587044534415, + "grad_norm": 9.59805118170954, + "learning_rate": 9.91061345307218e-06, + "loss": 2.6669, + "step": 382 + }, + { + "epoch": 1.5506072874493926, + "grad_norm": 5.341110768663029, + "learning_rate": 9.909278411607134e-06, + "loss": 2.7524, + "step": 383 + }, + { + "epoch": 1.5546558704453441, + "grad_norm": 6.319523626825805, + "learning_rate": 9.90793356534952e-06, + "loss": 3.2784, + "step": 384 + }, + { + "epoch": 1.5587044534412957, + "grad_norm": 1.1632747156326964, + "learning_rate": 9.906578916985267e-06, + "loss": 1.9441, + "step": 385 + }, + { + "epoch": 1.5627530364372468, + "grad_norm": 1.129320861281679, + "learning_rate": 9.90521446921987e-06, + "loss": 1.84, + "step": 386 + }, + { + "epoch": 1.5668016194331984, + "grad_norm": 1.0396625767769134, + "learning_rate": 9.9038402247784e-06, + "loss": 2.0999, + "step": 387 + }, + { + "epoch": 1.5708502024291497, + "grad_norm": 1.1109350507878293, + "learning_rate": 9.90245618640549e-06, + "loss": 1.7455, + "step": 388 + }, + { + "epoch": 1.574898785425101, + "grad_norm": 1.1573410708340344, + "learning_rate": 9.90106235686534e-06, + "loss": 2.1349, + "step": 389 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 1.0084157125260218, + "learning_rate": 9.8996587389417e-06, + "loss": 1.8406, + "step": 390 + }, + { + "epoch": 1.582995951417004, + "grad_norm": 1.1571333441837306, + "learning_rate": 9.89824533543787e-06, + "loss": 2.1231, + "step": 391 + }, + { + "epoch": 1.5870445344129553, + "grad_norm": 1.0697948256338023, + "learning_rate": 9.896822149176695e-06, + "loss": 1.9727, + "step": 392 + }, + { + "epoch": 1.591093117408907, + "grad_norm": 1.1795302734430202, + "learning_rate": 9.895389183000557e-06, + "loss": 1.9829, + "step": 393 + }, + { + "epoch": 1.5951417004048583, + "grad_norm": 1.3378200533531102, + "learning_rate": 9.893946439771369e-06, + "loss": 1.648, + "step": 394 + }, + { + "epoch": 1.5991902834008096, + "grad_norm": 1.190232768067943, + "learning_rate": 9.892493922370575e-06, + "loss": 1.6858, + "step": 395 + }, + { + "epoch": 1.6032388663967612, + "grad_norm": 1.1458315074040415, + "learning_rate": 9.891031633699135e-06, + "loss": 1.8744, + "step": 396 + }, + { + "epoch": 1.6072874493927125, + "grad_norm": 1.1819017581575564, + "learning_rate": 9.88955957667753e-06, + "loss": 1.7732, + "step": 397 + }, + { + "epoch": 1.6113360323886639, + "grad_norm": 1.8565903989047288, + "learning_rate": 9.888077754245741e-06, + "loss": 2.0753, + "step": 398 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 1.0244971639990994, + "learning_rate": 9.886586169363267e-06, + "loss": 1.9333, + "step": 399 + }, + { + "epoch": 1.6194331983805668, + "grad_norm": 1.249918723327364, + "learning_rate": 9.885084825009085e-06, + "loss": 1.8167, + "step": 400 + }, + { + "epoch": 1.623481781376518, + "grad_norm": 1.379879581099796, + "learning_rate": 9.883573724181683e-06, + "loss": 2.1783, + "step": 401 + }, + { + "epoch": 1.6275303643724697, + "grad_norm": 1.0714251364756116, + "learning_rate": 9.882052869899024e-06, + "loss": 1.9676, + "step": 402 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 1.2237579067545878, + "learning_rate": 9.880522265198548e-06, + "loss": 2.154, + "step": 403 + }, + { + "epoch": 1.6356275303643724, + "grad_norm": 1.0681493200255976, + "learning_rate": 9.878981913137178e-06, + "loss": 1.8629, + "step": 404 + }, + { + "epoch": 1.639676113360324, + "grad_norm": 1.213978261543208, + "learning_rate": 9.877431816791299e-06, + "loss": 2.0544, + "step": 405 + }, + { + "epoch": 1.6437246963562753, + "grad_norm": 1.0906406926843764, + "learning_rate": 9.875871979256754e-06, + "loss": 2.0126, + "step": 406 + }, + { + "epoch": 1.6477732793522266, + "grad_norm": 1.1548847276751324, + "learning_rate": 9.87430240364885e-06, + "loss": 1.9896, + "step": 407 + }, + { + "epoch": 1.6518218623481782, + "grad_norm": 1.1007484969249457, + "learning_rate": 9.872723093102332e-06, + "loss": 1.8537, + "step": 408 + }, + { + "epoch": 1.6558704453441295, + "grad_norm": 1.4626798707839297, + "learning_rate": 9.871134050771398e-06, + "loss": 2.0636, + "step": 409 + }, + { + "epoch": 1.6599190283400809, + "grad_norm": 1.4362925135326843, + "learning_rate": 9.869535279829674e-06, + "loss": 1.892, + "step": 410 + }, + { + "epoch": 1.6639676113360324, + "grad_norm": 1.1158035130218342, + "learning_rate": 9.867926783470221e-06, + "loss": 2.0106, + "step": 411 + }, + { + "epoch": 1.6680161943319838, + "grad_norm": 1.094342494438384, + "learning_rate": 9.866308564905523e-06, + "loss": 2.0453, + "step": 412 + }, + { + "epoch": 1.6720647773279351, + "grad_norm": 1.0432966613184569, + "learning_rate": 9.864680627367476e-06, + "loss": 1.9541, + "step": 413 + }, + { + "epoch": 1.6761133603238867, + "grad_norm": 1.2646590113938572, + "learning_rate": 9.863042974107395e-06, + "loss": 1.9078, + "step": 414 + }, + { + "epoch": 1.680161943319838, + "grad_norm": 1.4143613333940679, + "learning_rate": 9.861395608395993e-06, + "loss": 2.0498, + "step": 415 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 1.1227780591009553, + "learning_rate": 9.859738533523384e-06, + "loss": 1.8425, + "step": 416 + }, + { + "epoch": 1.688259109311741, + "grad_norm": 1.1478310296573677, + "learning_rate": 9.85807175279907e-06, + "loss": 1.9961, + "step": 417 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 1.1555612172711482, + "learning_rate": 9.856395269551941e-06, + "loss": 1.9982, + "step": 418 + }, + { + "epoch": 1.6963562753036436, + "grad_norm": 1.2453555718552303, + "learning_rate": 9.854709087130261e-06, + "loss": 1.8074, + "step": 419 + }, + { + "epoch": 1.7004048582995952, + "grad_norm": 1.3445248996792332, + "learning_rate": 9.85301320890167e-06, + "loss": 2.315, + "step": 420 + }, + { + "epoch": 1.7044534412955465, + "grad_norm": 1.37583724829167, + "learning_rate": 9.851307638253167e-06, + "loss": 2.0698, + "step": 421 + }, + { + "epoch": 1.708502024291498, + "grad_norm": 1.4100704184587762, + "learning_rate": 9.849592378591113e-06, + "loss": 1.7238, + "step": 422 + }, + { + "epoch": 1.7125506072874495, + "grad_norm": 1.2265807736330994, + "learning_rate": 9.847867433341218e-06, + "loss": 1.881, + "step": 423 + }, + { + "epoch": 1.7165991902834008, + "grad_norm": 1.192372006539784, + "learning_rate": 9.846132805948534e-06, + "loss": 1.9658, + "step": 424 + }, + { + "epoch": 1.7206477732793521, + "grad_norm": 1.307546713268623, + "learning_rate": 9.844388499877457e-06, + "loss": 1.873, + "step": 425 + }, + { + "epoch": 1.7246963562753037, + "grad_norm": 1.382722813051471, + "learning_rate": 9.842634518611705e-06, + "loss": 1.9664, + "step": 426 + }, + { + "epoch": 1.728744939271255, + "grad_norm": 1.4179302059943903, + "learning_rate": 9.840870865654323e-06, + "loss": 2.1073, + "step": 427 + }, + { + "epoch": 1.7327935222672064, + "grad_norm": 1.0508460965436048, + "learning_rate": 9.839097544527674e-06, + "loss": 1.9957, + "step": 428 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.239601761065164, + "learning_rate": 9.837314558773427e-06, + "loss": 2.0381, + "step": 429 + }, + { + "epoch": 1.7408906882591093, + "grad_norm": 2.1485433652175137, + "learning_rate": 9.835521911952554e-06, + "loss": 2.6976, + "step": 430 + }, + { + "epoch": 1.7449392712550607, + "grad_norm": 1.2416619753926275, + "learning_rate": 9.833719607645325e-06, + "loss": 2.0715, + "step": 431 + }, + { + "epoch": 1.7489878542510122, + "grad_norm": 1.2591779562696075, + "learning_rate": 9.831907649451291e-06, + "loss": 1.9002, + "step": 432 + }, + { + "epoch": 1.7530364372469636, + "grad_norm": 1.1535891547143164, + "learning_rate": 9.830086040989294e-06, + "loss": 1.7871, + "step": 433 + }, + { + "epoch": 1.757085020242915, + "grad_norm": 1.1923358702044, + "learning_rate": 9.82825478589744e-06, + "loss": 1.9962, + "step": 434 + }, + { + "epoch": 1.7611336032388665, + "grad_norm": 4.275347299758622, + "learning_rate": 9.826413887833103e-06, + "loss": 2.9222, + "step": 435 + }, + { + "epoch": 1.7651821862348178, + "grad_norm": 4.287598045967039, + "learning_rate": 9.824563350472922e-06, + "loss": 2.8461, + "step": 436 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 10.935868536450831, + "learning_rate": 9.822703177512783e-06, + "loss": 2.7384, + "step": 437 + }, + { + "epoch": 1.7732793522267207, + "grad_norm": 1.3409883266265459, + "learning_rate": 9.820833372667813e-06, + "loss": 1.9939, + "step": 438 + }, + { + "epoch": 1.777327935222672, + "grad_norm": 1.3613081112789813, + "learning_rate": 9.818953939672382e-06, + "loss": 2.1821, + "step": 439 + }, + { + "epoch": 1.7813765182186234, + "grad_norm": 1.2675875076339627, + "learning_rate": 9.817064882280085e-06, + "loss": 2.2096, + "step": 440 + }, + { + "epoch": 1.785425101214575, + "grad_norm": 1.1133761183439654, + "learning_rate": 9.815166204263743e-06, + "loss": 2.0038, + "step": 441 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 1.0606754044873359, + "learning_rate": 9.813257909415384e-06, + "loss": 1.887, + "step": 442 + }, + { + "epoch": 1.7935222672064777, + "grad_norm": 1.2526447757224037, + "learning_rate": 9.811340001546252e-06, + "loss": 2.0549, + "step": 443 + }, + { + "epoch": 1.7975708502024292, + "grad_norm": 1.1262042906691425, + "learning_rate": 9.809412484486785e-06, + "loss": 2.077, + "step": 444 + }, + { + "epoch": 1.8016194331983806, + "grad_norm": 1.155022921046038, + "learning_rate": 9.80747536208661e-06, + "loss": 1.8171, + "step": 445 + }, + { + "epoch": 1.805668016194332, + "grad_norm": 1.1470501457250857, + "learning_rate": 9.805528638214543e-06, + "loss": 1.709, + "step": 446 + }, + { + "epoch": 1.8097165991902835, + "grad_norm": 1.254871859778204, + "learning_rate": 9.803572316758573e-06, + "loss": 2.005, + "step": 447 + }, + { + "epoch": 1.8137651821862348, + "grad_norm": 1.4428684006978485, + "learning_rate": 9.801606401625857e-06, + "loss": 2.0437, + "step": 448 + }, + { + "epoch": 1.8178137651821862, + "grad_norm": 1.1372709832560302, + "learning_rate": 9.799630896742716e-06, + "loss": 1.8053, + "step": 449 + }, + { + "epoch": 1.8218623481781377, + "grad_norm": 7.867540851479705, + "learning_rate": 9.797645806054617e-06, + "loss": 2.6057, + "step": 450 + }, + { + "epoch": 1.825910931174089, + "grad_norm": 17.828898730946783, + "learning_rate": 9.79565113352618e-06, + "loss": 4.1742, + "step": 451 + }, + { + "epoch": 1.8299595141700404, + "grad_norm": 1.3323533085958537, + "learning_rate": 9.793646883141155e-06, + "loss": 1.9001, + "step": 452 + }, + { + "epoch": 1.834008097165992, + "grad_norm": 1.2550944955882024, + "learning_rate": 9.791633058902424e-06, + "loss": 1.7789, + "step": 453 + }, + { + "epoch": 1.8380566801619433, + "grad_norm": 1.2515953723091495, + "learning_rate": 9.789609664831988e-06, + "loss": 1.8425, + "step": 454 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 1.1650016476570495, + "learning_rate": 9.787576704970965e-06, + "loss": 1.8701, + "step": 455 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 1.1568290050770706, + "learning_rate": 9.785534183379571e-06, + "loss": 1.8468, + "step": 456 + }, + { + "epoch": 1.8502024291497976, + "grad_norm": 1.1529373182216824, + "learning_rate": 9.783482104137127e-06, + "loss": 1.8772, + "step": 457 + }, + { + "epoch": 1.854251012145749, + "grad_norm": 1.3000516637827273, + "learning_rate": 9.781420471342035e-06, + "loss": 1.9477, + "step": 458 + }, + { + "epoch": 1.8582995951417005, + "grad_norm": 1.0258650008659411, + "learning_rate": 9.779349289111781e-06, + "loss": 1.8995, + "step": 459 + }, + { + "epoch": 1.8623481781376519, + "grad_norm": 1.2394575763975424, + "learning_rate": 9.777268561582921e-06, + "loss": 1.9406, + "step": 460 + }, + { + "epoch": 1.8663967611336032, + "grad_norm": 1.2541685708518606, + "learning_rate": 9.77517829291108e-06, + "loss": 1.9325, + "step": 461 + }, + { + "epoch": 1.8704453441295548, + "grad_norm": 1.5330647366042962, + "learning_rate": 9.773078487270932e-06, + "loss": 2.4038, + "step": 462 + }, + { + "epoch": 1.874493927125506, + "grad_norm": 1.5015880335176561, + "learning_rate": 9.770969148856202e-06, + "loss": 2.3187, + "step": 463 + }, + { + "epoch": 1.8785425101214575, + "grad_norm": 1.4834304636666527, + "learning_rate": 9.768850281879651e-06, + "loss": 2.1105, + "step": 464 + }, + { + "epoch": 1.882591093117409, + "grad_norm": 1.2140714457469706, + "learning_rate": 9.766721890573075e-06, + "loss": 1.9824, + "step": 465 + }, + { + "epoch": 1.8866396761133604, + "grad_norm": 1.3661085878272685, + "learning_rate": 9.764583979187288e-06, + "loss": 1.5205, + "step": 466 + }, + { + "epoch": 1.8906882591093117, + "grad_norm": 1.2317311840953222, + "learning_rate": 9.762436551992117e-06, + "loss": 1.9872, + "step": 467 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 1.3883104250103875, + "learning_rate": 9.760279613276397e-06, + "loss": 2.0814, + "step": 468 + }, + { + "epoch": 1.8987854251012146, + "grad_norm": 1.1681713845582538, + "learning_rate": 9.75811316734796e-06, + "loss": 1.7849, + "step": 469 + }, + { + "epoch": 1.902834008097166, + "grad_norm": 1.15545443174025, + "learning_rate": 9.755937218533622e-06, + "loss": 1.8179, + "step": 470 + }, + { + "epoch": 1.9068825910931175, + "grad_norm": 1.5408624758508003, + "learning_rate": 9.753751771179177e-06, + "loss": 2.0286, + "step": 471 + }, + { + "epoch": 1.9109311740890689, + "grad_norm": 1.3817398480348058, + "learning_rate": 9.751556829649398e-06, + "loss": 1.5547, + "step": 472 + }, + { + "epoch": 1.9149797570850202, + "grad_norm": 1.3351696061966247, + "learning_rate": 9.74935239832801e-06, + "loss": 1.733, + "step": 473 + }, + { + "epoch": 1.9190283400809718, + "grad_norm": 1.264760117783077, + "learning_rate": 9.747138481617695e-06, + "loss": 1.767, + "step": 474 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 1.2863761477462097, + "learning_rate": 9.74491508394008e-06, + "loss": 2.0018, + "step": 475 + }, + { + "epoch": 1.9271255060728745, + "grad_norm": 1.5310497493928237, + "learning_rate": 9.742682209735727e-06, + "loss": 1.8865, + "step": 476 + }, + { + "epoch": 1.931174089068826, + "grad_norm": 1.711973469366144, + "learning_rate": 9.740439863464127e-06, + "loss": 1.9105, + "step": 477 + }, + { + "epoch": 1.9352226720647774, + "grad_norm": 1.249933707627717, + "learning_rate": 9.738188049603679e-06, + "loss": 1.7676, + "step": 478 + }, + { + "epoch": 1.9392712550607287, + "grad_norm": 1.2902981801333298, + "learning_rate": 9.735926772651703e-06, + "loss": 1.6493, + "step": 479 + }, + { + "epoch": 1.9433198380566803, + "grad_norm": 1.4792877192638219, + "learning_rate": 9.73365603712441e-06, + "loss": 1.9464, + "step": 480 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 1.3282266924987296, + "learning_rate": 9.731375847556905e-06, + "loss": 1.6826, + "step": 481 + }, + { + "epoch": 1.951417004048583, + "grad_norm": 1.4677668638223476, + "learning_rate": 9.729086208503174e-06, + "loss": 1.7014, + "step": 482 + }, + { + "epoch": 1.9554655870445345, + "grad_norm": 2.3808599607342855, + "learning_rate": 9.726787124536077e-06, + "loss": 1.9583, + "step": 483 + }, + { + "epoch": 1.9595141700404857, + "grad_norm": 1.3600754750050374, + "learning_rate": 9.724478600247333e-06, + "loss": 1.7925, + "step": 484 + }, + { + "epoch": 1.9635627530364372, + "grad_norm": 1.1666914976637783, + "learning_rate": 9.722160640247523e-06, + "loss": 1.8932, + "step": 485 + }, + { + "epoch": 1.9676113360323888, + "grad_norm": 1.3451750453053897, + "learning_rate": 9.719833249166061e-06, + "loss": 2.1332, + "step": 486 + }, + { + "epoch": 1.97165991902834, + "grad_norm": 1.9010105722641066, + "learning_rate": 9.717496431651212e-06, + "loss": 2.0526, + "step": 487 + }, + { + "epoch": 1.9757085020242915, + "grad_norm": 1.1672390815512188, + "learning_rate": 9.715150192370054e-06, + "loss": 1.8783, + "step": 488 + }, + { + "epoch": 1.979757085020243, + "grad_norm": 1.384114220461852, + "learning_rate": 9.712794536008488e-06, + "loss": 1.9859, + "step": 489 + }, + { + "epoch": 1.9838056680161942, + "grad_norm": 1.2933526518975824, + "learning_rate": 9.710429467271221e-06, + "loss": 2.0382, + "step": 490 + }, + { + "epoch": 1.9878542510121457, + "grad_norm": 1.423570288241044, + "learning_rate": 9.708054990881763e-06, + "loss": 1.8377, + "step": 491 + }, + { + "epoch": 1.9919028340080973, + "grad_norm": 1.2866158830707874, + "learning_rate": 9.705671111582406e-06, + "loss": 1.7694, + "step": 492 + }, + { + "epoch": 1.9959514170040484, + "grad_norm": 1.0521519412024614, + "learning_rate": 9.703277834134227e-06, + "loss": 2.0757, + "step": 493 + }, + { + "epoch": 2.0, + "grad_norm": 1.2995506674782646, + "learning_rate": 9.700875163317072e-06, + "loss": 1.8875, + "step": 494 + }, + { + "epoch": 2.0040485829959516, + "grad_norm": 1.1352855274001465, + "learning_rate": 9.698463103929542e-06, + "loss": 1.9618, + "step": 495 + }, + { + "epoch": 2.0080971659919027, + "grad_norm": 1.542269208448278, + "learning_rate": 9.696041660788997e-06, + "loss": 2.0888, + "step": 496 + }, + { + "epoch": 2.0121457489878543, + "grad_norm": 1.6780350902786914, + "learning_rate": 9.693610838731532e-06, + "loss": 1.9408, + "step": 497 + }, + { + "epoch": 2.016194331983806, + "grad_norm": 1.6035230575875041, + "learning_rate": 9.691170642611975e-06, + "loss": 2.0771, + "step": 498 + }, + { + "epoch": 2.020242914979757, + "grad_norm": 1.4671035377471024, + "learning_rate": 9.68872107730388e-06, + "loss": 2.3311, + "step": 499 + }, + { + "epoch": 2.0242914979757085, + "grad_norm": 1.5075955512152057, + "learning_rate": 9.686262147699507e-06, + "loss": 2.2077, + "step": 500 + }, + { + "epoch": 2.02834008097166, + "grad_norm": 1.5639916261560791, + "learning_rate": 9.683793858709821e-06, + "loss": 1.8546, + "step": 501 + }, + { + "epoch": 2.032388663967611, + "grad_norm": 1.5331421353363675, + "learning_rate": 9.681316215264481e-06, + "loss": 1.9004, + "step": 502 + }, + { + "epoch": 2.0364372469635628, + "grad_norm": 1.4656462364511347, + "learning_rate": 9.678829222311827e-06, + "loss": 2.1369, + "step": 503 + }, + { + "epoch": 2.0404858299595143, + "grad_norm": 1.7055289856989309, + "learning_rate": 9.67633288481887e-06, + "loss": 1.9294, + "step": 504 + }, + { + "epoch": 2.0445344129554655, + "grad_norm": 1.3320529357395552, + "learning_rate": 9.67382720777129e-06, + "loss": 1.9228, + "step": 505 + }, + { + "epoch": 2.048582995951417, + "grad_norm": 1.378485994628673, + "learning_rate": 9.671312196173413e-06, + "loss": 1.9005, + "step": 506 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 1.4519006220083899, + "learning_rate": 9.668787855048209e-06, + "loss": 2.0772, + "step": 507 + }, + { + "epoch": 2.0566801619433197, + "grad_norm": 1.46960243337033, + "learning_rate": 9.666254189437286e-06, + "loss": 1.9259, + "step": 508 + }, + { + "epoch": 2.0607287449392713, + "grad_norm": 1.3018755932293484, + "learning_rate": 9.663711204400872e-06, + "loss": 2.0637, + "step": 509 + }, + { + "epoch": 2.064777327935223, + "grad_norm": 1.4438151108336905, + "learning_rate": 9.661158905017804e-06, + "loss": 1.9998, + "step": 510 + }, + { + "epoch": 2.068825910931174, + "grad_norm": 1.5146888645164116, + "learning_rate": 9.658597296385527e-06, + "loss": 2.1032, + "step": 511 + }, + { + "epoch": 2.0728744939271255, + "grad_norm": 1.4173605487062464, + "learning_rate": 9.656026383620076e-06, + "loss": 1.9957, + "step": 512 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 1.3186505882274318, + "learning_rate": 9.653446171856069e-06, + "loss": 1.9291, + "step": 513 + }, + { + "epoch": 2.080971659919028, + "grad_norm": 1.2929004725593367, + "learning_rate": 9.650856666246693e-06, + "loss": 1.9435, + "step": 514 + }, + { + "epoch": 2.08502024291498, + "grad_norm": 1.2511951269635655, + "learning_rate": 9.6482578719637e-06, + "loss": 1.9267, + "step": 515 + }, + { + "epoch": 2.0890688259109313, + "grad_norm": 1.9429673192553882, + "learning_rate": 9.645649794197394e-06, + "loss": 1.9435, + "step": 516 + }, + { + "epoch": 2.0931174089068825, + "grad_norm": 1.315419932054697, + "learning_rate": 9.643032438156616e-06, + "loss": 2.0396, + "step": 517 + }, + { + "epoch": 2.097165991902834, + "grad_norm": 1.3284199817957691, + "learning_rate": 9.640405809068743e-06, + "loss": 1.765, + "step": 518 + }, + { + "epoch": 2.1012145748987856, + "grad_norm": 1.4032585852247357, + "learning_rate": 9.637769912179664e-06, + "loss": 1.9292, + "step": 519 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 1.4202061741742247, + "learning_rate": 9.635124752753787e-06, + "loss": 1.9832, + "step": 520 + }, + { + "epoch": 2.1093117408906883, + "grad_norm": 1.4962037346644237, + "learning_rate": 9.632470336074009e-06, + "loss": 1.8461, + "step": 521 + }, + { + "epoch": 2.11336032388664, + "grad_norm": 1.829451958189404, + "learning_rate": 9.629806667441727e-06, + "loss": 1.7856, + "step": 522 + }, + { + "epoch": 2.117408906882591, + "grad_norm": 1.6374878381545, + "learning_rate": 9.627133752176809e-06, + "loss": 1.7441, + "step": 523 + }, + { + "epoch": 2.1214574898785425, + "grad_norm": 1.4010819404830996, + "learning_rate": 9.624451595617588e-06, + "loss": 1.7615, + "step": 524 + }, + { + "epoch": 2.125506072874494, + "grad_norm": 1.441999234959946, + "learning_rate": 9.62176020312086e-06, + "loss": 1.7378, + "step": 525 + }, + { + "epoch": 2.1295546558704452, + "grad_norm": 1.5770630911097265, + "learning_rate": 9.619059580061862e-06, + "loss": 1.7039, + "step": 526 + }, + { + "epoch": 2.133603238866397, + "grad_norm": 1.4591597594445938, + "learning_rate": 9.616349731834271e-06, + "loss": 2.0009, + "step": 527 + }, + { + "epoch": 2.1376518218623484, + "grad_norm": 1.6179185626843804, + "learning_rate": 9.613630663850184e-06, + "loss": 1.872, + "step": 528 + }, + { + "epoch": 2.1417004048582995, + "grad_norm": 1.3086175576058332, + "learning_rate": 9.610902381540115e-06, + "loss": 1.5977, + "step": 529 + }, + { + "epoch": 2.145748987854251, + "grad_norm": 1.444761778117532, + "learning_rate": 9.608164890352977e-06, + "loss": 2.0221, + "step": 530 + }, + { + "epoch": 2.1497975708502026, + "grad_norm": 1.4113693951603745, + "learning_rate": 9.605418195756077e-06, + "loss": 1.8497, + "step": 531 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 1.2987083078720463, + "learning_rate": 9.602662303235106e-06, + "loss": 1.9881, + "step": 532 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 1.5356679778352307, + "learning_rate": 9.599897218294122e-06, + "loss": 2.2169, + "step": 533 + }, + { + "epoch": 2.161943319838057, + "grad_norm": 1.2586253730389827, + "learning_rate": 9.597122946455539e-06, + "loss": 1.8884, + "step": 534 + }, + { + "epoch": 2.165991902834008, + "grad_norm": 1.3241548388576752, + "learning_rate": 9.594339493260127e-06, + "loss": 1.9169, + "step": 535 + }, + { + "epoch": 2.1700404858299596, + "grad_norm": 3.3161848122832627, + "learning_rate": 9.591546864266983e-06, + "loss": 2.3116, + "step": 536 + }, + { + "epoch": 2.174089068825911, + "grad_norm": 1.2785252284615238, + "learning_rate": 9.58874506505354e-06, + "loss": 1.7854, + "step": 537 + }, + { + "epoch": 2.1781376518218623, + "grad_norm": 1.4062987764786141, + "learning_rate": 9.58593410121554e-06, + "loss": 1.9564, + "step": 538 + }, + { + "epoch": 2.182186234817814, + "grad_norm": 1.1858759757574733, + "learning_rate": 9.583113978367026e-06, + "loss": 1.7449, + "step": 539 + }, + { + "epoch": 2.1862348178137654, + "grad_norm": 1.4958289357631562, + "learning_rate": 9.580284702140342e-06, + "loss": 1.8748, + "step": 540 + }, + { + "epoch": 2.1902834008097165, + "grad_norm": 1.271888181605562, + "learning_rate": 9.577446278186103e-06, + "loss": 1.944, + "step": 541 + }, + { + "epoch": 2.194331983805668, + "grad_norm": 1.6297569109832326, + "learning_rate": 9.574598712173202e-06, + "loss": 2.1136, + "step": 542 + }, + { + "epoch": 2.1983805668016196, + "grad_norm": 1.7294919253670684, + "learning_rate": 9.571742009788787e-06, + "loss": 2.1866, + "step": 543 + }, + { + "epoch": 2.2024291497975708, + "grad_norm": 1.5317790321439353, + "learning_rate": 9.568876176738251e-06, + "loss": 1.8859, + "step": 544 + }, + { + "epoch": 2.2064777327935223, + "grad_norm": 1.711554028884214, + "learning_rate": 9.56600121874523e-06, + "loss": 2.1936, + "step": 545 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 1.4435460877228636, + "learning_rate": 9.563117141551574e-06, + "loss": 2.2517, + "step": 546 + }, + { + "epoch": 2.214574898785425, + "grad_norm": 1.4961050962412457, + "learning_rate": 9.560223950917354e-06, + "loss": 2.041, + "step": 547 + }, + { + "epoch": 2.2186234817813766, + "grad_norm": 1.3247670963766616, + "learning_rate": 9.557321652620839e-06, + "loss": 1.8986, + "step": 548 + }, + { + "epoch": 2.2226720647773277, + "grad_norm": 1.4724998096864195, + "learning_rate": 9.554410252458489e-06, + "loss": 1.8568, + "step": 549 + }, + { + "epoch": 2.2267206477732793, + "grad_norm": 3.7991275518186196, + "learning_rate": 9.551489756244939e-06, + "loss": 1.9347, + "step": 550 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 1.4010848185779328, + "learning_rate": 9.548560169812997e-06, + "loss": 1.8809, + "step": 551 + }, + { + "epoch": 2.234817813765182, + "grad_norm": 1.6221348693259423, + "learning_rate": 9.54562149901362e-06, + "loss": 2.0865, + "step": 552 + }, + { + "epoch": 2.2388663967611335, + "grad_norm": 1.4196865192753882, + "learning_rate": 9.54267374971591e-06, + "loss": 2.0449, + "step": 553 + }, + { + "epoch": 2.242914979757085, + "grad_norm": 1.4599787722592332, + "learning_rate": 9.539716927807102e-06, + "loss": 2.0083, + "step": 554 + }, + { + "epoch": 2.246963562753036, + "grad_norm": 1.251605201082177, + "learning_rate": 9.536751039192549e-06, + "loss": 1.8576, + "step": 555 + }, + { + "epoch": 2.251012145748988, + "grad_norm": 1.30407928376828, + "learning_rate": 9.533776089795712e-06, + "loss": 1.8923, + "step": 556 + }, + { + "epoch": 2.2550607287449393, + "grad_norm": 1.4348421622864604, + "learning_rate": 9.530792085558151e-06, + "loss": 1.9873, + "step": 557 + }, + { + "epoch": 2.2591093117408905, + "grad_norm": 1.4429474918555736, + "learning_rate": 9.527799032439506e-06, + "loss": 1.8211, + "step": 558 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.338584745094179, + "learning_rate": 9.524796936417495e-06, + "loss": 1.7082, + "step": 559 + }, + { + "epoch": 2.2672064777327936, + "grad_norm": 1.329824996124572, + "learning_rate": 9.521785803487888e-06, + "loss": 1.9216, + "step": 560 + }, + { + "epoch": 2.2712550607287447, + "grad_norm": 1.3374675078915148, + "learning_rate": 9.518765639664512e-06, + "loss": 1.9723, + "step": 561 + }, + { + "epoch": 2.2753036437246963, + "grad_norm": 1.4689345418902104, + "learning_rate": 9.515736450979224e-06, + "loss": 1.953, + "step": 562 + }, + { + "epoch": 2.279352226720648, + "grad_norm": 1.6439512327159642, + "learning_rate": 9.512698243481914e-06, + "loss": 1.991, + "step": 563 + }, + { + "epoch": 2.283400809716599, + "grad_norm": 1.5280266119657933, + "learning_rate": 9.509651023240472e-06, + "loss": 1.7088, + "step": 564 + }, + { + "epoch": 2.2874493927125505, + "grad_norm": 1.5234607385845351, + "learning_rate": 9.5065947963408e-06, + "loss": 1.7975, + "step": 565 + }, + { + "epoch": 2.291497975708502, + "grad_norm": 1.4898313464385229, + "learning_rate": 9.50352956888678e-06, + "loss": 1.6643, + "step": 566 + }, + { + "epoch": 2.2955465587044532, + "grad_norm": 1.5049004900957001, + "learning_rate": 9.500455347000273e-06, + "loss": 2.0078, + "step": 567 + }, + { + "epoch": 2.299595141700405, + "grad_norm": 1.5268023276941818, + "learning_rate": 9.497372136821103e-06, + "loss": 2.0653, + "step": 568 + }, + { + "epoch": 2.3036437246963564, + "grad_norm": 1.5293343920918272, + "learning_rate": 9.49427994450705e-06, + "loss": 1.8078, + "step": 569 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 1.504441993367853, + "learning_rate": 9.491178776233825e-06, + "loss": 1.8219, + "step": 570 + }, + { + "epoch": 2.311740890688259, + "grad_norm": 1.3604060927952581, + "learning_rate": 9.488068638195072e-06, + "loss": 1.8582, + "step": 571 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 1.7336288728624165, + "learning_rate": 9.484949536602343e-06, + "loss": 1.8562, + "step": 572 + }, + { + "epoch": 2.3198380566801617, + "grad_norm": 1.536212130823414, + "learning_rate": 9.481821477685102e-06, + "loss": 1.7431, + "step": 573 + }, + { + "epoch": 2.3238866396761133, + "grad_norm": 1.4120913757834546, + "learning_rate": 9.478684467690693e-06, + "loss": 1.7586, + "step": 574 + }, + { + "epoch": 2.327935222672065, + "grad_norm": 1.453958520209467, + "learning_rate": 9.47553851288434e-06, + "loss": 1.7694, + "step": 575 + }, + { + "epoch": 2.331983805668016, + "grad_norm": 1.3935000424019952, + "learning_rate": 9.472383619549133e-06, + "loss": 1.6545, + "step": 576 + }, + { + "epoch": 2.3360323886639676, + "grad_norm": 1.3589610652505588, + "learning_rate": 9.469219793986016e-06, + "loss": 1.6896, + "step": 577 + }, + { + "epoch": 2.340080971659919, + "grad_norm": 1.7566987829139051, + "learning_rate": 9.466047042513767e-06, + "loss": 1.6272, + "step": 578 + }, + { + "epoch": 2.3441295546558703, + "grad_norm": 1.3287178155779462, + "learning_rate": 9.462865371468994e-06, + "loss": 1.7176, + "step": 579 + }, + { + "epoch": 2.348178137651822, + "grad_norm": 1.8490808825118674, + "learning_rate": 9.459674787206117e-06, + "loss": 1.9005, + "step": 580 + }, + { + "epoch": 2.3522267206477734, + "grad_norm": 1.8200114285326863, + "learning_rate": 9.45647529609736e-06, + "loss": 1.7493, + "step": 581 + }, + { + "epoch": 2.3562753036437245, + "grad_norm": 1.7944997812037724, + "learning_rate": 9.453266904532737e-06, + "loss": 1.4856, + "step": 582 + }, + { + "epoch": 2.360323886639676, + "grad_norm": 1.6449884777915886, + "learning_rate": 9.450049618920034e-06, + "loss": 1.8312, + "step": 583 + }, + { + "epoch": 2.3643724696356276, + "grad_norm": 1.6009358010430617, + "learning_rate": 9.4468234456848e-06, + "loss": 1.9048, + "step": 584 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 1.519230320705593, + "learning_rate": 9.44358839127034e-06, + "loss": 1.8077, + "step": 585 + }, + { + "epoch": 2.3724696356275303, + "grad_norm": 1.8694258750708748, + "learning_rate": 9.44034446213769e-06, + "loss": 1.9556, + "step": 586 + }, + { + "epoch": 2.376518218623482, + "grad_norm": 1.4302907644008036, + "learning_rate": 9.437091664765611e-06, + "loss": 1.5064, + "step": 587 + }, + { + "epoch": 2.380566801619433, + "grad_norm": 1.5423881317930213, + "learning_rate": 9.433830005650582e-06, + "loss": 1.69, + "step": 588 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 1.4747017336722326, + "learning_rate": 9.430559491306777e-06, + "loss": 1.5552, + "step": 589 + }, + { + "epoch": 2.388663967611336, + "grad_norm": 1.600482934018078, + "learning_rate": 9.427280128266049e-06, + "loss": 1.6106, + "step": 590 + }, + { + "epoch": 2.3927125506072873, + "grad_norm": 1.5014148151060753, + "learning_rate": 9.423991923077938e-06, + "loss": 1.7636, + "step": 591 + }, + { + "epoch": 2.396761133603239, + "grad_norm": 1.7672182274084831, + "learning_rate": 9.420694882309628e-06, + "loss": 1.5786, + "step": 592 + }, + { + "epoch": 2.4008097165991904, + "grad_norm": 1.440572594457583, + "learning_rate": 9.41738901254596e-06, + "loss": 1.6426, + "step": 593 + }, + { + "epoch": 2.4048582995951415, + "grad_norm": 1.5625132261883155, + "learning_rate": 9.414074320389403e-06, + "loss": 1.7306, + "step": 594 + }, + { + "epoch": 2.408906882591093, + "grad_norm": 1.683823244071828, + "learning_rate": 9.41075081246005e-06, + "loss": 1.821, + "step": 595 + }, + { + "epoch": 2.4129554655870447, + "grad_norm": 1.4314599370281114, + "learning_rate": 9.4074184953956e-06, + "loss": 1.6872, + "step": 596 + }, + { + "epoch": 2.417004048582996, + "grad_norm": 1.5657957134872598, + "learning_rate": 9.404077375851338e-06, + "loss": 1.9362, + "step": 597 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 1.6198467768431548, + "learning_rate": 9.400727460500141e-06, + "loss": 2.0139, + "step": 598 + }, + { + "epoch": 2.425101214574899, + "grad_norm": 1.4103077055466628, + "learning_rate": 9.397368756032445e-06, + "loss": 1.8485, + "step": 599 + }, + { + "epoch": 2.42914979757085, + "grad_norm": 1.3471173889103276, + "learning_rate": 9.394001269156245e-06, + "loss": 1.9812, + "step": 600 + }, + { + "epoch": 2.4331983805668016, + "grad_norm": 1.4234064588511484, + "learning_rate": 9.39062500659707e-06, + "loss": 2.0496, + "step": 601 + }, + { + "epoch": 2.437246963562753, + "grad_norm": 1.4784926767119206, + "learning_rate": 9.38723997509798e-06, + "loss": 1.837, + "step": 602 + }, + { + "epoch": 2.4412955465587043, + "grad_norm": 1.5518065193263646, + "learning_rate": 9.383846181419547e-06, + "loss": 1.765, + "step": 603 + }, + { + "epoch": 2.445344129554656, + "grad_norm": 1.3196666479973478, + "learning_rate": 9.380443632339845e-06, + "loss": 2.0255, + "step": 604 + }, + { + "epoch": 2.4493927125506074, + "grad_norm": 1.440061740597458, + "learning_rate": 9.37703233465443e-06, + "loss": 1.7942, + "step": 605 + }, + { + "epoch": 2.4534412955465585, + "grad_norm": 1.5327759577164166, + "learning_rate": 9.373612295176333e-06, + "loss": 1.777, + "step": 606 + }, + { + "epoch": 2.45748987854251, + "grad_norm": 1.6814358499503075, + "learning_rate": 9.370183520736045e-06, + "loss": 2.185, + "step": 607 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 2.30393335895373, + "learning_rate": 9.366746018181503e-06, + "loss": 2.4563, + "step": 608 + }, + { + "epoch": 2.465587044534413, + "grad_norm": 1.8584859443814368, + "learning_rate": 9.363299794378072e-06, + "loss": 2.0155, + "step": 609 + }, + { + "epoch": 2.4696356275303644, + "grad_norm": 1.2803493212403667, + "learning_rate": 9.359844856208538e-06, + "loss": 1.9623, + "step": 610 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 1.323092986933548, + "learning_rate": 9.356381210573092e-06, + "loss": 1.8725, + "step": 611 + }, + { + "epoch": 2.477732793522267, + "grad_norm": 1.716120944564361, + "learning_rate": 9.352908864389313e-06, + "loss": 1.9058, + "step": 612 + }, + { + "epoch": 2.4817813765182186, + "grad_norm": 1.1767574227433577, + "learning_rate": 9.349427824592157e-06, + "loss": 1.818, + "step": 613 + }, + { + "epoch": 2.48582995951417, + "grad_norm": 1.8646580879242294, + "learning_rate": 9.345938098133946e-06, + "loss": 1.8001, + "step": 614 + }, + { + "epoch": 2.4898785425101213, + "grad_norm": 1.7755724904128214, + "learning_rate": 9.342439691984346e-06, + "loss": 1.7282, + "step": 615 + }, + { + "epoch": 2.493927125506073, + "grad_norm": 1.7352293901651843, + "learning_rate": 9.338932613130363e-06, + "loss": 1.7961, + "step": 616 + }, + { + "epoch": 2.4979757085020244, + "grad_norm": 1.6153408388514847, + "learning_rate": 9.33541686857632e-06, + "loss": 1.662, + "step": 617 + }, + { + "epoch": 2.5020242914979756, + "grad_norm": 1.5099283023047843, + "learning_rate": 9.331892465343851e-06, + "loss": 1.588, + "step": 618 + }, + { + "epoch": 2.506072874493927, + "grad_norm": 1.730183741035281, + "learning_rate": 9.328359410471878e-06, + "loss": 1.8722, + "step": 619 + }, + { + "epoch": 2.5101214574898787, + "grad_norm": 1.7321761047223487, + "learning_rate": 9.324817711016609e-06, + "loss": 1.9167, + "step": 620 + }, + { + "epoch": 2.51417004048583, + "grad_norm": 1.2095836589724516, + "learning_rate": 9.32126737405151e-06, + "loss": 1.8743, + "step": 621 + }, + { + "epoch": 2.5182186234817814, + "grad_norm": 1.5485434750214813, + "learning_rate": 9.3177084066673e-06, + "loss": 1.89, + "step": 622 + }, + { + "epoch": 2.522267206477733, + "grad_norm": 1.5145693598054688, + "learning_rate": 9.31414081597194e-06, + "loss": 1.8321, + "step": 623 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 1.8660173525702701, + "learning_rate": 9.310564609090605e-06, + "loss": 1.6178, + "step": 624 + }, + { + "epoch": 2.5303643724696356, + "grad_norm": 1.9092894315915314, + "learning_rate": 9.306979793165682e-06, + "loss": 1.718, + "step": 625 + }, + { + "epoch": 2.534412955465587, + "grad_norm": 2.1574694273419817, + "learning_rate": 9.303386375356752e-06, + "loss": 1.8536, + "step": 626 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 1.5187220263251169, + "learning_rate": 9.299784362840578e-06, + "loss": 2.0088, + "step": 627 + }, + { + "epoch": 2.54251012145749, + "grad_norm": 1.3524410374053388, + "learning_rate": 9.296173762811084e-06, + "loss": 1.8993, + "step": 628 + }, + { + "epoch": 2.5465587044534415, + "grad_norm": 3.8294272400161993, + "learning_rate": 9.292554582479349e-06, + "loss": 2.3583, + "step": 629 + }, + { + "epoch": 2.5506072874493926, + "grad_norm": 6.070012543144345, + "learning_rate": 9.288926829073583e-06, + "loss": 2.4906, + "step": 630 + }, + { + "epoch": 2.554655870445344, + "grad_norm": 5.603752988478888, + "learning_rate": 9.285290509839126e-06, + "loss": 2.7822, + "step": 631 + }, + { + "epoch": 2.5587044534412957, + "grad_norm": 1.4481838054717586, + "learning_rate": 9.281645632038417e-06, + "loss": 1.8168, + "step": 632 + }, + { + "epoch": 2.562753036437247, + "grad_norm": 1.414449313894791, + "learning_rate": 9.277992202950996e-06, + "loss": 1.7136, + "step": 633 + }, + { + "epoch": 2.5668016194331984, + "grad_norm": 1.4634757861687506, + "learning_rate": 9.274330229873474e-06, + "loss": 2.0032, + "step": 634 + }, + { + "epoch": 2.57085020242915, + "grad_norm": 1.484422105707642, + "learning_rate": 9.270659720119533e-06, + "loss": 1.6359, + "step": 635 + }, + { + "epoch": 2.574898785425101, + "grad_norm": 1.4574650651898802, + "learning_rate": 9.266980681019902e-06, + "loss": 1.9962, + "step": 636 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 1.2408661225828688, + "learning_rate": 9.263293119922341e-06, + "loss": 1.7137, + "step": 637 + }, + { + "epoch": 2.582995951417004, + "grad_norm": 1.4397062187160998, + "learning_rate": 9.259597044191635e-06, + "loss": 1.9567, + "step": 638 + }, + { + "epoch": 2.5870445344129553, + "grad_norm": 1.3678454147168124, + "learning_rate": 9.255892461209574e-06, + "loss": 1.8607, + "step": 639 + }, + { + "epoch": 2.591093117408907, + "grad_norm": 1.51295578810032, + "learning_rate": 9.252179378374937e-06, + "loss": 1.8423, + "step": 640 + }, + { + "epoch": 2.5951417004048585, + "grad_norm": 1.493191888596024, + "learning_rate": 9.248457803103476e-06, + "loss": 1.5365, + "step": 641 + }, + { + "epoch": 2.5991902834008096, + "grad_norm": 1.4402174802959915, + "learning_rate": 9.24472774282791e-06, + "loss": 1.5837, + "step": 642 + }, + { + "epoch": 2.603238866396761, + "grad_norm": 1.3814570168249611, + "learning_rate": 9.240989204997903e-06, + "loss": 1.7433, + "step": 643 + }, + { + "epoch": 2.6072874493927127, + "grad_norm": 1.4229224856881553, + "learning_rate": 9.237242197080045e-06, + "loss": 1.6373, + "step": 644 + }, + { + "epoch": 2.611336032388664, + "grad_norm": 1.529255344732051, + "learning_rate": 9.23348672655785e-06, + "loss": 1.9638, + "step": 645 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 1.2990811736528833, + "learning_rate": 9.229722800931727e-06, + "loss": 1.8372, + "step": 646 + }, + { + "epoch": 2.619433198380567, + "grad_norm": 1.7287958707975635, + "learning_rate": 9.225950427718974e-06, + "loss": 1.665, + "step": 647 + }, + { + "epoch": 2.623481781376518, + "grad_norm": 1.631936855970988, + "learning_rate": 9.222169614453765e-06, + "loss": 2.052, + "step": 648 + }, + { + "epoch": 2.6275303643724697, + "grad_norm": 1.384358037456477, + "learning_rate": 9.21838036868712e-06, + "loss": 1.8437, + "step": 649 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 1.57010881393224, + "learning_rate": 9.21458269798691e-06, + "loss": 2.0542, + "step": 650 + }, + { + "epoch": 2.6356275303643724, + "grad_norm": 1.4074541953077098, + "learning_rate": 9.21077660993783e-06, + "loss": 1.7342, + "step": 651 + }, + { + "epoch": 2.639676113360324, + "grad_norm": 1.6189308816605772, + "learning_rate": 9.206962112141382e-06, + "loss": 1.9321, + "step": 652 + }, + { + "epoch": 2.6437246963562755, + "grad_norm": 1.4090618348929758, + "learning_rate": 9.203139212215868e-06, + "loss": 1.871, + "step": 653 + }, + { + "epoch": 2.6477732793522266, + "grad_norm": 1.9494105407548425, + "learning_rate": 9.199307917796371e-06, + "loss": 1.8667, + "step": 654 + }, + { + "epoch": 2.651821862348178, + "grad_norm": 1.4331583331274316, + "learning_rate": 9.195468236534734e-06, + "loss": 1.7255, + "step": 655 + }, + { + "epoch": 2.6558704453441297, + "grad_norm": 1.5909315996217737, + "learning_rate": 9.191620176099559e-06, + "loss": 1.9444, + "step": 656 + }, + { + "epoch": 2.659919028340081, + "grad_norm": 1.7461445494408216, + "learning_rate": 9.187763744176175e-06, + "loss": 1.7728, + "step": 657 + }, + { + "epoch": 2.6639676113360324, + "grad_norm": 1.422126938114325, + "learning_rate": 9.183898948466633e-06, + "loss": 1.9077, + "step": 658 + }, + { + "epoch": 2.668016194331984, + "grad_norm": 1.4144043249974336, + "learning_rate": 9.180025796689692e-06, + "loss": 1.9331, + "step": 659 + }, + { + "epoch": 2.672064777327935, + "grad_norm": 2.7772861017132255, + "learning_rate": 9.176144296580794e-06, + "loss": 1.8667, + "step": 660 + }, + { + "epoch": 2.6761133603238867, + "grad_norm": 1.3064807850177453, + "learning_rate": 9.172254455892054e-06, + "loss": 1.8187, + "step": 661 + }, + { + "epoch": 2.6801619433198383, + "grad_norm": 1.7419083953095058, + "learning_rate": 9.168356282392253e-06, + "loss": 1.903, + "step": 662 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 1.4496863008780128, + "learning_rate": 9.164449783866802e-06, + "loss": 1.7048, + "step": 663 + }, + { + "epoch": 2.688259109311741, + "grad_norm": 1.491984655358695, + "learning_rate": 9.160534968117752e-06, + "loss": 1.8734, + "step": 664 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 1.5308194782439823, + "learning_rate": 9.156611842963753e-06, + "loss": 1.8788, + "step": 665 + }, + { + "epoch": 2.6963562753036436, + "grad_norm": 1.3476877228875297, + "learning_rate": 9.152680416240059e-06, + "loss": 1.7147, + "step": 666 + }, + { + "epoch": 2.700404858299595, + "grad_norm": 1.8151640153934792, + "learning_rate": 9.1487406957985e-06, + "loss": 2.2048, + "step": 667 + }, + { + "epoch": 2.7044534412955468, + "grad_norm": 1.7628995278188238, + "learning_rate": 9.144792689507471e-06, + "loss": 1.9635, + "step": 668 + }, + { + "epoch": 2.708502024291498, + "grad_norm": 1.602921120835359, + "learning_rate": 9.140836405251917e-06, + "loss": 1.5744, + "step": 669 + }, + { + "epoch": 2.7125506072874495, + "grad_norm": 1.490856129715411, + "learning_rate": 9.136871850933312e-06, + "loss": 1.7612, + "step": 670 + }, + { + "epoch": 2.716599190283401, + "grad_norm": 1.4382592619602368, + "learning_rate": 9.132899034469648e-06, + "loss": 1.8414, + "step": 671 + }, + { + "epoch": 2.720647773279352, + "grad_norm": 1.8014041637984994, + "learning_rate": 9.128917963795422e-06, + "loss": 1.7066, + "step": 672 + }, + { + "epoch": 2.7246963562753037, + "grad_norm": 1.7582254633750898, + "learning_rate": 9.124928646861613e-06, + "loss": 1.7925, + "step": 673 + }, + { + "epoch": 2.7287449392712553, + "grad_norm": 1.6343159265633571, + "learning_rate": 9.120931091635669e-06, + "loss": 1.9923, + "step": 674 + }, + { + "epoch": 2.7327935222672064, + "grad_norm": 1.3849537338720197, + "learning_rate": 9.116925306101494e-06, + "loss": 1.858, + "step": 675 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 1.5938145614524974, + "learning_rate": 9.112911298259426e-06, + "loss": 1.8935, + "step": 676 + }, + { + "epoch": 2.7408906882591095, + "grad_norm": 2.232137755564454, + "learning_rate": 9.108889076126226e-06, + "loss": 2.5611, + "step": 677 + }, + { + "epoch": 2.7449392712550607, + "grad_norm": 1.597451641610388, + "learning_rate": 9.104858647735065e-06, + "loss": 1.9346, + "step": 678 + }, + { + "epoch": 2.748987854251012, + "grad_norm": 1.734843462936045, + "learning_rate": 9.100820021135495e-06, + "loss": 1.7738, + "step": 679 + }, + { + "epoch": 2.753036437246964, + "grad_norm": 1.5432674907856907, + "learning_rate": 9.09677320439345e-06, + "loss": 1.6451, + "step": 680 + }, + { + "epoch": 2.757085020242915, + "grad_norm": 1.4375865005427824, + "learning_rate": 9.092718205591213e-06, + "loss": 1.8788, + "step": 681 + }, + { + "epoch": 2.7611336032388665, + "grad_norm": 3.7437865438416433, + "learning_rate": 9.088655032827418e-06, + "loss": 2.6938, + "step": 682 + }, + { + "epoch": 2.765182186234818, + "grad_norm": 6.350052687447943, + "learning_rate": 9.084583694217012e-06, + "loss": 2.5299, + "step": 683 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 4.945671727596882, + "learning_rate": 9.080504197891262e-06, + "loss": 2.4088, + "step": 684 + }, + { + "epoch": 2.7732793522267207, + "grad_norm": 1.6795835965091561, + "learning_rate": 9.076416551997721e-06, + "loss": 1.824, + "step": 685 + }, + { + "epoch": 2.7773279352226723, + "grad_norm": 1.5949270953831338, + "learning_rate": 9.072320764700223e-06, + "loss": 2.0511, + "step": 686 + }, + { + "epoch": 2.7813765182186234, + "grad_norm": 1.4556536124547252, + "learning_rate": 9.068216844178857e-06, + "loss": 2.0932, + "step": 687 + }, + { + "epoch": 2.785425101214575, + "grad_norm": 1.6439876597132232, + "learning_rate": 9.064104798629955e-06, + "loss": 1.8796, + "step": 688 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.4368651555210203, + "learning_rate": 9.059984636266082e-06, + "loss": 1.7757, + "step": 689 + }, + { + "epoch": 2.7935222672064777, + "grad_norm": 1.6510465877279545, + "learning_rate": 9.055856365316012e-06, + "loss": 1.9039, + "step": 690 + }, + { + "epoch": 2.7975708502024292, + "grad_norm": 1.5313446048549542, + "learning_rate": 9.051719994024711e-06, + "loss": 1.9171, + "step": 691 + }, + { + "epoch": 2.801619433198381, + "grad_norm": 1.5880262025571767, + "learning_rate": 9.047575530653324e-06, + "loss": 1.6852, + "step": 692 + }, + { + "epoch": 2.805668016194332, + "grad_norm": 1.4675446257129918, + "learning_rate": 9.043422983479158e-06, + "loss": 1.5727, + "step": 693 + }, + { + "epoch": 2.8097165991902835, + "grad_norm": 1.6282110219820332, + "learning_rate": 9.039262360795664e-06, + "loss": 1.9079, + "step": 694 + }, + { + "epoch": 2.813765182186235, + "grad_norm": 1.9452631088170542, + "learning_rate": 9.035093670912424e-06, + "loss": 1.9093, + "step": 695 + }, + { + "epoch": 2.817813765182186, + "grad_norm": 1.6299011643761043, + "learning_rate": 9.03091692215513e-06, + "loss": 1.6569, + "step": 696 + }, + { + "epoch": 2.8218623481781377, + "grad_norm": 7.734091901664539, + "learning_rate": 9.026732122865567e-06, + "loss": 2.4758, + "step": 697 + }, + { + "epoch": 2.8259109311740893, + "grad_norm": 18.1486281089367, + "learning_rate": 9.022539281401601e-06, + "loss": 3.9379, + "step": 698 + }, + { + "epoch": 2.8299595141700404, + "grad_norm": 1.7406474445735873, + "learning_rate": 9.01833840613716e-06, + "loss": 1.7599, + "step": 699 + }, + { + "epoch": 2.834008097165992, + "grad_norm": 1.7079549569427872, + "learning_rate": 9.014129505462217e-06, + "loss": 1.6112, + "step": 700 + }, + { + "epoch": 2.8380566801619436, + "grad_norm": 1.5492178198371753, + "learning_rate": 9.009912587782772e-06, + "loss": 1.719, + "step": 701 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 1.5966963855692302, + "learning_rate": 9.005687661520838e-06, + "loss": 1.7237, + "step": 702 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 1.5738987901659376, + "learning_rate": 9.00145473511442e-06, + "loss": 1.6892, + "step": 703 + }, + { + "epoch": 2.850202429149798, + "grad_norm": 1.6008695127081995, + "learning_rate": 8.997213817017508e-06, + "loss": 1.7534, + "step": 704 + }, + { + "epoch": 2.854251012145749, + "grad_norm": 1.8027657159531043, + "learning_rate": 8.99296491570004e-06, + "loss": 1.8313, + "step": 705 + }, + { + "epoch": 2.8582995951417005, + "grad_norm": 1.388477920242152, + "learning_rate": 8.98870803964791e-06, + "loss": 1.7662, + "step": 706 + }, + { + "epoch": 2.862348178137652, + "grad_norm": 1.697508321391829, + "learning_rate": 8.984443197362938e-06, + "loss": 1.7739, + "step": 707 + }, + { + "epoch": 2.866396761133603, + "grad_norm": 1.7051210953826448, + "learning_rate": 8.980170397362846e-06, + "loss": 1.7885, + "step": 708 + }, + { + "epoch": 2.8704453441295548, + "grad_norm": 2.112476620801928, + "learning_rate": 8.975889648181258e-06, + "loss": 2.2786, + "step": 709 + }, + { + "epoch": 2.8744939271255063, + "grad_norm": 1.9686852205718806, + "learning_rate": 8.971600958367668e-06, + "loss": 2.2033, + "step": 710 + }, + { + "epoch": 2.8785425101214575, + "grad_norm": 1.8858645037099275, + "learning_rate": 8.96730433648743e-06, + "loss": 1.9747, + "step": 711 + }, + { + "epoch": 2.882591093117409, + "grad_norm": 1.629389176480098, + "learning_rate": 8.962999791121745e-06, + "loss": 1.8561, + "step": 712 + }, + { + "epoch": 2.8866396761133606, + "grad_norm": 1.7283481294339973, + "learning_rate": 8.958687330867634e-06, + "loss": 1.3887, + "step": 713 + }, + { + "epoch": 2.8906882591093117, + "grad_norm": 1.5884187879059617, + "learning_rate": 8.954366964337926e-06, + "loss": 1.8757, + "step": 714 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 1.5310621607610841, + "learning_rate": 8.950038700161239e-06, + "loss": 1.9746, + "step": 715 + }, + { + "epoch": 2.898785425101215, + "grad_norm": 1.4608377788624507, + "learning_rate": 8.94570254698197e-06, + "loss": 1.6592, + "step": 716 + }, + { + "epoch": 2.902834008097166, + "grad_norm": 1.5297317667519899, + "learning_rate": 8.941358513460264e-06, + "loss": 1.722, + "step": 717 + }, + { + "epoch": 2.9068825910931175, + "grad_norm": 1.847621037937598, + "learning_rate": 8.937006608272009e-06, + "loss": 1.9182, + "step": 718 + }, + { + "epoch": 2.910931174089069, + "grad_norm": 1.6585955176413567, + "learning_rate": 8.932646840108818e-06, + "loss": 1.4523, + "step": 719 + }, + { + "epoch": 2.91497975708502, + "grad_norm": 1.807939122311604, + "learning_rate": 8.928279217677999e-06, + "loss": 1.5928, + "step": 720 + }, + { + "epoch": 2.919028340080972, + "grad_norm": 1.6812175947881611, + "learning_rate": 8.923903749702556e-06, + "loss": 1.6197, + "step": 721 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 1.5868810975571848, + "learning_rate": 8.919520444921153e-06, + "loss": 1.9066, + "step": 722 + }, + { + "epoch": 2.9271255060728745, + "grad_norm": 2.008002647816905, + "learning_rate": 8.915129312088112e-06, + "loss": 1.7547, + "step": 723 + }, + { + "epoch": 2.931174089068826, + "grad_norm": 2.2074435698181185, + "learning_rate": 8.910730359973386e-06, + "loss": 1.7851, + "step": 724 + }, + { + "epoch": 2.9352226720647776, + "grad_norm": 1.6720121053555042, + "learning_rate": 8.906323597362547e-06, + "loss": 1.6173, + "step": 725 + }, + { + "epoch": 2.9392712550607287, + "grad_norm": 1.7840437064722243, + "learning_rate": 8.901909033056763e-06, + "loss": 1.5244, + "step": 726 + }, + { + "epoch": 2.9433198380566803, + "grad_norm": 2.087404813784654, + "learning_rate": 8.89748667587279e-06, + "loss": 1.8108, + "step": 727 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 1.7622420447448541, + "learning_rate": 8.893056534642938e-06, + "loss": 1.5553, + "step": 728 + }, + { + "epoch": 2.951417004048583, + "grad_norm": 1.9454050876073625, + "learning_rate": 8.88861861821507e-06, + "loss": 1.5518, + "step": 729 + }, + { + "epoch": 2.9554655870445345, + "grad_norm": 3.180217232768608, + "learning_rate": 8.88417293545258e-06, + "loss": 1.7772, + "step": 730 + }, + { + "epoch": 2.9595141700404857, + "grad_norm": 3.564301283270782, + "learning_rate": 8.879719495234363e-06, + "loss": 1.6766, + "step": 731 + }, + { + "epoch": 2.9635627530364372, + "grad_norm": 1.5385071245811799, + "learning_rate": 8.875258306454814e-06, + "loss": 1.7823, + "step": 732 + }, + { + "epoch": 2.967611336032389, + "grad_norm": 1.8013008659956586, + "learning_rate": 8.87078937802381e-06, + "loss": 2.0096, + "step": 733 + }, + { + "epoch": 2.97165991902834, + "grad_norm": 2.38933092267862, + "learning_rate": 8.866312718866669e-06, + "loss": 1.9226, + "step": 734 + }, + { + "epoch": 2.9757085020242915, + "grad_norm": 1.5349029688081202, + "learning_rate": 8.861828337924164e-06, + "loss": 1.7634, + "step": 735 + }, + { + "epoch": 2.979757085020243, + "grad_norm": 1.7807993217999074, + "learning_rate": 8.85733624415248e-06, + "loss": 1.862, + "step": 736 + }, + { + "epoch": 2.983805668016194, + "grad_norm": 1.6270967039867585, + "learning_rate": 8.852836446523213e-06, + "loss": 1.9281, + "step": 737 + }, + { + "epoch": 2.9878542510121457, + "grad_norm": 1.8692589473995715, + "learning_rate": 8.848328954023342e-06, + "loss": 1.7317, + "step": 738 + }, + { + "epoch": 2.9919028340080973, + "grad_norm": 1.5874083562158485, + "learning_rate": 8.843813775655211e-06, + "loss": 1.6635, + "step": 739 + }, + { + "epoch": 2.9959514170040484, + "grad_norm": 1.3707872942838146, + "learning_rate": 8.83929092043652e-06, + "loss": 1.9759, + "step": 740 + }, + { + "epoch": 3.0, + "grad_norm": 1.7529361765269527, + "learning_rate": 8.8347603974003e-06, + "loss": 1.7407, + "step": 741 + }, + { + "epoch": 3.0040485829959516, + "grad_norm": 1.4847998012230224, + "learning_rate": 8.83022221559489e-06, + "loss": 1.8183, + "step": 742 + }, + { + "epoch": 3.0080971659919027, + "grad_norm": 2.0727143325799453, + "learning_rate": 8.825676384083936e-06, + "loss": 1.9566, + "step": 743 + }, + { + "epoch": 3.0121457489878543, + "grad_norm": 2.1863226369459072, + "learning_rate": 8.82112291194635e-06, + "loss": 1.8211, + "step": 744 + }, + { + "epoch": 3.016194331983806, + "grad_norm": 2.194214751548881, + "learning_rate": 8.816561808276312e-06, + "loss": 1.9756, + "step": 745 + }, + { + "epoch": 3.020242914979757, + "grad_norm": 1.8746800584359844, + "learning_rate": 8.811993082183243e-06, + "loss": 2.2277, + "step": 746 + }, + { + "epoch": 3.0242914979757085, + "grad_norm": 2.0032700627210636, + "learning_rate": 8.807416742791784e-06, + "loss": 2.0822, + "step": 747 + }, + { + "epoch": 3.02834008097166, + "grad_norm": 1.6874624326476195, + "learning_rate": 8.80283279924178e-06, + "loss": 1.7544, + "step": 748 + }, + { + "epoch": 3.032388663967611, + "grad_norm": 1.981414959416955, + "learning_rate": 8.798241260688273e-06, + "loss": 1.7612, + "step": 749 + }, + { + "epoch": 3.0364372469635628, + "grad_norm": 1.85228853236934, + "learning_rate": 8.793642136301462e-06, + "loss": 2.0061, + "step": 750 + }, + { + "epoch": 3.0404858299595143, + "grad_norm": 1.839202167316395, + "learning_rate": 8.7890354352667e-06, + "loss": 1.8078, + "step": 751 + }, + { + "epoch": 3.0445344129554655, + "grad_norm": 1.664692242856933, + "learning_rate": 8.784421166784476e-06, + "loss": 1.7918, + "step": 752 + }, + { + "epoch": 3.048582995951417, + "grad_norm": 1.8125016947634567, + "learning_rate": 8.779799340070388e-06, + "loss": 1.7574, + "step": 753 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 1.922401307664431, + "learning_rate": 8.775169964355134e-06, + "loss": 1.8982, + "step": 754 + }, + { + "epoch": 3.0566801619433197, + "grad_norm": 1.893673085388173, + "learning_rate": 8.770533048884483e-06, + "loss": 1.7375, + "step": 755 + }, + { + "epoch": 3.0607287449392713, + "grad_norm": 1.7578051605078406, + "learning_rate": 8.765888602919266e-06, + "loss": 1.9075, + "step": 756 + }, + { + "epoch": 3.064777327935223, + "grad_norm": 1.8959640677324443, + "learning_rate": 8.761236635735353e-06, + "loss": 1.8378, + "step": 757 + }, + { + "epoch": 3.068825910931174, + "grad_norm": 1.9801599495189568, + "learning_rate": 8.756577156623636e-06, + "loss": 1.9702, + "step": 758 + }, + { + "epoch": 3.0728744939271255, + "grad_norm": 1.790845579793568, + "learning_rate": 8.751910174890009e-06, + "loss": 1.8932, + "step": 759 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 1.8236903737287826, + "learning_rate": 8.74723569985535e-06, + "loss": 1.8215, + "step": 760 + }, + { + "epoch": 3.080971659919028, + "grad_norm": 1.7121510890543619, + "learning_rate": 8.742553740855507e-06, + "loss": 1.8237, + "step": 761 + }, + { + "epoch": 3.08502024291498, + "grad_norm": 1.6455567766467654, + "learning_rate": 8.737864307241266e-06, + "loss": 1.825, + "step": 762 + }, + { + "epoch": 3.0890688259109313, + "grad_norm": 2.004800789953328, + "learning_rate": 8.733167408378348e-06, + "loss": 1.83, + "step": 763 + }, + { + "epoch": 3.0931174089068825, + "grad_norm": 1.761656112643498, + "learning_rate": 8.728463053647382e-06, + "loss": 1.9209, + "step": 764 + }, + { + "epoch": 3.097165991902834, + "grad_norm": 1.7248736206433866, + "learning_rate": 8.723751252443891e-06, + "loss": 1.6591, + "step": 765 + }, + { + "epoch": 3.1012145748987856, + "grad_norm": 1.8246435273625035, + "learning_rate": 8.71903201417826e-06, + "loss": 1.8214, + "step": 766 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 1.8468962560997435, + "learning_rate": 8.71430534827574e-06, + "loss": 1.854, + "step": 767 + }, + { + "epoch": 3.1093117408906883, + "grad_norm": 1.9312402322655278, + "learning_rate": 8.709571264176408e-06, + "loss": 1.7321, + "step": 768 + }, + { + "epoch": 3.11336032388664, + "grad_norm": 2.316632605973664, + "learning_rate": 8.70482977133516e-06, + "loss": 1.6709, + "step": 769 + }, + { + "epoch": 3.117408906882591, + "grad_norm": 1.9879535887114659, + "learning_rate": 8.700080879221689e-06, + "loss": 1.6082, + "step": 770 + }, + { + "epoch": 3.1214574898785425, + "grad_norm": 1.8223147298487212, + "learning_rate": 8.69532459732046e-06, + "loss": 1.6324, + "step": 771 + }, + { + "epoch": 3.125506072874494, + "grad_norm": 1.9254678274105181, + "learning_rate": 8.690560935130708e-06, + "loss": 1.626, + "step": 772 + }, + { + "epoch": 3.1295546558704452, + "grad_norm": 2.1237007524174683, + "learning_rate": 8.685789902166395e-06, + "loss": 1.5525, + "step": 773 + }, + { + "epoch": 3.133603238866397, + "grad_norm": 1.7727476948432017, + "learning_rate": 8.681011507956215e-06, + "loss": 1.8873, + "step": 774 + }, + { + "epoch": 3.1376518218623484, + "grad_norm": 2.049295618159139, + "learning_rate": 8.676225762043555e-06, + "loss": 1.7496, + "step": 775 + }, + { + "epoch": 3.1417004048582995, + "grad_norm": 1.5682714669220028, + "learning_rate": 8.671432673986493e-06, + "loss": 1.4753, + "step": 776 + }, + { + "epoch": 3.145748987854251, + "grad_norm": 1.8938048440408406, + "learning_rate": 8.666632253357767e-06, + "loss": 1.8963, + "step": 777 + }, + { + "epoch": 3.1497975708502026, + "grad_norm": 1.8936062118104038, + "learning_rate": 8.661824509744754e-06, + "loss": 1.7098, + "step": 778 + }, + { + "epoch": 3.1538461538461537, + "grad_norm": 1.6774875162585348, + "learning_rate": 8.657009452749466e-06, + "loss": 1.8596, + "step": 779 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 2.015957389549595, + "learning_rate": 8.652187091988516e-06, + "loss": 2.061, + "step": 780 + }, + { + "epoch": 3.161943319838057, + "grad_norm": 1.7186637319125118, + "learning_rate": 8.647357437093104e-06, + "loss": 1.7589, + "step": 781 + }, + { + "epoch": 3.165991902834008, + "grad_norm": 1.7941883707597104, + "learning_rate": 8.642520497709001e-06, + "loss": 1.8086, + "step": 782 + }, + { + "epoch": 3.1700404858299596, + "grad_norm": 1.774631391234699, + "learning_rate": 8.637676283496521e-06, + "loss": 2.2517, + "step": 783 + }, + { + "epoch": 3.174089068825911, + "grad_norm": 1.7904179919335834, + "learning_rate": 8.632824804130514e-06, + "loss": 1.6679, + "step": 784 + }, + { + "epoch": 3.1781376518218623, + "grad_norm": 1.972746622761643, + "learning_rate": 8.627966069300332e-06, + "loss": 1.8345, + "step": 785 + }, + { + "epoch": 3.182186234817814, + "grad_norm": 1.5336336477310177, + "learning_rate": 8.623100088709829e-06, + "loss": 1.6473, + "step": 786 + }, + { + "epoch": 3.1862348178137654, + "grad_norm": 1.9951657707171577, + "learning_rate": 8.618226872077315e-06, + "loss": 1.7821, + "step": 787 + }, + { + "epoch": 3.1902834008097165, + "grad_norm": 1.7282375741642677, + "learning_rate": 8.613346429135567e-06, + "loss": 1.8289, + "step": 788 + }, + { + "epoch": 3.194331983805668, + "grad_norm": 2.1277631117336675, + "learning_rate": 8.608458769631785e-06, + "loss": 2.0076, + "step": 789 + }, + { + "epoch": 3.1983805668016196, + "grad_norm": 1.8372643674137712, + "learning_rate": 8.603563903327582e-06, + "loss": 2.0805, + "step": 790 + }, + { + "epoch": 3.2024291497975708, + "grad_norm": 1.8065321863693007, + "learning_rate": 8.598661839998972e-06, + "loss": 1.7669, + "step": 791 + }, + { + "epoch": 3.2064777327935223, + "grad_norm": 2.031336948957746, + "learning_rate": 8.593752589436334e-06, + "loss": 2.0858, + "step": 792 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 1.8889862063112353, + "learning_rate": 8.588836161444405e-06, + "loss": 2.1341, + "step": 793 + }, + { + "epoch": 3.214574898785425, + "grad_norm": 1.8426615628835388, + "learning_rate": 8.583912565842258e-06, + "loss": 1.9304, + "step": 794 + }, + { + "epoch": 3.2186234817813766, + "grad_norm": 1.7414893453963287, + "learning_rate": 8.578981812463278e-06, + "loss": 1.7942, + "step": 795 + }, + { + "epoch": 3.2226720647773277, + "grad_norm": 1.9096193735192637, + "learning_rate": 8.574043911155148e-06, + "loss": 1.72, + "step": 796 + }, + { + "epoch": 3.2267206477732793, + "grad_norm": 1.8025258377815987, + "learning_rate": 8.569098871779828e-06, + "loss": 1.8542, + "step": 797 + }, + { + "epoch": 3.230769230769231, + "grad_norm": 1.8460762696682704, + "learning_rate": 8.56414670421353e-06, + "loss": 1.7101, + "step": 798 + }, + { + "epoch": 3.234817813765182, + "grad_norm": 1.9398991434247146, + "learning_rate": 8.559187418346703e-06, + "loss": 1.95, + "step": 799 + }, + { + "epoch": 3.2388663967611335, + "grad_norm": 1.8632306612622278, + "learning_rate": 8.554221024084019e-06, + "loss": 1.8895, + "step": 800 + }, + { + "epoch": 3.242914979757085, + "grad_norm": 1.893700967064052, + "learning_rate": 8.54924753134434e-06, + "loss": 1.873, + "step": 801 + }, + { + "epoch": 3.246963562753036, + "grad_norm": 1.7151529599583697, + "learning_rate": 8.544266950060706e-06, + "loss": 1.7236, + "step": 802 + }, + { + "epoch": 3.251012145748988, + "grad_norm": 1.7251248112215953, + "learning_rate": 8.539279290180315e-06, + "loss": 1.7693, + "step": 803 + }, + { + "epoch": 3.2550607287449393, + "grad_norm": 1.9817743209184147, + "learning_rate": 8.534284561664508e-06, + "loss": 1.8365, + "step": 804 + }, + { + "epoch": 3.2591093117408905, + "grad_norm": 1.8362666024929137, + "learning_rate": 8.529282774488731e-06, + "loss": 1.6791, + "step": 805 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 1.9144972025615734, + "learning_rate": 8.524273938642539e-06, + "loss": 1.5622, + "step": 806 + }, + { + "epoch": 3.2672064777327936, + "grad_norm": 1.8150113569889472, + "learning_rate": 8.519258064129559e-06, + "loss": 1.8107, + "step": 807 + }, + { + "epoch": 3.2712550607287447, + "grad_norm": 1.8132774105922835, + "learning_rate": 8.514235160967476e-06, + "loss": 1.8382, + "step": 808 + }, + { + "epoch": 3.2753036437246963, + "grad_norm": 1.7178012200999808, + "learning_rate": 8.509205239188017e-06, + "loss": 1.8519, + "step": 809 + }, + { + "epoch": 3.279352226720648, + "grad_norm": 2.2519702448886845, + "learning_rate": 8.504168308836918e-06, + "loss": 1.8559, + "step": 810 + }, + { + "epoch": 3.283400809716599, + "grad_norm": 2.1015013370666513, + "learning_rate": 8.499124379973922e-06, + "loss": 1.5602, + "step": 811 + }, + { + "epoch": 3.2874493927125505, + "grad_norm": 2.1456515647605365, + "learning_rate": 8.494073462672743e-06, + "loss": 1.6597, + "step": 812 + }, + { + "epoch": 3.291497975708502, + "grad_norm": 2.1425091129883613, + "learning_rate": 8.489015567021054e-06, + "loss": 1.5311, + "step": 813 + }, + { + "epoch": 3.2955465587044532, + "grad_norm": 2.1055979919937693, + "learning_rate": 8.483950703120466e-06, + "loss": 1.8547, + "step": 814 + }, + { + "epoch": 3.299595141700405, + "grad_norm": 1.9678625432719996, + "learning_rate": 8.478878881086505e-06, + "loss": 1.9357, + "step": 815 + }, + { + "epoch": 3.3036437246963564, + "grad_norm": 2.0317817207691538, + "learning_rate": 8.473800111048598e-06, + "loss": 1.6684, + "step": 816 + }, + { + "epoch": 3.3076923076923075, + "grad_norm": 2.0379814335298843, + "learning_rate": 8.468714403150043e-06, + "loss": 1.6929, + "step": 817 + }, + { + "epoch": 3.311740890688259, + "grad_norm": 1.9848650286398888, + "learning_rate": 8.463621767547998e-06, + "loss": 1.7112, + "step": 818 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 2.274800378339576, + "learning_rate": 8.458522214413455e-06, + "loss": 1.7005, + "step": 819 + }, + { + "epoch": 3.3198380566801617, + "grad_norm": 2.170751690325617, + "learning_rate": 8.453415753931223e-06, + "loss": 1.5995, + "step": 820 + }, + { + "epoch": 3.3238866396761133, + "grad_norm": 1.9913626012571344, + "learning_rate": 8.448302396299906e-06, + "loss": 1.6057, + "step": 821 + }, + { + "epoch": 3.327935222672065, + "grad_norm": 1.9395230430651595, + "learning_rate": 8.443182151731883e-06, + "loss": 1.6349, + "step": 822 + }, + { + "epoch": 3.331983805668016, + "grad_norm": 1.9091197381555691, + "learning_rate": 8.438055030453287e-06, + "loss": 1.5595, + "step": 823 + }, + { + "epoch": 3.3360323886639676, + "grad_norm": 1.8562911407114664, + "learning_rate": 8.432921042703985e-06, + "loss": 1.6019, + "step": 824 + }, + { + "epoch": 3.340080971659919, + "grad_norm": 1.7832079833064884, + "learning_rate": 8.42778019873756e-06, + "loss": 1.552, + "step": 825 + }, + { + "epoch": 3.3441295546558703, + "grad_norm": 1.8542638409385725, + "learning_rate": 8.422632508821284e-06, + "loss": 1.5851, + "step": 826 + }, + { + "epoch": 3.348178137651822, + "grad_norm": 2.1436195397021436, + "learning_rate": 8.417477983236107e-06, + "loss": 1.7666, + "step": 827 + }, + { + "epoch": 3.3522267206477734, + "grad_norm": 2.33071372223659, + "learning_rate": 8.412316632276627e-06, + "loss": 1.6497, + "step": 828 + }, + { + "epoch": 3.3562753036437245, + "grad_norm": 2.205436986044382, + "learning_rate": 8.407148466251072e-06, + "loss": 1.3523, + "step": 829 + }, + { + "epoch": 3.360323886639676, + "grad_norm": 2.2620315487409877, + "learning_rate": 8.401973495481289e-06, + "loss": 1.723, + "step": 830 + }, + { + "epoch": 3.3643724696356276, + "grad_norm": 2.180101120238927, + "learning_rate": 8.396791730302708e-06, + "loss": 1.8056, + "step": 831 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 1.990085418505961, + "learning_rate": 8.39160318106433e-06, + "loss": 1.7166, + "step": 832 + }, + { + "epoch": 3.3724696356275303, + "grad_norm": 2.40657553356096, + "learning_rate": 8.386407858128707e-06, + "loss": 1.8193, + "step": 833 + }, + { + "epoch": 3.376518218623482, + "grad_norm": 1.94489059367538, + "learning_rate": 8.381205771871918e-06, + "loss": 1.4172, + "step": 834 + }, + { + "epoch": 3.380566801619433, + "grad_norm": 2.150391672244522, + "learning_rate": 8.375996932683553e-06, + "loss": 1.5949, + "step": 835 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 2.0030590669894903, + "learning_rate": 8.370781350966683e-06, + "loss": 1.4156, + "step": 836 + }, + { + "epoch": 3.388663967611336, + "grad_norm": 2.197019034882382, + "learning_rate": 8.36555903713785e-06, + "loss": 1.4714, + "step": 837 + }, + { + "epoch": 3.3927125506072873, + "grad_norm": 2.078166195454461, + "learning_rate": 8.360330001627043e-06, + "loss": 1.6429, + "step": 838 + }, + { + "epoch": 3.396761133603239, + "grad_norm": 2.40629641977567, + "learning_rate": 8.355094254877665e-06, + "loss": 1.4713, + "step": 839 + }, + { + "epoch": 3.4008097165991904, + "grad_norm": 1.9645801904393803, + "learning_rate": 8.349851807346535e-06, + "loss": 1.5146, + "step": 840 + }, + { + "epoch": 3.4048582995951415, + "grad_norm": 1.9534289124567972, + "learning_rate": 8.344602669503849e-06, + "loss": 1.5871, + "step": 841 + }, + { + "epoch": 3.408906882591093, + "grad_norm": 2.3102884897188534, + "learning_rate": 8.339346851833163e-06, + "loss": 1.6862, + "step": 842 + }, + { + "epoch": 3.4129554655870447, + "grad_norm": 2.0401234182707406, + "learning_rate": 8.334084364831381e-06, + "loss": 1.5214, + "step": 843 + }, + { + "epoch": 3.417004048582996, + "grad_norm": 2.159768925630674, + "learning_rate": 8.328815219008719e-06, + "loss": 1.8219, + "step": 844 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 2.2204972461802757, + "learning_rate": 8.323539424888695e-06, + "loss": 1.8941, + "step": 845 + }, + { + "epoch": 3.425101214574899, + "grad_norm": 1.9873340221710971, + "learning_rate": 8.318256993008108e-06, + "loss": 1.7539, + "step": 846 + }, + { + "epoch": 3.42914979757085, + "grad_norm": 1.975202455896719, + "learning_rate": 8.31296793391701e-06, + "loss": 1.8598, + "step": 847 + }, + { + "epoch": 3.4331983805668016, + "grad_norm": 1.8415081642607933, + "learning_rate": 8.30767225817869e-06, + "loss": 1.9574, + "step": 848 + }, + { + "epoch": 3.437246963562753, + "grad_norm": 2.047274050267817, + "learning_rate": 8.302369976369651e-06, + "loss": 1.736, + "step": 849 + }, + { + "epoch": 3.4412955465587043, + "grad_norm": 2.1457366433830454, + "learning_rate": 8.297061099079592e-06, + "loss": 1.6581, + "step": 850 + }, + { + "epoch": 3.445344129554656, + "grad_norm": 1.8891113266245207, + "learning_rate": 8.291745636911382e-06, + "loss": 1.9183, + "step": 851 + }, + { + "epoch": 3.4493927125506074, + "grad_norm": 2.05347009046486, + "learning_rate": 8.286423600481044e-06, + "loss": 1.6869, + "step": 852 + }, + { + "epoch": 3.4534412955465585, + "grad_norm": 2.1578470259791795, + "learning_rate": 8.281095000417725e-06, + "loss": 1.6709, + "step": 853 + }, + { + "epoch": 3.45748987854251, + "grad_norm": 2.2158190833608606, + "learning_rate": 8.27575984736369e-06, + "loss": 2.079, + "step": 854 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 2.9226191862145265, + "learning_rate": 8.270418151974286e-06, + "loss": 2.3146, + "step": 855 + }, + { + "epoch": 3.465587044534413, + "grad_norm": 2.1657050143675205, + "learning_rate": 8.265069924917925e-06, + "loss": 1.9175, + "step": 856 + }, + { + "epoch": 3.4696356275303644, + "grad_norm": 1.7932680376129573, + "learning_rate": 8.259715176876069e-06, + "loss": 1.8725, + "step": 857 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 1.8709685644083165, + "learning_rate": 8.254353918543199e-06, + "loss": 1.7809, + "step": 858 + }, + { + "epoch": 3.477732793522267, + "grad_norm": 2.4167400582718694, + "learning_rate": 8.2489861606268e-06, + "loss": 1.8016, + "step": 859 + }, + { + "epoch": 3.4817813765182186, + "grad_norm": 1.659768741074137, + "learning_rate": 8.243611913847337e-06, + "loss": 1.7188, + "step": 860 + }, + { + "epoch": 3.48582995951417, + "grad_norm": 2.1480568234600668, + "learning_rate": 8.238231188938237e-06, + "loss": 1.6913, + "step": 861 + }, + { + "epoch": 3.4898785425101213, + "grad_norm": 2.461283879827119, + "learning_rate": 8.232843996645865e-06, + "loss": 1.6242, + "step": 862 + }, + { + "epoch": 3.493927125506073, + "grad_norm": 2.3643514071925056, + "learning_rate": 8.2274503477295e-06, + "loss": 1.6881, + "step": 863 + }, + { + "epoch": 3.4979757085020244, + "grad_norm": 3.087293785042021, + "learning_rate": 8.222050252961318e-06, + "loss": 1.5087, + "step": 864 + }, + { + "epoch": 3.5020242914979756, + "grad_norm": 2.105684160210004, + "learning_rate": 8.216643723126367e-06, + "loss": 1.4331, + "step": 865 + }, + { + "epoch": 3.506072874493927, + "grad_norm": 2.420952436641065, + "learning_rate": 8.211230769022552e-06, + "loss": 1.7553, + "step": 866 + }, + { + "epoch": 3.5101214574898787, + "grad_norm": 2.2746665377354116, + "learning_rate": 8.2058114014606e-06, + "loss": 1.782, + "step": 867 + }, + { + "epoch": 3.51417004048583, + "grad_norm": 1.6776374980476494, + "learning_rate": 8.200385631264051e-06, + "loss": 1.7357, + "step": 868 + }, + { + "epoch": 3.5182186234817814, + "grad_norm": 2.130957958265717, + "learning_rate": 8.19495346926924e-06, + "loss": 1.7569, + "step": 869 + }, + { + "epoch": 3.522267206477733, + "grad_norm": 2.1241420175580386, + "learning_rate": 8.189514926325255e-06, + "loss": 1.7036, + "step": 870 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 2.397883462392177, + "learning_rate": 8.184070013293936e-06, + "loss": 1.4984, + "step": 871 + }, + { + "epoch": 3.5303643724696356, + "grad_norm": 2.676554915114245, + "learning_rate": 8.178618741049841e-06, + "loss": 1.5719, + "step": 872 + }, + { + "epoch": 3.534412955465587, + "grad_norm": 2.641036334787177, + "learning_rate": 8.173161120480232e-06, + "loss": 1.7235, + "step": 873 + }, + { + "epoch": 3.5384615384615383, + "grad_norm": 2.4283908813712127, + "learning_rate": 8.16769716248505e-06, + "loss": 1.8976, + "step": 874 + }, + { + "epoch": 3.54251012145749, + "grad_norm": 1.9109389793413394, + "learning_rate": 8.162226877976886e-06, + "loss": 1.797, + "step": 875 + }, + { + "epoch": 3.5465587044534415, + "grad_norm": 3.1765952449893073, + "learning_rate": 8.156750277880979e-06, + "loss": 2.2212, + "step": 876 + }, + { + "epoch": 3.5506072874493926, + "grad_norm": 6.740978753214387, + "learning_rate": 8.15126737313517e-06, + "loss": 2.2759, + "step": 877 + }, + { + "epoch": 3.554655870445344, + "grad_norm": 6.646199027432937, + "learning_rate": 8.145778174689897e-06, + "loss": 2.5045, + "step": 878 + }, + { + "epoch": 3.5587044534412957, + "grad_norm": 1.9732928727215509, + "learning_rate": 8.140282693508168e-06, + "loss": 1.702, + "step": 879 + }, + { + "epoch": 3.562753036437247, + "grad_norm": 1.923113895215325, + "learning_rate": 8.134780940565535e-06, + "loss": 1.5859, + "step": 880 + }, + { + "epoch": 3.5668016194331984, + "grad_norm": 1.888490124882663, + "learning_rate": 8.129272926850079e-06, + "loss": 1.9019, + "step": 881 + }, + { + "epoch": 3.57085020242915, + "grad_norm": 2.0879599313529247, + "learning_rate": 8.123758663362386e-06, + "loss": 1.5424, + "step": 882 + }, + { + "epoch": 3.574898785425101, + "grad_norm": 2.1113301524020778, + "learning_rate": 8.118238161115523e-06, + "loss": 1.8581, + "step": 883 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 1.7105898329062328, + "learning_rate": 8.112711431135014e-06, + "loss": 1.5914, + "step": 884 + }, + { + "epoch": 3.582995951417004, + "grad_norm": 1.9358089378047225, + "learning_rate": 8.107178484458825e-06, + "loss": 1.7957, + "step": 885 + }, + { + "epoch": 3.5870445344129553, + "grad_norm": 1.9092777164097747, + "learning_rate": 8.101639332137337e-06, + "loss": 1.7404, + "step": 886 + }, + { + "epoch": 3.591093117408907, + "grad_norm": 2.098080272876577, + "learning_rate": 8.096093985233323e-06, + "loss": 1.7127, + "step": 887 + }, + { + "epoch": 3.5951417004048585, + "grad_norm": 2.4907144738421065, + "learning_rate": 8.090542454821929e-06, + "loss": 1.4308, + "step": 888 + }, + { + "epoch": 3.5991902834008096, + "grad_norm": 1.8678109793168913, + "learning_rate": 8.084984751990652e-06, + "loss": 1.4797, + "step": 889 + }, + { + "epoch": 3.603238866396761, + "grad_norm": 1.8961480105884363, + "learning_rate": 8.079420887839316e-06, + "loss": 1.6173, + "step": 890 + }, + { + "epoch": 3.6072874493927127, + "grad_norm": 1.9539785870788862, + "learning_rate": 8.073850873480047e-06, + "loss": 1.4952, + "step": 891 + }, + { + "epoch": 3.611336032388664, + "grad_norm": 2.31450202449626, + "learning_rate": 8.068274720037261e-06, + "loss": 1.813, + "step": 892 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 1.8087093273790038, + "learning_rate": 8.062692438647628e-06, + "loss": 1.7376, + "step": 893 + }, + { + "epoch": 3.619433198380567, + "grad_norm": 2.408589476299181, + "learning_rate": 8.057104040460062e-06, + "loss": 1.505, + "step": 894 + }, + { + "epoch": 3.623481781376518, + "grad_norm": 2.3231639351842035, + "learning_rate": 8.051509536635686e-06, + "loss": 1.9039, + "step": 895 + }, + { + "epoch": 3.6275303643724697, + "grad_norm": 1.9849491847712974, + "learning_rate": 8.045908938347828e-06, + "loss": 1.7125, + "step": 896 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 2.2249483352664026, + "learning_rate": 8.04030225678198e-06, + "loss": 1.9514, + "step": 897 + }, + { + "epoch": 3.6356275303643724, + "grad_norm": 2.005047614111562, + "learning_rate": 8.034689503135785e-06, + "loss": 1.597, + "step": 898 + }, + { + "epoch": 3.639676113360324, + "grad_norm": 2.2925145752574854, + "learning_rate": 8.029070688619013e-06, + "loss": 1.8072, + "step": 899 + }, + { + "epoch": 3.6437246963562755, + "grad_norm": 1.9475842419850795, + "learning_rate": 8.023445824453539e-06, + "loss": 1.7289, + "step": 900 + }, + { + "epoch": 3.6477732793522266, + "grad_norm": 2.071154449190338, + "learning_rate": 8.017814921873326e-06, + "loss": 1.7658, + "step": 901 + }, + { + "epoch": 3.651821862348178, + "grad_norm": 1.9935193669759015, + "learning_rate": 8.012177992124385e-06, + "loss": 1.6002, + "step": 902 + }, + { + "epoch": 3.6558704453441297, + "grad_norm": 2.2483209235168737, + "learning_rate": 8.006535046464774e-06, + "loss": 1.8275, + "step": 903 + }, + { + "epoch": 3.659919028340081, + "grad_norm": 2.5274264683222425, + "learning_rate": 8.000886096164564e-06, + "loss": 1.6502, + "step": 904 + }, + { + "epoch": 3.6639676113360324, + "grad_norm": 2.0119741262052195, + "learning_rate": 7.995231152505815e-06, + "loss": 1.8017, + "step": 905 + }, + { + "epoch": 3.668016194331984, + "grad_norm": 2.1027093845450233, + "learning_rate": 7.989570226782562e-06, + "loss": 1.8138, + "step": 906 + }, + { + "epoch": 3.672064777327935, + "grad_norm": 3.056649771146675, + "learning_rate": 7.983903330300782e-06, + "loss": 1.8128, + "step": 907 + }, + { + "epoch": 3.6761133603238867, + "grad_norm": 1.9139807090551522, + "learning_rate": 7.978230474378383e-06, + "loss": 1.7148, + "step": 908 + }, + { + "epoch": 3.6801619433198383, + "grad_norm": 2.416490627923619, + "learning_rate": 7.97255167034517e-06, + "loss": 1.7726, + "step": 909 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 2.053612332583323, + "learning_rate": 7.966866929542827e-06, + "loss": 1.5779, + "step": 910 + }, + { + "epoch": 3.688259109311741, + "grad_norm": 2.0666037215601505, + "learning_rate": 7.961176263324902e-06, + "loss": 1.7465, + "step": 911 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 2.1463137742100327, + "learning_rate": 7.955479683056767e-06, + "loss": 1.7608, + "step": 912 + }, + { + "epoch": 3.6963562753036436, + "grad_norm": 1.9232481327470194, + "learning_rate": 7.949777200115617e-06, + "loss": 1.5992, + "step": 913 + }, + { + "epoch": 3.700404858299595, + "grad_norm": 2.5029604743639515, + "learning_rate": 7.944068825890424e-06, + "loss": 2.089, + "step": 914 + }, + { + "epoch": 3.7044534412955468, + "grad_norm": 2.425403056999352, + "learning_rate": 7.938354571781933e-06, + "loss": 1.8514, + "step": 915 + }, + { + "epoch": 3.708502024291498, + "grad_norm": 2.2889869162476315, + "learning_rate": 7.932634449202635e-06, + "loss": 1.4493, + "step": 916 + }, + { + "epoch": 3.7125506072874495, + "grad_norm": 2.0245599708625988, + "learning_rate": 7.92690846957673e-06, + "loss": 1.6351, + "step": 917 + }, + { + "epoch": 3.716599190283401, + "grad_norm": 1.997997696536965, + "learning_rate": 7.921176644340132e-06, + "loss": 1.7253, + "step": 918 + }, + { + "epoch": 3.720647773279352, + "grad_norm": 2.344635708570945, + "learning_rate": 7.915438984940415e-06, + "loss": 1.5384, + "step": 919 + }, + { + "epoch": 3.7246963562753037, + "grad_norm": 2.399788568220564, + "learning_rate": 7.909695502836814e-06, + "loss": 1.6518, + "step": 920 + }, + { + "epoch": 3.7287449392712553, + "grad_norm": 2.258204100694036, + "learning_rate": 7.903946209500189e-06, + "loss": 1.8741, + "step": 921 + }, + { + "epoch": 3.7327935222672064, + "grad_norm": 1.9355255173187593, + "learning_rate": 7.898191116413007e-06, + "loss": 1.6996, + "step": 922 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 2.1474241115417425, + "learning_rate": 7.892430235069317e-06, + "loss": 1.7427, + "step": 923 + }, + { + "epoch": 3.7408906882591095, + "grad_norm": 3.071687208613463, + "learning_rate": 7.886663576974733e-06, + "loss": 2.4106, + "step": 924 + }, + { + "epoch": 3.7449392712550607, + "grad_norm": 2.0799708188253465, + "learning_rate": 7.880891153646401e-06, + "loss": 1.808, + "step": 925 + }, + { + "epoch": 3.748987854251012, + "grad_norm": 2.4353787137639453, + "learning_rate": 7.875112976612984e-06, + "loss": 1.6368, + "step": 926 + }, + { + "epoch": 3.753036437246964, + "grad_norm": 2.159792334487355, + "learning_rate": 7.869329057414635e-06, + "loss": 1.5175, + "step": 927 + }, + { + "epoch": 3.757085020242915, + "grad_norm": 2.0548605804443274, + "learning_rate": 7.863539407602976e-06, + "loss": 1.7423, + "step": 928 + }, + { + "epoch": 3.7611336032388665, + "grad_norm": 3.9628857560933324, + "learning_rate": 7.857744038741076e-06, + "loss": 2.5332, + "step": 929 + }, + { + "epoch": 3.765182186234818, + "grad_norm": 4.514218437938051, + "learning_rate": 7.85194296240342e-06, + "loss": 2.3287, + "step": 930 + }, + { + "epoch": 3.769230769230769, + "grad_norm": 5.356074790215057, + "learning_rate": 7.846136190175901e-06, + "loss": 2.1714, + "step": 931 + }, + { + "epoch": 3.7732793522267207, + "grad_norm": 2.238703863406207, + "learning_rate": 7.84032373365578e-06, + "loss": 1.671, + "step": 932 + }, + { + "epoch": 3.7773279352226723, + "grad_norm": 2.194562792441507, + "learning_rate": 7.834505604451672e-06, + "loss": 1.9108, + "step": 933 + }, + { + "epoch": 3.7813765182186234, + "grad_norm": 2.085928113902739, + "learning_rate": 7.828681814183527e-06, + "loss": 1.9396, + "step": 934 + }, + { + "epoch": 3.785425101214575, + "grad_norm": 2.215253557008417, + "learning_rate": 7.822852374482597e-06, + "loss": 1.7587, + "step": 935 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 3.010107826761077, + "learning_rate": 7.817017296991411e-06, + "loss": 1.6507, + "step": 936 + }, + { + "epoch": 3.7935222672064777, + "grad_norm": 2.25886892537205, + "learning_rate": 7.811176593363771e-06, + "loss": 1.7372, + "step": 937 + }, + { + "epoch": 3.7975708502024292, + "grad_norm": 2.2130344020805297, + "learning_rate": 7.805330275264707e-06, + "loss": 1.7485, + "step": 938 + }, + { + "epoch": 3.801619433198381, + "grad_norm": 2.0367189537336907, + "learning_rate": 7.79947835437046e-06, + "loss": 1.5515, + "step": 939 + }, + { + "epoch": 3.805668016194332, + "grad_norm": 2.070856690389127, + "learning_rate": 7.79362084236847e-06, + "loss": 1.4447, + "step": 940 + }, + { + "epoch": 3.8097165991902835, + "grad_norm": 2.1857926637124794, + "learning_rate": 7.787757750957335e-06, + "loss": 1.8015, + "step": 941 + }, + { + "epoch": 3.813765182186235, + "grad_norm": 2.6872149719652305, + "learning_rate": 7.781889091846799e-06, + "loss": 1.7528, + "step": 942 + }, + { + "epoch": 3.817813765182186, + "grad_norm": 2.3048135110635264, + "learning_rate": 7.776014876757727e-06, + "loss": 1.5226, + "step": 943 + }, + { + "epoch": 3.8218623481781377, + "grad_norm": 8.991127581731243, + "learning_rate": 7.77013511742208e-06, + "loss": 2.3966, + "step": 944 + }, + { + "epoch": 3.8259109311740893, + "grad_norm": 19.276037930316928, + "learning_rate": 7.76424982558289e-06, + "loss": 3.7738, + "step": 945 + }, + { + "epoch": 3.8299595141700404, + "grad_norm": 2.4583074183525677, + "learning_rate": 7.758359012994242e-06, + "loss": 1.6137, + "step": 946 + }, + { + "epoch": 3.834008097165992, + "grad_norm": 2.405931055156567, + "learning_rate": 7.752462691421245e-06, + "loss": 1.4666, + "step": 947 + }, + { + "epoch": 3.8380566801619436, + "grad_norm": 2.114379083785604, + "learning_rate": 7.746560872640007e-06, + "loss": 1.5791, + "step": 948 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 2.1946059502111845, + "learning_rate": 7.740653568437623e-06, + "loss": 1.5937, + "step": 949 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 2.3168344745949456, + "learning_rate": 7.734740790612137e-06, + "loss": 1.5169, + "step": 950 + }, + { + "epoch": 3.850202429149798, + "grad_norm": 2.3139829718351197, + "learning_rate": 7.728822550972523e-06, + "loss": 1.6162, + "step": 951 + }, + { + "epoch": 3.854251012145749, + "grad_norm": 2.5483408296020764, + "learning_rate": 7.722898861338674e-06, + "loss": 1.7001, + "step": 952 + }, + { + "epoch": 3.8582995951417005, + "grad_norm": 1.917540396918308, + "learning_rate": 7.716969733541357e-06, + "loss": 1.6257, + "step": 953 + }, + { + "epoch": 3.862348178137652, + "grad_norm": 2.4091479518780177, + "learning_rate": 7.711035179422205e-06, + "loss": 1.6058, + "step": 954 + }, + { + "epoch": 3.866396761133603, + "grad_norm": 2.4390857592479183, + "learning_rate": 7.705095210833687e-06, + "loss": 1.6468, + "step": 955 + }, + { + "epoch": 3.8704453441295548, + "grad_norm": 3.01025731676863, + "learning_rate": 7.699149839639086e-06, + "loss": 2.1392, + "step": 956 + }, + { + "epoch": 3.8744939271255063, + "grad_norm": 2.6957364897623473, + "learning_rate": 7.693199077712476e-06, + "loss": 2.0741, + "step": 957 + }, + { + "epoch": 3.8785425101214575, + "grad_norm": 2.6726767004932395, + "learning_rate": 7.687242936938694e-06, + "loss": 1.8205, + "step": 958 + }, + { + "epoch": 3.882591093117409, + "grad_norm": 2.3223231672079727, + "learning_rate": 7.681281429213328e-06, + "loss": 1.7239, + "step": 959 + }, + { + "epoch": 3.8866396761133606, + "grad_norm": 2.4223424195591505, + "learning_rate": 7.675314566442673e-06, + "loss": 1.2702, + "step": 960 + }, + { + "epoch": 3.8906882591093117, + "grad_norm": 2.1111739790928024, + "learning_rate": 7.669342360543727e-06, + "loss": 1.7654, + "step": 961 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 2.0865715931939968, + "learning_rate": 7.663364823444157e-06, + "loss": 1.8567, + "step": 962 + }, + { + "epoch": 3.898785425101215, + "grad_norm": 1.9521945713254736, + "learning_rate": 7.65738196708228e-06, + "loss": 1.5513, + "step": 963 + }, + { + "epoch": 3.902834008097166, + "grad_norm": 2.252893420029499, + "learning_rate": 7.651393803407032e-06, + "loss": 1.6101, + "step": 964 + }, + { + "epoch": 3.9068825910931175, + "grad_norm": 2.445627287506017, + "learning_rate": 7.645400344377953e-06, + "loss": 1.7802, + "step": 965 + }, + { + "epoch": 3.910931174089069, + "grad_norm": 2.206311718559999, + "learning_rate": 7.639401601965158e-06, + "loss": 1.3433, + "step": 966 + }, + { + "epoch": 3.91497975708502, + "grad_norm": 2.5126306064577935, + "learning_rate": 7.63339758814931e-06, + "loss": 1.4571, + "step": 967 + }, + { + "epoch": 3.919028340080972, + "grad_norm": 2.301201962037062, + "learning_rate": 7.627388314921602e-06, + "loss": 1.4798, + "step": 968 + }, + { + "epoch": 3.9230769230769234, + "grad_norm": 2.0505587515987265, + "learning_rate": 7.621373794283735e-06, + "loss": 1.7924, + "step": 969 + }, + { + "epoch": 3.9271255060728745, + "grad_norm": 2.716118255543476, + "learning_rate": 7.615354038247889e-06, + "loss": 1.6337, + "step": 970 + }, + { + "epoch": 3.931174089068826, + "grad_norm": 2.636209282969381, + "learning_rate": 7.609329058836694e-06, + "loss": 1.6699, + "step": 971 + }, + { + "epoch": 3.9352226720647776, + "grad_norm": 2.3802398786409107, + "learning_rate": 7.6032988680832195e-06, + "loss": 1.4692, + "step": 972 + }, + { + "epoch": 3.9392712550607287, + "grad_norm": 2.5735078826994844, + "learning_rate": 7.597263478030939e-06, + "loss": 1.3909, + "step": 973 + }, + { + "epoch": 3.9433198380566803, + "grad_norm": 2.986329351018389, + "learning_rate": 7.59122290073371e-06, + "loss": 1.6787, + "step": 974 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 2.4407323865905015, + "learning_rate": 7.5851771482557535e-06, + "loss": 1.4349, + "step": 975 + }, + { + "epoch": 3.951417004048583, + "grad_norm": 2.8570555841909657, + "learning_rate": 7.579126232671621e-06, + "loss": 1.4016, + "step": 976 + }, + { + "epoch": 3.9554655870445345, + "grad_norm": 3.322338952206099, + "learning_rate": 7.5730701660661795e-06, + "loss": 1.6104, + "step": 977 + }, + { + "epoch": 3.9595141700404857, + "grad_norm": 2.5182830088343082, + "learning_rate": 7.567008960534585e-06, + "loss": 1.6231, + "step": 978 + }, + { + "epoch": 3.9635627530364372, + "grad_norm": 2.1739951186703923, + "learning_rate": 7.560942628182251e-06, + "loss": 1.6679, + "step": 979 + }, + { + "epoch": 3.967611336032389, + "grad_norm": 2.5756124639646982, + "learning_rate": 7.554871181124836e-06, + "loss": 1.8633, + "step": 980 + }, + { + "epoch": 3.97165991902834, + "grad_norm": 3.073388081199716, + "learning_rate": 7.548794631488211e-06, + "loss": 1.768, + "step": 981 + }, + { + "epoch": 3.9757085020242915, + "grad_norm": 2.1012291254049797, + "learning_rate": 7.5427129914084385e-06, + "loss": 1.6442, + "step": 982 + }, + { + "epoch": 3.979757085020243, + "grad_norm": 2.351295674425286, + "learning_rate": 7.536626273031747e-06, + "loss": 1.7358, + "step": 983 + }, + { + "epoch": 3.983805668016194, + "grad_norm": 2.115853749649768, + "learning_rate": 7.530534488514507e-06, + "loss": 1.8024, + "step": 984 + }, + { + "epoch": 3.9878542510121457, + "grad_norm": 2.454948116388734, + "learning_rate": 7.524437650023211e-06, + "loss": 1.6063, + "step": 985 + }, + { + "epoch": 3.9919028340080973, + "grad_norm": 2.043008387794743, + "learning_rate": 7.5183357697344395e-06, + "loss": 1.5544, + "step": 986 + }, + { + "epoch": 3.9959514170040484, + "grad_norm": 1.8968397388893163, + "learning_rate": 7.512228859834845e-06, + "loss": 1.8733, + "step": 987 + }, + { + "epoch": 4.0, + "grad_norm": 2.2142162316932255, + "learning_rate": 7.506116932521127e-06, + "loss": 1.6136, + "step": 988 + }, + { + "epoch": 4.004048582995951, + "grad_norm": 2.080064737878757, + "learning_rate": 7.500000000000001e-06, + "loss": 1.6735, + "step": 989 + }, + { + "epoch": 4.008097165991903, + "grad_norm": 2.8195577020771863, + "learning_rate": 7.493878074488184e-06, + "loss": 1.8144, + "step": 990 + }, + { + "epoch": 4.012145748987854, + "grad_norm": 2.861434123319288, + "learning_rate": 7.4877511682123635e-06, + "loss": 1.6734, + "step": 991 + }, + { + "epoch": 4.016194331983805, + "grad_norm": 3.0695960191225247, + "learning_rate": 7.481619293409173e-06, + "loss": 1.8495, + "step": 992 + }, + { + "epoch": 4.020242914979757, + "grad_norm": 2.580474309033628, + "learning_rate": 7.475482462325169e-06, + "loss": 2.099, + "step": 993 + }, + { + "epoch": 4.0242914979757085, + "grad_norm": 2.721243409721488, + "learning_rate": 7.469340687216809e-06, + "loss": 1.9446, + "step": 994 + }, + { + "epoch": 4.02834008097166, + "grad_norm": 2.3410049191202074, + "learning_rate": 7.4631939803504215e-06, + "loss": 1.6196, + "step": 995 + }, + { + "epoch": 4.032388663967612, + "grad_norm": 2.720885518023577, + "learning_rate": 7.4570423540021905e-06, + "loss": 1.6221, + "step": 996 + }, + { + "epoch": 4.036437246963563, + "grad_norm": 2.5413861683291996, + "learning_rate": 7.450885820458117e-06, + "loss": 1.8749, + "step": 997 + }, + { + "epoch": 4.040485829959514, + "grad_norm": 2.5863690862096957, + "learning_rate": 7.44472439201401e-06, + "loss": 1.6649, + "step": 998 + }, + { + "epoch": 4.044534412955466, + "grad_norm": 2.371552718771952, + "learning_rate": 7.438558080975449e-06, + "loss": 1.6799, + "step": 999 + }, + { + "epoch": 4.048582995951417, + "grad_norm": 2.5691951258164063, + "learning_rate": 7.4323868996577696e-06, + "loss": 1.63, + "step": 1000 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 2.675468998968646, + "learning_rate": 7.426210860386032e-06, + "loss": 1.7354, + "step": 1001 + }, + { + "epoch": 4.05668016194332, + "grad_norm": 2.58607973493479, + "learning_rate": 7.420029975494996e-06, + "loss": 1.5703, + "step": 1002 + }, + { + "epoch": 4.060728744939271, + "grad_norm": 2.475852723612659, + "learning_rate": 7.413844257329104e-06, + "loss": 1.749, + "step": 1003 + }, + { + "epoch": 4.064777327935222, + "grad_norm": 2.625704853477589, + "learning_rate": 7.407653718242449e-06, + "loss": 1.6948, + "step": 1004 + }, + { + "epoch": 4.068825910931174, + "grad_norm": 2.7272435081151283, + "learning_rate": 7.401458370598753e-06, + "loss": 1.8281, + "step": 1005 + }, + { + "epoch": 4.0728744939271255, + "grad_norm": 2.507953052399452, + "learning_rate": 7.395258226771341e-06, + "loss": 1.7673, + "step": 1006 + }, + { + "epoch": 4.076923076923077, + "grad_norm": 2.5085283118904074, + "learning_rate": 7.3890532991431174e-06, + "loss": 1.6958, + "step": 1007 + }, + { + "epoch": 4.080971659919029, + "grad_norm": 2.388953051348741, + "learning_rate": 7.382843600106539e-06, + "loss": 1.7112, + "step": 1008 + }, + { + "epoch": 4.08502024291498, + "grad_norm": 2.2236808085380644, + "learning_rate": 7.376629142063597e-06, + "loss": 1.7162, + "step": 1009 + }, + { + "epoch": 4.089068825910931, + "grad_norm": 2.7412048035286505, + "learning_rate": 7.370409937425781e-06, + "loss": 1.7045, + "step": 1010 + }, + { + "epoch": 4.093117408906883, + "grad_norm": 2.3839251838504367, + "learning_rate": 7.364185998614064e-06, + "loss": 1.7854, + "step": 1011 + }, + { + "epoch": 4.097165991902834, + "grad_norm": 2.383572557144146, + "learning_rate": 7.357957338058873e-06, + "loss": 1.534, + "step": 1012 + }, + { + "epoch": 4.101214574898785, + "grad_norm": 2.7483936941368996, + "learning_rate": 7.3517239682000675e-06, + "loss": 1.7001, + "step": 1013 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 2.6910416116843257, + "learning_rate": 7.345485901486908e-06, + "loss": 1.7037, + "step": 1014 + }, + { + "epoch": 4.109311740890688, + "grad_norm": 2.677750230508956, + "learning_rate": 7.33924315037804e-06, + "loss": 1.6197, + "step": 1015 + }, + { + "epoch": 4.113360323886639, + "grad_norm": 3.1184294482443717, + "learning_rate": 7.332995727341462e-06, + "loss": 1.5587, + "step": 1016 + }, + { + "epoch": 4.117408906882591, + "grad_norm": 2.697817221643411, + "learning_rate": 7.326743644854504e-06, + "loss": 1.4804, + "step": 1017 + }, + { + "epoch": 4.1214574898785425, + "grad_norm": 2.5533427892436364, + "learning_rate": 7.3204869154038015e-06, + "loss": 1.5149, + "step": 1018 + }, + { + "epoch": 4.125506072874494, + "grad_norm": 2.7058477331519604, + "learning_rate": 7.314225551485273e-06, + "loss": 1.5156, + "step": 1019 + }, + { + "epoch": 4.129554655870446, + "grad_norm": 2.8633359493766384, + "learning_rate": 7.30795956560409e-06, + "loss": 1.4187, + "step": 1020 + }, + { + "epoch": 4.133603238866397, + "grad_norm": 2.346585899707522, + "learning_rate": 7.301688970274655e-06, + "loss": 1.7718, + "step": 1021 + }, + { + "epoch": 4.137651821862348, + "grad_norm": 2.8346595314782568, + "learning_rate": 7.295413778020579e-06, + "loss": 1.6181, + "step": 1022 + }, + { + "epoch": 4.1417004048583, + "grad_norm": 2.1328033209542046, + "learning_rate": 7.289134001374654e-06, + "loss": 1.3513, + "step": 1023 + }, + { + "epoch": 4.145748987854251, + "grad_norm": 2.723527413205223, + "learning_rate": 7.282849652878824e-06, + "loss": 1.7449, + "step": 1024 + }, + { + "epoch": 4.149797570850202, + "grad_norm": 2.6296530406635648, + "learning_rate": 7.276560745084167e-06, + "loss": 1.56, + "step": 1025 + }, + { + "epoch": 4.153846153846154, + "grad_norm": 2.3607444563571645, + "learning_rate": 7.2702672905508656e-06, + "loss": 1.7373, + "step": 1026 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 2.857459652562985, + "learning_rate": 7.263969301848188e-06, + "loss": 1.8929, + "step": 1027 + }, + { + "epoch": 4.161943319838056, + "grad_norm": 2.416479591453608, + "learning_rate": 7.257666791554448e-06, + "loss": 1.6155, + "step": 1028 + }, + { + "epoch": 4.165991902834008, + "grad_norm": 2.485932817739182, + "learning_rate": 7.251359772256998e-06, + "loss": 1.6856, + "step": 1029 + }, + { + "epoch": 4.17004048582996, + "grad_norm": 2.2601305066652664, + "learning_rate": 7.245048256552195e-06, + "loss": 2.1658, + "step": 1030 + }, + { + "epoch": 4.174089068825911, + "grad_norm": 2.4736185296097566, + "learning_rate": 7.2387322570453724e-06, + "loss": 1.5329, + "step": 1031 + }, + { + "epoch": 4.178137651821863, + "grad_norm": 2.902522379367228, + "learning_rate": 7.232411786350824e-06, + "loss": 1.7115, + "step": 1032 + }, + { + "epoch": 4.182186234817814, + "grad_norm": 2.1213589715944594, + "learning_rate": 7.226086857091765e-06, + "loss": 1.5227, + "step": 1033 + }, + { + "epoch": 4.186234817813765, + "grad_norm": 2.8619121355527968, + "learning_rate": 7.219757481900325e-06, + "loss": 1.6826, + "step": 1034 + }, + { + "epoch": 4.190283400809717, + "grad_norm": 2.5322052891357867, + "learning_rate": 7.213423673417508e-06, + "loss": 1.7019, + "step": 1035 + }, + { + "epoch": 4.194331983805668, + "grad_norm": 2.868097930235534, + "learning_rate": 7.207085444293172e-06, + "loss": 1.8899, + "step": 1036 + }, + { + "epoch": 4.198380566801619, + "grad_norm": 2.5521158066560288, + "learning_rate": 7.2007428071860045e-06, + "loss": 1.9495, + "step": 1037 + }, + { + "epoch": 4.202429149797571, + "grad_norm": 2.63283746068705, + "learning_rate": 7.194395774763496e-06, + "loss": 1.6451, + "step": 1038 + }, + { + "epoch": 4.206477732793522, + "grad_norm": 3.020988257996165, + "learning_rate": 7.188044359701917e-06, + "loss": 1.9686, + "step": 1039 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 2.7497468285413267, + "learning_rate": 7.181688574686292e-06, + "loss": 2.0078, + "step": 1040 + }, + { + "epoch": 4.2145748987854255, + "grad_norm": 2.4897799224246873, + "learning_rate": 7.175328432410367e-06, + "loss": 1.7921, + "step": 1041 + }, + { + "epoch": 4.218623481781377, + "grad_norm": 2.470322521256254, + "learning_rate": 7.168963945576597e-06, + "loss": 1.6719, + "step": 1042 + }, + { + "epoch": 4.222672064777328, + "grad_norm": 2.6592137837660266, + "learning_rate": 7.162595126896111e-06, + "loss": 1.5749, + "step": 1043 + }, + { + "epoch": 4.22672064777328, + "grad_norm": 2.533296478811204, + "learning_rate": 7.15622198908869e-06, + "loss": 1.7352, + "step": 1044 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 2.5992050846283354, + "learning_rate": 7.149844544882742e-06, + "loss": 1.5639, + "step": 1045 + }, + { + "epoch": 4.234817813765182, + "grad_norm": 2.7675121593200367, + "learning_rate": 7.143462807015271e-06, + "loss": 1.8108, + "step": 1046 + }, + { + "epoch": 4.238866396761134, + "grad_norm": 2.658793190465704, + "learning_rate": 7.137076788231865e-06, + "loss": 1.7457, + "step": 1047 + }, + { + "epoch": 4.242914979757085, + "grad_norm": 2.604959217646965, + "learning_rate": 7.130686501286655e-06, + "loss": 1.7451, + "step": 1048 + }, + { + "epoch": 4.246963562753036, + "grad_norm": 2.5111072223063897, + "learning_rate": 7.1242919589422974e-06, + "loss": 1.5808, + "step": 1049 + }, + { + "epoch": 4.251012145748988, + "grad_norm": 2.4705422975939775, + "learning_rate": 7.11789317396995e-06, + "loss": 1.6597, + "step": 1050 + }, + { + "epoch": 4.255060728744939, + "grad_norm": 2.8012872307046726, + "learning_rate": 7.1114901591492404e-06, + "loss": 1.6728, + "step": 1051 + }, + { + "epoch": 4.2591093117408905, + "grad_norm": 2.376781495157912, + "learning_rate": 7.105082927268247e-06, + "loss": 1.561, + "step": 1052 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 2.5702431118604423, + "learning_rate": 7.0986714911234715e-06, + "loss": 1.4172, + "step": 1053 + }, + { + "epoch": 4.267206477732794, + "grad_norm": 2.508325280537679, + "learning_rate": 7.092255863519806e-06, + "loss": 1.6779, + "step": 1054 + }, + { + "epoch": 4.271255060728745, + "grad_norm": 2.540012700506, + "learning_rate": 7.085836057270521e-06, + "loss": 1.6985, + "step": 1055 + }, + { + "epoch": 4.275303643724697, + "grad_norm": 2.471796434580062, + "learning_rate": 7.079412085197229e-06, + "loss": 1.7301, + "step": 1056 + }, + { + "epoch": 4.279352226720648, + "grad_norm": 3.3244889584848107, + "learning_rate": 7.072983960129862e-06, + "loss": 1.7094, + "step": 1057 + }, + { + "epoch": 4.283400809716599, + "grad_norm": 2.983349503659567, + "learning_rate": 7.066551694906651e-06, + "loss": 1.3989, + "step": 1058 + }, + { + "epoch": 4.287449392712551, + "grad_norm": 3.036520426590972, + "learning_rate": 7.060115302374087e-06, + "loss": 1.5257, + "step": 1059 + }, + { + "epoch": 4.291497975708502, + "grad_norm": 3.2696461082092068, + "learning_rate": 7.053674795386914e-06, + "loss": 1.3769, + "step": 1060 + }, + { + "epoch": 4.295546558704453, + "grad_norm": 3.066097380387373, + "learning_rate": 7.047230186808085e-06, + "loss": 1.6842, + "step": 1061 + }, + { + "epoch": 4.299595141700405, + "grad_norm": 2.6903089198270855, + "learning_rate": 7.04078148950875e-06, + "loss": 1.8088, + "step": 1062 + }, + { + "epoch": 4.303643724696356, + "grad_norm": 2.8258995708159773, + "learning_rate": 7.034328716368224e-06, + "loss": 1.5156, + "step": 1063 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 2.858420747113862, + "learning_rate": 7.027871880273959e-06, + "loss": 1.5394, + "step": 1064 + }, + { + "epoch": 4.3117408906882595, + "grad_norm": 2.7740108493498323, + "learning_rate": 7.021410994121525e-06, + "loss": 1.549, + "step": 1065 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 3.219790325593576, + "learning_rate": 7.014946070814583e-06, + "loss": 1.5296, + "step": 1066 + }, + { + "epoch": 4.319838056680162, + "grad_norm": 3.0526696821998094, + "learning_rate": 7.008477123264849e-06, + "loss": 1.4361, + "step": 1067 + }, + { + "epoch": 4.323886639676114, + "grad_norm": 2.9571662763160136, + "learning_rate": 7.0020041643920826e-06, + "loss": 1.4498, + "step": 1068 + }, + { + "epoch": 4.327935222672065, + "grad_norm": 2.819893094328226, + "learning_rate": 6.995527207124053e-06, + "loss": 1.4853, + "step": 1069 + }, + { + "epoch": 4.331983805668016, + "grad_norm": 2.7252255526223625, + "learning_rate": 6.989046264396516e-06, + "loss": 1.4535, + "step": 1070 + }, + { + "epoch": 4.336032388663968, + "grad_norm": 2.6189552228263753, + "learning_rate": 6.982561349153188e-06, + "loss": 1.5022, + "step": 1071 + }, + { + "epoch": 4.340080971659919, + "grad_norm": 2.568082005220546, + "learning_rate": 6.976072474345713e-06, + "loss": 1.4532, + "step": 1072 + }, + { + "epoch": 4.34412955465587, + "grad_norm": 2.623502257576312, + "learning_rate": 6.96957965293365e-06, + "loss": 1.4399, + "step": 1073 + }, + { + "epoch": 4.348178137651822, + "grad_norm": 3.1483597392827045, + "learning_rate": 6.963082897884439e-06, + "loss": 1.615, + "step": 1074 + }, + { + "epoch": 4.352226720647773, + "grad_norm": 3.8022601065423123, + "learning_rate": 6.956582222173374e-06, + "loss": 1.5412, + "step": 1075 + }, + { + "epoch": 4.3562753036437245, + "grad_norm": 3.177062030751366, + "learning_rate": 6.9500776387835785e-06, + "loss": 1.2047, + "step": 1076 + }, + { + "epoch": 4.3603238866396765, + "grad_norm": 3.185748452470112, + "learning_rate": 6.943569160705985e-06, + "loss": 1.6101, + "step": 1077 + }, + { + "epoch": 4.364372469635628, + "grad_norm": 2.9943825828047954, + "learning_rate": 6.9370568009393e-06, + "loss": 1.6897, + "step": 1078 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 2.8396585705303297, + "learning_rate": 6.9305405724899876e-06, + "loss": 1.6066, + "step": 1079 + }, + { + "epoch": 4.372469635627531, + "grad_norm": 3.4103100269352504, + "learning_rate": 6.924020488372229e-06, + "loss": 1.6845, + "step": 1080 + }, + { + "epoch": 4.376518218623482, + "grad_norm": 2.8184107943036323, + "learning_rate": 6.917496561607915e-06, + "loss": 1.3205, + "step": 1081 + }, + { + "epoch": 4.380566801619433, + "grad_norm": 3.152451887221124, + "learning_rate": 6.91096880522661e-06, + "loss": 1.4827, + "step": 1082 + }, + { + "epoch": 4.384615384615385, + "grad_norm": 2.8506198416780317, + "learning_rate": 6.904437232265521e-06, + "loss": 1.2814, + "step": 1083 + }, + { + "epoch": 4.388663967611336, + "grad_norm": 3.2465586785242033, + "learning_rate": 6.897901855769483e-06, + "loss": 1.3431, + "step": 1084 + }, + { + "epoch": 4.392712550607287, + "grad_norm": 3.077940405612511, + "learning_rate": 6.891362688790925e-06, + "loss": 1.5208, + "step": 1085 + }, + { + "epoch": 4.396761133603239, + "grad_norm": 3.4135560109047005, + "learning_rate": 6.884819744389848e-06, + "loss": 1.3629, + "step": 1086 + }, + { + "epoch": 4.40080971659919, + "grad_norm": 2.6507174805524727, + "learning_rate": 6.878273035633795e-06, + "loss": 1.3853, + "step": 1087 + }, + { + "epoch": 4.4048582995951415, + "grad_norm": 2.5895703393651637, + "learning_rate": 6.871722575597829e-06, + "loss": 1.4423, + "step": 1088 + }, + { + "epoch": 4.4089068825910935, + "grad_norm": 3.2322118670425777, + "learning_rate": 6.865168377364506e-06, + "loss": 1.5468, + "step": 1089 + }, + { + "epoch": 4.412955465587045, + "grad_norm": 2.942042054251793, + "learning_rate": 6.858610454023842e-06, + "loss": 1.36, + "step": 1090 + }, + { + "epoch": 4.417004048582996, + "grad_norm": 3.122031784641475, + "learning_rate": 6.8520488186733e-06, + "loss": 1.6917, + "step": 1091 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 3.2313772685904847, + "learning_rate": 6.845483484417756e-06, + "loss": 1.7526, + "step": 1092 + }, + { + "epoch": 4.425101214574899, + "grad_norm": 2.8735793462178023, + "learning_rate": 6.838914464369467e-06, + "loss": 1.6487, + "step": 1093 + }, + { + "epoch": 4.42914979757085, + "grad_norm": 2.954566180150772, + "learning_rate": 6.832341771648057e-06, + "loss": 1.7096, + "step": 1094 + }, + { + "epoch": 4.433198380566802, + "grad_norm": 2.587188799407319, + "learning_rate": 6.825765419380484e-06, + "loss": 1.8456, + "step": 1095 + }, + { + "epoch": 4.437246963562753, + "grad_norm": 3.0518891038101925, + "learning_rate": 6.819185420701011e-06, + "loss": 1.6224, + "step": 1096 + }, + { + "epoch": 4.441295546558704, + "grad_norm": 3.118348281802091, + "learning_rate": 6.812601788751192e-06, + "loss": 1.5498, + "step": 1097 + }, + { + "epoch": 4.445344129554655, + "grad_norm": 2.894711350660116, + "learning_rate": 6.806014536679828e-06, + "loss": 1.8041, + "step": 1098 + }, + { + "epoch": 4.449392712550607, + "grad_norm": 3.062471930595446, + "learning_rate": 6.7994236776429555e-06, + "loss": 1.5815, + "step": 1099 + }, + { + "epoch": 4.4534412955465585, + "grad_norm": 3.0993288240233263, + "learning_rate": 6.792829224803816e-06, + "loss": 1.5695, + "step": 1100 + }, + { + "epoch": 4.4574898785425106, + "grad_norm": 3.149585012325393, + "learning_rate": 6.7862311913328235e-06, + "loss": 1.9487, + "step": 1101 + }, + { + "epoch": 4.461538461538462, + "grad_norm": 4.120477147155456, + "learning_rate": 6.779629590407547e-06, + "loss": 2.1517, + "step": 1102 + }, + { + "epoch": 4.465587044534413, + "grad_norm": 3.1988261301020855, + "learning_rate": 6.773024435212678e-06, + "loss": 1.79, + "step": 1103 + }, + { + "epoch": 4.469635627530364, + "grad_norm": 2.6369221757485457, + "learning_rate": 6.7664157389400095e-06, + "loss": 1.7651, + "step": 1104 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 2.7091701203884364, + "learning_rate": 6.7598035147884055e-06, + "loss": 1.6839, + "step": 1105 + }, + { + "epoch": 4.477732793522267, + "grad_norm": 3.4306069422759005, + "learning_rate": 6.753187775963773e-06, + "loss": 1.692, + "step": 1106 + }, + { + "epoch": 4.481781376518219, + "grad_norm": 2.386072562379964, + "learning_rate": 6.746568535679041e-06, + "loss": 1.6155, + "step": 1107 + }, + { + "epoch": 4.48582995951417, + "grad_norm": 2.851423578739297, + "learning_rate": 6.739945807154136e-06, + "loss": 1.5755, + "step": 1108 + }, + { + "epoch": 4.489878542510121, + "grad_norm": 3.3510139502859206, + "learning_rate": 6.733319603615941e-06, + "loss": 1.5105, + "step": 1109 + }, + { + "epoch": 4.493927125506072, + "grad_norm": 3.329100996808692, + "learning_rate": 6.726689938298289e-06, + "loss": 1.568, + "step": 1110 + }, + { + "epoch": 4.497975708502024, + "grad_norm": 2.7974205212393057, + "learning_rate": 6.72005682444192e-06, + "loss": 1.4162, + "step": 1111 + }, + { + "epoch": 4.502024291497976, + "grad_norm": 2.9991024909175676, + "learning_rate": 6.713420275294467e-06, + "loss": 1.2872, + "step": 1112 + }, + { + "epoch": 4.506072874493928, + "grad_norm": 3.341853790054196, + "learning_rate": 6.70678030411042e-06, + "loss": 1.6404, + "step": 1113 + }, + { + "epoch": 4.510121457489879, + "grad_norm": 3.2032309023708687, + "learning_rate": 6.700136924151104e-06, + "loss": 1.6321, + "step": 1114 + }, + { + "epoch": 4.51417004048583, + "grad_norm": 2.446695841899921, + "learning_rate": 6.693490148684654e-06, + "loss": 1.5906, + "step": 1115 + }, + { + "epoch": 4.518218623481781, + "grad_norm": 3.030284559367058, + "learning_rate": 6.686839990985984e-06, + "loss": 1.6148, + "step": 1116 + }, + { + "epoch": 4.522267206477733, + "grad_norm": 3.0612075992794665, + "learning_rate": 6.680186464336767e-06, + "loss": 1.5678, + "step": 1117 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 3.4922710550140685, + "learning_rate": 6.673529582025398e-06, + "loss": 1.3788, + "step": 1118 + }, + { + "epoch": 4.530364372469636, + "grad_norm": 3.4134796811660166, + "learning_rate": 6.666869357346979e-06, + "loss": 1.4428, + "step": 1119 + }, + { + "epoch": 4.534412955465587, + "grad_norm": 3.6649442008937383, + "learning_rate": 6.660205803603286e-06, + "loss": 1.5671, + "step": 1120 + }, + { + "epoch": 4.538461538461538, + "grad_norm": 3.108830354735827, + "learning_rate": 6.653538934102743e-06, + "loss": 1.7903, + "step": 1121 + }, + { + "epoch": 4.5425101214574894, + "grad_norm": 2.719205109719932, + "learning_rate": 6.646868762160399e-06, + "loss": 1.6907, + "step": 1122 + }, + { + "epoch": 4.5465587044534415, + "grad_norm": 15.861026319110369, + "learning_rate": 6.640195301097896e-06, + "loss": 2.0735, + "step": 1123 + }, + { + "epoch": 4.550607287449393, + "grad_norm": 7.357015627613091, + "learning_rate": 6.633518564243442e-06, + "loss": 2.1046, + "step": 1124 + }, + { + "epoch": 4.554655870445345, + "grad_norm": 6.67996402988713, + "learning_rate": 6.626838564931797e-06, + "loss": 2.3423, + "step": 1125 + }, + { + "epoch": 4.558704453441296, + "grad_norm": 2.790707731153053, + "learning_rate": 6.620155316504225e-06, + "loss": 1.5771, + "step": 1126 + }, + { + "epoch": 4.562753036437247, + "grad_norm": 2.6424764643365544, + "learning_rate": 6.6134688323084884e-06, + "loss": 1.4544, + "step": 1127 + }, + { + "epoch": 4.566801619433198, + "grad_norm": 4.460650672408528, + "learning_rate": 6.606779125698808e-06, + "loss": 1.7848, + "step": 1128 + }, + { + "epoch": 4.57085020242915, + "grad_norm": 2.81766092171609, + "learning_rate": 6.600086210035841e-06, + "loss": 1.4465, + "step": 1129 + }, + { + "epoch": 4.574898785425101, + "grad_norm": 2.7934258737790794, + "learning_rate": 6.593390098686653e-06, + "loss": 1.7079, + "step": 1130 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 2.357159807197533, + "learning_rate": 6.586690805024692e-06, + "loss": 1.4715, + "step": 1131 + }, + { + "epoch": 4.582995951417004, + "grad_norm": 2.8201575354409876, + "learning_rate": 6.579988342429764e-06, + "loss": 1.6256, + "step": 1132 + }, + { + "epoch": 4.587044534412955, + "grad_norm": 2.748728982741463, + "learning_rate": 6.573282724288001e-06, + "loss": 1.6067, + "step": 1133 + }, + { + "epoch": 4.5910931174089065, + "grad_norm": 3.0721591492986526, + "learning_rate": 6.566573963991839e-06, + "loss": 1.5832, + "step": 1134 + }, + { + "epoch": 4.5951417004048585, + "grad_norm": 2.8487748202828924, + "learning_rate": 6.559862074939989e-06, + "loss": 1.3233, + "step": 1135 + }, + { + "epoch": 4.59919028340081, + "grad_norm": 2.590591556134, + "learning_rate": 6.553147070537413e-06, + "loss": 1.3674, + "step": 1136 + }, + { + "epoch": 4.603238866396762, + "grad_norm": 2.6607589757127186, + "learning_rate": 6.546428964195289e-06, + "loss": 1.4813, + "step": 1137 + }, + { + "epoch": 4.607287449392713, + "grad_norm": 2.936419659787077, + "learning_rate": 6.539707769330995e-06, + "loss": 1.3335, + "step": 1138 + }, + { + "epoch": 4.611336032388664, + "grad_norm": 5.647454932081391, + "learning_rate": 6.532983499368078e-06, + "loss": 1.631, + "step": 1139 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 2.672027285236729, + "learning_rate": 6.526256167736224e-06, + "loss": 1.6247, + "step": 1140 + }, + { + "epoch": 4.619433198380567, + "grad_norm": 3.585540725187652, + "learning_rate": 6.519525787871235e-06, + "loss": 1.365, + "step": 1141 + }, + { + "epoch": 4.623481781376518, + "grad_norm": 3.509608711468321, + "learning_rate": 6.512792373215e-06, + "loss": 1.7573, + "step": 1142 + }, + { + "epoch": 4.62753036437247, + "grad_norm": 2.971185622782078, + "learning_rate": 6.506055937215471e-06, + "loss": 1.561, + "step": 1143 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 3.2949915313334035, + "learning_rate": 6.499316493326631e-06, + "loss": 1.836, + "step": 1144 + }, + { + "epoch": 4.635627530364372, + "grad_norm": 2.861710933431733, + "learning_rate": 6.492574055008474e-06, + "loss": 1.4458, + "step": 1145 + }, + { + "epoch": 4.6396761133603235, + "grad_norm": 3.3593193695088828, + "learning_rate": 6.4858286357269716e-06, + "loss": 1.6806, + "step": 1146 + }, + { + "epoch": 4.6437246963562755, + "grad_norm": 2.7995829454110317, + "learning_rate": 6.4790802489540495e-06, + "loss": 1.5849, + "step": 1147 + }, + { + "epoch": 4.647773279352227, + "grad_norm": 2.9650473845995617, + "learning_rate": 6.472328908167562e-06, + "loss": 1.6598, + "step": 1148 + }, + { + "epoch": 4.651821862348179, + "grad_norm": 2.7905940219323475, + "learning_rate": 6.465574626851262e-06, + "loss": 1.4666, + "step": 1149 + }, + { + "epoch": 4.65587044534413, + "grad_norm": 3.2553490418837323, + "learning_rate": 6.4588174184947725e-06, + "loss": 1.6918, + "step": 1150 + }, + { + "epoch": 4.659919028340081, + "grad_norm": 3.55927475882226, + "learning_rate": 6.452057296593568e-06, + "loss": 1.5207, + "step": 1151 + }, + { + "epoch": 4.663967611336032, + "grad_norm": 2.9162925097777954, + "learning_rate": 6.445294274648937e-06, + "loss": 1.6745, + "step": 1152 + }, + { + "epoch": 4.668016194331984, + "grad_norm": 2.987151078867793, + "learning_rate": 6.4385283661679624e-06, + "loss": 1.6752, + "step": 1153 + }, + { + "epoch": 4.672064777327935, + "grad_norm": 3.186333717498487, + "learning_rate": 6.431759584663492e-06, + "loss": 1.753, + "step": 1154 + }, + { + "epoch": 4.676113360323887, + "grad_norm": 9.509020769435434, + "learning_rate": 6.424987943654109e-06, + "loss": 1.6195, + "step": 1155 + }, + { + "epoch": 4.680161943319838, + "grad_norm": 3.356709601234609, + "learning_rate": 6.418213456664111e-06, + "loss": 1.6311, + "step": 1156 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 2.921816366789115, + "learning_rate": 6.411436137223479e-06, + "loss": 1.4584, + "step": 1157 + }, + { + "epoch": 4.6882591093117405, + "grad_norm": 2.8660981524508338, + "learning_rate": 6.4046559988678485e-06, + "loss": 1.6084, + "step": 1158 + }, + { + "epoch": 4.6923076923076925, + "grad_norm": 3.0730207415431954, + "learning_rate": 6.397873055138487e-06, + "loss": 1.6274, + "step": 1159 + }, + { + "epoch": 4.696356275303644, + "grad_norm": 2.766004464269283, + "learning_rate": 6.391087319582264e-06, + "loss": 1.4697, + "step": 1160 + }, + { + "epoch": 4.700404858299595, + "grad_norm": 3.6099089118584136, + "learning_rate": 6.384298805751626e-06, + "loss": 1.9489, + "step": 1161 + }, + { + "epoch": 4.704453441295547, + "grad_norm": 3.442626114825173, + "learning_rate": 6.37750752720457e-06, + "loss": 1.727, + "step": 1162 + }, + { + "epoch": 4.708502024291498, + "grad_norm": 3.341066779383342, + "learning_rate": 6.370713497504607e-06, + "loss": 1.3178, + "step": 1163 + }, + { + "epoch": 4.712550607287449, + "grad_norm": 2.8791145178147386, + "learning_rate": 6.363916730220752e-06, + "loss": 1.4908, + "step": 1164 + }, + { + "epoch": 4.716599190283401, + "grad_norm": 2.8558993301680076, + "learning_rate": 6.357117238927481e-06, + "loss": 1.588, + "step": 1165 + }, + { + "epoch": 4.720647773279352, + "grad_norm": 3.403507251743757, + "learning_rate": 6.350315037204714e-06, + "loss": 1.3794, + "step": 1166 + }, + { + "epoch": 4.724696356275303, + "grad_norm": 3.28937405397847, + "learning_rate": 6.343510138637783e-06, + "loss": 1.535, + "step": 1167 + }, + { + "epoch": 4.728744939271255, + "grad_norm": 3.182353899970667, + "learning_rate": 6.336702556817405e-06, + "loss": 1.7416, + "step": 1168 + }, + { + "epoch": 4.732793522267206, + "grad_norm": 2.8393068837004285, + "learning_rate": 6.329892305339659e-06, + "loss": 1.521, + "step": 1169 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 3.0526645906441585, + "learning_rate": 6.323079397805951e-06, + "loss": 1.6001, + "step": 1170 + }, + { + "epoch": 4.7408906882591095, + "grad_norm": 3.453365846818349, + "learning_rate": 6.3162638478229965e-06, + "loss": 2.244, + "step": 1171 + }, + { + "epoch": 4.744939271255061, + "grad_norm": 2.930549437132931, + "learning_rate": 6.309445669002787e-06, + "loss": 1.6859, + "step": 1172 + }, + { + "epoch": 4.748987854251012, + "grad_norm": 3.513459131175886, + "learning_rate": 6.302624874962563e-06, + "loss": 1.5138, + "step": 1173 + }, + { + "epoch": 4.753036437246964, + "grad_norm": 3.101847130962305, + "learning_rate": 6.295801479324788e-06, + "loss": 1.4048, + "step": 1174 + }, + { + "epoch": 4.757085020242915, + "grad_norm": 2.9351108422638625, + "learning_rate": 6.288975495717124e-06, + "loss": 1.5932, + "step": 1175 + }, + { + "epoch": 4.761133603238866, + "grad_norm": 4.674100976432621, + "learning_rate": 6.282146937772399e-06, + "loss": 2.3515, + "step": 1176 + }, + { + "epoch": 4.765182186234818, + "grad_norm": 5.182394350357637, + "learning_rate": 6.2753158191285844e-06, + "loss": 2.1322, + "step": 1177 + }, + { + "epoch": 4.769230769230769, + "grad_norm": 6.057045402676707, + "learning_rate": 6.268482153428763e-06, + "loss": 2.0072, + "step": 1178 + }, + { + "epoch": 4.77327935222672, + "grad_norm": 3.1068830892655726, + "learning_rate": 6.261645954321109e-06, + "loss": 1.5127, + "step": 1179 + }, + { + "epoch": 4.777327935222672, + "grad_norm": 3.0244265678427213, + "learning_rate": 6.254807235458853e-06, + "loss": 1.7728, + "step": 1180 + }, + { + "epoch": 4.781376518218623, + "grad_norm": 2.949903538067424, + "learning_rate": 6.247966010500258e-06, + "loss": 1.78, + "step": 1181 + }, + { + "epoch": 4.7854251012145745, + "grad_norm": 3.1823383170218946, + "learning_rate": 6.241122293108594e-06, + "loss": 1.6101, + "step": 1182 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 3.0390422214285975, + "learning_rate": 6.2342760969521085e-06, + "loss": 1.5326, + "step": 1183 + }, + { + "epoch": 4.793522267206478, + "grad_norm": 3.136764973756456, + "learning_rate": 6.227427435703997e-06, + "loss": 1.5671, + "step": 1184 + }, + { + "epoch": 4.797570850202429, + "grad_norm": 3.358208559803108, + "learning_rate": 6.220576323042381e-06, + "loss": 1.5746, + "step": 1185 + }, + { + "epoch": 4.801619433198381, + "grad_norm": 2.8750507177466305, + "learning_rate": 6.213722772650277e-06, + "loss": 1.4246, + "step": 1186 + }, + { + "epoch": 4.805668016194332, + "grad_norm": 3.028809163189934, + "learning_rate": 6.206866798215571e-06, + "loss": 1.317, + "step": 1187 + }, + { + "epoch": 4.809716599190283, + "grad_norm": 3.126804073645922, + "learning_rate": 6.2000084134309905e-06, + "loss": 1.6821, + "step": 1188 + }, + { + "epoch": 4.813765182186235, + "grad_norm": 3.71033178556479, + "learning_rate": 6.193147631994073e-06, + "loss": 1.5786, + "step": 1189 + }, + { + "epoch": 4.817813765182186, + "grad_norm": 3.2129146658285346, + "learning_rate": 6.186284467607149e-06, + "loss": 1.3971, + "step": 1190 + }, + { + "epoch": 4.821862348178137, + "grad_norm": 10.210146232119035, + "learning_rate": 6.179418933977301e-06, + "loss": 2.3347, + "step": 1191 + }, + { + "epoch": 4.825910931174089, + "grad_norm": 21.275577852601224, + "learning_rate": 6.1725510448163516e-06, + "loss": 3.6222, + "step": 1192 + }, + { + "epoch": 4.82995951417004, + "grad_norm": 3.4666551476237584, + "learning_rate": 6.165680813840822e-06, + "loss": 1.4645, + "step": 1193 + }, + { + "epoch": 4.834008097165992, + "grad_norm": 3.4458166986644443, + "learning_rate": 6.1588082547719095e-06, + "loss": 1.3391, + "step": 1194 + }, + { + "epoch": 4.838056680161944, + "grad_norm": 2.919273388343095, + "learning_rate": 6.151933381335468e-06, + "loss": 1.4313, + "step": 1195 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 3.0732467672720736, + "learning_rate": 6.1450562072619635e-06, + "loss": 1.4611, + "step": 1196 + }, + { + "epoch": 4.846153846153846, + "grad_norm": 3.2736024865252493, + "learning_rate": 6.138176746286468e-06, + "loss": 1.3333, + "step": 1197 + }, + { + "epoch": 4.850202429149798, + "grad_norm": 3.3437325068102486, + "learning_rate": 6.131295012148613e-06, + "loss": 1.4833, + "step": 1198 + }, + { + "epoch": 4.854251012145749, + "grad_norm": 3.6058736308138766, + "learning_rate": 6.124411018592568e-06, + "loss": 1.5521, + "step": 1199 + }, + { + "epoch": 4.8582995951417, + "grad_norm": 2.6980859324752267, + "learning_rate": 6.117524779367027e-06, + "loss": 1.4743, + "step": 1200 + }, + { + "epoch": 4.862348178137652, + "grad_norm": 3.4307422256171947, + "learning_rate": 6.110636308225157e-06, + "loss": 1.4612, + "step": 1201 + }, + { + "epoch": 4.866396761133603, + "grad_norm": 3.4665359414620625, + "learning_rate": 6.103745618924587e-06, + "loss": 1.4922, + "step": 1202 + }, + { + "epoch": 4.870445344129554, + "grad_norm": 4.034402333282032, + "learning_rate": 6.096852725227378e-06, + "loss": 1.9715, + "step": 1203 + }, + { + "epoch": 4.874493927125506, + "grad_norm": 3.6881022424154097, + "learning_rate": 6.089957640899988e-06, + "loss": 1.9107, + "step": 1204 + }, + { + "epoch": 4.8785425101214575, + "grad_norm": 3.862338875685726, + "learning_rate": 6.0830603797132574e-06, + "loss": 1.661, + "step": 1205 + }, + { + "epoch": 4.882591093117409, + "grad_norm": 3.384483266395071, + "learning_rate": 6.076160955442369e-06, + "loss": 1.5689, + "step": 1206 + }, + { + "epoch": 4.886639676113361, + "grad_norm": 3.345513039253192, + "learning_rate": 6.069259381866827e-06, + "loss": 1.1468, + "step": 1207 + }, + { + "epoch": 4.890688259109312, + "grad_norm": 2.8964038452697847, + "learning_rate": 6.0623556727704306e-06, + "loss": 1.6516, + "step": 1208 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 2.9136386786268895, + "learning_rate": 6.055449841941238e-06, + "loss": 1.7215, + "step": 1209 + }, + { + "epoch": 4.898785425101215, + "grad_norm": 2.7655346557671248, + "learning_rate": 6.048541903171552e-06, + "loss": 1.4413, + "step": 1210 + }, + { + "epoch": 4.902834008097166, + "grad_norm": 3.2433937012234715, + "learning_rate": 6.041631870257882e-06, + "loss": 1.4725, + "step": 1211 + }, + { + "epoch": 4.906882591093117, + "grad_norm": 3.4688660789200325, + "learning_rate": 6.034719757000918e-06, + "loss": 1.6069, + "step": 1212 + }, + { + "epoch": 4.910931174089069, + "grad_norm": 3.106070985660449, + "learning_rate": 6.0278055772055075e-06, + "loss": 1.2312, + "step": 1213 + }, + { + "epoch": 4.91497975708502, + "grad_norm": 3.4926777350408664, + "learning_rate": 6.020889344680627e-06, + "loss": 1.3252, + "step": 1214 + }, + { + "epoch": 4.919028340080971, + "grad_norm": 3.31474250904695, + "learning_rate": 6.013971073239346e-06, + "loss": 1.3404, + "step": 1215 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 2.7200582966885953, + "learning_rate": 6.007050776698816e-06, + "loss": 1.6668, + "step": 1216 + }, + { + "epoch": 4.9271255060728745, + "grad_norm": 4.194613418220712, + "learning_rate": 6.000128468880223e-06, + "loss": 1.5178, + "step": 1217 + }, + { + "epoch": 4.931174089068826, + "grad_norm": 3.6956716885492047, + "learning_rate": 5.993204163608776e-06, + "loss": 1.5313, + "step": 1218 + }, + { + "epoch": 4.935222672064778, + "grad_norm": 3.42386071095716, + "learning_rate": 5.986277874713672e-06, + "loss": 1.315, + "step": 1219 + }, + { + "epoch": 4.939271255060729, + "grad_norm": 3.4411238008448497, + "learning_rate": 5.979349616028067e-06, + "loss": 1.2599, + "step": 1220 + }, + { + "epoch": 4.94331983805668, + "grad_norm": 4.136849869910849, + "learning_rate": 5.972419401389058e-06, + "loss": 1.5671, + "step": 1221 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 3.3509710910402344, + "learning_rate": 5.96548724463764e-06, + "loss": 1.3098, + "step": 1222 + }, + { + "epoch": 4.951417004048583, + "grad_norm": 3.826301738234217, + "learning_rate": 5.958553159618693e-06, + "loss": 1.2627, + "step": 1223 + }, + { + "epoch": 4.955465587044534, + "grad_norm": 4.211383102056784, + "learning_rate": 5.951617160180944e-06, + "loss": 1.4866, + "step": 1224 + }, + { + "epoch": 4.959514170040486, + "grad_norm": 3.9784296755787043, + "learning_rate": 5.944679260176947e-06, + "loss": 1.5416, + "step": 1225 + }, + { + "epoch": 4.963562753036437, + "grad_norm": 3.121952186318371, + "learning_rate": 5.937739473463047e-06, + "loss": 1.5505, + "step": 1226 + }, + { + "epoch": 4.967611336032388, + "grad_norm": 3.717226187124744, + "learning_rate": 5.930797813899364e-06, + "loss": 1.6869, + "step": 1227 + }, + { + "epoch": 4.97165991902834, + "grad_norm": 4.139266573612088, + "learning_rate": 5.923854295349751e-06, + "loss": 1.5989, + "step": 1228 + }, + { + "epoch": 4.9757085020242915, + "grad_norm": 2.8954471867608937, + "learning_rate": 5.916908931681781e-06, + "loss": 1.5245, + "step": 1229 + }, + { + "epoch": 4.979757085020243, + "grad_norm": 3.153595083245072, + "learning_rate": 5.9099617367667065e-06, + "loss": 1.6063, + "step": 1230 + }, + { + "epoch": 4.983805668016195, + "grad_norm": 2.8400997626861173, + "learning_rate": 5.9030127244794385e-06, + "loss": 1.6715, + "step": 1231 + }, + { + "epoch": 4.987854251012146, + "grad_norm": 3.2491090209153874, + "learning_rate": 5.896061908698521e-06, + "loss": 1.4666, + "step": 1232 + }, + { + "epoch": 4.991902834008097, + "grad_norm": 2.6679775725786286, + "learning_rate": 5.8891093033060945e-06, + "loss": 1.4425, + "step": 1233 + }, + { + "epoch": 4.995951417004049, + "grad_norm": 2.6288454727168067, + "learning_rate": 5.8821549221878795e-06, + "loss": 1.7597, + "step": 1234 + }, + { + "epoch": 5.0, + "grad_norm": 2.885385124366649, + "learning_rate": 5.8751987792331365e-06, + "loss": 1.4922, + "step": 1235 + }, + { + "epoch": 5.004048582995951, + "grad_norm": 2.87961175357714, + "learning_rate": 5.8682408883346535e-06, + "loss": 1.5315, + "step": 1236 + }, + { + "epoch": 5.008097165991903, + "grad_norm": 3.895617299101059, + "learning_rate": 5.861281263388699e-06, + "loss": 1.6767, + "step": 1237 + }, + { + "epoch": 5.012145748987854, + "grad_norm": 3.762686290641399, + "learning_rate": 5.854319918295012e-06, + "loss": 1.5156, + "step": 1238 + }, + { + "epoch": 5.016194331983805, + "grad_norm": 4.177708865223027, + "learning_rate": 5.8473568669567645e-06, + "loss": 1.7157, + "step": 1239 + }, + { + "epoch": 5.020242914979757, + "grad_norm": 3.5866973777228996, + "learning_rate": 5.84039212328054e-06, + "loss": 1.9457, + "step": 1240 + }, + { + "epoch": 5.0242914979757085, + "grad_norm": 3.7038579253911434, + "learning_rate": 5.833425701176294e-06, + "loss": 1.8054, + "step": 1241 + }, + { + "epoch": 5.02834008097166, + "grad_norm": 3.053021737504678, + "learning_rate": 5.826457614557342e-06, + "loss": 1.4846, + "step": 1242 + }, + { + "epoch": 5.032388663967612, + "grad_norm": 3.7131269515944236, + "learning_rate": 5.819487877340318e-06, + "loss": 1.4864, + "step": 1243 + }, + { + "epoch": 5.036437246963563, + "grad_norm": 3.47442806634264, + "learning_rate": 5.812516503445158e-06, + "loss": 1.7235, + "step": 1244 + }, + { + "epoch": 5.040485829959514, + "grad_norm": 3.509517402822926, + "learning_rate": 5.805543506795063e-06, + "loss": 1.517, + "step": 1245 + }, + { + "epoch": 5.044534412955466, + "grad_norm": 3.3619188629392305, + "learning_rate": 5.798568901316475e-06, + "loss": 1.5768, + "step": 1246 + }, + { + "epoch": 5.048582995951417, + "grad_norm": 3.557428062968091, + "learning_rate": 5.79159270093905e-06, + "loss": 1.5018, + "step": 1247 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 3.7281770232445295, + "learning_rate": 5.784614919595631e-06, + "loss": 1.5785, + "step": 1248 + }, + { + "epoch": 5.05668016194332, + "grad_norm": 3.517681869861109, + "learning_rate": 5.7776355712222165e-06, + "loss": 1.4217, + "step": 1249 + }, + { + "epoch": 5.060728744939271, + "grad_norm": 3.487707428141539, + "learning_rate": 5.770654669757935e-06, + "loss": 1.5864, + "step": 1250 + }, + { + "epoch": 5.064777327935222, + "grad_norm": 3.79463286822166, + "learning_rate": 5.763672229145015e-06, + "loss": 1.5406, + "step": 1251 + }, + { + "epoch": 5.068825910931174, + "grad_norm": 3.9587280022782623, + "learning_rate": 5.756688263328762e-06, + "loss": 1.6808, + "step": 1252 + }, + { + "epoch": 5.0728744939271255, + "grad_norm": 3.574038459442136, + "learning_rate": 5.749702786257529e-06, + "loss": 1.6199, + "step": 1253 + }, + { + "epoch": 5.076923076923077, + "grad_norm": 3.9239619763747666, + "learning_rate": 5.742715811882682e-06, + "loss": 1.5554, + "step": 1254 + }, + { + "epoch": 5.080971659919029, + "grad_norm": 3.3525677000904435, + "learning_rate": 5.735727354158581e-06, + "loss": 1.5965, + "step": 1255 + }, + { + "epoch": 5.08502024291498, + "grad_norm": 3.14038896931749, + "learning_rate": 5.7287374270425475e-06, + "loss": 1.5955, + "step": 1256 + }, + { + "epoch": 5.089068825910931, + "grad_norm": 3.800313028867603, + "learning_rate": 5.721746044494838e-06, + "loss": 1.5594, + "step": 1257 + }, + { + "epoch": 5.093117408906883, + "grad_norm": 3.5079921931841707, + "learning_rate": 5.714753220478616e-06, + "loss": 1.6374, + "step": 1258 + }, + { + "epoch": 5.097165991902834, + "grad_norm": 3.3722158742610033, + "learning_rate": 5.707758968959923e-06, + "loss": 1.3947, + "step": 1259 + }, + { + "epoch": 5.101214574898785, + "grad_norm": 3.690572058964337, + "learning_rate": 5.7007633039076535e-06, + "loss": 1.5641, + "step": 1260 + }, + { + "epoch": 5.105263157894737, + "grad_norm": 3.868480542932687, + "learning_rate": 5.693766239293522e-06, + "loss": 1.5403, + "step": 1261 + }, + { + "epoch": 5.109311740890688, + "grad_norm": 3.642440736287873, + "learning_rate": 5.686767789092041e-06, + "loss": 1.4899, + "step": 1262 + }, + { + "epoch": 5.113360323886639, + "grad_norm": 4.407879993174004, + "learning_rate": 5.67976796728049e-06, + "loss": 1.4415, + "step": 1263 + }, + { + "epoch": 5.117408906882591, + "grad_norm": 3.9268283691257166, + "learning_rate": 5.672766787838884e-06, + "loss": 1.349, + "step": 1264 + }, + { + "epoch": 5.1214574898785425, + "grad_norm": 3.5424496240381282, + "learning_rate": 5.6657642647499545e-06, + "loss": 1.4005, + "step": 1265 + }, + { + "epoch": 5.125506072874494, + "grad_norm": 3.714267182183359, + "learning_rate": 5.658760411999115e-06, + "loss": 1.4047, + "step": 1266 + }, + { + "epoch": 5.129554655870446, + "grad_norm": 4.1352520308511425, + "learning_rate": 5.6517552435744325e-06, + "loss": 1.3041, + "step": 1267 + }, + { + "epoch": 5.133603238866397, + "grad_norm": 3.1992855070868185, + "learning_rate": 5.644748773466606e-06, + "loss": 1.6559, + "step": 1268 + }, + { + "epoch": 5.137651821862348, + "grad_norm": 3.852499540993822, + "learning_rate": 5.637741015668929e-06, + "loss": 1.4822, + "step": 1269 + }, + { + "epoch": 5.1417004048583, + "grad_norm": 3.0057363516680513, + "learning_rate": 5.630731984177269e-06, + "loss": 1.2246, + "step": 1270 + }, + { + "epoch": 5.145748987854251, + "grad_norm": 3.8748912975587544, + "learning_rate": 5.62372169299004e-06, + "loss": 1.5924, + "step": 1271 + }, + { + "epoch": 5.149797570850202, + "grad_norm": 3.5771984578664875, + "learning_rate": 5.616710156108167e-06, + "loss": 1.4133, + "step": 1272 + }, + { + "epoch": 5.153846153846154, + "grad_norm": 3.2086974588686576, + "learning_rate": 5.609697387535068e-06, + "loss": 1.621, + "step": 1273 + }, + { + "epoch": 5.157894736842105, + "grad_norm": 3.984819835501151, + "learning_rate": 5.6026834012766155e-06, + "loss": 1.7158, + "step": 1274 + }, + { + "epoch": 5.161943319838056, + "grad_norm": 3.2013860532982337, + "learning_rate": 5.5956682113411184e-06, + "loss": 1.4746, + "step": 1275 + }, + { + "epoch": 5.165991902834008, + "grad_norm": 3.450642934981606, + "learning_rate": 5.588651831739289e-06, + "loss": 1.5543, + "step": 1276 + }, + { + "epoch": 5.17004048582996, + "grad_norm": 3.093776549631426, + "learning_rate": 5.581634276484211e-06, + "loss": 2.074, + "step": 1277 + }, + { + "epoch": 5.174089068825911, + "grad_norm": 3.545758099078526, + "learning_rate": 5.574615559591323e-06, + "loss": 1.3906, + "step": 1278 + }, + { + "epoch": 5.178137651821863, + "grad_norm": 4.14672203994261, + "learning_rate": 5.567595695078379e-06, + "loss": 1.5738, + "step": 1279 + }, + { + "epoch": 5.182186234817814, + "grad_norm": 2.9347838837502294, + "learning_rate": 5.560574696965425e-06, + "loss": 1.3815, + "step": 1280 + }, + { + "epoch": 5.186234817813765, + "grad_norm": 3.90774860265149, + "learning_rate": 5.553552579274775e-06, + "loss": 1.5673, + "step": 1281 + }, + { + "epoch": 5.190283400809717, + "grad_norm": 3.578616704951525, + "learning_rate": 5.546529356030974e-06, + "loss": 1.5733, + "step": 1282 + }, + { + "epoch": 5.194331983805668, + "grad_norm": 4.0010401720998185, + "learning_rate": 5.539505041260779e-06, + "loss": 1.757, + "step": 1283 + }, + { + "epoch": 5.198380566801619, + "grad_norm": 3.509112575984563, + "learning_rate": 5.532479648993122e-06, + "loss": 1.8081, + "step": 1284 + }, + { + "epoch": 5.202429149797571, + "grad_norm": 3.5347317901565556, + "learning_rate": 5.525453193259094e-06, + "loss": 1.5116, + "step": 1285 + }, + { + "epoch": 5.206477732793522, + "grad_norm": 3.4675375372116184, + "learning_rate": 5.518425688091906e-06, + "loss": 1.8506, + "step": 1286 + }, + { + "epoch": 5.2105263157894735, + "grad_norm": 3.6323230014040306, + "learning_rate": 5.511397147526862e-06, + "loss": 1.8682, + "step": 1287 + }, + { + "epoch": 5.2145748987854255, + "grad_norm": 3.5536336190454048, + "learning_rate": 5.504367585601342e-06, + "loss": 1.6388, + "step": 1288 + }, + { + "epoch": 5.218623481781377, + "grad_norm": 3.6273876631462905, + "learning_rate": 5.497337016354757e-06, + "loss": 1.5266, + "step": 1289 + }, + { + "epoch": 5.222672064777328, + "grad_norm": 3.605955542328613, + "learning_rate": 5.490305453828534e-06, + "loss": 1.4274, + "step": 1290 + }, + { + "epoch": 5.22672064777328, + "grad_norm": 3.594834856645006, + "learning_rate": 5.483272912066084e-06, + "loss": 1.6117, + "step": 1291 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 3.6817183177194295, + "learning_rate": 5.476239405112775e-06, + "loss": 1.4265, + "step": 1292 + }, + { + "epoch": 5.234817813765182, + "grad_norm": 4.022022675891982, + "learning_rate": 5.469204947015897e-06, + "loss": 1.668, + "step": 1293 + }, + { + "epoch": 5.238866396761134, + "grad_norm": 3.889168025126557, + "learning_rate": 5.462169551824648e-06, + "loss": 1.6076, + "step": 1294 + }, + { + "epoch": 5.242914979757085, + "grad_norm": 3.6700082316334273, + "learning_rate": 5.45513323359009e-06, + "loss": 1.6171, + "step": 1295 + }, + { + "epoch": 5.246963562753036, + "grad_norm": 3.6748741609947855, + "learning_rate": 5.448096006365132e-06, + "loss": 1.4488, + "step": 1296 + }, + { + "epoch": 5.251012145748988, + "grad_norm": 3.6290737200114993, + "learning_rate": 5.4410578842045e-06, + "loss": 1.5478, + "step": 1297 + }, + { + "epoch": 5.255060728744939, + "grad_norm": 3.8478048256636357, + "learning_rate": 5.434018881164702e-06, + "loss": 1.523, + "step": 1298 + }, + { + "epoch": 5.2591093117408905, + "grad_norm": 3.312410066611835, + "learning_rate": 5.426979011304012e-06, + "loss": 1.4463, + "step": 1299 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 3.647621711678499, + "learning_rate": 5.41993828868243e-06, + "loss": 1.2639, + "step": 1300 + }, + { + "epoch": 5.267206477732794, + "grad_norm": 3.5536727878739205, + "learning_rate": 5.412896727361663e-06, + "loss": 1.5401, + "step": 1301 + }, + { + "epoch": 5.271255060728745, + "grad_norm": 3.539451611896165, + "learning_rate": 5.405854341405088e-06, + "loss": 1.5594, + "step": 1302 + }, + { + "epoch": 5.275303643724697, + "grad_norm": 3.4030202032336394, + "learning_rate": 5.398811144877733e-06, + "loss": 1.5997, + "step": 1303 + }, + { + "epoch": 5.279352226720648, + "grad_norm": 4.605755727643003, + "learning_rate": 5.391767151846247e-06, + "loss": 1.5551, + "step": 1304 + }, + { + "epoch": 5.283400809716599, + "grad_norm": 4.210060420659593, + "learning_rate": 5.384722376378861e-06, + "loss": 1.2388, + "step": 1305 + }, + { + "epoch": 5.287449392712551, + "grad_norm": 4.288644203676987, + "learning_rate": 5.377676832545377e-06, + "loss": 1.3926, + "step": 1306 + }, + { + "epoch": 5.291497975708502, + "grad_norm": 4.344641505323721, + "learning_rate": 5.370630534417133e-06, + "loss": 1.2335, + "step": 1307 + }, + { + "epoch": 5.295546558704453, + "grad_norm": 4.293842456125265, + "learning_rate": 5.363583496066963e-06, + "loss": 1.5097, + "step": 1308 + }, + { + "epoch": 5.299595141700405, + "grad_norm": 3.5889617840380956, + "learning_rate": 5.356535731569189e-06, + "loss": 1.6798, + "step": 1309 + }, + { + "epoch": 5.303643724696356, + "grad_norm": 3.8949744261018844, + "learning_rate": 5.349487254999579e-06, + "loss": 1.3501, + "step": 1310 + }, + { + "epoch": 5.3076923076923075, + "grad_norm": 3.8938141628185394, + "learning_rate": 5.342438080435325e-06, + "loss": 1.3823, + "step": 1311 + }, + { + "epoch": 5.3117408906882595, + "grad_norm": 3.7811284620632146, + "learning_rate": 5.335388221955012e-06, + "loss": 1.4001, + "step": 1312 + }, + { + "epoch": 5.315789473684211, + "grad_norm": 4.504485300390198, + "learning_rate": 5.328337693638591e-06, + "loss": 1.3433, + "step": 1313 + }, + { + "epoch": 5.319838056680162, + "grad_norm": 3.9863561932252, + "learning_rate": 5.321286509567351e-06, + "loss": 1.2701, + "step": 1314 + }, + { + "epoch": 5.323886639676114, + "grad_norm": 4.103946070839009, + "learning_rate": 5.314234683823892e-06, + "loss": 1.2979, + "step": 1315 + }, + { + "epoch": 5.327935222672065, + "grad_norm": 3.9048810862002896, + "learning_rate": 5.307182230492089e-06, + "loss": 1.3284, + "step": 1316 + }, + { + "epoch": 5.331983805668016, + "grad_norm": 3.802962634621348, + "learning_rate": 5.300129163657081e-06, + "loss": 1.3376, + "step": 1317 + }, + { + "epoch": 5.336032388663968, + "grad_norm": 3.6151941699291696, + "learning_rate": 5.2930754974052245e-06, + "loss": 1.3976, + "step": 1318 + }, + { + "epoch": 5.340080971659919, + "grad_norm": 3.4851660754400124, + "learning_rate": 5.286021245824075e-06, + "loss": 1.3431, + "step": 1319 + }, + { + "epoch": 5.34412955465587, + "grad_norm": 3.7167755157754008, + "learning_rate": 5.2789664230023595e-06, + "loss": 1.295, + "step": 1320 + }, + { + "epoch": 5.348178137651822, + "grad_norm": 4.41974802384744, + "learning_rate": 5.2719110430299416e-06, + "loss": 1.4491, + "step": 1321 + }, + { + "epoch": 5.352226720647773, + "grad_norm": 4.277030621050548, + "learning_rate": 5.264855119997803e-06, + "loss": 1.4354, + "step": 1322 + }, + { + "epoch": 5.3562753036437245, + "grad_norm": 4.194929698692418, + "learning_rate": 5.257798667998003e-06, + "loss": 1.0844, + "step": 1323 + }, + { + "epoch": 5.3603238866396765, + "grad_norm": 4.472113694740598, + "learning_rate": 5.2507417011236625e-06, + "loss": 1.4929, + "step": 1324 + }, + { + "epoch": 5.364372469635628, + "grad_norm": 3.9849001434928866, + "learning_rate": 5.243684233468933e-06, + "loss": 1.5648, + "step": 1325 + }, + { + "epoch": 5.368421052631579, + "grad_norm": 3.864302824850682, + "learning_rate": 5.236626279128958e-06, + "loss": 1.473, + "step": 1326 + }, + { + "epoch": 5.372469635627531, + "grad_norm": 4.810968253503194, + "learning_rate": 5.22956785219986e-06, + "loss": 1.5456, + "step": 1327 + }, + { + "epoch": 5.376518218623482, + "grad_norm": 4.111208820335583, + "learning_rate": 5.222508966778702e-06, + "loss": 1.2098, + "step": 1328 + }, + { + "epoch": 5.380566801619433, + "grad_norm": 4.534807999665865, + "learning_rate": 5.2154496369634645e-06, + "loss": 1.363, + "step": 1329 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 4.048755500092308, + "learning_rate": 5.208389876853014e-06, + "loss": 1.1592, + "step": 1330 + }, + { + "epoch": 5.388663967611336, + "grad_norm": 4.75370785314969, + "learning_rate": 5.201329700547077e-06, + "loss": 1.226, + "step": 1331 + }, + { + "epoch": 5.392712550607287, + "grad_norm": 4.367024994722068, + "learning_rate": 5.194269122146211e-06, + "loss": 1.4048, + "step": 1332 + }, + { + "epoch": 5.396761133603239, + "grad_norm": 4.918852915006795, + "learning_rate": 5.187208155751779e-06, + "loss": 1.2387, + "step": 1333 + }, + { + "epoch": 5.40080971659919, + "grad_norm": 3.6289200014371894, + "learning_rate": 5.180146815465915e-06, + "loss": 1.2571, + "step": 1334 + }, + { + "epoch": 5.4048582995951415, + "grad_norm": 3.7443218122005266, + "learning_rate": 5.173085115391502e-06, + "loss": 1.3062, + "step": 1335 + }, + { + "epoch": 5.4089068825910935, + "grad_norm": 4.7017026873802426, + "learning_rate": 5.16602306963214e-06, + "loss": 1.4154, + "step": 1336 + }, + { + "epoch": 5.412955465587045, + "grad_norm": 4.150505086067103, + "learning_rate": 5.158960692292122e-06, + "loss": 1.2259, + "step": 1337 + }, + { + "epoch": 5.417004048582996, + "grad_norm": 4.482184582986182, + "learning_rate": 5.151897997476403e-06, + "loss": 1.5583, + "step": 1338 + }, + { + "epoch": 5.421052631578947, + "grad_norm": 4.682227327595727, + "learning_rate": 5.144834999290567e-06, + "loss": 1.598, + "step": 1339 + }, + { + "epoch": 5.425101214574899, + "grad_norm": 4.008926002575055, + "learning_rate": 5.137771711840811e-06, + "loss": 1.5379, + "step": 1340 + }, + { + "epoch": 5.42914979757085, + "grad_norm": 4.302820633137393, + "learning_rate": 5.130708149233905e-06, + "loss": 1.5569, + "step": 1341 + }, + { + "epoch": 5.433198380566802, + "grad_norm": 3.5969352441824007, + "learning_rate": 5.123644325577168e-06, + "loss": 1.7237, + "step": 1342 + }, + { + "epoch": 5.437246963562753, + "grad_norm": 4.1865532032949035, + "learning_rate": 5.116580254978447e-06, + "loss": 1.4932, + "step": 1343 + }, + { + "epoch": 5.441295546558704, + "grad_norm": 4.443537220527738, + "learning_rate": 5.1095159515460736e-06, + "loss": 1.4349, + "step": 1344 + }, + { + "epoch": 5.445344129554655, + "grad_norm": 3.8400638359623653, + "learning_rate": 5.10245142938885e-06, + "loss": 1.6808, + "step": 1345 + }, + { + "epoch": 5.449392712550607, + "grad_norm": 4.456713357432363, + "learning_rate": 5.095386702616012e-06, + "loss": 1.4753, + "step": 1346 + }, + { + "epoch": 5.4534412955465585, + "grad_norm": 4.371248488578587, + "learning_rate": 5.088321785337207e-06, + "loss": 1.4634, + "step": 1347 + }, + { + "epoch": 5.4574898785425106, + "grad_norm": 4.503939177016205, + "learning_rate": 5.0812566916624624e-06, + "loss": 1.8175, + "step": 1348 + }, + { + "epoch": 5.461538461538462, + "grad_norm": 5.8661687643019444, + "learning_rate": 5.074191435702155e-06, + "loss": 1.9684, + "step": 1349 + }, + { + "epoch": 5.465587044534413, + "grad_norm": 4.324067092257868, + "learning_rate": 5.067126031566988e-06, + "loss": 1.6405, + "step": 1350 + }, + { + "epoch": 5.469635627530364, + "grad_norm": 3.796039870689883, + "learning_rate": 5.060060493367961e-06, + "loss": 1.6486, + "step": 1351 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 3.738600525398421, + "learning_rate": 5.05299483521634e-06, + "loss": 1.5872, + "step": 1352 + }, + { + "epoch": 5.477732793522267, + "grad_norm": 4.6006758703016, + "learning_rate": 5.045929071223633e-06, + "loss": 1.5976, + "step": 1353 + }, + { + "epoch": 5.481781376518219, + "grad_norm": 3.3463637296184854, + "learning_rate": 5.038863215501555e-06, + "loss": 1.5156, + "step": 1354 + }, + { + "epoch": 5.48582995951417, + "grad_norm": 3.8425032487043813, + "learning_rate": 5.031797282162007e-06, + "loss": 1.4631, + "step": 1355 + }, + { + "epoch": 5.489878542510121, + "grad_norm": 4.548619092337232, + "learning_rate": 5.024731285317046e-06, + "loss": 1.3972, + "step": 1356 + }, + { + "epoch": 5.493927125506072, + "grad_norm": 4.814717659012562, + "learning_rate": 5.017665239078854e-06, + "loss": 1.4267, + "step": 1357 + }, + { + "epoch": 5.497975708502024, + "grad_norm": 3.6552584947768096, + "learning_rate": 5.010599157559713e-06, + "loss": 1.2966, + "step": 1358 + }, + { + "epoch": 5.502024291497976, + "grad_norm": 4.204585823006649, + "learning_rate": 5.003533054871973e-06, + "loss": 1.15, + "step": 1359 + }, + { + "epoch": 5.506072874493928, + "grad_norm": 4.634653281785678, + "learning_rate": 4.996466945128029e-06, + "loss": 1.5181, + "step": 1360 + }, + { + "epoch": 5.510121457489879, + "grad_norm": 4.3188079424314, + "learning_rate": 4.98940084244029e-06, + "loss": 1.4787, + "step": 1361 + }, + { + "epoch": 5.51417004048583, + "grad_norm": 3.332377152961891, + "learning_rate": 4.982334760921149e-06, + "loss": 1.4434, + "step": 1362 + }, + { + "epoch": 5.518218623481781, + "grad_norm": 4.271374565670683, + "learning_rate": 4.975268714682956e-06, + "loss": 1.4766, + "step": 1363 + }, + { + "epoch": 5.522267206477733, + "grad_norm": 4.388046491535482, + "learning_rate": 4.968202717837996e-06, + "loss": 1.4244, + "step": 1364 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 4.81529396324836, + "learning_rate": 4.961136784498448e-06, + "loss": 1.2532, + "step": 1365 + }, + { + "epoch": 5.530364372469636, + "grad_norm": 4.589391225576633, + "learning_rate": 4.9540709287763685e-06, + "loss": 1.3152, + "step": 1366 + }, + { + "epoch": 5.534412955465587, + "grad_norm": 5.101062956149816, + "learning_rate": 4.947005164783661e-06, + "loss": 1.409, + "step": 1367 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 4.286443288173012, + "learning_rate": 4.939939506632041e-06, + "loss": 1.6652, + "step": 1368 + }, + { + "epoch": 5.5425101214574894, + "grad_norm": 3.857994197551904, + "learning_rate": 4.932873968433014e-06, + "loss": 1.5821, + "step": 1369 + }, + { + "epoch": 5.5465587044534415, + "grad_norm": 82.8177825176114, + "learning_rate": 4.925808564297847e-06, + "loss": 2.0481, + "step": 1370 + }, + { + "epoch": 5.550607287449393, + "grad_norm": 8.294269069115597, + "learning_rate": 4.918743308337539e-06, + "loss": 1.9382, + "step": 1371 + }, + { + "epoch": 5.554655870445345, + "grad_norm": 8.675625865701205, + "learning_rate": 4.911678214662795e-06, + "loss": 2.2234, + "step": 1372 + }, + { + "epoch": 5.558704453441296, + "grad_norm": 3.9912695390847595, + "learning_rate": 4.9046132973839895e-06, + "loss": 1.4514, + "step": 1373 + }, + { + "epoch": 5.562753036437247, + "grad_norm": 3.603893380101875, + "learning_rate": 4.897548570611153e-06, + "loss": 1.3266, + "step": 1374 + }, + { + "epoch": 5.566801619433198, + "grad_norm": 3.6938504736682054, + "learning_rate": 4.890484048453928e-06, + "loss": 1.704, + "step": 1375 + }, + { + "epoch": 5.57085020242915, + "grad_norm": 4.1771900748802135, + "learning_rate": 4.883419745021554e-06, + "loss": 1.3432, + "step": 1376 + }, + { + "epoch": 5.574898785425101, + "grad_norm": 4.029068029464602, + "learning_rate": 4.8763556744228324e-06, + "loss": 1.5548, + "step": 1377 + }, + { + "epoch": 5.578947368421053, + "grad_norm": 3.1723858445451776, + "learning_rate": 4.869291850766097e-06, + "loss": 1.3556, + "step": 1378 + }, + { + "epoch": 5.582995951417004, + "grad_norm": 3.9383901181787118, + "learning_rate": 4.862228288159191e-06, + "loss": 1.4828, + "step": 1379 + }, + { + "epoch": 5.587044534412955, + "grad_norm": 3.8742071296776883, + "learning_rate": 4.855165000709434e-06, + "loss": 1.4776, + "step": 1380 + }, + { + "epoch": 5.5910931174089065, + "grad_norm": 4.320505162169018, + "learning_rate": 4.848102002523597e-06, + "loss": 1.4632, + "step": 1381 + }, + { + "epoch": 5.5951417004048585, + "grad_norm": 3.8728016571115496, + "learning_rate": 4.841039307707878e-06, + "loss": 1.1957, + "step": 1382 + }, + { + "epoch": 5.59919028340081, + "grad_norm": 3.492753062395854, + "learning_rate": 4.833976930367859e-06, + "loss": 1.2615, + "step": 1383 + }, + { + "epoch": 5.603238866396762, + "grad_norm": 3.5488104026542513, + "learning_rate": 4.8269148846085e-06, + "loss": 1.3531, + "step": 1384 + }, + { + "epoch": 5.607287449392713, + "grad_norm": 4.068763646311401, + "learning_rate": 4.819853184534085e-06, + "loss": 1.1753, + "step": 1385 + }, + { + "epoch": 5.611336032388664, + "grad_norm": 4.377905274086795, + "learning_rate": 4.812791844248223e-06, + "loss": 1.4958, + "step": 1386 + }, + { + "epoch": 5.615384615384615, + "grad_norm": 3.6007003800569386, + "learning_rate": 4.80573087785379e-06, + "loss": 1.4974, + "step": 1387 + }, + { + "epoch": 5.619433198380567, + "grad_norm": 4.802311568406072, + "learning_rate": 4.798670299452926e-06, + "loss": 1.2282, + "step": 1388 + }, + { + "epoch": 5.623481781376518, + "grad_norm": 4.7745139328350135, + "learning_rate": 4.7916101231469886e-06, + "loss": 1.6082, + "step": 1389 + }, + { + "epoch": 5.62753036437247, + "grad_norm": 4.123643145041474, + "learning_rate": 4.784550363036539e-06, + "loss": 1.4134, + "step": 1390 + }, + { + "epoch": 5.631578947368421, + "grad_norm": 4.402507798104486, + "learning_rate": 4.7774910332213005e-06, + "loss": 1.6983, + "step": 1391 + }, + { + "epoch": 5.635627530364372, + "grad_norm": 3.8264895380697355, + "learning_rate": 4.770432147800141e-06, + "loss": 1.2975, + "step": 1392 + }, + { + "epoch": 5.6396761133603235, + "grad_norm": 4.517127158006528, + "learning_rate": 4.763373720871044e-06, + "loss": 1.5541, + "step": 1393 + }, + { + "epoch": 5.6437246963562755, + "grad_norm": 3.773516174749104, + "learning_rate": 4.756315766531069e-06, + "loss": 1.4461, + "step": 1394 + }, + { + "epoch": 5.647773279352227, + "grad_norm": 4.115306809751942, + "learning_rate": 4.749258298876338e-06, + "loss": 1.5498, + "step": 1395 + }, + { + "epoch": 5.651821862348179, + "grad_norm": 3.6874924730709413, + "learning_rate": 4.742201332001998e-06, + "loss": 1.333, + "step": 1396 + }, + { + "epoch": 5.65587044534413, + "grad_norm": 4.445009061040838, + "learning_rate": 4.735144880002199e-06, + "loss": 1.556, + "step": 1397 + }, + { + "epoch": 5.659919028340081, + "grad_norm": 4.819457563644938, + "learning_rate": 4.728088956970059e-06, + "loss": 1.3788, + "step": 1398 + }, + { + "epoch": 5.663967611336032, + "grad_norm": 3.9520027905188275, + "learning_rate": 4.721033576997641e-06, + "loss": 1.5347, + "step": 1399 + }, + { + "epoch": 5.668016194331984, + "grad_norm": 4.124422632263573, + "learning_rate": 4.713978754175926e-06, + "loss": 1.5292, + "step": 1400 + }, + { + "epoch": 5.672064777327935, + "grad_norm": 4.475410908220464, + "learning_rate": 4.706924502594777e-06, + "loss": 1.6549, + "step": 1401 + }, + { + "epoch": 5.676113360323887, + "grad_norm": 9.027913146446028, + "learning_rate": 4.69987083634292e-06, + "loss": 1.5814, + "step": 1402 + }, + { + "epoch": 5.680161943319838, + "grad_norm": 4.584849302385236, + "learning_rate": 4.692817769507912e-06, + "loss": 1.4982, + "step": 1403 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 4.088441988479735, + "learning_rate": 4.685765316176111e-06, + "loss": 1.3453, + "step": 1404 + }, + { + "epoch": 5.6882591093117405, + "grad_norm": 3.94840157844417, + "learning_rate": 4.67871349043265e-06, + "loss": 1.4717, + "step": 1405 + }, + { + "epoch": 5.6923076923076925, + "grad_norm": 4.252654676588602, + "learning_rate": 4.671662306361409e-06, + "loss": 1.4891, + "step": 1406 + }, + { + "epoch": 5.696356275303644, + "grad_norm": 3.784433251453805, + "learning_rate": 4.664611778044988e-06, + "loss": 1.3408, + "step": 1407 + }, + { + "epoch": 5.700404858299595, + "grad_norm": 4.988371722598511, + "learning_rate": 4.657561919564675e-06, + "loss": 1.8095, + "step": 1408 + }, + { + "epoch": 5.704453441295547, + "grad_norm": 4.664322457086443, + "learning_rate": 4.6505127450004216e-06, + "loss": 1.6024, + "step": 1409 + }, + { + "epoch": 5.708502024291498, + "grad_norm": 4.600715197938257, + "learning_rate": 4.643464268430812e-06, + "loss": 1.2021, + "step": 1410 + }, + { + "epoch": 5.712550607287449, + "grad_norm": 3.9099782560794503, + "learning_rate": 4.636416503933038e-06, + "loss": 1.3472, + "step": 1411 + }, + { + "epoch": 5.716599190283401, + "grad_norm": 3.9111543599245757, + "learning_rate": 4.62936946558287e-06, + "loss": 1.4523, + "step": 1412 + }, + { + "epoch": 5.720647773279352, + "grad_norm": 4.6487019160659, + "learning_rate": 4.622323167454623e-06, + "loss": 1.2302, + "step": 1413 + }, + { + "epoch": 5.724696356275303, + "grad_norm": 4.4548900152472815, + "learning_rate": 4.6152776236211415e-06, + "loss": 1.4256, + "step": 1414 + }, + { + "epoch": 5.728744939271255, + "grad_norm": 4.058092491633072, + "learning_rate": 4.608232848153757e-06, + "loss": 1.6055, + "step": 1415 + }, + { + "epoch": 5.732793522267206, + "grad_norm": 4.025502584936106, + "learning_rate": 4.601188855122269e-06, + "loss": 1.3484, + "step": 1416 + }, + { + "epoch": 5.7368421052631575, + "grad_norm": 4.1244592308665275, + "learning_rate": 4.594145658594914e-06, + "loss": 1.4537, + "step": 1417 + }, + { + "epoch": 5.7408906882591095, + "grad_norm": 4.167306098888644, + "learning_rate": 4.587103272638339e-06, + "loss": 2.0785, + "step": 1418 + }, + { + "epoch": 5.744939271255061, + "grad_norm": 3.858307172453616, + "learning_rate": 4.580061711317571e-06, + "loss": 1.5669, + "step": 1419 + }, + { + "epoch": 5.748987854251012, + "grad_norm": 4.76966444820156, + "learning_rate": 4.57302098869599e-06, + "loss": 1.3901, + "step": 1420 + }, + { + "epoch": 5.753036437246964, + "grad_norm": 4.3778097624694166, + "learning_rate": 4.565981118835299e-06, + "loss": 1.291, + "step": 1421 + }, + { + "epoch": 5.757085020242915, + "grad_norm": 4.090411706131635, + "learning_rate": 4.558942115795502e-06, + "loss": 1.4406, + "step": 1422 + }, + { + "epoch": 5.761133603238866, + "grad_norm": 5.337161250566187, + "learning_rate": 4.551903993634869e-06, + "loss": 2.1851, + "step": 1423 + }, + { + "epoch": 5.765182186234818, + "grad_norm": 6.286779559937267, + "learning_rate": 4.5448667664099125e-06, + "loss": 1.9602, + "step": 1424 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 6.765386541677961, + "learning_rate": 4.537830448175354e-06, + "loss": 1.8644, + "step": 1425 + }, + { + "epoch": 5.77327935222672, + "grad_norm": 4.009998051124011, + "learning_rate": 4.530795052984104e-06, + "loss": 1.3677, + "step": 1426 + }, + { + "epoch": 5.777327935222672, + "grad_norm": 4.067144464386327, + "learning_rate": 4.523760594887228e-06, + "loss": 1.6488, + "step": 1427 + }, + { + "epoch": 5.781376518218623, + "grad_norm": 3.900176884022236, + "learning_rate": 4.5167270879339165e-06, + "loss": 1.6378, + "step": 1428 + }, + { + "epoch": 5.7854251012145745, + "grad_norm": 4.307053870196715, + "learning_rate": 4.509694546171468e-06, + "loss": 1.458, + "step": 1429 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 4.202185719713703, + "learning_rate": 4.5026629836452445e-06, + "loss": 1.3863, + "step": 1430 + }, + { + "epoch": 5.793522267206478, + "grad_norm": 4.276979157413732, + "learning_rate": 4.495632414398659e-06, + "loss": 1.4133, + "step": 1431 + }, + { + "epoch": 5.797570850202429, + "grad_norm": 4.560387387278901, + "learning_rate": 4.488602852473138e-06, + "loss": 1.4313, + "step": 1432 + }, + { + "epoch": 5.801619433198381, + "grad_norm": 3.900998231009241, + "learning_rate": 4.481574311908096e-06, + "loss": 1.3065, + "step": 1433 + }, + { + "epoch": 5.805668016194332, + "grad_norm": 3.971785106076469, + "learning_rate": 4.4745468067409055e-06, + "loss": 1.1997, + "step": 1434 + }, + { + "epoch": 5.809716599190283, + "grad_norm": 4.230506562739517, + "learning_rate": 4.467520351006878e-06, + "loss": 1.5584, + "step": 1435 + }, + { + "epoch": 5.813765182186235, + "grad_norm": 5.12301466025395, + "learning_rate": 4.460494958739223e-06, + "loss": 1.4086, + "step": 1436 + }, + { + "epoch": 5.817813765182186, + "grad_norm": 4.360480527706543, + "learning_rate": 4.453470643969027e-06, + "loss": 1.2759, + "step": 1437 + }, + { + "epoch": 5.821862348178137, + "grad_norm": 11.774868013423882, + "learning_rate": 4.446447420725227e-06, + "loss": 2.2866, + "step": 1438 + }, + { + "epoch": 5.825910931174089, + "grad_norm": 23.795049320685568, + "learning_rate": 4.439425303034576e-06, + "loss": 3.4094, + "step": 1439 + }, + { + "epoch": 5.82995951417004, + "grad_norm": 4.607383270222987, + "learning_rate": 4.432404304921624e-06, + "loss": 1.3129, + "step": 1440 + }, + { + "epoch": 5.834008097165992, + "grad_norm": 4.67077067966415, + "learning_rate": 4.4253844404086785e-06, + "loss": 1.2285, + "step": 1441 + }, + { + "epoch": 5.838056680161944, + "grad_norm": 3.9312338569636394, + "learning_rate": 4.418365723515791e-06, + "loss": 1.286, + "step": 1442 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 4.003272377775398, + "learning_rate": 4.411348168260713e-06, + "loss": 1.3394, + "step": 1443 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 4.140441268173913, + "learning_rate": 4.404331788658882e-06, + "loss": 1.1712, + "step": 1444 + }, + { + "epoch": 5.850202429149798, + "grad_norm": 4.57761440040013, + "learning_rate": 4.397316598723385e-06, + "loss": 1.3548, + "step": 1445 + }, + { + "epoch": 5.854251012145749, + "grad_norm": 4.860966996025116, + "learning_rate": 4.390302612464934e-06, + "loss": 1.4071, + "step": 1446 + }, + { + "epoch": 5.8582995951417, + "grad_norm": 3.557234324926702, + "learning_rate": 4.383289843891835e-06, + "loss": 1.3334, + "step": 1447 + }, + { + "epoch": 5.862348178137652, + "grad_norm": 4.6167043083990515, + "learning_rate": 4.376278307009962e-06, + "loss": 1.332, + "step": 1448 + }, + { + "epoch": 5.866396761133603, + "grad_norm": 4.529476800833651, + "learning_rate": 4.369268015822733e-06, + "loss": 1.336, + "step": 1449 + }, + { + "epoch": 5.870445344129554, + "grad_norm": 5.460345634297291, + "learning_rate": 4.362258984331074e-06, + "loss": 1.7992, + "step": 1450 + }, + { + "epoch": 5.874493927125506, + "grad_norm": 4.852544977047948, + "learning_rate": 4.355251226533396e-06, + "loss": 1.7401, + "step": 1451 + }, + { + "epoch": 5.8785425101214575, + "grad_norm": 5.091561572959863, + "learning_rate": 4.348244756425569e-06, + "loss": 1.4945, + "step": 1452 + }, + { + "epoch": 5.882591093117409, + "grad_norm": 4.66519342749034, + "learning_rate": 4.341239588000887e-06, + "loss": 1.4193, + "step": 1453 + }, + { + "epoch": 5.886639676113361, + "grad_norm": 4.442060928034546, + "learning_rate": 4.334235735250047e-06, + "loss": 1.0274, + "step": 1454 + }, + { + "epoch": 5.890688259109312, + "grad_norm": 3.911256400148853, + "learning_rate": 4.327233212161118e-06, + "loss": 1.5401, + "step": 1455 + }, + { + "epoch": 5.894736842105263, + "grad_norm": 3.8807011184816846, + "learning_rate": 4.320232032719511e-06, + "loss": 1.5831, + "step": 1456 + }, + { + "epoch": 5.898785425101215, + "grad_norm": 3.58685678874274, + "learning_rate": 4.313232210907959e-06, + "loss": 1.3268, + "step": 1457 + }, + { + "epoch": 5.902834008097166, + "grad_norm": 4.318238652473736, + "learning_rate": 4.306233760706478e-06, + "loss": 1.3389, + "step": 1458 + }, + { + "epoch": 5.906882591093117, + "grad_norm": 4.611379978717958, + "learning_rate": 4.299236696092347e-06, + "loss": 1.4306, + "step": 1459 + }, + { + "epoch": 5.910931174089069, + "grad_norm": 3.900073554354451, + "learning_rate": 4.292241031040077e-06, + "loss": 1.1163, + "step": 1460 + }, + { + "epoch": 5.91497975708502, + "grad_norm": 4.550673982692945, + "learning_rate": 4.285246779521384e-06, + "loss": 1.2052, + "step": 1461 + }, + { + "epoch": 5.919028340080971, + "grad_norm": 4.574548958146505, + "learning_rate": 4.278253955505163e-06, + "loss": 1.213, + "step": 1462 + }, + { + "epoch": 5.923076923076923, + "grad_norm": 3.5603964829525725, + "learning_rate": 4.271262572957453e-06, + "loss": 1.5401, + "step": 1463 + }, + { + "epoch": 5.9271255060728745, + "grad_norm": 4.899646738920418, + "learning_rate": 4.264272645841419e-06, + "loss": 1.3832, + "step": 1464 + }, + { + "epoch": 5.931174089068826, + "grad_norm": 4.936217075017478, + "learning_rate": 4.2572841881173205e-06, + "loss": 1.3896, + "step": 1465 + }, + { + "epoch": 5.935222672064778, + "grad_norm": 4.841906645627207, + "learning_rate": 4.250297213742473e-06, + "loss": 1.173, + "step": 1466 + }, + { + "epoch": 5.939271255060729, + "grad_norm": 4.652957613099752, + "learning_rate": 4.243311736671239e-06, + "loss": 1.1544, + "step": 1467 + }, + { + "epoch": 5.94331983805668, + "grad_norm": 5.5395351930289864, + "learning_rate": 4.236327770854987e-06, + "loss": 1.4593, + "step": 1468 + }, + { + "epoch": 5.947368421052632, + "grad_norm": 4.423876597754868, + "learning_rate": 4.229345330242067e-06, + "loss": 1.1935, + "step": 1469 + }, + { + "epoch": 5.951417004048583, + "grad_norm": 5.270192860869612, + "learning_rate": 4.222364428777786e-06, + "loss": 1.1325, + "step": 1470 + }, + { + "epoch": 5.955465587044534, + "grad_norm": 5.410786507887627, + "learning_rate": 4.2153850804043706e-06, + "loss": 1.3971, + "step": 1471 + }, + { + "epoch": 5.959514170040486, + "grad_norm": 4.884826922400209, + "learning_rate": 4.2084072990609505e-06, + "loss": 1.4698, + "step": 1472 + }, + { + "epoch": 5.963562753036437, + "grad_norm": 4.313211329480648, + "learning_rate": 4.201431098683527e-06, + "loss": 1.4382, + "step": 1473 + }, + { + "epoch": 5.967611336032388, + "grad_norm": 5.213303398147368, + "learning_rate": 4.194456493204939e-06, + "loss": 1.5175, + "step": 1474 + }, + { + "epoch": 5.97165991902834, + "grad_norm": 5.448304606946485, + "learning_rate": 4.187483496554844e-06, + "loss": 1.433, + "step": 1475 + }, + { + "epoch": 5.9757085020242915, + "grad_norm": 3.801193566372591, + "learning_rate": 4.1805121226596826e-06, + "loss": 1.4114, + "step": 1476 + }, + { + "epoch": 5.979757085020243, + "grad_norm": 4.17077172984551, + "learning_rate": 4.173542385442659e-06, + "loss": 1.4847, + "step": 1477 + }, + { + "epoch": 5.983805668016195, + "grad_norm": 3.8042786020089285, + "learning_rate": 4.166574298823707e-06, + "loss": 1.5417, + "step": 1478 + }, + { + "epoch": 5.987854251012146, + "grad_norm": 4.0974559638165795, + "learning_rate": 4.1596078767194615e-06, + "loss": 1.3383, + "step": 1479 + }, + { + "epoch": 5.991902834008097, + "grad_norm": 3.4327656830127844, + "learning_rate": 4.152643133043236e-06, + "loss": 1.3384, + "step": 1480 + }, + { + "epoch": 5.995951417004049, + "grad_norm": 3.615327810634163, + "learning_rate": 4.145680081704989e-06, + "loss": 1.6541, + "step": 1481 + }, + { + "epoch": 6.0, + "grad_norm": 3.8329106879075594, + "learning_rate": 4.138718736611302e-06, + "loss": 1.3694, + "step": 1482 + }, + { + "epoch": 6.004048582995951, + "grad_norm": 3.830450157141594, + "learning_rate": 4.131759111665349e-06, + "loss": 1.4049, + "step": 1483 + }, + { + "epoch": 6.008097165991903, + "grad_norm": 5.1111426342190684, + "learning_rate": 4.1248012207668635e-06, + "loss": 1.5639, + "step": 1484 + }, + { + "epoch": 6.012145748987854, + "grad_norm": 4.83681122900061, + "learning_rate": 4.117845077812122e-06, + "loss": 1.3693, + "step": 1485 + }, + { + "epoch": 6.016194331983805, + "grad_norm": 5.4329470747052255, + "learning_rate": 4.110890696693906e-06, + "loss": 1.5831, + "step": 1486 + }, + { + "epoch": 6.020242914979757, + "grad_norm": 4.6500916905003535, + "learning_rate": 4.103938091301479e-06, + "loss": 1.7881, + "step": 1487 + }, + { + "epoch": 6.0242914979757085, + "grad_norm": 4.885048703930011, + "learning_rate": 4.096987275520562e-06, + "loss": 1.6668, + "step": 1488 + }, + { + "epoch": 6.02834008097166, + "grad_norm": 4.13626291343727, + "learning_rate": 4.090038263233294e-06, + "loss": 1.3587, + "step": 1489 + }, + { + "epoch": 6.032388663967612, + "grad_norm": 4.904165295750069, + "learning_rate": 4.08309106831822e-06, + "loss": 1.3678, + "step": 1490 + }, + { + "epoch": 6.036437246963563, + "grad_norm": 4.636168977638758, + "learning_rate": 4.0761457046502515e-06, + "loss": 1.5829, + "step": 1491 + }, + { + "epoch": 6.040485829959514, + "grad_norm": 4.665143753358694, + "learning_rate": 4.0692021861006386e-06, + "loss": 1.382, + "step": 1492 + }, + { + "epoch": 6.044534412955466, + "grad_norm": 4.58626969694099, + "learning_rate": 4.062260526536955e-06, + "loss": 1.4891, + "step": 1493 + }, + { + "epoch": 6.048582995951417, + "grad_norm": 4.689483058767236, + "learning_rate": 4.055320739823057e-06, + "loss": 1.3764, + "step": 1494 + }, + { + "epoch": 6.052631578947368, + "grad_norm": 5.0699840890954535, + "learning_rate": 4.048382839819058e-06, + "loss": 1.4399, + "step": 1495 + }, + { + "epoch": 6.05668016194332, + "grad_norm": 4.582891853100069, + "learning_rate": 4.041446840381309e-06, + "loss": 1.2964, + "step": 1496 + }, + { + "epoch": 6.060728744939271, + "grad_norm": 4.596209939663152, + "learning_rate": 4.034512755362361e-06, + "loss": 1.4451, + "step": 1497 + }, + { + "epoch": 6.064777327935222, + "grad_norm": 5.077809534848778, + "learning_rate": 4.027580598610943e-06, + "loss": 1.3934, + "step": 1498 + }, + { + "epoch": 6.068825910931174, + "grad_norm": 5.121648526362897, + "learning_rate": 4.0206503839719335e-06, + "loss": 1.5479, + "step": 1499 + }, + { + "epoch": 6.0728744939271255, + "grad_norm": 4.611548299373776, + "learning_rate": 4.01372212528633e-06, + "loss": 1.4704, + "step": 1500 + }, + { + "epoch": 6.076923076923077, + "grad_norm": 5.312277841332635, + "learning_rate": 4.006795836391226e-06, + "loss": 1.4155, + "step": 1501 + }, + { + "epoch": 6.080971659919029, + "grad_norm": 4.964246172799465, + "learning_rate": 3.999871531119779e-06, + "loss": 1.4857, + "step": 1502 + }, + { + "epoch": 6.08502024291498, + "grad_norm": 4.070954622733409, + "learning_rate": 3.992949223301185e-06, + "loss": 1.4726, + "step": 1503 + }, + { + "epoch": 6.089068825910931, + "grad_norm": 4.91594481744365, + "learning_rate": 3.986028926760655e-06, + "loss": 1.4183, + "step": 1504 + }, + { + "epoch": 6.093117408906883, + "grad_norm": 4.691943755517188, + "learning_rate": 3.9791106553193746e-06, + "loss": 1.497, + "step": 1505 + }, + { + "epoch": 6.097165991902834, + "grad_norm": 4.475695489598384, + "learning_rate": 3.972194422794493e-06, + "loss": 1.2572, + "step": 1506 + }, + { + "epoch": 6.101214574898785, + "grad_norm": 4.947241370368582, + "learning_rate": 3.965280242999083e-06, + "loss": 1.4398, + "step": 1507 + }, + { + "epoch": 6.105263157894737, + "grad_norm": 5.319805507480639, + "learning_rate": 3.9583681297421194e-06, + "loss": 1.3871, + "step": 1508 + }, + { + "epoch": 6.109311740890688, + "grad_norm": 4.749559720069604, + "learning_rate": 3.951458096828449e-06, + "loss": 1.375, + "step": 1509 + }, + { + "epoch": 6.113360323886639, + "grad_norm": 5.727885976477068, + "learning_rate": 3.944550158058762e-06, + "loss": 1.3195, + "step": 1510 + }, + { + "epoch": 6.117408906882591, + "grad_norm": 5.227063382939529, + "learning_rate": 3.937644327229572e-06, + "loss": 1.2256, + "step": 1511 + }, + { + "epoch": 6.1214574898785425, + "grad_norm": 4.738297898420654, + "learning_rate": 3.930740618133173e-06, + "loss": 1.2919, + "step": 1512 + }, + { + "epoch": 6.125506072874494, + "grad_norm": 4.796528713602936, + "learning_rate": 3.923839044557632e-06, + "loss": 1.3028, + "step": 1513 + }, + { + "epoch": 6.129554655870446, + "grad_norm": 5.590663766511934, + "learning_rate": 3.916939620286743e-06, + "loss": 1.1917, + "step": 1514 + }, + { + "epoch": 6.133603238866397, + "grad_norm": 4.16713103068686, + "learning_rate": 3.9100423591000124e-06, + "loss": 1.54, + "step": 1515 + }, + { + "epoch": 6.137651821862348, + "grad_norm": 5.035939317822777, + "learning_rate": 3.903147274772624e-06, + "loss": 1.3571, + "step": 1516 + }, + { + "epoch": 6.1417004048583, + "grad_norm": 4.0009552855543955, + "learning_rate": 3.896254381075416e-06, + "loss": 1.1103, + "step": 1517 + }, + { + "epoch": 6.145748987854251, + "grad_norm": 5.217383616489112, + "learning_rate": 3.8893636917748455e-06, + "loss": 1.4538, + "step": 1518 + }, + { + "epoch": 6.149797570850202, + "grad_norm": 4.709807039436491, + "learning_rate": 3.882475220632975e-06, + "loss": 1.2834, + "step": 1519 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 4.179087376153956, + "learning_rate": 3.875588981407433e-06, + "loss": 1.5023, + "step": 1520 + }, + { + "epoch": 6.157894736842105, + "grad_norm": 5.387448869675948, + "learning_rate": 3.86870498785139e-06, + "loss": 1.5494, + "step": 1521 + }, + { + "epoch": 6.161943319838056, + "grad_norm": 4.138048095732358, + "learning_rate": 3.861823253713535e-06, + "loss": 1.3442, + "step": 1522 + }, + { + "epoch": 6.165991902834008, + "grad_norm": 4.522673016609398, + "learning_rate": 3.854943792738037e-06, + "loss": 1.4306, + "step": 1523 + }, + { + "epoch": 6.17004048582996, + "grad_norm": 4.04807524846957, + "learning_rate": 3.848066618664534e-06, + "loss": 1.9855, + "step": 1524 + }, + { + "epoch": 6.174089068825911, + "grad_norm": 4.797553089745047, + "learning_rate": 3.841191745228091e-06, + "loss": 1.2562, + "step": 1525 + }, + { + "epoch": 6.178137651821863, + "grad_norm": 5.562886515767805, + "learning_rate": 3.834319186159179e-06, + "loss": 1.4532, + "step": 1526 + }, + { + "epoch": 6.182186234817814, + "grad_norm": 3.8582598799938315, + "learning_rate": 3.82744895518365e-06, + "loss": 1.2517, + "step": 1527 + }, + { + "epoch": 6.186234817813765, + "grad_norm": 4.976499846840885, + "learning_rate": 3.8205810660227e-06, + "loss": 1.4395, + "step": 1528 + }, + { + "epoch": 6.190283400809717, + "grad_norm": 5.013759086459238, + "learning_rate": 3.8137155323928526e-06, + "loss": 1.4579, + "step": 1529 + }, + { + "epoch": 6.194331983805668, + "grad_norm": 5.210004353191725, + "learning_rate": 3.8068523680059287e-06, + "loss": 1.6307, + "step": 1530 + }, + { + "epoch": 6.198380566801619, + "grad_norm": 4.444756027075356, + "learning_rate": 3.799991586569012e-06, + "loss": 1.6785, + "step": 1531 + }, + { + "epoch": 6.202429149797571, + "grad_norm": 4.581599022941181, + "learning_rate": 3.7931332017844302e-06, + "loss": 1.3911, + "step": 1532 + }, + { + "epoch": 6.206477732793522, + "grad_norm": 4.426732929526946, + "learning_rate": 3.786277227349724e-06, + "loss": 1.7226, + "step": 1533 + }, + { + "epoch": 6.2105263157894735, + "grad_norm": 4.573503321332187, + "learning_rate": 3.77942367695762e-06, + "loss": 1.7276, + "step": 1534 + }, + { + "epoch": 6.2145748987854255, + "grad_norm": 4.632474175205992, + "learning_rate": 3.7725725642960047e-06, + "loss": 1.4984, + "step": 1535 + }, + { + "epoch": 6.218623481781377, + "grad_norm": 5.004422527391663, + "learning_rate": 3.7657239030478927e-06, + "loss": 1.3822, + "step": 1536 + }, + { + "epoch": 6.222672064777328, + "grad_norm": 4.730329238431976, + "learning_rate": 3.758877706891407e-06, + "loss": 1.3005, + "step": 1537 + }, + { + "epoch": 6.22672064777328, + "grad_norm": 4.696618081800561, + "learning_rate": 3.752033989499742e-06, + "loss": 1.4995, + "step": 1538 + }, + { + "epoch": 6.230769230769231, + "grad_norm": 4.819216438393582, + "learning_rate": 3.7451927645411466e-06, + "loss": 1.2958, + "step": 1539 + }, + { + "epoch": 6.234817813765182, + "grad_norm": 5.4741629869641315, + "learning_rate": 3.7383540456788915e-06, + "loss": 1.5321, + "step": 1540 + }, + { + "epoch": 6.238866396761134, + "grad_norm": 5.271140694357475, + "learning_rate": 3.7315178465712364e-06, + "loss": 1.4701, + "step": 1541 + }, + { + "epoch": 6.242914979757085, + "grad_norm": 4.870369052928556, + "learning_rate": 3.7246841808714172e-06, + "loss": 1.4965, + "step": 1542 + }, + { + "epoch": 6.246963562753036, + "grad_norm": 4.627274116359122, + "learning_rate": 3.717853062227604e-06, + "loss": 1.3376, + "step": 1543 + }, + { + "epoch": 6.251012145748988, + "grad_norm": 4.862725711210235, + "learning_rate": 3.7110245042828786e-06, + "loss": 1.436, + "step": 1544 + }, + { + "epoch": 6.255060728744939, + "grad_norm": 4.948809530195508, + "learning_rate": 3.704198520675214e-06, + "loss": 1.3922, + "step": 1545 + }, + { + "epoch": 6.2591093117408905, + "grad_norm": 4.36897138423846, + "learning_rate": 3.69737512503744e-06, + "loss": 1.3391, + "step": 1546 + }, + { + "epoch": 6.2631578947368425, + "grad_norm": 4.774874457232701, + "learning_rate": 3.690554330997215e-06, + "loss": 1.1307, + "step": 1547 + }, + { + "epoch": 6.267206477732794, + "grad_norm": 4.560395256546156, + "learning_rate": 3.6837361521770056e-06, + "loss": 1.4205, + "step": 1548 + }, + { + "epoch": 6.271255060728745, + "grad_norm": 4.657377226532245, + "learning_rate": 3.6769206021940505e-06, + "loss": 1.4284, + "step": 1549 + }, + { + "epoch": 6.275303643724697, + "grad_norm": 4.523918352960143, + "learning_rate": 3.670107694660343e-06, + "loss": 1.4865, + "step": 1550 + }, + { + "epoch": 6.279352226720648, + "grad_norm": 6.060799013063325, + "learning_rate": 3.6632974431825965e-06, + "loss": 1.4177, + "step": 1551 + }, + { + "epoch": 6.283400809716599, + "grad_norm": 5.508975855268233, + "learning_rate": 3.656489861362218e-06, + "loss": 1.0975, + "step": 1552 + }, + { + "epoch": 6.287449392712551, + "grad_norm": 5.591620230854365, + "learning_rate": 3.6496849627952875e-06, + "loss": 1.2607, + "step": 1553 + }, + { + "epoch": 6.291497975708502, + "grad_norm": 5.501342695470275, + "learning_rate": 3.6428827610725203e-06, + "loss": 1.113, + "step": 1554 + }, + { + "epoch": 6.295546558704453, + "grad_norm": 5.371568603468503, + "learning_rate": 3.636083269779249e-06, + "loss": 1.3579, + "step": 1555 + }, + { + "epoch": 6.299595141700405, + "grad_norm": 4.658495618502483, + "learning_rate": 3.6292865024953945e-06, + "loss": 1.5612, + "step": 1556 + }, + { + "epoch": 6.303643724696356, + "grad_norm": 5.171922327948163, + "learning_rate": 3.622492472795432e-06, + "loss": 1.196, + "step": 1557 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 5.187630245267101, + "learning_rate": 3.615701194248375e-06, + "loss": 1.2403, + "step": 1558 + }, + { + "epoch": 6.3117408906882595, + "grad_norm": 4.739560149771274, + "learning_rate": 3.6089126804177373e-06, + "loss": 1.2748, + "step": 1559 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 5.8421200692609405, + "learning_rate": 3.6021269448615148e-06, + "loss": 1.1801, + "step": 1560 + }, + { + "epoch": 6.319838056680162, + "grad_norm": 5.003939683781086, + "learning_rate": 3.595344001132154e-06, + "loss": 1.1334, + "step": 1561 + }, + { + "epoch": 6.323886639676114, + "grad_norm": 5.213320704625486, + "learning_rate": 3.5885638627765228e-06, + "loss": 1.1662, + "step": 1562 + }, + { + "epoch": 6.327935222672065, + "grad_norm": 5.12672208334294, + "learning_rate": 3.5817865433358902e-06, + "loss": 1.1897, + "step": 1563 + }, + { + "epoch": 6.331983805668016, + "grad_norm": 4.990310131147776, + "learning_rate": 3.5750120563458924e-06, + "loss": 1.2197, + "step": 1564 + }, + { + "epoch": 6.336032388663968, + "grad_norm": 5.404582388895142, + "learning_rate": 3.568240415336509e-06, + "loss": 1.2979, + "step": 1565 + }, + { + "epoch": 6.340080971659919, + "grad_norm": 4.459387759024826, + "learning_rate": 3.5614716338320384e-06, + "loss": 1.2379, + "step": 1566 + }, + { + "epoch": 6.34412955465587, + "grad_norm": 4.906670384808422, + "learning_rate": 3.554705725351063e-06, + "loss": 1.1656, + "step": 1567 + }, + { + "epoch": 6.348178137651822, + "grad_norm": 5.788345645390745, + "learning_rate": 3.547942703406433e-06, + "loss": 1.3082, + "step": 1568 + }, + { + "epoch": 6.352226720647773, + "grad_norm": 5.367912057539721, + "learning_rate": 3.5411825815052296e-06, + "loss": 1.313, + "step": 1569 + }, + { + "epoch": 6.3562753036437245, + "grad_norm": 5.326205519895874, + "learning_rate": 3.534425373148741e-06, + "loss": 0.9762, + "step": 1570 + }, + { + "epoch": 6.3603238866396765, + "grad_norm": 5.708844505808687, + "learning_rate": 3.52767109183244e-06, + "loss": 1.373, + "step": 1571 + }, + { + "epoch": 6.364372469635628, + "grad_norm": 4.876273122171325, + "learning_rate": 3.5209197510459526e-06, + "loss": 1.448, + "step": 1572 + }, + { + "epoch": 6.368421052631579, + "grad_norm": 4.935122614604545, + "learning_rate": 3.5141713642730305e-06, + "loss": 1.3476, + "step": 1573 + }, + { + "epoch": 6.372469635627531, + "grad_norm": 6.109929961302762, + "learning_rate": 3.507425944991529e-06, + "loss": 1.4072, + "step": 1574 + }, + { + "epoch": 6.376518218623482, + "grad_norm": 5.409803828147351, + "learning_rate": 3.5006835066733707e-06, + "loss": 1.0987, + "step": 1575 + }, + { + "epoch": 6.380566801619433, + "grad_norm": 5.907878971006872, + "learning_rate": 3.4939440627845305e-06, + "loss": 1.2467, + "step": 1576 + }, + { + "epoch": 6.384615384615385, + "grad_norm": 5.060588652380501, + "learning_rate": 3.4872076267850015e-06, + "loss": 1.0512, + "step": 1577 + }, + { + "epoch": 6.388663967611336, + "grad_norm": 6.199263715395586, + "learning_rate": 3.480474212128766e-06, + "loss": 1.1192, + "step": 1578 + }, + { + "epoch": 6.392712550607287, + "grad_norm": 5.68773960369221, + "learning_rate": 3.473743832263778e-06, + "loss": 1.2989, + "step": 1579 + }, + { + "epoch": 6.396761133603239, + "grad_norm": 6.5411566006758886, + "learning_rate": 3.4670165006319236e-06, + "loss": 1.1125, + "step": 1580 + }, + { + "epoch": 6.40080971659919, + "grad_norm": 4.779266992013558, + "learning_rate": 3.4602922306690062e-06, + "loss": 1.1461, + "step": 1581 + }, + { + "epoch": 6.4048582995951415, + "grad_norm": 4.983422698218311, + "learning_rate": 3.453571035804714e-06, + "loss": 1.1805, + "step": 1582 + }, + { + "epoch": 6.4089068825910935, + "grad_norm": 6.281439869347411, + "learning_rate": 3.4468529294625895e-06, + "loss": 1.2865, + "step": 1583 + }, + { + "epoch": 6.412955465587045, + "grad_norm": 5.447638251945489, + "learning_rate": 3.4401379250600124e-06, + "loss": 1.112, + "step": 1584 + }, + { + "epoch": 6.417004048582996, + "grad_norm": 6.031371603465583, + "learning_rate": 3.433426036008163e-06, + "loss": 1.4222, + "step": 1585 + }, + { + "epoch": 6.421052631578947, + "grad_norm": 6.344172383462025, + "learning_rate": 3.4267172757120005e-06, + "loss": 1.4558, + "step": 1586 + }, + { + "epoch": 6.425101214574899, + "grad_norm": 5.253990555737164, + "learning_rate": 3.420011657570238e-06, + "loss": 1.4408, + "step": 1587 + }, + { + "epoch": 6.42914979757085, + "grad_norm": 5.944240629250275, + "learning_rate": 3.413309194975309e-06, + "loss": 1.4281, + "step": 1588 + }, + { + "epoch": 6.433198380566802, + "grad_norm": 4.690048614883703, + "learning_rate": 3.406609901313349e-06, + "loss": 1.6038, + "step": 1589 + }, + { + "epoch": 6.437246963562753, + "grad_norm": 5.538761343018897, + "learning_rate": 3.39991378996416e-06, + "loss": 1.3818, + "step": 1590 + }, + { + "epoch": 6.441295546558704, + "grad_norm": 5.904913245197766, + "learning_rate": 3.393220874301193e-06, + "loss": 1.324, + "step": 1591 + }, + { + "epoch": 6.445344129554655, + "grad_norm": 4.935839021246995, + "learning_rate": 3.386531167691512e-06, + "loss": 1.569, + "step": 1592 + }, + { + "epoch": 6.449392712550607, + "grad_norm": 5.96200793571726, + "learning_rate": 3.379844683495775e-06, + "loss": 1.3697, + "step": 1593 + }, + { + "epoch": 6.4534412955465585, + "grad_norm": 5.74218375449931, + "learning_rate": 3.3731614350682045e-06, + "loss": 1.3591, + "step": 1594 + }, + { + "epoch": 6.4574898785425106, + "grad_norm": 5.819819829923634, + "learning_rate": 3.36648143575656e-06, + "loss": 1.7039, + "step": 1595 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 7.530849687169004, + "learning_rate": 3.3598046989021073e-06, + "loss": 1.8161, + "step": 1596 + }, + { + "epoch": 6.465587044534413, + "grad_norm": 5.773184926893142, + "learning_rate": 3.3531312378396026e-06, + "loss": 1.506, + "step": 1597 + }, + { + "epoch": 6.469635627530364, + "grad_norm": 5.095389257052112, + "learning_rate": 3.3464610658972584e-06, + "loss": 1.5432, + "step": 1598 + }, + { + "epoch": 6.473684210526316, + "grad_norm": 4.864855264853332, + "learning_rate": 3.3397941963967162e-06, + "loss": 1.502, + "step": 1599 + }, + { + "epoch": 6.477732793522267, + "grad_norm": 6.57365780985993, + "learning_rate": 3.333130642653024e-06, + "loss": 1.5104, + "step": 1600 + }, + { + "epoch": 6.481781376518219, + "grad_norm": 4.515682901106996, + "learning_rate": 3.326470417974604e-06, + "loss": 1.4218, + "step": 1601 + }, + { + "epoch": 6.48582995951417, + "grad_norm": 5.044572956084713, + "learning_rate": 3.3198135356632353e-06, + "loss": 1.3685, + "step": 1602 + }, + { + "epoch": 6.489878542510121, + "grad_norm": 6.114856919793026, + "learning_rate": 3.313160009014017e-06, + "loss": 1.3026, + "step": 1603 + }, + { + "epoch": 6.493927125506072, + "grad_norm": 6.169486015477941, + "learning_rate": 3.3065098513153473e-06, + "loss": 1.2931, + "step": 1604 + }, + { + "epoch": 6.497975708502024, + "grad_norm": 4.671907121620305, + "learning_rate": 3.299863075848898e-06, + "loss": 1.203, + "step": 1605 + }, + { + "epoch": 6.502024291497976, + "grad_norm": 5.556963177721959, + "learning_rate": 3.2932196958895816e-06, + "loss": 1.0369, + "step": 1606 + }, + { + "epoch": 6.506072874493928, + "grad_norm": 6.041668515369977, + "learning_rate": 3.2865797247055354e-06, + "loss": 1.4057, + "step": 1607 + }, + { + "epoch": 6.510121457489879, + "grad_norm": 5.622532023329238, + "learning_rate": 3.2799431755580814e-06, + "loss": 1.3496, + "step": 1608 + }, + { + "epoch": 6.51417004048583, + "grad_norm": 4.164381858883872, + "learning_rate": 3.2733100617017126e-06, + "loss": 1.3227, + "step": 1609 + }, + { + "epoch": 6.518218623481781, + "grad_norm": 5.565945707547888, + "learning_rate": 3.266680396384061e-06, + "loss": 1.3552, + "step": 1610 + }, + { + "epoch": 6.522267206477733, + "grad_norm": 6.1834705735871855, + "learning_rate": 3.2600541928458664e-06, + "loss": 1.2943, + "step": 1611 + }, + { + "epoch": 6.526315789473684, + "grad_norm": 6.088692550743796, + "learning_rate": 3.2534314643209597e-06, + "loss": 1.132, + "step": 1612 + }, + { + "epoch": 6.530364372469636, + "grad_norm": 5.618439646445004, + "learning_rate": 3.2468122240362287e-06, + "loss": 1.2075, + "step": 1613 + }, + { + "epoch": 6.534412955465587, + "grad_norm": 6.117262117177891, + "learning_rate": 3.2401964852115954e-06, + "loss": 1.2648, + "step": 1614 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 5.488938699999532, + "learning_rate": 3.233584261059991e-06, + "loss": 1.5484, + "step": 1615 + }, + { + "epoch": 6.5425101214574894, + "grad_norm": 4.965386729846099, + "learning_rate": 3.226975564787322e-06, + "loss": 1.486, + "step": 1616 + }, + { + "epoch": 6.5465587044534415, + "grad_norm": 18.62707478890267, + "learning_rate": 3.2203704095924536e-06, + "loss": 2.0005, + "step": 1617 + }, + { + "epoch": 6.550607287449393, + "grad_norm": 9.55782070389464, + "learning_rate": 3.213768808667177e-06, + "loss": 1.7957, + "step": 1618 + }, + { + "epoch": 6.554655870445345, + "grad_norm": 9.720812117855125, + "learning_rate": 3.2071707751961838e-06, + "loss": 2.144, + "step": 1619 + }, + { + "epoch": 6.558704453441296, + "grad_norm": 5.342719089296339, + "learning_rate": 3.200576322357044e-06, + "loss": 1.3436, + "step": 1620 + }, + { + "epoch": 6.562753036437247, + "grad_norm": 4.64296304030207, + "learning_rate": 3.1939854633201727e-06, + "loss": 1.2129, + "step": 1621 + }, + { + "epoch": 6.566801619433198, + "grad_norm": 4.806685098084674, + "learning_rate": 3.187398211248811e-06, + "loss": 1.5973, + "step": 1622 + }, + { + "epoch": 6.57085020242915, + "grad_norm": 5.159929877257071, + "learning_rate": 3.1808145792989914e-06, + "loss": 1.2471, + "step": 1623 + }, + { + "epoch": 6.574898785425101, + "grad_norm": 4.881818219879603, + "learning_rate": 3.1742345806195196e-06, + "loss": 1.4285, + "step": 1624 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 4.079931587528226, + "learning_rate": 3.1676582283519454e-06, + "loss": 1.2586, + "step": 1625 + }, + { + "epoch": 6.582995951417004, + "grad_norm": 5.067504014062879, + "learning_rate": 3.1610855356305354e-06, + "loss": 1.3673, + "step": 1626 + }, + { + "epoch": 6.587044534412955, + "grad_norm": 4.954367681109359, + "learning_rate": 3.1545165155822453e-06, + "loss": 1.3681, + "step": 1627 + }, + { + "epoch": 6.5910931174089065, + "grad_norm": 5.605429782413848, + "learning_rate": 3.1479511813267006e-06, + "loss": 1.3636, + "step": 1628 + }, + { + "epoch": 6.5951417004048585, + "grad_norm": 4.958815188693233, + "learning_rate": 3.141389545976159e-06, + "loss": 1.0862, + "step": 1629 + }, + { + "epoch": 6.59919028340081, + "grad_norm": 4.427052082332069, + "learning_rate": 3.134831622635496e-06, + "loss": 1.1727, + "step": 1630 + }, + { + "epoch": 6.603238866396762, + "grad_norm": 4.453414798921641, + "learning_rate": 3.1282774244021717e-06, + "loss": 1.2508, + "step": 1631 + }, + { + "epoch": 6.607287449392713, + "grad_norm": 5.086142474437995, + "learning_rate": 3.1217269643662063e-06, + "loss": 1.0497, + "step": 1632 + }, + { + "epoch": 6.611336032388664, + "grad_norm": 5.252726223787453, + "learning_rate": 3.115180255610154e-06, + "loss": 1.352, + "step": 1633 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 4.618158368136601, + "learning_rate": 3.1086373112090762e-06, + "loss": 1.3803, + "step": 1634 + }, + { + "epoch": 6.619433198380567, + "grad_norm": 5.797639722448207, + "learning_rate": 3.1020981442305187e-06, + "loss": 1.1187, + "step": 1635 + }, + { + "epoch": 6.623481781376518, + "grad_norm": 5.892627204449989, + "learning_rate": 3.095562767734481e-06, + "loss": 1.4805, + "step": 1636 + }, + { + "epoch": 6.62753036437247, + "grad_norm": 4.995284041826363, + "learning_rate": 3.089031194773392e-06, + "loss": 1.2999, + "step": 1637 + }, + { + "epoch": 6.631578947368421, + "grad_norm": 5.424221812925032, + "learning_rate": 3.082503438392086e-06, + "loss": 1.5812, + "step": 1638 + }, + { + "epoch": 6.635627530364372, + "grad_norm": 4.773802128035484, + "learning_rate": 3.0759795116277723e-06, + "loss": 1.1799, + "step": 1639 + }, + { + "epoch": 6.6396761133603235, + "grad_norm": 5.573651737656804, + "learning_rate": 3.069459427510014e-06, + "loss": 1.4498, + "step": 1640 + }, + { + "epoch": 6.6437246963562755, + "grad_norm": 4.742522853775909, + "learning_rate": 3.0629431990607e-06, + "loss": 1.3417, + "step": 1641 + }, + { + "epoch": 6.647773279352227, + "grad_norm": 5.292712065001537, + "learning_rate": 3.056430839294015e-06, + "loss": 1.45, + "step": 1642 + }, + { + "epoch": 6.651821862348179, + "grad_norm": 4.5550435224065335, + "learning_rate": 3.049922361216422e-06, + "loss": 1.2275, + "step": 1643 + }, + { + "epoch": 6.65587044534413, + "grad_norm": 5.633966620000232, + "learning_rate": 3.043417777826627e-06, + "loss": 1.4383, + "step": 1644 + }, + { + "epoch": 6.659919028340081, + "grad_norm": 5.977264180838899, + "learning_rate": 3.036917102115561e-06, + "loss": 1.2502, + "step": 1645 + }, + { + "epoch": 6.663967611336032, + "grad_norm": 5.050359221231472, + "learning_rate": 3.0304203470663507e-06, + "loss": 1.4135, + "step": 1646 + }, + { + "epoch": 6.668016194331984, + "grad_norm": 5.3518078778159435, + "learning_rate": 3.023927525654288e-06, + "loss": 1.4064, + "step": 1647 + }, + { + "epoch": 6.672064777327935, + "grad_norm": 5.575471681679863, + "learning_rate": 3.017438650846815e-06, + "loss": 1.5635, + "step": 1648 + }, + { + "epoch": 6.676113360323887, + "grad_norm": 4.758858070207382, + "learning_rate": 3.0109537356034856e-06, + "loss": 1.5306, + "step": 1649 + }, + { + "epoch": 6.680161943319838, + "grad_norm": 5.646630068141117, + "learning_rate": 3.0044727928759487e-06, + "loss": 1.3876, + "step": 1650 + }, + { + "epoch": 6.684210526315789, + "grad_norm": 5.245224305674558, + "learning_rate": 2.9979958356079195e-06, + "loss": 1.2497, + "step": 1651 + }, + { + "epoch": 6.6882591093117405, + "grad_norm": 4.976281468525487, + "learning_rate": 2.991522876735154e-06, + "loss": 1.3506, + "step": 1652 + }, + { + "epoch": 6.6923076923076925, + "grad_norm": 5.375432065764104, + "learning_rate": 2.98505392918542e-06, + "loss": 1.3676, + "step": 1653 + }, + { + "epoch": 6.696356275303644, + "grad_norm": 4.849539565202561, + "learning_rate": 2.978589005878476e-06, + "loss": 1.2348, + "step": 1654 + }, + { + "epoch": 6.700404858299595, + "grad_norm": 6.373782199327902, + "learning_rate": 2.9721281197260427e-06, + "loss": 1.6916, + "step": 1655 + }, + { + "epoch": 6.704453441295547, + "grad_norm": 5.797065404713431, + "learning_rate": 2.965671283631778e-06, + "loss": 1.4917, + "step": 1656 + }, + { + "epoch": 6.708502024291498, + "grad_norm": 5.561054188837486, + "learning_rate": 2.959218510491252e-06, + "loss": 1.1089, + "step": 1657 + }, + { + "epoch": 6.712550607287449, + "grad_norm": 4.841361841602314, + "learning_rate": 2.9527698131919156e-06, + "loss": 1.2314, + "step": 1658 + }, + { + "epoch": 6.716599190283401, + "grad_norm": 4.961647413029597, + "learning_rate": 2.9463252046130884e-06, + "loss": 1.3488, + "step": 1659 + }, + { + "epoch": 6.720647773279352, + "grad_norm": 6.030520417168003, + "learning_rate": 2.9398846976259136e-06, + "loss": 1.1124, + "step": 1660 + }, + { + "epoch": 6.724696356275303, + "grad_norm": 5.376150681226648, + "learning_rate": 2.9334483050933506e-06, + "loss": 1.3305, + "step": 1661 + }, + { + "epoch": 6.728744939271255, + "grad_norm": 4.997899902629033, + "learning_rate": 2.9270160398701387e-06, + "loss": 1.4987, + "step": 1662 + }, + { + "epoch": 6.732793522267206, + "grad_norm": 5.003930672267123, + "learning_rate": 2.920587914802772e-06, + "loss": 1.2143, + "step": 1663 + }, + { + "epoch": 6.7368421052631575, + "grad_norm": 5.099065318842715, + "learning_rate": 2.91416394272948e-06, + "loss": 1.3239, + "step": 1664 + }, + { + "epoch": 6.7408906882591095, + "grad_norm": 5.065783888856437, + "learning_rate": 2.907744136480194e-06, + "loss": 1.9473, + "step": 1665 + }, + { + "epoch": 6.744939271255061, + "grad_norm": 4.828636889161134, + "learning_rate": 2.901328508876531e-06, + "loss": 1.4691, + "step": 1666 + }, + { + "epoch": 6.748987854251012, + "grad_norm": 5.887659634670204, + "learning_rate": 2.894917072731753e-06, + "loss": 1.2826, + "step": 1667 + }, + { + "epoch": 6.753036437246964, + "grad_norm": 5.421606621102472, + "learning_rate": 2.88850984085076e-06, + "loss": 1.1948, + "step": 1668 + }, + { + "epoch": 6.757085020242915, + "grad_norm": 5.2144985221753615, + "learning_rate": 2.8821068260300505e-06, + "loss": 1.3159, + "step": 1669 + }, + { + "epoch": 6.761133603238866, + "grad_norm": 6.35388499196324, + "learning_rate": 2.8757080410577042e-06, + "loss": 2.064, + "step": 1670 + }, + { + "epoch": 6.765182186234818, + "grad_norm": 6.533956411029131, + "learning_rate": 2.8693134987133464e-06, + "loss": 1.8202, + "step": 1671 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 7.388143224357747, + "learning_rate": 2.8629232117681354e-06, + "loss": 1.7417, + "step": 1672 + }, + { + "epoch": 6.77327935222672, + "grad_norm": 4.928577825497661, + "learning_rate": 2.8565371929847286e-06, + "loss": 1.2534, + "step": 1673 + }, + { + "epoch": 6.777327935222672, + "grad_norm": 5.033866214652084, + "learning_rate": 2.8501554551172613e-06, + "loss": 1.5421, + "step": 1674 + }, + { + "epoch": 6.781376518218623, + "grad_norm": 4.739685237811317, + "learning_rate": 2.843778010911311e-06, + "loss": 1.5263, + "step": 1675 + }, + { + "epoch": 6.7854251012145745, + "grad_norm": 5.136372890884333, + "learning_rate": 2.83740487310389e-06, + "loss": 1.3327, + "step": 1676 + }, + { + "epoch": 6.7894736842105265, + "grad_norm": 4.941908173697463, + "learning_rate": 2.8310360544234057e-06, + "loss": 1.2674, + "step": 1677 + }, + { + "epoch": 6.793522267206478, + "grad_norm": 5.393271110505753, + "learning_rate": 2.8246715675896354e-06, + "loss": 1.2836, + "step": 1678 + }, + { + "epoch": 6.797570850202429, + "grad_norm": 5.454849249006355, + "learning_rate": 2.81831142531371e-06, + "loss": 1.3156, + "step": 1679 + }, + { + "epoch": 6.801619433198381, + "grad_norm": 4.939088394387297, + "learning_rate": 2.811955640298083e-06, + "loss": 1.2068, + "step": 1680 + }, + { + "epoch": 6.805668016194332, + "grad_norm": 4.809916773128364, + "learning_rate": 2.8056042252365046e-06, + "loss": 1.0997, + "step": 1681 + }, + { + "epoch": 6.809716599190283, + "grad_norm": 5.329896547784682, + "learning_rate": 2.7992571928139984e-06, + "loss": 1.4471, + "step": 1682 + }, + { + "epoch": 6.813765182186235, + "grad_norm": 6.511906878209839, + "learning_rate": 2.7929145557068303e-06, + "loss": 1.2595, + "step": 1683 + }, + { + "epoch": 6.817813765182186, + "grad_norm": 5.372364570471038, + "learning_rate": 2.786576326582493e-06, + "loss": 1.1699, + "step": 1684 + }, + { + "epoch": 6.821862348178137, + "grad_norm": 13.8652581579135, + "learning_rate": 2.780242518099675e-06, + "loss": 2.2106, + "step": 1685 + }, + { + "epoch": 6.825910931174089, + "grad_norm": 25.171093577196388, + "learning_rate": 2.7739131429082373e-06, + "loss": 3.2586, + "step": 1686 + }, + { + "epoch": 6.82995951417004, + "grad_norm": 5.726221697590718, + "learning_rate": 2.7675882136491795e-06, + "loss": 1.1889, + "step": 1687 + }, + { + "epoch": 6.834008097165992, + "grad_norm": 5.969801910273205, + "learning_rate": 2.761267742954629e-06, + "loss": 1.1408, + "step": 1688 + }, + { + "epoch": 6.838056680161944, + "grad_norm": 5.061214863990714, + "learning_rate": 2.7549517434478063e-06, + "loss": 1.1687, + "step": 1689 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 4.867474293725249, + "learning_rate": 2.7486402277430026e-06, + "loss": 1.2449, + "step": 1690 + }, + { + "epoch": 6.846153846153846, + "grad_norm": 5.1018055774076645, + "learning_rate": 2.7423332084455543e-06, + "loss": 1.0478, + "step": 1691 + }, + { + "epoch": 6.850202429149798, + "grad_norm": 6.018705752891283, + "learning_rate": 2.736030698151815e-06, + "loss": 1.2496, + "step": 1692 + }, + { + "epoch": 6.854251012145749, + "grad_norm": 6.104939352615399, + "learning_rate": 2.7297327094491344e-06, + "loss": 1.287, + "step": 1693 + }, + { + "epoch": 6.8582995951417, + "grad_norm": 4.340656711987505, + "learning_rate": 2.723439254915834e-06, + "loss": 1.2266, + "step": 1694 + }, + { + "epoch": 6.862348178137652, + "grad_norm": 5.698807470646283, + "learning_rate": 2.717150347121177e-06, + "loss": 1.2273, + "step": 1695 + }, + { + "epoch": 6.866396761133603, + "grad_norm": 5.5042411488110154, + "learning_rate": 2.710865998625348e-06, + "loss": 1.2081, + "step": 1696 + }, + { + "epoch": 6.870445344129554, + "grad_norm": 6.8240067723829405, + "learning_rate": 2.704586221979422e-06, + "loss": 1.6486, + "step": 1697 + }, + { + "epoch": 6.874493927125506, + "grad_norm": 5.905111755452213, + "learning_rate": 2.698311029725346e-06, + "loss": 1.5976, + "step": 1698 + }, + { + "epoch": 6.8785425101214575, + "grad_norm": 6.1571466759316, + "learning_rate": 2.6920404343959106e-06, + "loss": 1.3605, + "step": 1699 + }, + { + "epoch": 6.882591093117409, + "grad_norm": 5.716713309024074, + "learning_rate": 2.6857744485147286e-06, + "loss": 1.2964, + "step": 1700 + }, + { + "epoch": 6.886639676113361, + "grad_norm": 5.42925803199323, + "learning_rate": 2.6795130845961993e-06, + "loss": 0.9267, + "step": 1701 + }, + { + "epoch": 6.890688259109312, + "grad_norm": 4.919365319165041, + "learning_rate": 2.673256355145499e-06, + "loss": 1.4449, + "step": 1702 + }, + { + "epoch": 6.894736842105263, + "grad_norm": 4.863542774795551, + "learning_rate": 2.667004272658541e-06, + "loss": 1.4657, + "step": 1703 + }, + { + "epoch": 6.898785425101215, + "grad_norm": 4.299136007306504, + "learning_rate": 2.660756849621962e-06, + "loss": 1.2369, + "step": 1704 + }, + { + "epoch": 6.902834008097166, + "grad_norm": 5.213129071990759, + "learning_rate": 2.6545140985130934e-06, + "loss": 1.2244, + "step": 1705 + }, + { + "epoch": 6.906882591093117, + "grad_norm": 5.578872418777055, + "learning_rate": 2.6482760317999338e-06, + "loss": 1.2811, + "step": 1706 + }, + { + "epoch": 6.910931174089069, + "grad_norm": 4.626194423109011, + "learning_rate": 2.642042661941129e-06, + "loss": 1.0198, + "step": 1707 + }, + { + "epoch": 6.91497975708502, + "grad_norm": 5.352887557319016, + "learning_rate": 2.635814001385938e-06, + "loss": 1.1012, + "step": 1708 + }, + { + "epoch": 6.919028340080971, + "grad_norm": 5.579613506703107, + "learning_rate": 2.629590062574221e-06, + "loss": 1.1085, + "step": 1709 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 4.252011072382573, + "learning_rate": 2.623370857936404e-06, + "loss": 1.431, + "step": 1710 + }, + { + "epoch": 6.9271255060728745, + "grad_norm": 5.916388957924838, + "learning_rate": 2.6171563998934605e-06, + "loss": 1.2774, + "step": 1711 + }, + { + "epoch": 6.931174089068826, + "grad_norm": 5.953432162823518, + "learning_rate": 2.610946700856885e-06, + "loss": 1.2618, + "step": 1712 + }, + { + "epoch": 6.935222672064778, + "grad_norm": 6.19929364838639, + "learning_rate": 2.604741773228661e-06, + "loss": 1.0577, + "step": 1713 + }, + { + "epoch": 6.939271255060729, + "grad_norm": 5.789164804068839, + "learning_rate": 2.5985416294012487e-06, + "loss": 1.0688, + "step": 1714 + }, + { + "epoch": 6.94331983805668, + "grad_norm": 6.659571736165462, + "learning_rate": 2.592346281757552e-06, + "loss": 1.3636, + "step": 1715 + }, + { + "epoch": 6.947368421052632, + "grad_norm": 5.314697446259228, + "learning_rate": 2.586155742670897e-06, + "loss": 1.0952, + "step": 1716 + }, + { + "epoch": 6.951417004048583, + "grad_norm": 6.659337503952005, + "learning_rate": 2.5799700245050074e-06, + "loss": 1.0229, + "step": 1717 + }, + { + "epoch": 6.955465587044534, + "grad_norm": 6.65312440022192, + "learning_rate": 2.5737891396139713e-06, + "loss": 1.3201, + "step": 1718 + }, + { + "epoch": 6.959514170040486, + "grad_norm": 5.938881485697329, + "learning_rate": 2.5676131003422317e-06, + "loss": 1.3962, + "step": 1719 + }, + { + "epoch": 6.963562753036437, + "grad_norm": 5.4389936951171025, + "learning_rate": 2.561441919024551e-06, + "loss": 1.346, + "step": 1720 + }, + { + "epoch": 6.967611336032388, + "grad_norm": 6.814603646499591, + "learning_rate": 2.5552756079859904e-06, + "loss": 1.3755, + "step": 1721 + }, + { + "epoch": 6.97165991902834, + "grad_norm": 6.557034047725967, + "learning_rate": 2.549114179541884e-06, + "loss": 1.2917, + "step": 1722 + }, + { + "epoch": 6.9757085020242915, + "grad_norm": 4.666089006915814, + "learning_rate": 2.542957645997811e-06, + "loss": 1.3178, + "step": 1723 + }, + { + "epoch": 6.979757085020243, + "grad_norm": 5.4101007526641, + "learning_rate": 2.5368060196495785e-06, + "loss": 1.3848, + "step": 1724 + }, + { + "epoch": 6.983805668016195, + "grad_norm": 5.003638917729553, + "learning_rate": 2.530659312783192e-06, + "loss": 1.4391, + "step": 1725 + }, + { + "epoch": 6.987854251012146, + "grad_norm": 4.982884862825928, + "learning_rate": 2.5245175376748334e-06, + "loss": 1.2329, + "step": 1726 + }, + { + "epoch": 6.991902834008097, + "grad_norm": 4.383040697186735, + "learning_rate": 2.5183807065908296e-06, + "loss": 1.2466, + "step": 1727 + }, + { + "epoch": 6.995951417004049, + "grad_norm": 4.833585025134396, + "learning_rate": 2.512248831787639e-06, + "loss": 1.5637, + "step": 1728 + }, + { + "epoch": 7.0, + "grad_norm": 4.848560799578388, + "learning_rate": 2.5061219255118186e-06, + "loss": 1.2677, + "step": 1729 + }, + { + "epoch": 7.004048582995951, + "grad_norm": 4.901375359150507, + "learning_rate": 2.5000000000000015e-06, + "loss": 1.3023, + "step": 1730 + }, + { + "epoch": 7.008097165991903, + "grad_norm": 6.545083705424055, + "learning_rate": 2.4938830674788756e-06, + "loss": 1.4651, + "step": 1731 + }, + { + "epoch": 7.012145748987854, + "grad_norm": 6.141277943301318, + "learning_rate": 2.4877711401651562e-06, + "loss": 1.2554, + "step": 1732 + }, + { + "epoch": 7.016194331983805, + "grad_norm": 6.544269798324027, + "learning_rate": 2.4816642302655634e-06, + "loss": 1.479, + "step": 1733 + }, + { + "epoch": 7.020242914979757, + "grad_norm": 5.746379418360751, + "learning_rate": 2.475562349976791e-06, + "loss": 1.656, + "step": 1734 + }, + { + "epoch": 7.0242914979757085, + "grad_norm": 6.035436258524213, + "learning_rate": 2.4694655114854936e-06, + "loss": 1.5592, + "step": 1735 + }, + { + "epoch": 7.02834008097166, + "grad_norm": 5.223633858026752, + "learning_rate": 2.4633737269682546e-06, + "loss": 1.2619, + "step": 1736 + }, + { + "epoch": 7.032388663967612, + "grad_norm": 5.890887028411126, + "learning_rate": 2.4572870085915628e-06, + "loss": 1.2686, + "step": 1737 + }, + { + "epoch": 7.036437246963563, + "grad_norm": 5.4867419263331785, + "learning_rate": 2.4512053685117916e-06, + "loss": 1.4711, + "step": 1738 + }, + { + "epoch": 7.040485829959514, + "grad_norm": 5.856066296731616, + "learning_rate": 2.445128818875166e-06, + "loss": 1.2784, + "step": 1739 + }, + { + "epoch": 7.044534412955466, + "grad_norm": 5.685747261263775, + "learning_rate": 2.4390573718177507e-06, + "loss": 1.4178, + "step": 1740 + }, + { + "epoch": 7.048582995951417, + "grad_norm": 5.580589694434444, + "learning_rate": 2.4329910394654167e-06, + "loss": 1.2819, + "step": 1741 + }, + { + "epoch": 7.052631578947368, + "grad_norm": 6.1734653161832345, + "learning_rate": 2.4269298339338205e-06, + "loss": 1.3334, + "step": 1742 + }, + { + "epoch": 7.05668016194332, + "grad_norm": 5.647156467107709, + "learning_rate": 2.4208737673283818e-06, + "loss": 1.1932, + "step": 1743 + }, + { + "epoch": 7.060728744939271, + "grad_norm": 5.571147412614646, + "learning_rate": 2.414822851744249e-06, + "loss": 1.3354, + "step": 1744 + }, + { + "epoch": 7.064777327935222, + "grad_norm": 6.222421117643815, + "learning_rate": 2.408777099266291e-06, + "loss": 1.2747, + "step": 1745 + }, + { + "epoch": 7.068825910931174, + "grad_norm": 6.251859136759403, + "learning_rate": 2.4027365219690617e-06, + "loss": 1.444, + "step": 1746 + }, + { + "epoch": 7.0728744939271255, + "grad_norm": 5.555376265690771, + "learning_rate": 2.3967011319167804e-06, + "loss": 1.3478, + "step": 1747 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 6.222350987405198, + "learning_rate": 2.3906709411633073e-06, + "loss": 1.3069, + "step": 1748 + }, + { + "epoch": 7.080971659919029, + "grad_norm": 5.290175219718593, + "learning_rate": 2.384645961752113e-06, + "loss": 1.4103, + "step": 1749 + }, + { + "epoch": 7.08502024291498, + "grad_norm": 4.882921637643386, + "learning_rate": 2.378626205716265e-06, + "loss": 1.3698, + "step": 1750 + }, + { + "epoch": 7.089068825910931, + "grad_norm": 5.893035167375215, + "learning_rate": 2.3726116850783987e-06, + "loss": 1.3153, + "step": 1751 + }, + { + "epoch": 7.093117408906883, + "grad_norm": 5.440462022348463, + "learning_rate": 2.3666024118506937e-06, + "loss": 1.3918, + "step": 1752 + }, + { + "epoch": 7.097165991902834, + "grad_norm": 5.298541554798929, + "learning_rate": 2.3605983980348446e-06, + "loss": 1.1493, + "step": 1753 + }, + { + "epoch": 7.101214574898785, + "grad_norm": 5.873912109321258, + "learning_rate": 2.354599655622049e-06, + "loss": 1.3419, + "step": 1754 + }, + { + "epoch": 7.105263157894737, + "grad_norm": 6.515086572176515, + "learning_rate": 2.3486061965929695e-06, + "loss": 1.2658, + "step": 1755 + }, + { + "epoch": 7.109311740890688, + "grad_norm": 5.640239544492155, + "learning_rate": 2.3426180329177217e-06, + "loss": 1.2778, + "step": 1756 + }, + { + "epoch": 7.113360323886639, + "grad_norm": 6.602620889096045, + "learning_rate": 2.3366351765558437e-06, + "loss": 1.2168, + "step": 1757 + }, + { + "epoch": 7.117408906882591, + "grad_norm": 6.23335605433251, + "learning_rate": 2.3306576394562748e-06, + "loss": 1.1279, + "step": 1758 + }, + { + "epoch": 7.1214574898785425, + "grad_norm": 5.812741962332591, + "learning_rate": 2.3246854335573303e-06, + "loss": 1.2, + "step": 1759 + }, + { + "epoch": 7.125506072874494, + "grad_norm": 5.7653076766991465, + "learning_rate": 2.318718570786675e-06, + "loss": 1.2204, + "step": 1760 + }, + { + "epoch": 7.129554655870446, + "grad_norm": 6.592268657435819, + "learning_rate": 2.3127570630613064e-06, + "loss": 1.0923, + "step": 1761 + }, + { + "epoch": 7.133603238866397, + "grad_norm": 5.105109462079527, + "learning_rate": 2.3068009222875256e-06, + "loss": 1.4491, + "step": 1762 + }, + { + "epoch": 7.137651821862348, + "grad_norm": 6.139171319338175, + "learning_rate": 2.3008501603609147e-06, + "loss": 1.2557, + "step": 1763 + }, + { + "epoch": 7.1417004048583, + "grad_norm": 4.871725004057816, + "learning_rate": 2.294904789166315e-06, + "loss": 1.023, + "step": 1764 + }, + { + "epoch": 7.145748987854251, + "grad_norm": 6.491293356249618, + "learning_rate": 2.288964820577797e-06, + "loss": 1.3439, + "step": 1765 + }, + { + "epoch": 7.149797570850202, + "grad_norm": 5.837952957007555, + "learning_rate": 2.283030266458644e-06, + "loss": 1.182, + "step": 1766 + }, + { + "epoch": 7.153846153846154, + "grad_norm": 5.104308775866129, + "learning_rate": 2.2771011386613268e-06, + "loss": 1.4117, + "step": 1767 + }, + { + "epoch": 7.157894736842105, + "grad_norm": 6.518827958790034, + "learning_rate": 2.2711774490274767e-06, + "loss": 1.4173, + "step": 1768 + }, + { + "epoch": 7.161943319838056, + "grad_norm": 4.94266123667569, + "learning_rate": 2.265259209387867e-06, + "loss": 1.2429, + "step": 1769 + }, + { + "epoch": 7.165991902834008, + "grad_norm": 5.473631523594278, + "learning_rate": 2.259346431562379e-06, + "loss": 1.3316, + "step": 1770 + }, + { + "epoch": 7.17004048582996, + "grad_norm": 5.001369544056481, + "learning_rate": 2.2534391273599937e-06, + "loss": 1.9136, + "step": 1771 + }, + { + "epoch": 7.174089068825911, + "grad_norm": 5.913295650699435, + "learning_rate": 2.2475373085787568e-06, + "loss": 1.1497, + "step": 1772 + }, + { + "epoch": 7.178137651821863, + "grad_norm": 6.952533318275522, + "learning_rate": 2.2416409870057577e-06, + "loss": 1.353, + "step": 1773 + }, + { + "epoch": 7.182186234817814, + "grad_norm": 4.723432595191292, + "learning_rate": 2.2357501744171105e-06, + "loss": 1.1492, + "step": 1774 + }, + { + "epoch": 7.186234817813765, + "grad_norm": 6.058020017509188, + "learning_rate": 2.229864882577921e-06, + "loss": 1.3322, + "step": 1775 + }, + { + "epoch": 7.190283400809717, + "grad_norm": 5.788151410477542, + "learning_rate": 2.2239851232422736e-06, + "loss": 1.3631, + "step": 1776 + }, + { + "epoch": 7.194331983805668, + "grad_norm": 6.262252651618726, + "learning_rate": 2.218110908153202e-06, + "loss": 1.5276, + "step": 1777 + }, + { + "epoch": 7.198380566801619, + "grad_norm": 5.208163192867401, + "learning_rate": 2.2122422490426676e-06, + "loss": 1.5831, + "step": 1778 + }, + { + "epoch": 7.202429149797571, + "grad_norm": 5.390523496529594, + "learning_rate": 2.206379157631532e-06, + "loss": 1.2908, + "step": 1779 + }, + { + "epoch": 7.206477732793522, + "grad_norm": 5.162249120166779, + "learning_rate": 2.200521645629542e-06, + "loss": 1.6171, + "step": 1780 + }, + { + "epoch": 7.2105263157894735, + "grad_norm": 5.391588507251084, + "learning_rate": 2.194669724735296e-06, + "loss": 1.6111, + "step": 1781 + }, + { + "epoch": 7.2145748987854255, + "grad_norm": 6.1034967557731665, + "learning_rate": 2.1888234066362303e-06, + "loss": 1.3854, + "step": 1782 + }, + { + "epoch": 7.218623481781377, + "grad_norm": 6.167454760308808, + "learning_rate": 2.18298270300859e-06, + "loss": 1.2693, + "step": 1783 + }, + { + "epoch": 7.222672064777328, + "grad_norm": 5.69770152013801, + "learning_rate": 2.1771476255174056e-06, + "loss": 1.2078, + "step": 1784 + }, + { + "epoch": 7.22672064777328, + "grad_norm": 5.460410860926906, + "learning_rate": 2.1713181858164746e-06, + "loss": 1.413, + "step": 1785 + }, + { + "epoch": 7.230769230769231, + "grad_norm": 5.566118830424516, + "learning_rate": 2.165494395548329e-06, + "loss": 1.1968, + "step": 1786 + }, + { + "epoch": 7.234817813765182, + "grad_norm": 6.43649848295101, + "learning_rate": 2.159676266344222e-06, + "loss": 1.4229, + "step": 1787 + }, + { + "epoch": 7.238866396761134, + "grad_norm": 6.290508191897902, + "learning_rate": 2.1538638098241e-06, + "loss": 1.3623, + "step": 1788 + }, + { + "epoch": 7.242914979757085, + "grad_norm": 5.730502481155649, + "learning_rate": 2.14805703759658e-06, + "loss": 1.396, + "step": 1789 + }, + { + "epoch": 7.246963562753036, + "grad_norm": 5.437978852325137, + "learning_rate": 2.1422559612589266e-06, + "loss": 1.252, + "step": 1790 + }, + { + "epoch": 7.251012145748988, + "grad_norm": 5.7552412936402435, + "learning_rate": 2.136460592397025e-06, + "loss": 1.344, + "step": 1791 + }, + { + "epoch": 7.255060728744939, + "grad_norm": 5.804592913810575, + "learning_rate": 2.1306709425853663e-06, + "loss": 1.291, + "step": 1792 + }, + { + "epoch": 7.2591093117408905, + "grad_norm": 5.304611515686778, + "learning_rate": 2.124887023387017e-06, + "loss": 1.25, + "step": 1793 + }, + { + "epoch": 7.2631578947368425, + "grad_norm": 5.579310956319717, + "learning_rate": 2.1191088463535997e-06, + "loss": 1.0352, + "step": 1794 + }, + { + "epoch": 7.267206477732794, + "grad_norm": 5.280713442914896, + "learning_rate": 2.113336423025269e-06, + "loss": 1.3293, + "step": 1795 + }, + { + "epoch": 7.271255060728745, + "grad_norm": 5.695843923044428, + "learning_rate": 2.1075697649306838e-06, + "loss": 1.3279, + "step": 1796 + }, + { + "epoch": 7.275303643724697, + "grad_norm": 5.537225853611836, + "learning_rate": 2.1018088835869943e-06, + "loss": 1.4052, + "step": 1797 + }, + { + "epoch": 7.279352226720648, + "grad_norm": 7.310804417037736, + "learning_rate": 2.0960537904998113e-06, + "loss": 1.3052, + "step": 1798 + }, + { + "epoch": 7.283400809716599, + "grad_norm": 6.5207473345683455, + "learning_rate": 2.0903044971631854e-06, + "loss": 0.9953, + "step": 1799 + }, + { + "epoch": 7.287449392712551, + "grad_norm": 6.891390925467454, + "learning_rate": 2.084561015059585e-06, + "loss": 1.1524, + "step": 1800 + }, + { + "epoch": 7.291497975708502, + "grad_norm": 6.511458265596788, + "learning_rate": 2.0788233556598688e-06, + "loss": 1.019, + "step": 1801 + }, + { + "epoch": 7.295546558704453, + "grad_norm": 6.525945460785431, + "learning_rate": 2.0730915304232692e-06, + "loss": 1.2347, + "step": 1802 + }, + { + "epoch": 7.299595141700405, + "grad_norm": 5.806148576127675, + "learning_rate": 2.067365550797367e-06, + "loss": 1.4674, + "step": 1803 + }, + { + "epoch": 7.303643724696356, + "grad_norm": 6.6525694728213685, + "learning_rate": 2.061645428218067e-06, + "loss": 1.0762, + "step": 1804 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 6.212203279710177, + "learning_rate": 2.055931174109579e-06, + "loss": 1.1289, + "step": 1805 + }, + { + "epoch": 7.3117408906882595, + "grad_norm": 5.666269345071883, + "learning_rate": 2.050222799884387e-06, + "loss": 1.1799, + "step": 1806 + }, + { + "epoch": 7.315789473684211, + "grad_norm": 7.0629439288873, + "learning_rate": 2.044520316943235e-06, + "loss": 1.0631, + "step": 1807 + }, + { + "epoch": 7.319838056680162, + "grad_norm": 6.059126520843265, + "learning_rate": 2.0388237366751005e-06, + "loss": 1.03, + "step": 1808 + }, + { + "epoch": 7.323886639676114, + "grad_norm": 6.3174918869462635, + "learning_rate": 2.0331330704571746e-06, + "loss": 1.0775, + "step": 1809 + }, + { + "epoch": 7.327935222672065, + "grad_norm": 6.098595972628923, + "learning_rate": 2.027448329654832e-06, + "loss": 1.0956, + "step": 1810 + }, + { + "epoch": 7.331983805668016, + "grad_norm": 6.07010789176819, + "learning_rate": 2.02176952562162e-06, + "loss": 1.132, + "step": 1811 + }, + { + "epoch": 7.336032388663968, + "grad_norm": 5.673793373139681, + "learning_rate": 2.0160966696992195e-06, + "loss": 1.235, + "step": 1812 + }, + { + "epoch": 7.340080971659919, + "grad_norm": 5.42325757234182, + "learning_rate": 2.0104297732174403e-06, + "loss": 1.1607, + "step": 1813 + }, + { + "epoch": 7.34412955465587, + "grad_norm": 5.845384796389491, + "learning_rate": 2.004768847494186e-06, + "loss": 1.069, + "step": 1814 + }, + { + "epoch": 7.348178137651822, + "grad_norm": 6.716611305618001, + "learning_rate": 1.999113903835438e-06, + "loss": 1.2088, + "step": 1815 + }, + { + "epoch": 7.352226720647773, + "grad_norm": 6.335024142337415, + "learning_rate": 1.9934649535352286e-06, + "loss": 1.215, + "step": 1816 + }, + { + "epoch": 7.3562753036437245, + "grad_norm": 6.074016020941024, + "learning_rate": 1.987822007875617e-06, + "loss": 0.8957, + "step": 1817 + }, + { + "epoch": 7.3603238866396765, + "grad_norm": 6.669356187358129, + "learning_rate": 1.982185078126676e-06, + "loss": 1.2878, + "step": 1818 + }, + { + "epoch": 7.364372469635628, + "grad_norm": 5.5205879930863055, + "learning_rate": 1.9765541755464605e-06, + "loss": 1.3594, + "step": 1819 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 5.791173021479898, + "learning_rate": 1.9709293113809876e-06, + "loss": 1.2518, + "step": 1820 + }, + { + "epoch": 7.372469635627531, + "grad_norm": 7.085668027134953, + "learning_rate": 1.965310496864217e-06, + "loss": 1.3044, + "step": 1821 + }, + { + "epoch": 7.376518218623482, + "grad_norm": 6.30070905341863, + "learning_rate": 1.9596977432180212e-06, + "loss": 1.0096, + "step": 1822 + }, + { + "epoch": 7.380566801619433, + "grad_norm": 6.668544077573982, + "learning_rate": 1.954091061652172e-06, + "loss": 1.1521, + "step": 1823 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 5.685627571377497, + "learning_rate": 1.948490463364313e-06, + "loss": 0.9629, + "step": 1824 + }, + { + "epoch": 7.388663967611336, + "grad_norm": 7.099232364097355, + "learning_rate": 1.942895959539939e-06, + "loss": 1.0332, + "step": 1825 + }, + { + "epoch": 7.392712550607287, + "grad_norm": 6.449023103797025, + "learning_rate": 1.9373075613523728e-06, + "loss": 1.219, + "step": 1826 + }, + { + "epoch": 7.396761133603239, + "grad_norm": 7.603243728006548, + "learning_rate": 1.9317252799627393e-06, + "loss": 1.0144, + "step": 1827 + }, + { + "epoch": 7.40080971659919, + "grad_norm": 5.630823437903324, + "learning_rate": 1.9261491265199526e-06, + "loss": 1.0604, + "step": 1828 + }, + { + "epoch": 7.4048582995951415, + "grad_norm": 5.804060941623419, + "learning_rate": 1.920579112160685e-06, + "loss": 1.0906, + "step": 1829 + }, + { + "epoch": 7.4089068825910935, + "grad_norm": 7.107387654645546, + "learning_rate": 1.915015248009348e-06, + "loss": 1.1866, + "step": 1830 + }, + { + "epoch": 7.412955465587045, + "grad_norm": 6.216151169357513, + "learning_rate": 1.9094575451780727e-06, + "loss": 1.0234, + "step": 1831 + }, + { + "epoch": 7.417004048582996, + "grad_norm": 7.173346243896998, + "learning_rate": 1.903906014766681e-06, + "loss": 1.3152, + "step": 1832 + }, + { + "epoch": 7.421052631578947, + "grad_norm": 7.353654026214847, + "learning_rate": 1.8983606678626665e-06, + "loss": 1.3466, + "step": 1833 + }, + { + "epoch": 7.425101214574899, + "grad_norm": 6.168388032585026, + "learning_rate": 1.8928215155411773e-06, + "loss": 1.3615, + "step": 1834 + }, + { + "epoch": 7.42914979757085, + "grad_norm": 7.177909922740221, + "learning_rate": 1.8872885688649879e-06, + "loss": 1.3325, + "step": 1835 + }, + { + "epoch": 7.433198380566802, + "grad_norm": 5.5067246147195315, + "learning_rate": 1.8817618388844783e-06, + "loss": 1.5126, + "step": 1836 + }, + { + "epoch": 7.437246963562753, + "grad_norm": 6.480398605143195, + "learning_rate": 1.8762413366376159e-06, + "loss": 1.2967, + "step": 1837 + }, + { + "epoch": 7.441295546558704, + "grad_norm": 7.239184730466869, + "learning_rate": 1.8707270731499223e-06, + "loss": 1.2391, + "step": 1838 + }, + { + "epoch": 7.445344129554655, + "grad_norm": 5.881764731806458, + "learning_rate": 1.865219059434467e-06, + "loss": 1.4892, + "step": 1839 + }, + { + "epoch": 7.449392712550607, + "grad_norm": 7.287338664223354, + "learning_rate": 1.8597173064918333e-06, + "loss": 1.2865, + "step": 1840 + }, + { + "epoch": 7.4534412955465585, + "grad_norm": 6.989877908949274, + "learning_rate": 1.854221825310103e-06, + "loss": 1.2753, + "step": 1841 + }, + { + "epoch": 7.4574898785425106, + "grad_norm": 6.967142936381031, + "learning_rate": 1.8487326268648314e-06, + "loss": 1.6209, + "step": 1842 + }, + { + "epoch": 7.461538461538462, + "grad_norm": 9.165493801033026, + "learning_rate": 1.8432497221190227e-06, + "loss": 1.7021, + "step": 1843 + }, + { + "epoch": 7.465587044534413, + "grad_norm": 7.201939055537971, + "learning_rate": 1.8377731220231144e-06, + "loss": 1.4113, + "step": 1844 + }, + { + "epoch": 7.469635627530364, + "grad_norm": 6.447673122675899, + "learning_rate": 1.832302837514952e-06, + "loss": 1.4683, + "step": 1845 + }, + { + "epoch": 7.473684210526316, + "grad_norm": 5.915439909033562, + "learning_rate": 1.8268388795197683e-06, + "loss": 1.4386, + "step": 1846 + }, + { + "epoch": 7.477732793522267, + "grad_norm": 7.791713816072655, + "learning_rate": 1.8213812589501611e-06, + "loss": 1.4409, + "step": 1847 + }, + { + "epoch": 7.481781376518219, + "grad_norm": 5.76907536016399, + "learning_rate": 1.815929986706066e-06, + "loss": 1.357, + "step": 1848 + }, + { + "epoch": 7.48582995951417, + "grad_norm": 6.324576322221301, + "learning_rate": 1.8104850736747458e-06, + "loss": 1.3014, + "step": 1849 + }, + { + "epoch": 7.489878542510121, + "grad_norm": 7.955436278806627, + "learning_rate": 1.8050465307307602e-06, + "loss": 1.2541, + "step": 1850 + }, + { + "epoch": 7.493927125506072, + "grad_norm": 8.3800061367103, + "learning_rate": 1.7996143687359475e-06, + "loss": 1.2069, + "step": 1851 + }, + { + "epoch": 7.497975708502024, + "grad_norm": 5.859852613078974, + "learning_rate": 1.7941885985394025e-06, + "loss": 1.1389, + "step": 1852 + }, + { + "epoch": 7.502024291497976, + "grad_norm": 6.714230939191411, + "learning_rate": 1.78876923097745e-06, + "loss": 0.96, + "step": 1853 + }, + { + "epoch": 7.506072874493928, + "grad_norm": 7.478771265211495, + "learning_rate": 1.783356276873633e-06, + "loss": 1.3238, + "step": 1854 + }, + { + "epoch": 7.510121457489879, + "grad_norm": 6.964602737040841, + "learning_rate": 1.7779497470386826e-06, + "loss": 1.2515, + "step": 1855 + }, + { + "epoch": 7.51417004048583, + "grad_norm": 5.135869484791375, + "learning_rate": 1.7725496522704998e-06, + "loss": 1.2487, + "step": 1856 + }, + { + "epoch": 7.518218623481781, + "grad_norm": 6.736233605627823, + "learning_rate": 1.7671560033541364e-06, + "loss": 1.2647, + "step": 1857 + }, + { + "epoch": 7.522267206477733, + "grad_norm": 7.4340596808517585, + "learning_rate": 1.7617688110617653e-06, + "loss": 1.1983, + "step": 1858 + }, + { + "epoch": 7.526315789473684, + "grad_norm": 7.142575001524021, + "learning_rate": 1.7563880861526656e-06, + "loss": 1.037, + "step": 1859 + }, + { + "epoch": 7.530364372469636, + "grad_norm": 6.461217060280809, + "learning_rate": 1.7510138393732029e-06, + "loss": 1.125, + "step": 1860 + }, + { + "epoch": 7.534412955465587, + "grad_norm": 7.120411669751328, + "learning_rate": 1.7456460814568032e-06, + "loss": 1.1532, + "step": 1861 + }, + { + "epoch": 7.538461538461538, + "grad_norm": 6.677578923600314, + "learning_rate": 1.7402848231239317e-06, + "loss": 1.447, + "step": 1862 + }, + { + "epoch": 7.5425101214574894, + "grad_norm": 5.995680414752151, + "learning_rate": 1.7349300750820758e-06, + "loss": 1.414, + "step": 1863 + }, + { + "epoch": 7.5465587044534415, + "grad_norm": 70.49787838581857, + "learning_rate": 1.7295818480257148e-06, + "loss": 1.9394, + "step": 1864 + }, + { + "epoch": 7.550607287449393, + "grad_norm": 11.227616663799225, + "learning_rate": 1.7242401526363095e-06, + "loss": 1.6974, + "step": 1865 + }, + { + "epoch": 7.554655870445345, + "grad_norm": 15.917128296917474, + "learning_rate": 1.7189049995822748e-06, + "loss": 2.0666, + "step": 1866 + }, + { + "epoch": 7.558704453441296, + "grad_norm": 6.5545578057982254, + "learning_rate": 1.7135763995189574e-06, + "loss": 1.2566, + "step": 1867 + }, + { + "epoch": 7.562753036437247, + "grad_norm": 5.608919892200609, + "learning_rate": 1.70825436308862e-06, + "loss": 1.1258, + "step": 1868 + }, + { + "epoch": 7.566801619433198, + "grad_norm": 5.78898827199352, + "learning_rate": 1.70293890092041e-06, + "loss": 1.511, + "step": 1869 + }, + { + "epoch": 7.57085020242915, + "grad_norm": 6.1957471468572605, + "learning_rate": 1.6976300236303505e-06, + "loss": 1.1713, + "step": 1870 + }, + { + "epoch": 7.574898785425101, + "grad_norm": 5.919353556112893, + "learning_rate": 1.692327741821312e-06, + "loss": 1.3418, + "step": 1871 + }, + { + "epoch": 7.578947368421053, + "grad_norm": 4.818508692645506, + "learning_rate": 1.6870320660829908e-06, + "loss": 1.1787, + "step": 1872 + }, + { + "epoch": 7.582995951417004, + "grad_norm": 6.074378707133634, + "learning_rate": 1.6817430069918939e-06, + "loss": 1.2772, + "step": 1873 + }, + { + "epoch": 7.587044534412955, + "grad_norm": 6.043486629250494, + "learning_rate": 1.676460575111306e-06, + "loss": 1.2858, + "step": 1874 + }, + { + "epoch": 7.5910931174089065, + "grad_norm": 6.824574202718084, + "learning_rate": 1.671184780991283e-06, + "loss": 1.2792, + "step": 1875 + }, + { + "epoch": 7.5951417004048585, + "grad_norm": 6.003146333113679, + "learning_rate": 1.6659156351686202e-06, + "loss": 0.9987, + "step": 1876 + }, + { + "epoch": 7.59919028340081, + "grad_norm": 5.257435712843031, + "learning_rate": 1.6606531481668364e-06, + "loss": 1.1001, + "step": 1877 + }, + { + "epoch": 7.603238866396762, + "grad_norm": 5.19698994619142, + "learning_rate": 1.6553973304961528e-06, + "loss": 1.1799, + "step": 1878 + }, + { + "epoch": 7.607287449392713, + "grad_norm": 5.841701091792967, + "learning_rate": 1.6501481926534658e-06, + "loss": 0.9594, + "step": 1879 + }, + { + "epoch": 7.611336032388664, + "grad_norm": 6.19240531240544, + "learning_rate": 1.6449057451223354e-06, + "loss": 1.2521, + "step": 1880 + }, + { + "epoch": 7.615384615384615, + "grad_norm": 5.549994801931837, + "learning_rate": 1.639669998372958e-06, + "loss": 1.2949, + "step": 1881 + }, + { + "epoch": 7.619433198380567, + "grad_norm": 6.675501333896787, + "learning_rate": 1.6344409628621482e-06, + "loss": 1.0393, + "step": 1882 + }, + { + "epoch": 7.623481781376518, + "grad_norm": 6.8185578077235025, + "learning_rate": 1.6292186490333172e-06, + "loss": 1.3907, + "step": 1883 + }, + { + "epoch": 7.62753036437247, + "grad_norm": 5.788785194808056, + "learning_rate": 1.6240030673164492e-06, + "loss": 1.2266, + "step": 1884 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 6.240532210004539, + "learning_rate": 1.6187942281280838e-06, + "loss": 1.4968, + "step": 1885 + }, + { + "epoch": 7.635627530364372, + "grad_norm": 5.438972394942183, + "learning_rate": 1.6135921418712959e-06, + "loss": 1.0917, + "step": 1886 + }, + { + "epoch": 7.6396761133603235, + "grad_norm": 6.412673367253676, + "learning_rate": 1.6083968189356724e-06, + "loss": 1.3789, + "step": 1887 + }, + { + "epoch": 7.6437246963562755, + "grad_norm": 5.536347657482411, + "learning_rate": 1.6032082696972945e-06, + "loss": 1.2638, + "step": 1888 + }, + { + "epoch": 7.647773279352227, + "grad_norm": 6.127206089252584, + "learning_rate": 1.5980265045187139e-06, + "loss": 1.3732, + "step": 1889 + }, + { + "epoch": 7.651821862348179, + "grad_norm": 5.193216915475832, + "learning_rate": 1.5928515337489292e-06, + "loss": 1.1536, + "step": 1890 + }, + { + "epoch": 7.65587044534413, + "grad_norm": 6.4405008029321635, + "learning_rate": 1.5876833677233754e-06, + "loss": 1.3585, + "step": 1891 + }, + { + "epoch": 7.659919028340081, + "grad_norm": 6.735596126416384, + "learning_rate": 1.5825220167638945e-06, + "loss": 1.1643, + "step": 1892 + }, + { + "epoch": 7.663967611336032, + "grad_norm": 5.578067115309463, + "learning_rate": 1.5773674911787157e-06, + "loss": 1.3335, + "step": 1893 + }, + { + "epoch": 7.668016194331984, + "grad_norm": 5.847753238206834, + "learning_rate": 1.5722198012624418e-06, + "loss": 1.3156, + "step": 1894 + }, + { + "epoch": 7.672064777327935, + "grad_norm": 6.167981268598202, + "learning_rate": 1.567078957296016e-06, + "loss": 1.4919, + "step": 1895 + }, + { + "epoch": 7.676113360323887, + "grad_norm": 5.209386411212645, + "learning_rate": 1.5619449695467142e-06, + "loss": 1.4698, + "step": 1896 + }, + { + "epoch": 7.680161943319838, + "grad_norm": 6.423491328339259, + "learning_rate": 1.556817848268118e-06, + "loss": 1.3083, + "step": 1897 + }, + { + "epoch": 7.684210526315789, + "grad_norm": 6.099826757015211, + "learning_rate": 1.5516976037000941e-06, + "loss": 1.1861, + "step": 1898 + }, + { + "epoch": 7.6882591093117405, + "grad_norm": 5.753586753644626, + "learning_rate": 1.5465842460687786e-06, + "loss": 1.2721, + "step": 1899 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 6.272583592648715, + "learning_rate": 1.5414777855865466e-06, + "loss": 1.2911, + "step": 1900 + }, + { + "epoch": 7.696356275303644, + "grad_norm": 5.68165710538138, + "learning_rate": 1.5363782324520033e-06, + "loss": 1.1648, + "step": 1901 + }, + { + "epoch": 7.700404858299595, + "grad_norm": 7.460829794563436, + "learning_rate": 1.5312855968499574e-06, + "loss": 1.6084, + "step": 1902 + }, + { + "epoch": 7.704453441295547, + "grad_norm": 6.5692354666682276, + "learning_rate": 1.5261998889514017e-06, + "loss": 1.4184, + "step": 1903 + }, + { + "epoch": 7.708502024291498, + "grad_norm": 6.3186571601325525, + "learning_rate": 1.5211211189134955e-06, + "loss": 1.0412, + "step": 1904 + }, + { + "epoch": 7.712550607287449, + "grad_norm": 5.682537504028156, + "learning_rate": 1.516049296879535e-06, + "loss": 1.1573, + "step": 1905 + }, + { + "epoch": 7.716599190283401, + "grad_norm": 5.812434487226451, + "learning_rate": 1.510984432978947e-06, + "loss": 1.2783, + "step": 1906 + }, + { + "epoch": 7.720647773279352, + "grad_norm": 7.075156192084278, + "learning_rate": 1.5059265373272574e-06, + "loss": 1.0288, + "step": 1907 + }, + { + "epoch": 7.724696356275303, + "grad_norm": 6.467523066478314, + "learning_rate": 1.5008756200260776e-06, + "loss": 1.2684, + "step": 1908 + }, + { + "epoch": 7.728744939271255, + "grad_norm": 5.838154690826828, + "learning_rate": 1.4958316911630827e-06, + "loss": 1.4278, + "step": 1909 + }, + { + "epoch": 7.732793522267206, + "grad_norm": 5.866932075199195, + "learning_rate": 1.4907947608119866e-06, + "loss": 1.1213, + "step": 1910 + }, + { + "epoch": 7.7368421052631575, + "grad_norm": 6.005636196644713, + "learning_rate": 1.4857648390325257e-06, + "loss": 1.2309, + "step": 1911 + }, + { + "epoch": 7.7408906882591095, + "grad_norm": 5.736349178634425, + "learning_rate": 1.4807419358704433e-06, + "loss": 1.8603, + "step": 1912 + }, + { + "epoch": 7.744939271255061, + "grad_norm": 5.608575893991077, + "learning_rate": 1.475726061357463e-06, + "loss": 1.4053, + "step": 1913 + }, + { + "epoch": 7.748987854251012, + "grad_norm": 6.949290018272913, + "learning_rate": 1.47071722551127e-06, + "loss": 1.2025, + "step": 1914 + }, + { + "epoch": 7.753036437246964, + "grad_norm": 6.470859543707123, + "learning_rate": 1.4657154383354948e-06, + "loss": 1.1287, + "step": 1915 + }, + { + "epoch": 7.757085020242915, + "grad_norm": 6.10955142295277, + "learning_rate": 1.4607207098196851e-06, + "loss": 1.2334, + "step": 1916 + }, + { + "epoch": 7.761133603238866, + "grad_norm": 6.5763762413068045, + "learning_rate": 1.4557330499392952e-06, + "loss": 1.9826, + "step": 1917 + }, + { + "epoch": 7.765182186234818, + "grad_norm": 7.723579817578996, + "learning_rate": 1.4507524686556612e-06, + "loss": 1.721, + "step": 1918 + }, + { + "epoch": 7.769230769230769, + "grad_norm": 8.397235796894286, + "learning_rate": 1.4457789759159813e-06, + "loss": 1.6659, + "step": 1919 + }, + { + "epoch": 7.77327935222672, + "grad_norm": 5.642365455166119, + "learning_rate": 1.4408125816532981e-06, + "loss": 1.1808, + "step": 1920 + }, + { + "epoch": 7.777327935222672, + "grad_norm": 5.725043241965928, + "learning_rate": 1.435853295786473e-06, + "loss": 1.4747, + "step": 1921 + }, + { + "epoch": 7.781376518218623, + "grad_norm": 5.394430714546486, + "learning_rate": 1.430901128220174e-06, + "loss": 1.4528, + "step": 1922 + }, + { + "epoch": 7.7854251012145745, + "grad_norm": 5.930712388463373, + "learning_rate": 1.4259560888448526e-06, + "loss": 1.2558, + "step": 1923 + }, + { + "epoch": 7.7894736842105265, + "grad_norm": 5.519869867138563, + "learning_rate": 1.4210181875367229e-06, + "loss": 1.1873, + "step": 1924 + }, + { + "epoch": 7.793522267206478, + "grad_norm": 6.265126307081154, + "learning_rate": 1.4160874341577447e-06, + "loss": 1.1916, + "step": 1925 + }, + { + "epoch": 7.797570850202429, + "grad_norm": 6.13894194733797, + "learning_rate": 1.4111638385555965e-06, + "loss": 1.2401, + "step": 1926 + }, + { + "epoch": 7.801619433198381, + "grad_norm": 5.721727948891365, + "learning_rate": 1.406247410563667e-06, + "loss": 1.1375, + "step": 1927 + }, + { + "epoch": 7.805668016194332, + "grad_norm": 5.409329610323807, + "learning_rate": 1.4013381600010278e-06, + "loss": 1.0394, + "step": 1928 + }, + { + "epoch": 7.809716599190283, + "grad_norm": 5.946216975378077, + "learning_rate": 1.396436096672416e-06, + "loss": 1.3717, + "step": 1929 + }, + { + "epoch": 7.813765182186235, + "grad_norm": 7.501336587253134, + "learning_rate": 1.3915412303682162e-06, + "loss": 1.1632, + "step": 1930 + }, + { + "epoch": 7.817813765182186, + "grad_norm": 6.192994323170135, + "learning_rate": 1.3866535708644335e-06, + "loss": 1.095, + "step": 1931 + }, + { + "epoch": 7.821862348178137, + "grad_norm": 14.576419437798382, + "learning_rate": 1.3817731279226843e-06, + "loss": 2.1725, + "step": 1932 + }, + { + "epoch": 7.825910931174089, + "grad_norm": 25.425127776950244, + "learning_rate": 1.376899911290172e-06, + "loss": 3.1191, + "step": 1933 + }, + { + "epoch": 7.82995951417004, + "grad_norm": 6.5130908283906574, + "learning_rate": 1.3720339306996666e-06, + "loss": 1.1065, + "step": 1934 + }, + { + "epoch": 7.834008097165992, + "grad_norm": 6.8625067545378755, + "learning_rate": 1.367175195869488e-06, + "loss": 1.076, + "step": 1935 + }, + { + "epoch": 7.838056680161944, + "grad_norm": 5.862839226770468, + "learning_rate": 1.3623237165034807e-06, + "loss": 1.0877, + "step": 1936 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 5.587464620521552, + "learning_rate": 1.3574795022910014e-06, + "loss": 1.181, + "step": 1937 + }, + { + "epoch": 7.846153846153846, + "grad_norm": 5.741544735607096, + "learning_rate": 1.3526425629068968e-06, + "loss": 0.9695, + "step": 1938 + }, + { + "epoch": 7.850202429149798, + "grad_norm": 7.078793165923023, + "learning_rate": 1.347812908011485e-06, + "loss": 1.1728, + "step": 1939 + }, + { + "epoch": 7.854251012145749, + "grad_norm": 7.029454395604512, + "learning_rate": 1.3429905472505344e-06, + "loss": 1.2049, + "step": 1940 + }, + { + "epoch": 7.8582995951417, + "grad_norm": 4.858460051035453, + "learning_rate": 1.3381754902552474e-06, + "loss": 1.1544, + "step": 1941 + }, + { + "epoch": 7.862348178137652, + "grad_norm": 6.543690353473279, + "learning_rate": 1.3333677466422357e-06, + "loss": 1.1535, + "step": 1942 + }, + { + "epoch": 7.866396761133603, + "grad_norm": 6.2618770897927165, + "learning_rate": 1.3285673260135073e-06, + "loss": 1.1238, + "step": 1943 + }, + { + "epoch": 7.870445344129554, + "grad_norm": 7.787458993836756, + "learning_rate": 1.323774237956445e-06, + "loss": 1.5443, + "step": 1944 + }, + { + "epoch": 7.874493927125506, + "grad_norm": 6.60339760790844, + "learning_rate": 1.3189884920437867e-06, + "loss": 1.4939, + "step": 1945 + }, + { + "epoch": 7.8785425101214575, + "grad_norm": 6.952377816462855, + "learning_rate": 1.314210097833607e-06, + "loss": 1.2695, + "step": 1946 + }, + { + "epoch": 7.882591093117409, + "grad_norm": 6.440482664289205, + "learning_rate": 1.309439064869295e-06, + "loss": 1.2076, + "step": 1947 + }, + { + "epoch": 7.886639676113361, + "grad_norm": 5.96904543777947, + "learning_rate": 1.3046754026795406e-06, + "loss": 0.8564, + "step": 1948 + }, + { + "epoch": 7.890688259109312, + "grad_norm": 5.611903455141828, + "learning_rate": 1.2999191207783129e-06, + "loss": 1.3827, + "step": 1949 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 5.50366242354655, + "learning_rate": 1.2951702286648399e-06, + "loss": 1.3867, + "step": 1950 + }, + { + "epoch": 7.898785425101215, + "grad_norm": 4.771234777762805, + "learning_rate": 1.290428735823593e-06, + "loss": 1.1739, + "step": 1951 + }, + { + "epoch": 7.902834008097166, + "grad_norm": 5.7833279202719075, + "learning_rate": 1.2856946517242608e-06, + "loss": 1.1495, + "step": 1952 + }, + { + "epoch": 7.906882591093117, + "grad_norm": 6.107712126684077, + "learning_rate": 1.28096798582174e-06, + "loss": 1.1842, + "step": 1953 + }, + { + "epoch": 7.910931174089069, + "grad_norm": 5.059953747053966, + "learning_rate": 1.2762487475561109e-06, + "loss": 0.9544, + "step": 1954 + }, + { + "epoch": 7.91497975708502, + "grad_norm": 5.819489630730656, + "learning_rate": 1.2715369463526173e-06, + "loss": 1.0285, + "step": 1955 + }, + { + "epoch": 7.919028340080971, + "grad_norm": 6.14238425845007, + "learning_rate": 1.2668325916216534e-06, + "loss": 1.0359, + "step": 1956 + }, + { + "epoch": 7.923076923076923, + "grad_norm": 4.708687979766823, + "learning_rate": 1.2621356927587353e-06, + "loss": 1.3581, + "step": 1957 + }, + { + "epoch": 7.9271255060728745, + "grad_norm": 6.6570477016899074, + "learning_rate": 1.257446259144494e-06, + "loss": 1.2012, + "step": 1958 + }, + { + "epoch": 7.931174089068826, + "grad_norm": 6.636474405464404, + "learning_rate": 1.2527643001446493e-06, + "loss": 1.181, + "step": 1959 + }, + { + "epoch": 7.935222672064778, + "grad_norm": 6.89647738144804, + "learning_rate": 1.248089825109991e-06, + "loss": 0.9855, + "step": 1960 + }, + { + "epoch": 7.939271255060729, + "grad_norm": 6.54652294560363, + "learning_rate": 1.2434228433763657e-06, + "loss": 1.0055, + "step": 1961 + }, + { + "epoch": 7.94331983805668, + "grad_norm": 7.466794850354919, + "learning_rate": 1.2387633642646501e-06, + "loss": 1.2977, + "step": 1962 + }, + { + "epoch": 7.947368421052632, + "grad_norm": 5.859347969468438, + "learning_rate": 1.2341113970807368e-06, + "loss": 1.0272, + "step": 1963 + }, + { + "epoch": 7.951417004048583, + "grad_norm": 7.526875704374519, + "learning_rate": 1.2294669511155193e-06, + "loss": 0.939, + "step": 1964 + }, + { + "epoch": 7.955465587044534, + "grad_norm": 7.225249295703587, + "learning_rate": 1.224830035644868e-06, + "loss": 1.2616, + "step": 1965 + }, + { + "epoch": 7.959514170040486, + "grad_norm": 6.683599476135708, + "learning_rate": 1.2202006599296122e-06, + "loss": 1.3384, + "step": 1966 + }, + { + "epoch": 7.963562753036437, + "grad_norm": 6.087314726468543, + "learning_rate": 1.215578833215526e-06, + "loss": 1.2777, + "step": 1967 + }, + { + "epoch": 7.967611336032388, + "grad_norm": 7.6203305950770766, + "learning_rate": 1.2109645647333018e-06, + "loss": 1.2766, + "step": 1968 + }, + { + "epoch": 7.97165991902834, + "grad_norm": 7.4075603041461155, + "learning_rate": 1.2063578636985402e-06, + "loss": 1.2, + "step": 1969 + }, + { + "epoch": 7.9757085020242915, + "grad_norm": 5.356896060806783, + "learning_rate": 1.201758739311728e-06, + "loss": 1.2542, + "step": 1970 + }, + { + "epoch": 7.979757085020243, + "grad_norm": 6.6184401008685, + "learning_rate": 1.1971672007582192e-06, + "loss": 1.3138, + "step": 1971 + }, + { + "epoch": 7.983805668016195, + "grad_norm": 5.952389025814739, + "learning_rate": 1.1925832572082184e-06, + "loss": 1.3645, + "step": 1972 + }, + { + "epoch": 7.987854251012146, + "grad_norm": 5.869009321326924, + "learning_rate": 1.1880069178167586e-06, + "loss": 1.1615, + "step": 1973 + }, + { + "epoch": 7.991902834008097, + "grad_norm": 5.240716232576427, + "learning_rate": 1.1834381917236881e-06, + "loss": 1.1793, + "step": 1974 + }, + { + "epoch": 7.995951417004049, + "grad_norm": 6.017014067933477, + "learning_rate": 1.178877088053651e-06, + "loss": 1.5002, + "step": 1975 + }, + { + "epoch": 8.0, + "grad_norm": 5.843845057775898, + "learning_rate": 1.1743236159160654e-06, + "loss": 1.2012, + "step": 1976 + } + ], + "logging_steps": 1, + "max_steps": 2470, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1976, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 600758819225600.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1976/training_args.bin b/checkpoint-1976/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..78bb788b48fdaeefa100fcca732cd4ad5de338f1 --- /dev/null +++ b/checkpoint-1976/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db617e3c3ae788b627938f09c1b4708215392619dbc3a2b63a88ab23d37b875b +size 7608 diff --git a/checkpoint-1976/zero_to_fp32.py b/checkpoint-1976/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-1976/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-2470/README.md b/checkpoint-2470/README.md new file mode 100644 index 0000000000000000000000000000000000000000..55b2fbb7c5bc2d687ad0ac5e2fdc9358b78dd42a --- /dev/null +++ b/checkpoint-2470/README.md @@ -0,0 +1,202 @@ +--- +base_model: /workspace/llms/Llama/Llama-3.3-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.2 \ No newline at end of file diff --git a/checkpoint-2470/adapter_config.json b/checkpoint-2470/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..066439893af62fc73d76190c54bbd164236f3c3f --- /dev/null +++ b/checkpoint-2470/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/workspace/llms/Llama/Llama-3.3-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2470/adapter_model.safetensors b/checkpoint-2470/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3a50b59e024ea094364c047037eda8ad502bc535 --- /dev/null +++ b/checkpoint-2470/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4759d9f6e45e8d965c676ba9f62be02b36b985ce2c373942f50ac32b1690f35 +size 207244392 diff --git a/checkpoint-2470/global_step2470/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a98fc210e8293786c686c9436fde2b0fa417f53 --- /dev/null +++ b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e81bd06f822c96c7b3700d034bfa3cf860f38e555b12b175c2aeadcf9e6554d +size 155324144 diff --git a/checkpoint-2470/global_step2470/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6740ab5bdaaab0613bac12c6dc20ac61253e695b --- /dev/null +++ b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8d9177b7708178e5adc57a79c2dfc39df2145c42d22ca55900507efeb6d10ce +size 155324144 diff --git a/checkpoint-2470/global_step2470/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b93dfe8be76e03e284c746310f743ca260dbd0a --- /dev/null +++ b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c57ed236d69391c5124ef82119c8b1510556125897e8ff870f07752eaecca9 +size 155324144 diff --git a/checkpoint-2470/global_step2470/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a80b21b729913415f903abc268ab4fb1b3f7ff11 --- /dev/null +++ b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb8f40be01b6a8850dccd4d618f4f5d09235adfbc9d4b76f07e224cdfd65a78d +size 155324144 diff --git a/checkpoint-2470/global_step2470/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..567876ea82f4958093951164e8a77ef4200c5927 --- /dev/null +++ b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04c6f42dce2e076965e068be3b02579d9002eac8210dd75d86531af5f561c5e0 +size 155324144 diff --git a/checkpoint-2470/global_step2470/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2de7204b107d1134a936d7925c4e1c48b354a1d8 --- /dev/null +++ b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc7db2a2be880257a2403d6c39db41b60675732605d954fdc39e46ec4dbd412a +size 155324144 diff --git a/checkpoint-2470/global_step2470/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a3da877dc9d4df892a7f4930b8c6bf53916437e --- /dev/null +++ b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bf6cf8bb277f61cc2f08c56784aa4002b393b2d513a75dae7b915e9a5cd614a +size 155324144 diff --git a/checkpoint-2470/global_step2470/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f4a37941a1cd8733afedb0917a2f87cb501efa3 --- /dev/null +++ b/checkpoint-2470/global_step2470/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:395ce47e7b55dbebdf1a7cb6f3055d0c4ab84732a9303633d05fb7ed90f25154 +size 155324144 diff --git a/checkpoint-2470/global_step2470/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-2470/global_step2470/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ca5a4be31d9beef70f620cbe48cfe23711d8be4 --- /dev/null +++ b/checkpoint-2470/global_step2470/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:affd4e7919b620507e3da11c75ba593d01b10acb05783df32b4fd760db6e419e +size 1107654 diff --git a/checkpoint-2470/global_step2470/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-2470/global_step2470/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b35bd6aaad9f80b3acb11318a90ad62cf7236bc6 --- /dev/null +++ b/checkpoint-2470/global_step2470/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdec7a3e68a625d1941528fb0bede957591336c41acc3aa3c855764935602326 +size 1107654 diff --git a/checkpoint-2470/global_step2470/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-2470/global_step2470/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36867566e182714eb487efd0a554f9d9d2ceded2 --- /dev/null +++ b/checkpoint-2470/global_step2470/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ade0f0a841def37fef3341dedeac5099445bd32e5d68ad35f8049f8b8da38d79 +size 1107654 diff --git a/checkpoint-2470/global_step2470/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-2470/global_step2470/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2683a25d6b30074df7b85641a9ae35acefb2b3ff --- /dev/null +++ b/checkpoint-2470/global_step2470/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13646763d500f3e15302d6e1a67610634f223a8c3add58d52b09056030776934 +size 1107654 diff --git a/checkpoint-2470/global_step2470/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-2470/global_step2470/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80495fd75abc87516cac4f89e974fb6d1f2db4c5 --- /dev/null +++ b/checkpoint-2470/global_step2470/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c12c11f6ccfc456f712088841cf4144de17ac4681c5677223b24fe21d9f0c37 +size 1107654 diff --git a/checkpoint-2470/global_step2470/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-2470/global_step2470/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d36dfa4de75d508faab3292dd8b52872cf846f14 --- /dev/null +++ b/checkpoint-2470/global_step2470/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee6f6f420927fdebec00b3c7e3531179bf95bb26a50754053daa11ca92b01ecd +size 1107654 diff --git a/checkpoint-2470/global_step2470/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-2470/global_step2470/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5b6aafb1f2db7666335c4990b56534e0e16f6fe --- /dev/null +++ b/checkpoint-2470/global_step2470/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c4fcb63df0b2b6e387d14f8ca3bbaa1fc5148630f799ea0aa54f11ad3044e0d +size 1107654 diff --git a/checkpoint-2470/global_step2470/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-2470/global_step2470/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebf41867cd7674598531dd12626c1aa59fe59024 --- /dev/null +++ b/checkpoint-2470/global_step2470/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8fe9871ce7e3a31541d975463599293afbf9a955bedaca9c2e83cecaa25c12b +size 1107654 diff --git a/checkpoint-2470/latest b/checkpoint-2470/latest new file mode 100644 index 0000000000000000000000000000000000000000..f6de6dfb6a5757a34892a32207d1190963fa0c54 --- /dev/null +++ b/checkpoint-2470/latest @@ -0,0 +1 @@ +global_step2470 \ No newline at end of file diff --git a/checkpoint-2470/rng_state_0.pth b/checkpoint-2470/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4ade713ef57d0535c32a9251c786bc57de03d06 --- /dev/null +++ b/checkpoint-2470/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1165242405b17b3d6a8186ae61b13dcb1faa5a54320bebd74ef8d71b964bf7 +size 15984 diff --git a/checkpoint-2470/rng_state_1.pth b/checkpoint-2470/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d91c511b147b4dd17988903c57adcefb6c1f20b0 --- /dev/null +++ b/checkpoint-2470/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562c262916c9997ec644c42fed9655ab28706b74fca20290ca921c4761d6a4b0 +size 15984 diff --git a/checkpoint-2470/rng_state_2.pth b/checkpoint-2470/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f71e829b3e3570a540263d07783c4e906a78a803 --- /dev/null +++ b/checkpoint-2470/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d40f8118f513299624ded0a9bcf09778b961635615090409394d4f96f928f6 +size 15984 diff --git a/checkpoint-2470/rng_state_3.pth b/checkpoint-2470/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..be7f0176676a7c526bb10cbb336b2afa89d8841c --- /dev/null +++ b/checkpoint-2470/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4391f924238a4cb855c4cbdc6d1a14954f785431c75997d05c7a4ee6615dae7 +size 15984 diff --git a/checkpoint-2470/rng_state_4.pth b/checkpoint-2470/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dd1a877dd1f03799067fd08739e82b9f2cd2ad3 --- /dev/null +++ b/checkpoint-2470/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be7b19bb9543a16bf9f4cd96466ac581436f63070f5815f3a7ba57980608994f +size 15984 diff --git a/checkpoint-2470/rng_state_5.pth b/checkpoint-2470/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcf1b720014f72a27a09ab9ef8570430a8e3c96d --- /dev/null +++ b/checkpoint-2470/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97da4a1ede0a3e0f96411cacd5bfdf84d9355198f7aadc9bcb8be41122043f63 +size 15984 diff --git a/checkpoint-2470/rng_state_6.pth b/checkpoint-2470/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b58cbeed7b25ef61c6439aced60df473cbaf6d4 --- /dev/null +++ b/checkpoint-2470/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:544cb6421b975bd5d2b2360a4e666003794e6197ae654d2ad963cd6572a86ede +size 15984 diff --git a/checkpoint-2470/rng_state_7.pth b/checkpoint-2470/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..36a7dcefe0e0264868d40586546699306878a454 --- /dev/null +++ b/checkpoint-2470/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d6eb32a23f3bef6262bbcb2eda724b2fd6f5e579969aa27c71a5971331722b +size 15984 diff --git a/checkpoint-2470/scheduler.pt b/checkpoint-2470/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea730ec23a0ee4e5cd602fcb5ba398b046e7d0c4 --- /dev/null +++ b/checkpoint-2470/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cad4c8a22df26c9949239d14bee6e49e26816f66764a12dcd0ee7c7684a0341 +size 1064 diff --git a/checkpoint-2470/special_tokens_map.json b/checkpoint-2470/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-2470/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2470/tokenizer.json b/checkpoint-2470/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-2470/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-2470/tokenizer_config.json b/checkpoint-2470/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a65829f8d45598369efc368800ef14b5dbd9f997 --- /dev/null +++ b/checkpoint-2470/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-2470/trainer_state.json b/checkpoint-2470/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7181b1d5c656003bc5a93a22a9e7c02095c25a91 --- /dev/null +++ b/checkpoint-2470/trainer_state.json @@ -0,0 +1,17324 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 2470, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004048582995951417, + "grad_norm": 0.46473725687854356, + "learning_rate": 0.0, + "loss": 2.5926, + "step": 1 + }, + { + "epoch": 0.008097165991902834, + "grad_norm": 0.7862315968268553, + "learning_rate": 4.0485829959514176e-08, + "loss": 2.9114, + "step": 2 + }, + { + "epoch": 0.012145748987854251, + "grad_norm": 0.6677933506680473, + "learning_rate": 8.097165991902835e-08, + "loss": 2.7471, + "step": 3 + }, + { + "epoch": 0.016194331983805668, + "grad_norm": 0.8630518959378011, + "learning_rate": 1.2145748987854252e-07, + "loss": 2.8706, + "step": 4 + }, + { + "epoch": 0.020242914979757085, + "grad_norm": 0.5173190139924537, + "learning_rate": 1.619433198380567e-07, + "loss": 2.9912, + "step": 5 + }, + { + "epoch": 0.024291497975708502, + "grad_norm": 0.7759993718339214, + "learning_rate": 2.0242914979757086e-07, + "loss": 3.0072, + "step": 6 + }, + { + "epoch": 0.02834008097165992, + "grad_norm": 1.3755130452390263, + "learning_rate": 2.4291497975708504e-07, + "loss": 2.4721, + "step": 7 + }, + { + "epoch": 0.032388663967611336, + "grad_norm": 0.44121276912866286, + "learning_rate": 2.834008097165992e-07, + "loss": 2.843, + "step": 8 + }, + { + "epoch": 0.03643724696356275, + "grad_norm": 0.5559835506705462, + "learning_rate": 3.238866396761134e-07, + "loss": 2.9053, + "step": 9 + }, + { + "epoch": 0.04048582995951417, + "grad_norm": 0.6731704914870359, + "learning_rate": 3.6437246963562754e-07, + "loss": 2.7608, + "step": 10 + }, + { + "epoch": 0.044534412955465584, + "grad_norm": 0.43190024730085624, + "learning_rate": 4.048582995951417e-07, + "loss": 2.7074, + "step": 11 + }, + { + "epoch": 0.048582995951417005, + "grad_norm": 0.7594718614486027, + "learning_rate": 4.453441295546559e-07, + "loss": 2.7846, + "step": 12 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.4278958670654092, + "learning_rate": 4.858299595141701e-07, + "loss": 3.018, + "step": 13 + }, + { + "epoch": 0.05668016194331984, + "grad_norm": 0.48698492939265825, + "learning_rate": 5.263157894736843e-07, + "loss": 2.8131, + "step": 14 + }, + { + "epoch": 0.06072874493927125, + "grad_norm": 0.405274105300616, + "learning_rate": 5.668016194331984e-07, + "loss": 2.8777, + "step": 15 + }, + { + "epoch": 0.06477732793522267, + "grad_norm": 0.5554327831452092, + "learning_rate": 6.072874493927125e-07, + "loss": 2.9472, + "step": 16 + }, + { + "epoch": 0.06882591093117409, + "grad_norm": 0.44756530277540646, + "learning_rate": 6.477732793522268e-07, + "loss": 3.0157, + "step": 17 + }, + { + "epoch": 0.0728744939271255, + "grad_norm": 0.8072585997136504, + "learning_rate": 6.882591093117409e-07, + "loss": 2.7773, + "step": 18 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.5635933276885046, + "learning_rate": 7.287449392712551e-07, + "loss": 2.7169, + "step": 19 + }, + { + "epoch": 0.08097165991902834, + "grad_norm": 0.4673928500608582, + "learning_rate": 7.692307692307694e-07, + "loss": 2.7934, + "step": 20 + }, + { + "epoch": 0.08502024291497975, + "grad_norm": 1.3664880257539318, + "learning_rate": 8.097165991902834e-07, + "loss": 2.713, + "step": 21 + }, + { + "epoch": 0.08906882591093117, + "grad_norm": 0.6438340318121762, + "learning_rate": 8.502024291497976e-07, + "loss": 2.8722, + "step": 22 + }, + { + "epoch": 0.0931174089068826, + "grad_norm": 0.512121787489251, + "learning_rate": 8.906882591093118e-07, + "loss": 2.722, + "step": 23 + }, + { + "epoch": 0.09716599190283401, + "grad_norm": 1.023552604444706, + "learning_rate": 9.31174089068826e-07, + "loss": 2.5291, + "step": 24 + }, + { + "epoch": 0.10121457489878542, + "grad_norm": 0.556430330792241, + "learning_rate": 9.716599190283402e-07, + "loss": 2.7028, + "step": 25 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.0165779263195185, + "learning_rate": 1.0121457489878542e-06, + "loss": 2.7946, + "step": 26 + }, + { + "epoch": 0.10931174089068826, + "grad_norm": 0.8434539164732048, + "learning_rate": 1.0526315789473685e-06, + "loss": 2.6139, + "step": 27 + }, + { + "epoch": 0.11336032388663968, + "grad_norm": 0.6252954896694622, + "learning_rate": 1.0931174089068828e-06, + "loss": 2.469, + "step": 28 + }, + { + "epoch": 0.11740890688259109, + "grad_norm": 0.8618444900481227, + "learning_rate": 1.133603238866397e-06, + "loss": 2.6452, + "step": 29 + }, + { + "epoch": 0.1214574898785425, + "grad_norm": 0.9066908581713439, + "learning_rate": 1.174089068825911e-06, + "loss": 2.4396, + "step": 30 + }, + { + "epoch": 0.12550607287449392, + "grad_norm": 0.528141325017682, + "learning_rate": 1.214574898785425e-06, + "loss": 2.469, + "step": 31 + }, + { + "epoch": 0.12955465587044535, + "grad_norm": 0.6378156052352336, + "learning_rate": 1.2550607287449393e-06, + "loss": 2.5795, + "step": 32 + }, + { + "epoch": 0.13360323886639677, + "grad_norm": 0.5624703100477139, + "learning_rate": 1.2955465587044536e-06, + "loss": 2.6768, + "step": 33 + }, + { + "epoch": 0.13765182186234817, + "grad_norm": 0.5821134471598685, + "learning_rate": 1.336032388663968e-06, + "loss": 2.8086, + "step": 34 + }, + { + "epoch": 0.1417004048582996, + "grad_norm": 0.6258194867082703, + "learning_rate": 1.3765182186234818e-06, + "loss": 2.3603, + "step": 35 + }, + { + "epoch": 0.145748987854251, + "grad_norm": 0.5477831289461287, + "learning_rate": 1.417004048582996e-06, + "loss": 2.7758, + "step": 36 + }, + { + "epoch": 0.14979757085020243, + "grad_norm": 0.5008051448479439, + "learning_rate": 1.4574898785425101e-06, + "loss": 2.7543, + "step": 37 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.5096264603702895, + "learning_rate": 1.4979757085020244e-06, + "loss": 2.7356, + "step": 38 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.6456644864025523, + "learning_rate": 1.5384615384615387e-06, + "loss": 3.0218, + "step": 39 + }, + { + "epoch": 0.16194331983805668, + "grad_norm": 0.5888424191973028, + "learning_rate": 1.5789473684210526e-06, + "loss": 2.6165, + "step": 40 + }, + { + "epoch": 0.1659919028340081, + "grad_norm": 0.7898553504446816, + "learning_rate": 1.6194331983805669e-06, + "loss": 2.6223, + "step": 41 + }, + { + "epoch": 0.1700404858299595, + "grad_norm": 0.6232472926548593, + "learning_rate": 1.6599190283400812e-06, + "loss": 2.7768, + "step": 42 + }, + { + "epoch": 0.17408906882591094, + "grad_norm": 0.6922764219271268, + "learning_rate": 1.7004048582995952e-06, + "loss": 2.479, + "step": 43 + }, + { + "epoch": 0.17813765182186234, + "grad_norm": 0.6679665416214551, + "learning_rate": 1.7408906882591095e-06, + "loss": 2.6842, + "step": 44 + }, + { + "epoch": 0.18218623481781376, + "grad_norm": 0.48868645690455986, + "learning_rate": 1.7813765182186236e-06, + "loss": 2.3611, + "step": 45 + }, + { + "epoch": 0.1862348178137652, + "grad_norm": 1.0959755351532565, + "learning_rate": 1.8218623481781379e-06, + "loss": 2.6644, + "step": 46 + }, + { + "epoch": 0.1902834008097166, + "grad_norm": 0.7403727047924632, + "learning_rate": 1.862348178137652e-06, + "loss": 2.7313, + "step": 47 + }, + { + "epoch": 0.19433198380566802, + "grad_norm": 0.5355809576361324, + "learning_rate": 1.902834008097166e-06, + "loss": 2.976, + "step": 48 + }, + { + "epoch": 0.19838056680161945, + "grad_norm": 0.6203117033335515, + "learning_rate": 1.9433198380566803e-06, + "loss": 2.8615, + "step": 49 + }, + { + "epoch": 0.20242914979757085, + "grad_norm": 0.6748602332749001, + "learning_rate": 1.9838056680161946e-06, + "loss": 2.7385, + "step": 50 + }, + { + "epoch": 0.20647773279352227, + "grad_norm": 0.6061522444778688, + "learning_rate": 2.0242914979757085e-06, + "loss": 2.7926, + "step": 51 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.5677094053210018, + "learning_rate": 2.0647773279352228e-06, + "loss": 2.8905, + "step": 52 + }, + { + "epoch": 0.2145748987854251, + "grad_norm": 0.7539663022721307, + "learning_rate": 2.105263157894737e-06, + "loss": 2.7044, + "step": 53 + }, + { + "epoch": 0.21862348178137653, + "grad_norm": 0.5511775427996539, + "learning_rate": 2.1457489878542513e-06, + "loss": 2.6044, + "step": 54 + }, + { + "epoch": 0.22267206477732793, + "grad_norm": 0.5001055873779205, + "learning_rate": 2.1862348178137656e-06, + "loss": 2.7154, + "step": 55 + }, + { + "epoch": 0.22672064777327935, + "grad_norm": 5.059433496293122, + "learning_rate": 2.2267206477732795e-06, + "loss": 2.6151, + "step": 56 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.5976992576491789, + "learning_rate": 2.267206477732794e-06, + "loss": 2.8561, + "step": 57 + }, + { + "epoch": 0.23481781376518218, + "grad_norm": 0.5650795458768608, + "learning_rate": 2.307692307692308e-06, + "loss": 2.994, + "step": 58 + }, + { + "epoch": 0.2388663967611336, + "grad_norm": 1.110043039226332, + "learning_rate": 2.348178137651822e-06, + "loss": 2.9581, + "step": 59 + }, + { + "epoch": 0.242914979757085, + "grad_norm": 0.8353821859752748, + "learning_rate": 2.3886639676113362e-06, + "loss": 2.9613, + "step": 60 + }, + { + "epoch": 0.24696356275303644, + "grad_norm": 0.7575324618871198, + "learning_rate": 2.42914979757085e-06, + "loss": 2.7295, + "step": 61 + }, + { + "epoch": 0.25101214574898784, + "grad_norm": 0.7791476828146748, + "learning_rate": 2.4696356275303644e-06, + "loss": 2.7126, + "step": 62 + }, + { + "epoch": 0.2550607287449393, + "grad_norm": 0.4809737260566304, + "learning_rate": 2.5101214574898787e-06, + "loss": 2.8892, + "step": 63 + }, + { + "epoch": 0.2591093117408907, + "grad_norm": 0.5968909877448142, + "learning_rate": 2.550607287449393e-06, + "loss": 2.6468, + "step": 64 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.7701935599652083, + "learning_rate": 2.5910931174089072e-06, + "loss": 2.5171, + "step": 65 + }, + { + "epoch": 0.26720647773279355, + "grad_norm": 0.49540617385936636, + "learning_rate": 2.631578947368421e-06, + "loss": 2.5617, + "step": 66 + }, + { + "epoch": 0.27125506072874495, + "grad_norm": 0.5880768265382437, + "learning_rate": 2.672064777327936e-06, + "loss": 2.6525, + "step": 67 + }, + { + "epoch": 0.27530364372469635, + "grad_norm": 0.8719044761766179, + "learning_rate": 2.7125506072874497e-06, + "loss": 2.5136, + "step": 68 + }, + { + "epoch": 0.2793522267206478, + "grad_norm": 0.7508384152907464, + "learning_rate": 2.7530364372469636e-06, + "loss": 2.7136, + "step": 69 + }, + { + "epoch": 0.2834008097165992, + "grad_norm": 0.7593508374848729, + "learning_rate": 2.7935222672064783e-06, + "loss": 2.5836, + "step": 70 + }, + { + "epoch": 0.2874493927125506, + "grad_norm": 0.6236865711432193, + "learning_rate": 2.834008097165992e-06, + "loss": 2.6042, + "step": 71 + }, + { + "epoch": 0.291497975708502, + "grad_norm": 0.9207439340534006, + "learning_rate": 2.8744939271255064e-06, + "loss": 2.4534, + "step": 72 + }, + { + "epoch": 0.29554655870445345, + "grad_norm": 0.9048216657065745, + "learning_rate": 2.9149797570850203e-06, + "loss": 2.7732, + "step": 73 + }, + { + "epoch": 0.29959514170040485, + "grad_norm": 1.0531213295224573, + "learning_rate": 2.955465587044535e-06, + "loss": 2.6927, + "step": 74 + }, + { + "epoch": 0.30364372469635625, + "grad_norm": 0.8889664393499657, + "learning_rate": 2.995951417004049e-06, + "loss": 2.7532, + "step": 75 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.678148296266936, + "learning_rate": 3.0364372469635627e-06, + "loss": 2.4982, + "step": 76 + }, + { + "epoch": 0.3117408906882591, + "grad_norm": 0.9143989903488097, + "learning_rate": 3.0769230769230774e-06, + "loss": 2.4821, + "step": 77 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.7430526887934812, + "learning_rate": 3.1174089068825913e-06, + "loss": 2.8892, + "step": 78 + }, + { + "epoch": 0.31983805668016196, + "grad_norm": 1.0967354490931058, + "learning_rate": 3.157894736842105e-06, + "loss": 2.5355, + "step": 79 + }, + { + "epoch": 0.32388663967611336, + "grad_norm": 0.6474936013842225, + "learning_rate": 3.19838056680162e-06, + "loss": 2.4627, + "step": 80 + }, + { + "epoch": 0.32793522267206476, + "grad_norm": 0.8223317792104156, + "learning_rate": 3.2388663967611337e-06, + "loss": 2.5097, + "step": 81 + }, + { + "epoch": 0.3319838056680162, + "grad_norm": 0.8471027758590536, + "learning_rate": 3.279352226720648e-06, + "loss": 2.5888, + "step": 82 + }, + { + "epoch": 0.3360323886639676, + "grad_norm": 0.4892443825365843, + "learning_rate": 3.3198380566801623e-06, + "loss": 2.4857, + "step": 83 + }, + { + "epoch": 0.340080971659919, + "grad_norm": 0.6329419393193343, + "learning_rate": 3.3603238866396766e-06, + "loss": 2.3704, + "step": 84 + }, + { + "epoch": 0.3441295546558704, + "grad_norm": 0.7450745621264726, + "learning_rate": 3.4008097165991905e-06, + "loss": 2.4814, + "step": 85 + }, + { + "epoch": 0.3481781376518219, + "grad_norm": 0.7915890438013479, + "learning_rate": 3.4412955465587043e-06, + "loss": 2.7336, + "step": 86 + }, + { + "epoch": 0.3522267206477733, + "grad_norm": 0.8224002727747803, + "learning_rate": 3.481781376518219e-06, + "loss": 2.6197, + "step": 87 + }, + { + "epoch": 0.3562753036437247, + "grad_norm": 0.7379097347027997, + "learning_rate": 3.522267206477733e-06, + "loss": 2.3123, + "step": 88 + }, + { + "epoch": 0.3603238866396761, + "grad_norm": 0.63590140796502, + "learning_rate": 3.562753036437247e-06, + "loss": 2.659, + "step": 89 + }, + { + "epoch": 0.3643724696356275, + "grad_norm": 0.9402424866754966, + "learning_rate": 3.6032388663967615e-06, + "loss": 2.6324, + "step": 90 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.7757246306456501, + "learning_rate": 3.6437246963562758e-06, + "loss": 2.5935, + "step": 91 + }, + { + "epoch": 0.3724696356275304, + "grad_norm": 0.7001956828085119, + "learning_rate": 3.6842105263157896e-06, + "loss": 2.8634, + "step": 92 + }, + { + "epoch": 0.3765182186234818, + "grad_norm": 0.6770880287428972, + "learning_rate": 3.724696356275304e-06, + "loss": 2.3526, + "step": 93 + }, + { + "epoch": 0.3805668016194332, + "grad_norm": 0.7469924696350099, + "learning_rate": 3.7651821862348182e-06, + "loss": 2.4551, + "step": 94 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.6156146016330529, + "learning_rate": 3.805668016194332e-06, + "loss": 2.441, + "step": 95 + }, + { + "epoch": 0.38866396761133604, + "grad_norm": 0.7142333380873401, + "learning_rate": 3.846153846153847e-06, + "loss": 2.5222, + "step": 96 + }, + { + "epoch": 0.39271255060728744, + "grad_norm": 0.6126483934481857, + "learning_rate": 3.886639676113361e-06, + "loss": 2.6018, + "step": 97 + }, + { + "epoch": 0.3967611336032389, + "grad_norm": 0.7531177478658849, + "learning_rate": 3.9271255060728745e-06, + "loss": 2.4227, + "step": 98 + }, + { + "epoch": 0.4008097165991903, + "grad_norm": 0.7172471080034739, + "learning_rate": 3.967611336032389e-06, + "loss": 2.4637, + "step": 99 + }, + { + "epoch": 0.4048582995951417, + "grad_norm": 0.7800438096349082, + "learning_rate": 4.008097165991903e-06, + "loss": 2.5228, + "step": 100 + }, + { + "epoch": 0.4089068825910931, + "grad_norm": 0.8009705607457139, + "learning_rate": 4.048582995951417e-06, + "loss": 2.6356, + "step": 101 + }, + { + "epoch": 0.41295546558704455, + "grad_norm": 0.9574889353775141, + "learning_rate": 4.089068825910931e-06, + "loss": 2.3874, + "step": 102 + }, + { + "epoch": 0.41700404858299595, + "grad_norm": 0.7824043116812712, + "learning_rate": 4.1295546558704455e-06, + "loss": 2.6671, + "step": 103 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.7116660818199502, + "learning_rate": 4.170040485829959e-06, + "loss": 2.6795, + "step": 104 + }, + { + "epoch": 0.4251012145748988, + "grad_norm": 0.6234909516086495, + "learning_rate": 4.210526315789474e-06, + "loss": 2.4891, + "step": 105 + }, + { + "epoch": 0.4291497975708502, + "grad_norm": 0.7507042701110958, + "learning_rate": 4.251012145748988e-06, + "loss": 2.5374, + "step": 106 + }, + { + "epoch": 0.4331983805668016, + "grad_norm": 0.5830775553501698, + "learning_rate": 4.291497975708503e-06, + "loss": 2.4393, + "step": 107 + }, + { + "epoch": 0.43724696356275305, + "grad_norm": 0.8561666711107475, + "learning_rate": 4.3319838056680166e-06, + "loss": 2.3122, + "step": 108 + }, + { + "epoch": 0.44129554655870445, + "grad_norm": 0.914997362840242, + "learning_rate": 4.372469635627531e-06, + "loss": 2.5436, + "step": 109 + }, + { + "epoch": 0.44534412955465585, + "grad_norm": 0.6732155905531092, + "learning_rate": 4.412955465587045e-06, + "loss": 2.5005, + "step": 110 + }, + { + "epoch": 0.4493927125506073, + "grad_norm": 0.7462341368666683, + "learning_rate": 4.453441295546559e-06, + "loss": 2.4483, + "step": 111 + }, + { + "epoch": 0.4534412955465587, + "grad_norm": 0.8245738963488927, + "learning_rate": 4.493927125506074e-06, + "loss": 2.5333, + "step": 112 + }, + { + "epoch": 0.4574898785425101, + "grad_norm": 0.7702932505386926, + "learning_rate": 4.534412955465588e-06, + "loss": 2.5613, + "step": 113 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 1.3101615300934801, + "learning_rate": 4.5748987854251014e-06, + "loss": 2.973, + "step": 114 + }, + { + "epoch": 0.46558704453441296, + "grad_norm": 0.7651586289456958, + "learning_rate": 4.615384615384616e-06, + "loss": 2.5947, + "step": 115 + }, + { + "epoch": 0.46963562753036436, + "grad_norm": 0.8222224925704688, + "learning_rate": 4.65587044534413e-06, + "loss": 2.4581, + "step": 116 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.6556587501075568, + "learning_rate": 4.696356275303644e-06, + "loss": 2.4571, + "step": 117 + }, + { + "epoch": 0.4777327935222672, + "grad_norm": 0.821438637414972, + "learning_rate": 4.736842105263158e-06, + "loss": 2.6622, + "step": 118 + }, + { + "epoch": 0.4817813765182186, + "grad_norm": 0.6254867878515806, + "learning_rate": 4.7773279352226725e-06, + "loss": 2.3622, + "step": 119 + }, + { + "epoch": 0.48582995951417, + "grad_norm": 0.6606998242945233, + "learning_rate": 4.817813765182186e-06, + "loss": 2.4812, + "step": 120 + }, + { + "epoch": 0.4898785425101215, + "grad_norm": 0.9140647082414407, + "learning_rate": 4.8582995951417e-06, + "loss": 2.5297, + "step": 121 + }, + { + "epoch": 0.4939271255060729, + "grad_norm": 0.8543729933153993, + "learning_rate": 4.898785425101215e-06, + "loss": 2.5534, + "step": 122 + }, + { + "epoch": 0.4979757085020243, + "grad_norm": 0.9641287101724041, + "learning_rate": 4.939271255060729e-06, + "loss": 2.3909, + "step": 123 + }, + { + "epoch": 0.5020242914979757, + "grad_norm": 0.7562747998003689, + "learning_rate": 4.9797570850202435e-06, + "loss": 2.3104, + "step": 124 + }, + { + "epoch": 0.5060728744939271, + "grad_norm": 0.9684058066200523, + "learning_rate": 5.020242914979757e-06, + "loss": 2.5894, + "step": 125 + }, + { + "epoch": 0.5101214574898786, + "grad_norm": 1.0833146453760147, + "learning_rate": 5.060728744939272e-06, + "loss": 2.686, + "step": 126 + }, + { + "epoch": 0.5141700404858299, + "grad_norm": 0.7212110120886743, + "learning_rate": 5.101214574898786e-06, + "loss": 2.5203, + "step": 127 + }, + { + "epoch": 0.5182186234817814, + "grad_norm": 0.9848467525032204, + "learning_rate": 5.1417004048583e-06, + "loss": 2.66, + "step": 128 + }, + { + "epoch": 0.5222672064777328, + "grad_norm": 0.78315965526943, + "learning_rate": 5.1821862348178145e-06, + "loss": 2.5008, + "step": 129 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8583112834837245, + "learning_rate": 5.222672064777329e-06, + "loss": 2.3134, + "step": 130 + }, + { + "epoch": 0.5303643724696356, + "grad_norm": 0.7581206885647646, + "learning_rate": 5.263157894736842e-06, + "loss": 2.4191, + "step": 131 + }, + { + "epoch": 0.5344129554655871, + "grad_norm": 0.9695513408717512, + "learning_rate": 5.303643724696357e-06, + "loss": 2.5499, + "step": 132 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 6.764939321667699, + "learning_rate": 5.344129554655872e-06, + "loss": 2.4736, + "step": 133 + }, + { + "epoch": 0.5425101214574899, + "grad_norm": 1.0247610500949114, + "learning_rate": 5.384615384615385e-06, + "loss": 2.3723, + "step": 134 + }, + { + "epoch": 0.5465587044534413, + "grad_norm": 15.672428379790873, + "learning_rate": 5.425101214574899e-06, + "loss": 3.4815, + "step": 135 + }, + { + "epoch": 0.5506072874493927, + "grad_norm": 2.249245731133667, + "learning_rate": 5.465587044534414e-06, + "loss": 3.4231, + "step": 136 + }, + { + "epoch": 0.5546558704453441, + "grad_norm": 3.797144058522148, + "learning_rate": 5.506072874493927e-06, + "loss": 4.4025, + "step": 137 + }, + { + "epoch": 0.5587044534412956, + "grad_norm": 0.8114215476851966, + "learning_rate": 5.546558704453442e-06, + "loss": 2.3958, + "step": 138 + }, + { + "epoch": 0.562753036437247, + "grad_norm": 0.7631595156767096, + "learning_rate": 5.5870445344129565e-06, + "loss": 2.1963, + "step": 139 + }, + { + "epoch": 0.5668016194331984, + "grad_norm": 0.8648024420211529, + "learning_rate": 5.6275303643724695e-06, + "loss": 2.4664, + "step": 140 + }, + { + "epoch": 0.5708502024291497, + "grad_norm": 1.1398946486999715, + "learning_rate": 5.668016194331984e-06, + "loss": 2.2672, + "step": 141 + }, + { + "epoch": 0.5748987854251012, + "grad_norm": 0.7035715089344788, + "learning_rate": 5.708502024291498e-06, + "loss": 2.4001, + "step": 142 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.7842465817250697, + "learning_rate": 5.748987854251013e-06, + "loss": 2.2186, + "step": 143 + }, + { + "epoch": 0.582995951417004, + "grad_norm": 0.8358191441707306, + "learning_rate": 5.789473684210527e-06, + "loss": 2.5692, + "step": 144 + }, + { + "epoch": 0.5870445344129555, + "grad_norm": 0.7027969455146362, + "learning_rate": 5.8299595141700406e-06, + "loss": 2.3088, + "step": 145 + }, + { + "epoch": 0.5910931174089069, + "grad_norm": 0.7026752876788243, + "learning_rate": 5.870445344129555e-06, + "loss": 2.4148, + "step": 146 + }, + { + "epoch": 0.5951417004048583, + "grad_norm": 0.9049685837714232, + "learning_rate": 5.91093117408907e-06, + "loss": 2.146, + "step": 147 + }, + { + "epoch": 0.5991902834008097, + "grad_norm": 0.8388567349727308, + "learning_rate": 5.951417004048583e-06, + "loss": 2.0989, + "step": 148 + }, + { + "epoch": 0.6032388663967612, + "grad_norm": 0.773577497225349, + "learning_rate": 5.991902834008098e-06, + "loss": 2.2379, + "step": 149 + }, + { + "epoch": 0.6072874493927125, + "grad_norm": 0.7826979729986758, + "learning_rate": 6.0323886639676124e-06, + "loss": 2.18, + "step": 150 + }, + { + "epoch": 0.611336032388664, + "grad_norm": 0.8592925674032668, + "learning_rate": 6.0728744939271254e-06, + "loss": 2.4302, + "step": 151 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.6169427006453612, + "learning_rate": 6.11336032388664e-06, + "loss": 2.2208, + "step": 152 + }, + { + "epoch": 0.6194331983805668, + "grad_norm": 0.8979145279675816, + "learning_rate": 6.153846153846155e-06, + "loss": 2.3089, + "step": 153 + }, + { + "epoch": 0.6234817813765182, + "grad_norm": 0.8069478254920203, + "learning_rate": 6.194331983805668e-06, + "loss": 2.5248, + "step": 154 + }, + { + "epoch": 0.6275303643724697, + "grad_norm": 0.702872317531758, + "learning_rate": 6.234817813765183e-06, + "loss": 2.2786, + "step": 155 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 1.1902510486781737, + "learning_rate": 6.275303643724697e-06, + "loss": 2.564, + "step": 156 + }, + { + "epoch": 0.6356275303643725, + "grad_norm": 0.7322358696471963, + "learning_rate": 6.31578947368421e-06, + "loss": 2.2575, + "step": 157 + }, + { + "epoch": 0.6396761133603239, + "grad_norm": 0.827272619073328, + "learning_rate": 6.356275303643725e-06, + "loss": 2.4085, + "step": 158 + }, + { + "epoch": 0.6437246963562753, + "grad_norm": 0.844449245612401, + "learning_rate": 6.39676113360324e-06, + "loss": 2.3392, + "step": 159 + }, + { + "epoch": 0.6477732793522267, + "grad_norm": 0.6963954379010507, + "learning_rate": 6.437246963562754e-06, + "loss": 2.3474, + "step": 160 + }, + { + "epoch": 0.6518218623481782, + "grad_norm": 1.0062158283533227, + "learning_rate": 6.4777327935222675e-06, + "loss": 2.206, + "step": 161 + }, + { + "epoch": 0.6558704453441295, + "grad_norm": 0.7010434692271018, + "learning_rate": 6.518218623481782e-06, + "loss": 2.4407, + "step": 162 + }, + { + "epoch": 0.659919028340081, + "grad_norm": 0.8546299950775236, + "learning_rate": 6.558704453441296e-06, + "loss": 2.3308, + "step": 163 + }, + { + "epoch": 0.6639676113360324, + "grad_norm": 0.9160069550133176, + "learning_rate": 6.599190283400811e-06, + "loss": 2.2799, + "step": 164 + }, + { + "epoch": 0.6680161943319838, + "grad_norm": 0.6991934828570997, + "learning_rate": 6.639676113360325e-06, + "loss": 2.3277, + "step": 165 + }, + { + "epoch": 0.6720647773279352, + "grad_norm": 2.441952914795693, + "learning_rate": 6.6801619433198385e-06, + "loss": 2.2357, + "step": 166 + }, + { + "epoch": 0.6761133603238867, + "grad_norm": 0.7134946099061733, + "learning_rate": 6.720647773279353e-06, + "loss": 2.1807, + "step": 167 + }, + { + "epoch": 0.680161943319838, + "grad_norm": 0.7920123504029117, + "learning_rate": 6.761133603238867e-06, + "loss": 2.4623, + "step": 168 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.7987244705898385, + "learning_rate": 6.801619433198381e-06, + "loss": 2.2289, + "step": 169 + }, + { + "epoch": 0.6882591093117408, + "grad_norm": 0.8092206406250949, + "learning_rate": 6.842105263157896e-06, + "loss": 2.3704, + "step": 170 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.7440145606342271, + "learning_rate": 6.882591093117409e-06, + "loss": 2.3322, + "step": 171 + }, + { + "epoch": 0.6963562753036437, + "grad_norm": 0.704685785309606, + "learning_rate": 6.923076923076923e-06, + "loss": 2.1067, + "step": 172 + }, + { + "epoch": 0.7004048582995951, + "grad_norm": 0.8716057180507851, + "learning_rate": 6.963562753036438e-06, + "loss": 2.6915, + "step": 173 + }, + { + "epoch": 0.7044534412955465, + "grad_norm": 0.8610302596466904, + "learning_rate": 7.004048582995951e-06, + "loss": 2.3607, + "step": 174 + }, + { + "epoch": 0.708502024291498, + "grad_norm": 0.7454341645101108, + "learning_rate": 7.044534412955466e-06, + "loss": 2.0946, + "step": 175 + }, + { + "epoch": 0.7125506072874493, + "grad_norm": 0.775526558923258, + "learning_rate": 7.0850202429149805e-06, + "loss": 2.2197, + "step": 176 + }, + { + "epoch": 0.7165991902834008, + "grad_norm": 0.7425363416700347, + "learning_rate": 7.125506072874494e-06, + "loss": 2.2515, + "step": 177 + }, + { + "epoch": 0.7206477732793523, + "grad_norm": 0.799480261879121, + "learning_rate": 7.165991902834008e-06, + "loss": 2.2984, + "step": 178 + }, + { + "epoch": 0.7246963562753036, + "grad_norm": 1.208911299168472, + "learning_rate": 7.206477732793523e-06, + "loss": 2.3498, + "step": 179 + }, + { + "epoch": 0.728744939271255, + "grad_norm": 0.8451843361875137, + "learning_rate": 7.246963562753037e-06, + "loss": 2.3922, + "step": 180 + }, + { + "epoch": 0.7327935222672065, + "grad_norm": 0.6688748588442022, + "learning_rate": 7.2874493927125516e-06, + "loss": 2.2572, + "step": 181 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 1.1693138233285796, + "learning_rate": 7.327935222672065e-06, + "loss": 2.327, + "step": 182 + }, + { + "epoch": 0.7408906882591093, + "grad_norm": 1.6904745941237547, + "learning_rate": 7.368421052631579e-06, + "loss": 2.8703, + "step": 183 + }, + { + "epoch": 0.7449392712550608, + "grad_norm": 0.8844949083017518, + "learning_rate": 7.408906882591094e-06, + "loss": 2.2888, + "step": 184 + }, + { + "epoch": 0.7489878542510121, + "grad_norm": 0.8858477106782153, + "learning_rate": 7.449392712550608e-06, + "loss": 2.2582, + "step": 185 + }, + { + "epoch": 0.7530364372469636, + "grad_norm": 0.7394352987608678, + "learning_rate": 7.489878542510122e-06, + "loss": 2.0775, + "step": 186 + }, + { + "epoch": 0.757085020242915, + "grad_norm": 0.8834206013583122, + "learning_rate": 7.5303643724696364e-06, + "loss": 2.2682, + "step": 187 + }, + { + "epoch": 0.7611336032388664, + "grad_norm": 6.250751086281045, + "learning_rate": 7.570850202429151e-06, + "loss": 3.2512, + "step": 188 + }, + { + "epoch": 0.7651821862348178, + "grad_norm": 35.543626516502854, + "learning_rate": 7.611336032388664e-06, + "loss": 3.2673, + "step": 189 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 4.671464673421441, + "learning_rate": 7.651821862348178e-06, + "loss": 3.288, + "step": 190 + }, + { + "epoch": 0.7732793522267206, + "grad_norm": 0.8467043403003462, + "learning_rate": 7.692307692307694e-06, + "loss": 2.3525, + "step": 191 + }, + { + "epoch": 0.7773279352226721, + "grad_norm": 0.7553553742503454, + "learning_rate": 7.732793522267207e-06, + "loss": 2.4147, + "step": 192 + }, + { + "epoch": 0.7813765182186235, + "grad_norm": 0.6722184689731728, + "learning_rate": 7.773279352226721e-06, + "loss": 2.4408, + "step": 193 + }, + { + "epoch": 0.7854251012145749, + "grad_norm": 0.8742278117345931, + "learning_rate": 7.813765182186235e-06, + "loss": 2.2427, + "step": 194 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.7018298382516639, + "learning_rate": 7.854251012145749e-06, + "loss": 2.1401, + "step": 195 + }, + { + "epoch": 0.7935222672064778, + "grad_norm": 0.8441291024867053, + "learning_rate": 7.894736842105265e-06, + "loss": 2.417, + "step": 196 + }, + { + "epoch": 0.7975708502024291, + "grad_norm": 0.8440780587728888, + "learning_rate": 7.935222672064778e-06, + "loss": 2.343, + "step": 197 + }, + { + "epoch": 0.8016194331983806, + "grad_norm": 0.7817852912155946, + "learning_rate": 7.975708502024292e-06, + "loss": 2.0718, + "step": 198 + }, + { + "epoch": 0.805668016194332, + "grad_norm": 0.8173811480736421, + "learning_rate": 8.016194331983806e-06, + "loss": 1.9574, + "step": 199 + }, + { + "epoch": 0.8097165991902834, + "grad_norm": 0.9130733429115842, + "learning_rate": 8.056680161943322e-06, + "loss": 2.1815, + "step": 200 + }, + { + "epoch": 0.8137651821862348, + "grad_norm": 0.9847086103025836, + "learning_rate": 8.097165991902834e-06, + "loss": 2.3515, + "step": 201 + }, + { + "epoch": 0.8178137651821862, + "grad_norm": 0.8676876881551969, + "learning_rate": 8.13765182186235e-06, + "loss": 2.0846, + "step": 202 + }, + { + "epoch": 0.8218623481781376, + "grad_norm": 13.90144045255743, + "learning_rate": 8.178137651821862e-06, + "loss": 2.901, + "step": 203 + }, + { + "epoch": 0.8259109311740891, + "grad_norm": 26.964637613541246, + "learning_rate": 8.218623481781377e-06, + "loss": 4.9217, + "step": 204 + }, + { + "epoch": 0.8299595141700404, + "grad_norm": 0.9450475296548486, + "learning_rate": 8.259109311740891e-06, + "loss": 2.213, + "step": 205 + }, + { + "epoch": 0.8340080971659919, + "grad_norm": 0.8251626027353501, + "learning_rate": 8.299595141700405e-06, + "loss": 2.1265, + "step": 206 + }, + { + "epoch": 0.8380566801619433, + "grad_norm": 1.5637444134794973, + "learning_rate": 8.340080971659919e-06, + "loss": 2.1168, + "step": 207 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.8572609413241875, + "learning_rate": 8.380566801619434e-06, + "loss": 2.2021, + "step": 208 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.8829526183041908, + "learning_rate": 8.421052631578948e-06, + "loss": 2.1197, + "step": 209 + }, + { + "epoch": 0.8502024291497976, + "grad_norm": 0.8230040936414714, + "learning_rate": 8.461538461538462e-06, + "loss": 2.1389, + "step": 210 + }, + { + "epoch": 0.854251012145749, + "grad_norm": 1.0630722291016348, + "learning_rate": 8.502024291497976e-06, + "loss": 2.2071, + "step": 211 + }, + { + "epoch": 0.8582995951417004, + "grad_norm": 0.8285650816893187, + "learning_rate": 8.54251012145749e-06, + "loss": 2.1278, + "step": 212 + }, + { + "epoch": 0.8623481781376519, + "grad_norm": 0.9374104368567024, + "learning_rate": 8.582995951417005e-06, + "loss": 2.2602, + "step": 213 + }, + { + "epoch": 0.8663967611336032, + "grad_norm": 0.9292432454800617, + "learning_rate": 8.62348178137652e-06, + "loss": 2.2139, + "step": 214 + }, + { + "epoch": 0.8704453441295547, + "grad_norm": 1.102816596900189, + "learning_rate": 8.663967611336033e-06, + "loss": 2.6954, + "step": 215 + }, + { + "epoch": 0.8744939271255061, + "grad_norm": 1.0693734533760941, + "learning_rate": 8.704453441295547e-06, + "loss": 2.6307, + "step": 216 + }, + { + "epoch": 0.8785425101214575, + "grad_norm": 0.9576307746487195, + "learning_rate": 8.744939271255063e-06, + "loss": 2.3637, + "step": 217 + }, + { + "epoch": 0.8825910931174089, + "grad_norm": 0.9705930148144204, + "learning_rate": 8.785425101214575e-06, + "loss": 2.2346, + "step": 218 + }, + { + "epoch": 0.8866396761133604, + "grad_norm": 1.0504776994181708, + "learning_rate": 8.82591093117409e-06, + "loss": 1.8973, + "step": 219 + }, + { + "epoch": 0.8906882591093117, + "grad_norm": 0.8931928814405187, + "learning_rate": 8.866396761133604e-06, + "loss": 2.2742, + "step": 220 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.9688347208506803, + "learning_rate": 8.906882591093118e-06, + "loss": 2.2952, + "step": 221 + }, + { + "epoch": 0.8987854251012146, + "grad_norm": 0.978996274596435, + "learning_rate": 8.947368421052632e-06, + "loss": 2.0332, + "step": 222 + }, + { + "epoch": 0.902834008097166, + "grad_norm": 0.9073798024023706, + "learning_rate": 8.987854251012147e-06, + "loss": 2.0714, + "step": 223 + }, + { + "epoch": 0.9068825910931174, + "grad_norm": 1.1581613082581128, + "learning_rate": 9.02834008097166e-06, + "loss": 2.2157, + "step": 224 + }, + { + "epoch": 0.9109311740890689, + "grad_norm": 1.0884120135655109, + "learning_rate": 9.068825910931175e-06, + "loss": 1.7915, + "step": 225 + }, + { + "epoch": 0.9149797570850202, + "grad_norm": 0.9581672716343882, + "learning_rate": 9.109311740890689e-06, + "loss": 2.0722, + "step": 226 + }, + { + "epoch": 0.9190283400809717, + "grad_norm": 0.9523432975820123, + "learning_rate": 9.149797570850203e-06, + "loss": 2.0351, + "step": 227 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.9395262500295037, + "learning_rate": 9.190283400809717e-06, + "loss": 2.1823, + "step": 228 + }, + { + "epoch": 0.9271255060728745, + "grad_norm": 1.0734663585541728, + "learning_rate": 9.230769230769232e-06, + "loss": 2.2329, + "step": 229 + }, + { + "epoch": 0.9311740890688259, + "grad_norm": 5.915661456573777, + "learning_rate": 9.271255060728746e-06, + "loss": 2.142, + "step": 230 + }, + { + "epoch": 0.9352226720647774, + "grad_norm": 0.943964635554494, + "learning_rate": 9.31174089068826e-06, + "loss": 2.0151, + "step": 231 + }, + { + "epoch": 0.9392712550607287, + "grad_norm": 0.9400321772267921, + "learning_rate": 9.352226720647774e-06, + "loss": 1.9453, + "step": 232 + }, + { + "epoch": 0.9433198380566802, + "grad_norm": 1.0803744575815664, + "learning_rate": 9.392712550607288e-06, + "loss": 2.2879, + "step": 233 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 1.1375116889631114, + "learning_rate": 9.433198380566803e-06, + "loss": 1.997, + "step": 234 + }, + { + "epoch": 0.951417004048583, + "grad_norm": 1.0484948139162147, + "learning_rate": 9.473684210526315e-06, + "loss": 2.0557, + "step": 235 + }, + { + "epoch": 0.9554655870445344, + "grad_norm": 1.9953282124950078, + "learning_rate": 9.514170040485831e-06, + "loss": 2.2939, + "step": 236 + }, + { + "epoch": 0.9595141700404858, + "grad_norm": 0.976191957030197, + "learning_rate": 9.554655870445345e-06, + "loss": 2.0733, + "step": 237 + }, + { + "epoch": 0.9635627530364372, + "grad_norm": 1.2563869839657487, + "learning_rate": 9.595141700404859e-06, + "loss": 2.0464, + "step": 238 + }, + { + "epoch": 0.9676113360323887, + "grad_norm": 1.5608940397030466, + "learning_rate": 9.635627530364373e-06, + "loss": 2.336, + "step": 239 + }, + { + "epoch": 0.97165991902834, + "grad_norm": 1.3591514491532213, + "learning_rate": 9.676113360323888e-06, + "loss": 2.3022, + "step": 240 + }, + { + "epoch": 0.9757085020242915, + "grad_norm": 0.9384697642414853, + "learning_rate": 9.7165991902834e-06, + "loss": 2.0917, + "step": 241 + }, + { + "epoch": 0.979757085020243, + "grad_norm": 1.0921517070072044, + "learning_rate": 9.757085020242916e-06, + "loss": 2.2454, + "step": 242 + }, + { + "epoch": 0.9838056680161943, + "grad_norm": 1.0952417249590038, + "learning_rate": 9.79757085020243e-06, + "loss": 2.2731, + "step": 243 + }, + { + "epoch": 0.9878542510121457, + "grad_norm": 1.004948368911197, + "learning_rate": 9.838056680161944e-06, + "loss": 2.0318, + "step": 244 + }, + { + "epoch": 0.9919028340080972, + "grad_norm": 0.9149897248279167, + "learning_rate": 9.878542510121458e-06, + "loss": 2.0005, + "step": 245 + }, + { + "epoch": 0.9959514170040485, + "grad_norm": 0.8508821706595309, + "learning_rate": 9.919028340080973e-06, + "loss": 2.2101, + "step": 246 + }, + { + "epoch": 1.0, + "grad_norm": 1.0244113302231659, + "learning_rate": 9.959514170040487e-06, + "loss": 2.0861, + "step": 247 + }, + { + "epoch": 1.0040485829959513, + "grad_norm": 0.9985250389875123, + "learning_rate": 1e-05, + "loss": 2.1654, + "step": 248 + }, + { + "epoch": 1.008097165991903, + "grad_norm": 1.5212147724237604, + "learning_rate": 9.999995007009308e-06, + "loss": 2.3841, + "step": 249 + }, + { + "epoch": 1.0121457489878543, + "grad_norm": 1.5612489351031709, + "learning_rate": 9.999980028047207e-06, + "loss": 2.2013, + "step": 250 + }, + { + "epoch": 1.0161943319838056, + "grad_norm": 1.3355032190827423, + "learning_rate": 9.99995506314361e-06, + "loss": 2.3109, + "step": 251 + }, + { + "epoch": 1.0202429149797572, + "grad_norm": 1.309995468445311, + "learning_rate": 9.999920112348379e-06, + "loss": 2.5018, + "step": 252 + }, + { + "epoch": 1.0242914979757085, + "grad_norm": 1.4582415698006528, + "learning_rate": 9.999875175731316e-06, + "loss": 2.4387, + "step": 253 + }, + { + "epoch": 1.0283400809716599, + "grad_norm": 1.2959671971401512, + "learning_rate": 9.99982025338217e-06, + "loss": 2.0271, + "step": 254 + }, + { + "epoch": 1.0323886639676114, + "grad_norm": 1.3702661061884107, + "learning_rate": 9.999755345410628e-06, + "loss": 2.1942, + "step": 255 + }, + { + "epoch": 1.0364372469635628, + "grad_norm": 1.2343807344186972, + "learning_rate": 9.999680451946327e-06, + "loss": 2.3802, + "step": 256 + }, + { + "epoch": 1.040485829959514, + "grad_norm": 1.2422842542141688, + "learning_rate": 9.999595573138845e-06, + "loss": 2.1737, + "step": 257 + }, + { + "epoch": 1.0445344129554657, + "grad_norm": 1.0535455017417064, + "learning_rate": 9.9995007091577e-06, + "loss": 2.1892, + "step": 258 + }, + { + "epoch": 1.048582995951417, + "grad_norm": 1.1326643708775719, + "learning_rate": 9.999395860192354e-06, + "loss": 2.165, + "step": 259 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.1512147523566951, + "learning_rate": 9.99928102645221e-06, + "loss": 2.4136, + "step": 260 + }, + { + "epoch": 1.05668016194332, + "grad_norm": 1.161431041066393, + "learning_rate": 9.999156208166614e-06, + "loss": 2.2649, + "step": 261 + }, + { + "epoch": 1.0607287449392713, + "grad_norm": 1.0550067630684001, + "learning_rate": 9.999021405584855e-06, + "loss": 2.2776, + "step": 262 + }, + { + "epoch": 1.0647773279352226, + "grad_norm": 1.2456078968374804, + "learning_rate": 9.99887661897616e-06, + "loss": 2.2937, + "step": 263 + }, + { + "epoch": 1.0688259109311742, + "grad_norm": 2.6565909174287934, + "learning_rate": 9.998721848629691e-06, + "loss": 2.3373, + "step": 264 + }, + { + "epoch": 1.0728744939271255, + "grad_norm": 1.2585354952683687, + "learning_rate": 9.99855709485456e-06, + "loss": 2.1755, + "step": 265 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 1.0397633573741487, + "learning_rate": 9.99838235797981e-06, + "loss": 2.1224, + "step": 266 + }, + { + "epoch": 1.0809716599190284, + "grad_norm": 1.3490485543349722, + "learning_rate": 9.998197638354428e-06, + "loss": 2.162, + "step": 267 + }, + { + "epoch": 1.0850202429149798, + "grad_norm": 0.9779246835555004, + "learning_rate": 9.998002936347334e-06, + "loss": 2.0674, + "step": 268 + }, + { + "epoch": 1.0890688259109311, + "grad_norm": 1.326338728002689, + "learning_rate": 9.997798252347382e-06, + "loss": 2.1639, + "step": 269 + }, + { + "epoch": 1.0931174089068827, + "grad_norm": 1.0363012993300713, + "learning_rate": 9.99758358676337e-06, + "loss": 2.2088, + "step": 270 + }, + { + "epoch": 1.097165991902834, + "grad_norm": 1.0931184449284037, + "learning_rate": 9.99735894002403e-06, + "loss": 1.9417, + "step": 271 + }, + { + "epoch": 1.1012145748987854, + "grad_norm": 1.1142050270090365, + "learning_rate": 9.99712431257802e-06, + "loss": 2.1229, + "step": 272 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.1058458560003002, + "learning_rate": 9.99687970489394e-06, + "loss": 2.147, + "step": 273 + }, + { + "epoch": 1.1093117408906883, + "grad_norm": 1.1507827310584715, + "learning_rate": 9.996625117460319e-06, + "loss": 2.0305, + "step": 274 + }, + { + "epoch": 1.1133603238866396, + "grad_norm": 1.4399534822311415, + "learning_rate": 9.996360550785619e-06, + "loss": 1.993, + "step": 275 + }, + { + "epoch": 1.117408906882591, + "grad_norm": 1.3360646827911495, + "learning_rate": 9.996086005398228e-06, + "loss": 1.9789, + "step": 276 + }, + { + "epoch": 1.1214574898785425, + "grad_norm": 1.1287606232609018, + "learning_rate": 9.995801481846474e-06, + "loss": 1.9362, + "step": 277 + }, + { + "epoch": 1.125506072874494, + "grad_norm": 1.0926872380366626, + "learning_rate": 9.9955069806986e-06, + "loss": 1.8981, + "step": 278 + }, + { + "epoch": 1.1295546558704452, + "grad_norm": 1.225113996229143, + "learning_rate": 9.995202502542785e-06, + "loss": 1.877, + "step": 279 + }, + { + "epoch": 1.1336032388663968, + "grad_norm": 1.350566519940966, + "learning_rate": 9.99488804798713e-06, + "loss": 2.1812, + "step": 280 + }, + { + "epoch": 1.1376518218623481, + "grad_norm": 1.3946048118439773, + "learning_rate": 9.994563617659665e-06, + "loss": 2.0952, + "step": 281 + }, + { + "epoch": 1.1417004048582995, + "grad_norm": 1.016854167145539, + "learning_rate": 9.99422921220834e-06, + "loss": 1.7897, + "step": 282 + }, + { + "epoch": 1.145748987854251, + "grad_norm": 1.1675202565627227, + "learning_rate": 9.993884832301029e-06, + "loss": 2.1832, + "step": 283 + }, + { + "epoch": 1.1497975708502024, + "grad_norm": 1.1052537876752062, + "learning_rate": 9.993530478625524e-06, + "loss": 2.0419, + "step": 284 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.0339091939503424, + "learning_rate": 9.99316615188954e-06, + "loss": 2.1765, + "step": 285 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 1.224235640342616, + "learning_rate": 9.992791852820709e-06, + "loss": 2.414, + "step": 286 + }, + { + "epoch": 1.1619433198380567, + "grad_norm": 1.1077938277922803, + "learning_rate": 9.992407582166582e-06, + "loss": 2.0729, + "step": 287 + }, + { + "epoch": 1.165991902834008, + "grad_norm": 1.1047832453065312, + "learning_rate": 9.99201334069462e-06, + "loss": 2.0816, + "step": 288 + }, + { + "epoch": 1.1700404858299596, + "grad_norm": 1.020340791924455, + "learning_rate": 9.991609129192202e-06, + "loss": 2.4242, + "step": 289 + }, + { + "epoch": 1.174089068825911, + "grad_norm": 1.0597565636193305, + "learning_rate": 9.991194948466615e-06, + "loss": 1.9546, + "step": 290 + }, + { + "epoch": 1.1781376518218623, + "grad_norm": 2.733652108939615, + "learning_rate": 9.990770799345064e-06, + "loss": 2.0891, + "step": 291 + }, + { + "epoch": 1.1821862348178138, + "grad_norm": 1.06820787268932, + "learning_rate": 9.990336682674656e-06, + "loss": 1.8523, + "step": 292 + }, + { + "epoch": 1.1862348178137652, + "grad_norm": 2.087421429190754, + "learning_rate": 9.989892599322404e-06, + "loss": 2.0252, + "step": 293 + }, + { + "epoch": 1.1902834008097165, + "grad_norm": 1.0884298591172652, + "learning_rate": 9.989438550175235e-06, + "loss": 2.094, + "step": 294 + }, + { + "epoch": 1.194331983805668, + "grad_norm": 1.4465924376774404, + "learning_rate": 9.98897453613997e-06, + "loss": 2.2522, + "step": 295 + }, + { + "epoch": 1.1983805668016194, + "grad_norm": 1.2561153181877684, + "learning_rate": 9.988500558143337e-06, + "loss": 2.3174, + "step": 296 + }, + { + "epoch": 1.2024291497975708, + "grad_norm": 1.299592783957394, + "learning_rate": 9.988016617131966e-06, + "loss": 2.0626, + "step": 297 + }, + { + "epoch": 1.2064777327935223, + "grad_norm": 1.616312765069768, + "learning_rate": 9.987522714072377e-06, + "loss": 2.332, + "step": 298 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 1.1673730449379247, + "learning_rate": 9.987018849950996e-06, + "loss": 2.3944, + "step": 299 + }, + { + "epoch": 1.214574898785425, + "grad_norm": 1.143398053052611, + "learning_rate": 9.986505025774137e-06, + "loss": 2.1948, + "step": 300 + }, + { + "epoch": 1.2186234817813766, + "grad_norm": 1.097402992490867, + "learning_rate": 9.985981242568009e-06, + "loss": 2.0261, + "step": 301 + }, + { + "epoch": 1.222672064777328, + "grad_norm": 1.1862462194607237, + "learning_rate": 9.985447501378706e-06, + "loss": 2.0268, + "step": 302 + }, + { + "epoch": 1.2267206477732793, + "grad_norm": 1.1867953576661743, + "learning_rate": 9.984903803272216e-06, + "loss": 2.0609, + "step": 303 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 1.160233224133256, + "learning_rate": 9.984350149334415e-06, + "loss": 2.118, + "step": 304 + }, + { + "epoch": 1.2348178137651822, + "grad_norm": 1.1580496833430431, + "learning_rate": 9.983786540671052e-06, + "loss": 2.2939, + "step": 305 + }, + { + "epoch": 1.2388663967611335, + "grad_norm": 1.1904466983631679, + "learning_rate": 9.983212978407767e-06, + "loss": 2.2554, + "step": 306 + }, + { + "epoch": 1.242914979757085, + "grad_norm": 1.191066075711238, + "learning_rate": 9.982629463690075e-06, + "loss": 2.2252, + "step": 307 + }, + { + "epoch": 1.2469635627530364, + "grad_norm": 0.9748723838702108, + "learning_rate": 9.982035997683372e-06, + "loss": 2.0288, + "step": 308 + }, + { + "epoch": 1.2510121457489878, + "grad_norm": 1.0421752021046666, + "learning_rate": 9.981432581572925e-06, + "loss": 2.0528, + "step": 309 + }, + { + "epoch": 1.2550607287449393, + "grad_norm": 1.1354302953976132, + "learning_rate": 9.980819216563875e-06, + "loss": 2.1848, + "step": 310 + }, + { + "epoch": 1.2591093117408907, + "grad_norm": 1.1565556608606453, + "learning_rate": 9.980195903881231e-06, + "loss": 1.9964, + "step": 311 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.0637756069428104, + "learning_rate": 9.979562644769871e-06, + "loss": 1.8735, + "step": 312 + }, + { + "epoch": 1.2672064777327936, + "grad_norm": 1.0699259387542537, + "learning_rate": 9.978919440494538e-06, + "loss": 2.0595, + "step": 313 + }, + { + "epoch": 1.271255060728745, + "grad_norm": 1.1179452169818913, + "learning_rate": 9.978266292339838e-06, + "loss": 2.1342, + "step": 314 + }, + { + "epoch": 1.2753036437246963, + "grad_norm": 0.9851906694579183, + "learning_rate": 9.977603201610236e-06, + "loss": 2.0658, + "step": 315 + }, + { + "epoch": 1.2793522267206479, + "grad_norm": 1.664317835506444, + "learning_rate": 9.976930169630052e-06, + "loss": 2.1478, + "step": 316 + }, + { + "epoch": 1.2834008097165992, + "grad_norm": 2.1052363417173012, + "learning_rate": 9.976247197743465e-06, + "loss": 1.8522, + "step": 317 + }, + { + "epoch": 1.2874493927125505, + "grad_norm": 1.1846256759923113, + "learning_rate": 9.975554287314505e-06, + "loss": 1.9432, + "step": 318 + }, + { + "epoch": 1.291497975708502, + "grad_norm": 1.138896431387234, + "learning_rate": 9.974851439727045e-06, + "loss": 1.8181, + "step": 319 + }, + { + "epoch": 1.2955465587044535, + "grad_norm": 1.153796269934686, + "learning_rate": 9.974138656384815e-06, + "loss": 2.1573, + "step": 320 + }, + { + "epoch": 1.2995951417004048, + "grad_norm": 1.703181471948063, + "learning_rate": 9.973415938711383e-06, + "loss": 2.1787, + "step": 321 + }, + { + "epoch": 1.3036437246963564, + "grad_norm": 1.7096036636558702, + "learning_rate": 9.972683288150155e-06, + "loss": 1.9479, + "step": 322 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 1.1866073546875906, + "learning_rate": 9.97194070616438e-06, + "loss": 1.9284, + "step": 323 + }, + { + "epoch": 1.311740890688259, + "grad_norm": 1.0952591943942271, + "learning_rate": 9.971188194237141e-06, + "loss": 1.9908, + "step": 324 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 1.5313235105110092, + "learning_rate": 9.97042575387135e-06, + "loss": 2.0365, + "step": 325 + }, + { + "epoch": 1.319838056680162, + "grad_norm": 1.2326037015549494, + "learning_rate": 9.969653386589749e-06, + "loss": 1.9016, + "step": 326 + }, + { + "epoch": 1.3238866396761133, + "grad_norm": 1.08612437072456, + "learning_rate": 9.968871093934908e-06, + "loss": 1.9295, + "step": 327 + }, + { + "epoch": 1.3279352226720649, + "grad_norm": 1.1765201682452633, + "learning_rate": 9.968078877469221e-06, + "loss": 1.9057, + "step": 328 + }, + { + "epoch": 1.3319838056680162, + "grad_norm": 1.1266840563836074, + "learning_rate": 9.967276738774897e-06, + "loss": 1.7933, + "step": 329 + }, + { + "epoch": 1.3360323886639676, + "grad_norm": 1.096241365913634, + "learning_rate": 9.966464679453969e-06, + "loss": 1.8225, + "step": 330 + }, + { + "epoch": 1.3400809716599191, + "grad_norm": 1.0190613068454424, + "learning_rate": 9.965642701128273e-06, + "loss": 1.7548, + "step": 331 + }, + { + "epoch": 1.3441295546558705, + "grad_norm": 1.045370042720153, + "learning_rate": 9.964810805439464e-06, + "loss": 1.8602, + "step": 332 + }, + { + "epoch": 1.3481781376518218, + "grad_norm": 1.2609434903119947, + "learning_rate": 9.963968994049e-06, + "loss": 2.0594, + "step": 333 + }, + { + "epoch": 1.3522267206477734, + "grad_norm": 2.6150970483606812, + "learning_rate": 9.963117268638147e-06, + "loss": 1.8496, + "step": 334 + }, + { + "epoch": 1.3562753036437247, + "grad_norm": 1.2099371136718209, + "learning_rate": 9.962255630907964e-06, + "loss": 1.6494, + "step": 335 + }, + { + "epoch": 1.360323886639676, + "grad_norm": 1.313765722576788, + "learning_rate": 9.961384082579311e-06, + "loss": 1.9562, + "step": 336 + }, + { + "epoch": 1.3643724696356276, + "grad_norm": 1.2172159882432991, + "learning_rate": 9.96050262539284e-06, + "loss": 2.0155, + "step": 337 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.2586156100651915, + "learning_rate": 9.959611261108999e-06, + "loss": 1.9085, + "step": 338 + }, + { + "epoch": 1.3724696356275303, + "grad_norm": 1.5183212778349207, + "learning_rate": 9.958709991508013e-06, + "loss": 2.0875, + "step": 339 + }, + { + "epoch": 1.376518218623482, + "grad_norm": 1.1522560111562028, + "learning_rate": 9.957798818389894e-06, + "loss": 1.619, + "step": 340 + }, + { + "epoch": 1.3805668016194332, + "grad_norm": 1.1594845675041106, + "learning_rate": 9.956877743574437e-06, + "loss": 1.809, + "step": 341 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 1.1122066306670175, + "learning_rate": 9.955946768901207e-06, + "loss": 1.7047, + "step": 342 + }, + { + "epoch": 1.3886639676113361, + "grad_norm": 1.330314253280862, + "learning_rate": 9.955005896229543e-06, + "loss": 1.7574, + "step": 343 + }, + { + "epoch": 1.3927125506072875, + "grad_norm": 1.1715493987473338, + "learning_rate": 9.954055127438554e-06, + "loss": 1.903, + "step": 344 + }, + { + "epoch": 1.3967611336032388, + "grad_norm": 1.3791674988449036, + "learning_rate": 9.95309446442711e-06, + "loss": 1.7259, + "step": 345 + }, + { + "epoch": 1.4008097165991904, + "grad_norm": 1.1049829081327143, + "learning_rate": 9.952123909113842e-06, + "loss": 1.7903, + "step": 346 + }, + { + "epoch": 1.4048582995951417, + "grad_norm": 1.2032214776472194, + "learning_rate": 9.951143463437145e-06, + "loss": 1.8805, + "step": 347 + }, + { + "epoch": 1.408906882591093, + "grad_norm": 1.4430732870842997, + "learning_rate": 9.950153129355156e-06, + "loss": 1.963, + "step": 348 + }, + { + "epoch": 1.4129554655870447, + "grad_norm": 1.1510222292519288, + "learning_rate": 9.949152908845771e-06, + "loss": 1.8567, + "step": 349 + }, + { + "epoch": 1.417004048582996, + "grad_norm": 1.195578264117532, + "learning_rate": 9.948142803906623e-06, + "loss": 2.0649, + "step": 350 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.233691487377917, + "learning_rate": 9.947122816555091e-06, + "loss": 2.1272, + "step": 351 + }, + { + "epoch": 1.425101214574899, + "grad_norm": 1.1086448213277071, + "learning_rate": 9.94609294882829e-06, + "loss": 1.9559, + "step": 352 + }, + { + "epoch": 1.4291497975708503, + "grad_norm": 1.095236792272251, + "learning_rate": 9.94505320278307e-06, + "loss": 2.0925, + "step": 353 + }, + { + "epoch": 1.4331983805668016, + "grad_norm": 1.5358655904235856, + "learning_rate": 9.944003580496004e-06, + "loss": 2.1299, + "step": 354 + }, + { + "epoch": 1.4372469635627532, + "grad_norm": 4.618210545500014, + "learning_rate": 9.942944084063397e-06, + "loss": 1.906, + "step": 355 + }, + { + "epoch": 1.4412955465587045, + "grad_norm": 1.2771853507714968, + "learning_rate": 9.94187471560127e-06, + "loss": 1.8895, + "step": 356 + }, + { + "epoch": 1.4453441295546559, + "grad_norm": 1.503260525653169, + "learning_rate": 9.940795477245362e-06, + "loss": 2.123, + "step": 357 + }, + { + "epoch": 1.4493927125506074, + "grad_norm": 1.1357577615662766, + "learning_rate": 9.939706371151124e-06, + "loss": 1.9087, + "step": 358 + }, + { + "epoch": 1.4534412955465588, + "grad_norm": 1.3448821103990194, + "learning_rate": 9.938607399493714e-06, + "loss": 1.8989, + "step": 359 + }, + { + "epoch": 1.45748987854251, + "grad_norm": 1.3913310219583304, + "learning_rate": 9.937498564467993e-06, + "loss": 2.2799, + "step": 360 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 1.9605641433764716, + "learning_rate": 9.936379868288525e-06, + "loss": 2.5915, + "step": 361 + }, + { + "epoch": 1.465587044534413, + "grad_norm": 1.2844543412275256, + "learning_rate": 9.935251313189564e-06, + "loss": 2.1301, + "step": 362 + }, + { + "epoch": 1.4696356275303644, + "grad_norm": 1.034982029315575, + "learning_rate": 9.934112901425058e-06, + "loss": 2.0549, + "step": 363 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.204999735063322, + "learning_rate": 9.932964635268637e-06, + "loss": 1.9596, + "step": 364 + }, + { + "epoch": 1.4777327935222673, + "grad_norm": 1.286601988495976, + "learning_rate": 9.931806517013612e-06, + "loss": 2.0348, + "step": 365 + }, + { + "epoch": 1.4817813765182186, + "grad_norm": 0.9482600934112612, + "learning_rate": 9.930638548972976e-06, + "loss": 1.9226, + "step": 366 + }, + { + "epoch": 1.48582995951417, + "grad_norm": 1.2527379198286719, + "learning_rate": 9.92946073347939e-06, + "loss": 1.9363, + "step": 367 + }, + { + "epoch": 1.4898785425101215, + "grad_norm": 1.416748811839403, + "learning_rate": 9.92827307288518e-06, + "loss": 1.8743, + "step": 368 + }, + { + "epoch": 1.4939271255060729, + "grad_norm": 1.4807677636442649, + "learning_rate": 9.927075569562342e-06, + "loss": 1.9204, + "step": 369 + }, + { + "epoch": 1.4979757085020242, + "grad_norm": 1.3869419977919077, + "learning_rate": 9.925868225902518e-06, + "loss": 1.8206, + "step": 370 + }, + { + "epoch": 1.5020242914979756, + "grad_norm": 1.1484019096824427, + "learning_rate": 9.924651044317017e-06, + "loss": 1.741, + "step": 371 + }, + { + "epoch": 1.5060728744939271, + "grad_norm": 1.33557569757452, + "learning_rate": 9.923424027236786e-06, + "loss": 2.0195, + "step": 372 + }, + { + "epoch": 1.5101214574898787, + "grad_norm": 1.3948710108814935, + "learning_rate": 9.922187177112422e-06, + "loss": 2.0682, + "step": 373 + }, + { + "epoch": 1.5141700404858298, + "grad_norm": 0.9670281862333157, + "learning_rate": 9.920940496414153e-06, + "loss": 2.0098, + "step": 374 + }, + { + "epoch": 1.5182186234817814, + "grad_norm": 1.1816940948972323, + "learning_rate": 9.919683987631849e-06, + "loss": 2.041, + "step": 375 + }, + { + "epoch": 1.522267206477733, + "grad_norm": 1.1912191018269882, + "learning_rate": 9.918417653275004e-06, + "loss": 1.9668, + "step": 376 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 2.3568912806934783, + "learning_rate": 9.917141495872733e-06, + "loss": 1.737, + "step": 377 + }, + { + "epoch": 1.5303643724696356, + "grad_norm": 1.4730591126031292, + "learning_rate": 9.915855517973776e-06, + "loss": 1.8672, + "step": 378 + }, + { + "epoch": 1.5344129554655872, + "grad_norm": 1.5631199604094446, + "learning_rate": 9.914559722146483e-06, + "loss": 2.0038, + "step": 379 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 2.5148949693335014, + "learning_rate": 9.913254110978812e-06, + "loss": 2.0916, + "step": 380 + }, + { + "epoch": 1.54251012145749, + "grad_norm": 1.0936340454215232, + "learning_rate": 9.911938687078324e-06, + "loss": 1.9959, + "step": 381 + }, + { + "epoch": 1.5465587044534415, + "grad_norm": 9.59805118170954, + "learning_rate": 9.91061345307218e-06, + "loss": 2.6669, + "step": 382 + }, + { + "epoch": 1.5506072874493926, + "grad_norm": 5.341110768663029, + "learning_rate": 9.909278411607134e-06, + "loss": 2.7524, + "step": 383 + }, + { + "epoch": 1.5546558704453441, + "grad_norm": 6.319523626825805, + "learning_rate": 9.90793356534952e-06, + "loss": 3.2784, + "step": 384 + }, + { + "epoch": 1.5587044534412957, + "grad_norm": 1.1632747156326964, + "learning_rate": 9.906578916985267e-06, + "loss": 1.9441, + "step": 385 + }, + { + "epoch": 1.5627530364372468, + "grad_norm": 1.129320861281679, + "learning_rate": 9.90521446921987e-06, + "loss": 1.84, + "step": 386 + }, + { + "epoch": 1.5668016194331984, + "grad_norm": 1.0396625767769134, + "learning_rate": 9.9038402247784e-06, + "loss": 2.0999, + "step": 387 + }, + { + "epoch": 1.5708502024291497, + "grad_norm": 1.1109350507878293, + "learning_rate": 9.90245618640549e-06, + "loss": 1.7455, + "step": 388 + }, + { + "epoch": 1.574898785425101, + "grad_norm": 1.1573410708340344, + "learning_rate": 9.90106235686534e-06, + "loss": 2.1349, + "step": 389 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 1.0084157125260218, + "learning_rate": 9.8996587389417e-06, + "loss": 1.8406, + "step": 390 + }, + { + "epoch": 1.582995951417004, + "grad_norm": 1.1571333441837306, + "learning_rate": 9.89824533543787e-06, + "loss": 2.1231, + "step": 391 + }, + { + "epoch": 1.5870445344129553, + "grad_norm": 1.0697948256338023, + "learning_rate": 9.896822149176695e-06, + "loss": 1.9727, + "step": 392 + }, + { + "epoch": 1.591093117408907, + "grad_norm": 1.1795302734430202, + "learning_rate": 9.895389183000557e-06, + "loss": 1.9829, + "step": 393 + }, + { + "epoch": 1.5951417004048583, + "grad_norm": 1.3378200533531102, + "learning_rate": 9.893946439771369e-06, + "loss": 1.648, + "step": 394 + }, + { + "epoch": 1.5991902834008096, + "grad_norm": 1.190232768067943, + "learning_rate": 9.892493922370575e-06, + "loss": 1.6858, + "step": 395 + }, + { + "epoch": 1.6032388663967612, + "grad_norm": 1.1458315074040415, + "learning_rate": 9.891031633699135e-06, + "loss": 1.8744, + "step": 396 + }, + { + "epoch": 1.6072874493927125, + "grad_norm": 1.1819017581575564, + "learning_rate": 9.88955957667753e-06, + "loss": 1.7732, + "step": 397 + }, + { + "epoch": 1.6113360323886639, + "grad_norm": 1.8565903989047288, + "learning_rate": 9.888077754245741e-06, + "loss": 2.0753, + "step": 398 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 1.0244971639990994, + "learning_rate": 9.886586169363267e-06, + "loss": 1.9333, + "step": 399 + }, + { + "epoch": 1.6194331983805668, + "grad_norm": 1.249918723327364, + "learning_rate": 9.885084825009085e-06, + "loss": 1.8167, + "step": 400 + }, + { + "epoch": 1.623481781376518, + "grad_norm": 1.379879581099796, + "learning_rate": 9.883573724181683e-06, + "loss": 2.1783, + "step": 401 + }, + { + "epoch": 1.6275303643724697, + "grad_norm": 1.0714251364756116, + "learning_rate": 9.882052869899024e-06, + "loss": 1.9676, + "step": 402 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 1.2237579067545878, + "learning_rate": 9.880522265198548e-06, + "loss": 2.154, + "step": 403 + }, + { + "epoch": 1.6356275303643724, + "grad_norm": 1.0681493200255976, + "learning_rate": 9.878981913137178e-06, + "loss": 1.8629, + "step": 404 + }, + { + "epoch": 1.639676113360324, + "grad_norm": 1.213978261543208, + "learning_rate": 9.877431816791299e-06, + "loss": 2.0544, + "step": 405 + }, + { + "epoch": 1.6437246963562753, + "grad_norm": 1.0906406926843764, + "learning_rate": 9.875871979256754e-06, + "loss": 2.0126, + "step": 406 + }, + { + "epoch": 1.6477732793522266, + "grad_norm": 1.1548847276751324, + "learning_rate": 9.87430240364885e-06, + "loss": 1.9896, + "step": 407 + }, + { + "epoch": 1.6518218623481782, + "grad_norm": 1.1007484969249457, + "learning_rate": 9.872723093102332e-06, + "loss": 1.8537, + "step": 408 + }, + { + "epoch": 1.6558704453441295, + "grad_norm": 1.4626798707839297, + "learning_rate": 9.871134050771398e-06, + "loss": 2.0636, + "step": 409 + }, + { + "epoch": 1.6599190283400809, + "grad_norm": 1.4362925135326843, + "learning_rate": 9.869535279829674e-06, + "loss": 1.892, + "step": 410 + }, + { + "epoch": 1.6639676113360324, + "grad_norm": 1.1158035130218342, + "learning_rate": 9.867926783470221e-06, + "loss": 2.0106, + "step": 411 + }, + { + "epoch": 1.6680161943319838, + "grad_norm": 1.094342494438384, + "learning_rate": 9.866308564905523e-06, + "loss": 2.0453, + "step": 412 + }, + { + "epoch": 1.6720647773279351, + "grad_norm": 1.0432966613184569, + "learning_rate": 9.864680627367476e-06, + "loss": 1.9541, + "step": 413 + }, + { + "epoch": 1.6761133603238867, + "grad_norm": 1.2646590113938572, + "learning_rate": 9.863042974107395e-06, + "loss": 1.9078, + "step": 414 + }, + { + "epoch": 1.680161943319838, + "grad_norm": 1.4143613333940679, + "learning_rate": 9.861395608395993e-06, + "loss": 2.0498, + "step": 415 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 1.1227780591009553, + "learning_rate": 9.859738533523384e-06, + "loss": 1.8425, + "step": 416 + }, + { + "epoch": 1.688259109311741, + "grad_norm": 1.1478310296573677, + "learning_rate": 9.85807175279907e-06, + "loss": 1.9961, + "step": 417 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 1.1555612172711482, + "learning_rate": 9.856395269551941e-06, + "loss": 1.9982, + "step": 418 + }, + { + "epoch": 1.6963562753036436, + "grad_norm": 1.2453555718552303, + "learning_rate": 9.854709087130261e-06, + "loss": 1.8074, + "step": 419 + }, + { + "epoch": 1.7004048582995952, + "grad_norm": 1.3445248996792332, + "learning_rate": 9.85301320890167e-06, + "loss": 2.315, + "step": 420 + }, + { + "epoch": 1.7044534412955465, + "grad_norm": 1.37583724829167, + "learning_rate": 9.851307638253167e-06, + "loss": 2.0698, + "step": 421 + }, + { + "epoch": 1.708502024291498, + "grad_norm": 1.4100704184587762, + "learning_rate": 9.849592378591113e-06, + "loss": 1.7238, + "step": 422 + }, + { + "epoch": 1.7125506072874495, + "grad_norm": 1.2265807736330994, + "learning_rate": 9.847867433341218e-06, + "loss": 1.881, + "step": 423 + }, + { + "epoch": 1.7165991902834008, + "grad_norm": 1.192372006539784, + "learning_rate": 9.846132805948534e-06, + "loss": 1.9658, + "step": 424 + }, + { + "epoch": 1.7206477732793521, + "grad_norm": 1.307546713268623, + "learning_rate": 9.844388499877457e-06, + "loss": 1.873, + "step": 425 + }, + { + "epoch": 1.7246963562753037, + "grad_norm": 1.382722813051471, + "learning_rate": 9.842634518611705e-06, + "loss": 1.9664, + "step": 426 + }, + { + "epoch": 1.728744939271255, + "grad_norm": 1.4179302059943903, + "learning_rate": 9.840870865654323e-06, + "loss": 2.1073, + "step": 427 + }, + { + "epoch": 1.7327935222672064, + "grad_norm": 1.0508460965436048, + "learning_rate": 9.839097544527674e-06, + "loss": 1.9957, + "step": 428 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.239601761065164, + "learning_rate": 9.837314558773427e-06, + "loss": 2.0381, + "step": 429 + }, + { + "epoch": 1.7408906882591093, + "grad_norm": 2.1485433652175137, + "learning_rate": 9.835521911952554e-06, + "loss": 2.6976, + "step": 430 + }, + { + "epoch": 1.7449392712550607, + "grad_norm": 1.2416619753926275, + "learning_rate": 9.833719607645325e-06, + "loss": 2.0715, + "step": 431 + }, + { + "epoch": 1.7489878542510122, + "grad_norm": 1.2591779562696075, + "learning_rate": 9.831907649451291e-06, + "loss": 1.9002, + "step": 432 + }, + { + "epoch": 1.7530364372469636, + "grad_norm": 1.1535891547143164, + "learning_rate": 9.830086040989294e-06, + "loss": 1.7871, + "step": 433 + }, + { + "epoch": 1.757085020242915, + "grad_norm": 1.1923358702044, + "learning_rate": 9.82825478589744e-06, + "loss": 1.9962, + "step": 434 + }, + { + "epoch": 1.7611336032388665, + "grad_norm": 4.275347299758622, + "learning_rate": 9.826413887833103e-06, + "loss": 2.9222, + "step": 435 + }, + { + "epoch": 1.7651821862348178, + "grad_norm": 4.287598045967039, + "learning_rate": 9.824563350472922e-06, + "loss": 2.8461, + "step": 436 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 10.935868536450831, + "learning_rate": 9.822703177512783e-06, + "loss": 2.7384, + "step": 437 + }, + { + "epoch": 1.7732793522267207, + "grad_norm": 1.3409883266265459, + "learning_rate": 9.820833372667813e-06, + "loss": 1.9939, + "step": 438 + }, + { + "epoch": 1.777327935222672, + "grad_norm": 1.3613081112789813, + "learning_rate": 9.818953939672382e-06, + "loss": 2.1821, + "step": 439 + }, + { + "epoch": 1.7813765182186234, + "grad_norm": 1.2675875076339627, + "learning_rate": 9.817064882280085e-06, + "loss": 2.2096, + "step": 440 + }, + { + "epoch": 1.785425101214575, + "grad_norm": 1.1133761183439654, + "learning_rate": 9.815166204263743e-06, + "loss": 2.0038, + "step": 441 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 1.0606754044873359, + "learning_rate": 9.813257909415384e-06, + "loss": 1.887, + "step": 442 + }, + { + "epoch": 1.7935222672064777, + "grad_norm": 1.2526447757224037, + "learning_rate": 9.811340001546252e-06, + "loss": 2.0549, + "step": 443 + }, + { + "epoch": 1.7975708502024292, + "grad_norm": 1.1262042906691425, + "learning_rate": 9.809412484486785e-06, + "loss": 2.077, + "step": 444 + }, + { + "epoch": 1.8016194331983806, + "grad_norm": 1.155022921046038, + "learning_rate": 9.80747536208661e-06, + "loss": 1.8171, + "step": 445 + }, + { + "epoch": 1.805668016194332, + "grad_norm": 1.1470501457250857, + "learning_rate": 9.805528638214543e-06, + "loss": 1.709, + "step": 446 + }, + { + "epoch": 1.8097165991902835, + "grad_norm": 1.254871859778204, + "learning_rate": 9.803572316758573e-06, + "loss": 2.005, + "step": 447 + }, + { + "epoch": 1.8137651821862348, + "grad_norm": 1.4428684006978485, + "learning_rate": 9.801606401625857e-06, + "loss": 2.0437, + "step": 448 + }, + { + "epoch": 1.8178137651821862, + "grad_norm": 1.1372709832560302, + "learning_rate": 9.799630896742716e-06, + "loss": 1.8053, + "step": 449 + }, + { + "epoch": 1.8218623481781377, + "grad_norm": 7.867540851479705, + "learning_rate": 9.797645806054617e-06, + "loss": 2.6057, + "step": 450 + }, + { + "epoch": 1.825910931174089, + "grad_norm": 17.828898730946783, + "learning_rate": 9.79565113352618e-06, + "loss": 4.1742, + "step": 451 + }, + { + "epoch": 1.8299595141700404, + "grad_norm": 1.3323533085958537, + "learning_rate": 9.793646883141155e-06, + "loss": 1.9001, + "step": 452 + }, + { + "epoch": 1.834008097165992, + "grad_norm": 1.2550944955882024, + "learning_rate": 9.791633058902424e-06, + "loss": 1.7789, + "step": 453 + }, + { + "epoch": 1.8380566801619433, + "grad_norm": 1.2515953723091495, + "learning_rate": 9.789609664831988e-06, + "loss": 1.8425, + "step": 454 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 1.1650016476570495, + "learning_rate": 9.787576704970965e-06, + "loss": 1.8701, + "step": 455 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 1.1568290050770706, + "learning_rate": 9.785534183379571e-06, + "loss": 1.8468, + "step": 456 + }, + { + "epoch": 1.8502024291497976, + "grad_norm": 1.1529373182216824, + "learning_rate": 9.783482104137127e-06, + "loss": 1.8772, + "step": 457 + }, + { + "epoch": 1.854251012145749, + "grad_norm": 1.3000516637827273, + "learning_rate": 9.781420471342035e-06, + "loss": 1.9477, + "step": 458 + }, + { + "epoch": 1.8582995951417005, + "grad_norm": 1.0258650008659411, + "learning_rate": 9.779349289111781e-06, + "loss": 1.8995, + "step": 459 + }, + { + "epoch": 1.8623481781376519, + "grad_norm": 1.2394575763975424, + "learning_rate": 9.777268561582921e-06, + "loss": 1.9406, + "step": 460 + }, + { + "epoch": 1.8663967611336032, + "grad_norm": 1.2541685708518606, + "learning_rate": 9.77517829291108e-06, + "loss": 1.9325, + "step": 461 + }, + { + "epoch": 1.8704453441295548, + "grad_norm": 1.5330647366042962, + "learning_rate": 9.773078487270932e-06, + "loss": 2.4038, + "step": 462 + }, + { + "epoch": 1.874493927125506, + "grad_norm": 1.5015880335176561, + "learning_rate": 9.770969148856202e-06, + "loss": 2.3187, + "step": 463 + }, + { + "epoch": 1.8785425101214575, + "grad_norm": 1.4834304636666527, + "learning_rate": 9.768850281879651e-06, + "loss": 2.1105, + "step": 464 + }, + { + "epoch": 1.882591093117409, + "grad_norm": 1.2140714457469706, + "learning_rate": 9.766721890573075e-06, + "loss": 1.9824, + "step": 465 + }, + { + "epoch": 1.8866396761133604, + "grad_norm": 1.3661085878272685, + "learning_rate": 9.764583979187288e-06, + "loss": 1.5205, + "step": 466 + }, + { + "epoch": 1.8906882591093117, + "grad_norm": 1.2317311840953222, + "learning_rate": 9.762436551992117e-06, + "loss": 1.9872, + "step": 467 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 1.3883104250103875, + "learning_rate": 9.760279613276397e-06, + "loss": 2.0814, + "step": 468 + }, + { + "epoch": 1.8987854251012146, + "grad_norm": 1.1681713845582538, + "learning_rate": 9.75811316734796e-06, + "loss": 1.7849, + "step": 469 + }, + { + "epoch": 1.902834008097166, + "grad_norm": 1.15545443174025, + "learning_rate": 9.755937218533622e-06, + "loss": 1.8179, + "step": 470 + }, + { + "epoch": 1.9068825910931175, + "grad_norm": 1.5408624758508003, + "learning_rate": 9.753751771179177e-06, + "loss": 2.0286, + "step": 471 + }, + { + "epoch": 1.9109311740890689, + "grad_norm": 1.3817398480348058, + "learning_rate": 9.751556829649398e-06, + "loss": 1.5547, + "step": 472 + }, + { + "epoch": 1.9149797570850202, + "grad_norm": 1.3351696061966247, + "learning_rate": 9.74935239832801e-06, + "loss": 1.733, + "step": 473 + }, + { + "epoch": 1.9190283400809718, + "grad_norm": 1.264760117783077, + "learning_rate": 9.747138481617695e-06, + "loss": 1.767, + "step": 474 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 1.2863761477462097, + "learning_rate": 9.74491508394008e-06, + "loss": 2.0018, + "step": 475 + }, + { + "epoch": 1.9271255060728745, + "grad_norm": 1.5310497493928237, + "learning_rate": 9.742682209735727e-06, + "loss": 1.8865, + "step": 476 + }, + { + "epoch": 1.931174089068826, + "grad_norm": 1.711973469366144, + "learning_rate": 9.740439863464127e-06, + "loss": 1.9105, + "step": 477 + }, + { + "epoch": 1.9352226720647774, + "grad_norm": 1.249933707627717, + "learning_rate": 9.738188049603679e-06, + "loss": 1.7676, + "step": 478 + }, + { + "epoch": 1.9392712550607287, + "grad_norm": 1.2902981801333298, + "learning_rate": 9.735926772651703e-06, + "loss": 1.6493, + "step": 479 + }, + { + "epoch": 1.9433198380566803, + "grad_norm": 1.4792877192638219, + "learning_rate": 9.73365603712441e-06, + "loss": 1.9464, + "step": 480 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 1.3282266924987296, + "learning_rate": 9.731375847556905e-06, + "loss": 1.6826, + "step": 481 + }, + { + "epoch": 1.951417004048583, + "grad_norm": 1.4677668638223476, + "learning_rate": 9.729086208503174e-06, + "loss": 1.7014, + "step": 482 + }, + { + "epoch": 1.9554655870445345, + "grad_norm": 2.3808599607342855, + "learning_rate": 9.726787124536077e-06, + "loss": 1.9583, + "step": 483 + }, + { + "epoch": 1.9595141700404857, + "grad_norm": 1.3600754750050374, + "learning_rate": 9.724478600247333e-06, + "loss": 1.7925, + "step": 484 + }, + { + "epoch": 1.9635627530364372, + "grad_norm": 1.1666914976637783, + "learning_rate": 9.722160640247523e-06, + "loss": 1.8932, + "step": 485 + }, + { + "epoch": 1.9676113360323888, + "grad_norm": 1.3451750453053897, + "learning_rate": 9.719833249166061e-06, + "loss": 2.1332, + "step": 486 + }, + { + "epoch": 1.97165991902834, + "grad_norm": 1.9010105722641066, + "learning_rate": 9.717496431651212e-06, + "loss": 2.0526, + "step": 487 + }, + { + "epoch": 1.9757085020242915, + "grad_norm": 1.1672390815512188, + "learning_rate": 9.715150192370054e-06, + "loss": 1.8783, + "step": 488 + }, + { + "epoch": 1.979757085020243, + "grad_norm": 1.384114220461852, + "learning_rate": 9.712794536008488e-06, + "loss": 1.9859, + "step": 489 + }, + { + "epoch": 1.9838056680161942, + "grad_norm": 1.2933526518975824, + "learning_rate": 9.710429467271221e-06, + "loss": 2.0382, + "step": 490 + }, + { + "epoch": 1.9878542510121457, + "grad_norm": 1.423570288241044, + "learning_rate": 9.708054990881763e-06, + "loss": 1.8377, + "step": 491 + }, + { + "epoch": 1.9919028340080973, + "grad_norm": 1.2866158830707874, + "learning_rate": 9.705671111582406e-06, + "loss": 1.7694, + "step": 492 + }, + { + "epoch": 1.9959514170040484, + "grad_norm": 1.0521519412024614, + "learning_rate": 9.703277834134227e-06, + "loss": 2.0757, + "step": 493 + }, + { + "epoch": 2.0, + "grad_norm": 1.2995506674782646, + "learning_rate": 9.700875163317072e-06, + "loss": 1.8875, + "step": 494 + }, + { + "epoch": 2.0040485829959516, + "grad_norm": 1.1352855274001465, + "learning_rate": 9.698463103929542e-06, + "loss": 1.9618, + "step": 495 + }, + { + "epoch": 2.0080971659919027, + "grad_norm": 1.542269208448278, + "learning_rate": 9.696041660788997e-06, + "loss": 2.0888, + "step": 496 + }, + { + "epoch": 2.0121457489878543, + "grad_norm": 1.6780350902786914, + "learning_rate": 9.693610838731532e-06, + "loss": 1.9408, + "step": 497 + }, + { + "epoch": 2.016194331983806, + "grad_norm": 1.6035230575875041, + "learning_rate": 9.691170642611975e-06, + "loss": 2.0771, + "step": 498 + }, + { + "epoch": 2.020242914979757, + "grad_norm": 1.4671035377471024, + "learning_rate": 9.68872107730388e-06, + "loss": 2.3311, + "step": 499 + }, + { + "epoch": 2.0242914979757085, + "grad_norm": 1.5075955512152057, + "learning_rate": 9.686262147699507e-06, + "loss": 2.2077, + "step": 500 + }, + { + "epoch": 2.02834008097166, + "grad_norm": 1.5639916261560791, + "learning_rate": 9.683793858709821e-06, + "loss": 1.8546, + "step": 501 + }, + { + "epoch": 2.032388663967611, + "grad_norm": 1.5331421353363675, + "learning_rate": 9.681316215264481e-06, + "loss": 1.9004, + "step": 502 + }, + { + "epoch": 2.0364372469635628, + "grad_norm": 1.4656462364511347, + "learning_rate": 9.678829222311827e-06, + "loss": 2.1369, + "step": 503 + }, + { + "epoch": 2.0404858299595143, + "grad_norm": 1.7055289856989309, + "learning_rate": 9.67633288481887e-06, + "loss": 1.9294, + "step": 504 + }, + { + "epoch": 2.0445344129554655, + "grad_norm": 1.3320529357395552, + "learning_rate": 9.67382720777129e-06, + "loss": 1.9228, + "step": 505 + }, + { + "epoch": 2.048582995951417, + "grad_norm": 1.378485994628673, + "learning_rate": 9.671312196173413e-06, + "loss": 1.9005, + "step": 506 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 1.4519006220083899, + "learning_rate": 9.668787855048209e-06, + "loss": 2.0772, + "step": 507 + }, + { + "epoch": 2.0566801619433197, + "grad_norm": 1.46960243337033, + "learning_rate": 9.666254189437286e-06, + "loss": 1.9259, + "step": 508 + }, + { + "epoch": 2.0607287449392713, + "grad_norm": 1.3018755932293484, + "learning_rate": 9.663711204400872e-06, + "loss": 2.0637, + "step": 509 + }, + { + "epoch": 2.064777327935223, + "grad_norm": 1.4438151108336905, + "learning_rate": 9.661158905017804e-06, + "loss": 1.9998, + "step": 510 + }, + { + "epoch": 2.068825910931174, + "grad_norm": 1.5146888645164116, + "learning_rate": 9.658597296385527e-06, + "loss": 2.1032, + "step": 511 + }, + { + "epoch": 2.0728744939271255, + "grad_norm": 1.4173605487062464, + "learning_rate": 9.656026383620076e-06, + "loss": 1.9957, + "step": 512 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 1.3186505882274318, + "learning_rate": 9.653446171856069e-06, + "loss": 1.9291, + "step": 513 + }, + { + "epoch": 2.080971659919028, + "grad_norm": 1.2929004725593367, + "learning_rate": 9.650856666246693e-06, + "loss": 1.9435, + "step": 514 + }, + { + "epoch": 2.08502024291498, + "grad_norm": 1.2511951269635655, + "learning_rate": 9.6482578719637e-06, + "loss": 1.9267, + "step": 515 + }, + { + "epoch": 2.0890688259109313, + "grad_norm": 1.9429673192553882, + "learning_rate": 9.645649794197394e-06, + "loss": 1.9435, + "step": 516 + }, + { + "epoch": 2.0931174089068825, + "grad_norm": 1.315419932054697, + "learning_rate": 9.643032438156616e-06, + "loss": 2.0396, + "step": 517 + }, + { + "epoch": 2.097165991902834, + "grad_norm": 1.3284199817957691, + "learning_rate": 9.640405809068743e-06, + "loss": 1.765, + "step": 518 + }, + { + "epoch": 2.1012145748987856, + "grad_norm": 1.4032585852247357, + "learning_rate": 9.637769912179664e-06, + "loss": 1.9292, + "step": 519 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 1.4202061741742247, + "learning_rate": 9.635124752753787e-06, + "loss": 1.9832, + "step": 520 + }, + { + "epoch": 2.1093117408906883, + "grad_norm": 1.4962037346644237, + "learning_rate": 9.632470336074009e-06, + "loss": 1.8461, + "step": 521 + }, + { + "epoch": 2.11336032388664, + "grad_norm": 1.829451958189404, + "learning_rate": 9.629806667441727e-06, + "loss": 1.7856, + "step": 522 + }, + { + "epoch": 2.117408906882591, + "grad_norm": 1.6374878381545, + "learning_rate": 9.627133752176809e-06, + "loss": 1.7441, + "step": 523 + }, + { + "epoch": 2.1214574898785425, + "grad_norm": 1.4010819404830996, + "learning_rate": 9.624451595617588e-06, + "loss": 1.7615, + "step": 524 + }, + { + "epoch": 2.125506072874494, + "grad_norm": 1.441999234959946, + "learning_rate": 9.62176020312086e-06, + "loss": 1.7378, + "step": 525 + }, + { + "epoch": 2.1295546558704452, + "grad_norm": 1.5770630911097265, + "learning_rate": 9.619059580061862e-06, + "loss": 1.7039, + "step": 526 + }, + { + "epoch": 2.133603238866397, + "grad_norm": 1.4591597594445938, + "learning_rate": 9.616349731834271e-06, + "loss": 2.0009, + "step": 527 + }, + { + "epoch": 2.1376518218623484, + "grad_norm": 1.6179185626843804, + "learning_rate": 9.613630663850184e-06, + "loss": 1.872, + "step": 528 + }, + { + "epoch": 2.1417004048582995, + "grad_norm": 1.3086175576058332, + "learning_rate": 9.610902381540115e-06, + "loss": 1.5977, + "step": 529 + }, + { + "epoch": 2.145748987854251, + "grad_norm": 1.444761778117532, + "learning_rate": 9.608164890352977e-06, + "loss": 2.0221, + "step": 530 + }, + { + "epoch": 2.1497975708502026, + "grad_norm": 1.4113693951603745, + "learning_rate": 9.605418195756077e-06, + "loss": 1.8497, + "step": 531 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 1.2987083078720463, + "learning_rate": 9.602662303235106e-06, + "loss": 1.9881, + "step": 532 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 1.5356679778352307, + "learning_rate": 9.599897218294122e-06, + "loss": 2.2169, + "step": 533 + }, + { + "epoch": 2.161943319838057, + "grad_norm": 1.2586253730389827, + "learning_rate": 9.597122946455539e-06, + "loss": 1.8884, + "step": 534 + }, + { + "epoch": 2.165991902834008, + "grad_norm": 1.3241548388576752, + "learning_rate": 9.594339493260127e-06, + "loss": 1.9169, + "step": 535 + }, + { + "epoch": 2.1700404858299596, + "grad_norm": 3.3161848122832627, + "learning_rate": 9.591546864266983e-06, + "loss": 2.3116, + "step": 536 + }, + { + "epoch": 2.174089068825911, + "grad_norm": 1.2785252284615238, + "learning_rate": 9.58874506505354e-06, + "loss": 1.7854, + "step": 537 + }, + { + "epoch": 2.1781376518218623, + "grad_norm": 1.4062987764786141, + "learning_rate": 9.58593410121554e-06, + "loss": 1.9564, + "step": 538 + }, + { + "epoch": 2.182186234817814, + "grad_norm": 1.1858759757574733, + "learning_rate": 9.583113978367026e-06, + "loss": 1.7449, + "step": 539 + }, + { + "epoch": 2.1862348178137654, + "grad_norm": 1.4958289357631562, + "learning_rate": 9.580284702140342e-06, + "loss": 1.8748, + "step": 540 + }, + { + "epoch": 2.1902834008097165, + "grad_norm": 1.271888181605562, + "learning_rate": 9.577446278186103e-06, + "loss": 1.944, + "step": 541 + }, + { + "epoch": 2.194331983805668, + "grad_norm": 1.6297569109832326, + "learning_rate": 9.574598712173202e-06, + "loss": 2.1136, + "step": 542 + }, + { + "epoch": 2.1983805668016196, + "grad_norm": 1.7294919253670684, + "learning_rate": 9.571742009788787e-06, + "loss": 2.1866, + "step": 543 + }, + { + "epoch": 2.2024291497975708, + "grad_norm": 1.5317790321439353, + "learning_rate": 9.568876176738251e-06, + "loss": 1.8859, + "step": 544 + }, + { + "epoch": 2.2064777327935223, + "grad_norm": 1.711554028884214, + "learning_rate": 9.56600121874523e-06, + "loss": 2.1936, + "step": 545 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 1.4435460877228636, + "learning_rate": 9.563117141551574e-06, + "loss": 2.2517, + "step": 546 + }, + { + "epoch": 2.214574898785425, + "grad_norm": 1.4961050962412457, + "learning_rate": 9.560223950917354e-06, + "loss": 2.041, + "step": 547 + }, + { + "epoch": 2.2186234817813766, + "grad_norm": 1.3247670963766616, + "learning_rate": 9.557321652620839e-06, + "loss": 1.8986, + "step": 548 + }, + { + "epoch": 2.2226720647773277, + "grad_norm": 1.4724998096864195, + "learning_rate": 9.554410252458489e-06, + "loss": 1.8568, + "step": 549 + }, + { + "epoch": 2.2267206477732793, + "grad_norm": 3.7991275518186196, + "learning_rate": 9.551489756244939e-06, + "loss": 1.9347, + "step": 550 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 1.4010848185779328, + "learning_rate": 9.548560169812997e-06, + "loss": 1.8809, + "step": 551 + }, + { + "epoch": 2.234817813765182, + "grad_norm": 1.6221348693259423, + "learning_rate": 9.54562149901362e-06, + "loss": 2.0865, + "step": 552 + }, + { + "epoch": 2.2388663967611335, + "grad_norm": 1.4196865192753882, + "learning_rate": 9.54267374971591e-06, + "loss": 2.0449, + "step": 553 + }, + { + "epoch": 2.242914979757085, + "grad_norm": 1.4599787722592332, + "learning_rate": 9.539716927807102e-06, + "loss": 2.0083, + "step": 554 + }, + { + "epoch": 2.246963562753036, + "grad_norm": 1.251605201082177, + "learning_rate": 9.536751039192549e-06, + "loss": 1.8576, + "step": 555 + }, + { + "epoch": 2.251012145748988, + "grad_norm": 1.30407928376828, + "learning_rate": 9.533776089795712e-06, + "loss": 1.8923, + "step": 556 + }, + { + "epoch": 2.2550607287449393, + "grad_norm": 1.4348421622864604, + "learning_rate": 9.530792085558151e-06, + "loss": 1.9873, + "step": 557 + }, + { + "epoch": 2.2591093117408905, + "grad_norm": 1.4429474918555736, + "learning_rate": 9.527799032439506e-06, + "loss": 1.8211, + "step": 558 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.338584745094179, + "learning_rate": 9.524796936417495e-06, + "loss": 1.7082, + "step": 559 + }, + { + "epoch": 2.2672064777327936, + "grad_norm": 1.329824996124572, + "learning_rate": 9.521785803487888e-06, + "loss": 1.9216, + "step": 560 + }, + { + "epoch": 2.2712550607287447, + "grad_norm": 1.3374675078915148, + "learning_rate": 9.518765639664512e-06, + "loss": 1.9723, + "step": 561 + }, + { + "epoch": 2.2753036437246963, + "grad_norm": 1.4689345418902104, + "learning_rate": 9.515736450979224e-06, + "loss": 1.953, + "step": 562 + }, + { + "epoch": 2.279352226720648, + "grad_norm": 1.6439512327159642, + "learning_rate": 9.512698243481914e-06, + "loss": 1.991, + "step": 563 + }, + { + "epoch": 2.283400809716599, + "grad_norm": 1.5280266119657933, + "learning_rate": 9.509651023240472e-06, + "loss": 1.7088, + "step": 564 + }, + { + "epoch": 2.2874493927125505, + "grad_norm": 1.5234607385845351, + "learning_rate": 9.5065947963408e-06, + "loss": 1.7975, + "step": 565 + }, + { + "epoch": 2.291497975708502, + "grad_norm": 1.4898313464385229, + "learning_rate": 9.50352956888678e-06, + "loss": 1.6643, + "step": 566 + }, + { + "epoch": 2.2955465587044532, + "grad_norm": 1.5049004900957001, + "learning_rate": 9.500455347000273e-06, + "loss": 2.0078, + "step": 567 + }, + { + "epoch": 2.299595141700405, + "grad_norm": 1.5268023276941818, + "learning_rate": 9.497372136821103e-06, + "loss": 2.0653, + "step": 568 + }, + { + "epoch": 2.3036437246963564, + "grad_norm": 1.5293343920918272, + "learning_rate": 9.49427994450705e-06, + "loss": 1.8078, + "step": 569 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 1.504441993367853, + "learning_rate": 9.491178776233825e-06, + "loss": 1.8219, + "step": 570 + }, + { + "epoch": 2.311740890688259, + "grad_norm": 1.3604060927952581, + "learning_rate": 9.488068638195072e-06, + "loss": 1.8582, + "step": 571 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 1.7336288728624165, + "learning_rate": 9.484949536602343e-06, + "loss": 1.8562, + "step": 572 + }, + { + "epoch": 2.3198380566801617, + "grad_norm": 1.536212130823414, + "learning_rate": 9.481821477685102e-06, + "loss": 1.7431, + "step": 573 + }, + { + "epoch": 2.3238866396761133, + "grad_norm": 1.4120913757834546, + "learning_rate": 9.478684467690693e-06, + "loss": 1.7586, + "step": 574 + }, + { + "epoch": 2.327935222672065, + "grad_norm": 1.453958520209467, + "learning_rate": 9.47553851288434e-06, + "loss": 1.7694, + "step": 575 + }, + { + "epoch": 2.331983805668016, + "grad_norm": 1.3935000424019952, + "learning_rate": 9.472383619549133e-06, + "loss": 1.6545, + "step": 576 + }, + { + "epoch": 2.3360323886639676, + "grad_norm": 1.3589610652505588, + "learning_rate": 9.469219793986016e-06, + "loss": 1.6896, + "step": 577 + }, + { + "epoch": 2.340080971659919, + "grad_norm": 1.7566987829139051, + "learning_rate": 9.466047042513767e-06, + "loss": 1.6272, + "step": 578 + }, + { + "epoch": 2.3441295546558703, + "grad_norm": 1.3287178155779462, + "learning_rate": 9.462865371468994e-06, + "loss": 1.7176, + "step": 579 + }, + { + "epoch": 2.348178137651822, + "grad_norm": 1.8490808825118674, + "learning_rate": 9.459674787206117e-06, + "loss": 1.9005, + "step": 580 + }, + { + "epoch": 2.3522267206477734, + "grad_norm": 1.8200114285326863, + "learning_rate": 9.45647529609736e-06, + "loss": 1.7493, + "step": 581 + }, + { + "epoch": 2.3562753036437245, + "grad_norm": 1.7944997812037724, + "learning_rate": 9.453266904532737e-06, + "loss": 1.4856, + "step": 582 + }, + { + "epoch": 2.360323886639676, + "grad_norm": 1.6449884777915886, + "learning_rate": 9.450049618920034e-06, + "loss": 1.8312, + "step": 583 + }, + { + "epoch": 2.3643724696356276, + "grad_norm": 1.6009358010430617, + "learning_rate": 9.4468234456848e-06, + "loss": 1.9048, + "step": 584 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 1.519230320705593, + "learning_rate": 9.44358839127034e-06, + "loss": 1.8077, + "step": 585 + }, + { + "epoch": 2.3724696356275303, + "grad_norm": 1.8694258750708748, + "learning_rate": 9.44034446213769e-06, + "loss": 1.9556, + "step": 586 + }, + { + "epoch": 2.376518218623482, + "grad_norm": 1.4302907644008036, + "learning_rate": 9.437091664765611e-06, + "loss": 1.5064, + "step": 587 + }, + { + "epoch": 2.380566801619433, + "grad_norm": 1.5423881317930213, + "learning_rate": 9.433830005650582e-06, + "loss": 1.69, + "step": 588 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 1.4747017336722326, + "learning_rate": 9.430559491306777e-06, + "loss": 1.5552, + "step": 589 + }, + { + "epoch": 2.388663967611336, + "grad_norm": 1.600482934018078, + "learning_rate": 9.427280128266049e-06, + "loss": 1.6106, + "step": 590 + }, + { + "epoch": 2.3927125506072873, + "grad_norm": 1.5014148151060753, + "learning_rate": 9.423991923077938e-06, + "loss": 1.7636, + "step": 591 + }, + { + "epoch": 2.396761133603239, + "grad_norm": 1.7672182274084831, + "learning_rate": 9.420694882309628e-06, + "loss": 1.5786, + "step": 592 + }, + { + "epoch": 2.4008097165991904, + "grad_norm": 1.440572594457583, + "learning_rate": 9.41738901254596e-06, + "loss": 1.6426, + "step": 593 + }, + { + "epoch": 2.4048582995951415, + "grad_norm": 1.5625132261883155, + "learning_rate": 9.414074320389403e-06, + "loss": 1.7306, + "step": 594 + }, + { + "epoch": 2.408906882591093, + "grad_norm": 1.683823244071828, + "learning_rate": 9.41075081246005e-06, + "loss": 1.821, + "step": 595 + }, + { + "epoch": 2.4129554655870447, + "grad_norm": 1.4314599370281114, + "learning_rate": 9.4074184953956e-06, + "loss": 1.6872, + "step": 596 + }, + { + "epoch": 2.417004048582996, + "grad_norm": 1.5657957134872598, + "learning_rate": 9.404077375851338e-06, + "loss": 1.9362, + "step": 597 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 1.6198467768431548, + "learning_rate": 9.400727460500141e-06, + "loss": 2.0139, + "step": 598 + }, + { + "epoch": 2.425101214574899, + "grad_norm": 1.4103077055466628, + "learning_rate": 9.397368756032445e-06, + "loss": 1.8485, + "step": 599 + }, + { + "epoch": 2.42914979757085, + "grad_norm": 1.3471173889103276, + "learning_rate": 9.394001269156245e-06, + "loss": 1.9812, + "step": 600 + }, + { + "epoch": 2.4331983805668016, + "grad_norm": 1.4234064588511484, + "learning_rate": 9.39062500659707e-06, + "loss": 2.0496, + "step": 601 + }, + { + "epoch": 2.437246963562753, + "grad_norm": 1.4784926767119206, + "learning_rate": 9.38723997509798e-06, + "loss": 1.837, + "step": 602 + }, + { + "epoch": 2.4412955465587043, + "grad_norm": 1.5518065193263646, + "learning_rate": 9.383846181419547e-06, + "loss": 1.765, + "step": 603 + }, + { + "epoch": 2.445344129554656, + "grad_norm": 1.3196666479973478, + "learning_rate": 9.380443632339845e-06, + "loss": 2.0255, + "step": 604 + }, + { + "epoch": 2.4493927125506074, + "grad_norm": 1.440061740597458, + "learning_rate": 9.37703233465443e-06, + "loss": 1.7942, + "step": 605 + }, + { + "epoch": 2.4534412955465585, + "grad_norm": 1.5327759577164166, + "learning_rate": 9.373612295176333e-06, + "loss": 1.777, + "step": 606 + }, + { + "epoch": 2.45748987854251, + "grad_norm": 1.6814358499503075, + "learning_rate": 9.370183520736045e-06, + "loss": 2.185, + "step": 607 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 2.30393335895373, + "learning_rate": 9.366746018181503e-06, + "loss": 2.4563, + "step": 608 + }, + { + "epoch": 2.465587044534413, + "grad_norm": 1.8584859443814368, + "learning_rate": 9.363299794378072e-06, + "loss": 2.0155, + "step": 609 + }, + { + "epoch": 2.4696356275303644, + "grad_norm": 1.2803493212403667, + "learning_rate": 9.359844856208538e-06, + "loss": 1.9623, + "step": 610 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 1.323092986933548, + "learning_rate": 9.356381210573092e-06, + "loss": 1.8725, + "step": 611 + }, + { + "epoch": 2.477732793522267, + "grad_norm": 1.716120944564361, + "learning_rate": 9.352908864389313e-06, + "loss": 1.9058, + "step": 612 + }, + { + "epoch": 2.4817813765182186, + "grad_norm": 1.1767574227433577, + "learning_rate": 9.349427824592157e-06, + "loss": 1.818, + "step": 613 + }, + { + "epoch": 2.48582995951417, + "grad_norm": 1.8646580879242294, + "learning_rate": 9.345938098133946e-06, + "loss": 1.8001, + "step": 614 + }, + { + "epoch": 2.4898785425101213, + "grad_norm": 1.7755724904128214, + "learning_rate": 9.342439691984346e-06, + "loss": 1.7282, + "step": 615 + }, + { + "epoch": 2.493927125506073, + "grad_norm": 1.7352293901651843, + "learning_rate": 9.338932613130363e-06, + "loss": 1.7961, + "step": 616 + }, + { + "epoch": 2.4979757085020244, + "grad_norm": 1.6153408388514847, + "learning_rate": 9.33541686857632e-06, + "loss": 1.662, + "step": 617 + }, + { + "epoch": 2.5020242914979756, + "grad_norm": 1.5099283023047843, + "learning_rate": 9.331892465343851e-06, + "loss": 1.588, + "step": 618 + }, + { + "epoch": 2.506072874493927, + "grad_norm": 1.730183741035281, + "learning_rate": 9.328359410471878e-06, + "loss": 1.8722, + "step": 619 + }, + { + "epoch": 2.5101214574898787, + "grad_norm": 1.7321761047223487, + "learning_rate": 9.324817711016609e-06, + "loss": 1.9167, + "step": 620 + }, + { + "epoch": 2.51417004048583, + "grad_norm": 1.2095836589724516, + "learning_rate": 9.32126737405151e-06, + "loss": 1.8743, + "step": 621 + }, + { + "epoch": 2.5182186234817814, + "grad_norm": 1.5485434750214813, + "learning_rate": 9.3177084066673e-06, + "loss": 1.89, + "step": 622 + }, + { + "epoch": 2.522267206477733, + "grad_norm": 1.5145693598054688, + "learning_rate": 9.31414081597194e-06, + "loss": 1.8321, + "step": 623 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 1.8660173525702701, + "learning_rate": 9.310564609090605e-06, + "loss": 1.6178, + "step": 624 + }, + { + "epoch": 2.5303643724696356, + "grad_norm": 1.9092894315915314, + "learning_rate": 9.306979793165682e-06, + "loss": 1.718, + "step": 625 + }, + { + "epoch": 2.534412955465587, + "grad_norm": 2.1574694273419817, + "learning_rate": 9.303386375356752e-06, + "loss": 1.8536, + "step": 626 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 1.5187220263251169, + "learning_rate": 9.299784362840578e-06, + "loss": 2.0088, + "step": 627 + }, + { + "epoch": 2.54251012145749, + "grad_norm": 1.3524410374053388, + "learning_rate": 9.296173762811084e-06, + "loss": 1.8993, + "step": 628 + }, + { + "epoch": 2.5465587044534415, + "grad_norm": 3.8294272400161993, + "learning_rate": 9.292554582479349e-06, + "loss": 2.3583, + "step": 629 + }, + { + "epoch": 2.5506072874493926, + "grad_norm": 6.070012543144345, + "learning_rate": 9.288926829073583e-06, + "loss": 2.4906, + "step": 630 + }, + { + "epoch": 2.554655870445344, + "grad_norm": 5.603752988478888, + "learning_rate": 9.285290509839126e-06, + "loss": 2.7822, + "step": 631 + }, + { + "epoch": 2.5587044534412957, + "grad_norm": 1.4481838054717586, + "learning_rate": 9.281645632038417e-06, + "loss": 1.8168, + "step": 632 + }, + { + "epoch": 2.562753036437247, + "grad_norm": 1.414449313894791, + "learning_rate": 9.277992202950996e-06, + "loss": 1.7136, + "step": 633 + }, + { + "epoch": 2.5668016194331984, + "grad_norm": 1.4634757861687506, + "learning_rate": 9.274330229873474e-06, + "loss": 2.0032, + "step": 634 + }, + { + "epoch": 2.57085020242915, + "grad_norm": 1.484422105707642, + "learning_rate": 9.270659720119533e-06, + "loss": 1.6359, + "step": 635 + }, + { + "epoch": 2.574898785425101, + "grad_norm": 1.4574650651898802, + "learning_rate": 9.266980681019902e-06, + "loss": 1.9962, + "step": 636 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 1.2408661225828688, + "learning_rate": 9.263293119922341e-06, + "loss": 1.7137, + "step": 637 + }, + { + "epoch": 2.582995951417004, + "grad_norm": 1.4397062187160998, + "learning_rate": 9.259597044191635e-06, + "loss": 1.9567, + "step": 638 + }, + { + "epoch": 2.5870445344129553, + "grad_norm": 1.3678454147168124, + "learning_rate": 9.255892461209574e-06, + "loss": 1.8607, + "step": 639 + }, + { + "epoch": 2.591093117408907, + "grad_norm": 1.51295578810032, + "learning_rate": 9.252179378374937e-06, + "loss": 1.8423, + "step": 640 + }, + { + "epoch": 2.5951417004048585, + "grad_norm": 1.493191888596024, + "learning_rate": 9.248457803103476e-06, + "loss": 1.5365, + "step": 641 + }, + { + "epoch": 2.5991902834008096, + "grad_norm": 1.4402174802959915, + "learning_rate": 9.24472774282791e-06, + "loss": 1.5837, + "step": 642 + }, + { + "epoch": 2.603238866396761, + "grad_norm": 1.3814570168249611, + "learning_rate": 9.240989204997903e-06, + "loss": 1.7433, + "step": 643 + }, + { + "epoch": 2.6072874493927127, + "grad_norm": 1.4229224856881553, + "learning_rate": 9.237242197080045e-06, + "loss": 1.6373, + "step": 644 + }, + { + "epoch": 2.611336032388664, + "grad_norm": 1.529255344732051, + "learning_rate": 9.23348672655785e-06, + "loss": 1.9638, + "step": 645 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 1.2990811736528833, + "learning_rate": 9.229722800931727e-06, + "loss": 1.8372, + "step": 646 + }, + { + "epoch": 2.619433198380567, + "grad_norm": 1.7287958707975635, + "learning_rate": 9.225950427718974e-06, + "loss": 1.665, + "step": 647 + }, + { + "epoch": 2.623481781376518, + "grad_norm": 1.631936855970988, + "learning_rate": 9.222169614453765e-06, + "loss": 2.052, + "step": 648 + }, + { + "epoch": 2.6275303643724697, + "grad_norm": 1.384358037456477, + "learning_rate": 9.21838036868712e-06, + "loss": 1.8437, + "step": 649 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 1.57010881393224, + "learning_rate": 9.21458269798691e-06, + "loss": 2.0542, + "step": 650 + }, + { + "epoch": 2.6356275303643724, + "grad_norm": 1.4074541953077098, + "learning_rate": 9.21077660993783e-06, + "loss": 1.7342, + "step": 651 + }, + { + "epoch": 2.639676113360324, + "grad_norm": 1.6189308816605772, + "learning_rate": 9.206962112141382e-06, + "loss": 1.9321, + "step": 652 + }, + { + "epoch": 2.6437246963562755, + "grad_norm": 1.4090618348929758, + "learning_rate": 9.203139212215868e-06, + "loss": 1.871, + "step": 653 + }, + { + "epoch": 2.6477732793522266, + "grad_norm": 1.9494105407548425, + "learning_rate": 9.199307917796371e-06, + "loss": 1.8667, + "step": 654 + }, + { + "epoch": 2.651821862348178, + "grad_norm": 1.4331583331274316, + "learning_rate": 9.195468236534734e-06, + "loss": 1.7255, + "step": 655 + }, + { + "epoch": 2.6558704453441297, + "grad_norm": 1.5909315996217737, + "learning_rate": 9.191620176099559e-06, + "loss": 1.9444, + "step": 656 + }, + { + "epoch": 2.659919028340081, + "grad_norm": 1.7461445494408216, + "learning_rate": 9.187763744176175e-06, + "loss": 1.7728, + "step": 657 + }, + { + "epoch": 2.6639676113360324, + "grad_norm": 1.422126938114325, + "learning_rate": 9.183898948466633e-06, + "loss": 1.9077, + "step": 658 + }, + { + "epoch": 2.668016194331984, + "grad_norm": 1.4144043249974336, + "learning_rate": 9.180025796689692e-06, + "loss": 1.9331, + "step": 659 + }, + { + "epoch": 2.672064777327935, + "grad_norm": 2.7772861017132255, + "learning_rate": 9.176144296580794e-06, + "loss": 1.8667, + "step": 660 + }, + { + "epoch": 2.6761133603238867, + "grad_norm": 1.3064807850177453, + "learning_rate": 9.172254455892054e-06, + "loss": 1.8187, + "step": 661 + }, + { + "epoch": 2.6801619433198383, + "grad_norm": 1.7419083953095058, + "learning_rate": 9.168356282392253e-06, + "loss": 1.903, + "step": 662 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 1.4496863008780128, + "learning_rate": 9.164449783866802e-06, + "loss": 1.7048, + "step": 663 + }, + { + "epoch": 2.688259109311741, + "grad_norm": 1.491984655358695, + "learning_rate": 9.160534968117752e-06, + "loss": 1.8734, + "step": 664 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 1.5308194782439823, + "learning_rate": 9.156611842963753e-06, + "loss": 1.8788, + "step": 665 + }, + { + "epoch": 2.6963562753036436, + "grad_norm": 1.3476877228875297, + "learning_rate": 9.152680416240059e-06, + "loss": 1.7147, + "step": 666 + }, + { + "epoch": 2.700404858299595, + "grad_norm": 1.8151640153934792, + "learning_rate": 9.1487406957985e-06, + "loss": 2.2048, + "step": 667 + }, + { + "epoch": 2.7044534412955468, + "grad_norm": 1.7628995278188238, + "learning_rate": 9.144792689507471e-06, + "loss": 1.9635, + "step": 668 + }, + { + "epoch": 2.708502024291498, + "grad_norm": 1.602921120835359, + "learning_rate": 9.140836405251917e-06, + "loss": 1.5744, + "step": 669 + }, + { + "epoch": 2.7125506072874495, + "grad_norm": 1.490856129715411, + "learning_rate": 9.136871850933312e-06, + "loss": 1.7612, + "step": 670 + }, + { + "epoch": 2.716599190283401, + "grad_norm": 1.4382592619602368, + "learning_rate": 9.132899034469648e-06, + "loss": 1.8414, + "step": 671 + }, + { + "epoch": 2.720647773279352, + "grad_norm": 1.8014041637984994, + "learning_rate": 9.128917963795422e-06, + "loss": 1.7066, + "step": 672 + }, + { + "epoch": 2.7246963562753037, + "grad_norm": 1.7582254633750898, + "learning_rate": 9.124928646861613e-06, + "loss": 1.7925, + "step": 673 + }, + { + "epoch": 2.7287449392712553, + "grad_norm": 1.6343159265633571, + "learning_rate": 9.120931091635669e-06, + "loss": 1.9923, + "step": 674 + }, + { + "epoch": 2.7327935222672064, + "grad_norm": 1.3849537338720197, + "learning_rate": 9.116925306101494e-06, + "loss": 1.858, + "step": 675 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 1.5938145614524974, + "learning_rate": 9.112911298259426e-06, + "loss": 1.8935, + "step": 676 + }, + { + "epoch": 2.7408906882591095, + "grad_norm": 2.232137755564454, + "learning_rate": 9.108889076126226e-06, + "loss": 2.5611, + "step": 677 + }, + { + "epoch": 2.7449392712550607, + "grad_norm": 1.597451641610388, + "learning_rate": 9.104858647735065e-06, + "loss": 1.9346, + "step": 678 + }, + { + "epoch": 2.748987854251012, + "grad_norm": 1.734843462936045, + "learning_rate": 9.100820021135495e-06, + "loss": 1.7738, + "step": 679 + }, + { + "epoch": 2.753036437246964, + "grad_norm": 1.5432674907856907, + "learning_rate": 9.09677320439345e-06, + "loss": 1.6451, + "step": 680 + }, + { + "epoch": 2.757085020242915, + "grad_norm": 1.4375865005427824, + "learning_rate": 9.092718205591213e-06, + "loss": 1.8788, + "step": 681 + }, + { + "epoch": 2.7611336032388665, + "grad_norm": 3.7437865438416433, + "learning_rate": 9.088655032827418e-06, + "loss": 2.6938, + "step": 682 + }, + { + "epoch": 2.765182186234818, + "grad_norm": 6.350052687447943, + "learning_rate": 9.084583694217012e-06, + "loss": 2.5299, + "step": 683 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 4.945671727596882, + "learning_rate": 9.080504197891262e-06, + "loss": 2.4088, + "step": 684 + }, + { + "epoch": 2.7732793522267207, + "grad_norm": 1.6795835965091561, + "learning_rate": 9.076416551997721e-06, + "loss": 1.824, + "step": 685 + }, + { + "epoch": 2.7773279352226723, + "grad_norm": 1.5949270953831338, + "learning_rate": 9.072320764700223e-06, + "loss": 2.0511, + "step": 686 + }, + { + "epoch": 2.7813765182186234, + "grad_norm": 1.4556536124547252, + "learning_rate": 9.068216844178857e-06, + "loss": 2.0932, + "step": 687 + }, + { + "epoch": 2.785425101214575, + "grad_norm": 1.6439876597132232, + "learning_rate": 9.064104798629955e-06, + "loss": 1.8796, + "step": 688 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.4368651555210203, + "learning_rate": 9.059984636266082e-06, + "loss": 1.7757, + "step": 689 + }, + { + "epoch": 2.7935222672064777, + "grad_norm": 1.6510465877279545, + "learning_rate": 9.055856365316012e-06, + "loss": 1.9039, + "step": 690 + }, + { + "epoch": 2.7975708502024292, + "grad_norm": 1.5313446048549542, + "learning_rate": 9.051719994024711e-06, + "loss": 1.9171, + "step": 691 + }, + { + "epoch": 2.801619433198381, + "grad_norm": 1.5880262025571767, + "learning_rate": 9.047575530653324e-06, + "loss": 1.6852, + "step": 692 + }, + { + "epoch": 2.805668016194332, + "grad_norm": 1.4675446257129918, + "learning_rate": 9.043422983479158e-06, + "loss": 1.5727, + "step": 693 + }, + { + "epoch": 2.8097165991902835, + "grad_norm": 1.6282110219820332, + "learning_rate": 9.039262360795664e-06, + "loss": 1.9079, + "step": 694 + }, + { + "epoch": 2.813765182186235, + "grad_norm": 1.9452631088170542, + "learning_rate": 9.035093670912424e-06, + "loss": 1.9093, + "step": 695 + }, + { + "epoch": 2.817813765182186, + "grad_norm": 1.6299011643761043, + "learning_rate": 9.03091692215513e-06, + "loss": 1.6569, + "step": 696 + }, + { + "epoch": 2.8218623481781377, + "grad_norm": 7.734091901664539, + "learning_rate": 9.026732122865567e-06, + "loss": 2.4758, + "step": 697 + }, + { + "epoch": 2.8259109311740893, + "grad_norm": 18.1486281089367, + "learning_rate": 9.022539281401601e-06, + "loss": 3.9379, + "step": 698 + }, + { + "epoch": 2.8299595141700404, + "grad_norm": 1.7406474445735873, + "learning_rate": 9.01833840613716e-06, + "loss": 1.7599, + "step": 699 + }, + { + "epoch": 2.834008097165992, + "grad_norm": 1.7079549569427872, + "learning_rate": 9.014129505462217e-06, + "loss": 1.6112, + "step": 700 + }, + { + "epoch": 2.8380566801619436, + "grad_norm": 1.5492178198371753, + "learning_rate": 9.009912587782772e-06, + "loss": 1.719, + "step": 701 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 1.5966963855692302, + "learning_rate": 9.005687661520838e-06, + "loss": 1.7237, + "step": 702 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 1.5738987901659376, + "learning_rate": 9.00145473511442e-06, + "loss": 1.6892, + "step": 703 + }, + { + "epoch": 2.850202429149798, + "grad_norm": 1.6008695127081995, + "learning_rate": 8.997213817017508e-06, + "loss": 1.7534, + "step": 704 + }, + { + "epoch": 2.854251012145749, + "grad_norm": 1.8027657159531043, + "learning_rate": 8.99296491570004e-06, + "loss": 1.8313, + "step": 705 + }, + { + "epoch": 2.8582995951417005, + "grad_norm": 1.388477920242152, + "learning_rate": 8.98870803964791e-06, + "loss": 1.7662, + "step": 706 + }, + { + "epoch": 2.862348178137652, + "grad_norm": 1.697508321391829, + "learning_rate": 8.984443197362938e-06, + "loss": 1.7739, + "step": 707 + }, + { + "epoch": 2.866396761133603, + "grad_norm": 1.7051210953826448, + "learning_rate": 8.980170397362846e-06, + "loss": 1.7885, + "step": 708 + }, + { + "epoch": 2.8704453441295548, + "grad_norm": 2.112476620801928, + "learning_rate": 8.975889648181258e-06, + "loss": 2.2786, + "step": 709 + }, + { + "epoch": 2.8744939271255063, + "grad_norm": 1.9686852205718806, + "learning_rate": 8.971600958367668e-06, + "loss": 2.2033, + "step": 710 + }, + { + "epoch": 2.8785425101214575, + "grad_norm": 1.8858645037099275, + "learning_rate": 8.96730433648743e-06, + "loss": 1.9747, + "step": 711 + }, + { + "epoch": 2.882591093117409, + "grad_norm": 1.629389176480098, + "learning_rate": 8.962999791121745e-06, + "loss": 1.8561, + "step": 712 + }, + { + "epoch": 2.8866396761133606, + "grad_norm": 1.7283481294339973, + "learning_rate": 8.958687330867634e-06, + "loss": 1.3887, + "step": 713 + }, + { + "epoch": 2.8906882591093117, + "grad_norm": 1.5884187879059617, + "learning_rate": 8.954366964337926e-06, + "loss": 1.8757, + "step": 714 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 1.5310621607610841, + "learning_rate": 8.950038700161239e-06, + "loss": 1.9746, + "step": 715 + }, + { + "epoch": 2.898785425101215, + "grad_norm": 1.4608377788624507, + "learning_rate": 8.94570254698197e-06, + "loss": 1.6592, + "step": 716 + }, + { + "epoch": 2.902834008097166, + "grad_norm": 1.5297317667519899, + "learning_rate": 8.941358513460264e-06, + "loss": 1.722, + "step": 717 + }, + { + "epoch": 2.9068825910931175, + "grad_norm": 1.847621037937598, + "learning_rate": 8.937006608272009e-06, + "loss": 1.9182, + "step": 718 + }, + { + "epoch": 2.910931174089069, + "grad_norm": 1.6585955176413567, + "learning_rate": 8.932646840108818e-06, + "loss": 1.4523, + "step": 719 + }, + { + "epoch": 2.91497975708502, + "grad_norm": 1.807939122311604, + "learning_rate": 8.928279217677999e-06, + "loss": 1.5928, + "step": 720 + }, + { + "epoch": 2.919028340080972, + "grad_norm": 1.6812175947881611, + "learning_rate": 8.923903749702556e-06, + "loss": 1.6197, + "step": 721 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 1.5868810975571848, + "learning_rate": 8.919520444921153e-06, + "loss": 1.9066, + "step": 722 + }, + { + "epoch": 2.9271255060728745, + "grad_norm": 2.008002647816905, + "learning_rate": 8.915129312088112e-06, + "loss": 1.7547, + "step": 723 + }, + { + "epoch": 2.931174089068826, + "grad_norm": 2.2074435698181185, + "learning_rate": 8.910730359973386e-06, + "loss": 1.7851, + "step": 724 + }, + { + "epoch": 2.9352226720647776, + "grad_norm": 1.6720121053555042, + "learning_rate": 8.906323597362547e-06, + "loss": 1.6173, + "step": 725 + }, + { + "epoch": 2.9392712550607287, + "grad_norm": 1.7840437064722243, + "learning_rate": 8.901909033056763e-06, + "loss": 1.5244, + "step": 726 + }, + { + "epoch": 2.9433198380566803, + "grad_norm": 2.087404813784654, + "learning_rate": 8.89748667587279e-06, + "loss": 1.8108, + "step": 727 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 1.7622420447448541, + "learning_rate": 8.893056534642938e-06, + "loss": 1.5553, + "step": 728 + }, + { + "epoch": 2.951417004048583, + "grad_norm": 1.9454050876073625, + "learning_rate": 8.88861861821507e-06, + "loss": 1.5518, + "step": 729 + }, + { + "epoch": 2.9554655870445345, + "grad_norm": 3.180217232768608, + "learning_rate": 8.88417293545258e-06, + "loss": 1.7772, + "step": 730 + }, + { + "epoch": 2.9595141700404857, + "grad_norm": 3.564301283270782, + "learning_rate": 8.879719495234363e-06, + "loss": 1.6766, + "step": 731 + }, + { + "epoch": 2.9635627530364372, + "grad_norm": 1.5385071245811799, + "learning_rate": 8.875258306454814e-06, + "loss": 1.7823, + "step": 732 + }, + { + "epoch": 2.967611336032389, + "grad_norm": 1.8013008659956586, + "learning_rate": 8.87078937802381e-06, + "loss": 2.0096, + "step": 733 + }, + { + "epoch": 2.97165991902834, + "grad_norm": 2.38933092267862, + "learning_rate": 8.866312718866669e-06, + "loss": 1.9226, + "step": 734 + }, + { + "epoch": 2.9757085020242915, + "grad_norm": 1.5349029688081202, + "learning_rate": 8.861828337924164e-06, + "loss": 1.7634, + "step": 735 + }, + { + "epoch": 2.979757085020243, + "grad_norm": 1.7807993217999074, + "learning_rate": 8.85733624415248e-06, + "loss": 1.862, + "step": 736 + }, + { + "epoch": 2.983805668016194, + "grad_norm": 1.6270967039867585, + "learning_rate": 8.852836446523213e-06, + "loss": 1.9281, + "step": 737 + }, + { + "epoch": 2.9878542510121457, + "grad_norm": 1.8692589473995715, + "learning_rate": 8.848328954023342e-06, + "loss": 1.7317, + "step": 738 + }, + { + "epoch": 2.9919028340080973, + "grad_norm": 1.5874083562158485, + "learning_rate": 8.843813775655211e-06, + "loss": 1.6635, + "step": 739 + }, + { + "epoch": 2.9959514170040484, + "grad_norm": 1.3707872942838146, + "learning_rate": 8.83929092043652e-06, + "loss": 1.9759, + "step": 740 + }, + { + "epoch": 3.0, + "grad_norm": 1.7529361765269527, + "learning_rate": 8.8347603974003e-06, + "loss": 1.7407, + "step": 741 + }, + { + "epoch": 3.0040485829959516, + "grad_norm": 1.4847998012230224, + "learning_rate": 8.83022221559489e-06, + "loss": 1.8183, + "step": 742 + }, + { + "epoch": 3.0080971659919027, + "grad_norm": 2.0727143325799453, + "learning_rate": 8.825676384083936e-06, + "loss": 1.9566, + "step": 743 + }, + { + "epoch": 3.0121457489878543, + "grad_norm": 2.1863226369459072, + "learning_rate": 8.82112291194635e-06, + "loss": 1.8211, + "step": 744 + }, + { + "epoch": 3.016194331983806, + "grad_norm": 2.194214751548881, + "learning_rate": 8.816561808276312e-06, + "loss": 1.9756, + "step": 745 + }, + { + "epoch": 3.020242914979757, + "grad_norm": 1.8746800584359844, + "learning_rate": 8.811993082183243e-06, + "loss": 2.2277, + "step": 746 + }, + { + "epoch": 3.0242914979757085, + "grad_norm": 2.0032700627210636, + "learning_rate": 8.807416742791784e-06, + "loss": 2.0822, + "step": 747 + }, + { + "epoch": 3.02834008097166, + "grad_norm": 1.6874624326476195, + "learning_rate": 8.80283279924178e-06, + "loss": 1.7544, + "step": 748 + }, + { + "epoch": 3.032388663967611, + "grad_norm": 1.981414959416955, + "learning_rate": 8.798241260688273e-06, + "loss": 1.7612, + "step": 749 + }, + { + "epoch": 3.0364372469635628, + "grad_norm": 1.85228853236934, + "learning_rate": 8.793642136301462e-06, + "loss": 2.0061, + "step": 750 + }, + { + "epoch": 3.0404858299595143, + "grad_norm": 1.839202167316395, + "learning_rate": 8.7890354352667e-06, + "loss": 1.8078, + "step": 751 + }, + { + "epoch": 3.0445344129554655, + "grad_norm": 1.664692242856933, + "learning_rate": 8.784421166784476e-06, + "loss": 1.7918, + "step": 752 + }, + { + "epoch": 3.048582995951417, + "grad_norm": 1.8125016947634567, + "learning_rate": 8.779799340070388e-06, + "loss": 1.7574, + "step": 753 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 1.922401307664431, + "learning_rate": 8.775169964355134e-06, + "loss": 1.8982, + "step": 754 + }, + { + "epoch": 3.0566801619433197, + "grad_norm": 1.893673085388173, + "learning_rate": 8.770533048884483e-06, + "loss": 1.7375, + "step": 755 + }, + { + "epoch": 3.0607287449392713, + "grad_norm": 1.7578051605078406, + "learning_rate": 8.765888602919266e-06, + "loss": 1.9075, + "step": 756 + }, + { + "epoch": 3.064777327935223, + "grad_norm": 1.8959640677324443, + "learning_rate": 8.761236635735353e-06, + "loss": 1.8378, + "step": 757 + }, + { + "epoch": 3.068825910931174, + "grad_norm": 1.9801599495189568, + "learning_rate": 8.756577156623636e-06, + "loss": 1.9702, + "step": 758 + }, + { + "epoch": 3.0728744939271255, + "grad_norm": 1.790845579793568, + "learning_rate": 8.751910174890009e-06, + "loss": 1.8932, + "step": 759 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 1.8236903737287826, + "learning_rate": 8.74723569985535e-06, + "loss": 1.8215, + "step": 760 + }, + { + "epoch": 3.080971659919028, + "grad_norm": 1.7121510890543619, + "learning_rate": 8.742553740855507e-06, + "loss": 1.8237, + "step": 761 + }, + { + "epoch": 3.08502024291498, + "grad_norm": 1.6455567766467654, + "learning_rate": 8.737864307241266e-06, + "loss": 1.825, + "step": 762 + }, + { + "epoch": 3.0890688259109313, + "grad_norm": 2.004800789953328, + "learning_rate": 8.733167408378348e-06, + "loss": 1.83, + "step": 763 + }, + { + "epoch": 3.0931174089068825, + "grad_norm": 1.761656112643498, + "learning_rate": 8.728463053647382e-06, + "loss": 1.9209, + "step": 764 + }, + { + "epoch": 3.097165991902834, + "grad_norm": 1.7248736206433866, + "learning_rate": 8.723751252443891e-06, + "loss": 1.6591, + "step": 765 + }, + { + "epoch": 3.1012145748987856, + "grad_norm": 1.8246435273625035, + "learning_rate": 8.71903201417826e-06, + "loss": 1.8214, + "step": 766 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 1.8468962560997435, + "learning_rate": 8.71430534827574e-06, + "loss": 1.854, + "step": 767 + }, + { + "epoch": 3.1093117408906883, + "grad_norm": 1.9312402322655278, + "learning_rate": 8.709571264176408e-06, + "loss": 1.7321, + "step": 768 + }, + { + "epoch": 3.11336032388664, + "grad_norm": 2.316632605973664, + "learning_rate": 8.70482977133516e-06, + "loss": 1.6709, + "step": 769 + }, + { + "epoch": 3.117408906882591, + "grad_norm": 1.9879535887114659, + "learning_rate": 8.700080879221689e-06, + "loss": 1.6082, + "step": 770 + }, + { + "epoch": 3.1214574898785425, + "grad_norm": 1.8223147298487212, + "learning_rate": 8.69532459732046e-06, + "loss": 1.6324, + "step": 771 + }, + { + "epoch": 3.125506072874494, + "grad_norm": 1.9254678274105181, + "learning_rate": 8.690560935130708e-06, + "loss": 1.626, + "step": 772 + }, + { + "epoch": 3.1295546558704452, + "grad_norm": 2.1237007524174683, + "learning_rate": 8.685789902166395e-06, + "loss": 1.5525, + "step": 773 + }, + { + "epoch": 3.133603238866397, + "grad_norm": 1.7727476948432017, + "learning_rate": 8.681011507956215e-06, + "loss": 1.8873, + "step": 774 + }, + { + "epoch": 3.1376518218623484, + "grad_norm": 2.049295618159139, + "learning_rate": 8.676225762043555e-06, + "loss": 1.7496, + "step": 775 + }, + { + "epoch": 3.1417004048582995, + "grad_norm": 1.5682714669220028, + "learning_rate": 8.671432673986493e-06, + "loss": 1.4753, + "step": 776 + }, + { + "epoch": 3.145748987854251, + "grad_norm": 1.8938048440408406, + "learning_rate": 8.666632253357767e-06, + "loss": 1.8963, + "step": 777 + }, + { + "epoch": 3.1497975708502026, + "grad_norm": 1.8936062118104038, + "learning_rate": 8.661824509744754e-06, + "loss": 1.7098, + "step": 778 + }, + { + "epoch": 3.1538461538461537, + "grad_norm": 1.6774875162585348, + "learning_rate": 8.657009452749466e-06, + "loss": 1.8596, + "step": 779 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 2.015957389549595, + "learning_rate": 8.652187091988516e-06, + "loss": 2.061, + "step": 780 + }, + { + "epoch": 3.161943319838057, + "grad_norm": 1.7186637319125118, + "learning_rate": 8.647357437093104e-06, + "loss": 1.7589, + "step": 781 + }, + { + "epoch": 3.165991902834008, + "grad_norm": 1.7941883707597104, + "learning_rate": 8.642520497709001e-06, + "loss": 1.8086, + "step": 782 + }, + { + "epoch": 3.1700404858299596, + "grad_norm": 1.774631391234699, + "learning_rate": 8.637676283496521e-06, + "loss": 2.2517, + "step": 783 + }, + { + "epoch": 3.174089068825911, + "grad_norm": 1.7904179919335834, + "learning_rate": 8.632824804130514e-06, + "loss": 1.6679, + "step": 784 + }, + { + "epoch": 3.1781376518218623, + "grad_norm": 1.972746622761643, + "learning_rate": 8.627966069300332e-06, + "loss": 1.8345, + "step": 785 + }, + { + "epoch": 3.182186234817814, + "grad_norm": 1.5336336477310177, + "learning_rate": 8.623100088709829e-06, + "loss": 1.6473, + "step": 786 + }, + { + "epoch": 3.1862348178137654, + "grad_norm": 1.9951657707171577, + "learning_rate": 8.618226872077315e-06, + "loss": 1.7821, + "step": 787 + }, + { + "epoch": 3.1902834008097165, + "grad_norm": 1.7282375741642677, + "learning_rate": 8.613346429135567e-06, + "loss": 1.8289, + "step": 788 + }, + { + "epoch": 3.194331983805668, + "grad_norm": 2.1277631117336675, + "learning_rate": 8.608458769631785e-06, + "loss": 2.0076, + "step": 789 + }, + { + "epoch": 3.1983805668016196, + "grad_norm": 1.8372643674137712, + "learning_rate": 8.603563903327582e-06, + "loss": 2.0805, + "step": 790 + }, + { + "epoch": 3.2024291497975708, + "grad_norm": 1.8065321863693007, + "learning_rate": 8.598661839998972e-06, + "loss": 1.7669, + "step": 791 + }, + { + "epoch": 3.2064777327935223, + "grad_norm": 2.031336948957746, + "learning_rate": 8.593752589436334e-06, + "loss": 2.0858, + "step": 792 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 1.8889862063112353, + "learning_rate": 8.588836161444405e-06, + "loss": 2.1341, + "step": 793 + }, + { + "epoch": 3.214574898785425, + "grad_norm": 1.8426615628835388, + "learning_rate": 8.583912565842258e-06, + "loss": 1.9304, + "step": 794 + }, + { + "epoch": 3.2186234817813766, + "grad_norm": 1.7414893453963287, + "learning_rate": 8.578981812463278e-06, + "loss": 1.7942, + "step": 795 + }, + { + "epoch": 3.2226720647773277, + "grad_norm": 1.9096193735192637, + "learning_rate": 8.574043911155148e-06, + "loss": 1.72, + "step": 796 + }, + { + "epoch": 3.2267206477732793, + "grad_norm": 1.8025258377815987, + "learning_rate": 8.569098871779828e-06, + "loss": 1.8542, + "step": 797 + }, + { + "epoch": 3.230769230769231, + "grad_norm": 1.8460762696682704, + "learning_rate": 8.56414670421353e-06, + "loss": 1.7101, + "step": 798 + }, + { + "epoch": 3.234817813765182, + "grad_norm": 1.9398991434247146, + "learning_rate": 8.559187418346703e-06, + "loss": 1.95, + "step": 799 + }, + { + "epoch": 3.2388663967611335, + "grad_norm": 1.8632306612622278, + "learning_rate": 8.554221024084019e-06, + "loss": 1.8895, + "step": 800 + }, + { + "epoch": 3.242914979757085, + "grad_norm": 1.893700967064052, + "learning_rate": 8.54924753134434e-06, + "loss": 1.873, + "step": 801 + }, + { + "epoch": 3.246963562753036, + "grad_norm": 1.7151529599583697, + "learning_rate": 8.544266950060706e-06, + "loss": 1.7236, + "step": 802 + }, + { + "epoch": 3.251012145748988, + "grad_norm": 1.7251248112215953, + "learning_rate": 8.539279290180315e-06, + "loss": 1.7693, + "step": 803 + }, + { + "epoch": 3.2550607287449393, + "grad_norm": 1.9817743209184147, + "learning_rate": 8.534284561664508e-06, + "loss": 1.8365, + "step": 804 + }, + { + "epoch": 3.2591093117408905, + "grad_norm": 1.8362666024929137, + "learning_rate": 8.529282774488731e-06, + "loss": 1.6791, + "step": 805 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 1.9144972025615734, + "learning_rate": 8.524273938642539e-06, + "loss": 1.5622, + "step": 806 + }, + { + "epoch": 3.2672064777327936, + "grad_norm": 1.8150113569889472, + "learning_rate": 8.519258064129559e-06, + "loss": 1.8107, + "step": 807 + }, + { + "epoch": 3.2712550607287447, + "grad_norm": 1.8132774105922835, + "learning_rate": 8.514235160967476e-06, + "loss": 1.8382, + "step": 808 + }, + { + "epoch": 3.2753036437246963, + "grad_norm": 1.7178012200999808, + "learning_rate": 8.509205239188017e-06, + "loss": 1.8519, + "step": 809 + }, + { + "epoch": 3.279352226720648, + "grad_norm": 2.2519702448886845, + "learning_rate": 8.504168308836918e-06, + "loss": 1.8559, + "step": 810 + }, + { + "epoch": 3.283400809716599, + "grad_norm": 2.1015013370666513, + "learning_rate": 8.499124379973922e-06, + "loss": 1.5602, + "step": 811 + }, + { + "epoch": 3.2874493927125505, + "grad_norm": 2.1456515647605365, + "learning_rate": 8.494073462672743e-06, + "loss": 1.6597, + "step": 812 + }, + { + "epoch": 3.291497975708502, + "grad_norm": 2.1425091129883613, + "learning_rate": 8.489015567021054e-06, + "loss": 1.5311, + "step": 813 + }, + { + "epoch": 3.2955465587044532, + "grad_norm": 2.1055979919937693, + "learning_rate": 8.483950703120466e-06, + "loss": 1.8547, + "step": 814 + }, + { + "epoch": 3.299595141700405, + "grad_norm": 1.9678625432719996, + "learning_rate": 8.478878881086505e-06, + "loss": 1.9357, + "step": 815 + }, + { + "epoch": 3.3036437246963564, + "grad_norm": 2.0317817207691538, + "learning_rate": 8.473800111048598e-06, + "loss": 1.6684, + "step": 816 + }, + { + "epoch": 3.3076923076923075, + "grad_norm": 2.0379814335298843, + "learning_rate": 8.468714403150043e-06, + "loss": 1.6929, + "step": 817 + }, + { + "epoch": 3.311740890688259, + "grad_norm": 1.9848650286398888, + "learning_rate": 8.463621767547998e-06, + "loss": 1.7112, + "step": 818 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 2.274800378339576, + "learning_rate": 8.458522214413455e-06, + "loss": 1.7005, + "step": 819 + }, + { + "epoch": 3.3198380566801617, + "grad_norm": 2.170751690325617, + "learning_rate": 8.453415753931223e-06, + "loss": 1.5995, + "step": 820 + }, + { + "epoch": 3.3238866396761133, + "grad_norm": 1.9913626012571344, + "learning_rate": 8.448302396299906e-06, + "loss": 1.6057, + "step": 821 + }, + { + "epoch": 3.327935222672065, + "grad_norm": 1.9395230430651595, + "learning_rate": 8.443182151731883e-06, + "loss": 1.6349, + "step": 822 + }, + { + "epoch": 3.331983805668016, + "grad_norm": 1.9091197381555691, + "learning_rate": 8.438055030453287e-06, + "loss": 1.5595, + "step": 823 + }, + { + "epoch": 3.3360323886639676, + "grad_norm": 1.8562911407114664, + "learning_rate": 8.432921042703985e-06, + "loss": 1.6019, + "step": 824 + }, + { + "epoch": 3.340080971659919, + "grad_norm": 1.7832079833064884, + "learning_rate": 8.42778019873756e-06, + "loss": 1.552, + "step": 825 + }, + { + "epoch": 3.3441295546558703, + "grad_norm": 1.8542638409385725, + "learning_rate": 8.422632508821284e-06, + "loss": 1.5851, + "step": 826 + }, + { + "epoch": 3.348178137651822, + "grad_norm": 2.1436195397021436, + "learning_rate": 8.417477983236107e-06, + "loss": 1.7666, + "step": 827 + }, + { + "epoch": 3.3522267206477734, + "grad_norm": 2.33071372223659, + "learning_rate": 8.412316632276627e-06, + "loss": 1.6497, + "step": 828 + }, + { + "epoch": 3.3562753036437245, + "grad_norm": 2.205436986044382, + "learning_rate": 8.407148466251072e-06, + "loss": 1.3523, + "step": 829 + }, + { + "epoch": 3.360323886639676, + "grad_norm": 2.2620315487409877, + "learning_rate": 8.401973495481289e-06, + "loss": 1.723, + "step": 830 + }, + { + "epoch": 3.3643724696356276, + "grad_norm": 2.180101120238927, + "learning_rate": 8.396791730302708e-06, + "loss": 1.8056, + "step": 831 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 1.990085418505961, + "learning_rate": 8.39160318106433e-06, + "loss": 1.7166, + "step": 832 + }, + { + "epoch": 3.3724696356275303, + "grad_norm": 2.40657553356096, + "learning_rate": 8.386407858128707e-06, + "loss": 1.8193, + "step": 833 + }, + { + "epoch": 3.376518218623482, + "grad_norm": 1.94489059367538, + "learning_rate": 8.381205771871918e-06, + "loss": 1.4172, + "step": 834 + }, + { + "epoch": 3.380566801619433, + "grad_norm": 2.150391672244522, + "learning_rate": 8.375996932683553e-06, + "loss": 1.5949, + "step": 835 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 2.0030590669894903, + "learning_rate": 8.370781350966683e-06, + "loss": 1.4156, + "step": 836 + }, + { + "epoch": 3.388663967611336, + "grad_norm": 2.197019034882382, + "learning_rate": 8.36555903713785e-06, + "loss": 1.4714, + "step": 837 + }, + { + "epoch": 3.3927125506072873, + "grad_norm": 2.078166195454461, + "learning_rate": 8.360330001627043e-06, + "loss": 1.6429, + "step": 838 + }, + { + "epoch": 3.396761133603239, + "grad_norm": 2.40629641977567, + "learning_rate": 8.355094254877665e-06, + "loss": 1.4713, + "step": 839 + }, + { + "epoch": 3.4008097165991904, + "grad_norm": 1.9645801904393803, + "learning_rate": 8.349851807346535e-06, + "loss": 1.5146, + "step": 840 + }, + { + "epoch": 3.4048582995951415, + "grad_norm": 1.9534289124567972, + "learning_rate": 8.344602669503849e-06, + "loss": 1.5871, + "step": 841 + }, + { + "epoch": 3.408906882591093, + "grad_norm": 2.3102884897188534, + "learning_rate": 8.339346851833163e-06, + "loss": 1.6862, + "step": 842 + }, + { + "epoch": 3.4129554655870447, + "grad_norm": 2.0401234182707406, + "learning_rate": 8.334084364831381e-06, + "loss": 1.5214, + "step": 843 + }, + { + "epoch": 3.417004048582996, + "grad_norm": 2.159768925630674, + "learning_rate": 8.328815219008719e-06, + "loss": 1.8219, + "step": 844 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 2.2204972461802757, + "learning_rate": 8.323539424888695e-06, + "loss": 1.8941, + "step": 845 + }, + { + "epoch": 3.425101214574899, + "grad_norm": 1.9873340221710971, + "learning_rate": 8.318256993008108e-06, + "loss": 1.7539, + "step": 846 + }, + { + "epoch": 3.42914979757085, + "grad_norm": 1.975202455896719, + "learning_rate": 8.31296793391701e-06, + "loss": 1.8598, + "step": 847 + }, + { + "epoch": 3.4331983805668016, + "grad_norm": 1.8415081642607933, + "learning_rate": 8.30767225817869e-06, + "loss": 1.9574, + "step": 848 + }, + { + "epoch": 3.437246963562753, + "grad_norm": 2.047274050267817, + "learning_rate": 8.302369976369651e-06, + "loss": 1.736, + "step": 849 + }, + { + "epoch": 3.4412955465587043, + "grad_norm": 2.1457366433830454, + "learning_rate": 8.297061099079592e-06, + "loss": 1.6581, + "step": 850 + }, + { + "epoch": 3.445344129554656, + "grad_norm": 1.8891113266245207, + "learning_rate": 8.291745636911382e-06, + "loss": 1.9183, + "step": 851 + }, + { + "epoch": 3.4493927125506074, + "grad_norm": 2.05347009046486, + "learning_rate": 8.286423600481044e-06, + "loss": 1.6869, + "step": 852 + }, + { + "epoch": 3.4534412955465585, + "grad_norm": 2.1578470259791795, + "learning_rate": 8.281095000417725e-06, + "loss": 1.6709, + "step": 853 + }, + { + "epoch": 3.45748987854251, + "grad_norm": 2.2158190833608606, + "learning_rate": 8.27575984736369e-06, + "loss": 2.079, + "step": 854 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 2.9226191862145265, + "learning_rate": 8.270418151974286e-06, + "loss": 2.3146, + "step": 855 + }, + { + "epoch": 3.465587044534413, + "grad_norm": 2.1657050143675205, + "learning_rate": 8.265069924917925e-06, + "loss": 1.9175, + "step": 856 + }, + { + "epoch": 3.4696356275303644, + "grad_norm": 1.7932680376129573, + "learning_rate": 8.259715176876069e-06, + "loss": 1.8725, + "step": 857 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 1.8709685644083165, + "learning_rate": 8.254353918543199e-06, + "loss": 1.7809, + "step": 858 + }, + { + "epoch": 3.477732793522267, + "grad_norm": 2.4167400582718694, + "learning_rate": 8.2489861606268e-06, + "loss": 1.8016, + "step": 859 + }, + { + "epoch": 3.4817813765182186, + "grad_norm": 1.659768741074137, + "learning_rate": 8.243611913847337e-06, + "loss": 1.7188, + "step": 860 + }, + { + "epoch": 3.48582995951417, + "grad_norm": 2.1480568234600668, + "learning_rate": 8.238231188938237e-06, + "loss": 1.6913, + "step": 861 + }, + { + "epoch": 3.4898785425101213, + "grad_norm": 2.461283879827119, + "learning_rate": 8.232843996645865e-06, + "loss": 1.6242, + "step": 862 + }, + { + "epoch": 3.493927125506073, + "grad_norm": 2.3643514071925056, + "learning_rate": 8.2274503477295e-06, + "loss": 1.6881, + "step": 863 + }, + { + "epoch": 3.4979757085020244, + "grad_norm": 3.087293785042021, + "learning_rate": 8.222050252961318e-06, + "loss": 1.5087, + "step": 864 + }, + { + "epoch": 3.5020242914979756, + "grad_norm": 2.105684160210004, + "learning_rate": 8.216643723126367e-06, + "loss": 1.4331, + "step": 865 + }, + { + "epoch": 3.506072874493927, + "grad_norm": 2.420952436641065, + "learning_rate": 8.211230769022552e-06, + "loss": 1.7553, + "step": 866 + }, + { + "epoch": 3.5101214574898787, + "grad_norm": 2.2746665377354116, + "learning_rate": 8.2058114014606e-06, + "loss": 1.782, + "step": 867 + }, + { + "epoch": 3.51417004048583, + "grad_norm": 1.6776374980476494, + "learning_rate": 8.200385631264051e-06, + "loss": 1.7357, + "step": 868 + }, + { + "epoch": 3.5182186234817814, + "grad_norm": 2.130957958265717, + "learning_rate": 8.19495346926924e-06, + "loss": 1.7569, + "step": 869 + }, + { + "epoch": 3.522267206477733, + "grad_norm": 2.1241420175580386, + "learning_rate": 8.189514926325255e-06, + "loss": 1.7036, + "step": 870 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 2.397883462392177, + "learning_rate": 8.184070013293936e-06, + "loss": 1.4984, + "step": 871 + }, + { + "epoch": 3.5303643724696356, + "grad_norm": 2.676554915114245, + "learning_rate": 8.178618741049841e-06, + "loss": 1.5719, + "step": 872 + }, + { + "epoch": 3.534412955465587, + "grad_norm": 2.641036334787177, + "learning_rate": 8.173161120480232e-06, + "loss": 1.7235, + "step": 873 + }, + { + "epoch": 3.5384615384615383, + "grad_norm": 2.4283908813712127, + "learning_rate": 8.16769716248505e-06, + "loss": 1.8976, + "step": 874 + }, + { + "epoch": 3.54251012145749, + "grad_norm": 1.9109389793413394, + "learning_rate": 8.162226877976886e-06, + "loss": 1.797, + "step": 875 + }, + { + "epoch": 3.5465587044534415, + "grad_norm": 3.1765952449893073, + "learning_rate": 8.156750277880979e-06, + "loss": 2.2212, + "step": 876 + }, + { + "epoch": 3.5506072874493926, + "grad_norm": 6.740978753214387, + "learning_rate": 8.15126737313517e-06, + "loss": 2.2759, + "step": 877 + }, + { + "epoch": 3.554655870445344, + "grad_norm": 6.646199027432937, + "learning_rate": 8.145778174689897e-06, + "loss": 2.5045, + "step": 878 + }, + { + "epoch": 3.5587044534412957, + "grad_norm": 1.9732928727215509, + "learning_rate": 8.140282693508168e-06, + "loss": 1.702, + "step": 879 + }, + { + "epoch": 3.562753036437247, + "grad_norm": 1.923113895215325, + "learning_rate": 8.134780940565535e-06, + "loss": 1.5859, + "step": 880 + }, + { + "epoch": 3.5668016194331984, + "grad_norm": 1.888490124882663, + "learning_rate": 8.129272926850079e-06, + "loss": 1.9019, + "step": 881 + }, + { + "epoch": 3.57085020242915, + "grad_norm": 2.0879599313529247, + "learning_rate": 8.123758663362386e-06, + "loss": 1.5424, + "step": 882 + }, + { + "epoch": 3.574898785425101, + "grad_norm": 2.1113301524020778, + "learning_rate": 8.118238161115523e-06, + "loss": 1.8581, + "step": 883 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 1.7105898329062328, + "learning_rate": 8.112711431135014e-06, + "loss": 1.5914, + "step": 884 + }, + { + "epoch": 3.582995951417004, + "grad_norm": 1.9358089378047225, + "learning_rate": 8.107178484458825e-06, + "loss": 1.7957, + "step": 885 + }, + { + "epoch": 3.5870445344129553, + "grad_norm": 1.9092777164097747, + "learning_rate": 8.101639332137337e-06, + "loss": 1.7404, + "step": 886 + }, + { + "epoch": 3.591093117408907, + "grad_norm": 2.098080272876577, + "learning_rate": 8.096093985233323e-06, + "loss": 1.7127, + "step": 887 + }, + { + "epoch": 3.5951417004048585, + "grad_norm": 2.4907144738421065, + "learning_rate": 8.090542454821929e-06, + "loss": 1.4308, + "step": 888 + }, + { + "epoch": 3.5991902834008096, + "grad_norm": 1.8678109793168913, + "learning_rate": 8.084984751990652e-06, + "loss": 1.4797, + "step": 889 + }, + { + "epoch": 3.603238866396761, + "grad_norm": 1.8961480105884363, + "learning_rate": 8.079420887839316e-06, + "loss": 1.6173, + "step": 890 + }, + { + "epoch": 3.6072874493927127, + "grad_norm": 1.9539785870788862, + "learning_rate": 8.073850873480047e-06, + "loss": 1.4952, + "step": 891 + }, + { + "epoch": 3.611336032388664, + "grad_norm": 2.31450202449626, + "learning_rate": 8.068274720037261e-06, + "loss": 1.813, + "step": 892 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 1.8087093273790038, + "learning_rate": 8.062692438647628e-06, + "loss": 1.7376, + "step": 893 + }, + { + "epoch": 3.619433198380567, + "grad_norm": 2.408589476299181, + "learning_rate": 8.057104040460062e-06, + "loss": 1.505, + "step": 894 + }, + { + "epoch": 3.623481781376518, + "grad_norm": 2.3231639351842035, + "learning_rate": 8.051509536635686e-06, + "loss": 1.9039, + "step": 895 + }, + { + "epoch": 3.6275303643724697, + "grad_norm": 1.9849491847712974, + "learning_rate": 8.045908938347828e-06, + "loss": 1.7125, + "step": 896 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 2.2249483352664026, + "learning_rate": 8.04030225678198e-06, + "loss": 1.9514, + "step": 897 + }, + { + "epoch": 3.6356275303643724, + "grad_norm": 2.005047614111562, + "learning_rate": 8.034689503135785e-06, + "loss": 1.597, + "step": 898 + }, + { + "epoch": 3.639676113360324, + "grad_norm": 2.2925145752574854, + "learning_rate": 8.029070688619013e-06, + "loss": 1.8072, + "step": 899 + }, + { + "epoch": 3.6437246963562755, + "grad_norm": 1.9475842419850795, + "learning_rate": 8.023445824453539e-06, + "loss": 1.7289, + "step": 900 + }, + { + "epoch": 3.6477732793522266, + "grad_norm": 2.071154449190338, + "learning_rate": 8.017814921873326e-06, + "loss": 1.7658, + "step": 901 + }, + { + "epoch": 3.651821862348178, + "grad_norm": 1.9935193669759015, + "learning_rate": 8.012177992124385e-06, + "loss": 1.6002, + "step": 902 + }, + { + "epoch": 3.6558704453441297, + "grad_norm": 2.2483209235168737, + "learning_rate": 8.006535046464774e-06, + "loss": 1.8275, + "step": 903 + }, + { + "epoch": 3.659919028340081, + "grad_norm": 2.5274264683222425, + "learning_rate": 8.000886096164564e-06, + "loss": 1.6502, + "step": 904 + }, + { + "epoch": 3.6639676113360324, + "grad_norm": 2.0119741262052195, + "learning_rate": 7.995231152505815e-06, + "loss": 1.8017, + "step": 905 + }, + { + "epoch": 3.668016194331984, + "grad_norm": 2.1027093845450233, + "learning_rate": 7.989570226782562e-06, + "loss": 1.8138, + "step": 906 + }, + { + "epoch": 3.672064777327935, + "grad_norm": 3.056649771146675, + "learning_rate": 7.983903330300782e-06, + "loss": 1.8128, + "step": 907 + }, + { + "epoch": 3.6761133603238867, + "grad_norm": 1.9139807090551522, + "learning_rate": 7.978230474378383e-06, + "loss": 1.7148, + "step": 908 + }, + { + "epoch": 3.6801619433198383, + "grad_norm": 2.416490627923619, + "learning_rate": 7.97255167034517e-06, + "loss": 1.7726, + "step": 909 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 2.053612332583323, + "learning_rate": 7.966866929542827e-06, + "loss": 1.5779, + "step": 910 + }, + { + "epoch": 3.688259109311741, + "grad_norm": 2.0666037215601505, + "learning_rate": 7.961176263324902e-06, + "loss": 1.7465, + "step": 911 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 2.1463137742100327, + "learning_rate": 7.955479683056767e-06, + "loss": 1.7608, + "step": 912 + }, + { + "epoch": 3.6963562753036436, + "grad_norm": 1.9232481327470194, + "learning_rate": 7.949777200115617e-06, + "loss": 1.5992, + "step": 913 + }, + { + "epoch": 3.700404858299595, + "grad_norm": 2.5029604743639515, + "learning_rate": 7.944068825890424e-06, + "loss": 2.089, + "step": 914 + }, + { + "epoch": 3.7044534412955468, + "grad_norm": 2.425403056999352, + "learning_rate": 7.938354571781933e-06, + "loss": 1.8514, + "step": 915 + }, + { + "epoch": 3.708502024291498, + "grad_norm": 2.2889869162476315, + "learning_rate": 7.932634449202635e-06, + "loss": 1.4493, + "step": 916 + }, + { + "epoch": 3.7125506072874495, + "grad_norm": 2.0245599708625988, + "learning_rate": 7.92690846957673e-06, + "loss": 1.6351, + "step": 917 + }, + { + "epoch": 3.716599190283401, + "grad_norm": 1.997997696536965, + "learning_rate": 7.921176644340132e-06, + "loss": 1.7253, + "step": 918 + }, + { + "epoch": 3.720647773279352, + "grad_norm": 2.344635708570945, + "learning_rate": 7.915438984940415e-06, + "loss": 1.5384, + "step": 919 + }, + { + "epoch": 3.7246963562753037, + "grad_norm": 2.399788568220564, + "learning_rate": 7.909695502836814e-06, + "loss": 1.6518, + "step": 920 + }, + { + "epoch": 3.7287449392712553, + "grad_norm": 2.258204100694036, + "learning_rate": 7.903946209500189e-06, + "loss": 1.8741, + "step": 921 + }, + { + "epoch": 3.7327935222672064, + "grad_norm": 1.9355255173187593, + "learning_rate": 7.898191116413007e-06, + "loss": 1.6996, + "step": 922 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 2.1474241115417425, + "learning_rate": 7.892430235069317e-06, + "loss": 1.7427, + "step": 923 + }, + { + "epoch": 3.7408906882591095, + "grad_norm": 3.071687208613463, + "learning_rate": 7.886663576974733e-06, + "loss": 2.4106, + "step": 924 + }, + { + "epoch": 3.7449392712550607, + "grad_norm": 2.0799708188253465, + "learning_rate": 7.880891153646401e-06, + "loss": 1.808, + "step": 925 + }, + { + "epoch": 3.748987854251012, + "grad_norm": 2.4353787137639453, + "learning_rate": 7.875112976612984e-06, + "loss": 1.6368, + "step": 926 + }, + { + "epoch": 3.753036437246964, + "grad_norm": 2.159792334487355, + "learning_rate": 7.869329057414635e-06, + "loss": 1.5175, + "step": 927 + }, + { + "epoch": 3.757085020242915, + "grad_norm": 2.0548605804443274, + "learning_rate": 7.863539407602976e-06, + "loss": 1.7423, + "step": 928 + }, + { + "epoch": 3.7611336032388665, + "grad_norm": 3.9628857560933324, + "learning_rate": 7.857744038741076e-06, + "loss": 2.5332, + "step": 929 + }, + { + "epoch": 3.765182186234818, + "grad_norm": 4.514218437938051, + "learning_rate": 7.85194296240342e-06, + "loss": 2.3287, + "step": 930 + }, + { + "epoch": 3.769230769230769, + "grad_norm": 5.356074790215057, + "learning_rate": 7.846136190175901e-06, + "loss": 2.1714, + "step": 931 + }, + { + "epoch": 3.7732793522267207, + "grad_norm": 2.238703863406207, + "learning_rate": 7.84032373365578e-06, + "loss": 1.671, + "step": 932 + }, + { + "epoch": 3.7773279352226723, + "grad_norm": 2.194562792441507, + "learning_rate": 7.834505604451672e-06, + "loss": 1.9108, + "step": 933 + }, + { + "epoch": 3.7813765182186234, + "grad_norm": 2.085928113902739, + "learning_rate": 7.828681814183527e-06, + "loss": 1.9396, + "step": 934 + }, + { + "epoch": 3.785425101214575, + "grad_norm": 2.215253557008417, + "learning_rate": 7.822852374482597e-06, + "loss": 1.7587, + "step": 935 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 3.010107826761077, + "learning_rate": 7.817017296991411e-06, + "loss": 1.6507, + "step": 936 + }, + { + "epoch": 3.7935222672064777, + "grad_norm": 2.25886892537205, + "learning_rate": 7.811176593363771e-06, + "loss": 1.7372, + "step": 937 + }, + { + "epoch": 3.7975708502024292, + "grad_norm": 2.2130344020805297, + "learning_rate": 7.805330275264707e-06, + "loss": 1.7485, + "step": 938 + }, + { + "epoch": 3.801619433198381, + "grad_norm": 2.0367189537336907, + "learning_rate": 7.79947835437046e-06, + "loss": 1.5515, + "step": 939 + }, + { + "epoch": 3.805668016194332, + "grad_norm": 2.070856690389127, + "learning_rate": 7.79362084236847e-06, + "loss": 1.4447, + "step": 940 + }, + { + "epoch": 3.8097165991902835, + "grad_norm": 2.1857926637124794, + "learning_rate": 7.787757750957335e-06, + "loss": 1.8015, + "step": 941 + }, + { + "epoch": 3.813765182186235, + "grad_norm": 2.6872149719652305, + "learning_rate": 7.781889091846799e-06, + "loss": 1.7528, + "step": 942 + }, + { + "epoch": 3.817813765182186, + "grad_norm": 2.3048135110635264, + "learning_rate": 7.776014876757727e-06, + "loss": 1.5226, + "step": 943 + }, + { + "epoch": 3.8218623481781377, + "grad_norm": 8.991127581731243, + "learning_rate": 7.77013511742208e-06, + "loss": 2.3966, + "step": 944 + }, + { + "epoch": 3.8259109311740893, + "grad_norm": 19.276037930316928, + "learning_rate": 7.76424982558289e-06, + "loss": 3.7738, + "step": 945 + }, + { + "epoch": 3.8299595141700404, + "grad_norm": 2.4583074183525677, + "learning_rate": 7.758359012994242e-06, + "loss": 1.6137, + "step": 946 + }, + { + "epoch": 3.834008097165992, + "grad_norm": 2.405931055156567, + "learning_rate": 7.752462691421245e-06, + "loss": 1.4666, + "step": 947 + }, + { + "epoch": 3.8380566801619436, + "grad_norm": 2.114379083785604, + "learning_rate": 7.746560872640007e-06, + "loss": 1.5791, + "step": 948 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 2.1946059502111845, + "learning_rate": 7.740653568437623e-06, + "loss": 1.5937, + "step": 949 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 2.3168344745949456, + "learning_rate": 7.734740790612137e-06, + "loss": 1.5169, + "step": 950 + }, + { + "epoch": 3.850202429149798, + "grad_norm": 2.3139829718351197, + "learning_rate": 7.728822550972523e-06, + "loss": 1.6162, + "step": 951 + }, + { + "epoch": 3.854251012145749, + "grad_norm": 2.5483408296020764, + "learning_rate": 7.722898861338674e-06, + "loss": 1.7001, + "step": 952 + }, + { + "epoch": 3.8582995951417005, + "grad_norm": 1.917540396918308, + "learning_rate": 7.716969733541357e-06, + "loss": 1.6257, + "step": 953 + }, + { + "epoch": 3.862348178137652, + "grad_norm": 2.4091479518780177, + "learning_rate": 7.711035179422205e-06, + "loss": 1.6058, + "step": 954 + }, + { + "epoch": 3.866396761133603, + "grad_norm": 2.4390857592479183, + "learning_rate": 7.705095210833687e-06, + "loss": 1.6468, + "step": 955 + }, + { + "epoch": 3.8704453441295548, + "grad_norm": 3.01025731676863, + "learning_rate": 7.699149839639086e-06, + "loss": 2.1392, + "step": 956 + }, + { + "epoch": 3.8744939271255063, + "grad_norm": 2.6957364897623473, + "learning_rate": 7.693199077712476e-06, + "loss": 2.0741, + "step": 957 + }, + { + "epoch": 3.8785425101214575, + "grad_norm": 2.6726767004932395, + "learning_rate": 7.687242936938694e-06, + "loss": 1.8205, + "step": 958 + }, + { + "epoch": 3.882591093117409, + "grad_norm": 2.3223231672079727, + "learning_rate": 7.681281429213328e-06, + "loss": 1.7239, + "step": 959 + }, + { + "epoch": 3.8866396761133606, + "grad_norm": 2.4223424195591505, + "learning_rate": 7.675314566442673e-06, + "loss": 1.2702, + "step": 960 + }, + { + "epoch": 3.8906882591093117, + "grad_norm": 2.1111739790928024, + "learning_rate": 7.669342360543727e-06, + "loss": 1.7654, + "step": 961 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 2.0865715931939968, + "learning_rate": 7.663364823444157e-06, + "loss": 1.8567, + "step": 962 + }, + { + "epoch": 3.898785425101215, + "grad_norm": 1.9521945713254736, + "learning_rate": 7.65738196708228e-06, + "loss": 1.5513, + "step": 963 + }, + { + "epoch": 3.902834008097166, + "grad_norm": 2.252893420029499, + "learning_rate": 7.651393803407032e-06, + "loss": 1.6101, + "step": 964 + }, + { + "epoch": 3.9068825910931175, + "grad_norm": 2.445627287506017, + "learning_rate": 7.645400344377953e-06, + "loss": 1.7802, + "step": 965 + }, + { + "epoch": 3.910931174089069, + "grad_norm": 2.206311718559999, + "learning_rate": 7.639401601965158e-06, + "loss": 1.3433, + "step": 966 + }, + { + "epoch": 3.91497975708502, + "grad_norm": 2.5126306064577935, + "learning_rate": 7.63339758814931e-06, + "loss": 1.4571, + "step": 967 + }, + { + "epoch": 3.919028340080972, + "grad_norm": 2.301201962037062, + "learning_rate": 7.627388314921602e-06, + "loss": 1.4798, + "step": 968 + }, + { + "epoch": 3.9230769230769234, + "grad_norm": 2.0505587515987265, + "learning_rate": 7.621373794283735e-06, + "loss": 1.7924, + "step": 969 + }, + { + "epoch": 3.9271255060728745, + "grad_norm": 2.716118255543476, + "learning_rate": 7.615354038247889e-06, + "loss": 1.6337, + "step": 970 + }, + { + "epoch": 3.931174089068826, + "grad_norm": 2.636209282969381, + "learning_rate": 7.609329058836694e-06, + "loss": 1.6699, + "step": 971 + }, + { + "epoch": 3.9352226720647776, + "grad_norm": 2.3802398786409107, + "learning_rate": 7.6032988680832195e-06, + "loss": 1.4692, + "step": 972 + }, + { + "epoch": 3.9392712550607287, + "grad_norm": 2.5735078826994844, + "learning_rate": 7.597263478030939e-06, + "loss": 1.3909, + "step": 973 + }, + { + "epoch": 3.9433198380566803, + "grad_norm": 2.986329351018389, + "learning_rate": 7.59122290073371e-06, + "loss": 1.6787, + "step": 974 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 2.4407323865905015, + "learning_rate": 7.5851771482557535e-06, + "loss": 1.4349, + "step": 975 + }, + { + "epoch": 3.951417004048583, + "grad_norm": 2.8570555841909657, + "learning_rate": 7.579126232671621e-06, + "loss": 1.4016, + "step": 976 + }, + { + "epoch": 3.9554655870445345, + "grad_norm": 3.322338952206099, + "learning_rate": 7.5730701660661795e-06, + "loss": 1.6104, + "step": 977 + }, + { + "epoch": 3.9595141700404857, + "grad_norm": 2.5182830088343082, + "learning_rate": 7.567008960534585e-06, + "loss": 1.6231, + "step": 978 + }, + { + "epoch": 3.9635627530364372, + "grad_norm": 2.1739951186703923, + "learning_rate": 7.560942628182251e-06, + "loss": 1.6679, + "step": 979 + }, + { + "epoch": 3.967611336032389, + "grad_norm": 2.5756124639646982, + "learning_rate": 7.554871181124836e-06, + "loss": 1.8633, + "step": 980 + }, + { + "epoch": 3.97165991902834, + "grad_norm": 3.073388081199716, + "learning_rate": 7.548794631488211e-06, + "loss": 1.768, + "step": 981 + }, + { + "epoch": 3.9757085020242915, + "grad_norm": 2.1012291254049797, + "learning_rate": 7.5427129914084385e-06, + "loss": 1.6442, + "step": 982 + }, + { + "epoch": 3.979757085020243, + "grad_norm": 2.351295674425286, + "learning_rate": 7.536626273031747e-06, + "loss": 1.7358, + "step": 983 + }, + { + "epoch": 3.983805668016194, + "grad_norm": 2.115853749649768, + "learning_rate": 7.530534488514507e-06, + "loss": 1.8024, + "step": 984 + }, + { + "epoch": 3.9878542510121457, + "grad_norm": 2.454948116388734, + "learning_rate": 7.524437650023211e-06, + "loss": 1.6063, + "step": 985 + }, + { + "epoch": 3.9919028340080973, + "grad_norm": 2.043008387794743, + "learning_rate": 7.5183357697344395e-06, + "loss": 1.5544, + "step": 986 + }, + { + "epoch": 3.9959514170040484, + "grad_norm": 1.8968397388893163, + "learning_rate": 7.512228859834845e-06, + "loss": 1.8733, + "step": 987 + }, + { + "epoch": 4.0, + "grad_norm": 2.2142162316932255, + "learning_rate": 7.506116932521127e-06, + "loss": 1.6136, + "step": 988 + }, + { + "epoch": 4.004048582995951, + "grad_norm": 2.080064737878757, + "learning_rate": 7.500000000000001e-06, + "loss": 1.6735, + "step": 989 + }, + { + "epoch": 4.008097165991903, + "grad_norm": 2.8195577020771863, + "learning_rate": 7.493878074488184e-06, + "loss": 1.8144, + "step": 990 + }, + { + "epoch": 4.012145748987854, + "grad_norm": 2.861434123319288, + "learning_rate": 7.4877511682123635e-06, + "loss": 1.6734, + "step": 991 + }, + { + "epoch": 4.016194331983805, + "grad_norm": 3.0695960191225247, + "learning_rate": 7.481619293409173e-06, + "loss": 1.8495, + "step": 992 + }, + { + "epoch": 4.020242914979757, + "grad_norm": 2.580474309033628, + "learning_rate": 7.475482462325169e-06, + "loss": 2.099, + "step": 993 + }, + { + "epoch": 4.0242914979757085, + "grad_norm": 2.721243409721488, + "learning_rate": 7.469340687216809e-06, + "loss": 1.9446, + "step": 994 + }, + { + "epoch": 4.02834008097166, + "grad_norm": 2.3410049191202074, + "learning_rate": 7.4631939803504215e-06, + "loss": 1.6196, + "step": 995 + }, + { + "epoch": 4.032388663967612, + "grad_norm": 2.720885518023577, + "learning_rate": 7.4570423540021905e-06, + "loss": 1.6221, + "step": 996 + }, + { + "epoch": 4.036437246963563, + "grad_norm": 2.5413861683291996, + "learning_rate": 7.450885820458117e-06, + "loss": 1.8749, + "step": 997 + }, + { + "epoch": 4.040485829959514, + "grad_norm": 2.5863690862096957, + "learning_rate": 7.44472439201401e-06, + "loss": 1.6649, + "step": 998 + }, + { + "epoch": 4.044534412955466, + "grad_norm": 2.371552718771952, + "learning_rate": 7.438558080975449e-06, + "loss": 1.6799, + "step": 999 + }, + { + "epoch": 4.048582995951417, + "grad_norm": 2.5691951258164063, + "learning_rate": 7.4323868996577696e-06, + "loss": 1.63, + "step": 1000 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 2.675468998968646, + "learning_rate": 7.426210860386032e-06, + "loss": 1.7354, + "step": 1001 + }, + { + "epoch": 4.05668016194332, + "grad_norm": 2.58607973493479, + "learning_rate": 7.420029975494996e-06, + "loss": 1.5703, + "step": 1002 + }, + { + "epoch": 4.060728744939271, + "grad_norm": 2.475852723612659, + "learning_rate": 7.413844257329104e-06, + "loss": 1.749, + "step": 1003 + }, + { + "epoch": 4.064777327935222, + "grad_norm": 2.625704853477589, + "learning_rate": 7.407653718242449e-06, + "loss": 1.6948, + "step": 1004 + }, + { + "epoch": 4.068825910931174, + "grad_norm": 2.7272435081151283, + "learning_rate": 7.401458370598753e-06, + "loss": 1.8281, + "step": 1005 + }, + { + "epoch": 4.0728744939271255, + "grad_norm": 2.507953052399452, + "learning_rate": 7.395258226771341e-06, + "loss": 1.7673, + "step": 1006 + }, + { + "epoch": 4.076923076923077, + "grad_norm": 2.5085283118904074, + "learning_rate": 7.3890532991431174e-06, + "loss": 1.6958, + "step": 1007 + }, + { + "epoch": 4.080971659919029, + "grad_norm": 2.388953051348741, + "learning_rate": 7.382843600106539e-06, + "loss": 1.7112, + "step": 1008 + }, + { + "epoch": 4.08502024291498, + "grad_norm": 2.2236808085380644, + "learning_rate": 7.376629142063597e-06, + "loss": 1.7162, + "step": 1009 + }, + { + "epoch": 4.089068825910931, + "grad_norm": 2.7412048035286505, + "learning_rate": 7.370409937425781e-06, + "loss": 1.7045, + "step": 1010 + }, + { + "epoch": 4.093117408906883, + "grad_norm": 2.3839251838504367, + "learning_rate": 7.364185998614064e-06, + "loss": 1.7854, + "step": 1011 + }, + { + "epoch": 4.097165991902834, + "grad_norm": 2.383572557144146, + "learning_rate": 7.357957338058873e-06, + "loss": 1.534, + "step": 1012 + }, + { + "epoch": 4.101214574898785, + "grad_norm": 2.7483936941368996, + "learning_rate": 7.3517239682000675e-06, + "loss": 1.7001, + "step": 1013 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 2.6910416116843257, + "learning_rate": 7.345485901486908e-06, + "loss": 1.7037, + "step": 1014 + }, + { + "epoch": 4.109311740890688, + "grad_norm": 2.677750230508956, + "learning_rate": 7.33924315037804e-06, + "loss": 1.6197, + "step": 1015 + }, + { + "epoch": 4.113360323886639, + "grad_norm": 3.1184294482443717, + "learning_rate": 7.332995727341462e-06, + "loss": 1.5587, + "step": 1016 + }, + { + "epoch": 4.117408906882591, + "grad_norm": 2.697817221643411, + "learning_rate": 7.326743644854504e-06, + "loss": 1.4804, + "step": 1017 + }, + { + "epoch": 4.1214574898785425, + "grad_norm": 2.5533427892436364, + "learning_rate": 7.3204869154038015e-06, + "loss": 1.5149, + "step": 1018 + }, + { + "epoch": 4.125506072874494, + "grad_norm": 2.7058477331519604, + "learning_rate": 7.314225551485273e-06, + "loss": 1.5156, + "step": 1019 + }, + { + "epoch": 4.129554655870446, + "grad_norm": 2.8633359493766384, + "learning_rate": 7.30795956560409e-06, + "loss": 1.4187, + "step": 1020 + }, + { + "epoch": 4.133603238866397, + "grad_norm": 2.346585899707522, + "learning_rate": 7.301688970274655e-06, + "loss": 1.7718, + "step": 1021 + }, + { + "epoch": 4.137651821862348, + "grad_norm": 2.8346595314782568, + "learning_rate": 7.295413778020579e-06, + "loss": 1.6181, + "step": 1022 + }, + { + "epoch": 4.1417004048583, + "grad_norm": 2.1328033209542046, + "learning_rate": 7.289134001374654e-06, + "loss": 1.3513, + "step": 1023 + }, + { + "epoch": 4.145748987854251, + "grad_norm": 2.723527413205223, + "learning_rate": 7.282849652878824e-06, + "loss": 1.7449, + "step": 1024 + }, + { + "epoch": 4.149797570850202, + "grad_norm": 2.6296530406635648, + "learning_rate": 7.276560745084167e-06, + "loss": 1.56, + "step": 1025 + }, + { + "epoch": 4.153846153846154, + "grad_norm": 2.3607444563571645, + "learning_rate": 7.2702672905508656e-06, + "loss": 1.7373, + "step": 1026 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 2.857459652562985, + "learning_rate": 7.263969301848188e-06, + "loss": 1.8929, + "step": 1027 + }, + { + "epoch": 4.161943319838056, + "grad_norm": 2.416479591453608, + "learning_rate": 7.257666791554448e-06, + "loss": 1.6155, + "step": 1028 + }, + { + "epoch": 4.165991902834008, + "grad_norm": 2.485932817739182, + "learning_rate": 7.251359772256998e-06, + "loss": 1.6856, + "step": 1029 + }, + { + "epoch": 4.17004048582996, + "grad_norm": 2.2601305066652664, + "learning_rate": 7.245048256552195e-06, + "loss": 2.1658, + "step": 1030 + }, + { + "epoch": 4.174089068825911, + "grad_norm": 2.4736185296097566, + "learning_rate": 7.2387322570453724e-06, + "loss": 1.5329, + "step": 1031 + }, + { + "epoch": 4.178137651821863, + "grad_norm": 2.902522379367228, + "learning_rate": 7.232411786350824e-06, + "loss": 1.7115, + "step": 1032 + }, + { + "epoch": 4.182186234817814, + "grad_norm": 2.1213589715944594, + "learning_rate": 7.226086857091765e-06, + "loss": 1.5227, + "step": 1033 + }, + { + "epoch": 4.186234817813765, + "grad_norm": 2.8619121355527968, + "learning_rate": 7.219757481900325e-06, + "loss": 1.6826, + "step": 1034 + }, + { + "epoch": 4.190283400809717, + "grad_norm": 2.5322052891357867, + "learning_rate": 7.213423673417508e-06, + "loss": 1.7019, + "step": 1035 + }, + { + "epoch": 4.194331983805668, + "grad_norm": 2.868097930235534, + "learning_rate": 7.207085444293172e-06, + "loss": 1.8899, + "step": 1036 + }, + { + "epoch": 4.198380566801619, + "grad_norm": 2.5521158066560288, + "learning_rate": 7.2007428071860045e-06, + "loss": 1.9495, + "step": 1037 + }, + { + "epoch": 4.202429149797571, + "grad_norm": 2.63283746068705, + "learning_rate": 7.194395774763496e-06, + "loss": 1.6451, + "step": 1038 + }, + { + "epoch": 4.206477732793522, + "grad_norm": 3.020988257996165, + "learning_rate": 7.188044359701917e-06, + "loss": 1.9686, + "step": 1039 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 2.7497468285413267, + "learning_rate": 7.181688574686292e-06, + "loss": 2.0078, + "step": 1040 + }, + { + "epoch": 4.2145748987854255, + "grad_norm": 2.4897799224246873, + "learning_rate": 7.175328432410367e-06, + "loss": 1.7921, + "step": 1041 + }, + { + "epoch": 4.218623481781377, + "grad_norm": 2.470322521256254, + "learning_rate": 7.168963945576597e-06, + "loss": 1.6719, + "step": 1042 + }, + { + "epoch": 4.222672064777328, + "grad_norm": 2.6592137837660266, + "learning_rate": 7.162595126896111e-06, + "loss": 1.5749, + "step": 1043 + }, + { + "epoch": 4.22672064777328, + "grad_norm": 2.533296478811204, + "learning_rate": 7.15622198908869e-06, + "loss": 1.7352, + "step": 1044 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 2.5992050846283354, + "learning_rate": 7.149844544882742e-06, + "loss": 1.5639, + "step": 1045 + }, + { + "epoch": 4.234817813765182, + "grad_norm": 2.7675121593200367, + "learning_rate": 7.143462807015271e-06, + "loss": 1.8108, + "step": 1046 + }, + { + "epoch": 4.238866396761134, + "grad_norm": 2.658793190465704, + "learning_rate": 7.137076788231865e-06, + "loss": 1.7457, + "step": 1047 + }, + { + "epoch": 4.242914979757085, + "grad_norm": 2.604959217646965, + "learning_rate": 7.130686501286655e-06, + "loss": 1.7451, + "step": 1048 + }, + { + "epoch": 4.246963562753036, + "grad_norm": 2.5111072223063897, + "learning_rate": 7.1242919589422974e-06, + "loss": 1.5808, + "step": 1049 + }, + { + "epoch": 4.251012145748988, + "grad_norm": 2.4705422975939775, + "learning_rate": 7.11789317396995e-06, + "loss": 1.6597, + "step": 1050 + }, + { + "epoch": 4.255060728744939, + "grad_norm": 2.8012872307046726, + "learning_rate": 7.1114901591492404e-06, + "loss": 1.6728, + "step": 1051 + }, + { + "epoch": 4.2591093117408905, + "grad_norm": 2.376781495157912, + "learning_rate": 7.105082927268247e-06, + "loss": 1.561, + "step": 1052 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 2.5702431118604423, + "learning_rate": 7.0986714911234715e-06, + "loss": 1.4172, + "step": 1053 + }, + { + "epoch": 4.267206477732794, + "grad_norm": 2.508325280537679, + "learning_rate": 7.092255863519806e-06, + "loss": 1.6779, + "step": 1054 + }, + { + "epoch": 4.271255060728745, + "grad_norm": 2.540012700506, + "learning_rate": 7.085836057270521e-06, + "loss": 1.6985, + "step": 1055 + }, + { + "epoch": 4.275303643724697, + "grad_norm": 2.471796434580062, + "learning_rate": 7.079412085197229e-06, + "loss": 1.7301, + "step": 1056 + }, + { + "epoch": 4.279352226720648, + "grad_norm": 3.3244889584848107, + "learning_rate": 7.072983960129862e-06, + "loss": 1.7094, + "step": 1057 + }, + { + "epoch": 4.283400809716599, + "grad_norm": 2.983349503659567, + "learning_rate": 7.066551694906651e-06, + "loss": 1.3989, + "step": 1058 + }, + { + "epoch": 4.287449392712551, + "grad_norm": 3.036520426590972, + "learning_rate": 7.060115302374087e-06, + "loss": 1.5257, + "step": 1059 + }, + { + "epoch": 4.291497975708502, + "grad_norm": 3.2696461082092068, + "learning_rate": 7.053674795386914e-06, + "loss": 1.3769, + "step": 1060 + }, + { + "epoch": 4.295546558704453, + "grad_norm": 3.066097380387373, + "learning_rate": 7.047230186808085e-06, + "loss": 1.6842, + "step": 1061 + }, + { + "epoch": 4.299595141700405, + "grad_norm": 2.6903089198270855, + "learning_rate": 7.04078148950875e-06, + "loss": 1.8088, + "step": 1062 + }, + { + "epoch": 4.303643724696356, + "grad_norm": 2.8258995708159773, + "learning_rate": 7.034328716368224e-06, + "loss": 1.5156, + "step": 1063 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 2.858420747113862, + "learning_rate": 7.027871880273959e-06, + "loss": 1.5394, + "step": 1064 + }, + { + "epoch": 4.3117408906882595, + "grad_norm": 2.7740108493498323, + "learning_rate": 7.021410994121525e-06, + "loss": 1.549, + "step": 1065 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 3.219790325593576, + "learning_rate": 7.014946070814583e-06, + "loss": 1.5296, + "step": 1066 + }, + { + "epoch": 4.319838056680162, + "grad_norm": 3.0526696821998094, + "learning_rate": 7.008477123264849e-06, + "loss": 1.4361, + "step": 1067 + }, + { + "epoch": 4.323886639676114, + "grad_norm": 2.9571662763160136, + "learning_rate": 7.0020041643920826e-06, + "loss": 1.4498, + "step": 1068 + }, + { + "epoch": 4.327935222672065, + "grad_norm": 2.819893094328226, + "learning_rate": 6.995527207124053e-06, + "loss": 1.4853, + "step": 1069 + }, + { + "epoch": 4.331983805668016, + "grad_norm": 2.7252255526223625, + "learning_rate": 6.989046264396516e-06, + "loss": 1.4535, + "step": 1070 + }, + { + "epoch": 4.336032388663968, + "grad_norm": 2.6189552228263753, + "learning_rate": 6.982561349153188e-06, + "loss": 1.5022, + "step": 1071 + }, + { + "epoch": 4.340080971659919, + "grad_norm": 2.568082005220546, + "learning_rate": 6.976072474345713e-06, + "loss": 1.4532, + "step": 1072 + }, + { + "epoch": 4.34412955465587, + "grad_norm": 2.623502257576312, + "learning_rate": 6.96957965293365e-06, + "loss": 1.4399, + "step": 1073 + }, + { + "epoch": 4.348178137651822, + "grad_norm": 3.1483597392827045, + "learning_rate": 6.963082897884439e-06, + "loss": 1.615, + "step": 1074 + }, + { + "epoch": 4.352226720647773, + "grad_norm": 3.8022601065423123, + "learning_rate": 6.956582222173374e-06, + "loss": 1.5412, + "step": 1075 + }, + { + "epoch": 4.3562753036437245, + "grad_norm": 3.177062030751366, + "learning_rate": 6.9500776387835785e-06, + "loss": 1.2047, + "step": 1076 + }, + { + "epoch": 4.3603238866396765, + "grad_norm": 3.185748452470112, + "learning_rate": 6.943569160705985e-06, + "loss": 1.6101, + "step": 1077 + }, + { + "epoch": 4.364372469635628, + "grad_norm": 2.9943825828047954, + "learning_rate": 6.9370568009393e-06, + "loss": 1.6897, + "step": 1078 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 2.8396585705303297, + "learning_rate": 6.9305405724899876e-06, + "loss": 1.6066, + "step": 1079 + }, + { + "epoch": 4.372469635627531, + "grad_norm": 3.4103100269352504, + "learning_rate": 6.924020488372229e-06, + "loss": 1.6845, + "step": 1080 + }, + { + "epoch": 4.376518218623482, + "grad_norm": 2.8184107943036323, + "learning_rate": 6.917496561607915e-06, + "loss": 1.3205, + "step": 1081 + }, + { + "epoch": 4.380566801619433, + "grad_norm": 3.152451887221124, + "learning_rate": 6.91096880522661e-06, + "loss": 1.4827, + "step": 1082 + }, + { + "epoch": 4.384615384615385, + "grad_norm": 2.8506198416780317, + "learning_rate": 6.904437232265521e-06, + "loss": 1.2814, + "step": 1083 + }, + { + "epoch": 4.388663967611336, + "grad_norm": 3.2465586785242033, + "learning_rate": 6.897901855769483e-06, + "loss": 1.3431, + "step": 1084 + }, + { + "epoch": 4.392712550607287, + "grad_norm": 3.077940405612511, + "learning_rate": 6.891362688790925e-06, + "loss": 1.5208, + "step": 1085 + }, + { + "epoch": 4.396761133603239, + "grad_norm": 3.4135560109047005, + "learning_rate": 6.884819744389848e-06, + "loss": 1.3629, + "step": 1086 + }, + { + "epoch": 4.40080971659919, + "grad_norm": 2.6507174805524727, + "learning_rate": 6.878273035633795e-06, + "loss": 1.3853, + "step": 1087 + }, + { + "epoch": 4.4048582995951415, + "grad_norm": 2.5895703393651637, + "learning_rate": 6.871722575597829e-06, + "loss": 1.4423, + "step": 1088 + }, + { + "epoch": 4.4089068825910935, + "grad_norm": 3.2322118670425777, + "learning_rate": 6.865168377364506e-06, + "loss": 1.5468, + "step": 1089 + }, + { + "epoch": 4.412955465587045, + "grad_norm": 2.942042054251793, + "learning_rate": 6.858610454023842e-06, + "loss": 1.36, + "step": 1090 + }, + { + "epoch": 4.417004048582996, + "grad_norm": 3.122031784641475, + "learning_rate": 6.8520488186733e-06, + "loss": 1.6917, + "step": 1091 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 3.2313772685904847, + "learning_rate": 6.845483484417756e-06, + "loss": 1.7526, + "step": 1092 + }, + { + "epoch": 4.425101214574899, + "grad_norm": 2.8735793462178023, + "learning_rate": 6.838914464369467e-06, + "loss": 1.6487, + "step": 1093 + }, + { + "epoch": 4.42914979757085, + "grad_norm": 2.954566180150772, + "learning_rate": 6.832341771648057e-06, + "loss": 1.7096, + "step": 1094 + }, + { + "epoch": 4.433198380566802, + "grad_norm": 2.587188799407319, + "learning_rate": 6.825765419380484e-06, + "loss": 1.8456, + "step": 1095 + }, + { + "epoch": 4.437246963562753, + "grad_norm": 3.0518891038101925, + "learning_rate": 6.819185420701011e-06, + "loss": 1.6224, + "step": 1096 + }, + { + "epoch": 4.441295546558704, + "grad_norm": 3.118348281802091, + "learning_rate": 6.812601788751192e-06, + "loss": 1.5498, + "step": 1097 + }, + { + "epoch": 4.445344129554655, + "grad_norm": 2.894711350660116, + "learning_rate": 6.806014536679828e-06, + "loss": 1.8041, + "step": 1098 + }, + { + "epoch": 4.449392712550607, + "grad_norm": 3.062471930595446, + "learning_rate": 6.7994236776429555e-06, + "loss": 1.5815, + "step": 1099 + }, + { + "epoch": 4.4534412955465585, + "grad_norm": 3.0993288240233263, + "learning_rate": 6.792829224803816e-06, + "loss": 1.5695, + "step": 1100 + }, + { + "epoch": 4.4574898785425106, + "grad_norm": 3.149585012325393, + "learning_rate": 6.7862311913328235e-06, + "loss": 1.9487, + "step": 1101 + }, + { + "epoch": 4.461538461538462, + "grad_norm": 4.120477147155456, + "learning_rate": 6.779629590407547e-06, + "loss": 2.1517, + "step": 1102 + }, + { + "epoch": 4.465587044534413, + "grad_norm": 3.1988261301020855, + "learning_rate": 6.773024435212678e-06, + "loss": 1.79, + "step": 1103 + }, + { + "epoch": 4.469635627530364, + "grad_norm": 2.6369221757485457, + "learning_rate": 6.7664157389400095e-06, + "loss": 1.7651, + "step": 1104 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 2.7091701203884364, + "learning_rate": 6.7598035147884055e-06, + "loss": 1.6839, + "step": 1105 + }, + { + "epoch": 4.477732793522267, + "grad_norm": 3.4306069422759005, + "learning_rate": 6.753187775963773e-06, + "loss": 1.692, + "step": 1106 + }, + { + "epoch": 4.481781376518219, + "grad_norm": 2.386072562379964, + "learning_rate": 6.746568535679041e-06, + "loss": 1.6155, + "step": 1107 + }, + { + "epoch": 4.48582995951417, + "grad_norm": 2.851423578739297, + "learning_rate": 6.739945807154136e-06, + "loss": 1.5755, + "step": 1108 + }, + { + "epoch": 4.489878542510121, + "grad_norm": 3.3510139502859206, + "learning_rate": 6.733319603615941e-06, + "loss": 1.5105, + "step": 1109 + }, + { + "epoch": 4.493927125506072, + "grad_norm": 3.329100996808692, + "learning_rate": 6.726689938298289e-06, + "loss": 1.568, + "step": 1110 + }, + { + "epoch": 4.497975708502024, + "grad_norm": 2.7974205212393057, + "learning_rate": 6.72005682444192e-06, + "loss": 1.4162, + "step": 1111 + }, + { + "epoch": 4.502024291497976, + "grad_norm": 2.9991024909175676, + "learning_rate": 6.713420275294467e-06, + "loss": 1.2872, + "step": 1112 + }, + { + "epoch": 4.506072874493928, + "grad_norm": 3.341853790054196, + "learning_rate": 6.70678030411042e-06, + "loss": 1.6404, + "step": 1113 + }, + { + "epoch": 4.510121457489879, + "grad_norm": 3.2032309023708687, + "learning_rate": 6.700136924151104e-06, + "loss": 1.6321, + "step": 1114 + }, + { + "epoch": 4.51417004048583, + "grad_norm": 2.446695841899921, + "learning_rate": 6.693490148684654e-06, + "loss": 1.5906, + "step": 1115 + }, + { + "epoch": 4.518218623481781, + "grad_norm": 3.030284559367058, + "learning_rate": 6.686839990985984e-06, + "loss": 1.6148, + "step": 1116 + }, + { + "epoch": 4.522267206477733, + "grad_norm": 3.0612075992794665, + "learning_rate": 6.680186464336767e-06, + "loss": 1.5678, + "step": 1117 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 3.4922710550140685, + "learning_rate": 6.673529582025398e-06, + "loss": 1.3788, + "step": 1118 + }, + { + "epoch": 4.530364372469636, + "grad_norm": 3.4134796811660166, + "learning_rate": 6.666869357346979e-06, + "loss": 1.4428, + "step": 1119 + }, + { + "epoch": 4.534412955465587, + "grad_norm": 3.6649442008937383, + "learning_rate": 6.660205803603286e-06, + "loss": 1.5671, + "step": 1120 + }, + { + "epoch": 4.538461538461538, + "grad_norm": 3.108830354735827, + "learning_rate": 6.653538934102743e-06, + "loss": 1.7903, + "step": 1121 + }, + { + "epoch": 4.5425101214574894, + "grad_norm": 2.719205109719932, + "learning_rate": 6.646868762160399e-06, + "loss": 1.6907, + "step": 1122 + }, + { + "epoch": 4.5465587044534415, + "grad_norm": 15.861026319110369, + "learning_rate": 6.640195301097896e-06, + "loss": 2.0735, + "step": 1123 + }, + { + "epoch": 4.550607287449393, + "grad_norm": 7.357015627613091, + "learning_rate": 6.633518564243442e-06, + "loss": 2.1046, + "step": 1124 + }, + { + "epoch": 4.554655870445345, + "grad_norm": 6.67996402988713, + "learning_rate": 6.626838564931797e-06, + "loss": 2.3423, + "step": 1125 + }, + { + "epoch": 4.558704453441296, + "grad_norm": 2.790707731153053, + "learning_rate": 6.620155316504225e-06, + "loss": 1.5771, + "step": 1126 + }, + { + "epoch": 4.562753036437247, + "grad_norm": 2.6424764643365544, + "learning_rate": 6.6134688323084884e-06, + "loss": 1.4544, + "step": 1127 + }, + { + "epoch": 4.566801619433198, + "grad_norm": 4.460650672408528, + "learning_rate": 6.606779125698808e-06, + "loss": 1.7848, + "step": 1128 + }, + { + "epoch": 4.57085020242915, + "grad_norm": 2.81766092171609, + "learning_rate": 6.600086210035841e-06, + "loss": 1.4465, + "step": 1129 + }, + { + "epoch": 4.574898785425101, + "grad_norm": 2.7934258737790794, + "learning_rate": 6.593390098686653e-06, + "loss": 1.7079, + "step": 1130 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 2.357159807197533, + "learning_rate": 6.586690805024692e-06, + "loss": 1.4715, + "step": 1131 + }, + { + "epoch": 4.582995951417004, + "grad_norm": 2.8201575354409876, + "learning_rate": 6.579988342429764e-06, + "loss": 1.6256, + "step": 1132 + }, + { + "epoch": 4.587044534412955, + "grad_norm": 2.748728982741463, + "learning_rate": 6.573282724288001e-06, + "loss": 1.6067, + "step": 1133 + }, + { + "epoch": 4.5910931174089065, + "grad_norm": 3.0721591492986526, + "learning_rate": 6.566573963991839e-06, + "loss": 1.5832, + "step": 1134 + }, + { + "epoch": 4.5951417004048585, + "grad_norm": 2.8487748202828924, + "learning_rate": 6.559862074939989e-06, + "loss": 1.3233, + "step": 1135 + }, + { + "epoch": 4.59919028340081, + "grad_norm": 2.590591556134, + "learning_rate": 6.553147070537413e-06, + "loss": 1.3674, + "step": 1136 + }, + { + "epoch": 4.603238866396762, + "grad_norm": 2.6607589757127186, + "learning_rate": 6.546428964195289e-06, + "loss": 1.4813, + "step": 1137 + }, + { + "epoch": 4.607287449392713, + "grad_norm": 2.936419659787077, + "learning_rate": 6.539707769330995e-06, + "loss": 1.3335, + "step": 1138 + }, + { + "epoch": 4.611336032388664, + "grad_norm": 5.647454932081391, + "learning_rate": 6.532983499368078e-06, + "loss": 1.631, + "step": 1139 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 2.672027285236729, + "learning_rate": 6.526256167736224e-06, + "loss": 1.6247, + "step": 1140 + }, + { + "epoch": 4.619433198380567, + "grad_norm": 3.585540725187652, + "learning_rate": 6.519525787871235e-06, + "loss": 1.365, + "step": 1141 + }, + { + "epoch": 4.623481781376518, + "grad_norm": 3.509608711468321, + "learning_rate": 6.512792373215e-06, + "loss": 1.7573, + "step": 1142 + }, + { + "epoch": 4.62753036437247, + "grad_norm": 2.971185622782078, + "learning_rate": 6.506055937215471e-06, + "loss": 1.561, + "step": 1143 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 3.2949915313334035, + "learning_rate": 6.499316493326631e-06, + "loss": 1.836, + "step": 1144 + }, + { + "epoch": 4.635627530364372, + "grad_norm": 2.861710933431733, + "learning_rate": 6.492574055008474e-06, + "loss": 1.4458, + "step": 1145 + }, + { + "epoch": 4.6396761133603235, + "grad_norm": 3.3593193695088828, + "learning_rate": 6.4858286357269716e-06, + "loss": 1.6806, + "step": 1146 + }, + { + "epoch": 4.6437246963562755, + "grad_norm": 2.7995829454110317, + "learning_rate": 6.4790802489540495e-06, + "loss": 1.5849, + "step": 1147 + }, + { + "epoch": 4.647773279352227, + "grad_norm": 2.9650473845995617, + "learning_rate": 6.472328908167562e-06, + "loss": 1.6598, + "step": 1148 + }, + { + "epoch": 4.651821862348179, + "grad_norm": 2.7905940219323475, + "learning_rate": 6.465574626851262e-06, + "loss": 1.4666, + "step": 1149 + }, + { + "epoch": 4.65587044534413, + "grad_norm": 3.2553490418837323, + "learning_rate": 6.4588174184947725e-06, + "loss": 1.6918, + "step": 1150 + }, + { + "epoch": 4.659919028340081, + "grad_norm": 3.55927475882226, + "learning_rate": 6.452057296593568e-06, + "loss": 1.5207, + "step": 1151 + }, + { + "epoch": 4.663967611336032, + "grad_norm": 2.9162925097777954, + "learning_rate": 6.445294274648937e-06, + "loss": 1.6745, + "step": 1152 + }, + { + "epoch": 4.668016194331984, + "grad_norm": 2.987151078867793, + "learning_rate": 6.4385283661679624e-06, + "loss": 1.6752, + "step": 1153 + }, + { + "epoch": 4.672064777327935, + "grad_norm": 3.186333717498487, + "learning_rate": 6.431759584663492e-06, + "loss": 1.753, + "step": 1154 + }, + { + "epoch": 4.676113360323887, + "grad_norm": 9.509020769435434, + "learning_rate": 6.424987943654109e-06, + "loss": 1.6195, + "step": 1155 + }, + { + "epoch": 4.680161943319838, + "grad_norm": 3.356709601234609, + "learning_rate": 6.418213456664111e-06, + "loss": 1.6311, + "step": 1156 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 2.921816366789115, + "learning_rate": 6.411436137223479e-06, + "loss": 1.4584, + "step": 1157 + }, + { + "epoch": 4.6882591093117405, + "grad_norm": 2.8660981524508338, + "learning_rate": 6.4046559988678485e-06, + "loss": 1.6084, + "step": 1158 + }, + { + "epoch": 4.6923076923076925, + "grad_norm": 3.0730207415431954, + "learning_rate": 6.397873055138487e-06, + "loss": 1.6274, + "step": 1159 + }, + { + "epoch": 4.696356275303644, + "grad_norm": 2.766004464269283, + "learning_rate": 6.391087319582264e-06, + "loss": 1.4697, + "step": 1160 + }, + { + "epoch": 4.700404858299595, + "grad_norm": 3.6099089118584136, + "learning_rate": 6.384298805751626e-06, + "loss": 1.9489, + "step": 1161 + }, + { + "epoch": 4.704453441295547, + "grad_norm": 3.442626114825173, + "learning_rate": 6.37750752720457e-06, + "loss": 1.727, + "step": 1162 + }, + { + "epoch": 4.708502024291498, + "grad_norm": 3.341066779383342, + "learning_rate": 6.370713497504607e-06, + "loss": 1.3178, + "step": 1163 + }, + { + "epoch": 4.712550607287449, + "grad_norm": 2.8791145178147386, + "learning_rate": 6.363916730220752e-06, + "loss": 1.4908, + "step": 1164 + }, + { + "epoch": 4.716599190283401, + "grad_norm": 2.8558993301680076, + "learning_rate": 6.357117238927481e-06, + "loss": 1.588, + "step": 1165 + }, + { + "epoch": 4.720647773279352, + "grad_norm": 3.403507251743757, + "learning_rate": 6.350315037204714e-06, + "loss": 1.3794, + "step": 1166 + }, + { + "epoch": 4.724696356275303, + "grad_norm": 3.28937405397847, + "learning_rate": 6.343510138637783e-06, + "loss": 1.535, + "step": 1167 + }, + { + "epoch": 4.728744939271255, + "grad_norm": 3.182353899970667, + "learning_rate": 6.336702556817405e-06, + "loss": 1.7416, + "step": 1168 + }, + { + "epoch": 4.732793522267206, + "grad_norm": 2.8393068837004285, + "learning_rate": 6.329892305339659e-06, + "loss": 1.521, + "step": 1169 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 3.0526645906441585, + "learning_rate": 6.323079397805951e-06, + "loss": 1.6001, + "step": 1170 + }, + { + "epoch": 4.7408906882591095, + "grad_norm": 3.453365846818349, + "learning_rate": 6.3162638478229965e-06, + "loss": 2.244, + "step": 1171 + }, + { + "epoch": 4.744939271255061, + "grad_norm": 2.930549437132931, + "learning_rate": 6.309445669002787e-06, + "loss": 1.6859, + "step": 1172 + }, + { + "epoch": 4.748987854251012, + "grad_norm": 3.513459131175886, + "learning_rate": 6.302624874962563e-06, + "loss": 1.5138, + "step": 1173 + }, + { + "epoch": 4.753036437246964, + "grad_norm": 3.101847130962305, + "learning_rate": 6.295801479324788e-06, + "loss": 1.4048, + "step": 1174 + }, + { + "epoch": 4.757085020242915, + "grad_norm": 2.9351108422638625, + "learning_rate": 6.288975495717124e-06, + "loss": 1.5932, + "step": 1175 + }, + { + "epoch": 4.761133603238866, + "grad_norm": 4.674100976432621, + "learning_rate": 6.282146937772399e-06, + "loss": 2.3515, + "step": 1176 + }, + { + "epoch": 4.765182186234818, + "grad_norm": 5.182394350357637, + "learning_rate": 6.2753158191285844e-06, + "loss": 2.1322, + "step": 1177 + }, + { + "epoch": 4.769230769230769, + "grad_norm": 6.057045402676707, + "learning_rate": 6.268482153428763e-06, + "loss": 2.0072, + "step": 1178 + }, + { + "epoch": 4.77327935222672, + "grad_norm": 3.1068830892655726, + "learning_rate": 6.261645954321109e-06, + "loss": 1.5127, + "step": 1179 + }, + { + "epoch": 4.777327935222672, + "grad_norm": 3.0244265678427213, + "learning_rate": 6.254807235458853e-06, + "loss": 1.7728, + "step": 1180 + }, + { + "epoch": 4.781376518218623, + "grad_norm": 2.949903538067424, + "learning_rate": 6.247966010500258e-06, + "loss": 1.78, + "step": 1181 + }, + { + "epoch": 4.7854251012145745, + "grad_norm": 3.1823383170218946, + "learning_rate": 6.241122293108594e-06, + "loss": 1.6101, + "step": 1182 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 3.0390422214285975, + "learning_rate": 6.2342760969521085e-06, + "loss": 1.5326, + "step": 1183 + }, + { + "epoch": 4.793522267206478, + "grad_norm": 3.136764973756456, + "learning_rate": 6.227427435703997e-06, + "loss": 1.5671, + "step": 1184 + }, + { + "epoch": 4.797570850202429, + "grad_norm": 3.358208559803108, + "learning_rate": 6.220576323042381e-06, + "loss": 1.5746, + "step": 1185 + }, + { + "epoch": 4.801619433198381, + "grad_norm": 2.8750507177466305, + "learning_rate": 6.213722772650277e-06, + "loss": 1.4246, + "step": 1186 + }, + { + "epoch": 4.805668016194332, + "grad_norm": 3.028809163189934, + "learning_rate": 6.206866798215571e-06, + "loss": 1.317, + "step": 1187 + }, + { + "epoch": 4.809716599190283, + "grad_norm": 3.126804073645922, + "learning_rate": 6.2000084134309905e-06, + "loss": 1.6821, + "step": 1188 + }, + { + "epoch": 4.813765182186235, + "grad_norm": 3.71033178556479, + "learning_rate": 6.193147631994073e-06, + "loss": 1.5786, + "step": 1189 + }, + { + "epoch": 4.817813765182186, + "grad_norm": 3.2129146658285346, + "learning_rate": 6.186284467607149e-06, + "loss": 1.3971, + "step": 1190 + }, + { + "epoch": 4.821862348178137, + "grad_norm": 10.210146232119035, + "learning_rate": 6.179418933977301e-06, + "loss": 2.3347, + "step": 1191 + }, + { + "epoch": 4.825910931174089, + "grad_norm": 21.275577852601224, + "learning_rate": 6.1725510448163516e-06, + "loss": 3.6222, + "step": 1192 + }, + { + "epoch": 4.82995951417004, + "grad_norm": 3.4666551476237584, + "learning_rate": 6.165680813840822e-06, + "loss": 1.4645, + "step": 1193 + }, + { + "epoch": 4.834008097165992, + "grad_norm": 3.4458166986644443, + "learning_rate": 6.1588082547719095e-06, + "loss": 1.3391, + "step": 1194 + }, + { + "epoch": 4.838056680161944, + "grad_norm": 2.919273388343095, + "learning_rate": 6.151933381335468e-06, + "loss": 1.4313, + "step": 1195 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 3.0732467672720736, + "learning_rate": 6.1450562072619635e-06, + "loss": 1.4611, + "step": 1196 + }, + { + "epoch": 4.846153846153846, + "grad_norm": 3.2736024865252493, + "learning_rate": 6.138176746286468e-06, + "loss": 1.3333, + "step": 1197 + }, + { + "epoch": 4.850202429149798, + "grad_norm": 3.3437325068102486, + "learning_rate": 6.131295012148613e-06, + "loss": 1.4833, + "step": 1198 + }, + { + "epoch": 4.854251012145749, + "grad_norm": 3.6058736308138766, + "learning_rate": 6.124411018592568e-06, + "loss": 1.5521, + "step": 1199 + }, + { + "epoch": 4.8582995951417, + "grad_norm": 2.6980859324752267, + "learning_rate": 6.117524779367027e-06, + "loss": 1.4743, + "step": 1200 + }, + { + "epoch": 4.862348178137652, + "grad_norm": 3.4307422256171947, + "learning_rate": 6.110636308225157e-06, + "loss": 1.4612, + "step": 1201 + }, + { + "epoch": 4.866396761133603, + "grad_norm": 3.4665359414620625, + "learning_rate": 6.103745618924587e-06, + "loss": 1.4922, + "step": 1202 + }, + { + "epoch": 4.870445344129554, + "grad_norm": 4.034402333282032, + "learning_rate": 6.096852725227378e-06, + "loss": 1.9715, + "step": 1203 + }, + { + "epoch": 4.874493927125506, + "grad_norm": 3.6881022424154097, + "learning_rate": 6.089957640899988e-06, + "loss": 1.9107, + "step": 1204 + }, + { + "epoch": 4.8785425101214575, + "grad_norm": 3.862338875685726, + "learning_rate": 6.0830603797132574e-06, + "loss": 1.661, + "step": 1205 + }, + { + "epoch": 4.882591093117409, + "grad_norm": 3.384483266395071, + "learning_rate": 6.076160955442369e-06, + "loss": 1.5689, + "step": 1206 + }, + { + "epoch": 4.886639676113361, + "grad_norm": 3.345513039253192, + "learning_rate": 6.069259381866827e-06, + "loss": 1.1468, + "step": 1207 + }, + { + "epoch": 4.890688259109312, + "grad_norm": 2.8964038452697847, + "learning_rate": 6.0623556727704306e-06, + "loss": 1.6516, + "step": 1208 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 2.9136386786268895, + "learning_rate": 6.055449841941238e-06, + "loss": 1.7215, + "step": 1209 + }, + { + "epoch": 4.898785425101215, + "grad_norm": 2.7655346557671248, + "learning_rate": 6.048541903171552e-06, + "loss": 1.4413, + "step": 1210 + }, + { + "epoch": 4.902834008097166, + "grad_norm": 3.2433937012234715, + "learning_rate": 6.041631870257882e-06, + "loss": 1.4725, + "step": 1211 + }, + { + "epoch": 4.906882591093117, + "grad_norm": 3.4688660789200325, + "learning_rate": 6.034719757000918e-06, + "loss": 1.6069, + "step": 1212 + }, + { + "epoch": 4.910931174089069, + "grad_norm": 3.106070985660449, + "learning_rate": 6.0278055772055075e-06, + "loss": 1.2312, + "step": 1213 + }, + { + "epoch": 4.91497975708502, + "grad_norm": 3.4926777350408664, + "learning_rate": 6.020889344680627e-06, + "loss": 1.3252, + "step": 1214 + }, + { + "epoch": 4.919028340080971, + "grad_norm": 3.31474250904695, + "learning_rate": 6.013971073239346e-06, + "loss": 1.3404, + "step": 1215 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 2.7200582966885953, + "learning_rate": 6.007050776698816e-06, + "loss": 1.6668, + "step": 1216 + }, + { + "epoch": 4.9271255060728745, + "grad_norm": 4.194613418220712, + "learning_rate": 6.000128468880223e-06, + "loss": 1.5178, + "step": 1217 + }, + { + "epoch": 4.931174089068826, + "grad_norm": 3.6956716885492047, + "learning_rate": 5.993204163608776e-06, + "loss": 1.5313, + "step": 1218 + }, + { + "epoch": 4.935222672064778, + "grad_norm": 3.42386071095716, + "learning_rate": 5.986277874713672e-06, + "loss": 1.315, + "step": 1219 + }, + { + "epoch": 4.939271255060729, + "grad_norm": 3.4411238008448497, + "learning_rate": 5.979349616028067e-06, + "loss": 1.2599, + "step": 1220 + }, + { + "epoch": 4.94331983805668, + "grad_norm": 4.136849869910849, + "learning_rate": 5.972419401389058e-06, + "loss": 1.5671, + "step": 1221 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 3.3509710910402344, + "learning_rate": 5.96548724463764e-06, + "loss": 1.3098, + "step": 1222 + }, + { + "epoch": 4.951417004048583, + "grad_norm": 3.826301738234217, + "learning_rate": 5.958553159618693e-06, + "loss": 1.2627, + "step": 1223 + }, + { + "epoch": 4.955465587044534, + "grad_norm": 4.211383102056784, + "learning_rate": 5.951617160180944e-06, + "loss": 1.4866, + "step": 1224 + }, + { + "epoch": 4.959514170040486, + "grad_norm": 3.9784296755787043, + "learning_rate": 5.944679260176947e-06, + "loss": 1.5416, + "step": 1225 + }, + { + "epoch": 4.963562753036437, + "grad_norm": 3.121952186318371, + "learning_rate": 5.937739473463047e-06, + "loss": 1.5505, + "step": 1226 + }, + { + "epoch": 4.967611336032388, + "grad_norm": 3.717226187124744, + "learning_rate": 5.930797813899364e-06, + "loss": 1.6869, + "step": 1227 + }, + { + "epoch": 4.97165991902834, + "grad_norm": 4.139266573612088, + "learning_rate": 5.923854295349751e-06, + "loss": 1.5989, + "step": 1228 + }, + { + "epoch": 4.9757085020242915, + "grad_norm": 2.8954471867608937, + "learning_rate": 5.916908931681781e-06, + "loss": 1.5245, + "step": 1229 + }, + { + "epoch": 4.979757085020243, + "grad_norm": 3.153595083245072, + "learning_rate": 5.9099617367667065e-06, + "loss": 1.6063, + "step": 1230 + }, + { + "epoch": 4.983805668016195, + "grad_norm": 2.8400997626861173, + "learning_rate": 5.9030127244794385e-06, + "loss": 1.6715, + "step": 1231 + }, + { + "epoch": 4.987854251012146, + "grad_norm": 3.2491090209153874, + "learning_rate": 5.896061908698521e-06, + "loss": 1.4666, + "step": 1232 + }, + { + "epoch": 4.991902834008097, + "grad_norm": 2.6679775725786286, + "learning_rate": 5.8891093033060945e-06, + "loss": 1.4425, + "step": 1233 + }, + { + "epoch": 4.995951417004049, + "grad_norm": 2.6288454727168067, + "learning_rate": 5.8821549221878795e-06, + "loss": 1.7597, + "step": 1234 + }, + { + "epoch": 5.0, + "grad_norm": 2.885385124366649, + "learning_rate": 5.8751987792331365e-06, + "loss": 1.4922, + "step": 1235 + }, + { + "epoch": 5.004048582995951, + "grad_norm": 2.87961175357714, + "learning_rate": 5.8682408883346535e-06, + "loss": 1.5315, + "step": 1236 + }, + { + "epoch": 5.008097165991903, + "grad_norm": 3.895617299101059, + "learning_rate": 5.861281263388699e-06, + "loss": 1.6767, + "step": 1237 + }, + { + "epoch": 5.012145748987854, + "grad_norm": 3.762686290641399, + "learning_rate": 5.854319918295012e-06, + "loss": 1.5156, + "step": 1238 + }, + { + "epoch": 5.016194331983805, + "grad_norm": 4.177708865223027, + "learning_rate": 5.8473568669567645e-06, + "loss": 1.7157, + "step": 1239 + }, + { + "epoch": 5.020242914979757, + "grad_norm": 3.5866973777228996, + "learning_rate": 5.84039212328054e-06, + "loss": 1.9457, + "step": 1240 + }, + { + "epoch": 5.0242914979757085, + "grad_norm": 3.7038579253911434, + "learning_rate": 5.833425701176294e-06, + "loss": 1.8054, + "step": 1241 + }, + { + "epoch": 5.02834008097166, + "grad_norm": 3.053021737504678, + "learning_rate": 5.826457614557342e-06, + "loss": 1.4846, + "step": 1242 + }, + { + "epoch": 5.032388663967612, + "grad_norm": 3.7131269515944236, + "learning_rate": 5.819487877340318e-06, + "loss": 1.4864, + "step": 1243 + }, + { + "epoch": 5.036437246963563, + "grad_norm": 3.47442806634264, + "learning_rate": 5.812516503445158e-06, + "loss": 1.7235, + "step": 1244 + }, + { + "epoch": 5.040485829959514, + "grad_norm": 3.509517402822926, + "learning_rate": 5.805543506795063e-06, + "loss": 1.517, + "step": 1245 + }, + { + "epoch": 5.044534412955466, + "grad_norm": 3.3619188629392305, + "learning_rate": 5.798568901316475e-06, + "loss": 1.5768, + "step": 1246 + }, + { + "epoch": 5.048582995951417, + "grad_norm": 3.557428062968091, + "learning_rate": 5.79159270093905e-06, + "loss": 1.5018, + "step": 1247 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 3.7281770232445295, + "learning_rate": 5.784614919595631e-06, + "loss": 1.5785, + "step": 1248 + }, + { + "epoch": 5.05668016194332, + "grad_norm": 3.517681869861109, + "learning_rate": 5.7776355712222165e-06, + "loss": 1.4217, + "step": 1249 + }, + { + "epoch": 5.060728744939271, + "grad_norm": 3.487707428141539, + "learning_rate": 5.770654669757935e-06, + "loss": 1.5864, + "step": 1250 + }, + { + "epoch": 5.064777327935222, + "grad_norm": 3.79463286822166, + "learning_rate": 5.763672229145015e-06, + "loss": 1.5406, + "step": 1251 + }, + { + "epoch": 5.068825910931174, + "grad_norm": 3.9587280022782623, + "learning_rate": 5.756688263328762e-06, + "loss": 1.6808, + "step": 1252 + }, + { + "epoch": 5.0728744939271255, + "grad_norm": 3.574038459442136, + "learning_rate": 5.749702786257529e-06, + "loss": 1.6199, + "step": 1253 + }, + { + "epoch": 5.076923076923077, + "grad_norm": 3.9239619763747666, + "learning_rate": 5.742715811882682e-06, + "loss": 1.5554, + "step": 1254 + }, + { + "epoch": 5.080971659919029, + "grad_norm": 3.3525677000904435, + "learning_rate": 5.735727354158581e-06, + "loss": 1.5965, + "step": 1255 + }, + { + "epoch": 5.08502024291498, + "grad_norm": 3.14038896931749, + "learning_rate": 5.7287374270425475e-06, + "loss": 1.5955, + "step": 1256 + }, + { + "epoch": 5.089068825910931, + "grad_norm": 3.800313028867603, + "learning_rate": 5.721746044494838e-06, + "loss": 1.5594, + "step": 1257 + }, + { + "epoch": 5.093117408906883, + "grad_norm": 3.5079921931841707, + "learning_rate": 5.714753220478616e-06, + "loss": 1.6374, + "step": 1258 + }, + { + "epoch": 5.097165991902834, + "grad_norm": 3.3722158742610033, + "learning_rate": 5.707758968959923e-06, + "loss": 1.3947, + "step": 1259 + }, + { + "epoch": 5.101214574898785, + "grad_norm": 3.690572058964337, + "learning_rate": 5.7007633039076535e-06, + "loss": 1.5641, + "step": 1260 + }, + { + "epoch": 5.105263157894737, + "grad_norm": 3.868480542932687, + "learning_rate": 5.693766239293522e-06, + "loss": 1.5403, + "step": 1261 + }, + { + "epoch": 5.109311740890688, + "grad_norm": 3.642440736287873, + "learning_rate": 5.686767789092041e-06, + "loss": 1.4899, + "step": 1262 + }, + { + "epoch": 5.113360323886639, + "grad_norm": 4.407879993174004, + "learning_rate": 5.67976796728049e-06, + "loss": 1.4415, + "step": 1263 + }, + { + "epoch": 5.117408906882591, + "grad_norm": 3.9268283691257166, + "learning_rate": 5.672766787838884e-06, + "loss": 1.349, + "step": 1264 + }, + { + "epoch": 5.1214574898785425, + "grad_norm": 3.5424496240381282, + "learning_rate": 5.6657642647499545e-06, + "loss": 1.4005, + "step": 1265 + }, + { + "epoch": 5.125506072874494, + "grad_norm": 3.714267182183359, + "learning_rate": 5.658760411999115e-06, + "loss": 1.4047, + "step": 1266 + }, + { + "epoch": 5.129554655870446, + "grad_norm": 4.1352520308511425, + "learning_rate": 5.6517552435744325e-06, + "loss": 1.3041, + "step": 1267 + }, + { + "epoch": 5.133603238866397, + "grad_norm": 3.1992855070868185, + "learning_rate": 5.644748773466606e-06, + "loss": 1.6559, + "step": 1268 + }, + { + "epoch": 5.137651821862348, + "grad_norm": 3.852499540993822, + "learning_rate": 5.637741015668929e-06, + "loss": 1.4822, + "step": 1269 + }, + { + "epoch": 5.1417004048583, + "grad_norm": 3.0057363516680513, + "learning_rate": 5.630731984177269e-06, + "loss": 1.2246, + "step": 1270 + }, + { + "epoch": 5.145748987854251, + "grad_norm": 3.8748912975587544, + "learning_rate": 5.62372169299004e-06, + "loss": 1.5924, + "step": 1271 + }, + { + "epoch": 5.149797570850202, + "grad_norm": 3.5771984578664875, + "learning_rate": 5.616710156108167e-06, + "loss": 1.4133, + "step": 1272 + }, + { + "epoch": 5.153846153846154, + "grad_norm": 3.2086974588686576, + "learning_rate": 5.609697387535068e-06, + "loss": 1.621, + "step": 1273 + }, + { + "epoch": 5.157894736842105, + "grad_norm": 3.984819835501151, + "learning_rate": 5.6026834012766155e-06, + "loss": 1.7158, + "step": 1274 + }, + { + "epoch": 5.161943319838056, + "grad_norm": 3.2013860532982337, + "learning_rate": 5.5956682113411184e-06, + "loss": 1.4746, + "step": 1275 + }, + { + "epoch": 5.165991902834008, + "grad_norm": 3.450642934981606, + "learning_rate": 5.588651831739289e-06, + "loss": 1.5543, + "step": 1276 + }, + { + "epoch": 5.17004048582996, + "grad_norm": 3.093776549631426, + "learning_rate": 5.581634276484211e-06, + "loss": 2.074, + "step": 1277 + }, + { + "epoch": 5.174089068825911, + "grad_norm": 3.545758099078526, + "learning_rate": 5.574615559591323e-06, + "loss": 1.3906, + "step": 1278 + }, + { + "epoch": 5.178137651821863, + "grad_norm": 4.14672203994261, + "learning_rate": 5.567595695078379e-06, + "loss": 1.5738, + "step": 1279 + }, + { + "epoch": 5.182186234817814, + "grad_norm": 2.9347838837502294, + "learning_rate": 5.560574696965425e-06, + "loss": 1.3815, + "step": 1280 + }, + { + "epoch": 5.186234817813765, + "grad_norm": 3.90774860265149, + "learning_rate": 5.553552579274775e-06, + "loss": 1.5673, + "step": 1281 + }, + { + "epoch": 5.190283400809717, + "grad_norm": 3.578616704951525, + "learning_rate": 5.546529356030974e-06, + "loss": 1.5733, + "step": 1282 + }, + { + "epoch": 5.194331983805668, + "grad_norm": 4.0010401720998185, + "learning_rate": 5.539505041260779e-06, + "loss": 1.757, + "step": 1283 + }, + { + "epoch": 5.198380566801619, + "grad_norm": 3.509112575984563, + "learning_rate": 5.532479648993122e-06, + "loss": 1.8081, + "step": 1284 + }, + { + "epoch": 5.202429149797571, + "grad_norm": 3.5347317901565556, + "learning_rate": 5.525453193259094e-06, + "loss": 1.5116, + "step": 1285 + }, + { + "epoch": 5.206477732793522, + "grad_norm": 3.4675375372116184, + "learning_rate": 5.518425688091906e-06, + "loss": 1.8506, + "step": 1286 + }, + { + "epoch": 5.2105263157894735, + "grad_norm": 3.6323230014040306, + "learning_rate": 5.511397147526862e-06, + "loss": 1.8682, + "step": 1287 + }, + { + "epoch": 5.2145748987854255, + "grad_norm": 3.5536336190454048, + "learning_rate": 5.504367585601342e-06, + "loss": 1.6388, + "step": 1288 + }, + { + "epoch": 5.218623481781377, + "grad_norm": 3.6273876631462905, + "learning_rate": 5.497337016354757e-06, + "loss": 1.5266, + "step": 1289 + }, + { + "epoch": 5.222672064777328, + "grad_norm": 3.605955542328613, + "learning_rate": 5.490305453828534e-06, + "loss": 1.4274, + "step": 1290 + }, + { + "epoch": 5.22672064777328, + "grad_norm": 3.594834856645006, + "learning_rate": 5.483272912066084e-06, + "loss": 1.6117, + "step": 1291 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 3.6817183177194295, + "learning_rate": 5.476239405112775e-06, + "loss": 1.4265, + "step": 1292 + }, + { + "epoch": 5.234817813765182, + "grad_norm": 4.022022675891982, + "learning_rate": 5.469204947015897e-06, + "loss": 1.668, + "step": 1293 + }, + { + "epoch": 5.238866396761134, + "grad_norm": 3.889168025126557, + "learning_rate": 5.462169551824648e-06, + "loss": 1.6076, + "step": 1294 + }, + { + "epoch": 5.242914979757085, + "grad_norm": 3.6700082316334273, + "learning_rate": 5.45513323359009e-06, + "loss": 1.6171, + "step": 1295 + }, + { + "epoch": 5.246963562753036, + "grad_norm": 3.6748741609947855, + "learning_rate": 5.448096006365132e-06, + "loss": 1.4488, + "step": 1296 + }, + { + "epoch": 5.251012145748988, + "grad_norm": 3.6290737200114993, + "learning_rate": 5.4410578842045e-06, + "loss": 1.5478, + "step": 1297 + }, + { + "epoch": 5.255060728744939, + "grad_norm": 3.8478048256636357, + "learning_rate": 5.434018881164702e-06, + "loss": 1.523, + "step": 1298 + }, + { + "epoch": 5.2591093117408905, + "grad_norm": 3.312410066611835, + "learning_rate": 5.426979011304012e-06, + "loss": 1.4463, + "step": 1299 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 3.647621711678499, + "learning_rate": 5.41993828868243e-06, + "loss": 1.2639, + "step": 1300 + }, + { + "epoch": 5.267206477732794, + "grad_norm": 3.5536727878739205, + "learning_rate": 5.412896727361663e-06, + "loss": 1.5401, + "step": 1301 + }, + { + "epoch": 5.271255060728745, + "grad_norm": 3.539451611896165, + "learning_rate": 5.405854341405088e-06, + "loss": 1.5594, + "step": 1302 + }, + { + "epoch": 5.275303643724697, + "grad_norm": 3.4030202032336394, + "learning_rate": 5.398811144877733e-06, + "loss": 1.5997, + "step": 1303 + }, + { + "epoch": 5.279352226720648, + "grad_norm": 4.605755727643003, + "learning_rate": 5.391767151846247e-06, + "loss": 1.5551, + "step": 1304 + }, + { + "epoch": 5.283400809716599, + "grad_norm": 4.210060420659593, + "learning_rate": 5.384722376378861e-06, + "loss": 1.2388, + "step": 1305 + }, + { + "epoch": 5.287449392712551, + "grad_norm": 4.288644203676987, + "learning_rate": 5.377676832545377e-06, + "loss": 1.3926, + "step": 1306 + }, + { + "epoch": 5.291497975708502, + "grad_norm": 4.344641505323721, + "learning_rate": 5.370630534417133e-06, + "loss": 1.2335, + "step": 1307 + }, + { + "epoch": 5.295546558704453, + "grad_norm": 4.293842456125265, + "learning_rate": 5.363583496066963e-06, + "loss": 1.5097, + "step": 1308 + }, + { + "epoch": 5.299595141700405, + "grad_norm": 3.5889617840380956, + "learning_rate": 5.356535731569189e-06, + "loss": 1.6798, + "step": 1309 + }, + { + "epoch": 5.303643724696356, + "grad_norm": 3.8949744261018844, + "learning_rate": 5.349487254999579e-06, + "loss": 1.3501, + "step": 1310 + }, + { + "epoch": 5.3076923076923075, + "grad_norm": 3.8938141628185394, + "learning_rate": 5.342438080435325e-06, + "loss": 1.3823, + "step": 1311 + }, + { + "epoch": 5.3117408906882595, + "grad_norm": 3.7811284620632146, + "learning_rate": 5.335388221955012e-06, + "loss": 1.4001, + "step": 1312 + }, + { + "epoch": 5.315789473684211, + "grad_norm": 4.504485300390198, + "learning_rate": 5.328337693638591e-06, + "loss": 1.3433, + "step": 1313 + }, + { + "epoch": 5.319838056680162, + "grad_norm": 3.9863561932252, + "learning_rate": 5.321286509567351e-06, + "loss": 1.2701, + "step": 1314 + }, + { + "epoch": 5.323886639676114, + "grad_norm": 4.103946070839009, + "learning_rate": 5.314234683823892e-06, + "loss": 1.2979, + "step": 1315 + }, + { + "epoch": 5.327935222672065, + "grad_norm": 3.9048810862002896, + "learning_rate": 5.307182230492089e-06, + "loss": 1.3284, + "step": 1316 + }, + { + "epoch": 5.331983805668016, + "grad_norm": 3.802962634621348, + "learning_rate": 5.300129163657081e-06, + "loss": 1.3376, + "step": 1317 + }, + { + "epoch": 5.336032388663968, + "grad_norm": 3.6151941699291696, + "learning_rate": 5.2930754974052245e-06, + "loss": 1.3976, + "step": 1318 + }, + { + "epoch": 5.340080971659919, + "grad_norm": 3.4851660754400124, + "learning_rate": 5.286021245824075e-06, + "loss": 1.3431, + "step": 1319 + }, + { + "epoch": 5.34412955465587, + "grad_norm": 3.7167755157754008, + "learning_rate": 5.2789664230023595e-06, + "loss": 1.295, + "step": 1320 + }, + { + "epoch": 5.348178137651822, + "grad_norm": 4.41974802384744, + "learning_rate": 5.2719110430299416e-06, + "loss": 1.4491, + "step": 1321 + }, + { + "epoch": 5.352226720647773, + "grad_norm": 4.277030621050548, + "learning_rate": 5.264855119997803e-06, + "loss": 1.4354, + "step": 1322 + }, + { + "epoch": 5.3562753036437245, + "grad_norm": 4.194929698692418, + "learning_rate": 5.257798667998003e-06, + "loss": 1.0844, + "step": 1323 + }, + { + "epoch": 5.3603238866396765, + "grad_norm": 4.472113694740598, + "learning_rate": 5.2507417011236625e-06, + "loss": 1.4929, + "step": 1324 + }, + { + "epoch": 5.364372469635628, + "grad_norm": 3.9849001434928866, + "learning_rate": 5.243684233468933e-06, + "loss": 1.5648, + "step": 1325 + }, + { + "epoch": 5.368421052631579, + "grad_norm": 3.864302824850682, + "learning_rate": 5.236626279128958e-06, + "loss": 1.473, + "step": 1326 + }, + { + "epoch": 5.372469635627531, + "grad_norm": 4.810968253503194, + "learning_rate": 5.22956785219986e-06, + "loss": 1.5456, + "step": 1327 + }, + { + "epoch": 5.376518218623482, + "grad_norm": 4.111208820335583, + "learning_rate": 5.222508966778702e-06, + "loss": 1.2098, + "step": 1328 + }, + { + "epoch": 5.380566801619433, + "grad_norm": 4.534807999665865, + "learning_rate": 5.2154496369634645e-06, + "loss": 1.363, + "step": 1329 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 4.048755500092308, + "learning_rate": 5.208389876853014e-06, + "loss": 1.1592, + "step": 1330 + }, + { + "epoch": 5.388663967611336, + "grad_norm": 4.75370785314969, + "learning_rate": 5.201329700547077e-06, + "loss": 1.226, + "step": 1331 + }, + { + "epoch": 5.392712550607287, + "grad_norm": 4.367024994722068, + "learning_rate": 5.194269122146211e-06, + "loss": 1.4048, + "step": 1332 + }, + { + "epoch": 5.396761133603239, + "grad_norm": 4.918852915006795, + "learning_rate": 5.187208155751779e-06, + "loss": 1.2387, + "step": 1333 + }, + { + "epoch": 5.40080971659919, + "grad_norm": 3.6289200014371894, + "learning_rate": 5.180146815465915e-06, + "loss": 1.2571, + "step": 1334 + }, + { + "epoch": 5.4048582995951415, + "grad_norm": 3.7443218122005266, + "learning_rate": 5.173085115391502e-06, + "loss": 1.3062, + "step": 1335 + }, + { + "epoch": 5.4089068825910935, + "grad_norm": 4.7017026873802426, + "learning_rate": 5.16602306963214e-06, + "loss": 1.4154, + "step": 1336 + }, + { + "epoch": 5.412955465587045, + "grad_norm": 4.150505086067103, + "learning_rate": 5.158960692292122e-06, + "loss": 1.2259, + "step": 1337 + }, + { + "epoch": 5.417004048582996, + "grad_norm": 4.482184582986182, + "learning_rate": 5.151897997476403e-06, + "loss": 1.5583, + "step": 1338 + }, + { + "epoch": 5.421052631578947, + "grad_norm": 4.682227327595727, + "learning_rate": 5.144834999290567e-06, + "loss": 1.598, + "step": 1339 + }, + { + "epoch": 5.425101214574899, + "grad_norm": 4.008926002575055, + "learning_rate": 5.137771711840811e-06, + "loss": 1.5379, + "step": 1340 + }, + { + "epoch": 5.42914979757085, + "grad_norm": 4.302820633137393, + "learning_rate": 5.130708149233905e-06, + "loss": 1.5569, + "step": 1341 + }, + { + "epoch": 5.433198380566802, + "grad_norm": 3.5969352441824007, + "learning_rate": 5.123644325577168e-06, + "loss": 1.7237, + "step": 1342 + }, + { + "epoch": 5.437246963562753, + "grad_norm": 4.1865532032949035, + "learning_rate": 5.116580254978447e-06, + "loss": 1.4932, + "step": 1343 + }, + { + "epoch": 5.441295546558704, + "grad_norm": 4.443537220527738, + "learning_rate": 5.1095159515460736e-06, + "loss": 1.4349, + "step": 1344 + }, + { + "epoch": 5.445344129554655, + "grad_norm": 3.8400638359623653, + "learning_rate": 5.10245142938885e-06, + "loss": 1.6808, + "step": 1345 + }, + { + "epoch": 5.449392712550607, + "grad_norm": 4.456713357432363, + "learning_rate": 5.095386702616012e-06, + "loss": 1.4753, + "step": 1346 + }, + { + "epoch": 5.4534412955465585, + "grad_norm": 4.371248488578587, + "learning_rate": 5.088321785337207e-06, + "loss": 1.4634, + "step": 1347 + }, + { + "epoch": 5.4574898785425106, + "grad_norm": 4.503939177016205, + "learning_rate": 5.0812566916624624e-06, + "loss": 1.8175, + "step": 1348 + }, + { + "epoch": 5.461538461538462, + "grad_norm": 5.8661687643019444, + "learning_rate": 5.074191435702155e-06, + "loss": 1.9684, + "step": 1349 + }, + { + "epoch": 5.465587044534413, + "grad_norm": 4.324067092257868, + "learning_rate": 5.067126031566988e-06, + "loss": 1.6405, + "step": 1350 + }, + { + "epoch": 5.469635627530364, + "grad_norm": 3.796039870689883, + "learning_rate": 5.060060493367961e-06, + "loss": 1.6486, + "step": 1351 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 3.738600525398421, + "learning_rate": 5.05299483521634e-06, + "loss": 1.5872, + "step": 1352 + }, + { + "epoch": 5.477732793522267, + "grad_norm": 4.6006758703016, + "learning_rate": 5.045929071223633e-06, + "loss": 1.5976, + "step": 1353 + }, + { + "epoch": 5.481781376518219, + "grad_norm": 3.3463637296184854, + "learning_rate": 5.038863215501555e-06, + "loss": 1.5156, + "step": 1354 + }, + { + "epoch": 5.48582995951417, + "grad_norm": 3.8425032487043813, + "learning_rate": 5.031797282162007e-06, + "loss": 1.4631, + "step": 1355 + }, + { + "epoch": 5.489878542510121, + "grad_norm": 4.548619092337232, + "learning_rate": 5.024731285317046e-06, + "loss": 1.3972, + "step": 1356 + }, + { + "epoch": 5.493927125506072, + "grad_norm": 4.814717659012562, + "learning_rate": 5.017665239078854e-06, + "loss": 1.4267, + "step": 1357 + }, + { + "epoch": 5.497975708502024, + "grad_norm": 3.6552584947768096, + "learning_rate": 5.010599157559713e-06, + "loss": 1.2966, + "step": 1358 + }, + { + "epoch": 5.502024291497976, + "grad_norm": 4.204585823006649, + "learning_rate": 5.003533054871973e-06, + "loss": 1.15, + "step": 1359 + }, + { + "epoch": 5.506072874493928, + "grad_norm": 4.634653281785678, + "learning_rate": 4.996466945128029e-06, + "loss": 1.5181, + "step": 1360 + }, + { + "epoch": 5.510121457489879, + "grad_norm": 4.3188079424314, + "learning_rate": 4.98940084244029e-06, + "loss": 1.4787, + "step": 1361 + }, + { + "epoch": 5.51417004048583, + "grad_norm": 3.332377152961891, + "learning_rate": 4.982334760921149e-06, + "loss": 1.4434, + "step": 1362 + }, + { + "epoch": 5.518218623481781, + "grad_norm": 4.271374565670683, + "learning_rate": 4.975268714682956e-06, + "loss": 1.4766, + "step": 1363 + }, + { + "epoch": 5.522267206477733, + "grad_norm": 4.388046491535482, + "learning_rate": 4.968202717837996e-06, + "loss": 1.4244, + "step": 1364 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 4.81529396324836, + "learning_rate": 4.961136784498448e-06, + "loss": 1.2532, + "step": 1365 + }, + { + "epoch": 5.530364372469636, + "grad_norm": 4.589391225576633, + "learning_rate": 4.9540709287763685e-06, + "loss": 1.3152, + "step": 1366 + }, + { + "epoch": 5.534412955465587, + "grad_norm": 5.101062956149816, + "learning_rate": 4.947005164783661e-06, + "loss": 1.409, + "step": 1367 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 4.286443288173012, + "learning_rate": 4.939939506632041e-06, + "loss": 1.6652, + "step": 1368 + }, + { + "epoch": 5.5425101214574894, + "grad_norm": 3.857994197551904, + "learning_rate": 4.932873968433014e-06, + "loss": 1.5821, + "step": 1369 + }, + { + "epoch": 5.5465587044534415, + "grad_norm": 82.8177825176114, + "learning_rate": 4.925808564297847e-06, + "loss": 2.0481, + "step": 1370 + }, + { + "epoch": 5.550607287449393, + "grad_norm": 8.294269069115597, + "learning_rate": 4.918743308337539e-06, + "loss": 1.9382, + "step": 1371 + }, + { + "epoch": 5.554655870445345, + "grad_norm": 8.675625865701205, + "learning_rate": 4.911678214662795e-06, + "loss": 2.2234, + "step": 1372 + }, + { + "epoch": 5.558704453441296, + "grad_norm": 3.9912695390847595, + "learning_rate": 4.9046132973839895e-06, + "loss": 1.4514, + "step": 1373 + }, + { + "epoch": 5.562753036437247, + "grad_norm": 3.603893380101875, + "learning_rate": 4.897548570611153e-06, + "loss": 1.3266, + "step": 1374 + }, + { + "epoch": 5.566801619433198, + "grad_norm": 3.6938504736682054, + "learning_rate": 4.890484048453928e-06, + "loss": 1.704, + "step": 1375 + }, + { + "epoch": 5.57085020242915, + "grad_norm": 4.1771900748802135, + "learning_rate": 4.883419745021554e-06, + "loss": 1.3432, + "step": 1376 + }, + { + "epoch": 5.574898785425101, + "grad_norm": 4.029068029464602, + "learning_rate": 4.8763556744228324e-06, + "loss": 1.5548, + "step": 1377 + }, + { + "epoch": 5.578947368421053, + "grad_norm": 3.1723858445451776, + "learning_rate": 4.869291850766097e-06, + "loss": 1.3556, + "step": 1378 + }, + { + "epoch": 5.582995951417004, + "grad_norm": 3.9383901181787118, + "learning_rate": 4.862228288159191e-06, + "loss": 1.4828, + "step": 1379 + }, + { + "epoch": 5.587044534412955, + "grad_norm": 3.8742071296776883, + "learning_rate": 4.855165000709434e-06, + "loss": 1.4776, + "step": 1380 + }, + { + "epoch": 5.5910931174089065, + "grad_norm": 4.320505162169018, + "learning_rate": 4.848102002523597e-06, + "loss": 1.4632, + "step": 1381 + }, + { + "epoch": 5.5951417004048585, + "grad_norm": 3.8728016571115496, + "learning_rate": 4.841039307707878e-06, + "loss": 1.1957, + "step": 1382 + }, + { + "epoch": 5.59919028340081, + "grad_norm": 3.492753062395854, + "learning_rate": 4.833976930367859e-06, + "loss": 1.2615, + "step": 1383 + }, + { + "epoch": 5.603238866396762, + "grad_norm": 3.5488104026542513, + "learning_rate": 4.8269148846085e-06, + "loss": 1.3531, + "step": 1384 + }, + { + "epoch": 5.607287449392713, + "grad_norm": 4.068763646311401, + "learning_rate": 4.819853184534085e-06, + "loss": 1.1753, + "step": 1385 + }, + { + "epoch": 5.611336032388664, + "grad_norm": 4.377905274086795, + "learning_rate": 4.812791844248223e-06, + "loss": 1.4958, + "step": 1386 + }, + { + "epoch": 5.615384615384615, + "grad_norm": 3.6007003800569386, + "learning_rate": 4.80573087785379e-06, + "loss": 1.4974, + "step": 1387 + }, + { + "epoch": 5.619433198380567, + "grad_norm": 4.802311568406072, + "learning_rate": 4.798670299452926e-06, + "loss": 1.2282, + "step": 1388 + }, + { + "epoch": 5.623481781376518, + "grad_norm": 4.7745139328350135, + "learning_rate": 4.7916101231469886e-06, + "loss": 1.6082, + "step": 1389 + }, + { + "epoch": 5.62753036437247, + "grad_norm": 4.123643145041474, + "learning_rate": 4.784550363036539e-06, + "loss": 1.4134, + "step": 1390 + }, + { + "epoch": 5.631578947368421, + "grad_norm": 4.402507798104486, + "learning_rate": 4.7774910332213005e-06, + "loss": 1.6983, + "step": 1391 + }, + { + "epoch": 5.635627530364372, + "grad_norm": 3.8264895380697355, + "learning_rate": 4.770432147800141e-06, + "loss": 1.2975, + "step": 1392 + }, + { + "epoch": 5.6396761133603235, + "grad_norm": 4.517127158006528, + "learning_rate": 4.763373720871044e-06, + "loss": 1.5541, + "step": 1393 + }, + { + "epoch": 5.6437246963562755, + "grad_norm": 3.773516174749104, + "learning_rate": 4.756315766531069e-06, + "loss": 1.4461, + "step": 1394 + }, + { + "epoch": 5.647773279352227, + "grad_norm": 4.115306809751942, + "learning_rate": 4.749258298876338e-06, + "loss": 1.5498, + "step": 1395 + }, + { + "epoch": 5.651821862348179, + "grad_norm": 3.6874924730709413, + "learning_rate": 4.742201332001998e-06, + "loss": 1.333, + "step": 1396 + }, + { + "epoch": 5.65587044534413, + "grad_norm": 4.445009061040838, + "learning_rate": 4.735144880002199e-06, + "loss": 1.556, + "step": 1397 + }, + { + "epoch": 5.659919028340081, + "grad_norm": 4.819457563644938, + "learning_rate": 4.728088956970059e-06, + "loss": 1.3788, + "step": 1398 + }, + { + "epoch": 5.663967611336032, + "grad_norm": 3.9520027905188275, + "learning_rate": 4.721033576997641e-06, + "loss": 1.5347, + "step": 1399 + }, + { + "epoch": 5.668016194331984, + "grad_norm": 4.124422632263573, + "learning_rate": 4.713978754175926e-06, + "loss": 1.5292, + "step": 1400 + }, + { + "epoch": 5.672064777327935, + "grad_norm": 4.475410908220464, + "learning_rate": 4.706924502594777e-06, + "loss": 1.6549, + "step": 1401 + }, + { + "epoch": 5.676113360323887, + "grad_norm": 9.027913146446028, + "learning_rate": 4.69987083634292e-06, + "loss": 1.5814, + "step": 1402 + }, + { + "epoch": 5.680161943319838, + "grad_norm": 4.584849302385236, + "learning_rate": 4.692817769507912e-06, + "loss": 1.4982, + "step": 1403 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 4.088441988479735, + "learning_rate": 4.685765316176111e-06, + "loss": 1.3453, + "step": 1404 + }, + { + "epoch": 5.6882591093117405, + "grad_norm": 3.94840157844417, + "learning_rate": 4.67871349043265e-06, + "loss": 1.4717, + "step": 1405 + }, + { + "epoch": 5.6923076923076925, + "grad_norm": 4.252654676588602, + "learning_rate": 4.671662306361409e-06, + "loss": 1.4891, + "step": 1406 + }, + { + "epoch": 5.696356275303644, + "grad_norm": 3.784433251453805, + "learning_rate": 4.664611778044988e-06, + "loss": 1.3408, + "step": 1407 + }, + { + "epoch": 5.700404858299595, + "grad_norm": 4.988371722598511, + "learning_rate": 4.657561919564675e-06, + "loss": 1.8095, + "step": 1408 + }, + { + "epoch": 5.704453441295547, + "grad_norm": 4.664322457086443, + "learning_rate": 4.6505127450004216e-06, + "loss": 1.6024, + "step": 1409 + }, + { + "epoch": 5.708502024291498, + "grad_norm": 4.600715197938257, + "learning_rate": 4.643464268430812e-06, + "loss": 1.2021, + "step": 1410 + }, + { + "epoch": 5.712550607287449, + "grad_norm": 3.9099782560794503, + "learning_rate": 4.636416503933038e-06, + "loss": 1.3472, + "step": 1411 + }, + { + "epoch": 5.716599190283401, + "grad_norm": 3.9111543599245757, + "learning_rate": 4.62936946558287e-06, + "loss": 1.4523, + "step": 1412 + }, + { + "epoch": 5.720647773279352, + "grad_norm": 4.6487019160659, + "learning_rate": 4.622323167454623e-06, + "loss": 1.2302, + "step": 1413 + }, + { + "epoch": 5.724696356275303, + "grad_norm": 4.4548900152472815, + "learning_rate": 4.6152776236211415e-06, + "loss": 1.4256, + "step": 1414 + }, + { + "epoch": 5.728744939271255, + "grad_norm": 4.058092491633072, + "learning_rate": 4.608232848153757e-06, + "loss": 1.6055, + "step": 1415 + }, + { + "epoch": 5.732793522267206, + "grad_norm": 4.025502584936106, + "learning_rate": 4.601188855122269e-06, + "loss": 1.3484, + "step": 1416 + }, + { + "epoch": 5.7368421052631575, + "grad_norm": 4.1244592308665275, + "learning_rate": 4.594145658594914e-06, + "loss": 1.4537, + "step": 1417 + }, + { + "epoch": 5.7408906882591095, + "grad_norm": 4.167306098888644, + "learning_rate": 4.587103272638339e-06, + "loss": 2.0785, + "step": 1418 + }, + { + "epoch": 5.744939271255061, + "grad_norm": 3.858307172453616, + "learning_rate": 4.580061711317571e-06, + "loss": 1.5669, + "step": 1419 + }, + { + "epoch": 5.748987854251012, + "grad_norm": 4.76966444820156, + "learning_rate": 4.57302098869599e-06, + "loss": 1.3901, + "step": 1420 + }, + { + "epoch": 5.753036437246964, + "grad_norm": 4.3778097624694166, + "learning_rate": 4.565981118835299e-06, + "loss": 1.291, + "step": 1421 + }, + { + "epoch": 5.757085020242915, + "grad_norm": 4.090411706131635, + "learning_rate": 4.558942115795502e-06, + "loss": 1.4406, + "step": 1422 + }, + { + "epoch": 5.761133603238866, + "grad_norm": 5.337161250566187, + "learning_rate": 4.551903993634869e-06, + "loss": 2.1851, + "step": 1423 + }, + { + "epoch": 5.765182186234818, + "grad_norm": 6.286779559937267, + "learning_rate": 4.5448667664099125e-06, + "loss": 1.9602, + "step": 1424 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 6.765386541677961, + "learning_rate": 4.537830448175354e-06, + "loss": 1.8644, + "step": 1425 + }, + { + "epoch": 5.77327935222672, + "grad_norm": 4.009998051124011, + "learning_rate": 4.530795052984104e-06, + "loss": 1.3677, + "step": 1426 + }, + { + "epoch": 5.777327935222672, + "grad_norm": 4.067144464386327, + "learning_rate": 4.523760594887228e-06, + "loss": 1.6488, + "step": 1427 + }, + { + "epoch": 5.781376518218623, + "grad_norm": 3.900176884022236, + "learning_rate": 4.5167270879339165e-06, + "loss": 1.6378, + "step": 1428 + }, + { + "epoch": 5.7854251012145745, + "grad_norm": 4.307053870196715, + "learning_rate": 4.509694546171468e-06, + "loss": 1.458, + "step": 1429 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 4.202185719713703, + "learning_rate": 4.5026629836452445e-06, + "loss": 1.3863, + "step": 1430 + }, + { + "epoch": 5.793522267206478, + "grad_norm": 4.276979157413732, + "learning_rate": 4.495632414398659e-06, + "loss": 1.4133, + "step": 1431 + }, + { + "epoch": 5.797570850202429, + "grad_norm": 4.560387387278901, + "learning_rate": 4.488602852473138e-06, + "loss": 1.4313, + "step": 1432 + }, + { + "epoch": 5.801619433198381, + "grad_norm": 3.900998231009241, + "learning_rate": 4.481574311908096e-06, + "loss": 1.3065, + "step": 1433 + }, + { + "epoch": 5.805668016194332, + "grad_norm": 3.971785106076469, + "learning_rate": 4.4745468067409055e-06, + "loss": 1.1997, + "step": 1434 + }, + { + "epoch": 5.809716599190283, + "grad_norm": 4.230506562739517, + "learning_rate": 4.467520351006878e-06, + "loss": 1.5584, + "step": 1435 + }, + { + "epoch": 5.813765182186235, + "grad_norm": 5.12301466025395, + "learning_rate": 4.460494958739223e-06, + "loss": 1.4086, + "step": 1436 + }, + { + "epoch": 5.817813765182186, + "grad_norm": 4.360480527706543, + "learning_rate": 4.453470643969027e-06, + "loss": 1.2759, + "step": 1437 + }, + { + "epoch": 5.821862348178137, + "grad_norm": 11.774868013423882, + "learning_rate": 4.446447420725227e-06, + "loss": 2.2866, + "step": 1438 + }, + { + "epoch": 5.825910931174089, + "grad_norm": 23.795049320685568, + "learning_rate": 4.439425303034576e-06, + "loss": 3.4094, + "step": 1439 + }, + { + "epoch": 5.82995951417004, + "grad_norm": 4.607383270222987, + "learning_rate": 4.432404304921624e-06, + "loss": 1.3129, + "step": 1440 + }, + { + "epoch": 5.834008097165992, + "grad_norm": 4.67077067966415, + "learning_rate": 4.4253844404086785e-06, + "loss": 1.2285, + "step": 1441 + }, + { + "epoch": 5.838056680161944, + "grad_norm": 3.9312338569636394, + "learning_rate": 4.418365723515791e-06, + "loss": 1.286, + "step": 1442 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 4.003272377775398, + "learning_rate": 4.411348168260713e-06, + "loss": 1.3394, + "step": 1443 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 4.140441268173913, + "learning_rate": 4.404331788658882e-06, + "loss": 1.1712, + "step": 1444 + }, + { + "epoch": 5.850202429149798, + "grad_norm": 4.57761440040013, + "learning_rate": 4.397316598723385e-06, + "loss": 1.3548, + "step": 1445 + }, + { + "epoch": 5.854251012145749, + "grad_norm": 4.860966996025116, + "learning_rate": 4.390302612464934e-06, + "loss": 1.4071, + "step": 1446 + }, + { + "epoch": 5.8582995951417, + "grad_norm": 3.557234324926702, + "learning_rate": 4.383289843891835e-06, + "loss": 1.3334, + "step": 1447 + }, + { + "epoch": 5.862348178137652, + "grad_norm": 4.6167043083990515, + "learning_rate": 4.376278307009962e-06, + "loss": 1.332, + "step": 1448 + }, + { + "epoch": 5.866396761133603, + "grad_norm": 4.529476800833651, + "learning_rate": 4.369268015822733e-06, + "loss": 1.336, + "step": 1449 + }, + { + "epoch": 5.870445344129554, + "grad_norm": 5.460345634297291, + "learning_rate": 4.362258984331074e-06, + "loss": 1.7992, + "step": 1450 + }, + { + "epoch": 5.874493927125506, + "grad_norm": 4.852544977047948, + "learning_rate": 4.355251226533396e-06, + "loss": 1.7401, + "step": 1451 + }, + { + "epoch": 5.8785425101214575, + "grad_norm": 5.091561572959863, + "learning_rate": 4.348244756425569e-06, + "loss": 1.4945, + "step": 1452 + }, + { + "epoch": 5.882591093117409, + "grad_norm": 4.66519342749034, + "learning_rate": 4.341239588000887e-06, + "loss": 1.4193, + "step": 1453 + }, + { + "epoch": 5.886639676113361, + "grad_norm": 4.442060928034546, + "learning_rate": 4.334235735250047e-06, + "loss": 1.0274, + "step": 1454 + }, + { + "epoch": 5.890688259109312, + "grad_norm": 3.911256400148853, + "learning_rate": 4.327233212161118e-06, + "loss": 1.5401, + "step": 1455 + }, + { + "epoch": 5.894736842105263, + "grad_norm": 3.8807011184816846, + "learning_rate": 4.320232032719511e-06, + "loss": 1.5831, + "step": 1456 + }, + { + "epoch": 5.898785425101215, + "grad_norm": 3.58685678874274, + "learning_rate": 4.313232210907959e-06, + "loss": 1.3268, + "step": 1457 + }, + { + "epoch": 5.902834008097166, + "grad_norm": 4.318238652473736, + "learning_rate": 4.306233760706478e-06, + "loss": 1.3389, + "step": 1458 + }, + { + "epoch": 5.906882591093117, + "grad_norm": 4.611379978717958, + "learning_rate": 4.299236696092347e-06, + "loss": 1.4306, + "step": 1459 + }, + { + "epoch": 5.910931174089069, + "grad_norm": 3.900073554354451, + "learning_rate": 4.292241031040077e-06, + "loss": 1.1163, + "step": 1460 + }, + { + "epoch": 5.91497975708502, + "grad_norm": 4.550673982692945, + "learning_rate": 4.285246779521384e-06, + "loss": 1.2052, + "step": 1461 + }, + { + "epoch": 5.919028340080971, + "grad_norm": 4.574548958146505, + "learning_rate": 4.278253955505163e-06, + "loss": 1.213, + "step": 1462 + }, + { + "epoch": 5.923076923076923, + "grad_norm": 3.5603964829525725, + "learning_rate": 4.271262572957453e-06, + "loss": 1.5401, + "step": 1463 + }, + { + "epoch": 5.9271255060728745, + "grad_norm": 4.899646738920418, + "learning_rate": 4.264272645841419e-06, + "loss": 1.3832, + "step": 1464 + }, + { + "epoch": 5.931174089068826, + "grad_norm": 4.936217075017478, + "learning_rate": 4.2572841881173205e-06, + "loss": 1.3896, + "step": 1465 + }, + { + "epoch": 5.935222672064778, + "grad_norm": 4.841906645627207, + "learning_rate": 4.250297213742473e-06, + "loss": 1.173, + "step": 1466 + }, + { + "epoch": 5.939271255060729, + "grad_norm": 4.652957613099752, + "learning_rate": 4.243311736671239e-06, + "loss": 1.1544, + "step": 1467 + }, + { + "epoch": 5.94331983805668, + "grad_norm": 5.5395351930289864, + "learning_rate": 4.236327770854987e-06, + "loss": 1.4593, + "step": 1468 + }, + { + "epoch": 5.947368421052632, + "grad_norm": 4.423876597754868, + "learning_rate": 4.229345330242067e-06, + "loss": 1.1935, + "step": 1469 + }, + { + "epoch": 5.951417004048583, + "grad_norm": 5.270192860869612, + "learning_rate": 4.222364428777786e-06, + "loss": 1.1325, + "step": 1470 + }, + { + "epoch": 5.955465587044534, + "grad_norm": 5.410786507887627, + "learning_rate": 4.2153850804043706e-06, + "loss": 1.3971, + "step": 1471 + }, + { + "epoch": 5.959514170040486, + "grad_norm": 4.884826922400209, + "learning_rate": 4.2084072990609505e-06, + "loss": 1.4698, + "step": 1472 + }, + { + "epoch": 5.963562753036437, + "grad_norm": 4.313211329480648, + "learning_rate": 4.201431098683527e-06, + "loss": 1.4382, + "step": 1473 + }, + { + "epoch": 5.967611336032388, + "grad_norm": 5.213303398147368, + "learning_rate": 4.194456493204939e-06, + "loss": 1.5175, + "step": 1474 + }, + { + "epoch": 5.97165991902834, + "grad_norm": 5.448304606946485, + "learning_rate": 4.187483496554844e-06, + "loss": 1.433, + "step": 1475 + }, + { + "epoch": 5.9757085020242915, + "grad_norm": 3.801193566372591, + "learning_rate": 4.1805121226596826e-06, + "loss": 1.4114, + "step": 1476 + }, + { + "epoch": 5.979757085020243, + "grad_norm": 4.17077172984551, + "learning_rate": 4.173542385442659e-06, + "loss": 1.4847, + "step": 1477 + }, + { + "epoch": 5.983805668016195, + "grad_norm": 3.8042786020089285, + "learning_rate": 4.166574298823707e-06, + "loss": 1.5417, + "step": 1478 + }, + { + "epoch": 5.987854251012146, + "grad_norm": 4.0974559638165795, + "learning_rate": 4.1596078767194615e-06, + "loss": 1.3383, + "step": 1479 + }, + { + "epoch": 5.991902834008097, + "grad_norm": 3.4327656830127844, + "learning_rate": 4.152643133043236e-06, + "loss": 1.3384, + "step": 1480 + }, + { + "epoch": 5.995951417004049, + "grad_norm": 3.615327810634163, + "learning_rate": 4.145680081704989e-06, + "loss": 1.6541, + "step": 1481 + }, + { + "epoch": 6.0, + "grad_norm": 3.8329106879075594, + "learning_rate": 4.138718736611302e-06, + "loss": 1.3694, + "step": 1482 + }, + { + "epoch": 6.004048582995951, + "grad_norm": 3.830450157141594, + "learning_rate": 4.131759111665349e-06, + "loss": 1.4049, + "step": 1483 + }, + { + "epoch": 6.008097165991903, + "grad_norm": 5.1111426342190684, + "learning_rate": 4.1248012207668635e-06, + "loss": 1.5639, + "step": 1484 + }, + { + "epoch": 6.012145748987854, + "grad_norm": 4.83681122900061, + "learning_rate": 4.117845077812122e-06, + "loss": 1.3693, + "step": 1485 + }, + { + "epoch": 6.016194331983805, + "grad_norm": 5.4329470747052255, + "learning_rate": 4.110890696693906e-06, + "loss": 1.5831, + "step": 1486 + }, + { + "epoch": 6.020242914979757, + "grad_norm": 4.6500916905003535, + "learning_rate": 4.103938091301479e-06, + "loss": 1.7881, + "step": 1487 + }, + { + "epoch": 6.0242914979757085, + "grad_norm": 4.885048703930011, + "learning_rate": 4.096987275520562e-06, + "loss": 1.6668, + "step": 1488 + }, + { + "epoch": 6.02834008097166, + "grad_norm": 4.13626291343727, + "learning_rate": 4.090038263233294e-06, + "loss": 1.3587, + "step": 1489 + }, + { + "epoch": 6.032388663967612, + "grad_norm": 4.904165295750069, + "learning_rate": 4.08309106831822e-06, + "loss": 1.3678, + "step": 1490 + }, + { + "epoch": 6.036437246963563, + "grad_norm": 4.636168977638758, + "learning_rate": 4.0761457046502515e-06, + "loss": 1.5829, + "step": 1491 + }, + { + "epoch": 6.040485829959514, + "grad_norm": 4.665143753358694, + "learning_rate": 4.0692021861006386e-06, + "loss": 1.382, + "step": 1492 + }, + { + "epoch": 6.044534412955466, + "grad_norm": 4.58626969694099, + "learning_rate": 4.062260526536955e-06, + "loss": 1.4891, + "step": 1493 + }, + { + "epoch": 6.048582995951417, + "grad_norm": 4.689483058767236, + "learning_rate": 4.055320739823057e-06, + "loss": 1.3764, + "step": 1494 + }, + { + "epoch": 6.052631578947368, + "grad_norm": 5.0699840890954535, + "learning_rate": 4.048382839819058e-06, + "loss": 1.4399, + "step": 1495 + }, + { + "epoch": 6.05668016194332, + "grad_norm": 4.582891853100069, + "learning_rate": 4.041446840381309e-06, + "loss": 1.2964, + "step": 1496 + }, + { + "epoch": 6.060728744939271, + "grad_norm": 4.596209939663152, + "learning_rate": 4.034512755362361e-06, + "loss": 1.4451, + "step": 1497 + }, + { + "epoch": 6.064777327935222, + "grad_norm": 5.077809534848778, + "learning_rate": 4.027580598610943e-06, + "loss": 1.3934, + "step": 1498 + }, + { + "epoch": 6.068825910931174, + "grad_norm": 5.121648526362897, + "learning_rate": 4.0206503839719335e-06, + "loss": 1.5479, + "step": 1499 + }, + { + "epoch": 6.0728744939271255, + "grad_norm": 4.611548299373776, + "learning_rate": 4.01372212528633e-06, + "loss": 1.4704, + "step": 1500 + }, + { + "epoch": 6.076923076923077, + "grad_norm": 5.312277841332635, + "learning_rate": 4.006795836391226e-06, + "loss": 1.4155, + "step": 1501 + }, + { + "epoch": 6.080971659919029, + "grad_norm": 4.964246172799465, + "learning_rate": 3.999871531119779e-06, + "loss": 1.4857, + "step": 1502 + }, + { + "epoch": 6.08502024291498, + "grad_norm": 4.070954622733409, + "learning_rate": 3.992949223301185e-06, + "loss": 1.4726, + "step": 1503 + }, + { + "epoch": 6.089068825910931, + "grad_norm": 4.91594481744365, + "learning_rate": 3.986028926760655e-06, + "loss": 1.4183, + "step": 1504 + }, + { + "epoch": 6.093117408906883, + "grad_norm": 4.691943755517188, + "learning_rate": 3.9791106553193746e-06, + "loss": 1.497, + "step": 1505 + }, + { + "epoch": 6.097165991902834, + "grad_norm": 4.475695489598384, + "learning_rate": 3.972194422794493e-06, + "loss": 1.2572, + "step": 1506 + }, + { + "epoch": 6.101214574898785, + "grad_norm": 4.947241370368582, + "learning_rate": 3.965280242999083e-06, + "loss": 1.4398, + "step": 1507 + }, + { + "epoch": 6.105263157894737, + "grad_norm": 5.319805507480639, + "learning_rate": 3.9583681297421194e-06, + "loss": 1.3871, + "step": 1508 + }, + { + "epoch": 6.109311740890688, + "grad_norm": 4.749559720069604, + "learning_rate": 3.951458096828449e-06, + "loss": 1.375, + "step": 1509 + }, + { + "epoch": 6.113360323886639, + "grad_norm": 5.727885976477068, + "learning_rate": 3.944550158058762e-06, + "loss": 1.3195, + "step": 1510 + }, + { + "epoch": 6.117408906882591, + "grad_norm": 5.227063382939529, + "learning_rate": 3.937644327229572e-06, + "loss": 1.2256, + "step": 1511 + }, + { + "epoch": 6.1214574898785425, + "grad_norm": 4.738297898420654, + "learning_rate": 3.930740618133173e-06, + "loss": 1.2919, + "step": 1512 + }, + { + "epoch": 6.125506072874494, + "grad_norm": 4.796528713602936, + "learning_rate": 3.923839044557632e-06, + "loss": 1.3028, + "step": 1513 + }, + { + "epoch": 6.129554655870446, + "grad_norm": 5.590663766511934, + "learning_rate": 3.916939620286743e-06, + "loss": 1.1917, + "step": 1514 + }, + { + "epoch": 6.133603238866397, + "grad_norm": 4.16713103068686, + "learning_rate": 3.9100423591000124e-06, + "loss": 1.54, + "step": 1515 + }, + { + "epoch": 6.137651821862348, + "grad_norm": 5.035939317822777, + "learning_rate": 3.903147274772624e-06, + "loss": 1.3571, + "step": 1516 + }, + { + "epoch": 6.1417004048583, + "grad_norm": 4.0009552855543955, + "learning_rate": 3.896254381075416e-06, + "loss": 1.1103, + "step": 1517 + }, + { + "epoch": 6.145748987854251, + "grad_norm": 5.217383616489112, + "learning_rate": 3.8893636917748455e-06, + "loss": 1.4538, + "step": 1518 + }, + { + "epoch": 6.149797570850202, + "grad_norm": 4.709807039436491, + "learning_rate": 3.882475220632975e-06, + "loss": 1.2834, + "step": 1519 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 4.179087376153956, + "learning_rate": 3.875588981407433e-06, + "loss": 1.5023, + "step": 1520 + }, + { + "epoch": 6.157894736842105, + "grad_norm": 5.387448869675948, + "learning_rate": 3.86870498785139e-06, + "loss": 1.5494, + "step": 1521 + }, + { + "epoch": 6.161943319838056, + "grad_norm": 4.138048095732358, + "learning_rate": 3.861823253713535e-06, + "loss": 1.3442, + "step": 1522 + }, + { + "epoch": 6.165991902834008, + "grad_norm": 4.522673016609398, + "learning_rate": 3.854943792738037e-06, + "loss": 1.4306, + "step": 1523 + }, + { + "epoch": 6.17004048582996, + "grad_norm": 4.04807524846957, + "learning_rate": 3.848066618664534e-06, + "loss": 1.9855, + "step": 1524 + }, + { + "epoch": 6.174089068825911, + "grad_norm": 4.797553089745047, + "learning_rate": 3.841191745228091e-06, + "loss": 1.2562, + "step": 1525 + }, + { + "epoch": 6.178137651821863, + "grad_norm": 5.562886515767805, + "learning_rate": 3.834319186159179e-06, + "loss": 1.4532, + "step": 1526 + }, + { + "epoch": 6.182186234817814, + "grad_norm": 3.8582598799938315, + "learning_rate": 3.82744895518365e-06, + "loss": 1.2517, + "step": 1527 + }, + { + "epoch": 6.186234817813765, + "grad_norm": 4.976499846840885, + "learning_rate": 3.8205810660227e-06, + "loss": 1.4395, + "step": 1528 + }, + { + "epoch": 6.190283400809717, + "grad_norm": 5.013759086459238, + "learning_rate": 3.8137155323928526e-06, + "loss": 1.4579, + "step": 1529 + }, + { + "epoch": 6.194331983805668, + "grad_norm": 5.210004353191725, + "learning_rate": 3.8068523680059287e-06, + "loss": 1.6307, + "step": 1530 + }, + { + "epoch": 6.198380566801619, + "grad_norm": 4.444756027075356, + "learning_rate": 3.799991586569012e-06, + "loss": 1.6785, + "step": 1531 + }, + { + "epoch": 6.202429149797571, + "grad_norm": 4.581599022941181, + "learning_rate": 3.7931332017844302e-06, + "loss": 1.3911, + "step": 1532 + }, + { + "epoch": 6.206477732793522, + "grad_norm": 4.426732929526946, + "learning_rate": 3.786277227349724e-06, + "loss": 1.7226, + "step": 1533 + }, + { + "epoch": 6.2105263157894735, + "grad_norm": 4.573503321332187, + "learning_rate": 3.77942367695762e-06, + "loss": 1.7276, + "step": 1534 + }, + { + "epoch": 6.2145748987854255, + "grad_norm": 4.632474175205992, + "learning_rate": 3.7725725642960047e-06, + "loss": 1.4984, + "step": 1535 + }, + { + "epoch": 6.218623481781377, + "grad_norm": 5.004422527391663, + "learning_rate": 3.7657239030478927e-06, + "loss": 1.3822, + "step": 1536 + }, + { + "epoch": 6.222672064777328, + "grad_norm": 4.730329238431976, + "learning_rate": 3.758877706891407e-06, + "loss": 1.3005, + "step": 1537 + }, + { + "epoch": 6.22672064777328, + "grad_norm": 4.696618081800561, + "learning_rate": 3.752033989499742e-06, + "loss": 1.4995, + "step": 1538 + }, + { + "epoch": 6.230769230769231, + "grad_norm": 4.819216438393582, + "learning_rate": 3.7451927645411466e-06, + "loss": 1.2958, + "step": 1539 + }, + { + "epoch": 6.234817813765182, + "grad_norm": 5.4741629869641315, + "learning_rate": 3.7383540456788915e-06, + "loss": 1.5321, + "step": 1540 + }, + { + "epoch": 6.238866396761134, + "grad_norm": 5.271140694357475, + "learning_rate": 3.7315178465712364e-06, + "loss": 1.4701, + "step": 1541 + }, + { + "epoch": 6.242914979757085, + "grad_norm": 4.870369052928556, + "learning_rate": 3.7246841808714172e-06, + "loss": 1.4965, + "step": 1542 + }, + { + "epoch": 6.246963562753036, + "grad_norm": 4.627274116359122, + "learning_rate": 3.717853062227604e-06, + "loss": 1.3376, + "step": 1543 + }, + { + "epoch": 6.251012145748988, + "grad_norm": 4.862725711210235, + "learning_rate": 3.7110245042828786e-06, + "loss": 1.436, + "step": 1544 + }, + { + "epoch": 6.255060728744939, + "grad_norm": 4.948809530195508, + "learning_rate": 3.704198520675214e-06, + "loss": 1.3922, + "step": 1545 + }, + { + "epoch": 6.2591093117408905, + "grad_norm": 4.36897138423846, + "learning_rate": 3.69737512503744e-06, + "loss": 1.3391, + "step": 1546 + }, + { + "epoch": 6.2631578947368425, + "grad_norm": 4.774874457232701, + "learning_rate": 3.690554330997215e-06, + "loss": 1.1307, + "step": 1547 + }, + { + "epoch": 6.267206477732794, + "grad_norm": 4.560395256546156, + "learning_rate": 3.6837361521770056e-06, + "loss": 1.4205, + "step": 1548 + }, + { + "epoch": 6.271255060728745, + "grad_norm": 4.657377226532245, + "learning_rate": 3.6769206021940505e-06, + "loss": 1.4284, + "step": 1549 + }, + { + "epoch": 6.275303643724697, + "grad_norm": 4.523918352960143, + "learning_rate": 3.670107694660343e-06, + "loss": 1.4865, + "step": 1550 + }, + { + "epoch": 6.279352226720648, + "grad_norm": 6.060799013063325, + "learning_rate": 3.6632974431825965e-06, + "loss": 1.4177, + "step": 1551 + }, + { + "epoch": 6.283400809716599, + "grad_norm": 5.508975855268233, + "learning_rate": 3.656489861362218e-06, + "loss": 1.0975, + "step": 1552 + }, + { + "epoch": 6.287449392712551, + "grad_norm": 5.591620230854365, + "learning_rate": 3.6496849627952875e-06, + "loss": 1.2607, + "step": 1553 + }, + { + "epoch": 6.291497975708502, + "grad_norm": 5.501342695470275, + "learning_rate": 3.6428827610725203e-06, + "loss": 1.113, + "step": 1554 + }, + { + "epoch": 6.295546558704453, + "grad_norm": 5.371568603468503, + "learning_rate": 3.636083269779249e-06, + "loss": 1.3579, + "step": 1555 + }, + { + "epoch": 6.299595141700405, + "grad_norm": 4.658495618502483, + "learning_rate": 3.6292865024953945e-06, + "loss": 1.5612, + "step": 1556 + }, + { + "epoch": 6.303643724696356, + "grad_norm": 5.171922327948163, + "learning_rate": 3.622492472795432e-06, + "loss": 1.196, + "step": 1557 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 5.187630245267101, + "learning_rate": 3.615701194248375e-06, + "loss": 1.2403, + "step": 1558 + }, + { + "epoch": 6.3117408906882595, + "grad_norm": 4.739560149771274, + "learning_rate": 3.6089126804177373e-06, + "loss": 1.2748, + "step": 1559 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 5.8421200692609405, + "learning_rate": 3.6021269448615148e-06, + "loss": 1.1801, + "step": 1560 + }, + { + "epoch": 6.319838056680162, + "grad_norm": 5.003939683781086, + "learning_rate": 3.595344001132154e-06, + "loss": 1.1334, + "step": 1561 + }, + { + "epoch": 6.323886639676114, + "grad_norm": 5.213320704625486, + "learning_rate": 3.5885638627765228e-06, + "loss": 1.1662, + "step": 1562 + }, + { + "epoch": 6.327935222672065, + "grad_norm": 5.12672208334294, + "learning_rate": 3.5817865433358902e-06, + "loss": 1.1897, + "step": 1563 + }, + { + "epoch": 6.331983805668016, + "grad_norm": 4.990310131147776, + "learning_rate": 3.5750120563458924e-06, + "loss": 1.2197, + "step": 1564 + }, + { + "epoch": 6.336032388663968, + "grad_norm": 5.404582388895142, + "learning_rate": 3.568240415336509e-06, + "loss": 1.2979, + "step": 1565 + }, + { + "epoch": 6.340080971659919, + "grad_norm": 4.459387759024826, + "learning_rate": 3.5614716338320384e-06, + "loss": 1.2379, + "step": 1566 + }, + { + "epoch": 6.34412955465587, + "grad_norm": 4.906670384808422, + "learning_rate": 3.554705725351063e-06, + "loss": 1.1656, + "step": 1567 + }, + { + "epoch": 6.348178137651822, + "grad_norm": 5.788345645390745, + "learning_rate": 3.547942703406433e-06, + "loss": 1.3082, + "step": 1568 + }, + { + "epoch": 6.352226720647773, + "grad_norm": 5.367912057539721, + "learning_rate": 3.5411825815052296e-06, + "loss": 1.313, + "step": 1569 + }, + { + "epoch": 6.3562753036437245, + "grad_norm": 5.326205519895874, + "learning_rate": 3.534425373148741e-06, + "loss": 0.9762, + "step": 1570 + }, + { + "epoch": 6.3603238866396765, + "grad_norm": 5.708844505808687, + "learning_rate": 3.52767109183244e-06, + "loss": 1.373, + "step": 1571 + }, + { + "epoch": 6.364372469635628, + "grad_norm": 4.876273122171325, + "learning_rate": 3.5209197510459526e-06, + "loss": 1.448, + "step": 1572 + }, + { + "epoch": 6.368421052631579, + "grad_norm": 4.935122614604545, + "learning_rate": 3.5141713642730305e-06, + "loss": 1.3476, + "step": 1573 + }, + { + "epoch": 6.372469635627531, + "grad_norm": 6.109929961302762, + "learning_rate": 3.507425944991529e-06, + "loss": 1.4072, + "step": 1574 + }, + { + "epoch": 6.376518218623482, + "grad_norm": 5.409803828147351, + "learning_rate": 3.5006835066733707e-06, + "loss": 1.0987, + "step": 1575 + }, + { + "epoch": 6.380566801619433, + "grad_norm": 5.907878971006872, + "learning_rate": 3.4939440627845305e-06, + "loss": 1.2467, + "step": 1576 + }, + { + "epoch": 6.384615384615385, + "grad_norm": 5.060588652380501, + "learning_rate": 3.4872076267850015e-06, + "loss": 1.0512, + "step": 1577 + }, + { + "epoch": 6.388663967611336, + "grad_norm": 6.199263715395586, + "learning_rate": 3.480474212128766e-06, + "loss": 1.1192, + "step": 1578 + }, + { + "epoch": 6.392712550607287, + "grad_norm": 5.68773960369221, + "learning_rate": 3.473743832263778e-06, + "loss": 1.2989, + "step": 1579 + }, + { + "epoch": 6.396761133603239, + "grad_norm": 6.5411566006758886, + "learning_rate": 3.4670165006319236e-06, + "loss": 1.1125, + "step": 1580 + }, + { + "epoch": 6.40080971659919, + "grad_norm": 4.779266992013558, + "learning_rate": 3.4602922306690062e-06, + "loss": 1.1461, + "step": 1581 + }, + { + "epoch": 6.4048582995951415, + "grad_norm": 4.983422698218311, + "learning_rate": 3.453571035804714e-06, + "loss": 1.1805, + "step": 1582 + }, + { + "epoch": 6.4089068825910935, + "grad_norm": 6.281439869347411, + "learning_rate": 3.4468529294625895e-06, + "loss": 1.2865, + "step": 1583 + }, + { + "epoch": 6.412955465587045, + "grad_norm": 5.447638251945489, + "learning_rate": 3.4401379250600124e-06, + "loss": 1.112, + "step": 1584 + }, + { + "epoch": 6.417004048582996, + "grad_norm": 6.031371603465583, + "learning_rate": 3.433426036008163e-06, + "loss": 1.4222, + "step": 1585 + }, + { + "epoch": 6.421052631578947, + "grad_norm": 6.344172383462025, + "learning_rate": 3.4267172757120005e-06, + "loss": 1.4558, + "step": 1586 + }, + { + "epoch": 6.425101214574899, + "grad_norm": 5.253990555737164, + "learning_rate": 3.420011657570238e-06, + "loss": 1.4408, + "step": 1587 + }, + { + "epoch": 6.42914979757085, + "grad_norm": 5.944240629250275, + "learning_rate": 3.413309194975309e-06, + "loss": 1.4281, + "step": 1588 + }, + { + "epoch": 6.433198380566802, + "grad_norm": 4.690048614883703, + "learning_rate": 3.406609901313349e-06, + "loss": 1.6038, + "step": 1589 + }, + { + "epoch": 6.437246963562753, + "grad_norm": 5.538761343018897, + "learning_rate": 3.39991378996416e-06, + "loss": 1.3818, + "step": 1590 + }, + { + "epoch": 6.441295546558704, + "grad_norm": 5.904913245197766, + "learning_rate": 3.393220874301193e-06, + "loss": 1.324, + "step": 1591 + }, + { + "epoch": 6.445344129554655, + "grad_norm": 4.935839021246995, + "learning_rate": 3.386531167691512e-06, + "loss": 1.569, + "step": 1592 + }, + { + "epoch": 6.449392712550607, + "grad_norm": 5.96200793571726, + "learning_rate": 3.379844683495775e-06, + "loss": 1.3697, + "step": 1593 + }, + { + "epoch": 6.4534412955465585, + "grad_norm": 5.74218375449931, + "learning_rate": 3.3731614350682045e-06, + "loss": 1.3591, + "step": 1594 + }, + { + "epoch": 6.4574898785425106, + "grad_norm": 5.819819829923634, + "learning_rate": 3.36648143575656e-06, + "loss": 1.7039, + "step": 1595 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 7.530849687169004, + "learning_rate": 3.3598046989021073e-06, + "loss": 1.8161, + "step": 1596 + }, + { + "epoch": 6.465587044534413, + "grad_norm": 5.773184926893142, + "learning_rate": 3.3531312378396026e-06, + "loss": 1.506, + "step": 1597 + }, + { + "epoch": 6.469635627530364, + "grad_norm": 5.095389257052112, + "learning_rate": 3.3464610658972584e-06, + "loss": 1.5432, + "step": 1598 + }, + { + "epoch": 6.473684210526316, + "grad_norm": 4.864855264853332, + "learning_rate": 3.3397941963967162e-06, + "loss": 1.502, + "step": 1599 + }, + { + "epoch": 6.477732793522267, + "grad_norm": 6.57365780985993, + "learning_rate": 3.333130642653024e-06, + "loss": 1.5104, + "step": 1600 + }, + { + "epoch": 6.481781376518219, + "grad_norm": 4.515682901106996, + "learning_rate": 3.326470417974604e-06, + "loss": 1.4218, + "step": 1601 + }, + { + "epoch": 6.48582995951417, + "grad_norm": 5.044572956084713, + "learning_rate": 3.3198135356632353e-06, + "loss": 1.3685, + "step": 1602 + }, + { + "epoch": 6.489878542510121, + "grad_norm": 6.114856919793026, + "learning_rate": 3.313160009014017e-06, + "loss": 1.3026, + "step": 1603 + }, + { + "epoch": 6.493927125506072, + "grad_norm": 6.169486015477941, + "learning_rate": 3.3065098513153473e-06, + "loss": 1.2931, + "step": 1604 + }, + { + "epoch": 6.497975708502024, + "grad_norm": 4.671907121620305, + "learning_rate": 3.299863075848898e-06, + "loss": 1.203, + "step": 1605 + }, + { + "epoch": 6.502024291497976, + "grad_norm": 5.556963177721959, + "learning_rate": 3.2932196958895816e-06, + "loss": 1.0369, + "step": 1606 + }, + { + "epoch": 6.506072874493928, + "grad_norm": 6.041668515369977, + "learning_rate": 3.2865797247055354e-06, + "loss": 1.4057, + "step": 1607 + }, + { + "epoch": 6.510121457489879, + "grad_norm": 5.622532023329238, + "learning_rate": 3.2799431755580814e-06, + "loss": 1.3496, + "step": 1608 + }, + { + "epoch": 6.51417004048583, + "grad_norm": 4.164381858883872, + "learning_rate": 3.2733100617017126e-06, + "loss": 1.3227, + "step": 1609 + }, + { + "epoch": 6.518218623481781, + "grad_norm": 5.565945707547888, + "learning_rate": 3.266680396384061e-06, + "loss": 1.3552, + "step": 1610 + }, + { + "epoch": 6.522267206477733, + "grad_norm": 6.1834705735871855, + "learning_rate": 3.2600541928458664e-06, + "loss": 1.2943, + "step": 1611 + }, + { + "epoch": 6.526315789473684, + "grad_norm": 6.088692550743796, + "learning_rate": 3.2534314643209597e-06, + "loss": 1.132, + "step": 1612 + }, + { + "epoch": 6.530364372469636, + "grad_norm": 5.618439646445004, + "learning_rate": 3.2468122240362287e-06, + "loss": 1.2075, + "step": 1613 + }, + { + "epoch": 6.534412955465587, + "grad_norm": 6.117262117177891, + "learning_rate": 3.2401964852115954e-06, + "loss": 1.2648, + "step": 1614 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 5.488938699999532, + "learning_rate": 3.233584261059991e-06, + "loss": 1.5484, + "step": 1615 + }, + { + "epoch": 6.5425101214574894, + "grad_norm": 4.965386729846099, + "learning_rate": 3.226975564787322e-06, + "loss": 1.486, + "step": 1616 + }, + { + "epoch": 6.5465587044534415, + "grad_norm": 18.62707478890267, + "learning_rate": 3.2203704095924536e-06, + "loss": 2.0005, + "step": 1617 + }, + { + "epoch": 6.550607287449393, + "grad_norm": 9.55782070389464, + "learning_rate": 3.213768808667177e-06, + "loss": 1.7957, + "step": 1618 + }, + { + "epoch": 6.554655870445345, + "grad_norm": 9.720812117855125, + "learning_rate": 3.2071707751961838e-06, + "loss": 2.144, + "step": 1619 + }, + { + "epoch": 6.558704453441296, + "grad_norm": 5.342719089296339, + "learning_rate": 3.200576322357044e-06, + "loss": 1.3436, + "step": 1620 + }, + { + "epoch": 6.562753036437247, + "grad_norm": 4.64296304030207, + "learning_rate": 3.1939854633201727e-06, + "loss": 1.2129, + "step": 1621 + }, + { + "epoch": 6.566801619433198, + "grad_norm": 4.806685098084674, + "learning_rate": 3.187398211248811e-06, + "loss": 1.5973, + "step": 1622 + }, + { + "epoch": 6.57085020242915, + "grad_norm": 5.159929877257071, + "learning_rate": 3.1808145792989914e-06, + "loss": 1.2471, + "step": 1623 + }, + { + "epoch": 6.574898785425101, + "grad_norm": 4.881818219879603, + "learning_rate": 3.1742345806195196e-06, + "loss": 1.4285, + "step": 1624 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 4.079931587528226, + "learning_rate": 3.1676582283519454e-06, + "loss": 1.2586, + "step": 1625 + }, + { + "epoch": 6.582995951417004, + "grad_norm": 5.067504014062879, + "learning_rate": 3.1610855356305354e-06, + "loss": 1.3673, + "step": 1626 + }, + { + "epoch": 6.587044534412955, + "grad_norm": 4.954367681109359, + "learning_rate": 3.1545165155822453e-06, + "loss": 1.3681, + "step": 1627 + }, + { + "epoch": 6.5910931174089065, + "grad_norm": 5.605429782413848, + "learning_rate": 3.1479511813267006e-06, + "loss": 1.3636, + "step": 1628 + }, + { + "epoch": 6.5951417004048585, + "grad_norm": 4.958815188693233, + "learning_rate": 3.141389545976159e-06, + "loss": 1.0862, + "step": 1629 + }, + { + "epoch": 6.59919028340081, + "grad_norm": 4.427052082332069, + "learning_rate": 3.134831622635496e-06, + "loss": 1.1727, + "step": 1630 + }, + { + "epoch": 6.603238866396762, + "grad_norm": 4.453414798921641, + "learning_rate": 3.1282774244021717e-06, + "loss": 1.2508, + "step": 1631 + }, + { + "epoch": 6.607287449392713, + "grad_norm": 5.086142474437995, + "learning_rate": 3.1217269643662063e-06, + "loss": 1.0497, + "step": 1632 + }, + { + "epoch": 6.611336032388664, + "grad_norm": 5.252726223787453, + "learning_rate": 3.115180255610154e-06, + "loss": 1.352, + "step": 1633 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 4.618158368136601, + "learning_rate": 3.1086373112090762e-06, + "loss": 1.3803, + "step": 1634 + }, + { + "epoch": 6.619433198380567, + "grad_norm": 5.797639722448207, + "learning_rate": 3.1020981442305187e-06, + "loss": 1.1187, + "step": 1635 + }, + { + "epoch": 6.623481781376518, + "grad_norm": 5.892627204449989, + "learning_rate": 3.095562767734481e-06, + "loss": 1.4805, + "step": 1636 + }, + { + "epoch": 6.62753036437247, + "grad_norm": 4.995284041826363, + "learning_rate": 3.089031194773392e-06, + "loss": 1.2999, + "step": 1637 + }, + { + "epoch": 6.631578947368421, + "grad_norm": 5.424221812925032, + "learning_rate": 3.082503438392086e-06, + "loss": 1.5812, + "step": 1638 + }, + { + "epoch": 6.635627530364372, + "grad_norm": 4.773802128035484, + "learning_rate": 3.0759795116277723e-06, + "loss": 1.1799, + "step": 1639 + }, + { + "epoch": 6.6396761133603235, + "grad_norm": 5.573651737656804, + "learning_rate": 3.069459427510014e-06, + "loss": 1.4498, + "step": 1640 + }, + { + "epoch": 6.6437246963562755, + "grad_norm": 4.742522853775909, + "learning_rate": 3.0629431990607e-06, + "loss": 1.3417, + "step": 1641 + }, + { + "epoch": 6.647773279352227, + "grad_norm": 5.292712065001537, + "learning_rate": 3.056430839294015e-06, + "loss": 1.45, + "step": 1642 + }, + { + "epoch": 6.651821862348179, + "grad_norm": 4.5550435224065335, + "learning_rate": 3.049922361216422e-06, + "loss": 1.2275, + "step": 1643 + }, + { + "epoch": 6.65587044534413, + "grad_norm": 5.633966620000232, + "learning_rate": 3.043417777826627e-06, + "loss": 1.4383, + "step": 1644 + }, + { + "epoch": 6.659919028340081, + "grad_norm": 5.977264180838899, + "learning_rate": 3.036917102115561e-06, + "loss": 1.2502, + "step": 1645 + }, + { + "epoch": 6.663967611336032, + "grad_norm": 5.050359221231472, + "learning_rate": 3.0304203470663507e-06, + "loss": 1.4135, + "step": 1646 + }, + { + "epoch": 6.668016194331984, + "grad_norm": 5.3518078778159435, + "learning_rate": 3.023927525654288e-06, + "loss": 1.4064, + "step": 1647 + }, + { + "epoch": 6.672064777327935, + "grad_norm": 5.575471681679863, + "learning_rate": 3.017438650846815e-06, + "loss": 1.5635, + "step": 1648 + }, + { + "epoch": 6.676113360323887, + "grad_norm": 4.758858070207382, + "learning_rate": 3.0109537356034856e-06, + "loss": 1.5306, + "step": 1649 + }, + { + "epoch": 6.680161943319838, + "grad_norm": 5.646630068141117, + "learning_rate": 3.0044727928759487e-06, + "loss": 1.3876, + "step": 1650 + }, + { + "epoch": 6.684210526315789, + "grad_norm": 5.245224305674558, + "learning_rate": 2.9979958356079195e-06, + "loss": 1.2497, + "step": 1651 + }, + { + "epoch": 6.6882591093117405, + "grad_norm": 4.976281468525487, + "learning_rate": 2.991522876735154e-06, + "loss": 1.3506, + "step": 1652 + }, + { + "epoch": 6.6923076923076925, + "grad_norm": 5.375432065764104, + "learning_rate": 2.98505392918542e-06, + "loss": 1.3676, + "step": 1653 + }, + { + "epoch": 6.696356275303644, + "grad_norm": 4.849539565202561, + "learning_rate": 2.978589005878476e-06, + "loss": 1.2348, + "step": 1654 + }, + { + "epoch": 6.700404858299595, + "grad_norm": 6.373782199327902, + "learning_rate": 2.9721281197260427e-06, + "loss": 1.6916, + "step": 1655 + }, + { + "epoch": 6.704453441295547, + "grad_norm": 5.797065404713431, + "learning_rate": 2.965671283631778e-06, + "loss": 1.4917, + "step": 1656 + }, + { + "epoch": 6.708502024291498, + "grad_norm": 5.561054188837486, + "learning_rate": 2.959218510491252e-06, + "loss": 1.1089, + "step": 1657 + }, + { + "epoch": 6.712550607287449, + "grad_norm": 4.841361841602314, + "learning_rate": 2.9527698131919156e-06, + "loss": 1.2314, + "step": 1658 + }, + { + "epoch": 6.716599190283401, + "grad_norm": 4.961647413029597, + "learning_rate": 2.9463252046130884e-06, + "loss": 1.3488, + "step": 1659 + }, + { + "epoch": 6.720647773279352, + "grad_norm": 6.030520417168003, + "learning_rate": 2.9398846976259136e-06, + "loss": 1.1124, + "step": 1660 + }, + { + "epoch": 6.724696356275303, + "grad_norm": 5.376150681226648, + "learning_rate": 2.9334483050933506e-06, + "loss": 1.3305, + "step": 1661 + }, + { + "epoch": 6.728744939271255, + "grad_norm": 4.997899902629033, + "learning_rate": 2.9270160398701387e-06, + "loss": 1.4987, + "step": 1662 + }, + { + "epoch": 6.732793522267206, + "grad_norm": 5.003930672267123, + "learning_rate": 2.920587914802772e-06, + "loss": 1.2143, + "step": 1663 + }, + { + "epoch": 6.7368421052631575, + "grad_norm": 5.099065318842715, + "learning_rate": 2.91416394272948e-06, + "loss": 1.3239, + "step": 1664 + }, + { + "epoch": 6.7408906882591095, + "grad_norm": 5.065783888856437, + "learning_rate": 2.907744136480194e-06, + "loss": 1.9473, + "step": 1665 + }, + { + "epoch": 6.744939271255061, + "grad_norm": 4.828636889161134, + "learning_rate": 2.901328508876531e-06, + "loss": 1.4691, + "step": 1666 + }, + { + "epoch": 6.748987854251012, + "grad_norm": 5.887659634670204, + "learning_rate": 2.894917072731753e-06, + "loss": 1.2826, + "step": 1667 + }, + { + "epoch": 6.753036437246964, + "grad_norm": 5.421606621102472, + "learning_rate": 2.88850984085076e-06, + "loss": 1.1948, + "step": 1668 + }, + { + "epoch": 6.757085020242915, + "grad_norm": 5.2144985221753615, + "learning_rate": 2.8821068260300505e-06, + "loss": 1.3159, + "step": 1669 + }, + { + "epoch": 6.761133603238866, + "grad_norm": 6.35388499196324, + "learning_rate": 2.8757080410577042e-06, + "loss": 2.064, + "step": 1670 + }, + { + "epoch": 6.765182186234818, + "grad_norm": 6.533956411029131, + "learning_rate": 2.8693134987133464e-06, + "loss": 1.8202, + "step": 1671 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 7.388143224357747, + "learning_rate": 2.8629232117681354e-06, + "loss": 1.7417, + "step": 1672 + }, + { + "epoch": 6.77327935222672, + "grad_norm": 4.928577825497661, + "learning_rate": 2.8565371929847286e-06, + "loss": 1.2534, + "step": 1673 + }, + { + "epoch": 6.777327935222672, + "grad_norm": 5.033866214652084, + "learning_rate": 2.8501554551172613e-06, + "loss": 1.5421, + "step": 1674 + }, + { + "epoch": 6.781376518218623, + "grad_norm": 4.739685237811317, + "learning_rate": 2.843778010911311e-06, + "loss": 1.5263, + "step": 1675 + }, + { + "epoch": 6.7854251012145745, + "grad_norm": 5.136372890884333, + "learning_rate": 2.83740487310389e-06, + "loss": 1.3327, + "step": 1676 + }, + { + "epoch": 6.7894736842105265, + "grad_norm": 4.941908173697463, + "learning_rate": 2.8310360544234057e-06, + "loss": 1.2674, + "step": 1677 + }, + { + "epoch": 6.793522267206478, + "grad_norm": 5.393271110505753, + "learning_rate": 2.8246715675896354e-06, + "loss": 1.2836, + "step": 1678 + }, + { + "epoch": 6.797570850202429, + "grad_norm": 5.454849249006355, + "learning_rate": 2.81831142531371e-06, + "loss": 1.3156, + "step": 1679 + }, + { + "epoch": 6.801619433198381, + "grad_norm": 4.939088394387297, + "learning_rate": 2.811955640298083e-06, + "loss": 1.2068, + "step": 1680 + }, + { + "epoch": 6.805668016194332, + "grad_norm": 4.809916773128364, + "learning_rate": 2.8056042252365046e-06, + "loss": 1.0997, + "step": 1681 + }, + { + "epoch": 6.809716599190283, + "grad_norm": 5.329896547784682, + "learning_rate": 2.7992571928139984e-06, + "loss": 1.4471, + "step": 1682 + }, + { + "epoch": 6.813765182186235, + "grad_norm": 6.511906878209839, + "learning_rate": 2.7929145557068303e-06, + "loss": 1.2595, + "step": 1683 + }, + { + "epoch": 6.817813765182186, + "grad_norm": 5.372364570471038, + "learning_rate": 2.786576326582493e-06, + "loss": 1.1699, + "step": 1684 + }, + { + "epoch": 6.821862348178137, + "grad_norm": 13.8652581579135, + "learning_rate": 2.780242518099675e-06, + "loss": 2.2106, + "step": 1685 + }, + { + "epoch": 6.825910931174089, + "grad_norm": 25.171093577196388, + "learning_rate": 2.7739131429082373e-06, + "loss": 3.2586, + "step": 1686 + }, + { + "epoch": 6.82995951417004, + "grad_norm": 5.726221697590718, + "learning_rate": 2.7675882136491795e-06, + "loss": 1.1889, + "step": 1687 + }, + { + "epoch": 6.834008097165992, + "grad_norm": 5.969801910273205, + "learning_rate": 2.761267742954629e-06, + "loss": 1.1408, + "step": 1688 + }, + { + "epoch": 6.838056680161944, + "grad_norm": 5.061214863990714, + "learning_rate": 2.7549517434478063e-06, + "loss": 1.1687, + "step": 1689 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 4.867474293725249, + "learning_rate": 2.7486402277430026e-06, + "loss": 1.2449, + "step": 1690 + }, + { + "epoch": 6.846153846153846, + "grad_norm": 5.1018055774076645, + "learning_rate": 2.7423332084455543e-06, + "loss": 1.0478, + "step": 1691 + }, + { + "epoch": 6.850202429149798, + "grad_norm": 6.018705752891283, + "learning_rate": 2.736030698151815e-06, + "loss": 1.2496, + "step": 1692 + }, + { + "epoch": 6.854251012145749, + "grad_norm": 6.104939352615399, + "learning_rate": 2.7297327094491344e-06, + "loss": 1.287, + "step": 1693 + }, + { + "epoch": 6.8582995951417, + "grad_norm": 4.340656711987505, + "learning_rate": 2.723439254915834e-06, + "loss": 1.2266, + "step": 1694 + }, + { + "epoch": 6.862348178137652, + "grad_norm": 5.698807470646283, + "learning_rate": 2.717150347121177e-06, + "loss": 1.2273, + "step": 1695 + }, + { + "epoch": 6.866396761133603, + "grad_norm": 5.5042411488110154, + "learning_rate": 2.710865998625348e-06, + "loss": 1.2081, + "step": 1696 + }, + { + "epoch": 6.870445344129554, + "grad_norm": 6.8240067723829405, + "learning_rate": 2.704586221979422e-06, + "loss": 1.6486, + "step": 1697 + }, + { + "epoch": 6.874493927125506, + "grad_norm": 5.905111755452213, + "learning_rate": 2.698311029725346e-06, + "loss": 1.5976, + "step": 1698 + }, + { + "epoch": 6.8785425101214575, + "grad_norm": 6.1571466759316, + "learning_rate": 2.6920404343959106e-06, + "loss": 1.3605, + "step": 1699 + }, + { + "epoch": 6.882591093117409, + "grad_norm": 5.716713309024074, + "learning_rate": 2.6857744485147286e-06, + "loss": 1.2964, + "step": 1700 + }, + { + "epoch": 6.886639676113361, + "grad_norm": 5.42925803199323, + "learning_rate": 2.6795130845961993e-06, + "loss": 0.9267, + "step": 1701 + }, + { + "epoch": 6.890688259109312, + "grad_norm": 4.919365319165041, + "learning_rate": 2.673256355145499e-06, + "loss": 1.4449, + "step": 1702 + }, + { + "epoch": 6.894736842105263, + "grad_norm": 4.863542774795551, + "learning_rate": 2.667004272658541e-06, + "loss": 1.4657, + "step": 1703 + }, + { + "epoch": 6.898785425101215, + "grad_norm": 4.299136007306504, + "learning_rate": 2.660756849621962e-06, + "loss": 1.2369, + "step": 1704 + }, + { + "epoch": 6.902834008097166, + "grad_norm": 5.213129071990759, + "learning_rate": 2.6545140985130934e-06, + "loss": 1.2244, + "step": 1705 + }, + { + "epoch": 6.906882591093117, + "grad_norm": 5.578872418777055, + "learning_rate": 2.6482760317999338e-06, + "loss": 1.2811, + "step": 1706 + }, + { + "epoch": 6.910931174089069, + "grad_norm": 4.626194423109011, + "learning_rate": 2.642042661941129e-06, + "loss": 1.0198, + "step": 1707 + }, + { + "epoch": 6.91497975708502, + "grad_norm": 5.352887557319016, + "learning_rate": 2.635814001385938e-06, + "loss": 1.1012, + "step": 1708 + }, + { + "epoch": 6.919028340080971, + "grad_norm": 5.579613506703107, + "learning_rate": 2.629590062574221e-06, + "loss": 1.1085, + "step": 1709 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 4.252011072382573, + "learning_rate": 2.623370857936404e-06, + "loss": 1.431, + "step": 1710 + }, + { + "epoch": 6.9271255060728745, + "grad_norm": 5.916388957924838, + "learning_rate": 2.6171563998934605e-06, + "loss": 1.2774, + "step": 1711 + }, + { + "epoch": 6.931174089068826, + "grad_norm": 5.953432162823518, + "learning_rate": 2.610946700856885e-06, + "loss": 1.2618, + "step": 1712 + }, + { + "epoch": 6.935222672064778, + "grad_norm": 6.19929364838639, + "learning_rate": 2.604741773228661e-06, + "loss": 1.0577, + "step": 1713 + }, + { + "epoch": 6.939271255060729, + "grad_norm": 5.789164804068839, + "learning_rate": 2.5985416294012487e-06, + "loss": 1.0688, + "step": 1714 + }, + { + "epoch": 6.94331983805668, + "grad_norm": 6.659571736165462, + "learning_rate": 2.592346281757552e-06, + "loss": 1.3636, + "step": 1715 + }, + { + "epoch": 6.947368421052632, + "grad_norm": 5.314697446259228, + "learning_rate": 2.586155742670897e-06, + "loss": 1.0952, + "step": 1716 + }, + { + "epoch": 6.951417004048583, + "grad_norm": 6.659337503952005, + "learning_rate": 2.5799700245050074e-06, + "loss": 1.0229, + "step": 1717 + }, + { + "epoch": 6.955465587044534, + "grad_norm": 6.65312440022192, + "learning_rate": 2.5737891396139713e-06, + "loss": 1.3201, + "step": 1718 + }, + { + "epoch": 6.959514170040486, + "grad_norm": 5.938881485697329, + "learning_rate": 2.5676131003422317e-06, + "loss": 1.3962, + "step": 1719 + }, + { + "epoch": 6.963562753036437, + "grad_norm": 5.4389936951171025, + "learning_rate": 2.561441919024551e-06, + "loss": 1.346, + "step": 1720 + }, + { + "epoch": 6.967611336032388, + "grad_norm": 6.814603646499591, + "learning_rate": 2.5552756079859904e-06, + "loss": 1.3755, + "step": 1721 + }, + { + "epoch": 6.97165991902834, + "grad_norm": 6.557034047725967, + "learning_rate": 2.549114179541884e-06, + "loss": 1.2917, + "step": 1722 + }, + { + "epoch": 6.9757085020242915, + "grad_norm": 4.666089006915814, + "learning_rate": 2.542957645997811e-06, + "loss": 1.3178, + "step": 1723 + }, + { + "epoch": 6.979757085020243, + "grad_norm": 5.4101007526641, + "learning_rate": 2.5368060196495785e-06, + "loss": 1.3848, + "step": 1724 + }, + { + "epoch": 6.983805668016195, + "grad_norm": 5.003638917729553, + "learning_rate": 2.530659312783192e-06, + "loss": 1.4391, + "step": 1725 + }, + { + "epoch": 6.987854251012146, + "grad_norm": 4.982884862825928, + "learning_rate": 2.5245175376748334e-06, + "loss": 1.2329, + "step": 1726 + }, + { + "epoch": 6.991902834008097, + "grad_norm": 4.383040697186735, + "learning_rate": 2.5183807065908296e-06, + "loss": 1.2466, + "step": 1727 + }, + { + "epoch": 6.995951417004049, + "grad_norm": 4.833585025134396, + "learning_rate": 2.512248831787639e-06, + "loss": 1.5637, + "step": 1728 + }, + { + "epoch": 7.0, + "grad_norm": 4.848560799578388, + "learning_rate": 2.5061219255118186e-06, + "loss": 1.2677, + "step": 1729 + }, + { + "epoch": 7.004048582995951, + "grad_norm": 4.901375359150507, + "learning_rate": 2.5000000000000015e-06, + "loss": 1.3023, + "step": 1730 + }, + { + "epoch": 7.008097165991903, + "grad_norm": 6.545083705424055, + "learning_rate": 2.4938830674788756e-06, + "loss": 1.4651, + "step": 1731 + }, + { + "epoch": 7.012145748987854, + "grad_norm": 6.141277943301318, + "learning_rate": 2.4877711401651562e-06, + "loss": 1.2554, + "step": 1732 + }, + { + "epoch": 7.016194331983805, + "grad_norm": 6.544269798324027, + "learning_rate": 2.4816642302655634e-06, + "loss": 1.479, + "step": 1733 + }, + { + "epoch": 7.020242914979757, + "grad_norm": 5.746379418360751, + "learning_rate": 2.475562349976791e-06, + "loss": 1.656, + "step": 1734 + }, + { + "epoch": 7.0242914979757085, + "grad_norm": 6.035436258524213, + "learning_rate": 2.4694655114854936e-06, + "loss": 1.5592, + "step": 1735 + }, + { + "epoch": 7.02834008097166, + "grad_norm": 5.223633858026752, + "learning_rate": 2.4633737269682546e-06, + "loss": 1.2619, + "step": 1736 + }, + { + "epoch": 7.032388663967612, + "grad_norm": 5.890887028411126, + "learning_rate": 2.4572870085915628e-06, + "loss": 1.2686, + "step": 1737 + }, + { + "epoch": 7.036437246963563, + "grad_norm": 5.4867419263331785, + "learning_rate": 2.4512053685117916e-06, + "loss": 1.4711, + "step": 1738 + }, + { + "epoch": 7.040485829959514, + "grad_norm": 5.856066296731616, + "learning_rate": 2.445128818875166e-06, + "loss": 1.2784, + "step": 1739 + }, + { + "epoch": 7.044534412955466, + "grad_norm": 5.685747261263775, + "learning_rate": 2.4390573718177507e-06, + "loss": 1.4178, + "step": 1740 + }, + { + "epoch": 7.048582995951417, + "grad_norm": 5.580589694434444, + "learning_rate": 2.4329910394654167e-06, + "loss": 1.2819, + "step": 1741 + }, + { + "epoch": 7.052631578947368, + "grad_norm": 6.1734653161832345, + "learning_rate": 2.4269298339338205e-06, + "loss": 1.3334, + "step": 1742 + }, + { + "epoch": 7.05668016194332, + "grad_norm": 5.647156467107709, + "learning_rate": 2.4208737673283818e-06, + "loss": 1.1932, + "step": 1743 + }, + { + "epoch": 7.060728744939271, + "grad_norm": 5.571147412614646, + "learning_rate": 2.414822851744249e-06, + "loss": 1.3354, + "step": 1744 + }, + { + "epoch": 7.064777327935222, + "grad_norm": 6.222421117643815, + "learning_rate": 2.408777099266291e-06, + "loss": 1.2747, + "step": 1745 + }, + { + "epoch": 7.068825910931174, + "grad_norm": 6.251859136759403, + "learning_rate": 2.4027365219690617e-06, + "loss": 1.444, + "step": 1746 + }, + { + "epoch": 7.0728744939271255, + "grad_norm": 5.555376265690771, + "learning_rate": 2.3967011319167804e-06, + "loss": 1.3478, + "step": 1747 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 6.222350987405198, + "learning_rate": 2.3906709411633073e-06, + "loss": 1.3069, + "step": 1748 + }, + { + "epoch": 7.080971659919029, + "grad_norm": 5.290175219718593, + "learning_rate": 2.384645961752113e-06, + "loss": 1.4103, + "step": 1749 + }, + { + "epoch": 7.08502024291498, + "grad_norm": 4.882921637643386, + "learning_rate": 2.378626205716265e-06, + "loss": 1.3698, + "step": 1750 + }, + { + "epoch": 7.089068825910931, + "grad_norm": 5.893035167375215, + "learning_rate": 2.3726116850783987e-06, + "loss": 1.3153, + "step": 1751 + }, + { + "epoch": 7.093117408906883, + "grad_norm": 5.440462022348463, + "learning_rate": 2.3666024118506937e-06, + "loss": 1.3918, + "step": 1752 + }, + { + "epoch": 7.097165991902834, + "grad_norm": 5.298541554798929, + "learning_rate": 2.3605983980348446e-06, + "loss": 1.1493, + "step": 1753 + }, + { + "epoch": 7.101214574898785, + "grad_norm": 5.873912109321258, + "learning_rate": 2.354599655622049e-06, + "loss": 1.3419, + "step": 1754 + }, + { + "epoch": 7.105263157894737, + "grad_norm": 6.515086572176515, + "learning_rate": 2.3486061965929695e-06, + "loss": 1.2658, + "step": 1755 + }, + { + "epoch": 7.109311740890688, + "grad_norm": 5.640239544492155, + "learning_rate": 2.3426180329177217e-06, + "loss": 1.2778, + "step": 1756 + }, + { + "epoch": 7.113360323886639, + "grad_norm": 6.602620889096045, + "learning_rate": 2.3366351765558437e-06, + "loss": 1.2168, + "step": 1757 + }, + { + "epoch": 7.117408906882591, + "grad_norm": 6.23335605433251, + "learning_rate": 2.3306576394562748e-06, + "loss": 1.1279, + "step": 1758 + }, + { + "epoch": 7.1214574898785425, + "grad_norm": 5.812741962332591, + "learning_rate": 2.3246854335573303e-06, + "loss": 1.2, + "step": 1759 + }, + { + "epoch": 7.125506072874494, + "grad_norm": 5.7653076766991465, + "learning_rate": 2.318718570786675e-06, + "loss": 1.2204, + "step": 1760 + }, + { + "epoch": 7.129554655870446, + "grad_norm": 6.592268657435819, + "learning_rate": 2.3127570630613064e-06, + "loss": 1.0923, + "step": 1761 + }, + { + "epoch": 7.133603238866397, + "grad_norm": 5.105109462079527, + "learning_rate": 2.3068009222875256e-06, + "loss": 1.4491, + "step": 1762 + }, + { + "epoch": 7.137651821862348, + "grad_norm": 6.139171319338175, + "learning_rate": 2.3008501603609147e-06, + "loss": 1.2557, + "step": 1763 + }, + { + "epoch": 7.1417004048583, + "grad_norm": 4.871725004057816, + "learning_rate": 2.294904789166315e-06, + "loss": 1.023, + "step": 1764 + }, + { + "epoch": 7.145748987854251, + "grad_norm": 6.491293356249618, + "learning_rate": 2.288964820577797e-06, + "loss": 1.3439, + "step": 1765 + }, + { + "epoch": 7.149797570850202, + "grad_norm": 5.837952957007555, + "learning_rate": 2.283030266458644e-06, + "loss": 1.182, + "step": 1766 + }, + { + "epoch": 7.153846153846154, + "grad_norm": 5.104308775866129, + "learning_rate": 2.2771011386613268e-06, + "loss": 1.4117, + "step": 1767 + }, + { + "epoch": 7.157894736842105, + "grad_norm": 6.518827958790034, + "learning_rate": 2.2711774490274767e-06, + "loss": 1.4173, + "step": 1768 + }, + { + "epoch": 7.161943319838056, + "grad_norm": 4.94266123667569, + "learning_rate": 2.265259209387867e-06, + "loss": 1.2429, + "step": 1769 + }, + { + "epoch": 7.165991902834008, + "grad_norm": 5.473631523594278, + "learning_rate": 2.259346431562379e-06, + "loss": 1.3316, + "step": 1770 + }, + { + "epoch": 7.17004048582996, + "grad_norm": 5.001369544056481, + "learning_rate": 2.2534391273599937e-06, + "loss": 1.9136, + "step": 1771 + }, + { + "epoch": 7.174089068825911, + "grad_norm": 5.913295650699435, + "learning_rate": 2.2475373085787568e-06, + "loss": 1.1497, + "step": 1772 + }, + { + "epoch": 7.178137651821863, + "grad_norm": 6.952533318275522, + "learning_rate": 2.2416409870057577e-06, + "loss": 1.353, + "step": 1773 + }, + { + "epoch": 7.182186234817814, + "grad_norm": 4.723432595191292, + "learning_rate": 2.2357501744171105e-06, + "loss": 1.1492, + "step": 1774 + }, + { + "epoch": 7.186234817813765, + "grad_norm": 6.058020017509188, + "learning_rate": 2.229864882577921e-06, + "loss": 1.3322, + "step": 1775 + }, + { + "epoch": 7.190283400809717, + "grad_norm": 5.788151410477542, + "learning_rate": 2.2239851232422736e-06, + "loss": 1.3631, + "step": 1776 + }, + { + "epoch": 7.194331983805668, + "grad_norm": 6.262252651618726, + "learning_rate": 2.218110908153202e-06, + "loss": 1.5276, + "step": 1777 + }, + { + "epoch": 7.198380566801619, + "grad_norm": 5.208163192867401, + "learning_rate": 2.2122422490426676e-06, + "loss": 1.5831, + "step": 1778 + }, + { + "epoch": 7.202429149797571, + "grad_norm": 5.390523496529594, + "learning_rate": 2.206379157631532e-06, + "loss": 1.2908, + "step": 1779 + }, + { + "epoch": 7.206477732793522, + "grad_norm": 5.162249120166779, + "learning_rate": 2.200521645629542e-06, + "loss": 1.6171, + "step": 1780 + }, + { + "epoch": 7.2105263157894735, + "grad_norm": 5.391588507251084, + "learning_rate": 2.194669724735296e-06, + "loss": 1.6111, + "step": 1781 + }, + { + "epoch": 7.2145748987854255, + "grad_norm": 6.1034967557731665, + "learning_rate": 2.1888234066362303e-06, + "loss": 1.3854, + "step": 1782 + }, + { + "epoch": 7.218623481781377, + "grad_norm": 6.167454760308808, + "learning_rate": 2.18298270300859e-06, + "loss": 1.2693, + "step": 1783 + }, + { + "epoch": 7.222672064777328, + "grad_norm": 5.69770152013801, + "learning_rate": 2.1771476255174056e-06, + "loss": 1.2078, + "step": 1784 + }, + { + "epoch": 7.22672064777328, + "grad_norm": 5.460410860926906, + "learning_rate": 2.1713181858164746e-06, + "loss": 1.413, + "step": 1785 + }, + { + "epoch": 7.230769230769231, + "grad_norm": 5.566118830424516, + "learning_rate": 2.165494395548329e-06, + "loss": 1.1968, + "step": 1786 + }, + { + "epoch": 7.234817813765182, + "grad_norm": 6.43649848295101, + "learning_rate": 2.159676266344222e-06, + "loss": 1.4229, + "step": 1787 + }, + { + "epoch": 7.238866396761134, + "grad_norm": 6.290508191897902, + "learning_rate": 2.1538638098241e-06, + "loss": 1.3623, + "step": 1788 + }, + { + "epoch": 7.242914979757085, + "grad_norm": 5.730502481155649, + "learning_rate": 2.14805703759658e-06, + "loss": 1.396, + "step": 1789 + }, + { + "epoch": 7.246963562753036, + "grad_norm": 5.437978852325137, + "learning_rate": 2.1422559612589266e-06, + "loss": 1.252, + "step": 1790 + }, + { + "epoch": 7.251012145748988, + "grad_norm": 5.7552412936402435, + "learning_rate": 2.136460592397025e-06, + "loss": 1.344, + "step": 1791 + }, + { + "epoch": 7.255060728744939, + "grad_norm": 5.804592913810575, + "learning_rate": 2.1306709425853663e-06, + "loss": 1.291, + "step": 1792 + }, + { + "epoch": 7.2591093117408905, + "grad_norm": 5.304611515686778, + "learning_rate": 2.124887023387017e-06, + "loss": 1.25, + "step": 1793 + }, + { + "epoch": 7.2631578947368425, + "grad_norm": 5.579310956319717, + "learning_rate": 2.1191088463535997e-06, + "loss": 1.0352, + "step": 1794 + }, + { + "epoch": 7.267206477732794, + "grad_norm": 5.280713442914896, + "learning_rate": 2.113336423025269e-06, + "loss": 1.3293, + "step": 1795 + }, + { + "epoch": 7.271255060728745, + "grad_norm": 5.695843923044428, + "learning_rate": 2.1075697649306838e-06, + "loss": 1.3279, + "step": 1796 + }, + { + "epoch": 7.275303643724697, + "grad_norm": 5.537225853611836, + "learning_rate": 2.1018088835869943e-06, + "loss": 1.4052, + "step": 1797 + }, + { + "epoch": 7.279352226720648, + "grad_norm": 7.310804417037736, + "learning_rate": 2.0960537904998113e-06, + "loss": 1.3052, + "step": 1798 + }, + { + "epoch": 7.283400809716599, + "grad_norm": 6.5207473345683455, + "learning_rate": 2.0903044971631854e-06, + "loss": 0.9953, + "step": 1799 + }, + { + "epoch": 7.287449392712551, + "grad_norm": 6.891390925467454, + "learning_rate": 2.084561015059585e-06, + "loss": 1.1524, + "step": 1800 + }, + { + "epoch": 7.291497975708502, + "grad_norm": 6.511458265596788, + "learning_rate": 2.0788233556598688e-06, + "loss": 1.019, + "step": 1801 + }, + { + "epoch": 7.295546558704453, + "grad_norm": 6.525945460785431, + "learning_rate": 2.0730915304232692e-06, + "loss": 1.2347, + "step": 1802 + }, + { + "epoch": 7.299595141700405, + "grad_norm": 5.806148576127675, + "learning_rate": 2.067365550797367e-06, + "loss": 1.4674, + "step": 1803 + }, + { + "epoch": 7.303643724696356, + "grad_norm": 6.6525694728213685, + "learning_rate": 2.061645428218067e-06, + "loss": 1.0762, + "step": 1804 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 6.212203279710177, + "learning_rate": 2.055931174109579e-06, + "loss": 1.1289, + "step": 1805 + }, + { + "epoch": 7.3117408906882595, + "grad_norm": 5.666269345071883, + "learning_rate": 2.050222799884387e-06, + "loss": 1.1799, + "step": 1806 + }, + { + "epoch": 7.315789473684211, + "grad_norm": 7.0629439288873, + "learning_rate": 2.044520316943235e-06, + "loss": 1.0631, + "step": 1807 + }, + { + "epoch": 7.319838056680162, + "grad_norm": 6.059126520843265, + "learning_rate": 2.0388237366751005e-06, + "loss": 1.03, + "step": 1808 + }, + { + "epoch": 7.323886639676114, + "grad_norm": 6.3174918869462635, + "learning_rate": 2.0331330704571746e-06, + "loss": 1.0775, + "step": 1809 + }, + { + "epoch": 7.327935222672065, + "grad_norm": 6.098595972628923, + "learning_rate": 2.027448329654832e-06, + "loss": 1.0956, + "step": 1810 + }, + { + "epoch": 7.331983805668016, + "grad_norm": 6.07010789176819, + "learning_rate": 2.02176952562162e-06, + "loss": 1.132, + "step": 1811 + }, + { + "epoch": 7.336032388663968, + "grad_norm": 5.673793373139681, + "learning_rate": 2.0160966696992195e-06, + "loss": 1.235, + "step": 1812 + }, + { + "epoch": 7.340080971659919, + "grad_norm": 5.42325757234182, + "learning_rate": 2.0104297732174403e-06, + "loss": 1.1607, + "step": 1813 + }, + { + "epoch": 7.34412955465587, + "grad_norm": 5.845384796389491, + "learning_rate": 2.004768847494186e-06, + "loss": 1.069, + "step": 1814 + }, + { + "epoch": 7.348178137651822, + "grad_norm": 6.716611305618001, + "learning_rate": 1.999113903835438e-06, + "loss": 1.2088, + "step": 1815 + }, + { + "epoch": 7.352226720647773, + "grad_norm": 6.335024142337415, + "learning_rate": 1.9934649535352286e-06, + "loss": 1.215, + "step": 1816 + }, + { + "epoch": 7.3562753036437245, + "grad_norm": 6.074016020941024, + "learning_rate": 1.987822007875617e-06, + "loss": 0.8957, + "step": 1817 + }, + { + "epoch": 7.3603238866396765, + "grad_norm": 6.669356187358129, + "learning_rate": 1.982185078126676e-06, + "loss": 1.2878, + "step": 1818 + }, + { + "epoch": 7.364372469635628, + "grad_norm": 5.5205879930863055, + "learning_rate": 1.9765541755464605e-06, + "loss": 1.3594, + "step": 1819 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 5.791173021479898, + "learning_rate": 1.9709293113809876e-06, + "loss": 1.2518, + "step": 1820 + }, + { + "epoch": 7.372469635627531, + "grad_norm": 7.085668027134953, + "learning_rate": 1.965310496864217e-06, + "loss": 1.3044, + "step": 1821 + }, + { + "epoch": 7.376518218623482, + "grad_norm": 6.30070905341863, + "learning_rate": 1.9596977432180212e-06, + "loss": 1.0096, + "step": 1822 + }, + { + "epoch": 7.380566801619433, + "grad_norm": 6.668544077573982, + "learning_rate": 1.954091061652172e-06, + "loss": 1.1521, + "step": 1823 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 5.685627571377497, + "learning_rate": 1.948490463364313e-06, + "loss": 0.9629, + "step": 1824 + }, + { + "epoch": 7.388663967611336, + "grad_norm": 7.099232364097355, + "learning_rate": 1.942895959539939e-06, + "loss": 1.0332, + "step": 1825 + }, + { + "epoch": 7.392712550607287, + "grad_norm": 6.449023103797025, + "learning_rate": 1.9373075613523728e-06, + "loss": 1.219, + "step": 1826 + }, + { + "epoch": 7.396761133603239, + "grad_norm": 7.603243728006548, + "learning_rate": 1.9317252799627393e-06, + "loss": 1.0144, + "step": 1827 + }, + { + "epoch": 7.40080971659919, + "grad_norm": 5.630823437903324, + "learning_rate": 1.9261491265199526e-06, + "loss": 1.0604, + "step": 1828 + }, + { + "epoch": 7.4048582995951415, + "grad_norm": 5.804060941623419, + "learning_rate": 1.920579112160685e-06, + "loss": 1.0906, + "step": 1829 + }, + { + "epoch": 7.4089068825910935, + "grad_norm": 7.107387654645546, + "learning_rate": 1.915015248009348e-06, + "loss": 1.1866, + "step": 1830 + }, + { + "epoch": 7.412955465587045, + "grad_norm": 6.216151169357513, + "learning_rate": 1.9094575451780727e-06, + "loss": 1.0234, + "step": 1831 + }, + { + "epoch": 7.417004048582996, + "grad_norm": 7.173346243896998, + "learning_rate": 1.903906014766681e-06, + "loss": 1.3152, + "step": 1832 + }, + { + "epoch": 7.421052631578947, + "grad_norm": 7.353654026214847, + "learning_rate": 1.8983606678626665e-06, + "loss": 1.3466, + "step": 1833 + }, + { + "epoch": 7.425101214574899, + "grad_norm": 6.168388032585026, + "learning_rate": 1.8928215155411773e-06, + "loss": 1.3615, + "step": 1834 + }, + { + "epoch": 7.42914979757085, + "grad_norm": 7.177909922740221, + "learning_rate": 1.8872885688649879e-06, + "loss": 1.3325, + "step": 1835 + }, + { + "epoch": 7.433198380566802, + "grad_norm": 5.5067246147195315, + "learning_rate": 1.8817618388844783e-06, + "loss": 1.5126, + "step": 1836 + }, + { + "epoch": 7.437246963562753, + "grad_norm": 6.480398605143195, + "learning_rate": 1.8762413366376159e-06, + "loss": 1.2967, + "step": 1837 + }, + { + "epoch": 7.441295546558704, + "grad_norm": 7.239184730466869, + "learning_rate": 1.8707270731499223e-06, + "loss": 1.2391, + "step": 1838 + }, + { + "epoch": 7.445344129554655, + "grad_norm": 5.881764731806458, + "learning_rate": 1.865219059434467e-06, + "loss": 1.4892, + "step": 1839 + }, + { + "epoch": 7.449392712550607, + "grad_norm": 7.287338664223354, + "learning_rate": 1.8597173064918333e-06, + "loss": 1.2865, + "step": 1840 + }, + { + "epoch": 7.4534412955465585, + "grad_norm": 6.989877908949274, + "learning_rate": 1.854221825310103e-06, + "loss": 1.2753, + "step": 1841 + }, + { + "epoch": 7.4574898785425106, + "grad_norm": 6.967142936381031, + "learning_rate": 1.8487326268648314e-06, + "loss": 1.6209, + "step": 1842 + }, + { + "epoch": 7.461538461538462, + "grad_norm": 9.165493801033026, + "learning_rate": 1.8432497221190227e-06, + "loss": 1.7021, + "step": 1843 + }, + { + "epoch": 7.465587044534413, + "grad_norm": 7.201939055537971, + "learning_rate": 1.8377731220231144e-06, + "loss": 1.4113, + "step": 1844 + }, + { + "epoch": 7.469635627530364, + "grad_norm": 6.447673122675899, + "learning_rate": 1.832302837514952e-06, + "loss": 1.4683, + "step": 1845 + }, + { + "epoch": 7.473684210526316, + "grad_norm": 5.915439909033562, + "learning_rate": 1.8268388795197683e-06, + "loss": 1.4386, + "step": 1846 + }, + { + "epoch": 7.477732793522267, + "grad_norm": 7.791713816072655, + "learning_rate": 1.8213812589501611e-06, + "loss": 1.4409, + "step": 1847 + }, + { + "epoch": 7.481781376518219, + "grad_norm": 5.76907536016399, + "learning_rate": 1.815929986706066e-06, + "loss": 1.357, + "step": 1848 + }, + { + "epoch": 7.48582995951417, + "grad_norm": 6.324576322221301, + "learning_rate": 1.8104850736747458e-06, + "loss": 1.3014, + "step": 1849 + }, + { + "epoch": 7.489878542510121, + "grad_norm": 7.955436278806627, + "learning_rate": 1.8050465307307602e-06, + "loss": 1.2541, + "step": 1850 + }, + { + "epoch": 7.493927125506072, + "grad_norm": 8.3800061367103, + "learning_rate": 1.7996143687359475e-06, + "loss": 1.2069, + "step": 1851 + }, + { + "epoch": 7.497975708502024, + "grad_norm": 5.859852613078974, + "learning_rate": 1.7941885985394025e-06, + "loss": 1.1389, + "step": 1852 + }, + { + "epoch": 7.502024291497976, + "grad_norm": 6.714230939191411, + "learning_rate": 1.78876923097745e-06, + "loss": 0.96, + "step": 1853 + }, + { + "epoch": 7.506072874493928, + "grad_norm": 7.478771265211495, + "learning_rate": 1.783356276873633e-06, + "loss": 1.3238, + "step": 1854 + }, + { + "epoch": 7.510121457489879, + "grad_norm": 6.964602737040841, + "learning_rate": 1.7779497470386826e-06, + "loss": 1.2515, + "step": 1855 + }, + { + "epoch": 7.51417004048583, + "grad_norm": 5.135869484791375, + "learning_rate": 1.7725496522704998e-06, + "loss": 1.2487, + "step": 1856 + }, + { + "epoch": 7.518218623481781, + "grad_norm": 6.736233605627823, + "learning_rate": 1.7671560033541364e-06, + "loss": 1.2647, + "step": 1857 + }, + { + "epoch": 7.522267206477733, + "grad_norm": 7.4340596808517585, + "learning_rate": 1.7617688110617653e-06, + "loss": 1.1983, + "step": 1858 + }, + { + "epoch": 7.526315789473684, + "grad_norm": 7.142575001524021, + "learning_rate": 1.7563880861526656e-06, + "loss": 1.037, + "step": 1859 + }, + { + "epoch": 7.530364372469636, + "grad_norm": 6.461217060280809, + "learning_rate": 1.7510138393732029e-06, + "loss": 1.125, + "step": 1860 + }, + { + "epoch": 7.534412955465587, + "grad_norm": 7.120411669751328, + "learning_rate": 1.7456460814568032e-06, + "loss": 1.1532, + "step": 1861 + }, + { + "epoch": 7.538461538461538, + "grad_norm": 6.677578923600314, + "learning_rate": 1.7402848231239317e-06, + "loss": 1.447, + "step": 1862 + }, + { + "epoch": 7.5425101214574894, + "grad_norm": 5.995680414752151, + "learning_rate": 1.7349300750820758e-06, + "loss": 1.414, + "step": 1863 + }, + { + "epoch": 7.5465587044534415, + "grad_norm": 70.49787838581857, + "learning_rate": 1.7295818480257148e-06, + "loss": 1.9394, + "step": 1864 + }, + { + "epoch": 7.550607287449393, + "grad_norm": 11.227616663799225, + "learning_rate": 1.7242401526363095e-06, + "loss": 1.6974, + "step": 1865 + }, + { + "epoch": 7.554655870445345, + "grad_norm": 15.917128296917474, + "learning_rate": 1.7189049995822748e-06, + "loss": 2.0666, + "step": 1866 + }, + { + "epoch": 7.558704453441296, + "grad_norm": 6.5545578057982254, + "learning_rate": 1.7135763995189574e-06, + "loss": 1.2566, + "step": 1867 + }, + { + "epoch": 7.562753036437247, + "grad_norm": 5.608919892200609, + "learning_rate": 1.70825436308862e-06, + "loss": 1.1258, + "step": 1868 + }, + { + "epoch": 7.566801619433198, + "grad_norm": 5.78898827199352, + "learning_rate": 1.70293890092041e-06, + "loss": 1.511, + "step": 1869 + }, + { + "epoch": 7.57085020242915, + "grad_norm": 6.1957471468572605, + "learning_rate": 1.6976300236303505e-06, + "loss": 1.1713, + "step": 1870 + }, + { + "epoch": 7.574898785425101, + "grad_norm": 5.919353556112893, + "learning_rate": 1.692327741821312e-06, + "loss": 1.3418, + "step": 1871 + }, + { + "epoch": 7.578947368421053, + "grad_norm": 4.818508692645506, + "learning_rate": 1.6870320660829908e-06, + "loss": 1.1787, + "step": 1872 + }, + { + "epoch": 7.582995951417004, + "grad_norm": 6.074378707133634, + "learning_rate": 1.6817430069918939e-06, + "loss": 1.2772, + "step": 1873 + }, + { + "epoch": 7.587044534412955, + "grad_norm": 6.043486629250494, + "learning_rate": 1.676460575111306e-06, + "loss": 1.2858, + "step": 1874 + }, + { + "epoch": 7.5910931174089065, + "grad_norm": 6.824574202718084, + "learning_rate": 1.671184780991283e-06, + "loss": 1.2792, + "step": 1875 + }, + { + "epoch": 7.5951417004048585, + "grad_norm": 6.003146333113679, + "learning_rate": 1.6659156351686202e-06, + "loss": 0.9987, + "step": 1876 + }, + { + "epoch": 7.59919028340081, + "grad_norm": 5.257435712843031, + "learning_rate": 1.6606531481668364e-06, + "loss": 1.1001, + "step": 1877 + }, + { + "epoch": 7.603238866396762, + "grad_norm": 5.19698994619142, + "learning_rate": 1.6553973304961528e-06, + "loss": 1.1799, + "step": 1878 + }, + { + "epoch": 7.607287449392713, + "grad_norm": 5.841701091792967, + "learning_rate": 1.6501481926534658e-06, + "loss": 0.9594, + "step": 1879 + }, + { + "epoch": 7.611336032388664, + "grad_norm": 6.19240531240544, + "learning_rate": 1.6449057451223354e-06, + "loss": 1.2521, + "step": 1880 + }, + { + "epoch": 7.615384615384615, + "grad_norm": 5.549994801931837, + "learning_rate": 1.639669998372958e-06, + "loss": 1.2949, + "step": 1881 + }, + { + "epoch": 7.619433198380567, + "grad_norm": 6.675501333896787, + "learning_rate": 1.6344409628621482e-06, + "loss": 1.0393, + "step": 1882 + }, + { + "epoch": 7.623481781376518, + "grad_norm": 6.8185578077235025, + "learning_rate": 1.6292186490333172e-06, + "loss": 1.3907, + "step": 1883 + }, + { + "epoch": 7.62753036437247, + "grad_norm": 5.788785194808056, + "learning_rate": 1.6240030673164492e-06, + "loss": 1.2266, + "step": 1884 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 6.240532210004539, + "learning_rate": 1.6187942281280838e-06, + "loss": 1.4968, + "step": 1885 + }, + { + "epoch": 7.635627530364372, + "grad_norm": 5.438972394942183, + "learning_rate": 1.6135921418712959e-06, + "loss": 1.0917, + "step": 1886 + }, + { + "epoch": 7.6396761133603235, + "grad_norm": 6.412673367253676, + "learning_rate": 1.6083968189356724e-06, + "loss": 1.3789, + "step": 1887 + }, + { + "epoch": 7.6437246963562755, + "grad_norm": 5.536347657482411, + "learning_rate": 1.6032082696972945e-06, + "loss": 1.2638, + "step": 1888 + }, + { + "epoch": 7.647773279352227, + "grad_norm": 6.127206089252584, + "learning_rate": 1.5980265045187139e-06, + "loss": 1.3732, + "step": 1889 + }, + { + "epoch": 7.651821862348179, + "grad_norm": 5.193216915475832, + "learning_rate": 1.5928515337489292e-06, + "loss": 1.1536, + "step": 1890 + }, + { + "epoch": 7.65587044534413, + "grad_norm": 6.4405008029321635, + "learning_rate": 1.5876833677233754e-06, + "loss": 1.3585, + "step": 1891 + }, + { + "epoch": 7.659919028340081, + "grad_norm": 6.735596126416384, + "learning_rate": 1.5825220167638945e-06, + "loss": 1.1643, + "step": 1892 + }, + { + "epoch": 7.663967611336032, + "grad_norm": 5.578067115309463, + "learning_rate": 1.5773674911787157e-06, + "loss": 1.3335, + "step": 1893 + }, + { + "epoch": 7.668016194331984, + "grad_norm": 5.847753238206834, + "learning_rate": 1.5722198012624418e-06, + "loss": 1.3156, + "step": 1894 + }, + { + "epoch": 7.672064777327935, + "grad_norm": 6.167981268598202, + "learning_rate": 1.567078957296016e-06, + "loss": 1.4919, + "step": 1895 + }, + { + "epoch": 7.676113360323887, + "grad_norm": 5.209386411212645, + "learning_rate": 1.5619449695467142e-06, + "loss": 1.4698, + "step": 1896 + }, + { + "epoch": 7.680161943319838, + "grad_norm": 6.423491328339259, + "learning_rate": 1.556817848268118e-06, + "loss": 1.3083, + "step": 1897 + }, + { + "epoch": 7.684210526315789, + "grad_norm": 6.099826757015211, + "learning_rate": 1.5516976037000941e-06, + "loss": 1.1861, + "step": 1898 + }, + { + "epoch": 7.6882591093117405, + "grad_norm": 5.753586753644626, + "learning_rate": 1.5465842460687786e-06, + "loss": 1.2721, + "step": 1899 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 6.272583592648715, + "learning_rate": 1.5414777855865466e-06, + "loss": 1.2911, + "step": 1900 + }, + { + "epoch": 7.696356275303644, + "grad_norm": 5.68165710538138, + "learning_rate": 1.5363782324520033e-06, + "loss": 1.1648, + "step": 1901 + }, + { + "epoch": 7.700404858299595, + "grad_norm": 7.460829794563436, + "learning_rate": 1.5312855968499574e-06, + "loss": 1.6084, + "step": 1902 + }, + { + "epoch": 7.704453441295547, + "grad_norm": 6.5692354666682276, + "learning_rate": 1.5261998889514017e-06, + "loss": 1.4184, + "step": 1903 + }, + { + "epoch": 7.708502024291498, + "grad_norm": 6.3186571601325525, + "learning_rate": 1.5211211189134955e-06, + "loss": 1.0412, + "step": 1904 + }, + { + "epoch": 7.712550607287449, + "grad_norm": 5.682537504028156, + "learning_rate": 1.516049296879535e-06, + "loss": 1.1573, + "step": 1905 + }, + { + "epoch": 7.716599190283401, + "grad_norm": 5.812434487226451, + "learning_rate": 1.510984432978947e-06, + "loss": 1.2783, + "step": 1906 + }, + { + "epoch": 7.720647773279352, + "grad_norm": 7.075156192084278, + "learning_rate": 1.5059265373272574e-06, + "loss": 1.0288, + "step": 1907 + }, + { + "epoch": 7.724696356275303, + "grad_norm": 6.467523066478314, + "learning_rate": 1.5008756200260776e-06, + "loss": 1.2684, + "step": 1908 + }, + { + "epoch": 7.728744939271255, + "grad_norm": 5.838154690826828, + "learning_rate": 1.4958316911630827e-06, + "loss": 1.4278, + "step": 1909 + }, + { + "epoch": 7.732793522267206, + "grad_norm": 5.866932075199195, + "learning_rate": 1.4907947608119866e-06, + "loss": 1.1213, + "step": 1910 + }, + { + "epoch": 7.7368421052631575, + "grad_norm": 6.005636196644713, + "learning_rate": 1.4857648390325257e-06, + "loss": 1.2309, + "step": 1911 + }, + { + "epoch": 7.7408906882591095, + "grad_norm": 5.736349178634425, + "learning_rate": 1.4807419358704433e-06, + "loss": 1.8603, + "step": 1912 + }, + { + "epoch": 7.744939271255061, + "grad_norm": 5.608575893991077, + "learning_rate": 1.475726061357463e-06, + "loss": 1.4053, + "step": 1913 + }, + { + "epoch": 7.748987854251012, + "grad_norm": 6.949290018272913, + "learning_rate": 1.47071722551127e-06, + "loss": 1.2025, + "step": 1914 + }, + { + "epoch": 7.753036437246964, + "grad_norm": 6.470859543707123, + "learning_rate": 1.4657154383354948e-06, + "loss": 1.1287, + "step": 1915 + }, + { + "epoch": 7.757085020242915, + "grad_norm": 6.10955142295277, + "learning_rate": 1.4607207098196851e-06, + "loss": 1.2334, + "step": 1916 + }, + { + "epoch": 7.761133603238866, + "grad_norm": 6.5763762413068045, + "learning_rate": 1.4557330499392952e-06, + "loss": 1.9826, + "step": 1917 + }, + { + "epoch": 7.765182186234818, + "grad_norm": 7.723579817578996, + "learning_rate": 1.4507524686556612e-06, + "loss": 1.721, + "step": 1918 + }, + { + "epoch": 7.769230769230769, + "grad_norm": 8.397235796894286, + "learning_rate": 1.4457789759159813e-06, + "loss": 1.6659, + "step": 1919 + }, + { + "epoch": 7.77327935222672, + "grad_norm": 5.642365455166119, + "learning_rate": 1.4408125816532981e-06, + "loss": 1.1808, + "step": 1920 + }, + { + "epoch": 7.777327935222672, + "grad_norm": 5.725043241965928, + "learning_rate": 1.435853295786473e-06, + "loss": 1.4747, + "step": 1921 + }, + { + "epoch": 7.781376518218623, + "grad_norm": 5.394430714546486, + "learning_rate": 1.430901128220174e-06, + "loss": 1.4528, + "step": 1922 + }, + { + "epoch": 7.7854251012145745, + "grad_norm": 5.930712388463373, + "learning_rate": 1.4259560888448526e-06, + "loss": 1.2558, + "step": 1923 + }, + { + "epoch": 7.7894736842105265, + "grad_norm": 5.519869867138563, + "learning_rate": 1.4210181875367229e-06, + "loss": 1.1873, + "step": 1924 + }, + { + "epoch": 7.793522267206478, + "grad_norm": 6.265126307081154, + "learning_rate": 1.4160874341577447e-06, + "loss": 1.1916, + "step": 1925 + }, + { + "epoch": 7.797570850202429, + "grad_norm": 6.13894194733797, + "learning_rate": 1.4111638385555965e-06, + "loss": 1.2401, + "step": 1926 + }, + { + "epoch": 7.801619433198381, + "grad_norm": 5.721727948891365, + "learning_rate": 1.406247410563667e-06, + "loss": 1.1375, + "step": 1927 + }, + { + "epoch": 7.805668016194332, + "grad_norm": 5.409329610323807, + "learning_rate": 1.4013381600010278e-06, + "loss": 1.0394, + "step": 1928 + }, + { + "epoch": 7.809716599190283, + "grad_norm": 5.946216975378077, + "learning_rate": 1.396436096672416e-06, + "loss": 1.3717, + "step": 1929 + }, + { + "epoch": 7.813765182186235, + "grad_norm": 7.501336587253134, + "learning_rate": 1.3915412303682162e-06, + "loss": 1.1632, + "step": 1930 + }, + { + "epoch": 7.817813765182186, + "grad_norm": 6.192994323170135, + "learning_rate": 1.3866535708644335e-06, + "loss": 1.095, + "step": 1931 + }, + { + "epoch": 7.821862348178137, + "grad_norm": 14.576419437798382, + "learning_rate": 1.3817731279226843e-06, + "loss": 2.1725, + "step": 1932 + }, + { + "epoch": 7.825910931174089, + "grad_norm": 25.425127776950244, + "learning_rate": 1.376899911290172e-06, + "loss": 3.1191, + "step": 1933 + }, + { + "epoch": 7.82995951417004, + "grad_norm": 6.5130908283906574, + "learning_rate": 1.3720339306996666e-06, + "loss": 1.1065, + "step": 1934 + }, + { + "epoch": 7.834008097165992, + "grad_norm": 6.8625067545378755, + "learning_rate": 1.367175195869488e-06, + "loss": 1.076, + "step": 1935 + }, + { + "epoch": 7.838056680161944, + "grad_norm": 5.862839226770468, + "learning_rate": 1.3623237165034807e-06, + "loss": 1.0877, + "step": 1936 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 5.587464620521552, + "learning_rate": 1.3574795022910014e-06, + "loss": 1.181, + "step": 1937 + }, + { + "epoch": 7.846153846153846, + "grad_norm": 5.741544735607096, + "learning_rate": 1.3526425629068968e-06, + "loss": 0.9695, + "step": 1938 + }, + { + "epoch": 7.850202429149798, + "grad_norm": 7.078793165923023, + "learning_rate": 1.347812908011485e-06, + "loss": 1.1728, + "step": 1939 + }, + { + "epoch": 7.854251012145749, + "grad_norm": 7.029454395604512, + "learning_rate": 1.3429905472505344e-06, + "loss": 1.2049, + "step": 1940 + }, + { + "epoch": 7.8582995951417, + "grad_norm": 4.858460051035453, + "learning_rate": 1.3381754902552474e-06, + "loss": 1.1544, + "step": 1941 + }, + { + "epoch": 7.862348178137652, + "grad_norm": 6.543690353473279, + "learning_rate": 1.3333677466422357e-06, + "loss": 1.1535, + "step": 1942 + }, + { + "epoch": 7.866396761133603, + "grad_norm": 6.2618770897927165, + "learning_rate": 1.3285673260135073e-06, + "loss": 1.1238, + "step": 1943 + }, + { + "epoch": 7.870445344129554, + "grad_norm": 7.787458993836756, + "learning_rate": 1.323774237956445e-06, + "loss": 1.5443, + "step": 1944 + }, + { + "epoch": 7.874493927125506, + "grad_norm": 6.60339760790844, + "learning_rate": 1.3189884920437867e-06, + "loss": 1.4939, + "step": 1945 + }, + { + "epoch": 7.8785425101214575, + "grad_norm": 6.952377816462855, + "learning_rate": 1.314210097833607e-06, + "loss": 1.2695, + "step": 1946 + }, + { + "epoch": 7.882591093117409, + "grad_norm": 6.440482664289205, + "learning_rate": 1.309439064869295e-06, + "loss": 1.2076, + "step": 1947 + }, + { + "epoch": 7.886639676113361, + "grad_norm": 5.96904543777947, + "learning_rate": 1.3046754026795406e-06, + "loss": 0.8564, + "step": 1948 + }, + { + "epoch": 7.890688259109312, + "grad_norm": 5.611903455141828, + "learning_rate": 1.2999191207783129e-06, + "loss": 1.3827, + "step": 1949 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 5.50366242354655, + "learning_rate": 1.2951702286648399e-06, + "loss": 1.3867, + "step": 1950 + }, + { + "epoch": 7.898785425101215, + "grad_norm": 4.771234777762805, + "learning_rate": 1.290428735823593e-06, + "loss": 1.1739, + "step": 1951 + }, + { + "epoch": 7.902834008097166, + "grad_norm": 5.7833279202719075, + "learning_rate": 1.2856946517242608e-06, + "loss": 1.1495, + "step": 1952 + }, + { + "epoch": 7.906882591093117, + "grad_norm": 6.107712126684077, + "learning_rate": 1.28096798582174e-06, + "loss": 1.1842, + "step": 1953 + }, + { + "epoch": 7.910931174089069, + "grad_norm": 5.059953747053966, + "learning_rate": 1.2762487475561109e-06, + "loss": 0.9544, + "step": 1954 + }, + { + "epoch": 7.91497975708502, + "grad_norm": 5.819489630730656, + "learning_rate": 1.2715369463526173e-06, + "loss": 1.0285, + "step": 1955 + }, + { + "epoch": 7.919028340080971, + "grad_norm": 6.14238425845007, + "learning_rate": 1.2668325916216534e-06, + "loss": 1.0359, + "step": 1956 + }, + { + "epoch": 7.923076923076923, + "grad_norm": 4.708687979766823, + "learning_rate": 1.2621356927587353e-06, + "loss": 1.3581, + "step": 1957 + }, + { + "epoch": 7.9271255060728745, + "grad_norm": 6.6570477016899074, + "learning_rate": 1.257446259144494e-06, + "loss": 1.2012, + "step": 1958 + }, + { + "epoch": 7.931174089068826, + "grad_norm": 6.636474405464404, + "learning_rate": 1.2527643001446493e-06, + "loss": 1.181, + "step": 1959 + }, + { + "epoch": 7.935222672064778, + "grad_norm": 6.89647738144804, + "learning_rate": 1.248089825109991e-06, + "loss": 0.9855, + "step": 1960 + }, + { + "epoch": 7.939271255060729, + "grad_norm": 6.54652294560363, + "learning_rate": 1.2434228433763657e-06, + "loss": 1.0055, + "step": 1961 + }, + { + "epoch": 7.94331983805668, + "grad_norm": 7.466794850354919, + "learning_rate": 1.2387633642646501e-06, + "loss": 1.2977, + "step": 1962 + }, + { + "epoch": 7.947368421052632, + "grad_norm": 5.859347969468438, + "learning_rate": 1.2341113970807368e-06, + "loss": 1.0272, + "step": 1963 + }, + { + "epoch": 7.951417004048583, + "grad_norm": 7.526875704374519, + "learning_rate": 1.2294669511155193e-06, + "loss": 0.939, + "step": 1964 + }, + { + "epoch": 7.955465587044534, + "grad_norm": 7.225249295703587, + "learning_rate": 1.224830035644868e-06, + "loss": 1.2616, + "step": 1965 + }, + { + "epoch": 7.959514170040486, + "grad_norm": 6.683599476135708, + "learning_rate": 1.2202006599296122e-06, + "loss": 1.3384, + "step": 1966 + }, + { + "epoch": 7.963562753036437, + "grad_norm": 6.087314726468543, + "learning_rate": 1.215578833215526e-06, + "loss": 1.2777, + "step": 1967 + }, + { + "epoch": 7.967611336032388, + "grad_norm": 7.6203305950770766, + "learning_rate": 1.2109645647333018e-06, + "loss": 1.2766, + "step": 1968 + }, + { + "epoch": 7.97165991902834, + "grad_norm": 7.4075603041461155, + "learning_rate": 1.2063578636985402e-06, + "loss": 1.2, + "step": 1969 + }, + { + "epoch": 7.9757085020242915, + "grad_norm": 5.356896060806783, + "learning_rate": 1.201758739311728e-06, + "loss": 1.2542, + "step": 1970 + }, + { + "epoch": 7.979757085020243, + "grad_norm": 6.6184401008685, + "learning_rate": 1.1971672007582192e-06, + "loss": 1.3138, + "step": 1971 + }, + { + "epoch": 7.983805668016195, + "grad_norm": 5.952389025814739, + "learning_rate": 1.1925832572082184e-06, + "loss": 1.3645, + "step": 1972 + }, + { + "epoch": 7.987854251012146, + "grad_norm": 5.869009321326924, + "learning_rate": 1.1880069178167586e-06, + "loss": 1.1615, + "step": 1973 + }, + { + "epoch": 7.991902834008097, + "grad_norm": 5.240716232576427, + "learning_rate": 1.1834381917236881e-06, + "loss": 1.1793, + "step": 1974 + }, + { + "epoch": 7.995951417004049, + "grad_norm": 6.017014067933477, + "learning_rate": 1.178877088053651e-06, + "loss": 1.5002, + "step": 1975 + }, + { + "epoch": 8.0, + "grad_norm": 5.843845057775898, + "learning_rate": 1.1743236159160654e-06, + "loss": 1.2012, + "step": 1976 + }, + { + "epoch": 8.004048582995951, + "grad_norm": 5.731134271109451, + "learning_rate": 1.1697777844051105e-06, + "loss": 1.2312, + "step": 1977 + }, + { + "epoch": 8.008097165991902, + "grad_norm": 8.123089091980212, + "learning_rate": 1.165239602599702e-06, + "loss": 1.4044, + "step": 1978 + }, + { + "epoch": 8.012145748987853, + "grad_norm": 7.3997346838307045, + "learning_rate": 1.1607090795634802e-06, + "loss": 1.179, + "step": 1979 + }, + { + "epoch": 8.016194331983806, + "grad_norm": 7.893381080795837, + "learning_rate": 1.156186224344789e-06, + "loss": 1.4132, + "step": 1980 + }, + { + "epoch": 8.020242914979757, + "grad_norm": 6.767976836554466, + "learning_rate": 1.1516710459766589e-06, + "loss": 1.5665, + "step": 1981 + }, + { + "epoch": 8.024291497975709, + "grad_norm": 7.315990265277637, + "learning_rate": 1.1471635534767877e-06, + "loss": 1.4869, + "step": 1982 + }, + { + "epoch": 8.02834008097166, + "grad_norm": 6.313789360006903, + "learning_rate": 1.1426637558475206e-06, + "loss": 1.1981, + "step": 1983 + }, + { + "epoch": 8.03238866396761, + "grad_norm": 7.201178679428242, + "learning_rate": 1.138171662075837e-06, + "loss": 1.2025, + "step": 1984 + }, + { + "epoch": 8.036437246963562, + "grad_norm": 6.52677540701035, + "learning_rate": 1.133687281133331e-06, + "loss": 1.4043, + "step": 1985 + }, + { + "epoch": 8.040485829959515, + "grad_norm": 6.870989950025807, + "learning_rate": 1.1292106219761928e-06, + "loss": 1.2134, + "step": 1986 + }, + { + "epoch": 8.044534412955466, + "grad_norm": 6.875288304164971, + "learning_rate": 1.1247416935451855e-06, + "loss": 1.3732, + "step": 1987 + }, + { + "epoch": 8.048582995951417, + "grad_norm": 6.61190406665116, + "learning_rate": 1.1202805047656406e-06, + "loss": 1.2149, + "step": 1988 + }, + { + "epoch": 8.052631578947368, + "grad_norm": 7.378314345746476, + "learning_rate": 1.1158270645474233e-06, + "loss": 1.2651, + "step": 1989 + }, + { + "epoch": 8.05668016194332, + "grad_norm": 6.525622834951594, + "learning_rate": 1.1113813817849312e-06, + "loss": 1.1235, + "step": 1990 + }, + { + "epoch": 8.06072874493927, + "grad_norm": 6.596016576904695, + "learning_rate": 1.1069434653570633e-06, + "loss": 1.2623, + "step": 1991 + }, + { + "epoch": 8.064777327935223, + "grad_norm": 7.280600264284795, + "learning_rate": 1.1025133241272113e-06, + "loss": 1.1959, + "step": 1992 + }, + { + "epoch": 8.068825910931174, + "grad_norm": 7.346457081658032, + "learning_rate": 1.0980909669432376e-06, + "loss": 1.3747, + "step": 1993 + }, + { + "epoch": 8.072874493927126, + "grad_norm": 6.3969953968688, + "learning_rate": 1.0936764026374547e-06, + "loss": 1.2673, + "step": 1994 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 7.087695501441698, + "learning_rate": 1.0892696400266151e-06, + "loss": 1.2309, + "step": 1995 + }, + { + "epoch": 8.080971659919028, + "grad_norm": 6.045859729929738, + "learning_rate": 1.0848706879118893e-06, + "loss": 1.3544, + "step": 1996 + }, + { + "epoch": 8.085020242914979, + "grad_norm": 5.591583983778709, + "learning_rate": 1.0804795550788473e-06, + "loss": 1.3016, + "step": 1997 + }, + { + "epoch": 8.089068825910932, + "grad_norm": 6.782167877710207, + "learning_rate": 1.0760962502974453e-06, + "loss": 1.2539, + "step": 1998 + }, + { + "epoch": 8.093117408906883, + "grad_norm": 6.256971136931457, + "learning_rate": 1.0717207823220005e-06, + "loss": 1.3311, + "step": 1999 + }, + { + "epoch": 8.097165991902834, + "grad_norm": 5.902238719165329, + "learning_rate": 1.0673531598911824e-06, + "loss": 1.0787, + "step": 2000 + }, + { + "epoch": 8.101214574898785, + "grad_norm": 6.625744512089742, + "learning_rate": 1.0629933917279906e-06, + "loss": 1.2767, + "step": 2001 + }, + { + "epoch": 8.105263157894736, + "grad_norm": 7.073772146380111, + "learning_rate": 1.0586414865397381e-06, + "loss": 1.1861, + "step": 2002 + }, + { + "epoch": 8.109311740890687, + "grad_norm": 6.262732530690249, + "learning_rate": 1.0542974530180327e-06, + "loss": 1.2172, + "step": 2003 + }, + { + "epoch": 8.11336032388664, + "grad_norm": 7.393380584551558, + "learning_rate": 1.0499612998387621e-06, + "loss": 1.1485, + "step": 2004 + }, + { + "epoch": 8.117408906882591, + "grad_norm": 6.857359399326426, + "learning_rate": 1.0456330356620758e-06, + "loss": 1.0672, + "step": 2005 + }, + { + "epoch": 8.121457489878543, + "grad_norm": 6.5740346675087205, + "learning_rate": 1.0413126691323667e-06, + "loss": 1.1479, + "step": 2006 + }, + { + "epoch": 8.125506072874494, + "grad_norm": 6.267695688330783, + "learning_rate": 1.0370002088782555e-06, + "loss": 1.165, + "step": 2007 + }, + { + "epoch": 8.129554655870445, + "grad_norm": 7.133762320289656, + "learning_rate": 1.0326956635125707e-06, + "loss": 1.0247, + "step": 2008 + }, + { + "epoch": 8.133603238866396, + "grad_norm": 5.586702351654256, + "learning_rate": 1.0283990416323336e-06, + "loss": 1.3881, + "step": 2009 + }, + { + "epoch": 8.137651821862349, + "grad_norm": 6.806616706670472, + "learning_rate": 1.0241103518187433e-06, + "loss": 1.1919, + "step": 2010 + }, + { + "epoch": 8.1417004048583, + "grad_norm": 5.430435103612442, + "learning_rate": 1.019829602637154e-06, + "loss": 0.9674, + "step": 2011 + }, + { + "epoch": 8.145748987854251, + "grad_norm": 7.14447897307659, + "learning_rate": 1.0155568026370637e-06, + "loss": 1.2791, + "step": 2012 + }, + { + "epoch": 8.149797570850202, + "grad_norm": 6.3472462119415525, + "learning_rate": 1.0112919603520898e-06, + "loss": 1.1158, + "step": 2013 + }, + { + "epoch": 8.153846153846153, + "grad_norm": 5.6608952411216125, + "learning_rate": 1.0070350842999622e-06, + "loss": 1.357, + "step": 2014 + }, + { + "epoch": 8.157894736842104, + "grad_norm": 7.080132640290096, + "learning_rate": 1.0027861829824953e-06, + "loss": 1.3434, + "step": 2015 + }, + { + "epoch": 8.161943319838057, + "grad_norm": 5.51924122267234, + "learning_rate": 9.985452648855803e-07, + "loss": 1.1787, + "step": 2016 + }, + { + "epoch": 8.165991902834008, + "grad_norm": 6.025963555073775, + "learning_rate": 9.943123384791632e-07, + "loss": 1.2719, + "step": 2017 + }, + { + "epoch": 8.17004048582996, + "grad_norm": 5.336299411323149, + "learning_rate": 9.900874122172294e-07, + "loss": 1.8638, + "step": 2018 + }, + { + "epoch": 8.17408906882591, + "grad_norm": 6.492484439551155, + "learning_rate": 9.85870494537784e-07, + "loss": 1.0806, + "step": 2019 + }, + { + "epoch": 8.178137651821862, + "grad_norm": 7.726948183355687, + "learning_rate": 9.816615938628409e-07, + "loss": 1.2902, + "step": 2020 + }, + { + "epoch": 8.182186234817813, + "grad_norm": 5.250851031576059, + "learning_rate": 9.774607185984004e-07, + "loss": 1.0877, + "step": 2021 + }, + { + "epoch": 8.186234817813766, + "grad_norm": 6.904468404911272, + "learning_rate": 9.732678771344344e-07, + "loss": 1.2729, + "step": 2022 + }, + { + "epoch": 8.190283400809717, + "grad_norm": 6.477961038997859, + "learning_rate": 9.690830778448723e-07, + "loss": 1.2954, + "step": 2023 + }, + { + "epoch": 8.194331983805668, + "grad_norm": 6.901383952123393, + "learning_rate": 9.649063290875771e-07, + "loss": 1.4598, + "step": 2024 + }, + { + "epoch": 8.19838056680162, + "grad_norm": 5.697192396337908, + "learning_rate": 9.607376392043366e-07, + "loss": 1.5219, + "step": 2025 + }, + { + "epoch": 8.20242914979757, + "grad_norm": 5.828533006715791, + "learning_rate": 9.565770165208432e-07, + "loss": 1.2267, + "step": 2026 + }, + { + "epoch": 8.206477732793521, + "grad_norm": 5.737519140655703, + "learning_rate": 9.524244693466773e-07, + "loss": 1.5547, + "step": 2027 + }, + { + "epoch": 8.210526315789474, + "grad_norm": 5.906148707150362, + "learning_rate": 9.482800059752911e-07, + "loss": 1.5423, + "step": 2028 + }, + { + "epoch": 8.214574898785425, + "grad_norm": 6.246342403732729, + "learning_rate": 9.441436346839894e-07, + "loss": 1.3284, + "step": 2029 + }, + { + "epoch": 8.218623481781377, + "grad_norm": 6.542108201095842, + "learning_rate": 9.400153637339182e-07, + "loss": 1.2057, + "step": 2030 + }, + { + "epoch": 8.222672064777328, + "grad_norm": 6.355801787175163, + "learning_rate": 9.358952013700462e-07, + "loss": 1.1541, + "step": 2031 + }, + { + "epoch": 8.226720647773279, + "grad_norm": 6.0083830127963465, + "learning_rate": 9.317831558211449e-07, + "loss": 1.3599, + "step": 2032 + }, + { + "epoch": 8.23076923076923, + "grad_norm": 6.143312349563429, + "learning_rate": 9.276792352997782e-07, + "loss": 1.1424, + "step": 2033 + }, + { + "epoch": 8.234817813765183, + "grad_norm": 7.026565648122738, + "learning_rate": 9.235834480022788e-07, + "loss": 1.361, + "step": 2034 + }, + { + "epoch": 8.238866396761134, + "grad_norm": 6.79010834147561, + "learning_rate": 9.19495802108738e-07, + "loss": 1.2944, + "step": 2035 + }, + { + "epoch": 8.242914979757085, + "grad_norm": 6.262899466718926, + "learning_rate": 9.154163057829879e-07, + "loss": 1.3301, + "step": 2036 + }, + { + "epoch": 8.246963562753036, + "grad_norm": 5.784122192100412, + "learning_rate": 9.113449671725832e-07, + "loss": 1.1986, + "step": 2037 + }, + { + "epoch": 8.251012145748987, + "grad_norm": 6.13085712005476, + "learning_rate": 9.072817944087875e-07, + "loss": 1.284, + "step": 2038 + }, + { + "epoch": 8.255060728744938, + "grad_norm": 6.317294175666071, + "learning_rate": 9.032267956065516e-07, + "loss": 1.2274, + "step": 2039 + }, + { + "epoch": 8.259109311740891, + "grad_norm": 5.586217657876971, + "learning_rate": 8.991799788645067e-07, + "loss": 1.1896, + "step": 2040 + }, + { + "epoch": 8.263157894736842, + "grad_norm": 6.088327462827803, + "learning_rate": 8.951413522649372e-07, + "loss": 0.9771, + "step": 2041 + }, + { + "epoch": 8.267206477732794, + "grad_norm": 5.76590382121624, + "learning_rate": 8.911109238737748e-07, + "loss": 1.2758, + "step": 2042 + }, + { + "epoch": 8.271255060728745, + "grad_norm": 6.211464855564121, + "learning_rate": 8.870887017405761e-07, + "loss": 1.273, + "step": 2043 + }, + { + "epoch": 8.275303643724696, + "grad_norm": 6.06402110401488, + "learning_rate": 8.830746938985091e-07, + "loss": 1.356, + "step": 2044 + }, + { + "epoch": 8.279352226720647, + "grad_norm": 7.891296000946273, + "learning_rate": 8.790689083643328e-07, + "loss": 1.2355, + "step": 2045 + }, + { + "epoch": 8.2834008097166, + "grad_norm": 6.919823315708994, + "learning_rate": 8.750713531383886e-07, + "loss": 0.9371, + "step": 2046 + }, + { + "epoch": 8.287449392712551, + "grad_norm": 7.599246572003176, + "learning_rate": 8.710820362045791e-07, + "loss": 1.0832, + "step": 2047 + }, + { + "epoch": 8.291497975708502, + "grad_norm": 7.084253293886639, + "learning_rate": 8.671009655303531e-07, + "loss": 0.9594, + "step": 2048 + }, + { + "epoch": 8.295546558704453, + "grad_norm": 7.266404675494076, + "learning_rate": 8.631281490666915e-07, + "loss": 1.1647, + "step": 2049 + }, + { + "epoch": 8.299595141700404, + "grad_norm": 6.465250431675959, + "learning_rate": 8.591635947480854e-07, + "loss": 1.4079, + "step": 2050 + }, + { + "epoch": 8.303643724696355, + "grad_norm": 7.279071790902037, + "learning_rate": 8.552073104925296e-07, + "loss": 1.0049, + "step": 2051 + }, + { + "epoch": 8.307692307692308, + "grad_norm": 6.756555724272831, + "learning_rate": 8.512593042015005e-07, + "loss": 1.0616, + "step": 2052 + }, + { + "epoch": 8.31174089068826, + "grad_norm": 6.254507577162332, + "learning_rate": 8.473195837599419e-07, + "loss": 1.1174, + "step": 2053 + }, + { + "epoch": 8.31578947368421, + "grad_norm": 7.727840849711051, + "learning_rate": 8.433881570362484e-07, + "loss": 0.9914, + "step": 2054 + }, + { + "epoch": 8.319838056680162, + "grad_norm": 6.756642529850463, + "learning_rate": 8.3946503188225e-07, + "loss": 0.9647, + "step": 2055 + }, + { + "epoch": 8.323886639676113, + "grad_norm": 6.963802700325999, + "learning_rate": 8.355502161331985e-07, + "loss": 1.0237, + "step": 2056 + }, + { + "epoch": 8.327935222672064, + "grad_norm": 6.731503726472556, + "learning_rate": 8.316437176077491e-07, + "loss": 1.0387, + "step": 2057 + }, + { + "epoch": 8.331983805668017, + "grad_norm": 6.8290712150235375, + "learning_rate": 8.277455441079463e-07, + "loss": 1.0816, + "step": 2058 + }, + { + "epoch": 8.336032388663968, + "grad_norm": 6.2806024635481625, + "learning_rate": 8.238557034192085e-07, + "loss": 1.189, + "step": 2059 + }, + { + "epoch": 8.34008097165992, + "grad_norm": 6.184192289516359, + "learning_rate": 8.199742033103091e-07, + "loss": 1.1119, + "step": 2060 + }, + { + "epoch": 8.34412955465587, + "grad_norm": 6.540385862485887, + "learning_rate": 8.161010515333662e-07, + "loss": 1.0109, + "step": 2061 + }, + { + "epoch": 8.348178137651821, + "grad_norm": 7.727191651616888, + "learning_rate": 8.12236255823825e-07, + "loss": 1.1502, + "step": 2062 + }, + { + "epoch": 8.352226720647772, + "grad_norm": 6.9096677414157535, + "learning_rate": 8.083798239004408e-07, + "loss": 1.1497, + "step": 2063 + }, + { + "epoch": 8.356275303643725, + "grad_norm": 6.414815740722037, + "learning_rate": 8.045317634652661e-07, + "loss": 0.842, + "step": 2064 + }, + { + "epoch": 8.360323886639677, + "grad_norm": 7.487469807700361, + "learning_rate": 8.006920822036307e-07, + "loss": 1.2308, + "step": 2065 + }, + { + "epoch": 8.364372469635628, + "grad_norm": 6.046114352668178, + "learning_rate": 7.968607877841333e-07, + "loss": 1.3044, + "step": 2066 + }, + { + "epoch": 8.368421052631579, + "grad_norm": 6.435316234936995, + "learning_rate": 7.930378878586198e-07, + "loss": 1.1938, + "step": 2067 + }, + { + "epoch": 8.37246963562753, + "grad_norm": 7.906119816359948, + "learning_rate": 7.89223390062172e-07, + "loss": 1.2389, + "step": 2068 + }, + { + "epoch": 8.376518218623481, + "grad_norm": 6.803177488562893, + "learning_rate": 7.854173020130906e-07, + "loss": 0.9517, + "step": 2069 + }, + { + "epoch": 8.380566801619434, + "grad_norm": 7.234612181909552, + "learning_rate": 7.816196313128821e-07, + "loss": 1.0982, + "step": 2070 + }, + { + "epoch": 8.384615384615385, + "grad_norm": 6.204452258594293, + "learning_rate": 7.778303855462382e-07, + "loss": 0.913, + "step": 2071 + }, + { + "epoch": 8.388663967611336, + "grad_norm": 7.6652434424714375, + "learning_rate": 7.740495722810271e-07, + "loss": 0.9799, + "step": 2072 + }, + { + "epoch": 8.392712550607287, + "grad_norm": 7.028162715790928, + "learning_rate": 7.702771990682745e-07, + "loss": 1.1741, + "step": 2073 + }, + { + "epoch": 8.396761133603238, + "grad_norm": 8.20693379504055, + "learning_rate": 7.66513273442151e-07, + "loss": 0.9586, + "step": 2074 + }, + { + "epoch": 8.40080971659919, + "grad_norm": 6.1595198687647255, + "learning_rate": 7.627578029199562e-07, + "loss": 1.0087, + "step": 2075 + }, + { + "epoch": 8.404858299595142, + "grad_norm": 6.186971527710178, + "learning_rate": 7.590107950020987e-07, + "loss": 1.0385, + "step": 2076 + }, + { + "epoch": 8.408906882591094, + "grad_norm": 7.634025112115446, + "learning_rate": 7.552722571720899e-07, + "loss": 1.1273, + "step": 2077 + }, + { + "epoch": 8.412955465587045, + "grad_norm": 6.509921676918103, + "learning_rate": 7.515421968965242e-07, + "loss": 0.9676, + "step": 2078 + }, + { + "epoch": 8.417004048582996, + "grad_norm": 7.549987787462475, + "learning_rate": 7.478206216250644e-07, + "loss": 1.2442, + "step": 2079 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 7.555658247599405, + "learning_rate": 7.441075387904267e-07, + "loss": 1.2719, + "step": 2080 + }, + { + "epoch": 8.425101214574898, + "grad_norm": 6.5153535549196215, + "learning_rate": 7.404029558083653e-07, + "loss": 1.3106, + "step": 2081 + }, + { + "epoch": 8.429149797570851, + "grad_norm": 7.532432754634663, + "learning_rate": 7.367068800776594e-07, + "loss": 1.2708, + "step": 2082 + }, + { + "epoch": 8.433198380566802, + "grad_norm": 5.774787588818044, + "learning_rate": 7.330193189800994e-07, + "loss": 1.4544, + "step": 2083 + }, + { + "epoch": 8.437246963562753, + "grad_norm": 6.8245265398524495, + "learning_rate": 7.293402798804667e-07, + "loss": 1.2466, + "step": 2084 + }, + { + "epoch": 8.441295546558704, + "grad_norm": 7.77462252770274, + "learning_rate": 7.25669770126527e-07, + "loss": 1.1822, + "step": 2085 + }, + { + "epoch": 8.445344129554655, + "grad_norm": 6.148198383672424, + "learning_rate": 7.220077970490058e-07, + "loss": 1.4383, + "step": 2086 + }, + { + "epoch": 8.449392712550607, + "grad_norm": 7.866867275378799, + "learning_rate": 7.183543679615834e-07, + "loss": 1.2326, + "step": 2087 + }, + { + "epoch": 8.45344129554656, + "grad_norm": 7.546182288687263, + "learning_rate": 7.147094901608748e-07, + "loss": 1.2273, + "step": 2088 + }, + { + "epoch": 8.45748987854251, + "grad_norm": 7.579378068549671, + "learning_rate": 7.110731709264163e-07, + "loss": 1.57, + "step": 2089 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 10.015081237740555, + "learning_rate": 7.074454175206524e-07, + "loss": 1.6365, + "step": 2090 + }, + { + "epoch": 8.465587044534413, + "grad_norm": 8.048721435929364, + "learning_rate": 7.03826237188916e-07, + "loss": 1.3541, + "step": 2091 + }, + { + "epoch": 8.469635627530364, + "grad_norm": 7.174284170679616, + "learning_rate": 7.002156371594237e-07, + "loss": 1.4242, + "step": 2092 + }, + { + "epoch": 8.473684210526315, + "grad_norm": 6.580944869519028, + "learning_rate": 6.966136246432492e-07, + "loss": 1.3988, + "step": 2093 + }, + { + "epoch": 8.477732793522268, + "grad_norm": 8.860518049025139, + "learning_rate": 6.930202068343206e-07, + "loss": 1.387, + "step": 2094 + }, + { + "epoch": 8.481781376518219, + "grad_norm": 6.5840932268783785, + "learning_rate": 6.894353909093976e-07, + "loss": 1.3236, + "step": 2095 + }, + { + "epoch": 8.48582995951417, + "grad_norm": 7.278031801942817, + "learning_rate": 6.858591840280627e-07, + "loss": 1.2652, + "step": 2096 + }, + { + "epoch": 8.489878542510121, + "grad_norm": 9.369040288696132, + "learning_rate": 6.822915933327012e-07, + "loss": 1.2337, + "step": 2097 + }, + { + "epoch": 8.493927125506072, + "grad_norm": 9.575953105863487, + "learning_rate": 6.787326259484922e-07, + "loss": 1.154, + "step": 2098 + }, + { + "epoch": 8.497975708502024, + "grad_norm": 6.611344155786181, + "learning_rate": 6.751822889833926e-07, + "loss": 1.0993, + "step": 2099 + }, + { + "epoch": 8.502024291497976, + "grad_norm": 7.493254499061418, + "learning_rate": 6.716405895281225e-07, + "loss": 0.9173, + "step": 2100 + }, + { + "epoch": 8.506072874493928, + "grad_norm": 8.70918178876987, + "learning_rate": 6.681075346561517e-07, + "loss": 1.2742, + "step": 2101 + }, + { + "epoch": 8.510121457489879, + "grad_norm": 8.138470526559217, + "learning_rate": 6.645831314236817e-07, + "loss": 1.2013, + "step": 2102 + }, + { + "epoch": 8.51417004048583, + "grad_norm": 6.09672586428882, + "learning_rate": 6.610673868696387e-07, + "loss": 1.2136, + "step": 2103 + }, + { + "epoch": 8.518218623481781, + "grad_norm": 7.785839558420341, + "learning_rate": 6.57560308015655e-07, + "loss": 1.2265, + "step": 2104 + }, + { + "epoch": 8.522267206477732, + "grad_norm": 8.953731258403018, + "learning_rate": 6.540619018660555e-07, + "loss": 1.15, + "step": 2105 + }, + { + "epoch": 8.526315789473685, + "grad_norm": 8.20742839178438, + "learning_rate": 6.505721754078443e-07, + "loss": 0.9784, + "step": 2106 + }, + { + "epoch": 8.530364372469636, + "grad_norm": 7.275127493366391, + "learning_rate": 6.470911356106885e-07, + "loss": 1.0741, + "step": 2107 + }, + { + "epoch": 8.534412955465587, + "grad_norm": 8.35403564959096, + "learning_rate": 6.436187894269086e-07, + "loss": 1.0919, + "step": 2108 + }, + { + "epoch": 8.538461538461538, + "grad_norm": 7.970907113068403, + "learning_rate": 6.401551437914621e-07, + "loss": 1.3919, + "step": 2109 + }, + { + "epoch": 8.54251012145749, + "grad_norm": 7.075118724352275, + "learning_rate": 6.367002056219285e-07, + "loss": 1.3732, + "step": 2110 + }, + { + "epoch": 8.54655870445344, + "grad_norm": 41.2546881055469, + "learning_rate": 6.332539818184985e-07, + "loss": 1.9685, + "step": 2111 + }, + { + "epoch": 8.550607287449393, + "grad_norm": 12.366409360208545, + "learning_rate": 6.298164792639555e-07, + "loss": 1.6408, + "step": 2112 + }, + { + "epoch": 8.554655870445345, + "grad_norm": 11.261791719061787, + "learning_rate": 6.263877048236683e-07, + "loss": 2.024, + "step": 2113 + }, + { + "epoch": 8.558704453441296, + "grad_norm": 7.759335659536259, + "learning_rate": 6.229676653455719e-07, + "loss": 1.2075, + "step": 2114 + }, + { + "epoch": 8.562753036437247, + "grad_norm": 6.714063146216, + "learning_rate": 6.195563676601563e-07, + "loss": 1.0819, + "step": 2115 + }, + { + "epoch": 8.566801619433198, + "grad_norm": 6.884464537310286, + "learning_rate": 6.161538185804544e-07, + "loss": 1.4577, + "step": 2116 + }, + { + "epoch": 8.570850202429149, + "grad_norm": 7.270354106890159, + "learning_rate": 6.127600249020216e-07, + "loss": 1.1355, + "step": 2117 + }, + { + "epoch": 8.574898785425102, + "grad_norm": 7.112708598170683, + "learning_rate": 6.09374993402932e-07, + "loss": 1.2906, + "step": 2118 + }, + { + "epoch": 8.578947368421053, + "grad_norm": 5.581752289480177, + "learning_rate": 6.059987308437565e-07, + "loss": 1.1301, + "step": 2119 + }, + { + "epoch": 8.582995951417004, + "grad_norm": 7.0619445923701205, + "learning_rate": 6.026312439675553e-07, + "loss": 1.2177, + "step": 2120 + }, + { + "epoch": 8.587044534412955, + "grad_norm": 7.241742089371334, + "learning_rate": 5.992725394998594e-07, + "loss": 1.2383, + "step": 2121 + }, + { + "epoch": 8.591093117408906, + "grad_norm": 7.799009343624035, + "learning_rate": 5.959226241486632e-07, + "loss": 1.2195, + "step": 2122 + }, + { + "epoch": 8.595141700404858, + "grad_norm": 6.849477991699955, + "learning_rate": 5.925815046044026e-07, + "loss": 0.9419, + "step": 2123 + }, + { + "epoch": 8.59919028340081, + "grad_norm": 5.8814800267073, + "learning_rate": 5.892491875399503e-07, + "loss": 1.0593, + "step": 2124 + }, + { + "epoch": 8.603238866396762, + "grad_norm": 6.030196066850317, + "learning_rate": 5.859256796105972e-07, + "loss": 1.141, + "step": 2125 + }, + { + "epoch": 8.607287449392713, + "grad_norm": 6.614777125339002, + "learning_rate": 5.826109874540409e-07, + "loss": 0.9086, + "step": 2126 + }, + { + "epoch": 8.611336032388664, + "grad_norm": 6.948469573714514, + "learning_rate": 5.793051176903736e-07, + "loss": 1.1918, + "step": 2127 + }, + { + "epoch": 8.615384615384615, + "grad_norm": 6.3016655419950425, + "learning_rate": 5.760080769220644e-07, + "loss": 1.2405, + "step": 2128 + }, + { + "epoch": 8.619433198380566, + "grad_norm": 7.302956053566254, + "learning_rate": 5.727198717339511e-07, + "loss": 0.9801, + "step": 2129 + }, + { + "epoch": 8.623481781376519, + "grad_norm": 7.80182069701434, + "learning_rate": 5.694405086932248e-07, + "loss": 1.3353, + "step": 2130 + }, + { + "epoch": 8.62753036437247, + "grad_norm": 6.639707488466264, + "learning_rate": 5.661699943494181e-07, + "loss": 1.1811, + "step": 2131 + }, + { + "epoch": 8.631578947368421, + "grad_norm": 6.899791220581512, + "learning_rate": 5.6290833523439e-07, + "loss": 1.4413, + "step": 2132 + }, + { + "epoch": 8.635627530364372, + "grad_norm": 5.939504101979065, + "learning_rate": 5.596555378623126e-07, + "loss": 1.0383, + "step": 2133 + }, + { + "epoch": 8.639676113360323, + "grad_norm": 6.991218523438056, + "learning_rate": 5.564116087296618e-07, + "loss": 1.3329, + "step": 2134 + }, + { + "epoch": 8.643724696356275, + "grad_norm": 6.159504500789582, + "learning_rate": 5.531765543152002e-07, + "loss": 1.2145, + "step": 2135 + }, + { + "epoch": 8.647773279352228, + "grad_norm": 6.654163633145904, + "learning_rate": 5.499503810799667e-07, + "loss": 1.3227, + "step": 2136 + }, + { + "epoch": 8.651821862348179, + "grad_norm": 5.691830923037683, + "learning_rate": 5.467330954672639e-07, + "loss": 1.1114, + "step": 2137 + }, + { + "epoch": 8.65587044534413, + "grad_norm": 6.9549489463225775, + "learning_rate": 5.435247039026398e-07, + "loss": 1.3092, + "step": 2138 + }, + { + "epoch": 8.65991902834008, + "grad_norm": 7.2060016669462295, + "learning_rate": 5.403252127938841e-07, + "loss": 1.1114, + "step": 2139 + }, + { + "epoch": 8.663967611336032, + "grad_norm": 5.860230883158528, + "learning_rate": 5.371346285310075e-07, + "loss": 1.2813, + "step": 2140 + }, + { + "epoch": 8.668016194331983, + "grad_norm": 6.007997589736958, + "learning_rate": 5.33952957486234e-07, + "loss": 1.2645, + "step": 2141 + }, + { + "epoch": 8.672064777327936, + "grad_norm": 6.380138387786136, + "learning_rate": 5.30780206013985e-07, + "loss": 1.4488, + "step": 2142 + }, + { + "epoch": 8.676113360323887, + "grad_norm": 5.389279452380071, + "learning_rate": 5.276163804508671e-07, + "loss": 1.436, + "step": 2143 + }, + { + "epoch": 8.680161943319838, + "grad_norm": 6.700282421943868, + "learning_rate": 5.244614871156612e-07, + "loss": 1.2596, + "step": 2144 + }, + { + "epoch": 8.68421052631579, + "grad_norm": 6.474231798689995, + "learning_rate": 5.213155323093094e-07, + "loss": 1.1446, + "step": 2145 + }, + { + "epoch": 8.68825910931174, + "grad_norm": 6.124423836263308, + "learning_rate": 5.181785223148999e-07, + "loss": 1.2253, + "step": 2146 + }, + { + "epoch": 8.692307692307692, + "grad_norm": 6.653036436702035, + "learning_rate": 5.150504633976572e-07, + "loss": 1.2426, + "step": 2147 + }, + { + "epoch": 8.696356275303645, + "grad_norm": 5.972447093623402, + "learning_rate": 5.119313618049309e-07, + "loss": 1.126, + "step": 2148 + }, + { + "epoch": 8.700404858299596, + "grad_norm": 7.753561648240651, + "learning_rate": 5.088212237661766e-07, + "loss": 1.5606, + "step": 2149 + }, + { + "epoch": 8.704453441295547, + "grad_norm": 6.583094267044065, + "learning_rate": 5.057200554929509e-07, + "loss": 1.3713, + "step": 2150 + }, + { + "epoch": 8.708502024291498, + "grad_norm": 6.552630239242199, + "learning_rate": 5.026278631788967e-07, + "loss": 1.001, + "step": 2151 + }, + { + "epoch": 8.712550607287449, + "grad_norm": 6.025634072841221, + "learning_rate": 4.995446529997283e-07, + "loss": 1.1171, + "step": 2152 + }, + { + "epoch": 8.7165991902834, + "grad_norm": 6.005875203639586, + "learning_rate": 4.964704311132224e-07, + "loss": 1.2367, + "step": 2153 + }, + { + "epoch": 8.720647773279353, + "grad_norm": 7.40464404684989, + "learning_rate": 4.934052036592018e-07, + "loss": 0.9859, + "step": 2154 + }, + { + "epoch": 8.724696356275304, + "grad_norm": 6.8264153711344155, + "learning_rate": 4.903489767595287e-07, + "loss": 1.2321, + "step": 2155 + }, + { + "epoch": 8.728744939271255, + "grad_norm": 6.019858796316467, + "learning_rate": 4.873017565180871e-07, + "loss": 1.3846, + "step": 2156 + }, + { + "epoch": 8.732793522267206, + "grad_norm": 6.09572311582674, + "learning_rate": 4.842635490207747e-07, + "loss": 1.0707, + "step": 2157 + }, + { + "epoch": 8.736842105263158, + "grad_norm": 6.364401455402685, + "learning_rate": 4.812343603354896e-07, + "loss": 1.1865, + "step": 2158 + }, + { + "epoch": 8.740890688259109, + "grad_norm": 5.987439148552965, + "learning_rate": 4.782141965121129e-07, + "loss": 1.8135, + "step": 2159 + }, + { + "epoch": 8.744939271255062, + "grad_norm": 5.829819359945272, + "learning_rate": 4.752030635825067e-07, + "loss": 1.3722, + "step": 2160 + }, + { + "epoch": 8.748987854251013, + "grad_norm": 7.34437549498405, + "learning_rate": 4.7220096756049384e-07, + "loss": 1.1621, + "step": 2161 + }, + { + "epoch": 8.753036437246964, + "grad_norm": 6.883954168990045, + "learning_rate": 4.6920791444184934e-07, + "loss": 1.0939, + "step": 2162 + }, + { + "epoch": 8.757085020242915, + "grad_norm": 6.597939900390535, + "learning_rate": 4.662239102042887e-07, + "loss": 1.1937, + "step": 2163 + }, + { + "epoch": 8.761133603238866, + "grad_norm": 7.598759077469902, + "learning_rate": 4.6324896080745254e-07, + "loss": 1.9407, + "step": 2164 + }, + { + "epoch": 8.765182186234817, + "grad_norm": 8.104460939111814, + "learning_rate": 4.602830721928997e-07, + "loss": 1.6736, + "step": 2165 + }, + { + "epoch": 8.76923076923077, + "grad_norm": 8.243030121308468, + "learning_rate": 4.573262502840914e-07, + "loss": 1.6042, + "step": 2166 + }, + { + "epoch": 8.773279352226721, + "grad_norm": 5.94145259338888, + "learning_rate": 4.54378500986381e-07, + "loss": 1.144, + "step": 2167 + }, + { + "epoch": 8.777327935222672, + "grad_norm": 6.040011156691804, + "learning_rate": 4.5143983018700485e-07, + "loss": 1.4426, + "step": 2168 + }, + { + "epoch": 8.781376518218623, + "grad_norm": 5.588028348172228, + "learning_rate": 4.48510243755062e-07, + "loss": 1.4161, + "step": 2169 + }, + { + "epoch": 8.785425101214575, + "grad_norm": 6.060131286333249, + "learning_rate": 4.455897475415133e-07, + "loss": 1.2124, + "step": 2170 + }, + { + "epoch": 8.789473684210526, + "grad_norm": 5.82973877308885, + "learning_rate": 4.4267834737916295e-07, + "loss": 1.1482, + "step": 2171 + }, + { + "epoch": 8.793522267206479, + "grad_norm": 6.657446024745296, + "learning_rate": 4.39776049082648e-07, + "loss": 1.1481, + "step": 2172 + }, + { + "epoch": 8.79757085020243, + "grad_norm": 6.3615249922064825, + "learning_rate": 4.3688285844842747e-07, + "loss": 1.1969, + "step": 2173 + }, + { + "epoch": 8.80161943319838, + "grad_norm": 6.112293290660962, + "learning_rate": 4.33998781254773e-07, + "loss": 1.101, + "step": 2174 + }, + { + "epoch": 8.805668016194332, + "grad_norm": 5.743019279125439, + "learning_rate": 4.3112382326174987e-07, + "loss": 1.0094, + "step": 2175 + }, + { + "epoch": 8.809716599190283, + "grad_norm": 6.20487988377178, + "learning_rate": 4.2825799021121493e-07, + "loss": 1.3299, + "step": 2176 + }, + { + "epoch": 8.813765182186234, + "grad_norm": 8.142675422332248, + "learning_rate": 4.2540128782679934e-07, + "loss": 1.1159, + "step": 2177 + }, + { + "epoch": 8.817813765182187, + "grad_norm": 6.620797295984167, + "learning_rate": 4.225537218138981e-07, + "loss": 1.0554, + "step": 2178 + }, + { + "epoch": 8.821862348178138, + "grad_norm": 15.187486943860346, + "learning_rate": 4.197152978596608e-07, + "loss": 2.1386, + "step": 2179 + }, + { + "epoch": 8.82591093117409, + "grad_norm": 25.220825856193247, + "learning_rate": 4.1688602163297564e-07, + "loss": 3.054, + "step": 2180 + }, + { + "epoch": 8.82995951417004, + "grad_norm": 6.966275914312463, + "learning_rate": 4.1406589878446257e-07, + "loss": 1.0687, + "step": 2181 + }, + { + "epoch": 8.834008097165992, + "grad_norm": 7.297199095572435, + "learning_rate": 4.112549349464606e-07, + "loss": 1.046, + "step": 2182 + }, + { + "epoch": 8.838056680161943, + "grad_norm": 6.240254431183654, + "learning_rate": 4.0845313573301736e-07, + "loss": 1.0509, + "step": 2183 + }, + { + "epoch": 8.842105263157894, + "grad_norm": 6.003677827425577, + "learning_rate": 4.05660506739875e-07, + "loss": 1.1568, + "step": 2184 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 6.079105511128487, + "learning_rate": 4.0287705354446147e-07, + "loss": 0.9318, + "step": 2185 + }, + { + "epoch": 8.850202429149798, + "grad_norm": 7.515196540628712, + "learning_rate": 4.001027817058789e-07, + "loss": 1.1372, + "step": 2186 + }, + { + "epoch": 8.854251012145749, + "grad_norm": 7.448967050580141, + "learning_rate": 3.973376967648934e-07, + "loss": 1.1666, + "step": 2187 + }, + { + "epoch": 8.8582995951417, + "grad_norm": 5.169545843583342, + "learning_rate": 3.945818042439226e-07, + "loss": 1.126, + "step": 2188 + }, + { + "epoch": 8.862348178137651, + "grad_norm": 6.900120551493389, + "learning_rate": 3.9183510964702463e-07, + "loss": 1.1207, + "step": 2189 + }, + { + "epoch": 8.866396761133604, + "grad_norm": 6.665292349596718, + "learning_rate": 3.890976184598866e-07, + "loss": 1.0898, + "step": 2190 + }, + { + "epoch": 8.870445344129555, + "grad_norm": 8.446843694582483, + "learning_rate": 3.863693361498161e-07, + "loss": 1.4988, + "step": 2191 + }, + { + "epoch": 8.874493927125506, + "grad_norm": 6.954595155730788, + "learning_rate": 3.836502681657289e-07, + "loss": 1.4457, + "step": 2192 + }, + { + "epoch": 8.878542510121457, + "grad_norm": 7.3364795797424405, + "learning_rate": 3.809404199381378e-07, + "loss": 1.2321, + "step": 2193 + }, + { + "epoch": 8.882591093117409, + "grad_norm": 6.790035951291051, + "learning_rate": 3.7823979687914125e-07, + "loss": 1.1646, + "step": 2194 + }, + { + "epoch": 8.88663967611336, + "grad_norm": 6.034067768687113, + "learning_rate": 3.755484043824131e-07, + "loss": 0.8228, + "step": 2195 + }, + { + "epoch": 8.89068825910931, + "grad_norm": 5.919062923007496, + "learning_rate": 3.728662478231926e-07, + "loss": 1.3459, + "step": 2196 + }, + { + "epoch": 8.894736842105264, + "grad_norm": 5.697673544149653, + "learning_rate": 3.7019333255827404e-07, + "loss": 1.3481, + "step": 2197 + }, + { + "epoch": 8.898785425101215, + "grad_norm": 4.97146657305071, + "learning_rate": 3.675296639259912e-07, + "loss": 1.1434, + "step": 2198 + }, + { + "epoch": 8.902834008097166, + "grad_norm": 5.924967675956248, + "learning_rate": 3.6487524724621526e-07, + "loss": 1.1156, + "step": 2199 + }, + { + "epoch": 8.906882591093117, + "grad_norm": 6.321158341478768, + "learning_rate": 3.6223008782033773e-07, + "loss": 1.1401, + "step": 2200 + }, + { + "epoch": 8.910931174089068, + "grad_norm": 5.228841709292987, + "learning_rate": 3.595941909312595e-07, + "loss": 0.9237, + "step": 2201 + }, + { + "epoch": 8.914979757085021, + "grad_norm": 5.870938886560629, + "learning_rate": 3.569675618433849e-07, + "loss": 0.9947, + "step": 2202 + }, + { + "epoch": 8.919028340080972, + "grad_norm": 6.134128784345386, + "learning_rate": 3.543502058026071e-07, + "loss": 0.9978, + "step": 2203 + }, + { + "epoch": 8.923076923076923, + "grad_norm": 4.8433258387597204, + "learning_rate": 3.517421280363004e-07, + "loss": 1.324, + "step": 2204 + }, + { + "epoch": 8.927125506072874, + "grad_norm": 6.975094443340041, + "learning_rate": 3.49143333753309e-07, + "loss": 1.1632, + "step": 2205 + }, + { + "epoch": 8.931174089068826, + "grad_norm": 6.7182527407351875, + "learning_rate": 3.4655382814393346e-07, + "loss": 1.1421, + "step": 2206 + }, + { + "epoch": 8.935222672064777, + "grad_norm": 6.849744331194626, + "learning_rate": 3.439736163799251e-07, + "loss": 0.9506, + "step": 2207 + }, + { + "epoch": 8.939271255060728, + "grad_norm": 6.491862758103337, + "learning_rate": 3.4140270361447405e-07, + "loss": 0.9707, + "step": 2208 + }, + { + "epoch": 8.94331983805668, + "grad_norm": 7.477773915394873, + "learning_rate": 3.388410949821969e-07, + "loss": 1.2587, + "step": 2209 + }, + { + "epoch": 8.947368421052632, + "grad_norm": 5.85145672480451, + "learning_rate": 3.362887955991301e-07, + "loss": 0.9956, + "step": 2210 + }, + { + "epoch": 8.951417004048583, + "grad_norm": 7.534940191759425, + "learning_rate": 3.337458105627145e-07, + "loss": 0.8958, + "step": 2211 + }, + { + "epoch": 8.955465587044534, + "grad_norm": 6.907958693442255, + "learning_rate": 3.3121214495179187e-07, + "loss": 1.2205, + "step": 2212 + }, + { + "epoch": 8.959514170040485, + "grad_norm": 6.615135062562839, + "learning_rate": 3.2868780382658895e-07, + "loss": 1.306, + "step": 2213 + }, + { + "epoch": 8.963562753036438, + "grad_norm": 5.859430885728825, + "learning_rate": 3.261727922287111e-07, + "loss": 1.237, + "step": 2214 + }, + { + "epoch": 8.96761133603239, + "grad_norm": 7.375138881839548, + "learning_rate": 3.236671151811305e-07, + "loss": 1.2228, + "step": 2215 + }, + { + "epoch": 8.97165991902834, + "grad_norm": 7.316291202886882, + "learning_rate": 3.2117077768817395e-07, + "loss": 1.1567, + "step": 2216 + }, + { + "epoch": 8.975708502024291, + "grad_norm": 5.4215558137774424, + "learning_rate": 3.1868378473551953e-07, + "loss": 1.2206, + "step": 2217 + }, + { + "epoch": 8.979757085020243, + "grad_norm": 6.8474837571580975, + "learning_rate": 3.16206141290179e-07, + "loss": 1.2698, + "step": 2218 + }, + { + "epoch": 8.983805668016194, + "grad_norm": 6.26046466508966, + "learning_rate": 3.1373785230049356e-07, + "loss": 1.3288, + "step": 2219 + }, + { + "epoch": 8.987854251012145, + "grad_norm": 6.114392604475578, + "learning_rate": 3.1127892269612103e-07, + "loss": 1.1248, + "step": 2220 + }, + { + "epoch": 8.991902834008098, + "grad_norm": 5.409545428136062, + "learning_rate": 3.0882935738802467e-07, + "loss": 1.1403, + "step": 2221 + }, + { + "epoch": 8.995951417004049, + "grad_norm": 6.090885634530899, + "learning_rate": 3.0638916126846885e-07, + "loss": 1.4643, + "step": 2222 + }, + { + "epoch": 9.0, + "grad_norm": 6.00173008385886, + "learning_rate": 3.039583392110046e-07, + "loss": 1.163, + "step": 2223 + }, + { + "epoch": 9.004048582995951, + "grad_norm": 5.810667276177312, + "learning_rate": 3.015368960704584e-07, + "loss": 1.192, + "step": 2224 + }, + { + "epoch": 9.008097165991902, + "grad_norm": 8.416776164216913, + "learning_rate": 2.991248366829291e-07, + "loss": 1.3682, + "step": 2225 + }, + { + "epoch": 9.012145748987853, + "grad_norm": 7.726907247606117, + "learning_rate": 2.9672216586577317e-07, + "loss": 1.1359, + "step": 2226 + }, + { + "epoch": 9.016194331983806, + "grad_norm": 8.263363535299403, + "learning_rate": 2.9432888841759434e-07, + "loss": 1.3759, + "step": 2227 + }, + { + "epoch": 9.020242914979757, + "grad_norm": 7.120837123754598, + "learning_rate": 2.91945009118238e-07, + "loss": 1.5184, + "step": 2228 + }, + { + "epoch": 9.024291497975709, + "grad_norm": 7.716424876152555, + "learning_rate": 2.8957053272877957e-07, + "loss": 1.4498, + "step": 2229 + }, + { + "epoch": 9.02834008097166, + "grad_norm": 6.703211372735481, + "learning_rate": 2.8720546399151395e-07, + "loss": 1.1665, + "step": 2230 + }, + { + "epoch": 9.03238866396761, + "grad_norm": 8.03492140351389, + "learning_rate": 2.848498076299483e-07, + "loss": 1.1768, + "step": 2231 + }, + { + "epoch": 9.036437246963562, + "grad_norm": 7.280365321268138, + "learning_rate": 2.8250356834878924e-07, + "loss": 1.3754, + "step": 2232 + }, + { + "epoch": 9.040485829959515, + "grad_norm": 7.806791539178109, + "learning_rate": 2.801667508339384e-07, + "loss": 1.1804, + "step": 2233 + }, + { + "epoch": 9.044534412955466, + "grad_norm": 7.769785235769093, + "learning_rate": 2.7783935975247867e-07, + "loss": 1.3509, + "step": 2234 + }, + { + "epoch": 9.048582995951417, + "grad_norm": 7.356545707849527, + "learning_rate": 2.7552139975266677e-07, + "loss": 1.1764, + "step": 2235 + }, + { + "epoch": 9.052631578947368, + "grad_norm": 8.282177365997146, + "learning_rate": 2.732128754639246e-07, + "loss": 1.2262, + "step": 2236 + }, + { + "epoch": 9.05668016194332, + "grad_norm": 7.26327016277894, + "learning_rate": 2.7091379149682683e-07, + "loss": 1.0832, + "step": 2237 + }, + { + "epoch": 9.06072874493927, + "grad_norm": 7.62051448590103, + "learning_rate": 2.68624152443096e-07, + "loss": 1.2281, + "step": 2238 + }, + { + "epoch": 9.064777327935223, + "grad_norm": 8.277202550605427, + "learning_rate": 2.6634396287559094e-07, + "loss": 1.1544, + "step": 2239 + }, + { + "epoch": 9.068825910931174, + "grad_norm": 8.275018851850179, + "learning_rate": 2.6407322734829763e-07, + "loss": 1.3331, + "step": 2240 + }, + { + "epoch": 9.072874493927126, + "grad_norm": 7.051944979723943, + "learning_rate": 2.6181195039632123e-07, + "loss": 1.2182, + "step": 2241 + }, + { + "epoch": 9.076923076923077, + "grad_norm": 8.156861245995605, + "learning_rate": 2.5956013653587465e-07, + "loss": 1.1883, + "step": 2242 + }, + { + "epoch": 9.080971659919028, + "grad_norm": 6.8935448308674925, + "learning_rate": 2.573177902642726e-07, + "loss": 1.3277, + "step": 2243 + }, + { + "epoch": 9.085020242914979, + "grad_norm": 6.261215775969084, + "learning_rate": 2.5508491605992003e-07, + "loss": 1.2689, + "step": 2244 + }, + { + "epoch": 9.089068825910932, + "grad_norm": 7.558850743447256, + "learning_rate": 2.528615183823058e-07, + "loss": 1.2173, + "step": 2245 + }, + { + "epoch": 9.093117408906883, + "grad_norm": 7.25014149962062, + "learning_rate": 2.506476016719922e-07, + "loss": 1.3017, + "step": 2246 + }, + { + "epoch": 9.097165991902834, + "grad_norm": 6.612210773327197, + "learning_rate": 2.4844317035060407e-07, + "loss": 1.0426, + "step": 2247 + }, + { + "epoch": 9.101214574898785, + "grad_norm": 7.546815676413145, + "learning_rate": 2.462482288208234e-07, + "loss": 1.2441, + "step": 2248 + }, + { + "epoch": 9.105263157894736, + "grad_norm": 7.807941903755463, + "learning_rate": 2.440627814663804e-07, + "loss": 1.1408, + "step": 2249 + }, + { + "epoch": 9.109311740890687, + "grad_norm": 7.10859572976939, + "learning_rate": 2.4188683265204125e-07, + "loss": 1.1815, + "step": 2250 + }, + { + "epoch": 9.11336032388664, + "grad_norm": 8.441440064069806, + "learning_rate": 2.397203867236031e-07, + "loss": 1.1018, + "step": 2251 + }, + { + "epoch": 9.117408906882591, + "grad_norm": 8.011527442183498, + "learning_rate": 2.3756344800788421e-07, + "loss": 1.0407, + "step": 2252 + }, + { + "epoch": 9.121457489878543, + "grad_norm": 7.2749461666868465, + "learning_rate": 2.354160208127143e-07, + "loss": 1.1121, + "step": 2253 + }, + { + "epoch": 9.125506072874494, + "grad_norm": 7.0804097957451235, + "learning_rate": 2.3327810942692653e-07, + "loss": 1.1386, + "step": 2254 + }, + { + "epoch": 9.129554655870445, + "grad_norm": 8.144395859046373, + "learning_rate": 2.3114971812034981e-07, + "loss": 0.9948, + "step": 2255 + }, + { + "epoch": 9.133603238866396, + "grad_norm": 6.168572056202311, + "learning_rate": 2.290308511437994e-07, + "loss": 1.3591, + "step": 2256 + }, + { + "epoch": 9.137651821862349, + "grad_norm": 7.533575763657583, + "learning_rate": 2.2692151272906916e-07, + "loss": 1.1552, + "step": 2257 + }, + { + "epoch": 9.1417004048583, + "grad_norm": 6.008485334299444, + "learning_rate": 2.2482170708892083e-07, + "loss": 0.9343, + "step": 2258 + }, + { + "epoch": 9.145748987854251, + "grad_norm": 7.964276904785252, + "learning_rate": 2.2273143841707922e-07, + "loss": 1.2456, + "step": 2259 + }, + { + "epoch": 9.149797570850202, + "grad_norm": 6.9266949606644035, + "learning_rate": 2.2065071088822055e-07, + "loss": 1.0812, + "step": 2260 + }, + { + "epoch": 9.153846153846153, + "grad_norm": 6.230772654822159, + "learning_rate": 2.1857952865796616e-07, + "loss": 1.3311, + "step": 2261 + }, + { + "epoch": 9.157894736842104, + "grad_norm": 7.819811201692324, + "learning_rate": 2.1651789586287442e-07, + "loss": 1.3046, + "step": 2262 + }, + { + "epoch": 9.161943319838057, + "grad_norm": 5.846834886948424, + "learning_rate": 2.1446581662042943e-07, + "loss": 1.1448, + "step": 2263 + }, + { + "epoch": 9.165991902834008, + "grad_norm": 6.59622872926416, + "learning_rate": 2.124232950290367e-07, + "loss": 1.2402, + "step": 2264 + }, + { + "epoch": 9.17004048582996, + "grad_norm": 5.697102047320491, + "learning_rate": 2.1039033516801255e-07, + "loss": 1.8377, + "step": 2265 + }, + { + "epoch": 9.17408906882591, + "grad_norm": 6.9465123167698755, + "learning_rate": 2.0836694109757748e-07, + "loss": 1.0402, + "step": 2266 + }, + { + "epoch": 9.178137651821862, + "grad_norm": 8.09589282023309, + "learning_rate": 2.0635311685884675e-07, + "loss": 1.2518, + "step": 2267 + }, + { + "epoch": 9.182186234817813, + "grad_norm": 5.634877933410695, + "learning_rate": 2.0434886647382135e-07, + "loss": 1.0571, + "step": 2268 + }, + { + "epoch": 9.186234817813766, + "grad_norm": 7.37849880722757, + "learning_rate": 2.0235419394538324e-07, + "loss": 1.2413, + "step": 2269 + }, + { + "epoch": 9.190283400809717, + "grad_norm": 6.864858377563145, + "learning_rate": 2.0036910325728521e-07, + "loss": 1.2594, + "step": 2270 + }, + { + "epoch": 9.194331983805668, + "grad_norm": 7.401494019521915, + "learning_rate": 1.9839359837414308e-07, + "loss": 1.4279, + "step": 2271 + }, + { + "epoch": 9.19838056680162, + "grad_norm": 5.926231403832184, + "learning_rate": 1.9642768324142803e-07, + "loss": 1.493, + "step": 2272 + }, + { + "epoch": 9.20242914979757, + "grad_norm": 6.068250986585262, + "learning_rate": 1.9447136178545766e-07, + "loss": 1.1961, + "step": 2273 + }, + { + "epoch": 9.206477732793521, + "grad_norm": 5.98199665934451, + "learning_rate": 1.9252463791339048e-07, + "loss": 1.5216, + "step": 2274 + }, + { + "epoch": 9.210526315789474, + "grad_norm": 6.111430931522919, + "learning_rate": 1.9058751551321642e-07, + "loss": 1.5102, + "step": 2275 + }, + { + "epoch": 9.214574898785425, + "grad_norm": 6.500698071562023, + "learning_rate": 1.8865999845374794e-07, + "loss": 1.2951, + "step": 2276 + }, + { + "epoch": 9.218623481781377, + "grad_norm": 6.682267491506086, + "learning_rate": 1.8674209058461624e-07, + "loss": 1.1742, + "step": 2277 + }, + { + "epoch": 9.222672064777328, + "grad_norm": 6.627947152950901, + "learning_rate": 1.8483379573625948e-07, + "loss": 1.1273, + "step": 2278 + }, + { + "epoch": 9.226720647773279, + "grad_norm": 6.274948615311735, + "learning_rate": 1.8293511771991624e-07, + "loss": 1.3395, + "step": 2279 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 6.379156536924719, + "learning_rate": 1.8104606032761985e-07, + "loss": 1.1154, + "step": 2280 + }, + { + "epoch": 9.234817813765183, + "grad_norm": 7.295750767606294, + "learning_rate": 1.7916662733218848e-07, + "loss": 1.3337, + "step": 2281 + }, + { + "epoch": 9.238866396761134, + "grad_norm": 6.984077825794193, + "learning_rate": 1.7729682248721848e-07, + "loss": 1.2648, + "step": 2282 + }, + { + "epoch": 9.242914979757085, + "grad_norm": 6.47007551460488, + "learning_rate": 1.7543664952707817e-07, + "loss": 1.3006, + "step": 2283 + }, + { + "epoch": 9.246963562753036, + "grad_norm": 5.809021360652588, + "learning_rate": 1.7358611216689692e-07, + "loss": 1.1748, + "step": 2284 + }, + { + "epoch": 9.251012145748987, + "grad_norm": 6.188595429945637, + "learning_rate": 1.7174521410256162e-07, + "loss": 1.2565, + "step": 2285 + }, + { + "epoch": 9.255060728744938, + "grad_norm": 6.458346476599245, + "learning_rate": 1.6991395901070685e-07, + "loss": 1.196, + "step": 2286 + }, + { + "epoch": 9.259109311740891, + "grad_norm": 5.698951444763946, + "learning_rate": 1.6809235054870865e-07, + "loss": 1.1668, + "step": 2287 + }, + { + "epoch": 9.263157894736842, + "grad_norm": 6.160438664948591, + "learning_rate": 1.6628039235467686e-07, + "loss": 0.9512, + "step": 2288 + }, + { + "epoch": 9.267206477732794, + "grad_norm": 5.767381827035687, + "learning_rate": 1.6447808804744668e-07, + "loss": 1.251, + "step": 2289 + }, + { + "epoch": 9.271255060728745, + "grad_norm": 6.25287181158909, + "learning_rate": 1.6268544122657437e-07, + "loss": 1.2493, + "step": 2290 + }, + { + "epoch": 9.275303643724696, + "grad_norm": 5.97072781413541, + "learning_rate": 1.6090245547232707e-07, + "loss": 1.3238, + "step": 2291 + }, + { + "epoch": 9.279352226720647, + "grad_norm": 7.620403877713194, + "learning_rate": 1.5912913434567746e-07, + "loss": 1.1986, + "step": 2292 + }, + { + "epoch": 9.2834008097166, + "grad_norm": 6.770502426251539, + "learning_rate": 1.5736548138829632e-07, + "loss": 0.9094, + "step": 2293 + }, + { + "epoch": 9.287449392712551, + "grad_norm": 7.538773236194581, + "learning_rate": 1.5561150012254446e-07, + "loss": 1.0554, + "step": 2294 + }, + { + "epoch": 9.291497975708502, + "grad_norm": 6.916240771142258, + "learning_rate": 1.5386719405146633e-07, + "loss": 0.9282, + "step": 2295 + }, + { + "epoch": 9.295546558704453, + "grad_norm": 7.325465270488264, + "learning_rate": 1.5213256665878362e-07, + "loss": 1.1336, + "step": 2296 + }, + { + "epoch": 9.299595141700404, + "grad_norm": 6.427562714147759, + "learning_rate": 1.5040762140888843e-07, + "loss": 1.3822, + "step": 2297 + }, + { + "epoch": 9.303643724696355, + "grad_norm": 7.247038384844516, + "learning_rate": 1.4869236174683443e-07, + "loss": 0.9789, + "step": 2298 + }, + { + "epoch": 9.307692307692308, + "grad_norm": 6.708599062077499, + "learning_rate": 1.4698679109833192e-07, + "loss": 1.034, + "step": 2299 + }, + { + "epoch": 9.31174089068826, + "grad_norm": 6.139085096565993, + "learning_rate": 1.4529091286973994e-07, + "loss": 1.0888, + "step": 2300 + }, + { + "epoch": 9.31578947368421, + "grad_norm": 8.100679017208485, + "learning_rate": 1.4360473044806033e-07, + "loss": 0.9574, + "step": 2301 + }, + { + "epoch": 9.319838056680162, + "grad_norm": 6.847505127280567, + "learning_rate": 1.419282472009309e-07, + "loss": 0.9404, + "step": 2302 + }, + { + "epoch": 9.323886639676113, + "grad_norm": 6.82409548432895, + "learning_rate": 1.402614664766172e-07, + "loss": 0.9955, + "step": 2303 + }, + { + "epoch": 9.327935222672064, + "grad_norm": 6.583985934044893, + "learning_rate": 1.3860439160400808e-07, + "loss": 1.0107, + "step": 2304 + }, + { + "epoch": 9.331983805668017, + "grad_norm": 6.894705234751225, + "learning_rate": 1.369570258926062e-07, + "loss": 1.0625, + "step": 2305 + }, + { + "epoch": 9.336032388663968, + "grad_norm": 6.246760039026441, + "learning_rate": 1.353193726325247e-07, + "loss": 1.1625, + "step": 2306 + }, + { + "epoch": 9.34008097165992, + "grad_norm": 6.209505844080315, + "learning_rate": 1.3369143509447903e-07, + "loss": 1.0921, + "step": 2307 + }, + { + "epoch": 9.34412955465587, + "grad_norm": 6.524439333514229, + "learning_rate": 1.3207321652977944e-07, + "loss": 0.9869, + "step": 2308 + }, + { + "epoch": 9.348178137651821, + "grad_norm": 7.626966177840634, + "learning_rate": 1.3046472017032685e-07, + "loss": 1.1164, + "step": 2309 + }, + { + "epoch": 9.352226720647772, + "grad_norm": 6.971248132841646, + "learning_rate": 1.288659492286032e-07, + "loss": 1.1228, + "step": 2310 + }, + { + "epoch": 9.356275303643725, + "grad_norm": 6.400589915934499, + "learning_rate": 1.2727690689766814e-07, + "loss": 0.8228, + "step": 2311 + }, + { + "epoch": 9.360323886639677, + "grad_norm": 7.5084173636270295, + "learning_rate": 1.2569759635115086e-07, + "loss": 1.2048, + "step": 2312 + }, + { + "epoch": 9.364372469635628, + "grad_norm": 6.0650715287124894, + "learning_rate": 1.2412802074324548e-07, + "loss": 1.2833, + "step": 2313 + }, + { + "epoch": 9.368421052631579, + "grad_norm": 6.326873844433714, + "learning_rate": 1.2256818320870224e-07, + "loss": 1.1706, + "step": 2314 + }, + { + "epoch": 9.37246963562753, + "grad_norm": 7.9973063918471805, + "learning_rate": 1.210180868628219e-07, + "loss": 1.2185, + "step": 2315 + }, + { + "epoch": 9.376518218623481, + "grad_norm": 6.867236440906254, + "learning_rate": 1.1947773480145198e-07, + "loss": 0.9325, + "step": 2316 + }, + { + "epoch": 9.380566801619434, + "grad_norm": 7.266878939706752, + "learning_rate": 1.179471301009777e-07, + "loss": 1.0759, + "step": 2317 + }, + { + "epoch": 9.384615384615385, + "grad_norm": 6.264264519092, + "learning_rate": 1.1642627581831767e-07, + "loss": 0.8937, + "step": 2318 + }, + { + "epoch": 9.388663967611336, + "grad_norm": 7.548189919698031, + "learning_rate": 1.1491517499091498e-07, + "loss": 0.9544, + "step": 2319 + }, + { + "epoch": 9.392712550607287, + "grad_norm": 6.995200625215076, + "learning_rate": 1.134138306367355e-07, + "loss": 1.1477, + "step": 2320 + }, + { + "epoch": 9.396761133603238, + "grad_norm": 8.233516260798435, + "learning_rate": 1.1192224575425848e-07, + "loss": 0.9354, + "step": 2321 + }, + { + "epoch": 9.40080971659919, + "grad_norm": 6.111526040757406, + "learning_rate": 1.1044042332247152e-07, + "loss": 0.9898, + "step": 2322 + }, + { + "epoch": 9.404858299595142, + "grad_norm": 6.132400329765274, + "learning_rate": 1.089683663008656e-07, + "loss": 1.0196, + "step": 2323 + }, + { + "epoch": 9.408906882591094, + "grad_norm": 7.514103964897205, + "learning_rate": 1.0750607762942622e-07, + "loss": 1.0973, + "step": 2324 + }, + { + "epoch": 9.412955465587045, + "grad_norm": 6.336921580711171, + "learning_rate": 1.0605356022863167e-07, + "loss": 0.9423, + "step": 2325 + }, + { + "epoch": 9.417004048582996, + "grad_norm": 7.452160362644236, + "learning_rate": 1.0461081699944475e-07, + "loss": 1.2153, + "step": 2326 + }, + { + "epoch": 9.421052631578947, + "grad_norm": 7.267471861431043, + "learning_rate": 1.0317785082330555e-07, + "loss": 1.2392, + "step": 2327 + }, + { + "epoch": 9.425101214574898, + "grad_norm": 6.361292365622441, + "learning_rate": 1.0175466456213034e-07, + "loss": 1.2855, + "step": 2328 + }, + { + "epoch": 9.429149797570851, + "grad_norm": 7.344161066683579, + "learning_rate": 1.0034126105830099e-07, + "loss": 1.2406, + "step": 2329 + }, + { + "epoch": 9.433198380566802, + "grad_norm": 5.605300392098185, + "learning_rate": 9.89376431346606e-08, + "loss": 1.4292, + "step": 2330 + }, + { + "epoch": 9.437246963562753, + "grad_norm": 6.679019603222645, + "learning_rate": 9.75438135945106e-08, + "loss": 1.2237, + "step": 2331 + }, + { + "epoch": 9.441295546558704, + "grad_norm": 7.5278062818077265, + "learning_rate": 9.615977522160147e-08, + "loss": 1.1524, + "step": 2332 + }, + { + "epoch": 9.445344129554655, + "grad_norm": 5.917194701522402, + "learning_rate": 9.478553078013042e-08, + "loss": 1.4128, + "step": 2333 + }, + { + "epoch": 9.449392712550607, + "grad_norm": 7.670080650819539, + "learning_rate": 9.342108301473308e-08, + "loss": 1.2044, + "step": 2334 + }, + { + "epoch": 9.45344129554656, + "grad_norm": 7.247183570995367, + "learning_rate": 9.206643465047904e-08, + "loss": 1.1982, + "step": 2335 + }, + { + "epoch": 9.45748987854251, + "grad_norm": 7.097006981905352, + "learning_rate": 9.072158839286748e-08, + "loss": 1.5372, + "step": 2336 + }, + { + "epoch": 9.461538461538462, + "grad_norm": 9.659107122936096, + "learning_rate": 8.938654692781989e-08, + "loss": 1.6061, + "step": 2337 + }, + { + "epoch": 9.465587044534413, + "grad_norm": 7.6683985186556916, + "learning_rate": 8.80613129216762e-08, + "loss": 1.3296, + "step": 2338 + }, + { + "epoch": 9.469635627530364, + "grad_norm": 6.895670007789144, + "learning_rate": 8.674588902118919e-08, + "loss": 1.3934, + "step": 2339 + }, + { + "epoch": 9.473684210526315, + "grad_norm": 6.372186547142852, + "learning_rate": 8.544027785351794e-08, + "loss": 1.3763, + "step": 2340 + }, + { + "epoch": 9.477732793522268, + "grad_norm": 8.632382677132219, + "learning_rate": 8.414448202622494e-08, + "loss": 1.3484, + "step": 2341 + }, + { + "epoch": 9.481781376518219, + "grad_norm": 6.30259565246508, + "learning_rate": 8.285850412726837e-08, + "loss": 1.2994, + "step": 2342 + }, + { + "epoch": 9.48582995951417, + "grad_norm": 7.059555129137264, + "learning_rate": 8.15823467249982e-08, + "loss": 1.2402, + "step": 2343 + }, + { + "epoch": 9.489878542510121, + "grad_norm": 9.014694685087083, + "learning_rate": 8.031601236815234e-08, + "loss": 1.2015, + "step": 2344 + }, + { + "epoch": 9.493927125506072, + "grad_norm": 9.570550816859026, + "learning_rate": 7.905950358584768e-08, + "loss": 1.1243, + "step": 2345 + }, + { + "epoch": 9.497975708502024, + "grad_norm": 6.594135266471429, + "learning_rate": 7.781282288757963e-08, + "loss": 1.0819, + "step": 2346 + }, + { + "epoch": 9.502024291497976, + "grad_norm": 7.353743236682782, + "learning_rate": 7.657597276321427e-08, + "loss": 0.896, + "step": 2347 + }, + { + "epoch": 9.506072874493928, + "grad_norm": 8.664718298165978, + "learning_rate": 7.534895568298395e-08, + "loss": 1.2481, + "step": 2348 + }, + { + "epoch": 9.510121457489879, + "grad_norm": 7.984423053875158, + "learning_rate": 7.413177409748284e-08, + "loss": 1.1753, + "step": 2349 + }, + { + "epoch": 9.51417004048583, + "grad_norm": 6.100693202631448, + "learning_rate": 7.292443043766085e-08, + "loss": 1.1947, + "step": 2350 + }, + { + "epoch": 9.518218623481781, + "grad_norm": 7.891833225357146, + "learning_rate": 7.172692711482022e-08, + "loss": 1.2041, + "step": 2351 + }, + { + "epoch": 9.522267206477732, + "grad_norm": 8.957858980675372, + "learning_rate": 7.053926652061116e-08, + "loss": 1.1271, + "step": 2352 + }, + { + "epoch": 9.526315789473685, + "grad_norm": 8.198107387088085, + "learning_rate": 6.936145102702407e-08, + "loss": 0.9474, + "step": 2353 + }, + { + "epoch": 9.530364372469636, + "grad_norm": 7.311121090177834, + "learning_rate": 6.819348298638839e-08, + "loss": 1.0489, + "step": 2354 + }, + { + "epoch": 9.534412955465587, + "grad_norm": 8.543656183569171, + "learning_rate": 6.703536473136486e-08, + "loss": 1.0637, + "step": 2355 + }, + { + "epoch": 9.538461538461538, + "grad_norm": 8.23797152083669, + "learning_rate": 6.588709857494324e-08, + "loss": 1.3686, + "step": 2356 + }, + { + "epoch": 9.54251012145749, + "grad_norm": 7.299808933974355, + "learning_rate": 6.474868681043578e-08, + "loss": 1.3526, + "step": 2357 + }, + { + "epoch": 9.54655870445344, + "grad_norm": 18.439147602111802, + "learning_rate": 6.36201317114754e-08, + "loss": 1.9396, + "step": 2358 + }, + { + "epoch": 9.550607287449393, + "grad_norm": 12.741322699860357, + "learning_rate": 6.250143553200694e-08, + "loss": 1.622, + "step": 2359 + }, + { + "epoch": 9.554655870445345, + "grad_norm": 13.74018979539314, + "learning_rate": 6.13926005062876e-08, + "loss": 2.0152, + "step": 2360 + }, + { + "epoch": 9.558704453441296, + "grad_norm": 8.35774310624565, + "learning_rate": 6.029362884887757e-08, + "loss": 1.1873, + "step": 2361 + }, + { + "epoch": 9.562753036437247, + "grad_norm": 7.0374249328430425, + "learning_rate": 5.920452275463895e-08, + "loss": 1.0601, + "step": 2362 + }, + { + "epoch": 9.566801619433198, + "grad_norm": 7.424773261082573, + "learning_rate": 5.8125284398730666e-08, + "loss": 1.4362, + "step": 2363 + }, + { + "epoch": 9.570850202429149, + "grad_norm": 7.8574653437082995, + "learning_rate": 5.705591593660353e-08, + "loss": 1.1286, + "step": 2364 + }, + { + "epoch": 9.574898785425102, + "grad_norm": 7.670640887580393, + "learning_rate": 5.5996419503996924e-08, + "loss": 1.2739, + "step": 2365 + }, + { + "epoch": 9.578947368421053, + "grad_norm": 6.048412628961693, + "learning_rate": 5.4946797216931524e-08, + "loss": 1.1107, + "step": 2366 + }, + { + "epoch": 9.582995951417004, + "grad_norm": 7.759659089797845, + "learning_rate": 5.390705117171047e-08, + "loss": 1.1965, + "step": 2367 + }, + { + "epoch": 9.587044534412955, + "grad_norm": 8.079375188718355, + "learning_rate": 5.2877183444909885e-08, + "loss": 1.2208, + "step": 2368 + }, + { + "epoch": 9.591093117408906, + "grad_norm": 8.586695205665634, + "learning_rate": 5.185719609337836e-08, + "loss": 1.2005, + "step": 2369 + }, + { + "epoch": 9.595141700404858, + "grad_norm": 7.467079614176582, + "learning_rate": 5.084709115423081e-08, + "loss": 0.9196, + "step": 2370 + }, + { + "epoch": 9.59919028340081, + "grad_norm": 6.378703085070227, + "learning_rate": 4.9846870644844616e-08, + "loss": 1.0393, + "step": 2371 + }, + { + "epoch": 9.603238866396762, + "grad_norm": 6.619001766419402, + "learning_rate": 4.885653656285627e-08, + "loss": 1.1261, + "step": 2372 + }, + { + "epoch": 9.607287449392713, + "grad_norm": 7.375768177311922, + "learning_rate": 4.7876090886158074e-08, + "loss": 0.8888, + "step": 2373 + }, + { + "epoch": 9.611336032388664, + "grad_norm": 7.765928006199822, + "learning_rate": 4.6905535572892015e-08, + "loss": 1.1768, + "step": 2374 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 7.136769572282673, + "learning_rate": 4.5944872561448084e-08, + "loss": 1.2218, + "step": 2375 + }, + { + "epoch": 9.619433198380566, + "grad_norm": 8.252293420134516, + "learning_rate": 4.499410377045765e-08, + "loss": 0.9572, + "step": 2376 + }, + { + "epoch": 9.623481781376519, + "grad_norm": 9.183057155873392, + "learning_rate": 4.4053231098794e-08, + "loss": 1.3123, + "step": 2377 + }, + { + "epoch": 9.62753036437247, + "grad_norm": 7.870175412604239, + "learning_rate": 4.3122256425563444e-08, + "loss": 1.1711, + "step": 2378 + }, + { + "epoch": 9.631578947368421, + "grad_norm": 7.805673333471194, + "learning_rate": 4.220118161010589e-08, + "loss": 1.4233, + "step": 2379 + }, + { + "epoch": 9.635627530364372, + "grad_norm": 6.513022878132265, + "learning_rate": 4.129000849198872e-08, + "loss": 1.0134, + "step": 2380 + }, + { + "epoch": 9.639676113360323, + "grad_norm": 7.767006697052738, + "learning_rate": 4.038873889100237e-08, + "loss": 1.3095, + "step": 2381 + }, + { + "epoch": 9.643724696356275, + "grad_norm": 6.987879134939129, + "learning_rate": 3.94973746071603e-08, + "loss": 1.196, + "step": 2382 + }, + { + "epoch": 9.647773279352228, + "grad_norm": 7.371753491139924, + "learning_rate": 3.861591742069071e-08, + "loss": 1.2979, + "step": 2383 + }, + { + "epoch": 9.651821862348179, + "grad_norm": 6.3171055856339065, + "learning_rate": 3.77443690920376e-08, + "loss": 1.09, + "step": 2384 + }, + { + "epoch": 9.65587044534413, + "grad_norm": 7.731889028234551, + "learning_rate": 3.688273136185416e-08, + "loss": 1.2847, + "step": 2385 + }, + { + "epoch": 9.65991902834008, + "grad_norm": 7.905740698031348, + "learning_rate": 3.60310059509994e-08, + "loss": 1.0873, + "step": 2386 + }, + { + "epoch": 9.663967611336032, + "grad_norm": 6.39452809376175, + "learning_rate": 3.518919456053649e-08, + "loss": 1.2574, + "step": 2387 + }, + { + "epoch": 9.668016194331983, + "grad_norm": 6.930382161581412, + "learning_rate": 3.4357298871727786e-08, + "loss": 1.2518, + "step": 2388 + }, + { + "epoch": 9.672064777327936, + "grad_norm": 7.172023867992301, + "learning_rate": 3.353532054603203e-08, + "loss": 1.4336, + "step": 2389 + }, + { + "epoch": 9.676113360323887, + "grad_norm": 6.17036083047538, + "learning_rate": 3.2723261225102164e-08, + "loss": 1.4305, + "step": 2390 + }, + { + "epoch": 9.680161943319838, + "grad_norm": 7.2582494975076814, + "learning_rate": 3.192112253077973e-08, + "loss": 1.2413, + "step": 2391 + }, + { + "epoch": 9.68421052631579, + "grad_norm": 7.1326480428223125, + "learning_rate": 3.1128906065092666e-08, + "loss": 1.1222, + "step": 2392 + }, + { + "epoch": 9.68825910931174, + "grad_norm": 6.710146846106117, + "learning_rate": 3.034661341025258e-08, + "loss": 1.2007, + "step": 2393 + }, + { + "epoch": 9.692307692307692, + "grad_norm": 7.167152573789711, + "learning_rate": 2.957424612865245e-08, + "loss": 1.2133, + "step": 2394 + }, + { + "epoch": 9.696356275303645, + "grad_norm": 6.4961462238366074, + "learning_rate": 2.8811805762860578e-08, + "loss": 1.1056, + "step": 2395 + }, + { + "epoch": 9.700404858299596, + "grad_norm": 8.566659166804367, + "learning_rate": 2.8059293835620006e-08, + "loss": 1.5372, + "step": 2396 + }, + { + "epoch": 9.704453441295547, + "grad_norm": 7.003050839114324, + "learning_rate": 2.731671184984519e-08, + "loss": 1.3512, + "step": 2397 + }, + { + "epoch": 9.708502024291498, + "grad_norm": 7.022553930422816, + "learning_rate": 2.6584061288617568e-08, + "loss": 0.9806, + "step": 2398 + }, + { + "epoch": 9.712550607287449, + "grad_norm": 6.5247192423891365, + "learning_rate": 2.5861343615184997e-08, + "loss": 1.1002, + "step": 2399 + }, + { + "epoch": 9.7165991902834, + "grad_norm": 6.583205928052207, + "learning_rate": 2.514856027295509e-08, + "loss": 1.2214, + "step": 2400 + }, + { + "epoch": 9.720647773279353, + "grad_norm": 8.023483107929737, + "learning_rate": 2.4445712685498e-08, + "loss": 0.9652, + "step": 2401 + }, + { + "epoch": 9.724696356275304, + "grad_norm": 7.31774753878339, + "learning_rate": 2.3752802256536423e-08, + "loss": 1.2102, + "step": 2402 + }, + { + "epoch": 9.728744939271255, + "grad_norm": 6.611110640043861, + "learning_rate": 2.3069830369949474e-08, + "loss": 1.3616, + "step": 2403 + }, + { + "epoch": 9.732793522267206, + "grad_norm": 6.752098669828748, + "learning_rate": 2.239679838976605e-08, + "loss": 1.051, + "step": 2404 + }, + { + "epoch": 9.736842105263158, + "grad_norm": 6.8895429841274884, + "learning_rate": 2.173370766016314e-08, + "loss": 1.1644, + "step": 2405 + }, + { + "epoch": 9.740890688259109, + "grad_norm": 12.580788775503475, + "learning_rate": 2.1080559505462504e-08, + "loss": 1.7934, + "step": 2406 + }, + { + "epoch": 9.744939271255062, + "grad_norm": 6.238998597279842, + "learning_rate": 2.043735523013013e-08, + "loss": 1.3521, + "step": 2407 + }, + { + "epoch": 9.748987854251013, + "grad_norm": 7.960224187441654, + "learning_rate": 1.98040961187701e-08, + "loss": 1.1426, + "step": 2408 + }, + { + "epoch": 9.753036437246964, + "grad_norm": 7.433530949559287, + "learning_rate": 1.918078343612628e-08, + "loss": 1.0778, + "step": 2409 + }, + { + "epoch": 9.757085020242915, + "grad_norm": 7.048067805939395, + "learning_rate": 1.85674184270751e-08, + "loss": 1.1728, + "step": 2410 + }, + { + "epoch": 9.761133603238866, + "grad_norm": 7.803352024996819, + "learning_rate": 1.7964002316628316e-08, + "loss": 1.9261, + "step": 2411 + }, + { + "epoch": 9.765182186234817, + "grad_norm": 8.772283334117857, + "learning_rate": 1.73705363099258e-08, + "loss": 1.6529, + "step": 2412 + }, + { + "epoch": 9.76923076923077, + "grad_norm": 8.84391299820265, + "learning_rate": 1.6787021592234998e-08, + "loss": 1.5923, + "step": 2413 + }, + { + "epoch": 9.773279352226721, + "grad_norm": 6.27175422673062, + "learning_rate": 1.6213459328950355e-08, + "loss": 1.1256, + "step": 2414 + }, + { + "epoch": 9.777327935222672, + "grad_norm": 6.32330852422478, + "learning_rate": 1.5649850665587217e-08, + "loss": 1.4239, + "step": 2415 + }, + { + "epoch": 9.781376518218623, + "grad_norm": 5.832326997964023, + "learning_rate": 1.5096196727783508e-08, + "loss": 1.3997, + "step": 2416 + }, + { + "epoch": 9.785425101214575, + "grad_norm": 6.464815391686391, + "learning_rate": 1.4552498621295264e-08, + "loss": 1.1972, + "step": 2417 + }, + { + "epoch": 9.789473684210526, + "grad_norm": 6.243813602787722, + "learning_rate": 1.4018757431992769e-08, + "loss": 1.1337, + "step": 2418 + }, + { + "epoch": 9.793522267206479, + "grad_norm": 7.081278097894398, + "learning_rate": 1.3494974225863322e-08, + "loss": 1.1298, + "step": 2419 + }, + { + "epoch": 9.79757085020243, + "grad_norm": 6.774603048357779, + "learning_rate": 1.2981150049004021e-08, + "loss": 1.1853, + "step": 2420 + }, + { + "epoch": 9.80161943319838, + "grad_norm": 6.425581343136603, + "learning_rate": 1.2477285927622873e-08, + "loss": 1.0838, + "step": 2421 + }, + { + "epoch": 9.805668016194332, + "grad_norm": 6.08994981446175, + "learning_rate": 1.1983382868036019e-08, + "loss": 0.9978, + "step": 2422 + }, + { + "epoch": 9.809716599190283, + "grad_norm": 6.5758131521979095, + "learning_rate": 1.1499441856663296e-08, + "loss": 1.3163, + "step": 2423 + }, + { + "epoch": 9.813765182186234, + "grad_norm": 8.400761856632263, + "learning_rate": 1.102546386003156e-08, + "loss": 1.0923, + "step": 2424 + }, + { + "epoch": 9.817813765182187, + "grad_norm": 6.878983386606489, + "learning_rate": 1.0561449824766367e-08, + "loss": 1.0377, + "step": 2425 + }, + { + "epoch": 9.821862348178138, + "grad_norm": 15.301731593931011, + "learning_rate": 1.0107400677596413e-08, + "loss": 2.149, + "step": 2426 + }, + { + "epoch": 9.82591093117409, + "grad_norm": 25.109744644655137, + "learning_rate": 9.663317325345756e-09, + "loss": 3.0471, + "step": 2427 + }, + { + "epoch": 9.82995951417004, + "grad_norm": 7.260711360623296, + "learning_rate": 9.229200654936599e-09, + "loss": 1.0529, + "step": 2428 + }, + { + "epoch": 9.834008097165992, + "grad_norm": 7.562582026280667, + "learning_rate": 8.805051533384846e-09, + "loss": 1.0334, + "step": 2429 + }, + { + "epoch": 9.838056680161943, + "grad_norm": 6.362275189041613, + "learning_rate": 8.390870807799545e-09, + "loss": 1.0283, + "step": 2430 + }, + { + "epoch": 9.842105263157894, + "grad_norm": 6.219234617919187, + "learning_rate": 7.986659305380672e-09, + "loss": 1.1448, + "step": 2431 + }, + { + "epoch": 9.846153846153847, + "grad_norm": 6.2936591308815215, + "learning_rate": 7.59241783341913e-09, + "loss": 0.9191, + "step": 2432 + }, + { + "epoch": 9.850202429149798, + "grad_norm": 7.885631371578948, + "learning_rate": 7.2081471792911914e-09, + "loss": 1.1249, + "step": 2433 + }, + { + "epoch": 9.854251012145749, + "grad_norm": 7.734452586270721, + "learning_rate": 6.833848110461283e-09, + "loss": 1.1522, + "step": 2434 + }, + { + "epoch": 9.8582995951417, + "grad_norm": 5.289881410244237, + "learning_rate": 6.469521374477539e-09, + "loss": 1.1116, + "step": 2435 + }, + { + "epoch": 9.862348178137651, + "grad_norm": 7.185956641577389, + "learning_rate": 6.115167698972912e-09, + "loss": 1.11, + "step": 2436 + }, + { + "epoch": 9.866396761133604, + "grad_norm": 6.862756557846077, + "learning_rate": 5.770787791661292e-09, + "loss": 1.0761, + "step": 2437 + }, + { + "epoch": 9.870445344129555, + "grad_norm": 8.513662562199336, + "learning_rate": 5.436382340335833e-09, + "loss": 1.4742, + "step": 2438 + }, + { + "epoch": 9.874493927125506, + "grad_norm": 7.096665938831381, + "learning_rate": 5.111952012870624e-09, + "loss": 1.4265, + "step": 2439 + }, + { + "epoch": 9.878542510121457, + "grad_norm": 7.574301958544817, + "learning_rate": 4.797497457216804e-09, + "loss": 1.2196, + "step": 2440 + }, + { + "epoch": 9.882591093117409, + "grad_norm": 6.916168526679012, + "learning_rate": 4.493019301401447e-09, + "loss": 1.1487, + "step": 2441 + }, + { + "epoch": 9.88663967611336, + "grad_norm": 6.12111410450182, + "learning_rate": 4.198518153527009e-09, + "loss": 0.8072, + "step": 2442 + }, + { + "epoch": 9.89068825910931, + "grad_norm": 5.942490477342795, + "learning_rate": 3.9139946017713315e-09, + "loss": 1.3326, + "step": 2443 + }, + { + "epoch": 9.894736842105264, + "grad_norm": 5.858437003056172, + "learning_rate": 3.6394492143820847e-09, + "loss": 1.3361, + "step": 2444 + }, + { + "epoch": 9.898785425101215, + "grad_norm": 5.054070594148292, + "learning_rate": 3.3748825396817675e-09, + "loss": 1.1313, + "step": 2445 + }, + { + "epoch": 9.902834008097166, + "grad_norm": 5.963132718905433, + "learning_rate": 3.120295106060489e-09, + "loss": 1.1024, + "step": 2446 + }, + { + "epoch": 9.906882591093117, + "grad_norm": 6.438564124074531, + "learning_rate": 2.875687421980966e-09, + "loss": 1.1277, + "step": 2447 + }, + { + "epoch": 9.910931174089068, + "grad_norm": 5.278764051706532, + "learning_rate": 2.6410599759713052e-09, + "loss": 0.9121, + "step": 2448 + }, + { + "epoch": 9.914979757085021, + "grad_norm": 5.930913466658232, + "learning_rate": 2.4164132366294444e-09, + "loss": 0.985, + "step": 2449 + }, + { + "epoch": 9.919028340080972, + "grad_norm": 6.134886702370438, + "learning_rate": 2.201747652618713e-09, + "loss": 0.9875, + "step": 2450 + }, + { + "epoch": 9.923076923076923, + "grad_norm": 4.919043912938238, + "learning_rate": 1.997063652668385e-09, + "loss": 1.3108, + "step": 2451 + }, + { + "epoch": 9.927125506072874, + "grad_norm": 6.59710619456491, + "learning_rate": 1.8023616455731253e-09, + "loss": 1.1487, + "step": 2452 + }, + { + "epoch": 9.931174089068826, + "grad_norm": 6.7268729919729955, + "learning_rate": 1.6176420201902132e-09, + "loss": 1.1327, + "step": 2453 + }, + { + "epoch": 9.935222672064777, + "grad_norm": 6.801182722691009, + "learning_rate": 1.4429051454412092e-09, + "loss": 0.9427, + "step": 2454 + }, + { + "epoch": 9.939271255060728, + "grad_norm": 6.409116476021893, + "learning_rate": 1.2781513703102877e-09, + "loss": 0.9617, + "step": 2455 + }, + { + "epoch": 9.94331983805668, + "grad_norm": 7.345877062431902, + "learning_rate": 1.1233810238425735e-09, + "loss": 1.2441, + "step": 2456 + }, + { + "epoch": 9.947368421052632, + "grad_norm": 5.764298273469222, + "learning_rate": 9.78594415145806e-10, + "loss": 0.9852, + "step": 2457 + }, + { + "epoch": 9.951417004048583, + "grad_norm": 7.41040208403738, + "learning_rate": 8.437918333864537e-10, + "loss": 0.8827, + "step": 2458 + }, + { + "epoch": 9.955465587044534, + "grad_norm": 6.690719160118015, + "learning_rate": 7.189735477913795e-10, + "loss": 1.207, + "step": 2459 + }, + { + "epoch": 9.959514170040485, + "grad_norm": 6.426110798176273, + "learning_rate": 6.041398076478411e-10, + "loss": 1.2944, + "step": 2460 + }, + { + "epoch": 9.963562753036438, + "grad_norm": 5.67612330623261, + "learning_rate": 4.99290842301825e-10, + "loss": 1.2245, + "step": 2461 + }, + { + "epoch": 9.96761133603239, + "grad_norm": 7.12228691332672, + "learning_rate": 4.0442686115582665e-10, + "loss": 1.2106, + "step": 2462 + }, + { + "epoch": 9.97165991902834, + "grad_norm": 7.171942174513077, + "learning_rate": 3.195480536732909e-10, + "loss": 1.1455, + "step": 2463 + }, + { + "epoch": 9.975708502024291, + "grad_norm": 5.323715551715951, + "learning_rate": 2.446545893730612e-10, + "loss": 1.2116, + "step": 2464 + }, + { + "epoch": 9.979757085020243, + "grad_norm": 6.645046974916862, + "learning_rate": 1.797466178327101e-10, + "loss": 1.2553, + "step": 2465 + }, + { + "epoch": 9.983805668016194, + "grad_norm": 6.179117377649965, + "learning_rate": 1.2482426868520858e-10, + "loss": 1.3211, + "step": 2466 + }, + { + "epoch": 9.987854251012145, + "grad_norm": 5.9598947487901075, + "learning_rate": 7.988765162225687e-11, + "loss": 1.1147, + "step": 2467 + }, + { + "epoch": 9.991902834008098, + "grad_norm": 5.288287586968503, + "learning_rate": 4.4936856390398465e-11, + "loss": 1.1298, + "step": 2468 + }, + { + "epoch": 9.995951417004049, + "grad_norm": 5.809768480823839, + "learning_rate": 1.9971952793240713e-11, + "loss": 1.454, + "step": 2469 + }, + { + "epoch": 10.0, + "grad_norm": 5.8240784074284795, + "learning_rate": 4.992990691454758e-12, + "loss": 1.1531, + "step": 2470 + } + ], + "logging_steps": 1, + "max_steps": 2470, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1976, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 750948524032000.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2470/training_args.bin b/checkpoint-2470/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..78bb788b48fdaeefa100fcca732cd4ad5de338f1 --- /dev/null +++ b/checkpoint-2470/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db617e3c3ae788b627938f09c1b4708215392619dbc3a2b63a88ab23d37b875b +size 7608 diff --git a/checkpoint-2470/zero_to_fp32.py b/checkpoint-2470/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..0e759146cadd92ddfefab3680146c2bd6a2b5c04 --- /dev/null +++ b/checkpoint-2470/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a65829f8d45598369efc368800ef14b5dbd9f997 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2066 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "split_special_tokens": false, + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..896e55bff219afc5b3341922c117c6239e776b4f --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 10.0, + "total_flos": 750948524032000.0, + "train_loss": 1.639819175053222, + "train_runtime": 7687.531, + "train_samples_per_second": 2.57, + "train_steps_per_second": 0.321 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..028150375533a54bd93e3ef7dd39ce6f50dcc4da --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,2471 @@ +{"current_steps": 1, "total_steps": 2470, "loss": 2.5926, "lr": 0.0, "epoch": 0.004048582995951417, "percentage": 0.04, "elapsed_time": "0:00:06", "remaining_time": "4:30:53"} +{"current_steps": 2, "total_steps": 2470, "loss": 2.9114, "lr": 4.0485829959514176e-08, "epoch": 0.008097165991902834, "percentage": 0.08, "elapsed_time": "0:00:12", "remaining_time": "4:20:59"} +{"current_steps": 3, "total_steps": 2470, "loss": 2.7471, "lr": 8.097165991902835e-08, "epoch": 0.012145748987854251, "percentage": 0.12, "elapsed_time": "0:00:15", "remaining_time": "3:35:39"} +{"current_steps": 4, "total_steps": 2470, "loss": 2.8706, "lr": 1.2145748987854252e-07, "epoch": 0.016194331983805668, "percentage": 0.16, "elapsed_time": "0:00:18", "remaining_time": "3:13:22"} +{"current_steps": 5, "total_steps": 2470, "loss": 2.9912, "lr": 1.619433198380567e-07, "epoch": 0.020242914979757085, "percentage": 0.2, "elapsed_time": "0:00:21", "remaining_time": "2:59:40"} +{"current_steps": 6, "total_steps": 2470, "loss": 3.0072, "lr": 2.0242914979757086e-07, "epoch": 0.024291497975708502, "percentage": 0.24, "elapsed_time": "0:00:24", "remaining_time": "2:50:31"} +{"current_steps": 7, "total_steps": 2470, "loss": 2.4721, "lr": 2.4291497975708504e-07, "epoch": 0.02834008097165992, "percentage": 0.28, "elapsed_time": "0:00:27", "remaining_time": "2:43:51"} +{"current_steps": 8, "total_steps": 2470, "loss": 2.843, "lr": 2.834008097165992e-07, "epoch": 0.032388663967611336, "percentage": 0.32, "elapsed_time": "0:00:30", "remaining_time": "2:38:51"} +{"current_steps": 9, "total_steps": 2470, "loss": 2.9053, "lr": 3.238866396761134e-07, "epoch": 0.03643724696356275, "percentage": 0.36, "elapsed_time": "0:00:34", "remaining_time": "2:35:19"} +{"current_steps": 10, "total_steps": 2470, "loss": 2.7608, "lr": 3.6437246963562754e-07, "epoch": 0.04048582995951417, "percentage": 0.4, "elapsed_time": "0:00:37", "remaining_time": "2:32:31"} +{"current_steps": 11, "total_steps": 2470, "loss": 2.7074, "lr": 4.048582995951417e-07, "epoch": 0.044534412955465584, "percentage": 0.45, "elapsed_time": "0:00:40", "remaining_time": "2:30:01"} +{"current_steps": 12, "total_steps": 2470, "loss": 2.7846, "lr": 4.453441295546559e-07, "epoch": 0.048582995951417005, "percentage": 0.49, "elapsed_time": "0:00:43", "remaining_time": "2:28:11"} +{"current_steps": 13, "total_steps": 2470, "loss": 3.018, "lr": 4.858299595141701e-07, "epoch": 0.05263157894736842, "percentage": 0.53, "elapsed_time": "0:00:46", "remaining_time": "2:26:36"} +{"current_steps": 14, "total_steps": 2470, "loss": 2.8131, "lr": 5.263157894736843e-07, "epoch": 0.05668016194331984, "percentage": 0.57, "elapsed_time": "0:00:49", "remaining_time": "2:25:00"} +{"current_steps": 15, "total_steps": 2470, "loss": 2.8777, "lr": 5.668016194331984e-07, "epoch": 0.06072874493927125, "percentage": 0.61, "elapsed_time": "0:00:52", "remaining_time": "2:23:38"} +{"current_steps": 16, "total_steps": 2470, "loss": 2.9472, "lr": 6.072874493927125e-07, "epoch": 0.06477732793522267, "percentage": 0.65, "elapsed_time": "0:00:55", "remaining_time": "2:22:30"} +{"current_steps": 17, "total_steps": 2470, "loss": 3.0157, "lr": 6.477732793522268e-07, "epoch": 0.06882591093117409, "percentage": 0.69, "elapsed_time": "0:00:58", "remaining_time": "2:21:28"} +{"current_steps": 18, "total_steps": 2470, "loss": 2.7773, "lr": 6.882591093117409e-07, "epoch": 0.0728744939271255, "percentage": 0.73, "elapsed_time": "0:01:01", "remaining_time": "2:20:32"} +{"current_steps": 19, "total_steps": 2470, "loss": 2.7169, "lr": 7.287449392712551e-07, "epoch": 0.07692307692307693, "percentage": 0.77, "elapsed_time": "0:01:04", "remaining_time": "2:19:39"} +{"current_steps": 20, "total_steps": 2470, "loss": 2.7934, "lr": 7.692307692307694e-07, "epoch": 0.08097165991902834, "percentage": 0.81, "elapsed_time": "0:01:08", "remaining_time": "2:18:50"} +{"current_steps": 21, "total_steps": 2470, "loss": 2.713, "lr": 8.097165991902834e-07, "epoch": 0.08502024291497975, "percentage": 0.85, "elapsed_time": "0:01:11", "remaining_time": "2:18:07"} +{"current_steps": 22, "total_steps": 2470, "loss": 2.8722, "lr": 8.502024291497976e-07, "epoch": 0.08906882591093117, "percentage": 0.89, "elapsed_time": "0:01:14", "remaining_time": "2:17:38"} +{"current_steps": 23, "total_steps": 2470, "loss": 2.722, "lr": 8.906882591093118e-07, "epoch": 0.0931174089068826, "percentage": 0.93, "elapsed_time": "0:01:17", "remaining_time": "2:17:00"} +{"current_steps": 24, "total_steps": 2470, "loss": 2.5291, "lr": 9.31174089068826e-07, "epoch": 0.09716599190283401, "percentage": 0.97, "elapsed_time": "0:01:20", "remaining_time": "2:16:23"} +{"current_steps": 25, "total_steps": 2470, "loss": 2.7028, "lr": 9.716599190283402e-07, "epoch": 0.10121457489878542, "percentage": 1.01, "elapsed_time": "0:01:23", "remaining_time": "2:15:48"} +{"current_steps": 26, "total_steps": 2470, "loss": 2.7946, "lr": 1.0121457489878542e-06, "epoch": 0.10526315789473684, "percentage": 1.05, "elapsed_time": "0:01:26", "remaining_time": "2:15:16"} +{"current_steps": 27, "total_steps": 2470, "loss": 2.6139, "lr": 1.0526315789473685e-06, "epoch": 0.10931174089068826, "percentage": 1.09, "elapsed_time": "0:01:29", "remaining_time": "2:14:52"} +{"current_steps": 28, "total_steps": 2470, "loss": 2.469, "lr": 1.0931174089068828e-06, "epoch": 0.11336032388663968, "percentage": 1.13, "elapsed_time": "0:01:32", "remaining_time": "2:14:25"} +{"current_steps": 29, "total_steps": 2470, "loss": 2.6452, "lr": 1.133603238866397e-06, "epoch": 0.11740890688259109, "percentage": 1.17, "elapsed_time": "0:01:35", "remaining_time": "2:14:00"} +{"current_steps": 30, "total_steps": 2470, "loss": 2.4396, "lr": 1.174089068825911e-06, "epoch": 0.1214574898785425, "percentage": 1.21, "elapsed_time": "0:01:38", "remaining_time": "2:13:37"} +{"current_steps": 31, "total_steps": 2470, "loss": 2.469, "lr": 1.214574898785425e-06, "epoch": 0.12550607287449392, "percentage": 1.26, "elapsed_time": "0:01:41", "remaining_time": "2:13:21"} +{"current_steps": 32, "total_steps": 2470, "loss": 2.5795, "lr": 1.2550607287449393e-06, "epoch": 0.12955465587044535, "percentage": 1.3, "elapsed_time": "0:01:44", "remaining_time": "2:13:05"} +{"current_steps": 33, "total_steps": 2470, "loss": 2.6768, "lr": 1.2955465587044536e-06, "epoch": 0.13360323886639677, "percentage": 1.34, "elapsed_time": "0:01:47", "remaining_time": "2:12:45"} +{"current_steps": 34, "total_steps": 2470, "loss": 2.8086, "lr": 1.336032388663968e-06, "epoch": 0.13765182186234817, "percentage": 1.38, "elapsed_time": "0:01:50", "remaining_time": "2:12:25"} +{"current_steps": 35, "total_steps": 2470, "loss": 2.3603, "lr": 1.3765182186234818e-06, "epoch": 0.1417004048582996, "percentage": 1.42, "elapsed_time": "0:01:53", "remaining_time": "2:12:05"} +{"current_steps": 36, "total_steps": 2470, "loss": 2.7758, "lr": 1.417004048582996e-06, "epoch": 0.145748987854251, "percentage": 1.46, "elapsed_time": "0:01:56", "remaining_time": "2:11:45"} +{"current_steps": 37, "total_steps": 2470, "loss": 2.7543, "lr": 1.4574898785425101e-06, "epoch": 0.14979757085020243, "percentage": 1.5, "elapsed_time": "0:01:59", "remaining_time": "2:11:27"} +{"current_steps": 38, "total_steps": 2470, "loss": 2.7356, "lr": 1.4979757085020244e-06, "epoch": 0.15384615384615385, "percentage": 1.54, "elapsed_time": "0:02:03", "remaining_time": "2:11:13"} +{"current_steps": 39, "total_steps": 2470, "loss": 3.0218, "lr": 1.5384615384615387e-06, "epoch": 0.15789473684210525, "percentage": 1.58, "elapsed_time": "0:02:06", "remaining_time": "2:10:57"} +{"current_steps": 40, "total_steps": 2470, "loss": 2.6165, "lr": 1.5789473684210526e-06, "epoch": 0.16194331983805668, "percentage": 1.62, "elapsed_time": "0:02:09", "remaining_time": "2:10:40"} +{"current_steps": 41, "total_steps": 2470, "loss": 2.6223, "lr": 1.6194331983805669e-06, "epoch": 0.1659919028340081, "percentage": 1.66, "elapsed_time": "0:02:12", "remaining_time": "2:10:25"} +{"current_steps": 42, "total_steps": 2470, "loss": 2.7768, "lr": 1.6599190283400812e-06, "epoch": 0.1700404858299595, "percentage": 1.7, "elapsed_time": "0:02:15", "remaining_time": "2:10:41"} +{"current_steps": 43, "total_steps": 2470, "loss": 2.479, "lr": 1.7004048582995952e-06, "epoch": 0.17408906882591094, "percentage": 1.74, "elapsed_time": "0:02:19", "remaining_time": "2:10:53"} +{"current_steps": 44, "total_steps": 2470, "loss": 2.6842, "lr": 1.7408906882591095e-06, "epoch": 0.17813765182186234, "percentage": 1.78, "elapsed_time": "0:02:22", "remaining_time": "2:10:40"} +{"current_steps": 45, "total_steps": 2470, "loss": 2.3611, "lr": 1.7813765182186236e-06, "epoch": 0.18218623481781376, "percentage": 1.82, "elapsed_time": "0:02:25", "remaining_time": "2:10:27"} +{"current_steps": 46, "total_steps": 2470, "loss": 2.6644, "lr": 1.8218623481781379e-06, "epoch": 0.1862348178137652, "percentage": 1.86, "elapsed_time": "0:02:28", "remaining_time": "2:10:12"} +{"current_steps": 47, "total_steps": 2470, "loss": 2.7313, "lr": 1.862348178137652e-06, "epoch": 0.1902834008097166, "percentage": 1.9, "elapsed_time": "0:02:31", "remaining_time": "2:09:59"} +{"current_steps": 48, "total_steps": 2470, "loss": 2.976, "lr": 1.902834008097166e-06, "epoch": 0.19433198380566802, "percentage": 1.94, "elapsed_time": "0:02:34", "remaining_time": "2:09:45"} +{"current_steps": 49, "total_steps": 2470, "loss": 2.8615, "lr": 1.9433198380566803e-06, "epoch": 0.19838056680161945, "percentage": 1.98, "elapsed_time": "0:02:37", "remaining_time": "2:09:34"} +{"current_steps": 50, "total_steps": 2470, "loss": 2.7385, "lr": 1.9838056680161946e-06, "epoch": 0.20242914979757085, "percentage": 2.02, "elapsed_time": "0:02:40", "remaining_time": "2:09:21"} +{"current_steps": 51, "total_steps": 2470, "loss": 2.7926, "lr": 2.0242914979757085e-06, "epoch": 0.20647773279352227, "percentage": 2.06, "elapsed_time": "0:02:43", "remaining_time": "2:09:09"} +{"current_steps": 52, "total_steps": 2470, "loss": 2.8905, "lr": 2.0647773279352228e-06, "epoch": 0.21052631578947367, "percentage": 2.11, "elapsed_time": "0:02:46", "remaining_time": "2:08:57"} +{"current_steps": 53, "total_steps": 2470, "loss": 2.7044, "lr": 2.105263157894737e-06, "epoch": 0.2145748987854251, "percentage": 2.15, "elapsed_time": "0:02:49", "remaining_time": "2:08:49"} +{"current_steps": 54, "total_steps": 2470, "loss": 2.6044, "lr": 2.1457489878542513e-06, "epoch": 0.21862348178137653, "percentage": 2.19, "elapsed_time": "0:02:52", "remaining_time": "2:08:38"} +{"current_steps": 55, "total_steps": 2470, "loss": 2.7154, "lr": 2.1862348178137656e-06, "epoch": 0.22267206477732793, "percentage": 2.23, "elapsed_time": "0:02:55", "remaining_time": "2:08:28"} +{"current_steps": 56, "total_steps": 2470, "loss": 2.6151, "lr": 2.2267206477732795e-06, "epoch": 0.22672064777327935, "percentage": 2.27, "elapsed_time": "0:02:58", "remaining_time": "2:08:21"} +{"current_steps": 57, "total_steps": 2470, "loss": 2.8561, "lr": 2.267206477732794e-06, "epoch": 0.23076923076923078, "percentage": 2.31, "elapsed_time": "0:03:01", "remaining_time": "2:08:11"} +{"current_steps": 58, "total_steps": 2470, "loss": 2.994, "lr": 2.307692307692308e-06, "epoch": 0.23481781376518218, "percentage": 2.35, "elapsed_time": "0:03:04", "remaining_time": "2:08:01"} +{"current_steps": 59, "total_steps": 2470, "loss": 2.9581, "lr": 2.348178137651822e-06, "epoch": 0.2388663967611336, "percentage": 2.39, "elapsed_time": "0:03:07", "remaining_time": "2:07:53"} +{"current_steps": 60, "total_steps": 2470, "loss": 2.9613, "lr": 2.3886639676113362e-06, "epoch": 0.242914979757085, "percentage": 2.43, "elapsed_time": "0:03:10", "remaining_time": "2:07:44"} +{"current_steps": 61, "total_steps": 2470, "loss": 2.7295, "lr": 2.42914979757085e-06, "epoch": 0.24696356275303644, "percentage": 2.47, "elapsed_time": "0:03:13", "remaining_time": "2:07:34"} +{"current_steps": 62, "total_steps": 2470, "loss": 2.7126, "lr": 2.4696356275303644e-06, "epoch": 0.25101214574898784, "percentage": 2.51, "elapsed_time": "0:03:16", "remaining_time": "2:07:25"} +{"current_steps": 63, "total_steps": 2470, "loss": 2.8892, "lr": 2.5101214574898787e-06, "epoch": 0.2550607287449393, "percentage": 2.55, "elapsed_time": "0:03:19", "remaining_time": "2:07:16"} +{"current_steps": 64, "total_steps": 2470, "loss": 2.6468, "lr": 2.550607287449393e-06, "epoch": 0.2591093117408907, "percentage": 2.59, "elapsed_time": "0:03:22", "remaining_time": "2:07:07"} +{"current_steps": 65, "total_steps": 2470, "loss": 2.5171, "lr": 2.5910931174089072e-06, "epoch": 0.2631578947368421, "percentage": 2.63, "elapsed_time": "0:03:25", "remaining_time": "2:06:59"} +{"current_steps": 66, "total_steps": 2470, "loss": 2.5617, "lr": 2.631578947368421e-06, "epoch": 0.26720647773279355, "percentage": 2.67, "elapsed_time": "0:03:28", "remaining_time": "2:06:50"} +{"current_steps": 67, "total_steps": 2470, "loss": 2.6525, "lr": 2.672064777327936e-06, "epoch": 0.27125506072874495, "percentage": 2.71, "elapsed_time": "0:03:31", "remaining_time": "2:06:41"} +{"current_steps": 68, "total_steps": 2470, "loss": 2.5136, "lr": 2.7125506072874497e-06, "epoch": 0.27530364372469635, "percentage": 2.75, "elapsed_time": "0:03:34", "remaining_time": "2:06:33"} +{"current_steps": 69, "total_steps": 2470, "loss": 2.7136, "lr": 2.7530364372469636e-06, "epoch": 0.2793522267206478, "percentage": 2.79, "elapsed_time": "0:03:38", "remaining_time": "2:06:26"} +{"current_steps": 70, "total_steps": 2470, "loss": 2.5836, "lr": 2.7935222672064783e-06, "epoch": 0.2834008097165992, "percentage": 2.83, "elapsed_time": "0:03:41", "remaining_time": "2:06:19"} +{"current_steps": 71, "total_steps": 2470, "loss": 2.6042, "lr": 2.834008097165992e-06, "epoch": 0.2874493927125506, "percentage": 2.87, "elapsed_time": "0:03:44", "remaining_time": "2:06:12"} +{"current_steps": 72, "total_steps": 2470, "loss": 2.4534, "lr": 2.8744939271255064e-06, "epoch": 0.291497975708502, "percentage": 2.91, "elapsed_time": "0:03:47", "remaining_time": "2:06:04"} +{"current_steps": 73, "total_steps": 2470, "loss": 2.7732, "lr": 2.9149797570850203e-06, "epoch": 0.29554655870445345, "percentage": 2.96, "elapsed_time": "0:03:50", "remaining_time": "2:05:57"} +{"current_steps": 74, "total_steps": 2470, "loss": 2.6927, "lr": 2.955465587044535e-06, "epoch": 0.29959514170040485, "percentage": 3.0, "elapsed_time": "0:03:53", "remaining_time": "2:05:50"} +{"current_steps": 75, "total_steps": 2470, "loss": 2.7532, "lr": 2.995951417004049e-06, "epoch": 0.30364372469635625, "percentage": 3.04, "elapsed_time": "0:03:56", "remaining_time": "2:05:44"} +{"current_steps": 76, "total_steps": 2470, "loss": 2.4982, "lr": 3.0364372469635627e-06, "epoch": 0.3076923076923077, "percentage": 3.08, "elapsed_time": "0:03:59", "remaining_time": "2:05:37"} +{"current_steps": 77, "total_steps": 2470, "loss": 2.4821, "lr": 3.0769230769230774e-06, "epoch": 0.3117408906882591, "percentage": 3.12, "elapsed_time": "0:04:02", "remaining_time": "2:05:30"} +{"current_steps": 78, "total_steps": 2470, "loss": 2.8892, "lr": 3.1174089068825913e-06, "epoch": 0.3157894736842105, "percentage": 3.16, "elapsed_time": "0:04:05", "remaining_time": "2:05:24"} +{"current_steps": 79, "total_steps": 2470, "loss": 2.5355, "lr": 3.157894736842105e-06, "epoch": 0.31983805668016196, "percentage": 3.2, "elapsed_time": "0:04:08", "remaining_time": "2:05:18"} +{"current_steps": 80, "total_steps": 2470, "loss": 2.4627, "lr": 3.19838056680162e-06, "epoch": 0.32388663967611336, "percentage": 3.24, "elapsed_time": "0:04:11", "remaining_time": "2:05:11"} +{"current_steps": 81, "total_steps": 2470, "loss": 2.5097, "lr": 3.2388663967611337e-06, "epoch": 0.32793522267206476, "percentage": 3.28, "elapsed_time": "0:04:14", "remaining_time": "2:05:05"} +{"current_steps": 82, "total_steps": 2470, "loss": 2.5888, "lr": 3.279352226720648e-06, "epoch": 0.3319838056680162, "percentage": 3.32, "elapsed_time": "0:04:17", "remaining_time": "2:04:58"} +{"current_steps": 83, "total_steps": 2470, "loss": 2.4857, "lr": 3.3198380566801623e-06, "epoch": 0.3360323886639676, "percentage": 3.36, "elapsed_time": "0:04:20", "remaining_time": "2:04:52"} +{"current_steps": 84, "total_steps": 2470, "loss": 2.3704, "lr": 3.3603238866396766e-06, "epoch": 0.340080971659919, "percentage": 3.4, "elapsed_time": "0:04:23", "remaining_time": "2:04:46"} +{"current_steps": 85, "total_steps": 2470, "loss": 2.4814, "lr": 3.4008097165991905e-06, "epoch": 0.3441295546558704, "percentage": 3.44, "elapsed_time": "0:04:26", "remaining_time": "2:04:39"} +{"current_steps": 86, "total_steps": 2470, "loss": 2.7336, "lr": 3.4412955465587043e-06, "epoch": 0.3481781376518219, "percentage": 3.48, "elapsed_time": "0:04:29", "remaining_time": "2:04:34"} +{"current_steps": 87, "total_steps": 2470, "loss": 2.6197, "lr": 3.481781376518219e-06, "epoch": 0.3522267206477733, "percentage": 3.52, "elapsed_time": "0:04:33", "remaining_time": "2:04:41"} +{"current_steps": 88, "total_steps": 2470, "loss": 2.3123, "lr": 3.522267206477733e-06, "epoch": 0.3562753036437247, "percentage": 3.56, "elapsed_time": "0:04:36", "remaining_time": "2:04:35"} +{"current_steps": 89, "total_steps": 2470, "loss": 2.659, "lr": 3.562753036437247e-06, "epoch": 0.3603238866396761, "percentage": 3.6, "elapsed_time": "0:04:39", "remaining_time": "2:04:29"} +{"current_steps": 90, "total_steps": 2470, "loss": 2.6324, "lr": 3.6032388663967615e-06, "epoch": 0.3643724696356275, "percentage": 3.64, "elapsed_time": "0:04:42", "remaining_time": "2:04:35"} +{"current_steps": 91, "total_steps": 2470, "loss": 2.5935, "lr": 3.6437246963562758e-06, "epoch": 0.3684210526315789, "percentage": 3.68, "elapsed_time": "0:04:45", "remaining_time": "2:04:29"} +{"current_steps": 92, "total_steps": 2470, "loss": 2.8634, "lr": 3.6842105263157896e-06, "epoch": 0.3724696356275304, "percentage": 3.72, "elapsed_time": "0:04:48", "remaining_time": "2:04:23"} +{"current_steps": 93, "total_steps": 2470, "loss": 2.3526, "lr": 3.724696356275304e-06, "epoch": 0.3765182186234818, "percentage": 3.77, "elapsed_time": "0:04:51", "remaining_time": "2:04:17"} +{"current_steps": 94, "total_steps": 2470, "loss": 2.4551, "lr": 3.7651821862348182e-06, "epoch": 0.3805668016194332, "percentage": 3.81, "elapsed_time": "0:04:54", "remaining_time": "2:04:12"} +{"current_steps": 95, "total_steps": 2470, "loss": 2.441, "lr": 3.805668016194332e-06, "epoch": 0.38461538461538464, "percentage": 3.85, "elapsed_time": "0:04:57", "remaining_time": "2:04:06"} +{"current_steps": 96, "total_steps": 2470, "loss": 2.5222, "lr": 3.846153846153847e-06, "epoch": 0.38866396761133604, "percentage": 3.89, "elapsed_time": "0:05:00", "remaining_time": "2:04:00"} +{"current_steps": 97, "total_steps": 2470, "loss": 2.6018, "lr": 3.886639676113361e-06, "epoch": 0.39271255060728744, "percentage": 3.93, "elapsed_time": "0:05:03", "remaining_time": "2:03:54"} +{"current_steps": 98, "total_steps": 2470, "loss": 2.4227, "lr": 3.9271255060728745e-06, "epoch": 0.3967611336032389, "percentage": 3.97, "elapsed_time": "0:05:06", "remaining_time": "2:03:48"} +{"current_steps": 99, "total_steps": 2470, "loss": 2.4637, "lr": 3.967611336032389e-06, "epoch": 0.4008097165991903, "percentage": 4.01, "elapsed_time": "0:05:09", "remaining_time": "2:03:43"} +{"current_steps": 100, "total_steps": 2470, "loss": 2.5228, "lr": 4.008097165991903e-06, "epoch": 0.4048582995951417, "percentage": 4.05, "elapsed_time": "0:05:13", "remaining_time": "2:03:38"} +{"current_steps": 101, "total_steps": 2470, "loss": 2.6356, "lr": 4.048582995951417e-06, "epoch": 0.4089068825910931, "percentage": 4.09, "elapsed_time": "0:05:16", "remaining_time": "2:03:32"} +{"current_steps": 102, "total_steps": 2470, "loss": 2.3874, "lr": 4.089068825910931e-06, "epoch": 0.41295546558704455, "percentage": 4.13, "elapsed_time": "0:05:19", "remaining_time": "2:03:28"} +{"current_steps": 103, "total_steps": 2470, "loss": 2.6671, "lr": 4.1295546558704455e-06, "epoch": 0.41700404858299595, "percentage": 4.17, "elapsed_time": "0:05:22", "remaining_time": "2:03:23"} +{"current_steps": 104, "total_steps": 2470, "loss": 2.6795, "lr": 4.170040485829959e-06, "epoch": 0.42105263157894735, "percentage": 4.21, "elapsed_time": "0:05:25", "remaining_time": "2:03:17"} +{"current_steps": 105, "total_steps": 2470, "loss": 2.4891, "lr": 4.210526315789474e-06, "epoch": 0.4251012145748988, "percentage": 4.25, "elapsed_time": "0:05:28", "remaining_time": "2:03:12"} +{"current_steps": 106, "total_steps": 2470, "loss": 2.5374, "lr": 4.251012145748988e-06, "epoch": 0.4291497975708502, "percentage": 4.29, "elapsed_time": "0:05:31", "remaining_time": "2:03:06"} +{"current_steps": 107, "total_steps": 2470, "loss": 2.4393, "lr": 4.291497975708503e-06, "epoch": 0.4331983805668016, "percentage": 4.33, "elapsed_time": "0:05:34", "remaining_time": "2:03:01"} +{"current_steps": 108, "total_steps": 2470, "loss": 2.3122, "lr": 4.3319838056680166e-06, "epoch": 0.43724696356275305, "percentage": 4.37, "elapsed_time": "0:05:37", "remaining_time": "2:02:55"} +{"current_steps": 109, "total_steps": 2470, "loss": 2.5436, "lr": 4.372469635627531e-06, "epoch": 0.44129554655870445, "percentage": 4.41, "elapsed_time": "0:05:40", "remaining_time": "2:02:50"} +{"current_steps": 110, "total_steps": 2470, "loss": 2.5005, "lr": 4.412955465587045e-06, "epoch": 0.44534412955465585, "percentage": 4.45, "elapsed_time": "0:05:43", "remaining_time": "2:02:45"} +{"current_steps": 111, "total_steps": 2470, "loss": 2.4483, "lr": 4.453441295546559e-06, "epoch": 0.4493927125506073, "percentage": 4.49, "elapsed_time": "0:05:46", "remaining_time": "2:02:39"} +{"current_steps": 112, "total_steps": 2470, "loss": 2.5333, "lr": 4.493927125506074e-06, "epoch": 0.4534412955465587, "percentage": 4.53, "elapsed_time": "0:05:49", "remaining_time": "2:02:34"} +{"current_steps": 113, "total_steps": 2470, "loss": 2.5613, "lr": 4.534412955465588e-06, "epoch": 0.4574898785425101, "percentage": 4.57, "elapsed_time": "0:05:52", "remaining_time": "2:02:29"} +{"current_steps": 114, "total_steps": 2470, "loss": 2.973, "lr": 4.5748987854251014e-06, "epoch": 0.46153846153846156, "percentage": 4.62, "elapsed_time": "0:05:55", "remaining_time": "2:02:24"} +{"current_steps": 115, "total_steps": 2470, "loss": 2.5947, "lr": 4.615384615384616e-06, "epoch": 0.46558704453441296, "percentage": 4.66, "elapsed_time": "0:05:58", "remaining_time": "2:02:18"} +{"current_steps": 116, "total_steps": 2470, "loss": 2.4581, "lr": 4.65587044534413e-06, "epoch": 0.46963562753036436, "percentage": 4.7, "elapsed_time": "0:06:01", "remaining_time": "2:02:13"} +{"current_steps": 117, "total_steps": 2470, "loss": 2.4571, "lr": 4.696356275303644e-06, "epoch": 0.47368421052631576, "percentage": 4.74, "elapsed_time": "0:06:04", "remaining_time": "2:02:09"} +{"current_steps": 118, "total_steps": 2470, "loss": 2.6622, "lr": 4.736842105263158e-06, "epoch": 0.4777327935222672, "percentage": 4.78, "elapsed_time": "0:06:07", "remaining_time": "2:02:04"} +{"current_steps": 119, "total_steps": 2470, "loss": 2.3622, "lr": 4.7773279352226725e-06, "epoch": 0.4817813765182186, "percentage": 4.82, "elapsed_time": "0:06:10", "remaining_time": "2:01:59"} +{"current_steps": 120, "total_steps": 2470, "loss": 2.4812, "lr": 4.817813765182186e-06, "epoch": 0.48582995951417, "percentage": 4.86, "elapsed_time": "0:06:13", "remaining_time": "2:01:55"} +{"current_steps": 121, "total_steps": 2470, "loss": 2.5297, "lr": 4.8582995951417e-06, "epoch": 0.4898785425101215, "percentage": 4.9, "elapsed_time": "0:06:16", "remaining_time": "2:01:50"} +{"current_steps": 122, "total_steps": 2470, "loss": 2.5534, "lr": 4.898785425101215e-06, "epoch": 0.4939271255060729, "percentage": 4.94, "elapsed_time": "0:06:19", "remaining_time": "2:01:45"} +{"current_steps": 123, "total_steps": 2470, "loss": 2.3909, "lr": 4.939271255060729e-06, "epoch": 0.4979757085020243, "percentage": 4.98, "elapsed_time": "0:06:22", "remaining_time": "2:01:41"} +{"current_steps": 124, "total_steps": 2470, "loss": 2.3104, "lr": 4.9797570850202435e-06, "epoch": 0.5020242914979757, "percentage": 5.02, "elapsed_time": "0:06:25", "remaining_time": "2:01:37"} +{"current_steps": 125, "total_steps": 2470, "loss": 2.5894, "lr": 5.020242914979757e-06, "epoch": 0.5060728744939271, "percentage": 5.06, "elapsed_time": "0:06:28", "remaining_time": "2:01:32"} +{"current_steps": 126, "total_steps": 2470, "loss": 2.686, "lr": 5.060728744939272e-06, "epoch": 0.5101214574898786, "percentage": 5.1, "elapsed_time": "0:06:31", "remaining_time": "2:01:28"} +{"current_steps": 127, "total_steps": 2470, "loss": 2.5203, "lr": 5.101214574898786e-06, "epoch": 0.5141700404858299, "percentage": 5.14, "elapsed_time": "0:06:34", "remaining_time": "2:01:23"} +{"current_steps": 128, "total_steps": 2470, "loss": 2.66, "lr": 5.1417004048583e-06, "epoch": 0.5182186234817814, "percentage": 5.18, "elapsed_time": "0:06:37", "remaining_time": "2:01:18"} +{"current_steps": 129, "total_steps": 2470, "loss": 2.5008, "lr": 5.1821862348178145e-06, "epoch": 0.5222672064777328, "percentage": 5.22, "elapsed_time": "0:06:40", "remaining_time": "2:01:14"} +{"current_steps": 130, "total_steps": 2470, "loss": 2.3134, "lr": 5.222672064777329e-06, "epoch": 0.5263157894736842, "percentage": 5.26, "elapsed_time": "0:06:43", "remaining_time": "2:01:09"} +{"current_steps": 131, "total_steps": 2470, "loss": 2.4191, "lr": 5.263157894736842e-06, "epoch": 0.5303643724696356, "percentage": 5.3, "elapsed_time": "0:06:46", "remaining_time": "2:01:05"} +{"current_steps": 132, "total_steps": 2470, "loss": 2.5499, "lr": 5.303643724696357e-06, "epoch": 0.5344129554655871, "percentage": 5.34, "elapsed_time": "0:06:49", "remaining_time": "2:01:01"} +{"current_steps": 133, "total_steps": 2470, "loss": 2.4736, "lr": 5.344129554655872e-06, "epoch": 0.5384615384615384, "percentage": 5.38, "elapsed_time": "0:06:52", "remaining_time": "2:00:56"} +{"current_steps": 134, "total_steps": 2470, "loss": 2.3723, "lr": 5.384615384615385e-06, "epoch": 0.5425101214574899, "percentage": 5.43, "elapsed_time": "0:06:55", "remaining_time": "2:00:51"} +{"current_steps": 135, "total_steps": 2470, "loss": 3.4815, "lr": 5.425101214574899e-06, "epoch": 0.5465587044534413, "percentage": 5.47, "elapsed_time": "0:06:59", "remaining_time": "2:00:55"} +{"current_steps": 136, "total_steps": 2470, "loss": 3.4231, "lr": 5.465587044534414e-06, "epoch": 0.5506072874493927, "percentage": 5.51, "elapsed_time": "0:07:02", "remaining_time": "2:00:50"} +{"current_steps": 137, "total_steps": 2470, "loss": 4.4025, "lr": 5.506072874493927e-06, "epoch": 0.5546558704453441, "percentage": 5.55, "elapsed_time": "0:07:05", "remaining_time": "2:00:46"} +{"current_steps": 138, "total_steps": 2470, "loss": 2.3958, "lr": 5.546558704453442e-06, "epoch": 0.5587044534412956, "percentage": 5.59, "elapsed_time": "0:07:08", "remaining_time": "2:00:41"} +{"current_steps": 139, "total_steps": 2470, "loss": 2.1963, "lr": 5.5870445344129565e-06, "epoch": 0.562753036437247, "percentage": 5.63, "elapsed_time": "0:07:11", "remaining_time": "2:00:37"} +{"current_steps": 140, "total_steps": 2470, "loss": 2.4664, "lr": 5.6275303643724695e-06, "epoch": 0.5668016194331984, "percentage": 5.67, "elapsed_time": "0:07:14", "remaining_time": "2:00:32"} +{"current_steps": 141, "total_steps": 2470, "loss": 2.2672, "lr": 5.668016194331984e-06, "epoch": 0.5708502024291497, "percentage": 5.71, "elapsed_time": "0:07:18", "remaining_time": "2:00:35"} +{"current_steps": 142, "total_steps": 2470, "loss": 2.4001, "lr": 5.708502024291498e-06, "epoch": 0.5748987854251012, "percentage": 5.75, "elapsed_time": "0:07:21", "remaining_time": "2:00:30"} +{"current_steps": 143, "total_steps": 2470, "loss": 2.2186, "lr": 5.748987854251013e-06, "epoch": 0.5789473684210527, "percentage": 5.79, "elapsed_time": "0:07:24", "remaining_time": "2:00:26"} +{"current_steps": 144, "total_steps": 2470, "loss": 2.5692, "lr": 5.789473684210527e-06, "epoch": 0.582995951417004, "percentage": 5.83, "elapsed_time": "0:07:27", "remaining_time": "2:00:21"} +{"current_steps": 145, "total_steps": 2470, "loss": 2.3088, "lr": 5.8299595141700406e-06, "epoch": 0.5870445344129555, "percentage": 5.87, "elapsed_time": "0:07:30", "remaining_time": "2:00:16"} +{"current_steps": 146, "total_steps": 2470, "loss": 2.4148, "lr": 5.870445344129555e-06, "epoch": 0.5910931174089069, "percentage": 5.91, "elapsed_time": "0:07:33", "remaining_time": "2:00:12"} +{"current_steps": 147, "total_steps": 2470, "loss": 2.146, "lr": 5.91093117408907e-06, "epoch": 0.5951417004048583, "percentage": 5.95, "elapsed_time": "0:07:36", "remaining_time": "2:00:08"} +{"current_steps": 148, "total_steps": 2470, "loss": 2.0989, "lr": 5.951417004048583e-06, "epoch": 0.5991902834008097, "percentage": 5.99, "elapsed_time": "0:07:39", "remaining_time": "2:00:04"} +{"current_steps": 149, "total_steps": 2470, "loss": 2.2379, "lr": 5.991902834008098e-06, "epoch": 0.6032388663967612, "percentage": 6.03, "elapsed_time": "0:07:42", "remaining_time": "2:00:00"} +{"current_steps": 150, "total_steps": 2470, "loss": 2.18, "lr": 6.0323886639676124e-06, "epoch": 0.6072874493927125, "percentage": 6.07, "elapsed_time": "0:07:45", "remaining_time": "1:59:55"} +{"current_steps": 151, "total_steps": 2470, "loss": 2.4302, "lr": 6.0728744939271254e-06, "epoch": 0.611336032388664, "percentage": 6.11, "elapsed_time": "0:07:48", "remaining_time": "1:59:52"} +{"current_steps": 152, "total_steps": 2470, "loss": 2.2208, "lr": 6.11336032388664e-06, "epoch": 0.6153846153846154, "percentage": 6.15, "elapsed_time": "0:07:51", "remaining_time": "1:59:47"} +{"current_steps": 153, "total_steps": 2470, "loss": 2.3089, "lr": 6.153846153846155e-06, "epoch": 0.6194331983805668, "percentage": 6.19, "elapsed_time": "0:07:54", "remaining_time": "1:59:43"} +{"current_steps": 154, "total_steps": 2470, "loss": 2.5248, "lr": 6.194331983805668e-06, "epoch": 0.6234817813765182, "percentage": 6.23, "elapsed_time": "0:07:57", "remaining_time": "1:59:39"} +{"current_steps": 155, "total_steps": 2470, "loss": 2.2786, "lr": 6.234817813765183e-06, "epoch": 0.6275303643724697, "percentage": 6.28, "elapsed_time": "0:08:00", "remaining_time": "1:59:34"} +{"current_steps": 156, "total_steps": 2470, "loss": 2.564, "lr": 6.275303643724697e-06, "epoch": 0.631578947368421, "percentage": 6.32, "elapsed_time": "0:08:03", "remaining_time": "1:59:30"} +{"current_steps": 157, "total_steps": 2470, "loss": 2.2575, "lr": 6.31578947368421e-06, "epoch": 0.6356275303643725, "percentage": 6.36, "elapsed_time": "0:08:06", "remaining_time": "1:59:26"} +{"current_steps": 158, "total_steps": 2470, "loss": 2.4085, "lr": 6.356275303643725e-06, "epoch": 0.6396761133603239, "percentage": 6.4, "elapsed_time": "0:08:09", "remaining_time": "1:59:22"} +{"current_steps": 159, "total_steps": 2470, "loss": 2.3392, "lr": 6.39676113360324e-06, "epoch": 0.6437246963562753, "percentage": 6.44, "elapsed_time": "0:08:12", "remaining_time": "1:59:17"} +{"current_steps": 160, "total_steps": 2470, "loss": 2.3474, "lr": 6.437246963562754e-06, "epoch": 0.6477732793522267, "percentage": 6.48, "elapsed_time": "0:08:15", "remaining_time": "1:59:13"} +{"current_steps": 161, "total_steps": 2470, "loss": 2.206, "lr": 6.4777327935222675e-06, "epoch": 0.6518218623481782, "percentage": 6.52, "elapsed_time": "0:08:18", "remaining_time": "1:59:10"} +{"current_steps": 162, "total_steps": 2470, "loss": 2.4407, "lr": 6.518218623481782e-06, "epoch": 0.6558704453441295, "percentage": 6.56, "elapsed_time": "0:08:21", "remaining_time": "1:59:06"} +{"current_steps": 163, "total_steps": 2470, "loss": 2.3308, "lr": 6.558704453441296e-06, "epoch": 0.659919028340081, "percentage": 6.6, "elapsed_time": "0:08:24", "remaining_time": "1:59:02"} +{"current_steps": 164, "total_steps": 2470, "loss": 2.2799, "lr": 6.599190283400811e-06, "epoch": 0.6639676113360324, "percentage": 6.64, "elapsed_time": "0:08:27", "remaining_time": "1:58:57"} +{"current_steps": 165, "total_steps": 2470, "loss": 2.3277, "lr": 6.639676113360325e-06, "epoch": 0.6680161943319838, "percentage": 6.68, "elapsed_time": "0:08:30", "remaining_time": "1:58:53"} +{"current_steps": 166, "total_steps": 2470, "loss": 2.2357, "lr": 6.6801619433198385e-06, "epoch": 0.6720647773279352, "percentage": 6.72, "elapsed_time": "0:08:33", "remaining_time": "1:58:49"} +{"current_steps": 167, "total_steps": 2470, "loss": 2.1807, "lr": 6.720647773279353e-06, "epoch": 0.6761133603238867, "percentage": 6.76, "elapsed_time": "0:08:36", "remaining_time": "1:58:45"} +{"current_steps": 168, "total_steps": 2470, "loss": 2.4623, "lr": 6.761133603238867e-06, "epoch": 0.680161943319838, "percentage": 6.8, "elapsed_time": "0:08:39", "remaining_time": "1:58:41"} +{"current_steps": 169, "total_steps": 2470, "loss": 2.2289, "lr": 6.801619433198381e-06, "epoch": 0.6842105263157895, "percentage": 6.84, "elapsed_time": "0:08:42", "remaining_time": "1:58:37"} +{"current_steps": 170, "total_steps": 2470, "loss": 2.3704, "lr": 6.842105263157896e-06, "epoch": 0.6882591093117408, "percentage": 6.88, "elapsed_time": "0:08:45", "remaining_time": "1:58:33"} +{"current_steps": 171, "total_steps": 2470, "loss": 2.3322, "lr": 6.882591093117409e-06, "epoch": 0.6923076923076923, "percentage": 6.92, "elapsed_time": "0:08:48", "remaining_time": "1:58:28"} +{"current_steps": 172, "total_steps": 2470, "loss": 2.1067, "lr": 6.923076923076923e-06, "epoch": 0.6963562753036437, "percentage": 6.96, "elapsed_time": "0:08:51", "remaining_time": "1:58:24"} +{"current_steps": 173, "total_steps": 2470, "loss": 2.6915, "lr": 6.963562753036438e-06, "epoch": 0.7004048582995951, "percentage": 7.0, "elapsed_time": "0:08:54", "remaining_time": "1:58:20"} +{"current_steps": 174, "total_steps": 2470, "loss": 2.3607, "lr": 7.004048582995951e-06, "epoch": 0.7044534412955465, "percentage": 7.04, "elapsed_time": "0:08:57", "remaining_time": "1:58:16"} +{"current_steps": 175, "total_steps": 2470, "loss": 2.0946, "lr": 7.044534412955466e-06, "epoch": 0.708502024291498, "percentage": 7.09, "elapsed_time": "0:09:00", "remaining_time": "1:58:12"} +{"current_steps": 176, "total_steps": 2470, "loss": 2.2197, "lr": 7.0850202429149805e-06, "epoch": 0.7125506072874493, "percentage": 7.13, "elapsed_time": "0:09:03", "remaining_time": "1:58:08"} +{"current_steps": 177, "total_steps": 2470, "loss": 2.2515, "lr": 7.125506072874494e-06, "epoch": 0.7165991902834008, "percentage": 7.17, "elapsed_time": "0:09:06", "remaining_time": "1:58:03"} +{"current_steps": 178, "total_steps": 2470, "loss": 2.2984, "lr": 7.165991902834008e-06, "epoch": 0.7206477732793523, "percentage": 7.21, "elapsed_time": "0:09:09", "remaining_time": "1:57:59"} +{"current_steps": 179, "total_steps": 2470, "loss": 2.3498, "lr": 7.206477732793523e-06, "epoch": 0.7246963562753036, "percentage": 7.25, "elapsed_time": "0:09:13", "remaining_time": "1:58:01"} +{"current_steps": 180, "total_steps": 2470, "loss": 2.3922, "lr": 7.246963562753037e-06, "epoch": 0.728744939271255, "percentage": 7.29, "elapsed_time": "0:09:16", "remaining_time": "1:57:57"} +{"current_steps": 181, "total_steps": 2470, "loss": 2.2572, "lr": 7.2874493927125516e-06, "epoch": 0.7327935222672065, "percentage": 7.33, "elapsed_time": "0:09:19", "remaining_time": "1:57:53"} +{"current_steps": 182, "total_steps": 2470, "loss": 2.327, "lr": 7.327935222672065e-06, "epoch": 0.7368421052631579, "percentage": 7.37, "elapsed_time": "0:09:22", "remaining_time": "1:57:49"} +{"current_steps": 183, "total_steps": 2470, "loss": 2.8703, "lr": 7.368421052631579e-06, "epoch": 0.7408906882591093, "percentage": 7.41, "elapsed_time": "0:09:25", "remaining_time": "1:57:45"} +{"current_steps": 184, "total_steps": 2470, "loss": 2.2888, "lr": 7.408906882591094e-06, "epoch": 0.7449392712550608, "percentage": 7.45, "elapsed_time": "0:09:28", "remaining_time": "1:57:42"} +{"current_steps": 185, "total_steps": 2470, "loss": 2.2582, "lr": 7.449392712550608e-06, "epoch": 0.7489878542510121, "percentage": 7.49, "elapsed_time": "0:09:31", "remaining_time": "1:57:42"} +{"current_steps": 186, "total_steps": 2470, "loss": 2.0775, "lr": 7.489878542510122e-06, "epoch": 0.7530364372469636, "percentage": 7.53, "elapsed_time": "0:09:34", "remaining_time": "1:57:38"} +{"current_steps": 187, "total_steps": 2470, "loss": 2.2682, "lr": 7.5303643724696364e-06, "epoch": 0.757085020242915, "percentage": 7.57, "elapsed_time": "0:09:37", "remaining_time": "1:57:35"} +{"current_steps": 188, "total_steps": 2470, "loss": 3.2512, "lr": 7.570850202429151e-06, "epoch": 0.7611336032388664, "percentage": 7.61, "elapsed_time": "0:09:40", "remaining_time": "1:57:31"} +{"current_steps": 189, "total_steps": 2470, "loss": 3.2673, "lr": 7.611336032388664e-06, "epoch": 0.7651821862348178, "percentage": 7.65, "elapsed_time": "0:09:43", "remaining_time": "1:57:27"} +{"current_steps": 190, "total_steps": 2470, "loss": 3.288, "lr": 7.651821862348178e-06, "epoch": 0.7692307692307693, "percentage": 7.69, "elapsed_time": "0:09:46", "remaining_time": "1:57:23"} +{"current_steps": 191, "total_steps": 2470, "loss": 2.3525, "lr": 7.692307692307694e-06, "epoch": 0.7732793522267206, "percentage": 7.73, "elapsed_time": "0:09:49", "remaining_time": "1:57:19"} +{"current_steps": 192, "total_steps": 2470, "loss": 2.4147, "lr": 7.732793522267207e-06, "epoch": 0.7773279352226721, "percentage": 7.77, "elapsed_time": "0:09:53", "remaining_time": "1:57:16"} +{"current_steps": 193, "total_steps": 2470, "loss": 2.4408, "lr": 7.773279352226721e-06, "epoch": 0.7813765182186235, "percentage": 7.81, "elapsed_time": "0:09:56", "remaining_time": "1:57:12"} +{"current_steps": 194, "total_steps": 2470, "loss": 2.2427, "lr": 7.813765182186235e-06, "epoch": 0.7854251012145749, "percentage": 7.85, "elapsed_time": "0:09:59", "remaining_time": "1:57:08"} +{"current_steps": 195, "total_steps": 2470, "loss": 2.1401, "lr": 7.854251012145749e-06, "epoch": 0.7894736842105263, "percentage": 7.89, "elapsed_time": "0:10:02", "remaining_time": "1:57:04"} +{"current_steps": 196, "total_steps": 2470, "loss": 2.417, "lr": 7.894736842105265e-06, "epoch": 0.7935222672064778, "percentage": 7.94, "elapsed_time": "0:10:05", "remaining_time": "1:57:00"} +{"current_steps": 197, "total_steps": 2470, "loss": 2.343, "lr": 7.935222672064778e-06, "epoch": 0.7975708502024291, "percentage": 7.98, "elapsed_time": "0:10:08", "remaining_time": "1:56:56"} +{"current_steps": 198, "total_steps": 2470, "loss": 2.0718, "lr": 7.975708502024292e-06, "epoch": 0.8016194331983806, "percentage": 8.02, "elapsed_time": "0:10:11", "remaining_time": "1:56:53"} +{"current_steps": 199, "total_steps": 2470, "loss": 1.9574, "lr": 8.016194331983806e-06, "epoch": 0.805668016194332, "percentage": 8.06, "elapsed_time": "0:10:14", "remaining_time": "1:56:49"} +{"current_steps": 200, "total_steps": 2470, "loss": 2.1815, "lr": 8.056680161943322e-06, "epoch": 0.8097165991902834, "percentage": 8.1, "elapsed_time": "0:10:17", "remaining_time": "1:56:45"} +{"current_steps": 201, "total_steps": 2470, "loss": 2.3515, "lr": 8.097165991902834e-06, "epoch": 0.8137651821862348, "percentage": 8.14, "elapsed_time": "0:10:20", "remaining_time": "1:56:42"} +{"current_steps": 202, "total_steps": 2470, "loss": 2.0846, "lr": 8.13765182186235e-06, "epoch": 0.8178137651821862, "percentage": 8.18, "elapsed_time": "0:10:23", "remaining_time": "1:56:38"} +{"current_steps": 203, "total_steps": 2470, "loss": 2.901, "lr": 8.178137651821862e-06, "epoch": 0.8218623481781376, "percentage": 8.22, "elapsed_time": "0:10:26", "remaining_time": "1:56:34"} +{"current_steps": 204, "total_steps": 2470, "loss": 4.9217, "lr": 8.218623481781377e-06, "epoch": 0.8259109311740891, "percentage": 8.26, "elapsed_time": "0:10:29", "remaining_time": "1:56:30"} +{"current_steps": 205, "total_steps": 2470, "loss": 2.213, "lr": 8.259109311740891e-06, "epoch": 0.8299595141700404, "percentage": 8.3, "elapsed_time": "0:10:32", "remaining_time": "1:56:26"} +{"current_steps": 206, "total_steps": 2470, "loss": 2.1265, "lr": 8.299595141700405e-06, "epoch": 0.8340080971659919, "percentage": 8.34, "elapsed_time": "0:10:35", "remaining_time": "1:56:22"} +{"current_steps": 207, "total_steps": 2470, "loss": 2.1168, "lr": 8.340080971659919e-06, "epoch": 0.8380566801619433, "percentage": 8.38, "elapsed_time": "0:10:38", "remaining_time": "1:56:18"} +{"current_steps": 208, "total_steps": 2470, "loss": 2.2021, "lr": 8.380566801619434e-06, "epoch": 0.8421052631578947, "percentage": 8.42, "elapsed_time": "0:10:41", "remaining_time": "1:56:14"} +{"current_steps": 209, "total_steps": 2470, "loss": 2.1197, "lr": 8.421052631578948e-06, "epoch": 0.8461538461538461, "percentage": 8.46, "elapsed_time": "0:10:44", "remaining_time": "1:56:11"} +{"current_steps": 210, "total_steps": 2470, "loss": 2.1389, "lr": 8.461538461538462e-06, "epoch": 0.8502024291497976, "percentage": 8.5, "elapsed_time": "0:10:47", "remaining_time": "1:56:07"} +{"current_steps": 211, "total_steps": 2470, "loss": 2.2071, "lr": 8.502024291497976e-06, "epoch": 0.854251012145749, "percentage": 8.54, "elapsed_time": "0:10:50", "remaining_time": "1:56:03"} +{"current_steps": 212, "total_steps": 2470, "loss": 2.1278, "lr": 8.54251012145749e-06, "epoch": 0.8582995951417004, "percentage": 8.58, "elapsed_time": "0:10:53", "remaining_time": "1:55:59"} +{"current_steps": 213, "total_steps": 2470, "loss": 2.2602, "lr": 8.582995951417005e-06, "epoch": 0.8623481781376519, "percentage": 8.62, "elapsed_time": "0:10:56", "remaining_time": "1:55:56"} +{"current_steps": 214, "total_steps": 2470, "loss": 2.2139, "lr": 8.62348178137652e-06, "epoch": 0.8663967611336032, "percentage": 8.66, "elapsed_time": "0:10:59", "remaining_time": "1:55:52"} +{"current_steps": 215, "total_steps": 2470, "loss": 2.6954, "lr": 8.663967611336033e-06, "epoch": 0.8704453441295547, "percentage": 8.7, "elapsed_time": "0:11:02", "remaining_time": "1:55:48"} +{"current_steps": 216, "total_steps": 2470, "loss": 2.6307, "lr": 8.704453441295547e-06, "epoch": 0.8744939271255061, "percentage": 8.74, "elapsed_time": "0:11:05", "remaining_time": "1:55:44"} +{"current_steps": 217, "total_steps": 2470, "loss": 2.3637, "lr": 8.744939271255063e-06, "epoch": 0.8785425101214575, "percentage": 8.79, "elapsed_time": "0:11:08", "remaining_time": "1:55:40"} +{"current_steps": 218, "total_steps": 2470, "loss": 2.2346, "lr": 8.785425101214575e-06, "epoch": 0.8825910931174089, "percentage": 8.83, "elapsed_time": "0:11:11", "remaining_time": "1:55:36"} +{"current_steps": 219, "total_steps": 2470, "loss": 1.8973, "lr": 8.82591093117409e-06, "epoch": 0.8866396761133604, "percentage": 8.87, "elapsed_time": "0:11:14", "remaining_time": "1:55:33"} +{"current_steps": 220, "total_steps": 2470, "loss": 2.2742, "lr": 8.866396761133604e-06, "epoch": 0.8906882591093117, "percentage": 8.91, "elapsed_time": "0:11:17", "remaining_time": "1:55:29"} +{"current_steps": 221, "total_steps": 2470, "loss": 2.2952, "lr": 8.906882591093118e-06, "epoch": 0.8947368421052632, "percentage": 8.95, "elapsed_time": "0:11:20", "remaining_time": "1:55:25"} +{"current_steps": 222, "total_steps": 2470, "loss": 2.0332, "lr": 8.947368421052632e-06, "epoch": 0.8987854251012146, "percentage": 8.99, "elapsed_time": "0:11:23", "remaining_time": "1:55:22"} +{"current_steps": 223, "total_steps": 2470, "loss": 2.0714, "lr": 8.987854251012147e-06, "epoch": 0.902834008097166, "percentage": 9.03, "elapsed_time": "0:11:26", "remaining_time": "1:55:18"} +{"current_steps": 224, "total_steps": 2470, "loss": 2.2157, "lr": 9.02834008097166e-06, "epoch": 0.9068825910931174, "percentage": 9.07, "elapsed_time": "0:11:29", "remaining_time": "1:55:14"} +{"current_steps": 225, "total_steps": 2470, "loss": 1.7915, "lr": 9.068825910931175e-06, "epoch": 0.9109311740890689, "percentage": 9.11, "elapsed_time": "0:11:32", "remaining_time": "1:55:11"} +{"current_steps": 226, "total_steps": 2470, "loss": 2.0722, "lr": 9.109311740890689e-06, "epoch": 0.9149797570850202, "percentage": 9.15, "elapsed_time": "0:11:35", "remaining_time": "1:55:07"} +{"current_steps": 227, "total_steps": 2470, "loss": 2.0351, "lr": 9.149797570850203e-06, "epoch": 0.9190283400809717, "percentage": 9.19, "elapsed_time": "0:11:39", "remaining_time": "1:55:11"} +{"current_steps": 228, "total_steps": 2470, "loss": 2.1823, "lr": 9.190283400809717e-06, "epoch": 0.9230769230769231, "percentage": 9.23, "elapsed_time": "0:11:42", "remaining_time": "1:55:07"} +{"current_steps": 229, "total_steps": 2470, "loss": 2.2329, "lr": 9.230769230769232e-06, "epoch": 0.9271255060728745, "percentage": 9.27, "elapsed_time": "0:11:45", "remaining_time": "1:55:04"} +{"current_steps": 230, "total_steps": 2470, "loss": 2.142, "lr": 9.271255060728746e-06, "epoch": 0.9311740890688259, "percentage": 9.31, "elapsed_time": "0:11:48", "remaining_time": "1:55:00"} +{"current_steps": 231, "total_steps": 2470, "loss": 2.0151, "lr": 9.31174089068826e-06, "epoch": 0.9352226720647774, "percentage": 9.35, "elapsed_time": "0:11:51", "remaining_time": "1:54:56"} +{"current_steps": 232, "total_steps": 2470, "loss": 1.9453, "lr": 9.352226720647774e-06, "epoch": 0.9392712550607287, "percentage": 9.39, "elapsed_time": "0:11:54", "remaining_time": "1:54:53"} +{"current_steps": 233, "total_steps": 2470, "loss": 2.2879, "lr": 9.392712550607288e-06, "epoch": 0.9433198380566802, "percentage": 9.43, "elapsed_time": "0:11:57", "remaining_time": "1:54:50"} +{"current_steps": 234, "total_steps": 2470, "loss": 1.997, "lr": 9.433198380566803e-06, "epoch": 0.9473684210526315, "percentage": 9.47, "elapsed_time": "0:12:00", "remaining_time": "1:54:46"} +{"current_steps": 235, "total_steps": 2470, "loss": 2.0557, "lr": 9.473684210526315e-06, "epoch": 0.951417004048583, "percentage": 9.51, "elapsed_time": "0:12:03", "remaining_time": "1:54:42"} +{"current_steps": 236, "total_steps": 2470, "loss": 2.2939, "lr": 9.514170040485831e-06, "epoch": 0.9554655870445344, "percentage": 9.55, "elapsed_time": "0:12:06", "remaining_time": "1:54:38"} +{"current_steps": 237, "total_steps": 2470, "loss": 2.0733, "lr": 9.554655870445345e-06, "epoch": 0.9595141700404858, "percentage": 9.6, "elapsed_time": "0:12:10", "remaining_time": "1:54:38"} +{"current_steps": 238, "total_steps": 2470, "loss": 2.0464, "lr": 9.595141700404859e-06, "epoch": 0.9635627530364372, "percentage": 9.64, "elapsed_time": "0:12:13", "remaining_time": "1:54:34"} +{"current_steps": 239, "total_steps": 2470, "loss": 2.336, "lr": 9.635627530364373e-06, "epoch": 0.9676113360323887, "percentage": 9.68, "elapsed_time": "0:12:16", "remaining_time": "1:54:31"} +{"current_steps": 240, "total_steps": 2470, "loss": 2.3022, "lr": 9.676113360323888e-06, "epoch": 0.97165991902834, "percentage": 9.72, "elapsed_time": "0:12:19", "remaining_time": "1:54:27"} +{"current_steps": 241, "total_steps": 2470, "loss": 2.0917, "lr": 9.7165991902834e-06, "epoch": 0.9757085020242915, "percentage": 9.76, "elapsed_time": "0:12:22", "remaining_time": "1:54:23"} +{"current_steps": 242, "total_steps": 2470, "loss": 2.2454, "lr": 9.757085020242916e-06, "epoch": 0.979757085020243, "percentage": 9.8, "elapsed_time": "0:12:25", "remaining_time": "1:54:20"} +{"current_steps": 243, "total_steps": 2470, "loss": 2.2731, "lr": 9.79757085020243e-06, "epoch": 0.9838056680161943, "percentage": 9.84, "elapsed_time": "0:12:28", "remaining_time": "1:54:16"} +{"current_steps": 244, "total_steps": 2470, "loss": 2.0318, "lr": 9.838056680161944e-06, "epoch": 0.9878542510121457, "percentage": 9.88, "elapsed_time": "0:12:31", "remaining_time": "1:54:12"} +{"current_steps": 245, "total_steps": 2470, "loss": 2.0005, "lr": 9.878542510121458e-06, "epoch": 0.9919028340080972, "percentage": 9.92, "elapsed_time": "0:12:34", "remaining_time": "1:54:09"} +{"current_steps": 246, "total_steps": 2470, "loss": 2.2101, "lr": 9.919028340080973e-06, "epoch": 0.9959514170040485, "percentage": 9.96, "elapsed_time": "0:12:37", "remaining_time": "1:54:05"} +{"current_steps": 247, "total_steps": 2470, "loss": 2.0861, "lr": 9.959514170040487e-06, "epoch": 1.0, "percentage": 10.0, "elapsed_time": "0:12:40", "remaining_time": "1:54:01"} +{"current_steps": 248, "total_steps": 2470, "loss": 2.1654, "lr": 1e-05, "epoch": 1.0040485829959513, "percentage": 10.04, "elapsed_time": "0:12:43", "remaining_time": "1:53:58"} +{"current_steps": 249, "total_steps": 2470, "loss": 2.3841, "lr": 9.999995007009308e-06, "epoch": 1.008097165991903, "percentage": 10.08, "elapsed_time": "0:12:46", "remaining_time": "1:53:54"} +{"current_steps": 250, "total_steps": 2470, "loss": 2.2013, "lr": 9.999980028047207e-06, "epoch": 1.0121457489878543, "percentage": 10.12, "elapsed_time": "0:12:49", "remaining_time": "1:53:50"} +{"current_steps": 251, "total_steps": 2470, "loss": 2.3109, "lr": 9.99995506314361e-06, "epoch": 1.0161943319838056, "percentage": 10.16, "elapsed_time": "0:12:52", "remaining_time": "1:53:47"} +{"current_steps": 252, "total_steps": 2470, "loss": 2.5018, "lr": 9.999920112348379e-06, "epoch": 1.0202429149797572, "percentage": 10.2, "elapsed_time": "0:12:55", "remaining_time": "1:53:43"} +{"current_steps": 253, "total_steps": 2470, "loss": 2.4387, "lr": 9.999875175731316e-06, "epoch": 1.0242914979757085, "percentage": 10.24, "elapsed_time": "0:12:58", "remaining_time": "1:53:40"} +{"current_steps": 254, "total_steps": 2470, "loss": 2.0271, "lr": 9.99982025338217e-06, "epoch": 1.0283400809716599, "percentage": 10.28, "elapsed_time": "0:13:01", "remaining_time": "1:53:36"} +{"current_steps": 255, "total_steps": 2470, "loss": 2.1942, "lr": 9.999755345410628e-06, "epoch": 1.0323886639676114, "percentage": 10.32, "elapsed_time": "0:13:04", "remaining_time": "1:53:32"} +{"current_steps": 256, "total_steps": 2470, "loss": 2.3802, "lr": 9.999680451946327e-06, "epoch": 1.0364372469635628, "percentage": 10.36, "elapsed_time": "0:13:07", "remaining_time": "1:53:29"} +{"current_steps": 257, "total_steps": 2470, "loss": 2.1737, "lr": 9.999595573138845e-06, "epoch": 1.040485829959514, "percentage": 10.4, "elapsed_time": "0:13:10", "remaining_time": "1:53:25"} +{"current_steps": 258, "total_steps": 2470, "loss": 2.1892, "lr": 9.9995007091577e-06, "epoch": 1.0445344129554657, "percentage": 10.45, "elapsed_time": "0:13:13", "remaining_time": "1:53:22"} +{"current_steps": 259, "total_steps": 2470, "loss": 2.165, "lr": 9.999395860192354e-06, "epoch": 1.048582995951417, "percentage": 10.49, "elapsed_time": "0:13:16", "remaining_time": "1:53:18"} +{"current_steps": 260, "total_steps": 2470, "loss": 2.4136, "lr": 9.99928102645221e-06, "epoch": 1.0526315789473684, "percentage": 10.53, "elapsed_time": "0:13:19", "remaining_time": "1:53:15"} +{"current_steps": 261, "total_steps": 2470, "loss": 2.2649, "lr": 9.999156208166614e-06, "epoch": 1.05668016194332, "percentage": 10.57, "elapsed_time": "0:13:22", "remaining_time": "1:53:11"} +{"current_steps": 262, "total_steps": 2470, "loss": 2.2776, "lr": 9.999021405584855e-06, "epoch": 1.0607287449392713, "percentage": 10.61, "elapsed_time": "0:13:25", "remaining_time": "1:53:08"} +{"current_steps": 263, "total_steps": 2470, "loss": 2.2937, "lr": 9.99887661897616e-06, "epoch": 1.0647773279352226, "percentage": 10.65, "elapsed_time": "0:13:28", "remaining_time": "1:53:04"} +{"current_steps": 264, "total_steps": 2470, "loss": 2.3373, "lr": 9.998721848629691e-06, "epoch": 1.0688259109311742, "percentage": 10.69, "elapsed_time": "0:13:31", "remaining_time": "1:53:01"} +{"current_steps": 265, "total_steps": 2470, "loss": 2.1755, "lr": 9.99855709485456e-06, "epoch": 1.0728744939271255, "percentage": 10.73, "elapsed_time": "0:13:34", "remaining_time": "1:52:58"} +{"current_steps": 266, "total_steps": 2470, "loss": 2.1224, "lr": 9.99838235797981e-06, "epoch": 1.0769230769230769, "percentage": 10.77, "elapsed_time": "0:13:37", "remaining_time": "1:52:54"} +{"current_steps": 267, "total_steps": 2470, "loss": 2.162, "lr": 9.998197638354428e-06, "epoch": 1.0809716599190284, "percentage": 10.81, "elapsed_time": "0:13:40", "remaining_time": "1:52:50"} +{"current_steps": 268, "total_steps": 2470, "loss": 2.0674, "lr": 9.998002936347334e-06, "epoch": 1.0850202429149798, "percentage": 10.85, "elapsed_time": "0:13:43", "remaining_time": "1:52:47"} +{"current_steps": 269, "total_steps": 2470, "loss": 2.1639, "lr": 9.997798252347382e-06, "epoch": 1.0890688259109311, "percentage": 10.89, "elapsed_time": "0:13:46", "remaining_time": "1:52:43"} +{"current_steps": 270, "total_steps": 2470, "loss": 2.2088, "lr": 9.99758358676337e-06, "epoch": 1.0931174089068827, "percentage": 10.93, "elapsed_time": "0:13:49", "remaining_time": "1:52:40"} +{"current_steps": 271, "total_steps": 2470, "loss": 1.9417, "lr": 9.99735894002403e-06, "epoch": 1.097165991902834, "percentage": 10.97, "elapsed_time": "0:13:52", "remaining_time": "1:52:36"} +{"current_steps": 272, "total_steps": 2470, "loss": 2.1229, "lr": 9.99712431257802e-06, "epoch": 1.1012145748987854, "percentage": 11.01, "elapsed_time": "0:13:55", "remaining_time": "1:52:33"} +{"current_steps": 273, "total_steps": 2470, "loss": 2.147, "lr": 9.99687970489394e-06, "epoch": 1.1052631578947367, "percentage": 11.05, "elapsed_time": "0:13:58", "remaining_time": "1:52:29"} +{"current_steps": 274, "total_steps": 2470, "loss": 2.0305, "lr": 9.996625117460319e-06, "epoch": 1.1093117408906883, "percentage": 11.09, "elapsed_time": "0:14:01", "remaining_time": "1:52:26"} +{"current_steps": 275, "total_steps": 2470, "loss": 1.993, "lr": 9.996360550785619e-06, "epoch": 1.1133603238866396, "percentage": 11.13, "elapsed_time": "0:14:04", "remaining_time": "1:52:22"} +{"current_steps": 276, "total_steps": 2470, "loss": 1.9789, "lr": 9.996086005398228e-06, "epoch": 1.117408906882591, "percentage": 11.17, "elapsed_time": "0:14:07", "remaining_time": "1:52:19"} +{"current_steps": 277, "total_steps": 2470, "loss": 1.9362, "lr": 9.995801481846474e-06, "epoch": 1.1214574898785425, "percentage": 11.21, "elapsed_time": "0:14:10", "remaining_time": "1:52:15"} +{"current_steps": 278, "total_steps": 2470, "loss": 1.8981, "lr": 9.9955069806986e-06, "epoch": 1.125506072874494, "percentage": 11.26, "elapsed_time": "0:14:14", "remaining_time": "1:52:16"} +{"current_steps": 279, "total_steps": 2470, "loss": 1.877, "lr": 9.995202502542785e-06, "epoch": 1.1295546558704452, "percentage": 11.3, "elapsed_time": "0:14:17", "remaining_time": "1:52:12"} +{"current_steps": 280, "total_steps": 2470, "loss": 2.1812, "lr": 9.99488804798713e-06, "epoch": 1.1336032388663968, "percentage": 11.34, "elapsed_time": "0:14:20", "remaining_time": "1:52:09"} +{"current_steps": 281, "total_steps": 2470, "loss": 2.0952, "lr": 9.994563617659665e-06, "epoch": 1.1376518218623481, "percentage": 11.38, "elapsed_time": "0:14:23", "remaining_time": "1:52:05"} +{"current_steps": 282, "total_steps": 2470, "loss": 1.7897, "lr": 9.99422921220834e-06, "epoch": 1.1417004048582995, "percentage": 11.42, "elapsed_time": "0:14:26", "remaining_time": "1:52:02"} +{"current_steps": 283, "total_steps": 2470, "loss": 2.1832, "lr": 9.993884832301029e-06, "epoch": 1.145748987854251, "percentage": 11.46, "elapsed_time": "0:14:29", "remaining_time": "1:51:58"} +{"current_steps": 284, "total_steps": 2470, "loss": 2.0419, "lr": 9.993530478625524e-06, "epoch": 1.1497975708502024, "percentage": 11.5, "elapsed_time": "0:14:32", "remaining_time": "1:51:54"} +{"current_steps": 285, "total_steps": 2470, "loss": 2.1765, "lr": 9.99316615188954e-06, "epoch": 1.1538461538461537, "percentage": 11.54, "elapsed_time": "0:14:35", "remaining_time": "1:51:54"} +{"current_steps": 286, "total_steps": 2470, "loss": 2.414, "lr": 9.992791852820709e-06, "epoch": 1.1578947368421053, "percentage": 11.58, "elapsed_time": "0:14:38", "remaining_time": "1:51:50"} +{"current_steps": 287, "total_steps": 2470, "loss": 2.0729, "lr": 9.992407582166582e-06, "epoch": 1.1619433198380567, "percentage": 11.62, "elapsed_time": "0:14:41", "remaining_time": "1:51:47"} +{"current_steps": 288, "total_steps": 2470, "loss": 2.0816, "lr": 9.99201334069462e-06, "epoch": 1.165991902834008, "percentage": 11.66, "elapsed_time": "0:14:44", "remaining_time": "1:51:43"} +{"current_steps": 289, "total_steps": 2470, "loss": 2.4242, "lr": 9.991609129192202e-06, "epoch": 1.1700404858299596, "percentage": 11.7, "elapsed_time": "0:14:47", "remaining_time": "1:51:40"} +{"current_steps": 290, "total_steps": 2470, "loss": 1.9546, "lr": 9.991194948466615e-06, "epoch": 1.174089068825911, "percentage": 11.74, "elapsed_time": "0:14:50", "remaining_time": "1:51:36"} +{"current_steps": 291, "total_steps": 2470, "loss": 2.0891, "lr": 9.990770799345064e-06, "epoch": 1.1781376518218623, "percentage": 11.78, "elapsed_time": "0:14:53", "remaining_time": "1:51:33"} +{"current_steps": 292, "total_steps": 2470, "loss": 1.8523, "lr": 9.990336682674656e-06, "epoch": 1.1821862348178138, "percentage": 11.82, "elapsed_time": "0:14:56", "remaining_time": "1:51:30"} +{"current_steps": 293, "total_steps": 2470, "loss": 2.0252, "lr": 9.989892599322404e-06, "epoch": 1.1862348178137652, "percentage": 11.86, "elapsed_time": "0:14:59", "remaining_time": "1:51:26"} +{"current_steps": 294, "total_steps": 2470, "loss": 2.094, "lr": 9.989438550175235e-06, "epoch": 1.1902834008097165, "percentage": 11.9, "elapsed_time": "0:15:02", "remaining_time": "1:51:23"} +{"current_steps": 295, "total_steps": 2470, "loss": 2.2522, "lr": 9.98897453613997e-06, "epoch": 1.194331983805668, "percentage": 11.94, "elapsed_time": "0:15:05", "remaining_time": "1:51:19"} +{"current_steps": 296, "total_steps": 2470, "loss": 2.3174, "lr": 9.988500558143337e-06, "epoch": 1.1983805668016194, "percentage": 11.98, "elapsed_time": "0:15:09", "remaining_time": "1:51:16"} +{"current_steps": 297, "total_steps": 2470, "loss": 2.0626, "lr": 9.988016617131966e-06, "epoch": 1.2024291497975708, "percentage": 12.02, "elapsed_time": "0:15:12", "remaining_time": "1:51:12"} +{"current_steps": 298, "total_steps": 2470, "loss": 2.332, "lr": 9.987522714072377e-06, "epoch": 1.2064777327935223, "percentage": 12.06, "elapsed_time": "0:15:15", "remaining_time": "1:51:10"} +{"current_steps": 299, "total_steps": 2470, "loss": 2.3944, "lr": 9.987018849950996e-06, "epoch": 1.2105263157894737, "percentage": 12.11, "elapsed_time": "0:15:18", "remaining_time": "1:51:06"} +{"current_steps": 300, "total_steps": 2470, "loss": 2.1948, "lr": 9.986505025774137e-06, "epoch": 1.214574898785425, "percentage": 12.15, "elapsed_time": "0:15:21", "remaining_time": "1:51:03"} +{"current_steps": 301, "total_steps": 2470, "loss": 2.0261, "lr": 9.985981242568009e-06, "epoch": 1.2186234817813766, "percentage": 12.19, "elapsed_time": "0:15:24", "remaining_time": "1:50:59"} +{"current_steps": 302, "total_steps": 2470, "loss": 2.0268, "lr": 9.985447501378706e-06, "epoch": 1.222672064777328, "percentage": 12.23, "elapsed_time": "0:15:27", "remaining_time": "1:50:56"} +{"current_steps": 303, "total_steps": 2470, "loss": 2.0609, "lr": 9.984903803272216e-06, "epoch": 1.2267206477732793, "percentage": 12.27, "elapsed_time": "0:15:30", "remaining_time": "1:50:52"} +{"current_steps": 304, "total_steps": 2470, "loss": 2.118, "lr": 9.984350149334415e-06, "epoch": 1.2307692307692308, "percentage": 12.31, "elapsed_time": "0:15:33", "remaining_time": "1:50:49"} +{"current_steps": 305, "total_steps": 2470, "loss": 2.2939, "lr": 9.983786540671052e-06, "epoch": 1.2348178137651822, "percentage": 12.35, "elapsed_time": "0:15:36", "remaining_time": "1:50:46"} +{"current_steps": 306, "total_steps": 2470, "loss": 2.2554, "lr": 9.983212978407767e-06, "epoch": 1.2388663967611335, "percentage": 12.39, "elapsed_time": "0:15:39", "remaining_time": "1:50:42"} +{"current_steps": 307, "total_steps": 2470, "loss": 2.2252, "lr": 9.982629463690075e-06, "epoch": 1.242914979757085, "percentage": 12.43, "elapsed_time": "0:15:42", "remaining_time": "1:50:39"} +{"current_steps": 308, "total_steps": 2470, "loss": 2.0288, "lr": 9.982035997683372e-06, "epoch": 1.2469635627530364, "percentage": 12.47, "elapsed_time": "0:15:45", "remaining_time": "1:50:35"} +{"current_steps": 309, "total_steps": 2470, "loss": 2.0528, "lr": 9.981432581572925e-06, "epoch": 1.2510121457489878, "percentage": 12.51, "elapsed_time": "0:15:48", "remaining_time": "1:50:32"} +{"current_steps": 310, "total_steps": 2470, "loss": 2.1848, "lr": 9.980819216563875e-06, "epoch": 1.2550607287449393, "percentage": 12.55, "elapsed_time": "0:15:51", "remaining_time": "1:50:28"} +{"current_steps": 311, "total_steps": 2470, "loss": 1.9964, "lr": 9.980195903881231e-06, "epoch": 1.2591093117408907, "percentage": 12.59, "elapsed_time": "0:15:54", "remaining_time": "1:50:25"} +{"current_steps": 312, "total_steps": 2470, "loss": 1.8735, "lr": 9.979562644769871e-06, "epoch": 1.263157894736842, "percentage": 12.63, "elapsed_time": "0:15:57", "remaining_time": "1:50:22"} +{"current_steps": 313, "total_steps": 2470, "loss": 2.0595, "lr": 9.978919440494538e-06, "epoch": 1.2672064777327936, "percentage": 12.67, "elapsed_time": "0:16:00", "remaining_time": "1:50:18"} +{"current_steps": 314, "total_steps": 2470, "loss": 2.1342, "lr": 9.978266292339838e-06, "epoch": 1.271255060728745, "percentage": 12.71, "elapsed_time": "0:16:03", "remaining_time": "1:50:15"} +{"current_steps": 315, "total_steps": 2470, "loss": 2.0658, "lr": 9.977603201610236e-06, "epoch": 1.2753036437246963, "percentage": 12.75, "elapsed_time": "0:16:06", "remaining_time": "1:50:11"} +{"current_steps": 316, "total_steps": 2470, "loss": 2.1478, "lr": 9.976930169630052e-06, "epoch": 1.2793522267206479, "percentage": 12.79, "elapsed_time": "0:16:09", "remaining_time": "1:50:08"} +{"current_steps": 317, "total_steps": 2470, "loss": 1.8522, "lr": 9.976247197743465e-06, "epoch": 1.2834008097165992, "percentage": 12.83, "elapsed_time": "0:16:12", "remaining_time": "1:50:05"} +{"current_steps": 318, "total_steps": 2470, "loss": 1.9432, "lr": 9.975554287314505e-06, "epoch": 1.2874493927125505, "percentage": 12.87, "elapsed_time": "0:16:15", "remaining_time": "1:50:02"} +{"current_steps": 319, "total_steps": 2470, "loss": 1.8181, "lr": 9.974851439727045e-06, "epoch": 1.291497975708502, "percentage": 12.91, "elapsed_time": "0:16:18", "remaining_time": "1:49:58"} +{"current_steps": 320, "total_steps": 2470, "loss": 2.1573, "lr": 9.974138656384815e-06, "epoch": 1.2955465587044535, "percentage": 12.96, "elapsed_time": "0:16:21", "remaining_time": "1:49:55"} +{"current_steps": 321, "total_steps": 2470, "loss": 2.1787, "lr": 9.973415938711383e-06, "epoch": 1.2995951417004048, "percentage": 13.0, "elapsed_time": "0:16:24", "remaining_time": "1:49:51"} +{"current_steps": 322, "total_steps": 2470, "loss": 1.9479, "lr": 9.972683288150155e-06, "epoch": 1.3036437246963564, "percentage": 13.04, "elapsed_time": "0:16:28", "remaining_time": "1:49:52"} +{"current_steps": 323, "total_steps": 2470, "loss": 1.9284, "lr": 9.97194070616438e-06, "epoch": 1.3076923076923077, "percentage": 13.08, "elapsed_time": "0:16:31", "remaining_time": "1:49:49"} +{"current_steps": 324, "total_steps": 2470, "loss": 1.9908, "lr": 9.971188194237141e-06, "epoch": 1.311740890688259, "percentage": 13.12, "elapsed_time": "0:16:34", "remaining_time": "1:49:46"} +{"current_steps": 325, "total_steps": 2470, "loss": 2.0365, "lr": 9.97042575387135e-06, "epoch": 1.3157894736842106, "percentage": 13.16, "elapsed_time": "0:16:37", "remaining_time": "1:49:42"} +{"current_steps": 326, "total_steps": 2470, "loss": 1.9016, "lr": 9.969653386589749e-06, "epoch": 1.319838056680162, "percentage": 13.2, "elapsed_time": "0:16:40", "remaining_time": "1:49:39"} +{"current_steps": 327, "total_steps": 2470, "loss": 1.9295, "lr": 9.968871093934908e-06, "epoch": 1.3238866396761133, "percentage": 13.24, "elapsed_time": "0:16:43", "remaining_time": "1:49:36"} +{"current_steps": 328, "total_steps": 2470, "loss": 1.9057, "lr": 9.968078877469221e-06, "epoch": 1.3279352226720649, "percentage": 13.28, "elapsed_time": "0:16:46", "remaining_time": "1:49:32"} +{"current_steps": 329, "total_steps": 2470, "loss": 1.7933, "lr": 9.967276738774897e-06, "epoch": 1.3319838056680162, "percentage": 13.32, "elapsed_time": "0:16:49", "remaining_time": "1:49:29"} +{"current_steps": 330, "total_steps": 2470, "loss": 1.8225, "lr": 9.966464679453969e-06, "epoch": 1.3360323886639676, "percentage": 13.36, "elapsed_time": "0:16:52", "remaining_time": "1:49:25"} +{"current_steps": 331, "total_steps": 2470, "loss": 1.7548, "lr": 9.965642701128273e-06, "epoch": 1.3400809716599191, "percentage": 13.4, "elapsed_time": "0:16:55", "remaining_time": "1:49:22"} +{"current_steps": 332, "total_steps": 2470, "loss": 1.8602, "lr": 9.964810805439464e-06, "epoch": 1.3441295546558705, "percentage": 13.44, "elapsed_time": "0:16:58", "remaining_time": "1:49:18"} +{"current_steps": 333, "total_steps": 2470, "loss": 2.0594, "lr": 9.963968994049e-06, "epoch": 1.3481781376518218, "percentage": 13.48, "elapsed_time": "0:17:01", "remaining_time": "1:49:18"} +{"current_steps": 334, "total_steps": 2470, "loss": 1.8496, "lr": 9.963117268638147e-06, "epoch": 1.3522267206477734, "percentage": 13.52, "elapsed_time": "0:17:04", "remaining_time": "1:49:14"} +{"current_steps": 335, "total_steps": 2470, "loss": 1.6494, "lr": 9.962255630907964e-06, "epoch": 1.3562753036437247, "percentage": 13.56, "elapsed_time": "0:17:07", "remaining_time": "1:49:11"} +{"current_steps": 336, "total_steps": 2470, "loss": 1.9562, "lr": 9.961384082579311e-06, "epoch": 1.360323886639676, "percentage": 13.6, "elapsed_time": "0:17:10", "remaining_time": "1:49:08"} +{"current_steps": 337, "total_steps": 2470, "loss": 2.0155, "lr": 9.96050262539284e-06, "epoch": 1.3643724696356276, "percentage": 13.64, "elapsed_time": "0:17:14", "remaining_time": "1:49:05"} +{"current_steps": 338, "total_steps": 2470, "loss": 1.9085, "lr": 9.959611261108999e-06, "epoch": 1.368421052631579, "percentage": 13.68, "elapsed_time": "0:17:17", "remaining_time": "1:49:01"} +{"current_steps": 339, "total_steps": 2470, "loss": 2.0875, "lr": 9.958709991508013e-06, "epoch": 1.3724696356275303, "percentage": 13.72, "elapsed_time": "0:17:20", "remaining_time": "1:48:58"} +{"current_steps": 340, "total_steps": 2470, "loss": 1.619, "lr": 9.957798818389894e-06, "epoch": 1.376518218623482, "percentage": 13.77, "elapsed_time": "0:17:23", "remaining_time": "1:48:55"} +{"current_steps": 341, "total_steps": 2470, "loss": 1.809, "lr": 9.956877743574437e-06, "epoch": 1.3805668016194332, "percentage": 13.81, "elapsed_time": "0:17:26", "remaining_time": "1:48:52"} +{"current_steps": 342, "total_steps": 2470, "loss": 1.7047, "lr": 9.955946768901207e-06, "epoch": 1.3846153846153846, "percentage": 13.85, "elapsed_time": "0:17:29", "remaining_time": "1:48:48"} +{"current_steps": 343, "total_steps": 2470, "loss": 1.7574, "lr": 9.955005896229543e-06, "epoch": 1.3886639676113361, "percentage": 13.89, "elapsed_time": "0:17:32", "remaining_time": "1:48:45"} +{"current_steps": 344, "total_steps": 2470, "loss": 1.903, "lr": 9.954055127438554e-06, "epoch": 1.3927125506072875, "percentage": 13.93, "elapsed_time": "0:17:35", "remaining_time": "1:48:42"} +{"current_steps": 345, "total_steps": 2470, "loss": 1.7259, "lr": 9.95309446442711e-06, "epoch": 1.3967611336032388, "percentage": 13.97, "elapsed_time": "0:17:38", "remaining_time": "1:48:38"} +{"current_steps": 346, "total_steps": 2470, "loss": 1.7903, "lr": 9.952123909113842e-06, "epoch": 1.4008097165991904, "percentage": 14.01, "elapsed_time": "0:17:41", "remaining_time": "1:48:35"} +{"current_steps": 347, "total_steps": 2470, "loss": 1.8805, "lr": 9.951143463437145e-06, "epoch": 1.4048582995951417, "percentage": 14.05, "elapsed_time": "0:17:44", "remaining_time": "1:48:31"} +{"current_steps": 348, "total_steps": 2470, "loss": 1.963, "lr": 9.950153129355156e-06, "epoch": 1.408906882591093, "percentage": 14.09, "elapsed_time": "0:17:47", "remaining_time": "1:48:28"} +{"current_steps": 349, "total_steps": 2470, "loss": 1.8567, "lr": 9.949152908845771e-06, "epoch": 1.4129554655870447, "percentage": 14.13, "elapsed_time": "0:17:50", "remaining_time": "1:48:25"} +{"current_steps": 350, "total_steps": 2470, "loss": 2.0649, "lr": 9.948142803906623e-06, "epoch": 1.417004048582996, "percentage": 14.17, "elapsed_time": "0:17:53", "remaining_time": "1:48:21"} +{"current_steps": 351, "total_steps": 2470, "loss": 2.1272, "lr": 9.947122816555091e-06, "epoch": 1.4210526315789473, "percentage": 14.21, "elapsed_time": "0:17:56", "remaining_time": "1:48:18"} +{"current_steps": 352, "total_steps": 2470, "loss": 1.9559, "lr": 9.94609294882829e-06, "epoch": 1.425101214574899, "percentage": 14.25, "elapsed_time": "0:17:59", "remaining_time": "1:48:15"} +{"current_steps": 353, "total_steps": 2470, "loss": 2.0925, "lr": 9.94505320278307e-06, "epoch": 1.4291497975708503, "percentage": 14.29, "elapsed_time": "0:18:02", "remaining_time": "1:48:11"} +{"current_steps": 354, "total_steps": 2470, "loss": 2.1299, "lr": 9.944003580496004e-06, "epoch": 1.4331983805668016, "percentage": 14.33, "elapsed_time": "0:18:05", "remaining_time": "1:48:08"} +{"current_steps": 355, "total_steps": 2470, "loss": 1.906, "lr": 9.942944084063397e-06, "epoch": 1.4372469635627532, "percentage": 14.37, "elapsed_time": "0:18:08", "remaining_time": "1:48:05"} +{"current_steps": 356, "total_steps": 2470, "loss": 1.8895, "lr": 9.94187471560127e-06, "epoch": 1.4412955465587045, "percentage": 14.41, "elapsed_time": "0:18:11", "remaining_time": "1:48:01"} +{"current_steps": 357, "total_steps": 2470, "loss": 2.123, "lr": 9.940795477245362e-06, "epoch": 1.4453441295546559, "percentage": 14.45, "elapsed_time": "0:18:14", "remaining_time": "1:47:58"} +{"current_steps": 358, "total_steps": 2470, "loss": 1.9087, "lr": 9.939706371151124e-06, "epoch": 1.4493927125506074, "percentage": 14.49, "elapsed_time": "0:18:17", "remaining_time": "1:47:55"} +{"current_steps": 359, "total_steps": 2470, "loss": 1.8989, "lr": 9.938607399493714e-06, "epoch": 1.4534412955465588, "percentage": 14.53, "elapsed_time": "0:18:20", "remaining_time": "1:47:52"} +{"current_steps": 360, "total_steps": 2470, "loss": 2.2799, "lr": 9.937498564467993e-06, "epoch": 1.45748987854251, "percentage": 14.57, "elapsed_time": "0:18:23", "remaining_time": "1:47:49"} +{"current_steps": 361, "total_steps": 2470, "loss": 2.5915, "lr": 9.936379868288525e-06, "epoch": 1.4615384615384617, "percentage": 14.62, "elapsed_time": "0:18:26", "remaining_time": "1:47:45"} +{"current_steps": 362, "total_steps": 2470, "loss": 2.1301, "lr": 9.935251313189564e-06, "epoch": 1.465587044534413, "percentage": 14.66, "elapsed_time": "0:18:29", "remaining_time": "1:47:42"} +{"current_steps": 363, "total_steps": 2470, "loss": 2.0549, "lr": 9.934112901425058e-06, "epoch": 1.4696356275303644, "percentage": 14.7, "elapsed_time": "0:18:32", "remaining_time": "1:47:39"} +{"current_steps": 364, "total_steps": 2470, "loss": 1.9596, "lr": 9.932964635268637e-06, "epoch": 1.4736842105263157, "percentage": 14.74, "elapsed_time": "0:18:35", "remaining_time": "1:47:36"} +{"current_steps": 365, "total_steps": 2470, "loss": 2.0348, "lr": 9.931806517013612e-06, "epoch": 1.4777327935222673, "percentage": 14.78, "elapsed_time": "0:18:38", "remaining_time": "1:47:33"} +{"current_steps": 366, "total_steps": 2470, "loss": 1.9226, "lr": 9.930638548972976e-06, "epoch": 1.4817813765182186, "percentage": 14.82, "elapsed_time": "0:18:41", "remaining_time": "1:47:29"} +{"current_steps": 367, "total_steps": 2470, "loss": 1.9363, "lr": 9.92946073347939e-06, "epoch": 1.48582995951417, "percentage": 14.86, "elapsed_time": "0:18:45", "remaining_time": "1:47:27"} +{"current_steps": 368, "total_steps": 2470, "loss": 1.8743, "lr": 9.92827307288518e-06, "epoch": 1.4898785425101215, "percentage": 14.9, "elapsed_time": "0:18:48", "remaining_time": "1:47:23"} +{"current_steps": 369, "total_steps": 2470, "loss": 1.9204, "lr": 9.927075569562342e-06, "epoch": 1.4939271255060729, "percentage": 14.94, "elapsed_time": "0:18:51", "remaining_time": "1:47:20"} +{"current_steps": 370, "total_steps": 2470, "loss": 1.8206, "lr": 9.925868225902518e-06, "epoch": 1.4979757085020242, "percentage": 14.98, "elapsed_time": "0:18:54", "remaining_time": "1:47:17"} +{"current_steps": 371, "total_steps": 2470, "loss": 1.741, "lr": 9.924651044317017e-06, "epoch": 1.5020242914979756, "percentage": 15.02, "elapsed_time": "0:18:57", "remaining_time": "1:47:14"} +{"current_steps": 372, "total_steps": 2470, "loss": 2.0195, "lr": 9.923424027236786e-06, "epoch": 1.5060728744939271, "percentage": 15.06, "elapsed_time": "0:19:00", "remaining_time": "1:47:10"} +{"current_steps": 373, "total_steps": 2470, "loss": 2.0682, "lr": 9.922187177112422e-06, "epoch": 1.5101214574898787, "percentage": 15.1, "elapsed_time": "0:19:03", "remaining_time": "1:47:07"} +{"current_steps": 374, "total_steps": 2470, "loss": 2.0098, "lr": 9.920940496414153e-06, "epoch": 1.5141700404858298, "percentage": 15.14, "elapsed_time": "0:19:06", "remaining_time": "1:47:07"} +{"current_steps": 375, "total_steps": 2470, "loss": 2.041, "lr": 9.919683987631849e-06, "epoch": 1.5182186234817814, "percentage": 15.18, "elapsed_time": "0:19:09", "remaining_time": "1:47:04"} +{"current_steps": 376, "total_steps": 2470, "loss": 1.9668, "lr": 9.918417653275004e-06, "epoch": 1.522267206477733, "percentage": 15.22, "elapsed_time": "0:19:12", "remaining_time": "1:47:01"} +{"current_steps": 377, "total_steps": 2470, "loss": 1.737, "lr": 9.917141495872733e-06, "epoch": 1.526315789473684, "percentage": 15.26, "elapsed_time": "0:19:15", "remaining_time": "1:46:57"} +{"current_steps": 378, "total_steps": 2470, "loss": 1.8672, "lr": 9.915855517973776e-06, "epoch": 1.5303643724696356, "percentage": 15.3, "elapsed_time": "0:19:19", "remaining_time": "1:46:54"} +{"current_steps": 379, "total_steps": 2470, "loss": 2.0038, "lr": 9.914559722146483e-06, "epoch": 1.5344129554655872, "percentage": 15.34, "elapsed_time": "0:19:22", "remaining_time": "1:46:51"} +{"current_steps": 380, "total_steps": 2470, "loss": 2.0916, "lr": 9.913254110978812e-06, "epoch": 1.5384615384615383, "percentage": 15.38, "elapsed_time": "0:19:25", "remaining_time": "1:46:48"} +{"current_steps": 381, "total_steps": 2470, "loss": 1.9959, "lr": 9.911938687078324e-06, "epoch": 1.54251012145749, "percentage": 15.43, "elapsed_time": "0:19:28", "remaining_time": "1:46:46"} +{"current_steps": 382, "total_steps": 2470, "loss": 2.6669, "lr": 9.91061345307218e-06, "epoch": 1.5465587044534415, "percentage": 15.47, "elapsed_time": "0:19:31", "remaining_time": "1:46:43"} +{"current_steps": 383, "total_steps": 2470, "loss": 2.7524, "lr": 9.909278411607134e-06, "epoch": 1.5506072874493926, "percentage": 15.51, "elapsed_time": "0:19:34", "remaining_time": "1:46:40"} +{"current_steps": 384, "total_steps": 2470, "loss": 3.2784, "lr": 9.90793356534952e-06, "epoch": 1.5546558704453441, "percentage": 15.55, "elapsed_time": "0:19:37", "remaining_time": "1:46:36"} +{"current_steps": 385, "total_steps": 2470, "loss": 1.9441, "lr": 9.906578916985267e-06, "epoch": 1.5587044534412957, "percentage": 15.59, "elapsed_time": "0:19:40", "remaining_time": "1:46:33"} +{"current_steps": 386, "total_steps": 2470, "loss": 1.84, "lr": 9.90521446921987e-06, "epoch": 1.5627530364372468, "percentage": 15.63, "elapsed_time": "0:19:43", "remaining_time": "1:46:30"} +{"current_steps": 387, "total_steps": 2470, "loss": 2.0999, "lr": 9.9038402247784e-06, "epoch": 1.5668016194331984, "percentage": 15.67, "elapsed_time": "0:19:46", "remaining_time": "1:46:27"} +{"current_steps": 388, "total_steps": 2470, "loss": 1.7455, "lr": 9.90245618640549e-06, "epoch": 1.5708502024291497, "percentage": 15.71, "elapsed_time": "0:19:49", "remaining_time": "1:46:24"} +{"current_steps": 389, "total_steps": 2470, "loss": 2.1349, "lr": 9.90106235686534e-06, "epoch": 1.574898785425101, "percentage": 15.75, "elapsed_time": "0:19:52", "remaining_time": "1:46:20"} +{"current_steps": 390, "total_steps": 2470, "loss": 1.8406, "lr": 9.8996587389417e-06, "epoch": 1.5789473684210527, "percentage": 15.79, "elapsed_time": "0:19:55", "remaining_time": "1:46:17"} +{"current_steps": 391, "total_steps": 2470, "loss": 2.1231, "lr": 9.89824533543787e-06, "epoch": 1.582995951417004, "percentage": 15.83, "elapsed_time": "0:19:58", "remaining_time": "1:46:14"} +{"current_steps": 392, "total_steps": 2470, "loss": 1.9727, "lr": 9.896822149176695e-06, "epoch": 1.5870445344129553, "percentage": 15.87, "elapsed_time": "0:20:01", "remaining_time": "1:46:11"} +{"current_steps": 393, "total_steps": 2470, "loss": 1.9829, "lr": 9.895389183000557e-06, "epoch": 1.591093117408907, "percentage": 15.91, "elapsed_time": "0:20:04", "remaining_time": "1:46:07"} +{"current_steps": 394, "total_steps": 2470, "loss": 1.648, "lr": 9.893946439771369e-06, "epoch": 1.5951417004048583, "percentage": 15.95, "elapsed_time": "0:20:07", "remaining_time": "1:46:04"} +{"current_steps": 395, "total_steps": 2470, "loss": 1.6858, "lr": 9.892493922370575e-06, "epoch": 1.5991902834008096, "percentage": 15.99, "elapsed_time": "0:20:10", "remaining_time": "1:46:01"} +{"current_steps": 396, "total_steps": 2470, "loss": 1.8744, "lr": 9.891031633699135e-06, "epoch": 1.6032388663967612, "percentage": 16.03, "elapsed_time": "0:20:13", "remaining_time": "1:45:57"} +{"current_steps": 397, "total_steps": 2470, "loss": 1.7732, "lr": 9.88955957667753e-06, "epoch": 1.6072874493927125, "percentage": 16.07, "elapsed_time": "0:20:16", "remaining_time": "1:45:54"} +{"current_steps": 398, "total_steps": 2470, "loss": 2.0753, "lr": 9.888077754245741e-06, "epoch": 1.6113360323886639, "percentage": 16.11, "elapsed_time": "0:20:19", "remaining_time": "1:45:51"} +{"current_steps": 399, "total_steps": 2470, "loss": 1.9333, "lr": 9.886586169363267e-06, "epoch": 1.6153846153846154, "percentage": 16.15, "elapsed_time": "0:20:22", "remaining_time": "1:45:47"} +{"current_steps": 400, "total_steps": 2470, "loss": 1.8167, "lr": 9.885084825009085e-06, "epoch": 1.6194331983805668, "percentage": 16.19, "elapsed_time": "0:20:26", "remaining_time": "1:45:44"} +{"current_steps": 401, "total_steps": 2470, "loss": 2.1783, "lr": 9.883573724181683e-06, "epoch": 1.623481781376518, "percentage": 16.23, "elapsed_time": "0:20:29", "remaining_time": "1:45:41"} +{"current_steps": 402, "total_steps": 2470, "loss": 1.9676, "lr": 9.882052869899024e-06, "epoch": 1.6275303643724697, "percentage": 16.28, "elapsed_time": "0:20:32", "remaining_time": "1:45:38"} +{"current_steps": 403, "total_steps": 2470, "loss": 2.154, "lr": 9.880522265198548e-06, "epoch": 1.631578947368421, "percentage": 16.32, "elapsed_time": "0:20:35", "remaining_time": "1:45:34"} +{"current_steps": 404, "total_steps": 2470, "loss": 1.8629, "lr": 9.878981913137178e-06, "epoch": 1.6356275303643724, "percentage": 16.36, "elapsed_time": "0:20:38", "remaining_time": "1:45:31"} +{"current_steps": 405, "total_steps": 2470, "loss": 2.0544, "lr": 9.877431816791299e-06, "epoch": 1.639676113360324, "percentage": 16.4, "elapsed_time": "0:20:41", "remaining_time": "1:45:28"} +{"current_steps": 406, "total_steps": 2470, "loss": 2.0126, "lr": 9.875871979256754e-06, "epoch": 1.6437246963562753, "percentage": 16.44, "elapsed_time": "0:20:44", "remaining_time": "1:45:24"} +{"current_steps": 407, "total_steps": 2470, "loss": 1.9896, "lr": 9.87430240364885e-06, "epoch": 1.6477732793522266, "percentage": 16.48, "elapsed_time": "0:20:47", "remaining_time": "1:45:21"} +{"current_steps": 408, "total_steps": 2470, "loss": 1.8537, "lr": 9.872723093102332e-06, "epoch": 1.6518218623481782, "percentage": 16.52, "elapsed_time": "0:20:50", "remaining_time": "1:45:18"} +{"current_steps": 409, "total_steps": 2470, "loss": 2.0636, "lr": 9.871134050771398e-06, "epoch": 1.6558704453441295, "percentage": 16.56, "elapsed_time": "0:20:53", "remaining_time": "1:45:14"} +{"current_steps": 410, "total_steps": 2470, "loss": 1.892, "lr": 9.869535279829674e-06, "epoch": 1.6599190283400809, "percentage": 16.6, "elapsed_time": "0:20:56", "remaining_time": "1:45:11"} +{"current_steps": 411, "total_steps": 2470, "loss": 2.0106, "lr": 9.867926783470221e-06, "epoch": 1.6639676113360324, "percentage": 16.64, "elapsed_time": "0:20:59", "remaining_time": "1:45:08"} +{"current_steps": 412, "total_steps": 2470, "loss": 2.0453, "lr": 9.866308564905523e-06, "epoch": 1.6680161943319838, "percentage": 16.68, "elapsed_time": "0:21:02", "remaining_time": "1:45:05"} +{"current_steps": 413, "total_steps": 2470, "loss": 1.9541, "lr": 9.864680627367476e-06, "epoch": 1.6720647773279351, "percentage": 16.72, "elapsed_time": "0:21:05", "remaining_time": "1:45:02"} +{"current_steps": 414, "total_steps": 2470, "loss": 1.9078, "lr": 9.863042974107395e-06, "epoch": 1.6761133603238867, "percentage": 16.76, "elapsed_time": "0:21:08", "remaining_time": "1:44:58"} +{"current_steps": 415, "total_steps": 2470, "loss": 2.0498, "lr": 9.861395608395993e-06, "epoch": 1.680161943319838, "percentage": 16.8, "elapsed_time": "0:21:11", "remaining_time": "1:44:55"} +{"current_steps": 416, "total_steps": 2470, "loss": 1.8425, "lr": 9.859738533523384e-06, "epoch": 1.6842105263157894, "percentage": 16.84, "elapsed_time": "0:21:14", "remaining_time": "1:44:52"} +{"current_steps": 417, "total_steps": 2470, "loss": 1.9961, "lr": 9.85807175279907e-06, "epoch": 1.688259109311741, "percentage": 16.88, "elapsed_time": "0:21:17", "remaining_time": "1:44:48"} +{"current_steps": 418, "total_steps": 2470, "loss": 1.9982, "lr": 9.856395269551941e-06, "epoch": 1.6923076923076923, "percentage": 16.92, "elapsed_time": "0:21:20", "remaining_time": "1:44:45"} +{"current_steps": 419, "total_steps": 2470, "loss": 1.8074, "lr": 9.854709087130261e-06, "epoch": 1.6963562753036436, "percentage": 16.96, "elapsed_time": "0:21:23", "remaining_time": "1:44:42"} +{"current_steps": 420, "total_steps": 2470, "loss": 2.315, "lr": 9.85301320890167e-06, "epoch": 1.7004048582995952, "percentage": 17.0, "elapsed_time": "0:21:26", "remaining_time": "1:44:39"} +{"current_steps": 421, "total_steps": 2470, "loss": 2.0698, "lr": 9.851307638253167e-06, "epoch": 1.7044534412955465, "percentage": 17.04, "elapsed_time": "0:21:29", "remaining_time": "1:44:35"} +{"current_steps": 422, "total_steps": 2470, "loss": 1.7238, "lr": 9.849592378591113e-06, "epoch": 1.708502024291498, "percentage": 17.09, "elapsed_time": "0:21:33", "remaining_time": "1:44:35"} +{"current_steps": 423, "total_steps": 2470, "loss": 1.881, "lr": 9.847867433341218e-06, "epoch": 1.7125506072874495, "percentage": 17.13, "elapsed_time": "0:21:36", "remaining_time": "1:44:32"} +{"current_steps": 424, "total_steps": 2470, "loss": 1.9658, "lr": 9.846132805948534e-06, "epoch": 1.7165991902834008, "percentage": 17.17, "elapsed_time": "0:21:39", "remaining_time": "1:44:28"} +{"current_steps": 425, "total_steps": 2470, "loss": 1.873, "lr": 9.844388499877457e-06, "epoch": 1.7206477732793521, "percentage": 17.21, "elapsed_time": "0:21:42", "remaining_time": "1:44:25"} +{"current_steps": 426, "total_steps": 2470, "loss": 1.9664, "lr": 9.842634518611705e-06, "epoch": 1.7246963562753037, "percentage": 17.25, "elapsed_time": "0:21:45", "remaining_time": "1:44:22"} +{"current_steps": 427, "total_steps": 2470, "loss": 2.1073, "lr": 9.840870865654323e-06, "epoch": 1.728744939271255, "percentage": 17.29, "elapsed_time": "0:21:48", "remaining_time": "1:44:19"} +{"current_steps": 428, "total_steps": 2470, "loss": 1.9957, "lr": 9.839097544527674e-06, "epoch": 1.7327935222672064, "percentage": 17.33, "elapsed_time": "0:21:51", "remaining_time": "1:44:15"} +{"current_steps": 429, "total_steps": 2470, "loss": 2.0381, "lr": 9.837314558773427e-06, "epoch": 1.736842105263158, "percentage": 17.37, "elapsed_time": "0:21:54", "remaining_time": "1:44:14"} +{"current_steps": 430, "total_steps": 2470, "loss": 2.6976, "lr": 9.835521911952554e-06, "epoch": 1.7408906882591093, "percentage": 17.41, "elapsed_time": "0:21:57", "remaining_time": "1:44:11"} +{"current_steps": 431, "total_steps": 2470, "loss": 2.0715, "lr": 9.833719607645325e-06, "epoch": 1.7449392712550607, "percentage": 17.45, "elapsed_time": "0:22:00", "remaining_time": "1:44:08"} +{"current_steps": 432, "total_steps": 2470, "loss": 1.9002, "lr": 9.831907649451291e-06, "epoch": 1.7489878542510122, "percentage": 17.49, "elapsed_time": "0:22:03", "remaining_time": "1:44:04"} +{"current_steps": 433, "total_steps": 2470, "loss": 1.7871, "lr": 9.830086040989294e-06, "epoch": 1.7530364372469636, "percentage": 17.53, "elapsed_time": "0:22:06", "remaining_time": "1:44:01"} +{"current_steps": 434, "total_steps": 2470, "loss": 1.9962, "lr": 9.82825478589744e-06, "epoch": 1.757085020242915, "percentage": 17.57, "elapsed_time": "0:22:09", "remaining_time": "1:43:58"} +{"current_steps": 435, "total_steps": 2470, "loss": 2.9222, "lr": 9.826413887833103e-06, "epoch": 1.7611336032388665, "percentage": 17.61, "elapsed_time": "0:22:12", "remaining_time": "1:43:55"} +{"current_steps": 436, "total_steps": 2470, "loss": 2.8461, "lr": 9.824563350472922e-06, "epoch": 1.7651821862348178, "percentage": 17.65, "elapsed_time": "0:22:15", "remaining_time": "1:43:51"} +{"current_steps": 437, "total_steps": 2470, "loss": 2.7384, "lr": 9.822703177512783e-06, "epoch": 1.7692307692307692, "percentage": 17.69, "elapsed_time": "0:22:18", "remaining_time": "1:43:48"} +{"current_steps": 438, "total_steps": 2470, "loss": 1.9939, "lr": 9.820833372667813e-06, "epoch": 1.7732793522267207, "percentage": 17.73, "elapsed_time": "0:22:21", "remaining_time": "1:43:45"} +{"current_steps": 439, "total_steps": 2470, "loss": 2.1821, "lr": 9.818953939672382e-06, "epoch": 1.777327935222672, "percentage": 17.77, "elapsed_time": "0:22:24", "remaining_time": "1:43:42"} +{"current_steps": 440, "total_steps": 2470, "loss": 2.2096, "lr": 9.817064882280085e-06, "epoch": 1.7813765182186234, "percentage": 17.81, "elapsed_time": "0:22:27", "remaining_time": "1:43:38"} +{"current_steps": 441, "total_steps": 2470, "loss": 2.0038, "lr": 9.815166204263743e-06, "epoch": 1.785425101214575, "percentage": 17.85, "elapsed_time": "0:22:30", "remaining_time": "1:43:35"} +{"current_steps": 442, "total_steps": 2470, "loss": 1.887, "lr": 9.813257909415384e-06, "epoch": 1.7894736842105263, "percentage": 17.89, "elapsed_time": "0:22:33", "remaining_time": "1:43:32"} +{"current_steps": 443, "total_steps": 2470, "loss": 2.0549, "lr": 9.811340001546252e-06, "epoch": 1.7935222672064777, "percentage": 17.94, "elapsed_time": "0:22:36", "remaining_time": "1:43:29"} +{"current_steps": 444, "total_steps": 2470, "loss": 2.077, "lr": 9.809412484486785e-06, "epoch": 1.7975708502024292, "percentage": 17.98, "elapsed_time": "0:22:40", "remaining_time": "1:43:25"} +{"current_steps": 445, "total_steps": 2470, "loss": 1.8171, "lr": 9.80747536208661e-06, "epoch": 1.8016194331983806, "percentage": 18.02, "elapsed_time": "0:22:43", "remaining_time": "1:43:22"} +{"current_steps": 446, "total_steps": 2470, "loss": 1.709, "lr": 9.805528638214543e-06, "epoch": 1.805668016194332, "percentage": 18.06, "elapsed_time": "0:22:46", "remaining_time": "1:43:19"} +{"current_steps": 447, "total_steps": 2470, "loss": 2.005, "lr": 9.803572316758573e-06, "epoch": 1.8097165991902835, "percentage": 18.1, "elapsed_time": "0:22:49", "remaining_time": "1:43:16"} +{"current_steps": 448, "total_steps": 2470, "loss": 2.0437, "lr": 9.801606401625857e-06, "epoch": 1.8137651821862348, "percentage": 18.14, "elapsed_time": "0:22:52", "remaining_time": "1:43:12"} +{"current_steps": 449, "total_steps": 2470, "loss": 1.8053, "lr": 9.799630896742716e-06, "epoch": 1.8178137651821862, "percentage": 18.18, "elapsed_time": "0:22:55", "remaining_time": "1:43:09"} +{"current_steps": 450, "total_steps": 2470, "loss": 2.6057, "lr": 9.797645806054617e-06, "epoch": 1.8218623481781377, "percentage": 18.22, "elapsed_time": "0:22:58", "remaining_time": "1:43:06"} +{"current_steps": 451, "total_steps": 2470, "loss": 4.1742, "lr": 9.79565113352618e-06, "epoch": 1.825910931174089, "percentage": 18.26, "elapsed_time": "0:23:01", "remaining_time": "1:43:02"} +{"current_steps": 452, "total_steps": 2470, "loss": 1.9001, "lr": 9.793646883141155e-06, "epoch": 1.8299595141700404, "percentage": 18.3, "elapsed_time": "0:23:04", "remaining_time": "1:42:59"} +{"current_steps": 453, "total_steps": 2470, "loss": 1.7789, "lr": 9.791633058902424e-06, "epoch": 1.834008097165992, "percentage": 18.34, "elapsed_time": "0:23:07", "remaining_time": "1:42:56"} +{"current_steps": 454, "total_steps": 2470, "loss": 1.8425, "lr": 9.789609664831988e-06, "epoch": 1.8380566801619433, "percentage": 18.38, "elapsed_time": "0:23:10", "remaining_time": "1:42:53"} +{"current_steps": 455, "total_steps": 2470, "loss": 1.8701, "lr": 9.787576704970965e-06, "epoch": 1.8421052631578947, "percentage": 18.42, "elapsed_time": "0:23:13", "remaining_time": "1:42:49"} +{"current_steps": 456, "total_steps": 2470, "loss": 1.8468, "lr": 9.785534183379571e-06, "epoch": 1.8461538461538463, "percentage": 18.46, "elapsed_time": "0:23:16", "remaining_time": "1:42:46"} +{"current_steps": 457, "total_steps": 2470, "loss": 1.8772, "lr": 9.783482104137127e-06, "epoch": 1.8502024291497976, "percentage": 18.5, "elapsed_time": "0:23:19", "remaining_time": "1:42:43"} +{"current_steps": 458, "total_steps": 2470, "loss": 1.9477, "lr": 9.781420471342035e-06, "epoch": 1.854251012145749, "percentage": 18.54, "elapsed_time": "0:23:22", "remaining_time": "1:42:40"} +{"current_steps": 459, "total_steps": 2470, "loss": 1.8995, "lr": 9.779349289111781e-06, "epoch": 1.8582995951417005, "percentage": 18.58, "elapsed_time": "0:23:25", "remaining_time": "1:42:36"} +{"current_steps": 460, "total_steps": 2470, "loss": 1.9406, "lr": 9.777268561582921e-06, "epoch": 1.8623481781376519, "percentage": 18.62, "elapsed_time": "0:23:28", "remaining_time": "1:42:33"} +{"current_steps": 461, "total_steps": 2470, "loss": 1.9325, "lr": 9.77517829291108e-06, "epoch": 1.8663967611336032, "percentage": 18.66, "elapsed_time": "0:23:31", "remaining_time": "1:42:30"} +{"current_steps": 462, "total_steps": 2470, "loss": 2.4038, "lr": 9.773078487270932e-06, "epoch": 1.8704453441295548, "percentage": 18.7, "elapsed_time": "0:23:34", "remaining_time": "1:42:27"} +{"current_steps": 463, "total_steps": 2470, "loss": 2.3187, "lr": 9.770969148856202e-06, "epoch": 1.874493927125506, "percentage": 18.74, "elapsed_time": "0:23:37", "remaining_time": "1:42:24"} +{"current_steps": 464, "total_steps": 2470, "loss": 2.1105, "lr": 9.768850281879651e-06, "epoch": 1.8785425101214575, "percentage": 18.79, "elapsed_time": "0:23:40", "remaining_time": "1:42:20"} +{"current_steps": 465, "total_steps": 2470, "loss": 1.9824, "lr": 9.766721890573075e-06, "epoch": 1.882591093117409, "percentage": 18.83, "elapsed_time": "0:23:43", "remaining_time": "1:42:17"} +{"current_steps": 466, "total_steps": 2470, "loss": 1.5205, "lr": 9.764583979187288e-06, "epoch": 1.8866396761133604, "percentage": 18.87, "elapsed_time": "0:23:46", "remaining_time": "1:42:14"} +{"current_steps": 467, "total_steps": 2470, "loss": 1.9872, "lr": 9.762436551992117e-06, "epoch": 1.8906882591093117, "percentage": 18.91, "elapsed_time": "0:23:49", "remaining_time": "1:42:11"} +{"current_steps": 468, "total_steps": 2470, "loss": 2.0814, "lr": 9.760279613276397e-06, "epoch": 1.8947368421052633, "percentage": 18.95, "elapsed_time": "0:23:52", "remaining_time": "1:42:07"} +{"current_steps": 469, "total_steps": 2470, "loss": 1.7849, "lr": 9.75811316734796e-06, "epoch": 1.8987854251012146, "percentage": 18.99, "elapsed_time": "0:23:55", "remaining_time": "1:42:04"} +{"current_steps": 470, "total_steps": 2470, "loss": 1.8179, "lr": 9.755937218533622e-06, "epoch": 1.902834008097166, "percentage": 19.03, "elapsed_time": "0:23:59", "remaining_time": "1:42:04"} +{"current_steps": 471, "total_steps": 2470, "loss": 2.0286, "lr": 9.753751771179177e-06, "epoch": 1.9068825910931175, "percentage": 19.07, "elapsed_time": "0:24:02", "remaining_time": "1:42:00"} +{"current_steps": 472, "total_steps": 2470, "loss": 1.5547, "lr": 9.751556829649398e-06, "epoch": 1.9109311740890689, "percentage": 19.11, "elapsed_time": "0:24:05", "remaining_time": "1:41:57"} +{"current_steps": 473, "total_steps": 2470, "loss": 1.733, "lr": 9.74935239832801e-06, "epoch": 1.9149797570850202, "percentage": 19.15, "elapsed_time": "0:24:08", "remaining_time": "1:41:54"} +{"current_steps": 474, "total_steps": 2470, "loss": 1.767, "lr": 9.747138481617695e-06, "epoch": 1.9190283400809718, "percentage": 19.19, "elapsed_time": "0:24:11", "remaining_time": "1:41:51"} +{"current_steps": 475, "total_steps": 2470, "loss": 2.0018, "lr": 9.74491508394008e-06, "epoch": 1.9230769230769231, "percentage": 19.23, "elapsed_time": "0:24:14", "remaining_time": "1:41:48"} +{"current_steps": 476, "total_steps": 2470, "loss": 1.8865, "lr": 9.742682209735727e-06, "epoch": 1.9271255060728745, "percentage": 19.27, "elapsed_time": "0:24:17", "remaining_time": "1:41:44"} +{"current_steps": 477, "total_steps": 2470, "loss": 1.9105, "lr": 9.740439863464127e-06, "epoch": 1.931174089068826, "percentage": 19.31, "elapsed_time": "0:24:20", "remaining_time": "1:41:43"} +{"current_steps": 478, "total_steps": 2470, "loss": 1.7676, "lr": 9.738188049603679e-06, "epoch": 1.9352226720647774, "percentage": 19.35, "elapsed_time": "0:24:23", "remaining_time": "1:41:39"} +{"current_steps": 479, "total_steps": 2470, "loss": 1.6493, "lr": 9.735926772651703e-06, "epoch": 1.9392712550607287, "percentage": 19.39, "elapsed_time": "0:24:26", "remaining_time": "1:41:36"} +{"current_steps": 480, "total_steps": 2470, "loss": 1.9464, "lr": 9.73365603712441e-06, "epoch": 1.9433198380566803, "percentage": 19.43, "elapsed_time": "0:24:29", "remaining_time": "1:41:33"} +{"current_steps": 481, "total_steps": 2470, "loss": 1.6826, "lr": 9.731375847556905e-06, "epoch": 1.9473684210526314, "percentage": 19.47, "elapsed_time": "0:24:32", "remaining_time": "1:41:30"} +{"current_steps": 482, "total_steps": 2470, "loss": 1.7014, "lr": 9.729086208503174e-06, "epoch": 1.951417004048583, "percentage": 19.51, "elapsed_time": "0:24:35", "remaining_time": "1:41:27"} +{"current_steps": 483, "total_steps": 2470, "loss": 1.9583, "lr": 9.726787124536077e-06, "epoch": 1.9554655870445345, "percentage": 19.55, "elapsed_time": "0:24:38", "remaining_time": "1:41:23"} +{"current_steps": 484, "total_steps": 2470, "loss": 1.7925, "lr": 9.724478600247333e-06, "epoch": 1.9595141700404857, "percentage": 19.6, "elapsed_time": "0:24:41", "remaining_time": "1:41:20"} +{"current_steps": 485, "total_steps": 2470, "loss": 1.8932, "lr": 9.722160640247523e-06, "epoch": 1.9635627530364372, "percentage": 19.64, "elapsed_time": "0:24:44", "remaining_time": "1:41:17"} +{"current_steps": 486, "total_steps": 2470, "loss": 2.1332, "lr": 9.719833249166061e-06, "epoch": 1.9676113360323888, "percentage": 19.68, "elapsed_time": "0:24:47", "remaining_time": "1:41:14"} +{"current_steps": 487, "total_steps": 2470, "loss": 2.0526, "lr": 9.717496431651212e-06, "epoch": 1.97165991902834, "percentage": 19.72, "elapsed_time": "0:24:51", "remaining_time": "1:41:11"} +{"current_steps": 488, "total_steps": 2470, "loss": 1.8783, "lr": 9.715150192370054e-06, "epoch": 1.9757085020242915, "percentage": 19.76, "elapsed_time": "0:24:54", "remaining_time": "1:41:07"} +{"current_steps": 489, "total_steps": 2470, "loss": 1.9859, "lr": 9.712794536008488e-06, "epoch": 1.979757085020243, "percentage": 19.8, "elapsed_time": "0:24:57", "remaining_time": "1:41:04"} +{"current_steps": 490, "total_steps": 2470, "loss": 2.0382, "lr": 9.710429467271221e-06, "epoch": 1.9838056680161942, "percentage": 19.84, "elapsed_time": "0:25:00", "remaining_time": "1:41:01"} +{"current_steps": 491, "total_steps": 2470, "loss": 1.8377, "lr": 9.708054990881763e-06, "epoch": 1.9878542510121457, "percentage": 19.88, "elapsed_time": "0:25:03", "remaining_time": "1:40:58"} +{"current_steps": 492, "total_steps": 2470, "loss": 1.7694, "lr": 9.705671111582406e-06, "epoch": 1.9919028340080973, "percentage": 19.92, "elapsed_time": "0:25:06", "remaining_time": "1:40:55"} +{"current_steps": 493, "total_steps": 2470, "loss": 2.0757, "lr": 9.703277834134227e-06, "epoch": 1.9959514170040484, "percentage": 19.96, "elapsed_time": "0:25:09", "remaining_time": "1:40:52"} +{"current_steps": 494, "total_steps": 2470, "loss": 1.8875, "lr": 9.700875163317072e-06, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "0:25:12", "remaining_time": "1:40:48"} +{"current_steps": 495, "total_steps": 2470, "loss": 1.9618, "lr": 9.698463103929542e-06, "epoch": 2.0040485829959516, "percentage": 20.04, "elapsed_time": "0:25:15", "remaining_time": "1:40:45"} +{"current_steps": 496, "total_steps": 2470, "loss": 2.0888, "lr": 9.696041660788997e-06, "epoch": 2.0080971659919027, "percentage": 20.08, "elapsed_time": "0:25:18", "remaining_time": "1:40:42"} +{"current_steps": 497, "total_steps": 2470, "loss": 1.9408, "lr": 9.693610838731532e-06, "epoch": 2.0121457489878543, "percentage": 20.12, "elapsed_time": "0:25:21", "remaining_time": "1:40:39"} +{"current_steps": 498, "total_steps": 2470, "loss": 2.0771, "lr": 9.691170642611975e-06, "epoch": 2.016194331983806, "percentage": 20.16, "elapsed_time": "0:25:24", "remaining_time": "1:40:35"} +{"current_steps": 499, "total_steps": 2470, "loss": 2.3311, "lr": 9.68872107730388e-06, "epoch": 2.020242914979757, "percentage": 20.2, "elapsed_time": "0:25:27", "remaining_time": "1:40:32"} +{"current_steps": 500, "total_steps": 2470, "loss": 2.2077, "lr": 9.686262147699507e-06, "epoch": 2.0242914979757085, "percentage": 20.24, "elapsed_time": "0:25:30", "remaining_time": "1:40:29"} +{"current_steps": 501, "total_steps": 2470, "loss": 1.8546, "lr": 9.683793858709821e-06, "epoch": 2.02834008097166, "percentage": 20.28, "elapsed_time": "0:25:33", "remaining_time": "1:40:26"} +{"current_steps": 502, "total_steps": 2470, "loss": 1.9004, "lr": 9.681316215264481e-06, "epoch": 2.032388663967611, "percentage": 20.32, "elapsed_time": "0:25:36", "remaining_time": "1:40:23"} +{"current_steps": 503, "total_steps": 2470, "loss": 2.1369, "lr": 9.678829222311827e-06, "epoch": 2.0364372469635628, "percentage": 20.36, "elapsed_time": "0:25:39", "remaining_time": "1:40:19"} +{"current_steps": 504, "total_steps": 2470, "loss": 1.9294, "lr": 9.67633288481887e-06, "epoch": 2.0404858299595143, "percentage": 20.4, "elapsed_time": "0:25:42", "remaining_time": "1:40:16"} +{"current_steps": 505, "total_steps": 2470, "loss": 1.9228, "lr": 9.67382720777129e-06, "epoch": 2.0445344129554655, "percentage": 20.45, "elapsed_time": "0:25:45", "remaining_time": "1:40:13"} +{"current_steps": 506, "total_steps": 2470, "loss": 1.9005, "lr": 9.671312196173413e-06, "epoch": 2.048582995951417, "percentage": 20.49, "elapsed_time": "0:25:48", "remaining_time": "1:40:10"} +{"current_steps": 507, "total_steps": 2470, "loss": 2.0772, "lr": 9.668787855048209e-06, "epoch": 2.0526315789473686, "percentage": 20.53, "elapsed_time": "0:25:51", "remaining_time": "1:40:07"} +{"current_steps": 508, "total_steps": 2470, "loss": 1.9259, "lr": 9.666254189437286e-06, "epoch": 2.0566801619433197, "percentage": 20.57, "elapsed_time": "0:25:54", "remaining_time": "1:40:03"} +{"current_steps": 509, "total_steps": 2470, "loss": 2.0637, "lr": 9.663711204400872e-06, "epoch": 2.0607287449392713, "percentage": 20.61, "elapsed_time": "0:25:57", "remaining_time": "1:40:00"} +{"current_steps": 510, "total_steps": 2470, "loss": 1.9998, "lr": 9.661158905017804e-06, "epoch": 2.064777327935223, "percentage": 20.65, "elapsed_time": "0:26:00", "remaining_time": "1:39:57"} +{"current_steps": 511, "total_steps": 2470, "loss": 2.1032, "lr": 9.658597296385527e-06, "epoch": 2.068825910931174, "percentage": 20.69, "elapsed_time": "0:26:03", "remaining_time": "1:39:54"} +{"current_steps": 512, "total_steps": 2470, "loss": 1.9957, "lr": 9.656026383620076e-06, "epoch": 2.0728744939271255, "percentage": 20.73, "elapsed_time": "0:26:06", "remaining_time": "1:39:51"} +{"current_steps": 513, "total_steps": 2470, "loss": 1.9291, "lr": 9.653446171856069e-06, "epoch": 2.076923076923077, "percentage": 20.77, "elapsed_time": "0:26:09", "remaining_time": "1:39:47"} +{"current_steps": 514, "total_steps": 2470, "loss": 1.9435, "lr": 9.650856666246693e-06, "epoch": 2.080971659919028, "percentage": 20.81, "elapsed_time": "0:26:12", "remaining_time": "1:39:44"} +{"current_steps": 515, "total_steps": 2470, "loss": 1.9267, "lr": 9.6482578719637e-06, "epoch": 2.08502024291498, "percentage": 20.85, "elapsed_time": "0:26:15", "remaining_time": "1:39:41"} +{"current_steps": 516, "total_steps": 2470, "loss": 1.9435, "lr": 9.645649794197394e-06, "epoch": 2.0890688259109313, "percentage": 20.89, "elapsed_time": "0:26:18", "remaining_time": "1:39:38"} +{"current_steps": 517, "total_steps": 2470, "loss": 2.0396, "lr": 9.643032438156616e-06, "epoch": 2.0931174089068825, "percentage": 20.93, "elapsed_time": "0:26:21", "remaining_time": "1:39:35"} +{"current_steps": 518, "total_steps": 2470, "loss": 1.765, "lr": 9.640405809068743e-06, "epoch": 2.097165991902834, "percentage": 20.97, "elapsed_time": "0:26:25", "remaining_time": "1:39:34"} +{"current_steps": 519, "total_steps": 2470, "loss": 1.9292, "lr": 9.637769912179664e-06, "epoch": 2.1012145748987856, "percentage": 21.01, "elapsed_time": "0:26:28", "remaining_time": "1:39:30"} +{"current_steps": 520, "total_steps": 2470, "loss": 1.9832, "lr": 9.635124752753787e-06, "epoch": 2.1052631578947367, "percentage": 21.05, "elapsed_time": "0:26:31", "remaining_time": "1:39:27"} +{"current_steps": 521, "total_steps": 2470, "loss": 1.8461, "lr": 9.632470336074009e-06, "epoch": 2.1093117408906883, "percentage": 21.09, "elapsed_time": "0:26:34", "remaining_time": "1:39:24"} +{"current_steps": 522, "total_steps": 2470, "loss": 1.7856, "lr": 9.629806667441727e-06, "epoch": 2.11336032388664, "percentage": 21.13, "elapsed_time": "0:26:37", "remaining_time": "1:39:21"} +{"current_steps": 523, "total_steps": 2470, "loss": 1.7441, "lr": 9.627133752176809e-06, "epoch": 2.117408906882591, "percentage": 21.17, "elapsed_time": "0:26:40", "remaining_time": "1:39:18"} +{"current_steps": 524, "total_steps": 2470, "loss": 1.7615, "lr": 9.624451595617588e-06, "epoch": 2.1214574898785425, "percentage": 21.21, "elapsed_time": "0:26:43", "remaining_time": "1:39:14"} +{"current_steps": 525, "total_steps": 2470, "loss": 1.7378, "lr": 9.62176020312086e-06, "epoch": 2.125506072874494, "percentage": 21.26, "elapsed_time": "0:26:46", "remaining_time": "1:39:13"} +{"current_steps": 526, "total_steps": 2470, "loss": 1.7039, "lr": 9.619059580061862e-06, "epoch": 2.1295546558704452, "percentage": 21.3, "elapsed_time": "0:26:49", "remaining_time": "1:39:10"} +{"current_steps": 527, "total_steps": 2470, "loss": 2.0009, "lr": 9.616349731834271e-06, "epoch": 2.133603238866397, "percentage": 21.34, "elapsed_time": "0:26:53", "remaining_time": "1:39:06"} +{"current_steps": 528, "total_steps": 2470, "loss": 1.872, "lr": 9.613630663850184e-06, "epoch": 2.1376518218623484, "percentage": 21.38, "elapsed_time": "0:26:56", "remaining_time": "1:39:03"} +{"current_steps": 529, "total_steps": 2470, "loss": 1.5977, "lr": 9.610902381540115e-06, "epoch": 2.1417004048582995, "percentage": 21.42, "elapsed_time": "0:26:59", "remaining_time": "1:39:00"} +{"current_steps": 530, "total_steps": 2470, "loss": 2.0221, "lr": 9.608164890352977e-06, "epoch": 2.145748987854251, "percentage": 21.46, "elapsed_time": "0:27:02", "remaining_time": "1:38:57"} +{"current_steps": 531, "total_steps": 2470, "loss": 1.8497, "lr": 9.605418195756077e-06, "epoch": 2.1497975708502026, "percentage": 21.5, "elapsed_time": "0:27:05", "remaining_time": "1:38:54"} +{"current_steps": 532, "total_steps": 2470, "loss": 1.9881, "lr": 9.602662303235106e-06, "epoch": 2.1538461538461537, "percentage": 21.54, "elapsed_time": "0:27:08", "remaining_time": "1:38:51"} +{"current_steps": 533, "total_steps": 2470, "loss": 2.2169, "lr": 9.599897218294122e-06, "epoch": 2.1578947368421053, "percentage": 21.58, "elapsed_time": "0:27:11", "remaining_time": "1:38:47"} +{"current_steps": 534, "total_steps": 2470, "loss": 1.8884, "lr": 9.597122946455539e-06, "epoch": 2.161943319838057, "percentage": 21.62, "elapsed_time": "0:27:14", "remaining_time": "1:38:44"} +{"current_steps": 535, "total_steps": 2470, "loss": 1.9169, "lr": 9.594339493260127e-06, "epoch": 2.165991902834008, "percentage": 21.66, "elapsed_time": "0:27:17", "remaining_time": "1:38:41"} +{"current_steps": 536, "total_steps": 2470, "loss": 2.3116, "lr": 9.591546864266983e-06, "epoch": 2.1700404858299596, "percentage": 21.7, "elapsed_time": "0:27:20", "remaining_time": "1:38:38"} +{"current_steps": 537, "total_steps": 2470, "loss": 1.7854, "lr": 9.58874506505354e-06, "epoch": 2.174089068825911, "percentage": 21.74, "elapsed_time": "0:27:23", "remaining_time": "1:38:34"} +{"current_steps": 538, "total_steps": 2470, "loss": 1.9564, "lr": 9.58593410121554e-06, "epoch": 2.1781376518218623, "percentage": 21.78, "elapsed_time": "0:27:26", "remaining_time": "1:38:31"} +{"current_steps": 539, "total_steps": 2470, "loss": 1.7449, "lr": 9.583113978367026e-06, "epoch": 2.182186234817814, "percentage": 21.82, "elapsed_time": "0:27:29", "remaining_time": "1:38:28"} +{"current_steps": 540, "total_steps": 2470, "loss": 1.8748, "lr": 9.580284702140342e-06, "epoch": 2.1862348178137654, "percentage": 21.86, "elapsed_time": "0:27:32", "remaining_time": "1:38:25"} +{"current_steps": 541, "total_steps": 2470, "loss": 1.944, "lr": 9.577446278186103e-06, "epoch": 2.1902834008097165, "percentage": 21.9, "elapsed_time": "0:27:35", "remaining_time": "1:38:22"} +{"current_steps": 542, "total_steps": 2470, "loss": 2.1136, "lr": 9.574598712173202e-06, "epoch": 2.194331983805668, "percentage": 21.94, "elapsed_time": "0:27:38", "remaining_time": "1:38:18"} +{"current_steps": 543, "total_steps": 2470, "loss": 2.1866, "lr": 9.571742009788787e-06, "epoch": 2.1983805668016196, "percentage": 21.98, "elapsed_time": "0:27:41", "remaining_time": "1:38:15"} +{"current_steps": 544, "total_steps": 2470, "loss": 1.8859, "lr": 9.568876176738251e-06, "epoch": 2.2024291497975708, "percentage": 22.02, "elapsed_time": "0:27:44", "remaining_time": "1:38:12"} +{"current_steps": 545, "total_steps": 2470, "loss": 2.1936, "lr": 9.56600121874523e-06, "epoch": 2.2064777327935223, "percentage": 22.06, "elapsed_time": "0:27:47", "remaining_time": "1:38:09"} +{"current_steps": 546, "total_steps": 2470, "loss": 2.2517, "lr": 9.563117141551574e-06, "epoch": 2.2105263157894735, "percentage": 22.11, "elapsed_time": "0:27:50", "remaining_time": "1:38:06"} +{"current_steps": 547, "total_steps": 2470, "loss": 2.041, "lr": 9.560223950917354e-06, "epoch": 2.214574898785425, "percentage": 22.15, "elapsed_time": "0:27:53", "remaining_time": "1:38:02"} +{"current_steps": 548, "total_steps": 2470, "loss": 1.8986, "lr": 9.557321652620839e-06, "epoch": 2.2186234817813766, "percentage": 22.19, "elapsed_time": "0:27:56", "remaining_time": "1:37:59"} +{"current_steps": 549, "total_steps": 2470, "loss": 1.8568, "lr": 9.554410252458489e-06, "epoch": 2.2226720647773277, "percentage": 22.23, "elapsed_time": "0:27:59", "remaining_time": "1:37:56"} +{"current_steps": 550, "total_steps": 2470, "loss": 1.9347, "lr": 9.551489756244939e-06, "epoch": 2.2267206477732793, "percentage": 22.27, "elapsed_time": "0:28:02", "remaining_time": "1:37:53"} +{"current_steps": 551, "total_steps": 2470, "loss": 1.8809, "lr": 9.548560169812997e-06, "epoch": 2.230769230769231, "percentage": 22.31, "elapsed_time": "0:28:05", "remaining_time": "1:37:50"} +{"current_steps": 552, "total_steps": 2470, "loss": 2.0865, "lr": 9.54562149901362e-06, "epoch": 2.234817813765182, "percentage": 22.35, "elapsed_time": "0:28:08", "remaining_time": "1:37:47"} +{"current_steps": 553, "total_steps": 2470, "loss": 2.0449, "lr": 9.54267374971591e-06, "epoch": 2.2388663967611335, "percentage": 22.39, "elapsed_time": "0:28:11", "remaining_time": "1:37:43"} +{"current_steps": 554, "total_steps": 2470, "loss": 2.0083, "lr": 9.539716927807102e-06, "epoch": 2.242914979757085, "percentage": 22.43, "elapsed_time": "0:28:14", "remaining_time": "1:37:40"} +{"current_steps": 555, "total_steps": 2470, "loss": 1.8576, "lr": 9.536751039192549e-06, "epoch": 2.246963562753036, "percentage": 22.47, "elapsed_time": "0:28:17", "remaining_time": "1:37:37"} +{"current_steps": 556, "total_steps": 2470, "loss": 1.8923, "lr": 9.533776089795712e-06, "epoch": 2.251012145748988, "percentage": 22.51, "elapsed_time": "0:28:20", "remaining_time": "1:37:34"} +{"current_steps": 557, "total_steps": 2470, "loss": 1.9873, "lr": 9.530792085558151e-06, "epoch": 2.2550607287449393, "percentage": 22.55, "elapsed_time": "0:28:23", "remaining_time": "1:37:31"} +{"current_steps": 558, "total_steps": 2470, "loss": 1.8211, "lr": 9.527799032439506e-06, "epoch": 2.2591093117408905, "percentage": 22.59, "elapsed_time": "0:28:26", "remaining_time": "1:37:27"} +{"current_steps": 559, "total_steps": 2470, "loss": 1.7082, "lr": 9.524796936417495e-06, "epoch": 2.263157894736842, "percentage": 22.63, "elapsed_time": "0:28:29", "remaining_time": "1:37:24"} +{"current_steps": 560, "total_steps": 2470, "loss": 1.9216, "lr": 9.521785803487888e-06, "epoch": 2.2672064777327936, "percentage": 22.67, "elapsed_time": "0:28:32", "remaining_time": "1:37:21"} +{"current_steps": 561, "total_steps": 2470, "loss": 1.9723, "lr": 9.518765639664512e-06, "epoch": 2.2712550607287447, "percentage": 22.71, "elapsed_time": "0:28:35", "remaining_time": "1:37:18"} +{"current_steps": 562, "total_steps": 2470, "loss": 1.953, "lr": 9.515736450979224e-06, "epoch": 2.2753036437246963, "percentage": 22.75, "elapsed_time": "0:28:38", "remaining_time": "1:37:15"} +{"current_steps": 563, "total_steps": 2470, "loss": 1.991, "lr": 9.512698243481914e-06, "epoch": 2.279352226720648, "percentage": 22.79, "elapsed_time": "0:28:41", "remaining_time": "1:37:12"} +{"current_steps": 564, "total_steps": 2470, "loss": 1.7088, "lr": 9.509651023240472e-06, "epoch": 2.283400809716599, "percentage": 22.83, "elapsed_time": "0:28:44", "remaining_time": "1:37:09"} +{"current_steps": 565, "total_steps": 2470, "loss": 1.7975, "lr": 9.5065947963408e-06, "epoch": 2.2874493927125505, "percentage": 22.87, "elapsed_time": "0:28:47", "remaining_time": "1:37:05"} +{"current_steps": 566, "total_steps": 2470, "loss": 1.6643, "lr": 9.50352956888678e-06, "epoch": 2.291497975708502, "percentage": 22.91, "elapsed_time": "0:28:51", "remaining_time": "1:37:04"} +{"current_steps": 567, "total_steps": 2470, "loss": 2.0078, "lr": 9.500455347000273e-06, "epoch": 2.2955465587044532, "percentage": 22.96, "elapsed_time": "0:28:54", "remaining_time": "1:37:01"} +{"current_steps": 568, "total_steps": 2470, "loss": 2.0653, "lr": 9.497372136821103e-06, "epoch": 2.299595141700405, "percentage": 23.0, "elapsed_time": "0:28:57", "remaining_time": "1:36:58"} +{"current_steps": 569, "total_steps": 2470, "loss": 1.8078, "lr": 9.49427994450705e-06, "epoch": 2.3036437246963564, "percentage": 23.04, "elapsed_time": "0:29:00", "remaining_time": "1:36:55"} +{"current_steps": 570, "total_steps": 2470, "loss": 1.8219, "lr": 9.491178776233825e-06, "epoch": 2.3076923076923075, "percentage": 23.08, "elapsed_time": "0:29:03", "remaining_time": "1:36:52"} +{"current_steps": 571, "total_steps": 2470, "loss": 1.8582, "lr": 9.488068638195072e-06, "epoch": 2.311740890688259, "percentage": 23.12, "elapsed_time": "0:29:06", "remaining_time": "1:36:48"} +{"current_steps": 572, "total_steps": 2470, "loss": 1.8562, "lr": 9.484949536602343e-06, "epoch": 2.3157894736842106, "percentage": 23.16, "elapsed_time": "0:29:09", "remaining_time": "1:36:45"} +{"current_steps": 573, "total_steps": 2470, "loss": 1.7431, "lr": 9.481821477685102e-06, "epoch": 2.3198380566801617, "percentage": 23.2, "elapsed_time": "0:29:13", "remaining_time": "1:36:43"} +{"current_steps": 574, "total_steps": 2470, "loss": 1.7586, "lr": 9.478684467690693e-06, "epoch": 2.3238866396761133, "percentage": 23.24, "elapsed_time": "0:29:16", "remaining_time": "1:36:40"} +{"current_steps": 575, "total_steps": 2470, "loss": 1.7694, "lr": 9.47553851288434e-06, "epoch": 2.327935222672065, "percentage": 23.28, "elapsed_time": "0:29:19", "remaining_time": "1:36:37"} +{"current_steps": 576, "total_steps": 2470, "loss": 1.6545, "lr": 9.472383619549133e-06, "epoch": 2.331983805668016, "percentage": 23.32, "elapsed_time": "0:29:22", "remaining_time": "1:36:34"} +{"current_steps": 577, "total_steps": 2470, "loss": 1.6896, "lr": 9.469219793986016e-06, "epoch": 2.3360323886639676, "percentage": 23.36, "elapsed_time": "0:29:25", "remaining_time": "1:36:31"} +{"current_steps": 578, "total_steps": 2470, "loss": 1.6272, "lr": 9.466047042513767e-06, "epoch": 2.340080971659919, "percentage": 23.4, "elapsed_time": "0:29:28", "remaining_time": "1:36:28"} +{"current_steps": 579, "total_steps": 2470, "loss": 1.7176, "lr": 9.462865371468994e-06, "epoch": 2.3441295546558703, "percentage": 23.44, "elapsed_time": "0:29:31", "remaining_time": "1:36:24"} +{"current_steps": 580, "total_steps": 2470, "loss": 1.9005, "lr": 9.459674787206117e-06, "epoch": 2.348178137651822, "percentage": 23.48, "elapsed_time": "0:29:34", "remaining_time": "1:36:21"} +{"current_steps": 581, "total_steps": 2470, "loss": 1.7493, "lr": 9.45647529609736e-06, "epoch": 2.3522267206477734, "percentage": 23.52, "elapsed_time": "0:29:37", "remaining_time": "1:36:18"} +{"current_steps": 582, "total_steps": 2470, "loss": 1.4856, "lr": 9.453266904532737e-06, "epoch": 2.3562753036437245, "percentage": 23.56, "elapsed_time": "0:29:40", "remaining_time": "1:36:15"} +{"current_steps": 583, "total_steps": 2470, "loss": 1.8312, "lr": 9.450049618920034e-06, "epoch": 2.360323886639676, "percentage": 23.6, "elapsed_time": "0:29:43", "remaining_time": "1:36:12"} +{"current_steps": 584, "total_steps": 2470, "loss": 1.9048, "lr": 9.4468234456848e-06, "epoch": 2.3643724696356276, "percentage": 23.64, "elapsed_time": "0:29:46", "remaining_time": "1:36:08"} +{"current_steps": 585, "total_steps": 2470, "loss": 1.8077, "lr": 9.44358839127034e-06, "epoch": 2.3684210526315788, "percentage": 23.68, "elapsed_time": "0:29:49", "remaining_time": "1:36:05"} +{"current_steps": 586, "total_steps": 2470, "loss": 1.9556, "lr": 9.44034446213769e-06, "epoch": 2.3724696356275303, "percentage": 23.72, "elapsed_time": "0:29:52", "remaining_time": "1:36:02"} +{"current_steps": 587, "total_steps": 2470, "loss": 1.5064, "lr": 9.437091664765611e-06, "epoch": 2.376518218623482, "percentage": 23.77, "elapsed_time": "0:29:55", "remaining_time": "1:35:59"} +{"current_steps": 588, "total_steps": 2470, "loss": 1.69, "lr": 9.433830005650582e-06, "epoch": 2.380566801619433, "percentage": 23.81, "elapsed_time": "0:29:58", "remaining_time": "1:35:56"} +{"current_steps": 589, "total_steps": 2470, "loss": 1.5552, "lr": 9.430559491306777e-06, "epoch": 2.3846153846153846, "percentage": 23.85, "elapsed_time": "0:30:01", "remaining_time": "1:35:53"} +{"current_steps": 590, "total_steps": 2470, "loss": 1.6106, "lr": 9.427280128266049e-06, "epoch": 2.388663967611336, "percentage": 23.89, "elapsed_time": "0:30:04", "remaining_time": "1:35:49"} +{"current_steps": 591, "total_steps": 2470, "loss": 1.7636, "lr": 9.423991923077938e-06, "epoch": 2.3927125506072873, "percentage": 23.93, "elapsed_time": "0:30:07", "remaining_time": "1:35:46"} +{"current_steps": 592, "total_steps": 2470, "loss": 1.5786, "lr": 9.420694882309628e-06, "epoch": 2.396761133603239, "percentage": 23.97, "elapsed_time": "0:30:10", "remaining_time": "1:35:43"} +{"current_steps": 593, "total_steps": 2470, "loss": 1.6426, "lr": 9.41738901254596e-06, "epoch": 2.4008097165991904, "percentage": 24.01, "elapsed_time": "0:30:13", "remaining_time": "1:35:40"} +{"current_steps": 594, "total_steps": 2470, "loss": 1.7306, "lr": 9.414074320389403e-06, "epoch": 2.4048582995951415, "percentage": 24.05, "elapsed_time": "0:30:16", "remaining_time": "1:35:37"} +{"current_steps": 595, "total_steps": 2470, "loss": 1.821, "lr": 9.41075081246005e-06, "epoch": 2.408906882591093, "percentage": 24.09, "elapsed_time": "0:30:19", "remaining_time": "1:35:34"} +{"current_steps": 596, "total_steps": 2470, "loss": 1.6872, "lr": 9.4074184953956e-06, "epoch": 2.4129554655870447, "percentage": 24.13, "elapsed_time": "0:30:22", "remaining_time": "1:35:30"} +{"current_steps": 597, "total_steps": 2470, "loss": 1.9362, "lr": 9.404077375851338e-06, "epoch": 2.417004048582996, "percentage": 24.17, "elapsed_time": "0:30:25", "remaining_time": "1:35:27"} +{"current_steps": 598, "total_steps": 2470, "loss": 2.0139, "lr": 9.400727460500141e-06, "epoch": 2.4210526315789473, "percentage": 24.21, "elapsed_time": "0:30:28", "remaining_time": "1:35:24"} +{"current_steps": 599, "total_steps": 2470, "loss": 1.8485, "lr": 9.397368756032445e-06, "epoch": 2.425101214574899, "percentage": 24.25, "elapsed_time": "0:30:31", "remaining_time": "1:35:21"} +{"current_steps": 600, "total_steps": 2470, "loss": 1.9812, "lr": 9.394001269156245e-06, "epoch": 2.42914979757085, "percentage": 24.29, "elapsed_time": "0:30:34", "remaining_time": "1:35:18"} +{"current_steps": 601, "total_steps": 2470, "loss": 2.0496, "lr": 9.39062500659707e-06, "epoch": 2.4331983805668016, "percentage": 24.33, "elapsed_time": "0:30:37", "remaining_time": "1:35:15"} +{"current_steps": 602, "total_steps": 2470, "loss": 1.837, "lr": 9.38723997509798e-06, "epoch": 2.437246963562753, "percentage": 24.37, "elapsed_time": "0:30:40", "remaining_time": "1:35:11"} +{"current_steps": 603, "total_steps": 2470, "loss": 1.765, "lr": 9.383846181419547e-06, "epoch": 2.4412955465587043, "percentage": 24.41, "elapsed_time": "0:30:43", "remaining_time": "1:35:08"} +{"current_steps": 604, "total_steps": 2470, "loss": 2.0255, "lr": 9.380443632339845e-06, "epoch": 2.445344129554656, "percentage": 24.45, "elapsed_time": "0:30:46", "remaining_time": "1:35:05"} +{"current_steps": 605, "total_steps": 2470, "loss": 1.7942, "lr": 9.37703233465443e-06, "epoch": 2.4493927125506074, "percentage": 24.49, "elapsed_time": "0:30:49", "remaining_time": "1:35:02"} +{"current_steps": 606, "total_steps": 2470, "loss": 1.777, "lr": 9.373612295176333e-06, "epoch": 2.4534412955465585, "percentage": 24.53, "elapsed_time": "0:30:52", "remaining_time": "1:34:59"} +{"current_steps": 607, "total_steps": 2470, "loss": 2.185, "lr": 9.370183520736045e-06, "epoch": 2.45748987854251, "percentage": 24.57, "elapsed_time": "0:30:55", "remaining_time": "1:34:56"} +{"current_steps": 608, "total_steps": 2470, "loss": 2.4563, "lr": 9.366746018181503e-06, "epoch": 2.4615384615384617, "percentage": 24.62, "elapsed_time": "0:30:58", "remaining_time": "1:34:53"} +{"current_steps": 609, "total_steps": 2470, "loss": 2.0155, "lr": 9.363299794378072e-06, "epoch": 2.465587044534413, "percentage": 24.66, "elapsed_time": "0:31:01", "remaining_time": "1:34:49"} +{"current_steps": 610, "total_steps": 2470, "loss": 1.9623, "lr": 9.359844856208538e-06, "epoch": 2.4696356275303644, "percentage": 24.7, "elapsed_time": "0:31:05", "remaining_time": "1:34:46"} +{"current_steps": 611, "total_steps": 2470, "loss": 1.8725, "lr": 9.356381210573092e-06, "epoch": 2.473684210526316, "percentage": 24.74, "elapsed_time": "0:31:08", "remaining_time": "1:34:43"} +{"current_steps": 612, "total_steps": 2470, "loss": 1.9058, "lr": 9.352908864389313e-06, "epoch": 2.477732793522267, "percentage": 24.78, "elapsed_time": "0:31:11", "remaining_time": "1:34:40"} +{"current_steps": 613, "total_steps": 2470, "loss": 1.818, "lr": 9.349427824592157e-06, "epoch": 2.4817813765182186, "percentage": 24.82, "elapsed_time": "0:31:14", "remaining_time": "1:34:37"} +{"current_steps": 614, "total_steps": 2470, "loss": 1.8001, "lr": 9.345938098133946e-06, "epoch": 2.48582995951417, "percentage": 24.86, "elapsed_time": "0:31:17", "remaining_time": "1:34:36"} +{"current_steps": 615, "total_steps": 2470, "loss": 1.7282, "lr": 9.342439691984346e-06, "epoch": 2.4898785425101213, "percentage": 24.9, "elapsed_time": "0:31:20", "remaining_time": "1:34:32"} +{"current_steps": 616, "total_steps": 2470, "loss": 1.7961, "lr": 9.338932613130363e-06, "epoch": 2.493927125506073, "percentage": 24.94, "elapsed_time": "0:31:23", "remaining_time": "1:34:29"} +{"current_steps": 617, "total_steps": 2470, "loss": 1.662, "lr": 9.33541686857632e-06, "epoch": 2.4979757085020244, "percentage": 24.98, "elapsed_time": "0:31:26", "remaining_time": "1:34:26"} +{"current_steps": 618, "total_steps": 2470, "loss": 1.588, "lr": 9.331892465343851e-06, "epoch": 2.5020242914979756, "percentage": 25.02, "elapsed_time": "0:31:29", "remaining_time": "1:34:23"} +{"current_steps": 619, "total_steps": 2470, "loss": 1.8722, "lr": 9.328359410471878e-06, "epoch": 2.506072874493927, "percentage": 25.06, "elapsed_time": "0:31:32", "remaining_time": "1:34:20"} +{"current_steps": 620, "total_steps": 2470, "loss": 1.9167, "lr": 9.324817711016609e-06, "epoch": 2.5101214574898787, "percentage": 25.1, "elapsed_time": "0:31:35", "remaining_time": "1:34:17"} +{"current_steps": 621, "total_steps": 2470, "loss": 1.8743, "lr": 9.32126737405151e-06, "epoch": 2.51417004048583, "percentage": 25.14, "elapsed_time": "0:31:39", "remaining_time": "1:34:15"} +{"current_steps": 622, "total_steps": 2470, "loss": 1.89, "lr": 9.3177084066673e-06, "epoch": 2.5182186234817814, "percentage": 25.18, "elapsed_time": "0:31:42", "remaining_time": "1:34:11"} +{"current_steps": 623, "total_steps": 2470, "loss": 1.8321, "lr": 9.31414081597194e-06, "epoch": 2.522267206477733, "percentage": 25.22, "elapsed_time": "0:31:45", "remaining_time": "1:34:08"} +{"current_steps": 624, "total_steps": 2470, "loss": 1.6178, "lr": 9.310564609090605e-06, "epoch": 2.526315789473684, "percentage": 25.26, "elapsed_time": "0:31:48", "remaining_time": "1:34:05"} +{"current_steps": 625, "total_steps": 2470, "loss": 1.718, "lr": 9.306979793165682e-06, "epoch": 2.5303643724696356, "percentage": 25.3, "elapsed_time": "0:31:51", "remaining_time": "1:34:02"} +{"current_steps": 626, "total_steps": 2470, "loss": 1.8536, "lr": 9.303386375356752e-06, "epoch": 2.534412955465587, "percentage": 25.34, "elapsed_time": "0:31:54", "remaining_time": "1:33:59"} +{"current_steps": 627, "total_steps": 2470, "loss": 2.0088, "lr": 9.299784362840578e-06, "epoch": 2.5384615384615383, "percentage": 25.38, "elapsed_time": "0:31:57", "remaining_time": "1:33:56"} +{"current_steps": 628, "total_steps": 2470, "loss": 1.8993, "lr": 9.296173762811084e-06, "epoch": 2.54251012145749, "percentage": 25.43, "elapsed_time": "0:32:00", "remaining_time": "1:33:52"} +{"current_steps": 629, "total_steps": 2470, "loss": 2.3583, "lr": 9.292554582479349e-06, "epoch": 2.5465587044534415, "percentage": 25.47, "elapsed_time": "0:32:03", "remaining_time": "1:33:49"} +{"current_steps": 630, "total_steps": 2470, "loss": 2.4906, "lr": 9.288926829073583e-06, "epoch": 2.5506072874493926, "percentage": 25.51, "elapsed_time": "0:32:06", "remaining_time": "1:33:46"} +{"current_steps": 631, "total_steps": 2470, "loss": 2.7822, "lr": 9.285290509839126e-06, "epoch": 2.554655870445344, "percentage": 25.55, "elapsed_time": "0:32:09", "remaining_time": "1:33:43"} +{"current_steps": 632, "total_steps": 2470, "loss": 1.8168, "lr": 9.281645632038417e-06, "epoch": 2.5587044534412957, "percentage": 25.59, "elapsed_time": "0:32:12", "remaining_time": "1:33:40"} +{"current_steps": 633, "total_steps": 2470, "loss": 1.7136, "lr": 9.277992202950996e-06, "epoch": 2.562753036437247, "percentage": 25.63, "elapsed_time": "0:32:15", "remaining_time": "1:33:37"} +{"current_steps": 634, "total_steps": 2470, "loss": 2.0032, "lr": 9.274330229873474e-06, "epoch": 2.5668016194331984, "percentage": 25.67, "elapsed_time": "0:32:18", "remaining_time": "1:33:33"} +{"current_steps": 635, "total_steps": 2470, "loss": 1.6359, "lr": 9.270659720119533e-06, "epoch": 2.57085020242915, "percentage": 25.71, "elapsed_time": "0:32:21", "remaining_time": "1:33:30"} +{"current_steps": 636, "total_steps": 2470, "loss": 1.9962, "lr": 9.266980681019902e-06, "epoch": 2.574898785425101, "percentage": 25.75, "elapsed_time": "0:32:24", "remaining_time": "1:33:27"} +{"current_steps": 637, "total_steps": 2470, "loss": 1.7137, "lr": 9.263293119922341e-06, "epoch": 2.5789473684210527, "percentage": 25.79, "elapsed_time": "0:32:27", "remaining_time": "1:33:24"} +{"current_steps": 638, "total_steps": 2470, "loss": 1.9567, "lr": 9.259597044191635e-06, "epoch": 2.582995951417004, "percentage": 25.83, "elapsed_time": "0:32:30", "remaining_time": "1:33:21"} +{"current_steps": 639, "total_steps": 2470, "loss": 1.8607, "lr": 9.255892461209574e-06, "epoch": 2.5870445344129553, "percentage": 25.87, "elapsed_time": "0:32:33", "remaining_time": "1:33:18"} +{"current_steps": 640, "total_steps": 2470, "loss": 1.8423, "lr": 9.252179378374937e-06, "epoch": 2.591093117408907, "percentage": 25.91, "elapsed_time": "0:32:36", "remaining_time": "1:33:15"} +{"current_steps": 641, "total_steps": 2470, "loss": 1.5365, "lr": 9.248457803103476e-06, "epoch": 2.5951417004048585, "percentage": 25.95, "elapsed_time": "0:32:39", "remaining_time": "1:33:11"} +{"current_steps": 642, "total_steps": 2470, "loss": 1.5837, "lr": 9.24472774282791e-06, "epoch": 2.5991902834008096, "percentage": 25.99, "elapsed_time": "0:32:42", "remaining_time": "1:33:08"} +{"current_steps": 643, "total_steps": 2470, "loss": 1.7433, "lr": 9.240989204997903e-06, "epoch": 2.603238866396761, "percentage": 26.03, "elapsed_time": "0:32:45", "remaining_time": "1:33:05"} +{"current_steps": 644, "total_steps": 2470, "loss": 1.6373, "lr": 9.237242197080045e-06, "epoch": 2.6072874493927127, "percentage": 26.07, "elapsed_time": "0:32:48", "remaining_time": "1:33:02"} +{"current_steps": 645, "total_steps": 2470, "loss": 1.9638, "lr": 9.23348672655785e-06, "epoch": 2.611336032388664, "percentage": 26.11, "elapsed_time": "0:32:51", "remaining_time": "1:32:59"} +{"current_steps": 646, "total_steps": 2470, "loss": 1.8372, "lr": 9.229722800931727e-06, "epoch": 2.6153846153846154, "percentage": 26.15, "elapsed_time": "0:32:54", "remaining_time": "1:32:56"} +{"current_steps": 647, "total_steps": 2470, "loss": 1.665, "lr": 9.225950427718974e-06, "epoch": 2.619433198380567, "percentage": 26.19, "elapsed_time": "0:32:57", "remaining_time": "1:32:52"} +{"current_steps": 648, "total_steps": 2470, "loss": 2.052, "lr": 9.222169614453765e-06, "epoch": 2.623481781376518, "percentage": 26.23, "elapsed_time": "0:33:00", "remaining_time": "1:32:49"} +{"current_steps": 649, "total_steps": 2470, "loss": 1.8437, "lr": 9.21838036868712e-06, "epoch": 2.6275303643724697, "percentage": 26.28, "elapsed_time": "0:33:03", "remaining_time": "1:32:46"} +{"current_steps": 650, "total_steps": 2470, "loss": 2.0542, "lr": 9.21458269798691e-06, "epoch": 2.6315789473684212, "percentage": 26.32, "elapsed_time": "0:33:06", "remaining_time": "1:32:43"} +{"current_steps": 651, "total_steps": 2470, "loss": 1.7342, "lr": 9.21077660993783e-06, "epoch": 2.6356275303643724, "percentage": 26.36, "elapsed_time": "0:33:09", "remaining_time": "1:32:40"} +{"current_steps": 652, "total_steps": 2470, "loss": 1.9321, "lr": 9.206962112141382e-06, "epoch": 2.639676113360324, "percentage": 26.4, "elapsed_time": "0:33:12", "remaining_time": "1:32:36"} +{"current_steps": 653, "total_steps": 2470, "loss": 1.871, "lr": 9.203139212215868e-06, "epoch": 2.6437246963562755, "percentage": 26.44, "elapsed_time": "0:33:15", "remaining_time": "1:32:33"} +{"current_steps": 654, "total_steps": 2470, "loss": 1.8667, "lr": 9.199307917796371e-06, "epoch": 2.6477732793522266, "percentage": 26.48, "elapsed_time": "0:33:18", "remaining_time": "1:32:30"} +{"current_steps": 655, "total_steps": 2470, "loss": 1.7255, "lr": 9.195468236534734e-06, "epoch": 2.651821862348178, "percentage": 26.52, "elapsed_time": "0:33:21", "remaining_time": "1:32:27"} +{"current_steps": 656, "total_steps": 2470, "loss": 1.9444, "lr": 9.191620176099559e-06, "epoch": 2.6558704453441297, "percentage": 26.56, "elapsed_time": "0:33:25", "remaining_time": "1:32:24"} +{"current_steps": 657, "total_steps": 2470, "loss": 1.7728, "lr": 9.187763744176175e-06, "epoch": 2.659919028340081, "percentage": 26.6, "elapsed_time": "0:33:28", "remaining_time": "1:32:21"} +{"current_steps": 658, "total_steps": 2470, "loss": 1.9077, "lr": 9.183898948466633e-06, "epoch": 2.6639676113360324, "percentage": 26.64, "elapsed_time": "0:33:31", "remaining_time": "1:32:18"} +{"current_steps": 659, "total_steps": 2470, "loss": 1.9331, "lr": 9.180025796689692e-06, "epoch": 2.668016194331984, "percentage": 26.68, "elapsed_time": "0:33:34", "remaining_time": "1:32:15"} +{"current_steps": 660, "total_steps": 2470, "loss": 1.8667, "lr": 9.176144296580794e-06, "epoch": 2.672064777327935, "percentage": 26.72, "elapsed_time": "0:33:37", "remaining_time": "1:32:12"} +{"current_steps": 661, "total_steps": 2470, "loss": 1.8187, "lr": 9.172254455892054e-06, "epoch": 2.6761133603238867, "percentage": 26.76, "elapsed_time": "0:33:40", "remaining_time": "1:32:08"} +{"current_steps": 662, "total_steps": 2470, "loss": 1.903, "lr": 9.168356282392253e-06, "epoch": 2.6801619433198383, "percentage": 26.8, "elapsed_time": "0:33:43", "remaining_time": "1:32:07"} +{"current_steps": 663, "total_steps": 2470, "loss": 1.7048, "lr": 9.164449783866802e-06, "epoch": 2.6842105263157894, "percentage": 26.84, "elapsed_time": "0:33:46", "remaining_time": "1:32:04"} +{"current_steps": 664, "total_steps": 2470, "loss": 1.8734, "lr": 9.160534968117752e-06, "epoch": 2.688259109311741, "percentage": 26.88, "elapsed_time": "0:33:49", "remaining_time": "1:32:01"} +{"current_steps": 665, "total_steps": 2470, "loss": 1.8788, "lr": 9.156611842963753e-06, "epoch": 2.6923076923076925, "percentage": 26.92, "elapsed_time": "0:33:52", "remaining_time": "1:31:58"} +{"current_steps": 666, "total_steps": 2470, "loss": 1.7147, "lr": 9.152680416240059e-06, "epoch": 2.6963562753036436, "percentage": 26.96, "elapsed_time": "0:33:55", "remaining_time": "1:31:54"} +{"current_steps": 667, "total_steps": 2470, "loss": 2.2048, "lr": 9.1487406957985e-06, "epoch": 2.700404858299595, "percentage": 27.0, "elapsed_time": "0:33:58", "remaining_time": "1:31:51"} +{"current_steps": 668, "total_steps": 2470, "loss": 1.9635, "lr": 9.144792689507471e-06, "epoch": 2.7044534412955468, "percentage": 27.04, "elapsed_time": "0:34:01", "remaining_time": "1:31:48"} +{"current_steps": 669, "total_steps": 2470, "loss": 1.5744, "lr": 9.140836405251917e-06, "epoch": 2.708502024291498, "percentage": 27.09, "elapsed_time": "0:34:05", "remaining_time": "1:31:46"} +{"current_steps": 670, "total_steps": 2470, "loss": 1.7612, "lr": 9.136871850933312e-06, "epoch": 2.7125506072874495, "percentage": 27.13, "elapsed_time": "0:34:08", "remaining_time": "1:31:43"} +{"current_steps": 671, "total_steps": 2470, "loss": 1.8414, "lr": 9.132899034469648e-06, "epoch": 2.716599190283401, "percentage": 27.17, "elapsed_time": "0:34:11", "remaining_time": "1:31:40"} +{"current_steps": 672, "total_steps": 2470, "loss": 1.7066, "lr": 9.128917963795422e-06, "epoch": 2.720647773279352, "percentage": 27.21, "elapsed_time": "0:34:14", "remaining_time": "1:31:36"} +{"current_steps": 673, "total_steps": 2470, "loss": 1.7925, "lr": 9.124928646861613e-06, "epoch": 2.7246963562753037, "percentage": 27.25, "elapsed_time": "0:34:17", "remaining_time": "1:31:33"} +{"current_steps": 674, "total_steps": 2470, "loss": 1.9923, "lr": 9.120931091635669e-06, "epoch": 2.7287449392712553, "percentage": 27.29, "elapsed_time": "0:34:20", "remaining_time": "1:31:30"} +{"current_steps": 675, "total_steps": 2470, "loss": 1.858, "lr": 9.116925306101494e-06, "epoch": 2.7327935222672064, "percentage": 27.33, "elapsed_time": "0:34:23", "remaining_time": "1:31:27"} +{"current_steps": 676, "total_steps": 2470, "loss": 1.8935, "lr": 9.112911298259426e-06, "epoch": 2.736842105263158, "percentage": 27.37, "elapsed_time": "0:34:26", "remaining_time": "1:31:24"} +{"current_steps": 677, "total_steps": 2470, "loss": 2.5611, "lr": 9.108889076126226e-06, "epoch": 2.7408906882591095, "percentage": 27.41, "elapsed_time": "0:34:29", "remaining_time": "1:31:20"} +{"current_steps": 678, "total_steps": 2470, "loss": 1.9346, "lr": 9.104858647735065e-06, "epoch": 2.7449392712550607, "percentage": 27.45, "elapsed_time": "0:34:32", "remaining_time": "1:31:17"} +{"current_steps": 679, "total_steps": 2470, "loss": 1.7738, "lr": 9.100820021135495e-06, "epoch": 2.748987854251012, "percentage": 27.49, "elapsed_time": "0:34:35", "remaining_time": "1:31:14"} +{"current_steps": 680, "total_steps": 2470, "loss": 1.6451, "lr": 9.09677320439345e-06, "epoch": 2.753036437246964, "percentage": 27.53, "elapsed_time": "0:34:38", "remaining_time": "1:31:11"} +{"current_steps": 681, "total_steps": 2470, "loss": 1.8788, "lr": 9.092718205591213e-06, "epoch": 2.757085020242915, "percentage": 27.57, "elapsed_time": "0:34:41", "remaining_time": "1:31:08"} +{"current_steps": 682, "total_steps": 2470, "loss": 2.6938, "lr": 9.088655032827418e-06, "epoch": 2.7611336032388665, "percentage": 27.61, "elapsed_time": "0:34:44", "remaining_time": "1:31:05"} +{"current_steps": 683, "total_steps": 2470, "loss": 2.5299, "lr": 9.084583694217012e-06, "epoch": 2.765182186234818, "percentage": 27.65, "elapsed_time": "0:34:47", "remaining_time": "1:31:02"} +{"current_steps": 684, "total_steps": 2470, "loss": 2.4088, "lr": 9.080504197891262e-06, "epoch": 2.769230769230769, "percentage": 27.69, "elapsed_time": "0:34:50", "remaining_time": "1:30:58"} +{"current_steps": 685, "total_steps": 2470, "loss": 1.824, "lr": 9.076416551997721e-06, "epoch": 2.7732793522267207, "percentage": 27.73, "elapsed_time": "0:34:53", "remaining_time": "1:30:55"} +{"current_steps": 686, "total_steps": 2470, "loss": 2.0511, "lr": 9.072320764700223e-06, "epoch": 2.7773279352226723, "percentage": 27.77, "elapsed_time": "0:34:56", "remaining_time": "1:30:52"} +{"current_steps": 687, "total_steps": 2470, "loss": 2.0932, "lr": 9.068216844178857e-06, "epoch": 2.7813765182186234, "percentage": 27.81, "elapsed_time": "0:34:59", "remaining_time": "1:30:49"} +{"current_steps": 688, "total_steps": 2470, "loss": 1.8796, "lr": 9.064104798629955e-06, "epoch": 2.785425101214575, "percentage": 27.85, "elapsed_time": "0:35:02", "remaining_time": "1:30:46"} +{"current_steps": 689, "total_steps": 2470, "loss": 1.7757, "lr": 9.059984636266082e-06, "epoch": 2.7894736842105265, "percentage": 27.89, "elapsed_time": "0:35:05", "remaining_time": "1:30:43"} +{"current_steps": 690, "total_steps": 2470, "loss": 1.9039, "lr": 9.055856365316012e-06, "epoch": 2.7935222672064777, "percentage": 27.94, "elapsed_time": "0:35:08", "remaining_time": "1:30:40"} +{"current_steps": 691, "total_steps": 2470, "loss": 1.9171, "lr": 9.051719994024711e-06, "epoch": 2.7975708502024292, "percentage": 27.98, "elapsed_time": "0:35:11", "remaining_time": "1:30:37"} +{"current_steps": 692, "total_steps": 2470, "loss": 1.6852, "lr": 9.047575530653324e-06, "epoch": 2.801619433198381, "percentage": 28.02, "elapsed_time": "0:35:14", "remaining_time": "1:30:33"} +{"current_steps": 693, "total_steps": 2470, "loss": 1.5727, "lr": 9.043422983479158e-06, "epoch": 2.805668016194332, "percentage": 28.06, "elapsed_time": "0:35:17", "remaining_time": "1:30:30"} +{"current_steps": 694, "total_steps": 2470, "loss": 1.9079, "lr": 9.039262360795664e-06, "epoch": 2.8097165991902835, "percentage": 28.1, "elapsed_time": "0:35:20", "remaining_time": "1:30:27"} +{"current_steps": 695, "total_steps": 2470, "loss": 1.9093, "lr": 9.035093670912424e-06, "epoch": 2.813765182186235, "percentage": 28.14, "elapsed_time": "0:35:23", "remaining_time": "1:30:24"} +{"current_steps": 696, "total_steps": 2470, "loss": 1.6569, "lr": 9.03091692215513e-06, "epoch": 2.817813765182186, "percentage": 28.18, "elapsed_time": "0:35:26", "remaining_time": "1:30:21"} +{"current_steps": 697, "total_steps": 2470, "loss": 2.4758, "lr": 9.026732122865567e-06, "epoch": 2.8218623481781377, "percentage": 28.22, "elapsed_time": "0:35:29", "remaining_time": "1:30:18"} +{"current_steps": 698, "total_steps": 2470, "loss": 3.9379, "lr": 9.022539281401601e-06, "epoch": 2.8259109311740893, "percentage": 28.26, "elapsed_time": "0:35:32", "remaining_time": "1:30:14"} +{"current_steps": 699, "total_steps": 2470, "loss": 1.7599, "lr": 9.01833840613716e-06, "epoch": 2.8299595141700404, "percentage": 28.3, "elapsed_time": "0:35:35", "remaining_time": "1:30:11"} +{"current_steps": 700, "total_steps": 2470, "loss": 1.6112, "lr": 9.014129505462217e-06, "epoch": 2.834008097165992, "percentage": 28.34, "elapsed_time": "0:35:38", "remaining_time": "1:30:08"} +{"current_steps": 701, "total_steps": 2470, "loss": 1.719, "lr": 9.009912587782772e-06, "epoch": 2.8380566801619436, "percentage": 28.38, "elapsed_time": "0:35:42", "remaining_time": "1:30:05"} +{"current_steps": 702, "total_steps": 2470, "loss": 1.7237, "lr": 9.005687661520838e-06, "epoch": 2.8421052631578947, "percentage": 28.42, "elapsed_time": "0:35:45", "remaining_time": "1:30:02"} +{"current_steps": 703, "total_steps": 2470, "loss": 1.6892, "lr": 9.00145473511442e-06, "epoch": 2.8461538461538463, "percentage": 28.46, "elapsed_time": "0:35:48", "remaining_time": "1:29:59"} +{"current_steps": 704, "total_steps": 2470, "loss": 1.7534, "lr": 8.997213817017508e-06, "epoch": 2.850202429149798, "percentage": 28.5, "elapsed_time": "0:35:51", "remaining_time": "1:29:55"} +{"current_steps": 705, "total_steps": 2470, "loss": 1.8313, "lr": 8.99296491570004e-06, "epoch": 2.854251012145749, "percentage": 28.54, "elapsed_time": "0:35:54", "remaining_time": "1:29:52"} +{"current_steps": 706, "total_steps": 2470, "loss": 1.7662, "lr": 8.98870803964791e-06, "epoch": 2.8582995951417005, "percentage": 28.58, "elapsed_time": "0:35:57", "remaining_time": "1:29:49"} +{"current_steps": 707, "total_steps": 2470, "loss": 1.7739, "lr": 8.984443197362938e-06, "epoch": 2.862348178137652, "percentage": 28.62, "elapsed_time": "0:36:00", "remaining_time": "1:29:46"} +{"current_steps": 708, "total_steps": 2470, "loss": 1.7885, "lr": 8.980170397362846e-06, "epoch": 2.866396761133603, "percentage": 28.66, "elapsed_time": "0:36:03", "remaining_time": "1:29:43"} +{"current_steps": 709, "total_steps": 2470, "loss": 2.2786, "lr": 8.975889648181258e-06, "epoch": 2.8704453441295548, "percentage": 28.7, "elapsed_time": "0:36:06", "remaining_time": "1:29:40"} +{"current_steps": 710, "total_steps": 2470, "loss": 2.2033, "lr": 8.971600958367668e-06, "epoch": 2.8744939271255063, "percentage": 28.74, "elapsed_time": "0:36:09", "remaining_time": "1:29:38"} +{"current_steps": 711, "total_steps": 2470, "loss": 1.9747, "lr": 8.96730433648743e-06, "epoch": 2.8785425101214575, "percentage": 28.79, "elapsed_time": "0:36:12", "remaining_time": "1:29:35"} +{"current_steps": 712, "total_steps": 2470, "loss": 1.8561, "lr": 8.962999791121745e-06, "epoch": 2.882591093117409, "percentage": 28.83, "elapsed_time": "0:36:15", "remaining_time": "1:29:32"} +{"current_steps": 713, "total_steps": 2470, "loss": 1.3887, "lr": 8.958687330867634e-06, "epoch": 2.8866396761133606, "percentage": 28.87, "elapsed_time": "0:36:18", "remaining_time": "1:29:29"} +{"current_steps": 714, "total_steps": 2470, "loss": 1.8757, "lr": 8.954366964337926e-06, "epoch": 2.8906882591093117, "percentage": 28.91, "elapsed_time": "0:36:21", "remaining_time": "1:29:25"} +{"current_steps": 715, "total_steps": 2470, "loss": 1.9746, "lr": 8.950038700161239e-06, "epoch": 2.8947368421052633, "percentage": 28.95, "elapsed_time": "0:36:24", "remaining_time": "1:29:22"} +{"current_steps": 716, "total_steps": 2470, "loss": 1.6592, "lr": 8.94570254698197e-06, "epoch": 2.898785425101215, "percentage": 28.99, "elapsed_time": "0:36:27", "remaining_time": "1:29:19"} +{"current_steps": 717, "total_steps": 2470, "loss": 1.722, "lr": 8.941358513460264e-06, "epoch": 2.902834008097166, "percentage": 29.03, "elapsed_time": "0:36:31", "remaining_time": "1:29:17"} +{"current_steps": 718, "total_steps": 2470, "loss": 1.9182, "lr": 8.937006608272009e-06, "epoch": 2.9068825910931175, "percentage": 29.07, "elapsed_time": "0:36:34", "remaining_time": "1:29:14"} +{"current_steps": 719, "total_steps": 2470, "loss": 1.4523, "lr": 8.932646840108818e-06, "epoch": 2.910931174089069, "percentage": 29.11, "elapsed_time": "0:36:37", "remaining_time": "1:29:11"} +{"current_steps": 720, "total_steps": 2470, "loss": 1.5928, "lr": 8.928279217677999e-06, "epoch": 2.91497975708502, "percentage": 29.15, "elapsed_time": "0:36:40", "remaining_time": "1:29:08"} +{"current_steps": 721, "total_steps": 2470, "loss": 1.6197, "lr": 8.923903749702556e-06, "epoch": 2.919028340080972, "percentage": 29.19, "elapsed_time": "0:36:43", "remaining_time": "1:29:05"} +{"current_steps": 722, "total_steps": 2470, "loss": 1.9066, "lr": 8.919520444921153e-06, "epoch": 2.9230769230769234, "percentage": 29.23, "elapsed_time": "0:36:46", "remaining_time": "1:29:01"} +{"current_steps": 723, "total_steps": 2470, "loss": 1.7547, "lr": 8.915129312088112e-06, "epoch": 2.9271255060728745, "percentage": 29.27, "elapsed_time": "0:36:49", "remaining_time": "1:28:58"} +{"current_steps": 724, "total_steps": 2470, "loss": 1.7851, "lr": 8.910730359973386e-06, "epoch": 2.931174089068826, "percentage": 29.31, "elapsed_time": "0:36:52", "remaining_time": "1:28:55"} +{"current_steps": 725, "total_steps": 2470, "loss": 1.6173, "lr": 8.906323597362547e-06, "epoch": 2.9352226720647776, "percentage": 29.35, "elapsed_time": "0:36:55", "remaining_time": "1:28:52"} +{"current_steps": 726, "total_steps": 2470, "loss": 1.5244, "lr": 8.901909033056763e-06, "epoch": 2.9392712550607287, "percentage": 29.39, "elapsed_time": "0:36:58", "remaining_time": "1:28:49"} +{"current_steps": 727, "total_steps": 2470, "loss": 1.8108, "lr": 8.89748667587279e-06, "epoch": 2.9433198380566803, "percentage": 29.43, "elapsed_time": "0:37:01", "remaining_time": "1:28:46"} +{"current_steps": 728, "total_steps": 2470, "loss": 1.5553, "lr": 8.893056534642938e-06, "epoch": 2.9473684210526314, "percentage": 29.47, "elapsed_time": "0:37:04", "remaining_time": "1:28:43"} +{"current_steps": 729, "total_steps": 2470, "loss": 1.5518, "lr": 8.88861861821507e-06, "epoch": 2.951417004048583, "percentage": 29.51, "elapsed_time": "0:37:07", "remaining_time": "1:28:40"} +{"current_steps": 730, "total_steps": 2470, "loss": 1.7772, "lr": 8.88417293545258e-06, "epoch": 2.9554655870445345, "percentage": 29.55, "elapsed_time": "0:37:10", "remaining_time": "1:28:36"} +{"current_steps": 731, "total_steps": 2470, "loss": 1.6766, "lr": 8.879719495234363e-06, "epoch": 2.9595141700404857, "percentage": 29.6, "elapsed_time": "0:37:13", "remaining_time": "1:28:33"} +{"current_steps": 732, "total_steps": 2470, "loss": 1.7823, "lr": 8.875258306454814e-06, "epoch": 2.9635627530364372, "percentage": 29.64, "elapsed_time": "0:37:16", "remaining_time": "1:28:30"} +{"current_steps": 733, "total_steps": 2470, "loss": 2.0096, "lr": 8.87078937802381e-06, "epoch": 2.967611336032389, "percentage": 29.68, "elapsed_time": "0:37:19", "remaining_time": "1:28:27"} +{"current_steps": 734, "total_steps": 2470, "loss": 1.9226, "lr": 8.866312718866669e-06, "epoch": 2.97165991902834, "percentage": 29.72, "elapsed_time": "0:37:22", "remaining_time": "1:28:24"} +{"current_steps": 735, "total_steps": 2470, "loss": 1.7634, "lr": 8.861828337924164e-06, "epoch": 2.9757085020242915, "percentage": 29.76, "elapsed_time": "0:37:25", "remaining_time": "1:28:21"} +{"current_steps": 736, "total_steps": 2470, "loss": 1.862, "lr": 8.85733624415248e-06, "epoch": 2.979757085020243, "percentage": 29.8, "elapsed_time": "0:37:28", "remaining_time": "1:28:18"} +{"current_steps": 737, "total_steps": 2470, "loss": 1.9281, "lr": 8.852836446523213e-06, "epoch": 2.983805668016194, "percentage": 29.84, "elapsed_time": "0:37:31", "remaining_time": "1:28:14"} +{"current_steps": 738, "total_steps": 2470, "loss": 1.7317, "lr": 8.848328954023342e-06, "epoch": 2.9878542510121457, "percentage": 29.88, "elapsed_time": "0:37:34", "remaining_time": "1:28:11"} +{"current_steps": 739, "total_steps": 2470, "loss": 1.6635, "lr": 8.843813775655211e-06, "epoch": 2.9919028340080973, "percentage": 29.92, "elapsed_time": "0:37:37", "remaining_time": "1:28:08"} +{"current_steps": 740, "total_steps": 2470, "loss": 1.9759, "lr": 8.83929092043652e-06, "epoch": 2.9959514170040484, "percentage": 29.96, "elapsed_time": "0:37:40", "remaining_time": "1:28:05"} +{"current_steps": 741, "total_steps": 2470, "loss": 1.7407, "lr": 8.8347603974003e-06, "epoch": 3.0, "percentage": 30.0, "elapsed_time": "0:37:43", "remaining_time": "1:28:02"} +{"current_steps": 742, "total_steps": 2470, "loss": 1.8183, "lr": 8.83022221559489e-06, "epoch": 3.0040485829959516, "percentage": 30.04, "elapsed_time": "0:37:46", "remaining_time": "1:27:59"} +{"current_steps": 743, "total_steps": 2470, "loss": 1.9566, "lr": 8.825676384083936e-06, "epoch": 3.0080971659919027, "percentage": 30.08, "elapsed_time": "0:37:49", "remaining_time": "1:27:56"} +{"current_steps": 744, "total_steps": 2470, "loss": 1.8211, "lr": 8.82112291194635e-06, "epoch": 3.0121457489878543, "percentage": 30.12, "elapsed_time": "0:37:53", "remaining_time": "1:27:53"} +{"current_steps": 745, "total_steps": 2470, "loss": 1.9756, "lr": 8.816561808276312e-06, "epoch": 3.016194331983806, "percentage": 30.16, "elapsed_time": "0:37:56", "remaining_time": "1:27:50"} +{"current_steps": 746, "total_steps": 2470, "loss": 2.2277, "lr": 8.811993082183243e-06, "epoch": 3.020242914979757, "percentage": 30.2, "elapsed_time": "0:37:59", "remaining_time": "1:27:46"} +{"current_steps": 747, "total_steps": 2470, "loss": 2.0822, "lr": 8.807416742791784e-06, "epoch": 3.0242914979757085, "percentage": 30.24, "elapsed_time": "0:38:02", "remaining_time": "1:27:43"} +{"current_steps": 748, "total_steps": 2470, "loss": 1.7544, "lr": 8.80283279924178e-06, "epoch": 3.02834008097166, "percentage": 30.28, "elapsed_time": "0:38:05", "remaining_time": "1:27:40"} +{"current_steps": 749, "total_steps": 2470, "loss": 1.7612, "lr": 8.798241260688273e-06, "epoch": 3.032388663967611, "percentage": 30.32, "elapsed_time": "0:38:08", "remaining_time": "1:27:37"} +{"current_steps": 750, "total_steps": 2470, "loss": 2.0061, "lr": 8.793642136301462e-06, "epoch": 3.0364372469635628, "percentage": 30.36, "elapsed_time": "0:38:11", "remaining_time": "1:27:34"} +{"current_steps": 751, "total_steps": 2470, "loss": 1.8078, "lr": 8.7890354352667e-06, "epoch": 3.0404858299595143, "percentage": 30.4, "elapsed_time": "0:38:14", "remaining_time": "1:27:31"} +{"current_steps": 752, "total_steps": 2470, "loss": 1.7918, "lr": 8.784421166784476e-06, "epoch": 3.0445344129554655, "percentage": 30.45, "elapsed_time": "0:38:17", "remaining_time": "1:27:28"} +{"current_steps": 753, "total_steps": 2470, "loss": 1.7574, "lr": 8.779799340070388e-06, "epoch": 3.048582995951417, "percentage": 30.49, "elapsed_time": "0:38:20", "remaining_time": "1:27:24"} +{"current_steps": 754, "total_steps": 2470, "loss": 1.8982, "lr": 8.775169964355134e-06, "epoch": 3.0526315789473686, "percentage": 30.53, "elapsed_time": "0:38:23", "remaining_time": "1:27:21"} +{"current_steps": 755, "total_steps": 2470, "loss": 1.7375, "lr": 8.770533048884483e-06, "epoch": 3.0566801619433197, "percentage": 30.57, "elapsed_time": "0:38:26", "remaining_time": "1:27:18"} +{"current_steps": 756, "total_steps": 2470, "loss": 1.9075, "lr": 8.765888602919266e-06, "epoch": 3.0607287449392713, "percentage": 30.61, "elapsed_time": "0:38:29", "remaining_time": "1:27:15"} +{"current_steps": 757, "total_steps": 2470, "loss": 1.8378, "lr": 8.761236635735353e-06, "epoch": 3.064777327935223, "percentage": 30.65, "elapsed_time": "0:38:32", "remaining_time": "1:27:12"} +{"current_steps": 758, "total_steps": 2470, "loss": 1.9702, "lr": 8.756577156623636e-06, "epoch": 3.068825910931174, "percentage": 30.69, "elapsed_time": "0:38:35", "remaining_time": "1:27:10"} +{"current_steps": 759, "total_steps": 2470, "loss": 1.8932, "lr": 8.751910174890009e-06, "epoch": 3.0728744939271255, "percentage": 30.73, "elapsed_time": "0:38:38", "remaining_time": "1:27:07"} +{"current_steps": 760, "total_steps": 2470, "loss": 1.8215, "lr": 8.74723569985535e-06, "epoch": 3.076923076923077, "percentage": 30.77, "elapsed_time": "0:38:42", "remaining_time": "1:27:04"} +{"current_steps": 761, "total_steps": 2470, "loss": 1.8237, "lr": 8.742553740855507e-06, "epoch": 3.080971659919028, "percentage": 30.81, "elapsed_time": "0:38:45", "remaining_time": "1:27:01"} +{"current_steps": 762, "total_steps": 2470, "loss": 1.825, "lr": 8.737864307241266e-06, "epoch": 3.08502024291498, "percentage": 30.85, "elapsed_time": "0:38:48", "remaining_time": "1:26:58"} +{"current_steps": 763, "total_steps": 2470, "loss": 1.83, "lr": 8.733167408378348e-06, "epoch": 3.0890688259109313, "percentage": 30.89, "elapsed_time": "0:38:51", "remaining_time": "1:26:55"} +{"current_steps": 764, "total_steps": 2470, "loss": 1.9209, "lr": 8.728463053647382e-06, "epoch": 3.0931174089068825, "percentage": 30.93, "elapsed_time": "0:38:54", "remaining_time": "1:26:52"} +{"current_steps": 765, "total_steps": 2470, "loss": 1.6591, "lr": 8.723751252443891e-06, "epoch": 3.097165991902834, "percentage": 30.97, "elapsed_time": "0:38:57", "remaining_time": "1:26:49"} +{"current_steps": 766, "total_steps": 2470, "loss": 1.8214, "lr": 8.71903201417826e-06, "epoch": 3.1012145748987856, "percentage": 31.01, "elapsed_time": "0:39:00", "remaining_time": "1:26:46"} +{"current_steps": 767, "total_steps": 2470, "loss": 1.854, "lr": 8.71430534827574e-06, "epoch": 3.1052631578947367, "percentage": 31.05, "elapsed_time": "0:39:03", "remaining_time": "1:26:43"} +{"current_steps": 768, "total_steps": 2470, "loss": 1.7321, "lr": 8.709571264176408e-06, "epoch": 3.1093117408906883, "percentage": 31.09, "elapsed_time": "0:39:06", "remaining_time": "1:26:40"} +{"current_steps": 769, "total_steps": 2470, "loss": 1.6709, "lr": 8.70482977133516e-06, "epoch": 3.11336032388664, "percentage": 31.13, "elapsed_time": "0:39:09", "remaining_time": "1:26:37"} +{"current_steps": 770, "total_steps": 2470, "loss": 1.6082, "lr": 8.700080879221689e-06, "epoch": 3.117408906882591, "percentage": 31.17, "elapsed_time": "0:39:12", "remaining_time": "1:26:34"} +{"current_steps": 771, "total_steps": 2470, "loss": 1.6324, "lr": 8.69532459732046e-06, "epoch": 3.1214574898785425, "percentage": 31.21, "elapsed_time": "0:39:15", "remaining_time": "1:26:31"} +{"current_steps": 772, "total_steps": 2470, "loss": 1.626, "lr": 8.690560935130708e-06, "epoch": 3.125506072874494, "percentage": 31.26, "elapsed_time": "0:39:18", "remaining_time": "1:26:28"} +{"current_steps": 773, "total_steps": 2470, "loss": 1.5525, "lr": 8.685789902166395e-06, "epoch": 3.1295546558704452, "percentage": 31.3, "elapsed_time": "0:39:21", "remaining_time": "1:26:24"} +{"current_steps": 774, "total_steps": 2470, "loss": 1.8873, "lr": 8.681011507956215e-06, "epoch": 3.133603238866397, "percentage": 31.34, "elapsed_time": "0:39:24", "remaining_time": "1:26:21"} +{"current_steps": 775, "total_steps": 2470, "loss": 1.7496, "lr": 8.676225762043555e-06, "epoch": 3.1376518218623484, "percentage": 31.38, "elapsed_time": "0:39:27", "remaining_time": "1:26:18"} +{"current_steps": 776, "total_steps": 2470, "loss": 1.4753, "lr": 8.671432673986493e-06, "epoch": 3.1417004048582995, "percentage": 31.42, "elapsed_time": "0:39:30", "remaining_time": "1:26:15"} +{"current_steps": 777, "total_steps": 2470, "loss": 1.8963, "lr": 8.666632253357767e-06, "epoch": 3.145748987854251, "percentage": 31.46, "elapsed_time": "0:39:33", "remaining_time": "1:26:12"} +{"current_steps": 778, "total_steps": 2470, "loss": 1.7098, "lr": 8.661824509744754e-06, "epoch": 3.1497975708502026, "percentage": 31.5, "elapsed_time": "0:39:37", "remaining_time": "1:26:09"} +{"current_steps": 779, "total_steps": 2470, "loss": 1.8596, "lr": 8.657009452749466e-06, "epoch": 3.1538461538461537, "percentage": 31.54, "elapsed_time": "0:39:40", "remaining_time": "1:26:06"} +{"current_steps": 780, "total_steps": 2470, "loss": 2.061, "lr": 8.652187091988516e-06, "epoch": 3.1578947368421053, "percentage": 31.58, "elapsed_time": "0:39:43", "remaining_time": "1:26:03"} +{"current_steps": 781, "total_steps": 2470, "loss": 1.7589, "lr": 8.647357437093104e-06, "epoch": 3.161943319838057, "percentage": 31.62, "elapsed_time": "0:39:46", "remaining_time": "1:26:00"} +{"current_steps": 782, "total_steps": 2470, "loss": 1.8086, "lr": 8.642520497709001e-06, "epoch": 3.165991902834008, "percentage": 31.66, "elapsed_time": "0:39:49", "remaining_time": "1:25:57"} +{"current_steps": 783, "total_steps": 2470, "loss": 2.2517, "lr": 8.637676283496521e-06, "epoch": 3.1700404858299596, "percentage": 31.7, "elapsed_time": "0:39:52", "remaining_time": "1:25:53"} +{"current_steps": 784, "total_steps": 2470, "loss": 1.6679, "lr": 8.632824804130514e-06, "epoch": 3.174089068825911, "percentage": 31.74, "elapsed_time": "0:39:55", "remaining_time": "1:25:50"} +{"current_steps": 785, "total_steps": 2470, "loss": 1.8345, "lr": 8.627966069300332e-06, "epoch": 3.1781376518218623, "percentage": 31.78, "elapsed_time": "0:39:58", "remaining_time": "1:25:47"} +{"current_steps": 786, "total_steps": 2470, "loss": 1.6473, "lr": 8.623100088709829e-06, "epoch": 3.182186234817814, "percentage": 31.82, "elapsed_time": "0:40:01", "remaining_time": "1:25:44"} +{"current_steps": 787, "total_steps": 2470, "loss": 1.7821, "lr": 8.618226872077315e-06, "epoch": 3.1862348178137654, "percentage": 31.86, "elapsed_time": "0:40:04", "remaining_time": "1:25:41"} +{"current_steps": 788, "total_steps": 2470, "loss": 1.8289, "lr": 8.613346429135567e-06, "epoch": 3.1902834008097165, "percentage": 31.9, "elapsed_time": "0:40:07", "remaining_time": "1:25:38"} +{"current_steps": 789, "total_steps": 2470, "loss": 2.0076, "lr": 8.608458769631785e-06, "epoch": 3.194331983805668, "percentage": 31.94, "elapsed_time": "0:40:10", "remaining_time": "1:25:35"} +{"current_steps": 790, "total_steps": 2470, "loss": 2.0805, "lr": 8.603563903327582e-06, "epoch": 3.1983805668016196, "percentage": 31.98, "elapsed_time": "0:40:13", "remaining_time": "1:25:32"} +{"current_steps": 791, "total_steps": 2470, "loss": 1.7669, "lr": 8.598661839998972e-06, "epoch": 3.2024291497975708, "percentage": 32.02, "elapsed_time": "0:40:16", "remaining_time": "1:25:28"} +{"current_steps": 792, "total_steps": 2470, "loss": 2.0858, "lr": 8.593752589436334e-06, "epoch": 3.2064777327935223, "percentage": 32.06, "elapsed_time": "0:40:19", "remaining_time": "1:25:25"} +{"current_steps": 793, "total_steps": 2470, "loss": 2.1341, "lr": 8.588836161444405e-06, "epoch": 3.2105263157894735, "percentage": 32.11, "elapsed_time": "0:40:22", "remaining_time": "1:25:22"} +{"current_steps": 794, "total_steps": 2470, "loss": 1.9304, "lr": 8.583912565842258e-06, "epoch": 3.214574898785425, "percentage": 32.15, "elapsed_time": "0:40:25", "remaining_time": "1:25:19"} +{"current_steps": 795, "total_steps": 2470, "loss": 1.7942, "lr": 8.578981812463278e-06, "epoch": 3.2186234817813766, "percentage": 32.19, "elapsed_time": "0:40:28", "remaining_time": "1:25:16"} +{"current_steps": 796, "total_steps": 2470, "loss": 1.72, "lr": 8.574043911155148e-06, "epoch": 3.2226720647773277, "percentage": 32.23, "elapsed_time": "0:40:31", "remaining_time": "1:25:13"} +{"current_steps": 797, "total_steps": 2470, "loss": 1.8542, "lr": 8.569098871779828e-06, "epoch": 3.2267206477732793, "percentage": 32.27, "elapsed_time": "0:40:34", "remaining_time": "1:25:10"} +{"current_steps": 798, "total_steps": 2470, "loss": 1.7101, "lr": 8.56414670421353e-06, "epoch": 3.230769230769231, "percentage": 32.31, "elapsed_time": "0:40:37", "remaining_time": "1:25:06"} +{"current_steps": 799, "total_steps": 2470, "loss": 1.95, "lr": 8.559187418346703e-06, "epoch": 3.234817813765182, "percentage": 32.35, "elapsed_time": "0:40:40", "remaining_time": "1:25:03"} +{"current_steps": 800, "total_steps": 2470, "loss": 1.8895, "lr": 8.554221024084019e-06, "epoch": 3.2388663967611335, "percentage": 32.39, "elapsed_time": "0:40:43", "remaining_time": "1:25:00"} +{"current_steps": 801, "total_steps": 2470, "loss": 1.873, "lr": 8.54924753134434e-06, "epoch": 3.242914979757085, "percentage": 32.43, "elapsed_time": "0:40:46", "remaining_time": "1:24:57"} +{"current_steps": 802, "total_steps": 2470, "loss": 1.7236, "lr": 8.544266950060706e-06, "epoch": 3.246963562753036, "percentage": 32.47, "elapsed_time": "0:40:49", "remaining_time": "1:24:54"} +{"current_steps": 803, "total_steps": 2470, "loss": 1.7693, "lr": 8.539279290180315e-06, "epoch": 3.251012145748988, "percentage": 32.51, "elapsed_time": "0:40:52", "remaining_time": "1:24:51"} +{"current_steps": 804, "total_steps": 2470, "loss": 1.8365, "lr": 8.534284561664508e-06, "epoch": 3.2550607287449393, "percentage": 32.55, "elapsed_time": "0:40:55", "remaining_time": "1:24:48"} +{"current_steps": 805, "total_steps": 2470, "loss": 1.6791, "lr": 8.529282774488731e-06, "epoch": 3.2591093117408905, "percentage": 32.59, "elapsed_time": "0:40:58", "remaining_time": "1:24:45"} +{"current_steps": 806, "total_steps": 2470, "loss": 1.5622, "lr": 8.524273938642539e-06, "epoch": 3.263157894736842, "percentage": 32.63, "elapsed_time": "0:41:02", "remaining_time": "1:24:43"} +{"current_steps": 807, "total_steps": 2470, "loss": 1.8107, "lr": 8.519258064129559e-06, "epoch": 3.2672064777327936, "percentage": 32.67, "elapsed_time": "0:41:05", "remaining_time": "1:24:40"} +{"current_steps": 808, "total_steps": 2470, "loss": 1.8382, "lr": 8.514235160967476e-06, "epoch": 3.2712550607287447, "percentage": 32.71, "elapsed_time": "0:41:08", "remaining_time": "1:24:36"} +{"current_steps": 809, "total_steps": 2470, "loss": 1.8519, "lr": 8.509205239188017e-06, "epoch": 3.2753036437246963, "percentage": 32.75, "elapsed_time": "0:41:11", "remaining_time": "1:24:33"} +{"current_steps": 810, "total_steps": 2470, "loss": 1.8559, "lr": 8.504168308836918e-06, "epoch": 3.279352226720648, "percentage": 32.79, "elapsed_time": "0:41:14", "remaining_time": "1:24:30"} +{"current_steps": 811, "total_steps": 2470, "loss": 1.5602, "lr": 8.499124379973922e-06, "epoch": 3.283400809716599, "percentage": 32.83, "elapsed_time": "0:41:17", "remaining_time": "1:24:27"} +{"current_steps": 812, "total_steps": 2470, "loss": 1.6597, "lr": 8.494073462672743e-06, "epoch": 3.2874493927125505, "percentage": 32.87, "elapsed_time": "0:41:20", "remaining_time": "1:24:24"} +{"current_steps": 813, "total_steps": 2470, "loss": 1.5311, "lr": 8.489015567021054e-06, "epoch": 3.291497975708502, "percentage": 32.91, "elapsed_time": "0:41:23", "remaining_time": "1:24:22"} +{"current_steps": 814, "total_steps": 2470, "loss": 1.8547, "lr": 8.483950703120466e-06, "epoch": 3.2955465587044532, "percentage": 32.96, "elapsed_time": "0:41:26", "remaining_time": "1:24:19"} +{"current_steps": 815, "total_steps": 2470, "loss": 1.9357, "lr": 8.478878881086505e-06, "epoch": 3.299595141700405, "percentage": 33.0, "elapsed_time": "0:41:29", "remaining_time": "1:24:16"} +{"current_steps": 816, "total_steps": 2470, "loss": 1.6684, "lr": 8.473800111048598e-06, "epoch": 3.3036437246963564, "percentage": 33.04, "elapsed_time": "0:41:32", "remaining_time": "1:24:13"} +{"current_steps": 817, "total_steps": 2470, "loss": 1.6929, "lr": 8.468714403150043e-06, "epoch": 3.3076923076923075, "percentage": 33.08, "elapsed_time": "0:41:36", "remaining_time": "1:24:10"} +{"current_steps": 818, "total_steps": 2470, "loss": 1.7112, "lr": 8.463621767547998e-06, "epoch": 3.311740890688259, "percentage": 33.12, "elapsed_time": "0:41:39", "remaining_time": "1:24:07"} +{"current_steps": 819, "total_steps": 2470, "loss": 1.7005, "lr": 8.458522214413455e-06, "epoch": 3.3157894736842106, "percentage": 33.16, "elapsed_time": "0:41:42", "remaining_time": "1:24:04"} +{"current_steps": 820, "total_steps": 2470, "loss": 1.5995, "lr": 8.453415753931223e-06, "epoch": 3.3198380566801617, "percentage": 33.2, "elapsed_time": "0:41:45", "remaining_time": "1:24:00"} +{"current_steps": 821, "total_steps": 2470, "loss": 1.6057, "lr": 8.448302396299906e-06, "epoch": 3.3238866396761133, "percentage": 33.24, "elapsed_time": "0:41:48", "remaining_time": "1:23:57"} +{"current_steps": 822, "total_steps": 2470, "loss": 1.6349, "lr": 8.443182151731883e-06, "epoch": 3.327935222672065, "percentage": 33.28, "elapsed_time": "0:41:51", "remaining_time": "1:23:54"} +{"current_steps": 823, "total_steps": 2470, "loss": 1.5595, "lr": 8.438055030453287e-06, "epoch": 3.331983805668016, "percentage": 33.32, "elapsed_time": "0:41:54", "remaining_time": "1:23:51"} +{"current_steps": 824, "total_steps": 2470, "loss": 1.6019, "lr": 8.432921042703985e-06, "epoch": 3.3360323886639676, "percentage": 33.36, "elapsed_time": "0:41:57", "remaining_time": "1:23:48"} +{"current_steps": 825, "total_steps": 2470, "loss": 1.552, "lr": 8.42778019873756e-06, "epoch": 3.340080971659919, "percentage": 33.4, "elapsed_time": "0:42:00", "remaining_time": "1:23:45"} +{"current_steps": 826, "total_steps": 2470, "loss": 1.5851, "lr": 8.422632508821284e-06, "epoch": 3.3441295546558703, "percentage": 33.44, "elapsed_time": "0:42:03", "remaining_time": "1:23:42"} +{"current_steps": 827, "total_steps": 2470, "loss": 1.7666, "lr": 8.417477983236107e-06, "epoch": 3.348178137651822, "percentage": 33.48, "elapsed_time": "0:42:06", "remaining_time": "1:23:39"} +{"current_steps": 828, "total_steps": 2470, "loss": 1.6497, "lr": 8.412316632276627e-06, "epoch": 3.3522267206477734, "percentage": 33.52, "elapsed_time": "0:42:09", "remaining_time": "1:23:36"} +{"current_steps": 829, "total_steps": 2470, "loss": 1.3523, "lr": 8.407148466251072e-06, "epoch": 3.3562753036437245, "percentage": 33.56, "elapsed_time": "0:42:12", "remaining_time": "1:23:33"} +{"current_steps": 830, "total_steps": 2470, "loss": 1.723, "lr": 8.401973495481289e-06, "epoch": 3.360323886639676, "percentage": 33.6, "elapsed_time": "0:42:15", "remaining_time": "1:23:30"} +{"current_steps": 831, "total_steps": 2470, "loss": 1.8056, "lr": 8.396791730302708e-06, "epoch": 3.3643724696356276, "percentage": 33.64, "elapsed_time": "0:42:18", "remaining_time": "1:23:27"} +{"current_steps": 832, "total_steps": 2470, "loss": 1.7166, "lr": 8.39160318106433e-06, "epoch": 3.3684210526315788, "percentage": 33.68, "elapsed_time": "0:42:21", "remaining_time": "1:23:24"} +{"current_steps": 833, "total_steps": 2470, "loss": 1.8193, "lr": 8.386407858128707e-06, "epoch": 3.3724696356275303, "percentage": 33.72, "elapsed_time": "0:42:24", "remaining_time": "1:23:20"} +{"current_steps": 834, "total_steps": 2470, "loss": 1.4172, "lr": 8.381205771871918e-06, "epoch": 3.376518218623482, "percentage": 33.77, "elapsed_time": "0:42:27", "remaining_time": "1:23:17"} +{"current_steps": 835, "total_steps": 2470, "loss": 1.5949, "lr": 8.375996932683553e-06, "epoch": 3.380566801619433, "percentage": 33.81, "elapsed_time": "0:42:30", "remaining_time": "1:23:14"} +{"current_steps": 836, "total_steps": 2470, "loss": 1.4156, "lr": 8.370781350966683e-06, "epoch": 3.3846153846153846, "percentage": 33.85, "elapsed_time": "0:42:33", "remaining_time": "1:23:11"} +{"current_steps": 837, "total_steps": 2470, "loss": 1.4714, "lr": 8.36555903713785e-06, "epoch": 3.388663967611336, "percentage": 33.89, "elapsed_time": "0:42:36", "remaining_time": "1:23:08"} +{"current_steps": 838, "total_steps": 2470, "loss": 1.6429, "lr": 8.360330001627043e-06, "epoch": 3.3927125506072873, "percentage": 33.93, "elapsed_time": "0:42:39", "remaining_time": "1:23:05"} +{"current_steps": 839, "total_steps": 2470, "loss": 1.4713, "lr": 8.355094254877665e-06, "epoch": 3.396761133603239, "percentage": 33.97, "elapsed_time": "0:42:42", "remaining_time": "1:23:02"} +{"current_steps": 840, "total_steps": 2470, "loss": 1.5146, "lr": 8.349851807346535e-06, "epoch": 3.4008097165991904, "percentage": 34.01, "elapsed_time": "0:42:46", "remaining_time": "1:22:59"} +{"current_steps": 841, "total_steps": 2470, "loss": 1.5871, "lr": 8.344602669503849e-06, "epoch": 3.4048582995951415, "percentage": 34.05, "elapsed_time": "0:42:49", "remaining_time": "1:22:56"} +{"current_steps": 842, "total_steps": 2470, "loss": 1.6862, "lr": 8.339346851833163e-06, "epoch": 3.408906882591093, "percentage": 34.09, "elapsed_time": "0:42:52", "remaining_time": "1:22:53"} +{"current_steps": 843, "total_steps": 2470, "loss": 1.5214, "lr": 8.334084364831381e-06, "epoch": 3.4129554655870447, "percentage": 34.13, "elapsed_time": "0:42:55", "remaining_time": "1:22:50"} +{"current_steps": 844, "total_steps": 2470, "loss": 1.8219, "lr": 8.328815219008719e-06, "epoch": 3.417004048582996, "percentage": 34.17, "elapsed_time": "0:42:58", "remaining_time": "1:22:46"} +{"current_steps": 845, "total_steps": 2470, "loss": 1.8941, "lr": 8.323539424888695e-06, "epoch": 3.4210526315789473, "percentage": 34.21, "elapsed_time": "0:43:01", "remaining_time": "1:22:43"} +{"current_steps": 846, "total_steps": 2470, "loss": 1.7539, "lr": 8.318256993008108e-06, "epoch": 3.425101214574899, "percentage": 34.25, "elapsed_time": "0:43:04", "remaining_time": "1:22:40"} +{"current_steps": 847, "total_steps": 2470, "loss": 1.8598, "lr": 8.31296793391701e-06, "epoch": 3.42914979757085, "percentage": 34.29, "elapsed_time": "0:43:07", "remaining_time": "1:22:37"} +{"current_steps": 848, "total_steps": 2470, "loss": 1.9574, "lr": 8.30767225817869e-06, "epoch": 3.4331983805668016, "percentage": 34.33, "elapsed_time": "0:43:10", "remaining_time": "1:22:34"} +{"current_steps": 849, "total_steps": 2470, "loss": 1.736, "lr": 8.302369976369651e-06, "epoch": 3.437246963562753, "percentage": 34.37, "elapsed_time": "0:43:13", "remaining_time": "1:22:31"} +{"current_steps": 850, "total_steps": 2470, "loss": 1.6581, "lr": 8.297061099079592e-06, "epoch": 3.4412955465587043, "percentage": 34.41, "elapsed_time": "0:43:16", "remaining_time": "1:22:28"} +{"current_steps": 851, "total_steps": 2470, "loss": 1.9183, "lr": 8.291745636911382e-06, "epoch": 3.445344129554656, "percentage": 34.45, "elapsed_time": "0:43:19", "remaining_time": "1:22:25"} +{"current_steps": 852, "total_steps": 2470, "loss": 1.6869, "lr": 8.286423600481044e-06, "epoch": 3.4493927125506074, "percentage": 34.49, "elapsed_time": "0:43:22", "remaining_time": "1:22:22"} +{"current_steps": 853, "total_steps": 2470, "loss": 1.6709, "lr": 8.281095000417725e-06, "epoch": 3.4534412955465585, "percentage": 34.53, "elapsed_time": "0:43:25", "remaining_time": "1:22:19"} +{"current_steps": 854, "total_steps": 2470, "loss": 2.079, "lr": 8.27575984736369e-06, "epoch": 3.45748987854251, "percentage": 34.57, "elapsed_time": "0:43:29", "remaining_time": "1:22:17"} +{"current_steps": 855, "total_steps": 2470, "loss": 2.3146, "lr": 8.270418151974286e-06, "epoch": 3.4615384615384617, "percentage": 34.62, "elapsed_time": "0:43:32", "remaining_time": "1:22:14"} +{"current_steps": 856, "total_steps": 2470, "loss": 1.9175, "lr": 8.265069924917925e-06, "epoch": 3.465587044534413, "percentage": 34.66, "elapsed_time": "0:43:35", "remaining_time": "1:22:10"} +{"current_steps": 857, "total_steps": 2470, "loss": 1.8725, "lr": 8.259715176876069e-06, "epoch": 3.4696356275303644, "percentage": 34.7, "elapsed_time": "0:43:38", "remaining_time": "1:22:07"} +{"current_steps": 858, "total_steps": 2470, "loss": 1.7809, "lr": 8.254353918543199e-06, "epoch": 3.473684210526316, "percentage": 34.74, "elapsed_time": "0:43:41", "remaining_time": "1:22:04"} +{"current_steps": 859, "total_steps": 2470, "loss": 1.8016, "lr": 8.2489861606268e-06, "epoch": 3.477732793522267, "percentage": 34.78, "elapsed_time": "0:43:44", "remaining_time": "1:22:01"} +{"current_steps": 860, "total_steps": 2470, "loss": 1.7188, "lr": 8.243611913847337e-06, "epoch": 3.4817813765182186, "percentage": 34.82, "elapsed_time": "0:43:47", "remaining_time": "1:21:58"} +{"current_steps": 861, "total_steps": 2470, "loss": 1.6913, "lr": 8.238231188938237e-06, "epoch": 3.48582995951417, "percentage": 34.86, "elapsed_time": "0:43:50", "remaining_time": "1:21:56"} +{"current_steps": 862, "total_steps": 2470, "loss": 1.6242, "lr": 8.232843996645865e-06, "epoch": 3.4898785425101213, "percentage": 34.9, "elapsed_time": "0:43:53", "remaining_time": "1:21:53"} +{"current_steps": 863, "total_steps": 2470, "loss": 1.6881, "lr": 8.2274503477295e-06, "epoch": 3.493927125506073, "percentage": 34.94, "elapsed_time": "0:43:56", "remaining_time": "1:21:50"} +{"current_steps": 864, "total_steps": 2470, "loss": 1.5087, "lr": 8.222050252961318e-06, "epoch": 3.4979757085020244, "percentage": 34.98, "elapsed_time": "0:43:59", "remaining_time": "1:21:47"} +{"current_steps": 865, "total_steps": 2470, "loss": 1.4331, "lr": 8.216643723126367e-06, "epoch": 3.5020242914979756, "percentage": 35.02, "elapsed_time": "0:44:02", "remaining_time": "1:21:43"} +{"current_steps": 866, "total_steps": 2470, "loss": 1.7553, "lr": 8.211230769022552e-06, "epoch": 3.506072874493927, "percentage": 35.06, "elapsed_time": "0:44:05", "remaining_time": "1:21:40"} +{"current_steps": 867, "total_steps": 2470, "loss": 1.782, "lr": 8.2058114014606e-06, "epoch": 3.5101214574898787, "percentage": 35.1, "elapsed_time": "0:44:08", "remaining_time": "1:21:37"} +{"current_steps": 868, "total_steps": 2470, "loss": 1.7357, "lr": 8.200385631264051e-06, "epoch": 3.51417004048583, "percentage": 35.14, "elapsed_time": "0:44:12", "remaining_time": "1:21:34"} +{"current_steps": 869, "total_steps": 2470, "loss": 1.7569, "lr": 8.19495346926924e-06, "epoch": 3.5182186234817814, "percentage": 35.18, "elapsed_time": "0:44:15", "remaining_time": "1:21:31"} +{"current_steps": 870, "total_steps": 2470, "loss": 1.7036, "lr": 8.189514926325255e-06, "epoch": 3.522267206477733, "percentage": 35.22, "elapsed_time": "0:44:18", "remaining_time": "1:21:28"} +{"current_steps": 871, "total_steps": 2470, "loss": 1.4984, "lr": 8.184070013293936e-06, "epoch": 3.526315789473684, "percentage": 35.26, "elapsed_time": "0:44:21", "remaining_time": "1:21:25"} +{"current_steps": 872, "total_steps": 2470, "loss": 1.5719, "lr": 8.178618741049841e-06, "epoch": 3.5303643724696356, "percentage": 35.3, "elapsed_time": "0:44:24", "remaining_time": "1:21:22"} +{"current_steps": 873, "total_steps": 2470, "loss": 1.7235, "lr": 8.173161120480232e-06, "epoch": 3.534412955465587, "percentage": 35.34, "elapsed_time": "0:44:27", "remaining_time": "1:21:19"} +{"current_steps": 874, "total_steps": 2470, "loss": 1.8976, "lr": 8.16769716248505e-06, "epoch": 3.5384615384615383, "percentage": 35.38, "elapsed_time": "0:44:30", "remaining_time": "1:21:16"} +{"current_steps": 875, "total_steps": 2470, "loss": 1.797, "lr": 8.162226877976886e-06, "epoch": 3.54251012145749, "percentage": 35.43, "elapsed_time": "0:44:33", "remaining_time": "1:21:13"} +{"current_steps": 876, "total_steps": 2470, "loss": 2.2212, "lr": 8.156750277880979e-06, "epoch": 3.5465587044534415, "percentage": 35.47, "elapsed_time": "0:44:36", "remaining_time": "1:21:09"} +{"current_steps": 877, "total_steps": 2470, "loss": 2.2759, "lr": 8.15126737313517e-06, "epoch": 3.5506072874493926, "percentage": 35.51, "elapsed_time": "0:44:39", "remaining_time": "1:21:06"} +{"current_steps": 878, "total_steps": 2470, "loss": 2.5045, "lr": 8.145778174689897e-06, "epoch": 3.554655870445344, "percentage": 35.55, "elapsed_time": "0:44:42", "remaining_time": "1:21:03"} +{"current_steps": 879, "total_steps": 2470, "loss": 1.702, "lr": 8.140282693508168e-06, "epoch": 3.5587044534412957, "percentage": 35.59, "elapsed_time": "0:44:45", "remaining_time": "1:21:00"} +{"current_steps": 880, "total_steps": 2470, "loss": 1.5859, "lr": 8.134780940565535e-06, "epoch": 3.562753036437247, "percentage": 35.63, "elapsed_time": "0:44:48", "remaining_time": "1:20:57"} +{"current_steps": 881, "total_steps": 2470, "loss": 1.9019, "lr": 8.129272926850079e-06, "epoch": 3.5668016194331984, "percentage": 35.67, "elapsed_time": "0:44:51", "remaining_time": "1:20:54"} +{"current_steps": 882, "total_steps": 2470, "loss": 1.5424, "lr": 8.123758663362386e-06, "epoch": 3.57085020242915, "percentage": 35.71, "elapsed_time": "0:44:54", "remaining_time": "1:20:51"} +{"current_steps": 883, "total_steps": 2470, "loss": 1.8581, "lr": 8.118238161115523e-06, "epoch": 3.574898785425101, "percentage": 35.75, "elapsed_time": "0:44:57", "remaining_time": "1:20:48"} +{"current_steps": 884, "total_steps": 2470, "loss": 1.5914, "lr": 8.112711431135014e-06, "epoch": 3.5789473684210527, "percentage": 35.79, "elapsed_time": "0:45:00", "remaining_time": "1:20:45"} +{"current_steps": 885, "total_steps": 2470, "loss": 1.7957, "lr": 8.107178484458825e-06, "epoch": 3.582995951417004, "percentage": 35.83, "elapsed_time": "0:45:03", "remaining_time": "1:20:42"} +{"current_steps": 886, "total_steps": 2470, "loss": 1.7404, "lr": 8.101639332137337e-06, "epoch": 3.5870445344129553, "percentage": 35.87, "elapsed_time": "0:45:06", "remaining_time": "1:20:39"} +{"current_steps": 887, "total_steps": 2470, "loss": 1.7127, "lr": 8.096093985233323e-06, "epoch": 3.591093117408907, "percentage": 35.91, "elapsed_time": "0:45:09", "remaining_time": "1:20:35"} +{"current_steps": 888, "total_steps": 2470, "loss": 1.4308, "lr": 8.090542454821929e-06, "epoch": 3.5951417004048585, "percentage": 35.95, "elapsed_time": "0:45:12", "remaining_time": "1:20:32"} +{"current_steps": 889, "total_steps": 2470, "loss": 1.4797, "lr": 8.084984751990652e-06, "epoch": 3.5991902834008096, "percentage": 35.99, "elapsed_time": "0:45:15", "remaining_time": "1:20:29"} +{"current_steps": 890, "total_steps": 2470, "loss": 1.6173, "lr": 8.079420887839316e-06, "epoch": 3.603238866396761, "percentage": 36.03, "elapsed_time": "0:45:18", "remaining_time": "1:20:26"} +{"current_steps": 891, "total_steps": 2470, "loss": 1.4952, "lr": 8.073850873480047e-06, "epoch": 3.6072874493927127, "percentage": 36.07, "elapsed_time": "0:45:21", "remaining_time": "1:20:23"} +{"current_steps": 892, "total_steps": 2470, "loss": 1.813, "lr": 8.068274720037261e-06, "epoch": 3.611336032388664, "percentage": 36.11, "elapsed_time": "0:45:24", "remaining_time": "1:20:20"} +{"current_steps": 893, "total_steps": 2470, "loss": 1.7376, "lr": 8.062692438647628e-06, "epoch": 3.6153846153846154, "percentage": 36.15, "elapsed_time": "0:45:27", "remaining_time": "1:20:17"} +{"current_steps": 894, "total_steps": 2470, "loss": 1.505, "lr": 8.057104040460062e-06, "epoch": 3.619433198380567, "percentage": 36.19, "elapsed_time": "0:45:30", "remaining_time": "1:20:14"} +{"current_steps": 895, "total_steps": 2470, "loss": 1.9039, "lr": 8.051509536635686e-06, "epoch": 3.623481781376518, "percentage": 36.23, "elapsed_time": "0:45:34", "remaining_time": "1:20:11"} +{"current_steps": 896, "total_steps": 2470, "loss": 1.7125, "lr": 8.045908938347828e-06, "epoch": 3.6275303643724697, "percentage": 36.28, "elapsed_time": "0:45:37", "remaining_time": "1:20:08"} +{"current_steps": 897, "total_steps": 2470, "loss": 1.9514, "lr": 8.04030225678198e-06, "epoch": 3.6315789473684212, "percentage": 36.32, "elapsed_time": "0:45:40", "remaining_time": "1:20:05"} +{"current_steps": 898, "total_steps": 2470, "loss": 1.597, "lr": 8.034689503135785e-06, "epoch": 3.6356275303643724, "percentage": 36.36, "elapsed_time": "0:45:43", "remaining_time": "1:20:02"} +{"current_steps": 899, "total_steps": 2470, "loss": 1.8072, "lr": 8.029070688619013e-06, "epoch": 3.639676113360324, "percentage": 36.4, "elapsed_time": "0:45:46", "remaining_time": "1:19:58"} +{"current_steps": 900, "total_steps": 2470, "loss": 1.7289, "lr": 8.023445824453539e-06, "epoch": 3.6437246963562755, "percentage": 36.44, "elapsed_time": "0:45:49", "remaining_time": "1:19:55"} +{"current_steps": 901, "total_steps": 2470, "loss": 1.7658, "lr": 8.017814921873326e-06, "epoch": 3.6477732793522266, "percentage": 36.48, "elapsed_time": "0:45:52", "remaining_time": "1:19:52"} +{"current_steps": 902, "total_steps": 2470, "loss": 1.6002, "lr": 8.012177992124385e-06, "epoch": 3.651821862348178, "percentage": 36.52, "elapsed_time": "0:45:55", "remaining_time": "1:19:50"} +{"current_steps": 903, "total_steps": 2470, "loss": 1.8275, "lr": 8.006535046464774e-06, "epoch": 3.6558704453441297, "percentage": 36.56, "elapsed_time": "0:45:58", "remaining_time": "1:19:47"} +{"current_steps": 904, "total_steps": 2470, "loss": 1.6502, "lr": 8.000886096164564e-06, "epoch": 3.659919028340081, "percentage": 36.6, "elapsed_time": "0:46:01", "remaining_time": "1:19:44"} +{"current_steps": 905, "total_steps": 2470, "loss": 1.8017, "lr": 7.995231152505815e-06, "epoch": 3.6639676113360324, "percentage": 36.64, "elapsed_time": "0:46:04", "remaining_time": "1:19:41"} +{"current_steps": 906, "total_steps": 2470, "loss": 1.8138, "lr": 7.989570226782562e-06, "epoch": 3.668016194331984, "percentage": 36.68, "elapsed_time": "0:46:07", "remaining_time": "1:19:38"} +{"current_steps": 907, "total_steps": 2470, "loss": 1.8128, "lr": 7.983903330300782e-06, "epoch": 3.672064777327935, "percentage": 36.72, "elapsed_time": "0:46:10", "remaining_time": "1:19:35"} +{"current_steps": 908, "total_steps": 2470, "loss": 1.7148, "lr": 7.978230474378383e-06, "epoch": 3.6761133603238867, "percentage": 36.76, "elapsed_time": "0:46:14", "remaining_time": "1:19:32"} +{"current_steps": 909, "total_steps": 2470, "loss": 1.7726, "lr": 7.97255167034517e-06, "epoch": 3.6801619433198383, "percentage": 36.8, "elapsed_time": "0:46:17", "remaining_time": "1:19:29"} +{"current_steps": 910, "total_steps": 2470, "loss": 1.5779, "lr": 7.966866929542827e-06, "epoch": 3.6842105263157894, "percentage": 36.84, "elapsed_time": "0:46:20", "remaining_time": "1:19:26"} +{"current_steps": 911, "total_steps": 2470, "loss": 1.7465, "lr": 7.961176263324902e-06, "epoch": 3.688259109311741, "percentage": 36.88, "elapsed_time": "0:46:23", "remaining_time": "1:19:23"} +{"current_steps": 912, "total_steps": 2470, "loss": 1.7608, "lr": 7.955479683056767e-06, "epoch": 3.6923076923076925, "percentage": 36.92, "elapsed_time": "0:46:26", "remaining_time": "1:19:20"} +{"current_steps": 913, "total_steps": 2470, "loss": 1.5992, "lr": 7.949777200115617e-06, "epoch": 3.6963562753036436, "percentage": 36.96, "elapsed_time": "0:46:29", "remaining_time": "1:19:17"} +{"current_steps": 914, "total_steps": 2470, "loss": 2.089, "lr": 7.944068825890424e-06, "epoch": 3.700404858299595, "percentage": 37.0, "elapsed_time": "0:46:32", "remaining_time": "1:19:14"} +{"current_steps": 915, "total_steps": 2470, "loss": 1.8514, "lr": 7.938354571781933e-06, "epoch": 3.7044534412955468, "percentage": 37.04, "elapsed_time": "0:46:35", "remaining_time": "1:19:11"} +{"current_steps": 916, "total_steps": 2470, "loss": 1.4493, "lr": 7.932634449202635e-06, "epoch": 3.708502024291498, "percentage": 37.09, "elapsed_time": "0:46:38", "remaining_time": "1:19:07"} +{"current_steps": 917, "total_steps": 2470, "loss": 1.6351, "lr": 7.92690846957673e-06, "epoch": 3.7125506072874495, "percentage": 37.13, "elapsed_time": "0:46:41", "remaining_time": "1:19:04"} +{"current_steps": 918, "total_steps": 2470, "loss": 1.7253, "lr": 7.921176644340132e-06, "epoch": 3.716599190283401, "percentage": 37.17, "elapsed_time": "0:46:44", "remaining_time": "1:19:01"} +{"current_steps": 919, "total_steps": 2470, "loss": 1.5384, "lr": 7.915438984940415e-06, "epoch": 3.720647773279352, "percentage": 37.21, "elapsed_time": "0:46:47", "remaining_time": "1:18:58"} +{"current_steps": 920, "total_steps": 2470, "loss": 1.6518, "lr": 7.909695502836814e-06, "epoch": 3.7246963562753037, "percentage": 37.25, "elapsed_time": "0:46:50", "remaining_time": "1:18:55"} +{"current_steps": 921, "total_steps": 2470, "loss": 1.8741, "lr": 7.903946209500189e-06, "epoch": 3.7287449392712553, "percentage": 37.29, "elapsed_time": "0:46:53", "remaining_time": "1:18:52"} +{"current_steps": 922, "total_steps": 2470, "loss": 1.6996, "lr": 7.898191116413007e-06, "epoch": 3.7327935222672064, "percentage": 37.33, "elapsed_time": "0:46:56", "remaining_time": "1:18:49"} +{"current_steps": 923, "total_steps": 2470, "loss": 1.7427, "lr": 7.892430235069317e-06, "epoch": 3.736842105263158, "percentage": 37.37, "elapsed_time": "0:46:59", "remaining_time": "1:18:46"} +{"current_steps": 924, "total_steps": 2470, "loss": 2.4106, "lr": 7.886663576974733e-06, "epoch": 3.7408906882591095, "percentage": 37.41, "elapsed_time": "0:47:02", "remaining_time": "1:18:43"} +{"current_steps": 925, "total_steps": 2470, "loss": 1.808, "lr": 7.880891153646401e-06, "epoch": 3.7449392712550607, "percentage": 37.45, "elapsed_time": "0:47:05", "remaining_time": "1:18:40"} +{"current_steps": 926, "total_steps": 2470, "loss": 1.6368, "lr": 7.875112976612984e-06, "epoch": 3.748987854251012, "percentage": 37.49, "elapsed_time": "0:47:08", "remaining_time": "1:18:37"} +{"current_steps": 927, "total_steps": 2470, "loss": 1.5175, "lr": 7.869329057414635e-06, "epoch": 3.753036437246964, "percentage": 37.53, "elapsed_time": "0:47:12", "remaining_time": "1:18:33"} +{"current_steps": 928, "total_steps": 2470, "loss": 1.7423, "lr": 7.863539407602976e-06, "epoch": 3.757085020242915, "percentage": 37.57, "elapsed_time": "0:47:15", "remaining_time": "1:18:30"} +{"current_steps": 929, "total_steps": 2470, "loss": 2.5332, "lr": 7.857744038741076e-06, "epoch": 3.7611336032388665, "percentage": 37.61, "elapsed_time": "0:47:18", "remaining_time": "1:18:27"} +{"current_steps": 930, "total_steps": 2470, "loss": 2.3287, "lr": 7.85194296240342e-06, "epoch": 3.765182186234818, "percentage": 37.65, "elapsed_time": "0:47:21", "remaining_time": "1:18:24"} +{"current_steps": 931, "total_steps": 2470, "loss": 2.1714, "lr": 7.846136190175901e-06, "epoch": 3.769230769230769, "percentage": 37.69, "elapsed_time": "0:47:24", "remaining_time": "1:18:21"} +{"current_steps": 932, "total_steps": 2470, "loss": 1.671, "lr": 7.84032373365578e-06, "epoch": 3.7732793522267207, "percentage": 37.73, "elapsed_time": "0:47:27", "remaining_time": "1:18:18"} +{"current_steps": 933, "total_steps": 2470, "loss": 1.9108, "lr": 7.834505604451672e-06, "epoch": 3.7773279352226723, "percentage": 37.77, "elapsed_time": "0:47:30", "remaining_time": "1:18:15"} +{"current_steps": 934, "total_steps": 2470, "loss": 1.9396, "lr": 7.828681814183527e-06, "epoch": 3.7813765182186234, "percentage": 37.81, "elapsed_time": "0:47:33", "remaining_time": "1:18:12"} +{"current_steps": 935, "total_steps": 2470, "loss": 1.7587, "lr": 7.822852374482597e-06, "epoch": 3.785425101214575, "percentage": 37.85, "elapsed_time": "0:47:36", "remaining_time": "1:18:09"} +{"current_steps": 936, "total_steps": 2470, "loss": 1.6507, "lr": 7.817017296991411e-06, "epoch": 3.7894736842105265, "percentage": 37.89, "elapsed_time": "0:47:39", "remaining_time": "1:18:05"} +{"current_steps": 937, "total_steps": 2470, "loss": 1.7372, "lr": 7.811176593363771e-06, "epoch": 3.7935222672064777, "percentage": 37.94, "elapsed_time": "0:47:42", "remaining_time": "1:18:02"} +{"current_steps": 938, "total_steps": 2470, "loss": 1.7485, "lr": 7.805330275264707e-06, "epoch": 3.7975708502024292, "percentage": 37.98, "elapsed_time": "0:47:45", "remaining_time": "1:17:59"} +{"current_steps": 939, "total_steps": 2470, "loss": 1.5515, "lr": 7.79947835437046e-06, "epoch": 3.801619433198381, "percentage": 38.02, "elapsed_time": "0:47:48", "remaining_time": "1:17:56"} +{"current_steps": 940, "total_steps": 2470, "loss": 1.4447, "lr": 7.79362084236847e-06, "epoch": 3.805668016194332, "percentage": 38.06, "elapsed_time": "0:47:51", "remaining_time": "1:17:53"} +{"current_steps": 941, "total_steps": 2470, "loss": 1.8015, "lr": 7.787757750957335e-06, "epoch": 3.8097165991902835, "percentage": 38.1, "elapsed_time": "0:47:54", "remaining_time": "1:17:50"} +{"current_steps": 942, "total_steps": 2470, "loss": 1.7528, "lr": 7.781889091846799e-06, "epoch": 3.813765182186235, "percentage": 38.14, "elapsed_time": "0:47:57", "remaining_time": "1:17:47"} +{"current_steps": 943, "total_steps": 2470, "loss": 1.5226, "lr": 7.776014876757727e-06, "epoch": 3.817813765182186, "percentage": 38.18, "elapsed_time": "0:48:00", "remaining_time": "1:17:44"} +{"current_steps": 944, "total_steps": 2470, "loss": 2.3966, "lr": 7.77013511742208e-06, "epoch": 3.8218623481781377, "percentage": 38.22, "elapsed_time": "0:48:03", "remaining_time": "1:17:41"} +{"current_steps": 945, "total_steps": 2470, "loss": 3.7738, "lr": 7.76424982558289e-06, "epoch": 3.8259109311740893, "percentage": 38.26, "elapsed_time": "0:48:06", "remaining_time": "1:17:38"} +{"current_steps": 946, "total_steps": 2470, "loss": 1.6137, "lr": 7.758359012994242e-06, "epoch": 3.8299595141700404, "percentage": 38.3, "elapsed_time": "0:48:09", "remaining_time": "1:17:35"} +{"current_steps": 947, "total_steps": 2470, "loss": 1.4666, "lr": 7.752462691421245e-06, "epoch": 3.834008097165992, "percentage": 38.34, "elapsed_time": "0:48:12", "remaining_time": "1:17:32"} +{"current_steps": 948, "total_steps": 2470, "loss": 1.5791, "lr": 7.746560872640007e-06, "epoch": 3.8380566801619436, "percentage": 38.38, "elapsed_time": "0:48:15", "remaining_time": "1:17:28"} +{"current_steps": 949, "total_steps": 2470, "loss": 1.5937, "lr": 7.740653568437623e-06, "epoch": 3.8421052631578947, "percentage": 38.42, "elapsed_time": "0:48:18", "remaining_time": "1:17:25"} +{"current_steps": 950, "total_steps": 2470, "loss": 1.5169, "lr": 7.734740790612137e-06, "epoch": 3.8461538461538463, "percentage": 38.46, "elapsed_time": "0:48:22", "remaining_time": "1:17:23"} +{"current_steps": 951, "total_steps": 2470, "loss": 1.6162, "lr": 7.728822550972523e-06, "epoch": 3.850202429149798, "percentage": 38.5, "elapsed_time": "0:48:25", "remaining_time": "1:17:20"} +{"current_steps": 952, "total_steps": 2470, "loss": 1.7001, "lr": 7.722898861338674e-06, "epoch": 3.854251012145749, "percentage": 38.54, "elapsed_time": "0:48:28", "remaining_time": "1:17:17"} +{"current_steps": 953, "total_steps": 2470, "loss": 1.6257, "lr": 7.716969733541357e-06, "epoch": 3.8582995951417005, "percentage": 38.58, "elapsed_time": "0:48:31", "remaining_time": "1:17:14"} +{"current_steps": 954, "total_steps": 2470, "loss": 1.6058, "lr": 7.711035179422205e-06, "epoch": 3.862348178137652, "percentage": 38.62, "elapsed_time": "0:48:34", "remaining_time": "1:17:11"} +{"current_steps": 955, "total_steps": 2470, "loss": 1.6468, "lr": 7.705095210833687e-06, "epoch": 3.866396761133603, "percentage": 38.66, "elapsed_time": "0:48:37", "remaining_time": "1:17:08"} +{"current_steps": 956, "total_steps": 2470, "loss": 2.1392, "lr": 7.699149839639086e-06, "epoch": 3.8704453441295548, "percentage": 38.7, "elapsed_time": "0:48:40", "remaining_time": "1:17:05"} +{"current_steps": 957, "total_steps": 2470, "loss": 2.0741, "lr": 7.693199077712476e-06, "epoch": 3.8744939271255063, "percentage": 38.74, "elapsed_time": "0:48:43", "remaining_time": "1:17:02"} +{"current_steps": 958, "total_steps": 2470, "loss": 1.8205, "lr": 7.687242936938694e-06, "epoch": 3.8785425101214575, "percentage": 38.79, "elapsed_time": "0:48:46", "remaining_time": "1:16:59"} +{"current_steps": 959, "total_steps": 2470, "loss": 1.7239, "lr": 7.681281429213328e-06, "epoch": 3.882591093117409, "percentage": 38.83, "elapsed_time": "0:48:49", "remaining_time": "1:16:56"} +{"current_steps": 960, "total_steps": 2470, "loss": 1.2702, "lr": 7.675314566442673e-06, "epoch": 3.8866396761133606, "percentage": 38.87, "elapsed_time": "0:48:52", "remaining_time": "1:16:53"} +{"current_steps": 961, "total_steps": 2470, "loss": 1.7654, "lr": 7.669342360543727e-06, "epoch": 3.8906882591093117, "percentage": 38.91, "elapsed_time": "0:48:55", "remaining_time": "1:16:50"} +{"current_steps": 962, "total_steps": 2470, "loss": 1.8567, "lr": 7.663364823444157e-06, "epoch": 3.8947368421052633, "percentage": 38.95, "elapsed_time": "0:48:58", "remaining_time": "1:16:47"} +{"current_steps": 963, "total_steps": 2470, "loss": 1.5513, "lr": 7.65738196708228e-06, "epoch": 3.898785425101215, "percentage": 38.99, "elapsed_time": "0:49:02", "remaining_time": "1:16:43"} +{"current_steps": 964, "total_steps": 2470, "loss": 1.6101, "lr": 7.651393803407032e-06, "epoch": 3.902834008097166, "percentage": 39.03, "elapsed_time": "0:49:05", "remaining_time": "1:16:40"} +{"current_steps": 965, "total_steps": 2470, "loss": 1.7802, "lr": 7.645400344377953e-06, "epoch": 3.9068825910931175, "percentage": 39.07, "elapsed_time": "0:49:08", "remaining_time": "1:16:37"} +{"current_steps": 966, "total_steps": 2470, "loss": 1.3433, "lr": 7.639401601965158e-06, "epoch": 3.910931174089069, "percentage": 39.11, "elapsed_time": "0:49:11", "remaining_time": "1:16:34"} +{"current_steps": 967, "total_steps": 2470, "loss": 1.4571, "lr": 7.63339758814931e-06, "epoch": 3.91497975708502, "percentage": 39.15, "elapsed_time": "0:49:14", "remaining_time": "1:16:31"} +{"current_steps": 968, "total_steps": 2470, "loss": 1.4798, "lr": 7.627388314921602e-06, "epoch": 3.919028340080972, "percentage": 39.19, "elapsed_time": "0:49:17", "remaining_time": "1:16:28"} +{"current_steps": 969, "total_steps": 2470, "loss": 1.7924, "lr": 7.621373794283735e-06, "epoch": 3.9230769230769234, "percentage": 39.23, "elapsed_time": "0:49:20", "remaining_time": "1:16:25"} +{"current_steps": 970, "total_steps": 2470, "loss": 1.6337, "lr": 7.615354038247889e-06, "epoch": 3.9271255060728745, "percentage": 39.27, "elapsed_time": "0:49:23", "remaining_time": "1:16:22"} +{"current_steps": 971, "total_steps": 2470, "loss": 1.6699, "lr": 7.609329058836694e-06, "epoch": 3.931174089068826, "percentage": 39.31, "elapsed_time": "0:49:26", "remaining_time": "1:16:19"} +{"current_steps": 972, "total_steps": 2470, "loss": 1.4692, "lr": 7.6032988680832195e-06, "epoch": 3.9352226720647776, "percentage": 39.35, "elapsed_time": "0:49:29", "remaining_time": "1:16:16"} +{"current_steps": 973, "total_steps": 2470, "loss": 1.3909, "lr": 7.597263478030939e-06, "epoch": 3.9392712550607287, "percentage": 39.39, "elapsed_time": "0:49:32", "remaining_time": "1:16:12"} +{"current_steps": 974, "total_steps": 2470, "loss": 1.6787, "lr": 7.59122290073371e-06, "epoch": 3.9433198380566803, "percentage": 39.43, "elapsed_time": "0:49:35", "remaining_time": "1:16:09"} +{"current_steps": 975, "total_steps": 2470, "loss": 1.4349, "lr": 7.5851771482557535e-06, "epoch": 3.9473684210526314, "percentage": 39.47, "elapsed_time": "0:49:38", "remaining_time": "1:16:06"} +{"current_steps": 976, "total_steps": 2470, "loss": 1.4016, "lr": 7.579126232671621e-06, "epoch": 3.951417004048583, "percentage": 39.51, "elapsed_time": "0:49:41", "remaining_time": "1:16:03"} +{"current_steps": 977, "total_steps": 2470, "loss": 1.6104, "lr": 7.5730701660661795e-06, "epoch": 3.9554655870445345, "percentage": 39.55, "elapsed_time": "0:49:44", "remaining_time": "1:16:00"} +{"current_steps": 978, "total_steps": 2470, "loss": 1.6231, "lr": 7.567008960534585e-06, "epoch": 3.9595141700404857, "percentage": 39.6, "elapsed_time": "0:49:47", "remaining_time": "1:15:57"} +{"current_steps": 979, "total_steps": 2470, "loss": 1.6679, "lr": 7.560942628182251e-06, "epoch": 3.9635627530364372, "percentage": 39.64, "elapsed_time": "0:49:50", "remaining_time": "1:15:54"} +{"current_steps": 980, "total_steps": 2470, "loss": 1.8633, "lr": 7.554871181124836e-06, "epoch": 3.967611336032389, "percentage": 39.68, "elapsed_time": "0:49:53", "remaining_time": "1:15:51"} +{"current_steps": 981, "total_steps": 2470, "loss": 1.768, "lr": 7.548794631488211e-06, "epoch": 3.97165991902834, "percentage": 39.72, "elapsed_time": "0:49:56", "remaining_time": "1:15:48"} +{"current_steps": 982, "total_steps": 2470, "loss": 1.6442, "lr": 7.5427129914084385e-06, "epoch": 3.9757085020242915, "percentage": 39.76, "elapsed_time": "0:49:59", "remaining_time": "1:15:45"} +{"current_steps": 983, "total_steps": 2470, "loss": 1.7358, "lr": 7.536626273031747e-06, "epoch": 3.979757085020243, "percentage": 39.8, "elapsed_time": "0:50:02", "remaining_time": "1:15:41"} +{"current_steps": 984, "total_steps": 2470, "loss": 1.8024, "lr": 7.530534488514507e-06, "epoch": 3.983805668016194, "percentage": 39.84, "elapsed_time": "0:50:05", "remaining_time": "1:15:38"} +{"current_steps": 985, "total_steps": 2470, "loss": 1.6063, "lr": 7.524437650023211e-06, "epoch": 3.9878542510121457, "percentage": 39.88, "elapsed_time": "0:50:08", "remaining_time": "1:15:35"} +{"current_steps": 986, "total_steps": 2470, "loss": 1.5544, "lr": 7.5183357697344395e-06, "epoch": 3.9919028340080973, "percentage": 39.92, "elapsed_time": "0:50:11", "remaining_time": "1:15:32"} +{"current_steps": 987, "total_steps": 2470, "loss": 1.8733, "lr": 7.512228859834845e-06, "epoch": 3.9959514170040484, "percentage": 39.96, "elapsed_time": "0:50:14", "remaining_time": "1:15:29"} +{"current_steps": 988, "total_steps": 2470, "loss": 1.6136, "lr": 7.506116932521127e-06, "epoch": 4.0, "percentage": 40.0, "elapsed_time": "0:50:17", "remaining_time": "1:15:26"} +{"current_steps": 989, "total_steps": 2470, "loss": 1.6735, "lr": 7.500000000000001e-06, "epoch": 4.004048582995951, "percentage": 40.04, "elapsed_time": "0:50:20", "remaining_time": "1:15:23"} +{"current_steps": 990, "total_steps": 2470, "loss": 1.8144, "lr": 7.493878074488184e-06, "epoch": 4.008097165991903, "percentage": 40.08, "elapsed_time": "0:50:23", "remaining_time": "1:15:20"} +{"current_steps": 991, "total_steps": 2470, "loss": 1.6734, "lr": 7.4877511682123635e-06, "epoch": 4.012145748987854, "percentage": 40.12, "elapsed_time": "0:50:26", "remaining_time": "1:15:17"} +{"current_steps": 992, "total_steps": 2470, "loss": 1.8495, "lr": 7.481619293409173e-06, "epoch": 4.016194331983805, "percentage": 40.16, "elapsed_time": "0:50:29", "remaining_time": "1:15:13"} +{"current_steps": 993, "total_steps": 2470, "loss": 2.099, "lr": 7.475482462325169e-06, "epoch": 4.020242914979757, "percentage": 40.2, "elapsed_time": "0:50:32", "remaining_time": "1:15:10"} +{"current_steps": 994, "total_steps": 2470, "loss": 1.9446, "lr": 7.469340687216809e-06, "epoch": 4.0242914979757085, "percentage": 40.24, "elapsed_time": "0:50:35", "remaining_time": "1:15:07"} +{"current_steps": 995, "total_steps": 2470, "loss": 1.6196, "lr": 7.4631939803504215e-06, "epoch": 4.02834008097166, "percentage": 40.28, "elapsed_time": "0:50:38", "remaining_time": "1:15:04"} +{"current_steps": 996, "total_steps": 2470, "loss": 1.6221, "lr": 7.4570423540021905e-06, "epoch": 4.032388663967612, "percentage": 40.32, "elapsed_time": "0:50:41", "remaining_time": "1:15:01"} +{"current_steps": 997, "total_steps": 2470, "loss": 1.8749, "lr": 7.450885820458117e-06, "epoch": 4.036437246963563, "percentage": 40.36, "elapsed_time": "0:50:44", "remaining_time": "1:14:58"} +{"current_steps": 998, "total_steps": 2470, "loss": 1.6649, "lr": 7.44472439201401e-06, "epoch": 4.040485829959514, "percentage": 40.4, "elapsed_time": "0:50:48", "remaining_time": "1:14:56"} +{"current_steps": 999, "total_steps": 2470, "loss": 1.6799, "lr": 7.438558080975449e-06, "epoch": 4.044534412955466, "percentage": 40.45, "elapsed_time": "0:50:51", "remaining_time": "1:14:53"} +{"current_steps": 1000, "total_steps": 2470, "loss": 1.63, "lr": 7.4323868996577696e-06, "epoch": 4.048582995951417, "percentage": 40.49, "elapsed_time": "0:50:54", "remaining_time": "1:14:49"} +{"current_steps": 1001, "total_steps": 2470, "loss": 1.7354, "lr": 7.426210860386032e-06, "epoch": 4.052631578947368, "percentage": 40.53, "elapsed_time": "0:50:57", "remaining_time": "1:14:46"} +{"current_steps": 1002, "total_steps": 2470, "loss": 1.5703, "lr": 7.420029975494996e-06, "epoch": 4.05668016194332, "percentage": 40.57, "elapsed_time": "0:51:00", "remaining_time": "1:14:43"} +{"current_steps": 1003, "total_steps": 2470, "loss": 1.749, "lr": 7.413844257329104e-06, "epoch": 4.060728744939271, "percentage": 40.61, "elapsed_time": "0:51:03", "remaining_time": "1:14:40"} +{"current_steps": 1004, "total_steps": 2470, "loss": 1.6948, "lr": 7.407653718242449e-06, "epoch": 4.064777327935222, "percentage": 40.65, "elapsed_time": "0:51:06", "remaining_time": "1:14:37"} +{"current_steps": 1005, "total_steps": 2470, "loss": 1.8281, "lr": 7.401458370598753e-06, "epoch": 4.068825910931174, "percentage": 40.69, "elapsed_time": "0:51:09", "remaining_time": "1:14:34"} +{"current_steps": 1006, "total_steps": 2470, "loss": 1.7673, "lr": 7.395258226771341e-06, "epoch": 4.0728744939271255, "percentage": 40.73, "elapsed_time": "0:51:12", "remaining_time": "1:14:31"} +{"current_steps": 1007, "total_steps": 2470, "loss": 1.6958, "lr": 7.3890532991431174e-06, "epoch": 4.076923076923077, "percentage": 40.77, "elapsed_time": "0:51:15", "remaining_time": "1:14:28"} +{"current_steps": 1008, "total_steps": 2470, "loss": 1.7112, "lr": 7.382843600106539e-06, "epoch": 4.080971659919029, "percentage": 40.81, "elapsed_time": "0:51:18", "remaining_time": "1:14:25"} +{"current_steps": 1009, "total_steps": 2470, "loss": 1.7162, "lr": 7.376629142063597e-06, "epoch": 4.08502024291498, "percentage": 40.85, "elapsed_time": "0:51:21", "remaining_time": "1:14:22"} +{"current_steps": 1010, "total_steps": 2470, "loss": 1.7045, "lr": 7.370409937425781e-06, "epoch": 4.089068825910931, "percentage": 40.89, "elapsed_time": "0:51:24", "remaining_time": "1:14:19"} +{"current_steps": 1011, "total_steps": 2470, "loss": 1.7854, "lr": 7.364185998614064e-06, "epoch": 4.093117408906883, "percentage": 40.93, "elapsed_time": "0:51:27", "remaining_time": "1:14:16"} +{"current_steps": 1012, "total_steps": 2470, "loss": 1.534, "lr": 7.357957338058873e-06, "epoch": 4.097165991902834, "percentage": 40.97, "elapsed_time": "0:51:30", "remaining_time": "1:14:13"} +{"current_steps": 1013, "total_steps": 2470, "loss": 1.7001, "lr": 7.3517239682000675e-06, "epoch": 4.101214574898785, "percentage": 41.01, "elapsed_time": "0:51:33", "remaining_time": "1:14:09"} +{"current_steps": 1014, "total_steps": 2470, "loss": 1.7037, "lr": 7.345485901486908e-06, "epoch": 4.105263157894737, "percentage": 41.05, "elapsed_time": "0:51:36", "remaining_time": "1:14:06"} +{"current_steps": 1015, "total_steps": 2470, "loss": 1.6197, "lr": 7.33924315037804e-06, "epoch": 4.109311740890688, "percentage": 41.09, "elapsed_time": "0:51:39", "remaining_time": "1:14:03"} +{"current_steps": 1016, "total_steps": 2470, "loss": 1.5587, "lr": 7.332995727341462e-06, "epoch": 4.113360323886639, "percentage": 41.13, "elapsed_time": "0:51:42", "remaining_time": "1:14:00"} +{"current_steps": 1017, "total_steps": 2470, "loss": 1.4804, "lr": 7.326743644854504e-06, "epoch": 4.117408906882591, "percentage": 41.17, "elapsed_time": "0:51:45", "remaining_time": "1:13:57"} +{"current_steps": 1018, "total_steps": 2470, "loss": 1.5149, "lr": 7.3204869154038015e-06, "epoch": 4.1214574898785425, "percentage": 41.21, "elapsed_time": "0:51:49", "remaining_time": "1:13:54"} +{"current_steps": 1019, "total_steps": 2470, "loss": 1.5156, "lr": 7.314225551485273e-06, "epoch": 4.125506072874494, "percentage": 41.26, "elapsed_time": "0:51:52", "remaining_time": "1:13:51"} +{"current_steps": 1020, "total_steps": 2470, "loss": 1.4187, "lr": 7.30795956560409e-06, "epoch": 4.129554655870446, "percentage": 41.3, "elapsed_time": "0:51:55", "remaining_time": "1:13:48"} +{"current_steps": 1021, "total_steps": 2470, "loss": 1.7718, "lr": 7.301688970274655e-06, "epoch": 4.133603238866397, "percentage": 41.34, "elapsed_time": "0:51:58", "remaining_time": "1:13:45"} +{"current_steps": 1022, "total_steps": 2470, "loss": 1.6181, "lr": 7.295413778020579e-06, "epoch": 4.137651821862348, "percentage": 41.38, "elapsed_time": "0:52:01", "remaining_time": "1:13:42"} +{"current_steps": 1023, "total_steps": 2470, "loss": 1.3513, "lr": 7.289134001374654e-06, "epoch": 4.1417004048583, "percentage": 41.42, "elapsed_time": "0:52:04", "remaining_time": "1:13:38"} +{"current_steps": 1024, "total_steps": 2470, "loss": 1.7449, "lr": 7.282849652878824e-06, "epoch": 4.145748987854251, "percentage": 41.46, "elapsed_time": "0:52:07", "remaining_time": "1:13:35"} +{"current_steps": 1025, "total_steps": 2470, "loss": 1.56, "lr": 7.276560745084167e-06, "epoch": 4.149797570850202, "percentage": 41.5, "elapsed_time": "0:52:10", "remaining_time": "1:13:32"} +{"current_steps": 1026, "total_steps": 2470, "loss": 1.7373, "lr": 7.2702672905508656e-06, "epoch": 4.153846153846154, "percentage": 41.54, "elapsed_time": "0:52:13", "remaining_time": "1:13:29"} +{"current_steps": 1027, "total_steps": 2470, "loss": 1.8929, "lr": 7.263969301848188e-06, "epoch": 4.157894736842105, "percentage": 41.58, "elapsed_time": "0:52:16", "remaining_time": "1:13:26"} +{"current_steps": 1028, "total_steps": 2470, "loss": 1.6155, "lr": 7.257666791554448e-06, "epoch": 4.161943319838056, "percentage": 41.62, "elapsed_time": "0:52:19", "remaining_time": "1:13:23"} +{"current_steps": 1029, "total_steps": 2470, "loss": 1.6856, "lr": 7.251359772256998e-06, "epoch": 4.165991902834008, "percentage": 41.66, "elapsed_time": "0:52:22", "remaining_time": "1:13:20"} +{"current_steps": 1030, "total_steps": 2470, "loss": 2.1658, "lr": 7.245048256552195e-06, "epoch": 4.17004048582996, "percentage": 41.7, "elapsed_time": "0:52:25", "remaining_time": "1:13:17"} +{"current_steps": 1031, "total_steps": 2470, "loss": 1.5329, "lr": 7.2387322570453724e-06, "epoch": 4.174089068825911, "percentage": 41.74, "elapsed_time": "0:52:28", "remaining_time": "1:13:14"} +{"current_steps": 1032, "total_steps": 2470, "loss": 1.7115, "lr": 7.232411786350824e-06, "epoch": 4.178137651821863, "percentage": 41.78, "elapsed_time": "0:52:31", "remaining_time": "1:13:11"} +{"current_steps": 1033, "total_steps": 2470, "loss": 1.5227, "lr": 7.226086857091765e-06, "epoch": 4.182186234817814, "percentage": 41.82, "elapsed_time": "0:52:34", "remaining_time": "1:13:08"} +{"current_steps": 1034, "total_steps": 2470, "loss": 1.6826, "lr": 7.219757481900325e-06, "epoch": 4.186234817813765, "percentage": 41.86, "elapsed_time": "0:52:37", "remaining_time": "1:13:04"} +{"current_steps": 1035, "total_steps": 2470, "loss": 1.7019, "lr": 7.213423673417508e-06, "epoch": 4.190283400809717, "percentage": 41.9, "elapsed_time": "0:52:40", "remaining_time": "1:13:01"} +{"current_steps": 1036, "total_steps": 2470, "loss": 1.8899, "lr": 7.207085444293172e-06, "epoch": 4.194331983805668, "percentage": 41.94, "elapsed_time": "0:52:43", "remaining_time": "1:12:58"} +{"current_steps": 1037, "total_steps": 2470, "loss": 1.9495, "lr": 7.2007428071860045e-06, "epoch": 4.198380566801619, "percentage": 41.98, "elapsed_time": "0:52:46", "remaining_time": "1:12:55"} +{"current_steps": 1038, "total_steps": 2470, "loss": 1.6451, "lr": 7.194395774763496e-06, "epoch": 4.202429149797571, "percentage": 42.02, "elapsed_time": "0:52:49", "remaining_time": "1:12:52"} +{"current_steps": 1039, "total_steps": 2470, "loss": 1.9686, "lr": 7.188044359701917e-06, "epoch": 4.206477732793522, "percentage": 42.06, "elapsed_time": "0:52:52", "remaining_time": "1:12:49"} +{"current_steps": 1040, "total_steps": 2470, "loss": 2.0078, "lr": 7.181688574686292e-06, "epoch": 4.2105263157894735, "percentage": 42.11, "elapsed_time": "0:52:55", "remaining_time": "1:12:46"} +{"current_steps": 1041, "total_steps": 2470, "loss": 1.7921, "lr": 7.175328432410367e-06, "epoch": 4.2145748987854255, "percentage": 42.15, "elapsed_time": "0:52:58", "remaining_time": "1:12:43"} +{"current_steps": 1042, "total_steps": 2470, "loss": 1.6719, "lr": 7.168963945576597e-06, "epoch": 4.218623481781377, "percentage": 42.19, "elapsed_time": "0:53:01", "remaining_time": "1:12:40"} +{"current_steps": 1043, "total_steps": 2470, "loss": 1.5749, "lr": 7.162595126896111e-06, "epoch": 4.222672064777328, "percentage": 42.23, "elapsed_time": "0:53:04", "remaining_time": "1:12:37"} +{"current_steps": 1044, "total_steps": 2470, "loss": 1.7352, "lr": 7.15622198908869e-06, "epoch": 4.22672064777328, "percentage": 42.27, "elapsed_time": "0:53:07", "remaining_time": "1:12:34"} +{"current_steps": 1045, "total_steps": 2470, "loss": 1.5639, "lr": 7.149844544882742e-06, "epoch": 4.230769230769231, "percentage": 42.31, "elapsed_time": "0:53:10", "remaining_time": "1:12:30"} +{"current_steps": 1046, "total_steps": 2470, "loss": 1.8108, "lr": 7.143462807015271e-06, "epoch": 4.234817813765182, "percentage": 42.35, "elapsed_time": "0:53:14", "remaining_time": "1:12:28"} +{"current_steps": 1047, "total_steps": 2470, "loss": 1.7457, "lr": 7.137076788231865e-06, "epoch": 4.238866396761134, "percentage": 42.39, "elapsed_time": "0:53:17", "remaining_time": "1:12:25"} +{"current_steps": 1048, "total_steps": 2470, "loss": 1.7451, "lr": 7.130686501286655e-06, "epoch": 4.242914979757085, "percentage": 42.43, "elapsed_time": "0:53:20", "remaining_time": "1:12:22"} +{"current_steps": 1049, "total_steps": 2470, "loss": 1.5808, "lr": 7.1242919589422974e-06, "epoch": 4.246963562753036, "percentage": 42.47, "elapsed_time": "0:53:23", "remaining_time": "1:12:19"} +{"current_steps": 1050, "total_steps": 2470, "loss": 1.6597, "lr": 7.11789317396995e-06, "epoch": 4.251012145748988, "percentage": 42.51, "elapsed_time": "0:53:26", "remaining_time": "1:12:16"} +{"current_steps": 1051, "total_steps": 2470, "loss": 1.6728, "lr": 7.1114901591492404e-06, "epoch": 4.255060728744939, "percentage": 42.55, "elapsed_time": "0:53:29", "remaining_time": "1:12:13"} +{"current_steps": 1052, "total_steps": 2470, "loss": 1.561, "lr": 7.105082927268247e-06, "epoch": 4.2591093117408905, "percentage": 42.59, "elapsed_time": "0:53:32", "remaining_time": "1:12:10"} +{"current_steps": 1053, "total_steps": 2470, "loss": 1.4172, "lr": 7.0986714911234715e-06, "epoch": 4.2631578947368425, "percentage": 42.63, "elapsed_time": "0:53:35", "remaining_time": "1:12:07"} +{"current_steps": 1054, "total_steps": 2470, "loss": 1.6779, "lr": 7.092255863519806e-06, "epoch": 4.267206477732794, "percentage": 42.67, "elapsed_time": "0:53:38", "remaining_time": "1:12:04"} +{"current_steps": 1055, "total_steps": 2470, "loss": 1.6985, "lr": 7.085836057270521e-06, "epoch": 4.271255060728745, "percentage": 42.71, "elapsed_time": "0:53:41", "remaining_time": "1:12:01"} +{"current_steps": 1056, "total_steps": 2470, "loss": 1.7301, "lr": 7.079412085197229e-06, "epoch": 4.275303643724697, "percentage": 42.75, "elapsed_time": "0:53:44", "remaining_time": "1:11:58"} +{"current_steps": 1057, "total_steps": 2470, "loss": 1.7094, "lr": 7.072983960129862e-06, "epoch": 4.279352226720648, "percentage": 42.79, "elapsed_time": "0:53:47", "remaining_time": "1:11:55"} +{"current_steps": 1058, "total_steps": 2470, "loss": 1.3989, "lr": 7.066551694906651e-06, "epoch": 4.283400809716599, "percentage": 42.83, "elapsed_time": "0:53:50", "remaining_time": "1:11:51"} +{"current_steps": 1059, "total_steps": 2470, "loss": 1.5257, "lr": 7.060115302374087e-06, "epoch": 4.287449392712551, "percentage": 42.87, "elapsed_time": "0:53:53", "remaining_time": "1:11:48"} +{"current_steps": 1060, "total_steps": 2470, "loss": 1.3769, "lr": 7.053674795386914e-06, "epoch": 4.291497975708502, "percentage": 42.91, "elapsed_time": "0:53:57", "remaining_time": "1:11:45"} +{"current_steps": 1061, "total_steps": 2470, "loss": 1.6842, "lr": 7.047230186808085e-06, "epoch": 4.295546558704453, "percentage": 42.96, "elapsed_time": "0:54:00", "remaining_time": "1:11:42"} +{"current_steps": 1062, "total_steps": 2470, "loss": 1.8088, "lr": 7.04078148950875e-06, "epoch": 4.299595141700405, "percentage": 43.0, "elapsed_time": "0:54:03", "remaining_time": "1:11:39"} +{"current_steps": 1063, "total_steps": 2470, "loss": 1.5156, "lr": 7.034328716368224e-06, "epoch": 4.303643724696356, "percentage": 43.04, "elapsed_time": "0:54:06", "remaining_time": "1:11:36"} +{"current_steps": 1064, "total_steps": 2470, "loss": 1.5394, "lr": 7.027871880273959e-06, "epoch": 4.3076923076923075, "percentage": 43.08, "elapsed_time": "0:54:09", "remaining_time": "1:11:33"} +{"current_steps": 1065, "total_steps": 2470, "loss": 1.549, "lr": 7.021410994121525e-06, "epoch": 4.3117408906882595, "percentage": 43.12, "elapsed_time": "0:54:12", "remaining_time": "1:11:30"} +{"current_steps": 1066, "total_steps": 2470, "loss": 1.5296, "lr": 7.014946070814583e-06, "epoch": 4.315789473684211, "percentage": 43.16, "elapsed_time": "0:54:15", "remaining_time": "1:11:27"} +{"current_steps": 1067, "total_steps": 2470, "loss": 1.4361, "lr": 7.008477123264849e-06, "epoch": 4.319838056680162, "percentage": 43.2, "elapsed_time": "0:54:18", "remaining_time": "1:11:24"} +{"current_steps": 1068, "total_steps": 2470, "loss": 1.4498, "lr": 7.0020041643920826e-06, "epoch": 4.323886639676114, "percentage": 43.24, "elapsed_time": "0:54:21", "remaining_time": "1:11:21"} +{"current_steps": 1069, "total_steps": 2470, "loss": 1.4853, "lr": 6.995527207124053e-06, "epoch": 4.327935222672065, "percentage": 43.28, "elapsed_time": "0:54:24", "remaining_time": "1:11:18"} +{"current_steps": 1070, "total_steps": 2470, "loss": 1.4535, "lr": 6.989046264396516e-06, "epoch": 4.331983805668016, "percentage": 43.32, "elapsed_time": "0:54:27", "remaining_time": "1:11:15"} +{"current_steps": 1071, "total_steps": 2470, "loss": 1.5022, "lr": 6.982561349153188e-06, "epoch": 4.336032388663968, "percentage": 43.36, "elapsed_time": "0:54:30", "remaining_time": "1:11:11"} +{"current_steps": 1072, "total_steps": 2470, "loss": 1.4532, "lr": 6.976072474345713e-06, "epoch": 4.340080971659919, "percentage": 43.4, "elapsed_time": "0:54:33", "remaining_time": "1:11:08"} +{"current_steps": 1073, "total_steps": 2470, "loss": 1.4399, "lr": 6.96957965293365e-06, "epoch": 4.34412955465587, "percentage": 43.44, "elapsed_time": "0:54:36", "remaining_time": "1:11:05"} +{"current_steps": 1074, "total_steps": 2470, "loss": 1.615, "lr": 6.963082897884439e-06, "epoch": 4.348178137651822, "percentage": 43.48, "elapsed_time": "0:54:39", "remaining_time": "1:11:02"} +{"current_steps": 1075, "total_steps": 2470, "loss": 1.5412, "lr": 6.956582222173374e-06, "epoch": 4.352226720647773, "percentage": 43.52, "elapsed_time": "0:54:42", "remaining_time": "1:10:59"} +{"current_steps": 1076, "total_steps": 2470, "loss": 1.2047, "lr": 6.9500776387835785e-06, "epoch": 4.3562753036437245, "percentage": 43.56, "elapsed_time": "0:54:45", "remaining_time": "1:10:56"} +{"current_steps": 1077, "total_steps": 2470, "loss": 1.6101, "lr": 6.943569160705985e-06, "epoch": 4.3603238866396765, "percentage": 43.6, "elapsed_time": "0:54:48", "remaining_time": "1:10:53"} +{"current_steps": 1078, "total_steps": 2470, "loss": 1.6897, "lr": 6.9370568009393e-06, "epoch": 4.364372469635628, "percentage": 43.64, "elapsed_time": "0:54:51", "remaining_time": "1:10:50"} +{"current_steps": 1079, "total_steps": 2470, "loss": 1.6066, "lr": 6.9305405724899876e-06, "epoch": 4.368421052631579, "percentage": 43.68, "elapsed_time": "0:54:54", "remaining_time": "1:10:47"} +{"current_steps": 1080, "total_steps": 2470, "loss": 1.6845, "lr": 6.924020488372229e-06, "epoch": 4.372469635627531, "percentage": 43.72, "elapsed_time": "0:54:57", "remaining_time": "1:10:44"} +{"current_steps": 1081, "total_steps": 2470, "loss": 1.3205, "lr": 6.917496561607915e-06, "epoch": 4.376518218623482, "percentage": 43.77, "elapsed_time": "0:55:00", "remaining_time": "1:10:41"} +{"current_steps": 1082, "total_steps": 2470, "loss": 1.4827, "lr": 6.91096880522661e-06, "epoch": 4.380566801619433, "percentage": 43.81, "elapsed_time": "0:55:04", "remaining_time": "1:10:38"} +{"current_steps": 1083, "total_steps": 2470, "loss": 1.2814, "lr": 6.904437232265521e-06, "epoch": 4.384615384615385, "percentage": 43.85, "elapsed_time": "0:55:07", "remaining_time": "1:10:35"} +{"current_steps": 1084, "total_steps": 2470, "loss": 1.3431, "lr": 6.897901855769483e-06, "epoch": 4.388663967611336, "percentage": 43.89, "elapsed_time": "0:55:10", "remaining_time": "1:10:32"} +{"current_steps": 1085, "total_steps": 2470, "loss": 1.5208, "lr": 6.891362688790925e-06, "epoch": 4.392712550607287, "percentage": 43.93, "elapsed_time": "0:55:13", "remaining_time": "1:10:29"} +{"current_steps": 1086, "total_steps": 2470, "loss": 1.3629, "lr": 6.884819744389848e-06, "epoch": 4.396761133603239, "percentage": 43.97, "elapsed_time": "0:55:16", "remaining_time": "1:10:26"} +{"current_steps": 1087, "total_steps": 2470, "loss": 1.3853, "lr": 6.878273035633795e-06, "epoch": 4.40080971659919, "percentage": 44.01, "elapsed_time": "0:55:19", "remaining_time": "1:10:23"} +{"current_steps": 1088, "total_steps": 2470, "loss": 1.4423, "lr": 6.871722575597829e-06, "epoch": 4.4048582995951415, "percentage": 44.05, "elapsed_time": "0:55:22", "remaining_time": "1:10:20"} +{"current_steps": 1089, "total_steps": 2470, "loss": 1.5468, "lr": 6.865168377364506e-06, "epoch": 4.4089068825910935, "percentage": 44.09, "elapsed_time": "0:55:25", "remaining_time": "1:10:17"} +{"current_steps": 1090, "total_steps": 2470, "loss": 1.36, "lr": 6.858610454023842e-06, "epoch": 4.412955465587045, "percentage": 44.13, "elapsed_time": "0:55:28", "remaining_time": "1:10:13"} +{"current_steps": 1091, "total_steps": 2470, "loss": 1.6917, "lr": 6.8520488186733e-06, "epoch": 4.417004048582996, "percentage": 44.17, "elapsed_time": "0:55:31", "remaining_time": "1:10:10"} +{"current_steps": 1092, "total_steps": 2470, "loss": 1.7526, "lr": 6.845483484417756e-06, "epoch": 4.421052631578947, "percentage": 44.21, "elapsed_time": "0:55:34", "remaining_time": "1:10:07"} +{"current_steps": 1093, "total_steps": 2470, "loss": 1.6487, "lr": 6.838914464369467e-06, "epoch": 4.425101214574899, "percentage": 44.25, "elapsed_time": "0:55:37", "remaining_time": "1:10:04"} +{"current_steps": 1094, "total_steps": 2470, "loss": 1.7096, "lr": 6.832341771648057e-06, "epoch": 4.42914979757085, "percentage": 44.29, "elapsed_time": "0:55:41", "remaining_time": "1:10:02"} +{"current_steps": 1095, "total_steps": 2470, "loss": 1.8456, "lr": 6.825765419380484e-06, "epoch": 4.433198380566802, "percentage": 44.33, "elapsed_time": "0:55:44", "remaining_time": "1:09:59"} +{"current_steps": 1096, "total_steps": 2470, "loss": 1.6224, "lr": 6.819185420701011e-06, "epoch": 4.437246963562753, "percentage": 44.37, "elapsed_time": "0:55:47", "remaining_time": "1:09:56"} +{"current_steps": 1097, "total_steps": 2470, "loss": 1.5498, "lr": 6.812601788751192e-06, "epoch": 4.441295546558704, "percentage": 44.41, "elapsed_time": "0:55:50", "remaining_time": "1:09:53"} +{"current_steps": 1098, "total_steps": 2470, "loss": 1.8041, "lr": 6.806014536679828e-06, "epoch": 4.445344129554655, "percentage": 44.45, "elapsed_time": "0:55:53", "remaining_time": "1:09:50"} +{"current_steps": 1099, "total_steps": 2470, "loss": 1.5815, "lr": 6.7994236776429555e-06, "epoch": 4.449392712550607, "percentage": 44.49, "elapsed_time": "0:55:56", "remaining_time": "1:09:47"} +{"current_steps": 1100, "total_steps": 2470, "loss": 1.5695, "lr": 6.792829224803816e-06, "epoch": 4.4534412955465585, "percentage": 44.53, "elapsed_time": "0:55:59", "remaining_time": "1:09:44"} +{"current_steps": 1101, "total_steps": 2470, "loss": 1.9487, "lr": 6.7862311913328235e-06, "epoch": 4.4574898785425106, "percentage": 44.57, "elapsed_time": "0:56:03", "remaining_time": "1:09:41"} +{"current_steps": 1102, "total_steps": 2470, "loss": 2.1517, "lr": 6.779629590407547e-06, "epoch": 4.461538461538462, "percentage": 44.62, "elapsed_time": "0:56:06", "remaining_time": "1:09:38"} +{"current_steps": 1103, "total_steps": 2470, "loss": 1.79, "lr": 6.773024435212678e-06, "epoch": 4.465587044534413, "percentage": 44.66, "elapsed_time": "0:56:09", "remaining_time": "1:09:35"} +{"current_steps": 1104, "total_steps": 2470, "loss": 1.7651, "lr": 6.7664157389400095e-06, "epoch": 4.469635627530364, "percentage": 44.7, "elapsed_time": "0:56:12", "remaining_time": "1:09:32"} +{"current_steps": 1105, "total_steps": 2470, "loss": 1.6839, "lr": 6.7598035147884055e-06, "epoch": 4.473684210526316, "percentage": 44.74, "elapsed_time": "0:56:15", "remaining_time": "1:09:29"} +{"current_steps": 1106, "total_steps": 2470, "loss": 1.692, "lr": 6.753187775963773e-06, "epoch": 4.477732793522267, "percentage": 44.78, "elapsed_time": "0:56:18", "remaining_time": "1:09:26"} +{"current_steps": 1107, "total_steps": 2470, "loss": 1.6155, "lr": 6.746568535679041e-06, "epoch": 4.481781376518219, "percentage": 44.82, "elapsed_time": "0:56:21", "remaining_time": "1:09:23"} +{"current_steps": 1108, "total_steps": 2470, "loss": 1.5755, "lr": 6.739945807154136e-06, "epoch": 4.48582995951417, "percentage": 44.86, "elapsed_time": "0:56:24", "remaining_time": "1:09:20"} +{"current_steps": 1109, "total_steps": 2470, "loss": 1.5105, "lr": 6.733319603615941e-06, "epoch": 4.489878542510121, "percentage": 44.9, "elapsed_time": "0:56:27", "remaining_time": "1:09:17"} +{"current_steps": 1110, "total_steps": 2470, "loss": 1.568, "lr": 6.726689938298289e-06, "epoch": 4.493927125506072, "percentage": 44.94, "elapsed_time": "0:56:30", "remaining_time": "1:09:14"} +{"current_steps": 1111, "total_steps": 2470, "loss": 1.4162, "lr": 6.72005682444192e-06, "epoch": 4.497975708502024, "percentage": 44.98, "elapsed_time": "0:56:33", "remaining_time": "1:09:11"} +{"current_steps": 1112, "total_steps": 2470, "loss": 1.2872, "lr": 6.713420275294467e-06, "epoch": 4.502024291497976, "percentage": 45.02, "elapsed_time": "0:56:36", "remaining_time": "1:09:08"} +{"current_steps": 1113, "total_steps": 2470, "loss": 1.6404, "lr": 6.70678030411042e-06, "epoch": 4.506072874493928, "percentage": 45.06, "elapsed_time": "0:56:39", "remaining_time": "1:09:05"} +{"current_steps": 1114, "total_steps": 2470, "loss": 1.6321, "lr": 6.700136924151104e-06, "epoch": 4.510121457489879, "percentage": 45.1, "elapsed_time": "0:56:42", "remaining_time": "1:09:02"} +{"current_steps": 1115, "total_steps": 2470, "loss": 1.5906, "lr": 6.693490148684654e-06, "epoch": 4.51417004048583, "percentage": 45.14, "elapsed_time": "0:56:45", "remaining_time": "1:08:58"} +{"current_steps": 1116, "total_steps": 2470, "loss": 1.6148, "lr": 6.686839990985984e-06, "epoch": 4.518218623481781, "percentage": 45.18, "elapsed_time": "0:56:48", "remaining_time": "1:08:55"} +{"current_steps": 1117, "total_steps": 2470, "loss": 1.5678, "lr": 6.680186464336767e-06, "epoch": 4.522267206477733, "percentage": 45.22, "elapsed_time": "0:56:51", "remaining_time": "1:08:52"} +{"current_steps": 1118, "total_steps": 2470, "loss": 1.3788, "lr": 6.673529582025398e-06, "epoch": 4.526315789473684, "percentage": 45.26, "elapsed_time": "0:56:54", "remaining_time": "1:08:49"} +{"current_steps": 1119, "total_steps": 2470, "loss": 1.4428, "lr": 6.666869357346979e-06, "epoch": 4.530364372469636, "percentage": 45.3, "elapsed_time": "0:56:57", "remaining_time": "1:08:46"} +{"current_steps": 1120, "total_steps": 2470, "loss": 1.5671, "lr": 6.660205803603286e-06, "epoch": 4.534412955465587, "percentage": 45.34, "elapsed_time": "0:57:01", "remaining_time": "1:08:43"} +{"current_steps": 1121, "total_steps": 2470, "loss": 1.7903, "lr": 6.653538934102743e-06, "epoch": 4.538461538461538, "percentage": 45.38, "elapsed_time": "0:57:04", "remaining_time": "1:08:40"} +{"current_steps": 1122, "total_steps": 2470, "loss": 1.6907, "lr": 6.646868762160399e-06, "epoch": 4.5425101214574894, "percentage": 45.43, "elapsed_time": "0:57:07", "remaining_time": "1:08:37"} +{"current_steps": 1123, "total_steps": 2470, "loss": 2.0735, "lr": 6.640195301097896e-06, "epoch": 4.5465587044534415, "percentage": 45.47, "elapsed_time": "0:57:10", "remaining_time": "1:08:34"} +{"current_steps": 1124, "total_steps": 2470, "loss": 2.1046, "lr": 6.633518564243442e-06, "epoch": 4.550607287449393, "percentage": 45.51, "elapsed_time": "0:57:13", "remaining_time": "1:08:31"} +{"current_steps": 1125, "total_steps": 2470, "loss": 2.3423, "lr": 6.626838564931797e-06, "epoch": 4.554655870445345, "percentage": 45.55, "elapsed_time": "0:57:16", "remaining_time": "1:08:28"} +{"current_steps": 1126, "total_steps": 2470, "loss": 1.5771, "lr": 6.620155316504225e-06, "epoch": 4.558704453441296, "percentage": 45.59, "elapsed_time": "0:57:19", "remaining_time": "1:08:25"} +{"current_steps": 1127, "total_steps": 2470, "loss": 1.4544, "lr": 6.6134688323084884e-06, "epoch": 4.562753036437247, "percentage": 45.63, "elapsed_time": "0:57:22", "remaining_time": "1:08:22"} +{"current_steps": 1128, "total_steps": 2470, "loss": 1.7848, "lr": 6.606779125698808e-06, "epoch": 4.566801619433198, "percentage": 45.67, "elapsed_time": "0:57:25", "remaining_time": "1:08:19"} +{"current_steps": 1129, "total_steps": 2470, "loss": 1.4465, "lr": 6.600086210035841e-06, "epoch": 4.57085020242915, "percentage": 45.71, "elapsed_time": "0:57:28", "remaining_time": "1:08:15"} +{"current_steps": 1130, "total_steps": 2470, "loss": 1.7079, "lr": 6.593390098686653e-06, "epoch": 4.574898785425101, "percentage": 45.75, "elapsed_time": "0:57:31", "remaining_time": "1:08:12"} +{"current_steps": 1131, "total_steps": 2470, "loss": 1.4715, "lr": 6.586690805024692e-06, "epoch": 4.578947368421053, "percentage": 45.79, "elapsed_time": "0:57:34", "remaining_time": "1:08:09"} +{"current_steps": 1132, "total_steps": 2470, "loss": 1.6256, "lr": 6.579988342429764e-06, "epoch": 4.582995951417004, "percentage": 45.83, "elapsed_time": "0:57:37", "remaining_time": "1:08:06"} +{"current_steps": 1133, "total_steps": 2470, "loss": 1.6067, "lr": 6.573282724288001e-06, "epoch": 4.587044534412955, "percentage": 45.87, "elapsed_time": "0:57:40", "remaining_time": "1:08:03"} +{"current_steps": 1134, "total_steps": 2470, "loss": 1.5832, "lr": 6.566573963991839e-06, "epoch": 4.5910931174089065, "percentage": 45.91, "elapsed_time": "0:57:43", "remaining_time": "1:08:00"} +{"current_steps": 1135, "total_steps": 2470, "loss": 1.3233, "lr": 6.559862074939989e-06, "epoch": 4.5951417004048585, "percentage": 45.95, "elapsed_time": "0:57:46", "remaining_time": "1:07:57"} +{"current_steps": 1136, "total_steps": 2470, "loss": 1.3674, "lr": 6.553147070537413e-06, "epoch": 4.59919028340081, "percentage": 45.99, "elapsed_time": "0:57:49", "remaining_time": "1:07:54"} +{"current_steps": 1137, "total_steps": 2470, "loss": 1.4813, "lr": 6.546428964195289e-06, "epoch": 4.603238866396762, "percentage": 46.03, "elapsed_time": "0:57:52", "remaining_time": "1:07:51"} +{"current_steps": 1138, "total_steps": 2470, "loss": 1.3335, "lr": 6.539707769330995e-06, "epoch": 4.607287449392713, "percentage": 46.07, "elapsed_time": "0:57:55", "remaining_time": "1:07:48"} +{"current_steps": 1139, "total_steps": 2470, "loss": 1.631, "lr": 6.532983499368078e-06, "epoch": 4.611336032388664, "percentage": 46.11, "elapsed_time": "0:57:58", "remaining_time": "1:07:45"} +{"current_steps": 1140, "total_steps": 2470, "loss": 1.6247, "lr": 6.526256167736224e-06, "epoch": 4.615384615384615, "percentage": 46.15, "elapsed_time": "0:58:01", "remaining_time": "1:07:42"} +{"current_steps": 1141, "total_steps": 2470, "loss": 1.365, "lr": 6.519525787871235e-06, "epoch": 4.619433198380567, "percentage": 46.19, "elapsed_time": "0:58:04", "remaining_time": "1:07:39"} +{"current_steps": 1142, "total_steps": 2470, "loss": 1.7573, "lr": 6.512792373215e-06, "epoch": 4.623481781376518, "percentage": 46.23, "elapsed_time": "0:58:08", "remaining_time": "1:07:36"} +{"current_steps": 1143, "total_steps": 2470, "loss": 1.561, "lr": 6.506055937215471e-06, "epoch": 4.62753036437247, "percentage": 46.28, "elapsed_time": "0:58:11", "remaining_time": "1:07:33"} +{"current_steps": 1144, "total_steps": 2470, "loss": 1.836, "lr": 6.499316493326631e-06, "epoch": 4.631578947368421, "percentage": 46.32, "elapsed_time": "0:58:14", "remaining_time": "1:07:30"} +{"current_steps": 1145, "total_steps": 2470, "loss": 1.4458, "lr": 6.492574055008474e-06, "epoch": 4.635627530364372, "percentage": 46.36, "elapsed_time": "0:58:17", "remaining_time": "1:07:27"} +{"current_steps": 1146, "total_steps": 2470, "loss": 1.6806, "lr": 6.4858286357269716e-06, "epoch": 4.6396761133603235, "percentage": 46.4, "elapsed_time": "0:58:20", "remaining_time": "1:07:24"} +{"current_steps": 1147, "total_steps": 2470, "loss": 1.5849, "lr": 6.4790802489540495e-06, "epoch": 4.6437246963562755, "percentage": 46.44, "elapsed_time": "0:58:23", "remaining_time": "1:07:21"} +{"current_steps": 1148, "total_steps": 2470, "loss": 1.6598, "lr": 6.472328908167562e-06, "epoch": 4.647773279352227, "percentage": 46.48, "elapsed_time": "0:58:26", "remaining_time": "1:07:18"} +{"current_steps": 1149, "total_steps": 2470, "loss": 1.4666, "lr": 6.465574626851262e-06, "epoch": 4.651821862348179, "percentage": 46.52, "elapsed_time": "0:58:30", "remaining_time": "1:07:15"} +{"current_steps": 1150, "total_steps": 2470, "loss": 1.6918, "lr": 6.4588174184947725e-06, "epoch": 4.65587044534413, "percentage": 46.56, "elapsed_time": "0:58:33", "remaining_time": "1:07:12"} +{"current_steps": 1151, "total_steps": 2470, "loss": 1.5207, "lr": 6.452057296593568e-06, "epoch": 4.659919028340081, "percentage": 46.6, "elapsed_time": "0:58:36", "remaining_time": "1:07:09"} +{"current_steps": 1152, "total_steps": 2470, "loss": 1.6745, "lr": 6.445294274648937e-06, "epoch": 4.663967611336032, "percentage": 46.64, "elapsed_time": "0:58:39", "remaining_time": "1:07:06"} +{"current_steps": 1153, "total_steps": 2470, "loss": 1.6752, "lr": 6.4385283661679624e-06, "epoch": 4.668016194331984, "percentage": 46.68, "elapsed_time": "0:58:42", "remaining_time": "1:07:03"} +{"current_steps": 1154, "total_steps": 2470, "loss": 1.753, "lr": 6.431759584663492e-06, "epoch": 4.672064777327935, "percentage": 46.72, "elapsed_time": "0:58:45", "remaining_time": "1:07:00"} +{"current_steps": 1155, "total_steps": 2470, "loss": 1.6195, "lr": 6.424987943654109e-06, "epoch": 4.676113360323887, "percentage": 46.76, "elapsed_time": "0:58:48", "remaining_time": "1:06:57"} +{"current_steps": 1156, "total_steps": 2470, "loss": 1.6311, "lr": 6.418213456664111e-06, "epoch": 4.680161943319838, "percentage": 46.8, "elapsed_time": "0:58:51", "remaining_time": "1:06:54"} +{"current_steps": 1157, "total_steps": 2470, "loss": 1.4584, "lr": 6.411436137223479e-06, "epoch": 4.684210526315789, "percentage": 46.84, "elapsed_time": "0:58:54", "remaining_time": "1:06:51"} +{"current_steps": 1158, "total_steps": 2470, "loss": 1.6084, "lr": 6.4046559988678485e-06, "epoch": 4.6882591093117405, "percentage": 46.88, "elapsed_time": "0:58:57", "remaining_time": "1:06:48"} +{"current_steps": 1159, "total_steps": 2470, "loss": 1.6274, "lr": 6.397873055138487e-06, "epoch": 4.6923076923076925, "percentage": 46.92, "elapsed_time": "0:59:00", "remaining_time": "1:06:45"} +{"current_steps": 1160, "total_steps": 2470, "loss": 1.4697, "lr": 6.391087319582264e-06, "epoch": 4.696356275303644, "percentage": 46.96, "elapsed_time": "0:59:03", "remaining_time": "1:06:42"} +{"current_steps": 1161, "total_steps": 2470, "loss": 1.9489, "lr": 6.384298805751626e-06, "epoch": 4.700404858299595, "percentage": 47.0, "elapsed_time": "0:59:06", "remaining_time": "1:06:39"} +{"current_steps": 1162, "total_steps": 2470, "loss": 1.727, "lr": 6.37750752720457e-06, "epoch": 4.704453441295547, "percentage": 47.04, "elapsed_time": "0:59:09", "remaining_time": "1:06:36"} +{"current_steps": 1163, "total_steps": 2470, "loss": 1.3178, "lr": 6.370713497504607e-06, "epoch": 4.708502024291498, "percentage": 47.09, "elapsed_time": "0:59:12", "remaining_time": "1:06:32"} +{"current_steps": 1164, "total_steps": 2470, "loss": 1.4908, "lr": 6.363916730220752e-06, "epoch": 4.712550607287449, "percentage": 47.13, "elapsed_time": "0:59:16", "remaining_time": "1:06:29"} +{"current_steps": 1165, "total_steps": 2470, "loss": 1.588, "lr": 6.357117238927481e-06, "epoch": 4.716599190283401, "percentage": 47.17, "elapsed_time": "0:59:19", "remaining_time": "1:06:26"} +{"current_steps": 1166, "total_steps": 2470, "loss": 1.3794, "lr": 6.350315037204714e-06, "epoch": 4.720647773279352, "percentage": 47.21, "elapsed_time": "0:59:22", "remaining_time": "1:06:23"} +{"current_steps": 1167, "total_steps": 2470, "loss": 1.535, "lr": 6.343510138637783e-06, "epoch": 4.724696356275303, "percentage": 47.25, "elapsed_time": "0:59:25", "remaining_time": "1:06:20"} +{"current_steps": 1168, "total_steps": 2470, "loss": 1.7416, "lr": 6.336702556817405e-06, "epoch": 4.728744939271255, "percentage": 47.29, "elapsed_time": "0:59:28", "remaining_time": "1:06:17"} +{"current_steps": 1169, "total_steps": 2470, "loss": 1.521, "lr": 6.329892305339659e-06, "epoch": 4.732793522267206, "percentage": 47.33, "elapsed_time": "0:59:31", "remaining_time": "1:06:14"} +{"current_steps": 1170, "total_steps": 2470, "loss": 1.6001, "lr": 6.323079397805951e-06, "epoch": 4.7368421052631575, "percentage": 47.37, "elapsed_time": "0:59:34", "remaining_time": "1:06:11"} +{"current_steps": 1171, "total_steps": 2470, "loss": 2.244, "lr": 6.3162638478229965e-06, "epoch": 4.7408906882591095, "percentage": 47.41, "elapsed_time": "0:59:37", "remaining_time": "1:06:08"} +{"current_steps": 1172, "total_steps": 2470, "loss": 1.6859, "lr": 6.309445669002787e-06, "epoch": 4.744939271255061, "percentage": 47.45, "elapsed_time": "0:59:40", "remaining_time": "1:06:05"} +{"current_steps": 1173, "total_steps": 2470, "loss": 1.5138, "lr": 6.302624874962563e-06, "epoch": 4.748987854251012, "percentage": 47.49, "elapsed_time": "0:59:43", "remaining_time": "1:06:02"} +{"current_steps": 1174, "total_steps": 2470, "loss": 1.4048, "lr": 6.295801479324788e-06, "epoch": 4.753036437246964, "percentage": 47.53, "elapsed_time": "0:59:46", "remaining_time": "1:05:59"} +{"current_steps": 1175, "total_steps": 2470, "loss": 1.5932, "lr": 6.288975495717124e-06, "epoch": 4.757085020242915, "percentage": 47.57, "elapsed_time": "0:59:49", "remaining_time": "1:05:56"} +{"current_steps": 1176, "total_steps": 2470, "loss": 2.3515, "lr": 6.282146937772399e-06, "epoch": 4.761133603238866, "percentage": 47.61, "elapsed_time": "0:59:52", "remaining_time": "1:05:52"} +{"current_steps": 1177, "total_steps": 2470, "loss": 2.1322, "lr": 6.2753158191285844e-06, "epoch": 4.765182186234818, "percentage": 47.65, "elapsed_time": "0:59:55", "remaining_time": "1:05:49"} +{"current_steps": 1178, "total_steps": 2470, "loss": 2.0072, "lr": 6.268482153428763e-06, "epoch": 4.769230769230769, "percentage": 47.69, "elapsed_time": "0:59:58", "remaining_time": "1:05:46"} +{"current_steps": 1179, "total_steps": 2470, "loss": 1.5127, "lr": 6.261645954321109e-06, "epoch": 4.77327935222672, "percentage": 47.73, "elapsed_time": "1:00:01", "remaining_time": "1:05:43"} +{"current_steps": 1180, "total_steps": 2470, "loss": 1.7728, "lr": 6.254807235458853e-06, "epoch": 4.777327935222672, "percentage": 47.77, "elapsed_time": "1:00:04", "remaining_time": "1:05:40"} +{"current_steps": 1181, "total_steps": 2470, "loss": 1.78, "lr": 6.247966010500258e-06, "epoch": 4.781376518218623, "percentage": 47.81, "elapsed_time": "1:00:07", "remaining_time": "1:05:37"} +{"current_steps": 1182, "total_steps": 2470, "loss": 1.6101, "lr": 6.241122293108594e-06, "epoch": 4.7854251012145745, "percentage": 47.85, "elapsed_time": "1:00:10", "remaining_time": "1:05:34"} +{"current_steps": 1183, "total_steps": 2470, "loss": 1.5326, "lr": 6.2342760969521085e-06, "epoch": 4.7894736842105265, "percentage": 47.89, "elapsed_time": "1:00:13", "remaining_time": "1:05:31"} +{"current_steps": 1184, "total_steps": 2470, "loss": 1.5671, "lr": 6.227427435703997e-06, "epoch": 4.793522267206478, "percentage": 47.94, "elapsed_time": "1:00:16", "remaining_time": "1:05:28"} +{"current_steps": 1185, "total_steps": 2470, "loss": 1.5746, "lr": 6.220576323042381e-06, "epoch": 4.797570850202429, "percentage": 47.98, "elapsed_time": "1:00:19", "remaining_time": "1:05:25"} +{"current_steps": 1186, "total_steps": 2470, "loss": 1.4246, "lr": 6.213722772650277e-06, "epoch": 4.801619433198381, "percentage": 48.02, "elapsed_time": "1:00:22", "remaining_time": "1:05:22"} +{"current_steps": 1187, "total_steps": 2470, "loss": 1.317, "lr": 6.206866798215571e-06, "epoch": 4.805668016194332, "percentage": 48.06, "elapsed_time": "1:00:25", "remaining_time": "1:05:19"} +{"current_steps": 1188, "total_steps": 2470, "loss": 1.6821, "lr": 6.2000084134309905e-06, "epoch": 4.809716599190283, "percentage": 48.1, "elapsed_time": "1:00:29", "remaining_time": "1:05:16"} +{"current_steps": 1189, "total_steps": 2470, "loss": 1.5786, "lr": 6.193147631994073e-06, "epoch": 4.813765182186235, "percentage": 48.14, "elapsed_time": "1:00:32", "remaining_time": "1:05:13"} +{"current_steps": 1190, "total_steps": 2470, "loss": 1.3971, "lr": 6.186284467607149e-06, "epoch": 4.817813765182186, "percentage": 48.18, "elapsed_time": "1:00:35", "remaining_time": "1:05:10"} +{"current_steps": 1191, "total_steps": 2470, "loss": 2.3347, "lr": 6.179418933977301e-06, "epoch": 4.821862348178137, "percentage": 48.22, "elapsed_time": "1:00:38", "remaining_time": "1:05:07"} +{"current_steps": 1192, "total_steps": 2470, "loss": 3.6222, "lr": 6.1725510448163516e-06, "epoch": 4.825910931174089, "percentage": 48.26, "elapsed_time": "1:00:41", "remaining_time": "1:05:04"} +{"current_steps": 1193, "total_steps": 2470, "loss": 1.4645, "lr": 6.165680813840822e-06, "epoch": 4.82995951417004, "percentage": 48.3, "elapsed_time": "1:00:44", "remaining_time": "1:05:01"} +{"current_steps": 1194, "total_steps": 2470, "loss": 1.3391, "lr": 6.1588082547719095e-06, "epoch": 4.834008097165992, "percentage": 48.34, "elapsed_time": "1:00:47", "remaining_time": "1:04:58"} +{"current_steps": 1195, "total_steps": 2470, "loss": 1.4313, "lr": 6.151933381335468e-06, "epoch": 4.838056680161944, "percentage": 48.38, "elapsed_time": "1:00:50", "remaining_time": "1:04:55"} +{"current_steps": 1196, "total_steps": 2470, "loss": 1.4611, "lr": 6.1450562072619635e-06, "epoch": 4.842105263157895, "percentage": 48.42, "elapsed_time": "1:00:54", "remaining_time": "1:04:52"} +{"current_steps": 1197, "total_steps": 2470, "loss": 1.3333, "lr": 6.138176746286468e-06, "epoch": 4.846153846153846, "percentage": 48.46, "elapsed_time": "1:00:57", "remaining_time": "1:04:49"} +{"current_steps": 1198, "total_steps": 2470, "loss": 1.4833, "lr": 6.131295012148613e-06, "epoch": 4.850202429149798, "percentage": 48.5, "elapsed_time": "1:01:00", "remaining_time": "1:04:46"} +{"current_steps": 1199, "total_steps": 2470, "loss": 1.5521, "lr": 6.124411018592568e-06, "epoch": 4.854251012145749, "percentage": 48.54, "elapsed_time": "1:01:03", "remaining_time": "1:04:43"} +{"current_steps": 1200, "total_steps": 2470, "loss": 1.4743, "lr": 6.117524779367027e-06, "epoch": 4.8582995951417, "percentage": 48.58, "elapsed_time": "1:01:06", "remaining_time": "1:04:40"} +{"current_steps": 1201, "total_steps": 2470, "loss": 1.4612, "lr": 6.110636308225157e-06, "epoch": 4.862348178137652, "percentage": 48.62, "elapsed_time": "1:01:09", "remaining_time": "1:04:37"} +{"current_steps": 1202, "total_steps": 2470, "loss": 1.4922, "lr": 6.103745618924587e-06, "epoch": 4.866396761133603, "percentage": 48.66, "elapsed_time": "1:01:12", "remaining_time": "1:04:34"} +{"current_steps": 1203, "total_steps": 2470, "loss": 1.9715, "lr": 6.096852725227378e-06, "epoch": 4.870445344129554, "percentage": 48.7, "elapsed_time": "1:01:15", "remaining_time": "1:04:31"} +{"current_steps": 1204, "total_steps": 2470, "loss": 1.9107, "lr": 6.089957640899988e-06, "epoch": 4.874493927125506, "percentage": 48.74, "elapsed_time": "1:01:18", "remaining_time": "1:04:28"} +{"current_steps": 1205, "total_steps": 2470, "loss": 1.661, "lr": 6.0830603797132574e-06, "epoch": 4.8785425101214575, "percentage": 48.79, "elapsed_time": "1:01:21", "remaining_time": "1:04:25"} +{"current_steps": 1206, "total_steps": 2470, "loss": 1.5689, "lr": 6.076160955442369e-06, "epoch": 4.882591093117409, "percentage": 48.83, "elapsed_time": "1:01:24", "remaining_time": "1:04:22"} +{"current_steps": 1207, "total_steps": 2470, "loss": 1.1468, "lr": 6.069259381866827e-06, "epoch": 4.886639676113361, "percentage": 48.87, "elapsed_time": "1:01:28", "remaining_time": "1:04:19"} +{"current_steps": 1208, "total_steps": 2470, "loss": 1.6516, "lr": 6.0623556727704306e-06, "epoch": 4.890688259109312, "percentage": 48.91, "elapsed_time": "1:01:31", "remaining_time": "1:04:16"} +{"current_steps": 1209, "total_steps": 2470, "loss": 1.7215, "lr": 6.055449841941238e-06, "epoch": 4.894736842105263, "percentage": 48.95, "elapsed_time": "1:01:34", "remaining_time": "1:04:13"} +{"current_steps": 1210, "total_steps": 2470, "loss": 1.4413, "lr": 6.048541903171552e-06, "epoch": 4.898785425101215, "percentage": 48.99, "elapsed_time": "1:01:37", "remaining_time": "1:04:09"} +{"current_steps": 1211, "total_steps": 2470, "loss": 1.4725, "lr": 6.041631870257882e-06, "epoch": 4.902834008097166, "percentage": 49.03, "elapsed_time": "1:01:40", "remaining_time": "1:04:06"} +{"current_steps": 1212, "total_steps": 2470, "loss": 1.6069, "lr": 6.034719757000918e-06, "epoch": 4.906882591093117, "percentage": 49.07, "elapsed_time": "1:01:43", "remaining_time": "1:04:03"} +{"current_steps": 1213, "total_steps": 2470, "loss": 1.2312, "lr": 6.0278055772055075e-06, "epoch": 4.910931174089069, "percentage": 49.11, "elapsed_time": "1:01:46", "remaining_time": "1:04:00"} +{"current_steps": 1214, "total_steps": 2470, "loss": 1.3252, "lr": 6.020889344680627e-06, "epoch": 4.91497975708502, "percentage": 49.15, "elapsed_time": "1:01:49", "remaining_time": "1:03:57"} +{"current_steps": 1215, "total_steps": 2470, "loss": 1.3404, "lr": 6.013971073239346e-06, "epoch": 4.919028340080971, "percentage": 49.19, "elapsed_time": "1:01:52", "remaining_time": "1:03:54"} +{"current_steps": 1216, "total_steps": 2470, "loss": 1.6668, "lr": 6.007050776698816e-06, "epoch": 4.923076923076923, "percentage": 49.23, "elapsed_time": "1:01:55", "remaining_time": "1:03:51"} +{"current_steps": 1217, "total_steps": 2470, "loss": 1.5178, "lr": 6.000128468880223e-06, "epoch": 4.9271255060728745, "percentage": 49.27, "elapsed_time": "1:01:58", "remaining_time": "1:03:48"} +{"current_steps": 1218, "total_steps": 2470, "loss": 1.5313, "lr": 5.993204163608776e-06, "epoch": 4.931174089068826, "percentage": 49.31, "elapsed_time": "1:02:01", "remaining_time": "1:03:45"} +{"current_steps": 1219, "total_steps": 2470, "loss": 1.315, "lr": 5.986277874713672e-06, "epoch": 4.935222672064778, "percentage": 49.35, "elapsed_time": "1:02:04", "remaining_time": "1:03:42"} +{"current_steps": 1220, "total_steps": 2470, "loss": 1.2599, "lr": 5.979349616028067e-06, "epoch": 4.939271255060729, "percentage": 49.39, "elapsed_time": "1:02:07", "remaining_time": "1:03:39"} +{"current_steps": 1221, "total_steps": 2470, "loss": 1.5671, "lr": 5.972419401389058e-06, "epoch": 4.94331983805668, "percentage": 49.43, "elapsed_time": "1:02:10", "remaining_time": "1:03:36"} +{"current_steps": 1222, "total_steps": 2470, "loss": 1.3098, "lr": 5.96548724463764e-06, "epoch": 4.947368421052632, "percentage": 49.47, "elapsed_time": "1:02:13", "remaining_time": "1:03:33"} +{"current_steps": 1223, "total_steps": 2470, "loss": 1.2627, "lr": 5.958553159618693e-06, "epoch": 4.951417004048583, "percentage": 49.51, "elapsed_time": "1:02:16", "remaining_time": "1:03:30"} +{"current_steps": 1224, "total_steps": 2470, "loss": 1.4866, "lr": 5.951617160180944e-06, "epoch": 4.955465587044534, "percentage": 49.55, "elapsed_time": "1:02:19", "remaining_time": "1:03:27"} +{"current_steps": 1225, "total_steps": 2470, "loss": 1.5416, "lr": 5.944679260176947e-06, "epoch": 4.959514170040486, "percentage": 49.6, "elapsed_time": "1:02:22", "remaining_time": "1:03:24"} +{"current_steps": 1226, "total_steps": 2470, "loss": 1.5505, "lr": 5.937739473463047e-06, "epoch": 4.963562753036437, "percentage": 49.64, "elapsed_time": "1:02:26", "remaining_time": "1:03:21"} +{"current_steps": 1227, "total_steps": 2470, "loss": 1.6869, "lr": 5.930797813899364e-06, "epoch": 4.967611336032388, "percentage": 49.68, "elapsed_time": "1:02:29", "remaining_time": "1:03:17"} +{"current_steps": 1228, "total_steps": 2470, "loss": 1.5989, "lr": 5.923854295349751e-06, "epoch": 4.97165991902834, "percentage": 49.72, "elapsed_time": "1:02:32", "remaining_time": "1:03:14"} +{"current_steps": 1229, "total_steps": 2470, "loss": 1.5245, "lr": 5.916908931681781e-06, "epoch": 4.9757085020242915, "percentage": 49.76, "elapsed_time": "1:02:35", "remaining_time": "1:03:11"} +{"current_steps": 1230, "total_steps": 2470, "loss": 1.6063, "lr": 5.9099617367667065e-06, "epoch": 4.979757085020243, "percentage": 49.8, "elapsed_time": "1:02:38", "remaining_time": "1:03:08"} +{"current_steps": 1231, "total_steps": 2470, "loss": 1.6715, "lr": 5.9030127244794385e-06, "epoch": 4.983805668016195, "percentage": 49.84, "elapsed_time": "1:02:41", "remaining_time": "1:03:05"} +{"current_steps": 1232, "total_steps": 2470, "loss": 1.4666, "lr": 5.896061908698521e-06, "epoch": 4.987854251012146, "percentage": 49.88, "elapsed_time": "1:02:44", "remaining_time": "1:03:02"} +{"current_steps": 1233, "total_steps": 2470, "loss": 1.4425, "lr": 5.8891093033060945e-06, "epoch": 4.991902834008097, "percentage": 49.92, "elapsed_time": "1:02:47", "remaining_time": "1:02:59"} +{"current_steps": 1234, "total_steps": 2470, "loss": 1.7597, "lr": 5.8821549221878795e-06, "epoch": 4.995951417004049, "percentage": 49.96, "elapsed_time": "1:02:50", "remaining_time": "1:02:56"} +{"current_steps": 1235, "total_steps": 2470, "loss": 1.4922, "lr": 5.8751987792331365e-06, "epoch": 5.0, "percentage": 50.0, "elapsed_time": "1:02:53", "remaining_time": "1:02:53"} +{"current_steps": 1236, "total_steps": 2470, "loss": 1.5315, "lr": 5.8682408883346535e-06, "epoch": 5.004048582995951, "percentage": 50.04, "elapsed_time": "1:02:56", "remaining_time": "1:02:50"} +{"current_steps": 1237, "total_steps": 2470, "loss": 1.6767, "lr": 5.861281263388699e-06, "epoch": 5.008097165991903, "percentage": 50.08, "elapsed_time": "1:02:59", "remaining_time": "1:02:47"} +{"current_steps": 1238, "total_steps": 2470, "loss": 1.5156, "lr": 5.854319918295012e-06, "epoch": 5.012145748987854, "percentage": 50.12, "elapsed_time": "1:03:03", "remaining_time": "1:02:44"} +{"current_steps": 1239, "total_steps": 2470, "loss": 1.7157, "lr": 5.8473568669567645e-06, "epoch": 5.016194331983805, "percentage": 50.16, "elapsed_time": "1:03:06", "remaining_time": "1:02:41"} +{"current_steps": 1240, "total_steps": 2470, "loss": 1.9457, "lr": 5.84039212328054e-06, "epoch": 5.020242914979757, "percentage": 50.2, "elapsed_time": "1:03:09", "remaining_time": "1:02:38"} +{"current_steps": 1241, "total_steps": 2470, "loss": 1.8054, "lr": 5.833425701176294e-06, "epoch": 5.0242914979757085, "percentage": 50.24, "elapsed_time": "1:03:12", "remaining_time": "1:02:35"} +{"current_steps": 1242, "total_steps": 2470, "loss": 1.4846, "lr": 5.826457614557342e-06, "epoch": 5.02834008097166, "percentage": 50.28, "elapsed_time": "1:03:15", "remaining_time": "1:02:32"} +{"current_steps": 1243, "total_steps": 2470, "loss": 1.4864, "lr": 5.819487877340318e-06, "epoch": 5.032388663967612, "percentage": 50.32, "elapsed_time": "1:03:18", "remaining_time": "1:02:29"} +{"current_steps": 1244, "total_steps": 2470, "loss": 1.7235, "lr": 5.812516503445158e-06, "epoch": 5.036437246963563, "percentage": 50.36, "elapsed_time": "1:03:21", "remaining_time": "1:02:26"} +{"current_steps": 1245, "total_steps": 2470, "loss": 1.517, "lr": 5.805543506795063e-06, "epoch": 5.040485829959514, "percentage": 50.4, "elapsed_time": "1:03:24", "remaining_time": "1:02:23"} +{"current_steps": 1246, "total_steps": 2470, "loss": 1.5768, "lr": 5.798568901316475e-06, "epoch": 5.044534412955466, "percentage": 50.45, "elapsed_time": "1:03:27", "remaining_time": "1:02:20"} +{"current_steps": 1247, "total_steps": 2470, "loss": 1.5018, "lr": 5.79159270093905e-06, "epoch": 5.048582995951417, "percentage": 50.49, "elapsed_time": "1:03:30", "remaining_time": "1:02:17"} +{"current_steps": 1248, "total_steps": 2470, "loss": 1.5785, "lr": 5.784614919595631e-06, "epoch": 5.052631578947368, "percentage": 50.53, "elapsed_time": "1:03:34", "remaining_time": "1:02:14"} +{"current_steps": 1249, "total_steps": 2470, "loss": 1.4217, "lr": 5.7776355712222165e-06, "epoch": 5.05668016194332, "percentage": 50.57, "elapsed_time": "1:03:37", "remaining_time": "1:02:11"} +{"current_steps": 1250, "total_steps": 2470, "loss": 1.5864, "lr": 5.770654669757935e-06, "epoch": 5.060728744939271, "percentage": 50.61, "elapsed_time": "1:03:40", "remaining_time": "1:02:08"} +{"current_steps": 1251, "total_steps": 2470, "loss": 1.5406, "lr": 5.763672229145015e-06, "epoch": 5.064777327935222, "percentage": 50.65, "elapsed_time": "1:03:43", "remaining_time": "1:02:05"} +{"current_steps": 1252, "total_steps": 2470, "loss": 1.6808, "lr": 5.756688263328762e-06, "epoch": 5.068825910931174, "percentage": 50.69, "elapsed_time": "1:03:46", "remaining_time": "1:02:02"} +{"current_steps": 1253, "total_steps": 2470, "loss": 1.6199, "lr": 5.749702786257529e-06, "epoch": 5.0728744939271255, "percentage": 50.73, "elapsed_time": "1:03:49", "remaining_time": "1:01:59"} +{"current_steps": 1254, "total_steps": 2470, "loss": 1.5554, "lr": 5.742715811882682e-06, "epoch": 5.076923076923077, "percentage": 50.77, "elapsed_time": "1:03:52", "remaining_time": "1:01:56"} +{"current_steps": 1255, "total_steps": 2470, "loss": 1.5965, "lr": 5.735727354158581e-06, "epoch": 5.080971659919029, "percentage": 50.81, "elapsed_time": "1:03:55", "remaining_time": "1:01:53"} +{"current_steps": 1256, "total_steps": 2470, "loss": 1.5955, "lr": 5.7287374270425475e-06, "epoch": 5.08502024291498, "percentage": 50.85, "elapsed_time": "1:03:58", "remaining_time": "1:01:49"} +{"current_steps": 1257, "total_steps": 2470, "loss": 1.5594, "lr": 5.721746044494838e-06, "epoch": 5.089068825910931, "percentage": 50.89, "elapsed_time": "1:04:01", "remaining_time": "1:01:46"} +{"current_steps": 1258, "total_steps": 2470, "loss": 1.6374, "lr": 5.714753220478616e-06, "epoch": 5.093117408906883, "percentage": 50.93, "elapsed_time": "1:04:04", "remaining_time": "1:01:43"} +{"current_steps": 1259, "total_steps": 2470, "loss": 1.3947, "lr": 5.707758968959923e-06, "epoch": 5.097165991902834, "percentage": 50.97, "elapsed_time": "1:04:07", "remaining_time": "1:01:40"} +{"current_steps": 1260, "total_steps": 2470, "loss": 1.5641, "lr": 5.7007633039076535e-06, "epoch": 5.101214574898785, "percentage": 51.01, "elapsed_time": "1:04:10", "remaining_time": "1:01:37"} +{"current_steps": 1261, "total_steps": 2470, "loss": 1.5403, "lr": 5.693766239293522e-06, "epoch": 5.105263157894737, "percentage": 51.05, "elapsed_time": "1:04:13", "remaining_time": "1:01:34"} +{"current_steps": 1262, "total_steps": 2470, "loss": 1.4899, "lr": 5.686767789092041e-06, "epoch": 5.109311740890688, "percentage": 51.09, "elapsed_time": "1:04:16", "remaining_time": "1:01:31"} +{"current_steps": 1263, "total_steps": 2470, "loss": 1.4415, "lr": 5.67976796728049e-06, "epoch": 5.113360323886639, "percentage": 51.13, "elapsed_time": "1:04:19", "remaining_time": "1:01:28"} +{"current_steps": 1264, "total_steps": 2470, "loss": 1.349, "lr": 5.672766787838884e-06, "epoch": 5.117408906882591, "percentage": 51.17, "elapsed_time": "1:04:22", "remaining_time": "1:01:25"} +{"current_steps": 1265, "total_steps": 2470, "loss": 1.4005, "lr": 5.6657642647499545e-06, "epoch": 5.1214574898785425, "percentage": 51.21, "elapsed_time": "1:04:25", "remaining_time": "1:01:22"} +{"current_steps": 1266, "total_steps": 2470, "loss": 1.4047, "lr": 5.658760411999115e-06, "epoch": 5.125506072874494, "percentage": 51.26, "elapsed_time": "1:04:28", "remaining_time": "1:01:19"} +{"current_steps": 1267, "total_steps": 2470, "loss": 1.3041, "lr": 5.6517552435744325e-06, "epoch": 5.129554655870446, "percentage": 51.3, "elapsed_time": "1:04:31", "remaining_time": "1:01:16"} +{"current_steps": 1268, "total_steps": 2470, "loss": 1.6559, "lr": 5.644748773466606e-06, "epoch": 5.133603238866397, "percentage": 51.34, "elapsed_time": "1:04:34", "remaining_time": "1:01:13"} +{"current_steps": 1269, "total_steps": 2470, "loss": 1.4822, "lr": 5.637741015668929e-06, "epoch": 5.137651821862348, "percentage": 51.38, "elapsed_time": "1:04:37", "remaining_time": "1:01:10"} +{"current_steps": 1270, "total_steps": 2470, "loss": 1.2246, "lr": 5.630731984177269e-06, "epoch": 5.1417004048583, "percentage": 51.42, "elapsed_time": "1:04:40", "remaining_time": "1:01:06"} +{"current_steps": 1271, "total_steps": 2470, "loss": 1.5924, "lr": 5.62372169299004e-06, "epoch": 5.145748987854251, "percentage": 51.46, "elapsed_time": "1:04:43", "remaining_time": "1:01:03"} +{"current_steps": 1272, "total_steps": 2470, "loss": 1.4133, "lr": 5.616710156108167e-06, "epoch": 5.149797570850202, "percentage": 51.5, "elapsed_time": "1:04:46", "remaining_time": "1:01:00"} +{"current_steps": 1273, "total_steps": 2470, "loss": 1.621, "lr": 5.609697387535068e-06, "epoch": 5.153846153846154, "percentage": 51.54, "elapsed_time": "1:04:49", "remaining_time": "1:00:57"} +{"current_steps": 1274, "total_steps": 2470, "loss": 1.7158, "lr": 5.6026834012766155e-06, "epoch": 5.157894736842105, "percentage": 51.58, "elapsed_time": "1:04:53", "remaining_time": "1:00:54"} +{"current_steps": 1275, "total_steps": 2470, "loss": 1.4746, "lr": 5.5956682113411184e-06, "epoch": 5.161943319838056, "percentage": 51.62, "elapsed_time": "1:04:56", "remaining_time": "1:00:51"} +{"current_steps": 1276, "total_steps": 2470, "loss": 1.5543, "lr": 5.588651831739289e-06, "epoch": 5.165991902834008, "percentage": 51.66, "elapsed_time": "1:04:59", "remaining_time": "1:00:48"} +{"current_steps": 1277, "total_steps": 2470, "loss": 2.074, "lr": 5.581634276484211e-06, "epoch": 5.17004048582996, "percentage": 51.7, "elapsed_time": "1:05:02", "remaining_time": "1:00:45"} +{"current_steps": 1278, "total_steps": 2470, "loss": 1.3906, "lr": 5.574615559591323e-06, "epoch": 5.174089068825911, "percentage": 51.74, "elapsed_time": "1:05:05", "remaining_time": "1:00:42"} +{"current_steps": 1279, "total_steps": 2470, "loss": 1.5738, "lr": 5.567595695078379e-06, "epoch": 5.178137651821863, "percentage": 51.78, "elapsed_time": "1:05:08", "remaining_time": "1:00:39"} +{"current_steps": 1280, "total_steps": 2470, "loss": 1.3815, "lr": 5.560574696965425e-06, "epoch": 5.182186234817814, "percentage": 51.82, "elapsed_time": "1:05:11", "remaining_time": "1:00:36"} +{"current_steps": 1281, "total_steps": 2470, "loss": 1.5673, "lr": 5.553552579274775e-06, "epoch": 5.186234817813765, "percentage": 51.86, "elapsed_time": "1:05:14", "remaining_time": "1:00:33"} +{"current_steps": 1282, "total_steps": 2470, "loss": 1.5733, "lr": 5.546529356030974e-06, "epoch": 5.190283400809717, "percentage": 51.9, "elapsed_time": "1:05:17", "remaining_time": "1:00:30"} +{"current_steps": 1283, "total_steps": 2470, "loss": 1.757, "lr": 5.539505041260779e-06, "epoch": 5.194331983805668, "percentage": 51.94, "elapsed_time": "1:05:20", "remaining_time": "1:00:27"} +{"current_steps": 1284, "total_steps": 2470, "loss": 1.8081, "lr": 5.532479648993122e-06, "epoch": 5.198380566801619, "percentage": 51.98, "elapsed_time": "1:05:23", "remaining_time": "1:00:23"} +{"current_steps": 1285, "total_steps": 2470, "loss": 1.5116, "lr": 5.525453193259094e-06, "epoch": 5.202429149797571, "percentage": 52.02, "elapsed_time": "1:05:26", "remaining_time": "1:00:20"} +{"current_steps": 1286, "total_steps": 2470, "loss": 1.8506, "lr": 5.518425688091906e-06, "epoch": 5.206477732793522, "percentage": 52.06, "elapsed_time": "1:05:30", "remaining_time": "1:00:18"} +{"current_steps": 1287, "total_steps": 2470, "loss": 1.8682, "lr": 5.511397147526862e-06, "epoch": 5.2105263157894735, "percentage": 52.11, "elapsed_time": "1:05:33", "remaining_time": "1:00:15"} +{"current_steps": 1288, "total_steps": 2470, "loss": 1.6388, "lr": 5.504367585601342e-06, "epoch": 5.2145748987854255, "percentage": 52.15, "elapsed_time": "1:05:36", "remaining_time": "1:00:12"} +{"current_steps": 1289, "total_steps": 2470, "loss": 1.5266, "lr": 5.497337016354757e-06, "epoch": 5.218623481781377, "percentage": 52.19, "elapsed_time": "1:05:39", "remaining_time": "1:00:09"} +{"current_steps": 1290, "total_steps": 2470, "loss": 1.4274, "lr": 5.490305453828534e-06, "epoch": 5.222672064777328, "percentage": 52.23, "elapsed_time": "1:05:42", "remaining_time": "1:00:06"} +{"current_steps": 1291, "total_steps": 2470, "loss": 1.6117, "lr": 5.483272912066084e-06, "epoch": 5.22672064777328, "percentage": 52.27, "elapsed_time": "1:05:45", "remaining_time": "1:00:03"} +{"current_steps": 1292, "total_steps": 2470, "loss": 1.4265, "lr": 5.476239405112775e-06, "epoch": 5.230769230769231, "percentage": 52.31, "elapsed_time": "1:05:48", "remaining_time": "1:00:00"} +{"current_steps": 1293, "total_steps": 2470, "loss": 1.668, "lr": 5.469204947015897e-06, "epoch": 5.234817813765182, "percentage": 52.35, "elapsed_time": "1:05:51", "remaining_time": "0:59:57"} +{"current_steps": 1294, "total_steps": 2470, "loss": 1.6076, "lr": 5.462169551824648e-06, "epoch": 5.238866396761134, "percentage": 52.39, "elapsed_time": "1:05:55", "remaining_time": "0:59:54"} +{"current_steps": 1295, "total_steps": 2470, "loss": 1.6171, "lr": 5.45513323359009e-06, "epoch": 5.242914979757085, "percentage": 52.43, "elapsed_time": "1:05:58", "remaining_time": "0:59:51"} +{"current_steps": 1296, "total_steps": 2470, "loss": 1.4488, "lr": 5.448096006365132e-06, "epoch": 5.246963562753036, "percentage": 52.47, "elapsed_time": "1:06:01", "remaining_time": "0:59:48"} +{"current_steps": 1297, "total_steps": 2470, "loss": 1.5478, "lr": 5.4410578842045e-06, "epoch": 5.251012145748988, "percentage": 52.51, "elapsed_time": "1:06:04", "remaining_time": "0:59:45"} +{"current_steps": 1298, "total_steps": 2470, "loss": 1.523, "lr": 5.434018881164702e-06, "epoch": 5.255060728744939, "percentage": 52.55, "elapsed_time": "1:06:07", "remaining_time": "0:59:42"} +{"current_steps": 1299, "total_steps": 2470, "loss": 1.4463, "lr": 5.426979011304012e-06, "epoch": 5.2591093117408905, "percentage": 52.59, "elapsed_time": "1:06:10", "remaining_time": "0:59:39"} +{"current_steps": 1300, "total_steps": 2470, "loss": 1.2639, "lr": 5.41993828868243e-06, "epoch": 5.2631578947368425, "percentage": 52.63, "elapsed_time": "1:06:13", "remaining_time": "0:59:35"} +{"current_steps": 1301, "total_steps": 2470, "loss": 1.5401, "lr": 5.412896727361663e-06, "epoch": 5.267206477732794, "percentage": 52.67, "elapsed_time": "1:06:16", "remaining_time": "0:59:32"} +{"current_steps": 1302, "total_steps": 2470, "loss": 1.5594, "lr": 5.405854341405088e-06, "epoch": 5.271255060728745, "percentage": 52.71, "elapsed_time": "1:06:19", "remaining_time": "0:59:29"} +{"current_steps": 1303, "total_steps": 2470, "loss": 1.5997, "lr": 5.398811144877733e-06, "epoch": 5.275303643724697, "percentage": 52.75, "elapsed_time": "1:06:22", "remaining_time": "0:59:26"} +{"current_steps": 1304, "total_steps": 2470, "loss": 1.5551, "lr": 5.391767151846247e-06, "epoch": 5.279352226720648, "percentage": 52.79, "elapsed_time": "1:06:25", "remaining_time": "0:59:23"} +{"current_steps": 1305, "total_steps": 2470, "loss": 1.2388, "lr": 5.384722376378861e-06, "epoch": 5.283400809716599, "percentage": 52.83, "elapsed_time": "1:06:28", "remaining_time": "0:59:20"} +{"current_steps": 1306, "total_steps": 2470, "loss": 1.3926, "lr": 5.377676832545377e-06, "epoch": 5.287449392712551, "percentage": 52.87, "elapsed_time": "1:06:31", "remaining_time": "0:59:17"} +{"current_steps": 1307, "total_steps": 2470, "loss": 1.2335, "lr": 5.370630534417133e-06, "epoch": 5.291497975708502, "percentage": 52.91, "elapsed_time": "1:06:34", "remaining_time": "0:59:14"} +{"current_steps": 1308, "total_steps": 2470, "loss": 1.5097, "lr": 5.363583496066963e-06, "epoch": 5.295546558704453, "percentage": 52.96, "elapsed_time": "1:06:37", "remaining_time": "0:59:11"} +{"current_steps": 1309, "total_steps": 2470, "loss": 1.6798, "lr": 5.356535731569189e-06, "epoch": 5.299595141700405, "percentage": 53.0, "elapsed_time": "1:06:40", "remaining_time": "0:59:08"} +{"current_steps": 1310, "total_steps": 2470, "loss": 1.3501, "lr": 5.349487254999579e-06, "epoch": 5.303643724696356, "percentage": 53.04, "elapsed_time": "1:06:43", "remaining_time": "0:59:05"} +{"current_steps": 1311, "total_steps": 2470, "loss": 1.3823, "lr": 5.342438080435325e-06, "epoch": 5.3076923076923075, "percentage": 53.08, "elapsed_time": "1:06:46", "remaining_time": "0:59:02"} +{"current_steps": 1312, "total_steps": 2470, "loss": 1.4001, "lr": 5.335388221955012e-06, "epoch": 5.3117408906882595, "percentage": 53.12, "elapsed_time": "1:06:49", "remaining_time": "0:58:59"} +{"current_steps": 1313, "total_steps": 2470, "loss": 1.3433, "lr": 5.328337693638591e-06, "epoch": 5.315789473684211, "percentage": 53.16, "elapsed_time": "1:06:52", "remaining_time": "0:58:56"} +{"current_steps": 1314, "total_steps": 2470, "loss": 1.2701, "lr": 5.321286509567351e-06, "epoch": 5.319838056680162, "percentage": 53.2, "elapsed_time": "1:06:55", "remaining_time": "0:58:53"} +{"current_steps": 1315, "total_steps": 2470, "loss": 1.2979, "lr": 5.314234683823892e-06, "epoch": 5.323886639676114, "percentage": 53.24, "elapsed_time": "1:06:58", "remaining_time": "0:58:49"} +{"current_steps": 1316, "total_steps": 2470, "loss": 1.3284, "lr": 5.307182230492089e-06, "epoch": 5.327935222672065, "percentage": 53.28, "elapsed_time": "1:07:01", "remaining_time": "0:58:46"} +{"current_steps": 1317, "total_steps": 2470, "loss": 1.3376, "lr": 5.300129163657081e-06, "epoch": 5.331983805668016, "percentage": 53.32, "elapsed_time": "1:07:05", "remaining_time": "0:58:43"} +{"current_steps": 1318, "total_steps": 2470, "loss": 1.3976, "lr": 5.2930754974052245e-06, "epoch": 5.336032388663968, "percentage": 53.36, "elapsed_time": "1:07:08", "remaining_time": "0:58:40"} +{"current_steps": 1319, "total_steps": 2470, "loss": 1.3431, "lr": 5.286021245824075e-06, "epoch": 5.340080971659919, "percentage": 53.4, "elapsed_time": "1:07:11", "remaining_time": "0:58:37"} +{"current_steps": 1320, "total_steps": 2470, "loss": 1.295, "lr": 5.2789664230023595e-06, "epoch": 5.34412955465587, "percentage": 53.44, "elapsed_time": "1:07:14", "remaining_time": "0:58:34"} +{"current_steps": 1321, "total_steps": 2470, "loss": 1.4491, "lr": 5.2719110430299416e-06, "epoch": 5.348178137651822, "percentage": 53.48, "elapsed_time": "1:07:17", "remaining_time": "0:58:31"} +{"current_steps": 1322, "total_steps": 2470, "loss": 1.4354, "lr": 5.264855119997803e-06, "epoch": 5.352226720647773, "percentage": 53.52, "elapsed_time": "1:07:20", "remaining_time": "0:58:28"} +{"current_steps": 1323, "total_steps": 2470, "loss": 1.0844, "lr": 5.257798667998003e-06, "epoch": 5.3562753036437245, "percentage": 53.56, "elapsed_time": "1:07:23", "remaining_time": "0:58:25"} +{"current_steps": 1324, "total_steps": 2470, "loss": 1.4929, "lr": 5.2507417011236625e-06, "epoch": 5.3603238866396765, "percentage": 53.6, "elapsed_time": "1:07:26", "remaining_time": "0:58:22"} +{"current_steps": 1325, "total_steps": 2470, "loss": 1.5648, "lr": 5.243684233468933e-06, "epoch": 5.364372469635628, "percentage": 53.64, "elapsed_time": "1:07:29", "remaining_time": "0:58:19"} +{"current_steps": 1326, "total_steps": 2470, "loss": 1.473, "lr": 5.236626279128958e-06, "epoch": 5.368421052631579, "percentage": 53.68, "elapsed_time": "1:07:32", "remaining_time": "0:58:16"} +{"current_steps": 1327, "total_steps": 2470, "loss": 1.5456, "lr": 5.22956785219986e-06, "epoch": 5.372469635627531, "percentage": 53.72, "elapsed_time": "1:07:35", "remaining_time": "0:58:13"} +{"current_steps": 1328, "total_steps": 2470, "loss": 1.2098, "lr": 5.222508966778702e-06, "epoch": 5.376518218623482, "percentage": 53.77, "elapsed_time": "1:07:38", "remaining_time": "0:58:10"} +{"current_steps": 1329, "total_steps": 2470, "loss": 1.363, "lr": 5.2154496369634645e-06, "epoch": 5.380566801619433, "percentage": 53.81, "elapsed_time": "1:07:41", "remaining_time": "0:58:07"} +{"current_steps": 1330, "total_steps": 2470, "loss": 1.1592, "lr": 5.208389876853014e-06, "epoch": 5.384615384615385, "percentage": 53.85, "elapsed_time": "1:07:44", "remaining_time": "0:58:03"} +{"current_steps": 1331, "total_steps": 2470, "loss": 1.226, "lr": 5.201329700547077e-06, "epoch": 5.388663967611336, "percentage": 53.89, "elapsed_time": "1:07:47", "remaining_time": "0:58:00"} +{"current_steps": 1332, "total_steps": 2470, "loss": 1.4048, "lr": 5.194269122146211e-06, "epoch": 5.392712550607287, "percentage": 53.93, "elapsed_time": "1:07:50", "remaining_time": "0:57:57"} +{"current_steps": 1333, "total_steps": 2470, "loss": 1.2387, "lr": 5.187208155751779e-06, "epoch": 5.396761133603239, "percentage": 53.97, "elapsed_time": "1:07:53", "remaining_time": "0:57:54"} +{"current_steps": 1334, "total_steps": 2470, "loss": 1.2571, "lr": 5.180146815465915e-06, "epoch": 5.40080971659919, "percentage": 54.01, "elapsed_time": "1:07:57", "remaining_time": "0:57:52"} +{"current_steps": 1335, "total_steps": 2470, "loss": 1.3062, "lr": 5.173085115391502e-06, "epoch": 5.4048582995951415, "percentage": 54.05, "elapsed_time": "1:08:00", "remaining_time": "0:57:49"} +{"current_steps": 1336, "total_steps": 2470, "loss": 1.4154, "lr": 5.16602306963214e-06, "epoch": 5.4089068825910935, "percentage": 54.09, "elapsed_time": "1:08:03", "remaining_time": "0:57:46"} +{"current_steps": 1337, "total_steps": 2470, "loss": 1.2259, "lr": 5.158960692292122e-06, "epoch": 5.412955465587045, "percentage": 54.13, "elapsed_time": "1:08:06", "remaining_time": "0:57:43"} +{"current_steps": 1338, "total_steps": 2470, "loss": 1.5583, "lr": 5.151897997476403e-06, "epoch": 5.417004048582996, "percentage": 54.17, "elapsed_time": "1:08:09", "remaining_time": "0:57:39"} +{"current_steps": 1339, "total_steps": 2470, "loss": 1.598, "lr": 5.144834999290567e-06, "epoch": 5.421052631578947, "percentage": 54.21, "elapsed_time": "1:08:12", "remaining_time": "0:57:36"} +{"current_steps": 1340, "total_steps": 2470, "loss": 1.5379, "lr": 5.137771711840811e-06, "epoch": 5.425101214574899, "percentage": 54.25, "elapsed_time": "1:08:15", "remaining_time": "0:57:33"} +{"current_steps": 1341, "total_steps": 2470, "loss": 1.5569, "lr": 5.130708149233905e-06, "epoch": 5.42914979757085, "percentage": 54.29, "elapsed_time": "1:08:19", "remaining_time": "0:57:31"} +{"current_steps": 1342, "total_steps": 2470, "loss": 1.7237, "lr": 5.123644325577168e-06, "epoch": 5.433198380566802, "percentage": 54.33, "elapsed_time": "1:08:22", "remaining_time": "0:57:28"} +{"current_steps": 1343, "total_steps": 2470, "loss": 1.4932, "lr": 5.116580254978447e-06, "epoch": 5.437246963562753, "percentage": 54.37, "elapsed_time": "1:08:25", "remaining_time": "0:57:24"} +{"current_steps": 1344, "total_steps": 2470, "loss": 1.4349, "lr": 5.1095159515460736e-06, "epoch": 5.441295546558704, "percentage": 54.41, "elapsed_time": "1:08:28", "remaining_time": "0:57:21"} +{"current_steps": 1345, "total_steps": 2470, "loss": 1.6808, "lr": 5.10245142938885e-06, "epoch": 5.445344129554655, "percentage": 54.45, "elapsed_time": "1:08:31", "remaining_time": "0:57:18"} +{"current_steps": 1346, "total_steps": 2470, "loss": 1.4753, "lr": 5.095386702616012e-06, "epoch": 5.449392712550607, "percentage": 54.49, "elapsed_time": "1:08:34", "remaining_time": "0:57:15"} +{"current_steps": 1347, "total_steps": 2470, "loss": 1.4634, "lr": 5.088321785337207e-06, "epoch": 5.4534412955465585, "percentage": 54.53, "elapsed_time": "1:08:37", "remaining_time": "0:57:12"} +{"current_steps": 1348, "total_steps": 2470, "loss": 1.8175, "lr": 5.0812566916624624e-06, "epoch": 5.4574898785425106, "percentage": 54.57, "elapsed_time": "1:08:40", "remaining_time": "0:57:09"} +{"current_steps": 1349, "total_steps": 2470, "loss": 1.9684, "lr": 5.074191435702155e-06, "epoch": 5.461538461538462, "percentage": 54.62, "elapsed_time": "1:08:43", "remaining_time": "0:57:06"} +{"current_steps": 1350, "total_steps": 2470, "loss": 1.6405, "lr": 5.067126031566988e-06, "epoch": 5.465587044534413, "percentage": 54.66, "elapsed_time": "1:08:46", "remaining_time": "0:57:03"} +{"current_steps": 1351, "total_steps": 2470, "loss": 1.6486, "lr": 5.060060493367961e-06, "epoch": 5.469635627530364, "percentage": 54.7, "elapsed_time": "1:08:49", "remaining_time": "0:57:00"} +{"current_steps": 1352, "total_steps": 2470, "loss": 1.5872, "lr": 5.05299483521634e-06, "epoch": 5.473684210526316, "percentage": 54.74, "elapsed_time": "1:08:52", "remaining_time": "0:56:57"} +{"current_steps": 1353, "total_steps": 2470, "loss": 1.5976, "lr": 5.045929071223633e-06, "epoch": 5.477732793522267, "percentage": 54.78, "elapsed_time": "1:08:55", "remaining_time": "0:56:54"} +{"current_steps": 1354, "total_steps": 2470, "loss": 1.5156, "lr": 5.038863215501555e-06, "epoch": 5.481781376518219, "percentage": 54.82, "elapsed_time": "1:08:58", "remaining_time": "0:56:51"} +{"current_steps": 1355, "total_steps": 2470, "loss": 1.4631, "lr": 5.031797282162007e-06, "epoch": 5.48582995951417, "percentage": 54.86, "elapsed_time": "1:09:01", "remaining_time": "0:56:48"} +{"current_steps": 1356, "total_steps": 2470, "loss": 1.3972, "lr": 5.024731285317046e-06, "epoch": 5.489878542510121, "percentage": 54.9, "elapsed_time": "1:09:05", "remaining_time": "0:56:45"} +{"current_steps": 1357, "total_steps": 2470, "loss": 1.4267, "lr": 5.017665239078854e-06, "epoch": 5.493927125506072, "percentage": 54.94, "elapsed_time": "1:09:08", "remaining_time": "0:56:42"} +{"current_steps": 1358, "total_steps": 2470, "loss": 1.2966, "lr": 5.010599157559713e-06, "epoch": 5.497975708502024, "percentage": 54.98, "elapsed_time": "1:09:11", "remaining_time": "0:56:39"} +{"current_steps": 1359, "total_steps": 2470, "loss": 1.15, "lr": 5.003533054871973e-06, "epoch": 5.502024291497976, "percentage": 55.02, "elapsed_time": "1:09:14", "remaining_time": "0:56:36"} +{"current_steps": 1360, "total_steps": 2470, "loss": 1.5181, "lr": 4.996466945128029e-06, "epoch": 5.506072874493928, "percentage": 55.06, "elapsed_time": "1:09:17", "remaining_time": "0:56:32"} +{"current_steps": 1361, "total_steps": 2470, "loss": 1.4787, "lr": 4.98940084244029e-06, "epoch": 5.510121457489879, "percentage": 55.1, "elapsed_time": "1:09:20", "remaining_time": "0:56:29"} +{"current_steps": 1362, "total_steps": 2470, "loss": 1.4434, "lr": 4.982334760921149e-06, "epoch": 5.51417004048583, "percentage": 55.14, "elapsed_time": "1:09:23", "remaining_time": "0:56:26"} +{"current_steps": 1363, "total_steps": 2470, "loss": 1.4766, "lr": 4.975268714682956e-06, "epoch": 5.518218623481781, "percentage": 55.18, "elapsed_time": "1:09:26", "remaining_time": "0:56:23"} +{"current_steps": 1364, "total_steps": 2470, "loss": 1.4244, "lr": 4.968202717837996e-06, "epoch": 5.522267206477733, "percentage": 55.22, "elapsed_time": "1:09:29", "remaining_time": "0:56:20"} +{"current_steps": 1365, "total_steps": 2470, "loss": 1.2532, "lr": 4.961136784498448e-06, "epoch": 5.526315789473684, "percentage": 55.26, "elapsed_time": "1:09:32", "remaining_time": "0:56:17"} +{"current_steps": 1366, "total_steps": 2470, "loss": 1.3152, "lr": 4.9540709287763685e-06, "epoch": 5.530364372469636, "percentage": 55.3, "elapsed_time": "1:09:35", "remaining_time": "0:56:14"} +{"current_steps": 1367, "total_steps": 2470, "loss": 1.409, "lr": 4.947005164783661e-06, "epoch": 5.534412955465587, "percentage": 55.34, "elapsed_time": "1:09:38", "remaining_time": "0:56:11"} +{"current_steps": 1368, "total_steps": 2470, "loss": 1.6652, "lr": 4.939939506632041e-06, "epoch": 5.538461538461538, "percentage": 55.38, "elapsed_time": "1:09:41", "remaining_time": "0:56:08"} +{"current_steps": 1369, "total_steps": 2470, "loss": 1.5821, "lr": 4.932873968433014e-06, "epoch": 5.5425101214574894, "percentage": 55.43, "elapsed_time": "1:09:44", "remaining_time": "0:56:05"} +{"current_steps": 1370, "total_steps": 2470, "loss": 2.0481, "lr": 4.925808564297847e-06, "epoch": 5.5465587044534415, "percentage": 55.47, "elapsed_time": "1:09:47", "remaining_time": "0:56:02"} +{"current_steps": 1371, "total_steps": 2470, "loss": 1.9382, "lr": 4.918743308337539e-06, "epoch": 5.550607287449393, "percentage": 55.51, "elapsed_time": "1:09:50", "remaining_time": "0:55:59"} +{"current_steps": 1372, "total_steps": 2470, "loss": 2.2234, "lr": 4.911678214662795e-06, "epoch": 5.554655870445345, "percentage": 55.55, "elapsed_time": "1:09:53", "remaining_time": "0:55:56"} +{"current_steps": 1373, "total_steps": 2470, "loss": 1.4514, "lr": 4.9046132973839895e-06, "epoch": 5.558704453441296, "percentage": 55.59, "elapsed_time": "1:09:56", "remaining_time": "0:55:53"} +{"current_steps": 1374, "total_steps": 2470, "loss": 1.3266, "lr": 4.897548570611153e-06, "epoch": 5.562753036437247, "percentage": 55.63, "elapsed_time": "1:09:59", "remaining_time": "0:55:50"} +{"current_steps": 1375, "total_steps": 2470, "loss": 1.704, "lr": 4.890484048453928e-06, "epoch": 5.566801619433198, "percentage": 55.67, "elapsed_time": "1:10:02", "remaining_time": "0:55:47"} +{"current_steps": 1376, "total_steps": 2470, "loss": 1.3432, "lr": 4.883419745021554e-06, "epoch": 5.57085020242915, "percentage": 55.71, "elapsed_time": "1:10:05", "remaining_time": "0:55:43"} +{"current_steps": 1377, "total_steps": 2470, "loss": 1.5548, "lr": 4.8763556744228324e-06, "epoch": 5.574898785425101, "percentage": 55.75, "elapsed_time": "1:10:09", "remaining_time": "0:55:40"} +{"current_steps": 1378, "total_steps": 2470, "loss": 1.3556, "lr": 4.869291850766097e-06, "epoch": 5.578947368421053, "percentage": 55.79, "elapsed_time": "1:10:12", "remaining_time": "0:55:37"} +{"current_steps": 1379, "total_steps": 2470, "loss": 1.4828, "lr": 4.862228288159191e-06, "epoch": 5.582995951417004, "percentage": 55.83, "elapsed_time": "1:10:15", "remaining_time": "0:55:34"} +{"current_steps": 1380, "total_steps": 2470, "loss": 1.4776, "lr": 4.855165000709434e-06, "epoch": 5.587044534412955, "percentage": 55.87, "elapsed_time": "1:10:18", "remaining_time": "0:55:31"} +{"current_steps": 1381, "total_steps": 2470, "loss": 1.4632, "lr": 4.848102002523597e-06, "epoch": 5.5910931174089065, "percentage": 55.91, "elapsed_time": "1:10:21", "remaining_time": "0:55:28"} +{"current_steps": 1382, "total_steps": 2470, "loss": 1.1957, "lr": 4.841039307707878e-06, "epoch": 5.5951417004048585, "percentage": 55.95, "elapsed_time": "1:10:24", "remaining_time": "0:55:26"} +{"current_steps": 1383, "total_steps": 2470, "loss": 1.2615, "lr": 4.833976930367859e-06, "epoch": 5.59919028340081, "percentage": 55.99, "elapsed_time": "1:10:27", "remaining_time": "0:55:23"} +{"current_steps": 1384, "total_steps": 2470, "loss": 1.3531, "lr": 4.8269148846085e-06, "epoch": 5.603238866396762, "percentage": 56.03, "elapsed_time": "1:10:31", "remaining_time": "0:55:20"} +{"current_steps": 1385, "total_steps": 2470, "loss": 1.1753, "lr": 4.819853184534085e-06, "epoch": 5.607287449392713, "percentage": 56.07, "elapsed_time": "1:10:34", "remaining_time": "0:55:16"} +{"current_steps": 1386, "total_steps": 2470, "loss": 1.4958, "lr": 4.812791844248223e-06, "epoch": 5.611336032388664, "percentage": 56.11, "elapsed_time": "1:10:37", "remaining_time": "0:55:13"} +{"current_steps": 1387, "total_steps": 2470, "loss": 1.4974, "lr": 4.80573087785379e-06, "epoch": 5.615384615384615, "percentage": 56.15, "elapsed_time": "1:10:40", "remaining_time": "0:55:10"} +{"current_steps": 1388, "total_steps": 2470, "loss": 1.2282, "lr": 4.798670299452926e-06, "epoch": 5.619433198380567, "percentage": 56.19, "elapsed_time": "1:10:43", "remaining_time": "0:55:07"} +{"current_steps": 1389, "total_steps": 2470, "loss": 1.6082, "lr": 4.7916101231469886e-06, "epoch": 5.623481781376518, "percentage": 56.23, "elapsed_time": "1:10:46", "remaining_time": "0:55:05"} +{"current_steps": 1390, "total_steps": 2470, "loss": 1.4134, "lr": 4.784550363036539e-06, "epoch": 5.62753036437247, "percentage": 56.28, "elapsed_time": "1:10:49", "remaining_time": "0:55:01"} +{"current_steps": 1391, "total_steps": 2470, "loss": 1.6983, "lr": 4.7774910332213005e-06, "epoch": 5.631578947368421, "percentage": 56.32, "elapsed_time": "1:10:52", "remaining_time": "0:54:58"} +{"current_steps": 1392, "total_steps": 2470, "loss": 1.2975, "lr": 4.770432147800141e-06, "epoch": 5.635627530364372, "percentage": 56.36, "elapsed_time": "1:10:55", "remaining_time": "0:54:55"} +{"current_steps": 1393, "total_steps": 2470, "loss": 1.5541, "lr": 4.763373720871044e-06, "epoch": 5.6396761133603235, "percentage": 56.4, "elapsed_time": "1:10:58", "remaining_time": "0:54:52"} +{"current_steps": 1394, "total_steps": 2470, "loss": 1.4461, "lr": 4.756315766531069e-06, "epoch": 5.6437246963562755, "percentage": 56.44, "elapsed_time": "1:11:01", "remaining_time": "0:54:49"} +{"current_steps": 1395, "total_steps": 2470, "loss": 1.5498, "lr": 4.749258298876338e-06, "epoch": 5.647773279352227, "percentage": 56.48, "elapsed_time": "1:11:04", "remaining_time": "0:54:46"} +{"current_steps": 1396, "total_steps": 2470, "loss": 1.333, "lr": 4.742201332001998e-06, "epoch": 5.651821862348179, "percentage": 56.52, "elapsed_time": "1:11:07", "remaining_time": "0:54:43"} +{"current_steps": 1397, "total_steps": 2470, "loss": 1.556, "lr": 4.735144880002199e-06, "epoch": 5.65587044534413, "percentage": 56.56, "elapsed_time": "1:11:11", "remaining_time": "0:54:40"} +{"current_steps": 1398, "total_steps": 2470, "loss": 1.3788, "lr": 4.728088956970059e-06, "epoch": 5.659919028340081, "percentage": 56.6, "elapsed_time": "1:11:14", "remaining_time": "0:54:37"} +{"current_steps": 1399, "total_steps": 2470, "loss": 1.5347, "lr": 4.721033576997641e-06, "epoch": 5.663967611336032, "percentage": 56.64, "elapsed_time": "1:11:17", "remaining_time": "0:54:34"} +{"current_steps": 1400, "total_steps": 2470, "loss": 1.5292, "lr": 4.713978754175926e-06, "epoch": 5.668016194331984, "percentage": 56.68, "elapsed_time": "1:11:20", "remaining_time": "0:54:31"} +{"current_steps": 1401, "total_steps": 2470, "loss": 1.6549, "lr": 4.706924502594777e-06, "epoch": 5.672064777327935, "percentage": 56.72, "elapsed_time": "1:11:23", "remaining_time": "0:54:28"} +{"current_steps": 1402, "total_steps": 2470, "loss": 1.5814, "lr": 4.69987083634292e-06, "epoch": 5.676113360323887, "percentage": 56.76, "elapsed_time": "1:11:26", "remaining_time": "0:54:25"} +{"current_steps": 1403, "total_steps": 2470, "loss": 1.4982, "lr": 4.692817769507912e-06, "epoch": 5.680161943319838, "percentage": 56.8, "elapsed_time": "1:11:29", "remaining_time": "0:54:22"} +{"current_steps": 1404, "total_steps": 2470, "loss": 1.3453, "lr": 4.685765316176111e-06, "epoch": 5.684210526315789, "percentage": 56.84, "elapsed_time": "1:11:32", "remaining_time": "0:54:18"} +{"current_steps": 1405, "total_steps": 2470, "loss": 1.4717, "lr": 4.67871349043265e-06, "epoch": 5.6882591093117405, "percentage": 56.88, "elapsed_time": "1:11:35", "remaining_time": "0:54:15"} +{"current_steps": 1406, "total_steps": 2470, "loss": 1.4891, "lr": 4.671662306361409e-06, "epoch": 5.6923076923076925, "percentage": 56.92, "elapsed_time": "1:11:38", "remaining_time": "0:54:12"} +{"current_steps": 1407, "total_steps": 2470, "loss": 1.3408, "lr": 4.664611778044988e-06, "epoch": 5.696356275303644, "percentage": 56.96, "elapsed_time": "1:11:41", "remaining_time": "0:54:09"} +{"current_steps": 1408, "total_steps": 2470, "loss": 1.8095, "lr": 4.657561919564675e-06, "epoch": 5.700404858299595, "percentage": 57.0, "elapsed_time": "1:11:44", "remaining_time": "0:54:06"} +{"current_steps": 1409, "total_steps": 2470, "loss": 1.6024, "lr": 4.6505127450004216e-06, "epoch": 5.704453441295547, "percentage": 57.04, "elapsed_time": "1:11:47", "remaining_time": "0:54:03"} +{"current_steps": 1410, "total_steps": 2470, "loss": 1.2021, "lr": 4.643464268430812e-06, "epoch": 5.708502024291498, "percentage": 57.09, "elapsed_time": "1:11:50", "remaining_time": "0:54:00"} +{"current_steps": 1411, "total_steps": 2470, "loss": 1.3472, "lr": 4.636416503933038e-06, "epoch": 5.712550607287449, "percentage": 57.13, "elapsed_time": "1:11:53", "remaining_time": "0:53:57"} +{"current_steps": 1412, "total_steps": 2470, "loss": 1.4523, "lr": 4.62936946558287e-06, "epoch": 5.716599190283401, "percentage": 57.17, "elapsed_time": "1:11:56", "remaining_time": "0:53:54"} +{"current_steps": 1413, "total_steps": 2470, "loss": 1.2302, "lr": 4.622323167454623e-06, "epoch": 5.720647773279352, "percentage": 57.21, "elapsed_time": "1:11:59", "remaining_time": "0:53:51"} +{"current_steps": 1414, "total_steps": 2470, "loss": 1.4256, "lr": 4.6152776236211415e-06, "epoch": 5.724696356275303, "percentage": 57.25, "elapsed_time": "1:12:02", "remaining_time": "0:53:48"} +{"current_steps": 1415, "total_steps": 2470, "loss": 1.6055, "lr": 4.608232848153757e-06, "epoch": 5.728744939271255, "percentage": 57.29, "elapsed_time": "1:12:05", "remaining_time": "0:53:45"} +{"current_steps": 1416, "total_steps": 2470, "loss": 1.3484, "lr": 4.601188855122269e-06, "epoch": 5.732793522267206, "percentage": 57.33, "elapsed_time": "1:12:08", "remaining_time": "0:53:42"} +{"current_steps": 1417, "total_steps": 2470, "loss": 1.4537, "lr": 4.594145658594914e-06, "epoch": 5.7368421052631575, "percentage": 57.37, "elapsed_time": "1:12:11", "remaining_time": "0:53:39"} +{"current_steps": 1418, "total_steps": 2470, "loss": 2.0785, "lr": 4.587103272638339e-06, "epoch": 5.7408906882591095, "percentage": 57.41, "elapsed_time": "1:12:14", "remaining_time": "0:53:35"} +{"current_steps": 1419, "total_steps": 2470, "loss": 1.5669, "lr": 4.580061711317571e-06, "epoch": 5.744939271255061, "percentage": 57.45, "elapsed_time": "1:12:17", "remaining_time": "0:53:32"} +{"current_steps": 1420, "total_steps": 2470, "loss": 1.3901, "lr": 4.57302098869599e-06, "epoch": 5.748987854251012, "percentage": 57.49, "elapsed_time": "1:12:20", "remaining_time": "0:53:29"} +{"current_steps": 1421, "total_steps": 2470, "loss": 1.291, "lr": 4.565981118835299e-06, "epoch": 5.753036437246964, "percentage": 57.53, "elapsed_time": "1:12:23", "remaining_time": "0:53:26"} +{"current_steps": 1422, "total_steps": 2470, "loss": 1.4406, "lr": 4.558942115795502e-06, "epoch": 5.757085020242915, "percentage": 57.57, "elapsed_time": "1:12:26", "remaining_time": "0:53:23"} +{"current_steps": 1423, "total_steps": 2470, "loss": 2.1851, "lr": 4.551903993634869e-06, "epoch": 5.761133603238866, "percentage": 57.61, "elapsed_time": "1:12:30", "remaining_time": "0:53:20"} +{"current_steps": 1424, "total_steps": 2470, "loss": 1.9602, "lr": 4.5448667664099125e-06, "epoch": 5.765182186234818, "percentage": 57.65, "elapsed_time": "1:12:33", "remaining_time": "0:53:17"} +{"current_steps": 1425, "total_steps": 2470, "loss": 1.8644, "lr": 4.537830448175354e-06, "epoch": 5.769230769230769, "percentage": 57.69, "elapsed_time": "1:12:36", "remaining_time": "0:53:14"} +{"current_steps": 1426, "total_steps": 2470, "loss": 1.3677, "lr": 4.530795052984104e-06, "epoch": 5.77327935222672, "percentage": 57.73, "elapsed_time": "1:12:39", "remaining_time": "0:53:11"} +{"current_steps": 1427, "total_steps": 2470, "loss": 1.6488, "lr": 4.523760594887228e-06, "epoch": 5.777327935222672, "percentage": 57.77, "elapsed_time": "1:12:42", "remaining_time": "0:53:08"} +{"current_steps": 1428, "total_steps": 2470, "loss": 1.6378, "lr": 4.5167270879339165e-06, "epoch": 5.781376518218623, "percentage": 57.81, "elapsed_time": "1:12:45", "remaining_time": "0:53:05"} +{"current_steps": 1429, "total_steps": 2470, "loss": 1.458, "lr": 4.509694546171468e-06, "epoch": 5.7854251012145745, "percentage": 57.85, "elapsed_time": "1:12:48", "remaining_time": "0:53:02"} +{"current_steps": 1430, "total_steps": 2470, "loss": 1.3863, "lr": 4.5026629836452445e-06, "epoch": 5.7894736842105265, "percentage": 57.89, "elapsed_time": "1:12:51", "remaining_time": "0:52:59"} +{"current_steps": 1431, "total_steps": 2470, "loss": 1.4133, "lr": 4.495632414398659e-06, "epoch": 5.793522267206478, "percentage": 57.94, "elapsed_time": "1:12:54", "remaining_time": "0:52:56"} +{"current_steps": 1432, "total_steps": 2470, "loss": 1.4313, "lr": 4.488602852473138e-06, "epoch": 5.797570850202429, "percentage": 57.98, "elapsed_time": "1:12:57", "remaining_time": "0:52:53"} +{"current_steps": 1433, "total_steps": 2470, "loss": 1.3065, "lr": 4.481574311908096e-06, "epoch": 5.801619433198381, "percentage": 58.02, "elapsed_time": "1:13:00", "remaining_time": "0:52:50"} +{"current_steps": 1434, "total_steps": 2470, "loss": 1.1997, "lr": 4.4745468067409055e-06, "epoch": 5.805668016194332, "percentage": 58.06, "elapsed_time": "1:13:03", "remaining_time": "0:52:47"} +{"current_steps": 1435, "total_steps": 2470, "loss": 1.5584, "lr": 4.467520351006878e-06, "epoch": 5.809716599190283, "percentage": 58.1, "elapsed_time": "1:13:07", "remaining_time": "0:52:44"} +{"current_steps": 1436, "total_steps": 2470, "loss": 1.4086, "lr": 4.460494958739223e-06, "epoch": 5.813765182186235, "percentage": 58.14, "elapsed_time": "1:13:10", "remaining_time": "0:52:41"} +{"current_steps": 1437, "total_steps": 2470, "loss": 1.2759, "lr": 4.453470643969027e-06, "epoch": 5.817813765182186, "percentage": 58.18, "elapsed_time": "1:13:13", "remaining_time": "0:52:38"} +{"current_steps": 1438, "total_steps": 2470, "loss": 2.2866, "lr": 4.446447420725227e-06, "epoch": 5.821862348178137, "percentage": 58.22, "elapsed_time": "1:13:16", "remaining_time": "0:52:35"} +{"current_steps": 1439, "total_steps": 2470, "loss": 3.4094, "lr": 4.439425303034576e-06, "epoch": 5.825910931174089, "percentage": 58.26, "elapsed_time": "1:13:19", "remaining_time": "0:52:32"} +{"current_steps": 1440, "total_steps": 2470, "loss": 1.3129, "lr": 4.432404304921624e-06, "epoch": 5.82995951417004, "percentage": 58.3, "elapsed_time": "1:13:22", "remaining_time": "0:52:29"} +{"current_steps": 1441, "total_steps": 2470, "loss": 1.2285, "lr": 4.4253844404086785e-06, "epoch": 5.834008097165992, "percentage": 58.34, "elapsed_time": "1:13:25", "remaining_time": "0:52:26"} +{"current_steps": 1442, "total_steps": 2470, "loss": 1.286, "lr": 4.418365723515791e-06, "epoch": 5.838056680161944, "percentage": 58.38, "elapsed_time": "1:13:28", "remaining_time": "0:52:23"} +{"current_steps": 1443, "total_steps": 2470, "loss": 1.3394, "lr": 4.411348168260713e-06, "epoch": 5.842105263157895, "percentage": 58.42, "elapsed_time": "1:13:31", "remaining_time": "0:52:20"} +{"current_steps": 1444, "total_steps": 2470, "loss": 1.1712, "lr": 4.404331788658882e-06, "epoch": 5.846153846153846, "percentage": 58.46, "elapsed_time": "1:13:34", "remaining_time": "0:52:16"} +{"current_steps": 1445, "total_steps": 2470, "loss": 1.3548, "lr": 4.397316598723385e-06, "epoch": 5.850202429149798, "percentage": 58.5, "elapsed_time": "1:13:38", "remaining_time": "0:52:13"} +{"current_steps": 1446, "total_steps": 2470, "loss": 1.4071, "lr": 4.390302612464934e-06, "epoch": 5.854251012145749, "percentage": 58.54, "elapsed_time": "1:13:41", "remaining_time": "0:52:10"} +{"current_steps": 1447, "total_steps": 2470, "loss": 1.3334, "lr": 4.383289843891835e-06, "epoch": 5.8582995951417, "percentage": 58.58, "elapsed_time": "1:13:44", "remaining_time": "0:52:07"} +{"current_steps": 1448, "total_steps": 2470, "loss": 1.332, "lr": 4.376278307009962e-06, "epoch": 5.862348178137652, "percentage": 58.62, "elapsed_time": "1:13:47", "remaining_time": "0:52:04"} +{"current_steps": 1449, "total_steps": 2470, "loss": 1.336, "lr": 4.369268015822733e-06, "epoch": 5.866396761133603, "percentage": 58.66, "elapsed_time": "1:13:50", "remaining_time": "0:52:01"} +{"current_steps": 1450, "total_steps": 2470, "loss": 1.7992, "lr": 4.362258984331074e-06, "epoch": 5.870445344129554, "percentage": 58.7, "elapsed_time": "1:13:53", "remaining_time": "0:51:58"} +{"current_steps": 1451, "total_steps": 2470, "loss": 1.7401, "lr": 4.355251226533396e-06, "epoch": 5.874493927125506, "percentage": 58.74, "elapsed_time": "1:13:56", "remaining_time": "0:51:55"} +{"current_steps": 1452, "total_steps": 2470, "loss": 1.4945, "lr": 4.348244756425569e-06, "epoch": 5.8785425101214575, "percentage": 58.79, "elapsed_time": "1:13:59", "remaining_time": "0:51:52"} +{"current_steps": 1453, "total_steps": 2470, "loss": 1.4193, "lr": 4.341239588000887e-06, "epoch": 5.882591093117409, "percentage": 58.83, "elapsed_time": "1:14:02", "remaining_time": "0:51:49"} +{"current_steps": 1454, "total_steps": 2470, "loss": 1.0274, "lr": 4.334235735250047e-06, "epoch": 5.886639676113361, "percentage": 58.87, "elapsed_time": "1:14:05", "remaining_time": "0:51:46"} +{"current_steps": 1455, "total_steps": 2470, "loss": 1.5401, "lr": 4.327233212161118e-06, "epoch": 5.890688259109312, "percentage": 58.91, "elapsed_time": "1:14:08", "remaining_time": "0:51:43"} +{"current_steps": 1456, "total_steps": 2470, "loss": 1.5831, "lr": 4.320232032719511e-06, "epoch": 5.894736842105263, "percentage": 58.95, "elapsed_time": "1:14:11", "remaining_time": "0:51:40"} +{"current_steps": 1457, "total_steps": 2470, "loss": 1.3268, "lr": 4.313232210907959e-06, "epoch": 5.898785425101215, "percentage": 58.99, "elapsed_time": "1:14:14", "remaining_time": "0:51:37"} +{"current_steps": 1458, "total_steps": 2470, "loss": 1.3389, "lr": 4.306233760706478e-06, "epoch": 5.902834008097166, "percentage": 59.03, "elapsed_time": "1:14:17", "remaining_time": "0:51:34"} +{"current_steps": 1459, "total_steps": 2470, "loss": 1.4306, "lr": 4.299236696092347e-06, "epoch": 5.906882591093117, "percentage": 59.07, "elapsed_time": "1:14:20", "remaining_time": "0:51:30"} +{"current_steps": 1460, "total_steps": 2470, "loss": 1.1163, "lr": 4.292241031040077e-06, "epoch": 5.910931174089069, "percentage": 59.11, "elapsed_time": "1:14:23", "remaining_time": "0:51:27"} +{"current_steps": 1461, "total_steps": 2470, "loss": 1.2052, "lr": 4.285246779521384e-06, "epoch": 5.91497975708502, "percentage": 59.15, "elapsed_time": "1:14:26", "remaining_time": "0:51:24"} +{"current_steps": 1462, "total_steps": 2470, "loss": 1.213, "lr": 4.278253955505163e-06, "epoch": 5.919028340080971, "percentage": 59.19, "elapsed_time": "1:14:29", "remaining_time": "0:51:21"} +{"current_steps": 1463, "total_steps": 2470, "loss": 1.5401, "lr": 4.271262572957453e-06, "epoch": 5.923076923076923, "percentage": 59.23, "elapsed_time": "1:14:32", "remaining_time": "0:51:18"} +{"current_steps": 1464, "total_steps": 2470, "loss": 1.3832, "lr": 4.264272645841419e-06, "epoch": 5.9271255060728745, "percentage": 59.27, "elapsed_time": "1:14:35", "remaining_time": "0:51:15"} +{"current_steps": 1465, "total_steps": 2470, "loss": 1.3896, "lr": 4.2572841881173205e-06, "epoch": 5.931174089068826, "percentage": 59.31, "elapsed_time": "1:14:38", "remaining_time": "0:51:12"} +{"current_steps": 1466, "total_steps": 2470, "loss": 1.173, "lr": 4.250297213742473e-06, "epoch": 5.935222672064778, "percentage": 59.35, "elapsed_time": "1:14:42", "remaining_time": "0:51:09"} +{"current_steps": 1467, "total_steps": 2470, "loss": 1.1544, "lr": 4.243311736671239e-06, "epoch": 5.939271255060729, "percentage": 59.39, "elapsed_time": "1:14:45", "remaining_time": "0:51:06"} +{"current_steps": 1468, "total_steps": 2470, "loss": 1.4593, "lr": 4.236327770854987e-06, "epoch": 5.94331983805668, "percentage": 59.43, "elapsed_time": "1:14:48", "remaining_time": "0:51:03"} +{"current_steps": 1469, "total_steps": 2470, "loss": 1.1935, "lr": 4.229345330242067e-06, "epoch": 5.947368421052632, "percentage": 59.47, "elapsed_time": "1:14:51", "remaining_time": "0:51:00"} +{"current_steps": 1470, "total_steps": 2470, "loss": 1.1325, "lr": 4.222364428777786e-06, "epoch": 5.951417004048583, "percentage": 59.51, "elapsed_time": "1:14:54", "remaining_time": "0:50:57"} +{"current_steps": 1471, "total_steps": 2470, "loss": 1.3971, "lr": 4.2153850804043706e-06, "epoch": 5.955465587044534, "percentage": 59.55, "elapsed_time": "1:14:57", "remaining_time": "0:50:54"} +{"current_steps": 1472, "total_steps": 2470, "loss": 1.4698, "lr": 4.2084072990609505e-06, "epoch": 5.959514170040486, "percentage": 59.6, "elapsed_time": "1:15:00", "remaining_time": "0:50:51"} +{"current_steps": 1473, "total_steps": 2470, "loss": 1.4382, "lr": 4.201431098683527e-06, "epoch": 5.963562753036437, "percentage": 59.64, "elapsed_time": "1:15:03", "remaining_time": "0:50:48"} +{"current_steps": 1474, "total_steps": 2470, "loss": 1.5175, "lr": 4.194456493204939e-06, "epoch": 5.967611336032388, "percentage": 59.68, "elapsed_time": "1:15:06", "remaining_time": "0:50:45"} +{"current_steps": 1475, "total_steps": 2470, "loss": 1.433, "lr": 4.187483496554844e-06, "epoch": 5.97165991902834, "percentage": 59.72, "elapsed_time": "1:15:09", "remaining_time": "0:50:41"} +{"current_steps": 1476, "total_steps": 2470, "loss": 1.4114, "lr": 4.1805121226596826e-06, "epoch": 5.9757085020242915, "percentage": 59.76, "elapsed_time": "1:15:12", "remaining_time": "0:50:38"} +{"current_steps": 1477, "total_steps": 2470, "loss": 1.4847, "lr": 4.173542385442659e-06, "epoch": 5.979757085020243, "percentage": 59.8, "elapsed_time": "1:15:15", "remaining_time": "0:50:35"} +{"current_steps": 1478, "total_steps": 2470, "loss": 1.5417, "lr": 4.166574298823707e-06, "epoch": 5.983805668016195, "percentage": 59.84, "elapsed_time": "1:15:19", "remaining_time": "0:50:33"} +{"current_steps": 1479, "total_steps": 2470, "loss": 1.3383, "lr": 4.1596078767194615e-06, "epoch": 5.987854251012146, "percentage": 59.88, "elapsed_time": "1:15:22", "remaining_time": "0:50:30"} +{"current_steps": 1480, "total_steps": 2470, "loss": 1.3384, "lr": 4.152643133043236e-06, "epoch": 5.991902834008097, "percentage": 59.92, "elapsed_time": "1:15:25", "remaining_time": "0:50:26"} +{"current_steps": 1481, "total_steps": 2470, "loss": 1.6541, "lr": 4.145680081704989e-06, "epoch": 5.995951417004049, "percentage": 59.96, "elapsed_time": "1:15:28", "remaining_time": "0:50:23"} +{"current_steps": 1482, "total_steps": 2470, "loss": 1.3694, "lr": 4.138718736611302e-06, "epoch": 6.0, "percentage": 60.0, "elapsed_time": "1:15:31", "remaining_time": "0:50:20"} +{"current_steps": 1483, "total_steps": 2470, "loss": 1.4049, "lr": 4.131759111665349e-06, "epoch": 6.004048582995951, "percentage": 60.04, "elapsed_time": "1:15:34", "remaining_time": "0:50:17"} +{"current_steps": 1484, "total_steps": 2470, "loss": 1.5639, "lr": 4.1248012207668635e-06, "epoch": 6.008097165991903, "percentage": 60.08, "elapsed_time": "1:15:37", "remaining_time": "0:50:14"} +{"current_steps": 1485, "total_steps": 2470, "loss": 1.3693, "lr": 4.117845077812122e-06, "epoch": 6.012145748987854, "percentage": 60.12, "elapsed_time": "1:15:40", "remaining_time": "0:50:11"} +{"current_steps": 1486, "total_steps": 2470, "loss": 1.5831, "lr": 4.110890696693906e-06, "epoch": 6.016194331983805, "percentage": 60.16, "elapsed_time": "1:15:43", "remaining_time": "0:50:08"} +{"current_steps": 1487, "total_steps": 2470, "loss": 1.7881, "lr": 4.103938091301479e-06, "epoch": 6.020242914979757, "percentage": 60.2, "elapsed_time": "1:15:46", "remaining_time": "0:50:05"} +{"current_steps": 1488, "total_steps": 2470, "loss": 1.6668, "lr": 4.096987275520562e-06, "epoch": 6.0242914979757085, "percentage": 60.24, "elapsed_time": "1:15:49", "remaining_time": "0:50:02"} +{"current_steps": 1489, "total_steps": 2470, "loss": 1.3587, "lr": 4.090038263233294e-06, "epoch": 6.02834008097166, "percentage": 60.28, "elapsed_time": "1:15:52", "remaining_time": "0:49:59"} +{"current_steps": 1490, "total_steps": 2470, "loss": 1.3678, "lr": 4.08309106831822e-06, "epoch": 6.032388663967612, "percentage": 60.32, "elapsed_time": "1:15:55", "remaining_time": "0:49:56"} +{"current_steps": 1491, "total_steps": 2470, "loss": 1.5829, "lr": 4.0761457046502515e-06, "epoch": 6.036437246963563, "percentage": 60.36, "elapsed_time": "1:15:58", "remaining_time": "0:49:53"} +{"current_steps": 1492, "total_steps": 2470, "loss": 1.382, "lr": 4.0692021861006386e-06, "epoch": 6.040485829959514, "percentage": 60.4, "elapsed_time": "1:16:02", "remaining_time": "0:49:50"} +{"current_steps": 1493, "total_steps": 2470, "loss": 1.4891, "lr": 4.062260526536955e-06, "epoch": 6.044534412955466, "percentage": 60.45, "elapsed_time": "1:16:05", "remaining_time": "0:49:47"} +{"current_steps": 1494, "total_steps": 2470, "loss": 1.3764, "lr": 4.055320739823057e-06, "epoch": 6.048582995951417, "percentage": 60.49, "elapsed_time": "1:16:08", "remaining_time": "0:49:44"} +{"current_steps": 1495, "total_steps": 2470, "loss": 1.4399, "lr": 4.048382839819058e-06, "epoch": 6.052631578947368, "percentage": 60.53, "elapsed_time": "1:16:11", "remaining_time": "0:49:41"} +{"current_steps": 1496, "total_steps": 2470, "loss": 1.2964, "lr": 4.041446840381309e-06, "epoch": 6.05668016194332, "percentage": 60.57, "elapsed_time": "1:16:14", "remaining_time": "0:49:38"} +{"current_steps": 1497, "total_steps": 2470, "loss": 1.4451, "lr": 4.034512755362361e-06, "epoch": 6.060728744939271, "percentage": 60.61, "elapsed_time": "1:16:17", "remaining_time": "0:49:35"} +{"current_steps": 1498, "total_steps": 2470, "loss": 1.3934, "lr": 4.027580598610943e-06, "epoch": 6.064777327935222, "percentage": 60.65, "elapsed_time": "1:16:20", "remaining_time": "0:49:31"} +{"current_steps": 1499, "total_steps": 2470, "loss": 1.5479, "lr": 4.0206503839719335e-06, "epoch": 6.068825910931174, "percentage": 60.69, "elapsed_time": "1:16:23", "remaining_time": "0:49:28"} +{"current_steps": 1500, "total_steps": 2470, "loss": 1.4704, "lr": 4.01372212528633e-06, "epoch": 6.0728744939271255, "percentage": 60.73, "elapsed_time": "1:16:26", "remaining_time": "0:49:25"} +{"current_steps": 1501, "total_steps": 2470, "loss": 1.4155, "lr": 4.006795836391226e-06, "epoch": 6.076923076923077, "percentage": 60.77, "elapsed_time": "1:16:29", "remaining_time": "0:49:22"} +{"current_steps": 1502, "total_steps": 2470, "loss": 1.4857, "lr": 3.999871531119779e-06, "epoch": 6.080971659919029, "percentage": 60.81, "elapsed_time": "1:16:32", "remaining_time": "0:49:19"} +{"current_steps": 1503, "total_steps": 2470, "loss": 1.4726, "lr": 3.992949223301185e-06, "epoch": 6.08502024291498, "percentage": 60.85, "elapsed_time": "1:16:35", "remaining_time": "0:49:16"} +{"current_steps": 1504, "total_steps": 2470, "loss": 1.4183, "lr": 3.986028926760655e-06, "epoch": 6.089068825910931, "percentage": 60.89, "elapsed_time": "1:16:38", "remaining_time": "0:49:13"} +{"current_steps": 1505, "total_steps": 2470, "loss": 1.497, "lr": 3.9791106553193746e-06, "epoch": 6.093117408906883, "percentage": 60.93, "elapsed_time": "1:16:41", "remaining_time": "0:49:10"} +{"current_steps": 1506, "total_steps": 2470, "loss": 1.2572, "lr": 3.972194422794493e-06, "epoch": 6.097165991902834, "percentage": 60.97, "elapsed_time": "1:16:44", "remaining_time": "0:49:07"} +{"current_steps": 1507, "total_steps": 2470, "loss": 1.4398, "lr": 3.965280242999083e-06, "epoch": 6.101214574898785, "percentage": 61.01, "elapsed_time": "1:16:47", "remaining_time": "0:49:04"} +{"current_steps": 1508, "total_steps": 2470, "loss": 1.3871, "lr": 3.9583681297421194e-06, "epoch": 6.105263157894737, "percentage": 61.05, "elapsed_time": "1:16:50", "remaining_time": "0:49:01"} +{"current_steps": 1509, "total_steps": 2470, "loss": 1.375, "lr": 3.951458096828449e-06, "epoch": 6.109311740890688, "percentage": 61.09, "elapsed_time": "1:16:53", "remaining_time": "0:48:58"} +{"current_steps": 1510, "total_steps": 2470, "loss": 1.3195, "lr": 3.944550158058762e-06, "epoch": 6.113360323886639, "percentage": 61.13, "elapsed_time": "1:16:56", "remaining_time": "0:48:55"} +{"current_steps": 1511, "total_steps": 2470, "loss": 1.2256, "lr": 3.937644327229572e-06, "epoch": 6.117408906882591, "percentage": 61.17, "elapsed_time": "1:16:59", "remaining_time": "0:48:52"} +{"current_steps": 1512, "total_steps": 2470, "loss": 1.2919, "lr": 3.930740618133173e-06, "epoch": 6.1214574898785425, "percentage": 61.21, "elapsed_time": "1:17:02", "remaining_time": "0:48:48"} +{"current_steps": 1513, "total_steps": 2470, "loss": 1.3028, "lr": 3.923839044557632e-06, "epoch": 6.125506072874494, "percentage": 61.26, "elapsed_time": "1:17:05", "remaining_time": "0:48:45"} +{"current_steps": 1514, "total_steps": 2470, "loss": 1.1917, "lr": 3.916939620286743e-06, "epoch": 6.129554655870446, "percentage": 61.3, "elapsed_time": "1:17:08", "remaining_time": "0:48:42"} +{"current_steps": 1515, "total_steps": 2470, "loss": 1.54, "lr": 3.9100423591000124e-06, "epoch": 6.133603238866397, "percentage": 61.34, "elapsed_time": "1:17:11", "remaining_time": "0:48:39"} +{"current_steps": 1516, "total_steps": 2470, "loss": 1.3571, "lr": 3.903147274772624e-06, "epoch": 6.137651821862348, "percentage": 61.38, "elapsed_time": "1:17:14", "remaining_time": "0:48:36"} +{"current_steps": 1517, "total_steps": 2470, "loss": 1.1103, "lr": 3.896254381075416e-06, "epoch": 6.1417004048583, "percentage": 61.42, "elapsed_time": "1:17:17", "remaining_time": "0:48:33"} +{"current_steps": 1518, "total_steps": 2470, "loss": 1.4538, "lr": 3.8893636917748455e-06, "epoch": 6.145748987854251, "percentage": 61.46, "elapsed_time": "1:17:20", "remaining_time": "0:48:30"} +{"current_steps": 1519, "total_steps": 2470, "loss": 1.2834, "lr": 3.882475220632975e-06, "epoch": 6.149797570850202, "percentage": 61.5, "elapsed_time": "1:17:23", "remaining_time": "0:48:27"} +{"current_steps": 1520, "total_steps": 2470, "loss": 1.5023, "lr": 3.875588981407433e-06, "epoch": 6.153846153846154, "percentage": 61.54, "elapsed_time": "1:17:26", "remaining_time": "0:48:24"} +{"current_steps": 1521, "total_steps": 2470, "loss": 1.5494, "lr": 3.86870498785139e-06, "epoch": 6.157894736842105, "percentage": 61.58, "elapsed_time": "1:17:30", "remaining_time": "0:48:21"} +{"current_steps": 1522, "total_steps": 2470, "loss": 1.3442, "lr": 3.861823253713535e-06, "epoch": 6.161943319838056, "percentage": 61.62, "elapsed_time": "1:17:33", "remaining_time": "0:48:18"} +{"current_steps": 1523, "total_steps": 2470, "loss": 1.4306, "lr": 3.854943792738037e-06, "epoch": 6.165991902834008, "percentage": 61.66, "elapsed_time": "1:17:36", "remaining_time": "0:48:15"} +{"current_steps": 1524, "total_steps": 2470, "loss": 1.9855, "lr": 3.848066618664534e-06, "epoch": 6.17004048582996, "percentage": 61.7, "elapsed_time": "1:17:39", "remaining_time": "0:48:12"} +{"current_steps": 1525, "total_steps": 2470, "loss": 1.2562, "lr": 3.841191745228091e-06, "epoch": 6.174089068825911, "percentage": 61.74, "elapsed_time": "1:17:42", "remaining_time": "0:48:08"} +{"current_steps": 1526, "total_steps": 2470, "loss": 1.4532, "lr": 3.834319186159179e-06, "epoch": 6.178137651821863, "percentage": 61.78, "elapsed_time": "1:17:45", "remaining_time": "0:48:06"} +{"current_steps": 1527, "total_steps": 2470, "loss": 1.2517, "lr": 3.82744895518365e-06, "epoch": 6.182186234817814, "percentage": 61.82, "elapsed_time": "1:17:48", "remaining_time": "0:48:03"} +{"current_steps": 1528, "total_steps": 2470, "loss": 1.4395, "lr": 3.8205810660227e-06, "epoch": 6.186234817813765, "percentage": 61.86, "elapsed_time": "1:17:51", "remaining_time": "0:48:00"} +{"current_steps": 1529, "total_steps": 2470, "loss": 1.4579, "lr": 3.8137155323928526e-06, "epoch": 6.190283400809717, "percentage": 61.9, "elapsed_time": "1:17:54", "remaining_time": "0:47:57"} +{"current_steps": 1530, "total_steps": 2470, "loss": 1.6307, "lr": 3.8068523680059287e-06, "epoch": 6.194331983805668, "percentage": 61.94, "elapsed_time": "1:17:57", "remaining_time": "0:47:54"} +{"current_steps": 1531, "total_steps": 2470, "loss": 1.6785, "lr": 3.799991586569012e-06, "epoch": 6.198380566801619, "percentage": 61.98, "elapsed_time": "1:18:00", "remaining_time": "0:47:50"} +{"current_steps": 1532, "total_steps": 2470, "loss": 1.3911, "lr": 3.7931332017844302e-06, "epoch": 6.202429149797571, "percentage": 62.02, "elapsed_time": "1:18:04", "remaining_time": "0:47:47"} +{"current_steps": 1533, "total_steps": 2470, "loss": 1.7226, "lr": 3.786277227349724e-06, "epoch": 6.206477732793522, "percentage": 62.06, "elapsed_time": "1:18:07", "remaining_time": "0:47:45"} +{"current_steps": 1534, "total_steps": 2470, "loss": 1.7276, "lr": 3.77942367695762e-06, "epoch": 6.2105263157894735, "percentage": 62.11, "elapsed_time": "1:18:10", "remaining_time": "0:47:42"} +{"current_steps": 1535, "total_steps": 2470, "loss": 1.4984, "lr": 3.7725725642960047e-06, "epoch": 6.2145748987854255, "percentage": 62.15, "elapsed_time": "1:18:13", "remaining_time": "0:47:38"} +{"current_steps": 1536, "total_steps": 2470, "loss": 1.3822, "lr": 3.7657239030478927e-06, "epoch": 6.218623481781377, "percentage": 62.19, "elapsed_time": "1:18:16", "remaining_time": "0:47:35"} +{"current_steps": 1537, "total_steps": 2470, "loss": 1.3005, "lr": 3.758877706891407e-06, "epoch": 6.222672064777328, "percentage": 62.23, "elapsed_time": "1:18:19", "remaining_time": "0:47:32"} +{"current_steps": 1538, "total_steps": 2470, "loss": 1.4995, "lr": 3.752033989499742e-06, "epoch": 6.22672064777328, "percentage": 62.27, "elapsed_time": "1:18:22", "remaining_time": "0:47:29"} +{"current_steps": 1539, "total_steps": 2470, "loss": 1.2958, "lr": 3.7451927645411466e-06, "epoch": 6.230769230769231, "percentage": 62.31, "elapsed_time": "1:18:25", "remaining_time": "0:47:26"} +{"current_steps": 1540, "total_steps": 2470, "loss": 1.5321, "lr": 3.7383540456788915e-06, "epoch": 6.234817813765182, "percentage": 62.35, "elapsed_time": "1:18:28", "remaining_time": "0:47:23"} +{"current_steps": 1541, "total_steps": 2470, "loss": 1.4701, "lr": 3.7315178465712364e-06, "epoch": 6.238866396761134, "percentage": 62.39, "elapsed_time": "1:18:31", "remaining_time": "0:47:20"} +{"current_steps": 1542, "total_steps": 2470, "loss": 1.4965, "lr": 3.7246841808714172e-06, "epoch": 6.242914979757085, "percentage": 62.43, "elapsed_time": "1:18:34", "remaining_time": "0:47:17"} +{"current_steps": 1543, "total_steps": 2470, "loss": 1.3376, "lr": 3.717853062227604e-06, "epoch": 6.246963562753036, "percentage": 62.47, "elapsed_time": "1:18:37", "remaining_time": "0:47:14"} +{"current_steps": 1544, "total_steps": 2470, "loss": 1.436, "lr": 3.7110245042828786e-06, "epoch": 6.251012145748988, "percentage": 62.51, "elapsed_time": "1:18:41", "remaining_time": "0:47:11"} +{"current_steps": 1545, "total_steps": 2470, "loss": 1.3922, "lr": 3.704198520675214e-06, "epoch": 6.255060728744939, "percentage": 62.55, "elapsed_time": "1:18:44", "remaining_time": "0:47:08"} +{"current_steps": 1546, "total_steps": 2470, "loss": 1.3391, "lr": 3.69737512503744e-06, "epoch": 6.2591093117408905, "percentage": 62.59, "elapsed_time": "1:18:47", "remaining_time": "0:47:05"} +{"current_steps": 1547, "total_steps": 2470, "loss": 1.1307, "lr": 3.690554330997215e-06, "epoch": 6.2631578947368425, "percentage": 62.63, "elapsed_time": "1:18:50", "remaining_time": "0:47:02"} +{"current_steps": 1548, "total_steps": 2470, "loss": 1.4205, "lr": 3.6837361521770056e-06, "epoch": 6.267206477732794, "percentage": 62.67, "elapsed_time": "1:18:53", "remaining_time": "0:46:59"} +{"current_steps": 1549, "total_steps": 2470, "loss": 1.4284, "lr": 3.6769206021940505e-06, "epoch": 6.271255060728745, "percentage": 62.71, "elapsed_time": "1:18:56", "remaining_time": "0:46:56"} +{"current_steps": 1550, "total_steps": 2470, "loss": 1.4865, "lr": 3.670107694660343e-06, "epoch": 6.275303643724697, "percentage": 62.75, "elapsed_time": "1:18:59", "remaining_time": "0:46:53"} +{"current_steps": 1551, "total_steps": 2470, "loss": 1.4177, "lr": 3.6632974431825965e-06, "epoch": 6.279352226720648, "percentage": 62.79, "elapsed_time": "1:19:02", "remaining_time": "0:46:49"} +{"current_steps": 1552, "total_steps": 2470, "loss": 1.0975, "lr": 3.656489861362218e-06, "epoch": 6.283400809716599, "percentage": 62.83, "elapsed_time": "1:19:05", "remaining_time": "0:46:46"} +{"current_steps": 1553, "total_steps": 2470, "loss": 1.2607, "lr": 3.6496849627952875e-06, "epoch": 6.287449392712551, "percentage": 62.87, "elapsed_time": "1:19:08", "remaining_time": "0:46:43"} +{"current_steps": 1554, "total_steps": 2470, "loss": 1.113, "lr": 3.6428827610725203e-06, "epoch": 6.291497975708502, "percentage": 62.91, "elapsed_time": "1:19:11", "remaining_time": "0:46:40"} +{"current_steps": 1555, "total_steps": 2470, "loss": 1.3579, "lr": 3.636083269779249e-06, "epoch": 6.295546558704453, "percentage": 62.96, "elapsed_time": "1:19:14", "remaining_time": "0:46:37"} +{"current_steps": 1556, "total_steps": 2470, "loss": 1.5612, "lr": 3.6292865024953945e-06, "epoch": 6.299595141700405, "percentage": 63.0, "elapsed_time": "1:19:17", "remaining_time": "0:46:34"} +{"current_steps": 1557, "total_steps": 2470, "loss": 1.196, "lr": 3.622492472795432e-06, "epoch": 6.303643724696356, "percentage": 63.04, "elapsed_time": "1:19:20", "remaining_time": "0:46:31"} +{"current_steps": 1558, "total_steps": 2470, "loss": 1.2403, "lr": 3.615701194248375e-06, "epoch": 6.3076923076923075, "percentage": 63.08, "elapsed_time": "1:19:23", "remaining_time": "0:46:28"} +{"current_steps": 1559, "total_steps": 2470, "loss": 1.2748, "lr": 3.6089126804177373e-06, "epoch": 6.3117408906882595, "percentage": 63.12, "elapsed_time": "1:19:26", "remaining_time": "0:46:25"} +{"current_steps": 1560, "total_steps": 2470, "loss": 1.1801, "lr": 3.6021269448615148e-06, "epoch": 6.315789473684211, "percentage": 63.16, "elapsed_time": "1:19:29", "remaining_time": "0:46:22"} +{"current_steps": 1561, "total_steps": 2470, "loss": 1.1334, "lr": 3.595344001132154e-06, "epoch": 6.319838056680162, "percentage": 63.2, "elapsed_time": "1:19:32", "remaining_time": "0:46:19"} +{"current_steps": 1562, "total_steps": 2470, "loss": 1.1662, "lr": 3.5885638627765228e-06, "epoch": 6.323886639676114, "percentage": 63.24, "elapsed_time": "1:19:35", "remaining_time": "0:46:16"} +{"current_steps": 1563, "total_steps": 2470, "loss": 1.1897, "lr": 3.5817865433358902e-06, "epoch": 6.327935222672065, "percentage": 63.28, "elapsed_time": "1:19:38", "remaining_time": "0:46:13"} +{"current_steps": 1564, "total_steps": 2470, "loss": 1.2197, "lr": 3.5750120563458924e-06, "epoch": 6.331983805668016, "percentage": 63.32, "elapsed_time": "1:19:41", "remaining_time": "0:46:10"} +{"current_steps": 1565, "total_steps": 2470, "loss": 1.2979, "lr": 3.568240415336509e-06, "epoch": 6.336032388663968, "percentage": 63.36, "elapsed_time": "1:19:44", "remaining_time": "0:46:07"} +{"current_steps": 1566, "total_steps": 2470, "loss": 1.2379, "lr": 3.5614716338320384e-06, "epoch": 6.340080971659919, "percentage": 63.4, "elapsed_time": "1:19:48", "remaining_time": "0:46:03"} +{"current_steps": 1567, "total_steps": 2470, "loss": 1.1656, "lr": 3.554705725351063e-06, "epoch": 6.34412955465587, "percentage": 63.44, "elapsed_time": "1:19:51", "remaining_time": "0:46:00"} +{"current_steps": 1568, "total_steps": 2470, "loss": 1.3082, "lr": 3.547942703406433e-06, "epoch": 6.348178137651822, "percentage": 63.48, "elapsed_time": "1:19:54", "remaining_time": "0:45:57"} +{"current_steps": 1569, "total_steps": 2470, "loss": 1.313, "lr": 3.5411825815052296e-06, "epoch": 6.352226720647773, "percentage": 63.52, "elapsed_time": "1:19:57", "remaining_time": "0:45:54"} +{"current_steps": 1570, "total_steps": 2470, "loss": 0.9762, "lr": 3.534425373148741e-06, "epoch": 6.3562753036437245, "percentage": 63.56, "elapsed_time": "1:20:00", "remaining_time": "0:45:51"} +{"current_steps": 1571, "total_steps": 2470, "loss": 1.373, "lr": 3.52767109183244e-06, "epoch": 6.3603238866396765, "percentage": 63.6, "elapsed_time": "1:20:03", "remaining_time": "0:45:48"} +{"current_steps": 1572, "total_steps": 2470, "loss": 1.448, "lr": 3.5209197510459526e-06, "epoch": 6.364372469635628, "percentage": 63.64, "elapsed_time": "1:20:06", "remaining_time": "0:45:45"} +{"current_steps": 1573, "total_steps": 2470, "loss": 1.3476, "lr": 3.5141713642730305e-06, "epoch": 6.368421052631579, "percentage": 63.68, "elapsed_time": "1:20:09", "remaining_time": "0:45:42"} +{"current_steps": 1574, "total_steps": 2470, "loss": 1.4072, "lr": 3.507425944991529e-06, "epoch": 6.372469635627531, "percentage": 63.72, "elapsed_time": "1:20:13", "remaining_time": "0:45:39"} +{"current_steps": 1575, "total_steps": 2470, "loss": 1.0987, "lr": 3.5006835066733707e-06, "epoch": 6.376518218623482, "percentage": 63.77, "elapsed_time": "1:20:16", "remaining_time": "0:45:36"} +{"current_steps": 1576, "total_steps": 2470, "loss": 1.2467, "lr": 3.4939440627845305e-06, "epoch": 6.380566801619433, "percentage": 63.81, "elapsed_time": "1:20:19", "remaining_time": "0:45:33"} +{"current_steps": 1577, "total_steps": 2470, "loss": 1.0512, "lr": 3.4872076267850015e-06, "epoch": 6.384615384615385, "percentage": 63.85, "elapsed_time": "1:20:22", "remaining_time": "0:45:30"} +{"current_steps": 1578, "total_steps": 2470, "loss": 1.1192, "lr": 3.480474212128766e-06, "epoch": 6.388663967611336, "percentage": 63.89, "elapsed_time": "1:20:25", "remaining_time": "0:45:27"} +{"current_steps": 1579, "total_steps": 2470, "loss": 1.2989, "lr": 3.473743832263778e-06, "epoch": 6.392712550607287, "percentage": 63.93, "elapsed_time": "1:20:28", "remaining_time": "0:45:24"} +{"current_steps": 1580, "total_steps": 2470, "loss": 1.1125, "lr": 3.4670165006319236e-06, "epoch": 6.396761133603239, "percentage": 63.97, "elapsed_time": "1:20:31", "remaining_time": "0:45:21"} +{"current_steps": 1581, "total_steps": 2470, "loss": 1.1461, "lr": 3.4602922306690062e-06, "epoch": 6.40080971659919, "percentage": 64.01, "elapsed_time": "1:20:34", "remaining_time": "0:45:18"} +{"current_steps": 1582, "total_steps": 2470, "loss": 1.1805, "lr": 3.453571035804714e-06, "epoch": 6.4048582995951415, "percentage": 64.05, "elapsed_time": "1:20:37", "remaining_time": "0:45:15"} +{"current_steps": 1583, "total_steps": 2470, "loss": 1.2865, "lr": 3.4468529294625895e-06, "epoch": 6.4089068825910935, "percentage": 64.09, "elapsed_time": "1:20:40", "remaining_time": "0:45:12"} +{"current_steps": 1584, "total_steps": 2470, "loss": 1.112, "lr": 3.4401379250600124e-06, "epoch": 6.412955465587045, "percentage": 64.13, "elapsed_time": "1:20:43", "remaining_time": "0:45:09"} +{"current_steps": 1585, "total_steps": 2470, "loss": 1.4222, "lr": 3.433426036008163e-06, "epoch": 6.417004048582996, "percentage": 64.17, "elapsed_time": "1:20:46", "remaining_time": "0:45:06"} +{"current_steps": 1586, "total_steps": 2470, "loss": 1.4558, "lr": 3.4267172757120005e-06, "epoch": 6.421052631578947, "percentage": 64.21, "elapsed_time": "1:20:49", "remaining_time": "0:45:03"} +{"current_steps": 1587, "total_steps": 2470, "loss": 1.4408, "lr": 3.420011657570238e-06, "epoch": 6.425101214574899, "percentage": 64.25, "elapsed_time": "1:20:53", "remaining_time": "0:45:00"} +{"current_steps": 1588, "total_steps": 2470, "loss": 1.4281, "lr": 3.413309194975309e-06, "epoch": 6.42914979757085, "percentage": 64.29, "elapsed_time": "1:20:56", "remaining_time": "0:44:57"} +{"current_steps": 1589, "total_steps": 2470, "loss": 1.6038, "lr": 3.406609901313349e-06, "epoch": 6.433198380566802, "percentage": 64.33, "elapsed_time": "1:20:59", "remaining_time": "0:44:54"} +{"current_steps": 1590, "total_steps": 2470, "loss": 1.3818, "lr": 3.39991378996416e-06, "epoch": 6.437246963562753, "percentage": 64.37, "elapsed_time": "1:21:02", "remaining_time": "0:44:50"} +{"current_steps": 1591, "total_steps": 2470, "loss": 1.324, "lr": 3.393220874301193e-06, "epoch": 6.441295546558704, "percentage": 64.41, "elapsed_time": "1:21:05", "remaining_time": "0:44:47"} +{"current_steps": 1592, "total_steps": 2470, "loss": 1.569, "lr": 3.386531167691512e-06, "epoch": 6.445344129554655, "percentage": 64.45, "elapsed_time": "1:21:08", "remaining_time": "0:44:44"} +{"current_steps": 1593, "total_steps": 2470, "loss": 1.3697, "lr": 3.379844683495775e-06, "epoch": 6.449392712550607, "percentage": 64.49, "elapsed_time": "1:21:11", "remaining_time": "0:44:41"} +{"current_steps": 1594, "total_steps": 2470, "loss": 1.3591, "lr": 3.3731614350682045e-06, "epoch": 6.4534412955465585, "percentage": 64.53, "elapsed_time": "1:21:14", "remaining_time": "0:44:38"} +{"current_steps": 1595, "total_steps": 2470, "loss": 1.7039, "lr": 3.36648143575656e-06, "epoch": 6.4574898785425106, "percentage": 64.57, "elapsed_time": "1:21:17", "remaining_time": "0:44:35"} +{"current_steps": 1596, "total_steps": 2470, "loss": 1.8161, "lr": 3.3598046989021073e-06, "epoch": 6.461538461538462, "percentage": 64.62, "elapsed_time": "1:21:20", "remaining_time": "0:44:32"} +{"current_steps": 1597, "total_steps": 2470, "loss": 1.506, "lr": 3.3531312378396026e-06, "epoch": 6.465587044534413, "percentage": 64.66, "elapsed_time": "1:21:23", "remaining_time": "0:44:29"} +{"current_steps": 1598, "total_steps": 2470, "loss": 1.5432, "lr": 3.3464610658972584e-06, "epoch": 6.469635627530364, "percentage": 64.7, "elapsed_time": "1:21:26", "remaining_time": "0:44:26"} +{"current_steps": 1599, "total_steps": 2470, "loss": 1.502, "lr": 3.3397941963967162e-06, "epoch": 6.473684210526316, "percentage": 64.74, "elapsed_time": "1:21:29", "remaining_time": "0:44:23"} +{"current_steps": 1600, "total_steps": 2470, "loss": 1.5104, "lr": 3.333130642653024e-06, "epoch": 6.477732793522267, "percentage": 64.78, "elapsed_time": "1:21:32", "remaining_time": "0:44:20"} +{"current_steps": 1601, "total_steps": 2470, "loss": 1.4218, "lr": 3.326470417974604e-06, "epoch": 6.481781376518219, "percentage": 64.82, "elapsed_time": "1:21:35", "remaining_time": "0:44:17"} +{"current_steps": 1602, "total_steps": 2470, "loss": 1.3685, "lr": 3.3198135356632353e-06, "epoch": 6.48582995951417, "percentage": 64.86, "elapsed_time": "1:21:38", "remaining_time": "0:44:14"} +{"current_steps": 1603, "total_steps": 2470, "loss": 1.3026, "lr": 3.313160009014017e-06, "epoch": 6.489878542510121, "percentage": 64.9, "elapsed_time": "1:21:41", "remaining_time": "0:44:11"} +{"current_steps": 1604, "total_steps": 2470, "loss": 1.2931, "lr": 3.3065098513153473e-06, "epoch": 6.493927125506072, "percentage": 64.94, "elapsed_time": "1:21:44", "remaining_time": "0:44:08"} +{"current_steps": 1605, "total_steps": 2470, "loss": 1.203, "lr": 3.299863075848898e-06, "epoch": 6.497975708502024, "percentage": 64.98, "elapsed_time": "1:21:47", "remaining_time": "0:44:05"} +{"current_steps": 1606, "total_steps": 2470, "loss": 1.0369, "lr": 3.2932196958895816e-06, "epoch": 6.502024291497976, "percentage": 65.02, "elapsed_time": "1:21:50", "remaining_time": "0:44:01"} +{"current_steps": 1607, "total_steps": 2470, "loss": 1.4057, "lr": 3.2865797247055354e-06, "epoch": 6.506072874493928, "percentage": 65.06, "elapsed_time": "1:21:53", "remaining_time": "0:43:58"} +{"current_steps": 1608, "total_steps": 2470, "loss": 1.3496, "lr": 3.2799431755580814e-06, "epoch": 6.510121457489879, "percentage": 65.1, "elapsed_time": "1:21:57", "remaining_time": "0:43:55"} +{"current_steps": 1609, "total_steps": 2470, "loss": 1.3227, "lr": 3.2733100617017126e-06, "epoch": 6.51417004048583, "percentage": 65.14, "elapsed_time": "1:22:00", "remaining_time": "0:43:52"} +{"current_steps": 1610, "total_steps": 2470, "loss": 1.3552, "lr": 3.266680396384061e-06, "epoch": 6.518218623481781, "percentage": 65.18, "elapsed_time": "1:22:03", "remaining_time": "0:43:49"} +{"current_steps": 1611, "total_steps": 2470, "loss": 1.2943, "lr": 3.2600541928458664e-06, "epoch": 6.522267206477733, "percentage": 65.22, "elapsed_time": "1:22:06", "remaining_time": "0:43:46"} +{"current_steps": 1612, "total_steps": 2470, "loss": 1.132, "lr": 3.2534314643209597e-06, "epoch": 6.526315789473684, "percentage": 65.26, "elapsed_time": "1:22:09", "remaining_time": "0:43:43"} +{"current_steps": 1613, "total_steps": 2470, "loss": 1.2075, "lr": 3.2468122240362287e-06, "epoch": 6.530364372469636, "percentage": 65.3, "elapsed_time": "1:22:12", "remaining_time": "0:43:40"} +{"current_steps": 1614, "total_steps": 2470, "loss": 1.2648, "lr": 3.2401964852115954e-06, "epoch": 6.534412955465587, "percentage": 65.34, "elapsed_time": "1:22:15", "remaining_time": "0:43:37"} +{"current_steps": 1615, "total_steps": 2470, "loss": 1.5484, "lr": 3.233584261059991e-06, "epoch": 6.538461538461538, "percentage": 65.38, "elapsed_time": "1:22:18", "remaining_time": "0:43:34"} +{"current_steps": 1616, "total_steps": 2470, "loss": 1.486, "lr": 3.226975564787322e-06, "epoch": 6.5425101214574894, "percentage": 65.43, "elapsed_time": "1:22:21", "remaining_time": "0:43:31"} +{"current_steps": 1617, "total_steps": 2470, "loss": 2.0005, "lr": 3.2203704095924536e-06, "epoch": 6.5465587044534415, "percentage": 65.47, "elapsed_time": "1:22:24", "remaining_time": "0:43:28"} +{"current_steps": 1618, "total_steps": 2470, "loss": 1.7957, "lr": 3.213768808667177e-06, "epoch": 6.550607287449393, "percentage": 65.51, "elapsed_time": "1:22:27", "remaining_time": "0:43:25"} +{"current_steps": 1619, "total_steps": 2470, "loss": 2.144, "lr": 3.2071707751961838e-06, "epoch": 6.554655870445345, "percentage": 65.55, "elapsed_time": "1:22:30", "remaining_time": "0:43:22"} +{"current_steps": 1620, "total_steps": 2470, "loss": 1.3436, "lr": 3.200576322357044e-06, "epoch": 6.558704453441296, "percentage": 65.59, "elapsed_time": "1:22:33", "remaining_time": "0:43:19"} +{"current_steps": 1621, "total_steps": 2470, "loss": 1.2129, "lr": 3.1939854633201727e-06, "epoch": 6.562753036437247, "percentage": 65.63, "elapsed_time": "1:22:36", "remaining_time": "0:43:16"} +{"current_steps": 1622, "total_steps": 2470, "loss": 1.5973, "lr": 3.187398211248811e-06, "epoch": 6.566801619433198, "percentage": 65.67, "elapsed_time": "1:22:40", "remaining_time": "0:43:13"} +{"current_steps": 1623, "total_steps": 2470, "loss": 1.2471, "lr": 3.1808145792989914e-06, "epoch": 6.57085020242915, "percentage": 65.71, "elapsed_time": "1:22:43", "remaining_time": "0:43:10"} +{"current_steps": 1624, "total_steps": 2470, "loss": 1.4285, "lr": 3.1742345806195196e-06, "epoch": 6.574898785425101, "percentage": 65.75, "elapsed_time": "1:22:46", "remaining_time": "0:43:07"} +{"current_steps": 1625, "total_steps": 2470, "loss": 1.2586, "lr": 3.1676582283519454e-06, "epoch": 6.578947368421053, "percentage": 65.79, "elapsed_time": "1:22:49", "remaining_time": "0:43:04"} +{"current_steps": 1626, "total_steps": 2470, "loss": 1.3673, "lr": 3.1610855356305354e-06, "epoch": 6.582995951417004, "percentage": 65.83, "elapsed_time": "1:22:52", "remaining_time": "0:43:01"} +{"current_steps": 1627, "total_steps": 2470, "loss": 1.3681, "lr": 3.1545165155822453e-06, "epoch": 6.587044534412955, "percentage": 65.87, "elapsed_time": "1:22:55", "remaining_time": "0:42:57"} +{"current_steps": 1628, "total_steps": 2470, "loss": 1.3636, "lr": 3.1479511813267006e-06, "epoch": 6.5910931174089065, "percentage": 65.91, "elapsed_time": "1:22:58", "remaining_time": "0:42:54"} +{"current_steps": 1629, "total_steps": 2470, "loss": 1.0862, "lr": 3.141389545976159e-06, "epoch": 6.5951417004048585, "percentage": 65.95, "elapsed_time": "1:23:02", "remaining_time": "0:42:52"} +{"current_steps": 1630, "total_steps": 2470, "loss": 1.1727, "lr": 3.134831622635496e-06, "epoch": 6.59919028340081, "percentage": 65.99, "elapsed_time": "1:23:05", "remaining_time": "0:42:48"} +{"current_steps": 1631, "total_steps": 2470, "loss": 1.2508, "lr": 3.1282774244021717e-06, "epoch": 6.603238866396762, "percentage": 66.03, "elapsed_time": "1:23:08", "remaining_time": "0:42:45"} +{"current_steps": 1632, "total_steps": 2470, "loss": 1.0497, "lr": 3.1217269643662063e-06, "epoch": 6.607287449392713, "percentage": 66.07, "elapsed_time": "1:23:11", "remaining_time": "0:42:42"} +{"current_steps": 1633, "total_steps": 2470, "loss": 1.352, "lr": 3.115180255610154e-06, "epoch": 6.611336032388664, "percentage": 66.11, "elapsed_time": "1:23:14", "remaining_time": "0:42:39"} +{"current_steps": 1634, "total_steps": 2470, "loss": 1.3803, "lr": 3.1086373112090762e-06, "epoch": 6.615384615384615, "percentage": 66.15, "elapsed_time": "1:23:17", "remaining_time": "0:42:36"} +{"current_steps": 1635, "total_steps": 2470, "loss": 1.1187, "lr": 3.1020981442305187e-06, "epoch": 6.619433198380567, "percentage": 66.19, "elapsed_time": "1:23:20", "remaining_time": "0:42:33"} +{"current_steps": 1636, "total_steps": 2470, "loss": 1.4805, "lr": 3.095562767734481e-06, "epoch": 6.623481781376518, "percentage": 66.23, "elapsed_time": "1:23:23", "remaining_time": "0:42:30"} +{"current_steps": 1637, "total_steps": 2470, "loss": 1.2999, "lr": 3.089031194773392e-06, "epoch": 6.62753036437247, "percentage": 66.28, "elapsed_time": "1:23:26", "remaining_time": "0:42:27"} +{"current_steps": 1638, "total_steps": 2470, "loss": 1.5812, "lr": 3.082503438392086e-06, "epoch": 6.631578947368421, "percentage": 66.32, "elapsed_time": "1:23:29", "remaining_time": "0:42:24"} +{"current_steps": 1639, "total_steps": 2470, "loss": 1.1799, "lr": 3.0759795116277723e-06, "epoch": 6.635627530364372, "percentage": 66.36, "elapsed_time": "1:23:32", "remaining_time": "0:42:21"} +{"current_steps": 1640, "total_steps": 2470, "loss": 1.4498, "lr": 3.069459427510014e-06, "epoch": 6.6396761133603235, "percentage": 66.4, "elapsed_time": "1:23:35", "remaining_time": "0:42:18"} +{"current_steps": 1641, "total_steps": 2470, "loss": 1.3417, "lr": 3.0629431990607e-06, "epoch": 6.6437246963562755, "percentage": 66.44, "elapsed_time": "1:23:38", "remaining_time": "0:42:15"} +{"current_steps": 1642, "total_steps": 2470, "loss": 1.45, "lr": 3.056430839294015e-06, "epoch": 6.647773279352227, "percentage": 66.48, "elapsed_time": "1:23:41", "remaining_time": "0:42:12"} +{"current_steps": 1643, "total_steps": 2470, "loss": 1.2275, "lr": 3.049922361216422e-06, "epoch": 6.651821862348179, "percentage": 66.52, "elapsed_time": "1:23:44", "remaining_time": "0:42:09"} +{"current_steps": 1644, "total_steps": 2470, "loss": 1.4383, "lr": 3.043417777826627e-06, "epoch": 6.65587044534413, "percentage": 66.56, "elapsed_time": "1:23:47", "remaining_time": "0:42:06"} +{"current_steps": 1645, "total_steps": 2470, "loss": 1.2502, "lr": 3.036917102115561e-06, "epoch": 6.659919028340081, "percentage": 66.6, "elapsed_time": "1:23:50", "remaining_time": "0:42:03"} +{"current_steps": 1646, "total_steps": 2470, "loss": 1.4135, "lr": 3.0304203470663507e-06, "epoch": 6.663967611336032, "percentage": 66.64, "elapsed_time": "1:23:53", "remaining_time": "0:42:00"} +{"current_steps": 1647, "total_steps": 2470, "loss": 1.4064, "lr": 3.023927525654288e-06, "epoch": 6.668016194331984, "percentage": 66.68, "elapsed_time": "1:23:56", "remaining_time": "0:41:56"} +{"current_steps": 1648, "total_steps": 2470, "loss": 1.5635, "lr": 3.017438650846815e-06, "epoch": 6.672064777327935, "percentage": 66.72, "elapsed_time": "1:24:00", "remaining_time": "0:41:53"} +{"current_steps": 1649, "total_steps": 2470, "loss": 1.5306, "lr": 3.0109537356034856e-06, "epoch": 6.676113360323887, "percentage": 66.76, "elapsed_time": "1:24:03", "remaining_time": "0:41:50"} +{"current_steps": 1650, "total_steps": 2470, "loss": 1.3876, "lr": 3.0044727928759487e-06, "epoch": 6.680161943319838, "percentage": 66.8, "elapsed_time": "1:24:06", "remaining_time": "0:41:47"} +{"current_steps": 1651, "total_steps": 2470, "loss": 1.2497, "lr": 2.9979958356079195e-06, "epoch": 6.684210526315789, "percentage": 66.84, "elapsed_time": "1:24:09", "remaining_time": "0:41:44"} +{"current_steps": 1652, "total_steps": 2470, "loss": 1.3506, "lr": 2.991522876735154e-06, "epoch": 6.6882591093117405, "percentage": 66.88, "elapsed_time": "1:24:12", "remaining_time": "0:41:41"} +{"current_steps": 1653, "total_steps": 2470, "loss": 1.3676, "lr": 2.98505392918542e-06, "epoch": 6.6923076923076925, "percentage": 66.92, "elapsed_time": "1:24:15", "remaining_time": "0:41:38"} +{"current_steps": 1654, "total_steps": 2470, "loss": 1.2348, "lr": 2.978589005878476e-06, "epoch": 6.696356275303644, "percentage": 66.96, "elapsed_time": "1:24:18", "remaining_time": "0:41:35"} +{"current_steps": 1655, "total_steps": 2470, "loss": 1.6916, "lr": 2.9721281197260427e-06, "epoch": 6.700404858299595, "percentage": 67.0, "elapsed_time": "1:24:21", "remaining_time": "0:41:32"} +{"current_steps": 1656, "total_steps": 2470, "loss": 1.4917, "lr": 2.965671283631778e-06, "epoch": 6.704453441295547, "percentage": 67.04, "elapsed_time": "1:24:24", "remaining_time": "0:41:29"} +{"current_steps": 1657, "total_steps": 2470, "loss": 1.1089, "lr": 2.959218510491252e-06, "epoch": 6.708502024291498, "percentage": 67.09, "elapsed_time": "1:24:27", "remaining_time": "0:41:26"} +{"current_steps": 1658, "total_steps": 2470, "loss": 1.2314, "lr": 2.9527698131919156e-06, "epoch": 6.712550607287449, "percentage": 67.13, "elapsed_time": "1:24:30", "remaining_time": "0:41:23"} +{"current_steps": 1659, "total_steps": 2470, "loss": 1.3488, "lr": 2.9463252046130884e-06, "epoch": 6.716599190283401, "percentage": 67.17, "elapsed_time": "1:24:33", "remaining_time": "0:41:20"} +{"current_steps": 1660, "total_steps": 2470, "loss": 1.1124, "lr": 2.9398846976259136e-06, "epoch": 6.720647773279352, "percentage": 67.21, "elapsed_time": "1:24:36", "remaining_time": "0:41:17"} +{"current_steps": 1661, "total_steps": 2470, "loss": 1.3305, "lr": 2.9334483050933506e-06, "epoch": 6.724696356275303, "percentage": 67.25, "elapsed_time": "1:24:39", "remaining_time": "0:41:14"} +{"current_steps": 1662, "total_steps": 2470, "loss": 1.4987, "lr": 2.9270160398701387e-06, "epoch": 6.728744939271255, "percentage": 67.29, "elapsed_time": "1:24:42", "remaining_time": "0:41:11"} +{"current_steps": 1663, "total_steps": 2470, "loss": 1.2143, "lr": 2.920587914802772e-06, "epoch": 6.732793522267206, "percentage": 67.33, "elapsed_time": "1:24:45", "remaining_time": "0:41:07"} +{"current_steps": 1664, "total_steps": 2470, "loss": 1.3239, "lr": 2.91416394272948e-06, "epoch": 6.7368421052631575, "percentage": 67.37, "elapsed_time": "1:24:48", "remaining_time": "0:41:04"} +{"current_steps": 1665, "total_steps": 2470, "loss": 1.9473, "lr": 2.907744136480194e-06, "epoch": 6.7408906882591095, "percentage": 67.41, "elapsed_time": "1:24:51", "remaining_time": "0:41:01"} +{"current_steps": 1666, "total_steps": 2470, "loss": 1.4691, "lr": 2.901328508876531e-06, "epoch": 6.744939271255061, "percentage": 67.45, "elapsed_time": "1:24:54", "remaining_time": "0:40:58"} +{"current_steps": 1667, "total_steps": 2470, "loss": 1.2826, "lr": 2.894917072731753e-06, "epoch": 6.748987854251012, "percentage": 67.49, "elapsed_time": "1:24:57", "remaining_time": "0:40:55"} +{"current_steps": 1668, "total_steps": 2470, "loss": 1.1948, "lr": 2.88850984085076e-06, "epoch": 6.753036437246964, "percentage": 67.53, "elapsed_time": "1:25:01", "remaining_time": "0:40:52"} +{"current_steps": 1669, "total_steps": 2470, "loss": 1.3159, "lr": 2.8821068260300505e-06, "epoch": 6.757085020242915, "percentage": 67.57, "elapsed_time": "1:25:04", "remaining_time": "0:40:49"} +{"current_steps": 1670, "total_steps": 2470, "loss": 2.064, "lr": 2.8757080410577042e-06, "epoch": 6.761133603238866, "percentage": 67.61, "elapsed_time": "1:25:07", "remaining_time": "0:40:46"} +{"current_steps": 1671, "total_steps": 2470, "loss": 1.8202, "lr": 2.8693134987133464e-06, "epoch": 6.765182186234818, "percentage": 67.65, "elapsed_time": "1:25:10", "remaining_time": "0:40:43"} +{"current_steps": 1672, "total_steps": 2470, "loss": 1.7417, "lr": 2.8629232117681354e-06, "epoch": 6.769230769230769, "percentage": 67.69, "elapsed_time": "1:25:13", "remaining_time": "0:40:40"} +{"current_steps": 1673, "total_steps": 2470, "loss": 1.2534, "lr": 2.8565371929847286e-06, "epoch": 6.77327935222672, "percentage": 67.73, "elapsed_time": "1:25:16", "remaining_time": "0:40:37"} +{"current_steps": 1674, "total_steps": 2470, "loss": 1.5421, "lr": 2.8501554551172613e-06, "epoch": 6.777327935222672, "percentage": 67.77, "elapsed_time": "1:25:19", "remaining_time": "0:40:34"} +{"current_steps": 1675, "total_steps": 2470, "loss": 1.5263, "lr": 2.843778010911311e-06, "epoch": 6.781376518218623, "percentage": 67.81, "elapsed_time": "1:25:22", "remaining_time": "0:40:31"} +{"current_steps": 1676, "total_steps": 2470, "loss": 1.3327, "lr": 2.83740487310389e-06, "epoch": 6.7854251012145745, "percentage": 67.85, "elapsed_time": "1:25:26", "remaining_time": "0:40:28"} +{"current_steps": 1677, "total_steps": 2470, "loss": 1.2674, "lr": 2.8310360544234057e-06, "epoch": 6.7894736842105265, "percentage": 67.89, "elapsed_time": "1:25:29", "remaining_time": "0:40:25"} +{"current_steps": 1678, "total_steps": 2470, "loss": 1.2836, "lr": 2.8246715675896354e-06, "epoch": 6.793522267206478, "percentage": 67.94, "elapsed_time": "1:25:32", "remaining_time": "0:40:22"} +{"current_steps": 1679, "total_steps": 2470, "loss": 1.3156, "lr": 2.81831142531371e-06, "epoch": 6.797570850202429, "percentage": 67.98, "elapsed_time": "1:25:35", "remaining_time": "0:40:19"} +{"current_steps": 1680, "total_steps": 2470, "loss": 1.2068, "lr": 2.811955640298083e-06, "epoch": 6.801619433198381, "percentage": 68.02, "elapsed_time": "1:25:38", "remaining_time": "0:40:16"} +{"current_steps": 1681, "total_steps": 2470, "loss": 1.0997, "lr": 2.8056042252365046e-06, "epoch": 6.805668016194332, "percentage": 68.06, "elapsed_time": "1:25:41", "remaining_time": "0:40:13"} +{"current_steps": 1682, "total_steps": 2470, "loss": 1.4471, "lr": 2.7992571928139984e-06, "epoch": 6.809716599190283, "percentage": 68.1, "elapsed_time": "1:25:44", "remaining_time": "0:40:10"} +{"current_steps": 1683, "total_steps": 2470, "loss": 1.2595, "lr": 2.7929145557068303e-06, "epoch": 6.813765182186235, "percentage": 68.14, "elapsed_time": "1:25:47", "remaining_time": "0:40:07"} +{"current_steps": 1684, "total_steps": 2470, "loss": 1.1699, "lr": 2.786576326582493e-06, "epoch": 6.817813765182186, "percentage": 68.18, "elapsed_time": "1:25:50", "remaining_time": "0:40:04"} +{"current_steps": 1685, "total_steps": 2470, "loss": 2.2106, "lr": 2.780242518099675e-06, "epoch": 6.821862348178137, "percentage": 68.22, "elapsed_time": "1:25:53", "remaining_time": "0:40:01"} +{"current_steps": 1686, "total_steps": 2470, "loss": 3.2586, "lr": 2.7739131429082373e-06, "epoch": 6.825910931174089, "percentage": 68.26, "elapsed_time": "1:25:56", "remaining_time": "0:39:57"} +{"current_steps": 1687, "total_steps": 2470, "loss": 1.1889, "lr": 2.7675882136491795e-06, "epoch": 6.82995951417004, "percentage": 68.3, "elapsed_time": "1:25:59", "remaining_time": "0:39:54"} +{"current_steps": 1688, "total_steps": 2470, "loss": 1.1408, "lr": 2.761267742954629e-06, "epoch": 6.834008097165992, "percentage": 68.34, "elapsed_time": "1:26:03", "remaining_time": "0:39:51"} +{"current_steps": 1689, "total_steps": 2470, "loss": 1.1687, "lr": 2.7549517434478063e-06, "epoch": 6.838056680161944, "percentage": 68.38, "elapsed_time": "1:26:06", "remaining_time": "0:39:48"} +{"current_steps": 1690, "total_steps": 2470, "loss": 1.2449, "lr": 2.7486402277430026e-06, "epoch": 6.842105263157895, "percentage": 68.42, "elapsed_time": "1:26:09", "remaining_time": "0:39:45"} +{"current_steps": 1691, "total_steps": 2470, "loss": 1.0478, "lr": 2.7423332084455543e-06, "epoch": 6.846153846153846, "percentage": 68.46, "elapsed_time": "1:26:12", "remaining_time": "0:39:42"} +{"current_steps": 1692, "total_steps": 2470, "loss": 1.2496, "lr": 2.736030698151815e-06, "epoch": 6.850202429149798, "percentage": 68.5, "elapsed_time": "1:26:15", "remaining_time": "0:39:39"} +{"current_steps": 1693, "total_steps": 2470, "loss": 1.287, "lr": 2.7297327094491344e-06, "epoch": 6.854251012145749, "percentage": 68.54, "elapsed_time": "1:26:18", "remaining_time": "0:39:36"} +{"current_steps": 1694, "total_steps": 2470, "loss": 1.2266, "lr": 2.723439254915834e-06, "epoch": 6.8582995951417, "percentage": 68.58, "elapsed_time": "1:26:21", "remaining_time": "0:39:33"} +{"current_steps": 1695, "total_steps": 2470, "loss": 1.2273, "lr": 2.717150347121177e-06, "epoch": 6.862348178137652, "percentage": 68.62, "elapsed_time": "1:26:24", "remaining_time": "0:39:30"} +{"current_steps": 1696, "total_steps": 2470, "loss": 1.2081, "lr": 2.710865998625348e-06, "epoch": 6.866396761133603, "percentage": 68.66, "elapsed_time": "1:26:27", "remaining_time": "0:39:27"} +{"current_steps": 1697, "total_steps": 2470, "loss": 1.6486, "lr": 2.704586221979422e-06, "epoch": 6.870445344129554, "percentage": 68.7, "elapsed_time": "1:26:30", "remaining_time": "0:39:24"} +{"current_steps": 1698, "total_steps": 2470, "loss": 1.5976, "lr": 2.698311029725346e-06, "epoch": 6.874493927125506, "percentage": 68.74, "elapsed_time": "1:26:33", "remaining_time": "0:39:21"} +{"current_steps": 1699, "total_steps": 2470, "loss": 1.3605, "lr": 2.6920404343959106e-06, "epoch": 6.8785425101214575, "percentage": 68.79, "elapsed_time": "1:26:36", "remaining_time": "0:39:18"} +{"current_steps": 1700, "total_steps": 2470, "loss": 1.2964, "lr": 2.6857744485147286e-06, "epoch": 6.882591093117409, "percentage": 68.83, "elapsed_time": "1:26:39", "remaining_time": "0:39:15"} +{"current_steps": 1701, "total_steps": 2470, "loss": 0.9267, "lr": 2.6795130845961993e-06, "epoch": 6.886639676113361, "percentage": 68.87, "elapsed_time": "1:26:42", "remaining_time": "0:39:12"} +{"current_steps": 1702, "total_steps": 2470, "loss": 1.4449, "lr": 2.673256355145499e-06, "epoch": 6.890688259109312, "percentage": 68.91, "elapsed_time": "1:26:45", "remaining_time": "0:39:08"} +{"current_steps": 1703, "total_steps": 2470, "loss": 1.4657, "lr": 2.667004272658541e-06, "epoch": 6.894736842105263, "percentage": 68.95, "elapsed_time": "1:26:48", "remaining_time": "0:39:05"} +{"current_steps": 1704, "total_steps": 2470, "loss": 1.2369, "lr": 2.660756849621962e-06, "epoch": 6.898785425101215, "percentage": 68.99, "elapsed_time": "1:26:51", "remaining_time": "0:39:02"} +{"current_steps": 1705, "total_steps": 2470, "loss": 1.2244, "lr": 2.6545140985130934e-06, "epoch": 6.902834008097166, "percentage": 69.03, "elapsed_time": "1:26:54", "remaining_time": "0:38:59"} +{"current_steps": 1706, "total_steps": 2470, "loss": 1.2811, "lr": 2.6482760317999338e-06, "epoch": 6.906882591093117, "percentage": 69.07, "elapsed_time": "1:26:57", "remaining_time": "0:38:56"} +{"current_steps": 1707, "total_steps": 2470, "loss": 1.0198, "lr": 2.642042661941129e-06, "epoch": 6.910931174089069, "percentage": 69.11, "elapsed_time": "1:27:00", "remaining_time": "0:38:53"} +{"current_steps": 1708, "total_steps": 2470, "loss": 1.1012, "lr": 2.635814001385938e-06, "epoch": 6.91497975708502, "percentage": 69.15, "elapsed_time": "1:27:03", "remaining_time": "0:38:50"} +{"current_steps": 1709, "total_steps": 2470, "loss": 1.1085, "lr": 2.629590062574221e-06, "epoch": 6.919028340080971, "percentage": 69.19, "elapsed_time": "1:27:07", "remaining_time": "0:38:47"} +{"current_steps": 1710, "total_steps": 2470, "loss": 1.431, "lr": 2.623370857936404e-06, "epoch": 6.923076923076923, "percentage": 69.23, "elapsed_time": "1:27:10", "remaining_time": "0:38:44"} +{"current_steps": 1711, "total_steps": 2470, "loss": 1.2774, "lr": 2.6171563998934605e-06, "epoch": 6.9271255060728745, "percentage": 69.27, "elapsed_time": "1:27:13", "remaining_time": "0:38:41"} +{"current_steps": 1712, "total_steps": 2470, "loss": 1.2618, "lr": 2.610946700856885e-06, "epoch": 6.931174089068826, "percentage": 69.31, "elapsed_time": "1:27:16", "remaining_time": "0:38:38"} +{"current_steps": 1713, "total_steps": 2470, "loss": 1.0577, "lr": 2.604741773228661e-06, "epoch": 6.935222672064778, "percentage": 69.35, "elapsed_time": "1:27:19", "remaining_time": "0:38:35"} +{"current_steps": 1714, "total_steps": 2470, "loss": 1.0688, "lr": 2.5985416294012487e-06, "epoch": 6.939271255060729, "percentage": 69.39, "elapsed_time": "1:27:22", "remaining_time": "0:38:32"} +{"current_steps": 1715, "total_steps": 2470, "loss": 1.3636, "lr": 2.592346281757552e-06, "epoch": 6.94331983805668, "percentage": 69.43, "elapsed_time": "1:27:25", "remaining_time": "0:38:29"} +{"current_steps": 1716, "total_steps": 2470, "loss": 1.0952, "lr": 2.586155742670897e-06, "epoch": 6.947368421052632, "percentage": 69.47, "elapsed_time": "1:27:28", "remaining_time": "0:38:26"} +{"current_steps": 1717, "total_steps": 2470, "loss": 1.0229, "lr": 2.5799700245050074e-06, "epoch": 6.951417004048583, "percentage": 69.51, "elapsed_time": "1:27:31", "remaining_time": "0:38:22"} +{"current_steps": 1718, "total_steps": 2470, "loss": 1.3201, "lr": 2.5737891396139713e-06, "epoch": 6.955465587044534, "percentage": 69.55, "elapsed_time": "1:27:34", "remaining_time": "0:38:20"} +{"current_steps": 1719, "total_steps": 2470, "loss": 1.3962, "lr": 2.5676131003422317e-06, "epoch": 6.959514170040486, "percentage": 69.6, "elapsed_time": "1:27:38", "remaining_time": "0:38:17"} +{"current_steps": 1720, "total_steps": 2470, "loss": 1.346, "lr": 2.561441919024551e-06, "epoch": 6.963562753036437, "percentage": 69.64, "elapsed_time": "1:27:41", "remaining_time": "0:38:14"} +{"current_steps": 1721, "total_steps": 2470, "loss": 1.3755, "lr": 2.5552756079859904e-06, "epoch": 6.967611336032388, "percentage": 69.68, "elapsed_time": "1:27:44", "remaining_time": "0:38:11"} +{"current_steps": 1722, "total_steps": 2470, "loss": 1.2917, "lr": 2.549114179541884e-06, "epoch": 6.97165991902834, "percentage": 69.72, "elapsed_time": "1:27:47", "remaining_time": "0:38:07"} +{"current_steps": 1723, "total_steps": 2470, "loss": 1.3178, "lr": 2.542957645997811e-06, "epoch": 6.9757085020242915, "percentage": 69.76, "elapsed_time": "1:27:50", "remaining_time": "0:38:04"} +{"current_steps": 1724, "total_steps": 2470, "loss": 1.3848, "lr": 2.5368060196495785e-06, "epoch": 6.979757085020243, "percentage": 69.8, "elapsed_time": "1:27:53", "remaining_time": "0:38:01"} +{"current_steps": 1725, "total_steps": 2470, "loss": 1.4391, "lr": 2.530659312783192e-06, "epoch": 6.983805668016195, "percentage": 69.84, "elapsed_time": "1:27:56", "remaining_time": "0:37:58"} +{"current_steps": 1726, "total_steps": 2470, "loss": 1.2329, "lr": 2.5245175376748334e-06, "epoch": 6.987854251012146, "percentage": 69.88, "elapsed_time": "1:27:59", "remaining_time": "0:37:55"} +{"current_steps": 1727, "total_steps": 2470, "loss": 1.2466, "lr": 2.5183807065908296e-06, "epoch": 6.991902834008097, "percentage": 69.92, "elapsed_time": "1:28:02", "remaining_time": "0:37:52"} +{"current_steps": 1728, "total_steps": 2470, "loss": 1.5637, "lr": 2.512248831787639e-06, "epoch": 6.995951417004049, "percentage": 69.96, "elapsed_time": "1:28:05", "remaining_time": "0:37:49"} +{"current_steps": 1729, "total_steps": 2470, "loss": 1.2677, "lr": 2.5061219255118186e-06, "epoch": 7.0, "percentage": 70.0, "elapsed_time": "1:28:08", "remaining_time": "0:37:46"} +{"current_steps": 1730, "total_steps": 2470, "loss": 1.3023, "lr": 2.5000000000000015e-06, "epoch": 7.004048582995951, "percentage": 70.04, "elapsed_time": "1:28:11", "remaining_time": "0:37:43"} +{"current_steps": 1731, "total_steps": 2470, "loss": 1.4651, "lr": 2.4938830674788756e-06, "epoch": 7.008097165991903, "percentage": 70.08, "elapsed_time": "1:28:14", "remaining_time": "0:37:40"} +{"current_steps": 1732, "total_steps": 2470, "loss": 1.2554, "lr": 2.4877711401651562e-06, "epoch": 7.012145748987854, "percentage": 70.12, "elapsed_time": "1:28:17", "remaining_time": "0:37:37"} +{"current_steps": 1733, "total_steps": 2470, "loss": 1.479, "lr": 2.4816642302655634e-06, "epoch": 7.016194331983805, "percentage": 70.16, "elapsed_time": "1:28:21", "remaining_time": "0:37:34"} +{"current_steps": 1734, "total_steps": 2470, "loss": 1.656, "lr": 2.475562349976791e-06, "epoch": 7.020242914979757, "percentage": 70.2, "elapsed_time": "1:28:24", "remaining_time": "0:37:31"} +{"current_steps": 1735, "total_steps": 2470, "loss": 1.5592, "lr": 2.4694655114854936e-06, "epoch": 7.0242914979757085, "percentage": 70.24, "elapsed_time": "1:28:27", "remaining_time": "0:37:28"} +{"current_steps": 1736, "total_steps": 2470, "loss": 1.2619, "lr": 2.4633737269682546e-06, "epoch": 7.02834008097166, "percentage": 70.28, "elapsed_time": "1:28:30", "remaining_time": "0:37:25"} +{"current_steps": 1737, "total_steps": 2470, "loss": 1.2686, "lr": 2.4572870085915628e-06, "epoch": 7.032388663967612, "percentage": 70.32, "elapsed_time": "1:28:33", "remaining_time": "0:37:22"} +{"current_steps": 1738, "total_steps": 2470, "loss": 1.4711, "lr": 2.4512053685117916e-06, "epoch": 7.036437246963563, "percentage": 70.36, "elapsed_time": "1:28:36", "remaining_time": "0:37:19"} +{"current_steps": 1739, "total_steps": 2470, "loss": 1.2784, "lr": 2.445128818875166e-06, "epoch": 7.040485829959514, "percentage": 70.4, "elapsed_time": "1:28:39", "remaining_time": "0:37:16"} +{"current_steps": 1740, "total_steps": 2470, "loss": 1.4178, "lr": 2.4390573718177507e-06, "epoch": 7.044534412955466, "percentage": 70.45, "elapsed_time": "1:28:42", "remaining_time": "0:37:12"} +{"current_steps": 1741, "total_steps": 2470, "loss": 1.2819, "lr": 2.4329910394654167e-06, "epoch": 7.048582995951417, "percentage": 70.49, "elapsed_time": "1:28:45", "remaining_time": "0:37:09"} +{"current_steps": 1742, "total_steps": 2470, "loss": 1.3334, "lr": 2.4269298339338205e-06, "epoch": 7.052631578947368, "percentage": 70.53, "elapsed_time": "1:28:48", "remaining_time": "0:37:06"} +{"current_steps": 1743, "total_steps": 2470, "loss": 1.1932, "lr": 2.4208737673283818e-06, "epoch": 7.05668016194332, "percentage": 70.57, "elapsed_time": "1:28:51", "remaining_time": "0:37:03"} +{"current_steps": 1744, "total_steps": 2470, "loss": 1.3354, "lr": 2.414822851744249e-06, "epoch": 7.060728744939271, "percentage": 70.61, "elapsed_time": "1:28:54", "remaining_time": "0:37:00"} +{"current_steps": 1745, "total_steps": 2470, "loss": 1.2747, "lr": 2.408777099266291e-06, "epoch": 7.064777327935222, "percentage": 70.65, "elapsed_time": "1:28:57", "remaining_time": "0:36:57"} +{"current_steps": 1746, "total_steps": 2470, "loss": 1.444, "lr": 2.4027365219690617e-06, "epoch": 7.068825910931174, "percentage": 70.69, "elapsed_time": "1:29:00", "remaining_time": "0:36:54"} +{"current_steps": 1747, "total_steps": 2470, "loss": 1.3478, "lr": 2.3967011319167804e-06, "epoch": 7.0728744939271255, "percentage": 70.73, "elapsed_time": "1:29:03", "remaining_time": "0:36:51"} +{"current_steps": 1748, "total_steps": 2470, "loss": 1.3069, "lr": 2.3906709411633073e-06, "epoch": 7.076923076923077, "percentage": 70.77, "elapsed_time": "1:29:06", "remaining_time": "0:36:48"} +{"current_steps": 1749, "total_steps": 2470, "loss": 1.4103, "lr": 2.384645961752113e-06, "epoch": 7.080971659919029, "percentage": 70.81, "elapsed_time": "1:29:09", "remaining_time": "0:36:45"} +{"current_steps": 1750, "total_steps": 2470, "loss": 1.3698, "lr": 2.378626205716265e-06, "epoch": 7.08502024291498, "percentage": 70.85, "elapsed_time": "1:29:12", "remaining_time": "0:36:42"} +{"current_steps": 1751, "total_steps": 2470, "loss": 1.3153, "lr": 2.3726116850783987e-06, "epoch": 7.089068825910931, "percentage": 70.89, "elapsed_time": "1:29:15", "remaining_time": "0:36:39"} +{"current_steps": 1752, "total_steps": 2470, "loss": 1.3918, "lr": 2.3666024118506937e-06, "epoch": 7.093117408906883, "percentage": 70.93, "elapsed_time": "1:29:18", "remaining_time": "0:36:36"} +{"current_steps": 1753, "total_steps": 2470, "loss": 1.1493, "lr": 2.3605983980348446e-06, "epoch": 7.097165991902834, "percentage": 70.97, "elapsed_time": "1:29:21", "remaining_time": "0:36:33"} +{"current_steps": 1754, "total_steps": 2470, "loss": 1.3419, "lr": 2.354599655622049e-06, "epoch": 7.101214574898785, "percentage": 71.01, "elapsed_time": "1:29:25", "remaining_time": "0:36:30"} +{"current_steps": 1755, "total_steps": 2470, "loss": 1.2658, "lr": 2.3486061965929695e-06, "epoch": 7.105263157894737, "percentage": 71.05, "elapsed_time": "1:29:28", "remaining_time": "0:36:26"} +{"current_steps": 1756, "total_steps": 2470, "loss": 1.2778, "lr": 2.3426180329177217e-06, "epoch": 7.109311740890688, "percentage": 71.09, "elapsed_time": "1:29:31", "remaining_time": "0:36:23"} +{"current_steps": 1757, "total_steps": 2470, "loss": 1.2168, "lr": 2.3366351765558437e-06, "epoch": 7.113360323886639, "percentage": 71.13, "elapsed_time": "1:29:34", "remaining_time": "0:36:20"} +{"current_steps": 1758, "total_steps": 2470, "loss": 1.1279, "lr": 2.3306576394562748e-06, "epoch": 7.117408906882591, "percentage": 71.17, "elapsed_time": "1:29:37", "remaining_time": "0:36:17"} +{"current_steps": 1759, "total_steps": 2470, "loss": 1.2, "lr": 2.3246854335573303e-06, "epoch": 7.1214574898785425, "percentage": 71.21, "elapsed_time": "1:29:40", "remaining_time": "0:36:14"} +{"current_steps": 1760, "total_steps": 2470, "loss": 1.2204, "lr": 2.318718570786675e-06, "epoch": 7.125506072874494, "percentage": 71.26, "elapsed_time": "1:29:43", "remaining_time": "0:36:11"} +{"current_steps": 1761, "total_steps": 2470, "loss": 1.0923, "lr": 2.3127570630613064e-06, "epoch": 7.129554655870446, "percentage": 71.3, "elapsed_time": "1:29:46", "remaining_time": "0:36:08"} +{"current_steps": 1762, "total_steps": 2470, "loss": 1.4491, "lr": 2.3068009222875256e-06, "epoch": 7.133603238866397, "percentage": 71.34, "elapsed_time": "1:29:49", "remaining_time": "0:36:05"} +{"current_steps": 1763, "total_steps": 2470, "loss": 1.2557, "lr": 2.3008501603609147e-06, "epoch": 7.137651821862348, "percentage": 71.38, "elapsed_time": "1:29:52", "remaining_time": "0:36:02"} +{"current_steps": 1764, "total_steps": 2470, "loss": 1.023, "lr": 2.294904789166315e-06, "epoch": 7.1417004048583, "percentage": 71.42, "elapsed_time": "1:29:55", "remaining_time": "0:35:59"} +{"current_steps": 1765, "total_steps": 2470, "loss": 1.3439, "lr": 2.288964820577797e-06, "epoch": 7.145748987854251, "percentage": 71.46, "elapsed_time": "1:29:58", "remaining_time": "0:35:56"} +{"current_steps": 1766, "total_steps": 2470, "loss": 1.182, "lr": 2.283030266458644e-06, "epoch": 7.149797570850202, "percentage": 71.5, "elapsed_time": "1:30:02", "remaining_time": "0:35:53"} +{"current_steps": 1767, "total_steps": 2470, "loss": 1.4117, "lr": 2.2771011386613268e-06, "epoch": 7.153846153846154, "percentage": 71.54, "elapsed_time": "1:30:05", "remaining_time": "0:35:50"} +{"current_steps": 1768, "total_steps": 2470, "loss": 1.4173, "lr": 2.2711774490274767e-06, "epoch": 7.157894736842105, "percentage": 71.58, "elapsed_time": "1:30:08", "remaining_time": "0:35:47"} +{"current_steps": 1769, "total_steps": 2470, "loss": 1.2429, "lr": 2.265259209387867e-06, "epoch": 7.161943319838056, "percentage": 71.62, "elapsed_time": "1:30:11", "remaining_time": "0:35:44"} +{"current_steps": 1770, "total_steps": 2470, "loss": 1.3316, "lr": 2.259346431562379e-06, "epoch": 7.165991902834008, "percentage": 71.66, "elapsed_time": "1:30:14", "remaining_time": "0:35:41"} +{"current_steps": 1771, "total_steps": 2470, "loss": 1.9136, "lr": 2.2534391273599937e-06, "epoch": 7.17004048582996, "percentage": 71.7, "elapsed_time": "1:30:17", "remaining_time": "0:35:38"} +{"current_steps": 1772, "total_steps": 2470, "loss": 1.1497, "lr": 2.2475373085787568e-06, "epoch": 7.174089068825911, "percentage": 71.74, "elapsed_time": "1:30:20", "remaining_time": "0:35:35"} +{"current_steps": 1773, "total_steps": 2470, "loss": 1.353, "lr": 2.2416409870057577e-06, "epoch": 7.178137651821863, "percentage": 71.78, "elapsed_time": "1:30:24", "remaining_time": "0:35:32"} +{"current_steps": 1774, "total_steps": 2470, "loss": 1.1492, "lr": 2.2357501744171105e-06, "epoch": 7.182186234817814, "percentage": 71.82, "elapsed_time": "1:30:27", "remaining_time": "0:35:29"} +{"current_steps": 1775, "total_steps": 2470, "loss": 1.3322, "lr": 2.229864882577921e-06, "epoch": 7.186234817813765, "percentage": 71.86, "elapsed_time": "1:30:30", "remaining_time": "0:35:26"} +{"current_steps": 1776, "total_steps": 2470, "loss": 1.3631, "lr": 2.2239851232422736e-06, "epoch": 7.190283400809717, "percentage": 71.9, "elapsed_time": "1:30:33", "remaining_time": "0:35:23"} +{"current_steps": 1777, "total_steps": 2470, "loss": 1.5276, "lr": 2.218110908153202e-06, "epoch": 7.194331983805668, "percentage": 71.94, "elapsed_time": "1:30:36", "remaining_time": "0:35:20"} +{"current_steps": 1778, "total_steps": 2470, "loss": 1.5831, "lr": 2.2122422490426676e-06, "epoch": 7.198380566801619, "percentage": 71.98, "elapsed_time": "1:30:39", "remaining_time": "0:35:17"} +{"current_steps": 1779, "total_steps": 2470, "loss": 1.2908, "lr": 2.206379157631532e-06, "epoch": 7.202429149797571, "percentage": 72.02, "elapsed_time": "1:30:42", "remaining_time": "0:35:13"} +{"current_steps": 1780, "total_steps": 2470, "loss": 1.6171, "lr": 2.200521645629542e-06, "epoch": 7.206477732793522, "percentage": 72.06, "elapsed_time": "1:30:45", "remaining_time": "0:35:10"} +{"current_steps": 1781, "total_steps": 2470, "loss": 1.6111, "lr": 2.194669724735296e-06, "epoch": 7.2105263157894735, "percentage": 72.11, "elapsed_time": "1:30:48", "remaining_time": "0:35:07"} +{"current_steps": 1782, "total_steps": 2470, "loss": 1.3854, "lr": 2.1888234066362303e-06, "epoch": 7.2145748987854255, "percentage": 72.15, "elapsed_time": "1:30:51", "remaining_time": "0:35:04"} +{"current_steps": 1783, "total_steps": 2470, "loss": 1.2693, "lr": 2.18298270300859e-06, "epoch": 7.218623481781377, "percentage": 72.19, "elapsed_time": "1:30:54", "remaining_time": "0:35:01"} +{"current_steps": 1784, "total_steps": 2470, "loss": 1.2078, "lr": 2.1771476255174056e-06, "epoch": 7.222672064777328, "percentage": 72.23, "elapsed_time": "1:30:57", "remaining_time": "0:34:58"} +{"current_steps": 1785, "total_steps": 2470, "loss": 1.413, "lr": 2.1713181858164746e-06, "epoch": 7.22672064777328, "percentage": 72.27, "elapsed_time": "1:31:00", "remaining_time": "0:34:55"} +{"current_steps": 1786, "total_steps": 2470, "loss": 1.1968, "lr": 2.165494395548329e-06, "epoch": 7.230769230769231, "percentage": 72.31, "elapsed_time": "1:31:03", "remaining_time": "0:34:52"} +{"current_steps": 1787, "total_steps": 2470, "loss": 1.4229, "lr": 2.159676266344222e-06, "epoch": 7.234817813765182, "percentage": 72.35, "elapsed_time": "1:31:06", "remaining_time": "0:34:49"} +{"current_steps": 1788, "total_steps": 2470, "loss": 1.3623, "lr": 2.1538638098241e-06, "epoch": 7.238866396761134, "percentage": 72.39, "elapsed_time": "1:31:09", "remaining_time": "0:34:46"} +{"current_steps": 1789, "total_steps": 2470, "loss": 1.396, "lr": 2.14805703759658e-06, "epoch": 7.242914979757085, "percentage": 72.43, "elapsed_time": "1:31:13", "remaining_time": "0:34:43"} +{"current_steps": 1790, "total_steps": 2470, "loss": 1.252, "lr": 2.1422559612589266e-06, "epoch": 7.246963562753036, "percentage": 72.47, "elapsed_time": "1:31:16", "remaining_time": "0:34:40"} +{"current_steps": 1791, "total_steps": 2470, "loss": 1.344, "lr": 2.136460592397025e-06, "epoch": 7.251012145748988, "percentage": 72.51, "elapsed_time": "1:31:19", "remaining_time": "0:34:37"} +{"current_steps": 1792, "total_steps": 2470, "loss": 1.291, "lr": 2.1306709425853663e-06, "epoch": 7.255060728744939, "percentage": 72.55, "elapsed_time": "1:31:22", "remaining_time": "0:34:34"} +{"current_steps": 1793, "total_steps": 2470, "loss": 1.25, "lr": 2.124887023387017e-06, "epoch": 7.2591093117408905, "percentage": 72.59, "elapsed_time": "1:31:25", "remaining_time": "0:34:31"} +{"current_steps": 1794, "total_steps": 2470, "loss": 1.0352, "lr": 2.1191088463535997e-06, "epoch": 7.2631578947368425, "percentage": 72.63, "elapsed_time": "1:31:28", "remaining_time": "0:34:28"} +{"current_steps": 1795, "total_steps": 2470, "loss": 1.3293, "lr": 2.113336423025269e-06, "epoch": 7.267206477732794, "percentage": 72.67, "elapsed_time": "1:31:31", "remaining_time": "0:34:24"} +{"current_steps": 1796, "total_steps": 2470, "loss": 1.3279, "lr": 2.1075697649306838e-06, "epoch": 7.271255060728745, "percentage": 72.71, "elapsed_time": "1:31:34", "remaining_time": "0:34:21"} +{"current_steps": 1797, "total_steps": 2470, "loss": 1.4052, "lr": 2.1018088835869943e-06, "epoch": 7.275303643724697, "percentage": 72.75, "elapsed_time": "1:31:37", "remaining_time": "0:34:18"} +{"current_steps": 1798, "total_steps": 2470, "loss": 1.3052, "lr": 2.0960537904998113e-06, "epoch": 7.279352226720648, "percentage": 72.79, "elapsed_time": "1:31:40", "remaining_time": "0:34:15"} +{"current_steps": 1799, "total_steps": 2470, "loss": 0.9953, "lr": 2.0903044971631854e-06, "epoch": 7.283400809716599, "percentage": 72.83, "elapsed_time": "1:31:43", "remaining_time": "0:34:12"} +{"current_steps": 1800, "total_steps": 2470, "loss": 1.1524, "lr": 2.084561015059585e-06, "epoch": 7.287449392712551, "percentage": 72.87, "elapsed_time": "1:31:46", "remaining_time": "0:34:09"} +{"current_steps": 1801, "total_steps": 2470, "loss": 1.019, "lr": 2.0788233556598688e-06, "epoch": 7.291497975708502, "percentage": 72.91, "elapsed_time": "1:31:49", "remaining_time": "0:34:06"} +{"current_steps": 1802, "total_steps": 2470, "loss": 1.2347, "lr": 2.0730915304232692e-06, "epoch": 7.295546558704453, "percentage": 72.96, "elapsed_time": "1:31:52", "remaining_time": "0:34:03"} +{"current_steps": 1803, "total_steps": 2470, "loss": 1.4674, "lr": 2.067365550797367e-06, "epoch": 7.299595141700405, "percentage": 73.0, "elapsed_time": "1:31:55", "remaining_time": "0:34:00"} +{"current_steps": 1804, "total_steps": 2470, "loss": 1.0762, "lr": 2.061645428218067e-06, "epoch": 7.303643724696356, "percentage": 73.04, "elapsed_time": "1:31:58", "remaining_time": "0:33:57"} +{"current_steps": 1805, "total_steps": 2470, "loss": 1.1289, "lr": 2.055931174109579e-06, "epoch": 7.3076923076923075, "percentage": 73.08, "elapsed_time": "1:32:01", "remaining_time": "0:33:54"} +{"current_steps": 1806, "total_steps": 2470, "loss": 1.1799, "lr": 2.050222799884387e-06, "epoch": 7.3117408906882595, "percentage": 73.12, "elapsed_time": "1:32:04", "remaining_time": "0:33:51"} +{"current_steps": 1807, "total_steps": 2470, "loss": 1.0631, "lr": 2.044520316943235e-06, "epoch": 7.315789473684211, "percentage": 73.16, "elapsed_time": "1:32:07", "remaining_time": "0:33:48"} +{"current_steps": 1808, "total_steps": 2470, "loss": 1.03, "lr": 2.0388237366751005e-06, "epoch": 7.319838056680162, "percentage": 73.2, "elapsed_time": "1:32:10", "remaining_time": "0:33:45"} +{"current_steps": 1809, "total_steps": 2470, "loss": 1.0775, "lr": 2.0331330704571746e-06, "epoch": 7.323886639676114, "percentage": 73.24, "elapsed_time": "1:32:13", "remaining_time": "0:33:42"} +{"current_steps": 1810, "total_steps": 2470, "loss": 1.0956, "lr": 2.027448329654832e-06, "epoch": 7.327935222672065, "percentage": 73.28, "elapsed_time": "1:32:17", "remaining_time": "0:33:39"} +{"current_steps": 1811, "total_steps": 2470, "loss": 1.132, "lr": 2.02176952562162e-06, "epoch": 7.331983805668016, "percentage": 73.32, "elapsed_time": "1:32:20", "remaining_time": "0:33:35"} +{"current_steps": 1812, "total_steps": 2470, "loss": 1.235, "lr": 2.0160966696992195e-06, "epoch": 7.336032388663968, "percentage": 73.36, "elapsed_time": "1:32:23", "remaining_time": "0:33:32"} +{"current_steps": 1813, "total_steps": 2470, "loss": 1.1607, "lr": 2.0104297732174403e-06, "epoch": 7.340080971659919, "percentage": 73.4, "elapsed_time": "1:32:26", "remaining_time": "0:33:29"} +{"current_steps": 1814, "total_steps": 2470, "loss": 1.069, "lr": 2.004768847494186e-06, "epoch": 7.34412955465587, "percentage": 73.44, "elapsed_time": "1:32:29", "remaining_time": "0:33:26"} +{"current_steps": 1815, "total_steps": 2470, "loss": 1.2088, "lr": 1.999113903835438e-06, "epoch": 7.348178137651822, "percentage": 73.48, "elapsed_time": "1:32:32", "remaining_time": "0:33:23"} +{"current_steps": 1816, "total_steps": 2470, "loss": 1.215, "lr": 1.9934649535352286e-06, "epoch": 7.352226720647773, "percentage": 73.52, "elapsed_time": "1:32:35", "remaining_time": "0:33:20"} +{"current_steps": 1817, "total_steps": 2470, "loss": 0.8957, "lr": 1.987822007875617e-06, "epoch": 7.3562753036437245, "percentage": 73.56, "elapsed_time": "1:32:39", "remaining_time": "0:33:17"} +{"current_steps": 1818, "total_steps": 2470, "loss": 1.2878, "lr": 1.982185078126676e-06, "epoch": 7.3603238866396765, "percentage": 73.6, "elapsed_time": "1:32:42", "remaining_time": "0:33:14"} +{"current_steps": 1819, "total_steps": 2470, "loss": 1.3594, "lr": 1.9765541755464605e-06, "epoch": 7.364372469635628, "percentage": 73.64, "elapsed_time": "1:32:45", "remaining_time": "0:33:11"} +{"current_steps": 1820, "total_steps": 2470, "loss": 1.2518, "lr": 1.9709293113809876e-06, "epoch": 7.368421052631579, "percentage": 73.68, "elapsed_time": "1:32:48", "remaining_time": "0:33:08"} +{"current_steps": 1821, "total_steps": 2470, "loss": 1.3044, "lr": 1.965310496864217e-06, "epoch": 7.372469635627531, "percentage": 73.72, "elapsed_time": "1:32:51", "remaining_time": "0:33:05"} +{"current_steps": 1822, "total_steps": 2470, "loss": 1.0096, "lr": 1.9596977432180212e-06, "epoch": 7.376518218623482, "percentage": 73.77, "elapsed_time": "1:32:54", "remaining_time": "0:33:02"} +{"current_steps": 1823, "total_steps": 2470, "loss": 1.1521, "lr": 1.954091061652172e-06, "epoch": 7.380566801619433, "percentage": 73.81, "elapsed_time": "1:32:57", "remaining_time": "0:32:59"} +{"current_steps": 1824, "total_steps": 2470, "loss": 0.9629, "lr": 1.948490463364313e-06, "epoch": 7.384615384615385, "percentage": 73.85, "elapsed_time": "1:33:00", "remaining_time": "0:32:56"} +{"current_steps": 1825, "total_steps": 2470, "loss": 1.0332, "lr": 1.942895959539939e-06, "epoch": 7.388663967611336, "percentage": 73.89, "elapsed_time": "1:33:03", "remaining_time": "0:32:53"} +{"current_steps": 1826, "total_steps": 2470, "loss": 1.219, "lr": 1.9373075613523728e-06, "epoch": 7.392712550607287, "percentage": 73.93, "elapsed_time": "1:33:06", "remaining_time": "0:32:50"} +{"current_steps": 1827, "total_steps": 2470, "loss": 1.0144, "lr": 1.9317252799627393e-06, "epoch": 7.396761133603239, "percentage": 73.97, "elapsed_time": "1:33:09", "remaining_time": "0:32:47"} +{"current_steps": 1828, "total_steps": 2470, "loss": 1.0604, "lr": 1.9261491265199526e-06, "epoch": 7.40080971659919, "percentage": 74.01, "elapsed_time": "1:33:12", "remaining_time": "0:32:44"} +{"current_steps": 1829, "total_steps": 2470, "loss": 1.0906, "lr": 1.920579112160685e-06, "epoch": 7.4048582995951415, "percentage": 74.05, "elapsed_time": "1:33:16", "remaining_time": "0:32:41"} +{"current_steps": 1830, "total_steps": 2470, "loss": 1.1866, "lr": 1.915015248009348e-06, "epoch": 7.4089068825910935, "percentage": 74.09, "elapsed_time": "1:33:19", "remaining_time": "0:32:38"} +{"current_steps": 1831, "total_steps": 2470, "loss": 1.0234, "lr": 1.9094575451780727e-06, "epoch": 7.412955465587045, "percentage": 74.13, "elapsed_time": "1:33:22", "remaining_time": "0:32:35"} +{"current_steps": 1832, "total_steps": 2470, "loss": 1.3152, "lr": 1.903906014766681e-06, "epoch": 7.417004048582996, "percentage": 74.17, "elapsed_time": "1:33:25", "remaining_time": "0:32:32"} +{"current_steps": 1833, "total_steps": 2470, "loss": 1.3466, "lr": 1.8983606678626665e-06, "epoch": 7.421052631578947, "percentage": 74.21, "elapsed_time": "1:33:28", "remaining_time": "0:32:28"} +{"current_steps": 1834, "total_steps": 2470, "loss": 1.3615, "lr": 1.8928215155411773e-06, "epoch": 7.425101214574899, "percentage": 74.25, "elapsed_time": "1:33:31", "remaining_time": "0:32:25"} +{"current_steps": 1835, "total_steps": 2470, "loss": 1.3325, "lr": 1.8872885688649879e-06, "epoch": 7.42914979757085, "percentage": 74.29, "elapsed_time": "1:33:34", "remaining_time": "0:32:22"} +{"current_steps": 1836, "total_steps": 2470, "loss": 1.5126, "lr": 1.8817618388844783e-06, "epoch": 7.433198380566802, "percentage": 74.33, "elapsed_time": "1:33:37", "remaining_time": "0:32:19"} +{"current_steps": 1837, "total_steps": 2470, "loss": 1.2967, "lr": 1.8762413366376159e-06, "epoch": 7.437246963562753, "percentage": 74.37, "elapsed_time": "1:33:40", "remaining_time": "0:32:16"} +{"current_steps": 1838, "total_steps": 2470, "loss": 1.2391, "lr": 1.8707270731499223e-06, "epoch": 7.441295546558704, "percentage": 74.41, "elapsed_time": "1:33:43", "remaining_time": "0:32:13"} +{"current_steps": 1839, "total_steps": 2470, "loss": 1.4892, "lr": 1.865219059434467e-06, "epoch": 7.445344129554655, "percentage": 74.45, "elapsed_time": "1:33:46", "remaining_time": "0:32:10"} +{"current_steps": 1840, "total_steps": 2470, "loss": 1.2865, "lr": 1.8597173064918333e-06, "epoch": 7.449392712550607, "percentage": 74.49, "elapsed_time": "1:33:49", "remaining_time": "0:32:07"} +{"current_steps": 1841, "total_steps": 2470, "loss": 1.2753, "lr": 1.854221825310103e-06, "epoch": 7.4534412955465585, "percentage": 74.53, "elapsed_time": "1:33:52", "remaining_time": "0:32:04"} +{"current_steps": 1842, "total_steps": 2470, "loss": 1.6209, "lr": 1.8487326268648314e-06, "epoch": 7.4574898785425106, "percentage": 74.57, "elapsed_time": "1:33:55", "remaining_time": "0:32:01"} +{"current_steps": 1843, "total_steps": 2470, "loss": 1.7021, "lr": 1.8432497221190227e-06, "epoch": 7.461538461538462, "percentage": 74.62, "elapsed_time": "1:33:58", "remaining_time": "0:31:58"} +{"current_steps": 1844, "total_steps": 2470, "loss": 1.4113, "lr": 1.8377731220231144e-06, "epoch": 7.465587044534413, "percentage": 74.66, "elapsed_time": "1:34:01", "remaining_time": "0:31:55"} +{"current_steps": 1845, "total_steps": 2470, "loss": 1.4683, "lr": 1.832302837514952e-06, "epoch": 7.469635627530364, "percentage": 74.7, "elapsed_time": "1:34:04", "remaining_time": "0:31:52"} +{"current_steps": 1846, "total_steps": 2470, "loss": 1.4386, "lr": 1.8268388795197683e-06, "epoch": 7.473684210526316, "percentage": 74.74, "elapsed_time": "1:34:07", "remaining_time": "0:31:49"} +{"current_steps": 1847, "total_steps": 2470, "loss": 1.4409, "lr": 1.8213812589501611e-06, "epoch": 7.477732793522267, "percentage": 74.78, "elapsed_time": "1:34:10", "remaining_time": "0:31:46"} +{"current_steps": 1848, "total_steps": 2470, "loss": 1.357, "lr": 1.815929986706066e-06, "epoch": 7.481781376518219, "percentage": 74.82, "elapsed_time": "1:34:13", "remaining_time": "0:31:42"} +{"current_steps": 1849, "total_steps": 2470, "loss": 1.3014, "lr": 1.8104850736747458e-06, "epoch": 7.48582995951417, "percentage": 74.86, "elapsed_time": "1:34:17", "remaining_time": "0:31:39"} +{"current_steps": 1850, "total_steps": 2470, "loss": 1.2541, "lr": 1.8050465307307602e-06, "epoch": 7.489878542510121, "percentage": 74.9, "elapsed_time": "1:34:20", "remaining_time": "0:31:36"} +{"current_steps": 1851, "total_steps": 2470, "loss": 1.2069, "lr": 1.7996143687359475e-06, "epoch": 7.493927125506072, "percentage": 74.94, "elapsed_time": "1:34:23", "remaining_time": "0:31:33"} +{"current_steps": 1852, "total_steps": 2470, "loss": 1.1389, "lr": 1.7941885985394025e-06, "epoch": 7.497975708502024, "percentage": 74.98, "elapsed_time": "1:34:26", "remaining_time": "0:31:30"} +{"current_steps": 1853, "total_steps": 2470, "loss": 0.96, "lr": 1.78876923097745e-06, "epoch": 7.502024291497976, "percentage": 75.02, "elapsed_time": "1:34:29", "remaining_time": "0:31:27"} +{"current_steps": 1854, "total_steps": 2470, "loss": 1.3238, "lr": 1.783356276873633e-06, "epoch": 7.506072874493928, "percentage": 75.06, "elapsed_time": "1:34:32", "remaining_time": "0:31:24"} +{"current_steps": 1855, "total_steps": 2470, "loss": 1.2515, "lr": 1.7779497470386826e-06, "epoch": 7.510121457489879, "percentage": 75.1, "elapsed_time": "1:34:35", "remaining_time": "0:31:21"} +{"current_steps": 1856, "total_steps": 2470, "loss": 1.2487, "lr": 1.7725496522704998e-06, "epoch": 7.51417004048583, "percentage": 75.14, "elapsed_time": "1:34:38", "remaining_time": "0:31:18"} +{"current_steps": 1857, "total_steps": 2470, "loss": 1.2647, "lr": 1.7671560033541364e-06, "epoch": 7.518218623481781, "percentage": 75.18, "elapsed_time": "1:34:41", "remaining_time": "0:31:15"} +{"current_steps": 1858, "total_steps": 2470, "loss": 1.1983, "lr": 1.7617688110617653e-06, "epoch": 7.522267206477733, "percentage": 75.22, "elapsed_time": "1:34:44", "remaining_time": "0:31:12"} +{"current_steps": 1859, "total_steps": 2470, "loss": 1.037, "lr": 1.7563880861526656e-06, "epoch": 7.526315789473684, "percentage": 75.26, "elapsed_time": "1:34:47", "remaining_time": "0:31:09"} +{"current_steps": 1860, "total_steps": 2470, "loss": 1.125, "lr": 1.7510138393732029e-06, "epoch": 7.530364372469636, "percentage": 75.3, "elapsed_time": "1:34:50", "remaining_time": "0:31:06"} +{"current_steps": 1861, "total_steps": 2470, "loss": 1.1532, "lr": 1.7456460814568032e-06, "epoch": 7.534412955465587, "percentage": 75.34, "elapsed_time": "1:34:53", "remaining_time": "0:31:03"} +{"current_steps": 1862, "total_steps": 2470, "loss": 1.447, "lr": 1.7402848231239317e-06, "epoch": 7.538461538461538, "percentage": 75.38, "elapsed_time": "1:34:57", "remaining_time": "0:31:00"} +{"current_steps": 1863, "total_steps": 2470, "loss": 1.414, "lr": 1.7349300750820758e-06, "epoch": 7.5425101214574894, "percentage": 75.43, "elapsed_time": "1:35:00", "remaining_time": "0:30:57"} +{"current_steps": 1864, "total_steps": 2470, "loss": 1.9394, "lr": 1.7295818480257148e-06, "epoch": 7.5465587044534415, "percentage": 75.47, "elapsed_time": "1:35:03", "remaining_time": "0:30:54"} +{"current_steps": 1865, "total_steps": 2470, "loss": 1.6974, "lr": 1.7242401526363095e-06, "epoch": 7.550607287449393, "percentage": 75.51, "elapsed_time": "1:35:06", "remaining_time": "0:30:51"} +{"current_steps": 1866, "total_steps": 2470, "loss": 2.0666, "lr": 1.7189049995822748e-06, "epoch": 7.554655870445345, "percentage": 75.55, "elapsed_time": "1:35:09", "remaining_time": "0:30:48"} +{"current_steps": 1867, "total_steps": 2470, "loss": 1.2566, "lr": 1.7135763995189574e-06, "epoch": 7.558704453441296, "percentage": 75.59, "elapsed_time": "1:35:12", "remaining_time": "0:30:45"} +{"current_steps": 1868, "total_steps": 2470, "loss": 1.1258, "lr": 1.70825436308862e-06, "epoch": 7.562753036437247, "percentage": 75.63, "elapsed_time": "1:35:15", "remaining_time": "0:30:41"} +{"current_steps": 1869, "total_steps": 2470, "loss": 1.511, "lr": 1.70293890092041e-06, "epoch": 7.566801619433198, "percentage": 75.67, "elapsed_time": "1:35:19", "remaining_time": "0:30:39"} +{"current_steps": 1870, "total_steps": 2470, "loss": 1.1713, "lr": 1.6976300236303505e-06, "epoch": 7.57085020242915, "percentage": 75.71, "elapsed_time": "1:35:22", "remaining_time": "0:30:35"} +{"current_steps": 1871, "total_steps": 2470, "loss": 1.3418, "lr": 1.692327741821312e-06, "epoch": 7.574898785425101, "percentage": 75.75, "elapsed_time": "1:35:25", "remaining_time": "0:30:32"} +{"current_steps": 1872, "total_steps": 2470, "loss": 1.1787, "lr": 1.6870320660829908e-06, "epoch": 7.578947368421053, "percentage": 75.79, "elapsed_time": "1:35:28", "remaining_time": "0:30:29"} +{"current_steps": 1873, "total_steps": 2470, "loss": 1.2772, "lr": 1.6817430069918939e-06, "epoch": 7.582995951417004, "percentage": 75.83, "elapsed_time": "1:35:31", "remaining_time": "0:30:26"} +{"current_steps": 1874, "total_steps": 2470, "loss": 1.2858, "lr": 1.676460575111306e-06, "epoch": 7.587044534412955, "percentage": 75.87, "elapsed_time": "1:35:34", "remaining_time": "0:30:23"} +{"current_steps": 1875, "total_steps": 2470, "loss": 1.2792, "lr": 1.671184780991283e-06, "epoch": 7.5910931174089065, "percentage": 75.91, "elapsed_time": "1:35:37", "remaining_time": "0:30:20"} +{"current_steps": 1876, "total_steps": 2470, "loss": 0.9987, "lr": 1.6659156351686202e-06, "epoch": 7.5951417004048585, "percentage": 75.95, "elapsed_time": "1:35:40", "remaining_time": "0:30:17"} +{"current_steps": 1877, "total_steps": 2470, "loss": 1.1001, "lr": 1.6606531481668364e-06, "epoch": 7.59919028340081, "percentage": 75.99, "elapsed_time": "1:35:43", "remaining_time": "0:30:14"} +{"current_steps": 1878, "total_steps": 2470, "loss": 1.1799, "lr": 1.6553973304961528e-06, "epoch": 7.603238866396762, "percentage": 76.03, "elapsed_time": "1:35:46", "remaining_time": "0:30:11"} +{"current_steps": 1879, "total_steps": 2470, "loss": 0.9594, "lr": 1.6501481926534658e-06, "epoch": 7.607287449392713, "percentage": 76.07, "elapsed_time": "1:35:49", "remaining_time": "0:30:08"} +{"current_steps": 1880, "total_steps": 2470, "loss": 1.2521, "lr": 1.6449057451223354e-06, "epoch": 7.611336032388664, "percentage": 76.11, "elapsed_time": "1:35:52", "remaining_time": "0:30:05"} +{"current_steps": 1881, "total_steps": 2470, "loss": 1.2949, "lr": 1.639669998372958e-06, "epoch": 7.615384615384615, "percentage": 76.15, "elapsed_time": "1:35:55", "remaining_time": "0:30:02"} +{"current_steps": 1882, "total_steps": 2470, "loss": 1.0393, "lr": 1.6344409628621482e-06, "epoch": 7.619433198380567, "percentage": 76.19, "elapsed_time": "1:35:58", "remaining_time": "0:29:59"} +{"current_steps": 1883, "total_steps": 2470, "loss": 1.3907, "lr": 1.6292186490333172e-06, "epoch": 7.623481781376518, "percentage": 76.23, "elapsed_time": "1:36:01", "remaining_time": "0:29:56"} +{"current_steps": 1884, "total_steps": 2470, "loss": 1.2266, "lr": 1.6240030673164492e-06, "epoch": 7.62753036437247, "percentage": 76.28, "elapsed_time": "1:36:04", "remaining_time": "0:29:53"} +{"current_steps": 1885, "total_steps": 2470, "loss": 1.4968, "lr": 1.6187942281280838e-06, "epoch": 7.631578947368421, "percentage": 76.32, "elapsed_time": "1:36:07", "remaining_time": "0:29:49"} +{"current_steps": 1886, "total_steps": 2470, "loss": 1.0917, "lr": 1.6135921418712959e-06, "epoch": 7.635627530364372, "percentage": 76.36, "elapsed_time": "1:36:10", "remaining_time": "0:29:46"} +{"current_steps": 1887, "total_steps": 2470, "loss": 1.3789, "lr": 1.6083968189356724e-06, "epoch": 7.6396761133603235, "percentage": 76.4, "elapsed_time": "1:36:13", "remaining_time": "0:29:43"} +{"current_steps": 1888, "total_steps": 2470, "loss": 1.2638, "lr": 1.6032082696972945e-06, "epoch": 7.6437246963562755, "percentage": 76.44, "elapsed_time": "1:36:16", "remaining_time": "0:29:40"} +{"current_steps": 1889, "total_steps": 2470, "loss": 1.3732, "lr": 1.5980265045187139e-06, "epoch": 7.647773279352227, "percentage": 76.48, "elapsed_time": "1:36:19", "remaining_time": "0:29:37"} +{"current_steps": 1890, "total_steps": 2470, "loss": 1.1536, "lr": 1.5928515337489292e-06, "epoch": 7.651821862348179, "percentage": 76.52, "elapsed_time": "1:36:22", "remaining_time": "0:29:34"} +{"current_steps": 1891, "total_steps": 2470, "loss": 1.3585, "lr": 1.5876833677233754e-06, "epoch": 7.65587044534413, "percentage": 76.56, "elapsed_time": "1:36:25", "remaining_time": "0:29:31"} +{"current_steps": 1892, "total_steps": 2470, "loss": 1.1643, "lr": 1.5825220167638945e-06, "epoch": 7.659919028340081, "percentage": 76.6, "elapsed_time": "1:36:28", "remaining_time": "0:29:28"} +{"current_steps": 1893, "total_steps": 2470, "loss": 1.3335, "lr": 1.5773674911787157e-06, "epoch": 7.663967611336032, "percentage": 76.64, "elapsed_time": "1:36:31", "remaining_time": "0:29:25"} +{"current_steps": 1894, "total_steps": 2470, "loss": 1.3156, "lr": 1.5722198012624418e-06, "epoch": 7.668016194331984, "percentage": 76.68, "elapsed_time": "1:36:35", "remaining_time": "0:29:22"} +{"current_steps": 1895, "total_steps": 2470, "loss": 1.4919, "lr": 1.567078957296016e-06, "epoch": 7.672064777327935, "percentage": 76.72, "elapsed_time": "1:36:38", "remaining_time": "0:29:19"} +{"current_steps": 1896, "total_steps": 2470, "loss": 1.4698, "lr": 1.5619449695467142e-06, "epoch": 7.676113360323887, "percentage": 76.76, "elapsed_time": "1:36:41", "remaining_time": "0:29:16"} +{"current_steps": 1897, "total_steps": 2470, "loss": 1.3083, "lr": 1.556817848268118e-06, "epoch": 7.680161943319838, "percentage": 76.8, "elapsed_time": "1:36:44", "remaining_time": "0:29:13"} +{"current_steps": 1898, "total_steps": 2470, "loss": 1.1861, "lr": 1.5516976037000941e-06, "epoch": 7.684210526315789, "percentage": 76.84, "elapsed_time": "1:36:47", "remaining_time": "0:29:10"} +{"current_steps": 1899, "total_steps": 2470, "loss": 1.2721, "lr": 1.5465842460687786e-06, "epoch": 7.6882591093117405, "percentage": 76.88, "elapsed_time": "1:36:50", "remaining_time": "0:29:07"} +{"current_steps": 1900, "total_steps": 2470, "loss": 1.2911, "lr": 1.5414777855865466e-06, "epoch": 7.6923076923076925, "percentage": 76.92, "elapsed_time": "1:36:53", "remaining_time": "0:29:03"} +{"current_steps": 1901, "total_steps": 2470, "loss": 1.1648, "lr": 1.5363782324520033e-06, "epoch": 7.696356275303644, "percentage": 76.96, "elapsed_time": "1:36:56", "remaining_time": "0:29:00"} +{"current_steps": 1902, "total_steps": 2470, "loss": 1.6084, "lr": 1.5312855968499574e-06, "epoch": 7.700404858299595, "percentage": 77.0, "elapsed_time": "1:36:59", "remaining_time": "0:28:57"} +{"current_steps": 1903, "total_steps": 2470, "loss": 1.4184, "lr": 1.5261998889514017e-06, "epoch": 7.704453441295547, "percentage": 77.04, "elapsed_time": "1:37:02", "remaining_time": "0:28:54"} +{"current_steps": 1904, "total_steps": 2470, "loss": 1.0412, "lr": 1.5211211189134955e-06, "epoch": 7.708502024291498, "percentage": 77.09, "elapsed_time": "1:37:05", "remaining_time": "0:28:51"} +{"current_steps": 1905, "total_steps": 2470, "loss": 1.1573, "lr": 1.516049296879535e-06, "epoch": 7.712550607287449, "percentage": 77.13, "elapsed_time": "1:37:08", "remaining_time": "0:28:48"} +{"current_steps": 1906, "total_steps": 2470, "loss": 1.2783, "lr": 1.510984432978947e-06, "epoch": 7.716599190283401, "percentage": 77.17, "elapsed_time": "1:37:11", "remaining_time": "0:28:45"} +{"current_steps": 1907, "total_steps": 2470, "loss": 1.0288, "lr": 1.5059265373272574e-06, "epoch": 7.720647773279352, "percentage": 77.21, "elapsed_time": "1:37:14", "remaining_time": "0:28:42"} +{"current_steps": 1908, "total_steps": 2470, "loss": 1.2684, "lr": 1.5008756200260776e-06, "epoch": 7.724696356275303, "percentage": 77.25, "elapsed_time": "1:37:17", "remaining_time": "0:28:39"} +{"current_steps": 1909, "total_steps": 2470, "loss": 1.4278, "lr": 1.4958316911630827e-06, "epoch": 7.728744939271255, "percentage": 77.29, "elapsed_time": "1:37:20", "remaining_time": "0:28:36"} +{"current_steps": 1910, "total_steps": 2470, "loss": 1.1213, "lr": 1.4907947608119866e-06, "epoch": 7.732793522267206, "percentage": 77.33, "elapsed_time": "1:37:24", "remaining_time": "0:28:33"} +{"current_steps": 1911, "total_steps": 2470, "loss": 1.2309, "lr": 1.4857648390325257e-06, "epoch": 7.7368421052631575, "percentage": 77.37, "elapsed_time": "1:37:27", "remaining_time": "0:28:30"} +{"current_steps": 1912, "total_steps": 2470, "loss": 1.8603, "lr": 1.4807419358704433e-06, "epoch": 7.7408906882591095, "percentage": 77.41, "elapsed_time": "1:37:30", "remaining_time": "0:28:27"} +{"current_steps": 1913, "total_steps": 2470, "loss": 1.4053, "lr": 1.475726061357463e-06, "epoch": 7.744939271255061, "percentage": 77.45, "elapsed_time": "1:37:33", "remaining_time": "0:28:24"} +{"current_steps": 1914, "total_steps": 2470, "loss": 1.2025, "lr": 1.47071722551127e-06, "epoch": 7.748987854251012, "percentage": 77.49, "elapsed_time": "1:37:36", "remaining_time": "0:28:21"} +{"current_steps": 1915, "total_steps": 2470, "loss": 1.1287, "lr": 1.4657154383354948e-06, "epoch": 7.753036437246964, "percentage": 77.53, "elapsed_time": "1:37:39", "remaining_time": "0:28:18"} +{"current_steps": 1916, "total_steps": 2470, "loss": 1.2334, "lr": 1.4607207098196851e-06, "epoch": 7.757085020242915, "percentage": 77.57, "elapsed_time": "1:37:42", "remaining_time": "0:28:15"} +{"current_steps": 1917, "total_steps": 2470, "loss": 1.9826, "lr": 1.4557330499392952e-06, "epoch": 7.761133603238866, "percentage": 77.61, "elapsed_time": "1:37:45", "remaining_time": "0:28:12"} +{"current_steps": 1918, "total_steps": 2470, "loss": 1.721, "lr": 1.4507524686556612e-06, "epoch": 7.765182186234818, "percentage": 77.65, "elapsed_time": "1:37:48", "remaining_time": "0:28:09"} +{"current_steps": 1919, "total_steps": 2470, "loss": 1.6659, "lr": 1.4457789759159813e-06, "epoch": 7.769230769230769, "percentage": 77.69, "elapsed_time": "1:37:51", "remaining_time": "0:28:06"} +{"current_steps": 1920, "total_steps": 2470, "loss": 1.1808, "lr": 1.4408125816532981e-06, "epoch": 7.77327935222672, "percentage": 77.73, "elapsed_time": "1:37:55", "remaining_time": "0:28:02"} +{"current_steps": 1921, "total_steps": 2470, "loss": 1.4747, "lr": 1.435853295786473e-06, "epoch": 7.777327935222672, "percentage": 77.77, "elapsed_time": "1:37:58", "remaining_time": "0:27:59"} +{"current_steps": 1922, "total_steps": 2470, "loss": 1.4528, "lr": 1.430901128220174e-06, "epoch": 7.781376518218623, "percentage": 77.81, "elapsed_time": "1:38:01", "remaining_time": "0:27:56"} +{"current_steps": 1923, "total_steps": 2470, "loss": 1.2558, "lr": 1.4259560888448526e-06, "epoch": 7.7854251012145745, "percentage": 77.85, "elapsed_time": "1:38:04", "remaining_time": "0:27:53"} +{"current_steps": 1924, "total_steps": 2470, "loss": 1.1873, "lr": 1.4210181875367229e-06, "epoch": 7.7894736842105265, "percentage": 77.89, "elapsed_time": "1:38:07", "remaining_time": "0:27:50"} +{"current_steps": 1925, "total_steps": 2470, "loss": 1.1916, "lr": 1.4160874341577447e-06, "epoch": 7.793522267206478, "percentage": 77.94, "elapsed_time": "1:38:10", "remaining_time": "0:27:47"} +{"current_steps": 1926, "total_steps": 2470, "loss": 1.2401, "lr": 1.4111638385555965e-06, "epoch": 7.797570850202429, "percentage": 77.98, "elapsed_time": "1:38:13", "remaining_time": "0:27:44"} +{"current_steps": 1927, "total_steps": 2470, "loss": 1.1375, "lr": 1.406247410563667e-06, "epoch": 7.801619433198381, "percentage": 78.02, "elapsed_time": "1:38:16", "remaining_time": "0:27:41"} +{"current_steps": 1928, "total_steps": 2470, "loss": 1.0394, "lr": 1.4013381600010278e-06, "epoch": 7.805668016194332, "percentage": 78.06, "elapsed_time": "1:38:19", "remaining_time": "0:27:38"} +{"current_steps": 1929, "total_steps": 2470, "loss": 1.3717, "lr": 1.396436096672416e-06, "epoch": 7.809716599190283, "percentage": 78.1, "elapsed_time": "1:38:22", "remaining_time": "0:27:35"} +{"current_steps": 1930, "total_steps": 2470, "loss": 1.1632, "lr": 1.3915412303682162e-06, "epoch": 7.813765182186235, "percentage": 78.14, "elapsed_time": "1:38:25", "remaining_time": "0:27:32"} +{"current_steps": 1931, "total_steps": 2470, "loss": 1.095, "lr": 1.3866535708644335e-06, "epoch": 7.817813765182186, "percentage": 78.18, "elapsed_time": "1:38:28", "remaining_time": "0:27:29"} +{"current_steps": 1932, "total_steps": 2470, "loss": 2.1725, "lr": 1.3817731279226843e-06, "epoch": 7.821862348178137, "percentage": 78.22, "elapsed_time": "1:38:31", "remaining_time": "0:27:26"} +{"current_steps": 1933, "total_steps": 2470, "loss": 3.1191, "lr": 1.376899911290172e-06, "epoch": 7.825910931174089, "percentage": 78.26, "elapsed_time": "1:38:34", "remaining_time": "0:27:23"} +{"current_steps": 1934, "total_steps": 2470, "loss": 1.1065, "lr": 1.3720339306996666e-06, "epoch": 7.82995951417004, "percentage": 78.3, "elapsed_time": "1:38:37", "remaining_time": "0:27:20"} +{"current_steps": 1935, "total_steps": 2470, "loss": 1.076, "lr": 1.367175195869488e-06, "epoch": 7.834008097165992, "percentage": 78.34, "elapsed_time": "1:38:40", "remaining_time": "0:27:16"} +{"current_steps": 1936, "total_steps": 2470, "loss": 1.0877, "lr": 1.3623237165034807e-06, "epoch": 7.838056680161944, "percentage": 78.38, "elapsed_time": "1:38:43", "remaining_time": "0:27:13"} +{"current_steps": 1937, "total_steps": 2470, "loss": 1.181, "lr": 1.3574795022910014e-06, "epoch": 7.842105263157895, "percentage": 78.42, "elapsed_time": "1:38:46", "remaining_time": "0:27:10"} +{"current_steps": 1938, "total_steps": 2470, "loss": 0.9695, "lr": 1.3526425629068968e-06, "epoch": 7.846153846153846, "percentage": 78.46, "elapsed_time": "1:38:49", "remaining_time": "0:27:07"} +{"current_steps": 1939, "total_steps": 2470, "loss": 1.1728, "lr": 1.347812908011485e-06, "epoch": 7.850202429149798, "percentage": 78.5, "elapsed_time": "1:38:52", "remaining_time": "0:27:04"} +{"current_steps": 1940, "total_steps": 2470, "loss": 1.2049, "lr": 1.3429905472505344e-06, "epoch": 7.854251012145749, "percentage": 78.54, "elapsed_time": "1:38:55", "remaining_time": "0:27:01"} +{"current_steps": 1941, "total_steps": 2470, "loss": 1.1544, "lr": 1.3381754902552474e-06, "epoch": 7.8582995951417, "percentage": 78.58, "elapsed_time": "1:38:59", "remaining_time": "0:26:58"} +{"current_steps": 1942, "total_steps": 2470, "loss": 1.1535, "lr": 1.3333677466422357e-06, "epoch": 7.862348178137652, "percentage": 78.62, "elapsed_time": "1:39:02", "remaining_time": "0:26:55"} +{"current_steps": 1943, "total_steps": 2470, "loss": 1.1238, "lr": 1.3285673260135073e-06, "epoch": 7.866396761133603, "percentage": 78.66, "elapsed_time": "1:39:05", "remaining_time": "0:26:52"} +{"current_steps": 1944, "total_steps": 2470, "loss": 1.5443, "lr": 1.323774237956445e-06, "epoch": 7.870445344129554, "percentage": 78.7, "elapsed_time": "1:39:08", "remaining_time": "0:26:49"} +{"current_steps": 1945, "total_steps": 2470, "loss": 1.4939, "lr": 1.3189884920437867e-06, "epoch": 7.874493927125506, "percentage": 78.74, "elapsed_time": "1:39:11", "remaining_time": "0:26:46"} +{"current_steps": 1946, "total_steps": 2470, "loss": 1.2695, "lr": 1.314210097833607e-06, "epoch": 7.8785425101214575, "percentage": 78.79, "elapsed_time": "1:39:14", "remaining_time": "0:26:43"} +{"current_steps": 1947, "total_steps": 2470, "loss": 1.2076, "lr": 1.309439064869295e-06, "epoch": 7.882591093117409, "percentage": 78.83, "elapsed_time": "1:39:17", "remaining_time": "0:26:40"} +{"current_steps": 1948, "total_steps": 2470, "loss": 0.8564, "lr": 1.3046754026795406e-06, "epoch": 7.886639676113361, "percentage": 78.87, "elapsed_time": "1:39:20", "remaining_time": "0:26:37"} +{"current_steps": 1949, "total_steps": 2470, "loss": 1.3827, "lr": 1.2999191207783129e-06, "epoch": 7.890688259109312, "percentage": 78.91, "elapsed_time": "1:39:23", "remaining_time": "0:26:34"} +{"current_steps": 1950, "total_steps": 2470, "loss": 1.3867, "lr": 1.2951702286648399e-06, "epoch": 7.894736842105263, "percentage": 78.95, "elapsed_time": "1:39:26", "remaining_time": "0:26:31"} +{"current_steps": 1951, "total_steps": 2470, "loss": 1.1739, "lr": 1.290428735823593e-06, "epoch": 7.898785425101215, "percentage": 78.99, "elapsed_time": "1:39:29", "remaining_time": "0:26:28"} +{"current_steps": 1952, "total_steps": 2470, "loss": 1.1495, "lr": 1.2856946517242608e-06, "epoch": 7.902834008097166, "percentage": 79.03, "elapsed_time": "1:39:32", "remaining_time": "0:26:24"} +{"current_steps": 1953, "total_steps": 2470, "loss": 1.1842, "lr": 1.28096798582174e-06, "epoch": 7.906882591093117, "percentage": 79.07, "elapsed_time": "1:39:35", "remaining_time": "0:26:21"} +{"current_steps": 1954, "total_steps": 2470, "loss": 0.9544, "lr": 1.2762487475561109e-06, "epoch": 7.910931174089069, "percentage": 79.11, "elapsed_time": "1:39:38", "remaining_time": "0:26:18"} +{"current_steps": 1955, "total_steps": 2470, "loss": 1.0285, "lr": 1.2715369463526173e-06, "epoch": 7.91497975708502, "percentage": 79.15, "elapsed_time": "1:39:41", "remaining_time": "0:26:15"} +{"current_steps": 1956, "total_steps": 2470, "loss": 1.0359, "lr": 1.2668325916216534e-06, "epoch": 7.919028340080971, "percentage": 79.19, "elapsed_time": "1:39:44", "remaining_time": "0:26:12"} +{"current_steps": 1957, "total_steps": 2470, "loss": 1.3581, "lr": 1.2621356927587353e-06, "epoch": 7.923076923076923, "percentage": 79.23, "elapsed_time": "1:39:47", "remaining_time": "0:26:09"} +{"current_steps": 1958, "total_steps": 2470, "loss": 1.2012, "lr": 1.257446259144494e-06, "epoch": 7.9271255060728745, "percentage": 79.27, "elapsed_time": "1:39:51", "remaining_time": "0:26:06"} +{"current_steps": 1959, "total_steps": 2470, "loss": 1.181, "lr": 1.2527643001446493e-06, "epoch": 7.931174089068826, "percentage": 79.31, "elapsed_time": "1:39:54", "remaining_time": "0:26:03"} +{"current_steps": 1960, "total_steps": 2470, "loss": 0.9855, "lr": 1.248089825109991e-06, "epoch": 7.935222672064778, "percentage": 79.35, "elapsed_time": "1:39:57", "remaining_time": "0:26:00"} +{"current_steps": 1961, "total_steps": 2470, "loss": 1.0055, "lr": 1.2434228433763657e-06, "epoch": 7.939271255060729, "percentage": 79.39, "elapsed_time": "1:40:00", "remaining_time": "0:25:57"} +{"current_steps": 1962, "total_steps": 2470, "loss": 1.2977, "lr": 1.2387633642646501e-06, "epoch": 7.94331983805668, "percentage": 79.43, "elapsed_time": "1:40:03", "remaining_time": "0:25:54"} +{"current_steps": 1963, "total_steps": 2470, "loss": 1.0272, "lr": 1.2341113970807368e-06, "epoch": 7.947368421052632, "percentage": 79.47, "elapsed_time": "1:40:06", "remaining_time": "0:25:51"} +{"current_steps": 1964, "total_steps": 2470, "loss": 0.939, "lr": 1.2294669511155193e-06, "epoch": 7.951417004048583, "percentage": 79.51, "elapsed_time": "1:40:09", "remaining_time": "0:25:48"} +{"current_steps": 1965, "total_steps": 2470, "loss": 1.2616, "lr": 1.224830035644868e-06, "epoch": 7.955465587044534, "percentage": 79.55, "elapsed_time": "1:40:13", "remaining_time": "0:25:45"} +{"current_steps": 1966, "total_steps": 2470, "loss": 1.3384, "lr": 1.2202006599296122e-06, "epoch": 7.959514170040486, "percentage": 79.6, "elapsed_time": "1:40:16", "remaining_time": "0:25:42"} +{"current_steps": 1967, "total_steps": 2470, "loss": 1.2777, "lr": 1.215578833215526e-06, "epoch": 7.963562753036437, "percentage": 79.64, "elapsed_time": "1:40:19", "remaining_time": "0:25:39"} +{"current_steps": 1968, "total_steps": 2470, "loss": 1.2766, "lr": 1.2109645647333018e-06, "epoch": 7.967611336032388, "percentage": 79.68, "elapsed_time": "1:40:22", "remaining_time": "0:25:36"} +{"current_steps": 1969, "total_steps": 2470, "loss": 1.2, "lr": 1.2063578636985402e-06, "epoch": 7.97165991902834, "percentage": 79.72, "elapsed_time": "1:40:25", "remaining_time": "0:25:33"} +{"current_steps": 1970, "total_steps": 2470, "loss": 1.2542, "lr": 1.201758739311728e-06, "epoch": 7.9757085020242915, "percentage": 79.76, "elapsed_time": "1:40:28", "remaining_time": "0:25:30"} +{"current_steps": 1971, "total_steps": 2470, "loss": 1.3138, "lr": 1.1971672007582192e-06, "epoch": 7.979757085020243, "percentage": 79.8, "elapsed_time": "1:40:31", "remaining_time": "0:25:27"} +{"current_steps": 1972, "total_steps": 2470, "loss": 1.3645, "lr": 1.1925832572082184e-06, "epoch": 7.983805668016195, "percentage": 79.84, "elapsed_time": "1:40:34", "remaining_time": "0:25:24"} +{"current_steps": 1973, "total_steps": 2470, "loss": 1.1615, "lr": 1.1880069178167586e-06, "epoch": 7.987854251012146, "percentage": 79.88, "elapsed_time": "1:40:37", "remaining_time": "0:25:20"} +{"current_steps": 1974, "total_steps": 2470, "loss": 1.1793, "lr": 1.1834381917236881e-06, "epoch": 7.991902834008097, "percentage": 79.92, "elapsed_time": "1:40:40", "remaining_time": "0:25:17"} +{"current_steps": 1975, "total_steps": 2470, "loss": 1.5002, "lr": 1.178877088053651e-06, "epoch": 7.995951417004049, "percentage": 79.96, "elapsed_time": "1:40:43", "remaining_time": "0:25:14"} +{"current_steps": 1976, "total_steps": 2470, "loss": 1.2012, "lr": 1.1743236159160654e-06, "epoch": 8.0, "percentage": 80.0, "elapsed_time": "1:40:47", "remaining_time": "0:25:11"} +{"current_steps": 1977, "total_steps": 2470, "loss": 1.2312, "lr": 1.1697777844051105e-06, "epoch": 8.004048582995951, "percentage": 80.04, "elapsed_time": "1:41:55", "remaining_time": "0:25:25"} +{"current_steps": 1978, "total_steps": 2470, "loss": 1.4044, "lr": 1.165239602599702e-06, "epoch": 8.008097165991902, "percentage": 80.08, "elapsed_time": "1:41:58", "remaining_time": "0:25:21"} +{"current_steps": 1979, "total_steps": 2470, "loss": 1.179, "lr": 1.1607090795634802e-06, "epoch": 8.012145748987853, "percentage": 80.12, "elapsed_time": "1:42:01", "remaining_time": "0:25:18"} +{"current_steps": 1980, "total_steps": 2470, "loss": 1.4132, "lr": 1.156186224344789e-06, "epoch": 8.016194331983806, "percentage": 80.16, "elapsed_time": "1:42:04", "remaining_time": "0:25:15"} +{"current_steps": 1981, "total_steps": 2470, "loss": 1.5665, "lr": 1.1516710459766589e-06, "epoch": 8.020242914979757, "percentage": 80.2, "elapsed_time": "1:42:08", "remaining_time": "0:25:12"} +{"current_steps": 1982, "total_steps": 2470, "loss": 1.4869, "lr": 1.1471635534767877e-06, "epoch": 8.024291497975709, "percentage": 80.24, "elapsed_time": "1:42:11", "remaining_time": "0:25:09"} +{"current_steps": 1983, "total_steps": 2470, "loss": 1.1981, "lr": 1.1426637558475206e-06, "epoch": 8.02834008097166, "percentage": 80.28, "elapsed_time": "1:42:14", "remaining_time": "0:25:06"} +{"current_steps": 1984, "total_steps": 2470, "loss": 1.2025, "lr": 1.138171662075837e-06, "epoch": 8.03238866396761, "percentage": 80.32, "elapsed_time": "1:42:17", "remaining_time": "0:25:03"} +{"current_steps": 1985, "total_steps": 2470, "loss": 1.4043, "lr": 1.133687281133331e-06, "epoch": 8.036437246963562, "percentage": 80.36, "elapsed_time": "1:42:20", "remaining_time": "0:25:00"} +{"current_steps": 1986, "total_steps": 2470, "loss": 1.2134, "lr": 1.1292106219761928e-06, "epoch": 8.040485829959515, "percentage": 80.4, "elapsed_time": "1:42:23", "remaining_time": "0:24:57"} +{"current_steps": 1987, "total_steps": 2470, "loss": 1.3732, "lr": 1.1247416935451855e-06, "epoch": 8.044534412955466, "percentage": 80.45, "elapsed_time": "1:42:26", "remaining_time": "0:24:54"} +{"current_steps": 1988, "total_steps": 2470, "loss": 1.2149, "lr": 1.1202805047656406e-06, "epoch": 8.048582995951417, "percentage": 80.49, "elapsed_time": "1:42:29", "remaining_time": "0:24:50"} +{"current_steps": 1989, "total_steps": 2470, "loss": 1.2651, "lr": 1.1158270645474233e-06, "epoch": 8.052631578947368, "percentage": 80.53, "elapsed_time": "1:42:32", "remaining_time": "0:24:47"} +{"current_steps": 1990, "total_steps": 2470, "loss": 1.1235, "lr": 1.1113813817849312e-06, "epoch": 8.05668016194332, "percentage": 80.57, "elapsed_time": "1:42:35", "remaining_time": "0:24:44"} +{"current_steps": 1991, "total_steps": 2470, "loss": 1.2623, "lr": 1.1069434653570633e-06, "epoch": 8.06072874493927, "percentage": 80.61, "elapsed_time": "1:42:38", "remaining_time": "0:24:41"} +{"current_steps": 1992, "total_steps": 2470, "loss": 1.1959, "lr": 1.1025133241272113e-06, "epoch": 8.064777327935223, "percentage": 80.65, "elapsed_time": "1:42:41", "remaining_time": "0:24:38"} +{"current_steps": 1993, "total_steps": 2470, "loss": 1.3747, "lr": 1.0980909669432376e-06, "epoch": 8.068825910931174, "percentage": 80.69, "elapsed_time": "1:42:44", "remaining_time": "0:24:35"} +{"current_steps": 1994, "total_steps": 2470, "loss": 1.2673, "lr": 1.0936764026374547e-06, "epoch": 8.072874493927126, "percentage": 80.73, "elapsed_time": "1:42:48", "remaining_time": "0:24:32"} +{"current_steps": 1995, "total_steps": 2470, "loss": 1.2309, "lr": 1.0892696400266151e-06, "epoch": 8.076923076923077, "percentage": 80.77, "elapsed_time": "1:42:51", "remaining_time": "0:24:29"} +{"current_steps": 1996, "total_steps": 2470, "loss": 1.3544, "lr": 1.0848706879118893e-06, "epoch": 8.080971659919028, "percentage": 80.81, "elapsed_time": "1:42:54", "remaining_time": "0:24:26"} +{"current_steps": 1997, "total_steps": 2470, "loss": 1.3016, "lr": 1.0804795550788473e-06, "epoch": 8.085020242914979, "percentage": 80.85, "elapsed_time": "1:42:57", "remaining_time": "0:24:23"} +{"current_steps": 1998, "total_steps": 2470, "loss": 1.2539, "lr": 1.0760962502974453e-06, "epoch": 8.089068825910932, "percentage": 80.89, "elapsed_time": "1:43:00", "remaining_time": "0:24:20"} +{"current_steps": 1999, "total_steps": 2470, "loss": 1.3311, "lr": 1.0717207823220005e-06, "epoch": 8.093117408906883, "percentage": 80.93, "elapsed_time": "1:43:03", "remaining_time": "0:24:16"} +{"current_steps": 2000, "total_steps": 2470, "loss": 1.0787, "lr": 1.0673531598911824e-06, "epoch": 8.097165991902834, "percentage": 80.97, "elapsed_time": "1:43:06", "remaining_time": "0:24:13"} +{"current_steps": 2001, "total_steps": 2470, "loss": 1.2767, "lr": 1.0629933917279906e-06, "epoch": 8.101214574898785, "percentage": 81.01, "elapsed_time": "1:43:09", "remaining_time": "0:24:10"} +{"current_steps": 2002, "total_steps": 2470, "loss": 1.1861, "lr": 1.0586414865397381e-06, "epoch": 8.105263157894736, "percentage": 81.05, "elapsed_time": "1:43:12", "remaining_time": "0:24:07"} +{"current_steps": 2003, "total_steps": 2470, "loss": 1.2172, "lr": 1.0542974530180327e-06, "epoch": 8.109311740890687, "percentage": 81.09, "elapsed_time": "1:43:15", "remaining_time": "0:24:04"} +{"current_steps": 2004, "total_steps": 2470, "loss": 1.1485, "lr": 1.0499612998387621e-06, "epoch": 8.11336032388664, "percentage": 81.13, "elapsed_time": "1:43:18", "remaining_time": "0:24:01"} +{"current_steps": 2005, "total_steps": 2470, "loss": 1.0672, "lr": 1.0456330356620758e-06, "epoch": 8.117408906882591, "percentage": 81.17, "elapsed_time": "1:43:21", "remaining_time": "0:23:58"} +{"current_steps": 2006, "total_steps": 2470, "loss": 1.1479, "lr": 1.0413126691323667e-06, "epoch": 8.121457489878543, "percentage": 81.21, "elapsed_time": "1:43:31", "remaining_time": "0:23:56"} +{"current_steps": 2007, "total_steps": 2470, "loss": 1.165, "lr": 1.0370002088782555e-06, "epoch": 8.125506072874494, "percentage": 81.26, "elapsed_time": "1:43:34", "remaining_time": "0:23:53"} +{"current_steps": 2008, "total_steps": 2470, "loss": 1.0247, "lr": 1.0326956635125707e-06, "epoch": 8.129554655870445, "percentage": 81.3, "elapsed_time": "1:43:38", "remaining_time": "0:23:50"} +{"current_steps": 2009, "total_steps": 2470, "loss": 1.3881, "lr": 1.0283990416323336e-06, "epoch": 8.133603238866396, "percentage": 81.34, "elapsed_time": "1:43:41", "remaining_time": "0:23:47"} +{"current_steps": 2010, "total_steps": 2470, "loss": 1.1919, "lr": 1.0241103518187433e-06, "epoch": 8.137651821862349, "percentage": 81.38, "elapsed_time": "1:43:44", "remaining_time": "0:23:44"} +{"current_steps": 2011, "total_steps": 2470, "loss": 0.9674, "lr": 1.019829602637154e-06, "epoch": 8.1417004048583, "percentage": 81.42, "elapsed_time": "1:43:47", "remaining_time": "0:23:41"} +{"current_steps": 2012, "total_steps": 2470, "loss": 1.2791, "lr": 1.0155568026370637e-06, "epoch": 8.145748987854251, "percentage": 81.46, "elapsed_time": "1:43:50", "remaining_time": "0:23:38"} +{"current_steps": 2013, "total_steps": 2470, "loss": 1.1158, "lr": 1.0112919603520898e-06, "epoch": 8.149797570850202, "percentage": 81.5, "elapsed_time": "1:43:53", "remaining_time": "0:23:35"} +{"current_steps": 2014, "total_steps": 2470, "loss": 1.357, "lr": 1.0070350842999622e-06, "epoch": 8.153846153846153, "percentage": 81.54, "elapsed_time": "1:43:56", "remaining_time": "0:23:32"} +{"current_steps": 2015, "total_steps": 2470, "loss": 1.3434, "lr": 1.0027861829824953e-06, "epoch": 8.157894736842104, "percentage": 81.58, "elapsed_time": "1:43:59", "remaining_time": "0:23:29"} +{"current_steps": 2016, "total_steps": 2470, "loss": 1.1787, "lr": 9.985452648855803e-07, "epoch": 8.161943319838057, "percentage": 81.62, "elapsed_time": "1:44:02", "remaining_time": "0:23:25"} +{"current_steps": 2017, "total_steps": 2470, "loss": 1.2719, "lr": 9.943123384791632e-07, "epoch": 8.165991902834008, "percentage": 81.66, "elapsed_time": "1:44:06", "remaining_time": "0:23:22"} +{"current_steps": 2018, "total_steps": 2470, "loss": 1.8638, "lr": 9.900874122172294e-07, "epoch": 8.17004048582996, "percentage": 81.7, "elapsed_time": "1:44:09", "remaining_time": "0:23:19"} +{"current_steps": 2019, "total_steps": 2470, "loss": 1.0806, "lr": 9.85870494537784e-07, "epoch": 8.17408906882591, "percentage": 81.74, "elapsed_time": "1:44:12", "remaining_time": "0:23:16"} +{"current_steps": 2020, "total_steps": 2470, "loss": 1.2902, "lr": 9.816615938628409e-07, "epoch": 8.178137651821862, "percentage": 81.78, "elapsed_time": "1:44:15", "remaining_time": "0:23:13"} +{"current_steps": 2021, "total_steps": 2470, "loss": 1.0877, "lr": 9.774607185984004e-07, "epoch": 8.182186234817813, "percentage": 81.82, "elapsed_time": "1:44:18", "remaining_time": "0:23:10"} +{"current_steps": 2022, "total_steps": 2470, "loss": 1.2729, "lr": 9.732678771344344e-07, "epoch": 8.186234817813766, "percentage": 81.86, "elapsed_time": "1:44:21", "remaining_time": "0:23:07"} +{"current_steps": 2023, "total_steps": 2470, "loss": 1.2954, "lr": 9.690830778448723e-07, "epoch": 8.190283400809717, "percentage": 81.9, "elapsed_time": "1:44:24", "remaining_time": "0:23:04"} +{"current_steps": 2024, "total_steps": 2470, "loss": 1.4598, "lr": 9.649063290875771e-07, "epoch": 8.194331983805668, "percentage": 81.94, "elapsed_time": "1:44:27", "remaining_time": "0:23:01"} +{"current_steps": 2025, "total_steps": 2470, "loss": 1.5219, "lr": 9.607376392043366e-07, "epoch": 8.19838056680162, "percentage": 81.98, "elapsed_time": "1:44:30", "remaining_time": "0:22:57"} +{"current_steps": 2026, "total_steps": 2470, "loss": 1.2267, "lr": 9.565770165208432e-07, "epoch": 8.20242914979757, "percentage": 82.02, "elapsed_time": "1:44:33", "remaining_time": "0:22:54"} +{"current_steps": 2027, "total_steps": 2470, "loss": 1.5547, "lr": 9.524244693466773e-07, "epoch": 8.206477732793521, "percentage": 82.06, "elapsed_time": "1:44:36", "remaining_time": "0:22:51"} +{"current_steps": 2028, "total_steps": 2470, "loss": 1.5423, "lr": 9.482800059752911e-07, "epoch": 8.210526315789474, "percentage": 82.11, "elapsed_time": "1:44:39", "remaining_time": "0:22:48"} +{"current_steps": 2029, "total_steps": 2470, "loss": 1.3284, "lr": 9.441436346839894e-07, "epoch": 8.214574898785425, "percentage": 82.15, "elapsed_time": "1:44:42", "remaining_time": "0:22:45"} +{"current_steps": 2030, "total_steps": 2470, "loss": 1.2057, "lr": 9.400153637339182e-07, "epoch": 8.218623481781377, "percentage": 82.19, "elapsed_time": "1:44:45", "remaining_time": "0:22:42"} +{"current_steps": 2031, "total_steps": 2470, "loss": 1.1541, "lr": 9.358952013700462e-07, "epoch": 8.222672064777328, "percentage": 82.23, "elapsed_time": "1:44:48", "remaining_time": "0:22:39"} +{"current_steps": 2032, "total_steps": 2470, "loss": 1.3599, "lr": 9.317831558211449e-07, "epoch": 8.226720647773279, "percentage": 82.27, "elapsed_time": "1:44:51", "remaining_time": "0:22:36"} +{"current_steps": 2033, "total_steps": 2470, "loss": 1.1424, "lr": 9.276792352997782e-07, "epoch": 8.23076923076923, "percentage": 82.31, "elapsed_time": "1:44:54", "remaining_time": "0:22:33"} +{"current_steps": 2034, "total_steps": 2470, "loss": 1.361, "lr": 9.235834480022788e-07, "epoch": 8.234817813765183, "percentage": 82.35, "elapsed_time": "1:44:57", "remaining_time": "0:22:29"} +{"current_steps": 2035, "total_steps": 2470, "loss": 1.2944, "lr": 9.19495802108738e-07, "epoch": 8.238866396761134, "percentage": 82.39, "elapsed_time": "1:45:00", "remaining_time": "0:22:26"} +{"current_steps": 2036, "total_steps": 2470, "loss": 1.3301, "lr": 9.154163057829879e-07, "epoch": 8.242914979757085, "percentage": 82.43, "elapsed_time": "1:45:03", "remaining_time": "0:22:23"} +{"current_steps": 2037, "total_steps": 2470, "loss": 1.1986, "lr": 9.113449671725832e-07, "epoch": 8.246963562753036, "percentage": 82.47, "elapsed_time": "1:45:06", "remaining_time": "0:22:20"} +{"current_steps": 2038, "total_steps": 2470, "loss": 1.284, "lr": 9.072817944087875e-07, "epoch": 8.251012145748987, "percentage": 82.51, "elapsed_time": "1:45:10", "remaining_time": "0:22:17"} +{"current_steps": 2039, "total_steps": 2470, "loss": 1.2274, "lr": 9.032267956065516e-07, "epoch": 8.255060728744938, "percentage": 82.55, "elapsed_time": "1:45:13", "remaining_time": "0:22:14"} +{"current_steps": 2040, "total_steps": 2470, "loss": 1.1896, "lr": 8.991799788645067e-07, "epoch": 8.259109311740891, "percentage": 82.59, "elapsed_time": "1:45:16", "remaining_time": "0:22:11"} +{"current_steps": 2041, "total_steps": 2470, "loss": 0.9771, "lr": 8.951413522649372e-07, "epoch": 8.263157894736842, "percentage": 82.63, "elapsed_time": "1:45:19", "remaining_time": "0:22:08"} +{"current_steps": 2042, "total_steps": 2470, "loss": 1.2758, "lr": 8.911109238737748e-07, "epoch": 8.267206477732794, "percentage": 82.67, "elapsed_time": "1:45:22", "remaining_time": "0:22:05"} +{"current_steps": 2043, "total_steps": 2470, "loss": 1.273, "lr": 8.870887017405761e-07, "epoch": 8.271255060728745, "percentage": 82.71, "elapsed_time": "1:45:25", "remaining_time": "0:22:02"} +{"current_steps": 2044, "total_steps": 2470, "loss": 1.356, "lr": 8.830746938985091e-07, "epoch": 8.275303643724696, "percentage": 82.75, "elapsed_time": "1:45:28", "remaining_time": "0:21:58"} +{"current_steps": 2045, "total_steps": 2470, "loss": 1.2355, "lr": 8.790689083643328e-07, "epoch": 8.279352226720647, "percentage": 82.79, "elapsed_time": "1:45:31", "remaining_time": "0:21:55"} +{"current_steps": 2046, "total_steps": 2470, "loss": 0.9371, "lr": 8.750713531383886e-07, "epoch": 8.2834008097166, "percentage": 82.83, "elapsed_time": "1:45:34", "remaining_time": "0:21:52"} +{"current_steps": 2047, "total_steps": 2470, "loss": 1.0832, "lr": 8.710820362045791e-07, "epoch": 8.287449392712551, "percentage": 82.87, "elapsed_time": "1:45:37", "remaining_time": "0:21:49"} +{"current_steps": 2048, "total_steps": 2470, "loss": 0.9594, "lr": 8.671009655303531e-07, "epoch": 8.291497975708502, "percentage": 82.91, "elapsed_time": "1:45:40", "remaining_time": "0:21:46"} +{"current_steps": 2049, "total_steps": 2470, "loss": 1.1647, "lr": 8.631281490666915e-07, "epoch": 8.295546558704453, "percentage": 82.96, "elapsed_time": "1:45:43", "remaining_time": "0:21:43"} +{"current_steps": 2050, "total_steps": 2470, "loss": 1.4079, "lr": 8.591635947480854e-07, "epoch": 8.299595141700404, "percentage": 83.0, "elapsed_time": "1:45:46", "remaining_time": "0:21:40"} +{"current_steps": 2051, "total_steps": 2470, "loss": 1.0049, "lr": 8.552073104925296e-07, "epoch": 8.303643724696355, "percentage": 83.04, "elapsed_time": "1:45:49", "remaining_time": "0:21:37"} +{"current_steps": 2052, "total_steps": 2470, "loss": 1.0616, "lr": 8.512593042015005e-07, "epoch": 8.307692307692308, "percentage": 83.08, "elapsed_time": "1:45:52", "remaining_time": "0:21:34"} +{"current_steps": 2053, "total_steps": 2470, "loss": 1.1174, "lr": 8.473195837599419e-07, "epoch": 8.31174089068826, "percentage": 83.12, "elapsed_time": "1:45:55", "remaining_time": "0:21:30"} +{"current_steps": 2054, "total_steps": 2470, "loss": 0.9914, "lr": 8.433881570362484e-07, "epoch": 8.31578947368421, "percentage": 83.16, "elapsed_time": "1:45:58", "remaining_time": "0:21:27"} +{"current_steps": 2055, "total_steps": 2470, "loss": 0.9647, "lr": 8.3946503188225e-07, "epoch": 8.319838056680162, "percentage": 83.2, "elapsed_time": "1:46:01", "remaining_time": "0:21:24"} +{"current_steps": 2056, "total_steps": 2470, "loss": 1.0237, "lr": 8.355502161331985e-07, "epoch": 8.323886639676113, "percentage": 83.24, "elapsed_time": "1:46:04", "remaining_time": "0:21:21"} +{"current_steps": 2057, "total_steps": 2470, "loss": 1.0387, "lr": 8.316437176077491e-07, "epoch": 8.327935222672064, "percentage": 83.28, "elapsed_time": "1:46:07", "remaining_time": "0:21:18"} +{"current_steps": 2058, "total_steps": 2470, "loss": 1.0816, "lr": 8.277455441079463e-07, "epoch": 8.331983805668017, "percentage": 83.32, "elapsed_time": "1:46:11", "remaining_time": "0:21:15"} +{"current_steps": 2059, "total_steps": 2470, "loss": 1.189, "lr": 8.238557034192085e-07, "epoch": 8.336032388663968, "percentage": 83.36, "elapsed_time": "1:46:14", "remaining_time": "0:21:12"} +{"current_steps": 2060, "total_steps": 2470, "loss": 1.1119, "lr": 8.199742033103091e-07, "epoch": 8.34008097165992, "percentage": 83.4, "elapsed_time": "1:46:17", "remaining_time": "0:21:09"} +{"current_steps": 2061, "total_steps": 2470, "loss": 1.0109, "lr": 8.161010515333662e-07, "epoch": 8.34412955465587, "percentage": 83.44, "elapsed_time": "1:46:20", "remaining_time": "0:21:06"} +{"current_steps": 2062, "total_steps": 2470, "loss": 1.1502, "lr": 8.12236255823825e-07, "epoch": 8.348178137651821, "percentage": 83.48, "elapsed_time": "1:46:24", "remaining_time": "0:21:03"} +{"current_steps": 2063, "total_steps": 2470, "loss": 1.1497, "lr": 8.083798239004408e-07, "epoch": 8.352226720647772, "percentage": 83.52, "elapsed_time": "1:46:27", "remaining_time": "0:21:00"} +{"current_steps": 2064, "total_steps": 2470, "loss": 0.842, "lr": 8.045317634652661e-07, "epoch": 8.356275303643725, "percentage": 83.56, "elapsed_time": "1:46:30", "remaining_time": "0:20:57"} +{"current_steps": 2065, "total_steps": 2470, "loss": 1.2308, "lr": 8.006920822036307e-07, "epoch": 8.360323886639677, "percentage": 83.6, "elapsed_time": "1:46:33", "remaining_time": "0:20:53"} +{"current_steps": 2066, "total_steps": 2470, "loss": 1.3044, "lr": 7.968607877841333e-07, "epoch": 8.364372469635628, "percentage": 83.64, "elapsed_time": "1:46:36", "remaining_time": "0:20:50"} +{"current_steps": 2067, "total_steps": 2470, "loss": 1.1938, "lr": 7.930378878586198e-07, "epoch": 8.368421052631579, "percentage": 83.68, "elapsed_time": "1:46:39", "remaining_time": "0:20:47"} +{"current_steps": 2068, "total_steps": 2470, "loss": 1.2389, "lr": 7.89223390062172e-07, "epoch": 8.37246963562753, "percentage": 83.72, "elapsed_time": "1:46:42", "remaining_time": "0:20:44"} +{"current_steps": 2069, "total_steps": 2470, "loss": 0.9517, "lr": 7.854173020130906e-07, "epoch": 8.376518218623481, "percentage": 83.77, "elapsed_time": "1:46:45", "remaining_time": "0:20:41"} +{"current_steps": 2070, "total_steps": 2470, "loss": 1.0982, "lr": 7.816196313128821e-07, "epoch": 8.380566801619434, "percentage": 83.81, "elapsed_time": "1:46:48", "remaining_time": "0:20:38"} +{"current_steps": 2071, "total_steps": 2470, "loss": 0.913, "lr": 7.778303855462382e-07, "epoch": 8.384615384615385, "percentage": 83.85, "elapsed_time": "1:46:51", "remaining_time": "0:20:35"} +{"current_steps": 2072, "total_steps": 2470, "loss": 0.9799, "lr": 7.740495722810271e-07, "epoch": 8.388663967611336, "percentage": 83.89, "elapsed_time": "1:46:54", "remaining_time": "0:20:32"} +{"current_steps": 2073, "total_steps": 2470, "loss": 1.1741, "lr": 7.702771990682745e-07, "epoch": 8.392712550607287, "percentage": 83.93, "elapsed_time": "1:46:57", "remaining_time": "0:20:29"} +{"current_steps": 2074, "total_steps": 2470, "loss": 0.9586, "lr": 7.66513273442151e-07, "epoch": 8.396761133603238, "percentage": 83.97, "elapsed_time": "1:47:00", "remaining_time": "0:20:25"} +{"current_steps": 2075, "total_steps": 2470, "loss": 1.0087, "lr": 7.627578029199562e-07, "epoch": 8.40080971659919, "percentage": 84.01, "elapsed_time": "1:47:03", "remaining_time": "0:20:22"} +{"current_steps": 2076, "total_steps": 2470, "loss": 1.0385, "lr": 7.590107950020987e-07, "epoch": 8.404858299595142, "percentage": 84.05, "elapsed_time": "1:47:06", "remaining_time": "0:20:19"} +{"current_steps": 2077, "total_steps": 2470, "loss": 1.1273, "lr": 7.552722571720899e-07, "epoch": 8.408906882591094, "percentage": 84.09, "elapsed_time": "1:47:10", "remaining_time": "0:20:16"} +{"current_steps": 2078, "total_steps": 2470, "loss": 0.9676, "lr": 7.515421968965242e-07, "epoch": 8.412955465587045, "percentage": 84.13, "elapsed_time": "1:47:13", "remaining_time": "0:20:13"} +{"current_steps": 2079, "total_steps": 2470, "loss": 1.2442, "lr": 7.478206216250644e-07, "epoch": 8.417004048582996, "percentage": 84.17, "elapsed_time": "1:47:16", "remaining_time": "0:20:10"} +{"current_steps": 2080, "total_steps": 2470, "loss": 1.2719, "lr": 7.441075387904267e-07, "epoch": 8.421052631578947, "percentage": 84.21, "elapsed_time": "1:47:19", "remaining_time": "0:20:07"} +{"current_steps": 2081, "total_steps": 2470, "loss": 1.3106, "lr": 7.404029558083653e-07, "epoch": 8.425101214574898, "percentage": 84.25, "elapsed_time": "1:47:22", "remaining_time": "0:20:04"} +{"current_steps": 2082, "total_steps": 2470, "loss": 1.2708, "lr": 7.367068800776594e-07, "epoch": 8.429149797570851, "percentage": 84.29, "elapsed_time": "1:47:25", "remaining_time": "0:20:01"} +{"current_steps": 2083, "total_steps": 2470, "loss": 1.4544, "lr": 7.330193189800994e-07, "epoch": 8.433198380566802, "percentage": 84.33, "elapsed_time": "1:47:28", "remaining_time": "0:19:58"} +{"current_steps": 2084, "total_steps": 2470, "loss": 1.2466, "lr": 7.293402798804667e-07, "epoch": 8.437246963562753, "percentage": 84.37, "elapsed_time": "1:47:31", "remaining_time": "0:19:54"} +{"current_steps": 2085, "total_steps": 2470, "loss": 1.1822, "lr": 7.25669770126527e-07, "epoch": 8.441295546558704, "percentage": 84.41, "elapsed_time": "1:47:34", "remaining_time": "0:19:51"} +{"current_steps": 2086, "total_steps": 2470, "loss": 1.4383, "lr": 7.220077970490058e-07, "epoch": 8.445344129554655, "percentage": 84.45, "elapsed_time": "1:47:37", "remaining_time": "0:19:48"} +{"current_steps": 2087, "total_steps": 2470, "loss": 1.2326, "lr": 7.183543679615834e-07, "epoch": 8.449392712550607, "percentage": 84.49, "elapsed_time": "1:47:40", "remaining_time": "0:19:45"} +{"current_steps": 2088, "total_steps": 2470, "loss": 1.2273, "lr": 7.147094901608748e-07, "epoch": 8.45344129554656, "percentage": 84.53, "elapsed_time": "1:47:43", "remaining_time": "0:19:42"} +{"current_steps": 2089, "total_steps": 2470, "loss": 1.57, "lr": 7.110731709264163e-07, "epoch": 8.45748987854251, "percentage": 84.57, "elapsed_time": "1:47:46", "remaining_time": "0:19:39"} +{"current_steps": 2090, "total_steps": 2470, "loss": 1.6365, "lr": 7.074454175206524e-07, "epoch": 8.461538461538462, "percentage": 84.62, "elapsed_time": "1:47:49", "remaining_time": "0:19:36"} +{"current_steps": 2091, "total_steps": 2470, "loss": 1.3541, "lr": 7.03826237188916e-07, "epoch": 8.465587044534413, "percentage": 84.66, "elapsed_time": "1:47:53", "remaining_time": "0:19:33"} +{"current_steps": 2092, "total_steps": 2470, "loss": 1.4242, "lr": 7.002156371594237e-07, "epoch": 8.469635627530364, "percentage": 84.7, "elapsed_time": "1:47:56", "remaining_time": "0:19:30"} +{"current_steps": 2093, "total_steps": 2470, "loss": 1.3988, "lr": 6.966136246432492e-07, "epoch": 8.473684210526315, "percentage": 84.74, "elapsed_time": "1:47:59", "remaining_time": "0:19:27"} +{"current_steps": 2094, "total_steps": 2470, "loss": 1.387, "lr": 6.930202068343206e-07, "epoch": 8.477732793522268, "percentage": 84.78, "elapsed_time": "1:48:02", "remaining_time": "0:19:23"} +{"current_steps": 2095, "total_steps": 2470, "loss": 1.3236, "lr": 6.894353909093976e-07, "epoch": 8.481781376518219, "percentage": 84.82, "elapsed_time": "1:48:05", "remaining_time": "0:19:20"} +{"current_steps": 2096, "total_steps": 2470, "loss": 1.2652, "lr": 6.858591840280627e-07, "epoch": 8.48582995951417, "percentage": 84.86, "elapsed_time": "1:48:08", "remaining_time": "0:19:17"} +{"current_steps": 2097, "total_steps": 2470, "loss": 1.2337, "lr": 6.822915933327012e-07, "epoch": 8.489878542510121, "percentage": 84.9, "elapsed_time": "1:48:11", "remaining_time": "0:19:14"} +{"current_steps": 2098, "total_steps": 2470, "loss": 1.154, "lr": 6.787326259484922e-07, "epoch": 8.493927125506072, "percentage": 84.94, "elapsed_time": "1:48:14", "remaining_time": "0:19:11"} +{"current_steps": 2099, "total_steps": 2470, "loss": 1.0993, "lr": 6.751822889833926e-07, "epoch": 8.497975708502024, "percentage": 84.98, "elapsed_time": "1:48:17", "remaining_time": "0:19:08"} +{"current_steps": 2100, "total_steps": 2470, "loss": 0.9173, "lr": 6.716405895281225e-07, "epoch": 8.502024291497976, "percentage": 85.02, "elapsed_time": "1:48:20", "remaining_time": "0:19:05"} +{"current_steps": 2101, "total_steps": 2470, "loss": 1.2742, "lr": 6.681075346561517e-07, "epoch": 8.506072874493928, "percentage": 85.06, "elapsed_time": "1:48:23", "remaining_time": "0:19:02"} +{"current_steps": 2102, "total_steps": 2470, "loss": 1.2013, "lr": 6.645831314236817e-07, "epoch": 8.510121457489879, "percentage": 85.1, "elapsed_time": "1:48:26", "remaining_time": "0:18:59"} +{"current_steps": 2103, "total_steps": 2470, "loss": 1.2136, "lr": 6.610673868696387e-07, "epoch": 8.51417004048583, "percentage": 85.14, "elapsed_time": "1:48:29", "remaining_time": "0:18:55"} +{"current_steps": 2104, "total_steps": 2470, "loss": 1.2265, "lr": 6.57560308015655e-07, "epoch": 8.518218623481781, "percentage": 85.18, "elapsed_time": "1:48:32", "remaining_time": "0:18:52"} +{"current_steps": 2105, "total_steps": 2470, "loss": 1.15, "lr": 6.540619018660555e-07, "epoch": 8.522267206477732, "percentage": 85.22, "elapsed_time": "1:48:35", "remaining_time": "0:18:49"} +{"current_steps": 2106, "total_steps": 2470, "loss": 0.9784, "lr": 6.505721754078443e-07, "epoch": 8.526315789473685, "percentage": 85.26, "elapsed_time": "1:48:39", "remaining_time": "0:18:46"} +{"current_steps": 2107, "total_steps": 2470, "loss": 1.0741, "lr": 6.470911356106885e-07, "epoch": 8.530364372469636, "percentage": 85.3, "elapsed_time": "1:48:42", "remaining_time": "0:18:43"} +{"current_steps": 2108, "total_steps": 2470, "loss": 1.0919, "lr": 6.436187894269086e-07, "epoch": 8.534412955465587, "percentage": 85.34, "elapsed_time": "1:48:45", "remaining_time": "0:18:40"} +{"current_steps": 2109, "total_steps": 2470, "loss": 1.3919, "lr": 6.401551437914621e-07, "epoch": 8.538461538461538, "percentage": 85.38, "elapsed_time": "1:48:48", "remaining_time": "0:18:37"} +{"current_steps": 2110, "total_steps": 2470, "loss": 1.3732, "lr": 6.367002056219285e-07, "epoch": 8.54251012145749, "percentage": 85.43, "elapsed_time": "1:48:51", "remaining_time": "0:18:34"} +{"current_steps": 2111, "total_steps": 2470, "loss": 1.9685, "lr": 6.332539818184985e-07, "epoch": 8.54655870445344, "percentage": 85.47, "elapsed_time": "1:48:54", "remaining_time": "0:18:31"} +{"current_steps": 2112, "total_steps": 2470, "loss": 1.6408, "lr": 6.298164792639555e-07, "epoch": 8.550607287449393, "percentage": 85.51, "elapsed_time": "1:48:57", "remaining_time": "0:18:28"} +{"current_steps": 2113, "total_steps": 2470, "loss": 2.024, "lr": 6.263877048236683e-07, "epoch": 8.554655870445345, "percentage": 85.55, "elapsed_time": "1:49:00", "remaining_time": "0:18:25"} +{"current_steps": 2114, "total_steps": 2470, "loss": 1.2075, "lr": 6.229676653455719e-07, "epoch": 8.558704453441296, "percentage": 85.59, "elapsed_time": "1:49:03", "remaining_time": "0:18:21"} +{"current_steps": 2115, "total_steps": 2470, "loss": 1.0819, "lr": 6.195563676601563e-07, "epoch": 8.562753036437247, "percentage": 85.63, "elapsed_time": "1:49:06", "remaining_time": "0:18:18"} +{"current_steps": 2116, "total_steps": 2470, "loss": 1.4577, "lr": 6.161538185804544e-07, "epoch": 8.566801619433198, "percentage": 85.67, "elapsed_time": "1:49:09", "remaining_time": "0:18:15"} +{"current_steps": 2117, "total_steps": 2470, "loss": 1.1355, "lr": 6.127600249020216e-07, "epoch": 8.570850202429149, "percentage": 85.71, "elapsed_time": "1:49:12", "remaining_time": "0:18:12"} +{"current_steps": 2118, "total_steps": 2470, "loss": 1.2906, "lr": 6.09374993402932e-07, "epoch": 8.574898785425102, "percentage": 85.75, "elapsed_time": "1:49:15", "remaining_time": "0:18:09"} +{"current_steps": 2119, "total_steps": 2470, "loss": 1.1301, "lr": 6.059987308437565e-07, "epoch": 8.578947368421053, "percentage": 85.79, "elapsed_time": "1:49:18", "remaining_time": "0:18:06"} +{"current_steps": 2120, "total_steps": 2470, "loss": 1.2177, "lr": 6.026312439675553e-07, "epoch": 8.582995951417004, "percentage": 85.83, "elapsed_time": "1:49:21", "remaining_time": "0:18:03"} +{"current_steps": 2121, "total_steps": 2470, "loss": 1.2383, "lr": 5.992725394998594e-07, "epoch": 8.587044534412955, "percentage": 85.87, "elapsed_time": "1:49:25", "remaining_time": "0:18:00"} +{"current_steps": 2122, "total_steps": 2470, "loss": 1.2195, "lr": 5.959226241486632e-07, "epoch": 8.591093117408906, "percentage": 85.91, "elapsed_time": "1:49:28", "remaining_time": "0:17:57"} +{"current_steps": 2123, "total_steps": 2470, "loss": 0.9419, "lr": 5.925815046044026e-07, "epoch": 8.595141700404858, "percentage": 85.95, "elapsed_time": "1:49:31", "remaining_time": "0:17:54"} +{"current_steps": 2124, "total_steps": 2470, "loss": 1.0593, "lr": 5.892491875399503e-07, "epoch": 8.59919028340081, "percentage": 85.99, "elapsed_time": "1:49:34", "remaining_time": "0:17:50"} +{"current_steps": 2125, "total_steps": 2470, "loss": 1.141, "lr": 5.859256796105972e-07, "epoch": 8.603238866396762, "percentage": 86.03, "elapsed_time": "1:49:37", "remaining_time": "0:17:47"} +{"current_steps": 2126, "total_steps": 2470, "loss": 0.9086, "lr": 5.826109874540409e-07, "epoch": 8.607287449392713, "percentage": 86.07, "elapsed_time": "1:49:40", "remaining_time": "0:17:44"} +{"current_steps": 2127, "total_steps": 2470, "loss": 1.1918, "lr": 5.793051176903736e-07, "epoch": 8.611336032388664, "percentage": 86.11, "elapsed_time": "1:49:43", "remaining_time": "0:17:41"} +{"current_steps": 2128, "total_steps": 2470, "loss": 1.2405, "lr": 5.760080769220644e-07, "epoch": 8.615384615384615, "percentage": 86.15, "elapsed_time": "1:49:46", "remaining_time": "0:17:38"} +{"current_steps": 2129, "total_steps": 2470, "loss": 0.9801, "lr": 5.727198717339511e-07, "epoch": 8.619433198380566, "percentage": 86.19, "elapsed_time": "1:49:49", "remaining_time": "0:17:35"} +{"current_steps": 2130, "total_steps": 2470, "loss": 1.3353, "lr": 5.694405086932248e-07, "epoch": 8.623481781376519, "percentage": 86.23, "elapsed_time": "1:49:52", "remaining_time": "0:17:32"} +{"current_steps": 2131, "total_steps": 2470, "loss": 1.1811, "lr": 5.661699943494181e-07, "epoch": 8.62753036437247, "percentage": 86.28, "elapsed_time": "1:49:55", "remaining_time": "0:17:29"} +{"current_steps": 2132, "total_steps": 2470, "loss": 1.4413, "lr": 5.6290833523439e-07, "epoch": 8.631578947368421, "percentage": 86.32, "elapsed_time": "1:49:58", "remaining_time": "0:17:26"} +{"current_steps": 2133, "total_steps": 2470, "loss": 1.0383, "lr": 5.596555378623126e-07, "epoch": 8.635627530364372, "percentage": 86.36, "elapsed_time": "1:50:01", "remaining_time": "0:17:23"} +{"current_steps": 2134, "total_steps": 2470, "loss": 1.3329, "lr": 5.564116087296618e-07, "epoch": 8.639676113360323, "percentage": 86.4, "elapsed_time": "1:50:04", "remaining_time": "0:17:19"} +{"current_steps": 2135, "total_steps": 2470, "loss": 1.2145, "lr": 5.531765543152002e-07, "epoch": 8.643724696356275, "percentage": 86.44, "elapsed_time": "1:50:07", "remaining_time": "0:17:16"} +{"current_steps": 2136, "total_steps": 2470, "loss": 1.3227, "lr": 5.499503810799667e-07, "epoch": 8.647773279352228, "percentage": 86.48, "elapsed_time": "1:50:10", "remaining_time": "0:17:13"} +{"current_steps": 2137, "total_steps": 2470, "loss": 1.1114, "lr": 5.467330954672639e-07, "epoch": 8.651821862348179, "percentage": 86.52, "elapsed_time": "1:50:13", "remaining_time": "0:17:10"} +{"current_steps": 2138, "total_steps": 2470, "loss": 1.3092, "lr": 5.435247039026398e-07, "epoch": 8.65587044534413, "percentage": 86.56, "elapsed_time": "1:50:16", "remaining_time": "0:17:07"} +{"current_steps": 2139, "total_steps": 2470, "loss": 1.1114, "lr": 5.403252127938841e-07, "epoch": 8.65991902834008, "percentage": 86.6, "elapsed_time": "1:50:20", "remaining_time": "0:17:04"} +{"current_steps": 2140, "total_steps": 2470, "loss": 1.2813, "lr": 5.371346285310075e-07, "epoch": 8.663967611336032, "percentage": 86.64, "elapsed_time": "1:50:23", "remaining_time": "0:17:01"} +{"current_steps": 2141, "total_steps": 2470, "loss": 1.2645, "lr": 5.33952957486234e-07, "epoch": 8.668016194331983, "percentage": 86.68, "elapsed_time": "1:50:26", "remaining_time": "0:16:58"} +{"current_steps": 2142, "total_steps": 2470, "loss": 1.4488, "lr": 5.30780206013985e-07, "epoch": 8.672064777327936, "percentage": 86.72, "elapsed_time": "1:50:29", "remaining_time": "0:16:55"} +{"current_steps": 2143, "total_steps": 2470, "loss": 1.436, "lr": 5.276163804508671e-07, "epoch": 8.676113360323887, "percentage": 86.76, "elapsed_time": "1:50:32", "remaining_time": "0:16:52"} +{"current_steps": 2144, "total_steps": 2470, "loss": 1.2596, "lr": 5.244614871156612e-07, "epoch": 8.680161943319838, "percentage": 86.8, "elapsed_time": "1:50:35", "remaining_time": "0:16:48"} +{"current_steps": 2145, "total_steps": 2470, "loss": 1.1446, "lr": 5.213155323093094e-07, "epoch": 8.68421052631579, "percentage": 86.84, "elapsed_time": "1:50:38", "remaining_time": "0:16:45"} +{"current_steps": 2146, "total_steps": 2470, "loss": 1.2253, "lr": 5.181785223148999e-07, "epoch": 8.68825910931174, "percentage": 86.88, "elapsed_time": "1:50:41", "remaining_time": "0:16:42"} +{"current_steps": 2147, "total_steps": 2470, "loss": 1.2426, "lr": 5.150504633976572e-07, "epoch": 8.692307692307692, "percentage": 86.92, "elapsed_time": "1:50:44", "remaining_time": "0:16:39"} +{"current_steps": 2148, "total_steps": 2470, "loss": 1.126, "lr": 5.119313618049309e-07, "epoch": 8.696356275303645, "percentage": 86.96, "elapsed_time": "1:50:47", "remaining_time": "0:16:36"} +{"current_steps": 2149, "total_steps": 2470, "loss": 1.5606, "lr": 5.088212237661766e-07, "epoch": 8.700404858299596, "percentage": 87.0, "elapsed_time": "1:50:50", "remaining_time": "0:16:33"} +{"current_steps": 2150, "total_steps": 2470, "loss": 1.3713, "lr": 5.057200554929509e-07, "epoch": 8.704453441295547, "percentage": 87.04, "elapsed_time": "1:50:53", "remaining_time": "0:16:30"} +{"current_steps": 2151, "total_steps": 2470, "loss": 1.001, "lr": 5.026278631788967e-07, "epoch": 8.708502024291498, "percentage": 87.09, "elapsed_time": "1:50:56", "remaining_time": "0:16:27"} +{"current_steps": 2152, "total_steps": 2470, "loss": 1.1171, "lr": 4.995446529997283e-07, "epoch": 8.712550607287449, "percentage": 87.13, "elapsed_time": "1:50:59", "remaining_time": "0:16:24"} +{"current_steps": 2153, "total_steps": 2470, "loss": 1.2367, "lr": 4.964704311132224e-07, "epoch": 8.7165991902834, "percentage": 87.17, "elapsed_time": "1:51:02", "remaining_time": "0:16:21"} +{"current_steps": 2154, "total_steps": 2470, "loss": 0.9859, "lr": 4.934052036592018e-07, "epoch": 8.720647773279353, "percentage": 87.21, "elapsed_time": "1:51:06", "remaining_time": "0:16:17"} +{"current_steps": 2155, "total_steps": 2470, "loss": 1.2321, "lr": 4.903489767595287e-07, "epoch": 8.724696356275304, "percentage": 87.25, "elapsed_time": "1:51:09", "remaining_time": "0:16:14"} +{"current_steps": 2156, "total_steps": 2470, "loss": 1.3846, "lr": 4.873017565180871e-07, "epoch": 8.728744939271255, "percentage": 87.29, "elapsed_time": "1:51:12", "remaining_time": "0:16:11"} +{"current_steps": 2157, "total_steps": 2470, "loss": 1.0707, "lr": 4.842635490207747e-07, "epoch": 8.732793522267206, "percentage": 87.33, "elapsed_time": "1:51:15", "remaining_time": "0:16:08"} +{"current_steps": 2158, "total_steps": 2470, "loss": 1.1865, "lr": 4.812343603354896e-07, "epoch": 8.736842105263158, "percentage": 87.37, "elapsed_time": "1:51:18", "remaining_time": "0:16:05"} +{"current_steps": 2159, "total_steps": 2470, "loss": 1.8135, "lr": 4.782141965121129e-07, "epoch": 8.740890688259109, "percentage": 87.41, "elapsed_time": "1:51:21", "remaining_time": "0:16:02"} +{"current_steps": 2160, "total_steps": 2470, "loss": 1.3722, "lr": 4.752030635825067e-07, "epoch": 8.744939271255062, "percentage": 87.45, "elapsed_time": "1:51:24", "remaining_time": "0:15:59"} +{"current_steps": 2161, "total_steps": 2470, "loss": 1.1621, "lr": 4.7220096756049384e-07, "epoch": 8.748987854251013, "percentage": 87.49, "elapsed_time": "1:51:27", "remaining_time": "0:15:56"} +{"current_steps": 2162, "total_steps": 2470, "loss": 1.0939, "lr": 4.6920791444184934e-07, "epoch": 8.753036437246964, "percentage": 87.53, "elapsed_time": "1:51:30", "remaining_time": "0:15:53"} +{"current_steps": 2163, "total_steps": 2470, "loss": 1.1937, "lr": 4.662239102042887e-07, "epoch": 8.757085020242915, "percentage": 87.57, "elapsed_time": "1:51:33", "remaining_time": "0:15:50"} +{"current_steps": 2164, "total_steps": 2470, "loss": 1.9407, "lr": 4.6324896080745254e-07, "epoch": 8.761133603238866, "percentage": 87.61, "elapsed_time": "1:51:36", "remaining_time": "0:15:46"} +{"current_steps": 2165, "total_steps": 2470, "loss": 1.6736, "lr": 4.602830721928997e-07, "epoch": 8.765182186234817, "percentage": 87.65, "elapsed_time": "1:51:39", "remaining_time": "0:15:43"} +{"current_steps": 2166, "total_steps": 2470, "loss": 1.6042, "lr": 4.573262502840914e-07, "epoch": 8.76923076923077, "percentage": 87.69, "elapsed_time": "1:51:42", "remaining_time": "0:15:40"} +{"current_steps": 2167, "total_steps": 2470, "loss": 1.144, "lr": 4.54378500986381e-07, "epoch": 8.773279352226721, "percentage": 87.73, "elapsed_time": "1:51:45", "remaining_time": "0:15:37"} +{"current_steps": 2168, "total_steps": 2470, "loss": 1.4426, "lr": 4.5143983018700485e-07, "epoch": 8.777327935222672, "percentage": 87.77, "elapsed_time": "1:51:48", "remaining_time": "0:15:34"} +{"current_steps": 2169, "total_steps": 2470, "loss": 1.4161, "lr": 4.48510243755062e-07, "epoch": 8.781376518218623, "percentage": 87.81, "elapsed_time": "1:51:51", "remaining_time": "0:15:31"} +{"current_steps": 2170, "total_steps": 2470, "loss": 1.2124, "lr": 4.455897475415133e-07, "epoch": 8.785425101214575, "percentage": 87.85, "elapsed_time": "1:51:54", "remaining_time": "0:15:28"} +{"current_steps": 2171, "total_steps": 2470, "loss": 1.1482, "lr": 4.4267834737916295e-07, "epoch": 8.789473684210526, "percentage": 87.89, "elapsed_time": "1:51:57", "remaining_time": "0:15:25"} +{"current_steps": 2172, "total_steps": 2470, "loss": 1.1481, "lr": 4.39776049082648e-07, "epoch": 8.793522267206479, "percentage": 87.94, "elapsed_time": "1:52:00", "remaining_time": "0:15:22"} +{"current_steps": 2173, "total_steps": 2470, "loss": 1.1969, "lr": 4.3688285844842747e-07, "epoch": 8.79757085020243, "percentage": 87.98, "elapsed_time": "1:52:03", "remaining_time": "0:15:18"} +{"current_steps": 2174, "total_steps": 2470, "loss": 1.101, "lr": 4.33998781254773e-07, "epoch": 8.80161943319838, "percentage": 88.02, "elapsed_time": "1:52:06", "remaining_time": "0:15:15"} +{"current_steps": 2175, "total_steps": 2470, "loss": 1.0094, "lr": 4.3112382326174987e-07, "epoch": 8.805668016194332, "percentage": 88.06, "elapsed_time": "1:52:09", "remaining_time": "0:15:12"} +{"current_steps": 2176, "total_steps": 2470, "loss": 1.3299, "lr": 4.2825799021121493e-07, "epoch": 8.809716599190283, "percentage": 88.1, "elapsed_time": "1:52:12", "remaining_time": "0:15:09"} +{"current_steps": 2177, "total_steps": 2470, "loss": 1.1159, "lr": 4.2540128782679934e-07, "epoch": 8.813765182186234, "percentage": 88.14, "elapsed_time": "1:52:15", "remaining_time": "0:15:06"} +{"current_steps": 2178, "total_steps": 2470, "loss": 1.0554, "lr": 4.225537218138981e-07, "epoch": 8.817813765182187, "percentage": 88.18, "elapsed_time": "1:52:18", "remaining_time": "0:15:03"} +{"current_steps": 2179, "total_steps": 2470, "loss": 2.1386, "lr": 4.197152978596608e-07, "epoch": 8.821862348178138, "percentage": 88.22, "elapsed_time": "1:52:21", "remaining_time": "0:15:00"} +{"current_steps": 2180, "total_steps": 2470, "loss": 3.054, "lr": 4.1688602163297564e-07, "epoch": 8.82591093117409, "percentage": 88.26, "elapsed_time": "1:52:24", "remaining_time": "0:14:57"} +{"current_steps": 2181, "total_steps": 2470, "loss": 1.0687, "lr": 4.1406589878446257e-07, "epoch": 8.82995951417004, "percentage": 88.3, "elapsed_time": "1:52:27", "remaining_time": "0:14:54"} +{"current_steps": 2182, "total_steps": 2470, "loss": 1.046, "lr": 4.112549349464606e-07, "epoch": 8.834008097165992, "percentage": 88.34, "elapsed_time": "1:52:30", "remaining_time": "0:14:51"} +{"current_steps": 2183, "total_steps": 2470, "loss": 1.0509, "lr": 4.0845313573301736e-07, "epoch": 8.838056680161943, "percentage": 88.38, "elapsed_time": "1:52:33", "remaining_time": "0:14:47"} +{"current_steps": 2184, "total_steps": 2470, "loss": 1.1568, "lr": 4.05660506739875e-07, "epoch": 8.842105263157894, "percentage": 88.42, "elapsed_time": "1:52:36", "remaining_time": "0:14:44"} +{"current_steps": 2185, "total_steps": 2470, "loss": 0.9318, "lr": 4.0287705354446147e-07, "epoch": 8.846153846153847, "percentage": 88.46, "elapsed_time": "1:52:39", "remaining_time": "0:14:41"} +{"current_steps": 2186, "total_steps": 2470, "loss": 1.1372, "lr": 4.001027817058789e-07, "epoch": 8.850202429149798, "percentage": 88.5, "elapsed_time": "1:52:42", "remaining_time": "0:14:38"} +{"current_steps": 2187, "total_steps": 2470, "loss": 1.1666, "lr": 3.973376967648934e-07, "epoch": 8.854251012145749, "percentage": 88.54, "elapsed_time": "1:52:45", "remaining_time": "0:14:35"} +{"current_steps": 2188, "total_steps": 2470, "loss": 1.126, "lr": 3.945818042439226e-07, "epoch": 8.8582995951417, "percentage": 88.58, "elapsed_time": "1:52:48", "remaining_time": "0:14:32"} +{"current_steps": 2189, "total_steps": 2470, "loss": 1.1207, "lr": 3.9183510964702463e-07, "epoch": 8.862348178137651, "percentage": 88.62, "elapsed_time": "1:52:51", "remaining_time": "0:14:29"} +{"current_steps": 2190, "total_steps": 2470, "loss": 1.0898, "lr": 3.890976184598866e-07, "epoch": 8.866396761133604, "percentage": 88.66, "elapsed_time": "1:52:54", "remaining_time": "0:14:26"} +{"current_steps": 2191, "total_steps": 2470, "loss": 1.4988, "lr": 3.863693361498161e-07, "epoch": 8.870445344129555, "percentage": 88.7, "elapsed_time": "1:52:57", "remaining_time": "0:14:23"} +{"current_steps": 2192, "total_steps": 2470, "loss": 1.4457, "lr": 3.836502681657289e-07, "epoch": 8.874493927125506, "percentage": 88.74, "elapsed_time": "1:53:00", "remaining_time": "0:14:19"} +{"current_steps": 2193, "total_steps": 2470, "loss": 1.2321, "lr": 3.809404199381378e-07, "epoch": 8.878542510121457, "percentage": 88.79, "elapsed_time": "1:53:03", "remaining_time": "0:14:16"} +{"current_steps": 2194, "total_steps": 2470, "loss": 1.1646, "lr": 3.7823979687914125e-07, "epoch": 8.882591093117409, "percentage": 88.83, "elapsed_time": "1:53:06", "remaining_time": "0:14:13"} +{"current_steps": 2195, "total_steps": 2470, "loss": 0.8228, "lr": 3.755484043824131e-07, "epoch": 8.88663967611336, "percentage": 88.87, "elapsed_time": "1:53:09", "remaining_time": "0:14:10"} +{"current_steps": 2196, "total_steps": 2470, "loss": 1.3459, "lr": 3.728662478231926e-07, "epoch": 8.89068825910931, "percentage": 88.91, "elapsed_time": "1:53:12", "remaining_time": "0:14:07"} +{"current_steps": 2197, "total_steps": 2470, "loss": 1.3481, "lr": 3.7019333255827404e-07, "epoch": 8.894736842105264, "percentage": 88.95, "elapsed_time": "1:53:15", "remaining_time": "0:14:04"} +{"current_steps": 2198, "total_steps": 2470, "loss": 1.1434, "lr": 3.675296639259912e-07, "epoch": 8.898785425101215, "percentage": 88.99, "elapsed_time": "1:53:18", "remaining_time": "0:14:01"} +{"current_steps": 2199, "total_steps": 2470, "loss": 1.1156, "lr": 3.6487524724621526e-07, "epoch": 8.902834008097166, "percentage": 89.03, "elapsed_time": "1:53:21", "remaining_time": "0:13:58"} +{"current_steps": 2200, "total_steps": 2470, "loss": 1.1401, "lr": 3.6223008782033773e-07, "epoch": 8.906882591093117, "percentage": 89.07, "elapsed_time": "1:53:24", "remaining_time": "0:13:55"} +{"current_steps": 2201, "total_steps": 2470, "loss": 0.9237, "lr": 3.595941909312595e-07, "epoch": 8.910931174089068, "percentage": 89.11, "elapsed_time": "1:53:27", "remaining_time": "0:13:52"} +{"current_steps": 2202, "total_steps": 2470, "loss": 0.9947, "lr": 3.569675618433849e-07, "epoch": 8.914979757085021, "percentage": 89.15, "elapsed_time": "1:53:31", "remaining_time": "0:13:48"} +{"current_steps": 2203, "total_steps": 2470, "loss": 0.9978, "lr": 3.543502058026071e-07, "epoch": 8.919028340080972, "percentage": 89.19, "elapsed_time": "1:53:34", "remaining_time": "0:13:45"} +{"current_steps": 2204, "total_steps": 2470, "loss": 1.324, "lr": 3.517421280363004e-07, "epoch": 8.923076923076923, "percentage": 89.23, "elapsed_time": "1:53:37", "remaining_time": "0:13:42"} +{"current_steps": 2205, "total_steps": 2470, "loss": 1.1632, "lr": 3.49143333753309e-07, "epoch": 8.927125506072874, "percentage": 89.27, "elapsed_time": "1:53:40", "remaining_time": "0:13:39"} +{"current_steps": 2206, "total_steps": 2470, "loss": 1.1421, "lr": 3.4655382814393346e-07, "epoch": 8.931174089068826, "percentage": 89.31, "elapsed_time": "1:53:43", "remaining_time": "0:13:36"} +{"current_steps": 2207, "total_steps": 2470, "loss": 0.9506, "lr": 3.439736163799251e-07, "epoch": 8.935222672064777, "percentage": 89.35, "elapsed_time": "1:53:46", "remaining_time": "0:13:33"} +{"current_steps": 2208, "total_steps": 2470, "loss": 0.9707, "lr": 3.4140270361447405e-07, "epoch": 8.939271255060728, "percentage": 89.39, "elapsed_time": "1:53:49", "remaining_time": "0:13:30"} +{"current_steps": 2209, "total_steps": 2470, "loss": 1.2587, "lr": 3.388410949821969e-07, "epoch": 8.94331983805668, "percentage": 89.43, "elapsed_time": "1:53:52", "remaining_time": "0:13:27"} +{"current_steps": 2210, "total_steps": 2470, "loss": 0.9956, "lr": 3.362887955991301e-07, "epoch": 8.947368421052632, "percentage": 89.47, "elapsed_time": "1:53:55", "remaining_time": "0:13:24"} +{"current_steps": 2211, "total_steps": 2470, "loss": 0.8958, "lr": 3.337458105627145e-07, "epoch": 8.951417004048583, "percentage": 89.51, "elapsed_time": "1:53:58", "remaining_time": "0:13:21"} +{"current_steps": 2212, "total_steps": 2470, "loss": 1.2205, "lr": 3.3121214495179187e-07, "epoch": 8.955465587044534, "percentage": 89.55, "elapsed_time": "1:54:01", "remaining_time": "0:13:17"} +{"current_steps": 2213, "total_steps": 2470, "loss": 1.306, "lr": 3.2868780382658895e-07, "epoch": 8.959514170040485, "percentage": 89.6, "elapsed_time": "1:54:04", "remaining_time": "0:13:14"} +{"current_steps": 2214, "total_steps": 2470, "loss": 1.237, "lr": 3.261727922287111e-07, "epoch": 8.963562753036438, "percentage": 89.64, "elapsed_time": "1:54:07", "remaining_time": "0:13:11"} +{"current_steps": 2215, "total_steps": 2470, "loss": 1.2228, "lr": 3.236671151811305e-07, "epoch": 8.96761133603239, "percentage": 89.68, "elapsed_time": "1:54:10", "remaining_time": "0:13:08"} +{"current_steps": 2216, "total_steps": 2470, "loss": 1.1567, "lr": 3.2117077768817395e-07, "epoch": 8.97165991902834, "percentage": 89.72, "elapsed_time": "1:54:13", "remaining_time": "0:13:05"} +{"current_steps": 2217, "total_steps": 2470, "loss": 1.2206, "lr": 3.1868378473551953e-07, "epoch": 8.975708502024291, "percentage": 89.76, "elapsed_time": "1:54:16", "remaining_time": "0:13:02"} +{"current_steps": 2218, "total_steps": 2470, "loss": 1.2698, "lr": 3.16206141290179e-07, "epoch": 8.979757085020243, "percentage": 89.8, "elapsed_time": "1:54:19", "remaining_time": "0:12:59"} +{"current_steps": 2219, "total_steps": 2470, "loss": 1.3288, "lr": 3.1373785230049356e-07, "epoch": 8.983805668016194, "percentage": 89.84, "elapsed_time": "1:54:22", "remaining_time": "0:12:56"} +{"current_steps": 2220, "total_steps": 2470, "loss": 1.1248, "lr": 3.1127892269612103e-07, "epoch": 8.987854251012145, "percentage": 89.88, "elapsed_time": "1:54:25", "remaining_time": "0:12:53"} +{"current_steps": 2221, "total_steps": 2470, "loss": 1.1403, "lr": 3.0882935738802467e-07, "epoch": 8.991902834008098, "percentage": 89.92, "elapsed_time": "1:54:28", "remaining_time": "0:12:50"} +{"current_steps": 2222, "total_steps": 2470, "loss": 1.4643, "lr": 3.0638916126846885e-07, "epoch": 8.995951417004049, "percentage": 89.96, "elapsed_time": "1:54:31", "remaining_time": "0:12:46"} +{"current_steps": 2223, "total_steps": 2470, "loss": 1.163, "lr": 3.039583392110046e-07, "epoch": 9.0, "percentage": 90.0, "elapsed_time": "1:54:34", "remaining_time": "0:12:43"} +{"current_steps": 2224, "total_steps": 2470, "loss": 1.192, "lr": 3.015368960704584e-07, "epoch": 9.004048582995951, "percentage": 90.04, "elapsed_time": "1:54:37", "remaining_time": "0:12:40"} +{"current_steps": 2225, "total_steps": 2470, "loss": 1.3682, "lr": 2.991248366829291e-07, "epoch": 9.008097165991902, "percentage": 90.08, "elapsed_time": "1:54:40", "remaining_time": "0:12:37"} +{"current_steps": 2226, "total_steps": 2470, "loss": 1.1359, "lr": 2.9672216586577317e-07, "epoch": 9.012145748987853, "percentage": 90.12, "elapsed_time": "1:54:43", "remaining_time": "0:12:34"} +{"current_steps": 2227, "total_steps": 2470, "loss": 1.3759, "lr": 2.9432888841759434e-07, "epoch": 9.016194331983806, "percentage": 90.16, "elapsed_time": "1:54:46", "remaining_time": "0:12:31"} +{"current_steps": 2228, "total_steps": 2470, "loss": 1.5184, "lr": 2.91945009118238e-07, "epoch": 9.020242914979757, "percentage": 90.2, "elapsed_time": "1:54:49", "remaining_time": "0:12:28"} +{"current_steps": 2229, "total_steps": 2470, "loss": 1.4498, "lr": 2.8957053272877957e-07, "epoch": 9.024291497975709, "percentage": 90.24, "elapsed_time": "1:54:52", "remaining_time": "0:12:25"} +{"current_steps": 2230, "total_steps": 2470, "loss": 1.1665, "lr": 2.8720546399151395e-07, "epoch": 9.02834008097166, "percentage": 90.28, "elapsed_time": "1:54:55", "remaining_time": "0:12:22"} +{"current_steps": 2231, "total_steps": 2470, "loss": 1.1768, "lr": 2.848498076299483e-07, "epoch": 9.03238866396761, "percentage": 90.32, "elapsed_time": "1:54:58", "remaining_time": "0:12:18"} +{"current_steps": 2232, "total_steps": 2470, "loss": 1.3754, "lr": 2.8250356834878924e-07, "epoch": 9.036437246963562, "percentage": 90.36, "elapsed_time": "1:55:01", "remaining_time": "0:12:15"} +{"current_steps": 2233, "total_steps": 2470, "loss": 1.1804, "lr": 2.801667508339384e-07, "epoch": 9.040485829959515, "percentage": 90.4, "elapsed_time": "1:55:04", "remaining_time": "0:12:12"} +{"current_steps": 2234, "total_steps": 2470, "loss": 1.3509, "lr": 2.7783935975247867e-07, "epoch": 9.044534412955466, "percentage": 90.45, "elapsed_time": "1:55:07", "remaining_time": "0:12:09"} +{"current_steps": 2235, "total_steps": 2470, "loss": 1.1764, "lr": 2.7552139975266677e-07, "epoch": 9.048582995951417, "percentage": 90.49, "elapsed_time": "1:55:10", "remaining_time": "0:12:06"} +{"current_steps": 2236, "total_steps": 2470, "loss": 1.2262, "lr": 2.732128754639246e-07, "epoch": 9.052631578947368, "percentage": 90.53, "elapsed_time": "1:55:13", "remaining_time": "0:12:03"} +{"current_steps": 2237, "total_steps": 2470, "loss": 1.0832, "lr": 2.7091379149682683e-07, "epoch": 9.05668016194332, "percentage": 90.57, "elapsed_time": "1:55:16", "remaining_time": "0:12:00"} +{"current_steps": 2238, "total_steps": 2470, "loss": 1.2281, "lr": 2.68624152443096e-07, "epoch": 9.06072874493927, "percentage": 90.61, "elapsed_time": "1:55:19", "remaining_time": "0:11:57"} +{"current_steps": 2239, "total_steps": 2470, "loss": 1.1544, "lr": 2.6634396287559094e-07, "epoch": 9.064777327935223, "percentage": 90.65, "elapsed_time": "1:55:22", "remaining_time": "0:11:54"} +{"current_steps": 2240, "total_steps": 2470, "loss": 1.3331, "lr": 2.6407322734829763e-07, "epoch": 9.068825910931174, "percentage": 90.69, "elapsed_time": "1:55:25", "remaining_time": "0:11:51"} +{"current_steps": 2241, "total_steps": 2470, "loss": 1.2182, "lr": 2.6181195039632123e-07, "epoch": 9.072874493927126, "percentage": 90.73, "elapsed_time": "1:55:28", "remaining_time": "0:11:47"} +{"current_steps": 2242, "total_steps": 2470, "loss": 1.1883, "lr": 2.5956013653587465e-07, "epoch": 9.076923076923077, "percentage": 90.77, "elapsed_time": "1:55:30", "remaining_time": "0:11:44"} +{"current_steps": 2243, "total_steps": 2470, "loss": 1.3277, "lr": 2.573177902642726e-07, "epoch": 9.080971659919028, "percentage": 90.81, "elapsed_time": "1:55:33", "remaining_time": "0:11:41"} +{"current_steps": 2244, "total_steps": 2470, "loss": 1.2689, "lr": 2.5508491605992003e-07, "epoch": 9.085020242914979, "percentage": 90.85, "elapsed_time": "1:55:36", "remaining_time": "0:11:38"} +{"current_steps": 2245, "total_steps": 2470, "loss": 1.2173, "lr": 2.528615183823058e-07, "epoch": 9.089068825910932, "percentage": 90.89, "elapsed_time": "1:55:39", "remaining_time": "0:11:35"} +{"current_steps": 2246, "total_steps": 2470, "loss": 1.3017, "lr": 2.506476016719922e-07, "epoch": 9.093117408906883, "percentage": 90.93, "elapsed_time": "1:55:42", "remaining_time": "0:11:32"} +{"current_steps": 2247, "total_steps": 2470, "loss": 1.0426, "lr": 2.4844317035060407e-07, "epoch": 9.097165991902834, "percentage": 90.97, "elapsed_time": "1:55:45", "remaining_time": "0:11:29"} +{"current_steps": 2248, "total_steps": 2470, "loss": 1.2441, "lr": 2.462482288208234e-07, "epoch": 9.101214574898785, "percentage": 91.01, "elapsed_time": "1:55:48", "remaining_time": "0:11:26"} +{"current_steps": 2249, "total_steps": 2470, "loss": 1.1408, "lr": 2.440627814663804e-07, "epoch": 9.105263157894736, "percentage": 91.05, "elapsed_time": "1:55:51", "remaining_time": "0:11:23"} +{"current_steps": 2250, "total_steps": 2470, "loss": 1.1815, "lr": 2.4188683265204125e-07, "epoch": 9.109311740890687, "percentage": 91.09, "elapsed_time": "1:55:55", "remaining_time": "0:11:20"} +{"current_steps": 2251, "total_steps": 2470, "loss": 1.1018, "lr": 2.397203867236031e-07, "epoch": 9.11336032388664, "percentage": 91.13, "elapsed_time": "1:55:58", "remaining_time": "0:11:16"} +{"current_steps": 2252, "total_steps": 2470, "loss": 1.0407, "lr": 2.3756344800788421e-07, "epoch": 9.117408906882591, "percentage": 91.17, "elapsed_time": "1:56:01", "remaining_time": "0:11:13"} +{"current_steps": 2253, "total_steps": 2470, "loss": 1.1121, "lr": 2.354160208127143e-07, "epoch": 9.121457489878543, "percentage": 91.21, "elapsed_time": "1:56:04", "remaining_time": "0:11:10"} +{"current_steps": 2254, "total_steps": 2470, "loss": 1.1386, "lr": 2.3327810942692653e-07, "epoch": 9.125506072874494, "percentage": 91.26, "elapsed_time": "1:56:07", "remaining_time": "0:11:07"} +{"current_steps": 2255, "total_steps": 2470, "loss": 0.9948, "lr": 2.3114971812034981e-07, "epoch": 9.129554655870445, "percentage": 91.3, "elapsed_time": "1:56:10", "remaining_time": "0:11:04"} +{"current_steps": 2256, "total_steps": 2470, "loss": 1.3591, "lr": 2.290308511437994e-07, "epoch": 9.133603238866396, "percentage": 91.34, "elapsed_time": "1:56:13", "remaining_time": "0:11:01"} +{"current_steps": 2257, "total_steps": 2470, "loss": 1.1552, "lr": 2.2692151272906916e-07, "epoch": 9.137651821862349, "percentage": 91.38, "elapsed_time": "1:56:16", "remaining_time": "0:10:58"} +{"current_steps": 2258, "total_steps": 2470, "loss": 0.9343, "lr": 2.2482170708892083e-07, "epoch": 9.1417004048583, "percentage": 91.42, "elapsed_time": "1:56:19", "remaining_time": "0:10:55"} +{"current_steps": 2259, "total_steps": 2470, "loss": 1.2456, "lr": 2.2273143841707922e-07, "epoch": 9.145748987854251, "percentage": 91.46, "elapsed_time": "1:56:22", "remaining_time": "0:10:52"} +{"current_steps": 2260, "total_steps": 2470, "loss": 1.0812, "lr": 2.2065071088822055e-07, "epoch": 9.149797570850202, "percentage": 91.5, "elapsed_time": "1:56:25", "remaining_time": "0:10:49"} +{"current_steps": 2261, "total_steps": 2470, "loss": 1.3311, "lr": 2.1857952865796616e-07, "epoch": 9.153846153846153, "percentage": 91.54, "elapsed_time": "1:56:28", "remaining_time": "0:10:46"} +{"current_steps": 2262, "total_steps": 2470, "loss": 1.3046, "lr": 2.1651789586287442e-07, "epoch": 9.157894736842104, "percentage": 91.58, "elapsed_time": "1:56:31", "remaining_time": "0:10:42"} +{"current_steps": 2263, "total_steps": 2470, "loss": 1.1448, "lr": 2.1446581662042943e-07, "epoch": 9.161943319838057, "percentage": 91.62, "elapsed_time": "1:56:34", "remaining_time": "0:10:39"} +{"current_steps": 2264, "total_steps": 2470, "loss": 1.2402, "lr": 2.124232950290367e-07, "epoch": 9.165991902834008, "percentage": 91.66, "elapsed_time": "1:56:37", "remaining_time": "0:10:36"} +{"current_steps": 2265, "total_steps": 2470, "loss": 1.8377, "lr": 2.1039033516801255e-07, "epoch": 9.17004048582996, "percentage": 91.7, "elapsed_time": "1:56:40", "remaining_time": "0:10:33"} +{"current_steps": 2266, "total_steps": 2470, "loss": 1.0402, "lr": 2.0836694109757748e-07, "epoch": 9.17408906882591, "percentage": 91.74, "elapsed_time": "1:56:43", "remaining_time": "0:10:30"} +{"current_steps": 2267, "total_steps": 2470, "loss": 1.2518, "lr": 2.0635311685884675e-07, "epoch": 9.178137651821862, "percentage": 91.78, "elapsed_time": "1:56:46", "remaining_time": "0:10:27"} +{"current_steps": 2268, "total_steps": 2470, "loss": 1.0571, "lr": 2.0434886647382135e-07, "epoch": 9.182186234817813, "percentage": 91.82, "elapsed_time": "1:56:49", "remaining_time": "0:10:24"} +{"current_steps": 2269, "total_steps": 2470, "loss": 1.2413, "lr": 2.0235419394538324e-07, "epoch": 9.186234817813766, "percentage": 91.86, "elapsed_time": "1:56:52", "remaining_time": "0:10:21"} +{"current_steps": 2270, "total_steps": 2470, "loss": 1.2594, "lr": 2.0036910325728521e-07, "epoch": 9.190283400809717, "percentage": 91.9, "elapsed_time": "1:56:55", "remaining_time": "0:10:18"} +{"current_steps": 2271, "total_steps": 2470, "loss": 1.4279, "lr": 1.9839359837414308e-07, "epoch": 9.194331983805668, "percentage": 91.94, "elapsed_time": "1:56:58", "remaining_time": "0:10:15"} +{"current_steps": 2272, "total_steps": 2470, "loss": 1.493, "lr": 1.9642768324142803e-07, "epoch": 9.19838056680162, "percentage": 91.98, "elapsed_time": "1:57:01", "remaining_time": "0:10:11"} +{"current_steps": 2273, "total_steps": 2470, "loss": 1.1961, "lr": 1.9447136178545766e-07, "epoch": 9.20242914979757, "percentage": 92.02, "elapsed_time": "1:57:04", "remaining_time": "0:10:08"} +{"current_steps": 2274, "total_steps": 2470, "loss": 1.5216, "lr": 1.9252463791339048e-07, "epoch": 9.206477732793521, "percentage": 92.06, "elapsed_time": "1:57:07", "remaining_time": "0:10:05"} +{"current_steps": 2275, "total_steps": 2470, "loss": 1.5102, "lr": 1.9058751551321642e-07, "epoch": 9.210526315789474, "percentage": 92.11, "elapsed_time": "1:57:10", "remaining_time": "0:10:02"} +{"current_steps": 2276, "total_steps": 2470, "loss": 1.2951, "lr": 1.8865999845374794e-07, "epoch": 9.214574898785425, "percentage": 92.15, "elapsed_time": "1:57:13", "remaining_time": "0:09:59"} +{"current_steps": 2277, "total_steps": 2470, "loss": 1.1742, "lr": 1.8674209058461624e-07, "epoch": 9.218623481781377, "percentage": 92.19, "elapsed_time": "1:57:16", "remaining_time": "0:09:56"} +{"current_steps": 2278, "total_steps": 2470, "loss": 1.1273, "lr": 1.8483379573625948e-07, "epoch": 9.222672064777328, "percentage": 92.23, "elapsed_time": "1:57:19", "remaining_time": "0:09:53"} +{"current_steps": 2279, "total_steps": 2470, "loss": 1.3395, "lr": 1.8293511771991624e-07, "epoch": 9.226720647773279, "percentage": 92.27, "elapsed_time": "1:57:22", "remaining_time": "0:09:50"} +{"current_steps": 2280, "total_steps": 2470, "loss": 1.1154, "lr": 1.8104606032761985e-07, "epoch": 9.23076923076923, "percentage": 92.31, "elapsed_time": "1:57:25", "remaining_time": "0:09:47"} +{"current_steps": 2281, "total_steps": 2470, "loss": 1.3337, "lr": 1.7916662733218848e-07, "epoch": 9.234817813765183, "percentage": 92.35, "elapsed_time": "1:57:28", "remaining_time": "0:09:44"} +{"current_steps": 2282, "total_steps": 2470, "loss": 1.2648, "lr": 1.7729682248721848e-07, "epoch": 9.238866396761134, "percentage": 92.39, "elapsed_time": "1:57:31", "remaining_time": "0:09:40"} +{"current_steps": 2283, "total_steps": 2470, "loss": 1.3006, "lr": 1.7543664952707817e-07, "epoch": 9.242914979757085, "percentage": 92.43, "elapsed_time": "1:57:34", "remaining_time": "0:09:37"} +{"current_steps": 2284, "total_steps": 2470, "loss": 1.1748, "lr": 1.7358611216689692e-07, "epoch": 9.246963562753036, "percentage": 92.47, "elapsed_time": "1:57:38", "remaining_time": "0:09:34"} +{"current_steps": 2285, "total_steps": 2470, "loss": 1.2565, "lr": 1.7174521410256162e-07, "epoch": 9.251012145748987, "percentage": 92.51, "elapsed_time": "1:57:41", "remaining_time": "0:09:31"} +{"current_steps": 2286, "total_steps": 2470, "loss": 1.196, "lr": 1.6991395901070685e-07, "epoch": 9.255060728744938, "percentage": 92.55, "elapsed_time": "1:57:44", "remaining_time": "0:09:28"} +{"current_steps": 2287, "total_steps": 2470, "loss": 1.1668, "lr": 1.6809235054870865e-07, "epoch": 9.259109311740891, "percentage": 92.59, "elapsed_time": "1:57:47", "remaining_time": "0:09:25"} +{"current_steps": 2288, "total_steps": 2470, "loss": 0.9512, "lr": 1.6628039235467686e-07, "epoch": 9.263157894736842, "percentage": 92.63, "elapsed_time": "1:57:50", "remaining_time": "0:09:22"} +{"current_steps": 2289, "total_steps": 2470, "loss": 1.251, "lr": 1.6447808804744668e-07, "epoch": 9.267206477732794, "percentage": 92.67, "elapsed_time": "1:57:53", "remaining_time": "0:09:19"} +{"current_steps": 2290, "total_steps": 2470, "loss": 1.2493, "lr": 1.6268544122657437e-07, "epoch": 9.271255060728745, "percentage": 92.71, "elapsed_time": "1:57:56", "remaining_time": "0:09:16"} +{"current_steps": 2291, "total_steps": 2470, "loss": 1.3238, "lr": 1.6090245547232707e-07, "epoch": 9.275303643724696, "percentage": 92.75, "elapsed_time": "1:57:59", "remaining_time": "0:09:13"} +{"current_steps": 2292, "total_steps": 2470, "loss": 1.1986, "lr": 1.5912913434567746e-07, "epoch": 9.279352226720647, "percentage": 92.79, "elapsed_time": "1:58:02", "remaining_time": "0:09:10"} +{"current_steps": 2293, "total_steps": 2470, "loss": 0.9094, "lr": 1.5736548138829632e-07, "epoch": 9.2834008097166, "percentage": 92.83, "elapsed_time": "1:58:05", "remaining_time": "0:09:06"} +{"current_steps": 2294, "total_steps": 2470, "loss": 1.0554, "lr": 1.5561150012254446e-07, "epoch": 9.287449392712551, "percentage": 92.87, "elapsed_time": "1:58:08", "remaining_time": "0:09:03"} +{"current_steps": 2295, "total_steps": 2470, "loss": 0.9282, "lr": 1.5386719405146633e-07, "epoch": 9.291497975708502, "percentage": 92.91, "elapsed_time": "1:58:11", "remaining_time": "0:09:00"} +{"current_steps": 2296, "total_steps": 2470, "loss": 1.1336, "lr": 1.5213256665878362e-07, "epoch": 9.295546558704453, "percentage": 92.96, "elapsed_time": "1:58:14", "remaining_time": "0:08:57"} +{"current_steps": 2297, "total_steps": 2470, "loss": 1.3822, "lr": 1.5040762140888843e-07, "epoch": 9.299595141700404, "percentage": 93.0, "elapsed_time": "1:58:17", "remaining_time": "0:08:54"} +{"current_steps": 2298, "total_steps": 2470, "loss": 0.9789, "lr": 1.4869236174683443e-07, "epoch": 9.303643724696355, "percentage": 93.04, "elapsed_time": "1:58:20", "remaining_time": "0:08:51"} +{"current_steps": 2299, "total_steps": 2470, "loss": 1.034, "lr": 1.4698679109833192e-07, "epoch": 9.307692307692308, "percentage": 93.08, "elapsed_time": "1:58:23", "remaining_time": "0:08:48"} +{"current_steps": 2300, "total_steps": 2470, "loss": 1.0888, "lr": 1.4529091286973994e-07, "epoch": 9.31174089068826, "percentage": 93.12, "elapsed_time": "1:58:26", "remaining_time": "0:08:45"} +{"current_steps": 2301, "total_steps": 2470, "loss": 0.9574, "lr": 1.4360473044806033e-07, "epoch": 9.31578947368421, "percentage": 93.16, "elapsed_time": "1:58:29", "remaining_time": "0:08:42"} +{"current_steps": 2302, "total_steps": 2470, "loss": 0.9404, "lr": 1.419282472009309e-07, "epoch": 9.319838056680162, "percentage": 93.2, "elapsed_time": "1:58:32", "remaining_time": "0:08:39"} +{"current_steps": 2303, "total_steps": 2470, "loss": 0.9955, "lr": 1.402614664766172e-07, "epoch": 9.323886639676113, "percentage": 93.24, "elapsed_time": "1:58:35", "remaining_time": "0:08:35"} +{"current_steps": 2304, "total_steps": 2470, "loss": 1.0107, "lr": 1.3860439160400808e-07, "epoch": 9.327935222672064, "percentage": 93.28, "elapsed_time": "1:58:38", "remaining_time": "0:08:32"} +{"current_steps": 2305, "total_steps": 2470, "loss": 1.0625, "lr": 1.369570258926062e-07, "epoch": 9.331983805668017, "percentage": 93.32, "elapsed_time": "1:58:41", "remaining_time": "0:08:29"} +{"current_steps": 2306, "total_steps": 2470, "loss": 1.1625, "lr": 1.353193726325247e-07, "epoch": 9.336032388663968, "percentage": 93.36, "elapsed_time": "1:58:44", "remaining_time": "0:08:26"} +{"current_steps": 2307, "total_steps": 2470, "loss": 1.0921, "lr": 1.3369143509447903e-07, "epoch": 9.34008097165992, "percentage": 93.4, "elapsed_time": "1:58:47", "remaining_time": "0:08:23"} +{"current_steps": 2308, "total_steps": 2470, "loss": 0.9869, "lr": 1.3207321652977944e-07, "epoch": 9.34412955465587, "percentage": 93.44, "elapsed_time": "1:58:50", "remaining_time": "0:08:20"} +{"current_steps": 2309, "total_steps": 2470, "loss": 1.1164, "lr": 1.3046472017032685e-07, "epoch": 9.348178137651821, "percentage": 93.48, "elapsed_time": "1:58:53", "remaining_time": "0:08:17"} +{"current_steps": 2310, "total_steps": 2470, "loss": 1.1228, "lr": 1.288659492286032e-07, "epoch": 9.352226720647772, "percentage": 93.52, "elapsed_time": "1:58:56", "remaining_time": "0:08:14"} +{"current_steps": 2311, "total_steps": 2470, "loss": 0.8228, "lr": 1.2727690689766814e-07, "epoch": 9.356275303643725, "percentage": 93.56, "elapsed_time": "1:58:59", "remaining_time": "0:08:11"} +{"current_steps": 2312, "total_steps": 2470, "loss": 1.2048, "lr": 1.2569759635115086e-07, "epoch": 9.360323886639677, "percentage": 93.6, "elapsed_time": "1:59:02", "remaining_time": "0:08:08"} +{"current_steps": 2313, "total_steps": 2470, "loss": 1.2833, "lr": 1.2412802074324548e-07, "epoch": 9.364372469635628, "percentage": 93.64, "elapsed_time": "1:59:05", "remaining_time": "0:08:05"} +{"current_steps": 2314, "total_steps": 2470, "loss": 1.1706, "lr": 1.2256818320870224e-07, "epoch": 9.368421052631579, "percentage": 93.68, "elapsed_time": "1:59:08", "remaining_time": "0:08:01"} +{"current_steps": 2315, "total_steps": 2470, "loss": 1.2185, "lr": 1.210180868628219e-07, "epoch": 9.37246963562753, "percentage": 93.72, "elapsed_time": "1:59:11", "remaining_time": "0:07:58"} +{"current_steps": 2316, "total_steps": 2470, "loss": 0.9325, "lr": 1.1947773480145198e-07, "epoch": 9.376518218623481, "percentage": 93.77, "elapsed_time": "1:59:14", "remaining_time": "0:07:55"} +{"current_steps": 2317, "total_steps": 2470, "loss": 1.0759, "lr": 1.179471301009777e-07, "epoch": 9.380566801619434, "percentage": 93.81, "elapsed_time": "1:59:17", "remaining_time": "0:07:52"} +{"current_steps": 2318, "total_steps": 2470, "loss": 0.8937, "lr": 1.1642627581831767e-07, "epoch": 9.384615384615385, "percentage": 93.85, "elapsed_time": "1:59:20", "remaining_time": "0:07:49"} +{"current_steps": 2319, "total_steps": 2470, "loss": 0.9544, "lr": 1.1491517499091498e-07, "epoch": 9.388663967611336, "percentage": 93.89, "elapsed_time": "1:59:23", "remaining_time": "0:07:46"} +{"current_steps": 2320, "total_steps": 2470, "loss": 1.1477, "lr": 1.134138306367355e-07, "epoch": 9.392712550607287, "percentage": 93.93, "elapsed_time": "1:59:26", "remaining_time": "0:07:43"} +{"current_steps": 2321, "total_steps": 2470, "loss": 0.9354, "lr": 1.1192224575425848e-07, "epoch": 9.396761133603238, "percentage": 93.97, "elapsed_time": "1:59:29", "remaining_time": "0:07:40"} +{"current_steps": 2322, "total_steps": 2470, "loss": 0.9898, "lr": 1.1044042332247152e-07, "epoch": 9.40080971659919, "percentage": 94.01, "elapsed_time": "1:59:32", "remaining_time": "0:07:37"} +{"current_steps": 2323, "total_steps": 2470, "loss": 1.0196, "lr": 1.089683663008656e-07, "epoch": 9.404858299595142, "percentage": 94.05, "elapsed_time": "1:59:35", "remaining_time": "0:07:34"} +{"current_steps": 2324, "total_steps": 2470, "loss": 1.0973, "lr": 1.0750607762942622e-07, "epoch": 9.408906882591094, "percentage": 94.09, "elapsed_time": "1:59:38", "remaining_time": "0:07:30"} +{"current_steps": 2325, "total_steps": 2470, "loss": 0.9423, "lr": 1.0605356022863167e-07, "epoch": 9.412955465587045, "percentage": 94.13, "elapsed_time": "1:59:41", "remaining_time": "0:07:27"} +{"current_steps": 2326, "total_steps": 2470, "loss": 1.2153, "lr": 1.0461081699944475e-07, "epoch": 9.417004048582996, "percentage": 94.17, "elapsed_time": "1:59:44", "remaining_time": "0:07:24"} +{"current_steps": 2327, "total_steps": 2470, "loss": 1.2392, "lr": 1.0317785082330555e-07, "epoch": 9.421052631578947, "percentage": 94.21, "elapsed_time": "1:59:47", "remaining_time": "0:07:21"} +{"current_steps": 2328, "total_steps": 2470, "loss": 1.2855, "lr": 1.0175466456213034e-07, "epoch": 9.425101214574898, "percentage": 94.25, "elapsed_time": "1:59:50", "remaining_time": "0:07:18"} +{"current_steps": 2329, "total_steps": 2470, "loss": 1.2406, "lr": 1.0034126105830099e-07, "epoch": 9.429149797570851, "percentage": 94.29, "elapsed_time": "1:59:53", "remaining_time": "0:07:15"} +{"current_steps": 2330, "total_steps": 2470, "loss": 1.4292, "lr": 9.89376431346606e-08, "epoch": 9.433198380566802, "percentage": 94.33, "elapsed_time": "1:59:56", "remaining_time": "0:07:12"} +{"current_steps": 2331, "total_steps": 2470, "loss": 1.2237, "lr": 9.75438135945106e-08, "epoch": 9.437246963562753, "percentage": 94.37, "elapsed_time": "1:59:59", "remaining_time": "0:07:09"} +{"current_steps": 2332, "total_steps": 2470, "loss": 1.1524, "lr": 9.615977522160147e-08, "epoch": 9.441295546558704, "percentage": 94.41, "elapsed_time": "2:00:02", "remaining_time": "0:07:06"} +{"current_steps": 2333, "total_steps": 2470, "loss": 1.4128, "lr": 9.478553078013042e-08, "epoch": 9.445344129554655, "percentage": 94.45, "elapsed_time": "2:00:05", "remaining_time": "0:07:03"} +{"current_steps": 2334, "total_steps": 2470, "loss": 1.2044, "lr": 9.342108301473308e-08, "epoch": 9.449392712550607, "percentage": 94.49, "elapsed_time": "2:00:08", "remaining_time": "0:07:00"} +{"current_steps": 2335, "total_steps": 2470, "loss": 1.1982, "lr": 9.206643465047904e-08, "epoch": 9.45344129554656, "percentage": 94.53, "elapsed_time": "2:00:11", "remaining_time": "0:06:56"} +{"current_steps": 2336, "total_steps": 2470, "loss": 1.5372, "lr": 9.072158839286748e-08, "epoch": 9.45748987854251, "percentage": 94.57, "elapsed_time": "2:00:14", "remaining_time": "0:06:53"} +{"current_steps": 2337, "total_steps": 2470, "loss": 1.6061, "lr": 8.938654692781989e-08, "epoch": 9.461538461538462, "percentage": 94.62, "elapsed_time": "2:00:17", "remaining_time": "0:06:50"} +{"current_steps": 2338, "total_steps": 2470, "loss": 1.3296, "lr": 8.80613129216762e-08, "epoch": 9.465587044534413, "percentage": 94.66, "elapsed_time": "2:00:20", "remaining_time": "0:06:47"} +{"current_steps": 2339, "total_steps": 2470, "loss": 1.3934, "lr": 8.674588902118919e-08, "epoch": 9.469635627530364, "percentage": 94.7, "elapsed_time": "2:00:23", "remaining_time": "0:06:44"} +{"current_steps": 2340, "total_steps": 2470, "loss": 1.3763, "lr": 8.544027785351794e-08, "epoch": 9.473684210526315, "percentage": 94.74, "elapsed_time": "2:00:26", "remaining_time": "0:06:41"} +{"current_steps": 2341, "total_steps": 2470, "loss": 1.3484, "lr": 8.414448202622494e-08, "epoch": 9.477732793522268, "percentage": 94.78, "elapsed_time": "2:00:29", "remaining_time": "0:06:38"} +{"current_steps": 2342, "total_steps": 2470, "loss": 1.2994, "lr": 8.285850412726837e-08, "epoch": 9.481781376518219, "percentage": 94.82, "elapsed_time": "2:00:32", "remaining_time": "0:06:35"} +{"current_steps": 2343, "total_steps": 2470, "loss": 1.2402, "lr": 8.15823467249982e-08, "epoch": 9.48582995951417, "percentage": 94.86, "elapsed_time": "2:00:35", "remaining_time": "0:06:32"} +{"current_steps": 2344, "total_steps": 2470, "loss": 1.2015, "lr": 8.031601236815234e-08, "epoch": 9.489878542510121, "percentage": 94.9, "elapsed_time": "2:00:38", "remaining_time": "0:06:29"} +{"current_steps": 2345, "total_steps": 2470, "loss": 1.1243, "lr": 7.905950358584768e-08, "epoch": 9.493927125506072, "percentage": 94.94, "elapsed_time": "2:00:41", "remaining_time": "0:06:26"} +{"current_steps": 2346, "total_steps": 2470, "loss": 1.0819, "lr": 7.781282288757963e-08, "epoch": 9.497975708502024, "percentage": 94.98, "elapsed_time": "2:00:44", "remaining_time": "0:06:22"} +{"current_steps": 2347, "total_steps": 2470, "loss": 0.896, "lr": 7.657597276321427e-08, "epoch": 9.502024291497976, "percentage": 95.02, "elapsed_time": "2:00:47", "remaining_time": "0:06:19"} +{"current_steps": 2348, "total_steps": 2470, "loss": 1.2481, "lr": 7.534895568298395e-08, "epoch": 9.506072874493928, "percentage": 95.06, "elapsed_time": "2:00:50", "remaining_time": "0:06:16"} +{"current_steps": 2349, "total_steps": 2470, "loss": 1.1753, "lr": 7.413177409748284e-08, "epoch": 9.510121457489879, "percentage": 95.1, "elapsed_time": "2:00:53", "remaining_time": "0:06:13"} +{"current_steps": 2350, "total_steps": 2470, "loss": 1.1947, "lr": 7.292443043766085e-08, "epoch": 9.51417004048583, "percentage": 95.14, "elapsed_time": "2:00:56", "remaining_time": "0:06:10"} +{"current_steps": 2351, "total_steps": 2470, "loss": 1.2041, "lr": 7.172692711482022e-08, "epoch": 9.518218623481781, "percentage": 95.18, "elapsed_time": "2:00:59", "remaining_time": "0:06:07"} +{"current_steps": 2352, "total_steps": 2470, "loss": 1.1271, "lr": 7.053926652061116e-08, "epoch": 9.522267206477732, "percentage": 95.22, "elapsed_time": "2:01:02", "remaining_time": "0:06:04"} +{"current_steps": 2353, "total_steps": 2470, "loss": 0.9474, "lr": 6.936145102702407e-08, "epoch": 9.526315789473685, "percentage": 95.26, "elapsed_time": "2:01:06", "remaining_time": "0:06:01"} +{"current_steps": 2354, "total_steps": 2470, "loss": 1.0489, "lr": 6.819348298638839e-08, "epoch": 9.530364372469636, "percentage": 95.3, "elapsed_time": "2:01:09", "remaining_time": "0:05:58"} +{"current_steps": 2355, "total_steps": 2470, "loss": 1.0637, "lr": 6.703536473136486e-08, "epoch": 9.534412955465587, "percentage": 95.34, "elapsed_time": "2:01:12", "remaining_time": "0:05:55"} +{"current_steps": 2356, "total_steps": 2470, "loss": 1.3686, "lr": 6.588709857494324e-08, "epoch": 9.538461538461538, "percentage": 95.38, "elapsed_time": "2:01:15", "remaining_time": "0:05:52"} +{"current_steps": 2357, "total_steps": 2470, "loss": 1.3526, "lr": 6.474868681043578e-08, "epoch": 9.54251012145749, "percentage": 95.43, "elapsed_time": "2:01:18", "remaining_time": "0:05:48"} +{"current_steps": 2358, "total_steps": 2470, "loss": 1.9396, "lr": 6.36201317114754e-08, "epoch": 9.54655870445344, "percentage": 95.47, "elapsed_time": "2:01:21", "remaining_time": "0:05:45"} +{"current_steps": 2359, "total_steps": 2470, "loss": 1.622, "lr": 6.250143553200694e-08, "epoch": 9.550607287449393, "percentage": 95.51, "elapsed_time": "2:01:24", "remaining_time": "0:05:42"} +{"current_steps": 2360, "total_steps": 2470, "loss": 2.0152, "lr": 6.13926005062876e-08, "epoch": 9.554655870445345, "percentage": 95.55, "elapsed_time": "2:01:27", "remaining_time": "0:05:39"} +{"current_steps": 2361, "total_steps": 2470, "loss": 1.1873, "lr": 6.029362884887757e-08, "epoch": 9.558704453441296, "percentage": 95.59, "elapsed_time": "2:01:30", "remaining_time": "0:05:36"} +{"current_steps": 2362, "total_steps": 2470, "loss": 1.0601, "lr": 5.920452275463895e-08, "epoch": 9.562753036437247, "percentage": 95.63, "elapsed_time": "2:01:33", "remaining_time": "0:05:33"} +{"current_steps": 2363, "total_steps": 2470, "loss": 1.4362, "lr": 5.8125284398730666e-08, "epoch": 9.566801619433198, "percentage": 95.67, "elapsed_time": "2:01:36", "remaining_time": "0:05:30"} +{"current_steps": 2364, "total_steps": 2470, "loss": 1.1286, "lr": 5.705591593660353e-08, "epoch": 9.570850202429149, "percentage": 95.71, "elapsed_time": "2:01:39", "remaining_time": "0:05:27"} +{"current_steps": 2365, "total_steps": 2470, "loss": 1.2739, "lr": 5.5996419503996924e-08, "epoch": 9.574898785425102, "percentage": 95.75, "elapsed_time": "2:01:42", "remaining_time": "0:05:24"} +{"current_steps": 2366, "total_steps": 2470, "loss": 1.1107, "lr": 5.4946797216931524e-08, "epoch": 9.578947368421053, "percentage": 95.79, "elapsed_time": "2:01:45", "remaining_time": "0:05:21"} +{"current_steps": 2367, "total_steps": 2470, "loss": 1.1965, "lr": 5.390705117171047e-08, "epoch": 9.582995951417004, "percentage": 95.83, "elapsed_time": "2:01:48", "remaining_time": "0:05:18"} +{"current_steps": 2368, "total_steps": 2470, "loss": 1.2208, "lr": 5.2877183444909885e-08, "epoch": 9.587044534412955, "percentage": 95.87, "elapsed_time": "2:01:51", "remaining_time": "0:05:14"} +{"current_steps": 2369, "total_steps": 2470, "loss": 1.2005, "lr": 5.185719609337836e-08, "epoch": 9.591093117408906, "percentage": 95.91, "elapsed_time": "2:01:54", "remaining_time": "0:05:11"} +{"current_steps": 2370, "total_steps": 2470, "loss": 0.9196, "lr": 5.084709115423081e-08, "epoch": 9.595141700404858, "percentage": 95.95, "elapsed_time": "2:01:57", "remaining_time": "0:05:08"} +{"current_steps": 2371, "total_steps": 2470, "loss": 1.0393, "lr": 4.9846870644844616e-08, "epoch": 9.59919028340081, "percentage": 95.99, "elapsed_time": "2:02:00", "remaining_time": "0:05:05"} +{"current_steps": 2372, "total_steps": 2470, "loss": 1.1261, "lr": 4.885653656285627e-08, "epoch": 9.603238866396762, "percentage": 96.03, "elapsed_time": "2:02:03", "remaining_time": "0:05:02"} +{"current_steps": 2373, "total_steps": 2470, "loss": 0.8888, "lr": 4.7876090886158074e-08, "epoch": 9.607287449392713, "percentage": 96.07, "elapsed_time": "2:02:06", "remaining_time": "0:04:59"} +{"current_steps": 2374, "total_steps": 2470, "loss": 1.1768, "lr": 4.6905535572892015e-08, "epoch": 9.611336032388664, "percentage": 96.11, "elapsed_time": "2:02:09", "remaining_time": "0:04:56"} +{"current_steps": 2375, "total_steps": 2470, "loss": 1.2218, "lr": 4.5944872561448084e-08, "epoch": 9.615384615384615, "percentage": 96.15, "elapsed_time": "2:02:12", "remaining_time": "0:04:53"} +{"current_steps": 2376, "total_steps": 2470, "loss": 0.9572, "lr": 4.499410377045765e-08, "epoch": 9.619433198380566, "percentage": 96.19, "elapsed_time": "2:02:15", "remaining_time": "0:04:50"} +{"current_steps": 2377, "total_steps": 2470, "loss": 1.3123, "lr": 4.4053231098794e-08, "epoch": 9.623481781376519, "percentage": 96.23, "elapsed_time": "2:02:18", "remaining_time": "0:04:47"} +{"current_steps": 2378, "total_steps": 2470, "loss": 1.1711, "lr": 4.3122256425563444e-08, "epoch": 9.62753036437247, "percentage": 96.28, "elapsed_time": "2:02:21", "remaining_time": "0:04:44"} +{"current_steps": 2379, "total_steps": 2470, "loss": 1.4233, "lr": 4.220118161010589e-08, "epoch": 9.631578947368421, "percentage": 96.32, "elapsed_time": "2:02:24", "remaining_time": "0:04:40"} +{"current_steps": 2380, "total_steps": 2470, "loss": 1.0134, "lr": 4.129000849198872e-08, "epoch": 9.635627530364372, "percentage": 96.36, "elapsed_time": "2:02:27", "remaining_time": "0:04:37"} +{"current_steps": 2381, "total_steps": 2470, "loss": 1.3095, "lr": 4.038873889100237e-08, "epoch": 9.639676113360323, "percentage": 96.4, "elapsed_time": "2:02:30", "remaining_time": "0:04:34"} +{"current_steps": 2382, "total_steps": 2470, "loss": 1.196, "lr": 3.94973746071603e-08, "epoch": 9.643724696356275, "percentage": 96.44, "elapsed_time": "2:02:33", "remaining_time": "0:04:31"} +{"current_steps": 2383, "total_steps": 2470, "loss": 1.2979, "lr": 3.861591742069071e-08, "epoch": 9.647773279352228, "percentage": 96.48, "elapsed_time": "2:02:36", "remaining_time": "0:04:28"} +{"current_steps": 2384, "total_steps": 2470, "loss": 1.09, "lr": 3.77443690920376e-08, "epoch": 9.651821862348179, "percentage": 96.52, "elapsed_time": "2:02:39", "remaining_time": "0:04:25"} +{"current_steps": 2385, "total_steps": 2470, "loss": 1.2847, "lr": 3.688273136185416e-08, "epoch": 9.65587044534413, "percentage": 96.56, "elapsed_time": "2:02:42", "remaining_time": "0:04:22"} +{"current_steps": 2386, "total_steps": 2470, "loss": 1.0873, "lr": 3.60310059509994e-08, "epoch": 9.65991902834008, "percentage": 96.6, "elapsed_time": "2:02:45", "remaining_time": "0:04:19"} +{"current_steps": 2387, "total_steps": 2470, "loss": 1.2574, "lr": 3.518919456053649e-08, "epoch": 9.663967611336032, "percentage": 96.64, "elapsed_time": "2:02:48", "remaining_time": "0:04:16"} +{"current_steps": 2388, "total_steps": 2470, "loss": 1.2518, "lr": 3.4357298871727786e-08, "epoch": 9.668016194331983, "percentage": 96.68, "elapsed_time": "2:02:51", "remaining_time": "0:04:13"} +{"current_steps": 2389, "total_steps": 2470, "loss": 1.4336, "lr": 3.353532054603203e-08, "epoch": 9.672064777327936, "percentage": 96.72, "elapsed_time": "2:02:54", "remaining_time": "0:04:10"} +{"current_steps": 2390, "total_steps": 2470, "loss": 1.4305, "lr": 3.2723261225102164e-08, "epoch": 9.676113360323887, "percentage": 96.76, "elapsed_time": "2:02:57", "remaining_time": "0:04:06"} +{"current_steps": 2391, "total_steps": 2470, "loss": 1.2413, "lr": 3.192112253077973e-08, "epoch": 9.680161943319838, "percentage": 96.8, "elapsed_time": "2:03:00", "remaining_time": "0:04:03"} +{"current_steps": 2392, "total_steps": 2470, "loss": 1.1222, "lr": 3.1128906065092666e-08, "epoch": 9.68421052631579, "percentage": 96.84, "elapsed_time": "2:03:03", "remaining_time": "0:04:00"} +{"current_steps": 2393, "total_steps": 2470, "loss": 1.2007, "lr": 3.034661341025258e-08, "epoch": 9.68825910931174, "percentage": 96.88, "elapsed_time": "2:03:06", "remaining_time": "0:03:57"} +{"current_steps": 2394, "total_steps": 2470, "loss": 1.2133, "lr": 2.957424612865245e-08, "epoch": 9.692307692307692, "percentage": 96.92, "elapsed_time": "2:03:10", "remaining_time": "0:03:54"} +{"current_steps": 2395, "total_steps": 2470, "loss": 1.1056, "lr": 2.8811805762860578e-08, "epoch": 9.696356275303645, "percentage": 96.96, "elapsed_time": "2:03:13", "remaining_time": "0:03:51"} +{"current_steps": 2396, "total_steps": 2470, "loss": 1.5372, "lr": 2.8059293835620006e-08, "epoch": 9.700404858299596, "percentage": 97.0, "elapsed_time": "2:03:16", "remaining_time": "0:03:48"} +{"current_steps": 2397, "total_steps": 2470, "loss": 1.3512, "lr": 2.731671184984519e-08, "epoch": 9.704453441295547, "percentage": 97.04, "elapsed_time": "2:03:19", "remaining_time": "0:03:45"} +{"current_steps": 2398, "total_steps": 2470, "loss": 0.9806, "lr": 2.6584061288617568e-08, "epoch": 9.708502024291498, "percentage": 97.09, "elapsed_time": "2:03:22", "remaining_time": "0:03:42"} +{"current_steps": 2399, "total_steps": 2470, "loss": 1.1002, "lr": 2.5861343615184997e-08, "epoch": 9.712550607287449, "percentage": 97.13, "elapsed_time": "2:03:25", "remaining_time": "0:03:39"} +{"current_steps": 2400, "total_steps": 2470, "loss": 1.2214, "lr": 2.514856027295509e-08, "epoch": 9.7165991902834, "percentage": 97.17, "elapsed_time": "2:03:28", "remaining_time": "0:03:36"} +{"current_steps": 2401, "total_steps": 2470, "loss": 0.9652, "lr": 2.4445712685498e-08, "epoch": 9.720647773279353, "percentage": 97.21, "elapsed_time": "2:03:32", "remaining_time": "0:03:33"} +{"current_steps": 2402, "total_steps": 2470, "loss": 1.2102, "lr": 2.3752802256536423e-08, "epoch": 9.724696356275304, "percentage": 97.25, "elapsed_time": "2:03:35", "remaining_time": "0:03:29"} +{"current_steps": 2403, "total_steps": 2470, "loss": 1.3616, "lr": 2.3069830369949474e-08, "epoch": 9.728744939271255, "percentage": 97.29, "elapsed_time": "2:03:38", "remaining_time": "0:03:26"} +{"current_steps": 2404, "total_steps": 2470, "loss": 1.051, "lr": 2.239679838976605e-08, "epoch": 9.732793522267206, "percentage": 97.33, "elapsed_time": "2:03:41", "remaining_time": "0:03:23"} +{"current_steps": 2405, "total_steps": 2470, "loss": 1.1644, "lr": 2.173370766016314e-08, "epoch": 9.736842105263158, "percentage": 97.37, "elapsed_time": "2:03:44", "remaining_time": "0:03:20"} +{"current_steps": 2406, "total_steps": 2470, "loss": 1.7934, "lr": 2.1080559505462504e-08, "epoch": 9.740890688259109, "percentage": 97.41, "elapsed_time": "2:03:47", "remaining_time": "0:03:17"} +{"current_steps": 2407, "total_steps": 2470, "loss": 1.3521, "lr": 2.043735523013013e-08, "epoch": 9.744939271255062, "percentage": 97.45, "elapsed_time": "2:03:50", "remaining_time": "0:03:14"} +{"current_steps": 2408, "total_steps": 2470, "loss": 1.1426, "lr": 1.98040961187701e-08, "epoch": 9.748987854251013, "percentage": 97.49, "elapsed_time": "2:03:53", "remaining_time": "0:03:11"} +{"current_steps": 2409, "total_steps": 2470, "loss": 1.0778, "lr": 1.918078343612628e-08, "epoch": 9.753036437246964, "percentage": 97.53, "elapsed_time": "2:03:56", "remaining_time": "0:03:08"} +{"current_steps": 2410, "total_steps": 2470, "loss": 1.1728, "lr": 1.85674184270751e-08, "epoch": 9.757085020242915, "percentage": 97.57, "elapsed_time": "2:03:59", "remaining_time": "0:03:05"} +{"current_steps": 2411, "total_steps": 2470, "loss": 1.9261, "lr": 1.7964002316628316e-08, "epoch": 9.761133603238866, "percentage": 97.61, "elapsed_time": "2:04:02", "remaining_time": "0:03:02"} +{"current_steps": 2412, "total_steps": 2470, "loss": 1.6529, "lr": 1.73705363099258e-08, "epoch": 9.765182186234817, "percentage": 97.65, "elapsed_time": "2:04:05", "remaining_time": "0:02:59"} +{"current_steps": 2413, "total_steps": 2470, "loss": 1.5923, "lr": 1.6787021592234998e-08, "epoch": 9.76923076923077, "percentage": 97.69, "elapsed_time": "2:04:08", "remaining_time": "0:02:55"} +{"current_steps": 2414, "total_steps": 2470, "loss": 1.1256, "lr": 1.6213459328950355e-08, "epoch": 9.773279352226721, "percentage": 97.73, "elapsed_time": "2:04:11", "remaining_time": "0:02:52"} +{"current_steps": 2415, "total_steps": 2470, "loss": 1.4239, "lr": 1.5649850665587217e-08, "epoch": 9.777327935222672, "percentage": 97.77, "elapsed_time": "2:04:14", "remaining_time": "0:02:49"} +{"current_steps": 2416, "total_steps": 2470, "loss": 1.3997, "lr": 1.5096196727783508e-08, "epoch": 9.781376518218623, "percentage": 97.81, "elapsed_time": "2:04:17", "remaining_time": "0:02:46"} +{"current_steps": 2417, "total_steps": 2470, "loss": 1.1972, "lr": 1.4552498621295264e-08, "epoch": 9.785425101214575, "percentage": 97.85, "elapsed_time": "2:04:20", "remaining_time": "0:02:43"} +{"current_steps": 2418, "total_steps": 2470, "loss": 1.1337, "lr": 1.4018757431992769e-08, "epoch": 9.789473684210526, "percentage": 97.89, "elapsed_time": "2:04:23", "remaining_time": "0:02:40"} +{"current_steps": 2419, "total_steps": 2470, "loss": 1.1298, "lr": 1.3494974225863322e-08, "epoch": 9.793522267206479, "percentage": 97.94, "elapsed_time": "2:04:26", "remaining_time": "0:02:37"} +{"current_steps": 2420, "total_steps": 2470, "loss": 1.1853, "lr": 1.2981150049004021e-08, "epoch": 9.79757085020243, "percentage": 97.98, "elapsed_time": "2:04:29", "remaining_time": "0:02:34"} +{"current_steps": 2421, "total_steps": 2470, "loss": 1.0838, "lr": 1.2477285927622873e-08, "epoch": 9.80161943319838, "percentage": 98.02, "elapsed_time": "2:04:32", "remaining_time": "0:02:31"} +{"current_steps": 2422, "total_steps": 2470, "loss": 0.9978, "lr": 1.1983382868036019e-08, "epoch": 9.805668016194332, "percentage": 98.06, "elapsed_time": "2:04:35", "remaining_time": "0:02:28"} +{"current_steps": 2423, "total_steps": 2470, "loss": 1.3163, "lr": 1.1499441856663296e-08, "epoch": 9.809716599190283, "percentage": 98.1, "elapsed_time": "2:04:38", "remaining_time": "0:02:25"} +{"current_steps": 2424, "total_steps": 2470, "loss": 1.0923, "lr": 1.102546386003156e-08, "epoch": 9.813765182186234, "percentage": 98.14, "elapsed_time": "2:04:41", "remaining_time": "0:02:21"} +{"current_steps": 2425, "total_steps": 2470, "loss": 1.0377, "lr": 1.0561449824766367e-08, "epoch": 9.817813765182187, "percentage": 98.18, "elapsed_time": "2:04:44", "remaining_time": "0:02:18"} +{"current_steps": 2426, "total_steps": 2470, "loss": 2.149, "lr": 1.0107400677596413e-08, "epoch": 9.821862348178138, "percentage": 98.22, "elapsed_time": "2:04:47", "remaining_time": "0:02:15"} +{"current_steps": 2427, "total_steps": 2470, "loss": 3.0471, "lr": 9.663317325345756e-09, "epoch": 9.82591093117409, "percentage": 98.26, "elapsed_time": "2:04:50", "remaining_time": "0:02:12"} +{"current_steps": 2428, "total_steps": 2470, "loss": 1.0529, "lr": 9.229200654936599e-09, "epoch": 9.82995951417004, "percentage": 98.3, "elapsed_time": "2:04:53", "remaining_time": "0:02:09"} +{"current_steps": 2429, "total_steps": 2470, "loss": 1.0334, "lr": 8.805051533384846e-09, "epoch": 9.834008097165992, "percentage": 98.34, "elapsed_time": "2:04:56", "remaining_time": "0:02:06"} +{"current_steps": 2430, "total_steps": 2470, "loss": 1.0283, "lr": 8.390870807799545e-09, "epoch": 9.838056680161943, "percentage": 98.38, "elapsed_time": "2:04:59", "remaining_time": "0:02:03"} +{"current_steps": 2431, "total_steps": 2470, "loss": 1.1448, "lr": 7.986659305380672e-09, "epoch": 9.842105263157894, "percentage": 98.42, "elapsed_time": "2:05:02", "remaining_time": "0:02:00"} +{"current_steps": 2432, "total_steps": 2470, "loss": 0.9191, "lr": 7.59241783341913e-09, "epoch": 9.846153846153847, "percentage": 98.46, "elapsed_time": "2:05:05", "remaining_time": "0:01:57"} +{"current_steps": 2433, "total_steps": 2470, "loss": 1.1249, "lr": 7.2081471792911914e-09, "epoch": 9.850202429149798, "percentage": 98.5, "elapsed_time": "2:05:08", "remaining_time": "0:01:54"} +{"current_steps": 2434, "total_steps": 2470, "loss": 1.1522, "lr": 6.833848110461283e-09, "epoch": 9.854251012145749, "percentage": 98.54, "elapsed_time": "2:05:11", "remaining_time": "0:01:51"} +{"current_steps": 2435, "total_steps": 2470, "loss": 1.1116, "lr": 6.469521374477539e-09, "epoch": 9.8582995951417, "percentage": 98.58, "elapsed_time": "2:05:14", "remaining_time": "0:01:48"} +{"current_steps": 2436, "total_steps": 2470, "loss": 1.11, "lr": 6.115167698972912e-09, "epoch": 9.862348178137651, "percentage": 98.62, "elapsed_time": "2:05:17", "remaining_time": "0:01:44"} +{"current_steps": 2437, "total_steps": 2470, "loss": 1.0761, "lr": 5.770787791661292e-09, "epoch": 9.866396761133604, "percentage": 98.66, "elapsed_time": "2:05:20", "remaining_time": "0:01:41"} +{"current_steps": 2438, "total_steps": 2470, "loss": 1.4742, "lr": 5.436382340335833e-09, "epoch": 9.870445344129555, "percentage": 98.7, "elapsed_time": "2:05:23", "remaining_time": "0:01:38"} +{"current_steps": 2439, "total_steps": 2470, "loss": 1.4265, "lr": 5.111952012870624e-09, "epoch": 9.874493927125506, "percentage": 98.74, "elapsed_time": "2:05:26", "remaining_time": "0:01:35"} +{"current_steps": 2440, "total_steps": 2470, "loss": 1.2196, "lr": 4.797497457216804e-09, "epoch": 9.878542510121457, "percentage": 98.79, "elapsed_time": "2:05:30", "remaining_time": "0:01:32"} +{"current_steps": 2441, "total_steps": 2470, "loss": 1.1487, "lr": 4.493019301401447e-09, "epoch": 9.882591093117409, "percentage": 98.83, "elapsed_time": "2:05:33", "remaining_time": "0:01:29"} +{"current_steps": 2442, "total_steps": 2470, "loss": 0.8072, "lr": 4.198518153527009e-09, "epoch": 9.88663967611336, "percentage": 98.87, "elapsed_time": "2:05:36", "remaining_time": "0:01:26"} +{"current_steps": 2443, "total_steps": 2470, "loss": 1.3326, "lr": 3.9139946017713315e-09, "epoch": 9.89068825910931, "percentage": 98.91, "elapsed_time": "2:05:39", "remaining_time": "0:01:23"} +{"current_steps": 2444, "total_steps": 2470, "loss": 1.3361, "lr": 3.6394492143820847e-09, "epoch": 9.894736842105264, "percentage": 98.95, "elapsed_time": "2:05:42", "remaining_time": "0:01:20"} +{"current_steps": 2445, "total_steps": 2470, "loss": 1.1313, "lr": 3.3748825396817675e-09, "epoch": 9.898785425101215, "percentage": 98.99, "elapsed_time": "2:05:45", "remaining_time": "0:01:17"} +{"current_steps": 2446, "total_steps": 2470, "loss": 1.1024, "lr": 3.120295106060489e-09, "epoch": 9.902834008097166, "percentage": 99.03, "elapsed_time": "2:05:48", "remaining_time": "0:01:14"} +{"current_steps": 2447, "total_steps": 2470, "loss": 1.1277, "lr": 2.875687421980966e-09, "epoch": 9.906882591093117, "percentage": 99.07, "elapsed_time": "2:05:51", "remaining_time": "0:01:10"} +{"current_steps": 2448, "total_steps": 2470, "loss": 0.9121, "lr": 2.6410599759713052e-09, "epoch": 9.910931174089068, "percentage": 99.11, "elapsed_time": "2:05:54", "remaining_time": "0:01:07"} +{"current_steps": 2449, "total_steps": 2470, "loss": 0.985, "lr": 2.4164132366294444e-09, "epoch": 9.914979757085021, "percentage": 99.15, "elapsed_time": "2:05:58", "remaining_time": "0:01:04"} +{"current_steps": 2450, "total_steps": 2470, "loss": 0.9875, "lr": 2.201747652618713e-09, "epoch": 9.919028340080972, "percentage": 99.19, "elapsed_time": "2:06:01", "remaining_time": "0:01:01"} +{"current_steps": 2451, "total_steps": 2470, "loss": 1.3108, "lr": 1.997063652668385e-09, "epoch": 9.923076923076923, "percentage": 99.23, "elapsed_time": "2:06:04", "remaining_time": "0:00:58"} +{"current_steps": 2452, "total_steps": 2470, "loss": 1.1487, "lr": 1.8023616455731253e-09, "epoch": 9.927125506072874, "percentage": 99.27, "elapsed_time": "2:06:07", "remaining_time": "0:00:55"} +{"current_steps": 2453, "total_steps": 2470, "loss": 1.1327, "lr": 1.6176420201902132e-09, "epoch": 9.931174089068826, "percentage": 99.31, "elapsed_time": "2:06:09", "remaining_time": "0:00:52"} +{"current_steps": 2454, "total_steps": 2470, "loss": 0.9427, "lr": 1.4429051454412092e-09, "epoch": 9.935222672064777, "percentage": 99.35, "elapsed_time": "2:06:13", "remaining_time": "0:00:49"} +{"current_steps": 2455, "total_steps": 2470, "loss": 0.9617, "lr": 1.2781513703102877e-09, "epoch": 9.939271255060728, "percentage": 99.39, "elapsed_time": "2:06:15", "remaining_time": "0:00:46"} +{"current_steps": 2456, "total_steps": 2470, "loss": 1.2441, "lr": 1.1233810238425735e-09, "epoch": 9.94331983805668, "percentage": 99.43, "elapsed_time": "2:06:18", "remaining_time": "0:00:43"} +{"current_steps": 2457, "total_steps": 2470, "loss": 0.9852, "lr": 9.78594415145806e-10, "epoch": 9.947368421052632, "percentage": 99.47, "elapsed_time": "2:06:21", "remaining_time": "0:00:40"} +{"current_steps": 2458, "total_steps": 2470, "loss": 0.8827, "lr": 8.437918333864537e-10, "epoch": 9.951417004048583, "percentage": 99.51, "elapsed_time": "2:06:25", "remaining_time": "0:00:37"} +{"current_steps": 2459, "total_steps": 2470, "loss": 1.207, "lr": 7.189735477913795e-10, "epoch": 9.955465587044534, "percentage": 99.55, "elapsed_time": "2:06:28", "remaining_time": "0:00:33"} +{"current_steps": 2460, "total_steps": 2470, "loss": 1.2944, "lr": 6.041398076478411e-10, "epoch": 9.959514170040485, "percentage": 99.6, "elapsed_time": "2:06:31", "remaining_time": "0:00:30"} +{"current_steps": 2461, "total_steps": 2470, "loss": 1.2245, "lr": 4.99290842301825e-10, "epoch": 9.963562753036438, "percentage": 99.64, "elapsed_time": "2:06:34", "remaining_time": "0:00:27"} +{"current_steps": 2462, "total_steps": 2470, "loss": 1.2106, "lr": 4.0442686115582665e-10, "epoch": 9.96761133603239, "percentage": 99.68, "elapsed_time": "2:06:37", "remaining_time": "0:00:24"} +{"current_steps": 2463, "total_steps": 2470, "loss": 1.1455, "lr": 3.195480536732909e-10, "epoch": 9.97165991902834, "percentage": 99.72, "elapsed_time": "2:06:40", "remaining_time": "0:00:21"} +{"current_steps": 2464, "total_steps": 2470, "loss": 1.2116, "lr": 2.446545893730612e-10, "epoch": 9.975708502024291, "percentage": 99.76, "elapsed_time": "2:06:43", "remaining_time": "0:00:18"} +{"current_steps": 2465, "total_steps": 2470, "loss": 1.2553, "lr": 1.797466178327101e-10, "epoch": 9.979757085020243, "percentage": 99.8, "elapsed_time": "2:06:46", "remaining_time": "0:00:15"} +{"current_steps": 2466, "total_steps": 2470, "loss": 1.3211, "lr": 1.2482426868520858e-10, "epoch": 9.983805668016194, "percentage": 99.84, "elapsed_time": "2:06:49", "remaining_time": "0:00:12"} +{"current_steps": 2467, "total_steps": 2470, "loss": 1.1147, "lr": 7.988765162225687e-11, "epoch": 9.987854251012145, "percentage": 99.88, "elapsed_time": "2:06:52", "remaining_time": "0:00:09"} +{"current_steps": 2468, "total_steps": 2470, "loss": 1.1298, "lr": 4.4936856390398465e-11, "epoch": 9.991902834008098, "percentage": 99.92, "elapsed_time": "2:06:55", "remaining_time": "0:00:06"} +{"current_steps": 2469, "total_steps": 2470, "loss": 1.454, "lr": 1.9971952793240713e-11, "epoch": 9.995951417004049, "percentage": 99.96, "elapsed_time": "2:06:58", "remaining_time": "0:00:03"} +{"current_steps": 2470, "total_steps": 2470, "loss": 1.1531, "lr": 4.992990691454758e-12, "epoch": 10.0, "percentage": 100.0, "elapsed_time": "2:07:01", "remaining_time": "0:00:00"} +{"current_steps": 2470, "total_steps": 2470, "epoch": 10.0, "percentage": 100.0, "elapsed_time": "2:08:07", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..db6afb0f0eda26b9ff48dfe4cca145206a76e349 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,17333 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 2470, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004048582995951417, + "grad_norm": 0.46473725687854356, + "learning_rate": 0.0, + "loss": 2.5926, + "step": 1 + }, + { + "epoch": 0.008097165991902834, + "grad_norm": 0.7862315968268553, + "learning_rate": 4.0485829959514176e-08, + "loss": 2.9114, + "step": 2 + }, + { + "epoch": 0.012145748987854251, + "grad_norm": 0.6677933506680473, + "learning_rate": 8.097165991902835e-08, + "loss": 2.7471, + "step": 3 + }, + { + "epoch": 0.016194331983805668, + "grad_norm": 0.8630518959378011, + "learning_rate": 1.2145748987854252e-07, + "loss": 2.8706, + "step": 4 + }, + { + "epoch": 0.020242914979757085, + "grad_norm": 0.5173190139924537, + "learning_rate": 1.619433198380567e-07, + "loss": 2.9912, + "step": 5 + }, + { + "epoch": 0.024291497975708502, + "grad_norm": 0.7759993718339214, + "learning_rate": 2.0242914979757086e-07, + "loss": 3.0072, + "step": 6 + }, + { + "epoch": 0.02834008097165992, + "grad_norm": 1.3755130452390263, + "learning_rate": 2.4291497975708504e-07, + "loss": 2.4721, + "step": 7 + }, + { + "epoch": 0.032388663967611336, + "grad_norm": 0.44121276912866286, + "learning_rate": 2.834008097165992e-07, + "loss": 2.843, + "step": 8 + }, + { + "epoch": 0.03643724696356275, + "grad_norm": 0.5559835506705462, + "learning_rate": 3.238866396761134e-07, + "loss": 2.9053, + "step": 9 + }, + { + "epoch": 0.04048582995951417, + "grad_norm": 0.6731704914870359, + "learning_rate": 3.6437246963562754e-07, + "loss": 2.7608, + "step": 10 + }, + { + "epoch": 0.044534412955465584, + "grad_norm": 0.43190024730085624, + "learning_rate": 4.048582995951417e-07, + "loss": 2.7074, + "step": 11 + }, + { + "epoch": 0.048582995951417005, + "grad_norm": 0.7594718614486027, + "learning_rate": 4.453441295546559e-07, + "loss": 2.7846, + "step": 12 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.4278958670654092, + "learning_rate": 4.858299595141701e-07, + "loss": 3.018, + "step": 13 + }, + { + "epoch": 0.05668016194331984, + "grad_norm": 0.48698492939265825, + "learning_rate": 5.263157894736843e-07, + "loss": 2.8131, + "step": 14 + }, + { + "epoch": 0.06072874493927125, + "grad_norm": 0.405274105300616, + "learning_rate": 5.668016194331984e-07, + "loss": 2.8777, + "step": 15 + }, + { + "epoch": 0.06477732793522267, + "grad_norm": 0.5554327831452092, + "learning_rate": 6.072874493927125e-07, + "loss": 2.9472, + "step": 16 + }, + { + "epoch": 0.06882591093117409, + "grad_norm": 0.44756530277540646, + "learning_rate": 6.477732793522268e-07, + "loss": 3.0157, + "step": 17 + }, + { + "epoch": 0.0728744939271255, + "grad_norm": 0.8072585997136504, + "learning_rate": 6.882591093117409e-07, + "loss": 2.7773, + "step": 18 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.5635933276885046, + "learning_rate": 7.287449392712551e-07, + "loss": 2.7169, + "step": 19 + }, + { + "epoch": 0.08097165991902834, + "grad_norm": 0.4673928500608582, + "learning_rate": 7.692307692307694e-07, + "loss": 2.7934, + "step": 20 + }, + { + "epoch": 0.08502024291497975, + "grad_norm": 1.3664880257539318, + "learning_rate": 8.097165991902834e-07, + "loss": 2.713, + "step": 21 + }, + { + "epoch": 0.08906882591093117, + "grad_norm": 0.6438340318121762, + "learning_rate": 8.502024291497976e-07, + "loss": 2.8722, + "step": 22 + }, + { + "epoch": 0.0931174089068826, + "grad_norm": 0.512121787489251, + "learning_rate": 8.906882591093118e-07, + "loss": 2.722, + "step": 23 + }, + { + "epoch": 0.09716599190283401, + "grad_norm": 1.023552604444706, + "learning_rate": 9.31174089068826e-07, + "loss": 2.5291, + "step": 24 + }, + { + "epoch": 0.10121457489878542, + "grad_norm": 0.556430330792241, + "learning_rate": 9.716599190283402e-07, + "loss": 2.7028, + "step": 25 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.0165779263195185, + "learning_rate": 1.0121457489878542e-06, + "loss": 2.7946, + "step": 26 + }, + { + "epoch": 0.10931174089068826, + "grad_norm": 0.8434539164732048, + "learning_rate": 1.0526315789473685e-06, + "loss": 2.6139, + "step": 27 + }, + { + "epoch": 0.11336032388663968, + "grad_norm": 0.6252954896694622, + "learning_rate": 1.0931174089068828e-06, + "loss": 2.469, + "step": 28 + }, + { + "epoch": 0.11740890688259109, + "grad_norm": 0.8618444900481227, + "learning_rate": 1.133603238866397e-06, + "loss": 2.6452, + "step": 29 + }, + { + "epoch": 0.1214574898785425, + "grad_norm": 0.9066908581713439, + "learning_rate": 1.174089068825911e-06, + "loss": 2.4396, + "step": 30 + }, + { + "epoch": 0.12550607287449392, + "grad_norm": 0.528141325017682, + "learning_rate": 1.214574898785425e-06, + "loss": 2.469, + "step": 31 + }, + { + "epoch": 0.12955465587044535, + "grad_norm": 0.6378156052352336, + "learning_rate": 1.2550607287449393e-06, + "loss": 2.5795, + "step": 32 + }, + { + "epoch": 0.13360323886639677, + "grad_norm": 0.5624703100477139, + "learning_rate": 1.2955465587044536e-06, + "loss": 2.6768, + "step": 33 + }, + { + "epoch": 0.13765182186234817, + "grad_norm": 0.5821134471598685, + "learning_rate": 1.336032388663968e-06, + "loss": 2.8086, + "step": 34 + }, + { + "epoch": 0.1417004048582996, + "grad_norm": 0.6258194867082703, + "learning_rate": 1.3765182186234818e-06, + "loss": 2.3603, + "step": 35 + }, + { + "epoch": 0.145748987854251, + "grad_norm": 0.5477831289461287, + "learning_rate": 1.417004048582996e-06, + "loss": 2.7758, + "step": 36 + }, + { + "epoch": 0.14979757085020243, + "grad_norm": 0.5008051448479439, + "learning_rate": 1.4574898785425101e-06, + "loss": 2.7543, + "step": 37 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.5096264603702895, + "learning_rate": 1.4979757085020244e-06, + "loss": 2.7356, + "step": 38 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.6456644864025523, + "learning_rate": 1.5384615384615387e-06, + "loss": 3.0218, + "step": 39 + }, + { + "epoch": 0.16194331983805668, + "grad_norm": 0.5888424191973028, + "learning_rate": 1.5789473684210526e-06, + "loss": 2.6165, + "step": 40 + }, + { + "epoch": 0.1659919028340081, + "grad_norm": 0.7898553504446816, + "learning_rate": 1.6194331983805669e-06, + "loss": 2.6223, + "step": 41 + }, + { + "epoch": 0.1700404858299595, + "grad_norm": 0.6232472926548593, + "learning_rate": 1.6599190283400812e-06, + "loss": 2.7768, + "step": 42 + }, + { + "epoch": 0.17408906882591094, + "grad_norm": 0.6922764219271268, + "learning_rate": 1.7004048582995952e-06, + "loss": 2.479, + "step": 43 + }, + { + "epoch": 0.17813765182186234, + "grad_norm": 0.6679665416214551, + "learning_rate": 1.7408906882591095e-06, + "loss": 2.6842, + "step": 44 + }, + { + "epoch": 0.18218623481781376, + "grad_norm": 0.48868645690455986, + "learning_rate": 1.7813765182186236e-06, + "loss": 2.3611, + "step": 45 + }, + { + "epoch": 0.1862348178137652, + "grad_norm": 1.0959755351532565, + "learning_rate": 1.8218623481781379e-06, + "loss": 2.6644, + "step": 46 + }, + { + "epoch": 0.1902834008097166, + "grad_norm": 0.7403727047924632, + "learning_rate": 1.862348178137652e-06, + "loss": 2.7313, + "step": 47 + }, + { + "epoch": 0.19433198380566802, + "grad_norm": 0.5355809576361324, + "learning_rate": 1.902834008097166e-06, + "loss": 2.976, + "step": 48 + }, + { + "epoch": 0.19838056680161945, + "grad_norm": 0.6203117033335515, + "learning_rate": 1.9433198380566803e-06, + "loss": 2.8615, + "step": 49 + }, + { + "epoch": 0.20242914979757085, + "grad_norm": 0.6748602332749001, + "learning_rate": 1.9838056680161946e-06, + "loss": 2.7385, + "step": 50 + }, + { + "epoch": 0.20647773279352227, + "grad_norm": 0.6061522444778688, + "learning_rate": 2.0242914979757085e-06, + "loss": 2.7926, + "step": 51 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.5677094053210018, + "learning_rate": 2.0647773279352228e-06, + "loss": 2.8905, + "step": 52 + }, + { + "epoch": 0.2145748987854251, + "grad_norm": 0.7539663022721307, + "learning_rate": 2.105263157894737e-06, + "loss": 2.7044, + "step": 53 + }, + { + "epoch": 0.21862348178137653, + "grad_norm": 0.5511775427996539, + "learning_rate": 2.1457489878542513e-06, + "loss": 2.6044, + "step": 54 + }, + { + "epoch": 0.22267206477732793, + "grad_norm": 0.5001055873779205, + "learning_rate": 2.1862348178137656e-06, + "loss": 2.7154, + "step": 55 + }, + { + "epoch": 0.22672064777327935, + "grad_norm": 5.059433496293122, + "learning_rate": 2.2267206477732795e-06, + "loss": 2.6151, + "step": 56 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.5976992576491789, + "learning_rate": 2.267206477732794e-06, + "loss": 2.8561, + "step": 57 + }, + { + "epoch": 0.23481781376518218, + "grad_norm": 0.5650795458768608, + "learning_rate": 2.307692307692308e-06, + "loss": 2.994, + "step": 58 + }, + { + "epoch": 0.2388663967611336, + "grad_norm": 1.110043039226332, + "learning_rate": 2.348178137651822e-06, + "loss": 2.9581, + "step": 59 + }, + { + "epoch": 0.242914979757085, + "grad_norm": 0.8353821859752748, + "learning_rate": 2.3886639676113362e-06, + "loss": 2.9613, + "step": 60 + }, + { + "epoch": 0.24696356275303644, + "grad_norm": 0.7575324618871198, + "learning_rate": 2.42914979757085e-06, + "loss": 2.7295, + "step": 61 + }, + { + "epoch": 0.25101214574898784, + "grad_norm": 0.7791476828146748, + "learning_rate": 2.4696356275303644e-06, + "loss": 2.7126, + "step": 62 + }, + { + "epoch": 0.2550607287449393, + "grad_norm": 0.4809737260566304, + "learning_rate": 2.5101214574898787e-06, + "loss": 2.8892, + "step": 63 + }, + { + "epoch": 0.2591093117408907, + "grad_norm": 0.5968909877448142, + "learning_rate": 2.550607287449393e-06, + "loss": 2.6468, + "step": 64 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.7701935599652083, + "learning_rate": 2.5910931174089072e-06, + "loss": 2.5171, + "step": 65 + }, + { + "epoch": 0.26720647773279355, + "grad_norm": 0.49540617385936636, + "learning_rate": 2.631578947368421e-06, + "loss": 2.5617, + "step": 66 + }, + { + "epoch": 0.27125506072874495, + "grad_norm": 0.5880768265382437, + "learning_rate": 2.672064777327936e-06, + "loss": 2.6525, + "step": 67 + }, + { + "epoch": 0.27530364372469635, + "grad_norm": 0.8719044761766179, + "learning_rate": 2.7125506072874497e-06, + "loss": 2.5136, + "step": 68 + }, + { + "epoch": 0.2793522267206478, + "grad_norm": 0.7508384152907464, + "learning_rate": 2.7530364372469636e-06, + "loss": 2.7136, + "step": 69 + }, + { + "epoch": 0.2834008097165992, + "grad_norm": 0.7593508374848729, + "learning_rate": 2.7935222672064783e-06, + "loss": 2.5836, + "step": 70 + }, + { + "epoch": 0.2874493927125506, + "grad_norm": 0.6236865711432193, + "learning_rate": 2.834008097165992e-06, + "loss": 2.6042, + "step": 71 + }, + { + "epoch": 0.291497975708502, + "grad_norm": 0.9207439340534006, + "learning_rate": 2.8744939271255064e-06, + "loss": 2.4534, + "step": 72 + }, + { + "epoch": 0.29554655870445345, + "grad_norm": 0.9048216657065745, + "learning_rate": 2.9149797570850203e-06, + "loss": 2.7732, + "step": 73 + }, + { + "epoch": 0.29959514170040485, + "grad_norm": 1.0531213295224573, + "learning_rate": 2.955465587044535e-06, + "loss": 2.6927, + "step": 74 + }, + { + "epoch": 0.30364372469635625, + "grad_norm": 0.8889664393499657, + "learning_rate": 2.995951417004049e-06, + "loss": 2.7532, + "step": 75 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.678148296266936, + "learning_rate": 3.0364372469635627e-06, + "loss": 2.4982, + "step": 76 + }, + { + "epoch": 0.3117408906882591, + "grad_norm": 0.9143989903488097, + "learning_rate": 3.0769230769230774e-06, + "loss": 2.4821, + "step": 77 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.7430526887934812, + "learning_rate": 3.1174089068825913e-06, + "loss": 2.8892, + "step": 78 + }, + { + "epoch": 0.31983805668016196, + "grad_norm": 1.0967354490931058, + "learning_rate": 3.157894736842105e-06, + "loss": 2.5355, + "step": 79 + }, + { + "epoch": 0.32388663967611336, + "grad_norm": 0.6474936013842225, + "learning_rate": 3.19838056680162e-06, + "loss": 2.4627, + "step": 80 + }, + { + "epoch": 0.32793522267206476, + "grad_norm": 0.8223317792104156, + "learning_rate": 3.2388663967611337e-06, + "loss": 2.5097, + "step": 81 + }, + { + "epoch": 0.3319838056680162, + "grad_norm": 0.8471027758590536, + "learning_rate": 3.279352226720648e-06, + "loss": 2.5888, + "step": 82 + }, + { + "epoch": 0.3360323886639676, + "grad_norm": 0.4892443825365843, + "learning_rate": 3.3198380566801623e-06, + "loss": 2.4857, + "step": 83 + }, + { + "epoch": 0.340080971659919, + "grad_norm": 0.6329419393193343, + "learning_rate": 3.3603238866396766e-06, + "loss": 2.3704, + "step": 84 + }, + { + "epoch": 0.3441295546558704, + "grad_norm": 0.7450745621264726, + "learning_rate": 3.4008097165991905e-06, + "loss": 2.4814, + "step": 85 + }, + { + "epoch": 0.3481781376518219, + "grad_norm": 0.7915890438013479, + "learning_rate": 3.4412955465587043e-06, + "loss": 2.7336, + "step": 86 + }, + { + "epoch": 0.3522267206477733, + "grad_norm": 0.8224002727747803, + "learning_rate": 3.481781376518219e-06, + "loss": 2.6197, + "step": 87 + }, + { + "epoch": 0.3562753036437247, + "grad_norm": 0.7379097347027997, + "learning_rate": 3.522267206477733e-06, + "loss": 2.3123, + "step": 88 + }, + { + "epoch": 0.3603238866396761, + "grad_norm": 0.63590140796502, + "learning_rate": 3.562753036437247e-06, + "loss": 2.659, + "step": 89 + }, + { + "epoch": 0.3643724696356275, + "grad_norm": 0.9402424866754966, + "learning_rate": 3.6032388663967615e-06, + "loss": 2.6324, + "step": 90 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.7757246306456501, + "learning_rate": 3.6437246963562758e-06, + "loss": 2.5935, + "step": 91 + }, + { + "epoch": 0.3724696356275304, + "grad_norm": 0.7001956828085119, + "learning_rate": 3.6842105263157896e-06, + "loss": 2.8634, + "step": 92 + }, + { + "epoch": 0.3765182186234818, + "grad_norm": 0.6770880287428972, + "learning_rate": 3.724696356275304e-06, + "loss": 2.3526, + "step": 93 + }, + { + "epoch": 0.3805668016194332, + "grad_norm": 0.7469924696350099, + "learning_rate": 3.7651821862348182e-06, + "loss": 2.4551, + "step": 94 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.6156146016330529, + "learning_rate": 3.805668016194332e-06, + "loss": 2.441, + "step": 95 + }, + { + "epoch": 0.38866396761133604, + "grad_norm": 0.7142333380873401, + "learning_rate": 3.846153846153847e-06, + "loss": 2.5222, + "step": 96 + }, + { + "epoch": 0.39271255060728744, + "grad_norm": 0.6126483934481857, + "learning_rate": 3.886639676113361e-06, + "loss": 2.6018, + "step": 97 + }, + { + "epoch": 0.3967611336032389, + "grad_norm": 0.7531177478658849, + "learning_rate": 3.9271255060728745e-06, + "loss": 2.4227, + "step": 98 + }, + { + "epoch": 0.4008097165991903, + "grad_norm": 0.7172471080034739, + "learning_rate": 3.967611336032389e-06, + "loss": 2.4637, + "step": 99 + }, + { + "epoch": 0.4048582995951417, + "grad_norm": 0.7800438096349082, + "learning_rate": 4.008097165991903e-06, + "loss": 2.5228, + "step": 100 + }, + { + "epoch": 0.4089068825910931, + "grad_norm": 0.8009705607457139, + "learning_rate": 4.048582995951417e-06, + "loss": 2.6356, + "step": 101 + }, + { + "epoch": 0.41295546558704455, + "grad_norm": 0.9574889353775141, + "learning_rate": 4.089068825910931e-06, + "loss": 2.3874, + "step": 102 + }, + { + "epoch": 0.41700404858299595, + "grad_norm": 0.7824043116812712, + "learning_rate": 4.1295546558704455e-06, + "loss": 2.6671, + "step": 103 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.7116660818199502, + "learning_rate": 4.170040485829959e-06, + "loss": 2.6795, + "step": 104 + }, + { + "epoch": 0.4251012145748988, + "grad_norm": 0.6234909516086495, + "learning_rate": 4.210526315789474e-06, + "loss": 2.4891, + "step": 105 + }, + { + "epoch": 0.4291497975708502, + "grad_norm": 0.7507042701110958, + "learning_rate": 4.251012145748988e-06, + "loss": 2.5374, + "step": 106 + }, + { + "epoch": 0.4331983805668016, + "grad_norm": 0.5830775553501698, + "learning_rate": 4.291497975708503e-06, + "loss": 2.4393, + "step": 107 + }, + { + "epoch": 0.43724696356275305, + "grad_norm": 0.8561666711107475, + "learning_rate": 4.3319838056680166e-06, + "loss": 2.3122, + "step": 108 + }, + { + "epoch": 0.44129554655870445, + "grad_norm": 0.914997362840242, + "learning_rate": 4.372469635627531e-06, + "loss": 2.5436, + "step": 109 + }, + { + "epoch": 0.44534412955465585, + "grad_norm": 0.6732155905531092, + "learning_rate": 4.412955465587045e-06, + "loss": 2.5005, + "step": 110 + }, + { + "epoch": 0.4493927125506073, + "grad_norm": 0.7462341368666683, + "learning_rate": 4.453441295546559e-06, + "loss": 2.4483, + "step": 111 + }, + { + "epoch": 0.4534412955465587, + "grad_norm": 0.8245738963488927, + "learning_rate": 4.493927125506074e-06, + "loss": 2.5333, + "step": 112 + }, + { + "epoch": 0.4574898785425101, + "grad_norm": 0.7702932505386926, + "learning_rate": 4.534412955465588e-06, + "loss": 2.5613, + "step": 113 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 1.3101615300934801, + "learning_rate": 4.5748987854251014e-06, + "loss": 2.973, + "step": 114 + }, + { + "epoch": 0.46558704453441296, + "grad_norm": 0.7651586289456958, + "learning_rate": 4.615384615384616e-06, + "loss": 2.5947, + "step": 115 + }, + { + "epoch": 0.46963562753036436, + "grad_norm": 0.8222224925704688, + "learning_rate": 4.65587044534413e-06, + "loss": 2.4581, + "step": 116 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.6556587501075568, + "learning_rate": 4.696356275303644e-06, + "loss": 2.4571, + "step": 117 + }, + { + "epoch": 0.4777327935222672, + "grad_norm": 0.821438637414972, + "learning_rate": 4.736842105263158e-06, + "loss": 2.6622, + "step": 118 + }, + { + "epoch": 0.4817813765182186, + "grad_norm": 0.6254867878515806, + "learning_rate": 4.7773279352226725e-06, + "loss": 2.3622, + "step": 119 + }, + { + "epoch": 0.48582995951417, + "grad_norm": 0.6606998242945233, + "learning_rate": 4.817813765182186e-06, + "loss": 2.4812, + "step": 120 + }, + { + "epoch": 0.4898785425101215, + "grad_norm": 0.9140647082414407, + "learning_rate": 4.8582995951417e-06, + "loss": 2.5297, + "step": 121 + }, + { + "epoch": 0.4939271255060729, + "grad_norm": 0.8543729933153993, + "learning_rate": 4.898785425101215e-06, + "loss": 2.5534, + "step": 122 + }, + { + "epoch": 0.4979757085020243, + "grad_norm": 0.9641287101724041, + "learning_rate": 4.939271255060729e-06, + "loss": 2.3909, + "step": 123 + }, + { + "epoch": 0.5020242914979757, + "grad_norm": 0.7562747998003689, + "learning_rate": 4.9797570850202435e-06, + "loss": 2.3104, + "step": 124 + }, + { + "epoch": 0.5060728744939271, + "grad_norm": 0.9684058066200523, + "learning_rate": 5.020242914979757e-06, + "loss": 2.5894, + "step": 125 + }, + { + "epoch": 0.5101214574898786, + "grad_norm": 1.0833146453760147, + "learning_rate": 5.060728744939272e-06, + "loss": 2.686, + "step": 126 + }, + { + "epoch": 0.5141700404858299, + "grad_norm": 0.7212110120886743, + "learning_rate": 5.101214574898786e-06, + "loss": 2.5203, + "step": 127 + }, + { + "epoch": 0.5182186234817814, + "grad_norm": 0.9848467525032204, + "learning_rate": 5.1417004048583e-06, + "loss": 2.66, + "step": 128 + }, + { + "epoch": 0.5222672064777328, + "grad_norm": 0.78315965526943, + "learning_rate": 5.1821862348178145e-06, + "loss": 2.5008, + "step": 129 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8583112834837245, + "learning_rate": 5.222672064777329e-06, + "loss": 2.3134, + "step": 130 + }, + { + "epoch": 0.5303643724696356, + "grad_norm": 0.7581206885647646, + "learning_rate": 5.263157894736842e-06, + "loss": 2.4191, + "step": 131 + }, + { + "epoch": 0.5344129554655871, + "grad_norm": 0.9695513408717512, + "learning_rate": 5.303643724696357e-06, + "loss": 2.5499, + "step": 132 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 6.764939321667699, + "learning_rate": 5.344129554655872e-06, + "loss": 2.4736, + "step": 133 + }, + { + "epoch": 0.5425101214574899, + "grad_norm": 1.0247610500949114, + "learning_rate": 5.384615384615385e-06, + "loss": 2.3723, + "step": 134 + }, + { + "epoch": 0.5465587044534413, + "grad_norm": 15.672428379790873, + "learning_rate": 5.425101214574899e-06, + "loss": 3.4815, + "step": 135 + }, + { + "epoch": 0.5506072874493927, + "grad_norm": 2.249245731133667, + "learning_rate": 5.465587044534414e-06, + "loss": 3.4231, + "step": 136 + }, + { + "epoch": 0.5546558704453441, + "grad_norm": 3.797144058522148, + "learning_rate": 5.506072874493927e-06, + "loss": 4.4025, + "step": 137 + }, + { + "epoch": 0.5587044534412956, + "grad_norm": 0.8114215476851966, + "learning_rate": 5.546558704453442e-06, + "loss": 2.3958, + "step": 138 + }, + { + "epoch": 0.562753036437247, + "grad_norm": 0.7631595156767096, + "learning_rate": 5.5870445344129565e-06, + "loss": 2.1963, + "step": 139 + }, + { + "epoch": 0.5668016194331984, + "grad_norm": 0.8648024420211529, + "learning_rate": 5.6275303643724695e-06, + "loss": 2.4664, + "step": 140 + }, + { + "epoch": 0.5708502024291497, + "grad_norm": 1.1398946486999715, + "learning_rate": 5.668016194331984e-06, + "loss": 2.2672, + "step": 141 + }, + { + "epoch": 0.5748987854251012, + "grad_norm": 0.7035715089344788, + "learning_rate": 5.708502024291498e-06, + "loss": 2.4001, + "step": 142 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.7842465817250697, + "learning_rate": 5.748987854251013e-06, + "loss": 2.2186, + "step": 143 + }, + { + "epoch": 0.582995951417004, + "grad_norm": 0.8358191441707306, + "learning_rate": 5.789473684210527e-06, + "loss": 2.5692, + "step": 144 + }, + { + "epoch": 0.5870445344129555, + "grad_norm": 0.7027969455146362, + "learning_rate": 5.8299595141700406e-06, + "loss": 2.3088, + "step": 145 + }, + { + "epoch": 0.5910931174089069, + "grad_norm": 0.7026752876788243, + "learning_rate": 5.870445344129555e-06, + "loss": 2.4148, + "step": 146 + }, + { + "epoch": 0.5951417004048583, + "grad_norm": 0.9049685837714232, + "learning_rate": 5.91093117408907e-06, + "loss": 2.146, + "step": 147 + }, + { + "epoch": 0.5991902834008097, + "grad_norm": 0.8388567349727308, + "learning_rate": 5.951417004048583e-06, + "loss": 2.0989, + "step": 148 + }, + { + "epoch": 0.6032388663967612, + "grad_norm": 0.773577497225349, + "learning_rate": 5.991902834008098e-06, + "loss": 2.2379, + "step": 149 + }, + { + "epoch": 0.6072874493927125, + "grad_norm": 0.7826979729986758, + "learning_rate": 6.0323886639676124e-06, + "loss": 2.18, + "step": 150 + }, + { + "epoch": 0.611336032388664, + "grad_norm": 0.8592925674032668, + "learning_rate": 6.0728744939271254e-06, + "loss": 2.4302, + "step": 151 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.6169427006453612, + "learning_rate": 6.11336032388664e-06, + "loss": 2.2208, + "step": 152 + }, + { + "epoch": 0.6194331983805668, + "grad_norm": 0.8979145279675816, + "learning_rate": 6.153846153846155e-06, + "loss": 2.3089, + "step": 153 + }, + { + "epoch": 0.6234817813765182, + "grad_norm": 0.8069478254920203, + "learning_rate": 6.194331983805668e-06, + "loss": 2.5248, + "step": 154 + }, + { + "epoch": 0.6275303643724697, + "grad_norm": 0.702872317531758, + "learning_rate": 6.234817813765183e-06, + "loss": 2.2786, + "step": 155 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 1.1902510486781737, + "learning_rate": 6.275303643724697e-06, + "loss": 2.564, + "step": 156 + }, + { + "epoch": 0.6356275303643725, + "grad_norm": 0.7322358696471963, + "learning_rate": 6.31578947368421e-06, + "loss": 2.2575, + "step": 157 + }, + { + "epoch": 0.6396761133603239, + "grad_norm": 0.827272619073328, + "learning_rate": 6.356275303643725e-06, + "loss": 2.4085, + "step": 158 + }, + { + "epoch": 0.6437246963562753, + "grad_norm": 0.844449245612401, + "learning_rate": 6.39676113360324e-06, + "loss": 2.3392, + "step": 159 + }, + { + "epoch": 0.6477732793522267, + "grad_norm": 0.6963954379010507, + "learning_rate": 6.437246963562754e-06, + "loss": 2.3474, + "step": 160 + }, + { + "epoch": 0.6518218623481782, + "grad_norm": 1.0062158283533227, + "learning_rate": 6.4777327935222675e-06, + "loss": 2.206, + "step": 161 + }, + { + "epoch": 0.6558704453441295, + "grad_norm": 0.7010434692271018, + "learning_rate": 6.518218623481782e-06, + "loss": 2.4407, + "step": 162 + }, + { + "epoch": 0.659919028340081, + "grad_norm": 0.8546299950775236, + "learning_rate": 6.558704453441296e-06, + "loss": 2.3308, + "step": 163 + }, + { + "epoch": 0.6639676113360324, + "grad_norm": 0.9160069550133176, + "learning_rate": 6.599190283400811e-06, + "loss": 2.2799, + "step": 164 + }, + { + "epoch": 0.6680161943319838, + "grad_norm": 0.6991934828570997, + "learning_rate": 6.639676113360325e-06, + "loss": 2.3277, + "step": 165 + }, + { + "epoch": 0.6720647773279352, + "grad_norm": 2.441952914795693, + "learning_rate": 6.6801619433198385e-06, + "loss": 2.2357, + "step": 166 + }, + { + "epoch": 0.6761133603238867, + "grad_norm": 0.7134946099061733, + "learning_rate": 6.720647773279353e-06, + "loss": 2.1807, + "step": 167 + }, + { + "epoch": 0.680161943319838, + "grad_norm": 0.7920123504029117, + "learning_rate": 6.761133603238867e-06, + "loss": 2.4623, + "step": 168 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.7987244705898385, + "learning_rate": 6.801619433198381e-06, + "loss": 2.2289, + "step": 169 + }, + { + "epoch": 0.6882591093117408, + "grad_norm": 0.8092206406250949, + "learning_rate": 6.842105263157896e-06, + "loss": 2.3704, + "step": 170 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.7440145606342271, + "learning_rate": 6.882591093117409e-06, + "loss": 2.3322, + "step": 171 + }, + { + "epoch": 0.6963562753036437, + "grad_norm": 0.704685785309606, + "learning_rate": 6.923076923076923e-06, + "loss": 2.1067, + "step": 172 + }, + { + "epoch": 0.7004048582995951, + "grad_norm": 0.8716057180507851, + "learning_rate": 6.963562753036438e-06, + "loss": 2.6915, + "step": 173 + }, + { + "epoch": 0.7044534412955465, + "grad_norm": 0.8610302596466904, + "learning_rate": 7.004048582995951e-06, + "loss": 2.3607, + "step": 174 + }, + { + "epoch": 0.708502024291498, + "grad_norm": 0.7454341645101108, + "learning_rate": 7.044534412955466e-06, + "loss": 2.0946, + "step": 175 + }, + { + "epoch": 0.7125506072874493, + "grad_norm": 0.775526558923258, + "learning_rate": 7.0850202429149805e-06, + "loss": 2.2197, + "step": 176 + }, + { + "epoch": 0.7165991902834008, + "grad_norm": 0.7425363416700347, + "learning_rate": 7.125506072874494e-06, + "loss": 2.2515, + "step": 177 + }, + { + "epoch": 0.7206477732793523, + "grad_norm": 0.799480261879121, + "learning_rate": 7.165991902834008e-06, + "loss": 2.2984, + "step": 178 + }, + { + "epoch": 0.7246963562753036, + "grad_norm": 1.208911299168472, + "learning_rate": 7.206477732793523e-06, + "loss": 2.3498, + "step": 179 + }, + { + "epoch": 0.728744939271255, + "grad_norm": 0.8451843361875137, + "learning_rate": 7.246963562753037e-06, + "loss": 2.3922, + "step": 180 + }, + { + "epoch": 0.7327935222672065, + "grad_norm": 0.6688748588442022, + "learning_rate": 7.2874493927125516e-06, + "loss": 2.2572, + "step": 181 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 1.1693138233285796, + "learning_rate": 7.327935222672065e-06, + "loss": 2.327, + "step": 182 + }, + { + "epoch": 0.7408906882591093, + "grad_norm": 1.6904745941237547, + "learning_rate": 7.368421052631579e-06, + "loss": 2.8703, + "step": 183 + }, + { + "epoch": 0.7449392712550608, + "grad_norm": 0.8844949083017518, + "learning_rate": 7.408906882591094e-06, + "loss": 2.2888, + "step": 184 + }, + { + "epoch": 0.7489878542510121, + "grad_norm": 0.8858477106782153, + "learning_rate": 7.449392712550608e-06, + "loss": 2.2582, + "step": 185 + }, + { + "epoch": 0.7530364372469636, + "grad_norm": 0.7394352987608678, + "learning_rate": 7.489878542510122e-06, + "loss": 2.0775, + "step": 186 + }, + { + "epoch": 0.757085020242915, + "grad_norm": 0.8834206013583122, + "learning_rate": 7.5303643724696364e-06, + "loss": 2.2682, + "step": 187 + }, + { + "epoch": 0.7611336032388664, + "grad_norm": 6.250751086281045, + "learning_rate": 7.570850202429151e-06, + "loss": 3.2512, + "step": 188 + }, + { + "epoch": 0.7651821862348178, + "grad_norm": 35.543626516502854, + "learning_rate": 7.611336032388664e-06, + "loss": 3.2673, + "step": 189 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 4.671464673421441, + "learning_rate": 7.651821862348178e-06, + "loss": 3.288, + "step": 190 + }, + { + "epoch": 0.7732793522267206, + "grad_norm": 0.8467043403003462, + "learning_rate": 7.692307692307694e-06, + "loss": 2.3525, + "step": 191 + }, + { + "epoch": 0.7773279352226721, + "grad_norm": 0.7553553742503454, + "learning_rate": 7.732793522267207e-06, + "loss": 2.4147, + "step": 192 + }, + { + "epoch": 0.7813765182186235, + "grad_norm": 0.6722184689731728, + "learning_rate": 7.773279352226721e-06, + "loss": 2.4408, + "step": 193 + }, + { + "epoch": 0.7854251012145749, + "grad_norm": 0.8742278117345931, + "learning_rate": 7.813765182186235e-06, + "loss": 2.2427, + "step": 194 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.7018298382516639, + "learning_rate": 7.854251012145749e-06, + "loss": 2.1401, + "step": 195 + }, + { + "epoch": 0.7935222672064778, + "grad_norm": 0.8441291024867053, + "learning_rate": 7.894736842105265e-06, + "loss": 2.417, + "step": 196 + }, + { + "epoch": 0.7975708502024291, + "grad_norm": 0.8440780587728888, + "learning_rate": 7.935222672064778e-06, + "loss": 2.343, + "step": 197 + }, + { + "epoch": 0.8016194331983806, + "grad_norm": 0.7817852912155946, + "learning_rate": 7.975708502024292e-06, + "loss": 2.0718, + "step": 198 + }, + { + "epoch": 0.805668016194332, + "grad_norm": 0.8173811480736421, + "learning_rate": 8.016194331983806e-06, + "loss": 1.9574, + "step": 199 + }, + { + "epoch": 0.8097165991902834, + "grad_norm": 0.9130733429115842, + "learning_rate": 8.056680161943322e-06, + "loss": 2.1815, + "step": 200 + }, + { + "epoch": 0.8137651821862348, + "grad_norm": 0.9847086103025836, + "learning_rate": 8.097165991902834e-06, + "loss": 2.3515, + "step": 201 + }, + { + "epoch": 0.8178137651821862, + "grad_norm": 0.8676876881551969, + "learning_rate": 8.13765182186235e-06, + "loss": 2.0846, + "step": 202 + }, + { + "epoch": 0.8218623481781376, + "grad_norm": 13.90144045255743, + "learning_rate": 8.178137651821862e-06, + "loss": 2.901, + "step": 203 + }, + { + "epoch": 0.8259109311740891, + "grad_norm": 26.964637613541246, + "learning_rate": 8.218623481781377e-06, + "loss": 4.9217, + "step": 204 + }, + { + "epoch": 0.8299595141700404, + "grad_norm": 0.9450475296548486, + "learning_rate": 8.259109311740891e-06, + "loss": 2.213, + "step": 205 + }, + { + "epoch": 0.8340080971659919, + "grad_norm": 0.8251626027353501, + "learning_rate": 8.299595141700405e-06, + "loss": 2.1265, + "step": 206 + }, + { + "epoch": 0.8380566801619433, + "grad_norm": 1.5637444134794973, + "learning_rate": 8.340080971659919e-06, + "loss": 2.1168, + "step": 207 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.8572609413241875, + "learning_rate": 8.380566801619434e-06, + "loss": 2.2021, + "step": 208 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.8829526183041908, + "learning_rate": 8.421052631578948e-06, + "loss": 2.1197, + "step": 209 + }, + { + "epoch": 0.8502024291497976, + "grad_norm": 0.8230040936414714, + "learning_rate": 8.461538461538462e-06, + "loss": 2.1389, + "step": 210 + }, + { + "epoch": 0.854251012145749, + "grad_norm": 1.0630722291016348, + "learning_rate": 8.502024291497976e-06, + "loss": 2.2071, + "step": 211 + }, + { + "epoch": 0.8582995951417004, + "grad_norm": 0.8285650816893187, + "learning_rate": 8.54251012145749e-06, + "loss": 2.1278, + "step": 212 + }, + { + "epoch": 0.8623481781376519, + "grad_norm": 0.9374104368567024, + "learning_rate": 8.582995951417005e-06, + "loss": 2.2602, + "step": 213 + }, + { + "epoch": 0.8663967611336032, + "grad_norm": 0.9292432454800617, + "learning_rate": 8.62348178137652e-06, + "loss": 2.2139, + "step": 214 + }, + { + "epoch": 0.8704453441295547, + "grad_norm": 1.102816596900189, + "learning_rate": 8.663967611336033e-06, + "loss": 2.6954, + "step": 215 + }, + { + "epoch": 0.8744939271255061, + "grad_norm": 1.0693734533760941, + "learning_rate": 8.704453441295547e-06, + "loss": 2.6307, + "step": 216 + }, + { + "epoch": 0.8785425101214575, + "grad_norm": 0.9576307746487195, + "learning_rate": 8.744939271255063e-06, + "loss": 2.3637, + "step": 217 + }, + { + "epoch": 0.8825910931174089, + "grad_norm": 0.9705930148144204, + "learning_rate": 8.785425101214575e-06, + "loss": 2.2346, + "step": 218 + }, + { + "epoch": 0.8866396761133604, + "grad_norm": 1.0504776994181708, + "learning_rate": 8.82591093117409e-06, + "loss": 1.8973, + "step": 219 + }, + { + "epoch": 0.8906882591093117, + "grad_norm": 0.8931928814405187, + "learning_rate": 8.866396761133604e-06, + "loss": 2.2742, + "step": 220 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.9688347208506803, + "learning_rate": 8.906882591093118e-06, + "loss": 2.2952, + "step": 221 + }, + { + "epoch": 0.8987854251012146, + "grad_norm": 0.978996274596435, + "learning_rate": 8.947368421052632e-06, + "loss": 2.0332, + "step": 222 + }, + { + "epoch": 0.902834008097166, + "grad_norm": 0.9073798024023706, + "learning_rate": 8.987854251012147e-06, + "loss": 2.0714, + "step": 223 + }, + { + "epoch": 0.9068825910931174, + "grad_norm": 1.1581613082581128, + "learning_rate": 9.02834008097166e-06, + "loss": 2.2157, + "step": 224 + }, + { + "epoch": 0.9109311740890689, + "grad_norm": 1.0884120135655109, + "learning_rate": 9.068825910931175e-06, + "loss": 1.7915, + "step": 225 + }, + { + "epoch": 0.9149797570850202, + "grad_norm": 0.9581672716343882, + "learning_rate": 9.109311740890689e-06, + "loss": 2.0722, + "step": 226 + }, + { + "epoch": 0.9190283400809717, + "grad_norm": 0.9523432975820123, + "learning_rate": 9.149797570850203e-06, + "loss": 2.0351, + "step": 227 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.9395262500295037, + "learning_rate": 9.190283400809717e-06, + "loss": 2.1823, + "step": 228 + }, + { + "epoch": 0.9271255060728745, + "grad_norm": 1.0734663585541728, + "learning_rate": 9.230769230769232e-06, + "loss": 2.2329, + "step": 229 + }, + { + "epoch": 0.9311740890688259, + "grad_norm": 5.915661456573777, + "learning_rate": 9.271255060728746e-06, + "loss": 2.142, + "step": 230 + }, + { + "epoch": 0.9352226720647774, + "grad_norm": 0.943964635554494, + "learning_rate": 9.31174089068826e-06, + "loss": 2.0151, + "step": 231 + }, + { + "epoch": 0.9392712550607287, + "grad_norm": 0.9400321772267921, + "learning_rate": 9.352226720647774e-06, + "loss": 1.9453, + "step": 232 + }, + { + "epoch": 0.9433198380566802, + "grad_norm": 1.0803744575815664, + "learning_rate": 9.392712550607288e-06, + "loss": 2.2879, + "step": 233 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 1.1375116889631114, + "learning_rate": 9.433198380566803e-06, + "loss": 1.997, + "step": 234 + }, + { + "epoch": 0.951417004048583, + "grad_norm": 1.0484948139162147, + "learning_rate": 9.473684210526315e-06, + "loss": 2.0557, + "step": 235 + }, + { + "epoch": 0.9554655870445344, + "grad_norm": 1.9953282124950078, + "learning_rate": 9.514170040485831e-06, + "loss": 2.2939, + "step": 236 + }, + { + "epoch": 0.9595141700404858, + "grad_norm": 0.976191957030197, + "learning_rate": 9.554655870445345e-06, + "loss": 2.0733, + "step": 237 + }, + { + "epoch": 0.9635627530364372, + "grad_norm": 1.2563869839657487, + "learning_rate": 9.595141700404859e-06, + "loss": 2.0464, + "step": 238 + }, + { + "epoch": 0.9676113360323887, + "grad_norm": 1.5608940397030466, + "learning_rate": 9.635627530364373e-06, + "loss": 2.336, + "step": 239 + }, + { + "epoch": 0.97165991902834, + "grad_norm": 1.3591514491532213, + "learning_rate": 9.676113360323888e-06, + "loss": 2.3022, + "step": 240 + }, + { + "epoch": 0.9757085020242915, + "grad_norm": 0.9384697642414853, + "learning_rate": 9.7165991902834e-06, + "loss": 2.0917, + "step": 241 + }, + { + "epoch": 0.979757085020243, + "grad_norm": 1.0921517070072044, + "learning_rate": 9.757085020242916e-06, + "loss": 2.2454, + "step": 242 + }, + { + "epoch": 0.9838056680161943, + "grad_norm": 1.0952417249590038, + "learning_rate": 9.79757085020243e-06, + "loss": 2.2731, + "step": 243 + }, + { + "epoch": 0.9878542510121457, + "grad_norm": 1.004948368911197, + "learning_rate": 9.838056680161944e-06, + "loss": 2.0318, + "step": 244 + }, + { + "epoch": 0.9919028340080972, + "grad_norm": 0.9149897248279167, + "learning_rate": 9.878542510121458e-06, + "loss": 2.0005, + "step": 245 + }, + { + "epoch": 0.9959514170040485, + "grad_norm": 0.8508821706595309, + "learning_rate": 9.919028340080973e-06, + "loss": 2.2101, + "step": 246 + }, + { + "epoch": 1.0, + "grad_norm": 1.0244113302231659, + "learning_rate": 9.959514170040487e-06, + "loss": 2.0861, + "step": 247 + }, + { + "epoch": 1.0040485829959513, + "grad_norm": 0.9985250389875123, + "learning_rate": 1e-05, + "loss": 2.1654, + "step": 248 + }, + { + "epoch": 1.008097165991903, + "grad_norm": 1.5212147724237604, + "learning_rate": 9.999995007009308e-06, + "loss": 2.3841, + "step": 249 + }, + { + "epoch": 1.0121457489878543, + "grad_norm": 1.5612489351031709, + "learning_rate": 9.999980028047207e-06, + "loss": 2.2013, + "step": 250 + }, + { + "epoch": 1.0161943319838056, + "grad_norm": 1.3355032190827423, + "learning_rate": 9.99995506314361e-06, + "loss": 2.3109, + "step": 251 + }, + { + "epoch": 1.0202429149797572, + "grad_norm": 1.309995468445311, + "learning_rate": 9.999920112348379e-06, + "loss": 2.5018, + "step": 252 + }, + { + "epoch": 1.0242914979757085, + "grad_norm": 1.4582415698006528, + "learning_rate": 9.999875175731316e-06, + "loss": 2.4387, + "step": 253 + }, + { + "epoch": 1.0283400809716599, + "grad_norm": 1.2959671971401512, + "learning_rate": 9.99982025338217e-06, + "loss": 2.0271, + "step": 254 + }, + { + "epoch": 1.0323886639676114, + "grad_norm": 1.3702661061884107, + "learning_rate": 9.999755345410628e-06, + "loss": 2.1942, + "step": 255 + }, + { + "epoch": 1.0364372469635628, + "grad_norm": 1.2343807344186972, + "learning_rate": 9.999680451946327e-06, + "loss": 2.3802, + "step": 256 + }, + { + "epoch": 1.040485829959514, + "grad_norm": 1.2422842542141688, + "learning_rate": 9.999595573138845e-06, + "loss": 2.1737, + "step": 257 + }, + { + "epoch": 1.0445344129554657, + "grad_norm": 1.0535455017417064, + "learning_rate": 9.9995007091577e-06, + "loss": 2.1892, + "step": 258 + }, + { + "epoch": 1.048582995951417, + "grad_norm": 1.1326643708775719, + "learning_rate": 9.999395860192354e-06, + "loss": 2.165, + "step": 259 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.1512147523566951, + "learning_rate": 9.99928102645221e-06, + "loss": 2.4136, + "step": 260 + }, + { + "epoch": 1.05668016194332, + "grad_norm": 1.161431041066393, + "learning_rate": 9.999156208166614e-06, + "loss": 2.2649, + "step": 261 + }, + { + "epoch": 1.0607287449392713, + "grad_norm": 1.0550067630684001, + "learning_rate": 9.999021405584855e-06, + "loss": 2.2776, + "step": 262 + }, + { + "epoch": 1.0647773279352226, + "grad_norm": 1.2456078968374804, + "learning_rate": 9.99887661897616e-06, + "loss": 2.2937, + "step": 263 + }, + { + "epoch": 1.0688259109311742, + "grad_norm": 2.6565909174287934, + "learning_rate": 9.998721848629691e-06, + "loss": 2.3373, + "step": 264 + }, + { + "epoch": 1.0728744939271255, + "grad_norm": 1.2585354952683687, + "learning_rate": 9.99855709485456e-06, + "loss": 2.1755, + "step": 265 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 1.0397633573741487, + "learning_rate": 9.99838235797981e-06, + "loss": 2.1224, + "step": 266 + }, + { + "epoch": 1.0809716599190284, + "grad_norm": 1.3490485543349722, + "learning_rate": 9.998197638354428e-06, + "loss": 2.162, + "step": 267 + }, + { + "epoch": 1.0850202429149798, + "grad_norm": 0.9779246835555004, + "learning_rate": 9.998002936347334e-06, + "loss": 2.0674, + "step": 268 + }, + { + "epoch": 1.0890688259109311, + "grad_norm": 1.326338728002689, + "learning_rate": 9.997798252347382e-06, + "loss": 2.1639, + "step": 269 + }, + { + "epoch": 1.0931174089068827, + "grad_norm": 1.0363012993300713, + "learning_rate": 9.99758358676337e-06, + "loss": 2.2088, + "step": 270 + }, + { + "epoch": 1.097165991902834, + "grad_norm": 1.0931184449284037, + "learning_rate": 9.99735894002403e-06, + "loss": 1.9417, + "step": 271 + }, + { + "epoch": 1.1012145748987854, + "grad_norm": 1.1142050270090365, + "learning_rate": 9.99712431257802e-06, + "loss": 2.1229, + "step": 272 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.1058458560003002, + "learning_rate": 9.99687970489394e-06, + "loss": 2.147, + "step": 273 + }, + { + "epoch": 1.1093117408906883, + "grad_norm": 1.1507827310584715, + "learning_rate": 9.996625117460319e-06, + "loss": 2.0305, + "step": 274 + }, + { + "epoch": 1.1133603238866396, + "grad_norm": 1.4399534822311415, + "learning_rate": 9.996360550785619e-06, + "loss": 1.993, + "step": 275 + }, + { + "epoch": 1.117408906882591, + "grad_norm": 1.3360646827911495, + "learning_rate": 9.996086005398228e-06, + "loss": 1.9789, + "step": 276 + }, + { + "epoch": 1.1214574898785425, + "grad_norm": 1.1287606232609018, + "learning_rate": 9.995801481846474e-06, + "loss": 1.9362, + "step": 277 + }, + { + "epoch": 1.125506072874494, + "grad_norm": 1.0926872380366626, + "learning_rate": 9.9955069806986e-06, + "loss": 1.8981, + "step": 278 + }, + { + "epoch": 1.1295546558704452, + "grad_norm": 1.225113996229143, + "learning_rate": 9.995202502542785e-06, + "loss": 1.877, + "step": 279 + }, + { + "epoch": 1.1336032388663968, + "grad_norm": 1.350566519940966, + "learning_rate": 9.99488804798713e-06, + "loss": 2.1812, + "step": 280 + }, + { + "epoch": 1.1376518218623481, + "grad_norm": 1.3946048118439773, + "learning_rate": 9.994563617659665e-06, + "loss": 2.0952, + "step": 281 + }, + { + "epoch": 1.1417004048582995, + "grad_norm": 1.016854167145539, + "learning_rate": 9.99422921220834e-06, + "loss": 1.7897, + "step": 282 + }, + { + "epoch": 1.145748987854251, + "grad_norm": 1.1675202565627227, + "learning_rate": 9.993884832301029e-06, + "loss": 2.1832, + "step": 283 + }, + { + "epoch": 1.1497975708502024, + "grad_norm": 1.1052537876752062, + "learning_rate": 9.993530478625524e-06, + "loss": 2.0419, + "step": 284 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.0339091939503424, + "learning_rate": 9.99316615188954e-06, + "loss": 2.1765, + "step": 285 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 1.224235640342616, + "learning_rate": 9.992791852820709e-06, + "loss": 2.414, + "step": 286 + }, + { + "epoch": 1.1619433198380567, + "grad_norm": 1.1077938277922803, + "learning_rate": 9.992407582166582e-06, + "loss": 2.0729, + "step": 287 + }, + { + "epoch": 1.165991902834008, + "grad_norm": 1.1047832453065312, + "learning_rate": 9.99201334069462e-06, + "loss": 2.0816, + "step": 288 + }, + { + "epoch": 1.1700404858299596, + "grad_norm": 1.020340791924455, + "learning_rate": 9.991609129192202e-06, + "loss": 2.4242, + "step": 289 + }, + { + "epoch": 1.174089068825911, + "grad_norm": 1.0597565636193305, + "learning_rate": 9.991194948466615e-06, + "loss": 1.9546, + "step": 290 + }, + { + "epoch": 1.1781376518218623, + "grad_norm": 2.733652108939615, + "learning_rate": 9.990770799345064e-06, + "loss": 2.0891, + "step": 291 + }, + { + "epoch": 1.1821862348178138, + "grad_norm": 1.06820787268932, + "learning_rate": 9.990336682674656e-06, + "loss": 1.8523, + "step": 292 + }, + { + "epoch": 1.1862348178137652, + "grad_norm": 2.087421429190754, + "learning_rate": 9.989892599322404e-06, + "loss": 2.0252, + "step": 293 + }, + { + "epoch": 1.1902834008097165, + "grad_norm": 1.0884298591172652, + "learning_rate": 9.989438550175235e-06, + "loss": 2.094, + "step": 294 + }, + { + "epoch": 1.194331983805668, + "grad_norm": 1.4465924376774404, + "learning_rate": 9.98897453613997e-06, + "loss": 2.2522, + "step": 295 + }, + { + "epoch": 1.1983805668016194, + "grad_norm": 1.2561153181877684, + "learning_rate": 9.988500558143337e-06, + "loss": 2.3174, + "step": 296 + }, + { + "epoch": 1.2024291497975708, + "grad_norm": 1.299592783957394, + "learning_rate": 9.988016617131966e-06, + "loss": 2.0626, + "step": 297 + }, + { + "epoch": 1.2064777327935223, + "grad_norm": 1.616312765069768, + "learning_rate": 9.987522714072377e-06, + "loss": 2.332, + "step": 298 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 1.1673730449379247, + "learning_rate": 9.987018849950996e-06, + "loss": 2.3944, + "step": 299 + }, + { + "epoch": 1.214574898785425, + "grad_norm": 1.143398053052611, + "learning_rate": 9.986505025774137e-06, + "loss": 2.1948, + "step": 300 + }, + { + "epoch": 1.2186234817813766, + "grad_norm": 1.097402992490867, + "learning_rate": 9.985981242568009e-06, + "loss": 2.0261, + "step": 301 + }, + { + "epoch": 1.222672064777328, + "grad_norm": 1.1862462194607237, + "learning_rate": 9.985447501378706e-06, + "loss": 2.0268, + "step": 302 + }, + { + "epoch": 1.2267206477732793, + "grad_norm": 1.1867953576661743, + "learning_rate": 9.984903803272216e-06, + "loss": 2.0609, + "step": 303 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 1.160233224133256, + "learning_rate": 9.984350149334415e-06, + "loss": 2.118, + "step": 304 + }, + { + "epoch": 1.2348178137651822, + "grad_norm": 1.1580496833430431, + "learning_rate": 9.983786540671052e-06, + "loss": 2.2939, + "step": 305 + }, + { + "epoch": 1.2388663967611335, + "grad_norm": 1.1904466983631679, + "learning_rate": 9.983212978407767e-06, + "loss": 2.2554, + "step": 306 + }, + { + "epoch": 1.242914979757085, + "grad_norm": 1.191066075711238, + "learning_rate": 9.982629463690075e-06, + "loss": 2.2252, + "step": 307 + }, + { + "epoch": 1.2469635627530364, + "grad_norm": 0.9748723838702108, + "learning_rate": 9.982035997683372e-06, + "loss": 2.0288, + "step": 308 + }, + { + "epoch": 1.2510121457489878, + "grad_norm": 1.0421752021046666, + "learning_rate": 9.981432581572925e-06, + "loss": 2.0528, + "step": 309 + }, + { + "epoch": 1.2550607287449393, + "grad_norm": 1.1354302953976132, + "learning_rate": 9.980819216563875e-06, + "loss": 2.1848, + "step": 310 + }, + { + "epoch": 1.2591093117408907, + "grad_norm": 1.1565556608606453, + "learning_rate": 9.980195903881231e-06, + "loss": 1.9964, + "step": 311 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.0637756069428104, + "learning_rate": 9.979562644769871e-06, + "loss": 1.8735, + "step": 312 + }, + { + "epoch": 1.2672064777327936, + "grad_norm": 1.0699259387542537, + "learning_rate": 9.978919440494538e-06, + "loss": 2.0595, + "step": 313 + }, + { + "epoch": 1.271255060728745, + "grad_norm": 1.1179452169818913, + "learning_rate": 9.978266292339838e-06, + "loss": 2.1342, + "step": 314 + }, + { + "epoch": 1.2753036437246963, + "grad_norm": 0.9851906694579183, + "learning_rate": 9.977603201610236e-06, + "loss": 2.0658, + "step": 315 + }, + { + "epoch": 1.2793522267206479, + "grad_norm": 1.664317835506444, + "learning_rate": 9.976930169630052e-06, + "loss": 2.1478, + "step": 316 + }, + { + "epoch": 1.2834008097165992, + "grad_norm": 2.1052363417173012, + "learning_rate": 9.976247197743465e-06, + "loss": 1.8522, + "step": 317 + }, + { + "epoch": 1.2874493927125505, + "grad_norm": 1.1846256759923113, + "learning_rate": 9.975554287314505e-06, + "loss": 1.9432, + "step": 318 + }, + { + "epoch": 1.291497975708502, + "grad_norm": 1.138896431387234, + "learning_rate": 9.974851439727045e-06, + "loss": 1.8181, + "step": 319 + }, + { + "epoch": 1.2955465587044535, + "grad_norm": 1.153796269934686, + "learning_rate": 9.974138656384815e-06, + "loss": 2.1573, + "step": 320 + }, + { + "epoch": 1.2995951417004048, + "grad_norm": 1.703181471948063, + "learning_rate": 9.973415938711383e-06, + "loss": 2.1787, + "step": 321 + }, + { + "epoch": 1.3036437246963564, + "grad_norm": 1.7096036636558702, + "learning_rate": 9.972683288150155e-06, + "loss": 1.9479, + "step": 322 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 1.1866073546875906, + "learning_rate": 9.97194070616438e-06, + "loss": 1.9284, + "step": 323 + }, + { + "epoch": 1.311740890688259, + "grad_norm": 1.0952591943942271, + "learning_rate": 9.971188194237141e-06, + "loss": 1.9908, + "step": 324 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 1.5313235105110092, + "learning_rate": 9.97042575387135e-06, + "loss": 2.0365, + "step": 325 + }, + { + "epoch": 1.319838056680162, + "grad_norm": 1.2326037015549494, + "learning_rate": 9.969653386589749e-06, + "loss": 1.9016, + "step": 326 + }, + { + "epoch": 1.3238866396761133, + "grad_norm": 1.08612437072456, + "learning_rate": 9.968871093934908e-06, + "loss": 1.9295, + "step": 327 + }, + { + "epoch": 1.3279352226720649, + "grad_norm": 1.1765201682452633, + "learning_rate": 9.968078877469221e-06, + "loss": 1.9057, + "step": 328 + }, + { + "epoch": 1.3319838056680162, + "grad_norm": 1.1266840563836074, + "learning_rate": 9.967276738774897e-06, + "loss": 1.7933, + "step": 329 + }, + { + "epoch": 1.3360323886639676, + "grad_norm": 1.096241365913634, + "learning_rate": 9.966464679453969e-06, + "loss": 1.8225, + "step": 330 + }, + { + "epoch": 1.3400809716599191, + "grad_norm": 1.0190613068454424, + "learning_rate": 9.965642701128273e-06, + "loss": 1.7548, + "step": 331 + }, + { + "epoch": 1.3441295546558705, + "grad_norm": 1.045370042720153, + "learning_rate": 9.964810805439464e-06, + "loss": 1.8602, + "step": 332 + }, + { + "epoch": 1.3481781376518218, + "grad_norm": 1.2609434903119947, + "learning_rate": 9.963968994049e-06, + "loss": 2.0594, + "step": 333 + }, + { + "epoch": 1.3522267206477734, + "grad_norm": 2.6150970483606812, + "learning_rate": 9.963117268638147e-06, + "loss": 1.8496, + "step": 334 + }, + { + "epoch": 1.3562753036437247, + "grad_norm": 1.2099371136718209, + "learning_rate": 9.962255630907964e-06, + "loss": 1.6494, + "step": 335 + }, + { + "epoch": 1.360323886639676, + "grad_norm": 1.313765722576788, + "learning_rate": 9.961384082579311e-06, + "loss": 1.9562, + "step": 336 + }, + { + "epoch": 1.3643724696356276, + "grad_norm": 1.2172159882432991, + "learning_rate": 9.96050262539284e-06, + "loss": 2.0155, + "step": 337 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.2586156100651915, + "learning_rate": 9.959611261108999e-06, + "loss": 1.9085, + "step": 338 + }, + { + "epoch": 1.3724696356275303, + "grad_norm": 1.5183212778349207, + "learning_rate": 9.958709991508013e-06, + "loss": 2.0875, + "step": 339 + }, + { + "epoch": 1.376518218623482, + "grad_norm": 1.1522560111562028, + "learning_rate": 9.957798818389894e-06, + "loss": 1.619, + "step": 340 + }, + { + "epoch": 1.3805668016194332, + "grad_norm": 1.1594845675041106, + "learning_rate": 9.956877743574437e-06, + "loss": 1.809, + "step": 341 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 1.1122066306670175, + "learning_rate": 9.955946768901207e-06, + "loss": 1.7047, + "step": 342 + }, + { + "epoch": 1.3886639676113361, + "grad_norm": 1.330314253280862, + "learning_rate": 9.955005896229543e-06, + "loss": 1.7574, + "step": 343 + }, + { + "epoch": 1.3927125506072875, + "grad_norm": 1.1715493987473338, + "learning_rate": 9.954055127438554e-06, + "loss": 1.903, + "step": 344 + }, + { + "epoch": 1.3967611336032388, + "grad_norm": 1.3791674988449036, + "learning_rate": 9.95309446442711e-06, + "loss": 1.7259, + "step": 345 + }, + { + "epoch": 1.4008097165991904, + "grad_norm": 1.1049829081327143, + "learning_rate": 9.952123909113842e-06, + "loss": 1.7903, + "step": 346 + }, + { + "epoch": 1.4048582995951417, + "grad_norm": 1.2032214776472194, + "learning_rate": 9.951143463437145e-06, + "loss": 1.8805, + "step": 347 + }, + { + "epoch": 1.408906882591093, + "grad_norm": 1.4430732870842997, + "learning_rate": 9.950153129355156e-06, + "loss": 1.963, + "step": 348 + }, + { + "epoch": 1.4129554655870447, + "grad_norm": 1.1510222292519288, + "learning_rate": 9.949152908845771e-06, + "loss": 1.8567, + "step": 349 + }, + { + "epoch": 1.417004048582996, + "grad_norm": 1.195578264117532, + "learning_rate": 9.948142803906623e-06, + "loss": 2.0649, + "step": 350 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.233691487377917, + "learning_rate": 9.947122816555091e-06, + "loss": 2.1272, + "step": 351 + }, + { + "epoch": 1.425101214574899, + "grad_norm": 1.1086448213277071, + "learning_rate": 9.94609294882829e-06, + "loss": 1.9559, + "step": 352 + }, + { + "epoch": 1.4291497975708503, + "grad_norm": 1.095236792272251, + "learning_rate": 9.94505320278307e-06, + "loss": 2.0925, + "step": 353 + }, + { + "epoch": 1.4331983805668016, + "grad_norm": 1.5358655904235856, + "learning_rate": 9.944003580496004e-06, + "loss": 2.1299, + "step": 354 + }, + { + "epoch": 1.4372469635627532, + "grad_norm": 4.618210545500014, + "learning_rate": 9.942944084063397e-06, + "loss": 1.906, + "step": 355 + }, + { + "epoch": 1.4412955465587045, + "grad_norm": 1.2771853507714968, + "learning_rate": 9.94187471560127e-06, + "loss": 1.8895, + "step": 356 + }, + { + "epoch": 1.4453441295546559, + "grad_norm": 1.503260525653169, + "learning_rate": 9.940795477245362e-06, + "loss": 2.123, + "step": 357 + }, + { + "epoch": 1.4493927125506074, + "grad_norm": 1.1357577615662766, + "learning_rate": 9.939706371151124e-06, + "loss": 1.9087, + "step": 358 + }, + { + "epoch": 1.4534412955465588, + "grad_norm": 1.3448821103990194, + "learning_rate": 9.938607399493714e-06, + "loss": 1.8989, + "step": 359 + }, + { + "epoch": 1.45748987854251, + "grad_norm": 1.3913310219583304, + "learning_rate": 9.937498564467993e-06, + "loss": 2.2799, + "step": 360 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 1.9605641433764716, + "learning_rate": 9.936379868288525e-06, + "loss": 2.5915, + "step": 361 + }, + { + "epoch": 1.465587044534413, + "grad_norm": 1.2844543412275256, + "learning_rate": 9.935251313189564e-06, + "loss": 2.1301, + "step": 362 + }, + { + "epoch": 1.4696356275303644, + "grad_norm": 1.034982029315575, + "learning_rate": 9.934112901425058e-06, + "loss": 2.0549, + "step": 363 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.204999735063322, + "learning_rate": 9.932964635268637e-06, + "loss": 1.9596, + "step": 364 + }, + { + "epoch": 1.4777327935222673, + "grad_norm": 1.286601988495976, + "learning_rate": 9.931806517013612e-06, + "loss": 2.0348, + "step": 365 + }, + { + "epoch": 1.4817813765182186, + "grad_norm": 0.9482600934112612, + "learning_rate": 9.930638548972976e-06, + "loss": 1.9226, + "step": 366 + }, + { + "epoch": 1.48582995951417, + "grad_norm": 1.2527379198286719, + "learning_rate": 9.92946073347939e-06, + "loss": 1.9363, + "step": 367 + }, + { + "epoch": 1.4898785425101215, + "grad_norm": 1.416748811839403, + "learning_rate": 9.92827307288518e-06, + "loss": 1.8743, + "step": 368 + }, + { + "epoch": 1.4939271255060729, + "grad_norm": 1.4807677636442649, + "learning_rate": 9.927075569562342e-06, + "loss": 1.9204, + "step": 369 + }, + { + "epoch": 1.4979757085020242, + "grad_norm": 1.3869419977919077, + "learning_rate": 9.925868225902518e-06, + "loss": 1.8206, + "step": 370 + }, + { + "epoch": 1.5020242914979756, + "grad_norm": 1.1484019096824427, + "learning_rate": 9.924651044317017e-06, + "loss": 1.741, + "step": 371 + }, + { + "epoch": 1.5060728744939271, + "grad_norm": 1.33557569757452, + "learning_rate": 9.923424027236786e-06, + "loss": 2.0195, + "step": 372 + }, + { + "epoch": 1.5101214574898787, + "grad_norm": 1.3948710108814935, + "learning_rate": 9.922187177112422e-06, + "loss": 2.0682, + "step": 373 + }, + { + "epoch": 1.5141700404858298, + "grad_norm": 0.9670281862333157, + "learning_rate": 9.920940496414153e-06, + "loss": 2.0098, + "step": 374 + }, + { + "epoch": 1.5182186234817814, + "grad_norm": 1.1816940948972323, + "learning_rate": 9.919683987631849e-06, + "loss": 2.041, + "step": 375 + }, + { + "epoch": 1.522267206477733, + "grad_norm": 1.1912191018269882, + "learning_rate": 9.918417653275004e-06, + "loss": 1.9668, + "step": 376 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 2.3568912806934783, + "learning_rate": 9.917141495872733e-06, + "loss": 1.737, + "step": 377 + }, + { + "epoch": 1.5303643724696356, + "grad_norm": 1.4730591126031292, + "learning_rate": 9.915855517973776e-06, + "loss": 1.8672, + "step": 378 + }, + { + "epoch": 1.5344129554655872, + "grad_norm": 1.5631199604094446, + "learning_rate": 9.914559722146483e-06, + "loss": 2.0038, + "step": 379 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 2.5148949693335014, + "learning_rate": 9.913254110978812e-06, + "loss": 2.0916, + "step": 380 + }, + { + "epoch": 1.54251012145749, + "grad_norm": 1.0936340454215232, + "learning_rate": 9.911938687078324e-06, + "loss": 1.9959, + "step": 381 + }, + { + "epoch": 1.5465587044534415, + "grad_norm": 9.59805118170954, + "learning_rate": 9.91061345307218e-06, + "loss": 2.6669, + "step": 382 + }, + { + "epoch": 1.5506072874493926, + "grad_norm": 5.341110768663029, + "learning_rate": 9.909278411607134e-06, + "loss": 2.7524, + "step": 383 + }, + { + "epoch": 1.5546558704453441, + "grad_norm": 6.319523626825805, + "learning_rate": 9.90793356534952e-06, + "loss": 3.2784, + "step": 384 + }, + { + "epoch": 1.5587044534412957, + "grad_norm": 1.1632747156326964, + "learning_rate": 9.906578916985267e-06, + "loss": 1.9441, + "step": 385 + }, + { + "epoch": 1.5627530364372468, + "grad_norm": 1.129320861281679, + "learning_rate": 9.90521446921987e-06, + "loss": 1.84, + "step": 386 + }, + { + "epoch": 1.5668016194331984, + "grad_norm": 1.0396625767769134, + "learning_rate": 9.9038402247784e-06, + "loss": 2.0999, + "step": 387 + }, + { + "epoch": 1.5708502024291497, + "grad_norm": 1.1109350507878293, + "learning_rate": 9.90245618640549e-06, + "loss": 1.7455, + "step": 388 + }, + { + "epoch": 1.574898785425101, + "grad_norm": 1.1573410708340344, + "learning_rate": 9.90106235686534e-06, + "loss": 2.1349, + "step": 389 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 1.0084157125260218, + "learning_rate": 9.8996587389417e-06, + "loss": 1.8406, + "step": 390 + }, + { + "epoch": 1.582995951417004, + "grad_norm": 1.1571333441837306, + "learning_rate": 9.89824533543787e-06, + "loss": 2.1231, + "step": 391 + }, + { + "epoch": 1.5870445344129553, + "grad_norm": 1.0697948256338023, + "learning_rate": 9.896822149176695e-06, + "loss": 1.9727, + "step": 392 + }, + { + "epoch": 1.591093117408907, + "grad_norm": 1.1795302734430202, + "learning_rate": 9.895389183000557e-06, + "loss": 1.9829, + "step": 393 + }, + { + "epoch": 1.5951417004048583, + "grad_norm": 1.3378200533531102, + "learning_rate": 9.893946439771369e-06, + "loss": 1.648, + "step": 394 + }, + { + "epoch": 1.5991902834008096, + "grad_norm": 1.190232768067943, + "learning_rate": 9.892493922370575e-06, + "loss": 1.6858, + "step": 395 + }, + { + "epoch": 1.6032388663967612, + "grad_norm": 1.1458315074040415, + "learning_rate": 9.891031633699135e-06, + "loss": 1.8744, + "step": 396 + }, + { + "epoch": 1.6072874493927125, + "grad_norm": 1.1819017581575564, + "learning_rate": 9.88955957667753e-06, + "loss": 1.7732, + "step": 397 + }, + { + "epoch": 1.6113360323886639, + "grad_norm": 1.8565903989047288, + "learning_rate": 9.888077754245741e-06, + "loss": 2.0753, + "step": 398 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 1.0244971639990994, + "learning_rate": 9.886586169363267e-06, + "loss": 1.9333, + "step": 399 + }, + { + "epoch": 1.6194331983805668, + "grad_norm": 1.249918723327364, + "learning_rate": 9.885084825009085e-06, + "loss": 1.8167, + "step": 400 + }, + { + "epoch": 1.623481781376518, + "grad_norm": 1.379879581099796, + "learning_rate": 9.883573724181683e-06, + "loss": 2.1783, + "step": 401 + }, + { + "epoch": 1.6275303643724697, + "grad_norm": 1.0714251364756116, + "learning_rate": 9.882052869899024e-06, + "loss": 1.9676, + "step": 402 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 1.2237579067545878, + "learning_rate": 9.880522265198548e-06, + "loss": 2.154, + "step": 403 + }, + { + "epoch": 1.6356275303643724, + "grad_norm": 1.0681493200255976, + "learning_rate": 9.878981913137178e-06, + "loss": 1.8629, + "step": 404 + }, + { + "epoch": 1.639676113360324, + "grad_norm": 1.213978261543208, + "learning_rate": 9.877431816791299e-06, + "loss": 2.0544, + "step": 405 + }, + { + "epoch": 1.6437246963562753, + "grad_norm": 1.0906406926843764, + "learning_rate": 9.875871979256754e-06, + "loss": 2.0126, + "step": 406 + }, + { + "epoch": 1.6477732793522266, + "grad_norm": 1.1548847276751324, + "learning_rate": 9.87430240364885e-06, + "loss": 1.9896, + "step": 407 + }, + { + "epoch": 1.6518218623481782, + "grad_norm": 1.1007484969249457, + "learning_rate": 9.872723093102332e-06, + "loss": 1.8537, + "step": 408 + }, + { + "epoch": 1.6558704453441295, + "grad_norm": 1.4626798707839297, + "learning_rate": 9.871134050771398e-06, + "loss": 2.0636, + "step": 409 + }, + { + "epoch": 1.6599190283400809, + "grad_norm": 1.4362925135326843, + "learning_rate": 9.869535279829674e-06, + "loss": 1.892, + "step": 410 + }, + { + "epoch": 1.6639676113360324, + "grad_norm": 1.1158035130218342, + "learning_rate": 9.867926783470221e-06, + "loss": 2.0106, + "step": 411 + }, + { + "epoch": 1.6680161943319838, + "grad_norm": 1.094342494438384, + "learning_rate": 9.866308564905523e-06, + "loss": 2.0453, + "step": 412 + }, + { + "epoch": 1.6720647773279351, + "grad_norm": 1.0432966613184569, + "learning_rate": 9.864680627367476e-06, + "loss": 1.9541, + "step": 413 + }, + { + "epoch": 1.6761133603238867, + "grad_norm": 1.2646590113938572, + "learning_rate": 9.863042974107395e-06, + "loss": 1.9078, + "step": 414 + }, + { + "epoch": 1.680161943319838, + "grad_norm": 1.4143613333940679, + "learning_rate": 9.861395608395993e-06, + "loss": 2.0498, + "step": 415 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 1.1227780591009553, + "learning_rate": 9.859738533523384e-06, + "loss": 1.8425, + "step": 416 + }, + { + "epoch": 1.688259109311741, + "grad_norm": 1.1478310296573677, + "learning_rate": 9.85807175279907e-06, + "loss": 1.9961, + "step": 417 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 1.1555612172711482, + "learning_rate": 9.856395269551941e-06, + "loss": 1.9982, + "step": 418 + }, + { + "epoch": 1.6963562753036436, + "grad_norm": 1.2453555718552303, + "learning_rate": 9.854709087130261e-06, + "loss": 1.8074, + "step": 419 + }, + { + "epoch": 1.7004048582995952, + "grad_norm": 1.3445248996792332, + "learning_rate": 9.85301320890167e-06, + "loss": 2.315, + "step": 420 + }, + { + "epoch": 1.7044534412955465, + "grad_norm": 1.37583724829167, + "learning_rate": 9.851307638253167e-06, + "loss": 2.0698, + "step": 421 + }, + { + "epoch": 1.708502024291498, + "grad_norm": 1.4100704184587762, + "learning_rate": 9.849592378591113e-06, + "loss": 1.7238, + "step": 422 + }, + { + "epoch": 1.7125506072874495, + "grad_norm": 1.2265807736330994, + "learning_rate": 9.847867433341218e-06, + "loss": 1.881, + "step": 423 + }, + { + "epoch": 1.7165991902834008, + "grad_norm": 1.192372006539784, + "learning_rate": 9.846132805948534e-06, + "loss": 1.9658, + "step": 424 + }, + { + "epoch": 1.7206477732793521, + "grad_norm": 1.307546713268623, + "learning_rate": 9.844388499877457e-06, + "loss": 1.873, + "step": 425 + }, + { + "epoch": 1.7246963562753037, + "grad_norm": 1.382722813051471, + "learning_rate": 9.842634518611705e-06, + "loss": 1.9664, + "step": 426 + }, + { + "epoch": 1.728744939271255, + "grad_norm": 1.4179302059943903, + "learning_rate": 9.840870865654323e-06, + "loss": 2.1073, + "step": 427 + }, + { + "epoch": 1.7327935222672064, + "grad_norm": 1.0508460965436048, + "learning_rate": 9.839097544527674e-06, + "loss": 1.9957, + "step": 428 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.239601761065164, + "learning_rate": 9.837314558773427e-06, + "loss": 2.0381, + "step": 429 + }, + { + "epoch": 1.7408906882591093, + "grad_norm": 2.1485433652175137, + "learning_rate": 9.835521911952554e-06, + "loss": 2.6976, + "step": 430 + }, + { + "epoch": 1.7449392712550607, + "grad_norm": 1.2416619753926275, + "learning_rate": 9.833719607645325e-06, + "loss": 2.0715, + "step": 431 + }, + { + "epoch": 1.7489878542510122, + "grad_norm": 1.2591779562696075, + "learning_rate": 9.831907649451291e-06, + "loss": 1.9002, + "step": 432 + }, + { + "epoch": 1.7530364372469636, + "grad_norm": 1.1535891547143164, + "learning_rate": 9.830086040989294e-06, + "loss": 1.7871, + "step": 433 + }, + { + "epoch": 1.757085020242915, + "grad_norm": 1.1923358702044, + "learning_rate": 9.82825478589744e-06, + "loss": 1.9962, + "step": 434 + }, + { + "epoch": 1.7611336032388665, + "grad_norm": 4.275347299758622, + "learning_rate": 9.826413887833103e-06, + "loss": 2.9222, + "step": 435 + }, + { + "epoch": 1.7651821862348178, + "grad_norm": 4.287598045967039, + "learning_rate": 9.824563350472922e-06, + "loss": 2.8461, + "step": 436 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 10.935868536450831, + "learning_rate": 9.822703177512783e-06, + "loss": 2.7384, + "step": 437 + }, + { + "epoch": 1.7732793522267207, + "grad_norm": 1.3409883266265459, + "learning_rate": 9.820833372667813e-06, + "loss": 1.9939, + "step": 438 + }, + { + "epoch": 1.777327935222672, + "grad_norm": 1.3613081112789813, + "learning_rate": 9.818953939672382e-06, + "loss": 2.1821, + "step": 439 + }, + { + "epoch": 1.7813765182186234, + "grad_norm": 1.2675875076339627, + "learning_rate": 9.817064882280085e-06, + "loss": 2.2096, + "step": 440 + }, + { + "epoch": 1.785425101214575, + "grad_norm": 1.1133761183439654, + "learning_rate": 9.815166204263743e-06, + "loss": 2.0038, + "step": 441 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 1.0606754044873359, + "learning_rate": 9.813257909415384e-06, + "loss": 1.887, + "step": 442 + }, + { + "epoch": 1.7935222672064777, + "grad_norm": 1.2526447757224037, + "learning_rate": 9.811340001546252e-06, + "loss": 2.0549, + "step": 443 + }, + { + "epoch": 1.7975708502024292, + "grad_norm": 1.1262042906691425, + "learning_rate": 9.809412484486785e-06, + "loss": 2.077, + "step": 444 + }, + { + "epoch": 1.8016194331983806, + "grad_norm": 1.155022921046038, + "learning_rate": 9.80747536208661e-06, + "loss": 1.8171, + "step": 445 + }, + { + "epoch": 1.805668016194332, + "grad_norm": 1.1470501457250857, + "learning_rate": 9.805528638214543e-06, + "loss": 1.709, + "step": 446 + }, + { + "epoch": 1.8097165991902835, + "grad_norm": 1.254871859778204, + "learning_rate": 9.803572316758573e-06, + "loss": 2.005, + "step": 447 + }, + { + "epoch": 1.8137651821862348, + "grad_norm": 1.4428684006978485, + "learning_rate": 9.801606401625857e-06, + "loss": 2.0437, + "step": 448 + }, + { + "epoch": 1.8178137651821862, + "grad_norm": 1.1372709832560302, + "learning_rate": 9.799630896742716e-06, + "loss": 1.8053, + "step": 449 + }, + { + "epoch": 1.8218623481781377, + "grad_norm": 7.867540851479705, + "learning_rate": 9.797645806054617e-06, + "loss": 2.6057, + "step": 450 + }, + { + "epoch": 1.825910931174089, + "grad_norm": 17.828898730946783, + "learning_rate": 9.79565113352618e-06, + "loss": 4.1742, + "step": 451 + }, + { + "epoch": 1.8299595141700404, + "grad_norm": 1.3323533085958537, + "learning_rate": 9.793646883141155e-06, + "loss": 1.9001, + "step": 452 + }, + { + "epoch": 1.834008097165992, + "grad_norm": 1.2550944955882024, + "learning_rate": 9.791633058902424e-06, + "loss": 1.7789, + "step": 453 + }, + { + "epoch": 1.8380566801619433, + "grad_norm": 1.2515953723091495, + "learning_rate": 9.789609664831988e-06, + "loss": 1.8425, + "step": 454 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 1.1650016476570495, + "learning_rate": 9.787576704970965e-06, + "loss": 1.8701, + "step": 455 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 1.1568290050770706, + "learning_rate": 9.785534183379571e-06, + "loss": 1.8468, + "step": 456 + }, + { + "epoch": 1.8502024291497976, + "grad_norm": 1.1529373182216824, + "learning_rate": 9.783482104137127e-06, + "loss": 1.8772, + "step": 457 + }, + { + "epoch": 1.854251012145749, + "grad_norm": 1.3000516637827273, + "learning_rate": 9.781420471342035e-06, + "loss": 1.9477, + "step": 458 + }, + { + "epoch": 1.8582995951417005, + "grad_norm": 1.0258650008659411, + "learning_rate": 9.779349289111781e-06, + "loss": 1.8995, + "step": 459 + }, + { + "epoch": 1.8623481781376519, + "grad_norm": 1.2394575763975424, + "learning_rate": 9.777268561582921e-06, + "loss": 1.9406, + "step": 460 + }, + { + "epoch": 1.8663967611336032, + "grad_norm": 1.2541685708518606, + "learning_rate": 9.77517829291108e-06, + "loss": 1.9325, + "step": 461 + }, + { + "epoch": 1.8704453441295548, + "grad_norm": 1.5330647366042962, + "learning_rate": 9.773078487270932e-06, + "loss": 2.4038, + "step": 462 + }, + { + "epoch": 1.874493927125506, + "grad_norm": 1.5015880335176561, + "learning_rate": 9.770969148856202e-06, + "loss": 2.3187, + "step": 463 + }, + { + "epoch": 1.8785425101214575, + "grad_norm": 1.4834304636666527, + "learning_rate": 9.768850281879651e-06, + "loss": 2.1105, + "step": 464 + }, + { + "epoch": 1.882591093117409, + "grad_norm": 1.2140714457469706, + "learning_rate": 9.766721890573075e-06, + "loss": 1.9824, + "step": 465 + }, + { + "epoch": 1.8866396761133604, + "grad_norm": 1.3661085878272685, + "learning_rate": 9.764583979187288e-06, + "loss": 1.5205, + "step": 466 + }, + { + "epoch": 1.8906882591093117, + "grad_norm": 1.2317311840953222, + "learning_rate": 9.762436551992117e-06, + "loss": 1.9872, + "step": 467 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 1.3883104250103875, + "learning_rate": 9.760279613276397e-06, + "loss": 2.0814, + "step": 468 + }, + { + "epoch": 1.8987854251012146, + "grad_norm": 1.1681713845582538, + "learning_rate": 9.75811316734796e-06, + "loss": 1.7849, + "step": 469 + }, + { + "epoch": 1.902834008097166, + "grad_norm": 1.15545443174025, + "learning_rate": 9.755937218533622e-06, + "loss": 1.8179, + "step": 470 + }, + { + "epoch": 1.9068825910931175, + "grad_norm": 1.5408624758508003, + "learning_rate": 9.753751771179177e-06, + "loss": 2.0286, + "step": 471 + }, + { + "epoch": 1.9109311740890689, + "grad_norm": 1.3817398480348058, + "learning_rate": 9.751556829649398e-06, + "loss": 1.5547, + "step": 472 + }, + { + "epoch": 1.9149797570850202, + "grad_norm": 1.3351696061966247, + "learning_rate": 9.74935239832801e-06, + "loss": 1.733, + "step": 473 + }, + { + "epoch": 1.9190283400809718, + "grad_norm": 1.264760117783077, + "learning_rate": 9.747138481617695e-06, + "loss": 1.767, + "step": 474 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 1.2863761477462097, + "learning_rate": 9.74491508394008e-06, + "loss": 2.0018, + "step": 475 + }, + { + "epoch": 1.9271255060728745, + "grad_norm": 1.5310497493928237, + "learning_rate": 9.742682209735727e-06, + "loss": 1.8865, + "step": 476 + }, + { + "epoch": 1.931174089068826, + "grad_norm": 1.711973469366144, + "learning_rate": 9.740439863464127e-06, + "loss": 1.9105, + "step": 477 + }, + { + "epoch": 1.9352226720647774, + "grad_norm": 1.249933707627717, + "learning_rate": 9.738188049603679e-06, + "loss": 1.7676, + "step": 478 + }, + { + "epoch": 1.9392712550607287, + "grad_norm": 1.2902981801333298, + "learning_rate": 9.735926772651703e-06, + "loss": 1.6493, + "step": 479 + }, + { + "epoch": 1.9433198380566803, + "grad_norm": 1.4792877192638219, + "learning_rate": 9.73365603712441e-06, + "loss": 1.9464, + "step": 480 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 1.3282266924987296, + "learning_rate": 9.731375847556905e-06, + "loss": 1.6826, + "step": 481 + }, + { + "epoch": 1.951417004048583, + "grad_norm": 1.4677668638223476, + "learning_rate": 9.729086208503174e-06, + "loss": 1.7014, + "step": 482 + }, + { + "epoch": 1.9554655870445345, + "grad_norm": 2.3808599607342855, + "learning_rate": 9.726787124536077e-06, + "loss": 1.9583, + "step": 483 + }, + { + "epoch": 1.9595141700404857, + "grad_norm": 1.3600754750050374, + "learning_rate": 9.724478600247333e-06, + "loss": 1.7925, + "step": 484 + }, + { + "epoch": 1.9635627530364372, + "grad_norm": 1.1666914976637783, + "learning_rate": 9.722160640247523e-06, + "loss": 1.8932, + "step": 485 + }, + { + "epoch": 1.9676113360323888, + "grad_norm": 1.3451750453053897, + "learning_rate": 9.719833249166061e-06, + "loss": 2.1332, + "step": 486 + }, + { + "epoch": 1.97165991902834, + "grad_norm": 1.9010105722641066, + "learning_rate": 9.717496431651212e-06, + "loss": 2.0526, + "step": 487 + }, + { + "epoch": 1.9757085020242915, + "grad_norm": 1.1672390815512188, + "learning_rate": 9.715150192370054e-06, + "loss": 1.8783, + "step": 488 + }, + { + "epoch": 1.979757085020243, + "grad_norm": 1.384114220461852, + "learning_rate": 9.712794536008488e-06, + "loss": 1.9859, + "step": 489 + }, + { + "epoch": 1.9838056680161942, + "grad_norm": 1.2933526518975824, + "learning_rate": 9.710429467271221e-06, + "loss": 2.0382, + "step": 490 + }, + { + "epoch": 1.9878542510121457, + "grad_norm": 1.423570288241044, + "learning_rate": 9.708054990881763e-06, + "loss": 1.8377, + "step": 491 + }, + { + "epoch": 1.9919028340080973, + "grad_norm": 1.2866158830707874, + "learning_rate": 9.705671111582406e-06, + "loss": 1.7694, + "step": 492 + }, + { + "epoch": 1.9959514170040484, + "grad_norm": 1.0521519412024614, + "learning_rate": 9.703277834134227e-06, + "loss": 2.0757, + "step": 493 + }, + { + "epoch": 2.0, + "grad_norm": 1.2995506674782646, + "learning_rate": 9.700875163317072e-06, + "loss": 1.8875, + "step": 494 + }, + { + "epoch": 2.0040485829959516, + "grad_norm": 1.1352855274001465, + "learning_rate": 9.698463103929542e-06, + "loss": 1.9618, + "step": 495 + }, + { + "epoch": 2.0080971659919027, + "grad_norm": 1.542269208448278, + "learning_rate": 9.696041660788997e-06, + "loss": 2.0888, + "step": 496 + }, + { + "epoch": 2.0121457489878543, + "grad_norm": 1.6780350902786914, + "learning_rate": 9.693610838731532e-06, + "loss": 1.9408, + "step": 497 + }, + { + "epoch": 2.016194331983806, + "grad_norm": 1.6035230575875041, + "learning_rate": 9.691170642611975e-06, + "loss": 2.0771, + "step": 498 + }, + { + "epoch": 2.020242914979757, + "grad_norm": 1.4671035377471024, + "learning_rate": 9.68872107730388e-06, + "loss": 2.3311, + "step": 499 + }, + { + "epoch": 2.0242914979757085, + "grad_norm": 1.5075955512152057, + "learning_rate": 9.686262147699507e-06, + "loss": 2.2077, + "step": 500 + }, + { + "epoch": 2.02834008097166, + "grad_norm": 1.5639916261560791, + "learning_rate": 9.683793858709821e-06, + "loss": 1.8546, + "step": 501 + }, + { + "epoch": 2.032388663967611, + "grad_norm": 1.5331421353363675, + "learning_rate": 9.681316215264481e-06, + "loss": 1.9004, + "step": 502 + }, + { + "epoch": 2.0364372469635628, + "grad_norm": 1.4656462364511347, + "learning_rate": 9.678829222311827e-06, + "loss": 2.1369, + "step": 503 + }, + { + "epoch": 2.0404858299595143, + "grad_norm": 1.7055289856989309, + "learning_rate": 9.67633288481887e-06, + "loss": 1.9294, + "step": 504 + }, + { + "epoch": 2.0445344129554655, + "grad_norm": 1.3320529357395552, + "learning_rate": 9.67382720777129e-06, + "loss": 1.9228, + "step": 505 + }, + { + "epoch": 2.048582995951417, + "grad_norm": 1.378485994628673, + "learning_rate": 9.671312196173413e-06, + "loss": 1.9005, + "step": 506 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 1.4519006220083899, + "learning_rate": 9.668787855048209e-06, + "loss": 2.0772, + "step": 507 + }, + { + "epoch": 2.0566801619433197, + "grad_norm": 1.46960243337033, + "learning_rate": 9.666254189437286e-06, + "loss": 1.9259, + "step": 508 + }, + { + "epoch": 2.0607287449392713, + "grad_norm": 1.3018755932293484, + "learning_rate": 9.663711204400872e-06, + "loss": 2.0637, + "step": 509 + }, + { + "epoch": 2.064777327935223, + "grad_norm": 1.4438151108336905, + "learning_rate": 9.661158905017804e-06, + "loss": 1.9998, + "step": 510 + }, + { + "epoch": 2.068825910931174, + "grad_norm": 1.5146888645164116, + "learning_rate": 9.658597296385527e-06, + "loss": 2.1032, + "step": 511 + }, + { + "epoch": 2.0728744939271255, + "grad_norm": 1.4173605487062464, + "learning_rate": 9.656026383620076e-06, + "loss": 1.9957, + "step": 512 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 1.3186505882274318, + "learning_rate": 9.653446171856069e-06, + "loss": 1.9291, + "step": 513 + }, + { + "epoch": 2.080971659919028, + "grad_norm": 1.2929004725593367, + "learning_rate": 9.650856666246693e-06, + "loss": 1.9435, + "step": 514 + }, + { + "epoch": 2.08502024291498, + "grad_norm": 1.2511951269635655, + "learning_rate": 9.6482578719637e-06, + "loss": 1.9267, + "step": 515 + }, + { + "epoch": 2.0890688259109313, + "grad_norm": 1.9429673192553882, + "learning_rate": 9.645649794197394e-06, + "loss": 1.9435, + "step": 516 + }, + { + "epoch": 2.0931174089068825, + "grad_norm": 1.315419932054697, + "learning_rate": 9.643032438156616e-06, + "loss": 2.0396, + "step": 517 + }, + { + "epoch": 2.097165991902834, + "grad_norm": 1.3284199817957691, + "learning_rate": 9.640405809068743e-06, + "loss": 1.765, + "step": 518 + }, + { + "epoch": 2.1012145748987856, + "grad_norm": 1.4032585852247357, + "learning_rate": 9.637769912179664e-06, + "loss": 1.9292, + "step": 519 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 1.4202061741742247, + "learning_rate": 9.635124752753787e-06, + "loss": 1.9832, + "step": 520 + }, + { + "epoch": 2.1093117408906883, + "grad_norm": 1.4962037346644237, + "learning_rate": 9.632470336074009e-06, + "loss": 1.8461, + "step": 521 + }, + { + "epoch": 2.11336032388664, + "grad_norm": 1.829451958189404, + "learning_rate": 9.629806667441727e-06, + "loss": 1.7856, + "step": 522 + }, + { + "epoch": 2.117408906882591, + "grad_norm": 1.6374878381545, + "learning_rate": 9.627133752176809e-06, + "loss": 1.7441, + "step": 523 + }, + { + "epoch": 2.1214574898785425, + "grad_norm": 1.4010819404830996, + "learning_rate": 9.624451595617588e-06, + "loss": 1.7615, + "step": 524 + }, + { + "epoch": 2.125506072874494, + "grad_norm": 1.441999234959946, + "learning_rate": 9.62176020312086e-06, + "loss": 1.7378, + "step": 525 + }, + { + "epoch": 2.1295546558704452, + "grad_norm": 1.5770630911097265, + "learning_rate": 9.619059580061862e-06, + "loss": 1.7039, + "step": 526 + }, + { + "epoch": 2.133603238866397, + "grad_norm": 1.4591597594445938, + "learning_rate": 9.616349731834271e-06, + "loss": 2.0009, + "step": 527 + }, + { + "epoch": 2.1376518218623484, + "grad_norm": 1.6179185626843804, + "learning_rate": 9.613630663850184e-06, + "loss": 1.872, + "step": 528 + }, + { + "epoch": 2.1417004048582995, + "grad_norm": 1.3086175576058332, + "learning_rate": 9.610902381540115e-06, + "loss": 1.5977, + "step": 529 + }, + { + "epoch": 2.145748987854251, + "grad_norm": 1.444761778117532, + "learning_rate": 9.608164890352977e-06, + "loss": 2.0221, + "step": 530 + }, + { + "epoch": 2.1497975708502026, + "grad_norm": 1.4113693951603745, + "learning_rate": 9.605418195756077e-06, + "loss": 1.8497, + "step": 531 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 1.2987083078720463, + "learning_rate": 9.602662303235106e-06, + "loss": 1.9881, + "step": 532 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 1.5356679778352307, + "learning_rate": 9.599897218294122e-06, + "loss": 2.2169, + "step": 533 + }, + { + "epoch": 2.161943319838057, + "grad_norm": 1.2586253730389827, + "learning_rate": 9.597122946455539e-06, + "loss": 1.8884, + "step": 534 + }, + { + "epoch": 2.165991902834008, + "grad_norm": 1.3241548388576752, + "learning_rate": 9.594339493260127e-06, + "loss": 1.9169, + "step": 535 + }, + { + "epoch": 2.1700404858299596, + "grad_norm": 3.3161848122832627, + "learning_rate": 9.591546864266983e-06, + "loss": 2.3116, + "step": 536 + }, + { + "epoch": 2.174089068825911, + "grad_norm": 1.2785252284615238, + "learning_rate": 9.58874506505354e-06, + "loss": 1.7854, + "step": 537 + }, + { + "epoch": 2.1781376518218623, + "grad_norm": 1.4062987764786141, + "learning_rate": 9.58593410121554e-06, + "loss": 1.9564, + "step": 538 + }, + { + "epoch": 2.182186234817814, + "grad_norm": 1.1858759757574733, + "learning_rate": 9.583113978367026e-06, + "loss": 1.7449, + "step": 539 + }, + { + "epoch": 2.1862348178137654, + "grad_norm": 1.4958289357631562, + "learning_rate": 9.580284702140342e-06, + "loss": 1.8748, + "step": 540 + }, + { + "epoch": 2.1902834008097165, + "grad_norm": 1.271888181605562, + "learning_rate": 9.577446278186103e-06, + "loss": 1.944, + "step": 541 + }, + { + "epoch": 2.194331983805668, + "grad_norm": 1.6297569109832326, + "learning_rate": 9.574598712173202e-06, + "loss": 2.1136, + "step": 542 + }, + { + "epoch": 2.1983805668016196, + "grad_norm": 1.7294919253670684, + "learning_rate": 9.571742009788787e-06, + "loss": 2.1866, + "step": 543 + }, + { + "epoch": 2.2024291497975708, + "grad_norm": 1.5317790321439353, + "learning_rate": 9.568876176738251e-06, + "loss": 1.8859, + "step": 544 + }, + { + "epoch": 2.2064777327935223, + "grad_norm": 1.711554028884214, + "learning_rate": 9.56600121874523e-06, + "loss": 2.1936, + "step": 545 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 1.4435460877228636, + "learning_rate": 9.563117141551574e-06, + "loss": 2.2517, + "step": 546 + }, + { + "epoch": 2.214574898785425, + "grad_norm": 1.4961050962412457, + "learning_rate": 9.560223950917354e-06, + "loss": 2.041, + "step": 547 + }, + { + "epoch": 2.2186234817813766, + "grad_norm": 1.3247670963766616, + "learning_rate": 9.557321652620839e-06, + "loss": 1.8986, + "step": 548 + }, + { + "epoch": 2.2226720647773277, + "grad_norm": 1.4724998096864195, + "learning_rate": 9.554410252458489e-06, + "loss": 1.8568, + "step": 549 + }, + { + "epoch": 2.2267206477732793, + "grad_norm": 3.7991275518186196, + "learning_rate": 9.551489756244939e-06, + "loss": 1.9347, + "step": 550 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 1.4010848185779328, + "learning_rate": 9.548560169812997e-06, + "loss": 1.8809, + "step": 551 + }, + { + "epoch": 2.234817813765182, + "grad_norm": 1.6221348693259423, + "learning_rate": 9.54562149901362e-06, + "loss": 2.0865, + "step": 552 + }, + { + "epoch": 2.2388663967611335, + "grad_norm": 1.4196865192753882, + "learning_rate": 9.54267374971591e-06, + "loss": 2.0449, + "step": 553 + }, + { + "epoch": 2.242914979757085, + "grad_norm": 1.4599787722592332, + "learning_rate": 9.539716927807102e-06, + "loss": 2.0083, + "step": 554 + }, + { + "epoch": 2.246963562753036, + "grad_norm": 1.251605201082177, + "learning_rate": 9.536751039192549e-06, + "loss": 1.8576, + "step": 555 + }, + { + "epoch": 2.251012145748988, + "grad_norm": 1.30407928376828, + "learning_rate": 9.533776089795712e-06, + "loss": 1.8923, + "step": 556 + }, + { + "epoch": 2.2550607287449393, + "grad_norm": 1.4348421622864604, + "learning_rate": 9.530792085558151e-06, + "loss": 1.9873, + "step": 557 + }, + { + "epoch": 2.2591093117408905, + "grad_norm": 1.4429474918555736, + "learning_rate": 9.527799032439506e-06, + "loss": 1.8211, + "step": 558 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.338584745094179, + "learning_rate": 9.524796936417495e-06, + "loss": 1.7082, + "step": 559 + }, + { + "epoch": 2.2672064777327936, + "grad_norm": 1.329824996124572, + "learning_rate": 9.521785803487888e-06, + "loss": 1.9216, + "step": 560 + }, + { + "epoch": 2.2712550607287447, + "grad_norm": 1.3374675078915148, + "learning_rate": 9.518765639664512e-06, + "loss": 1.9723, + "step": 561 + }, + { + "epoch": 2.2753036437246963, + "grad_norm": 1.4689345418902104, + "learning_rate": 9.515736450979224e-06, + "loss": 1.953, + "step": 562 + }, + { + "epoch": 2.279352226720648, + "grad_norm": 1.6439512327159642, + "learning_rate": 9.512698243481914e-06, + "loss": 1.991, + "step": 563 + }, + { + "epoch": 2.283400809716599, + "grad_norm": 1.5280266119657933, + "learning_rate": 9.509651023240472e-06, + "loss": 1.7088, + "step": 564 + }, + { + "epoch": 2.2874493927125505, + "grad_norm": 1.5234607385845351, + "learning_rate": 9.5065947963408e-06, + "loss": 1.7975, + "step": 565 + }, + { + "epoch": 2.291497975708502, + "grad_norm": 1.4898313464385229, + "learning_rate": 9.50352956888678e-06, + "loss": 1.6643, + "step": 566 + }, + { + "epoch": 2.2955465587044532, + "grad_norm": 1.5049004900957001, + "learning_rate": 9.500455347000273e-06, + "loss": 2.0078, + "step": 567 + }, + { + "epoch": 2.299595141700405, + "grad_norm": 1.5268023276941818, + "learning_rate": 9.497372136821103e-06, + "loss": 2.0653, + "step": 568 + }, + { + "epoch": 2.3036437246963564, + "grad_norm": 1.5293343920918272, + "learning_rate": 9.49427994450705e-06, + "loss": 1.8078, + "step": 569 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 1.504441993367853, + "learning_rate": 9.491178776233825e-06, + "loss": 1.8219, + "step": 570 + }, + { + "epoch": 2.311740890688259, + "grad_norm": 1.3604060927952581, + "learning_rate": 9.488068638195072e-06, + "loss": 1.8582, + "step": 571 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 1.7336288728624165, + "learning_rate": 9.484949536602343e-06, + "loss": 1.8562, + "step": 572 + }, + { + "epoch": 2.3198380566801617, + "grad_norm": 1.536212130823414, + "learning_rate": 9.481821477685102e-06, + "loss": 1.7431, + "step": 573 + }, + { + "epoch": 2.3238866396761133, + "grad_norm": 1.4120913757834546, + "learning_rate": 9.478684467690693e-06, + "loss": 1.7586, + "step": 574 + }, + { + "epoch": 2.327935222672065, + "grad_norm": 1.453958520209467, + "learning_rate": 9.47553851288434e-06, + "loss": 1.7694, + "step": 575 + }, + { + "epoch": 2.331983805668016, + "grad_norm": 1.3935000424019952, + "learning_rate": 9.472383619549133e-06, + "loss": 1.6545, + "step": 576 + }, + { + "epoch": 2.3360323886639676, + "grad_norm": 1.3589610652505588, + "learning_rate": 9.469219793986016e-06, + "loss": 1.6896, + "step": 577 + }, + { + "epoch": 2.340080971659919, + "grad_norm": 1.7566987829139051, + "learning_rate": 9.466047042513767e-06, + "loss": 1.6272, + "step": 578 + }, + { + "epoch": 2.3441295546558703, + "grad_norm": 1.3287178155779462, + "learning_rate": 9.462865371468994e-06, + "loss": 1.7176, + "step": 579 + }, + { + "epoch": 2.348178137651822, + "grad_norm": 1.8490808825118674, + "learning_rate": 9.459674787206117e-06, + "loss": 1.9005, + "step": 580 + }, + { + "epoch": 2.3522267206477734, + "grad_norm": 1.8200114285326863, + "learning_rate": 9.45647529609736e-06, + "loss": 1.7493, + "step": 581 + }, + { + "epoch": 2.3562753036437245, + "grad_norm": 1.7944997812037724, + "learning_rate": 9.453266904532737e-06, + "loss": 1.4856, + "step": 582 + }, + { + "epoch": 2.360323886639676, + "grad_norm": 1.6449884777915886, + "learning_rate": 9.450049618920034e-06, + "loss": 1.8312, + "step": 583 + }, + { + "epoch": 2.3643724696356276, + "grad_norm": 1.6009358010430617, + "learning_rate": 9.4468234456848e-06, + "loss": 1.9048, + "step": 584 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 1.519230320705593, + "learning_rate": 9.44358839127034e-06, + "loss": 1.8077, + "step": 585 + }, + { + "epoch": 2.3724696356275303, + "grad_norm": 1.8694258750708748, + "learning_rate": 9.44034446213769e-06, + "loss": 1.9556, + "step": 586 + }, + { + "epoch": 2.376518218623482, + "grad_norm": 1.4302907644008036, + "learning_rate": 9.437091664765611e-06, + "loss": 1.5064, + "step": 587 + }, + { + "epoch": 2.380566801619433, + "grad_norm": 1.5423881317930213, + "learning_rate": 9.433830005650582e-06, + "loss": 1.69, + "step": 588 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 1.4747017336722326, + "learning_rate": 9.430559491306777e-06, + "loss": 1.5552, + "step": 589 + }, + { + "epoch": 2.388663967611336, + "grad_norm": 1.600482934018078, + "learning_rate": 9.427280128266049e-06, + "loss": 1.6106, + "step": 590 + }, + { + "epoch": 2.3927125506072873, + "grad_norm": 1.5014148151060753, + "learning_rate": 9.423991923077938e-06, + "loss": 1.7636, + "step": 591 + }, + { + "epoch": 2.396761133603239, + "grad_norm": 1.7672182274084831, + "learning_rate": 9.420694882309628e-06, + "loss": 1.5786, + "step": 592 + }, + { + "epoch": 2.4008097165991904, + "grad_norm": 1.440572594457583, + "learning_rate": 9.41738901254596e-06, + "loss": 1.6426, + "step": 593 + }, + { + "epoch": 2.4048582995951415, + "grad_norm": 1.5625132261883155, + "learning_rate": 9.414074320389403e-06, + "loss": 1.7306, + "step": 594 + }, + { + "epoch": 2.408906882591093, + "grad_norm": 1.683823244071828, + "learning_rate": 9.41075081246005e-06, + "loss": 1.821, + "step": 595 + }, + { + "epoch": 2.4129554655870447, + "grad_norm": 1.4314599370281114, + "learning_rate": 9.4074184953956e-06, + "loss": 1.6872, + "step": 596 + }, + { + "epoch": 2.417004048582996, + "grad_norm": 1.5657957134872598, + "learning_rate": 9.404077375851338e-06, + "loss": 1.9362, + "step": 597 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 1.6198467768431548, + "learning_rate": 9.400727460500141e-06, + "loss": 2.0139, + "step": 598 + }, + { + "epoch": 2.425101214574899, + "grad_norm": 1.4103077055466628, + "learning_rate": 9.397368756032445e-06, + "loss": 1.8485, + "step": 599 + }, + { + "epoch": 2.42914979757085, + "grad_norm": 1.3471173889103276, + "learning_rate": 9.394001269156245e-06, + "loss": 1.9812, + "step": 600 + }, + { + "epoch": 2.4331983805668016, + "grad_norm": 1.4234064588511484, + "learning_rate": 9.39062500659707e-06, + "loss": 2.0496, + "step": 601 + }, + { + "epoch": 2.437246963562753, + "grad_norm": 1.4784926767119206, + "learning_rate": 9.38723997509798e-06, + "loss": 1.837, + "step": 602 + }, + { + "epoch": 2.4412955465587043, + "grad_norm": 1.5518065193263646, + "learning_rate": 9.383846181419547e-06, + "loss": 1.765, + "step": 603 + }, + { + "epoch": 2.445344129554656, + "grad_norm": 1.3196666479973478, + "learning_rate": 9.380443632339845e-06, + "loss": 2.0255, + "step": 604 + }, + { + "epoch": 2.4493927125506074, + "grad_norm": 1.440061740597458, + "learning_rate": 9.37703233465443e-06, + "loss": 1.7942, + "step": 605 + }, + { + "epoch": 2.4534412955465585, + "grad_norm": 1.5327759577164166, + "learning_rate": 9.373612295176333e-06, + "loss": 1.777, + "step": 606 + }, + { + "epoch": 2.45748987854251, + "grad_norm": 1.6814358499503075, + "learning_rate": 9.370183520736045e-06, + "loss": 2.185, + "step": 607 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 2.30393335895373, + "learning_rate": 9.366746018181503e-06, + "loss": 2.4563, + "step": 608 + }, + { + "epoch": 2.465587044534413, + "grad_norm": 1.8584859443814368, + "learning_rate": 9.363299794378072e-06, + "loss": 2.0155, + "step": 609 + }, + { + "epoch": 2.4696356275303644, + "grad_norm": 1.2803493212403667, + "learning_rate": 9.359844856208538e-06, + "loss": 1.9623, + "step": 610 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 1.323092986933548, + "learning_rate": 9.356381210573092e-06, + "loss": 1.8725, + "step": 611 + }, + { + "epoch": 2.477732793522267, + "grad_norm": 1.716120944564361, + "learning_rate": 9.352908864389313e-06, + "loss": 1.9058, + "step": 612 + }, + { + "epoch": 2.4817813765182186, + "grad_norm": 1.1767574227433577, + "learning_rate": 9.349427824592157e-06, + "loss": 1.818, + "step": 613 + }, + { + "epoch": 2.48582995951417, + "grad_norm": 1.8646580879242294, + "learning_rate": 9.345938098133946e-06, + "loss": 1.8001, + "step": 614 + }, + { + "epoch": 2.4898785425101213, + "grad_norm": 1.7755724904128214, + "learning_rate": 9.342439691984346e-06, + "loss": 1.7282, + "step": 615 + }, + { + "epoch": 2.493927125506073, + "grad_norm": 1.7352293901651843, + "learning_rate": 9.338932613130363e-06, + "loss": 1.7961, + "step": 616 + }, + { + "epoch": 2.4979757085020244, + "grad_norm": 1.6153408388514847, + "learning_rate": 9.33541686857632e-06, + "loss": 1.662, + "step": 617 + }, + { + "epoch": 2.5020242914979756, + "grad_norm": 1.5099283023047843, + "learning_rate": 9.331892465343851e-06, + "loss": 1.588, + "step": 618 + }, + { + "epoch": 2.506072874493927, + "grad_norm": 1.730183741035281, + "learning_rate": 9.328359410471878e-06, + "loss": 1.8722, + "step": 619 + }, + { + "epoch": 2.5101214574898787, + "grad_norm": 1.7321761047223487, + "learning_rate": 9.324817711016609e-06, + "loss": 1.9167, + "step": 620 + }, + { + "epoch": 2.51417004048583, + "grad_norm": 1.2095836589724516, + "learning_rate": 9.32126737405151e-06, + "loss": 1.8743, + "step": 621 + }, + { + "epoch": 2.5182186234817814, + "grad_norm": 1.5485434750214813, + "learning_rate": 9.3177084066673e-06, + "loss": 1.89, + "step": 622 + }, + { + "epoch": 2.522267206477733, + "grad_norm": 1.5145693598054688, + "learning_rate": 9.31414081597194e-06, + "loss": 1.8321, + "step": 623 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 1.8660173525702701, + "learning_rate": 9.310564609090605e-06, + "loss": 1.6178, + "step": 624 + }, + { + "epoch": 2.5303643724696356, + "grad_norm": 1.9092894315915314, + "learning_rate": 9.306979793165682e-06, + "loss": 1.718, + "step": 625 + }, + { + "epoch": 2.534412955465587, + "grad_norm": 2.1574694273419817, + "learning_rate": 9.303386375356752e-06, + "loss": 1.8536, + "step": 626 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 1.5187220263251169, + "learning_rate": 9.299784362840578e-06, + "loss": 2.0088, + "step": 627 + }, + { + "epoch": 2.54251012145749, + "grad_norm": 1.3524410374053388, + "learning_rate": 9.296173762811084e-06, + "loss": 1.8993, + "step": 628 + }, + { + "epoch": 2.5465587044534415, + "grad_norm": 3.8294272400161993, + "learning_rate": 9.292554582479349e-06, + "loss": 2.3583, + "step": 629 + }, + { + "epoch": 2.5506072874493926, + "grad_norm": 6.070012543144345, + "learning_rate": 9.288926829073583e-06, + "loss": 2.4906, + "step": 630 + }, + { + "epoch": 2.554655870445344, + "grad_norm": 5.603752988478888, + "learning_rate": 9.285290509839126e-06, + "loss": 2.7822, + "step": 631 + }, + { + "epoch": 2.5587044534412957, + "grad_norm": 1.4481838054717586, + "learning_rate": 9.281645632038417e-06, + "loss": 1.8168, + "step": 632 + }, + { + "epoch": 2.562753036437247, + "grad_norm": 1.414449313894791, + "learning_rate": 9.277992202950996e-06, + "loss": 1.7136, + "step": 633 + }, + { + "epoch": 2.5668016194331984, + "grad_norm": 1.4634757861687506, + "learning_rate": 9.274330229873474e-06, + "loss": 2.0032, + "step": 634 + }, + { + "epoch": 2.57085020242915, + "grad_norm": 1.484422105707642, + "learning_rate": 9.270659720119533e-06, + "loss": 1.6359, + "step": 635 + }, + { + "epoch": 2.574898785425101, + "grad_norm": 1.4574650651898802, + "learning_rate": 9.266980681019902e-06, + "loss": 1.9962, + "step": 636 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 1.2408661225828688, + "learning_rate": 9.263293119922341e-06, + "loss": 1.7137, + "step": 637 + }, + { + "epoch": 2.582995951417004, + "grad_norm": 1.4397062187160998, + "learning_rate": 9.259597044191635e-06, + "loss": 1.9567, + "step": 638 + }, + { + "epoch": 2.5870445344129553, + "grad_norm": 1.3678454147168124, + "learning_rate": 9.255892461209574e-06, + "loss": 1.8607, + "step": 639 + }, + { + "epoch": 2.591093117408907, + "grad_norm": 1.51295578810032, + "learning_rate": 9.252179378374937e-06, + "loss": 1.8423, + "step": 640 + }, + { + "epoch": 2.5951417004048585, + "grad_norm": 1.493191888596024, + "learning_rate": 9.248457803103476e-06, + "loss": 1.5365, + "step": 641 + }, + { + "epoch": 2.5991902834008096, + "grad_norm": 1.4402174802959915, + "learning_rate": 9.24472774282791e-06, + "loss": 1.5837, + "step": 642 + }, + { + "epoch": 2.603238866396761, + "grad_norm": 1.3814570168249611, + "learning_rate": 9.240989204997903e-06, + "loss": 1.7433, + "step": 643 + }, + { + "epoch": 2.6072874493927127, + "grad_norm": 1.4229224856881553, + "learning_rate": 9.237242197080045e-06, + "loss": 1.6373, + "step": 644 + }, + { + "epoch": 2.611336032388664, + "grad_norm": 1.529255344732051, + "learning_rate": 9.23348672655785e-06, + "loss": 1.9638, + "step": 645 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 1.2990811736528833, + "learning_rate": 9.229722800931727e-06, + "loss": 1.8372, + "step": 646 + }, + { + "epoch": 2.619433198380567, + "grad_norm": 1.7287958707975635, + "learning_rate": 9.225950427718974e-06, + "loss": 1.665, + "step": 647 + }, + { + "epoch": 2.623481781376518, + "grad_norm": 1.631936855970988, + "learning_rate": 9.222169614453765e-06, + "loss": 2.052, + "step": 648 + }, + { + "epoch": 2.6275303643724697, + "grad_norm": 1.384358037456477, + "learning_rate": 9.21838036868712e-06, + "loss": 1.8437, + "step": 649 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 1.57010881393224, + "learning_rate": 9.21458269798691e-06, + "loss": 2.0542, + "step": 650 + }, + { + "epoch": 2.6356275303643724, + "grad_norm": 1.4074541953077098, + "learning_rate": 9.21077660993783e-06, + "loss": 1.7342, + "step": 651 + }, + { + "epoch": 2.639676113360324, + "grad_norm": 1.6189308816605772, + "learning_rate": 9.206962112141382e-06, + "loss": 1.9321, + "step": 652 + }, + { + "epoch": 2.6437246963562755, + "grad_norm": 1.4090618348929758, + "learning_rate": 9.203139212215868e-06, + "loss": 1.871, + "step": 653 + }, + { + "epoch": 2.6477732793522266, + "grad_norm": 1.9494105407548425, + "learning_rate": 9.199307917796371e-06, + "loss": 1.8667, + "step": 654 + }, + { + "epoch": 2.651821862348178, + "grad_norm": 1.4331583331274316, + "learning_rate": 9.195468236534734e-06, + "loss": 1.7255, + "step": 655 + }, + { + "epoch": 2.6558704453441297, + "grad_norm": 1.5909315996217737, + "learning_rate": 9.191620176099559e-06, + "loss": 1.9444, + "step": 656 + }, + { + "epoch": 2.659919028340081, + "grad_norm": 1.7461445494408216, + "learning_rate": 9.187763744176175e-06, + "loss": 1.7728, + "step": 657 + }, + { + "epoch": 2.6639676113360324, + "grad_norm": 1.422126938114325, + "learning_rate": 9.183898948466633e-06, + "loss": 1.9077, + "step": 658 + }, + { + "epoch": 2.668016194331984, + "grad_norm": 1.4144043249974336, + "learning_rate": 9.180025796689692e-06, + "loss": 1.9331, + "step": 659 + }, + { + "epoch": 2.672064777327935, + "grad_norm": 2.7772861017132255, + "learning_rate": 9.176144296580794e-06, + "loss": 1.8667, + "step": 660 + }, + { + "epoch": 2.6761133603238867, + "grad_norm": 1.3064807850177453, + "learning_rate": 9.172254455892054e-06, + "loss": 1.8187, + "step": 661 + }, + { + "epoch": 2.6801619433198383, + "grad_norm": 1.7419083953095058, + "learning_rate": 9.168356282392253e-06, + "loss": 1.903, + "step": 662 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 1.4496863008780128, + "learning_rate": 9.164449783866802e-06, + "loss": 1.7048, + "step": 663 + }, + { + "epoch": 2.688259109311741, + "grad_norm": 1.491984655358695, + "learning_rate": 9.160534968117752e-06, + "loss": 1.8734, + "step": 664 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 1.5308194782439823, + "learning_rate": 9.156611842963753e-06, + "loss": 1.8788, + "step": 665 + }, + { + "epoch": 2.6963562753036436, + "grad_norm": 1.3476877228875297, + "learning_rate": 9.152680416240059e-06, + "loss": 1.7147, + "step": 666 + }, + { + "epoch": 2.700404858299595, + "grad_norm": 1.8151640153934792, + "learning_rate": 9.1487406957985e-06, + "loss": 2.2048, + "step": 667 + }, + { + "epoch": 2.7044534412955468, + "grad_norm": 1.7628995278188238, + "learning_rate": 9.144792689507471e-06, + "loss": 1.9635, + "step": 668 + }, + { + "epoch": 2.708502024291498, + "grad_norm": 1.602921120835359, + "learning_rate": 9.140836405251917e-06, + "loss": 1.5744, + "step": 669 + }, + { + "epoch": 2.7125506072874495, + "grad_norm": 1.490856129715411, + "learning_rate": 9.136871850933312e-06, + "loss": 1.7612, + "step": 670 + }, + { + "epoch": 2.716599190283401, + "grad_norm": 1.4382592619602368, + "learning_rate": 9.132899034469648e-06, + "loss": 1.8414, + "step": 671 + }, + { + "epoch": 2.720647773279352, + "grad_norm": 1.8014041637984994, + "learning_rate": 9.128917963795422e-06, + "loss": 1.7066, + "step": 672 + }, + { + "epoch": 2.7246963562753037, + "grad_norm": 1.7582254633750898, + "learning_rate": 9.124928646861613e-06, + "loss": 1.7925, + "step": 673 + }, + { + "epoch": 2.7287449392712553, + "grad_norm": 1.6343159265633571, + "learning_rate": 9.120931091635669e-06, + "loss": 1.9923, + "step": 674 + }, + { + "epoch": 2.7327935222672064, + "grad_norm": 1.3849537338720197, + "learning_rate": 9.116925306101494e-06, + "loss": 1.858, + "step": 675 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 1.5938145614524974, + "learning_rate": 9.112911298259426e-06, + "loss": 1.8935, + "step": 676 + }, + { + "epoch": 2.7408906882591095, + "grad_norm": 2.232137755564454, + "learning_rate": 9.108889076126226e-06, + "loss": 2.5611, + "step": 677 + }, + { + "epoch": 2.7449392712550607, + "grad_norm": 1.597451641610388, + "learning_rate": 9.104858647735065e-06, + "loss": 1.9346, + "step": 678 + }, + { + "epoch": 2.748987854251012, + "grad_norm": 1.734843462936045, + "learning_rate": 9.100820021135495e-06, + "loss": 1.7738, + "step": 679 + }, + { + "epoch": 2.753036437246964, + "grad_norm": 1.5432674907856907, + "learning_rate": 9.09677320439345e-06, + "loss": 1.6451, + "step": 680 + }, + { + "epoch": 2.757085020242915, + "grad_norm": 1.4375865005427824, + "learning_rate": 9.092718205591213e-06, + "loss": 1.8788, + "step": 681 + }, + { + "epoch": 2.7611336032388665, + "grad_norm": 3.7437865438416433, + "learning_rate": 9.088655032827418e-06, + "loss": 2.6938, + "step": 682 + }, + { + "epoch": 2.765182186234818, + "grad_norm": 6.350052687447943, + "learning_rate": 9.084583694217012e-06, + "loss": 2.5299, + "step": 683 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 4.945671727596882, + "learning_rate": 9.080504197891262e-06, + "loss": 2.4088, + "step": 684 + }, + { + "epoch": 2.7732793522267207, + "grad_norm": 1.6795835965091561, + "learning_rate": 9.076416551997721e-06, + "loss": 1.824, + "step": 685 + }, + { + "epoch": 2.7773279352226723, + "grad_norm": 1.5949270953831338, + "learning_rate": 9.072320764700223e-06, + "loss": 2.0511, + "step": 686 + }, + { + "epoch": 2.7813765182186234, + "grad_norm": 1.4556536124547252, + "learning_rate": 9.068216844178857e-06, + "loss": 2.0932, + "step": 687 + }, + { + "epoch": 2.785425101214575, + "grad_norm": 1.6439876597132232, + "learning_rate": 9.064104798629955e-06, + "loss": 1.8796, + "step": 688 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.4368651555210203, + "learning_rate": 9.059984636266082e-06, + "loss": 1.7757, + "step": 689 + }, + { + "epoch": 2.7935222672064777, + "grad_norm": 1.6510465877279545, + "learning_rate": 9.055856365316012e-06, + "loss": 1.9039, + "step": 690 + }, + { + "epoch": 2.7975708502024292, + "grad_norm": 1.5313446048549542, + "learning_rate": 9.051719994024711e-06, + "loss": 1.9171, + "step": 691 + }, + { + "epoch": 2.801619433198381, + "grad_norm": 1.5880262025571767, + "learning_rate": 9.047575530653324e-06, + "loss": 1.6852, + "step": 692 + }, + { + "epoch": 2.805668016194332, + "grad_norm": 1.4675446257129918, + "learning_rate": 9.043422983479158e-06, + "loss": 1.5727, + "step": 693 + }, + { + "epoch": 2.8097165991902835, + "grad_norm": 1.6282110219820332, + "learning_rate": 9.039262360795664e-06, + "loss": 1.9079, + "step": 694 + }, + { + "epoch": 2.813765182186235, + "grad_norm": 1.9452631088170542, + "learning_rate": 9.035093670912424e-06, + "loss": 1.9093, + "step": 695 + }, + { + "epoch": 2.817813765182186, + "grad_norm": 1.6299011643761043, + "learning_rate": 9.03091692215513e-06, + "loss": 1.6569, + "step": 696 + }, + { + "epoch": 2.8218623481781377, + "grad_norm": 7.734091901664539, + "learning_rate": 9.026732122865567e-06, + "loss": 2.4758, + "step": 697 + }, + { + "epoch": 2.8259109311740893, + "grad_norm": 18.1486281089367, + "learning_rate": 9.022539281401601e-06, + "loss": 3.9379, + "step": 698 + }, + { + "epoch": 2.8299595141700404, + "grad_norm": 1.7406474445735873, + "learning_rate": 9.01833840613716e-06, + "loss": 1.7599, + "step": 699 + }, + { + "epoch": 2.834008097165992, + "grad_norm": 1.7079549569427872, + "learning_rate": 9.014129505462217e-06, + "loss": 1.6112, + "step": 700 + }, + { + "epoch": 2.8380566801619436, + "grad_norm": 1.5492178198371753, + "learning_rate": 9.009912587782772e-06, + "loss": 1.719, + "step": 701 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 1.5966963855692302, + "learning_rate": 9.005687661520838e-06, + "loss": 1.7237, + "step": 702 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 1.5738987901659376, + "learning_rate": 9.00145473511442e-06, + "loss": 1.6892, + "step": 703 + }, + { + "epoch": 2.850202429149798, + "grad_norm": 1.6008695127081995, + "learning_rate": 8.997213817017508e-06, + "loss": 1.7534, + "step": 704 + }, + { + "epoch": 2.854251012145749, + "grad_norm": 1.8027657159531043, + "learning_rate": 8.99296491570004e-06, + "loss": 1.8313, + "step": 705 + }, + { + "epoch": 2.8582995951417005, + "grad_norm": 1.388477920242152, + "learning_rate": 8.98870803964791e-06, + "loss": 1.7662, + "step": 706 + }, + { + "epoch": 2.862348178137652, + "grad_norm": 1.697508321391829, + "learning_rate": 8.984443197362938e-06, + "loss": 1.7739, + "step": 707 + }, + { + "epoch": 2.866396761133603, + "grad_norm": 1.7051210953826448, + "learning_rate": 8.980170397362846e-06, + "loss": 1.7885, + "step": 708 + }, + { + "epoch": 2.8704453441295548, + "grad_norm": 2.112476620801928, + "learning_rate": 8.975889648181258e-06, + "loss": 2.2786, + "step": 709 + }, + { + "epoch": 2.8744939271255063, + "grad_norm": 1.9686852205718806, + "learning_rate": 8.971600958367668e-06, + "loss": 2.2033, + "step": 710 + }, + { + "epoch": 2.8785425101214575, + "grad_norm": 1.8858645037099275, + "learning_rate": 8.96730433648743e-06, + "loss": 1.9747, + "step": 711 + }, + { + "epoch": 2.882591093117409, + "grad_norm": 1.629389176480098, + "learning_rate": 8.962999791121745e-06, + "loss": 1.8561, + "step": 712 + }, + { + "epoch": 2.8866396761133606, + "grad_norm": 1.7283481294339973, + "learning_rate": 8.958687330867634e-06, + "loss": 1.3887, + "step": 713 + }, + { + "epoch": 2.8906882591093117, + "grad_norm": 1.5884187879059617, + "learning_rate": 8.954366964337926e-06, + "loss": 1.8757, + "step": 714 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 1.5310621607610841, + "learning_rate": 8.950038700161239e-06, + "loss": 1.9746, + "step": 715 + }, + { + "epoch": 2.898785425101215, + "grad_norm": 1.4608377788624507, + "learning_rate": 8.94570254698197e-06, + "loss": 1.6592, + "step": 716 + }, + { + "epoch": 2.902834008097166, + "grad_norm": 1.5297317667519899, + "learning_rate": 8.941358513460264e-06, + "loss": 1.722, + "step": 717 + }, + { + "epoch": 2.9068825910931175, + "grad_norm": 1.847621037937598, + "learning_rate": 8.937006608272009e-06, + "loss": 1.9182, + "step": 718 + }, + { + "epoch": 2.910931174089069, + "grad_norm": 1.6585955176413567, + "learning_rate": 8.932646840108818e-06, + "loss": 1.4523, + "step": 719 + }, + { + "epoch": 2.91497975708502, + "grad_norm": 1.807939122311604, + "learning_rate": 8.928279217677999e-06, + "loss": 1.5928, + "step": 720 + }, + { + "epoch": 2.919028340080972, + "grad_norm": 1.6812175947881611, + "learning_rate": 8.923903749702556e-06, + "loss": 1.6197, + "step": 721 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 1.5868810975571848, + "learning_rate": 8.919520444921153e-06, + "loss": 1.9066, + "step": 722 + }, + { + "epoch": 2.9271255060728745, + "grad_norm": 2.008002647816905, + "learning_rate": 8.915129312088112e-06, + "loss": 1.7547, + "step": 723 + }, + { + "epoch": 2.931174089068826, + "grad_norm": 2.2074435698181185, + "learning_rate": 8.910730359973386e-06, + "loss": 1.7851, + "step": 724 + }, + { + "epoch": 2.9352226720647776, + "grad_norm": 1.6720121053555042, + "learning_rate": 8.906323597362547e-06, + "loss": 1.6173, + "step": 725 + }, + { + "epoch": 2.9392712550607287, + "grad_norm": 1.7840437064722243, + "learning_rate": 8.901909033056763e-06, + "loss": 1.5244, + "step": 726 + }, + { + "epoch": 2.9433198380566803, + "grad_norm": 2.087404813784654, + "learning_rate": 8.89748667587279e-06, + "loss": 1.8108, + "step": 727 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 1.7622420447448541, + "learning_rate": 8.893056534642938e-06, + "loss": 1.5553, + "step": 728 + }, + { + "epoch": 2.951417004048583, + "grad_norm": 1.9454050876073625, + "learning_rate": 8.88861861821507e-06, + "loss": 1.5518, + "step": 729 + }, + { + "epoch": 2.9554655870445345, + "grad_norm": 3.180217232768608, + "learning_rate": 8.88417293545258e-06, + "loss": 1.7772, + "step": 730 + }, + { + "epoch": 2.9595141700404857, + "grad_norm": 3.564301283270782, + "learning_rate": 8.879719495234363e-06, + "loss": 1.6766, + "step": 731 + }, + { + "epoch": 2.9635627530364372, + "grad_norm": 1.5385071245811799, + "learning_rate": 8.875258306454814e-06, + "loss": 1.7823, + "step": 732 + }, + { + "epoch": 2.967611336032389, + "grad_norm": 1.8013008659956586, + "learning_rate": 8.87078937802381e-06, + "loss": 2.0096, + "step": 733 + }, + { + "epoch": 2.97165991902834, + "grad_norm": 2.38933092267862, + "learning_rate": 8.866312718866669e-06, + "loss": 1.9226, + "step": 734 + }, + { + "epoch": 2.9757085020242915, + "grad_norm": 1.5349029688081202, + "learning_rate": 8.861828337924164e-06, + "loss": 1.7634, + "step": 735 + }, + { + "epoch": 2.979757085020243, + "grad_norm": 1.7807993217999074, + "learning_rate": 8.85733624415248e-06, + "loss": 1.862, + "step": 736 + }, + { + "epoch": 2.983805668016194, + "grad_norm": 1.6270967039867585, + "learning_rate": 8.852836446523213e-06, + "loss": 1.9281, + "step": 737 + }, + { + "epoch": 2.9878542510121457, + "grad_norm": 1.8692589473995715, + "learning_rate": 8.848328954023342e-06, + "loss": 1.7317, + "step": 738 + }, + { + "epoch": 2.9919028340080973, + "grad_norm": 1.5874083562158485, + "learning_rate": 8.843813775655211e-06, + "loss": 1.6635, + "step": 739 + }, + { + "epoch": 2.9959514170040484, + "grad_norm": 1.3707872942838146, + "learning_rate": 8.83929092043652e-06, + "loss": 1.9759, + "step": 740 + }, + { + "epoch": 3.0, + "grad_norm": 1.7529361765269527, + "learning_rate": 8.8347603974003e-06, + "loss": 1.7407, + "step": 741 + }, + { + "epoch": 3.0040485829959516, + "grad_norm": 1.4847998012230224, + "learning_rate": 8.83022221559489e-06, + "loss": 1.8183, + "step": 742 + }, + { + "epoch": 3.0080971659919027, + "grad_norm": 2.0727143325799453, + "learning_rate": 8.825676384083936e-06, + "loss": 1.9566, + "step": 743 + }, + { + "epoch": 3.0121457489878543, + "grad_norm": 2.1863226369459072, + "learning_rate": 8.82112291194635e-06, + "loss": 1.8211, + "step": 744 + }, + { + "epoch": 3.016194331983806, + "grad_norm": 2.194214751548881, + "learning_rate": 8.816561808276312e-06, + "loss": 1.9756, + "step": 745 + }, + { + "epoch": 3.020242914979757, + "grad_norm": 1.8746800584359844, + "learning_rate": 8.811993082183243e-06, + "loss": 2.2277, + "step": 746 + }, + { + "epoch": 3.0242914979757085, + "grad_norm": 2.0032700627210636, + "learning_rate": 8.807416742791784e-06, + "loss": 2.0822, + "step": 747 + }, + { + "epoch": 3.02834008097166, + "grad_norm": 1.6874624326476195, + "learning_rate": 8.80283279924178e-06, + "loss": 1.7544, + "step": 748 + }, + { + "epoch": 3.032388663967611, + "grad_norm": 1.981414959416955, + "learning_rate": 8.798241260688273e-06, + "loss": 1.7612, + "step": 749 + }, + { + "epoch": 3.0364372469635628, + "grad_norm": 1.85228853236934, + "learning_rate": 8.793642136301462e-06, + "loss": 2.0061, + "step": 750 + }, + { + "epoch": 3.0404858299595143, + "grad_norm": 1.839202167316395, + "learning_rate": 8.7890354352667e-06, + "loss": 1.8078, + "step": 751 + }, + { + "epoch": 3.0445344129554655, + "grad_norm": 1.664692242856933, + "learning_rate": 8.784421166784476e-06, + "loss": 1.7918, + "step": 752 + }, + { + "epoch": 3.048582995951417, + "grad_norm": 1.8125016947634567, + "learning_rate": 8.779799340070388e-06, + "loss": 1.7574, + "step": 753 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 1.922401307664431, + "learning_rate": 8.775169964355134e-06, + "loss": 1.8982, + "step": 754 + }, + { + "epoch": 3.0566801619433197, + "grad_norm": 1.893673085388173, + "learning_rate": 8.770533048884483e-06, + "loss": 1.7375, + "step": 755 + }, + { + "epoch": 3.0607287449392713, + "grad_norm": 1.7578051605078406, + "learning_rate": 8.765888602919266e-06, + "loss": 1.9075, + "step": 756 + }, + { + "epoch": 3.064777327935223, + "grad_norm": 1.8959640677324443, + "learning_rate": 8.761236635735353e-06, + "loss": 1.8378, + "step": 757 + }, + { + "epoch": 3.068825910931174, + "grad_norm": 1.9801599495189568, + "learning_rate": 8.756577156623636e-06, + "loss": 1.9702, + "step": 758 + }, + { + "epoch": 3.0728744939271255, + "grad_norm": 1.790845579793568, + "learning_rate": 8.751910174890009e-06, + "loss": 1.8932, + "step": 759 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 1.8236903737287826, + "learning_rate": 8.74723569985535e-06, + "loss": 1.8215, + "step": 760 + }, + { + "epoch": 3.080971659919028, + "grad_norm": 1.7121510890543619, + "learning_rate": 8.742553740855507e-06, + "loss": 1.8237, + "step": 761 + }, + { + "epoch": 3.08502024291498, + "grad_norm": 1.6455567766467654, + "learning_rate": 8.737864307241266e-06, + "loss": 1.825, + "step": 762 + }, + { + "epoch": 3.0890688259109313, + "grad_norm": 2.004800789953328, + "learning_rate": 8.733167408378348e-06, + "loss": 1.83, + "step": 763 + }, + { + "epoch": 3.0931174089068825, + "grad_norm": 1.761656112643498, + "learning_rate": 8.728463053647382e-06, + "loss": 1.9209, + "step": 764 + }, + { + "epoch": 3.097165991902834, + "grad_norm": 1.7248736206433866, + "learning_rate": 8.723751252443891e-06, + "loss": 1.6591, + "step": 765 + }, + { + "epoch": 3.1012145748987856, + "grad_norm": 1.8246435273625035, + "learning_rate": 8.71903201417826e-06, + "loss": 1.8214, + "step": 766 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 1.8468962560997435, + "learning_rate": 8.71430534827574e-06, + "loss": 1.854, + "step": 767 + }, + { + "epoch": 3.1093117408906883, + "grad_norm": 1.9312402322655278, + "learning_rate": 8.709571264176408e-06, + "loss": 1.7321, + "step": 768 + }, + { + "epoch": 3.11336032388664, + "grad_norm": 2.316632605973664, + "learning_rate": 8.70482977133516e-06, + "loss": 1.6709, + "step": 769 + }, + { + "epoch": 3.117408906882591, + "grad_norm": 1.9879535887114659, + "learning_rate": 8.700080879221689e-06, + "loss": 1.6082, + "step": 770 + }, + { + "epoch": 3.1214574898785425, + "grad_norm": 1.8223147298487212, + "learning_rate": 8.69532459732046e-06, + "loss": 1.6324, + "step": 771 + }, + { + "epoch": 3.125506072874494, + "grad_norm": 1.9254678274105181, + "learning_rate": 8.690560935130708e-06, + "loss": 1.626, + "step": 772 + }, + { + "epoch": 3.1295546558704452, + "grad_norm": 2.1237007524174683, + "learning_rate": 8.685789902166395e-06, + "loss": 1.5525, + "step": 773 + }, + { + "epoch": 3.133603238866397, + "grad_norm": 1.7727476948432017, + "learning_rate": 8.681011507956215e-06, + "loss": 1.8873, + "step": 774 + }, + { + "epoch": 3.1376518218623484, + "grad_norm": 2.049295618159139, + "learning_rate": 8.676225762043555e-06, + "loss": 1.7496, + "step": 775 + }, + { + "epoch": 3.1417004048582995, + "grad_norm": 1.5682714669220028, + "learning_rate": 8.671432673986493e-06, + "loss": 1.4753, + "step": 776 + }, + { + "epoch": 3.145748987854251, + "grad_norm": 1.8938048440408406, + "learning_rate": 8.666632253357767e-06, + "loss": 1.8963, + "step": 777 + }, + { + "epoch": 3.1497975708502026, + "grad_norm": 1.8936062118104038, + "learning_rate": 8.661824509744754e-06, + "loss": 1.7098, + "step": 778 + }, + { + "epoch": 3.1538461538461537, + "grad_norm": 1.6774875162585348, + "learning_rate": 8.657009452749466e-06, + "loss": 1.8596, + "step": 779 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 2.015957389549595, + "learning_rate": 8.652187091988516e-06, + "loss": 2.061, + "step": 780 + }, + { + "epoch": 3.161943319838057, + "grad_norm": 1.7186637319125118, + "learning_rate": 8.647357437093104e-06, + "loss": 1.7589, + "step": 781 + }, + { + "epoch": 3.165991902834008, + "grad_norm": 1.7941883707597104, + "learning_rate": 8.642520497709001e-06, + "loss": 1.8086, + "step": 782 + }, + { + "epoch": 3.1700404858299596, + "grad_norm": 1.774631391234699, + "learning_rate": 8.637676283496521e-06, + "loss": 2.2517, + "step": 783 + }, + { + "epoch": 3.174089068825911, + "grad_norm": 1.7904179919335834, + "learning_rate": 8.632824804130514e-06, + "loss": 1.6679, + "step": 784 + }, + { + "epoch": 3.1781376518218623, + "grad_norm": 1.972746622761643, + "learning_rate": 8.627966069300332e-06, + "loss": 1.8345, + "step": 785 + }, + { + "epoch": 3.182186234817814, + "grad_norm": 1.5336336477310177, + "learning_rate": 8.623100088709829e-06, + "loss": 1.6473, + "step": 786 + }, + { + "epoch": 3.1862348178137654, + "grad_norm": 1.9951657707171577, + "learning_rate": 8.618226872077315e-06, + "loss": 1.7821, + "step": 787 + }, + { + "epoch": 3.1902834008097165, + "grad_norm": 1.7282375741642677, + "learning_rate": 8.613346429135567e-06, + "loss": 1.8289, + "step": 788 + }, + { + "epoch": 3.194331983805668, + "grad_norm": 2.1277631117336675, + "learning_rate": 8.608458769631785e-06, + "loss": 2.0076, + "step": 789 + }, + { + "epoch": 3.1983805668016196, + "grad_norm": 1.8372643674137712, + "learning_rate": 8.603563903327582e-06, + "loss": 2.0805, + "step": 790 + }, + { + "epoch": 3.2024291497975708, + "grad_norm": 1.8065321863693007, + "learning_rate": 8.598661839998972e-06, + "loss": 1.7669, + "step": 791 + }, + { + "epoch": 3.2064777327935223, + "grad_norm": 2.031336948957746, + "learning_rate": 8.593752589436334e-06, + "loss": 2.0858, + "step": 792 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 1.8889862063112353, + "learning_rate": 8.588836161444405e-06, + "loss": 2.1341, + "step": 793 + }, + { + "epoch": 3.214574898785425, + "grad_norm": 1.8426615628835388, + "learning_rate": 8.583912565842258e-06, + "loss": 1.9304, + "step": 794 + }, + { + "epoch": 3.2186234817813766, + "grad_norm": 1.7414893453963287, + "learning_rate": 8.578981812463278e-06, + "loss": 1.7942, + "step": 795 + }, + { + "epoch": 3.2226720647773277, + "grad_norm": 1.9096193735192637, + "learning_rate": 8.574043911155148e-06, + "loss": 1.72, + "step": 796 + }, + { + "epoch": 3.2267206477732793, + "grad_norm": 1.8025258377815987, + "learning_rate": 8.569098871779828e-06, + "loss": 1.8542, + "step": 797 + }, + { + "epoch": 3.230769230769231, + "grad_norm": 1.8460762696682704, + "learning_rate": 8.56414670421353e-06, + "loss": 1.7101, + "step": 798 + }, + { + "epoch": 3.234817813765182, + "grad_norm": 1.9398991434247146, + "learning_rate": 8.559187418346703e-06, + "loss": 1.95, + "step": 799 + }, + { + "epoch": 3.2388663967611335, + "grad_norm": 1.8632306612622278, + "learning_rate": 8.554221024084019e-06, + "loss": 1.8895, + "step": 800 + }, + { + "epoch": 3.242914979757085, + "grad_norm": 1.893700967064052, + "learning_rate": 8.54924753134434e-06, + "loss": 1.873, + "step": 801 + }, + { + "epoch": 3.246963562753036, + "grad_norm": 1.7151529599583697, + "learning_rate": 8.544266950060706e-06, + "loss": 1.7236, + "step": 802 + }, + { + "epoch": 3.251012145748988, + "grad_norm": 1.7251248112215953, + "learning_rate": 8.539279290180315e-06, + "loss": 1.7693, + "step": 803 + }, + { + "epoch": 3.2550607287449393, + "grad_norm": 1.9817743209184147, + "learning_rate": 8.534284561664508e-06, + "loss": 1.8365, + "step": 804 + }, + { + "epoch": 3.2591093117408905, + "grad_norm": 1.8362666024929137, + "learning_rate": 8.529282774488731e-06, + "loss": 1.6791, + "step": 805 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 1.9144972025615734, + "learning_rate": 8.524273938642539e-06, + "loss": 1.5622, + "step": 806 + }, + { + "epoch": 3.2672064777327936, + "grad_norm": 1.8150113569889472, + "learning_rate": 8.519258064129559e-06, + "loss": 1.8107, + "step": 807 + }, + { + "epoch": 3.2712550607287447, + "grad_norm": 1.8132774105922835, + "learning_rate": 8.514235160967476e-06, + "loss": 1.8382, + "step": 808 + }, + { + "epoch": 3.2753036437246963, + "grad_norm": 1.7178012200999808, + "learning_rate": 8.509205239188017e-06, + "loss": 1.8519, + "step": 809 + }, + { + "epoch": 3.279352226720648, + "grad_norm": 2.2519702448886845, + "learning_rate": 8.504168308836918e-06, + "loss": 1.8559, + "step": 810 + }, + { + "epoch": 3.283400809716599, + "grad_norm": 2.1015013370666513, + "learning_rate": 8.499124379973922e-06, + "loss": 1.5602, + "step": 811 + }, + { + "epoch": 3.2874493927125505, + "grad_norm": 2.1456515647605365, + "learning_rate": 8.494073462672743e-06, + "loss": 1.6597, + "step": 812 + }, + { + "epoch": 3.291497975708502, + "grad_norm": 2.1425091129883613, + "learning_rate": 8.489015567021054e-06, + "loss": 1.5311, + "step": 813 + }, + { + "epoch": 3.2955465587044532, + "grad_norm": 2.1055979919937693, + "learning_rate": 8.483950703120466e-06, + "loss": 1.8547, + "step": 814 + }, + { + "epoch": 3.299595141700405, + "grad_norm": 1.9678625432719996, + "learning_rate": 8.478878881086505e-06, + "loss": 1.9357, + "step": 815 + }, + { + "epoch": 3.3036437246963564, + "grad_norm": 2.0317817207691538, + "learning_rate": 8.473800111048598e-06, + "loss": 1.6684, + "step": 816 + }, + { + "epoch": 3.3076923076923075, + "grad_norm": 2.0379814335298843, + "learning_rate": 8.468714403150043e-06, + "loss": 1.6929, + "step": 817 + }, + { + "epoch": 3.311740890688259, + "grad_norm": 1.9848650286398888, + "learning_rate": 8.463621767547998e-06, + "loss": 1.7112, + "step": 818 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 2.274800378339576, + "learning_rate": 8.458522214413455e-06, + "loss": 1.7005, + "step": 819 + }, + { + "epoch": 3.3198380566801617, + "grad_norm": 2.170751690325617, + "learning_rate": 8.453415753931223e-06, + "loss": 1.5995, + "step": 820 + }, + { + "epoch": 3.3238866396761133, + "grad_norm": 1.9913626012571344, + "learning_rate": 8.448302396299906e-06, + "loss": 1.6057, + "step": 821 + }, + { + "epoch": 3.327935222672065, + "grad_norm": 1.9395230430651595, + "learning_rate": 8.443182151731883e-06, + "loss": 1.6349, + "step": 822 + }, + { + "epoch": 3.331983805668016, + "grad_norm": 1.9091197381555691, + "learning_rate": 8.438055030453287e-06, + "loss": 1.5595, + "step": 823 + }, + { + "epoch": 3.3360323886639676, + "grad_norm": 1.8562911407114664, + "learning_rate": 8.432921042703985e-06, + "loss": 1.6019, + "step": 824 + }, + { + "epoch": 3.340080971659919, + "grad_norm": 1.7832079833064884, + "learning_rate": 8.42778019873756e-06, + "loss": 1.552, + "step": 825 + }, + { + "epoch": 3.3441295546558703, + "grad_norm": 1.8542638409385725, + "learning_rate": 8.422632508821284e-06, + "loss": 1.5851, + "step": 826 + }, + { + "epoch": 3.348178137651822, + "grad_norm": 2.1436195397021436, + "learning_rate": 8.417477983236107e-06, + "loss": 1.7666, + "step": 827 + }, + { + "epoch": 3.3522267206477734, + "grad_norm": 2.33071372223659, + "learning_rate": 8.412316632276627e-06, + "loss": 1.6497, + "step": 828 + }, + { + "epoch": 3.3562753036437245, + "grad_norm": 2.205436986044382, + "learning_rate": 8.407148466251072e-06, + "loss": 1.3523, + "step": 829 + }, + { + "epoch": 3.360323886639676, + "grad_norm": 2.2620315487409877, + "learning_rate": 8.401973495481289e-06, + "loss": 1.723, + "step": 830 + }, + { + "epoch": 3.3643724696356276, + "grad_norm": 2.180101120238927, + "learning_rate": 8.396791730302708e-06, + "loss": 1.8056, + "step": 831 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 1.990085418505961, + "learning_rate": 8.39160318106433e-06, + "loss": 1.7166, + "step": 832 + }, + { + "epoch": 3.3724696356275303, + "grad_norm": 2.40657553356096, + "learning_rate": 8.386407858128707e-06, + "loss": 1.8193, + "step": 833 + }, + { + "epoch": 3.376518218623482, + "grad_norm": 1.94489059367538, + "learning_rate": 8.381205771871918e-06, + "loss": 1.4172, + "step": 834 + }, + { + "epoch": 3.380566801619433, + "grad_norm": 2.150391672244522, + "learning_rate": 8.375996932683553e-06, + "loss": 1.5949, + "step": 835 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 2.0030590669894903, + "learning_rate": 8.370781350966683e-06, + "loss": 1.4156, + "step": 836 + }, + { + "epoch": 3.388663967611336, + "grad_norm": 2.197019034882382, + "learning_rate": 8.36555903713785e-06, + "loss": 1.4714, + "step": 837 + }, + { + "epoch": 3.3927125506072873, + "grad_norm": 2.078166195454461, + "learning_rate": 8.360330001627043e-06, + "loss": 1.6429, + "step": 838 + }, + { + "epoch": 3.396761133603239, + "grad_norm": 2.40629641977567, + "learning_rate": 8.355094254877665e-06, + "loss": 1.4713, + "step": 839 + }, + { + "epoch": 3.4008097165991904, + "grad_norm": 1.9645801904393803, + "learning_rate": 8.349851807346535e-06, + "loss": 1.5146, + "step": 840 + }, + { + "epoch": 3.4048582995951415, + "grad_norm": 1.9534289124567972, + "learning_rate": 8.344602669503849e-06, + "loss": 1.5871, + "step": 841 + }, + { + "epoch": 3.408906882591093, + "grad_norm": 2.3102884897188534, + "learning_rate": 8.339346851833163e-06, + "loss": 1.6862, + "step": 842 + }, + { + "epoch": 3.4129554655870447, + "grad_norm": 2.0401234182707406, + "learning_rate": 8.334084364831381e-06, + "loss": 1.5214, + "step": 843 + }, + { + "epoch": 3.417004048582996, + "grad_norm": 2.159768925630674, + "learning_rate": 8.328815219008719e-06, + "loss": 1.8219, + "step": 844 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 2.2204972461802757, + "learning_rate": 8.323539424888695e-06, + "loss": 1.8941, + "step": 845 + }, + { + "epoch": 3.425101214574899, + "grad_norm": 1.9873340221710971, + "learning_rate": 8.318256993008108e-06, + "loss": 1.7539, + "step": 846 + }, + { + "epoch": 3.42914979757085, + "grad_norm": 1.975202455896719, + "learning_rate": 8.31296793391701e-06, + "loss": 1.8598, + "step": 847 + }, + { + "epoch": 3.4331983805668016, + "grad_norm": 1.8415081642607933, + "learning_rate": 8.30767225817869e-06, + "loss": 1.9574, + "step": 848 + }, + { + "epoch": 3.437246963562753, + "grad_norm": 2.047274050267817, + "learning_rate": 8.302369976369651e-06, + "loss": 1.736, + "step": 849 + }, + { + "epoch": 3.4412955465587043, + "grad_norm": 2.1457366433830454, + "learning_rate": 8.297061099079592e-06, + "loss": 1.6581, + "step": 850 + }, + { + "epoch": 3.445344129554656, + "grad_norm": 1.8891113266245207, + "learning_rate": 8.291745636911382e-06, + "loss": 1.9183, + "step": 851 + }, + { + "epoch": 3.4493927125506074, + "grad_norm": 2.05347009046486, + "learning_rate": 8.286423600481044e-06, + "loss": 1.6869, + "step": 852 + }, + { + "epoch": 3.4534412955465585, + "grad_norm": 2.1578470259791795, + "learning_rate": 8.281095000417725e-06, + "loss": 1.6709, + "step": 853 + }, + { + "epoch": 3.45748987854251, + "grad_norm": 2.2158190833608606, + "learning_rate": 8.27575984736369e-06, + "loss": 2.079, + "step": 854 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 2.9226191862145265, + "learning_rate": 8.270418151974286e-06, + "loss": 2.3146, + "step": 855 + }, + { + "epoch": 3.465587044534413, + "grad_norm": 2.1657050143675205, + "learning_rate": 8.265069924917925e-06, + "loss": 1.9175, + "step": 856 + }, + { + "epoch": 3.4696356275303644, + "grad_norm": 1.7932680376129573, + "learning_rate": 8.259715176876069e-06, + "loss": 1.8725, + "step": 857 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 1.8709685644083165, + "learning_rate": 8.254353918543199e-06, + "loss": 1.7809, + "step": 858 + }, + { + "epoch": 3.477732793522267, + "grad_norm": 2.4167400582718694, + "learning_rate": 8.2489861606268e-06, + "loss": 1.8016, + "step": 859 + }, + { + "epoch": 3.4817813765182186, + "grad_norm": 1.659768741074137, + "learning_rate": 8.243611913847337e-06, + "loss": 1.7188, + "step": 860 + }, + { + "epoch": 3.48582995951417, + "grad_norm": 2.1480568234600668, + "learning_rate": 8.238231188938237e-06, + "loss": 1.6913, + "step": 861 + }, + { + "epoch": 3.4898785425101213, + "grad_norm": 2.461283879827119, + "learning_rate": 8.232843996645865e-06, + "loss": 1.6242, + "step": 862 + }, + { + "epoch": 3.493927125506073, + "grad_norm": 2.3643514071925056, + "learning_rate": 8.2274503477295e-06, + "loss": 1.6881, + "step": 863 + }, + { + "epoch": 3.4979757085020244, + "grad_norm": 3.087293785042021, + "learning_rate": 8.222050252961318e-06, + "loss": 1.5087, + "step": 864 + }, + { + "epoch": 3.5020242914979756, + "grad_norm": 2.105684160210004, + "learning_rate": 8.216643723126367e-06, + "loss": 1.4331, + "step": 865 + }, + { + "epoch": 3.506072874493927, + "grad_norm": 2.420952436641065, + "learning_rate": 8.211230769022552e-06, + "loss": 1.7553, + "step": 866 + }, + { + "epoch": 3.5101214574898787, + "grad_norm": 2.2746665377354116, + "learning_rate": 8.2058114014606e-06, + "loss": 1.782, + "step": 867 + }, + { + "epoch": 3.51417004048583, + "grad_norm": 1.6776374980476494, + "learning_rate": 8.200385631264051e-06, + "loss": 1.7357, + "step": 868 + }, + { + "epoch": 3.5182186234817814, + "grad_norm": 2.130957958265717, + "learning_rate": 8.19495346926924e-06, + "loss": 1.7569, + "step": 869 + }, + { + "epoch": 3.522267206477733, + "grad_norm": 2.1241420175580386, + "learning_rate": 8.189514926325255e-06, + "loss": 1.7036, + "step": 870 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 2.397883462392177, + "learning_rate": 8.184070013293936e-06, + "loss": 1.4984, + "step": 871 + }, + { + "epoch": 3.5303643724696356, + "grad_norm": 2.676554915114245, + "learning_rate": 8.178618741049841e-06, + "loss": 1.5719, + "step": 872 + }, + { + "epoch": 3.534412955465587, + "grad_norm": 2.641036334787177, + "learning_rate": 8.173161120480232e-06, + "loss": 1.7235, + "step": 873 + }, + { + "epoch": 3.5384615384615383, + "grad_norm": 2.4283908813712127, + "learning_rate": 8.16769716248505e-06, + "loss": 1.8976, + "step": 874 + }, + { + "epoch": 3.54251012145749, + "grad_norm": 1.9109389793413394, + "learning_rate": 8.162226877976886e-06, + "loss": 1.797, + "step": 875 + }, + { + "epoch": 3.5465587044534415, + "grad_norm": 3.1765952449893073, + "learning_rate": 8.156750277880979e-06, + "loss": 2.2212, + "step": 876 + }, + { + "epoch": 3.5506072874493926, + "grad_norm": 6.740978753214387, + "learning_rate": 8.15126737313517e-06, + "loss": 2.2759, + "step": 877 + }, + { + "epoch": 3.554655870445344, + "grad_norm": 6.646199027432937, + "learning_rate": 8.145778174689897e-06, + "loss": 2.5045, + "step": 878 + }, + { + "epoch": 3.5587044534412957, + "grad_norm": 1.9732928727215509, + "learning_rate": 8.140282693508168e-06, + "loss": 1.702, + "step": 879 + }, + { + "epoch": 3.562753036437247, + "grad_norm": 1.923113895215325, + "learning_rate": 8.134780940565535e-06, + "loss": 1.5859, + "step": 880 + }, + { + "epoch": 3.5668016194331984, + "grad_norm": 1.888490124882663, + "learning_rate": 8.129272926850079e-06, + "loss": 1.9019, + "step": 881 + }, + { + "epoch": 3.57085020242915, + "grad_norm": 2.0879599313529247, + "learning_rate": 8.123758663362386e-06, + "loss": 1.5424, + "step": 882 + }, + { + "epoch": 3.574898785425101, + "grad_norm": 2.1113301524020778, + "learning_rate": 8.118238161115523e-06, + "loss": 1.8581, + "step": 883 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 1.7105898329062328, + "learning_rate": 8.112711431135014e-06, + "loss": 1.5914, + "step": 884 + }, + { + "epoch": 3.582995951417004, + "grad_norm": 1.9358089378047225, + "learning_rate": 8.107178484458825e-06, + "loss": 1.7957, + "step": 885 + }, + { + "epoch": 3.5870445344129553, + "grad_norm": 1.9092777164097747, + "learning_rate": 8.101639332137337e-06, + "loss": 1.7404, + "step": 886 + }, + { + "epoch": 3.591093117408907, + "grad_norm": 2.098080272876577, + "learning_rate": 8.096093985233323e-06, + "loss": 1.7127, + "step": 887 + }, + { + "epoch": 3.5951417004048585, + "grad_norm": 2.4907144738421065, + "learning_rate": 8.090542454821929e-06, + "loss": 1.4308, + "step": 888 + }, + { + "epoch": 3.5991902834008096, + "grad_norm": 1.8678109793168913, + "learning_rate": 8.084984751990652e-06, + "loss": 1.4797, + "step": 889 + }, + { + "epoch": 3.603238866396761, + "grad_norm": 1.8961480105884363, + "learning_rate": 8.079420887839316e-06, + "loss": 1.6173, + "step": 890 + }, + { + "epoch": 3.6072874493927127, + "grad_norm": 1.9539785870788862, + "learning_rate": 8.073850873480047e-06, + "loss": 1.4952, + "step": 891 + }, + { + "epoch": 3.611336032388664, + "grad_norm": 2.31450202449626, + "learning_rate": 8.068274720037261e-06, + "loss": 1.813, + "step": 892 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 1.8087093273790038, + "learning_rate": 8.062692438647628e-06, + "loss": 1.7376, + "step": 893 + }, + { + "epoch": 3.619433198380567, + "grad_norm": 2.408589476299181, + "learning_rate": 8.057104040460062e-06, + "loss": 1.505, + "step": 894 + }, + { + "epoch": 3.623481781376518, + "grad_norm": 2.3231639351842035, + "learning_rate": 8.051509536635686e-06, + "loss": 1.9039, + "step": 895 + }, + { + "epoch": 3.6275303643724697, + "grad_norm": 1.9849491847712974, + "learning_rate": 8.045908938347828e-06, + "loss": 1.7125, + "step": 896 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 2.2249483352664026, + "learning_rate": 8.04030225678198e-06, + "loss": 1.9514, + "step": 897 + }, + { + "epoch": 3.6356275303643724, + "grad_norm": 2.005047614111562, + "learning_rate": 8.034689503135785e-06, + "loss": 1.597, + "step": 898 + }, + { + "epoch": 3.639676113360324, + "grad_norm": 2.2925145752574854, + "learning_rate": 8.029070688619013e-06, + "loss": 1.8072, + "step": 899 + }, + { + "epoch": 3.6437246963562755, + "grad_norm": 1.9475842419850795, + "learning_rate": 8.023445824453539e-06, + "loss": 1.7289, + "step": 900 + }, + { + "epoch": 3.6477732793522266, + "grad_norm": 2.071154449190338, + "learning_rate": 8.017814921873326e-06, + "loss": 1.7658, + "step": 901 + }, + { + "epoch": 3.651821862348178, + "grad_norm": 1.9935193669759015, + "learning_rate": 8.012177992124385e-06, + "loss": 1.6002, + "step": 902 + }, + { + "epoch": 3.6558704453441297, + "grad_norm": 2.2483209235168737, + "learning_rate": 8.006535046464774e-06, + "loss": 1.8275, + "step": 903 + }, + { + "epoch": 3.659919028340081, + "grad_norm": 2.5274264683222425, + "learning_rate": 8.000886096164564e-06, + "loss": 1.6502, + "step": 904 + }, + { + "epoch": 3.6639676113360324, + "grad_norm": 2.0119741262052195, + "learning_rate": 7.995231152505815e-06, + "loss": 1.8017, + "step": 905 + }, + { + "epoch": 3.668016194331984, + "grad_norm": 2.1027093845450233, + "learning_rate": 7.989570226782562e-06, + "loss": 1.8138, + "step": 906 + }, + { + "epoch": 3.672064777327935, + "grad_norm": 3.056649771146675, + "learning_rate": 7.983903330300782e-06, + "loss": 1.8128, + "step": 907 + }, + { + "epoch": 3.6761133603238867, + "grad_norm": 1.9139807090551522, + "learning_rate": 7.978230474378383e-06, + "loss": 1.7148, + "step": 908 + }, + { + "epoch": 3.6801619433198383, + "grad_norm": 2.416490627923619, + "learning_rate": 7.97255167034517e-06, + "loss": 1.7726, + "step": 909 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 2.053612332583323, + "learning_rate": 7.966866929542827e-06, + "loss": 1.5779, + "step": 910 + }, + { + "epoch": 3.688259109311741, + "grad_norm": 2.0666037215601505, + "learning_rate": 7.961176263324902e-06, + "loss": 1.7465, + "step": 911 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 2.1463137742100327, + "learning_rate": 7.955479683056767e-06, + "loss": 1.7608, + "step": 912 + }, + { + "epoch": 3.6963562753036436, + "grad_norm": 1.9232481327470194, + "learning_rate": 7.949777200115617e-06, + "loss": 1.5992, + "step": 913 + }, + { + "epoch": 3.700404858299595, + "grad_norm": 2.5029604743639515, + "learning_rate": 7.944068825890424e-06, + "loss": 2.089, + "step": 914 + }, + { + "epoch": 3.7044534412955468, + "grad_norm": 2.425403056999352, + "learning_rate": 7.938354571781933e-06, + "loss": 1.8514, + "step": 915 + }, + { + "epoch": 3.708502024291498, + "grad_norm": 2.2889869162476315, + "learning_rate": 7.932634449202635e-06, + "loss": 1.4493, + "step": 916 + }, + { + "epoch": 3.7125506072874495, + "grad_norm": 2.0245599708625988, + "learning_rate": 7.92690846957673e-06, + "loss": 1.6351, + "step": 917 + }, + { + "epoch": 3.716599190283401, + "grad_norm": 1.997997696536965, + "learning_rate": 7.921176644340132e-06, + "loss": 1.7253, + "step": 918 + }, + { + "epoch": 3.720647773279352, + "grad_norm": 2.344635708570945, + "learning_rate": 7.915438984940415e-06, + "loss": 1.5384, + "step": 919 + }, + { + "epoch": 3.7246963562753037, + "grad_norm": 2.399788568220564, + "learning_rate": 7.909695502836814e-06, + "loss": 1.6518, + "step": 920 + }, + { + "epoch": 3.7287449392712553, + "grad_norm": 2.258204100694036, + "learning_rate": 7.903946209500189e-06, + "loss": 1.8741, + "step": 921 + }, + { + "epoch": 3.7327935222672064, + "grad_norm": 1.9355255173187593, + "learning_rate": 7.898191116413007e-06, + "loss": 1.6996, + "step": 922 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 2.1474241115417425, + "learning_rate": 7.892430235069317e-06, + "loss": 1.7427, + "step": 923 + }, + { + "epoch": 3.7408906882591095, + "grad_norm": 3.071687208613463, + "learning_rate": 7.886663576974733e-06, + "loss": 2.4106, + "step": 924 + }, + { + "epoch": 3.7449392712550607, + "grad_norm": 2.0799708188253465, + "learning_rate": 7.880891153646401e-06, + "loss": 1.808, + "step": 925 + }, + { + "epoch": 3.748987854251012, + "grad_norm": 2.4353787137639453, + "learning_rate": 7.875112976612984e-06, + "loss": 1.6368, + "step": 926 + }, + { + "epoch": 3.753036437246964, + "grad_norm": 2.159792334487355, + "learning_rate": 7.869329057414635e-06, + "loss": 1.5175, + "step": 927 + }, + { + "epoch": 3.757085020242915, + "grad_norm": 2.0548605804443274, + "learning_rate": 7.863539407602976e-06, + "loss": 1.7423, + "step": 928 + }, + { + "epoch": 3.7611336032388665, + "grad_norm": 3.9628857560933324, + "learning_rate": 7.857744038741076e-06, + "loss": 2.5332, + "step": 929 + }, + { + "epoch": 3.765182186234818, + "grad_norm": 4.514218437938051, + "learning_rate": 7.85194296240342e-06, + "loss": 2.3287, + "step": 930 + }, + { + "epoch": 3.769230769230769, + "grad_norm": 5.356074790215057, + "learning_rate": 7.846136190175901e-06, + "loss": 2.1714, + "step": 931 + }, + { + "epoch": 3.7732793522267207, + "grad_norm": 2.238703863406207, + "learning_rate": 7.84032373365578e-06, + "loss": 1.671, + "step": 932 + }, + { + "epoch": 3.7773279352226723, + "grad_norm": 2.194562792441507, + "learning_rate": 7.834505604451672e-06, + "loss": 1.9108, + "step": 933 + }, + { + "epoch": 3.7813765182186234, + "grad_norm": 2.085928113902739, + "learning_rate": 7.828681814183527e-06, + "loss": 1.9396, + "step": 934 + }, + { + "epoch": 3.785425101214575, + "grad_norm": 2.215253557008417, + "learning_rate": 7.822852374482597e-06, + "loss": 1.7587, + "step": 935 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 3.010107826761077, + "learning_rate": 7.817017296991411e-06, + "loss": 1.6507, + "step": 936 + }, + { + "epoch": 3.7935222672064777, + "grad_norm": 2.25886892537205, + "learning_rate": 7.811176593363771e-06, + "loss": 1.7372, + "step": 937 + }, + { + "epoch": 3.7975708502024292, + "grad_norm": 2.2130344020805297, + "learning_rate": 7.805330275264707e-06, + "loss": 1.7485, + "step": 938 + }, + { + "epoch": 3.801619433198381, + "grad_norm": 2.0367189537336907, + "learning_rate": 7.79947835437046e-06, + "loss": 1.5515, + "step": 939 + }, + { + "epoch": 3.805668016194332, + "grad_norm": 2.070856690389127, + "learning_rate": 7.79362084236847e-06, + "loss": 1.4447, + "step": 940 + }, + { + "epoch": 3.8097165991902835, + "grad_norm": 2.1857926637124794, + "learning_rate": 7.787757750957335e-06, + "loss": 1.8015, + "step": 941 + }, + { + "epoch": 3.813765182186235, + "grad_norm": 2.6872149719652305, + "learning_rate": 7.781889091846799e-06, + "loss": 1.7528, + "step": 942 + }, + { + "epoch": 3.817813765182186, + "grad_norm": 2.3048135110635264, + "learning_rate": 7.776014876757727e-06, + "loss": 1.5226, + "step": 943 + }, + { + "epoch": 3.8218623481781377, + "grad_norm": 8.991127581731243, + "learning_rate": 7.77013511742208e-06, + "loss": 2.3966, + "step": 944 + }, + { + "epoch": 3.8259109311740893, + "grad_norm": 19.276037930316928, + "learning_rate": 7.76424982558289e-06, + "loss": 3.7738, + "step": 945 + }, + { + "epoch": 3.8299595141700404, + "grad_norm": 2.4583074183525677, + "learning_rate": 7.758359012994242e-06, + "loss": 1.6137, + "step": 946 + }, + { + "epoch": 3.834008097165992, + "grad_norm": 2.405931055156567, + "learning_rate": 7.752462691421245e-06, + "loss": 1.4666, + "step": 947 + }, + { + "epoch": 3.8380566801619436, + "grad_norm": 2.114379083785604, + "learning_rate": 7.746560872640007e-06, + "loss": 1.5791, + "step": 948 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 2.1946059502111845, + "learning_rate": 7.740653568437623e-06, + "loss": 1.5937, + "step": 949 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 2.3168344745949456, + "learning_rate": 7.734740790612137e-06, + "loss": 1.5169, + "step": 950 + }, + { + "epoch": 3.850202429149798, + "grad_norm": 2.3139829718351197, + "learning_rate": 7.728822550972523e-06, + "loss": 1.6162, + "step": 951 + }, + { + "epoch": 3.854251012145749, + "grad_norm": 2.5483408296020764, + "learning_rate": 7.722898861338674e-06, + "loss": 1.7001, + "step": 952 + }, + { + "epoch": 3.8582995951417005, + "grad_norm": 1.917540396918308, + "learning_rate": 7.716969733541357e-06, + "loss": 1.6257, + "step": 953 + }, + { + "epoch": 3.862348178137652, + "grad_norm": 2.4091479518780177, + "learning_rate": 7.711035179422205e-06, + "loss": 1.6058, + "step": 954 + }, + { + "epoch": 3.866396761133603, + "grad_norm": 2.4390857592479183, + "learning_rate": 7.705095210833687e-06, + "loss": 1.6468, + "step": 955 + }, + { + "epoch": 3.8704453441295548, + "grad_norm": 3.01025731676863, + "learning_rate": 7.699149839639086e-06, + "loss": 2.1392, + "step": 956 + }, + { + "epoch": 3.8744939271255063, + "grad_norm": 2.6957364897623473, + "learning_rate": 7.693199077712476e-06, + "loss": 2.0741, + "step": 957 + }, + { + "epoch": 3.8785425101214575, + "grad_norm": 2.6726767004932395, + "learning_rate": 7.687242936938694e-06, + "loss": 1.8205, + "step": 958 + }, + { + "epoch": 3.882591093117409, + "grad_norm": 2.3223231672079727, + "learning_rate": 7.681281429213328e-06, + "loss": 1.7239, + "step": 959 + }, + { + "epoch": 3.8866396761133606, + "grad_norm": 2.4223424195591505, + "learning_rate": 7.675314566442673e-06, + "loss": 1.2702, + "step": 960 + }, + { + "epoch": 3.8906882591093117, + "grad_norm": 2.1111739790928024, + "learning_rate": 7.669342360543727e-06, + "loss": 1.7654, + "step": 961 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 2.0865715931939968, + "learning_rate": 7.663364823444157e-06, + "loss": 1.8567, + "step": 962 + }, + { + "epoch": 3.898785425101215, + "grad_norm": 1.9521945713254736, + "learning_rate": 7.65738196708228e-06, + "loss": 1.5513, + "step": 963 + }, + { + "epoch": 3.902834008097166, + "grad_norm": 2.252893420029499, + "learning_rate": 7.651393803407032e-06, + "loss": 1.6101, + "step": 964 + }, + { + "epoch": 3.9068825910931175, + "grad_norm": 2.445627287506017, + "learning_rate": 7.645400344377953e-06, + "loss": 1.7802, + "step": 965 + }, + { + "epoch": 3.910931174089069, + "grad_norm": 2.206311718559999, + "learning_rate": 7.639401601965158e-06, + "loss": 1.3433, + "step": 966 + }, + { + "epoch": 3.91497975708502, + "grad_norm": 2.5126306064577935, + "learning_rate": 7.63339758814931e-06, + "loss": 1.4571, + "step": 967 + }, + { + "epoch": 3.919028340080972, + "grad_norm": 2.301201962037062, + "learning_rate": 7.627388314921602e-06, + "loss": 1.4798, + "step": 968 + }, + { + "epoch": 3.9230769230769234, + "grad_norm": 2.0505587515987265, + "learning_rate": 7.621373794283735e-06, + "loss": 1.7924, + "step": 969 + }, + { + "epoch": 3.9271255060728745, + "grad_norm": 2.716118255543476, + "learning_rate": 7.615354038247889e-06, + "loss": 1.6337, + "step": 970 + }, + { + "epoch": 3.931174089068826, + "grad_norm": 2.636209282969381, + "learning_rate": 7.609329058836694e-06, + "loss": 1.6699, + "step": 971 + }, + { + "epoch": 3.9352226720647776, + "grad_norm": 2.3802398786409107, + "learning_rate": 7.6032988680832195e-06, + "loss": 1.4692, + "step": 972 + }, + { + "epoch": 3.9392712550607287, + "grad_norm": 2.5735078826994844, + "learning_rate": 7.597263478030939e-06, + "loss": 1.3909, + "step": 973 + }, + { + "epoch": 3.9433198380566803, + "grad_norm": 2.986329351018389, + "learning_rate": 7.59122290073371e-06, + "loss": 1.6787, + "step": 974 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 2.4407323865905015, + "learning_rate": 7.5851771482557535e-06, + "loss": 1.4349, + "step": 975 + }, + { + "epoch": 3.951417004048583, + "grad_norm": 2.8570555841909657, + "learning_rate": 7.579126232671621e-06, + "loss": 1.4016, + "step": 976 + }, + { + "epoch": 3.9554655870445345, + "grad_norm": 3.322338952206099, + "learning_rate": 7.5730701660661795e-06, + "loss": 1.6104, + "step": 977 + }, + { + "epoch": 3.9595141700404857, + "grad_norm": 2.5182830088343082, + "learning_rate": 7.567008960534585e-06, + "loss": 1.6231, + "step": 978 + }, + { + "epoch": 3.9635627530364372, + "grad_norm": 2.1739951186703923, + "learning_rate": 7.560942628182251e-06, + "loss": 1.6679, + "step": 979 + }, + { + "epoch": 3.967611336032389, + "grad_norm": 2.5756124639646982, + "learning_rate": 7.554871181124836e-06, + "loss": 1.8633, + "step": 980 + }, + { + "epoch": 3.97165991902834, + "grad_norm": 3.073388081199716, + "learning_rate": 7.548794631488211e-06, + "loss": 1.768, + "step": 981 + }, + { + "epoch": 3.9757085020242915, + "grad_norm": 2.1012291254049797, + "learning_rate": 7.5427129914084385e-06, + "loss": 1.6442, + "step": 982 + }, + { + "epoch": 3.979757085020243, + "grad_norm": 2.351295674425286, + "learning_rate": 7.536626273031747e-06, + "loss": 1.7358, + "step": 983 + }, + { + "epoch": 3.983805668016194, + "grad_norm": 2.115853749649768, + "learning_rate": 7.530534488514507e-06, + "loss": 1.8024, + "step": 984 + }, + { + "epoch": 3.9878542510121457, + "grad_norm": 2.454948116388734, + "learning_rate": 7.524437650023211e-06, + "loss": 1.6063, + "step": 985 + }, + { + "epoch": 3.9919028340080973, + "grad_norm": 2.043008387794743, + "learning_rate": 7.5183357697344395e-06, + "loss": 1.5544, + "step": 986 + }, + { + "epoch": 3.9959514170040484, + "grad_norm": 1.8968397388893163, + "learning_rate": 7.512228859834845e-06, + "loss": 1.8733, + "step": 987 + }, + { + "epoch": 4.0, + "grad_norm": 2.2142162316932255, + "learning_rate": 7.506116932521127e-06, + "loss": 1.6136, + "step": 988 + }, + { + "epoch": 4.004048582995951, + "grad_norm": 2.080064737878757, + "learning_rate": 7.500000000000001e-06, + "loss": 1.6735, + "step": 989 + }, + { + "epoch": 4.008097165991903, + "grad_norm": 2.8195577020771863, + "learning_rate": 7.493878074488184e-06, + "loss": 1.8144, + "step": 990 + }, + { + "epoch": 4.012145748987854, + "grad_norm": 2.861434123319288, + "learning_rate": 7.4877511682123635e-06, + "loss": 1.6734, + "step": 991 + }, + { + "epoch": 4.016194331983805, + "grad_norm": 3.0695960191225247, + "learning_rate": 7.481619293409173e-06, + "loss": 1.8495, + "step": 992 + }, + { + "epoch": 4.020242914979757, + "grad_norm": 2.580474309033628, + "learning_rate": 7.475482462325169e-06, + "loss": 2.099, + "step": 993 + }, + { + "epoch": 4.0242914979757085, + "grad_norm": 2.721243409721488, + "learning_rate": 7.469340687216809e-06, + "loss": 1.9446, + "step": 994 + }, + { + "epoch": 4.02834008097166, + "grad_norm": 2.3410049191202074, + "learning_rate": 7.4631939803504215e-06, + "loss": 1.6196, + "step": 995 + }, + { + "epoch": 4.032388663967612, + "grad_norm": 2.720885518023577, + "learning_rate": 7.4570423540021905e-06, + "loss": 1.6221, + "step": 996 + }, + { + "epoch": 4.036437246963563, + "grad_norm": 2.5413861683291996, + "learning_rate": 7.450885820458117e-06, + "loss": 1.8749, + "step": 997 + }, + { + "epoch": 4.040485829959514, + "grad_norm": 2.5863690862096957, + "learning_rate": 7.44472439201401e-06, + "loss": 1.6649, + "step": 998 + }, + { + "epoch": 4.044534412955466, + "grad_norm": 2.371552718771952, + "learning_rate": 7.438558080975449e-06, + "loss": 1.6799, + "step": 999 + }, + { + "epoch": 4.048582995951417, + "grad_norm": 2.5691951258164063, + "learning_rate": 7.4323868996577696e-06, + "loss": 1.63, + "step": 1000 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 2.675468998968646, + "learning_rate": 7.426210860386032e-06, + "loss": 1.7354, + "step": 1001 + }, + { + "epoch": 4.05668016194332, + "grad_norm": 2.58607973493479, + "learning_rate": 7.420029975494996e-06, + "loss": 1.5703, + "step": 1002 + }, + { + "epoch": 4.060728744939271, + "grad_norm": 2.475852723612659, + "learning_rate": 7.413844257329104e-06, + "loss": 1.749, + "step": 1003 + }, + { + "epoch": 4.064777327935222, + "grad_norm": 2.625704853477589, + "learning_rate": 7.407653718242449e-06, + "loss": 1.6948, + "step": 1004 + }, + { + "epoch": 4.068825910931174, + "grad_norm": 2.7272435081151283, + "learning_rate": 7.401458370598753e-06, + "loss": 1.8281, + "step": 1005 + }, + { + "epoch": 4.0728744939271255, + "grad_norm": 2.507953052399452, + "learning_rate": 7.395258226771341e-06, + "loss": 1.7673, + "step": 1006 + }, + { + "epoch": 4.076923076923077, + "grad_norm": 2.5085283118904074, + "learning_rate": 7.3890532991431174e-06, + "loss": 1.6958, + "step": 1007 + }, + { + "epoch": 4.080971659919029, + "grad_norm": 2.388953051348741, + "learning_rate": 7.382843600106539e-06, + "loss": 1.7112, + "step": 1008 + }, + { + "epoch": 4.08502024291498, + "grad_norm": 2.2236808085380644, + "learning_rate": 7.376629142063597e-06, + "loss": 1.7162, + "step": 1009 + }, + { + "epoch": 4.089068825910931, + "grad_norm": 2.7412048035286505, + "learning_rate": 7.370409937425781e-06, + "loss": 1.7045, + "step": 1010 + }, + { + "epoch": 4.093117408906883, + "grad_norm": 2.3839251838504367, + "learning_rate": 7.364185998614064e-06, + "loss": 1.7854, + "step": 1011 + }, + { + "epoch": 4.097165991902834, + "grad_norm": 2.383572557144146, + "learning_rate": 7.357957338058873e-06, + "loss": 1.534, + "step": 1012 + }, + { + "epoch": 4.101214574898785, + "grad_norm": 2.7483936941368996, + "learning_rate": 7.3517239682000675e-06, + "loss": 1.7001, + "step": 1013 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 2.6910416116843257, + "learning_rate": 7.345485901486908e-06, + "loss": 1.7037, + "step": 1014 + }, + { + "epoch": 4.109311740890688, + "grad_norm": 2.677750230508956, + "learning_rate": 7.33924315037804e-06, + "loss": 1.6197, + "step": 1015 + }, + { + "epoch": 4.113360323886639, + "grad_norm": 3.1184294482443717, + "learning_rate": 7.332995727341462e-06, + "loss": 1.5587, + "step": 1016 + }, + { + "epoch": 4.117408906882591, + "grad_norm": 2.697817221643411, + "learning_rate": 7.326743644854504e-06, + "loss": 1.4804, + "step": 1017 + }, + { + "epoch": 4.1214574898785425, + "grad_norm": 2.5533427892436364, + "learning_rate": 7.3204869154038015e-06, + "loss": 1.5149, + "step": 1018 + }, + { + "epoch": 4.125506072874494, + "grad_norm": 2.7058477331519604, + "learning_rate": 7.314225551485273e-06, + "loss": 1.5156, + "step": 1019 + }, + { + "epoch": 4.129554655870446, + "grad_norm": 2.8633359493766384, + "learning_rate": 7.30795956560409e-06, + "loss": 1.4187, + "step": 1020 + }, + { + "epoch": 4.133603238866397, + "grad_norm": 2.346585899707522, + "learning_rate": 7.301688970274655e-06, + "loss": 1.7718, + "step": 1021 + }, + { + "epoch": 4.137651821862348, + "grad_norm": 2.8346595314782568, + "learning_rate": 7.295413778020579e-06, + "loss": 1.6181, + "step": 1022 + }, + { + "epoch": 4.1417004048583, + "grad_norm": 2.1328033209542046, + "learning_rate": 7.289134001374654e-06, + "loss": 1.3513, + "step": 1023 + }, + { + "epoch": 4.145748987854251, + "grad_norm": 2.723527413205223, + "learning_rate": 7.282849652878824e-06, + "loss": 1.7449, + "step": 1024 + }, + { + "epoch": 4.149797570850202, + "grad_norm": 2.6296530406635648, + "learning_rate": 7.276560745084167e-06, + "loss": 1.56, + "step": 1025 + }, + { + "epoch": 4.153846153846154, + "grad_norm": 2.3607444563571645, + "learning_rate": 7.2702672905508656e-06, + "loss": 1.7373, + "step": 1026 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 2.857459652562985, + "learning_rate": 7.263969301848188e-06, + "loss": 1.8929, + "step": 1027 + }, + { + "epoch": 4.161943319838056, + "grad_norm": 2.416479591453608, + "learning_rate": 7.257666791554448e-06, + "loss": 1.6155, + "step": 1028 + }, + { + "epoch": 4.165991902834008, + "grad_norm": 2.485932817739182, + "learning_rate": 7.251359772256998e-06, + "loss": 1.6856, + "step": 1029 + }, + { + "epoch": 4.17004048582996, + "grad_norm": 2.2601305066652664, + "learning_rate": 7.245048256552195e-06, + "loss": 2.1658, + "step": 1030 + }, + { + "epoch": 4.174089068825911, + "grad_norm": 2.4736185296097566, + "learning_rate": 7.2387322570453724e-06, + "loss": 1.5329, + "step": 1031 + }, + { + "epoch": 4.178137651821863, + "grad_norm": 2.902522379367228, + "learning_rate": 7.232411786350824e-06, + "loss": 1.7115, + "step": 1032 + }, + { + "epoch": 4.182186234817814, + "grad_norm": 2.1213589715944594, + "learning_rate": 7.226086857091765e-06, + "loss": 1.5227, + "step": 1033 + }, + { + "epoch": 4.186234817813765, + "grad_norm": 2.8619121355527968, + "learning_rate": 7.219757481900325e-06, + "loss": 1.6826, + "step": 1034 + }, + { + "epoch": 4.190283400809717, + "grad_norm": 2.5322052891357867, + "learning_rate": 7.213423673417508e-06, + "loss": 1.7019, + "step": 1035 + }, + { + "epoch": 4.194331983805668, + "grad_norm": 2.868097930235534, + "learning_rate": 7.207085444293172e-06, + "loss": 1.8899, + "step": 1036 + }, + { + "epoch": 4.198380566801619, + "grad_norm": 2.5521158066560288, + "learning_rate": 7.2007428071860045e-06, + "loss": 1.9495, + "step": 1037 + }, + { + "epoch": 4.202429149797571, + "grad_norm": 2.63283746068705, + "learning_rate": 7.194395774763496e-06, + "loss": 1.6451, + "step": 1038 + }, + { + "epoch": 4.206477732793522, + "grad_norm": 3.020988257996165, + "learning_rate": 7.188044359701917e-06, + "loss": 1.9686, + "step": 1039 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 2.7497468285413267, + "learning_rate": 7.181688574686292e-06, + "loss": 2.0078, + "step": 1040 + }, + { + "epoch": 4.2145748987854255, + "grad_norm": 2.4897799224246873, + "learning_rate": 7.175328432410367e-06, + "loss": 1.7921, + "step": 1041 + }, + { + "epoch": 4.218623481781377, + "grad_norm": 2.470322521256254, + "learning_rate": 7.168963945576597e-06, + "loss": 1.6719, + "step": 1042 + }, + { + "epoch": 4.222672064777328, + "grad_norm": 2.6592137837660266, + "learning_rate": 7.162595126896111e-06, + "loss": 1.5749, + "step": 1043 + }, + { + "epoch": 4.22672064777328, + "grad_norm": 2.533296478811204, + "learning_rate": 7.15622198908869e-06, + "loss": 1.7352, + "step": 1044 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 2.5992050846283354, + "learning_rate": 7.149844544882742e-06, + "loss": 1.5639, + "step": 1045 + }, + { + "epoch": 4.234817813765182, + "grad_norm": 2.7675121593200367, + "learning_rate": 7.143462807015271e-06, + "loss": 1.8108, + "step": 1046 + }, + { + "epoch": 4.238866396761134, + "grad_norm": 2.658793190465704, + "learning_rate": 7.137076788231865e-06, + "loss": 1.7457, + "step": 1047 + }, + { + "epoch": 4.242914979757085, + "grad_norm": 2.604959217646965, + "learning_rate": 7.130686501286655e-06, + "loss": 1.7451, + "step": 1048 + }, + { + "epoch": 4.246963562753036, + "grad_norm": 2.5111072223063897, + "learning_rate": 7.1242919589422974e-06, + "loss": 1.5808, + "step": 1049 + }, + { + "epoch": 4.251012145748988, + "grad_norm": 2.4705422975939775, + "learning_rate": 7.11789317396995e-06, + "loss": 1.6597, + "step": 1050 + }, + { + "epoch": 4.255060728744939, + "grad_norm": 2.8012872307046726, + "learning_rate": 7.1114901591492404e-06, + "loss": 1.6728, + "step": 1051 + }, + { + "epoch": 4.2591093117408905, + "grad_norm": 2.376781495157912, + "learning_rate": 7.105082927268247e-06, + "loss": 1.561, + "step": 1052 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 2.5702431118604423, + "learning_rate": 7.0986714911234715e-06, + "loss": 1.4172, + "step": 1053 + }, + { + "epoch": 4.267206477732794, + "grad_norm": 2.508325280537679, + "learning_rate": 7.092255863519806e-06, + "loss": 1.6779, + "step": 1054 + }, + { + "epoch": 4.271255060728745, + "grad_norm": 2.540012700506, + "learning_rate": 7.085836057270521e-06, + "loss": 1.6985, + "step": 1055 + }, + { + "epoch": 4.275303643724697, + "grad_norm": 2.471796434580062, + "learning_rate": 7.079412085197229e-06, + "loss": 1.7301, + "step": 1056 + }, + { + "epoch": 4.279352226720648, + "grad_norm": 3.3244889584848107, + "learning_rate": 7.072983960129862e-06, + "loss": 1.7094, + "step": 1057 + }, + { + "epoch": 4.283400809716599, + "grad_norm": 2.983349503659567, + "learning_rate": 7.066551694906651e-06, + "loss": 1.3989, + "step": 1058 + }, + { + "epoch": 4.287449392712551, + "grad_norm": 3.036520426590972, + "learning_rate": 7.060115302374087e-06, + "loss": 1.5257, + "step": 1059 + }, + { + "epoch": 4.291497975708502, + "grad_norm": 3.2696461082092068, + "learning_rate": 7.053674795386914e-06, + "loss": 1.3769, + "step": 1060 + }, + { + "epoch": 4.295546558704453, + "grad_norm": 3.066097380387373, + "learning_rate": 7.047230186808085e-06, + "loss": 1.6842, + "step": 1061 + }, + { + "epoch": 4.299595141700405, + "grad_norm": 2.6903089198270855, + "learning_rate": 7.04078148950875e-06, + "loss": 1.8088, + "step": 1062 + }, + { + "epoch": 4.303643724696356, + "grad_norm": 2.8258995708159773, + "learning_rate": 7.034328716368224e-06, + "loss": 1.5156, + "step": 1063 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 2.858420747113862, + "learning_rate": 7.027871880273959e-06, + "loss": 1.5394, + "step": 1064 + }, + { + "epoch": 4.3117408906882595, + "grad_norm": 2.7740108493498323, + "learning_rate": 7.021410994121525e-06, + "loss": 1.549, + "step": 1065 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 3.219790325593576, + "learning_rate": 7.014946070814583e-06, + "loss": 1.5296, + "step": 1066 + }, + { + "epoch": 4.319838056680162, + "grad_norm": 3.0526696821998094, + "learning_rate": 7.008477123264849e-06, + "loss": 1.4361, + "step": 1067 + }, + { + "epoch": 4.323886639676114, + "grad_norm": 2.9571662763160136, + "learning_rate": 7.0020041643920826e-06, + "loss": 1.4498, + "step": 1068 + }, + { + "epoch": 4.327935222672065, + "grad_norm": 2.819893094328226, + "learning_rate": 6.995527207124053e-06, + "loss": 1.4853, + "step": 1069 + }, + { + "epoch": 4.331983805668016, + "grad_norm": 2.7252255526223625, + "learning_rate": 6.989046264396516e-06, + "loss": 1.4535, + "step": 1070 + }, + { + "epoch": 4.336032388663968, + "grad_norm": 2.6189552228263753, + "learning_rate": 6.982561349153188e-06, + "loss": 1.5022, + "step": 1071 + }, + { + "epoch": 4.340080971659919, + "grad_norm": 2.568082005220546, + "learning_rate": 6.976072474345713e-06, + "loss": 1.4532, + "step": 1072 + }, + { + "epoch": 4.34412955465587, + "grad_norm": 2.623502257576312, + "learning_rate": 6.96957965293365e-06, + "loss": 1.4399, + "step": 1073 + }, + { + "epoch": 4.348178137651822, + "grad_norm": 3.1483597392827045, + "learning_rate": 6.963082897884439e-06, + "loss": 1.615, + "step": 1074 + }, + { + "epoch": 4.352226720647773, + "grad_norm": 3.8022601065423123, + "learning_rate": 6.956582222173374e-06, + "loss": 1.5412, + "step": 1075 + }, + { + "epoch": 4.3562753036437245, + "grad_norm": 3.177062030751366, + "learning_rate": 6.9500776387835785e-06, + "loss": 1.2047, + "step": 1076 + }, + { + "epoch": 4.3603238866396765, + "grad_norm": 3.185748452470112, + "learning_rate": 6.943569160705985e-06, + "loss": 1.6101, + "step": 1077 + }, + { + "epoch": 4.364372469635628, + "grad_norm": 2.9943825828047954, + "learning_rate": 6.9370568009393e-06, + "loss": 1.6897, + "step": 1078 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 2.8396585705303297, + "learning_rate": 6.9305405724899876e-06, + "loss": 1.6066, + "step": 1079 + }, + { + "epoch": 4.372469635627531, + "grad_norm": 3.4103100269352504, + "learning_rate": 6.924020488372229e-06, + "loss": 1.6845, + "step": 1080 + }, + { + "epoch": 4.376518218623482, + "grad_norm": 2.8184107943036323, + "learning_rate": 6.917496561607915e-06, + "loss": 1.3205, + "step": 1081 + }, + { + "epoch": 4.380566801619433, + "grad_norm": 3.152451887221124, + "learning_rate": 6.91096880522661e-06, + "loss": 1.4827, + "step": 1082 + }, + { + "epoch": 4.384615384615385, + "grad_norm": 2.8506198416780317, + "learning_rate": 6.904437232265521e-06, + "loss": 1.2814, + "step": 1083 + }, + { + "epoch": 4.388663967611336, + "grad_norm": 3.2465586785242033, + "learning_rate": 6.897901855769483e-06, + "loss": 1.3431, + "step": 1084 + }, + { + "epoch": 4.392712550607287, + "grad_norm": 3.077940405612511, + "learning_rate": 6.891362688790925e-06, + "loss": 1.5208, + "step": 1085 + }, + { + "epoch": 4.396761133603239, + "grad_norm": 3.4135560109047005, + "learning_rate": 6.884819744389848e-06, + "loss": 1.3629, + "step": 1086 + }, + { + "epoch": 4.40080971659919, + "grad_norm": 2.6507174805524727, + "learning_rate": 6.878273035633795e-06, + "loss": 1.3853, + "step": 1087 + }, + { + "epoch": 4.4048582995951415, + "grad_norm": 2.5895703393651637, + "learning_rate": 6.871722575597829e-06, + "loss": 1.4423, + "step": 1088 + }, + { + "epoch": 4.4089068825910935, + "grad_norm": 3.2322118670425777, + "learning_rate": 6.865168377364506e-06, + "loss": 1.5468, + "step": 1089 + }, + { + "epoch": 4.412955465587045, + "grad_norm": 2.942042054251793, + "learning_rate": 6.858610454023842e-06, + "loss": 1.36, + "step": 1090 + }, + { + "epoch": 4.417004048582996, + "grad_norm": 3.122031784641475, + "learning_rate": 6.8520488186733e-06, + "loss": 1.6917, + "step": 1091 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 3.2313772685904847, + "learning_rate": 6.845483484417756e-06, + "loss": 1.7526, + "step": 1092 + }, + { + "epoch": 4.425101214574899, + "grad_norm": 2.8735793462178023, + "learning_rate": 6.838914464369467e-06, + "loss": 1.6487, + "step": 1093 + }, + { + "epoch": 4.42914979757085, + "grad_norm": 2.954566180150772, + "learning_rate": 6.832341771648057e-06, + "loss": 1.7096, + "step": 1094 + }, + { + "epoch": 4.433198380566802, + "grad_norm": 2.587188799407319, + "learning_rate": 6.825765419380484e-06, + "loss": 1.8456, + "step": 1095 + }, + { + "epoch": 4.437246963562753, + "grad_norm": 3.0518891038101925, + "learning_rate": 6.819185420701011e-06, + "loss": 1.6224, + "step": 1096 + }, + { + "epoch": 4.441295546558704, + "grad_norm": 3.118348281802091, + "learning_rate": 6.812601788751192e-06, + "loss": 1.5498, + "step": 1097 + }, + { + "epoch": 4.445344129554655, + "grad_norm": 2.894711350660116, + "learning_rate": 6.806014536679828e-06, + "loss": 1.8041, + "step": 1098 + }, + { + "epoch": 4.449392712550607, + "grad_norm": 3.062471930595446, + "learning_rate": 6.7994236776429555e-06, + "loss": 1.5815, + "step": 1099 + }, + { + "epoch": 4.4534412955465585, + "grad_norm": 3.0993288240233263, + "learning_rate": 6.792829224803816e-06, + "loss": 1.5695, + "step": 1100 + }, + { + "epoch": 4.4574898785425106, + "grad_norm": 3.149585012325393, + "learning_rate": 6.7862311913328235e-06, + "loss": 1.9487, + "step": 1101 + }, + { + "epoch": 4.461538461538462, + "grad_norm": 4.120477147155456, + "learning_rate": 6.779629590407547e-06, + "loss": 2.1517, + "step": 1102 + }, + { + "epoch": 4.465587044534413, + "grad_norm": 3.1988261301020855, + "learning_rate": 6.773024435212678e-06, + "loss": 1.79, + "step": 1103 + }, + { + "epoch": 4.469635627530364, + "grad_norm": 2.6369221757485457, + "learning_rate": 6.7664157389400095e-06, + "loss": 1.7651, + "step": 1104 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 2.7091701203884364, + "learning_rate": 6.7598035147884055e-06, + "loss": 1.6839, + "step": 1105 + }, + { + "epoch": 4.477732793522267, + "grad_norm": 3.4306069422759005, + "learning_rate": 6.753187775963773e-06, + "loss": 1.692, + "step": 1106 + }, + { + "epoch": 4.481781376518219, + "grad_norm": 2.386072562379964, + "learning_rate": 6.746568535679041e-06, + "loss": 1.6155, + "step": 1107 + }, + { + "epoch": 4.48582995951417, + "grad_norm": 2.851423578739297, + "learning_rate": 6.739945807154136e-06, + "loss": 1.5755, + "step": 1108 + }, + { + "epoch": 4.489878542510121, + "grad_norm": 3.3510139502859206, + "learning_rate": 6.733319603615941e-06, + "loss": 1.5105, + "step": 1109 + }, + { + "epoch": 4.493927125506072, + "grad_norm": 3.329100996808692, + "learning_rate": 6.726689938298289e-06, + "loss": 1.568, + "step": 1110 + }, + { + "epoch": 4.497975708502024, + "grad_norm": 2.7974205212393057, + "learning_rate": 6.72005682444192e-06, + "loss": 1.4162, + "step": 1111 + }, + { + "epoch": 4.502024291497976, + "grad_norm": 2.9991024909175676, + "learning_rate": 6.713420275294467e-06, + "loss": 1.2872, + "step": 1112 + }, + { + "epoch": 4.506072874493928, + "grad_norm": 3.341853790054196, + "learning_rate": 6.70678030411042e-06, + "loss": 1.6404, + "step": 1113 + }, + { + "epoch": 4.510121457489879, + "grad_norm": 3.2032309023708687, + "learning_rate": 6.700136924151104e-06, + "loss": 1.6321, + "step": 1114 + }, + { + "epoch": 4.51417004048583, + "grad_norm": 2.446695841899921, + "learning_rate": 6.693490148684654e-06, + "loss": 1.5906, + "step": 1115 + }, + { + "epoch": 4.518218623481781, + "grad_norm": 3.030284559367058, + "learning_rate": 6.686839990985984e-06, + "loss": 1.6148, + "step": 1116 + }, + { + "epoch": 4.522267206477733, + "grad_norm": 3.0612075992794665, + "learning_rate": 6.680186464336767e-06, + "loss": 1.5678, + "step": 1117 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 3.4922710550140685, + "learning_rate": 6.673529582025398e-06, + "loss": 1.3788, + "step": 1118 + }, + { + "epoch": 4.530364372469636, + "grad_norm": 3.4134796811660166, + "learning_rate": 6.666869357346979e-06, + "loss": 1.4428, + "step": 1119 + }, + { + "epoch": 4.534412955465587, + "grad_norm": 3.6649442008937383, + "learning_rate": 6.660205803603286e-06, + "loss": 1.5671, + "step": 1120 + }, + { + "epoch": 4.538461538461538, + "grad_norm": 3.108830354735827, + "learning_rate": 6.653538934102743e-06, + "loss": 1.7903, + "step": 1121 + }, + { + "epoch": 4.5425101214574894, + "grad_norm": 2.719205109719932, + "learning_rate": 6.646868762160399e-06, + "loss": 1.6907, + "step": 1122 + }, + { + "epoch": 4.5465587044534415, + "grad_norm": 15.861026319110369, + "learning_rate": 6.640195301097896e-06, + "loss": 2.0735, + "step": 1123 + }, + { + "epoch": 4.550607287449393, + "grad_norm": 7.357015627613091, + "learning_rate": 6.633518564243442e-06, + "loss": 2.1046, + "step": 1124 + }, + { + "epoch": 4.554655870445345, + "grad_norm": 6.67996402988713, + "learning_rate": 6.626838564931797e-06, + "loss": 2.3423, + "step": 1125 + }, + { + "epoch": 4.558704453441296, + "grad_norm": 2.790707731153053, + "learning_rate": 6.620155316504225e-06, + "loss": 1.5771, + "step": 1126 + }, + { + "epoch": 4.562753036437247, + "grad_norm": 2.6424764643365544, + "learning_rate": 6.6134688323084884e-06, + "loss": 1.4544, + "step": 1127 + }, + { + "epoch": 4.566801619433198, + "grad_norm": 4.460650672408528, + "learning_rate": 6.606779125698808e-06, + "loss": 1.7848, + "step": 1128 + }, + { + "epoch": 4.57085020242915, + "grad_norm": 2.81766092171609, + "learning_rate": 6.600086210035841e-06, + "loss": 1.4465, + "step": 1129 + }, + { + "epoch": 4.574898785425101, + "grad_norm": 2.7934258737790794, + "learning_rate": 6.593390098686653e-06, + "loss": 1.7079, + "step": 1130 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 2.357159807197533, + "learning_rate": 6.586690805024692e-06, + "loss": 1.4715, + "step": 1131 + }, + { + "epoch": 4.582995951417004, + "grad_norm": 2.8201575354409876, + "learning_rate": 6.579988342429764e-06, + "loss": 1.6256, + "step": 1132 + }, + { + "epoch": 4.587044534412955, + "grad_norm": 2.748728982741463, + "learning_rate": 6.573282724288001e-06, + "loss": 1.6067, + "step": 1133 + }, + { + "epoch": 4.5910931174089065, + "grad_norm": 3.0721591492986526, + "learning_rate": 6.566573963991839e-06, + "loss": 1.5832, + "step": 1134 + }, + { + "epoch": 4.5951417004048585, + "grad_norm": 2.8487748202828924, + "learning_rate": 6.559862074939989e-06, + "loss": 1.3233, + "step": 1135 + }, + { + "epoch": 4.59919028340081, + "grad_norm": 2.590591556134, + "learning_rate": 6.553147070537413e-06, + "loss": 1.3674, + "step": 1136 + }, + { + "epoch": 4.603238866396762, + "grad_norm": 2.6607589757127186, + "learning_rate": 6.546428964195289e-06, + "loss": 1.4813, + "step": 1137 + }, + { + "epoch": 4.607287449392713, + "grad_norm": 2.936419659787077, + "learning_rate": 6.539707769330995e-06, + "loss": 1.3335, + "step": 1138 + }, + { + "epoch": 4.611336032388664, + "grad_norm": 5.647454932081391, + "learning_rate": 6.532983499368078e-06, + "loss": 1.631, + "step": 1139 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 2.672027285236729, + "learning_rate": 6.526256167736224e-06, + "loss": 1.6247, + "step": 1140 + }, + { + "epoch": 4.619433198380567, + "grad_norm": 3.585540725187652, + "learning_rate": 6.519525787871235e-06, + "loss": 1.365, + "step": 1141 + }, + { + "epoch": 4.623481781376518, + "grad_norm": 3.509608711468321, + "learning_rate": 6.512792373215e-06, + "loss": 1.7573, + "step": 1142 + }, + { + "epoch": 4.62753036437247, + "grad_norm": 2.971185622782078, + "learning_rate": 6.506055937215471e-06, + "loss": 1.561, + "step": 1143 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 3.2949915313334035, + "learning_rate": 6.499316493326631e-06, + "loss": 1.836, + "step": 1144 + }, + { + "epoch": 4.635627530364372, + "grad_norm": 2.861710933431733, + "learning_rate": 6.492574055008474e-06, + "loss": 1.4458, + "step": 1145 + }, + { + "epoch": 4.6396761133603235, + "grad_norm": 3.3593193695088828, + "learning_rate": 6.4858286357269716e-06, + "loss": 1.6806, + "step": 1146 + }, + { + "epoch": 4.6437246963562755, + "grad_norm": 2.7995829454110317, + "learning_rate": 6.4790802489540495e-06, + "loss": 1.5849, + "step": 1147 + }, + { + "epoch": 4.647773279352227, + "grad_norm": 2.9650473845995617, + "learning_rate": 6.472328908167562e-06, + "loss": 1.6598, + "step": 1148 + }, + { + "epoch": 4.651821862348179, + "grad_norm": 2.7905940219323475, + "learning_rate": 6.465574626851262e-06, + "loss": 1.4666, + "step": 1149 + }, + { + "epoch": 4.65587044534413, + "grad_norm": 3.2553490418837323, + "learning_rate": 6.4588174184947725e-06, + "loss": 1.6918, + "step": 1150 + }, + { + "epoch": 4.659919028340081, + "grad_norm": 3.55927475882226, + "learning_rate": 6.452057296593568e-06, + "loss": 1.5207, + "step": 1151 + }, + { + "epoch": 4.663967611336032, + "grad_norm": 2.9162925097777954, + "learning_rate": 6.445294274648937e-06, + "loss": 1.6745, + "step": 1152 + }, + { + "epoch": 4.668016194331984, + "grad_norm": 2.987151078867793, + "learning_rate": 6.4385283661679624e-06, + "loss": 1.6752, + "step": 1153 + }, + { + "epoch": 4.672064777327935, + "grad_norm": 3.186333717498487, + "learning_rate": 6.431759584663492e-06, + "loss": 1.753, + "step": 1154 + }, + { + "epoch": 4.676113360323887, + "grad_norm": 9.509020769435434, + "learning_rate": 6.424987943654109e-06, + "loss": 1.6195, + "step": 1155 + }, + { + "epoch": 4.680161943319838, + "grad_norm": 3.356709601234609, + "learning_rate": 6.418213456664111e-06, + "loss": 1.6311, + "step": 1156 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 2.921816366789115, + "learning_rate": 6.411436137223479e-06, + "loss": 1.4584, + "step": 1157 + }, + { + "epoch": 4.6882591093117405, + "grad_norm": 2.8660981524508338, + "learning_rate": 6.4046559988678485e-06, + "loss": 1.6084, + "step": 1158 + }, + { + "epoch": 4.6923076923076925, + "grad_norm": 3.0730207415431954, + "learning_rate": 6.397873055138487e-06, + "loss": 1.6274, + "step": 1159 + }, + { + "epoch": 4.696356275303644, + "grad_norm": 2.766004464269283, + "learning_rate": 6.391087319582264e-06, + "loss": 1.4697, + "step": 1160 + }, + { + "epoch": 4.700404858299595, + "grad_norm": 3.6099089118584136, + "learning_rate": 6.384298805751626e-06, + "loss": 1.9489, + "step": 1161 + }, + { + "epoch": 4.704453441295547, + "grad_norm": 3.442626114825173, + "learning_rate": 6.37750752720457e-06, + "loss": 1.727, + "step": 1162 + }, + { + "epoch": 4.708502024291498, + "grad_norm": 3.341066779383342, + "learning_rate": 6.370713497504607e-06, + "loss": 1.3178, + "step": 1163 + }, + { + "epoch": 4.712550607287449, + "grad_norm": 2.8791145178147386, + "learning_rate": 6.363916730220752e-06, + "loss": 1.4908, + "step": 1164 + }, + { + "epoch": 4.716599190283401, + "grad_norm": 2.8558993301680076, + "learning_rate": 6.357117238927481e-06, + "loss": 1.588, + "step": 1165 + }, + { + "epoch": 4.720647773279352, + "grad_norm": 3.403507251743757, + "learning_rate": 6.350315037204714e-06, + "loss": 1.3794, + "step": 1166 + }, + { + "epoch": 4.724696356275303, + "grad_norm": 3.28937405397847, + "learning_rate": 6.343510138637783e-06, + "loss": 1.535, + "step": 1167 + }, + { + "epoch": 4.728744939271255, + "grad_norm": 3.182353899970667, + "learning_rate": 6.336702556817405e-06, + "loss": 1.7416, + "step": 1168 + }, + { + "epoch": 4.732793522267206, + "grad_norm": 2.8393068837004285, + "learning_rate": 6.329892305339659e-06, + "loss": 1.521, + "step": 1169 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 3.0526645906441585, + "learning_rate": 6.323079397805951e-06, + "loss": 1.6001, + "step": 1170 + }, + { + "epoch": 4.7408906882591095, + "grad_norm": 3.453365846818349, + "learning_rate": 6.3162638478229965e-06, + "loss": 2.244, + "step": 1171 + }, + { + "epoch": 4.744939271255061, + "grad_norm": 2.930549437132931, + "learning_rate": 6.309445669002787e-06, + "loss": 1.6859, + "step": 1172 + }, + { + "epoch": 4.748987854251012, + "grad_norm": 3.513459131175886, + "learning_rate": 6.302624874962563e-06, + "loss": 1.5138, + "step": 1173 + }, + { + "epoch": 4.753036437246964, + "grad_norm": 3.101847130962305, + "learning_rate": 6.295801479324788e-06, + "loss": 1.4048, + "step": 1174 + }, + { + "epoch": 4.757085020242915, + "grad_norm": 2.9351108422638625, + "learning_rate": 6.288975495717124e-06, + "loss": 1.5932, + "step": 1175 + }, + { + "epoch": 4.761133603238866, + "grad_norm": 4.674100976432621, + "learning_rate": 6.282146937772399e-06, + "loss": 2.3515, + "step": 1176 + }, + { + "epoch": 4.765182186234818, + "grad_norm": 5.182394350357637, + "learning_rate": 6.2753158191285844e-06, + "loss": 2.1322, + "step": 1177 + }, + { + "epoch": 4.769230769230769, + "grad_norm": 6.057045402676707, + "learning_rate": 6.268482153428763e-06, + "loss": 2.0072, + "step": 1178 + }, + { + "epoch": 4.77327935222672, + "grad_norm": 3.1068830892655726, + "learning_rate": 6.261645954321109e-06, + "loss": 1.5127, + "step": 1179 + }, + { + "epoch": 4.777327935222672, + "grad_norm": 3.0244265678427213, + "learning_rate": 6.254807235458853e-06, + "loss": 1.7728, + "step": 1180 + }, + { + "epoch": 4.781376518218623, + "grad_norm": 2.949903538067424, + "learning_rate": 6.247966010500258e-06, + "loss": 1.78, + "step": 1181 + }, + { + "epoch": 4.7854251012145745, + "grad_norm": 3.1823383170218946, + "learning_rate": 6.241122293108594e-06, + "loss": 1.6101, + "step": 1182 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 3.0390422214285975, + "learning_rate": 6.2342760969521085e-06, + "loss": 1.5326, + "step": 1183 + }, + { + "epoch": 4.793522267206478, + "grad_norm": 3.136764973756456, + "learning_rate": 6.227427435703997e-06, + "loss": 1.5671, + "step": 1184 + }, + { + "epoch": 4.797570850202429, + "grad_norm": 3.358208559803108, + "learning_rate": 6.220576323042381e-06, + "loss": 1.5746, + "step": 1185 + }, + { + "epoch": 4.801619433198381, + "grad_norm": 2.8750507177466305, + "learning_rate": 6.213722772650277e-06, + "loss": 1.4246, + "step": 1186 + }, + { + "epoch": 4.805668016194332, + "grad_norm": 3.028809163189934, + "learning_rate": 6.206866798215571e-06, + "loss": 1.317, + "step": 1187 + }, + { + "epoch": 4.809716599190283, + "grad_norm": 3.126804073645922, + "learning_rate": 6.2000084134309905e-06, + "loss": 1.6821, + "step": 1188 + }, + { + "epoch": 4.813765182186235, + "grad_norm": 3.71033178556479, + "learning_rate": 6.193147631994073e-06, + "loss": 1.5786, + "step": 1189 + }, + { + "epoch": 4.817813765182186, + "grad_norm": 3.2129146658285346, + "learning_rate": 6.186284467607149e-06, + "loss": 1.3971, + "step": 1190 + }, + { + "epoch": 4.821862348178137, + "grad_norm": 10.210146232119035, + "learning_rate": 6.179418933977301e-06, + "loss": 2.3347, + "step": 1191 + }, + { + "epoch": 4.825910931174089, + "grad_norm": 21.275577852601224, + "learning_rate": 6.1725510448163516e-06, + "loss": 3.6222, + "step": 1192 + }, + { + "epoch": 4.82995951417004, + "grad_norm": 3.4666551476237584, + "learning_rate": 6.165680813840822e-06, + "loss": 1.4645, + "step": 1193 + }, + { + "epoch": 4.834008097165992, + "grad_norm": 3.4458166986644443, + "learning_rate": 6.1588082547719095e-06, + "loss": 1.3391, + "step": 1194 + }, + { + "epoch": 4.838056680161944, + "grad_norm": 2.919273388343095, + "learning_rate": 6.151933381335468e-06, + "loss": 1.4313, + "step": 1195 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 3.0732467672720736, + "learning_rate": 6.1450562072619635e-06, + "loss": 1.4611, + "step": 1196 + }, + { + "epoch": 4.846153846153846, + "grad_norm": 3.2736024865252493, + "learning_rate": 6.138176746286468e-06, + "loss": 1.3333, + "step": 1197 + }, + { + "epoch": 4.850202429149798, + "grad_norm": 3.3437325068102486, + "learning_rate": 6.131295012148613e-06, + "loss": 1.4833, + "step": 1198 + }, + { + "epoch": 4.854251012145749, + "grad_norm": 3.6058736308138766, + "learning_rate": 6.124411018592568e-06, + "loss": 1.5521, + "step": 1199 + }, + { + "epoch": 4.8582995951417, + "grad_norm": 2.6980859324752267, + "learning_rate": 6.117524779367027e-06, + "loss": 1.4743, + "step": 1200 + }, + { + "epoch": 4.862348178137652, + "grad_norm": 3.4307422256171947, + "learning_rate": 6.110636308225157e-06, + "loss": 1.4612, + "step": 1201 + }, + { + "epoch": 4.866396761133603, + "grad_norm": 3.4665359414620625, + "learning_rate": 6.103745618924587e-06, + "loss": 1.4922, + "step": 1202 + }, + { + "epoch": 4.870445344129554, + "grad_norm": 4.034402333282032, + "learning_rate": 6.096852725227378e-06, + "loss": 1.9715, + "step": 1203 + }, + { + "epoch": 4.874493927125506, + "grad_norm": 3.6881022424154097, + "learning_rate": 6.089957640899988e-06, + "loss": 1.9107, + "step": 1204 + }, + { + "epoch": 4.8785425101214575, + "grad_norm": 3.862338875685726, + "learning_rate": 6.0830603797132574e-06, + "loss": 1.661, + "step": 1205 + }, + { + "epoch": 4.882591093117409, + "grad_norm": 3.384483266395071, + "learning_rate": 6.076160955442369e-06, + "loss": 1.5689, + "step": 1206 + }, + { + "epoch": 4.886639676113361, + "grad_norm": 3.345513039253192, + "learning_rate": 6.069259381866827e-06, + "loss": 1.1468, + "step": 1207 + }, + { + "epoch": 4.890688259109312, + "grad_norm": 2.8964038452697847, + "learning_rate": 6.0623556727704306e-06, + "loss": 1.6516, + "step": 1208 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 2.9136386786268895, + "learning_rate": 6.055449841941238e-06, + "loss": 1.7215, + "step": 1209 + }, + { + "epoch": 4.898785425101215, + "grad_norm": 2.7655346557671248, + "learning_rate": 6.048541903171552e-06, + "loss": 1.4413, + "step": 1210 + }, + { + "epoch": 4.902834008097166, + "grad_norm": 3.2433937012234715, + "learning_rate": 6.041631870257882e-06, + "loss": 1.4725, + "step": 1211 + }, + { + "epoch": 4.906882591093117, + "grad_norm": 3.4688660789200325, + "learning_rate": 6.034719757000918e-06, + "loss": 1.6069, + "step": 1212 + }, + { + "epoch": 4.910931174089069, + "grad_norm": 3.106070985660449, + "learning_rate": 6.0278055772055075e-06, + "loss": 1.2312, + "step": 1213 + }, + { + "epoch": 4.91497975708502, + "grad_norm": 3.4926777350408664, + "learning_rate": 6.020889344680627e-06, + "loss": 1.3252, + "step": 1214 + }, + { + "epoch": 4.919028340080971, + "grad_norm": 3.31474250904695, + "learning_rate": 6.013971073239346e-06, + "loss": 1.3404, + "step": 1215 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 2.7200582966885953, + "learning_rate": 6.007050776698816e-06, + "loss": 1.6668, + "step": 1216 + }, + { + "epoch": 4.9271255060728745, + "grad_norm": 4.194613418220712, + "learning_rate": 6.000128468880223e-06, + "loss": 1.5178, + "step": 1217 + }, + { + "epoch": 4.931174089068826, + "grad_norm": 3.6956716885492047, + "learning_rate": 5.993204163608776e-06, + "loss": 1.5313, + "step": 1218 + }, + { + "epoch": 4.935222672064778, + "grad_norm": 3.42386071095716, + "learning_rate": 5.986277874713672e-06, + "loss": 1.315, + "step": 1219 + }, + { + "epoch": 4.939271255060729, + "grad_norm": 3.4411238008448497, + "learning_rate": 5.979349616028067e-06, + "loss": 1.2599, + "step": 1220 + }, + { + "epoch": 4.94331983805668, + "grad_norm": 4.136849869910849, + "learning_rate": 5.972419401389058e-06, + "loss": 1.5671, + "step": 1221 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 3.3509710910402344, + "learning_rate": 5.96548724463764e-06, + "loss": 1.3098, + "step": 1222 + }, + { + "epoch": 4.951417004048583, + "grad_norm": 3.826301738234217, + "learning_rate": 5.958553159618693e-06, + "loss": 1.2627, + "step": 1223 + }, + { + "epoch": 4.955465587044534, + "grad_norm": 4.211383102056784, + "learning_rate": 5.951617160180944e-06, + "loss": 1.4866, + "step": 1224 + }, + { + "epoch": 4.959514170040486, + "grad_norm": 3.9784296755787043, + "learning_rate": 5.944679260176947e-06, + "loss": 1.5416, + "step": 1225 + }, + { + "epoch": 4.963562753036437, + "grad_norm": 3.121952186318371, + "learning_rate": 5.937739473463047e-06, + "loss": 1.5505, + "step": 1226 + }, + { + "epoch": 4.967611336032388, + "grad_norm": 3.717226187124744, + "learning_rate": 5.930797813899364e-06, + "loss": 1.6869, + "step": 1227 + }, + { + "epoch": 4.97165991902834, + "grad_norm": 4.139266573612088, + "learning_rate": 5.923854295349751e-06, + "loss": 1.5989, + "step": 1228 + }, + { + "epoch": 4.9757085020242915, + "grad_norm": 2.8954471867608937, + "learning_rate": 5.916908931681781e-06, + "loss": 1.5245, + "step": 1229 + }, + { + "epoch": 4.979757085020243, + "grad_norm": 3.153595083245072, + "learning_rate": 5.9099617367667065e-06, + "loss": 1.6063, + "step": 1230 + }, + { + "epoch": 4.983805668016195, + "grad_norm": 2.8400997626861173, + "learning_rate": 5.9030127244794385e-06, + "loss": 1.6715, + "step": 1231 + }, + { + "epoch": 4.987854251012146, + "grad_norm": 3.2491090209153874, + "learning_rate": 5.896061908698521e-06, + "loss": 1.4666, + "step": 1232 + }, + { + "epoch": 4.991902834008097, + "grad_norm": 2.6679775725786286, + "learning_rate": 5.8891093033060945e-06, + "loss": 1.4425, + "step": 1233 + }, + { + "epoch": 4.995951417004049, + "grad_norm": 2.6288454727168067, + "learning_rate": 5.8821549221878795e-06, + "loss": 1.7597, + "step": 1234 + }, + { + "epoch": 5.0, + "grad_norm": 2.885385124366649, + "learning_rate": 5.8751987792331365e-06, + "loss": 1.4922, + "step": 1235 + }, + { + "epoch": 5.004048582995951, + "grad_norm": 2.87961175357714, + "learning_rate": 5.8682408883346535e-06, + "loss": 1.5315, + "step": 1236 + }, + { + "epoch": 5.008097165991903, + "grad_norm": 3.895617299101059, + "learning_rate": 5.861281263388699e-06, + "loss": 1.6767, + "step": 1237 + }, + { + "epoch": 5.012145748987854, + "grad_norm": 3.762686290641399, + "learning_rate": 5.854319918295012e-06, + "loss": 1.5156, + "step": 1238 + }, + { + "epoch": 5.016194331983805, + "grad_norm": 4.177708865223027, + "learning_rate": 5.8473568669567645e-06, + "loss": 1.7157, + "step": 1239 + }, + { + "epoch": 5.020242914979757, + "grad_norm": 3.5866973777228996, + "learning_rate": 5.84039212328054e-06, + "loss": 1.9457, + "step": 1240 + }, + { + "epoch": 5.0242914979757085, + "grad_norm": 3.7038579253911434, + "learning_rate": 5.833425701176294e-06, + "loss": 1.8054, + "step": 1241 + }, + { + "epoch": 5.02834008097166, + "grad_norm": 3.053021737504678, + "learning_rate": 5.826457614557342e-06, + "loss": 1.4846, + "step": 1242 + }, + { + "epoch": 5.032388663967612, + "grad_norm": 3.7131269515944236, + "learning_rate": 5.819487877340318e-06, + "loss": 1.4864, + "step": 1243 + }, + { + "epoch": 5.036437246963563, + "grad_norm": 3.47442806634264, + "learning_rate": 5.812516503445158e-06, + "loss": 1.7235, + "step": 1244 + }, + { + "epoch": 5.040485829959514, + "grad_norm": 3.509517402822926, + "learning_rate": 5.805543506795063e-06, + "loss": 1.517, + "step": 1245 + }, + { + "epoch": 5.044534412955466, + "grad_norm": 3.3619188629392305, + "learning_rate": 5.798568901316475e-06, + "loss": 1.5768, + "step": 1246 + }, + { + "epoch": 5.048582995951417, + "grad_norm": 3.557428062968091, + "learning_rate": 5.79159270093905e-06, + "loss": 1.5018, + "step": 1247 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 3.7281770232445295, + "learning_rate": 5.784614919595631e-06, + "loss": 1.5785, + "step": 1248 + }, + { + "epoch": 5.05668016194332, + "grad_norm": 3.517681869861109, + "learning_rate": 5.7776355712222165e-06, + "loss": 1.4217, + "step": 1249 + }, + { + "epoch": 5.060728744939271, + "grad_norm": 3.487707428141539, + "learning_rate": 5.770654669757935e-06, + "loss": 1.5864, + "step": 1250 + }, + { + "epoch": 5.064777327935222, + "grad_norm": 3.79463286822166, + "learning_rate": 5.763672229145015e-06, + "loss": 1.5406, + "step": 1251 + }, + { + "epoch": 5.068825910931174, + "grad_norm": 3.9587280022782623, + "learning_rate": 5.756688263328762e-06, + "loss": 1.6808, + "step": 1252 + }, + { + "epoch": 5.0728744939271255, + "grad_norm": 3.574038459442136, + "learning_rate": 5.749702786257529e-06, + "loss": 1.6199, + "step": 1253 + }, + { + "epoch": 5.076923076923077, + "grad_norm": 3.9239619763747666, + "learning_rate": 5.742715811882682e-06, + "loss": 1.5554, + "step": 1254 + }, + { + "epoch": 5.080971659919029, + "grad_norm": 3.3525677000904435, + "learning_rate": 5.735727354158581e-06, + "loss": 1.5965, + "step": 1255 + }, + { + "epoch": 5.08502024291498, + "grad_norm": 3.14038896931749, + "learning_rate": 5.7287374270425475e-06, + "loss": 1.5955, + "step": 1256 + }, + { + "epoch": 5.089068825910931, + "grad_norm": 3.800313028867603, + "learning_rate": 5.721746044494838e-06, + "loss": 1.5594, + "step": 1257 + }, + { + "epoch": 5.093117408906883, + "grad_norm": 3.5079921931841707, + "learning_rate": 5.714753220478616e-06, + "loss": 1.6374, + "step": 1258 + }, + { + "epoch": 5.097165991902834, + "grad_norm": 3.3722158742610033, + "learning_rate": 5.707758968959923e-06, + "loss": 1.3947, + "step": 1259 + }, + { + "epoch": 5.101214574898785, + "grad_norm": 3.690572058964337, + "learning_rate": 5.7007633039076535e-06, + "loss": 1.5641, + "step": 1260 + }, + { + "epoch": 5.105263157894737, + "grad_norm": 3.868480542932687, + "learning_rate": 5.693766239293522e-06, + "loss": 1.5403, + "step": 1261 + }, + { + "epoch": 5.109311740890688, + "grad_norm": 3.642440736287873, + "learning_rate": 5.686767789092041e-06, + "loss": 1.4899, + "step": 1262 + }, + { + "epoch": 5.113360323886639, + "grad_norm": 4.407879993174004, + "learning_rate": 5.67976796728049e-06, + "loss": 1.4415, + "step": 1263 + }, + { + "epoch": 5.117408906882591, + "grad_norm": 3.9268283691257166, + "learning_rate": 5.672766787838884e-06, + "loss": 1.349, + "step": 1264 + }, + { + "epoch": 5.1214574898785425, + "grad_norm": 3.5424496240381282, + "learning_rate": 5.6657642647499545e-06, + "loss": 1.4005, + "step": 1265 + }, + { + "epoch": 5.125506072874494, + "grad_norm": 3.714267182183359, + "learning_rate": 5.658760411999115e-06, + "loss": 1.4047, + "step": 1266 + }, + { + "epoch": 5.129554655870446, + "grad_norm": 4.1352520308511425, + "learning_rate": 5.6517552435744325e-06, + "loss": 1.3041, + "step": 1267 + }, + { + "epoch": 5.133603238866397, + "grad_norm": 3.1992855070868185, + "learning_rate": 5.644748773466606e-06, + "loss": 1.6559, + "step": 1268 + }, + { + "epoch": 5.137651821862348, + "grad_norm": 3.852499540993822, + "learning_rate": 5.637741015668929e-06, + "loss": 1.4822, + "step": 1269 + }, + { + "epoch": 5.1417004048583, + "grad_norm": 3.0057363516680513, + "learning_rate": 5.630731984177269e-06, + "loss": 1.2246, + "step": 1270 + }, + { + "epoch": 5.145748987854251, + "grad_norm": 3.8748912975587544, + "learning_rate": 5.62372169299004e-06, + "loss": 1.5924, + "step": 1271 + }, + { + "epoch": 5.149797570850202, + "grad_norm": 3.5771984578664875, + "learning_rate": 5.616710156108167e-06, + "loss": 1.4133, + "step": 1272 + }, + { + "epoch": 5.153846153846154, + "grad_norm": 3.2086974588686576, + "learning_rate": 5.609697387535068e-06, + "loss": 1.621, + "step": 1273 + }, + { + "epoch": 5.157894736842105, + "grad_norm": 3.984819835501151, + "learning_rate": 5.6026834012766155e-06, + "loss": 1.7158, + "step": 1274 + }, + { + "epoch": 5.161943319838056, + "grad_norm": 3.2013860532982337, + "learning_rate": 5.5956682113411184e-06, + "loss": 1.4746, + "step": 1275 + }, + { + "epoch": 5.165991902834008, + "grad_norm": 3.450642934981606, + "learning_rate": 5.588651831739289e-06, + "loss": 1.5543, + "step": 1276 + }, + { + "epoch": 5.17004048582996, + "grad_norm": 3.093776549631426, + "learning_rate": 5.581634276484211e-06, + "loss": 2.074, + "step": 1277 + }, + { + "epoch": 5.174089068825911, + "grad_norm": 3.545758099078526, + "learning_rate": 5.574615559591323e-06, + "loss": 1.3906, + "step": 1278 + }, + { + "epoch": 5.178137651821863, + "grad_norm": 4.14672203994261, + "learning_rate": 5.567595695078379e-06, + "loss": 1.5738, + "step": 1279 + }, + { + "epoch": 5.182186234817814, + "grad_norm": 2.9347838837502294, + "learning_rate": 5.560574696965425e-06, + "loss": 1.3815, + "step": 1280 + }, + { + "epoch": 5.186234817813765, + "grad_norm": 3.90774860265149, + "learning_rate": 5.553552579274775e-06, + "loss": 1.5673, + "step": 1281 + }, + { + "epoch": 5.190283400809717, + "grad_norm": 3.578616704951525, + "learning_rate": 5.546529356030974e-06, + "loss": 1.5733, + "step": 1282 + }, + { + "epoch": 5.194331983805668, + "grad_norm": 4.0010401720998185, + "learning_rate": 5.539505041260779e-06, + "loss": 1.757, + "step": 1283 + }, + { + "epoch": 5.198380566801619, + "grad_norm": 3.509112575984563, + "learning_rate": 5.532479648993122e-06, + "loss": 1.8081, + "step": 1284 + }, + { + "epoch": 5.202429149797571, + "grad_norm": 3.5347317901565556, + "learning_rate": 5.525453193259094e-06, + "loss": 1.5116, + "step": 1285 + }, + { + "epoch": 5.206477732793522, + "grad_norm": 3.4675375372116184, + "learning_rate": 5.518425688091906e-06, + "loss": 1.8506, + "step": 1286 + }, + { + "epoch": 5.2105263157894735, + "grad_norm": 3.6323230014040306, + "learning_rate": 5.511397147526862e-06, + "loss": 1.8682, + "step": 1287 + }, + { + "epoch": 5.2145748987854255, + "grad_norm": 3.5536336190454048, + "learning_rate": 5.504367585601342e-06, + "loss": 1.6388, + "step": 1288 + }, + { + "epoch": 5.218623481781377, + "grad_norm": 3.6273876631462905, + "learning_rate": 5.497337016354757e-06, + "loss": 1.5266, + "step": 1289 + }, + { + "epoch": 5.222672064777328, + "grad_norm": 3.605955542328613, + "learning_rate": 5.490305453828534e-06, + "loss": 1.4274, + "step": 1290 + }, + { + "epoch": 5.22672064777328, + "grad_norm": 3.594834856645006, + "learning_rate": 5.483272912066084e-06, + "loss": 1.6117, + "step": 1291 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 3.6817183177194295, + "learning_rate": 5.476239405112775e-06, + "loss": 1.4265, + "step": 1292 + }, + { + "epoch": 5.234817813765182, + "grad_norm": 4.022022675891982, + "learning_rate": 5.469204947015897e-06, + "loss": 1.668, + "step": 1293 + }, + { + "epoch": 5.238866396761134, + "grad_norm": 3.889168025126557, + "learning_rate": 5.462169551824648e-06, + "loss": 1.6076, + "step": 1294 + }, + { + "epoch": 5.242914979757085, + "grad_norm": 3.6700082316334273, + "learning_rate": 5.45513323359009e-06, + "loss": 1.6171, + "step": 1295 + }, + { + "epoch": 5.246963562753036, + "grad_norm": 3.6748741609947855, + "learning_rate": 5.448096006365132e-06, + "loss": 1.4488, + "step": 1296 + }, + { + "epoch": 5.251012145748988, + "grad_norm": 3.6290737200114993, + "learning_rate": 5.4410578842045e-06, + "loss": 1.5478, + "step": 1297 + }, + { + "epoch": 5.255060728744939, + "grad_norm": 3.8478048256636357, + "learning_rate": 5.434018881164702e-06, + "loss": 1.523, + "step": 1298 + }, + { + "epoch": 5.2591093117408905, + "grad_norm": 3.312410066611835, + "learning_rate": 5.426979011304012e-06, + "loss": 1.4463, + "step": 1299 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 3.647621711678499, + "learning_rate": 5.41993828868243e-06, + "loss": 1.2639, + "step": 1300 + }, + { + "epoch": 5.267206477732794, + "grad_norm": 3.5536727878739205, + "learning_rate": 5.412896727361663e-06, + "loss": 1.5401, + "step": 1301 + }, + { + "epoch": 5.271255060728745, + "grad_norm": 3.539451611896165, + "learning_rate": 5.405854341405088e-06, + "loss": 1.5594, + "step": 1302 + }, + { + "epoch": 5.275303643724697, + "grad_norm": 3.4030202032336394, + "learning_rate": 5.398811144877733e-06, + "loss": 1.5997, + "step": 1303 + }, + { + "epoch": 5.279352226720648, + "grad_norm": 4.605755727643003, + "learning_rate": 5.391767151846247e-06, + "loss": 1.5551, + "step": 1304 + }, + { + "epoch": 5.283400809716599, + "grad_norm": 4.210060420659593, + "learning_rate": 5.384722376378861e-06, + "loss": 1.2388, + "step": 1305 + }, + { + "epoch": 5.287449392712551, + "grad_norm": 4.288644203676987, + "learning_rate": 5.377676832545377e-06, + "loss": 1.3926, + "step": 1306 + }, + { + "epoch": 5.291497975708502, + "grad_norm": 4.344641505323721, + "learning_rate": 5.370630534417133e-06, + "loss": 1.2335, + "step": 1307 + }, + { + "epoch": 5.295546558704453, + "grad_norm": 4.293842456125265, + "learning_rate": 5.363583496066963e-06, + "loss": 1.5097, + "step": 1308 + }, + { + "epoch": 5.299595141700405, + "grad_norm": 3.5889617840380956, + "learning_rate": 5.356535731569189e-06, + "loss": 1.6798, + "step": 1309 + }, + { + "epoch": 5.303643724696356, + "grad_norm": 3.8949744261018844, + "learning_rate": 5.349487254999579e-06, + "loss": 1.3501, + "step": 1310 + }, + { + "epoch": 5.3076923076923075, + "grad_norm": 3.8938141628185394, + "learning_rate": 5.342438080435325e-06, + "loss": 1.3823, + "step": 1311 + }, + { + "epoch": 5.3117408906882595, + "grad_norm": 3.7811284620632146, + "learning_rate": 5.335388221955012e-06, + "loss": 1.4001, + "step": 1312 + }, + { + "epoch": 5.315789473684211, + "grad_norm": 4.504485300390198, + "learning_rate": 5.328337693638591e-06, + "loss": 1.3433, + "step": 1313 + }, + { + "epoch": 5.319838056680162, + "grad_norm": 3.9863561932252, + "learning_rate": 5.321286509567351e-06, + "loss": 1.2701, + "step": 1314 + }, + { + "epoch": 5.323886639676114, + "grad_norm": 4.103946070839009, + "learning_rate": 5.314234683823892e-06, + "loss": 1.2979, + "step": 1315 + }, + { + "epoch": 5.327935222672065, + "grad_norm": 3.9048810862002896, + "learning_rate": 5.307182230492089e-06, + "loss": 1.3284, + "step": 1316 + }, + { + "epoch": 5.331983805668016, + "grad_norm": 3.802962634621348, + "learning_rate": 5.300129163657081e-06, + "loss": 1.3376, + "step": 1317 + }, + { + "epoch": 5.336032388663968, + "grad_norm": 3.6151941699291696, + "learning_rate": 5.2930754974052245e-06, + "loss": 1.3976, + "step": 1318 + }, + { + "epoch": 5.340080971659919, + "grad_norm": 3.4851660754400124, + "learning_rate": 5.286021245824075e-06, + "loss": 1.3431, + "step": 1319 + }, + { + "epoch": 5.34412955465587, + "grad_norm": 3.7167755157754008, + "learning_rate": 5.2789664230023595e-06, + "loss": 1.295, + "step": 1320 + }, + { + "epoch": 5.348178137651822, + "grad_norm": 4.41974802384744, + "learning_rate": 5.2719110430299416e-06, + "loss": 1.4491, + "step": 1321 + }, + { + "epoch": 5.352226720647773, + "grad_norm": 4.277030621050548, + "learning_rate": 5.264855119997803e-06, + "loss": 1.4354, + "step": 1322 + }, + { + "epoch": 5.3562753036437245, + "grad_norm": 4.194929698692418, + "learning_rate": 5.257798667998003e-06, + "loss": 1.0844, + "step": 1323 + }, + { + "epoch": 5.3603238866396765, + "grad_norm": 4.472113694740598, + "learning_rate": 5.2507417011236625e-06, + "loss": 1.4929, + "step": 1324 + }, + { + "epoch": 5.364372469635628, + "grad_norm": 3.9849001434928866, + "learning_rate": 5.243684233468933e-06, + "loss": 1.5648, + "step": 1325 + }, + { + "epoch": 5.368421052631579, + "grad_norm": 3.864302824850682, + "learning_rate": 5.236626279128958e-06, + "loss": 1.473, + "step": 1326 + }, + { + "epoch": 5.372469635627531, + "grad_norm": 4.810968253503194, + "learning_rate": 5.22956785219986e-06, + "loss": 1.5456, + "step": 1327 + }, + { + "epoch": 5.376518218623482, + "grad_norm": 4.111208820335583, + "learning_rate": 5.222508966778702e-06, + "loss": 1.2098, + "step": 1328 + }, + { + "epoch": 5.380566801619433, + "grad_norm": 4.534807999665865, + "learning_rate": 5.2154496369634645e-06, + "loss": 1.363, + "step": 1329 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 4.048755500092308, + "learning_rate": 5.208389876853014e-06, + "loss": 1.1592, + "step": 1330 + }, + { + "epoch": 5.388663967611336, + "grad_norm": 4.75370785314969, + "learning_rate": 5.201329700547077e-06, + "loss": 1.226, + "step": 1331 + }, + { + "epoch": 5.392712550607287, + "grad_norm": 4.367024994722068, + "learning_rate": 5.194269122146211e-06, + "loss": 1.4048, + "step": 1332 + }, + { + "epoch": 5.396761133603239, + "grad_norm": 4.918852915006795, + "learning_rate": 5.187208155751779e-06, + "loss": 1.2387, + "step": 1333 + }, + { + "epoch": 5.40080971659919, + "grad_norm": 3.6289200014371894, + "learning_rate": 5.180146815465915e-06, + "loss": 1.2571, + "step": 1334 + }, + { + "epoch": 5.4048582995951415, + "grad_norm": 3.7443218122005266, + "learning_rate": 5.173085115391502e-06, + "loss": 1.3062, + "step": 1335 + }, + { + "epoch": 5.4089068825910935, + "grad_norm": 4.7017026873802426, + "learning_rate": 5.16602306963214e-06, + "loss": 1.4154, + "step": 1336 + }, + { + "epoch": 5.412955465587045, + "grad_norm": 4.150505086067103, + "learning_rate": 5.158960692292122e-06, + "loss": 1.2259, + "step": 1337 + }, + { + "epoch": 5.417004048582996, + "grad_norm": 4.482184582986182, + "learning_rate": 5.151897997476403e-06, + "loss": 1.5583, + "step": 1338 + }, + { + "epoch": 5.421052631578947, + "grad_norm": 4.682227327595727, + "learning_rate": 5.144834999290567e-06, + "loss": 1.598, + "step": 1339 + }, + { + "epoch": 5.425101214574899, + "grad_norm": 4.008926002575055, + "learning_rate": 5.137771711840811e-06, + "loss": 1.5379, + "step": 1340 + }, + { + "epoch": 5.42914979757085, + "grad_norm": 4.302820633137393, + "learning_rate": 5.130708149233905e-06, + "loss": 1.5569, + "step": 1341 + }, + { + "epoch": 5.433198380566802, + "grad_norm": 3.5969352441824007, + "learning_rate": 5.123644325577168e-06, + "loss": 1.7237, + "step": 1342 + }, + { + "epoch": 5.437246963562753, + "grad_norm": 4.1865532032949035, + "learning_rate": 5.116580254978447e-06, + "loss": 1.4932, + "step": 1343 + }, + { + "epoch": 5.441295546558704, + "grad_norm": 4.443537220527738, + "learning_rate": 5.1095159515460736e-06, + "loss": 1.4349, + "step": 1344 + }, + { + "epoch": 5.445344129554655, + "grad_norm": 3.8400638359623653, + "learning_rate": 5.10245142938885e-06, + "loss": 1.6808, + "step": 1345 + }, + { + "epoch": 5.449392712550607, + "grad_norm": 4.456713357432363, + "learning_rate": 5.095386702616012e-06, + "loss": 1.4753, + "step": 1346 + }, + { + "epoch": 5.4534412955465585, + "grad_norm": 4.371248488578587, + "learning_rate": 5.088321785337207e-06, + "loss": 1.4634, + "step": 1347 + }, + { + "epoch": 5.4574898785425106, + "grad_norm": 4.503939177016205, + "learning_rate": 5.0812566916624624e-06, + "loss": 1.8175, + "step": 1348 + }, + { + "epoch": 5.461538461538462, + "grad_norm": 5.8661687643019444, + "learning_rate": 5.074191435702155e-06, + "loss": 1.9684, + "step": 1349 + }, + { + "epoch": 5.465587044534413, + "grad_norm": 4.324067092257868, + "learning_rate": 5.067126031566988e-06, + "loss": 1.6405, + "step": 1350 + }, + { + "epoch": 5.469635627530364, + "grad_norm": 3.796039870689883, + "learning_rate": 5.060060493367961e-06, + "loss": 1.6486, + "step": 1351 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 3.738600525398421, + "learning_rate": 5.05299483521634e-06, + "loss": 1.5872, + "step": 1352 + }, + { + "epoch": 5.477732793522267, + "grad_norm": 4.6006758703016, + "learning_rate": 5.045929071223633e-06, + "loss": 1.5976, + "step": 1353 + }, + { + "epoch": 5.481781376518219, + "grad_norm": 3.3463637296184854, + "learning_rate": 5.038863215501555e-06, + "loss": 1.5156, + "step": 1354 + }, + { + "epoch": 5.48582995951417, + "grad_norm": 3.8425032487043813, + "learning_rate": 5.031797282162007e-06, + "loss": 1.4631, + "step": 1355 + }, + { + "epoch": 5.489878542510121, + "grad_norm": 4.548619092337232, + "learning_rate": 5.024731285317046e-06, + "loss": 1.3972, + "step": 1356 + }, + { + "epoch": 5.493927125506072, + "grad_norm": 4.814717659012562, + "learning_rate": 5.017665239078854e-06, + "loss": 1.4267, + "step": 1357 + }, + { + "epoch": 5.497975708502024, + "grad_norm": 3.6552584947768096, + "learning_rate": 5.010599157559713e-06, + "loss": 1.2966, + "step": 1358 + }, + { + "epoch": 5.502024291497976, + "grad_norm": 4.204585823006649, + "learning_rate": 5.003533054871973e-06, + "loss": 1.15, + "step": 1359 + }, + { + "epoch": 5.506072874493928, + "grad_norm": 4.634653281785678, + "learning_rate": 4.996466945128029e-06, + "loss": 1.5181, + "step": 1360 + }, + { + "epoch": 5.510121457489879, + "grad_norm": 4.3188079424314, + "learning_rate": 4.98940084244029e-06, + "loss": 1.4787, + "step": 1361 + }, + { + "epoch": 5.51417004048583, + "grad_norm": 3.332377152961891, + "learning_rate": 4.982334760921149e-06, + "loss": 1.4434, + "step": 1362 + }, + { + "epoch": 5.518218623481781, + "grad_norm": 4.271374565670683, + "learning_rate": 4.975268714682956e-06, + "loss": 1.4766, + "step": 1363 + }, + { + "epoch": 5.522267206477733, + "grad_norm": 4.388046491535482, + "learning_rate": 4.968202717837996e-06, + "loss": 1.4244, + "step": 1364 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 4.81529396324836, + "learning_rate": 4.961136784498448e-06, + "loss": 1.2532, + "step": 1365 + }, + { + "epoch": 5.530364372469636, + "grad_norm": 4.589391225576633, + "learning_rate": 4.9540709287763685e-06, + "loss": 1.3152, + "step": 1366 + }, + { + "epoch": 5.534412955465587, + "grad_norm": 5.101062956149816, + "learning_rate": 4.947005164783661e-06, + "loss": 1.409, + "step": 1367 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 4.286443288173012, + "learning_rate": 4.939939506632041e-06, + "loss": 1.6652, + "step": 1368 + }, + { + "epoch": 5.5425101214574894, + "grad_norm": 3.857994197551904, + "learning_rate": 4.932873968433014e-06, + "loss": 1.5821, + "step": 1369 + }, + { + "epoch": 5.5465587044534415, + "grad_norm": 82.8177825176114, + "learning_rate": 4.925808564297847e-06, + "loss": 2.0481, + "step": 1370 + }, + { + "epoch": 5.550607287449393, + "grad_norm": 8.294269069115597, + "learning_rate": 4.918743308337539e-06, + "loss": 1.9382, + "step": 1371 + }, + { + "epoch": 5.554655870445345, + "grad_norm": 8.675625865701205, + "learning_rate": 4.911678214662795e-06, + "loss": 2.2234, + "step": 1372 + }, + { + "epoch": 5.558704453441296, + "grad_norm": 3.9912695390847595, + "learning_rate": 4.9046132973839895e-06, + "loss": 1.4514, + "step": 1373 + }, + { + "epoch": 5.562753036437247, + "grad_norm": 3.603893380101875, + "learning_rate": 4.897548570611153e-06, + "loss": 1.3266, + "step": 1374 + }, + { + "epoch": 5.566801619433198, + "grad_norm": 3.6938504736682054, + "learning_rate": 4.890484048453928e-06, + "loss": 1.704, + "step": 1375 + }, + { + "epoch": 5.57085020242915, + "grad_norm": 4.1771900748802135, + "learning_rate": 4.883419745021554e-06, + "loss": 1.3432, + "step": 1376 + }, + { + "epoch": 5.574898785425101, + "grad_norm": 4.029068029464602, + "learning_rate": 4.8763556744228324e-06, + "loss": 1.5548, + "step": 1377 + }, + { + "epoch": 5.578947368421053, + "grad_norm": 3.1723858445451776, + "learning_rate": 4.869291850766097e-06, + "loss": 1.3556, + "step": 1378 + }, + { + "epoch": 5.582995951417004, + "grad_norm": 3.9383901181787118, + "learning_rate": 4.862228288159191e-06, + "loss": 1.4828, + "step": 1379 + }, + { + "epoch": 5.587044534412955, + "grad_norm": 3.8742071296776883, + "learning_rate": 4.855165000709434e-06, + "loss": 1.4776, + "step": 1380 + }, + { + "epoch": 5.5910931174089065, + "grad_norm": 4.320505162169018, + "learning_rate": 4.848102002523597e-06, + "loss": 1.4632, + "step": 1381 + }, + { + "epoch": 5.5951417004048585, + "grad_norm": 3.8728016571115496, + "learning_rate": 4.841039307707878e-06, + "loss": 1.1957, + "step": 1382 + }, + { + "epoch": 5.59919028340081, + "grad_norm": 3.492753062395854, + "learning_rate": 4.833976930367859e-06, + "loss": 1.2615, + "step": 1383 + }, + { + "epoch": 5.603238866396762, + "grad_norm": 3.5488104026542513, + "learning_rate": 4.8269148846085e-06, + "loss": 1.3531, + "step": 1384 + }, + { + "epoch": 5.607287449392713, + "grad_norm": 4.068763646311401, + "learning_rate": 4.819853184534085e-06, + "loss": 1.1753, + "step": 1385 + }, + { + "epoch": 5.611336032388664, + "grad_norm": 4.377905274086795, + "learning_rate": 4.812791844248223e-06, + "loss": 1.4958, + "step": 1386 + }, + { + "epoch": 5.615384615384615, + "grad_norm": 3.6007003800569386, + "learning_rate": 4.80573087785379e-06, + "loss": 1.4974, + "step": 1387 + }, + { + "epoch": 5.619433198380567, + "grad_norm": 4.802311568406072, + "learning_rate": 4.798670299452926e-06, + "loss": 1.2282, + "step": 1388 + }, + { + "epoch": 5.623481781376518, + "grad_norm": 4.7745139328350135, + "learning_rate": 4.7916101231469886e-06, + "loss": 1.6082, + "step": 1389 + }, + { + "epoch": 5.62753036437247, + "grad_norm": 4.123643145041474, + "learning_rate": 4.784550363036539e-06, + "loss": 1.4134, + "step": 1390 + }, + { + "epoch": 5.631578947368421, + "grad_norm": 4.402507798104486, + "learning_rate": 4.7774910332213005e-06, + "loss": 1.6983, + "step": 1391 + }, + { + "epoch": 5.635627530364372, + "grad_norm": 3.8264895380697355, + "learning_rate": 4.770432147800141e-06, + "loss": 1.2975, + "step": 1392 + }, + { + "epoch": 5.6396761133603235, + "grad_norm": 4.517127158006528, + "learning_rate": 4.763373720871044e-06, + "loss": 1.5541, + "step": 1393 + }, + { + "epoch": 5.6437246963562755, + "grad_norm": 3.773516174749104, + "learning_rate": 4.756315766531069e-06, + "loss": 1.4461, + "step": 1394 + }, + { + "epoch": 5.647773279352227, + "grad_norm": 4.115306809751942, + "learning_rate": 4.749258298876338e-06, + "loss": 1.5498, + "step": 1395 + }, + { + "epoch": 5.651821862348179, + "grad_norm": 3.6874924730709413, + "learning_rate": 4.742201332001998e-06, + "loss": 1.333, + "step": 1396 + }, + { + "epoch": 5.65587044534413, + "grad_norm": 4.445009061040838, + "learning_rate": 4.735144880002199e-06, + "loss": 1.556, + "step": 1397 + }, + { + "epoch": 5.659919028340081, + "grad_norm": 4.819457563644938, + "learning_rate": 4.728088956970059e-06, + "loss": 1.3788, + "step": 1398 + }, + { + "epoch": 5.663967611336032, + "grad_norm": 3.9520027905188275, + "learning_rate": 4.721033576997641e-06, + "loss": 1.5347, + "step": 1399 + }, + { + "epoch": 5.668016194331984, + "grad_norm": 4.124422632263573, + "learning_rate": 4.713978754175926e-06, + "loss": 1.5292, + "step": 1400 + }, + { + "epoch": 5.672064777327935, + "grad_norm": 4.475410908220464, + "learning_rate": 4.706924502594777e-06, + "loss": 1.6549, + "step": 1401 + }, + { + "epoch": 5.676113360323887, + "grad_norm": 9.027913146446028, + "learning_rate": 4.69987083634292e-06, + "loss": 1.5814, + "step": 1402 + }, + { + "epoch": 5.680161943319838, + "grad_norm": 4.584849302385236, + "learning_rate": 4.692817769507912e-06, + "loss": 1.4982, + "step": 1403 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 4.088441988479735, + "learning_rate": 4.685765316176111e-06, + "loss": 1.3453, + "step": 1404 + }, + { + "epoch": 5.6882591093117405, + "grad_norm": 3.94840157844417, + "learning_rate": 4.67871349043265e-06, + "loss": 1.4717, + "step": 1405 + }, + { + "epoch": 5.6923076923076925, + "grad_norm": 4.252654676588602, + "learning_rate": 4.671662306361409e-06, + "loss": 1.4891, + "step": 1406 + }, + { + "epoch": 5.696356275303644, + "grad_norm": 3.784433251453805, + "learning_rate": 4.664611778044988e-06, + "loss": 1.3408, + "step": 1407 + }, + { + "epoch": 5.700404858299595, + "grad_norm": 4.988371722598511, + "learning_rate": 4.657561919564675e-06, + "loss": 1.8095, + "step": 1408 + }, + { + "epoch": 5.704453441295547, + "grad_norm": 4.664322457086443, + "learning_rate": 4.6505127450004216e-06, + "loss": 1.6024, + "step": 1409 + }, + { + "epoch": 5.708502024291498, + "grad_norm": 4.600715197938257, + "learning_rate": 4.643464268430812e-06, + "loss": 1.2021, + "step": 1410 + }, + { + "epoch": 5.712550607287449, + "grad_norm": 3.9099782560794503, + "learning_rate": 4.636416503933038e-06, + "loss": 1.3472, + "step": 1411 + }, + { + "epoch": 5.716599190283401, + "grad_norm": 3.9111543599245757, + "learning_rate": 4.62936946558287e-06, + "loss": 1.4523, + "step": 1412 + }, + { + "epoch": 5.720647773279352, + "grad_norm": 4.6487019160659, + "learning_rate": 4.622323167454623e-06, + "loss": 1.2302, + "step": 1413 + }, + { + "epoch": 5.724696356275303, + "grad_norm": 4.4548900152472815, + "learning_rate": 4.6152776236211415e-06, + "loss": 1.4256, + "step": 1414 + }, + { + "epoch": 5.728744939271255, + "grad_norm": 4.058092491633072, + "learning_rate": 4.608232848153757e-06, + "loss": 1.6055, + "step": 1415 + }, + { + "epoch": 5.732793522267206, + "grad_norm": 4.025502584936106, + "learning_rate": 4.601188855122269e-06, + "loss": 1.3484, + "step": 1416 + }, + { + "epoch": 5.7368421052631575, + "grad_norm": 4.1244592308665275, + "learning_rate": 4.594145658594914e-06, + "loss": 1.4537, + "step": 1417 + }, + { + "epoch": 5.7408906882591095, + "grad_norm": 4.167306098888644, + "learning_rate": 4.587103272638339e-06, + "loss": 2.0785, + "step": 1418 + }, + { + "epoch": 5.744939271255061, + "grad_norm": 3.858307172453616, + "learning_rate": 4.580061711317571e-06, + "loss": 1.5669, + "step": 1419 + }, + { + "epoch": 5.748987854251012, + "grad_norm": 4.76966444820156, + "learning_rate": 4.57302098869599e-06, + "loss": 1.3901, + "step": 1420 + }, + { + "epoch": 5.753036437246964, + "grad_norm": 4.3778097624694166, + "learning_rate": 4.565981118835299e-06, + "loss": 1.291, + "step": 1421 + }, + { + "epoch": 5.757085020242915, + "grad_norm": 4.090411706131635, + "learning_rate": 4.558942115795502e-06, + "loss": 1.4406, + "step": 1422 + }, + { + "epoch": 5.761133603238866, + "grad_norm": 5.337161250566187, + "learning_rate": 4.551903993634869e-06, + "loss": 2.1851, + "step": 1423 + }, + { + "epoch": 5.765182186234818, + "grad_norm": 6.286779559937267, + "learning_rate": 4.5448667664099125e-06, + "loss": 1.9602, + "step": 1424 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 6.765386541677961, + "learning_rate": 4.537830448175354e-06, + "loss": 1.8644, + "step": 1425 + }, + { + "epoch": 5.77327935222672, + "grad_norm": 4.009998051124011, + "learning_rate": 4.530795052984104e-06, + "loss": 1.3677, + "step": 1426 + }, + { + "epoch": 5.777327935222672, + "grad_norm": 4.067144464386327, + "learning_rate": 4.523760594887228e-06, + "loss": 1.6488, + "step": 1427 + }, + { + "epoch": 5.781376518218623, + "grad_norm": 3.900176884022236, + "learning_rate": 4.5167270879339165e-06, + "loss": 1.6378, + "step": 1428 + }, + { + "epoch": 5.7854251012145745, + "grad_norm": 4.307053870196715, + "learning_rate": 4.509694546171468e-06, + "loss": 1.458, + "step": 1429 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 4.202185719713703, + "learning_rate": 4.5026629836452445e-06, + "loss": 1.3863, + "step": 1430 + }, + { + "epoch": 5.793522267206478, + "grad_norm": 4.276979157413732, + "learning_rate": 4.495632414398659e-06, + "loss": 1.4133, + "step": 1431 + }, + { + "epoch": 5.797570850202429, + "grad_norm": 4.560387387278901, + "learning_rate": 4.488602852473138e-06, + "loss": 1.4313, + "step": 1432 + }, + { + "epoch": 5.801619433198381, + "grad_norm": 3.900998231009241, + "learning_rate": 4.481574311908096e-06, + "loss": 1.3065, + "step": 1433 + }, + { + "epoch": 5.805668016194332, + "grad_norm": 3.971785106076469, + "learning_rate": 4.4745468067409055e-06, + "loss": 1.1997, + "step": 1434 + }, + { + "epoch": 5.809716599190283, + "grad_norm": 4.230506562739517, + "learning_rate": 4.467520351006878e-06, + "loss": 1.5584, + "step": 1435 + }, + { + "epoch": 5.813765182186235, + "grad_norm": 5.12301466025395, + "learning_rate": 4.460494958739223e-06, + "loss": 1.4086, + "step": 1436 + }, + { + "epoch": 5.817813765182186, + "grad_norm": 4.360480527706543, + "learning_rate": 4.453470643969027e-06, + "loss": 1.2759, + "step": 1437 + }, + { + "epoch": 5.821862348178137, + "grad_norm": 11.774868013423882, + "learning_rate": 4.446447420725227e-06, + "loss": 2.2866, + "step": 1438 + }, + { + "epoch": 5.825910931174089, + "grad_norm": 23.795049320685568, + "learning_rate": 4.439425303034576e-06, + "loss": 3.4094, + "step": 1439 + }, + { + "epoch": 5.82995951417004, + "grad_norm": 4.607383270222987, + "learning_rate": 4.432404304921624e-06, + "loss": 1.3129, + "step": 1440 + }, + { + "epoch": 5.834008097165992, + "grad_norm": 4.67077067966415, + "learning_rate": 4.4253844404086785e-06, + "loss": 1.2285, + "step": 1441 + }, + { + "epoch": 5.838056680161944, + "grad_norm": 3.9312338569636394, + "learning_rate": 4.418365723515791e-06, + "loss": 1.286, + "step": 1442 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 4.003272377775398, + "learning_rate": 4.411348168260713e-06, + "loss": 1.3394, + "step": 1443 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 4.140441268173913, + "learning_rate": 4.404331788658882e-06, + "loss": 1.1712, + "step": 1444 + }, + { + "epoch": 5.850202429149798, + "grad_norm": 4.57761440040013, + "learning_rate": 4.397316598723385e-06, + "loss": 1.3548, + "step": 1445 + }, + { + "epoch": 5.854251012145749, + "grad_norm": 4.860966996025116, + "learning_rate": 4.390302612464934e-06, + "loss": 1.4071, + "step": 1446 + }, + { + "epoch": 5.8582995951417, + "grad_norm": 3.557234324926702, + "learning_rate": 4.383289843891835e-06, + "loss": 1.3334, + "step": 1447 + }, + { + "epoch": 5.862348178137652, + "grad_norm": 4.6167043083990515, + "learning_rate": 4.376278307009962e-06, + "loss": 1.332, + "step": 1448 + }, + { + "epoch": 5.866396761133603, + "grad_norm": 4.529476800833651, + "learning_rate": 4.369268015822733e-06, + "loss": 1.336, + "step": 1449 + }, + { + "epoch": 5.870445344129554, + "grad_norm": 5.460345634297291, + "learning_rate": 4.362258984331074e-06, + "loss": 1.7992, + "step": 1450 + }, + { + "epoch": 5.874493927125506, + "grad_norm": 4.852544977047948, + "learning_rate": 4.355251226533396e-06, + "loss": 1.7401, + "step": 1451 + }, + { + "epoch": 5.8785425101214575, + "grad_norm": 5.091561572959863, + "learning_rate": 4.348244756425569e-06, + "loss": 1.4945, + "step": 1452 + }, + { + "epoch": 5.882591093117409, + "grad_norm": 4.66519342749034, + "learning_rate": 4.341239588000887e-06, + "loss": 1.4193, + "step": 1453 + }, + { + "epoch": 5.886639676113361, + "grad_norm": 4.442060928034546, + "learning_rate": 4.334235735250047e-06, + "loss": 1.0274, + "step": 1454 + }, + { + "epoch": 5.890688259109312, + "grad_norm": 3.911256400148853, + "learning_rate": 4.327233212161118e-06, + "loss": 1.5401, + "step": 1455 + }, + { + "epoch": 5.894736842105263, + "grad_norm": 3.8807011184816846, + "learning_rate": 4.320232032719511e-06, + "loss": 1.5831, + "step": 1456 + }, + { + "epoch": 5.898785425101215, + "grad_norm": 3.58685678874274, + "learning_rate": 4.313232210907959e-06, + "loss": 1.3268, + "step": 1457 + }, + { + "epoch": 5.902834008097166, + "grad_norm": 4.318238652473736, + "learning_rate": 4.306233760706478e-06, + "loss": 1.3389, + "step": 1458 + }, + { + "epoch": 5.906882591093117, + "grad_norm": 4.611379978717958, + "learning_rate": 4.299236696092347e-06, + "loss": 1.4306, + "step": 1459 + }, + { + "epoch": 5.910931174089069, + "grad_norm": 3.900073554354451, + "learning_rate": 4.292241031040077e-06, + "loss": 1.1163, + "step": 1460 + }, + { + "epoch": 5.91497975708502, + "grad_norm": 4.550673982692945, + "learning_rate": 4.285246779521384e-06, + "loss": 1.2052, + "step": 1461 + }, + { + "epoch": 5.919028340080971, + "grad_norm": 4.574548958146505, + "learning_rate": 4.278253955505163e-06, + "loss": 1.213, + "step": 1462 + }, + { + "epoch": 5.923076923076923, + "grad_norm": 3.5603964829525725, + "learning_rate": 4.271262572957453e-06, + "loss": 1.5401, + "step": 1463 + }, + { + "epoch": 5.9271255060728745, + "grad_norm": 4.899646738920418, + "learning_rate": 4.264272645841419e-06, + "loss": 1.3832, + "step": 1464 + }, + { + "epoch": 5.931174089068826, + "grad_norm": 4.936217075017478, + "learning_rate": 4.2572841881173205e-06, + "loss": 1.3896, + "step": 1465 + }, + { + "epoch": 5.935222672064778, + "grad_norm": 4.841906645627207, + "learning_rate": 4.250297213742473e-06, + "loss": 1.173, + "step": 1466 + }, + { + "epoch": 5.939271255060729, + "grad_norm": 4.652957613099752, + "learning_rate": 4.243311736671239e-06, + "loss": 1.1544, + "step": 1467 + }, + { + "epoch": 5.94331983805668, + "grad_norm": 5.5395351930289864, + "learning_rate": 4.236327770854987e-06, + "loss": 1.4593, + "step": 1468 + }, + { + "epoch": 5.947368421052632, + "grad_norm": 4.423876597754868, + "learning_rate": 4.229345330242067e-06, + "loss": 1.1935, + "step": 1469 + }, + { + "epoch": 5.951417004048583, + "grad_norm": 5.270192860869612, + "learning_rate": 4.222364428777786e-06, + "loss": 1.1325, + "step": 1470 + }, + { + "epoch": 5.955465587044534, + "grad_norm": 5.410786507887627, + "learning_rate": 4.2153850804043706e-06, + "loss": 1.3971, + "step": 1471 + }, + { + "epoch": 5.959514170040486, + "grad_norm": 4.884826922400209, + "learning_rate": 4.2084072990609505e-06, + "loss": 1.4698, + "step": 1472 + }, + { + "epoch": 5.963562753036437, + "grad_norm": 4.313211329480648, + "learning_rate": 4.201431098683527e-06, + "loss": 1.4382, + "step": 1473 + }, + { + "epoch": 5.967611336032388, + "grad_norm": 5.213303398147368, + "learning_rate": 4.194456493204939e-06, + "loss": 1.5175, + "step": 1474 + }, + { + "epoch": 5.97165991902834, + "grad_norm": 5.448304606946485, + "learning_rate": 4.187483496554844e-06, + "loss": 1.433, + "step": 1475 + }, + { + "epoch": 5.9757085020242915, + "grad_norm": 3.801193566372591, + "learning_rate": 4.1805121226596826e-06, + "loss": 1.4114, + "step": 1476 + }, + { + "epoch": 5.979757085020243, + "grad_norm": 4.17077172984551, + "learning_rate": 4.173542385442659e-06, + "loss": 1.4847, + "step": 1477 + }, + { + "epoch": 5.983805668016195, + "grad_norm": 3.8042786020089285, + "learning_rate": 4.166574298823707e-06, + "loss": 1.5417, + "step": 1478 + }, + { + "epoch": 5.987854251012146, + "grad_norm": 4.0974559638165795, + "learning_rate": 4.1596078767194615e-06, + "loss": 1.3383, + "step": 1479 + }, + { + "epoch": 5.991902834008097, + "grad_norm": 3.4327656830127844, + "learning_rate": 4.152643133043236e-06, + "loss": 1.3384, + "step": 1480 + }, + { + "epoch": 5.995951417004049, + "grad_norm": 3.615327810634163, + "learning_rate": 4.145680081704989e-06, + "loss": 1.6541, + "step": 1481 + }, + { + "epoch": 6.0, + "grad_norm": 3.8329106879075594, + "learning_rate": 4.138718736611302e-06, + "loss": 1.3694, + "step": 1482 + }, + { + "epoch": 6.004048582995951, + "grad_norm": 3.830450157141594, + "learning_rate": 4.131759111665349e-06, + "loss": 1.4049, + "step": 1483 + }, + { + "epoch": 6.008097165991903, + "grad_norm": 5.1111426342190684, + "learning_rate": 4.1248012207668635e-06, + "loss": 1.5639, + "step": 1484 + }, + { + "epoch": 6.012145748987854, + "grad_norm": 4.83681122900061, + "learning_rate": 4.117845077812122e-06, + "loss": 1.3693, + "step": 1485 + }, + { + "epoch": 6.016194331983805, + "grad_norm": 5.4329470747052255, + "learning_rate": 4.110890696693906e-06, + "loss": 1.5831, + "step": 1486 + }, + { + "epoch": 6.020242914979757, + "grad_norm": 4.6500916905003535, + "learning_rate": 4.103938091301479e-06, + "loss": 1.7881, + "step": 1487 + }, + { + "epoch": 6.0242914979757085, + "grad_norm": 4.885048703930011, + "learning_rate": 4.096987275520562e-06, + "loss": 1.6668, + "step": 1488 + }, + { + "epoch": 6.02834008097166, + "grad_norm": 4.13626291343727, + "learning_rate": 4.090038263233294e-06, + "loss": 1.3587, + "step": 1489 + }, + { + "epoch": 6.032388663967612, + "grad_norm": 4.904165295750069, + "learning_rate": 4.08309106831822e-06, + "loss": 1.3678, + "step": 1490 + }, + { + "epoch": 6.036437246963563, + "grad_norm": 4.636168977638758, + "learning_rate": 4.0761457046502515e-06, + "loss": 1.5829, + "step": 1491 + }, + { + "epoch": 6.040485829959514, + "grad_norm": 4.665143753358694, + "learning_rate": 4.0692021861006386e-06, + "loss": 1.382, + "step": 1492 + }, + { + "epoch": 6.044534412955466, + "grad_norm": 4.58626969694099, + "learning_rate": 4.062260526536955e-06, + "loss": 1.4891, + "step": 1493 + }, + { + "epoch": 6.048582995951417, + "grad_norm": 4.689483058767236, + "learning_rate": 4.055320739823057e-06, + "loss": 1.3764, + "step": 1494 + }, + { + "epoch": 6.052631578947368, + "grad_norm": 5.0699840890954535, + "learning_rate": 4.048382839819058e-06, + "loss": 1.4399, + "step": 1495 + }, + { + "epoch": 6.05668016194332, + "grad_norm": 4.582891853100069, + "learning_rate": 4.041446840381309e-06, + "loss": 1.2964, + "step": 1496 + }, + { + "epoch": 6.060728744939271, + "grad_norm": 4.596209939663152, + "learning_rate": 4.034512755362361e-06, + "loss": 1.4451, + "step": 1497 + }, + { + "epoch": 6.064777327935222, + "grad_norm": 5.077809534848778, + "learning_rate": 4.027580598610943e-06, + "loss": 1.3934, + "step": 1498 + }, + { + "epoch": 6.068825910931174, + "grad_norm": 5.121648526362897, + "learning_rate": 4.0206503839719335e-06, + "loss": 1.5479, + "step": 1499 + }, + { + "epoch": 6.0728744939271255, + "grad_norm": 4.611548299373776, + "learning_rate": 4.01372212528633e-06, + "loss": 1.4704, + "step": 1500 + }, + { + "epoch": 6.076923076923077, + "grad_norm": 5.312277841332635, + "learning_rate": 4.006795836391226e-06, + "loss": 1.4155, + "step": 1501 + }, + { + "epoch": 6.080971659919029, + "grad_norm": 4.964246172799465, + "learning_rate": 3.999871531119779e-06, + "loss": 1.4857, + "step": 1502 + }, + { + "epoch": 6.08502024291498, + "grad_norm": 4.070954622733409, + "learning_rate": 3.992949223301185e-06, + "loss": 1.4726, + "step": 1503 + }, + { + "epoch": 6.089068825910931, + "grad_norm": 4.91594481744365, + "learning_rate": 3.986028926760655e-06, + "loss": 1.4183, + "step": 1504 + }, + { + "epoch": 6.093117408906883, + "grad_norm": 4.691943755517188, + "learning_rate": 3.9791106553193746e-06, + "loss": 1.497, + "step": 1505 + }, + { + "epoch": 6.097165991902834, + "grad_norm": 4.475695489598384, + "learning_rate": 3.972194422794493e-06, + "loss": 1.2572, + "step": 1506 + }, + { + "epoch": 6.101214574898785, + "grad_norm": 4.947241370368582, + "learning_rate": 3.965280242999083e-06, + "loss": 1.4398, + "step": 1507 + }, + { + "epoch": 6.105263157894737, + "grad_norm": 5.319805507480639, + "learning_rate": 3.9583681297421194e-06, + "loss": 1.3871, + "step": 1508 + }, + { + "epoch": 6.109311740890688, + "grad_norm": 4.749559720069604, + "learning_rate": 3.951458096828449e-06, + "loss": 1.375, + "step": 1509 + }, + { + "epoch": 6.113360323886639, + "grad_norm": 5.727885976477068, + "learning_rate": 3.944550158058762e-06, + "loss": 1.3195, + "step": 1510 + }, + { + "epoch": 6.117408906882591, + "grad_norm": 5.227063382939529, + "learning_rate": 3.937644327229572e-06, + "loss": 1.2256, + "step": 1511 + }, + { + "epoch": 6.1214574898785425, + "grad_norm": 4.738297898420654, + "learning_rate": 3.930740618133173e-06, + "loss": 1.2919, + "step": 1512 + }, + { + "epoch": 6.125506072874494, + "grad_norm": 4.796528713602936, + "learning_rate": 3.923839044557632e-06, + "loss": 1.3028, + "step": 1513 + }, + { + "epoch": 6.129554655870446, + "grad_norm": 5.590663766511934, + "learning_rate": 3.916939620286743e-06, + "loss": 1.1917, + "step": 1514 + }, + { + "epoch": 6.133603238866397, + "grad_norm": 4.16713103068686, + "learning_rate": 3.9100423591000124e-06, + "loss": 1.54, + "step": 1515 + }, + { + "epoch": 6.137651821862348, + "grad_norm": 5.035939317822777, + "learning_rate": 3.903147274772624e-06, + "loss": 1.3571, + "step": 1516 + }, + { + "epoch": 6.1417004048583, + "grad_norm": 4.0009552855543955, + "learning_rate": 3.896254381075416e-06, + "loss": 1.1103, + "step": 1517 + }, + { + "epoch": 6.145748987854251, + "grad_norm": 5.217383616489112, + "learning_rate": 3.8893636917748455e-06, + "loss": 1.4538, + "step": 1518 + }, + { + "epoch": 6.149797570850202, + "grad_norm": 4.709807039436491, + "learning_rate": 3.882475220632975e-06, + "loss": 1.2834, + "step": 1519 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 4.179087376153956, + "learning_rate": 3.875588981407433e-06, + "loss": 1.5023, + "step": 1520 + }, + { + "epoch": 6.157894736842105, + "grad_norm": 5.387448869675948, + "learning_rate": 3.86870498785139e-06, + "loss": 1.5494, + "step": 1521 + }, + { + "epoch": 6.161943319838056, + "grad_norm": 4.138048095732358, + "learning_rate": 3.861823253713535e-06, + "loss": 1.3442, + "step": 1522 + }, + { + "epoch": 6.165991902834008, + "grad_norm": 4.522673016609398, + "learning_rate": 3.854943792738037e-06, + "loss": 1.4306, + "step": 1523 + }, + { + "epoch": 6.17004048582996, + "grad_norm": 4.04807524846957, + "learning_rate": 3.848066618664534e-06, + "loss": 1.9855, + "step": 1524 + }, + { + "epoch": 6.174089068825911, + "grad_norm": 4.797553089745047, + "learning_rate": 3.841191745228091e-06, + "loss": 1.2562, + "step": 1525 + }, + { + "epoch": 6.178137651821863, + "grad_norm": 5.562886515767805, + "learning_rate": 3.834319186159179e-06, + "loss": 1.4532, + "step": 1526 + }, + { + "epoch": 6.182186234817814, + "grad_norm": 3.8582598799938315, + "learning_rate": 3.82744895518365e-06, + "loss": 1.2517, + "step": 1527 + }, + { + "epoch": 6.186234817813765, + "grad_norm": 4.976499846840885, + "learning_rate": 3.8205810660227e-06, + "loss": 1.4395, + "step": 1528 + }, + { + "epoch": 6.190283400809717, + "grad_norm": 5.013759086459238, + "learning_rate": 3.8137155323928526e-06, + "loss": 1.4579, + "step": 1529 + }, + { + "epoch": 6.194331983805668, + "grad_norm": 5.210004353191725, + "learning_rate": 3.8068523680059287e-06, + "loss": 1.6307, + "step": 1530 + }, + { + "epoch": 6.198380566801619, + "grad_norm": 4.444756027075356, + "learning_rate": 3.799991586569012e-06, + "loss": 1.6785, + "step": 1531 + }, + { + "epoch": 6.202429149797571, + "grad_norm": 4.581599022941181, + "learning_rate": 3.7931332017844302e-06, + "loss": 1.3911, + "step": 1532 + }, + { + "epoch": 6.206477732793522, + "grad_norm": 4.426732929526946, + "learning_rate": 3.786277227349724e-06, + "loss": 1.7226, + "step": 1533 + }, + { + "epoch": 6.2105263157894735, + "grad_norm": 4.573503321332187, + "learning_rate": 3.77942367695762e-06, + "loss": 1.7276, + "step": 1534 + }, + { + "epoch": 6.2145748987854255, + "grad_norm": 4.632474175205992, + "learning_rate": 3.7725725642960047e-06, + "loss": 1.4984, + "step": 1535 + }, + { + "epoch": 6.218623481781377, + "grad_norm": 5.004422527391663, + "learning_rate": 3.7657239030478927e-06, + "loss": 1.3822, + "step": 1536 + }, + { + "epoch": 6.222672064777328, + "grad_norm": 4.730329238431976, + "learning_rate": 3.758877706891407e-06, + "loss": 1.3005, + "step": 1537 + }, + { + "epoch": 6.22672064777328, + "grad_norm": 4.696618081800561, + "learning_rate": 3.752033989499742e-06, + "loss": 1.4995, + "step": 1538 + }, + { + "epoch": 6.230769230769231, + "grad_norm": 4.819216438393582, + "learning_rate": 3.7451927645411466e-06, + "loss": 1.2958, + "step": 1539 + }, + { + "epoch": 6.234817813765182, + "grad_norm": 5.4741629869641315, + "learning_rate": 3.7383540456788915e-06, + "loss": 1.5321, + "step": 1540 + }, + { + "epoch": 6.238866396761134, + "grad_norm": 5.271140694357475, + "learning_rate": 3.7315178465712364e-06, + "loss": 1.4701, + "step": 1541 + }, + { + "epoch": 6.242914979757085, + "grad_norm": 4.870369052928556, + "learning_rate": 3.7246841808714172e-06, + "loss": 1.4965, + "step": 1542 + }, + { + "epoch": 6.246963562753036, + "grad_norm": 4.627274116359122, + "learning_rate": 3.717853062227604e-06, + "loss": 1.3376, + "step": 1543 + }, + { + "epoch": 6.251012145748988, + "grad_norm": 4.862725711210235, + "learning_rate": 3.7110245042828786e-06, + "loss": 1.436, + "step": 1544 + }, + { + "epoch": 6.255060728744939, + "grad_norm": 4.948809530195508, + "learning_rate": 3.704198520675214e-06, + "loss": 1.3922, + "step": 1545 + }, + { + "epoch": 6.2591093117408905, + "grad_norm": 4.36897138423846, + "learning_rate": 3.69737512503744e-06, + "loss": 1.3391, + "step": 1546 + }, + { + "epoch": 6.2631578947368425, + "grad_norm": 4.774874457232701, + "learning_rate": 3.690554330997215e-06, + "loss": 1.1307, + "step": 1547 + }, + { + "epoch": 6.267206477732794, + "grad_norm": 4.560395256546156, + "learning_rate": 3.6837361521770056e-06, + "loss": 1.4205, + "step": 1548 + }, + { + "epoch": 6.271255060728745, + "grad_norm": 4.657377226532245, + "learning_rate": 3.6769206021940505e-06, + "loss": 1.4284, + "step": 1549 + }, + { + "epoch": 6.275303643724697, + "grad_norm": 4.523918352960143, + "learning_rate": 3.670107694660343e-06, + "loss": 1.4865, + "step": 1550 + }, + { + "epoch": 6.279352226720648, + "grad_norm": 6.060799013063325, + "learning_rate": 3.6632974431825965e-06, + "loss": 1.4177, + "step": 1551 + }, + { + "epoch": 6.283400809716599, + "grad_norm": 5.508975855268233, + "learning_rate": 3.656489861362218e-06, + "loss": 1.0975, + "step": 1552 + }, + { + "epoch": 6.287449392712551, + "grad_norm": 5.591620230854365, + "learning_rate": 3.6496849627952875e-06, + "loss": 1.2607, + "step": 1553 + }, + { + "epoch": 6.291497975708502, + "grad_norm": 5.501342695470275, + "learning_rate": 3.6428827610725203e-06, + "loss": 1.113, + "step": 1554 + }, + { + "epoch": 6.295546558704453, + "grad_norm": 5.371568603468503, + "learning_rate": 3.636083269779249e-06, + "loss": 1.3579, + "step": 1555 + }, + { + "epoch": 6.299595141700405, + "grad_norm": 4.658495618502483, + "learning_rate": 3.6292865024953945e-06, + "loss": 1.5612, + "step": 1556 + }, + { + "epoch": 6.303643724696356, + "grad_norm": 5.171922327948163, + "learning_rate": 3.622492472795432e-06, + "loss": 1.196, + "step": 1557 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 5.187630245267101, + "learning_rate": 3.615701194248375e-06, + "loss": 1.2403, + "step": 1558 + }, + { + "epoch": 6.3117408906882595, + "grad_norm": 4.739560149771274, + "learning_rate": 3.6089126804177373e-06, + "loss": 1.2748, + "step": 1559 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 5.8421200692609405, + "learning_rate": 3.6021269448615148e-06, + "loss": 1.1801, + "step": 1560 + }, + { + "epoch": 6.319838056680162, + "grad_norm": 5.003939683781086, + "learning_rate": 3.595344001132154e-06, + "loss": 1.1334, + "step": 1561 + }, + { + "epoch": 6.323886639676114, + "grad_norm": 5.213320704625486, + "learning_rate": 3.5885638627765228e-06, + "loss": 1.1662, + "step": 1562 + }, + { + "epoch": 6.327935222672065, + "grad_norm": 5.12672208334294, + "learning_rate": 3.5817865433358902e-06, + "loss": 1.1897, + "step": 1563 + }, + { + "epoch": 6.331983805668016, + "grad_norm": 4.990310131147776, + "learning_rate": 3.5750120563458924e-06, + "loss": 1.2197, + "step": 1564 + }, + { + "epoch": 6.336032388663968, + "grad_norm": 5.404582388895142, + "learning_rate": 3.568240415336509e-06, + "loss": 1.2979, + "step": 1565 + }, + { + "epoch": 6.340080971659919, + "grad_norm": 4.459387759024826, + "learning_rate": 3.5614716338320384e-06, + "loss": 1.2379, + "step": 1566 + }, + { + "epoch": 6.34412955465587, + "grad_norm": 4.906670384808422, + "learning_rate": 3.554705725351063e-06, + "loss": 1.1656, + "step": 1567 + }, + { + "epoch": 6.348178137651822, + "grad_norm": 5.788345645390745, + "learning_rate": 3.547942703406433e-06, + "loss": 1.3082, + "step": 1568 + }, + { + "epoch": 6.352226720647773, + "grad_norm": 5.367912057539721, + "learning_rate": 3.5411825815052296e-06, + "loss": 1.313, + "step": 1569 + }, + { + "epoch": 6.3562753036437245, + "grad_norm": 5.326205519895874, + "learning_rate": 3.534425373148741e-06, + "loss": 0.9762, + "step": 1570 + }, + { + "epoch": 6.3603238866396765, + "grad_norm": 5.708844505808687, + "learning_rate": 3.52767109183244e-06, + "loss": 1.373, + "step": 1571 + }, + { + "epoch": 6.364372469635628, + "grad_norm": 4.876273122171325, + "learning_rate": 3.5209197510459526e-06, + "loss": 1.448, + "step": 1572 + }, + { + "epoch": 6.368421052631579, + "grad_norm": 4.935122614604545, + "learning_rate": 3.5141713642730305e-06, + "loss": 1.3476, + "step": 1573 + }, + { + "epoch": 6.372469635627531, + "grad_norm": 6.109929961302762, + "learning_rate": 3.507425944991529e-06, + "loss": 1.4072, + "step": 1574 + }, + { + "epoch": 6.376518218623482, + "grad_norm": 5.409803828147351, + "learning_rate": 3.5006835066733707e-06, + "loss": 1.0987, + "step": 1575 + }, + { + "epoch": 6.380566801619433, + "grad_norm": 5.907878971006872, + "learning_rate": 3.4939440627845305e-06, + "loss": 1.2467, + "step": 1576 + }, + { + "epoch": 6.384615384615385, + "grad_norm": 5.060588652380501, + "learning_rate": 3.4872076267850015e-06, + "loss": 1.0512, + "step": 1577 + }, + { + "epoch": 6.388663967611336, + "grad_norm": 6.199263715395586, + "learning_rate": 3.480474212128766e-06, + "loss": 1.1192, + "step": 1578 + }, + { + "epoch": 6.392712550607287, + "grad_norm": 5.68773960369221, + "learning_rate": 3.473743832263778e-06, + "loss": 1.2989, + "step": 1579 + }, + { + "epoch": 6.396761133603239, + "grad_norm": 6.5411566006758886, + "learning_rate": 3.4670165006319236e-06, + "loss": 1.1125, + "step": 1580 + }, + { + "epoch": 6.40080971659919, + "grad_norm": 4.779266992013558, + "learning_rate": 3.4602922306690062e-06, + "loss": 1.1461, + "step": 1581 + }, + { + "epoch": 6.4048582995951415, + "grad_norm": 4.983422698218311, + "learning_rate": 3.453571035804714e-06, + "loss": 1.1805, + "step": 1582 + }, + { + "epoch": 6.4089068825910935, + "grad_norm": 6.281439869347411, + "learning_rate": 3.4468529294625895e-06, + "loss": 1.2865, + "step": 1583 + }, + { + "epoch": 6.412955465587045, + "grad_norm": 5.447638251945489, + "learning_rate": 3.4401379250600124e-06, + "loss": 1.112, + "step": 1584 + }, + { + "epoch": 6.417004048582996, + "grad_norm": 6.031371603465583, + "learning_rate": 3.433426036008163e-06, + "loss": 1.4222, + "step": 1585 + }, + { + "epoch": 6.421052631578947, + "grad_norm": 6.344172383462025, + "learning_rate": 3.4267172757120005e-06, + "loss": 1.4558, + "step": 1586 + }, + { + "epoch": 6.425101214574899, + "grad_norm": 5.253990555737164, + "learning_rate": 3.420011657570238e-06, + "loss": 1.4408, + "step": 1587 + }, + { + "epoch": 6.42914979757085, + "grad_norm": 5.944240629250275, + "learning_rate": 3.413309194975309e-06, + "loss": 1.4281, + "step": 1588 + }, + { + "epoch": 6.433198380566802, + "grad_norm": 4.690048614883703, + "learning_rate": 3.406609901313349e-06, + "loss": 1.6038, + "step": 1589 + }, + { + "epoch": 6.437246963562753, + "grad_norm": 5.538761343018897, + "learning_rate": 3.39991378996416e-06, + "loss": 1.3818, + "step": 1590 + }, + { + "epoch": 6.441295546558704, + "grad_norm": 5.904913245197766, + "learning_rate": 3.393220874301193e-06, + "loss": 1.324, + "step": 1591 + }, + { + "epoch": 6.445344129554655, + "grad_norm": 4.935839021246995, + "learning_rate": 3.386531167691512e-06, + "loss": 1.569, + "step": 1592 + }, + { + "epoch": 6.449392712550607, + "grad_norm": 5.96200793571726, + "learning_rate": 3.379844683495775e-06, + "loss": 1.3697, + "step": 1593 + }, + { + "epoch": 6.4534412955465585, + "grad_norm": 5.74218375449931, + "learning_rate": 3.3731614350682045e-06, + "loss": 1.3591, + "step": 1594 + }, + { + "epoch": 6.4574898785425106, + "grad_norm": 5.819819829923634, + "learning_rate": 3.36648143575656e-06, + "loss": 1.7039, + "step": 1595 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 7.530849687169004, + "learning_rate": 3.3598046989021073e-06, + "loss": 1.8161, + "step": 1596 + }, + { + "epoch": 6.465587044534413, + "grad_norm": 5.773184926893142, + "learning_rate": 3.3531312378396026e-06, + "loss": 1.506, + "step": 1597 + }, + { + "epoch": 6.469635627530364, + "grad_norm": 5.095389257052112, + "learning_rate": 3.3464610658972584e-06, + "loss": 1.5432, + "step": 1598 + }, + { + "epoch": 6.473684210526316, + "grad_norm": 4.864855264853332, + "learning_rate": 3.3397941963967162e-06, + "loss": 1.502, + "step": 1599 + }, + { + "epoch": 6.477732793522267, + "grad_norm": 6.57365780985993, + "learning_rate": 3.333130642653024e-06, + "loss": 1.5104, + "step": 1600 + }, + { + "epoch": 6.481781376518219, + "grad_norm": 4.515682901106996, + "learning_rate": 3.326470417974604e-06, + "loss": 1.4218, + "step": 1601 + }, + { + "epoch": 6.48582995951417, + "grad_norm": 5.044572956084713, + "learning_rate": 3.3198135356632353e-06, + "loss": 1.3685, + "step": 1602 + }, + { + "epoch": 6.489878542510121, + "grad_norm": 6.114856919793026, + "learning_rate": 3.313160009014017e-06, + "loss": 1.3026, + "step": 1603 + }, + { + "epoch": 6.493927125506072, + "grad_norm": 6.169486015477941, + "learning_rate": 3.3065098513153473e-06, + "loss": 1.2931, + "step": 1604 + }, + { + "epoch": 6.497975708502024, + "grad_norm": 4.671907121620305, + "learning_rate": 3.299863075848898e-06, + "loss": 1.203, + "step": 1605 + }, + { + "epoch": 6.502024291497976, + "grad_norm": 5.556963177721959, + "learning_rate": 3.2932196958895816e-06, + "loss": 1.0369, + "step": 1606 + }, + { + "epoch": 6.506072874493928, + "grad_norm": 6.041668515369977, + "learning_rate": 3.2865797247055354e-06, + "loss": 1.4057, + "step": 1607 + }, + { + "epoch": 6.510121457489879, + "grad_norm": 5.622532023329238, + "learning_rate": 3.2799431755580814e-06, + "loss": 1.3496, + "step": 1608 + }, + { + "epoch": 6.51417004048583, + "grad_norm": 4.164381858883872, + "learning_rate": 3.2733100617017126e-06, + "loss": 1.3227, + "step": 1609 + }, + { + "epoch": 6.518218623481781, + "grad_norm": 5.565945707547888, + "learning_rate": 3.266680396384061e-06, + "loss": 1.3552, + "step": 1610 + }, + { + "epoch": 6.522267206477733, + "grad_norm": 6.1834705735871855, + "learning_rate": 3.2600541928458664e-06, + "loss": 1.2943, + "step": 1611 + }, + { + "epoch": 6.526315789473684, + "grad_norm": 6.088692550743796, + "learning_rate": 3.2534314643209597e-06, + "loss": 1.132, + "step": 1612 + }, + { + "epoch": 6.530364372469636, + "grad_norm": 5.618439646445004, + "learning_rate": 3.2468122240362287e-06, + "loss": 1.2075, + "step": 1613 + }, + { + "epoch": 6.534412955465587, + "grad_norm": 6.117262117177891, + "learning_rate": 3.2401964852115954e-06, + "loss": 1.2648, + "step": 1614 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 5.488938699999532, + "learning_rate": 3.233584261059991e-06, + "loss": 1.5484, + "step": 1615 + }, + { + "epoch": 6.5425101214574894, + "grad_norm": 4.965386729846099, + "learning_rate": 3.226975564787322e-06, + "loss": 1.486, + "step": 1616 + }, + { + "epoch": 6.5465587044534415, + "grad_norm": 18.62707478890267, + "learning_rate": 3.2203704095924536e-06, + "loss": 2.0005, + "step": 1617 + }, + { + "epoch": 6.550607287449393, + "grad_norm": 9.55782070389464, + "learning_rate": 3.213768808667177e-06, + "loss": 1.7957, + "step": 1618 + }, + { + "epoch": 6.554655870445345, + "grad_norm": 9.720812117855125, + "learning_rate": 3.2071707751961838e-06, + "loss": 2.144, + "step": 1619 + }, + { + "epoch": 6.558704453441296, + "grad_norm": 5.342719089296339, + "learning_rate": 3.200576322357044e-06, + "loss": 1.3436, + "step": 1620 + }, + { + "epoch": 6.562753036437247, + "grad_norm": 4.64296304030207, + "learning_rate": 3.1939854633201727e-06, + "loss": 1.2129, + "step": 1621 + }, + { + "epoch": 6.566801619433198, + "grad_norm": 4.806685098084674, + "learning_rate": 3.187398211248811e-06, + "loss": 1.5973, + "step": 1622 + }, + { + "epoch": 6.57085020242915, + "grad_norm": 5.159929877257071, + "learning_rate": 3.1808145792989914e-06, + "loss": 1.2471, + "step": 1623 + }, + { + "epoch": 6.574898785425101, + "grad_norm": 4.881818219879603, + "learning_rate": 3.1742345806195196e-06, + "loss": 1.4285, + "step": 1624 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 4.079931587528226, + "learning_rate": 3.1676582283519454e-06, + "loss": 1.2586, + "step": 1625 + }, + { + "epoch": 6.582995951417004, + "grad_norm": 5.067504014062879, + "learning_rate": 3.1610855356305354e-06, + "loss": 1.3673, + "step": 1626 + }, + { + "epoch": 6.587044534412955, + "grad_norm": 4.954367681109359, + "learning_rate": 3.1545165155822453e-06, + "loss": 1.3681, + "step": 1627 + }, + { + "epoch": 6.5910931174089065, + "grad_norm": 5.605429782413848, + "learning_rate": 3.1479511813267006e-06, + "loss": 1.3636, + "step": 1628 + }, + { + "epoch": 6.5951417004048585, + "grad_norm": 4.958815188693233, + "learning_rate": 3.141389545976159e-06, + "loss": 1.0862, + "step": 1629 + }, + { + "epoch": 6.59919028340081, + "grad_norm": 4.427052082332069, + "learning_rate": 3.134831622635496e-06, + "loss": 1.1727, + "step": 1630 + }, + { + "epoch": 6.603238866396762, + "grad_norm": 4.453414798921641, + "learning_rate": 3.1282774244021717e-06, + "loss": 1.2508, + "step": 1631 + }, + { + "epoch": 6.607287449392713, + "grad_norm": 5.086142474437995, + "learning_rate": 3.1217269643662063e-06, + "loss": 1.0497, + "step": 1632 + }, + { + "epoch": 6.611336032388664, + "grad_norm": 5.252726223787453, + "learning_rate": 3.115180255610154e-06, + "loss": 1.352, + "step": 1633 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 4.618158368136601, + "learning_rate": 3.1086373112090762e-06, + "loss": 1.3803, + "step": 1634 + }, + { + "epoch": 6.619433198380567, + "grad_norm": 5.797639722448207, + "learning_rate": 3.1020981442305187e-06, + "loss": 1.1187, + "step": 1635 + }, + { + "epoch": 6.623481781376518, + "grad_norm": 5.892627204449989, + "learning_rate": 3.095562767734481e-06, + "loss": 1.4805, + "step": 1636 + }, + { + "epoch": 6.62753036437247, + "grad_norm": 4.995284041826363, + "learning_rate": 3.089031194773392e-06, + "loss": 1.2999, + "step": 1637 + }, + { + "epoch": 6.631578947368421, + "grad_norm": 5.424221812925032, + "learning_rate": 3.082503438392086e-06, + "loss": 1.5812, + "step": 1638 + }, + { + "epoch": 6.635627530364372, + "grad_norm": 4.773802128035484, + "learning_rate": 3.0759795116277723e-06, + "loss": 1.1799, + "step": 1639 + }, + { + "epoch": 6.6396761133603235, + "grad_norm": 5.573651737656804, + "learning_rate": 3.069459427510014e-06, + "loss": 1.4498, + "step": 1640 + }, + { + "epoch": 6.6437246963562755, + "grad_norm": 4.742522853775909, + "learning_rate": 3.0629431990607e-06, + "loss": 1.3417, + "step": 1641 + }, + { + "epoch": 6.647773279352227, + "grad_norm": 5.292712065001537, + "learning_rate": 3.056430839294015e-06, + "loss": 1.45, + "step": 1642 + }, + { + "epoch": 6.651821862348179, + "grad_norm": 4.5550435224065335, + "learning_rate": 3.049922361216422e-06, + "loss": 1.2275, + "step": 1643 + }, + { + "epoch": 6.65587044534413, + "grad_norm": 5.633966620000232, + "learning_rate": 3.043417777826627e-06, + "loss": 1.4383, + "step": 1644 + }, + { + "epoch": 6.659919028340081, + "grad_norm": 5.977264180838899, + "learning_rate": 3.036917102115561e-06, + "loss": 1.2502, + "step": 1645 + }, + { + "epoch": 6.663967611336032, + "grad_norm": 5.050359221231472, + "learning_rate": 3.0304203470663507e-06, + "loss": 1.4135, + "step": 1646 + }, + { + "epoch": 6.668016194331984, + "grad_norm": 5.3518078778159435, + "learning_rate": 3.023927525654288e-06, + "loss": 1.4064, + "step": 1647 + }, + { + "epoch": 6.672064777327935, + "grad_norm": 5.575471681679863, + "learning_rate": 3.017438650846815e-06, + "loss": 1.5635, + "step": 1648 + }, + { + "epoch": 6.676113360323887, + "grad_norm": 4.758858070207382, + "learning_rate": 3.0109537356034856e-06, + "loss": 1.5306, + "step": 1649 + }, + { + "epoch": 6.680161943319838, + "grad_norm": 5.646630068141117, + "learning_rate": 3.0044727928759487e-06, + "loss": 1.3876, + "step": 1650 + }, + { + "epoch": 6.684210526315789, + "grad_norm": 5.245224305674558, + "learning_rate": 2.9979958356079195e-06, + "loss": 1.2497, + "step": 1651 + }, + { + "epoch": 6.6882591093117405, + "grad_norm": 4.976281468525487, + "learning_rate": 2.991522876735154e-06, + "loss": 1.3506, + "step": 1652 + }, + { + "epoch": 6.6923076923076925, + "grad_norm": 5.375432065764104, + "learning_rate": 2.98505392918542e-06, + "loss": 1.3676, + "step": 1653 + }, + { + "epoch": 6.696356275303644, + "grad_norm": 4.849539565202561, + "learning_rate": 2.978589005878476e-06, + "loss": 1.2348, + "step": 1654 + }, + { + "epoch": 6.700404858299595, + "grad_norm": 6.373782199327902, + "learning_rate": 2.9721281197260427e-06, + "loss": 1.6916, + "step": 1655 + }, + { + "epoch": 6.704453441295547, + "grad_norm": 5.797065404713431, + "learning_rate": 2.965671283631778e-06, + "loss": 1.4917, + "step": 1656 + }, + { + "epoch": 6.708502024291498, + "grad_norm": 5.561054188837486, + "learning_rate": 2.959218510491252e-06, + "loss": 1.1089, + "step": 1657 + }, + { + "epoch": 6.712550607287449, + "grad_norm": 4.841361841602314, + "learning_rate": 2.9527698131919156e-06, + "loss": 1.2314, + "step": 1658 + }, + { + "epoch": 6.716599190283401, + "grad_norm": 4.961647413029597, + "learning_rate": 2.9463252046130884e-06, + "loss": 1.3488, + "step": 1659 + }, + { + "epoch": 6.720647773279352, + "grad_norm": 6.030520417168003, + "learning_rate": 2.9398846976259136e-06, + "loss": 1.1124, + "step": 1660 + }, + { + "epoch": 6.724696356275303, + "grad_norm": 5.376150681226648, + "learning_rate": 2.9334483050933506e-06, + "loss": 1.3305, + "step": 1661 + }, + { + "epoch": 6.728744939271255, + "grad_norm": 4.997899902629033, + "learning_rate": 2.9270160398701387e-06, + "loss": 1.4987, + "step": 1662 + }, + { + "epoch": 6.732793522267206, + "grad_norm": 5.003930672267123, + "learning_rate": 2.920587914802772e-06, + "loss": 1.2143, + "step": 1663 + }, + { + "epoch": 6.7368421052631575, + "grad_norm": 5.099065318842715, + "learning_rate": 2.91416394272948e-06, + "loss": 1.3239, + "step": 1664 + }, + { + "epoch": 6.7408906882591095, + "grad_norm": 5.065783888856437, + "learning_rate": 2.907744136480194e-06, + "loss": 1.9473, + "step": 1665 + }, + { + "epoch": 6.744939271255061, + "grad_norm": 4.828636889161134, + "learning_rate": 2.901328508876531e-06, + "loss": 1.4691, + "step": 1666 + }, + { + "epoch": 6.748987854251012, + "grad_norm": 5.887659634670204, + "learning_rate": 2.894917072731753e-06, + "loss": 1.2826, + "step": 1667 + }, + { + "epoch": 6.753036437246964, + "grad_norm": 5.421606621102472, + "learning_rate": 2.88850984085076e-06, + "loss": 1.1948, + "step": 1668 + }, + { + "epoch": 6.757085020242915, + "grad_norm": 5.2144985221753615, + "learning_rate": 2.8821068260300505e-06, + "loss": 1.3159, + "step": 1669 + }, + { + "epoch": 6.761133603238866, + "grad_norm": 6.35388499196324, + "learning_rate": 2.8757080410577042e-06, + "loss": 2.064, + "step": 1670 + }, + { + "epoch": 6.765182186234818, + "grad_norm": 6.533956411029131, + "learning_rate": 2.8693134987133464e-06, + "loss": 1.8202, + "step": 1671 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 7.388143224357747, + "learning_rate": 2.8629232117681354e-06, + "loss": 1.7417, + "step": 1672 + }, + { + "epoch": 6.77327935222672, + "grad_norm": 4.928577825497661, + "learning_rate": 2.8565371929847286e-06, + "loss": 1.2534, + "step": 1673 + }, + { + "epoch": 6.777327935222672, + "grad_norm": 5.033866214652084, + "learning_rate": 2.8501554551172613e-06, + "loss": 1.5421, + "step": 1674 + }, + { + "epoch": 6.781376518218623, + "grad_norm": 4.739685237811317, + "learning_rate": 2.843778010911311e-06, + "loss": 1.5263, + "step": 1675 + }, + { + "epoch": 6.7854251012145745, + "grad_norm": 5.136372890884333, + "learning_rate": 2.83740487310389e-06, + "loss": 1.3327, + "step": 1676 + }, + { + "epoch": 6.7894736842105265, + "grad_norm": 4.941908173697463, + "learning_rate": 2.8310360544234057e-06, + "loss": 1.2674, + "step": 1677 + }, + { + "epoch": 6.793522267206478, + "grad_norm": 5.393271110505753, + "learning_rate": 2.8246715675896354e-06, + "loss": 1.2836, + "step": 1678 + }, + { + "epoch": 6.797570850202429, + "grad_norm": 5.454849249006355, + "learning_rate": 2.81831142531371e-06, + "loss": 1.3156, + "step": 1679 + }, + { + "epoch": 6.801619433198381, + "grad_norm": 4.939088394387297, + "learning_rate": 2.811955640298083e-06, + "loss": 1.2068, + "step": 1680 + }, + { + "epoch": 6.805668016194332, + "grad_norm": 4.809916773128364, + "learning_rate": 2.8056042252365046e-06, + "loss": 1.0997, + "step": 1681 + }, + { + "epoch": 6.809716599190283, + "grad_norm": 5.329896547784682, + "learning_rate": 2.7992571928139984e-06, + "loss": 1.4471, + "step": 1682 + }, + { + "epoch": 6.813765182186235, + "grad_norm": 6.511906878209839, + "learning_rate": 2.7929145557068303e-06, + "loss": 1.2595, + "step": 1683 + }, + { + "epoch": 6.817813765182186, + "grad_norm": 5.372364570471038, + "learning_rate": 2.786576326582493e-06, + "loss": 1.1699, + "step": 1684 + }, + { + "epoch": 6.821862348178137, + "grad_norm": 13.8652581579135, + "learning_rate": 2.780242518099675e-06, + "loss": 2.2106, + "step": 1685 + }, + { + "epoch": 6.825910931174089, + "grad_norm": 25.171093577196388, + "learning_rate": 2.7739131429082373e-06, + "loss": 3.2586, + "step": 1686 + }, + { + "epoch": 6.82995951417004, + "grad_norm": 5.726221697590718, + "learning_rate": 2.7675882136491795e-06, + "loss": 1.1889, + "step": 1687 + }, + { + "epoch": 6.834008097165992, + "grad_norm": 5.969801910273205, + "learning_rate": 2.761267742954629e-06, + "loss": 1.1408, + "step": 1688 + }, + { + "epoch": 6.838056680161944, + "grad_norm": 5.061214863990714, + "learning_rate": 2.7549517434478063e-06, + "loss": 1.1687, + "step": 1689 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 4.867474293725249, + "learning_rate": 2.7486402277430026e-06, + "loss": 1.2449, + "step": 1690 + }, + { + "epoch": 6.846153846153846, + "grad_norm": 5.1018055774076645, + "learning_rate": 2.7423332084455543e-06, + "loss": 1.0478, + "step": 1691 + }, + { + "epoch": 6.850202429149798, + "grad_norm": 6.018705752891283, + "learning_rate": 2.736030698151815e-06, + "loss": 1.2496, + "step": 1692 + }, + { + "epoch": 6.854251012145749, + "grad_norm": 6.104939352615399, + "learning_rate": 2.7297327094491344e-06, + "loss": 1.287, + "step": 1693 + }, + { + "epoch": 6.8582995951417, + "grad_norm": 4.340656711987505, + "learning_rate": 2.723439254915834e-06, + "loss": 1.2266, + "step": 1694 + }, + { + "epoch": 6.862348178137652, + "grad_norm": 5.698807470646283, + "learning_rate": 2.717150347121177e-06, + "loss": 1.2273, + "step": 1695 + }, + { + "epoch": 6.866396761133603, + "grad_norm": 5.5042411488110154, + "learning_rate": 2.710865998625348e-06, + "loss": 1.2081, + "step": 1696 + }, + { + "epoch": 6.870445344129554, + "grad_norm": 6.8240067723829405, + "learning_rate": 2.704586221979422e-06, + "loss": 1.6486, + "step": 1697 + }, + { + "epoch": 6.874493927125506, + "grad_norm": 5.905111755452213, + "learning_rate": 2.698311029725346e-06, + "loss": 1.5976, + "step": 1698 + }, + { + "epoch": 6.8785425101214575, + "grad_norm": 6.1571466759316, + "learning_rate": 2.6920404343959106e-06, + "loss": 1.3605, + "step": 1699 + }, + { + "epoch": 6.882591093117409, + "grad_norm": 5.716713309024074, + "learning_rate": 2.6857744485147286e-06, + "loss": 1.2964, + "step": 1700 + }, + { + "epoch": 6.886639676113361, + "grad_norm": 5.42925803199323, + "learning_rate": 2.6795130845961993e-06, + "loss": 0.9267, + "step": 1701 + }, + { + "epoch": 6.890688259109312, + "grad_norm": 4.919365319165041, + "learning_rate": 2.673256355145499e-06, + "loss": 1.4449, + "step": 1702 + }, + { + "epoch": 6.894736842105263, + "grad_norm": 4.863542774795551, + "learning_rate": 2.667004272658541e-06, + "loss": 1.4657, + "step": 1703 + }, + { + "epoch": 6.898785425101215, + "grad_norm": 4.299136007306504, + "learning_rate": 2.660756849621962e-06, + "loss": 1.2369, + "step": 1704 + }, + { + "epoch": 6.902834008097166, + "grad_norm": 5.213129071990759, + "learning_rate": 2.6545140985130934e-06, + "loss": 1.2244, + "step": 1705 + }, + { + "epoch": 6.906882591093117, + "grad_norm": 5.578872418777055, + "learning_rate": 2.6482760317999338e-06, + "loss": 1.2811, + "step": 1706 + }, + { + "epoch": 6.910931174089069, + "grad_norm": 4.626194423109011, + "learning_rate": 2.642042661941129e-06, + "loss": 1.0198, + "step": 1707 + }, + { + "epoch": 6.91497975708502, + "grad_norm": 5.352887557319016, + "learning_rate": 2.635814001385938e-06, + "loss": 1.1012, + "step": 1708 + }, + { + "epoch": 6.919028340080971, + "grad_norm": 5.579613506703107, + "learning_rate": 2.629590062574221e-06, + "loss": 1.1085, + "step": 1709 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 4.252011072382573, + "learning_rate": 2.623370857936404e-06, + "loss": 1.431, + "step": 1710 + }, + { + "epoch": 6.9271255060728745, + "grad_norm": 5.916388957924838, + "learning_rate": 2.6171563998934605e-06, + "loss": 1.2774, + "step": 1711 + }, + { + "epoch": 6.931174089068826, + "grad_norm": 5.953432162823518, + "learning_rate": 2.610946700856885e-06, + "loss": 1.2618, + "step": 1712 + }, + { + "epoch": 6.935222672064778, + "grad_norm": 6.19929364838639, + "learning_rate": 2.604741773228661e-06, + "loss": 1.0577, + "step": 1713 + }, + { + "epoch": 6.939271255060729, + "grad_norm": 5.789164804068839, + "learning_rate": 2.5985416294012487e-06, + "loss": 1.0688, + "step": 1714 + }, + { + "epoch": 6.94331983805668, + "grad_norm": 6.659571736165462, + "learning_rate": 2.592346281757552e-06, + "loss": 1.3636, + "step": 1715 + }, + { + "epoch": 6.947368421052632, + "grad_norm": 5.314697446259228, + "learning_rate": 2.586155742670897e-06, + "loss": 1.0952, + "step": 1716 + }, + { + "epoch": 6.951417004048583, + "grad_norm": 6.659337503952005, + "learning_rate": 2.5799700245050074e-06, + "loss": 1.0229, + "step": 1717 + }, + { + "epoch": 6.955465587044534, + "grad_norm": 6.65312440022192, + "learning_rate": 2.5737891396139713e-06, + "loss": 1.3201, + "step": 1718 + }, + { + "epoch": 6.959514170040486, + "grad_norm": 5.938881485697329, + "learning_rate": 2.5676131003422317e-06, + "loss": 1.3962, + "step": 1719 + }, + { + "epoch": 6.963562753036437, + "grad_norm": 5.4389936951171025, + "learning_rate": 2.561441919024551e-06, + "loss": 1.346, + "step": 1720 + }, + { + "epoch": 6.967611336032388, + "grad_norm": 6.814603646499591, + "learning_rate": 2.5552756079859904e-06, + "loss": 1.3755, + "step": 1721 + }, + { + "epoch": 6.97165991902834, + "grad_norm": 6.557034047725967, + "learning_rate": 2.549114179541884e-06, + "loss": 1.2917, + "step": 1722 + }, + { + "epoch": 6.9757085020242915, + "grad_norm": 4.666089006915814, + "learning_rate": 2.542957645997811e-06, + "loss": 1.3178, + "step": 1723 + }, + { + "epoch": 6.979757085020243, + "grad_norm": 5.4101007526641, + "learning_rate": 2.5368060196495785e-06, + "loss": 1.3848, + "step": 1724 + }, + { + "epoch": 6.983805668016195, + "grad_norm": 5.003638917729553, + "learning_rate": 2.530659312783192e-06, + "loss": 1.4391, + "step": 1725 + }, + { + "epoch": 6.987854251012146, + "grad_norm": 4.982884862825928, + "learning_rate": 2.5245175376748334e-06, + "loss": 1.2329, + "step": 1726 + }, + { + "epoch": 6.991902834008097, + "grad_norm": 4.383040697186735, + "learning_rate": 2.5183807065908296e-06, + "loss": 1.2466, + "step": 1727 + }, + { + "epoch": 6.995951417004049, + "grad_norm": 4.833585025134396, + "learning_rate": 2.512248831787639e-06, + "loss": 1.5637, + "step": 1728 + }, + { + "epoch": 7.0, + "grad_norm": 4.848560799578388, + "learning_rate": 2.5061219255118186e-06, + "loss": 1.2677, + "step": 1729 + }, + { + "epoch": 7.004048582995951, + "grad_norm": 4.901375359150507, + "learning_rate": 2.5000000000000015e-06, + "loss": 1.3023, + "step": 1730 + }, + { + "epoch": 7.008097165991903, + "grad_norm": 6.545083705424055, + "learning_rate": 2.4938830674788756e-06, + "loss": 1.4651, + "step": 1731 + }, + { + "epoch": 7.012145748987854, + "grad_norm": 6.141277943301318, + "learning_rate": 2.4877711401651562e-06, + "loss": 1.2554, + "step": 1732 + }, + { + "epoch": 7.016194331983805, + "grad_norm": 6.544269798324027, + "learning_rate": 2.4816642302655634e-06, + "loss": 1.479, + "step": 1733 + }, + { + "epoch": 7.020242914979757, + "grad_norm": 5.746379418360751, + "learning_rate": 2.475562349976791e-06, + "loss": 1.656, + "step": 1734 + }, + { + "epoch": 7.0242914979757085, + "grad_norm": 6.035436258524213, + "learning_rate": 2.4694655114854936e-06, + "loss": 1.5592, + "step": 1735 + }, + { + "epoch": 7.02834008097166, + "grad_norm": 5.223633858026752, + "learning_rate": 2.4633737269682546e-06, + "loss": 1.2619, + "step": 1736 + }, + { + "epoch": 7.032388663967612, + "grad_norm": 5.890887028411126, + "learning_rate": 2.4572870085915628e-06, + "loss": 1.2686, + "step": 1737 + }, + { + "epoch": 7.036437246963563, + "grad_norm": 5.4867419263331785, + "learning_rate": 2.4512053685117916e-06, + "loss": 1.4711, + "step": 1738 + }, + { + "epoch": 7.040485829959514, + "grad_norm": 5.856066296731616, + "learning_rate": 2.445128818875166e-06, + "loss": 1.2784, + "step": 1739 + }, + { + "epoch": 7.044534412955466, + "grad_norm": 5.685747261263775, + "learning_rate": 2.4390573718177507e-06, + "loss": 1.4178, + "step": 1740 + }, + { + "epoch": 7.048582995951417, + "grad_norm": 5.580589694434444, + "learning_rate": 2.4329910394654167e-06, + "loss": 1.2819, + "step": 1741 + }, + { + "epoch": 7.052631578947368, + "grad_norm": 6.1734653161832345, + "learning_rate": 2.4269298339338205e-06, + "loss": 1.3334, + "step": 1742 + }, + { + "epoch": 7.05668016194332, + "grad_norm": 5.647156467107709, + "learning_rate": 2.4208737673283818e-06, + "loss": 1.1932, + "step": 1743 + }, + { + "epoch": 7.060728744939271, + "grad_norm": 5.571147412614646, + "learning_rate": 2.414822851744249e-06, + "loss": 1.3354, + "step": 1744 + }, + { + "epoch": 7.064777327935222, + "grad_norm": 6.222421117643815, + "learning_rate": 2.408777099266291e-06, + "loss": 1.2747, + "step": 1745 + }, + { + "epoch": 7.068825910931174, + "grad_norm": 6.251859136759403, + "learning_rate": 2.4027365219690617e-06, + "loss": 1.444, + "step": 1746 + }, + { + "epoch": 7.0728744939271255, + "grad_norm": 5.555376265690771, + "learning_rate": 2.3967011319167804e-06, + "loss": 1.3478, + "step": 1747 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 6.222350987405198, + "learning_rate": 2.3906709411633073e-06, + "loss": 1.3069, + "step": 1748 + }, + { + "epoch": 7.080971659919029, + "grad_norm": 5.290175219718593, + "learning_rate": 2.384645961752113e-06, + "loss": 1.4103, + "step": 1749 + }, + { + "epoch": 7.08502024291498, + "grad_norm": 4.882921637643386, + "learning_rate": 2.378626205716265e-06, + "loss": 1.3698, + "step": 1750 + }, + { + "epoch": 7.089068825910931, + "grad_norm": 5.893035167375215, + "learning_rate": 2.3726116850783987e-06, + "loss": 1.3153, + "step": 1751 + }, + { + "epoch": 7.093117408906883, + "grad_norm": 5.440462022348463, + "learning_rate": 2.3666024118506937e-06, + "loss": 1.3918, + "step": 1752 + }, + { + "epoch": 7.097165991902834, + "grad_norm": 5.298541554798929, + "learning_rate": 2.3605983980348446e-06, + "loss": 1.1493, + "step": 1753 + }, + { + "epoch": 7.101214574898785, + "grad_norm": 5.873912109321258, + "learning_rate": 2.354599655622049e-06, + "loss": 1.3419, + "step": 1754 + }, + { + "epoch": 7.105263157894737, + "grad_norm": 6.515086572176515, + "learning_rate": 2.3486061965929695e-06, + "loss": 1.2658, + "step": 1755 + }, + { + "epoch": 7.109311740890688, + "grad_norm": 5.640239544492155, + "learning_rate": 2.3426180329177217e-06, + "loss": 1.2778, + "step": 1756 + }, + { + "epoch": 7.113360323886639, + "grad_norm": 6.602620889096045, + "learning_rate": 2.3366351765558437e-06, + "loss": 1.2168, + "step": 1757 + }, + { + "epoch": 7.117408906882591, + "grad_norm": 6.23335605433251, + "learning_rate": 2.3306576394562748e-06, + "loss": 1.1279, + "step": 1758 + }, + { + "epoch": 7.1214574898785425, + "grad_norm": 5.812741962332591, + "learning_rate": 2.3246854335573303e-06, + "loss": 1.2, + "step": 1759 + }, + { + "epoch": 7.125506072874494, + "grad_norm": 5.7653076766991465, + "learning_rate": 2.318718570786675e-06, + "loss": 1.2204, + "step": 1760 + }, + { + "epoch": 7.129554655870446, + "grad_norm": 6.592268657435819, + "learning_rate": 2.3127570630613064e-06, + "loss": 1.0923, + "step": 1761 + }, + { + "epoch": 7.133603238866397, + "grad_norm": 5.105109462079527, + "learning_rate": 2.3068009222875256e-06, + "loss": 1.4491, + "step": 1762 + }, + { + "epoch": 7.137651821862348, + "grad_norm": 6.139171319338175, + "learning_rate": 2.3008501603609147e-06, + "loss": 1.2557, + "step": 1763 + }, + { + "epoch": 7.1417004048583, + "grad_norm": 4.871725004057816, + "learning_rate": 2.294904789166315e-06, + "loss": 1.023, + "step": 1764 + }, + { + "epoch": 7.145748987854251, + "grad_norm": 6.491293356249618, + "learning_rate": 2.288964820577797e-06, + "loss": 1.3439, + "step": 1765 + }, + { + "epoch": 7.149797570850202, + "grad_norm": 5.837952957007555, + "learning_rate": 2.283030266458644e-06, + "loss": 1.182, + "step": 1766 + }, + { + "epoch": 7.153846153846154, + "grad_norm": 5.104308775866129, + "learning_rate": 2.2771011386613268e-06, + "loss": 1.4117, + "step": 1767 + }, + { + "epoch": 7.157894736842105, + "grad_norm": 6.518827958790034, + "learning_rate": 2.2711774490274767e-06, + "loss": 1.4173, + "step": 1768 + }, + { + "epoch": 7.161943319838056, + "grad_norm": 4.94266123667569, + "learning_rate": 2.265259209387867e-06, + "loss": 1.2429, + "step": 1769 + }, + { + "epoch": 7.165991902834008, + "grad_norm": 5.473631523594278, + "learning_rate": 2.259346431562379e-06, + "loss": 1.3316, + "step": 1770 + }, + { + "epoch": 7.17004048582996, + "grad_norm": 5.001369544056481, + "learning_rate": 2.2534391273599937e-06, + "loss": 1.9136, + "step": 1771 + }, + { + "epoch": 7.174089068825911, + "grad_norm": 5.913295650699435, + "learning_rate": 2.2475373085787568e-06, + "loss": 1.1497, + "step": 1772 + }, + { + "epoch": 7.178137651821863, + "grad_norm": 6.952533318275522, + "learning_rate": 2.2416409870057577e-06, + "loss": 1.353, + "step": 1773 + }, + { + "epoch": 7.182186234817814, + "grad_norm": 4.723432595191292, + "learning_rate": 2.2357501744171105e-06, + "loss": 1.1492, + "step": 1774 + }, + { + "epoch": 7.186234817813765, + "grad_norm": 6.058020017509188, + "learning_rate": 2.229864882577921e-06, + "loss": 1.3322, + "step": 1775 + }, + { + "epoch": 7.190283400809717, + "grad_norm": 5.788151410477542, + "learning_rate": 2.2239851232422736e-06, + "loss": 1.3631, + "step": 1776 + }, + { + "epoch": 7.194331983805668, + "grad_norm": 6.262252651618726, + "learning_rate": 2.218110908153202e-06, + "loss": 1.5276, + "step": 1777 + }, + { + "epoch": 7.198380566801619, + "grad_norm": 5.208163192867401, + "learning_rate": 2.2122422490426676e-06, + "loss": 1.5831, + "step": 1778 + }, + { + "epoch": 7.202429149797571, + "grad_norm": 5.390523496529594, + "learning_rate": 2.206379157631532e-06, + "loss": 1.2908, + "step": 1779 + }, + { + "epoch": 7.206477732793522, + "grad_norm": 5.162249120166779, + "learning_rate": 2.200521645629542e-06, + "loss": 1.6171, + "step": 1780 + }, + { + "epoch": 7.2105263157894735, + "grad_norm": 5.391588507251084, + "learning_rate": 2.194669724735296e-06, + "loss": 1.6111, + "step": 1781 + }, + { + "epoch": 7.2145748987854255, + "grad_norm": 6.1034967557731665, + "learning_rate": 2.1888234066362303e-06, + "loss": 1.3854, + "step": 1782 + }, + { + "epoch": 7.218623481781377, + "grad_norm": 6.167454760308808, + "learning_rate": 2.18298270300859e-06, + "loss": 1.2693, + "step": 1783 + }, + { + "epoch": 7.222672064777328, + "grad_norm": 5.69770152013801, + "learning_rate": 2.1771476255174056e-06, + "loss": 1.2078, + "step": 1784 + }, + { + "epoch": 7.22672064777328, + "grad_norm": 5.460410860926906, + "learning_rate": 2.1713181858164746e-06, + "loss": 1.413, + "step": 1785 + }, + { + "epoch": 7.230769230769231, + "grad_norm": 5.566118830424516, + "learning_rate": 2.165494395548329e-06, + "loss": 1.1968, + "step": 1786 + }, + { + "epoch": 7.234817813765182, + "grad_norm": 6.43649848295101, + "learning_rate": 2.159676266344222e-06, + "loss": 1.4229, + "step": 1787 + }, + { + "epoch": 7.238866396761134, + "grad_norm": 6.290508191897902, + "learning_rate": 2.1538638098241e-06, + "loss": 1.3623, + "step": 1788 + }, + { + "epoch": 7.242914979757085, + "grad_norm": 5.730502481155649, + "learning_rate": 2.14805703759658e-06, + "loss": 1.396, + "step": 1789 + }, + { + "epoch": 7.246963562753036, + "grad_norm": 5.437978852325137, + "learning_rate": 2.1422559612589266e-06, + "loss": 1.252, + "step": 1790 + }, + { + "epoch": 7.251012145748988, + "grad_norm": 5.7552412936402435, + "learning_rate": 2.136460592397025e-06, + "loss": 1.344, + "step": 1791 + }, + { + "epoch": 7.255060728744939, + "grad_norm": 5.804592913810575, + "learning_rate": 2.1306709425853663e-06, + "loss": 1.291, + "step": 1792 + }, + { + "epoch": 7.2591093117408905, + "grad_norm": 5.304611515686778, + "learning_rate": 2.124887023387017e-06, + "loss": 1.25, + "step": 1793 + }, + { + "epoch": 7.2631578947368425, + "grad_norm": 5.579310956319717, + "learning_rate": 2.1191088463535997e-06, + "loss": 1.0352, + "step": 1794 + }, + { + "epoch": 7.267206477732794, + "grad_norm": 5.280713442914896, + "learning_rate": 2.113336423025269e-06, + "loss": 1.3293, + "step": 1795 + }, + { + "epoch": 7.271255060728745, + "grad_norm": 5.695843923044428, + "learning_rate": 2.1075697649306838e-06, + "loss": 1.3279, + "step": 1796 + }, + { + "epoch": 7.275303643724697, + "grad_norm": 5.537225853611836, + "learning_rate": 2.1018088835869943e-06, + "loss": 1.4052, + "step": 1797 + }, + { + "epoch": 7.279352226720648, + "grad_norm": 7.310804417037736, + "learning_rate": 2.0960537904998113e-06, + "loss": 1.3052, + "step": 1798 + }, + { + "epoch": 7.283400809716599, + "grad_norm": 6.5207473345683455, + "learning_rate": 2.0903044971631854e-06, + "loss": 0.9953, + "step": 1799 + }, + { + "epoch": 7.287449392712551, + "grad_norm": 6.891390925467454, + "learning_rate": 2.084561015059585e-06, + "loss": 1.1524, + "step": 1800 + }, + { + "epoch": 7.291497975708502, + "grad_norm": 6.511458265596788, + "learning_rate": 2.0788233556598688e-06, + "loss": 1.019, + "step": 1801 + }, + { + "epoch": 7.295546558704453, + "grad_norm": 6.525945460785431, + "learning_rate": 2.0730915304232692e-06, + "loss": 1.2347, + "step": 1802 + }, + { + "epoch": 7.299595141700405, + "grad_norm": 5.806148576127675, + "learning_rate": 2.067365550797367e-06, + "loss": 1.4674, + "step": 1803 + }, + { + "epoch": 7.303643724696356, + "grad_norm": 6.6525694728213685, + "learning_rate": 2.061645428218067e-06, + "loss": 1.0762, + "step": 1804 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 6.212203279710177, + "learning_rate": 2.055931174109579e-06, + "loss": 1.1289, + "step": 1805 + }, + { + "epoch": 7.3117408906882595, + "grad_norm": 5.666269345071883, + "learning_rate": 2.050222799884387e-06, + "loss": 1.1799, + "step": 1806 + }, + { + "epoch": 7.315789473684211, + "grad_norm": 7.0629439288873, + "learning_rate": 2.044520316943235e-06, + "loss": 1.0631, + "step": 1807 + }, + { + "epoch": 7.319838056680162, + "grad_norm": 6.059126520843265, + "learning_rate": 2.0388237366751005e-06, + "loss": 1.03, + "step": 1808 + }, + { + "epoch": 7.323886639676114, + "grad_norm": 6.3174918869462635, + "learning_rate": 2.0331330704571746e-06, + "loss": 1.0775, + "step": 1809 + }, + { + "epoch": 7.327935222672065, + "grad_norm": 6.098595972628923, + "learning_rate": 2.027448329654832e-06, + "loss": 1.0956, + "step": 1810 + }, + { + "epoch": 7.331983805668016, + "grad_norm": 6.07010789176819, + "learning_rate": 2.02176952562162e-06, + "loss": 1.132, + "step": 1811 + }, + { + "epoch": 7.336032388663968, + "grad_norm": 5.673793373139681, + "learning_rate": 2.0160966696992195e-06, + "loss": 1.235, + "step": 1812 + }, + { + "epoch": 7.340080971659919, + "grad_norm": 5.42325757234182, + "learning_rate": 2.0104297732174403e-06, + "loss": 1.1607, + "step": 1813 + }, + { + "epoch": 7.34412955465587, + "grad_norm": 5.845384796389491, + "learning_rate": 2.004768847494186e-06, + "loss": 1.069, + "step": 1814 + }, + { + "epoch": 7.348178137651822, + "grad_norm": 6.716611305618001, + "learning_rate": 1.999113903835438e-06, + "loss": 1.2088, + "step": 1815 + }, + { + "epoch": 7.352226720647773, + "grad_norm": 6.335024142337415, + "learning_rate": 1.9934649535352286e-06, + "loss": 1.215, + "step": 1816 + }, + { + "epoch": 7.3562753036437245, + "grad_norm": 6.074016020941024, + "learning_rate": 1.987822007875617e-06, + "loss": 0.8957, + "step": 1817 + }, + { + "epoch": 7.3603238866396765, + "grad_norm": 6.669356187358129, + "learning_rate": 1.982185078126676e-06, + "loss": 1.2878, + "step": 1818 + }, + { + "epoch": 7.364372469635628, + "grad_norm": 5.5205879930863055, + "learning_rate": 1.9765541755464605e-06, + "loss": 1.3594, + "step": 1819 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 5.791173021479898, + "learning_rate": 1.9709293113809876e-06, + "loss": 1.2518, + "step": 1820 + }, + { + "epoch": 7.372469635627531, + "grad_norm": 7.085668027134953, + "learning_rate": 1.965310496864217e-06, + "loss": 1.3044, + "step": 1821 + }, + { + "epoch": 7.376518218623482, + "grad_norm": 6.30070905341863, + "learning_rate": 1.9596977432180212e-06, + "loss": 1.0096, + "step": 1822 + }, + { + "epoch": 7.380566801619433, + "grad_norm": 6.668544077573982, + "learning_rate": 1.954091061652172e-06, + "loss": 1.1521, + "step": 1823 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 5.685627571377497, + "learning_rate": 1.948490463364313e-06, + "loss": 0.9629, + "step": 1824 + }, + { + "epoch": 7.388663967611336, + "grad_norm": 7.099232364097355, + "learning_rate": 1.942895959539939e-06, + "loss": 1.0332, + "step": 1825 + }, + { + "epoch": 7.392712550607287, + "grad_norm": 6.449023103797025, + "learning_rate": 1.9373075613523728e-06, + "loss": 1.219, + "step": 1826 + }, + { + "epoch": 7.396761133603239, + "grad_norm": 7.603243728006548, + "learning_rate": 1.9317252799627393e-06, + "loss": 1.0144, + "step": 1827 + }, + { + "epoch": 7.40080971659919, + "grad_norm": 5.630823437903324, + "learning_rate": 1.9261491265199526e-06, + "loss": 1.0604, + "step": 1828 + }, + { + "epoch": 7.4048582995951415, + "grad_norm": 5.804060941623419, + "learning_rate": 1.920579112160685e-06, + "loss": 1.0906, + "step": 1829 + }, + { + "epoch": 7.4089068825910935, + "grad_norm": 7.107387654645546, + "learning_rate": 1.915015248009348e-06, + "loss": 1.1866, + "step": 1830 + }, + { + "epoch": 7.412955465587045, + "grad_norm": 6.216151169357513, + "learning_rate": 1.9094575451780727e-06, + "loss": 1.0234, + "step": 1831 + }, + { + "epoch": 7.417004048582996, + "grad_norm": 7.173346243896998, + "learning_rate": 1.903906014766681e-06, + "loss": 1.3152, + "step": 1832 + }, + { + "epoch": 7.421052631578947, + "grad_norm": 7.353654026214847, + "learning_rate": 1.8983606678626665e-06, + "loss": 1.3466, + "step": 1833 + }, + { + "epoch": 7.425101214574899, + "grad_norm": 6.168388032585026, + "learning_rate": 1.8928215155411773e-06, + "loss": 1.3615, + "step": 1834 + }, + { + "epoch": 7.42914979757085, + "grad_norm": 7.177909922740221, + "learning_rate": 1.8872885688649879e-06, + "loss": 1.3325, + "step": 1835 + }, + { + "epoch": 7.433198380566802, + "grad_norm": 5.5067246147195315, + "learning_rate": 1.8817618388844783e-06, + "loss": 1.5126, + "step": 1836 + }, + { + "epoch": 7.437246963562753, + "grad_norm": 6.480398605143195, + "learning_rate": 1.8762413366376159e-06, + "loss": 1.2967, + "step": 1837 + }, + { + "epoch": 7.441295546558704, + "grad_norm": 7.239184730466869, + "learning_rate": 1.8707270731499223e-06, + "loss": 1.2391, + "step": 1838 + }, + { + "epoch": 7.445344129554655, + "grad_norm": 5.881764731806458, + "learning_rate": 1.865219059434467e-06, + "loss": 1.4892, + "step": 1839 + }, + { + "epoch": 7.449392712550607, + "grad_norm": 7.287338664223354, + "learning_rate": 1.8597173064918333e-06, + "loss": 1.2865, + "step": 1840 + }, + { + "epoch": 7.4534412955465585, + "grad_norm": 6.989877908949274, + "learning_rate": 1.854221825310103e-06, + "loss": 1.2753, + "step": 1841 + }, + { + "epoch": 7.4574898785425106, + "grad_norm": 6.967142936381031, + "learning_rate": 1.8487326268648314e-06, + "loss": 1.6209, + "step": 1842 + }, + { + "epoch": 7.461538461538462, + "grad_norm": 9.165493801033026, + "learning_rate": 1.8432497221190227e-06, + "loss": 1.7021, + "step": 1843 + }, + { + "epoch": 7.465587044534413, + "grad_norm": 7.201939055537971, + "learning_rate": 1.8377731220231144e-06, + "loss": 1.4113, + "step": 1844 + }, + { + "epoch": 7.469635627530364, + "grad_norm": 6.447673122675899, + "learning_rate": 1.832302837514952e-06, + "loss": 1.4683, + "step": 1845 + }, + { + "epoch": 7.473684210526316, + "grad_norm": 5.915439909033562, + "learning_rate": 1.8268388795197683e-06, + "loss": 1.4386, + "step": 1846 + }, + { + "epoch": 7.477732793522267, + "grad_norm": 7.791713816072655, + "learning_rate": 1.8213812589501611e-06, + "loss": 1.4409, + "step": 1847 + }, + { + "epoch": 7.481781376518219, + "grad_norm": 5.76907536016399, + "learning_rate": 1.815929986706066e-06, + "loss": 1.357, + "step": 1848 + }, + { + "epoch": 7.48582995951417, + "grad_norm": 6.324576322221301, + "learning_rate": 1.8104850736747458e-06, + "loss": 1.3014, + "step": 1849 + }, + { + "epoch": 7.489878542510121, + "grad_norm": 7.955436278806627, + "learning_rate": 1.8050465307307602e-06, + "loss": 1.2541, + "step": 1850 + }, + { + "epoch": 7.493927125506072, + "grad_norm": 8.3800061367103, + "learning_rate": 1.7996143687359475e-06, + "loss": 1.2069, + "step": 1851 + }, + { + "epoch": 7.497975708502024, + "grad_norm": 5.859852613078974, + "learning_rate": 1.7941885985394025e-06, + "loss": 1.1389, + "step": 1852 + }, + { + "epoch": 7.502024291497976, + "grad_norm": 6.714230939191411, + "learning_rate": 1.78876923097745e-06, + "loss": 0.96, + "step": 1853 + }, + { + "epoch": 7.506072874493928, + "grad_norm": 7.478771265211495, + "learning_rate": 1.783356276873633e-06, + "loss": 1.3238, + "step": 1854 + }, + { + "epoch": 7.510121457489879, + "grad_norm": 6.964602737040841, + "learning_rate": 1.7779497470386826e-06, + "loss": 1.2515, + "step": 1855 + }, + { + "epoch": 7.51417004048583, + "grad_norm": 5.135869484791375, + "learning_rate": 1.7725496522704998e-06, + "loss": 1.2487, + "step": 1856 + }, + { + "epoch": 7.518218623481781, + "grad_norm": 6.736233605627823, + "learning_rate": 1.7671560033541364e-06, + "loss": 1.2647, + "step": 1857 + }, + { + "epoch": 7.522267206477733, + "grad_norm": 7.4340596808517585, + "learning_rate": 1.7617688110617653e-06, + "loss": 1.1983, + "step": 1858 + }, + { + "epoch": 7.526315789473684, + "grad_norm": 7.142575001524021, + "learning_rate": 1.7563880861526656e-06, + "loss": 1.037, + "step": 1859 + }, + { + "epoch": 7.530364372469636, + "grad_norm": 6.461217060280809, + "learning_rate": 1.7510138393732029e-06, + "loss": 1.125, + "step": 1860 + }, + { + "epoch": 7.534412955465587, + "grad_norm": 7.120411669751328, + "learning_rate": 1.7456460814568032e-06, + "loss": 1.1532, + "step": 1861 + }, + { + "epoch": 7.538461538461538, + "grad_norm": 6.677578923600314, + "learning_rate": 1.7402848231239317e-06, + "loss": 1.447, + "step": 1862 + }, + { + "epoch": 7.5425101214574894, + "grad_norm": 5.995680414752151, + "learning_rate": 1.7349300750820758e-06, + "loss": 1.414, + "step": 1863 + }, + { + "epoch": 7.5465587044534415, + "grad_norm": 70.49787838581857, + "learning_rate": 1.7295818480257148e-06, + "loss": 1.9394, + "step": 1864 + }, + { + "epoch": 7.550607287449393, + "grad_norm": 11.227616663799225, + "learning_rate": 1.7242401526363095e-06, + "loss": 1.6974, + "step": 1865 + }, + { + "epoch": 7.554655870445345, + "grad_norm": 15.917128296917474, + "learning_rate": 1.7189049995822748e-06, + "loss": 2.0666, + "step": 1866 + }, + { + "epoch": 7.558704453441296, + "grad_norm": 6.5545578057982254, + "learning_rate": 1.7135763995189574e-06, + "loss": 1.2566, + "step": 1867 + }, + { + "epoch": 7.562753036437247, + "grad_norm": 5.608919892200609, + "learning_rate": 1.70825436308862e-06, + "loss": 1.1258, + "step": 1868 + }, + { + "epoch": 7.566801619433198, + "grad_norm": 5.78898827199352, + "learning_rate": 1.70293890092041e-06, + "loss": 1.511, + "step": 1869 + }, + { + "epoch": 7.57085020242915, + "grad_norm": 6.1957471468572605, + "learning_rate": 1.6976300236303505e-06, + "loss": 1.1713, + "step": 1870 + }, + { + "epoch": 7.574898785425101, + "grad_norm": 5.919353556112893, + "learning_rate": 1.692327741821312e-06, + "loss": 1.3418, + "step": 1871 + }, + { + "epoch": 7.578947368421053, + "grad_norm": 4.818508692645506, + "learning_rate": 1.6870320660829908e-06, + "loss": 1.1787, + "step": 1872 + }, + { + "epoch": 7.582995951417004, + "grad_norm": 6.074378707133634, + "learning_rate": 1.6817430069918939e-06, + "loss": 1.2772, + "step": 1873 + }, + { + "epoch": 7.587044534412955, + "grad_norm": 6.043486629250494, + "learning_rate": 1.676460575111306e-06, + "loss": 1.2858, + "step": 1874 + }, + { + "epoch": 7.5910931174089065, + "grad_norm": 6.824574202718084, + "learning_rate": 1.671184780991283e-06, + "loss": 1.2792, + "step": 1875 + }, + { + "epoch": 7.5951417004048585, + "grad_norm": 6.003146333113679, + "learning_rate": 1.6659156351686202e-06, + "loss": 0.9987, + "step": 1876 + }, + { + "epoch": 7.59919028340081, + "grad_norm": 5.257435712843031, + "learning_rate": 1.6606531481668364e-06, + "loss": 1.1001, + "step": 1877 + }, + { + "epoch": 7.603238866396762, + "grad_norm": 5.19698994619142, + "learning_rate": 1.6553973304961528e-06, + "loss": 1.1799, + "step": 1878 + }, + { + "epoch": 7.607287449392713, + "grad_norm": 5.841701091792967, + "learning_rate": 1.6501481926534658e-06, + "loss": 0.9594, + "step": 1879 + }, + { + "epoch": 7.611336032388664, + "grad_norm": 6.19240531240544, + "learning_rate": 1.6449057451223354e-06, + "loss": 1.2521, + "step": 1880 + }, + { + "epoch": 7.615384615384615, + "grad_norm": 5.549994801931837, + "learning_rate": 1.639669998372958e-06, + "loss": 1.2949, + "step": 1881 + }, + { + "epoch": 7.619433198380567, + "grad_norm": 6.675501333896787, + "learning_rate": 1.6344409628621482e-06, + "loss": 1.0393, + "step": 1882 + }, + { + "epoch": 7.623481781376518, + "grad_norm": 6.8185578077235025, + "learning_rate": 1.6292186490333172e-06, + "loss": 1.3907, + "step": 1883 + }, + { + "epoch": 7.62753036437247, + "grad_norm": 5.788785194808056, + "learning_rate": 1.6240030673164492e-06, + "loss": 1.2266, + "step": 1884 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 6.240532210004539, + "learning_rate": 1.6187942281280838e-06, + "loss": 1.4968, + "step": 1885 + }, + { + "epoch": 7.635627530364372, + "grad_norm": 5.438972394942183, + "learning_rate": 1.6135921418712959e-06, + "loss": 1.0917, + "step": 1886 + }, + { + "epoch": 7.6396761133603235, + "grad_norm": 6.412673367253676, + "learning_rate": 1.6083968189356724e-06, + "loss": 1.3789, + "step": 1887 + }, + { + "epoch": 7.6437246963562755, + "grad_norm": 5.536347657482411, + "learning_rate": 1.6032082696972945e-06, + "loss": 1.2638, + "step": 1888 + }, + { + "epoch": 7.647773279352227, + "grad_norm": 6.127206089252584, + "learning_rate": 1.5980265045187139e-06, + "loss": 1.3732, + "step": 1889 + }, + { + "epoch": 7.651821862348179, + "grad_norm": 5.193216915475832, + "learning_rate": 1.5928515337489292e-06, + "loss": 1.1536, + "step": 1890 + }, + { + "epoch": 7.65587044534413, + "grad_norm": 6.4405008029321635, + "learning_rate": 1.5876833677233754e-06, + "loss": 1.3585, + "step": 1891 + }, + { + "epoch": 7.659919028340081, + "grad_norm": 6.735596126416384, + "learning_rate": 1.5825220167638945e-06, + "loss": 1.1643, + "step": 1892 + }, + { + "epoch": 7.663967611336032, + "grad_norm": 5.578067115309463, + "learning_rate": 1.5773674911787157e-06, + "loss": 1.3335, + "step": 1893 + }, + { + "epoch": 7.668016194331984, + "grad_norm": 5.847753238206834, + "learning_rate": 1.5722198012624418e-06, + "loss": 1.3156, + "step": 1894 + }, + { + "epoch": 7.672064777327935, + "grad_norm": 6.167981268598202, + "learning_rate": 1.567078957296016e-06, + "loss": 1.4919, + "step": 1895 + }, + { + "epoch": 7.676113360323887, + "grad_norm": 5.209386411212645, + "learning_rate": 1.5619449695467142e-06, + "loss": 1.4698, + "step": 1896 + }, + { + "epoch": 7.680161943319838, + "grad_norm": 6.423491328339259, + "learning_rate": 1.556817848268118e-06, + "loss": 1.3083, + "step": 1897 + }, + { + "epoch": 7.684210526315789, + "grad_norm": 6.099826757015211, + "learning_rate": 1.5516976037000941e-06, + "loss": 1.1861, + "step": 1898 + }, + { + "epoch": 7.6882591093117405, + "grad_norm": 5.753586753644626, + "learning_rate": 1.5465842460687786e-06, + "loss": 1.2721, + "step": 1899 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 6.272583592648715, + "learning_rate": 1.5414777855865466e-06, + "loss": 1.2911, + "step": 1900 + }, + { + "epoch": 7.696356275303644, + "grad_norm": 5.68165710538138, + "learning_rate": 1.5363782324520033e-06, + "loss": 1.1648, + "step": 1901 + }, + { + "epoch": 7.700404858299595, + "grad_norm": 7.460829794563436, + "learning_rate": 1.5312855968499574e-06, + "loss": 1.6084, + "step": 1902 + }, + { + "epoch": 7.704453441295547, + "grad_norm": 6.5692354666682276, + "learning_rate": 1.5261998889514017e-06, + "loss": 1.4184, + "step": 1903 + }, + { + "epoch": 7.708502024291498, + "grad_norm": 6.3186571601325525, + "learning_rate": 1.5211211189134955e-06, + "loss": 1.0412, + "step": 1904 + }, + { + "epoch": 7.712550607287449, + "grad_norm": 5.682537504028156, + "learning_rate": 1.516049296879535e-06, + "loss": 1.1573, + "step": 1905 + }, + { + "epoch": 7.716599190283401, + "grad_norm": 5.812434487226451, + "learning_rate": 1.510984432978947e-06, + "loss": 1.2783, + "step": 1906 + }, + { + "epoch": 7.720647773279352, + "grad_norm": 7.075156192084278, + "learning_rate": 1.5059265373272574e-06, + "loss": 1.0288, + "step": 1907 + }, + { + "epoch": 7.724696356275303, + "grad_norm": 6.467523066478314, + "learning_rate": 1.5008756200260776e-06, + "loss": 1.2684, + "step": 1908 + }, + { + "epoch": 7.728744939271255, + "grad_norm": 5.838154690826828, + "learning_rate": 1.4958316911630827e-06, + "loss": 1.4278, + "step": 1909 + }, + { + "epoch": 7.732793522267206, + "grad_norm": 5.866932075199195, + "learning_rate": 1.4907947608119866e-06, + "loss": 1.1213, + "step": 1910 + }, + { + "epoch": 7.7368421052631575, + "grad_norm": 6.005636196644713, + "learning_rate": 1.4857648390325257e-06, + "loss": 1.2309, + "step": 1911 + }, + { + "epoch": 7.7408906882591095, + "grad_norm": 5.736349178634425, + "learning_rate": 1.4807419358704433e-06, + "loss": 1.8603, + "step": 1912 + }, + { + "epoch": 7.744939271255061, + "grad_norm": 5.608575893991077, + "learning_rate": 1.475726061357463e-06, + "loss": 1.4053, + "step": 1913 + }, + { + "epoch": 7.748987854251012, + "grad_norm": 6.949290018272913, + "learning_rate": 1.47071722551127e-06, + "loss": 1.2025, + "step": 1914 + }, + { + "epoch": 7.753036437246964, + "grad_norm": 6.470859543707123, + "learning_rate": 1.4657154383354948e-06, + "loss": 1.1287, + "step": 1915 + }, + { + "epoch": 7.757085020242915, + "grad_norm": 6.10955142295277, + "learning_rate": 1.4607207098196851e-06, + "loss": 1.2334, + "step": 1916 + }, + { + "epoch": 7.761133603238866, + "grad_norm": 6.5763762413068045, + "learning_rate": 1.4557330499392952e-06, + "loss": 1.9826, + "step": 1917 + }, + { + "epoch": 7.765182186234818, + "grad_norm": 7.723579817578996, + "learning_rate": 1.4507524686556612e-06, + "loss": 1.721, + "step": 1918 + }, + { + "epoch": 7.769230769230769, + "grad_norm": 8.397235796894286, + "learning_rate": 1.4457789759159813e-06, + "loss": 1.6659, + "step": 1919 + }, + { + "epoch": 7.77327935222672, + "grad_norm": 5.642365455166119, + "learning_rate": 1.4408125816532981e-06, + "loss": 1.1808, + "step": 1920 + }, + { + "epoch": 7.777327935222672, + "grad_norm": 5.725043241965928, + "learning_rate": 1.435853295786473e-06, + "loss": 1.4747, + "step": 1921 + }, + { + "epoch": 7.781376518218623, + "grad_norm": 5.394430714546486, + "learning_rate": 1.430901128220174e-06, + "loss": 1.4528, + "step": 1922 + }, + { + "epoch": 7.7854251012145745, + "grad_norm": 5.930712388463373, + "learning_rate": 1.4259560888448526e-06, + "loss": 1.2558, + "step": 1923 + }, + { + "epoch": 7.7894736842105265, + "grad_norm": 5.519869867138563, + "learning_rate": 1.4210181875367229e-06, + "loss": 1.1873, + "step": 1924 + }, + { + "epoch": 7.793522267206478, + "grad_norm": 6.265126307081154, + "learning_rate": 1.4160874341577447e-06, + "loss": 1.1916, + "step": 1925 + }, + { + "epoch": 7.797570850202429, + "grad_norm": 6.13894194733797, + "learning_rate": 1.4111638385555965e-06, + "loss": 1.2401, + "step": 1926 + }, + { + "epoch": 7.801619433198381, + "grad_norm": 5.721727948891365, + "learning_rate": 1.406247410563667e-06, + "loss": 1.1375, + "step": 1927 + }, + { + "epoch": 7.805668016194332, + "grad_norm": 5.409329610323807, + "learning_rate": 1.4013381600010278e-06, + "loss": 1.0394, + "step": 1928 + }, + { + "epoch": 7.809716599190283, + "grad_norm": 5.946216975378077, + "learning_rate": 1.396436096672416e-06, + "loss": 1.3717, + "step": 1929 + }, + { + "epoch": 7.813765182186235, + "grad_norm": 7.501336587253134, + "learning_rate": 1.3915412303682162e-06, + "loss": 1.1632, + "step": 1930 + }, + { + "epoch": 7.817813765182186, + "grad_norm": 6.192994323170135, + "learning_rate": 1.3866535708644335e-06, + "loss": 1.095, + "step": 1931 + }, + { + "epoch": 7.821862348178137, + "grad_norm": 14.576419437798382, + "learning_rate": 1.3817731279226843e-06, + "loss": 2.1725, + "step": 1932 + }, + { + "epoch": 7.825910931174089, + "grad_norm": 25.425127776950244, + "learning_rate": 1.376899911290172e-06, + "loss": 3.1191, + "step": 1933 + }, + { + "epoch": 7.82995951417004, + "grad_norm": 6.5130908283906574, + "learning_rate": 1.3720339306996666e-06, + "loss": 1.1065, + "step": 1934 + }, + { + "epoch": 7.834008097165992, + "grad_norm": 6.8625067545378755, + "learning_rate": 1.367175195869488e-06, + "loss": 1.076, + "step": 1935 + }, + { + "epoch": 7.838056680161944, + "grad_norm": 5.862839226770468, + "learning_rate": 1.3623237165034807e-06, + "loss": 1.0877, + "step": 1936 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 5.587464620521552, + "learning_rate": 1.3574795022910014e-06, + "loss": 1.181, + "step": 1937 + }, + { + "epoch": 7.846153846153846, + "grad_norm": 5.741544735607096, + "learning_rate": 1.3526425629068968e-06, + "loss": 0.9695, + "step": 1938 + }, + { + "epoch": 7.850202429149798, + "grad_norm": 7.078793165923023, + "learning_rate": 1.347812908011485e-06, + "loss": 1.1728, + "step": 1939 + }, + { + "epoch": 7.854251012145749, + "grad_norm": 7.029454395604512, + "learning_rate": 1.3429905472505344e-06, + "loss": 1.2049, + "step": 1940 + }, + { + "epoch": 7.8582995951417, + "grad_norm": 4.858460051035453, + "learning_rate": 1.3381754902552474e-06, + "loss": 1.1544, + "step": 1941 + }, + { + "epoch": 7.862348178137652, + "grad_norm": 6.543690353473279, + "learning_rate": 1.3333677466422357e-06, + "loss": 1.1535, + "step": 1942 + }, + { + "epoch": 7.866396761133603, + "grad_norm": 6.2618770897927165, + "learning_rate": 1.3285673260135073e-06, + "loss": 1.1238, + "step": 1943 + }, + { + "epoch": 7.870445344129554, + "grad_norm": 7.787458993836756, + "learning_rate": 1.323774237956445e-06, + "loss": 1.5443, + "step": 1944 + }, + { + "epoch": 7.874493927125506, + "grad_norm": 6.60339760790844, + "learning_rate": 1.3189884920437867e-06, + "loss": 1.4939, + "step": 1945 + }, + { + "epoch": 7.8785425101214575, + "grad_norm": 6.952377816462855, + "learning_rate": 1.314210097833607e-06, + "loss": 1.2695, + "step": 1946 + }, + { + "epoch": 7.882591093117409, + "grad_norm": 6.440482664289205, + "learning_rate": 1.309439064869295e-06, + "loss": 1.2076, + "step": 1947 + }, + { + "epoch": 7.886639676113361, + "grad_norm": 5.96904543777947, + "learning_rate": 1.3046754026795406e-06, + "loss": 0.8564, + "step": 1948 + }, + { + "epoch": 7.890688259109312, + "grad_norm": 5.611903455141828, + "learning_rate": 1.2999191207783129e-06, + "loss": 1.3827, + "step": 1949 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 5.50366242354655, + "learning_rate": 1.2951702286648399e-06, + "loss": 1.3867, + "step": 1950 + }, + { + "epoch": 7.898785425101215, + "grad_norm": 4.771234777762805, + "learning_rate": 1.290428735823593e-06, + "loss": 1.1739, + "step": 1951 + }, + { + "epoch": 7.902834008097166, + "grad_norm": 5.7833279202719075, + "learning_rate": 1.2856946517242608e-06, + "loss": 1.1495, + "step": 1952 + }, + { + "epoch": 7.906882591093117, + "grad_norm": 6.107712126684077, + "learning_rate": 1.28096798582174e-06, + "loss": 1.1842, + "step": 1953 + }, + { + "epoch": 7.910931174089069, + "grad_norm": 5.059953747053966, + "learning_rate": 1.2762487475561109e-06, + "loss": 0.9544, + "step": 1954 + }, + { + "epoch": 7.91497975708502, + "grad_norm": 5.819489630730656, + "learning_rate": 1.2715369463526173e-06, + "loss": 1.0285, + "step": 1955 + }, + { + "epoch": 7.919028340080971, + "grad_norm": 6.14238425845007, + "learning_rate": 1.2668325916216534e-06, + "loss": 1.0359, + "step": 1956 + }, + { + "epoch": 7.923076923076923, + "grad_norm": 4.708687979766823, + "learning_rate": 1.2621356927587353e-06, + "loss": 1.3581, + "step": 1957 + }, + { + "epoch": 7.9271255060728745, + "grad_norm": 6.6570477016899074, + "learning_rate": 1.257446259144494e-06, + "loss": 1.2012, + "step": 1958 + }, + { + "epoch": 7.931174089068826, + "grad_norm": 6.636474405464404, + "learning_rate": 1.2527643001446493e-06, + "loss": 1.181, + "step": 1959 + }, + { + "epoch": 7.935222672064778, + "grad_norm": 6.89647738144804, + "learning_rate": 1.248089825109991e-06, + "loss": 0.9855, + "step": 1960 + }, + { + "epoch": 7.939271255060729, + "grad_norm": 6.54652294560363, + "learning_rate": 1.2434228433763657e-06, + "loss": 1.0055, + "step": 1961 + }, + { + "epoch": 7.94331983805668, + "grad_norm": 7.466794850354919, + "learning_rate": 1.2387633642646501e-06, + "loss": 1.2977, + "step": 1962 + }, + { + "epoch": 7.947368421052632, + "grad_norm": 5.859347969468438, + "learning_rate": 1.2341113970807368e-06, + "loss": 1.0272, + "step": 1963 + }, + { + "epoch": 7.951417004048583, + "grad_norm": 7.526875704374519, + "learning_rate": 1.2294669511155193e-06, + "loss": 0.939, + "step": 1964 + }, + { + "epoch": 7.955465587044534, + "grad_norm": 7.225249295703587, + "learning_rate": 1.224830035644868e-06, + "loss": 1.2616, + "step": 1965 + }, + { + "epoch": 7.959514170040486, + "grad_norm": 6.683599476135708, + "learning_rate": 1.2202006599296122e-06, + "loss": 1.3384, + "step": 1966 + }, + { + "epoch": 7.963562753036437, + "grad_norm": 6.087314726468543, + "learning_rate": 1.215578833215526e-06, + "loss": 1.2777, + "step": 1967 + }, + { + "epoch": 7.967611336032388, + "grad_norm": 7.6203305950770766, + "learning_rate": 1.2109645647333018e-06, + "loss": 1.2766, + "step": 1968 + }, + { + "epoch": 7.97165991902834, + "grad_norm": 7.4075603041461155, + "learning_rate": 1.2063578636985402e-06, + "loss": 1.2, + "step": 1969 + }, + { + "epoch": 7.9757085020242915, + "grad_norm": 5.356896060806783, + "learning_rate": 1.201758739311728e-06, + "loss": 1.2542, + "step": 1970 + }, + { + "epoch": 7.979757085020243, + "grad_norm": 6.6184401008685, + "learning_rate": 1.1971672007582192e-06, + "loss": 1.3138, + "step": 1971 + }, + { + "epoch": 7.983805668016195, + "grad_norm": 5.952389025814739, + "learning_rate": 1.1925832572082184e-06, + "loss": 1.3645, + "step": 1972 + }, + { + "epoch": 7.987854251012146, + "grad_norm": 5.869009321326924, + "learning_rate": 1.1880069178167586e-06, + "loss": 1.1615, + "step": 1973 + }, + { + "epoch": 7.991902834008097, + "grad_norm": 5.240716232576427, + "learning_rate": 1.1834381917236881e-06, + "loss": 1.1793, + "step": 1974 + }, + { + "epoch": 7.995951417004049, + "grad_norm": 6.017014067933477, + "learning_rate": 1.178877088053651e-06, + "loss": 1.5002, + "step": 1975 + }, + { + "epoch": 8.0, + "grad_norm": 5.843845057775898, + "learning_rate": 1.1743236159160654e-06, + "loss": 1.2012, + "step": 1976 + }, + { + "epoch": 8.004048582995951, + "grad_norm": 5.731134271109451, + "learning_rate": 1.1697777844051105e-06, + "loss": 1.2312, + "step": 1977 + }, + { + "epoch": 8.008097165991902, + "grad_norm": 8.123089091980212, + "learning_rate": 1.165239602599702e-06, + "loss": 1.4044, + "step": 1978 + }, + { + "epoch": 8.012145748987853, + "grad_norm": 7.3997346838307045, + "learning_rate": 1.1607090795634802e-06, + "loss": 1.179, + "step": 1979 + }, + { + "epoch": 8.016194331983806, + "grad_norm": 7.893381080795837, + "learning_rate": 1.156186224344789e-06, + "loss": 1.4132, + "step": 1980 + }, + { + "epoch": 8.020242914979757, + "grad_norm": 6.767976836554466, + "learning_rate": 1.1516710459766589e-06, + "loss": 1.5665, + "step": 1981 + }, + { + "epoch": 8.024291497975709, + "grad_norm": 7.315990265277637, + "learning_rate": 1.1471635534767877e-06, + "loss": 1.4869, + "step": 1982 + }, + { + "epoch": 8.02834008097166, + "grad_norm": 6.313789360006903, + "learning_rate": 1.1426637558475206e-06, + "loss": 1.1981, + "step": 1983 + }, + { + "epoch": 8.03238866396761, + "grad_norm": 7.201178679428242, + "learning_rate": 1.138171662075837e-06, + "loss": 1.2025, + "step": 1984 + }, + { + "epoch": 8.036437246963562, + "grad_norm": 6.52677540701035, + "learning_rate": 1.133687281133331e-06, + "loss": 1.4043, + "step": 1985 + }, + { + "epoch": 8.040485829959515, + "grad_norm": 6.870989950025807, + "learning_rate": 1.1292106219761928e-06, + "loss": 1.2134, + "step": 1986 + }, + { + "epoch": 8.044534412955466, + "grad_norm": 6.875288304164971, + "learning_rate": 1.1247416935451855e-06, + "loss": 1.3732, + "step": 1987 + }, + { + "epoch": 8.048582995951417, + "grad_norm": 6.61190406665116, + "learning_rate": 1.1202805047656406e-06, + "loss": 1.2149, + "step": 1988 + }, + { + "epoch": 8.052631578947368, + "grad_norm": 7.378314345746476, + "learning_rate": 1.1158270645474233e-06, + "loss": 1.2651, + "step": 1989 + }, + { + "epoch": 8.05668016194332, + "grad_norm": 6.525622834951594, + "learning_rate": 1.1113813817849312e-06, + "loss": 1.1235, + "step": 1990 + }, + { + "epoch": 8.06072874493927, + "grad_norm": 6.596016576904695, + "learning_rate": 1.1069434653570633e-06, + "loss": 1.2623, + "step": 1991 + }, + { + "epoch": 8.064777327935223, + "grad_norm": 7.280600264284795, + "learning_rate": 1.1025133241272113e-06, + "loss": 1.1959, + "step": 1992 + }, + { + "epoch": 8.068825910931174, + "grad_norm": 7.346457081658032, + "learning_rate": 1.0980909669432376e-06, + "loss": 1.3747, + "step": 1993 + }, + { + "epoch": 8.072874493927126, + "grad_norm": 6.3969953968688, + "learning_rate": 1.0936764026374547e-06, + "loss": 1.2673, + "step": 1994 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 7.087695501441698, + "learning_rate": 1.0892696400266151e-06, + "loss": 1.2309, + "step": 1995 + }, + { + "epoch": 8.080971659919028, + "grad_norm": 6.045859729929738, + "learning_rate": 1.0848706879118893e-06, + "loss": 1.3544, + "step": 1996 + }, + { + "epoch": 8.085020242914979, + "grad_norm": 5.591583983778709, + "learning_rate": 1.0804795550788473e-06, + "loss": 1.3016, + "step": 1997 + }, + { + "epoch": 8.089068825910932, + "grad_norm": 6.782167877710207, + "learning_rate": 1.0760962502974453e-06, + "loss": 1.2539, + "step": 1998 + }, + { + "epoch": 8.093117408906883, + "grad_norm": 6.256971136931457, + "learning_rate": 1.0717207823220005e-06, + "loss": 1.3311, + "step": 1999 + }, + { + "epoch": 8.097165991902834, + "grad_norm": 5.902238719165329, + "learning_rate": 1.0673531598911824e-06, + "loss": 1.0787, + "step": 2000 + }, + { + "epoch": 8.101214574898785, + "grad_norm": 6.625744512089742, + "learning_rate": 1.0629933917279906e-06, + "loss": 1.2767, + "step": 2001 + }, + { + "epoch": 8.105263157894736, + "grad_norm": 7.073772146380111, + "learning_rate": 1.0586414865397381e-06, + "loss": 1.1861, + "step": 2002 + }, + { + "epoch": 8.109311740890687, + "grad_norm": 6.262732530690249, + "learning_rate": 1.0542974530180327e-06, + "loss": 1.2172, + "step": 2003 + }, + { + "epoch": 8.11336032388664, + "grad_norm": 7.393380584551558, + "learning_rate": 1.0499612998387621e-06, + "loss": 1.1485, + "step": 2004 + }, + { + "epoch": 8.117408906882591, + "grad_norm": 6.857359399326426, + "learning_rate": 1.0456330356620758e-06, + "loss": 1.0672, + "step": 2005 + }, + { + "epoch": 8.121457489878543, + "grad_norm": 6.5740346675087205, + "learning_rate": 1.0413126691323667e-06, + "loss": 1.1479, + "step": 2006 + }, + { + "epoch": 8.125506072874494, + "grad_norm": 6.267695688330783, + "learning_rate": 1.0370002088782555e-06, + "loss": 1.165, + "step": 2007 + }, + { + "epoch": 8.129554655870445, + "grad_norm": 7.133762320289656, + "learning_rate": 1.0326956635125707e-06, + "loss": 1.0247, + "step": 2008 + }, + { + "epoch": 8.133603238866396, + "grad_norm": 5.586702351654256, + "learning_rate": 1.0283990416323336e-06, + "loss": 1.3881, + "step": 2009 + }, + { + "epoch": 8.137651821862349, + "grad_norm": 6.806616706670472, + "learning_rate": 1.0241103518187433e-06, + "loss": 1.1919, + "step": 2010 + }, + { + "epoch": 8.1417004048583, + "grad_norm": 5.430435103612442, + "learning_rate": 1.019829602637154e-06, + "loss": 0.9674, + "step": 2011 + }, + { + "epoch": 8.145748987854251, + "grad_norm": 7.14447897307659, + "learning_rate": 1.0155568026370637e-06, + "loss": 1.2791, + "step": 2012 + }, + { + "epoch": 8.149797570850202, + "grad_norm": 6.3472462119415525, + "learning_rate": 1.0112919603520898e-06, + "loss": 1.1158, + "step": 2013 + }, + { + "epoch": 8.153846153846153, + "grad_norm": 5.6608952411216125, + "learning_rate": 1.0070350842999622e-06, + "loss": 1.357, + "step": 2014 + }, + { + "epoch": 8.157894736842104, + "grad_norm": 7.080132640290096, + "learning_rate": 1.0027861829824953e-06, + "loss": 1.3434, + "step": 2015 + }, + { + "epoch": 8.161943319838057, + "grad_norm": 5.51924122267234, + "learning_rate": 9.985452648855803e-07, + "loss": 1.1787, + "step": 2016 + }, + { + "epoch": 8.165991902834008, + "grad_norm": 6.025963555073775, + "learning_rate": 9.943123384791632e-07, + "loss": 1.2719, + "step": 2017 + }, + { + "epoch": 8.17004048582996, + "grad_norm": 5.336299411323149, + "learning_rate": 9.900874122172294e-07, + "loss": 1.8638, + "step": 2018 + }, + { + "epoch": 8.17408906882591, + "grad_norm": 6.492484439551155, + "learning_rate": 9.85870494537784e-07, + "loss": 1.0806, + "step": 2019 + }, + { + "epoch": 8.178137651821862, + "grad_norm": 7.726948183355687, + "learning_rate": 9.816615938628409e-07, + "loss": 1.2902, + "step": 2020 + }, + { + "epoch": 8.182186234817813, + "grad_norm": 5.250851031576059, + "learning_rate": 9.774607185984004e-07, + "loss": 1.0877, + "step": 2021 + }, + { + "epoch": 8.186234817813766, + "grad_norm": 6.904468404911272, + "learning_rate": 9.732678771344344e-07, + "loss": 1.2729, + "step": 2022 + }, + { + "epoch": 8.190283400809717, + "grad_norm": 6.477961038997859, + "learning_rate": 9.690830778448723e-07, + "loss": 1.2954, + "step": 2023 + }, + { + "epoch": 8.194331983805668, + "grad_norm": 6.901383952123393, + "learning_rate": 9.649063290875771e-07, + "loss": 1.4598, + "step": 2024 + }, + { + "epoch": 8.19838056680162, + "grad_norm": 5.697192396337908, + "learning_rate": 9.607376392043366e-07, + "loss": 1.5219, + "step": 2025 + }, + { + "epoch": 8.20242914979757, + "grad_norm": 5.828533006715791, + "learning_rate": 9.565770165208432e-07, + "loss": 1.2267, + "step": 2026 + }, + { + "epoch": 8.206477732793521, + "grad_norm": 5.737519140655703, + "learning_rate": 9.524244693466773e-07, + "loss": 1.5547, + "step": 2027 + }, + { + "epoch": 8.210526315789474, + "grad_norm": 5.906148707150362, + "learning_rate": 9.482800059752911e-07, + "loss": 1.5423, + "step": 2028 + }, + { + "epoch": 8.214574898785425, + "grad_norm": 6.246342403732729, + "learning_rate": 9.441436346839894e-07, + "loss": 1.3284, + "step": 2029 + }, + { + "epoch": 8.218623481781377, + "grad_norm": 6.542108201095842, + "learning_rate": 9.400153637339182e-07, + "loss": 1.2057, + "step": 2030 + }, + { + "epoch": 8.222672064777328, + "grad_norm": 6.355801787175163, + "learning_rate": 9.358952013700462e-07, + "loss": 1.1541, + "step": 2031 + }, + { + "epoch": 8.226720647773279, + "grad_norm": 6.0083830127963465, + "learning_rate": 9.317831558211449e-07, + "loss": 1.3599, + "step": 2032 + }, + { + "epoch": 8.23076923076923, + "grad_norm": 6.143312349563429, + "learning_rate": 9.276792352997782e-07, + "loss": 1.1424, + "step": 2033 + }, + { + "epoch": 8.234817813765183, + "grad_norm": 7.026565648122738, + "learning_rate": 9.235834480022788e-07, + "loss": 1.361, + "step": 2034 + }, + { + "epoch": 8.238866396761134, + "grad_norm": 6.79010834147561, + "learning_rate": 9.19495802108738e-07, + "loss": 1.2944, + "step": 2035 + }, + { + "epoch": 8.242914979757085, + "grad_norm": 6.262899466718926, + "learning_rate": 9.154163057829879e-07, + "loss": 1.3301, + "step": 2036 + }, + { + "epoch": 8.246963562753036, + "grad_norm": 5.784122192100412, + "learning_rate": 9.113449671725832e-07, + "loss": 1.1986, + "step": 2037 + }, + { + "epoch": 8.251012145748987, + "grad_norm": 6.13085712005476, + "learning_rate": 9.072817944087875e-07, + "loss": 1.284, + "step": 2038 + }, + { + "epoch": 8.255060728744938, + "grad_norm": 6.317294175666071, + "learning_rate": 9.032267956065516e-07, + "loss": 1.2274, + "step": 2039 + }, + { + "epoch": 8.259109311740891, + "grad_norm": 5.586217657876971, + "learning_rate": 8.991799788645067e-07, + "loss": 1.1896, + "step": 2040 + }, + { + "epoch": 8.263157894736842, + "grad_norm": 6.088327462827803, + "learning_rate": 8.951413522649372e-07, + "loss": 0.9771, + "step": 2041 + }, + { + "epoch": 8.267206477732794, + "grad_norm": 5.76590382121624, + "learning_rate": 8.911109238737748e-07, + "loss": 1.2758, + "step": 2042 + }, + { + "epoch": 8.271255060728745, + "grad_norm": 6.211464855564121, + "learning_rate": 8.870887017405761e-07, + "loss": 1.273, + "step": 2043 + }, + { + "epoch": 8.275303643724696, + "grad_norm": 6.06402110401488, + "learning_rate": 8.830746938985091e-07, + "loss": 1.356, + "step": 2044 + }, + { + "epoch": 8.279352226720647, + "grad_norm": 7.891296000946273, + "learning_rate": 8.790689083643328e-07, + "loss": 1.2355, + "step": 2045 + }, + { + "epoch": 8.2834008097166, + "grad_norm": 6.919823315708994, + "learning_rate": 8.750713531383886e-07, + "loss": 0.9371, + "step": 2046 + }, + { + "epoch": 8.287449392712551, + "grad_norm": 7.599246572003176, + "learning_rate": 8.710820362045791e-07, + "loss": 1.0832, + "step": 2047 + }, + { + "epoch": 8.291497975708502, + "grad_norm": 7.084253293886639, + "learning_rate": 8.671009655303531e-07, + "loss": 0.9594, + "step": 2048 + }, + { + "epoch": 8.295546558704453, + "grad_norm": 7.266404675494076, + "learning_rate": 8.631281490666915e-07, + "loss": 1.1647, + "step": 2049 + }, + { + "epoch": 8.299595141700404, + "grad_norm": 6.465250431675959, + "learning_rate": 8.591635947480854e-07, + "loss": 1.4079, + "step": 2050 + }, + { + "epoch": 8.303643724696355, + "grad_norm": 7.279071790902037, + "learning_rate": 8.552073104925296e-07, + "loss": 1.0049, + "step": 2051 + }, + { + "epoch": 8.307692307692308, + "grad_norm": 6.756555724272831, + "learning_rate": 8.512593042015005e-07, + "loss": 1.0616, + "step": 2052 + }, + { + "epoch": 8.31174089068826, + "grad_norm": 6.254507577162332, + "learning_rate": 8.473195837599419e-07, + "loss": 1.1174, + "step": 2053 + }, + { + "epoch": 8.31578947368421, + "grad_norm": 7.727840849711051, + "learning_rate": 8.433881570362484e-07, + "loss": 0.9914, + "step": 2054 + }, + { + "epoch": 8.319838056680162, + "grad_norm": 6.756642529850463, + "learning_rate": 8.3946503188225e-07, + "loss": 0.9647, + "step": 2055 + }, + { + "epoch": 8.323886639676113, + "grad_norm": 6.963802700325999, + "learning_rate": 8.355502161331985e-07, + "loss": 1.0237, + "step": 2056 + }, + { + "epoch": 8.327935222672064, + "grad_norm": 6.731503726472556, + "learning_rate": 8.316437176077491e-07, + "loss": 1.0387, + "step": 2057 + }, + { + "epoch": 8.331983805668017, + "grad_norm": 6.8290712150235375, + "learning_rate": 8.277455441079463e-07, + "loss": 1.0816, + "step": 2058 + }, + { + "epoch": 8.336032388663968, + "grad_norm": 6.2806024635481625, + "learning_rate": 8.238557034192085e-07, + "loss": 1.189, + "step": 2059 + }, + { + "epoch": 8.34008097165992, + "grad_norm": 6.184192289516359, + "learning_rate": 8.199742033103091e-07, + "loss": 1.1119, + "step": 2060 + }, + { + "epoch": 8.34412955465587, + "grad_norm": 6.540385862485887, + "learning_rate": 8.161010515333662e-07, + "loss": 1.0109, + "step": 2061 + }, + { + "epoch": 8.348178137651821, + "grad_norm": 7.727191651616888, + "learning_rate": 8.12236255823825e-07, + "loss": 1.1502, + "step": 2062 + }, + { + "epoch": 8.352226720647772, + "grad_norm": 6.9096677414157535, + "learning_rate": 8.083798239004408e-07, + "loss": 1.1497, + "step": 2063 + }, + { + "epoch": 8.356275303643725, + "grad_norm": 6.414815740722037, + "learning_rate": 8.045317634652661e-07, + "loss": 0.842, + "step": 2064 + }, + { + "epoch": 8.360323886639677, + "grad_norm": 7.487469807700361, + "learning_rate": 8.006920822036307e-07, + "loss": 1.2308, + "step": 2065 + }, + { + "epoch": 8.364372469635628, + "grad_norm": 6.046114352668178, + "learning_rate": 7.968607877841333e-07, + "loss": 1.3044, + "step": 2066 + }, + { + "epoch": 8.368421052631579, + "grad_norm": 6.435316234936995, + "learning_rate": 7.930378878586198e-07, + "loss": 1.1938, + "step": 2067 + }, + { + "epoch": 8.37246963562753, + "grad_norm": 7.906119816359948, + "learning_rate": 7.89223390062172e-07, + "loss": 1.2389, + "step": 2068 + }, + { + "epoch": 8.376518218623481, + "grad_norm": 6.803177488562893, + "learning_rate": 7.854173020130906e-07, + "loss": 0.9517, + "step": 2069 + }, + { + "epoch": 8.380566801619434, + "grad_norm": 7.234612181909552, + "learning_rate": 7.816196313128821e-07, + "loss": 1.0982, + "step": 2070 + }, + { + "epoch": 8.384615384615385, + "grad_norm": 6.204452258594293, + "learning_rate": 7.778303855462382e-07, + "loss": 0.913, + "step": 2071 + }, + { + "epoch": 8.388663967611336, + "grad_norm": 7.6652434424714375, + "learning_rate": 7.740495722810271e-07, + "loss": 0.9799, + "step": 2072 + }, + { + "epoch": 8.392712550607287, + "grad_norm": 7.028162715790928, + "learning_rate": 7.702771990682745e-07, + "loss": 1.1741, + "step": 2073 + }, + { + "epoch": 8.396761133603238, + "grad_norm": 8.20693379504055, + "learning_rate": 7.66513273442151e-07, + "loss": 0.9586, + "step": 2074 + }, + { + "epoch": 8.40080971659919, + "grad_norm": 6.1595198687647255, + "learning_rate": 7.627578029199562e-07, + "loss": 1.0087, + "step": 2075 + }, + { + "epoch": 8.404858299595142, + "grad_norm": 6.186971527710178, + "learning_rate": 7.590107950020987e-07, + "loss": 1.0385, + "step": 2076 + }, + { + "epoch": 8.408906882591094, + "grad_norm": 7.634025112115446, + "learning_rate": 7.552722571720899e-07, + "loss": 1.1273, + "step": 2077 + }, + { + "epoch": 8.412955465587045, + "grad_norm": 6.509921676918103, + "learning_rate": 7.515421968965242e-07, + "loss": 0.9676, + "step": 2078 + }, + { + "epoch": 8.417004048582996, + "grad_norm": 7.549987787462475, + "learning_rate": 7.478206216250644e-07, + "loss": 1.2442, + "step": 2079 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 7.555658247599405, + "learning_rate": 7.441075387904267e-07, + "loss": 1.2719, + "step": 2080 + }, + { + "epoch": 8.425101214574898, + "grad_norm": 6.5153535549196215, + "learning_rate": 7.404029558083653e-07, + "loss": 1.3106, + "step": 2081 + }, + { + "epoch": 8.429149797570851, + "grad_norm": 7.532432754634663, + "learning_rate": 7.367068800776594e-07, + "loss": 1.2708, + "step": 2082 + }, + { + "epoch": 8.433198380566802, + "grad_norm": 5.774787588818044, + "learning_rate": 7.330193189800994e-07, + "loss": 1.4544, + "step": 2083 + }, + { + "epoch": 8.437246963562753, + "grad_norm": 6.8245265398524495, + "learning_rate": 7.293402798804667e-07, + "loss": 1.2466, + "step": 2084 + }, + { + "epoch": 8.441295546558704, + "grad_norm": 7.77462252770274, + "learning_rate": 7.25669770126527e-07, + "loss": 1.1822, + "step": 2085 + }, + { + "epoch": 8.445344129554655, + "grad_norm": 6.148198383672424, + "learning_rate": 7.220077970490058e-07, + "loss": 1.4383, + "step": 2086 + }, + { + "epoch": 8.449392712550607, + "grad_norm": 7.866867275378799, + "learning_rate": 7.183543679615834e-07, + "loss": 1.2326, + "step": 2087 + }, + { + "epoch": 8.45344129554656, + "grad_norm": 7.546182288687263, + "learning_rate": 7.147094901608748e-07, + "loss": 1.2273, + "step": 2088 + }, + { + "epoch": 8.45748987854251, + "grad_norm": 7.579378068549671, + "learning_rate": 7.110731709264163e-07, + "loss": 1.57, + "step": 2089 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 10.015081237740555, + "learning_rate": 7.074454175206524e-07, + "loss": 1.6365, + "step": 2090 + }, + { + "epoch": 8.465587044534413, + "grad_norm": 8.048721435929364, + "learning_rate": 7.03826237188916e-07, + "loss": 1.3541, + "step": 2091 + }, + { + "epoch": 8.469635627530364, + "grad_norm": 7.174284170679616, + "learning_rate": 7.002156371594237e-07, + "loss": 1.4242, + "step": 2092 + }, + { + "epoch": 8.473684210526315, + "grad_norm": 6.580944869519028, + "learning_rate": 6.966136246432492e-07, + "loss": 1.3988, + "step": 2093 + }, + { + "epoch": 8.477732793522268, + "grad_norm": 8.860518049025139, + "learning_rate": 6.930202068343206e-07, + "loss": 1.387, + "step": 2094 + }, + { + "epoch": 8.481781376518219, + "grad_norm": 6.5840932268783785, + "learning_rate": 6.894353909093976e-07, + "loss": 1.3236, + "step": 2095 + }, + { + "epoch": 8.48582995951417, + "grad_norm": 7.278031801942817, + "learning_rate": 6.858591840280627e-07, + "loss": 1.2652, + "step": 2096 + }, + { + "epoch": 8.489878542510121, + "grad_norm": 9.369040288696132, + "learning_rate": 6.822915933327012e-07, + "loss": 1.2337, + "step": 2097 + }, + { + "epoch": 8.493927125506072, + "grad_norm": 9.575953105863487, + "learning_rate": 6.787326259484922e-07, + "loss": 1.154, + "step": 2098 + }, + { + "epoch": 8.497975708502024, + "grad_norm": 6.611344155786181, + "learning_rate": 6.751822889833926e-07, + "loss": 1.0993, + "step": 2099 + }, + { + "epoch": 8.502024291497976, + "grad_norm": 7.493254499061418, + "learning_rate": 6.716405895281225e-07, + "loss": 0.9173, + "step": 2100 + }, + { + "epoch": 8.506072874493928, + "grad_norm": 8.70918178876987, + "learning_rate": 6.681075346561517e-07, + "loss": 1.2742, + "step": 2101 + }, + { + "epoch": 8.510121457489879, + "grad_norm": 8.138470526559217, + "learning_rate": 6.645831314236817e-07, + "loss": 1.2013, + "step": 2102 + }, + { + "epoch": 8.51417004048583, + "grad_norm": 6.09672586428882, + "learning_rate": 6.610673868696387e-07, + "loss": 1.2136, + "step": 2103 + }, + { + "epoch": 8.518218623481781, + "grad_norm": 7.785839558420341, + "learning_rate": 6.57560308015655e-07, + "loss": 1.2265, + "step": 2104 + }, + { + "epoch": 8.522267206477732, + "grad_norm": 8.953731258403018, + "learning_rate": 6.540619018660555e-07, + "loss": 1.15, + "step": 2105 + }, + { + "epoch": 8.526315789473685, + "grad_norm": 8.20742839178438, + "learning_rate": 6.505721754078443e-07, + "loss": 0.9784, + "step": 2106 + }, + { + "epoch": 8.530364372469636, + "grad_norm": 7.275127493366391, + "learning_rate": 6.470911356106885e-07, + "loss": 1.0741, + "step": 2107 + }, + { + "epoch": 8.534412955465587, + "grad_norm": 8.35403564959096, + "learning_rate": 6.436187894269086e-07, + "loss": 1.0919, + "step": 2108 + }, + { + "epoch": 8.538461538461538, + "grad_norm": 7.970907113068403, + "learning_rate": 6.401551437914621e-07, + "loss": 1.3919, + "step": 2109 + }, + { + "epoch": 8.54251012145749, + "grad_norm": 7.075118724352275, + "learning_rate": 6.367002056219285e-07, + "loss": 1.3732, + "step": 2110 + }, + { + "epoch": 8.54655870445344, + "grad_norm": 41.2546881055469, + "learning_rate": 6.332539818184985e-07, + "loss": 1.9685, + "step": 2111 + }, + { + "epoch": 8.550607287449393, + "grad_norm": 12.366409360208545, + "learning_rate": 6.298164792639555e-07, + "loss": 1.6408, + "step": 2112 + }, + { + "epoch": 8.554655870445345, + "grad_norm": 11.261791719061787, + "learning_rate": 6.263877048236683e-07, + "loss": 2.024, + "step": 2113 + }, + { + "epoch": 8.558704453441296, + "grad_norm": 7.759335659536259, + "learning_rate": 6.229676653455719e-07, + "loss": 1.2075, + "step": 2114 + }, + { + "epoch": 8.562753036437247, + "grad_norm": 6.714063146216, + "learning_rate": 6.195563676601563e-07, + "loss": 1.0819, + "step": 2115 + }, + { + "epoch": 8.566801619433198, + "grad_norm": 6.884464537310286, + "learning_rate": 6.161538185804544e-07, + "loss": 1.4577, + "step": 2116 + }, + { + "epoch": 8.570850202429149, + "grad_norm": 7.270354106890159, + "learning_rate": 6.127600249020216e-07, + "loss": 1.1355, + "step": 2117 + }, + { + "epoch": 8.574898785425102, + "grad_norm": 7.112708598170683, + "learning_rate": 6.09374993402932e-07, + "loss": 1.2906, + "step": 2118 + }, + { + "epoch": 8.578947368421053, + "grad_norm": 5.581752289480177, + "learning_rate": 6.059987308437565e-07, + "loss": 1.1301, + "step": 2119 + }, + { + "epoch": 8.582995951417004, + "grad_norm": 7.0619445923701205, + "learning_rate": 6.026312439675553e-07, + "loss": 1.2177, + "step": 2120 + }, + { + "epoch": 8.587044534412955, + "grad_norm": 7.241742089371334, + "learning_rate": 5.992725394998594e-07, + "loss": 1.2383, + "step": 2121 + }, + { + "epoch": 8.591093117408906, + "grad_norm": 7.799009343624035, + "learning_rate": 5.959226241486632e-07, + "loss": 1.2195, + "step": 2122 + }, + { + "epoch": 8.595141700404858, + "grad_norm": 6.849477991699955, + "learning_rate": 5.925815046044026e-07, + "loss": 0.9419, + "step": 2123 + }, + { + "epoch": 8.59919028340081, + "grad_norm": 5.8814800267073, + "learning_rate": 5.892491875399503e-07, + "loss": 1.0593, + "step": 2124 + }, + { + "epoch": 8.603238866396762, + "grad_norm": 6.030196066850317, + "learning_rate": 5.859256796105972e-07, + "loss": 1.141, + "step": 2125 + }, + { + "epoch": 8.607287449392713, + "grad_norm": 6.614777125339002, + "learning_rate": 5.826109874540409e-07, + "loss": 0.9086, + "step": 2126 + }, + { + "epoch": 8.611336032388664, + "grad_norm": 6.948469573714514, + "learning_rate": 5.793051176903736e-07, + "loss": 1.1918, + "step": 2127 + }, + { + "epoch": 8.615384615384615, + "grad_norm": 6.3016655419950425, + "learning_rate": 5.760080769220644e-07, + "loss": 1.2405, + "step": 2128 + }, + { + "epoch": 8.619433198380566, + "grad_norm": 7.302956053566254, + "learning_rate": 5.727198717339511e-07, + "loss": 0.9801, + "step": 2129 + }, + { + "epoch": 8.623481781376519, + "grad_norm": 7.80182069701434, + "learning_rate": 5.694405086932248e-07, + "loss": 1.3353, + "step": 2130 + }, + { + "epoch": 8.62753036437247, + "grad_norm": 6.639707488466264, + "learning_rate": 5.661699943494181e-07, + "loss": 1.1811, + "step": 2131 + }, + { + "epoch": 8.631578947368421, + "grad_norm": 6.899791220581512, + "learning_rate": 5.6290833523439e-07, + "loss": 1.4413, + "step": 2132 + }, + { + "epoch": 8.635627530364372, + "grad_norm": 5.939504101979065, + "learning_rate": 5.596555378623126e-07, + "loss": 1.0383, + "step": 2133 + }, + { + "epoch": 8.639676113360323, + "grad_norm": 6.991218523438056, + "learning_rate": 5.564116087296618e-07, + "loss": 1.3329, + "step": 2134 + }, + { + "epoch": 8.643724696356275, + "grad_norm": 6.159504500789582, + "learning_rate": 5.531765543152002e-07, + "loss": 1.2145, + "step": 2135 + }, + { + "epoch": 8.647773279352228, + "grad_norm": 6.654163633145904, + "learning_rate": 5.499503810799667e-07, + "loss": 1.3227, + "step": 2136 + }, + { + "epoch": 8.651821862348179, + "grad_norm": 5.691830923037683, + "learning_rate": 5.467330954672639e-07, + "loss": 1.1114, + "step": 2137 + }, + { + "epoch": 8.65587044534413, + "grad_norm": 6.9549489463225775, + "learning_rate": 5.435247039026398e-07, + "loss": 1.3092, + "step": 2138 + }, + { + "epoch": 8.65991902834008, + "grad_norm": 7.2060016669462295, + "learning_rate": 5.403252127938841e-07, + "loss": 1.1114, + "step": 2139 + }, + { + "epoch": 8.663967611336032, + "grad_norm": 5.860230883158528, + "learning_rate": 5.371346285310075e-07, + "loss": 1.2813, + "step": 2140 + }, + { + "epoch": 8.668016194331983, + "grad_norm": 6.007997589736958, + "learning_rate": 5.33952957486234e-07, + "loss": 1.2645, + "step": 2141 + }, + { + "epoch": 8.672064777327936, + "grad_norm": 6.380138387786136, + "learning_rate": 5.30780206013985e-07, + "loss": 1.4488, + "step": 2142 + }, + { + "epoch": 8.676113360323887, + "grad_norm": 5.389279452380071, + "learning_rate": 5.276163804508671e-07, + "loss": 1.436, + "step": 2143 + }, + { + "epoch": 8.680161943319838, + "grad_norm": 6.700282421943868, + "learning_rate": 5.244614871156612e-07, + "loss": 1.2596, + "step": 2144 + }, + { + "epoch": 8.68421052631579, + "grad_norm": 6.474231798689995, + "learning_rate": 5.213155323093094e-07, + "loss": 1.1446, + "step": 2145 + }, + { + "epoch": 8.68825910931174, + "grad_norm": 6.124423836263308, + "learning_rate": 5.181785223148999e-07, + "loss": 1.2253, + "step": 2146 + }, + { + "epoch": 8.692307692307692, + "grad_norm": 6.653036436702035, + "learning_rate": 5.150504633976572e-07, + "loss": 1.2426, + "step": 2147 + }, + { + "epoch": 8.696356275303645, + "grad_norm": 5.972447093623402, + "learning_rate": 5.119313618049309e-07, + "loss": 1.126, + "step": 2148 + }, + { + "epoch": 8.700404858299596, + "grad_norm": 7.753561648240651, + "learning_rate": 5.088212237661766e-07, + "loss": 1.5606, + "step": 2149 + }, + { + "epoch": 8.704453441295547, + "grad_norm": 6.583094267044065, + "learning_rate": 5.057200554929509e-07, + "loss": 1.3713, + "step": 2150 + }, + { + "epoch": 8.708502024291498, + "grad_norm": 6.552630239242199, + "learning_rate": 5.026278631788967e-07, + "loss": 1.001, + "step": 2151 + }, + { + "epoch": 8.712550607287449, + "grad_norm": 6.025634072841221, + "learning_rate": 4.995446529997283e-07, + "loss": 1.1171, + "step": 2152 + }, + { + "epoch": 8.7165991902834, + "grad_norm": 6.005875203639586, + "learning_rate": 4.964704311132224e-07, + "loss": 1.2367, + "step": 2153 + }, + { + "epoch": 8.720647773279353, + "grad_norm": 7.40464404684989, + "learning_rate": 4.934052036592018e-07, + "loss": 0.9859, + "step": 2154 + }, + { + "epoch": 8.724696356275304, + "grad_norm": 6.8264153711344155, + "learning_rate": 4.903489767595287e-07, + "loss": 1.2321, + "step": 2155 + }, + { + "epoch": 8.728744939271255, + "grad_norm": 6.019858796316467, + "learning_rate": 4.873017565180871e-07, + "loss": 1.3846, + "step": 2156 + }, + { + "epoch": 8.732793522267206, + "grad_norm": 6.09572311582674, + "learning_rate": 4.842635490207747e-07, + "loss": 1.0707, + "step": 2157 + }, + { + "epoch": 8.736842105263158, + "grad_norm": 6.364401455402685, + "learning_rate": 4.812343603354896e-07, + "loss": 1.1865, + "step": 2158 + }, + { + "epoch": 8.740890688259109, + "grad_norm": 5.987439148552965, + "learning_rate": 4.782141965121129e-07, + "loss": 1.8135, + "step": 2159 + }, + { + "epoch": 8.744939271255062, + "grad_norm": 5.829819359945272, + "learning_rate": 4.752030635825067e-07, + "loss": 1.3722, + "step": 2160 + }, + { + "epoch": 8.748987854251013, + "grad_norm": 7.34437549498405, + "learning_rate": 4.7220096756049384e-07, + "loss": 1.1621, + "step": 2161 + }, + { + "epoch": 8.753036437246964, + "grad_norm": 6.883954168990045, + "learning_rate": 4.6920791444184934e-07, + "loss": 1.0939, + "step": 2162 + }, + { + "epoch": 8.757085020242915, + "grad_norm": 6.597939900390535, + "learning_rate": 4.662239102042887e-07, + "loss": 1.1937, + "step": 2163 + }, + { + "epoch": 8.761133603238866, + "grad_norm": 7.598759077469902, + "learning_rate": 4.6324896080745254e-07, + "loss": 1.9407, + "step": 2164 + }, + { + "epoch": 8.765182186234817, + "grad_norm": 8.104460939111814, + "learning_rate": 4.602830721928997e-07, + "loss": 1.6736, + "step": 2165 + }, + { + "epoch": 8.76923076923077, + "grad_norm": 8.243030121308468, + "learning_rate": 4.573262502840914e-07, + "loss": 1.6042, + "step": 2166 + }, + { + "epoch": 8.773279352226721, + "grad_norm": 5.94145259338888, + "learning_rate": 4.54378500986381e-07, + "loss": 1.144, + "step": 2167 + }, + { + "epoch": 8.777327935222672, + "grad_norm": 6.040011156691804, + "learning_rate": 4.5143983018700485e-07, + "loss": 1.4426, + "step": 2168 + }, + { + "epoch": 8.781376518218623, + "grad_norm": 5.588028348172228, + "learning_rate": 4.48510243755062e-07, + "loss": 1.4161, + "step": 2169 + }, + { + "epoch": 8.785425101214575, + "grad_norm": 6.060131286333249, + "learning_rate": 4.455897475415133e-07, + "loss": 1.2124, + "step": 2170 + }, + { + "epoch": 8.789473684210526, + "grad_norm": 5.82973877308885, + "learning_rate": 4.4267834737916295e-07, + "loss": 1.1482, + "step": 2171 + }, + { + "epoch": 8.793522267206479, + "grad_norm": 6.657446024745296, + "learning_rate": 4.39776049082648e-07, + "loss": 1.1481, + "step": 2172 + }, + { + "epoch": 8.79757085020243, + "grad_norm": 6.3615249922064825, + "learning_rate": 4.3688285844842747e-07, + "loss": 1.1969, + "step": 2173 + }, + { + "epoch": 8.80161943319838, + "grad_norm": 6.112293290660962, + "learning_rate": 4.33998781254773e-07, + "loss": 1.101, + "step": 2174 + }, + { + "epoch": 8.805668016194332, + "grad_norm": 5.743019279125439, + "learning_rate": 4.3112382326174987e-07, + "loss": 1.0094, + "step": 2175 + }, + { + "epoch": 8.809716599190283, + "grad_norm": 6.20487988377178, + "learning_rate": 4.2825799021121493e-07, + "loss": 1.3299, + "step": 2176 + }, + { + "epoch": 8.813765182186234, + "grad_norm": 8.142675422332248, + "learning_rate": 4.2540128782679934e-07, + "loss": 1.1159, + "step": 2177 + }, + { + "epoch": 8.817813765182187, + "grad_norm": 6.620797295984167, + "learning_rate": 4.225537218138981e-07, + "loss": 1.0554, + "step": 2178 + }, + { + "epoch": 8.821862348178138, + "grad_norm": 15.187486943860346, + "learning_rate": 4.197152978596608e-07, + "loss": 2.1386, + "step": 2179 + }, + { + "epoch": 8.82591093117409, + "grad_norm": 25.220825856193247, + "learning_rate": 4.1688602163297564e-07, + "loss": 3.054, + "step": 2180 + }, + { + "epoch": 8.82995951417004, + "grad_norm": 6.966275914312463, + "learning_rate": 4.1406589878446257e-07, + "loss": 1.0687, + "step": 2181 + }, + { + "epoch": 8.834008097165992, + "grad_norm": 7.297199095572435, + "learning_rate": 4.112549349464606e-07, + "loss": 1.046, + "step": 2182 + }, + { + "epoch": 8.838056680161943, + "grad_norm": 6.240254431183654, + "learning_rate": 4.0845313573301736e-07, + "loss": 1.0509, + "step": 2183 + }, + { + "epoch": 8.842105263157894, + "grad_norm": 6.003677827425577, + "learning_rate": 4.05660506739875e-07, + "loss": 1.1568, + "step": 2184 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 6.079105511128487, + "learning_rate": 4.0287705354446147e-07, + "loss": 0.9318, + "step": 2185 + }, + { + "epoch": 8.850202429149798, + "grad_norm": 7.515196540628712, + "learning_rate": 4.001027817058789e-07, + "loss": 1.1372, + "step": 2186 + }, + { + "epoch": 8.854251012145749, + "grad_norm": 7.448967050580141, + "learning_rate": 3.973376967648934e-07, + "loss": 1.1666, + "step": 2187 + }, + { + "epoch": 8.8582995951417, + "grad_norm": 5.169545843583342, + "learning_rate": 3.945818042439226e-07, + "loss": 1.126, + "step": 2188 + }, + { + "epoch": 8.862348178137651, + "grad_norm": 6.900120551493389, + "learning_rate": 3.9183510964702463e-07, + "loss": 1.1207, + "step": 2189 + }, + { + "epoch": 8.866396761133604, + "grad_norm": 6.665292349596718, + "learning_rate": 3.890976184598866e-07, + "loss": 1.0898, + "step": 2190 + }, + { + "epoch": 8.870445344129555, + "grad_norm": 8.446843694582483, + "learning_rate": 3.863693361498161e-07, + "loss": 1.4988, + "step": 2191 + }, + { + "epoch": 8.874493927125506, + "grad_norm": 6.954595155730788, + "learning_rate": 3.836502681657289e-07, + "loss": 1.4457, + "step": 2192 + }, + { + "epoch": 8.878542510121457, + "grad_norm": 7.3364795797424405, + "learning_rate": 3.809404199381378e-07, + "loss": 1.2321, + "step": 2193 + }, + { + "epoch": 8.882591093117409, + "grad_norm": 6.790035951291051, + "learning_rate": 3.7823979687914125e-07, + "loss": 1.1646, + "step": 2194 + }, + { + "epoch": 8.88663967611336, + "grad_norm": 6.034067768687113, + "learning_rate": 3.755484043824131e-07, + "loss": 0.8228, + "step": 2195 + }, + { + "epoch": 8.89068825910931, + "grad_norm": 5.919062923007496, + "learning_rate": 3.728662478231926e-07, + "loss": 1.3459, + "step": 2196 + }, + { + "epoch": 8.894736842105264, + "grad_norm": 5.697673544149653, + "learning_rate": 3.7019333255827404e-07, + "loss": 1.3481, + "step": 2197 + }, + { + "epoch": 8.898785425101215, + "grad_norm": 4.97146657305071, + "learning_rate": 3.675296639259912e-07, + "loss": 1.1434, + "step": 2198 + }, + { + "epoch": 8.902834008097166, + "grad_norm": 5.924967675956248, + "learning_rate": 3.6487524724621526e-07, + "loss": 1.1156, + "step": 2199 + }, + { + "epoch": 8.906882591093117, + "grad_norm": 6.321158341478768, + "learning_rate": 3.6223008782033773e-07, + "loss": 1.1401, + "step": 2200 + }, + { + "epoch": 8.910931174089068, + "grad_norm": 5.228841709292987, + "learning_rate": 3.595941909312595e-07, + "loss": 0.9237, + "step": 2201 + }, + { + "epoch": 8.914979757085021, + "grad_norm": 5.870938886560629, + "learning_rate": 3.569675618433849e-07, + "loss": 0.9947, + "step": 2202 + }, + { + "epoch": 8.919028340080972, + "grad_norm": 6.134128784345386, + "learning_rate": 3.543502058026071e-07, + "loss": 0.9978, + "step": 2203 + }, + { + "epoch": 8.923076923076923, + "grad_norm": 4.8433258387597204, + "learning_rate": 3.517421280363004e-07, + "loss": 1.324, + "step": 2204 + }, + { + "epoch": 8.927125506072874, + "grad_norm": 6.975094443340041, + "learning_rate": 3.49143333753309e-07, + "loss": 1.1632, + "step": 2205 + }, + { + "epoch": 8.931174089068826, + "grad_norm": 6.7182527407351875, + "learning_rate": 3.4655382814393346e-07, + "loss": 1.1421, + "step": 2206 + }, + { + "epoch": 8.935222672064777, + "grad_norm": 6.849744331194626, + "learning_rate": 3.439736163799251e-07, + "loss": 0.9506, + "step": 2207 + }, + { + "epoch": 8.939271255060728, + "grad_norm": 6.491862758103337, + "learning_rate": 3.4140270361447405e-07, + "loss": 0.9707, + "step": 2208 + }, + { + "epoch": 8.94331983805668, + "grad_norm": 7.477773915394873, + "learning_rate": 3.388410949821969e-07, + "loss": 1.2587, + "step": 2209 + }, + { + "epoch": 8.947368421052632, + "grad_norm": 5.85145672480451, + "learning_rate": 3.362887955991301e-07, + "loss": 0.9956, + "step": 2210 + }, + { + "epoch": 8.951417004048583, + "grad_norm": 7.534940191759425, + "learning_rate": 3.337458105627145e-07, + "loss": 0.8958, + "step": 2211 + }, + { + "epoch": 8.955465587044534, + "grad_norm": 6.907958693442255, + "learning_rate": 3.3121214495179187e-07, + "loss": 1.2205, + "step": 2212 + }, + { + "epoch": 8.959514170040485, + "grad_norm": 6.615135062562839, + "learning_rate": 3.2868780382658895e-07, + "loss": 1.306, + "step": 2213 + }, + { + "epoch": 8.963562753036438, + "grad_norm": 5.859430885728825, + "learning_rate": 3.261727922287111e-07, + "loss": 1.237, + "step": 2214 + }, + { + "epoch": 8.96761133603239, + "grad_norm": 7.375138881839548, + "learning_rate": 3.236671151811305e-07, + "loss": 1.2228, + "step": 2215 + }, + { + "epoch": 8.97165991902834, + "grad_norm": 7.316291202886882, + "learning_rate": 3.2117077768817395e-07, + "loss": 1.1567, + "step": 2216 + }, + { + "epoch": 8.975708502024291, + "grad_norm": 5.4215558137774424, + "learning_rate": 3.1868378473551953e-07, + "loss": 1.2206, + "step": 2217 + }, + { + "epoch": 8.979757085020243, + "grad_norm": 6.8474837571580975, + "learning_rate": 3.16206141290179e-07, + "loss": 1.2698, + "step": 2218 + }, + { + "epoch": 8.983805668016194, + "grad_norm": 6.26046466508966, + "learning_rate": 3.1373785230049356e-07, + "loss": 1.3288, + "step": 2219 + }, + { + "epoch": 8.987854251012145, + "grad_norm": 6.114392604475578, + "learning_rate": 3.1127892269612103e-07, + "loss": 1.1248, + "step": 2220 + }, + { + "epoch": 8.991902834008098, + "grad_norm": 5.409545428136062, + "learning_rate": 3.0882935738802467e-07, + "loss": 1.1403, + "step": 2221 + }, + { + "epoch": 8.995951417004049, + "grad_norm": 6.090885634530899, + "learning_rate": 3.0638916126846885e-07, + "loss": 1.4643, + "step": 2222 + }, + { + "epoch": 9.0, + "grad_norm": 6.00173008385886, + "learning_rate": 3.039583392110046e-07, + "loss": 1.163, + "step": 2223 + }, + { + "epoch": 9.004048582995951, + "grad_norm": 5.810667276177312, + "learning_rate": 3.015368960704584e-07, + "loss": 1.192, + "step": 2224 + }, + { + "epoch": 9.008097165991902, + "grad_norm": 8.416776164216913, + "learning_rate": 2.991248366829291e-07, + "loss": 1.3682, + "step": 2225 + }, + { + "epoch": 9.012145748987853, + "grad_norm": 7.726907247606117, + "learning_rate": 2.9672216586577317e-07, + "loss": 1.1359, + "step": 2226 + }, + { + "epoch": 9.016194331983806, + "grad_norm": 8.263363535299403, + "learning_rate": 2.9432888841759434e-07, + "loss": 1.3759, + "step": 2227 + }, + { + "epoch": 9.020242914979757, + "grad_norm": 7.120837123754598, + "learning_rate": 2.91945009118238e-07, + "loss": 1.5184, + "step": 2228 + }, + { + "epoch": 9.024291497975709, + "grad_norm": 7.716424876152555, + "learning_rate": 2.8957053272877957e-07, + "loss": 1.4498, + "step": 2229 + }, + { + "epoch": 9.02834008097166, + "grad_norm": 6.703211372735481, + "learning_rate": 2.8720546399151395e-07, + "loss": 1.1665, + "step": 2230 + }, + { + "epoch": 9.03238866396761, + "grad_norm": 8.03492140351389, + "learning_rate": 2.848498076299483e-07, + "loss": 1.1768, + "step": 2231 + }, + { + "epoch": 9.036437246963562, + "grad_norm": 7.280365321268138, + "learning_rate": 2.8250356834878924e-07, + "loss": 1.3754, + "step": 2232 + }, + { + "epoch": 9.040485829959515, + "grad_norm": 7.806791539178109, + "learning_rate": 2.801667508339384e-07, + "loss": 1.1804, + "step": 2233 + }, + { + "epoch": 9.044534412955466, + "grad_norm": 7.769785235769093, + "learning_rate": 2.7783935975247867e-07, + "loss": 1.3509, + "step": 2234 + }, + { + "epoch": 9.048582995951417, + "grad_norm": 7.356545707849527, + "learning_rate": 2.7552139975266677e-07, + "loss": 1.1764, + "step": 2235 + }, + { + "epoch": 9.052631578947368, + "grad_norm": 8.282177365997146, + "learning_rate": 2.732128754639246e-07, + "loss": 1.2262, + "step": 2236 + }, + { + "epoch": 9.05668016194332, + "grad_norm": 7.26327016277894, + "learning_rate": 2.7091379149682683e-07, + "loss": 1.0832, + "step": 2237 + }, + { + "epoch": 9.06072874493927, + "grad_norm": 7.62051448590103, + "learning_rate": 2.68624152443096e-07, + "loss": 1.2281, + "step": 2238 + }, + { + "epoch": 9.064777327935223, + "grad_norm": 8.277202550605427, + "learning_rate": 2.6634396287559094e-07, + "loss": 1.1544, + "step": 2239 + }, + { + "epoch": 9.068825910931174, + "grad_norm": 8.275018851850179, + "learning_rate": 2.6407322734829763e-07, + "loss": 1.3331, + "step": 2240 + }, + { + "epoch": 9.072874493927126, + "grad_norm": 7.051944979723943, + "learning_rate": 2.6181195039632123e-07, + "loss": 1.2182, + "step": 2241 + }, + { + "epoch": 9.076923076923077, + "grad_norm": 8.156861245995605, + "learning_rate": 2.5956013653587465e-07, + "loss": 1.1883, + "step": 2242 + }, + { + "epoch": 9.080971659919028, + "grad_norm": 6.8935448308674925, + "learning_rate": 2.573177902642726e-07, + "loss": 1.3277, + "step": 2243 + }, + { + "epoch": 9.085020242914979, + "grad_norm": 6.261215775969084, + "learning_rate": 2.5508491605992003e-07, + "loss": 1.2689, + "step": 2244 + }, + { + "epoch": 9.089068825910932, + "grad_norm": 7.558850743447256, + "learning_rate": 2.528615183823058e-07, + "loss": 1.2173, + "step": 2245 + }, + { + "epoch": 9.093117408906883, + "grad_norm": 7.25014149962062, + "learning_rate": 2.506476016719922e-07, + "loss": 1.3017, + "step": 2246 + }, + { + "epoch": 9.097165991902834, + "grad_norm": 6.612210773327197, + "learning_rate": 2.4844317035060407e-07, + "loss": 1.0426, + "step": 2247 + }, + { + "epoch": 9.101214574898785, + "grad_norm": 7.546815676413145, + "learning_rate": 2.462482288208234e-07, + "loss": 1.2441, + "step": 2248 + }, + { + "epoch": 9.105263157894736, + "grad_norm": 7.807941903755463, + "learning_rate": 2.440627814663804e-07, + "loss": 1.1408, + "step": 2249 + }, + { + "epoch": 9.109311740890687, + "grad_norm": 7.10859572976939, + "learning_rate": 2.4188683265204125e-07, + "loss": 1.1815, + "step": 2250 + }, + { + "epoch": 9.11336032388664, + "grad_norm": 8.441440064069806, + "learning_rate": 2.397203867236031e-07, + "loss": 1.1018, + "step": 2251 + }, + { + "epoch": 9.117408906882591, + "grad_norm": 8.011527442183498, + "learning_rate": 2.3756344800788421e-07, + "loss": 1.0407, + "step": 2252 + }, + { + "epoch": 9.121457489878543, + "grad_norm": 7.2749461666868465, + "learning_rate": 2.354160208127143e-07, + "loss": 1.1121, + "step": 2253 + }, + { + "epoch": 9.125506072874494, + "grad_norm": 7.0804097957451235, + "learning_rate": 2.3327810942692653e-07, + "loss": 1.1386, + "step": 2254 + }, + { + "epoch": 9.129554655870445, + "grad_norm": 8.144395859046373, + "learning_rate": 2.3114971812034981e-07, + "loss": 0.9948, + "step": 2255 + }, + { + "epoch": 9.133603238866396, + "grad_norm": 6.168572056202311, + "learning_rate": 2.290308511437994e-07, + "loss": 1.3591, + "step": 2256 + }, + { + "epoch": 9.137651821862349, + "grad_norm": 7.533575763657583, + "learning_rate": 2.2692151272906916e-07, + "loss": 1.1552, + "step": 2257 + }, + { + "epoch": 9.1417004048583, + "grad_norm": 6.008485334299444, + "learning_rate": 2.2482170708892083e-07, + "loss": 0.9343, + "step": 2258 + }, + { + "epoch": 9.145748987854251, + "grad_norm": 7.964276904785252, + "learning_rate": 2.2273143841707922e-07, + "loss": 1.2456, + "step": 2259 + }, + { + "epoch": 9.149797570850202, + "grad_norm": 6.9266949606644035, + "learning_rate": 2.2065071088822055e-07, + "loss": 1.0812, + "step": 2260 + }, + { + "epoch": 9.153846153846153, + "grad_norm": 6.230772654822159, + "learning_rate": 2.1857952865796616e-07, + "loss": 1.3311, + "step": 2261 + }, + { + "epoch": 9.157894736842104, + "grad_norm": 7.819811201692324, + "learning_rate": 2.1651789586287442e-07, + "loss": 1.3046, + "step": 2262 + }, + { + "epoch": 9.161943319838057, + "grad_norm": 5.846834886948424, + "learning_rate": 2.1446581662042943e-07, + "loss": 1.1448, + "step": 2263 + }, + { + "epoch": 9.165991902834008, + "grad_norm": 6.59622872926416, + "learning_rate": 2.124232950290367e-07, + "loss": 1.2402, + "step": 2264 + }, + { + "epoch": 9.17004048582996, + "grad_norm": 5.697102047320491, + "learning_rate": 2.1039033516801255e-07, + "loss": 1.8377, + "step": 2265 + }, + { + "epoch": 9.17408906882591, + "grad_norm": 6.9465123167698755, + "learning_rate": 2.0836694109757748e-07, + "loss": 1.0402, + "step": 2266 + }, + { + "epoch": 9.178137651821862, + "grad_norm": 8.09589282023309, + "learning_rate": 2.0635311685884675e-07, + "loss": 1.2518, + "step": 2267 + }, + { + "epoch": 9.182186234817813, + "grad_norm": 5.634877933410695, + "learning_rate": 2.0434886647382135e-07, + "loss": 1.0571, + "step": 2268 + }, + { + "epoch": 9.186234817813766, + "grad_norm": 7.37849880722757, + "learning_rate": 2.0235419394538324e-07, + "loss": 1.2413, + "step": 2269 + }, + { + "epoch": 9.190283400809717, + "grad_norm": 6.864858377563145, + "learning_rate": 2.0036910325728521e-07, + "loss": 1.2594, + "step": 2270 + }, + { + "epoch": 9.194331983805668, + "grad_norm": 7.401494019521915, + "learning_rate": 1.9839359837414308e-07, + "loss": 1.4279, + "step": 2271 + }, + { + "epoch": 9.19838056680162, + "grad_norm": 5.926231403832184, + "learning_rate": 1.9642768324142803e-07, + "loss": 1.493, + "step": 2272 + }, + { + "epoch": 9.20242914979757, + "grad_norm": 6.068250986585262, + "learning_rate": 1.9447136178545766e-07, + "loss": 1.1961, + "step": 2273 + }, + { + "epoch": 9.206477732793521, + "grad_norm": 5.98199665934451, + "learning_rate": 1.9252463791339048e-07, + "loss": 1.5216, + "step": 2274 + }, + { + "epoch": 9.210526315789474, + "grad_norm": 6.111430931522919, + "learning_rate": 1.9058751551321642e-07, + "loss": 1.5102, + "step": 2275 + }, + { + "epoch": 9.214574898785425, + "grad_norm": 6.500698071562023, + "learning_rate": 1.8865999845374794e-07, + "loss": 1.2951, + "step": 2276 + }, + { + "epoch": 9.218623481781377, + "grad_norm": 6.682267491506086, + "learning_rate": 1.8674209058461624e-07, + "loss": 1.1742, + "step": 2277 + }, + { + "epoch": 9.222672064777328, + "grad_norm": 6.627947152950901, + "learning_rate": 1.8483379573625948e-07, + "loss": 1.1273, + "step": 2278 + }, + { + "epoch": 9.226720647773279, + "grad_norm": 6.274948615311735, + "learning_rate": 1.8293511771991624e-07, + "loss": 1.3395, + "step": 2279 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 6.379156536924719, + "learning_rate": 1.8104606032761985e-07, + "loss": 1.1154, + "step": 2280 + }, + { + "epoch": 9.234817813765183, + "grad_norm": 7.295750767606294, + "learning_rate": 1.7916662733218848e-07, + "loss": 1.3337, + "step": 2281 + }, + { + "epoch": 9.238866396761134, + "grad_norm": 6.984077825794193, + "learning_rate": 1.7729682248721848e-07, + "loss": 1.2648, + "step": 2282 + }, + { + "epoch": 9.242914979757085, + "grad_norm": 6.47007551460488, + "learning_rate": 1.7543664952707817e-07, + "loss": 1.3006, + "step": 2283 + }, + { + "epoch": 9.246963562753036, + "grad_norm": 5.809021360652588, + "learning_rate": 1.7358611216689692e-07, + "loss": 1.1748, + "step": 2284 + }, + { + "epoch": 9.251012145748987, + "grad_norm": 6.188595429945637, + "learning_rate": 1.7174521410256162e-07, + "loss": 1.2565, + "step": 2285 + }, + { + "epoch": 9.255060728744938, + "grad_norm": 6.458346476599245, + "learning_rate": 1.6991395901070685e-07, + "loss": 1.196, + "step": 2286 + }, + { + "epoch": 9.259109311740891, + "grad_norm": 5.698951444763946, + "learning_rate": 1.6809235054870865e-07, + "loss": 1.1668, + "step": 2287 + }, + { + "epoch": 9.263157894736842, + "grad_norm": 6.160438664948591, + "learning_rate": 1.6628039235467686e-07, + "loss": 0.9512, + "step": 2288 + }, + { + "epoch": 9.267206477732794, + "grad_norm": 5.767381827035687, + "learning_rate": 1.6447808804744668e-07, + "loss": 1.251, + "step": 2289 + }, + { + "epoch": 9.271255060728745, + "grad_norm": 6.25287181158909, + "learning_rate": 1.6268544122657437e-07, + "loss": 1.2493, + "step": 2290 + }, + { + "epoch": 9.275303643724696, + "grad_norm": 5.97072781413541, + "learning_rate": 1.6090245547232707e-07, + "loss": 1.3238, + "step": 2291 + }, + { + "epoch": 9.279352226720647, + "grad_norm": 7.620403877713194, + "learning_rate": 1.5912913434567746e-07, + "loss": 1.1986, + "step": 2292 + }, + { + "epoch": 9.2834008097166, + "grad_norm": 6.770502426251539, + "learning_rate": 1.5736548138829632e-07, + "loss": 0.9094, + "step": 2293 + }, + { + "epoch": 9.287449392712551, + "grad_norm": 7.538773236194581, + "learning_rate": 1.5561150012254446e-07, + "loss": 1.0554, + "step": 2294 + }, + { + "epoch": 9.291497975708502, + "grad_norm": 6.916240771142258, + "learning_rate": 1.5386719405146633e-07, + "loss": 0.9282, + "step": 2295 + }, + { + "epoch": 9.295546558704453, + "grad_norm": 7.325465270488264, + "learning_rate": 1.5213256665878362e-07, + "loss": 1.1336, + "step": 2296 + }, + { + "epoch": 9.299595141700404, + "grad_norm": 6.427562714147759, + "learning_rate": 1.5040762140888843e-07, + "loss": 1.3822, + "step": 2297 + }, + { + "epoch": 9.303643724696355, + "grad_norm": 7.247038384844516, + "learning_rate": 1.4869236174683443e-07, + "loss": 0.9789, + "step": 2298 + }, + { + "epoch": 9.307692307692308, + "grad_norm": 6.708599062077499, + "learning_rate": 1.4698679109833192e-07, + "loss": 1.034, + "step": 2299 + }, + { + "epoch": 9.31174089068826, + "grad_norm": 6.139085096565993, + "learning_rate": 1.4529091286973994e-07, + "loss": 1.0888, + "step": 2300 + }, + { + "epoch": 9.31578947368421, + "grad_norm": 8.100679017208485, + "learning_rate": 1.4360473044806033e-07, + "loss": 0.9574, + "step": 2301 + }, + { + "epoch": 9.319838056680162, + "grad_norm": 6.847505127280567, + "learning_rate": 1.419282472009309e-07, + "loss": 0.9404, + "step": 2302 + }, + { + "epoch": 9.323886639676113, + "grad_norm": 6.82409548432895, + "learning_rate": 1.402614664766172e-07, + "loss": 0.9955, + "step": 2303 + }, + { + "epoch": 9.327935222672064, + "grad_norm": 6.583985934044893, + "learning_rate": 1.3860439160400808e-07, + "loss": 1.0107, + "step": 2304 + }, + { + "epoch": 9.331983805668017, + "grad_norm": 6.894705234751225, + "learning_rate": 1.369570258926062e-07, + "loss": 1.0625, + "step": 2305 + }, + { + "epoch": 9.336032388663968, + "grad_norm": 6.246760039026441, + "learning_rate": 1.353193726325247e-07, + "loss": 1.1625, + "step": 2306 + }, + { + "epoch": 9.34008097165992, + "grad_norm": 6.209505844080315, + "learning_rate": 1.3369143509447903e-07, + "loss": 1.0921, + "step": 2307 + }, + { + "epoch": 9.34412955465587, + "grad_norm": 6.524439333514229, + "learning_rate": 1.3207321652977944e-07, + "loss": 0.9869, + "step": 2308 + }, + { + "epoch": 9.348178137651821, + "grad_norm": 7.626966177840634, + "learning_rate": 1.3046472017032685e-07, + "loss": 1.1164, + "step": 2309 + }, + { + "epoch": 9.352226720647772, + "grad_norm": 6.971248132841646, + "learning_rate": 1.288659492286032e-07, + "loss": 1.1228, + "step": 2310 + }, + { + "epoch": 9.356275303643725, + "grad_norm": 6.400589915934499, + "learning_rate": 1.2727690689766814e-07, + "loss": 0.8228, + "step": 2311 + }, + { + "epoch": 9.360323886639677, + "grad_norm": 7.5084173636270295, + "learning_rate": 1.2569759635115086e-07, + "loss": 1.2048, + "step": 2312 + }, + { + "epoch": 9.364372469635628, + "grad_norm": 6.0650715287124894, + "learning_rate": 1.2412802074324548e-07, + "loss": 1.2833, + "step": 2313 + }, + { + "epoch": 9.368421052631579, + "grad_norm": 6.326873844433714, + "learning_rate": 1.2256818320870224e-07, + "loss": 1.1706, + "step": 2314 + }, + { + "epoch": 9.37246963562753, + "grad_norm": 7.9973063918471805, + "learning_rate": 1.210180868628219e-07, + "loss": 1.2185, + "step": 2315 + }, + { + "epoch": 9.376518218623481, + "grad_norm": 6.867236440906254, + "learning_rate": 1.1947773480145198e-07, + "loss": 0.9325, + "step": 2316 + }, + { + "epoch": 9.380566801619434, + "grad_norm": 7.266878939706752, + "learning_rate": 1.179471301009777e-07, + "loss": 1.0759, + "step": 2317 + }, + { + "epoch": 9.384615384615385, + "grad_norm": 6.264264519092, + "learning_rate": 1.1642627581831767e-07, + "loss": 0.8937, + "step": 2318 + }, + { + "epoch": 9.388663967611336, + "grad_norm": 7.548189919698031, + "learning_rate": 1.1491517499091498e-07, + "loss": 0.9544, + "step": 2319 + }, + { + "epoch": 9.392712550607287, + "grad_norm": 6.995200625215076, + "learning_rate": 1.134138306367355e-07, + "loss": 1.1477, + "step": 2320 + }, + { + "epoch": 9.396761133603238, + "grad_norm": 8.233516260798435, + "learning_rate": 1.1192224575425848e-07, + "loss": 0.9354, + "step": 2321 + }, + { + "epoch": 9.40080971659919, + "grad_norm": 6.111526040757406, + "learning_rate": 1.1044042332247152e-07, + "loss": 0.9898, + "step": 2322 + }, + { + "epoch": 9.404858299595142, + "grad_norm": 6.132400329765274, + "learning_rate": 1.089683663008656e-07, + "loss": 1.0196, + "step": 2323 + }, + { + "epoch": 9.408906882591094, + "grad_norm": 7.514103964897205, + "learning_rate": 1.0750607762942622e-07, + "loss": 1.0973, + "step": 2324 + }, + { + "epoch": 9.412955465587045, + "grad_norm": 6.336921580711171, + "learning_rate": 1.0605356022863167e-07, + "loss": 0.9423, + "step": 2325 + }, + { + "epoch": 9.417004048582996, + "grad_norm": 7.452160362644236, + "learning_rate": 1.0461081699944475e-07, + "loss": 1.2153, + "step": 2326 + }, + { + "epoch": 9.421052631578947, + "grad_norm": 7.267471861431043, + "learning_rate": 1.0317785082330555e-07, + "loss": 1.2392, + "step": 2327 + }, + { + "epoch": 9.425101214574898, + "grad_norm": 6.361292365622441, + "learning_rate": 1.0175466456213034e-07, + "loss": 1.2855, + "step": 2328 + }, + { + "epoch": 9.429149797570851, + "grad_norm": 7.344161066683579, + "learning_rate": 1.0034126105830099e-07, + "loss": 1.2406, + "step": 2329 + }, + { + "epoch": 9.433198380566802, + "grad_norm": 5.605300392098185, + "learning_rate": 9.89376431346606e-08, + "loss": 1.4292, + "step": 2330 + }, + { + "epoch": 9.437246963562753, + "grad_norm": 6.679019603222645, + "learning_rate": 9.75438135945106e-08, + "loss": 1.2237, + "step": 2331 + }, + { + "epoch": 9.441295546558704, + "grad_norm": 7.5278062818077265, + "learning_rate": 9.615977522160147e-08, + "loss": 1.1524, + "step": 2332 + }, + { + "epoch": 9.445344129554655, + "grad_norm": 5.917194701522402, + "learning_rate": 9.478553078013042e-08, + "loss": 1.4128, + "step": 2333 + }, + { + "epoch": 9.449392712550607, + "grad_norm": 7.670080650819539, + "learning_rate": 9.342108301473308e-08, + "loss": 1.2044, + "step": 2334 + }, + { + "epoch": 9.45344129554656, + "grad_norm": 7.247183570995367, + "learning_rate": 9.206643465047904e-08, + "loss": 1.1982, + "step": 2335 + }, + { + "epoch": 9.45748987854251, + "grad_norm": 7.097006981905352, + "learning_rate": 9.072158839286748e-08, + "loss": 1.5372, + "step": 2336 + }, + { + "epoch": 9.461538461538462, + "grad_norm": 9.659107122936096, + "learning_rate": 8.938654692781989e-08, + "loss": 1.6061, + "step": 2337 + }, + { + "epoch": 9.465587044534413, + "grad_norm": 7.6683985186556916, + "learning_rate": 8.80613129216762e-08, + "loss": 1.3296, + "step": 2338 + }, + { + "epoch": 9.469635627530364, + "grad_norm": 6.895670007789144, + "learning_rate": 8.674588902118919e-08, + "loss": 1.3934, + "step": 2339 + }, + { + "epoch": 9.473684210526315, + "grad_norm": 6.372186547142852, + "learning_rate": 8.544027785351794e-08, + "loss": 1.3763, + "step": 2340 + }, + { + "epoch": 9.477732793522268, + "grad_norm": 8.632382677132219, + "learning_rate": 8.414448202622494e-08, + "loss": 1.3484, + "step": 2341 + }, + { + "epoch": 9.481781376518219, + "grad_norm": 6.30259565246508, + "learning_rate": 8.285850412726837e-08, + "loss": 1.2994, + "step": 2342 + }, + { + "epoch": 9.48582995951417, + "grad_norm": 7.059555129137264, + "learning_rate": 8.15823467249982e-08, + "loss": 1.2402, + "step": 2343 + }, + { + "epoch": 9.489878542510121, + "grad_norm": 9.014694685087083, + "learning_rate": 8.031601236815234e-08, + "loss": 1.2015, + "step": 2344 + }, + { + "epoch": 9.493927125506072, + "grad_norm": 9.570550816859026, + "learning_rate": 7.905950358584768e-08, + "loss": 1.1243, + "step": 2345 + }, + { + "epoch": 9.497975708502024, + "grad_norm": 6.594135266471429, + "learning_rate": 7.781282288757963e-08, + "loss": 1.0819, + "step": 2346 + }, + { + "epoch": 9.502024291497976, + "grad_norm": 7.353743236682782, + "learning_rate": 7.657597276321427e-08, + "loss": 0.896, + "step": 2347 + }, + { + "epoch": 9.506072874493928, + "grad_norm": 8.664718298165978, + "learning_rate": 7.534895568298395e-08, + "loss": 1.2481, + "step": 2348 + }, + { + "epoch": 9.510121457489879, + "grad_norm": 7.984423053875158, + "learning_rate": 7.413177409748284e-08, + "loss": 1.1753, + "step": 2349 + }, + { + "epoch": 9.51417004048583, + "grad_norm": 6.100693202631448, + "learning_rate": 7.292443043766085e-08, + "loss": 1.1947, + "step": 2350 + }, + { + "epoch": 9.518218623481781, + "grad_norm": 7.891833225357146, + "learning_rate": 7.172692711482022e-08, + "loss": 1.2041, + "step": 2351 + }, + { + "epoch": 9.522267206477732, + "grad_norm": 8.957858980675372, + "learning_rate": 7.053926652061116e-08, + "loss": 1.1271, + "step": 2352 + }, + { + "epoch": 9.526315789473685, + "grad_norm": 8.198107387088085, + "learning_rate": 6.936145102702407e-08, + "loss": 0.9474, + "step": 2353 + }, + { + "epoch": 9.530364372469636, + "grad_norm": 7.311121090177834, + "learning_rate": 6.819348298638839e-08, + "loss": 1.0489, + "step": 2354 + }, + { + "epoch": 9.534412955465587, + "grad_norm": 8.543656183569171, + "learning_rate": 6.703536473136486e-08, + "loss": 1.0637, + "step": 2355 + }, + { + "epoch": 9.538461538461538, + "grad_norm": 8.23797152083669, + "learning_rate": 6.588709857494324e-08, + "loss": 1.3686, + "step": 2356 + }, + { + "epoch": 9.54251012145749, + "grad_norm": 7.299808933974355, + "learning_rate": 6.474868681043578e-08, + "loss": 1.3526, + "step": 2357 + }, + { + "epoch": 9.54655870445344, + "grad_norm": 18.439147602111802, + "learning_rate": 6.36201317114754e-08, + "loss": 1.9396, + "step": 2358 + }, + { + "epoch": 9.550607287449393, + "grad_norm": 12.741322699860357, + "learning_rate": 6.250143553200694e-08, + "loss": 1.622, + "step": 2359 + }, + { + "epoch": 9.554655870445345, + "grad_norm": 13.74018979539314, + "learning_rate": 6.13926005062876e-08, + "loss": 2.0152, + "step": 2360 + }, + { + "epoch": 9.558704453441296, + "grad_norm": 8.35774310624565, + "learning_rate": 6.029362884887757e-08, + "loss": 1.1873, + "step": 2361 + }, + { + "epoch": 9.562753036437247, + "grad_norm": 7.0374249328430425, + "learning_rate": 5.920452275463895e-08, + "loss": 1.0601, + "step": 2362 + }, + { + "epoch": 9.566801619433198, + "grad_norm": 7.424773261082573, + "learning_rate": 5.8125284398730666e-08, + "loss": 1.4362, + "step": 2363 + }, + { + "epoch": 9.570850202429149, + "grad_norm": 7.8574653437082995, + "learning_rate": 5.705591593660353e-08, + "loss": 1.1286, + "step": 2364 + }, + { + "epoch": 9.574898785425102, + "grad_norm": 7.670640887580393, + "learning_rate": 5.5996419503996924e-08, + "loss": 1.2739, + "step": 2365 + }, + { + "epoch": 9.578947368421053, + "grad_norm": 6.048412628961693, + "learning_rate": 5.4946797216931524e-08, + "loss": 1.1107, + "step": 2366 + }, + { + "epoch": 9.582995951417004, + "grad_norm": 7.759659089797845, + "learning_rate": 5.390705117171047e-08, + "loss": 1.1965, + "step": 2367 + }, + { + "epoch": 9.587044534412955, + "grad_norm": 8.079375188718355, + "learning_rate": 5.2877183444909885e-08, + "loss": 1.2208, + "step": 2368 + }, + { + "epoch": 9.591093117408906, + "grad_norm": 8.586695205665634, + "learning_rate": 5.185719609337836e-08, + "loss": 1.2005, + "step": 2369 + }, + { + "epoch": 9.595141700404858, + "grad_norm": 7.467079614176582, + "learning_rate": 5.084709115423081e-08, + "loss": 0.9196, + "step": 2370 + }, + { + "epoch": 9.59919028340081, + "grad_norm": 6.378703085070227, + "learning_rate": 4.9846870644844616e-08, + "loss": 1.0393, + "step": 2371 + }, + { + "epoch": 9.603238866396762, + "grad_norm": 6.619001766419402, + "learning_rate": 4.885653656285627e-08, + "loss": 1.1261, + "step": 2372 + }, + { + "epoch": 9.607287449392713, + "grad_norm": 7.375768177311922, + "learning_rate": 4.7876090886158074e-08, + "loss": 0.8888, + "step": 2373 + }, + { + "epoch": 9.611336032388664, + "grad_norm": 7.765928006199822, + "learning_rate": 4.6905535572892015e-08, + "loss": 1.1768, + "step": 2374 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 7.136769572282673, + "learning_rate": 4.5944872561448084e-08, + "loss": 1.2218, + "step": 2375 + }, + { + "epoch": 9.619433198380566, + "grad_norm": 8.252293420134516, + "learning_rate": 4.499410377045765e-08, + "loss": 0.9572, + "step": 2376 + }, + { + "epoch": 9.623481781376519, + "grad_norm": 9.183057155873392, + "learning_rate": 4.4053231098794e-08, + "loss": 1.3123, + "step": 2377 + }, + { + "epoch": 9.62753036437247, + "grad_norm": 7.870175412604239, + "learning_rate": 4.3122256425563444e-08, + "loss": 1.1711, + "step": 2378 + }, + { + "epoch": 9.631578947368421, + "grad_norm": 7.805673333471194, + "learning_rate": 4.220118161010589e-08, + "loss": 1.4233, + "step": 2379 + }, + { + "epoch": 9.635627530364372, + "grad_norm": 6.513022878132265, + "learning_rate": 4.129000849198872e-08, + "loss": 1.0134, + "step": 2380 + }, + { + "epoch": 9.639676113360323, + "grad_norm": 7.767006697052738, + "learning_rate": 4.038873889100237e-08, + "loss": 1.3095, + "step": 2381 + }, + { + "epoch": 9.643724696356275, + "grad_norm": 6.987879134939129, + "learning_rate": 3.94973746071603e-08, + "loss": 1.196, + "step": 2382 + }, + { + "epoch": 9.647773279352228, + "grad_norm": 7.371753491139924, + "learning_rate": 3.861591742069071e-08, + "loss": 1.2979, + "step": 2383 + }, + { + "epoch": 9.651821862348179, + "grad_norm": 6.3171055856339065, + "learning_rate": 3.77443690920376e-08, + "loss": 1.09, + "step": 2384 + }, + { + "epoch": 9.65587044534413, + "grad_norm": 7.731889028234551, + "learning_rate": 3.688273136185416e-08, + "loss": 1.2847, + "step": 2385 + }, + { + "epoch": 9.65991902834008, + "grad_norm": 7.905740698031348, + "learning_rate": 3.60310059509994e-08, + "loss": 1.0873, + "step": 2386 + }, + { + "epoch": 9.663967611336032, + "grad_norm": 6.39452809376175, + "learning_rate": 3.518919456053649e-08, + "loss": 1.2574, + "step": 2387 + }, + { + "epoch": 9.668016194331983, + "grad_norm": 6.930382161581412, + "learning_rate": 3.4357298871727786e-08, + "loss": 1.2518, + "step": 2388 + }, + { + "epoch": 9.672064777327936, + "grad_norm": 7.172023867992301, + "learning_rate": 3.353532054603203e-08, + "loss": 1.4336, + "step": 2389 + }, + { + "epoch": 9.676113360323887, + "grad_norm": 6.17036083047538, + "learning_rate": 3.2723261225102164e-08, + "loss": 1.4305, + "step": 2390 + }, + { + "epoch": 9.680161943319838, + "grad_norm": 7.2582494975076814, + "learning_rate": 3.192112253077973e-08, + "loss": 1.2413, + "step": 2391 + }, + { + "epoch": 9.68421052631579, + "grad_norm": 7.1326480428223125, + "learning_rate": 3.1128906065092666e-08, + "loss": 1.1222, + "step": 2392 + }, + { + "epoch": 9.68825910931174, + "grad_norm": 6.710146846106117, + "learning_rate": 3.034661341025258e-08, + "loss": 1.2007, + "step": 2393 + }, + { + "epoch": 9.692307692307692, + "grad_norm": 7.167152573789711, + "learning_rate": 2.957424612865245e-08, + "loss": 1.2133, + "step": 2394 + }, + { + "epoch": 9.696356275303645, + "grad_norm": 6.4961462238366074, + "learning_rate": 2.8811805762860578e-08, + "loss": 1.1056, + "step": 2395 + }, + { + "epoch": 9.700404858299596, + "grad_norm": 8.566659166804367, + "learning_rate": 2.8059293835620006e-08, + "loss": 1.5372, + "step": 2396 + }, + { + "epoch": 9.704453441295547, + "grad_norm": 7.003050839114324, + "learning_rate": 2.731671184984519e-08, + "loss": 1.3512, + "step": 2397 + }, + { + "epoch": 9.708502024291498, + "grad_norm": 7.022553930422816, + "learning_rate": 2.6584061288617568e-08, + "loss": 0.9806, + "step": 2398 + }, + { + "epoch": 9.712550607287449, + "grad_norm": 6.5247192423891365, + "learning_rate": 2.5861343615184997e-08, + "loss": 1.1002, + "step": 2399 + }, + { + "epoch": 9.7165991902834, + "grad_norm": 6.583205928052207, + "learning_rate": 2.514856027295509e-08, + "loss": 1.2214, + "step": 2400 + }, + { + "epoch": 9.720647773279353, + "grad_norm": 8.023483107929737, + "learning_rate": 2.4445712685498e-08, + "loss": 0.9652, + "step": 2401 + }, + { + "epoch": 9.724696356275304, + "grad_norm": 7.31774753878339, + "learning_rate": 2.3752802256536423e-08, + "loss": 1.2102, + "step": 2402 + }, + { + "epoch": 9.728744939271255, + "grad_norm": 6.611110640043861, + "learning_rate": 2.3069830369949474e-08, + "loss": 1.3616, + "step": 2403 + }, + { + "epoch": 9.732793522267206, + "grad_norm": 6.752098669828748, + "learning_rate": 2.239679838976605e-08, + "loss": 1.051, + "step": 2404 + }, + { + "epoch": 9.736842105263158, + "grad_norm": 6.8895429841274884, + "learning_rate": 2.173370766016314e-08, + "loss": 1.1644, + "step": 2405 + }, + { + "epoch": 9.740890688259109, + "grad_norm": 12.580788775503475, + "learning_rate": 2.1080559505462504e-08, + "loss": 1.7934, + "step": 2406 + }, + { + "epoch": 9.744939271255062, + "grad_norm": 6.238998597279842, + "learning_rate": 2.043735523013013e-08, + "loss": 1.3521, + "step": 2407 + }, + { + "epoch": 9.748987854251013, + "grad_norm": 7.960224187441654, + "learning_rate": 1.98040961187701e-08, + "loss": 1.1426, + "step": 2408 + }, + { + "epoch": 9.753036437246964, + "grad_norm": 7.433530949559287, + "learning_rate": 1.918078343612628e-08, + "loss": 1.0778, + "step": 2409 + }, + { + "epoch": 9.757085020242915, + "grad_norm": 7.048067805939395, + "learning_rate": 1.85674184270751e-08, + "loss": 1.1728, + "step": 2410 + }, + { + "epoch": 9.761133603238866, + "grad_norm": 7.803352024996819, + "learning_rate": 1.7964002316628316e-08, + "loss": 1.9261, + "step": 2411 + }, + { + "epoch": 9.765182186234817, + "grad_norm": 8.772283334117857, + "learning_rate": 1.73705363099258e-08, + "loss": 1.6529, + "step": 2412 + }, + { + "epoch": 9.76923076923077, + "grad_norm": 8.84391299820265, + "learning_rate": 1.6787021592234998e-08, + "loss": 1.5923, + "step": 2413 + }, + { + "epoch": 9.773279352226721, + "grad_norm": 6.27175422673062, + "learning_rate": 1.6213459328950355e-08, + "loss": 1.1256, + "step": 2414 + }, + { + "epoch": 9.777327935222672, + "grad_norm": 6.32330852422478, + "learning_rate": 1.5649850665587217e-08, + "loss": 1.4239, + "step": 2415 + }, + { + "epoch": 9.781376518218623, + "grad_norm": 5.832326997964023, + "learning_rate": 1.5096196727783508e-08, + "loss": 1.3997, + "step": 2416 + }, + { + "epoch": 9.785425101214575, + "grad_norm": 6.464815391686391, + "learning_rate": 1.4552498621295264e-08, + "loss": 1.1972, + "step": 2417 + }, + { + "epoch": 9.789473684210526, + "grad_norm": 6.243813602787722, + "learning_rate": 1.4018757431992769e-08, + "loss": 1.1337, + "step": 2418 + }, + { + "epoch": 9.793522267206479, + "grad_norm": 7.081278097894398, + "learning_rate": 1.3494974225863322e-08, + "loss": 1.1298, + "step": 2419 + }, + { + "epoch": 9.79757085020243, + "grad_norm": 6.774603048357779, + "learning_rate": 1.2981150049004021e-08, + "loss": 1.1853, + "step": 2420 + }, + { + "epoch": 9.80161943319838, + "grad_norm": 6.425581343136603, + "learning_rate": 1.2477285927622873e-08, + "loss": 1.0838, + "step": 2421 + }, + { + "epoch": 9.805668016194332, + "grad_norm": 6.08994981446175, + "learning_rate": 1.1983382868036019e-08, + "loss": 0.9978, + "step": 2422 + }, + { + "epoch": 9.809716599190283, + "grad_norm": 6.5758131521979095, + "learning_rate": 1.1499441856663296e-08, + "loss": 1.3163, + "step": 2423 + }, + { + "epoch": 9.813765182186234, + "grad_norm": 8.400761856632263, + "learning_rate": 1.102546386003156e-08, + "loss": 1.0923, + "step": 2424 + }, + { + "epoch": 9.817813765182187, + "grad_norm": 6.878983386606489, + "learning_rate": 1.0561449824766367e-08, + "loss": 1.0377, + "step": 2425 + }, + { + "epoch": 9.821862348178138, + "grad_norm": 15.301731593931011, + "learning_rate": 1.0107400677596413e-08, + "loss": 2.149, + "step": 2426 + }, + { + "epoch": 9.82591093117409, + "grad_norm": 25.109744644655137, + "learning_rate": 9.663317325345756e-09, + "loss": 3.0471, + "step": 2427 + }, + { + "epoch": 9.82995951417004, + "grad_norm": 7.260711360623296, + "learning_rate": 9.229200654936599e-09, + "loss": 1.0529, + "step": 2428 + }, + { + "epoch": 9.834008097165992, + "grad_norm": 7.562582026280667, + "learning_rate": 8.805051533384846e-09, + "loss": 1.0334, + "step": 2429 + }, + { + "epoch": 9.838056680161943, + "grad_norm": 6.362275189041613, + "learning_rate": 8.390870807799545e-09, + "loss": 1.0283, + "step": 2430 + }, + { + "epoch": 9.842105263157894, + "grad_norm": 6.219234617919187, + "learning_rate": 7.986659305380672e-09, + "loss": 1.1448, + "step": 2431 + }, + { + "epoch": 9.846153846153847, + "grad_norm": 6.2936591308815215, + "learning_rate": 7.59241783341913e-09, + "loss": 0.9191, + "step": 2432 + }, + { + "epoch": 9.850202429149798, + "grad_norm": 7.885631371578948, + "learning_rate": 7.2081471792911914e-09, + "loss": 1.1249, + "step": 2433 + }, + { + "epoch": 9.854251012145749, + "grad_norm": 7.734452586270721, + "learning_rate": 6.833848110461283e-09, + "loss": 1.1522, + "step": 2434 + }, + { + "epoch": 9.8582995951417, + "grad_norm": 5.289881410244237, + "learning_rate": 6.469521374477539e-09, + "loss": 1.1116, + "step": 2435 + }, + { + "epoch": 9.862348178137651, + "grad_norm": 7.185956641577389, + "learning_rate": 6.115167698972912e-09, + "loss": 1.11, + "step": 2436 + }, + { + "epoch": 9.866396761133604, + "grad_norm": 6.862756557846077, + "learning_rate": 5.770787791661292e-09, + "loss": 1.0761, + "step": 2437 + }, + { + "epoch": 9.870445344129555, + "grad_norm": 8.513662562199336, + "learning_rate": 5.436382340335833e-09, + "loss": 1.4742, + "step": 2438 + }, + { + "epoch": 9.874493927125506, + "grad_norm": 7.096665938831381, + "learning_rate": 5.111952012870624e-09, + "loss": 1.4265, + "step": 2439 + }, + { + "epoch": 9.878542510121457, + "grad_norm": 7.574301958544817, + "learning_rate": 4.797497457216804e-09, + "loss": 1.2196, + "step": 2440 + }, + { + "epoch": 9.882591093117409, + "grad_norm": 6.916168526679012, + "learning_rate": 4.493019301401447e-09, + "loss": 1.1487, + "step": 2441 + }, + { + "epoch": 9.88663967611336, + "grad_norm": 6.12111410450182, + "learning_rate": 4.198518153527009e-09, + "loss": 0.8072, + "step": 2442 + }, + { + "epoch": 9.89068825910931, + "grad_norm": 5.942490477342795, + "learning_rate": 3.9139946017713315e-09, + "loss": 1.3326, + "step": 2443 + }, + { + "epoch": 9.894736842105264, + "grad_norm": 5.858437003056172, + "learning_rate": 3.6394492143820847e-09, + "loss": 1.3361, + "step": 2444 + }, + { + "epoch": 9.898785425101215, + "grad_norm": 5.054070594148292, + "learning_rate": 3.3748825396817675e-09, + "loss": 1.1313, + "step": 2445 + }, + { + "epoch": 9.902834008097166, + "grad_norm": 5.963132718905433, + "learning_rate": 3.120295106060489e-09, + "loss": 1.1024, + "step": 2446 + }, + { + "epoch": 9.906882591093117, + "grad_norm": 6.438564124074531, + "learning_rate": 2.875687421980966e-09, + "loss": 1.1277, + "step": 2447 + }, + { + "epoch": 9.910931174089068, + "grad_norm": 5.278764051706532, + "learning_rate": 2.6410599759713052e-09, + "loss": 0.9121, + "step": 2448 + }, + { + "epoch": 9.914979757085021, + "grad_norm": 5.930913466658232, + "learning_rate": 2.4164132366294444e-09, + "loss": 0.985, + "step": 2449 + }, + { + "epoch": 9.919028340080972, + "grad_norm": 6.134886702370438, + "learning_rate": 2.201747652618713e-09, + "loss": 0.9875, + "step": 2450 + }, + { + "epoch": 9.923076923076923, + "grad_norm": 4.919043912938238, + "learning_rate": 1.997063652668385e-09, + "loss": 1.3108, + "step": 2451 + }, + { + "epoch": 9.927125506072874, + "grad_norm": 6.59710619456491, + "learning_rate": 1.8023616455731253e-09, + "loss": 1.1487, + "step": 2452 + }, + { + "epoch": 9.931174089068826, + "grad_norm": 6.7268729919729955, + "learning_rate": 1.6176420201902132e-09, + "loss": 1.1327, + "step": 2453 + }, + { + "epoch": 9.935222672064777, + "grad_norm": 6.801182722691009, + "learning_rate": 1.4429051454412092e-09, + "loss": 0.9427, + "step": 2454 + }, + { + "epoch": 9.939271255060728, + "grad_norm": 6.409116476021893, + "learning_rate": 1.2781513703102877e-09, + "loss": 0.9617, + "step": 2455 + }, + { + "epoch": 9.94331983805668, + "grad_norm": 7.345877062431902, + "learning_rate": 1.1233810238425735e-09, + "loss": 1.2441, + "step": 2456 + }, + { + "epoch": 9.947368421052632, + "grad_norm": 5.764298273469222, + "learning_rate": 9.78594415145806e-10, + "loss": 0.9852, + "step": 2457 + }, + { + "epoch": 9.951417004048583, + "grad_norm": 7.41040208403738, + "learning_rate": 8.437918333864537e-10, + "loss": 0.8827, + "step": 2458 + }, + { + "epoch": 9.955465587044534, + "grad_norm": 6.690719160118015, + "learning_rate": 7.189735477913795e-10, + "loss": 1.207, + "step": 2459 + }, + { + "epoch": 9.959514170040485, + "grad_norm": 6.426110798176273, + "learning_rate": 6.041398076478411e-10, + "loss": 1.2944, + "step": 2460 + }, + { + "epoch": 9.963562753036438, + "grad_norm": 5.67612330623261, + "learning_rate": 4.99290842301825e-10, + "loss": 1.2245, + "step": 2461 + }, + { + "epoch": 9.96761133603239, + "grad_norm": 7.12228691332672, + "learning_rate": 4.0442686115582665e-10, + "loss": 1.2106, + "step": 2462 + }, + { + "epoch": 9.97165991902834, + "grad_norm": 7.171942174513077, + "learning_rate": 3.195480536732909e-10, + "loss": 1.1455, + "step": 2463 + }, + { + "epoch": 9.975708502024291, + "grad_norm": 5.323715551715951, + "learning_rate": 2.446545893730612e-10, + "loss": 1.2116, + "step": 2464 + }, + { + "epoch": 9.979757085020243, + "grad_norm": 6.645046974916862, + "learning_rate": 1.797466178327101e-10, + "loss": 1.2553, + "step": 2465 + }, + { + "epoch": 9.983805668016194, + "grad_norm": 6.179117377649965, + "learning_rate": 1.2482426868520858e-10, + "loss": 1.3211, + "step": 2466 + }, + { + "epoch": 9.987854251012145, + "grad_norm": 5.9598947487901075, + "learning_rate": 7.988765162225687e-11, + "loss": 1.1147, + "step": 2467 + }, + { + "epoch": 9.991902834008098, + "grad_norm": 5.288287586968503, + "learning_rate": 4.4936856390398465e-11, + "loss": 1.1298, + "step": 2468 + }, + { + "epoch": 9.995951417004049, + "grad_norm": 5.809768480823839, + "learning_rate": 1.9971952793240713e-11, + "loss": 1.454, + "step": 2469 + }, + { + "epoch": 10.0, + "grad_norm": 5.8240784074284795, + "learning_rate": 4.992990691454758e-12, + "loss": 1.1531, + "step": 2470 + }, + { + "epoch": 10.0, + "step": 2470, + "total_flos": 750948524032000.0, + "train_loss": 1.639819175053222, + "train_runtime": 7687.531, + "train_samples_per_second": 2.57, + "train_steps_per_second": 0.321 + } + ], + "logging_steps": 1, + "max_steps": 2470, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1976, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 750948524032000.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..78bb788b48fdaeefa100fcca732cd4ad5de338f1 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db617e3c3ae788b627938f09c1b4708215392619dbc3a2b63a88ab23d37b875b +size 7608 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..b726123e33dd1390382a6a679ce48247ee002686 Binary files /dev/null and b/training_loss.png differ