| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 720, | |
| "global_step": 10797, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13892747985551543, | |
| "grad_norm": 0.6735790967941284, | |
| "learning_rate": 0.0007907937390015746, | |
| "loss": 0.3742, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2000555709919422, | |
| "eval_loss": 0.2844131588935852, | |
| "eval_runtime": 5.091, | |
| "eval_samples_per_second": 98.212, | |
| "eval_steps_per_second": 6.286, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.27785495971103086, | |
| "grad_norm": 0.6214504241943359, | |
| "learning_rate": 0.0007815319070112068, | |
| "loss": 0.2859, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4001111419838844, | |
| "eval_loss": 0.2708466649055481, | |
| "eval_runtime": 4.8349, | |
| "eval_samples_per_second": 103.416, | |
| "eval_steps_per_second": 6.619, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.41678243956654626, | |
| "grad_norm": 0.5057302117347717, | |
| "learning_rate": 0.0007722700750208392, | |
| "loss": 0.2679, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5557099194220617, | |
| "grad_norm": 0.623817503452301, | |
| "learning_rate": 0.0007630082430304714, | |
| "loss": 0.2527, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6001667129758266, | |
| "eval_loss": 0.24877774715423584, | |
| "eval_runtime": 5.9656, | |
| "eval_samples_per_second": 83.813, | |
| "eval_steps_per_second": 5.364, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6946373992775771, | |
| "grad_norm": 1.3911620378494263, | |
| "learning_rate": 0.0007537649347040845, | |
| "loss": 0.2441, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8002222839677688, | |
| "eval_loss": 0.2391277253627777, | |
| "eval_runtime": 5.9872, | |
| "eval_samples_per_second": 83.512, | |
| "eval_steps_per_second": 5.345, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.8335648791330925, | |
| "grad_norm": 0.7922923564910889, | |
| "learning_rate": 0.0007445216263776975, | |
| "loss": 0.235, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.972492358988608, | |
| "grad_norm": 0.8539218902587891, | |
| "learning_rate": 0.0007352597943873298, | |
| "loss": 0.2331, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.000277854959711, | |
| "eval_loss": 0.23887279629707336, | |
| "eval_runtime": 5.6004, | |
| "eval_samples_per_second": 89.279, | |
| "eval_steps_per_second": 5.714, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1114198388441234, | |
| "grad_norm": 0.6812190413475037, | |
| "learning_rate": 0.0007259979623969622, | |
| "loss": 0.2201, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2003334259516532, | |
| "eval_loss": 0.2331564724445343, | |
| "eval_runtime": 5.2457, | |
| "eval_samples_per_second": 95.317, | |
| "eval_steps_per_second": 6.1, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.2503473186996388, | |
| "grad_norm": 0.6926279067993164, | |
| "learning_rate": 0.0007167361304065945, | |
| "loss": 0.2199, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3892747985551543, | |
| "grad_norm": 0.857128918170929, | |
| "learning_rate": 0.0007074742984162268, | |
| "loss": 0.2157, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4003889969435954, | |
| "eval_loss": 0.22209732234477997, | |
| "eval_runtime": 4.9003, | |
| "eval_samples_per_second": 102.035, | |
| "eval_steps_per_second": 6.53, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.5282022784106695, | |
| "grad_norm": 0.8350584506988525, | |
| "learning_rate": 0.000698212466425859, | |
| "loss": 0.2122, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6004445679355377, | |
| "eval_loss": 0.22165100276470184, | |
| "eval_runtime": 4.8879, | |
| "eval_samples_per_second": 102.294, | |
| "eval_steps_per_second": 6.547, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.667129758266185, | |
| "grad_norm": 0.6862344145774841, | |
| "learning_rate": 0.0006889691580994721, | |
| "loss": 0.2158, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8005001389274797, | |
| "eval_loss": 0.21265414357185364, | |
| "eval_runtime": 5.1036, | |
| "eval_samples_per_second": 97.97, | |
| "eval_steps_per_second": 6.27, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.8060572381217006, | |
| "grad_norm": 0.64631587266922, | |
| "learning_rate": 0.0006797073261091044, | |
| "loss": 0.2069, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9449847179772157, | |
| "grad_norm": 0.6139649748802185, | |
| "learning_rate": 0.0006704454941187368, | |
| "loss": 0.2063, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.000555709919422, | |
| "eval_loss": 0.2054276168346405, | |
| "eval_runtime": 5.6047, | |
| "eval_samples_per_second": 89.211, | |
| "eval_steps_per_second": 5.71, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.0839121978327313, | |
| "grad_norm": 0.5134121179580688, | |
| "learning_rate": 0.000661183662128369, | |
| "loss": 0.1961, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2006112809113643, | |
| "eval_loss": 0.20968343317508698, | |
| "eval_runtime": 4.9427, | |
| "eval_samples_per_second": 101.16, | |
| "eval_steps_per_second": 6.474, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.222839677688247, | |
| "grad_norm": 0.667497456073761, | |
| "learning_rate": 0.0006519218301380013, | |
| "loss": 0.198, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.361767157543762, | |
| "grad_norm": 0.7320435643196106, | |
| "learning_rate": 0.0006426599981476336, | |
| "loss": 0.1942, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4006668519033063, | |
| "eval_loss": 0.201798677444458, | |
| "eval_runtime": 5.0966, | |
| "eval_samples_per_second": 98.105, | |
| "eval_steps_per_second": 6.279, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.5006946373992776, | |
| "grad_norm": 0.5457369685173035, | |
| "learning_rate": 0.000633398166157266, | |
| "loss": 0.1904, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.600722422895249, | |
| "eval_loss": 0.19970637559890747, | |
| "eval_runtime": 4.9095, | |
| "eval_samples_per_second": 101.843, | |
| "eval_steps_per_second": 6.518, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 2.639622117254793, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.0006241363341668982, | |
| "loss": 0.1928, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7785495971103087, | |
| "grad_norm": 0.5789324045181274, | |
| "learning_rate": 0.0006148930258405113, | |
| "loss": 0.1914, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.800777993887191, | |
| "eval_loss": 0.20011071860790253, | |
| "eval_runtime": 4.9182, | |
| "eval_samples_per_second": 101.663, | |
| "eval_steps_per_second": 6.506, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 2.917477076965824, | |
| "grad_norm": 0.5479323863983154, | |
| "learning_rate": 0.0006056311938501436, | |
| "loss": 0.193, | |
| "step": 10500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 43188, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.036449305620644e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |