| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.0, | |
| "eval_steps": 720, | |
| "global_step": 21594, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13892747985551543, | |
| "grad_norm": 0.595678985118866, | |
| "learning_rate": 0.00039537834583680656, | |
| "loss": 0.376, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2000555709919422, | |
| "eval_loss": 0.30611157417297363, | |
| "eval_runtime": 21.4927, | |
| "eval_samples_per_second": 23.264, | |
| "eval_steps_per_second": 2.931, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.27785495971103086, | |
| "grad_norm": 0.7439117431640625, | |
| "learning_rate": 0.0003907474298416227, | |
| "loss": 0.2956, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4001111419838844, | |
| "eval_loss": 0.271937757730484, | |
| "eval_runtime": 21.3424, | |
| "eval_samples_per_second": 23.428, | |
| "eval_steps_per_second": 2.952, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.41678243956654626, | |
| "grad_norm": 0.5364288091659546, | |
| "learning_rate": 0.00038611651384643886, | |
| "loss": 0.2658, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5557099194220617, | |
| "grad_norm": 0.5661336779594421, | |
| "learning_rate": 0.000381485597851255, | |
| "loss": 0.2461, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6001667129758266, | |
| "eval_loss": 0.2515988349914551, | |
| "eval_runtime": 21.3216, | |
| "eval_samples_per_second": 23.45, | |
| "eval_steps_per_second": 2.955, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6946373992775771, | |
| "grad_norm": 0.4893731474876404, | |
| "learning_rate": 0.00037685468185607115, | |
| "loss": 0.2307, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8002222839677688, | |
| "eval_loss": 0.22781750559806824, | |
| "eval_runtime": 21.3004, | |
| "eval_samples_per_second": 23.474, | |
| "eval_steps_per_second": 2.958, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.8335648791330925, | |
| "grad_norm": 0.5245664119720459, | |
| "learning_rate": 0.0003722237658608873, | |
| "loss": 0.2205, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.972492358988608, | |
| "grad_norm": 0.7262536287307739, | |
| "learning_rate": 0.00036759284986570345, | |
| "loss": 0.2154, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.000277854959711, | |
| "eval_loss": 0.21747416257858276, | |
| "eval_runtime": 21.4198, | |
| "eval_samples_per_second": 23.343, | |
| "eval_steps_per_second": 2.941, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1114198388441234, | |
| "grad_norm": 0.5650269389152527, | |
| "learning_rate": 0.0003629619338705196, | |
| "loss": 0.1815, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2003334259516532, | |
| "eval_loss": 0.2159462422132492, | |
| "eval_runtime": 21.4142, | |
| "eval_samples_per_second": 23.349, | |
| "eval_steps_per_second": 2.942, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.2503473186996388, | |
| "grad_norm": 0.6974398493766785, | |
| "learning_rate": 0.00035833101787533574, | |
| "loss": 0.1813, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3892747985551543, | |
| "grad_norm": 0.6206701993942261, | |
| "learning_rate": 0.0003537001018801519, | |
| "loss": 0.1785, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4003889969435954, | |
| "eval_loss": 0.20631669461727142, | |
| "eval_runtime": 21.3451, | |
| "eval_samples_per_second": 23.425, | |
| "eval_steps_per_second": 2.952, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.5282022784106695, | |
| "grad_norm": 0.6988590359687805, | |
| "learning_rate": 0.00034906918588496804, | |
| "loss": 0.1748, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6004445679355377, | |
| "eval_loss": 0.20530453324317932, | |
| "eval_runtime": 21.3207, | |
| "eval_samples_per_second": 23.451, | |
| "eval_steps_per_second": 2.955, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.667129758266185, | |
| "grad_norm": 0.5506817698478699, | |
| "learning_rate": 0.0003444382698897842, | |
| "loss": 0.1715, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8005001389274797, | |
| "eval_loss": 0.19710968434810638, | |
| "eval_runtime": 21.2884, | |
| "eval_samples_per_second": 23.487, | |
| "eval_steps_per_second": 2.959, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.8060572381217006, | |
| "grad_norm": 0.464895099401474, | |
| "learning_rate": 0.0003398073538946004, | |
| "loss": 0.1678, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9449847179772157, | |
| "grad_norm": 0.6984584331512451, | |
| "learning_rate": 0.0003351764378994165, | |
| "loss": 0.1664, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.000555709919422, | |
| "eval_loss": 0.18991900980472565, | |
| "eval_runtime": 21.3167, | |
| "eval_samples_per_second": 23.456, | |
| "eval_steps_per_second": 2.955, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.0839121978327313, | |
| "grad_norm": 0.6013078093528748, | |
| "learning_rate": 0.0003305455219042327, | |
| "loss": 0.1452, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2006112809113643, | |
| "eval_loss": 0.1970534473657608, | |
| "eval_runtime": 21.6809, | |
| "eval_samples_per_second": 23.062, | |
| "eval_steps_per_second": 2.906, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.222839677688247, | |
| "grad_norm": 0.516161322593689, | |
| "learning_rate": 0.0003259146059090488, | |
| "loss": 0.1352, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.361767157543762, | |
| "grad_norm": 0.6993707418441772, | |
| "learning_rate": 0.000321283689913865, | |
| "loss": 0.1363, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4006668519033063, | |
| "eval_loss": 0.18779733777046204, | |
| "eval_runtime": 21.3417, | |
| "eval_samples_per_second": 23.428, | |
| "eval_steps_per_second": 2.952, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.5006946373992776, | |
| "grad_norm": 0.542614758014679, | |
| "learning_rate": 0.00031665277391868115, | |
| "loss": 0.1345, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.600722422895249, | |
| "eval_loss": 0.18519769608974457, | |
| "eval_runtime": 21.5664, | |
| "eval_samples_per_second": 23.184, | |
| "eval_steps_per_second": 2.921, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 2.639622117254793, | |
| "grad_norm": 0.691017746925354, | |
| "learning_rate": 0.00031202185792349727, | |
| "loss": 0.1353, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7785495971103087, | |
| "grad_norm": 0.4563291370868683, | |
| "learning_rate": 0.0003073909419283134, | |
| "loss": 0.1345, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.800777993887191, | |
| "eval_loss": 0.18035078048706055, | |
| "eval_runtime": 21.2975, | |
| "eval_samples_per_second": 23.477, | |
| "eval_steps_per_second": 2.958, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 2.917477076965824, | |
| "grad_norm": 0.6514193415641785, | |
| "learning_rate": 0.00030276002593312957, | |
| "loss": 0.1355, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.000833564879133, | |
| "eval_loss": 0.17824167013168335, | |
| "eval_runtime": 21.3358, | |
| "eval_samples_per_second": 23.435, | |
| "eval_steps_per_second": 2.953, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.0564045568213394, | |
| "grad_norm": 0.7063578367233276, | |
| "learning_rate": 0.00029812910993794574, | |
| "loss": 0.1222, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.1953320366768545, | |
| "grad_norm": 0.7858085632324219, | |
| "learning_rate": 0.0002934981939427619, | |
| "loss": 0.1072, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.2008891358710754, | |
| "eval_loss": 0.18112412095069885, | |
| "eval_runtime": 21.3193, | |
| "eval_samples_per_second": 23.453, | |
| "eval_steps_per_second": 2.955, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 3.33425951653237, | |
| "grad_norm": 0.962510883808136, | |
| "learning_rate": 0.00028886727794757804, | |
| "loss": 0.1086, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4009447068630174, | |
| "eval_loss": 0.17723555862903595, | |
| "eval_runtime": 21.2949, | |
| "eval_samples_per_second": 23.48, | |
| "eval_steps_per_second": 2.958, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 3.4731869963878856, | |
| "grad_norm": 0.47604429721832275, | |
| "learning_rate": 0.00028423636195239416, | |
| "loss": 0.109, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6010002778549595, | |
| "eval_loss": 0.1781689077615738, | |
| "eval_runtime": 21.2531, | |
| "eval_samples_per_second": 23.526, | |
| "eval_steps_per_second": 2.964, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 3.612114476243401, | |
| "grad_norm": 0.49760594964027405, | |
| "learning_rate": 0.00027960544595721033, | |
| "loss": 0.1099, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7510419560989163, | |
| "grad_norm": 0.6786069869995117, | |
| "learning_rate": 0.0002749745299620265, | |
| "loss": 0.1084, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.801055848846902, | |
| "eval_loss": 0.17475004494190216, | |
| "eval_runtime": 21.2523, | |
| "eval_samples_per_second": 23.527, | |
| "eval_steps_per_second": 2.964, | |
| "step": 13680 | |
| }, | |
| { | |
| "epoch": 3.889969435954432, | |
| "grad_norm": 0.8729720115661621, | |
| "learning_rate": 0.0002703436139668427, | |
| "loss": 0.1114, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.001111419838844, | |
| "eval_loss": 0.16653864085674286, | |
| "eval_runtime": 21.3451, | |
| "eval_samples_per_second": 23.425, | |
| "eval_steps_per_second": 2.951, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.0288969158099475, | |
| "grad_norm": 0.7045803070068359, | |
| "learning_rate": 0.0002657126979716588, | |
| "loss": 0.1038, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.167824395665463, | |
| "grad_norm": 0.9094932079315186, | |
| "learning_rate": 0.0002610817819764749, | |
| "loss": 0.0839, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.201166990830786, | |
| "eval_loss": 0.1762482225894928, | |
| "eval_runtime": 21.3199, | |
| "eval_samples_per_second": 23.452, | |
| "eval_steps_per_second": 2.955, | |
| "step": 15120 | |
| }, | |
| { | |
| "epoch": 4.306751875520978, | |
| "grad_norm": 0.6102758646011353, | |
| "learning_rate": 0.0002564508659812911, | |
| "loss": 0.0878, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.4012225618227285, | |
| "eval_loss": 0.17415344715118408, | |
| "eval_runtime": 21.3432, | |
| "eval_samples_per_second": 23.427, | |
| "eval_steps_per_second": 2.952, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 4.445679355376494, | |
| "grad_norm": 0.46317267417907715, | |
| "learning_rate": 0.00025181994998610727, | |
| "loss": 0.0883, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.584606835232009, | |
| "grad_norm": 0.6532848477363586, | |
| "learning_rate": 0.00024718903399092345, | |
| "loss": 0.0887, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.601278132814671, | |
| "eval_loss": 0.18139471113681793, | |
| "eval_runtime": 21.5324, | |
| "eval_samples_per_second": 23.221, | |
| "eval_steps_per_second": 2.926, | |
| "step": 16560 | |
| }, | |
| { | |
| "epoch": 4.723534315087524, | |
| "grad_norm": 0.8044198155403137, | |
| "learning_rate": 0.00024255811799573954, | |
| "loss": 0.0928, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.801333703806613, | |
| "eval_loss": 0.16956336796283722, | |
| "eval_runtime": 22.0239, | |
| "eval_samples_per_second": 22.703, | |
| "eval_steps_per_second": 2.861, | |
| "step": 17280 | |
| }, | |
| { | |
| "epoch": 4.86246179494304, | |
| "grad_norm": 0.6072468161582947, | |
| "learning_rate": 0.00023792720200055571, | |
| "loss": 0.0915, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "grad_norm": 0.7845910787582397, | |
| "learning_rate": 0.0002332962860053719, | |
| "loss": 0.0888, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "eval_loss": 0.17226029932498932, | |
| "eval_runtime": 21.2768, | |
| "eval_samples_per_second": 23.5, | |
| "eval_steps_per_second": 2.961, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.14031675465407, | |
| "grad_norm": 0.4631412923336029, | |
| "learning_rate": 0.00022866537001018804, | |
| "loss": 0.067, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.201444845790498, | |
| "eval_loss": 0.18070575594902039, | |
| "eval_runtime": 21.433, | |
| "eval_samples_per_second": 23.328, | |
| "eval_steps_per_second": 2.939, | |
| "step": 18720 | |
| }, | |
| { | |
| "epoch": 5.279244234509586, | |
| "grad_norm": 0.5497326850891113, | |
| "learning_rate": 0.00022403445401500416, | |
| "loss": 0.0696, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.401500416782439, | |
| "eval_loss": 0.17943890392780304, | |
| "eval_runtime": 21.3084, | |
| "eval_samples_per_second": 23.465, | |
| "eval_steps_per_second": 2.957, | |
| "step": 19440 | |
| }, | |
| { | |
| "epoch": 5.418171714365101, | |
| "grad_norm": 0.455014705657959, | |
| "learning_rate": 0.00021940353801982033, | |
| "loss": 0.0717, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.5570991942206165, | |
| "grad_norm": 0.4856937527656555, | |
| "learning_rate": 0.00021477262202463648, | |
| "loss": 0.0724, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.601555987774382, | |
| "eval_loss": 0.1766962856054306, | |
| "eval_runtime": 21.2925, | |
| "eval_samples_per_second": 23.482, | |
| "eval_steps_per_second": 2.959, | |
| "step": 20160 | |
| }, | |
| { | |
| "epoch": 5.6960266740761325, | |
| "grad_norm": 0.9877897500991821, | |
| "learning_rate": 0.00021014170602945265, | |
| "loss": 0.0748, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.801611558766324, | |
| "eval_loss": 0.17774970829486847, | |
| "eval_runtime": 21.463, | |
| "eval_samples_per_second": 23.296, | |
| "eval_steps_per_second": 2.935, | |
| "step": 20880 | |
| }, | |
| { | |
| "epoch": 5.834954153931648, | |
| "grad_norm": 0.5059243440628052, | |
| "learning_rate": 0.0002055107900342688, | |
| "loss": 0.0732, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.973881633787163, | |
| "grad_norm": 0.6043376326560974, | |
| "learning_rate": 0.00020087987403908492, | |
| "loss": 0.0731, | |
| "step": 21500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 43188, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.186199114359243e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |