{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 720, "global_step": 21594, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13892747985551543, "grad_norm": 0.595678985118866, "learning_rate": 0.00039537834583680656, "loss": 0.376, "step": 500 }, { "epoch": 0.2000555709919422, "eval_loss": 0.30611157417297363, "eval_runtime": 21.4927, "eval_samples_per_second": 23.264, "eval_steps_per_second": 2.931, "step": 720 }, { "epoch": 0.27785495971103086, "grad_norm": 0.7439117431640625, "learning_rate": 0.0003907474298416227, "loss": 0.2956, "step": 1000 }, { "epoch": 0.4001111419838844, "eval_loss": 0.271937757730484, "eval_runtime": 21.3424, "eval_samples_per_second": 23.428, "eval_steps_per_second": 2.952, "step": 1440 }, { "epoch": 0.41678243956654626, "grad_norm": 0.5364288091659546, "learning_rate": 0.00038611651384643886, "loss": 0.2658, "step": 1500 }, { "epoch": 0.5557099194220617, "grad_norm": 0.5661336779594421, "learning_rate": 0.000381485597851255, "loss": 0.2461, "step": 2000 }, { "epoch": 0.6001667129758266, "eval_loss": 0.2515988349914551, "eval_runtime": 21.3216, "eval_samples_per_second": 23.45, "eval_steps_per_second": 2.955, "step": 2160 }, { "epoch": 0.6946373992775771, "grad_norm": 0.4893731474876404, "learning_rate": 0.00037685468185607115, "loss": 0.2307, "step": 2500 }, { "epoch": 0.8002222839677688, "eval_loss": 0.22781750559806824, "eval_runtime": 21.3004, "eval_samples_per_second": 23.474, "eval_steps_per_second": 2.958, "step": 2880 }, { "epoch": 0.8335648791330925, "grad_norm": 0.5245664119720459, "learning_rate": 0.0003722237658608873, "loss": 0.2205, "step": 3000 }, { "epoch": 0.972492358988608, "grad_norm": 0.7262536287307739, "learning_rate": 0.00036759284986570345, "loss": 0.2154, "step": 3500 }, { "epoch": 1.000277854959711, "eval_loss": 0.21747416257858276, "eval_runtime": 21.4198, "eval_samples_per_second": 23.343, "eval_steps_per_second": 2.941, "step": 3600 }, { "epoch": 1.1114198388441234, "grad_norm": 0.5650269389152527, "learning_rate": 0.0003629619338705196, "loss": 0.1815, "step": 4000 }, { "epoch": 1.2003334259516532, "eval_loss": 0.2159462422132492, "eval_runtime": 21.4142, "eval_samples_per_second": 23.349, "eval_steps_per_second": 2.942, "step": 4320 }, { "epoch": 1.2503473186996388, "grad_norm": 0.6974398493766785, "learning_rate": 0.00035833101787533574, "loss": 0.1813, "step": 4500 }, { "epoch": 1.3892747985551543, "grad_norm": 0.6206701993942261, "learning_rate": 0.0003537001018801519, "loss": 0.1785, "step": 5000 }, { "epoch": 1.4003889969435954, "eval_loss": 0.20631669461727142, "eval_runtime": 21.3451, "eval_samples_per_second": 23.425, "eval_steps_per_second": 2.952, "step": 5040 }, { "epoch": 1.5282022784106695, "grad_norm": 0.6988590359687805, "learning_rate": 0.00034906918588496804, "loss": 0.1748, "step": 5500 }, { "epoch": 1.6004445679355377, "eval_loss": 0.20530453324317932, "eval_runtime": 21.3207, "eval_samples_per_second": 23.451, "eval_steps_per_second": 2.955, "step": 5760 }, { "epoch": 1.667129758266185, "grad_norm": 0.5506817698478699, "learning_rate": 0.0003444382698897842, "loss": 0.1715, "step": 6000 }, { "epoch": 1.8005001389274797, "eval_loss": 0.19710968434810638, "eval_runtime": 21.2884, "eval_samples_per_second": 23.487, "eval_steps_per_second": 2.959, "step": 6480 }, { "epoch": 1.8060572381217006, "grad_norm": 0.464895099401474, "learning_rate": 0.0003398073538946004, "loss": 0.1678, "step": 6500 }, { "epoch": 1.9449847179772157, "grad_norm": 0.6984584331512451, "learning_rate": 0.0003351764378994165, "loss": 0.1664, "step": 7000 }, { "epoch": 2.000555709919422, "eval_loss": 0.18991900980472565, "eval_runtime": 21.3167, "eval_samples_per_second": 23.456, "eval_steps_per_second": 2.955, "step": 7200 }, { "epoch": 2.0839121978327313, "grad_norm": 0.6013078093528748, "learning_rate": 0.0003305455219042327, "loss": 0.1452, "step": 7500 }, { "epoch": 2.2006112809113643, "eval_loss": 0.1970534473657608, "eval_runtime": 21.6809, "eval_samples_per_second": 23.062, "eval_steps_per_second": 2.906, "step": 7920 }, { "epoch": 2.222839677688247, "grad_norm": 0.516161322593689, "learning_rate": 0.0003259146059090488, "loss": 0.1352, "step": 8000 }, { "epoch": 2.361767157543762, "grad_norm": 0.6993707418441772, "learning_rate": 0.000321283689913865, "loss": 0.1363, "step": 8500 }, { "epoch": 2.4006668519033063, "eval_loss": 0.18779733777046204, "eval_runtime": 21.3417, "eval_samples_per_second": 23.428, "eval_steps_per_second": 2.952, "step": 8640 }, { "epoch": 2.5006946373992776, "grad_norm": 0.542614758014679, "learning_rate": 0.00031665277391868115, "loss": 0.1345, "step": 9000 }, { "epoch": 2.600722422895249, "eval_loss": 0.18519769608974457, "eval_runtime": 21.5664, "eval_samples_per_second": 23.184, "eval_steps_per_second": 2.921, "step": 9360 }, { "epoch": 2.639622117254793, "grad_norm": 0.691017746925354, "learning_rate": 0.00031202185792349727, "loss": 0.1353, "step": 9500 }, { "epoch": 2.7785495971103087, "grad_norm": 0.4563291370868683, "learning_rate": 0.0003073909419283134, "loss": 0.1345, "step": 10000 }, { "epoch": 2.800777993887191, "eval_loss": 0.18035078048706055, "eval_runtime": 21.2975, "eval_samples_per_second": 23.477, "eval_steps_per_second": 2.958, "step": 10080 }, { "epoch": 2.917477076965824, "grad_norm": 0.6514193415641785, "learning_rate": 0.00030276002593312957, "loss": 0.1355, "step": 10500 }, { "epoch": 3.000833564879133, "eval_loss": 0.17824167013168335, "eval_runtime": 21.3358, "eval_samples_per_second": 23.435, "eval_steps_per_second": 2.953, "step": 10800 }, { "epoch": 3.0564045568213394, "grad_norm": 0.7063578367233276, "learning_rate": 0.00029812910993794574, "loss": 0.1222, "step": 11000 }, { "epoch": 3.1953320366768545, "grad_norm": 0.7858085632324219, "learning_rate": 0.0002934981939427619, "loss": 0.1072, "step": 11500 }, { "epoch": 3.2008891358710754, "eval_loss": 0.18112412095069885, "eval_runtime": 21.3193, "eval_samples_per_second": 23.453, "eval_steps_per_second": 2.955, "step": 11520 }, { "epoch": 3.33425951653237, "grad_norm": 0.962510883808136, "learning_rate": 0.00028886727794757804, "loss": 0.1086, "step": 12000 }, { "epoch": 3.4009447068630174, "eval_loss": 0.17723555862903595, "eval_runtime": 21.2949, "eval_samples_per_second": 23.48, "eval_steps_per_second": 2.958, "step": 12240 }, { "epoch": 3.4731869963878856, "grad_norm": 0.47604429721832275, "learning_rate": 0.00028423636195239416, "loss": 0.109, "step": 12500 }, { "epoch": 3.6010002778549595, "eval_loss": 0.1781689077615738, "eval_runtime": 21.2531, "eval_samples_per_second": 23.526, "eval_steps_per_second": 2.964, "step": 12960 }, { "epoch": 3.612114476243401, "grad_norm": 0.49760594964027405, "learning_rate": 0.00027960544595721033, "loss": 0.1099, "step": 13000 }, { "epoch": 3.7510419560989163, "grad_norm": 0.6786069869995117, "learning_rate": 0.0002749745299620265, "loss": 0.1084, "step": 13500 }, { "epoch": 3.801055848846902, "eval_loss": 0.17475004494190216, "eval_runtime": 21.2523, "eval_samples_per_second": 23.527, "eval_steps_per_second": 2.964, "step": 13680 }, { "epoch": 3.889969435954432, "grad_norm": 0.8729720115661621, "learning_rate": 0.0002703436139668427, "loss": 0.1114, "step": 14000 }, { "epoch": 4.001111419838844, "eval_loss": 0.16653864085674286, "eval_runtime": 21.3451, "eval_samples_per_second": 23.425, "eval_steps_per_second": 2.951, "step": 14400 }, { "epoch": 4.0288969158099475, "grad_norm": 0.7045803070068359, "learning_rate": 0.0002657126979716588, "loss": 0.1038, "step": 14500 }, { "epoch": 4.167824395665463, "grad_norm": 0.9094932079315186, "learning_rate": 0.0002610817819764749, "loss": 0.0839, "step": 15000 }, { "epoch": 4.201166990830786, "eval_loss": 0.1762482225894928, "eval_runtime": 21.3199, "eval_samples_per_second": 23.452, "eval_steps_per_second": 2.955, "step": 15120 }, { "epoch": 4.306751875520978, "grad_norm": 0.6102758646011353, "learning_rate": 0.0002564508659812911, "loss": 0.0878, "step": 15500 }, { "epoch": 4.4012225618227285, "eval_loss": 0.17415344715118408, "eval_runtime": 21.3432, "eval_samples_per_second": 23.427, "eval_steps_per_second": 2.952, "step": 15840 }, { "epoch": 4.445679355376494, "grad_norm": 0.46317267417907715, "learning_rate": 0.00025181994998610727, "loss": 0.0883, "step": 16000 }, { "epoch": 4.584606835232009, "grad_norm": 0.6532848477363586, "learning_rate": 0.00024718903399092345, "loss": 0.0887, "step": 16500 }, { "epoch": 4.601278132814671, "eval_loss": 0.18139471113681793, "eval_runtime": 21.5324, "eval_samples_per_second": 23.221, "eval_steps_per_second": 2.926, "step": 16560 }, { "epoch": 4.723534315087524, "grad_norm": 0.8044198155403137, "learning_rate": 0.00024255811799573954, "loss": 0.0928, "step": 17000 }, { "epoch": 4.801333703806613, "eval_loss": 0.16956336796283722, "eval_runtime": 22.0239, "eval_samples_per_second": 22.703, "eval_steps_per_second": 2.861, "step": 17280 }, { "epoch": 4.86246179494304, "grad_norm": 0.6072468161582947, "learning_rate": 0.00023792720200055571, "loss": 0.0915, "step": 17500 }, { "epoch": 5.001389274798555, "grad_norm": 0.7845910787582397, "learning_rate": 0.0002332962860053719, "loss": 0.0888, "step": 18000 }, { "epoch": 5.001389274798555, "eval_loss": 0.17226029932498932, "eval_runtime": 21.2768, "eval_samples_per_second": 23.5, "eval_steps_per_second": 2.961, "step": 18000 }, { "epoch": 5.14031675465407, "grad_norm": 0.4631412923336029, "learning_rate": 0.00022866537001018804, "loss": 0.067, "step": 18500 }, { "epoch": 5.201444845790498, "eval_loss": 0.18070575594902039, "eval_runtime": 21.433, "eval_samples_per_second": 23.328, "eval_steps_per_second": 2.939, "step": 18720 }, { "epoch": 5.279244234509586, "grad_norm": 0.5497326850891113, "learning_rate": 0.00022403445401500416, "loss": 0.0696, "step": 19000 }, { "epoch": 5.401500416782439, "eval_loss": 0.17943890392780304, "eval_runtime": 21.3084, "eval_samples_per_second": 23.465, "eval_steps_per_second": 2.957, "step": 19440 }, { "epoch": 5.418171714365101, "grad_norm": 0.455014705657959, "learning_rate": 0.00021940353801982033, "loss": 0.0717, "step": 19500 }, { "epoch": 5.5570991942206165, "grad_norm": 0.4856937527656555, "learning_rate": 0.00021477262202463648, "loss": 0.0724, "step": 20000 }, { "epoch": 5.601555987774382, "eval_loss": 0.1766962856054306, "eval_runtime": 21.2925, "eval_samples_per_second": 23.482, "eval_steps_per_second": 2.959, "step": 20160 }, { "epoch": 5.6960266740761325, "grad_norm": 0.9877897500991821, "learning_rate": 0.00021014170602945265, "loss": 0.0748, "step": 20500 }, { "epoch": 5.801611558766324, "eval_loss": 0.17774970829486847, "eval_runtime": 21.463, "eval_samples_per_second": 23.296, "eval_steps_per_second": 2.935, "step": 20880 }, { "epoch": 5.834954153931648, "grad_norm": 0.5059243440628052, "learning_rate": 0.0002055107900342688, "loss": 0.0732, "step": 21000 }, { "epoch": 5.973881633787163, "grad_norm": 0.6043376326560974, "learning_rate": 0.00020087987403908492, "loss": 0.0731, "step": 21500 } ], "logging_steps": 500, "max_steps": 43188, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.186199114359243e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }