{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.0, "eval_steps": 720, "global_step": 32391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13892747985551543, "grad_norm": 0.595678985118866, "learning_rate": 0.00039537834583680656, "loss": 0.376, "step": 500 }, { "epoch": 0.2000555709919422, "eval_loss": 0.30611157417297363, "eval_runtime": 21.4927, "eval_samples_per_second": 23.264, "eval_steps_per_second": 2.931, "step": 720 }, { "epoch": 0.27785495971103086, "grad_norm": 0.7439117431640625, "learning_rate": 0.0003907474298416227, "loss": 0.2956, "step": 1000 }, { "epoch": 0.4001111419838844, "eval_loss": 0.271937757730484, "eval_runtime": 21.3424, "eval_samples_per_second": 23.428, "eval_steps_per_second": 2.952, "step": 1440 }, { "epoch": 0.41678243956654626, "grad_norm": 0.5364288091659546, "learning_rate": 0.00038611651384643886, "loss": 0.2658, "step": 1500 }, { "epoch": 0.5557099194220617, "grad_norm": 0.5661336779594421, "learning_rate": 0.000381485597851255, "loss": 0.2461, "step": 2000 }, { "epoch": 0.6001667129758266, "eval_loss": 0.2515988349914551, "eval_runtime": 21.3216, "eval_samples_per_second": 23.45, "eval_steps_per_second": 2.955, "step": 2160 }, { "epoch": 0.6946373992775771, "grad_norm": 0.4893731474876404, "learning_rate": 0.00037685468185607115, "loss": 0.2307, "step": 2500 }, { "epoch": 0.8002222839677688, "eval_loss": 0.22781750559806824, "eval_runtime": 21.3004, "eval_samples_per_second": 23.474, "eval_steps_per_second": 2.958, "step": 2880 }, { "epoch": 0.8335648791330925, "grad_norm": 0.5245664119720459, "learning_rate": 0.0003722237658608873, "loss": 0.2205, "step": 3000 }, { "epoch": 0.972492358988608, "grad_norm": 0.7262536287307739, "learning_rate": 0.00036759284986570345, "loss": 0.2154, "step": 3500 }, { "epoch": 1.000277854959711, "eval_loss": 0.21747416257858276, "eval_runtime": 21.4198, "eval_samples_per_second": 23.343, "eval_steps_per_second": 2.941, "step": 3600 }, { "epoch": 1.1114198388441234, "grad_norm": 0.5650269389152527, "learning_rate": 0.0003629619338705196, "loss": 0.1815, "step": 4000 }, { "epoch": 1.2003334259516532, "eval_loss": 0.2159462422132492, "eval_runtime": 21.4142, "eval_samples_per_second": 23.349, "eval_steps_per_second": 2.942, "step": 4320 }, { "epoch": 1.2503473186996388, "grad_norm": 0.6974398493766785, "learning_rate": 0.00035833101787533574, "loss": 0.1813, "step": 4500 }, { "epoch": 1.3892747985551543, "grad_norm": 0.6206701993942261, "learning_rate": 0.0003537001018801519, "loss": 0.1785, "step": 5000 }, { "epoch": 1.4003889969435954, "eval_loss": 0.20631669461727142, "eval_runtime": 21.3451, "eval_samples_per_second": 23.425, "eval_steps_per_second": 2.952, "step": 5040 }, { "epoch": 1.5282022784106695, "grad_norm": 0.6988590359687805, "learning_rate": 0.00034906918588496804, "loss": 0.1748, "step": 5500 }, { "epoch": 1.6004445679355377, "eval_loss": 0.20530453324317932, "eval_runtime": 21.3207, "eval_samples_per_second": 23.451, "eval_steps_per_second": 2.955, "step": 5760 }, { "epoch": 1.667129758266185, "grad_norm": 0.5506817698478699, "learning_rate": 0.0003444382698897842, "loss": 0.1715, "step": 6000 }, { "epoch": 1.8005001389274797, "eval_loss": 0.19710968434810638, "eval_runtime": 21.2884, "eval_samples_per_second": 23.487, "eval_steps_per_second": 2.959, "step": 6480 }, { "epoch": 1.8060572381217006, "grad_norm": 0.464895099401474, "learning_rate": 0.0003398073538946004, "loss": 0.1678, "step": 6500 }, { "epoch": 1.9449847179772157, "grad_norm": 0.6984584331512451, "learning_rate": 0.0003351764378994165, "loss": 0.1664, "step": 7000 }, { "epoch": 2.000555709919422, "eval_loss": 0.18991900980472565, "eval_runtime": 21.3167, "eval_samples_per_second": 23.456, "eval_steps_per_second": 2.955, "step": 7200 }, { "epoch": 2.0839121978327313, "grad_norm": 0.6013078093528748, "learning_rate": 0.0003305455219042327, "loss": 0.1452, "step": 7500 }, { "epoch": 2.2006112809113643, "eval_loss": 0.1970534473657608, "eval_runtime": 21.6809, "eval_samples_per_second": 23.062, "eval_steps_per_second": 2.906, "step": 7920 }, { "epoch": 2.222839677688247, "grad_norm": 0.516161322593689, "learning_rate": 0.0003259146059090488, "loss": 0.1352, "step": 8000 }, { "epoch": 2.361767157543762, "grad_norm": 0.6993707418441772, "learning_rate": 0.000321283689913865, "loss": 0.1363, "step": 8500 }, { "epoch": 2.4006668519033063, "eval_loss": 0.18779733777046204, "eval_runtime": 21.3417, "eval_samples_per_second": 23.428, "eval_steps_per_second": 2.952, "step": 8640 }, { "epoch": 2.5006946373992776, "grad_norm": 0.542614758014679, "learning_rate": 0.00031665277391868115, "loss": 0.1345, "step": 9000 }, { "epoch": 2.600722422895249, "eval_loss": 0.18519769608974457, "eval_runtime": 21.5664, "eval_samples_per_second": 23.184, "eval_steps_per_second": 2.921, "step": 9360 }, { "epoch": 2.639622117254793, "grad_norm": 0.691017746925354, "learning_rate": 0.00031202185792349727, "loss": 0.1353, "step": 9500 }, { "epoch": 2.7785495971103087, "grad_norm": 0.4563291370868683, "learning_rate": 0.0003073909419283134, "loss": 0.1345, "step": 10000 }, { "epoch": 2.800777993887191, "eval_loss": 0.18035078048706055, "eval_runtime": 21.2975, "eval_samples_per_second": 23.477, "eval_steps_per_second": 2.958, "step": 10080 }, { "epoch": 2.917477076965824, "grad_norm": 0.6514193415641785, "learning_rate": 0.00030276002593312957, "loss": 0.1355, "step": 10500 }, { "epoch": 3.000833564879133, "eval_loss": 0.17824167013168335, "eval_runtime": 21.3358, "eval_samples_per_second": 23.435, "eval_steps_per_second": 2.953, "step": 10800 }, { "epoch": 3.0564045568213394, "grad_norm": 0.7063578367233276, "learning_rate": 0.00029812910993794574, "loss": 0.1222, "step": 11000 }, { "epoch": 3.1953320366768545, "grad_norm": 0.7858085632324219, "learning_rate": 0.0002934981939427619, "loss": 0.1072, "step": 11500 }, { "epoch": 3.2008891358710754, "eval_loss": 0.18112412095069885, "eval_runtime": 21.3193, "eval_samples_per_second": 23.453, "eval_steps_per_second": 2.955, "step": 11520 }, { "epoch": 3.33425951653237, "grad_norm": 0.962510883808136, "learning_rate": 0.00028886727794757804, "loss": 0.1086, "step": 12000 }, { "epoch": 3.4009447068630174, "eval_loss": 0.17723555862903595, "eval_runtime": 21.2949, "eval_samples_per_second": 23.48, "eval_steps_per_second": 2.958, "step": 12240 }, { "epoch": 3.4731869963878856, "grad_norm": 0.47604429721832275, "learning_rate": 0.00028423636195239416, "loss": 0.109, "step": 12500 }, { "epoch": 3.6010002778549595, "eval_loss": 0.1781689077615738, "eval_runtime": 21.2531, "eval_samples_per_second": 23.526, "eval_steps_per_second": 2.964, "step": 12960 }, { "epoch": 3.612114476243401, "grad_norm": 0.49760594964027405, "learning_rate": 0.00027960544595721033, "loss": 0.1099, "step": 13000 }, { "epoch": 3.7510419560989163, "grad_norm": 0.6786069869995117, "learning_rate": 0.0002749745299620265, "loss": 0.1084, "step": 13500 }, { "epoch": 3.801055848846902, "eval_loss": 0.17475004494190216, "eval_runtime": 21.2523, "eval_samples_per_second": 23.527, "eval_steps_per_second": 2.964, "step": 13680 }, { "epoch": 3.889969435954432, "grad_norm": 0.8729720115661621, "learning_rate": 0.0002703436139668427, "loss": 0.1114, "step": 14000 }, { "epoch": 4.001111419838844, "eval_loss": 0.16653864085674286, "eval_runtime": 21.3451, "eval_samples_per_second": 23.425, "eval_steps_per_second": 2.951, "step": 14400 }, { "epoch": 4.0288969158099475, "grad_norm": 0.7045803070068359, "learning_rate": 0.0002657126979716588, "loss": 0.1038, "step": 14500 }, { "epoch": 4.167824395665463, "grad_norm": 0.9094932079315186, "learning_rate": 0.0002610817819764749, "loss": 0.0839, "step": 15000 }, { "epoch": 4.201166990830786, "eval_loss": 0.1762482225894928, "eval_runtime": 21.3199, "eval_samples_per_second": 23.452, "eval_steps_per_second": 2.955, "step": 15120 }, { "epoch": 4.306751875520978, "grad_norm": 0.6102758646011353, "learning_rate": 0.0002564508659812911, "loss": 0.0878, "step": 15500 }, { "epoch": 4.4012225618227285, "eval_loss": 0.17415344715118408, "eval_runtime": 21.3432, "eval_samples_per_second": 23.427, "eval_steps_per_second": 2.952, "step": 15840 }, { "epoch": 4.445679355376494, "grad_norm": 0.46317267417907715, "learning_rate": 0.00025181994998610727, "loss": 0.0883, "step": 16000 }, { "epoch": 4.584606835232009, "grad_norm": 0.6532848477363586, "learning_rate": 0.00024718903399092345, "loss": 0.0887, "step": 16500 }, { "epoch": 4.601278132814671, "eval_loss": 0.18139471113681793, "eval_runtime": 21.5324, "eval_samples_per_second": 23.221, "eval_steps_per_second": 2.926, "step": 16560 }, { "epoch": 4.723534315087524, "grad_norm": 0.8044198155403137, "learning_rate": 0.00024255811799573954, "loss": 0.0928, "step": 17000 }, { "epoch": 4.801333703806613, "eval_loss": 0.16956336796283722, "eval_runtime": 22.0239, "eval_samples_per_second": 22.703, "eval_steps_per_second": 2.861, "step": 17280 }, { "epoch": 4.86246179494304, "grad_norm": 0.6072468161582947, "learning_rate": 0.00023792720200055571, "loss": 0.0915, "step": 17500 }, { "epoch": 5.001389274798555, "grad_norm": 0.7845910787582397, "learning_rate": 0.0002332962860053719, "loss": 0.0888, "step": 18000 }, { "epoch": 5.001389274798555, "eval_loss": 0.17226029932498932, "eval_runtime": 21.2768, "eval_samples_per_second": 23.5, "eval_steps_per_second": 2.961, "step": 18000 }, { "epoch": 5.14031675465407, "grad_norm": 0.4631412923336029, "learning_rate": 0.00022866537001018804, "loss": 0.067, "step": 18500 }, { "epoch": 5.201444845790498, "eval_loss": 0.18070575594902039, "eval_runtime": 21.433, "eval_samples_per_second": 23.328, "eval_steps_per_second": 2.939, "step": 18720 }, { "epoch": 5.279244234509586, "grad_norm": 0.5497326850891113, "learning_rate": 0.00022403445401500416, "loss": 0.0696, "step": 19000 }, { "epoch": 5.401500416782439, "eval_loss": 0.17943890392780304, "eval_runtime": 21.3084, "eval_samples_per_second": 23.465, "eval_steps_per_second": 2.957, "step": 19440 }, { "epoch": 5.418171714365101, "grad_norm": 0.455014705657959, "learning_rate": 0.00021940353801982033, "loss": 0.0717, "step": 19500 }, { "epoch": 5.5570991942206165, "grad_norm": 0.4856937527656555, "learning_rate": 0.00021477262202463648, "loss": 0.0724, "step": 20000 }, { "epoch": 5.601555987774382, "eval_loss": 0.1766962856054306, "eval_runtime": 21.2925, "eval_samples_per_second": 23.482, "eval_steps_per_second": 2.959, "step": 20160 }, { "epoch": 5.6960266740761325, "grad_norm": 0.9877897500991821, "learning_rate": 0.00021014170602945265, "loss": 0.0748, "step": 20500 }, { "epoch": 5.801611558766324, "eval_loss": 0.17774970829486847, "eval_runtime": 21.463, "eval_samples_per_second": 23.296, "eval_steps_per_second": 2.935, "step": 20880 }, { "epoch": 5.834954153931648, "grad_norm": 0.5059243440628052, "learning_rate": 0.0002055107900342688, "loss": 0.0732, "step": 21000 }, { "epoch": 5.973881633787163, "grad_norm": 0.6043376326560974, "learning_rate": 0.00020087987403908492, "loss": 0.0731, "step": 21500 }, { "epoch": 6.001667129758266, "eval_loss": 0.1752805858850479, "eval_runtime": 21.3362, "eval_samples_per_second": 23.434, "eval_steps_per_second": 2.953, "step": 21600 }, { "epoch": 6.112809113642679, "grad_norm": 0.49161791801452637, "learning_rate": 0.0001962489580439011, "loss": 0.0579, "step": 22000 }, { "epoch": 6.201722700750208, "eval_loss": 0.18562264740467072, "eval_runtime": 21.3241, "eval_samples_per_second": 23.448, "eval_steps_per_second": 2.954, "step": 22320 }, { "epoch": 6.251736593498194, "grad_norm": 0.5732259154319763, "learning_rate": 0.00019161804204871724, "loss": 0.0548, "step": 22500 }, { "epoch": 6.390664073353709, "grad_norm": 0.807499885559082, "learning_rate": 0.0001869871260535334, "loss": 0.0582, "step": 23000 }, { "epoch": 6.401778271742151, "eval_loss": 0.17600271105766296, "eval_runtime": 21.3547, "eval_samples_per_second": 23.414, "eval_steps_per_second": 2.95, "step": 23040 }, { "epoch": 6.529591553209225, "grad_norm": 0.5904980897903442, "learning_rate": 0.00018235621005834954, "loss": 0.0597, "step": 23500 }, { "epoch": 6.601833842734093, "eval_loss": 0.17754560708999634, "eval_runtime": 21.3447, "eval_samples_per_second": 23.425, "eval_steps_per_second": 2.952, "step": 23760 }, { "epoch": 6.66851903306474, "grad_norm": 0.721977174282074, "learning_rate": 0.0001777252940631657, "loss": 0.0591, "step": 24000 }, { "epoch": 6.801889413726035, "eval_loss": 0.18270494043827057, "eval_runtime": 21.3444, "eval_samples_per_second": 23.425, "eval_steps_per_second": 2.952, "step": 24480 }, { "epoch": 6.807446512920255, "grad_norm": 0.3316296339035034, "learning_rate": 0.00017309437806798186, "loss": 0.06, "step": 24500 }, { "epoch": 6.946373992775771, "grad_norm": 0.64435213804245, "learning_rate": 0.000168463462072798, "loss": 0.0591, "step": 25000 }, { "epoch": 7.001944984717977, "eval_loss": 0.17571093142032623, "eval_runtime": 21.3307, "eval_samples_per_second": 23.44, "eval_steps_per_second": 2.953, "step": 25200 }, { "epoch": 7.085301472631286, "grad_norm": 0.743349552154541, "learning_rate": 0.00016383254607761416, "loss": 0.0479, "step": 25500 }, { "epoch": 7.20200055570992, "eval_loss": 0.19375747442245483, "eval_runtime": 21.2935, "eval_samples_per_second": 23.481, "eval_steps_per_second": 2.959, "step": 25920 }, { "epoch": 7.2242289524868015, "grad_norm": 0.6206533908843994, "learning_rate": 0.00015920163008243033, "loss": 0.0431, "step": 26000 }, { "epoch": 7.3631564323423175, "grad_norm": 0.5083501935005188, "learning_rate": 0.00015457071408724645, "loss": 0.0457, "step": 26500 }, { "epoch": 7.402056126701861, "eval_loss": 0.18405260145664215, "eval_runtime": 21.2846, "eval_samples_per_second": 23.491, "eval_steps_per_second": 2.96, "step": 26640 }, { "epoch": 7.502083912197833, "grad_norm": 0.5129767060279846, "learning_rate": 0.00014993979809206263, "loss": 0.0463, "step": 27000 }, { "epoch": 7.602111697693804, "eval_loss": 0.18968620896339417, "eval_runtime": 21.3052, "eval_samples_per_second": 23.468, "eval_steps_per_second": 2.957, "step": 27360 }, { "epoch": 7.641011392053348, "grad_norm": 0.5119397044181824, "learning_rate": 0.00014530888209687877, "loss": 0.0463, "step": 27500 }, { "epoch": 7.779938871908864, "grad_norm": 0.6329432725906372, "learning_rate": 0.00014067796610169492, "loss": 0.0482, "step": 28000 }, { "epoch": 7.802167268685746, "eval_loss": 0.18267524242401123, "eval_runtime": 21.2984, "eval_samples_per_second": 23.476, "eval_steps_per_second": 2.958, "step": 28080 }, { "epoch": 7.918866351764379, "grad_norm": 0.9381836652755737, "learning_rate": 0.00013604705010651107, "loss": 0.0483, "step": 28500 }, { "epoch": 8.002222839677689, "eval_loss": 0.18136519193649292, "eval_runtime": 21.3111, "eval_samples_per_second": 23.462, "eval_steps_per_second": 2.956, "step": 28800 }, { "epoch": 8.057793831619895, "grad_norm": 0.8777905106544495, "learning_rate": 0.00013141613411132722, "loss": 0.0408, "step": 29000 }, { "epoch": 8.19672131147541, "grad_norm": 0.6021227240562439, "learning_rate": 0.0001267852181161434, "loss": 0.0353, "step": 29500 }, { "epoch": 8.20227841066963, "eval_loss": 0.19863204658031464, "eval_runtime": 21.3525, "eval_samples_per_second": 23.416, "eval_steps_per_second": 2.95, "step": 29520 }, { "epoch": 8.335648791330925, "grad_norm": 0.37129077315330505, "learning_rate": 0.00012215430212095954, "loss": 0.0346, "step": 30000 }, { "epoch": 8.402333981661572, "eval_loss": 0.2027880847454071, "eval_runtime": 21.3394, "eval_samples_per_second": 23.431, "eval_steps_per_second": 2.952, "step": 30240 }, { "epoch": 8.474576271186441, "grad_norm": 0.5072668194770813, "learning_rate": 0.00011752338612577569, "loss": 0.0358, "step": 30500 }, { "epoch": 8.602389552653515, "eval_loss": 0.1992003470659256, "eval_runtime": 21.3327, "eval_samples_per_second": 23.438, "eval_steps_per_second": 2.953, "step": 30960 }, { "epoch": 8.613503751041955, "grad_norm": 0.7495487332344055, "learning_rate": 0.00011289247013059183, "loss": 0.0369, "step": 31000 }, { "epoch": 8.752431230897471, "grad_norm": 0.618772566318512, "learning_rate": 0.000108261554135408, "loss": 0.0374, "step": 31500 }, { "epoch": 8.802445123645457, "eval_loss": 0.20048412680625916, "eval_runtime": 21.4267, "eval_samples_per_second": 23.335, "eval_steps_per_second": 2.94, "step": 31680 }, { "epoch": 8.891358710752987, "grad_norm": 0.839154064655304, "learning_rate": 0.00010363063814022413, "loss": 0.0373, "step": 32000 } ], "logging_steps": 500, "max_steps": 43188, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2282623257814761e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }