| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.178226438081977, | |
| "eval_steps": 100, | |
| "global_step": 12000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017404925593943087, | |
| "grad_norm": 4.2433902126025425, | |
| "learning_rate": 9.8e-05, | |
| "loss": 3.6204, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.034809851187886175, | |
| "grad_norm": 3.0786203091123565, | |
| "learning_rate": 9.999928647255986e-05, | |
| "loss": 2.5602, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.034809851187886175, | |
| "eval_loss": 2.363542079925537, | |
| "eval_runtime": 14.092, | |
| "eval_samples_per_second": 70.962, | |
| "eval_steps_per_second": 2.271, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05221477678182926, | |
| "grad_norm": 2.489798826862787, | |
| "learning_rate": 9.999708736748881e-05, | |
| "loss": 2.2577, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06961970237577235, | |
| "grad_norm": 4.227397206704295, | |
| "learning_rate": 9.999340245361986e-05, | |
| "loss": 2.065, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06961970237577235, | |
| "eval_loss": 2.0204052925109863, | |
| "eval_runtime": 14.0707, | |
| "eval_samples_per_second": 71.07, | |
| "eval_steps_per_second": 2.274, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08702462796971543, | |
| "grad_norm": 1.5627603609182088, | |
| "learning_rate": 9.998823184156712e-05, | |
| "loss": 1.9504, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.10442955356365852, | |
| "grad_norm": 2.0849274528531834, | |
| "learning_rate": 9.998157568654259e-05, | |
| "loss": 1.9106, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10442955356365852, | |
| "eval_loss": 1.8868601322174072, | |
| "eval_runtime": 14.0214, | |
| "eval_samples_per_second": 71.319, | |
| "eval_steps_per_second": 2.282, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12183447915760161, | |
| "grad_norm": 1.2760571472974125, | |
| "learning_rate": 9.997343418835142e-05, | |
| "loss": 1.8861, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1392394047515447, | |
| "grad_norm": 3.7430921365305005, | |
| "learning_rate": 9.996380759138595e-05, | |
| "loss": 1.8622, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1392394047515447, | |
| "eval_loss": 1.8383088111877441, | |
| "eval_runtime": 14.0403, | |
| "eval_samples_per_second": 71.224, | |
| "eval_steps_per_second": 2.279, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.15664433034548778, | |
| "grad_norm": 0.9586199143769504, | |
| "learning_rate": 9.995269618461844e-05, | |
| "loss": 1.8478, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.17404925593943085, | |
| "grad_norm": 1.5061363604288809, | |
| "learning_rate": 9.99401003015922e-05, | |
| "loss": 1.8117, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.17404925593943085, | |
| "eval_loss": 1.7942754030227661, | |
| "eval_runtime": 14.0551, | |
| "eval_samples_per_second": 71.149, | |
| "eval_steps_per_second": 2.277, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.19145418153337396, | |
| "grad_norm": 2.433096036143497, | |
| "learning_rate": 9.992602032041181e-05, | |
| "loss": 1.8071, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.20885910712731703, | |
| "grad_norm": 3.0865325369733796, | |
| "learning_rate": 9.991045666373163e-05, | |
| "loss": 1.7895, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.20885910712731703, | |
| "eval_loss": 1.7960834503173828, | |
| "eval_runtime": 14.0575, | |
| "eval_samples_per_second": 71.136, | |
| "eval_steps_per_second": 2.276, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.2262640327212601, | |
| "grad_norm": 1.0844936845986464, | |
| "learning_rate": 9.989340979874317e-05, | |
| "loss": 1.7954, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.24366895831520322, | |
| "grad_norm": 1.5841141135500758, | |
| "learning_rate": 9.987488023716102e-05, | |
| "loss": 1.7827, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.24366895831520322, | |
| "eval_loss": 1.7635940313339233, | |
| "eval_runtime": 14.0192, | |
| "eval_samples_per_second": 71.331, | |
| "eval_steps_per_second": 2.283, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.26107388390914626, | |
| "grad_norm": 1.0358659820491174, | |
| "learning_rate": 9.985486853520748e-05, | |
| "loss": 1.7755, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2784788095030894, | |
| "grad_norm": 2.921637768400008, | |
| "learning_rate": 9.983337529359597e-05, | |
| "loss": 1.7689, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2784788095030894, | |
| "eval_loss": 1.7601885795593262, | |
| "eval_runtime": 14.0539, | |
| "eval_samples_per_second": 71.155, | |
| "eval_steps_per_second": 2.277, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2958837350970325, | |
| "grad_norm": 1.2109225200496343, | |
| "learning_rate": 9.981040115751287e-05, | |
| "loss": 1.7642, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.31328866069097555, | |
| "grad_norm": 0.9168156411457995, | |
| "learning_rate": 9.978594681659822e-05, | |
| "loss": 1.7584, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.31328866069097555, | |
| "eval_loss": 1.733883023262024, | |
| "eval_runtime": 14.0773, | |
| "eval_samples_per_second": 71.036, | |
| "eval_steps_per_second": 2.273, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.33069358628491863, | |
| "grad_norm": 1.231514474527335, | |
| "learning_rate": 9.976001300492505e-05, | |
| "loss": 1.7476, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3480985118788617, | |
| "grad_norm": 1.3345795373547282, | |
| "learning_rate": 9.97326005009772e-05, | |
| "loss": 1.7529, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3480985118788617, | |
| "eval_loss": 1.7277562618255615, | |
| "eval_runtime": 14.0558, | |
| "eval_samples_per_second": 71.145, | |
| "eval_steps_per_second": 2.277, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3655034374728048, | |
| "grad_norm": 1.3848562250830305, | |
| "learning_rate": 9.970371012762615e-05, | |
| "loss": 1.7383, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3829083630667479, | |
| "grad_norm": 0.9102870237217918, | |
| "learning_rate": 9.967334275210616e-05, | |
| "loss": 1.7312, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3829083630667479, | |
| "eval_loss": 1.7197346687316895, | |
| "eval_runtime": 14.1392, | |
| "eval_samples_per_second": 70.725, | |
| "eval_steps_per_second": 2.263, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.400313288660691, | |
| "grad_norm": 0.9403631790293067, | |
| "learning_rate": 9.964149928598834e-05, | |
| "loss": 1.7354, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.41771821425463407, | |
| "grad_norm": 1.5743784967989016, | |
| "learning_rate": 9.96081806851532e-05, | |
| "loss": 1.7384, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.41771821425463407, | |
| "eval_loss": 1.7184182405471802, | |
| "eval_runtime": 14.0812, | |
| "eval_samples_per_second": 71.017, | |
| "eval_steps_per_second": 2.273, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.43512313984857715, | |
| "grad_norm": 2.0859941497191743, | |
| "learning_rate": 9.957338794976201e-05, | |
| "loss": 1.7389, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4525280654425202, | |
| "grad_norm": 1.347628827751819, | |
| "learning_rate": 9.953712212422681e-05, | |
| "loss": 1.7267, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4525280654425202, | |
| "eval_loss": 1.7116867303848267, | |
| "eval_runtime": 14.0496, | |
| "eval_samples_per_second": 71.177, | |
| "eval_steps_per_second": 2.278, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4699329910364633, | |
| "grad_norm": 0.7775683989797394, | |
| "learning_rate": 9.949938429717895e-05, | |
| "loss": 1.7136, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.48733791663040643, | |
| "grad_norm": 0.7955090270012505, | |
| "learning_rate": 9.946017560143651e-05, | |
| "loss": 1.7188, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.48733791663040643, | |
| "eval_loss": 1.703679084777832, | |
| "eval_runtime": 14.077, | |
| "eval_samples_per_second": 71.038, | |
| "eval_steps_per_second": 2.273, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5047428422243495, | |
| "grad_norm": 1.2332760111187542, | |
| "learning_rate": 9.941949721397028e-05, | |
| "loss": 1.7169, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5221477678182925, | |
| "grad_norm": 1.81355607393607, | |
| "learning_rate": 9.93773503558684e-05, | |
| "loss": 1.7157, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5221477678182925, | |
| "eval_loss": 1.7078830003738403, | |
| "eval_runtime": 14.09, | |
| "eval_samples_per_second": 70.972, | |
| "eval_steps_per_second": 2.271, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5395526934122357, | |
| "grad_norm": 0.9053542478773059, | |
| "learning_rate": 9.933373629229969e-05, | |
| "loss": 1.7102, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5569576190061788, | |
| "grad_norm": 0.6503277295238644, | |
| "learning_rate": 9.928865633247573e-05, | |
| "loss": 1.7033, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5569576190061788, | |
| "eval_loss": 1.6917779445648193, | |
| "eval_runtime": 14.0698, | |
| "eval_samples_per_second": 71.074, | |
| "eval_steps_per_second": 2.274, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5743625446001218, | |
| "grad_norm": 1.4224490096345375, | |
| "learning_rate": 9.92421118296115e-05, | |
| "loss": 1.6997, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.591767470194065, | |
| "grad_norm": 0.7864420926166752, | |
| "learning_rate": 9.919410418088481e-05, | |
| "loss": 1.7102, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.591767470194065, | |
| "eval_loss": 1.690305233001709, | |
| "eval_runtime": 14.1062, | |
| "eval_samples_per_second": 70.891, | |
| "eval_steps_per_second": 2.269, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.609172395788008, | |
| "grad_norm": 0.5663590518834145, | |
| "learning_rate": 9.914463482739435e-05, | |
| "loss": 1.7046, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.6265773213819511, | |
| "grad_norm": 1.1145025421986445, | |
| "learning_rate": 9.909370525411637e-05, | |
| "loss": 1.6905, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6265773213819511, | |
| "eval_loss": 1.6856919527053833, | |
| "eval_runtime": 14.0345, | |
| "eval_samples_per_second": 71.253, | |
| "eval_steps_per_second": 2.28, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6439822469758941, | |
| "grad_norm": 1.079593642429848, | |
| "learning_rate": 9.90413169898602e-05, | |
| "loss": 1.6973, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.6613871725698373, | |
| "grad_norm": 0.8794305699903086, | |
| "learning_rate": 9.898747160722229e-05, | |
| "loss": 1.6821, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6613871725698373, | |
| "eval_loss": 1.680002212524414, | |
| "eval_runtime": 14.0923, | |
| "eval_samples_per_second": 70.961, | |
| "eval_steps_per_second": 2.271, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6787920981637804, | |
| "grad_norm": 1.3664190261530837, | |
| "learning_rate": 9.893217072253903e-05, | |
| "loss": 1.6909, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6961970237577234, | |
| "grad_norm": 0.9268231360918758, | |
| "learning_rate": 9.88754159958382e-05, | |
| "loss": 1.6901, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6961970237577234, | |
| "eval_loss": 1.6765377521514893, | |
| "eval_runtime": 14.0942, | |
| "eval_samples_per_second": 70.951, | |
| "eval_steps_per_second": 2.27, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7136019493516665, | |
| "grad_norm": 0.9864416812238661, | |
| "learning_rate": 9.881720913078921e-05, | |
| "loss": 1.6911, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.7310068749456096, | |
| "grad_norm": 0.8706035984933645, | |
| "learning_rate": 9.875755187465186e-05, | |
| "loss": 1.6866, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7310068749456096, | |
| "eval_loss": 1.675471305847168, | |
| "eval_runtime": 14.0392, | |
| "eval_samples_per_second": 71.229, | |
| "eval_steps_per_second": 2.279, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7484118005395527, | |
| "grad_norm": 0.9954026204157976, | |
| "learning_rate": 9.869644601822396e-05, | |
| "loss": 1.6764, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.7658167261334958, | |
| "grad_norm": 0.9859776473729975, | |
| "learning_rate": 9.863389339578761e-05, | |
| "loss": 1.6772, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7658167261334958, | |
| "eval_loss": 1.6698520183563232, | |
| "eval_runtime": 14.0605, | |
| "eval_samples_per_second": 71.121, | |
| "eval_steps_per_second": 2.276, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7832216517274389, | |
| "grad_norm": 0.9106273220771831, | |
| "learning_rate": 9.856989588505399e-05, | |
| "loss": 1.6796, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.800626577321382, | |
| "grad_norm": 1.1219788313484198, | |
| "learning_rate": 9.850445540710714e-05, | |
| "loss": 1.6742, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.800626577321382, | |
| "eval_loss": 1.663262963294983, | |
| "eval_runtime": 14.0171, | |
| "eval_samples_per_second": 71.341, | |
| "eval_steps_per_second": 2.283, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.818031502915325, | |
| "grad_norm": 0.7584108888049894, | |
| "learning_rate": 9.843757392634629e-05, | |
| "loss": 1.6773, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.8354364285092681, | |
| "grad_norm": 0.8524680066268957, | |
| "learning_rate": 9.836925345042675e-05, | |
| "loss": 1.6802, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8354364285092681, | |
| "eval_loss": 1.6637836694717407, | |
| "eval_runtime": 14.0523, | |
| "eval_samples_per_second": 71.163, | |
| "eval_steps_per_second": 2.277, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8528413541032112, | |
| "grad_norm": 0.5830225213698085, | |
| "learning_rate": 9.82994960301998e-05, | |
| "loss": 1.6774, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.8702462796971543, | |
| "grad_norm": 0.8294538008262156, | |
| "learning_rate": 9.822830375965103e-05, | |
| "loss": 1.6702, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8702462796971543, | |
| "eval_loss": 1.6600449085235596, | |
| "eval_runtime": 14.0902, | |
| "eval_samples_per_second": 70.972, | |
| "eval_steps_per_second": 2.271, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8876512052910974, | |
| "grad_norm": 1.0515224700476833, | |
| "learning_rate": 9.815567877583758e-05, | |
| "loss": 1.6758, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.9050561308850404, | |
| "grad_norm": 0.7866141842693181, | |
| "learning_rate": 9.808162325882385e-05, | |
| "loss": 1.6645, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.9050561308850404, | |
| "eval_loss": 1.657778263092041, | |
| "eval_runtime": 14.0817, | |
| "eval_samples_per_second": 71.014, | |
| "eval_steps_per_second": 2.272, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.9224610564789836, | |
| "grad_norm": 0.9909101251943951, | |
| "learning_rate": 9.800613943161619e-05, | |
| "loss": 1.6629, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.9398659820729266, | |
| "grad_norm": 0.9534991636209588, | |
| "learning_rate": 9.79292295600961e-05, | |
| "loss": 1.6523, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9398659820729266, | |
| "eval_loss": 1.6576528549194336, | |
| "eval_runtime": 14.0341, | |
| "eval_samples_per_second": 71.255, | |
| "eval_steps_per_second": 2.28, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9572709076668697, | |
| "grad_norm": 0.7620610436511178, | |
| "learning_rate": 9.785089595295222e-05, | |
| "loss": 1.6573, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.9746758332608129, | |
| "grad_norm": 1.5752171211110084, | |
| "learning_rate": 9.777114096161105e-05, | |
| "loss": 1.6583, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9746758332608129, | |
| "eval_loss": 1.6622099876403809, | |
| "eval_runtime": 14.0927, | |
| "eval_samples_per_second": 70.959, | |
| "eval_steps_per_second": 2.271, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9920807588547559, | |
| "grad_norm": 0.5970970379963504, | |
| "learning_rate": 9.768996698016636e-05, | |
| "loss": 1.6625, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.009746758332608, | |
| "grad_norm": 0.7470976369983713, | |
| "learning_rate": 9.760737644530726e-05, | |
| "loss": 1.6597, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.009746758332608, | |
| "eval_loss": 1.647603988647461, | |
| "eval_runtime": 14.12, | |
| "eval_samples_per_second": 70.822, | |
| "eval_steps_per_second": 2.266, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.0271516839265513, | |
| "grad_norm": 0.962160586071795, | |
| "learning_rate": 9.75233718362452e-05, | |
| "loss": 1.611, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.0445566095204943, | |
| "grad_norm": 0.6386050774526276, | |
| "learning_rate": 9.74379556746394e-05, | |
| "loss": 1.619, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0445566095204943, | |
| "eval_loss": 1.6434565782546997, | |
| "eval_runtime": 14.0132, | |
| "eval_samples_per_second": 71.361, | |
| "eval_steps_per_second": 2.284, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0619615351144374, | |
| "grad_norm": 1.5569952795665942, | |
| "learning_rate": 9.735113052452119e-05, | |
| "loss": 1.6108, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.0793664607083804, | |
| "grad_norm": 1.223444554102184, | |
| "learning_rate": 9.726289899221713e-05, | |
| "loss": 1.6242, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.0793664607083804, | |
| "eval_loss": 1.6534233093261719, | |
| "eval_runtime": 14.0914, | |
| "eval_samples_per_second": 70.965, | |
| "eval_steps_per_second": 2.271, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.0967713863023236, | |
| "grad_norm": 0.6055563672851731, | |
| "learning_rate": 9.717326372627065e-05, | |
| "loss": 1.6165, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.1141763118962666, | |
| "grad_norm": 0.7125630072846985, | |
| "learning_rate": 9.708222741736268e-05, | |
| "loss": 1.6137, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.1141763118962666, | |
| "eval_loss": 1.6405473947525024, | |
| "eval_runtime": 14.0433, | |
| "eval_samples_per_second": 71.208, | |
| "eval_steps_per_second": 2.279, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.1315812374902097, | |
| "grad_norm": 0.6828372843368237, | |
| "learning_rate": 9.698979279823071e-05, | |
| "loss": 1.6178, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.148986163084153, | |
| "grad_norm": 0.6458088716811551, | |
| "learning_rate": 9.689596264358694e-05, | |
| "loss": 1.6057, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.148986163084153, | |
| "eval_loss": 1.6405302286148071, | |
| "eval_runtime": 14.0715, | |
| "eval_samples_per_second": 71.065, | |
| "eval_steps_per_second": 2.274, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.166391088678096, | |
| "grad_norm": 0.7900271544609745, | |
| "learning_rate": 9.680073977003483e-05, | |
| "loss": 1.6031, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.183796014272039, | |
| "grad_norm": 1.0152045530917768, | |
| "learning_rate": 9.670412703598469e-05, | |
| "loss": 1.6117, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.183796014272039, | |
| "eval_loss": 1.639701247215271, | |
| "eval_runtime": 14.0316, | |
| "eval_samples_per_second": 71.268, | |
| "eval_steps_per_second": 2.281, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.201200939865982, | |
| "grad_norm": 0.7302281809291777, | |
| "learning_rate": 9.660612734156777e-05, | |
| "loss": 1.6027, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.2186058654599252, | |
| "grad_norm": 0.6755389511951054, | |
| "learning_rate": 9.650674362854923e-05, | |
| "loss": 1.6227, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2186058654599252, | |
| "eval_loss": 1.633447289466858, | |
| "eval_runtime": 14.0553, | |
| "eval_samples_per_second": 71.148, | |
| "eval_steps_per_second": 2.277, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2360107910538682, | |
| "grad_norm": 0.4958097178118167, | |
| "learning_rate": 9.640597888023988e-05, | |
| "loss": 1.6039, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.2534157166478113, | |
| "grad_norm": 0.842473221384133, | |
| "learning_rate": 9.630383612140661e-05, | |
| "loss": 1.6105, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.2534157166478113, | |
| "eval_loss": 1.6299790143966675, | |
| "eval_runtime": 14.0661, | |
| "eval_samples_per_second": 71.093, | |
| "eval_steps_per_second": 2.275, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.2708206422417545, | |
| "grad_norm": 0.5547865266670734, | |
| "learning_rate": 9.62003184181815e-05, | |
| "loss": 1.608, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.2882255678356975, | |
| "grad_norm": 0.5625620088465835, | |
| "learning_rate": 9.609542887796993e-05, | |
| "loss": 1.6141, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2882255678356975, | |
| "eval_loss": 1.6269824504852295, | |
| "eval_runtime": 14.0355, | |
| "eval_samples_per_second": 71.248, | |
| "eval_steps_per_second": 2.28, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.3056304934296405, | |
| "grad_norm": 0.9870933875943105, | |
| "learning_rate": 9.598917064935719e-05, | |
| "loss": 1.6045, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.3230354190235838, | |
| "grad_norm": 0.6538637454488698, | |
| "learning_rate": 9.5881546922014e-05, | |
| "loss": 1.601, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.3230354190235838, | |
| "eval_loss": 1.626142144203186, | |
| "eval_runtime": 14.1089, | |
| "eval_samples_per_second": 70.877, | |
| "eval_steps_per_second": 2.268, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.3404403446175268, | |
| "grad_norm": 0.6547060258929419, | |
| "learning_rate": 9.57725609266008e-05, | |
| "loss": 1.6066, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.3578452702114698, | |
| "grad_norm": 0.9358458562600437, | |
| "learning_rate": 9.566221593467069e-05, | |
| "loss": 1.6221, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.3578452702114698, | |
| "eval_loss": 1.627410888671875, | |
| "eval_runtime": 14.088, | |
| "eval_samples_per_second": 70.982, | |
| "eval_steps_per_second": 2.271, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.3752501958054129, | |
| "grad_norm": 0.8129191474694835, | |
| "learning_rate": 9.555051525857134e-05, | |
| "loss": 1.5996, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.392655121399356, | |
| "grad_norm": 0.6824919031119797, | |
| "learning_rate": 9.54374622513454e-05, | |
| "loss": 1.6101, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.392655121399356, | |
| "eval_loss": 1.6165417432785034, | |
| "eval_runtime": 14.0492, | |
| "eval_samples_per_second": 71.179, | |
| "eval_steps_per_second": 2.278, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.4100600469932991, | |
| "grad_norm": 0.9330542502271321, | |
| "learning_rate": 9.532306030663e-05, | |
| "loss": 1.5958, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.4274649725872421, | |
| "grad_norm": 0.6438330837104954, | |
| "learning_rate": 9.520731285855482e-05, | |
| "loss": 1.599, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.4274649725872421, | |
| "eval_loss": 1.6210800409317017, | |
| "eval_runtime": 14.0932, | |
| "eval_samples_per_second": 70.956, | |
| "eval_steps_per_second": 2.271, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.4448698981811852, | |
| "grad_norm": 0.9636631744898069, | |
| "learning_rate": 9.509022338163896e-05, | |
| "loss": 1.5955, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.4622748237751284, | |
| "grad_norm": 0.5569273625801461, | |
| "learning_rate": 9.497179539068673e-05, | |
| "loss": 1.6007, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.4622748237751284, | |
| "eval_loss": 1.6149400472640991, | |
| "eval_runtime": 14.0717, | |
| "eval_samples_per_second": 71.064, | |
| "eval_steps_per_second": 2.274, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.4796797493690714, | |
| "grad_norm": 0.5160141243848255, | |
| "learning_rate": 9.485203244068202e-05, | |
| "loss": 1.5926, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.4970846749630145, | |
| "grad_norm": 0.48151772986247815, | |
| "learning_rate": 9.473093812668182e-05, | |
| "loss": 1.5936, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.4970846749630145, | |
| "eval_loss": 1.6123466491699219, | |
| "eval_runtime": 14.0881, | |
| "eval_samples_per_second": 70.982, | |
| "eval_steps_per_second": 2.271, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.5144896005569577, | |
| "grad_norm": 1.1271863223922003, | |
| "learning_rate": 9.460851608370794e-05, | |
| "loss": 1.6012, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.5318945261509007, | |
| "grad_norm": 0.8558669669849335, | |
| "learning_rate": 9.448476998663825e-05, | |
| "loss": 1.605, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.5318945261509007, | |
| "eval_loss": 1.6140981912612915, | |
| "eval_runtime": 14.1256, | |
| "eval_samples_per_second": 70.793, | |
| "eval_steps_per_second": 2.265, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.5492994517448437, | |
| "grad_norm": 0.7276127450869437, | |
| "learning_rate": 9.435970355009615e-05, | |
| "loss": 1.5938, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.566704377338787, | |
| "grad_norm": 0.6065688198096086, | |
| "learning_rate": 9.423332052833916e-05, | |
| "loss": 1.5946, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.566704377338787, | |
| "eval_loss": 1.611683964729309, | |
| "eval_runtime": 14.0436, | |
| "eval_samples_per_second": 71.207, | |
| "eval_steps_per_second": 2.279, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.58410930293273, | |
| "grad_norm": 0.7748024258482299, | |
| "learning_rate": 9.410562471514616e-05, | |
| "loss": 1.5894, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.601514228526673, | |
| "grad_norm": 0.48917881847751543, | |
| "learning_rate": 9.397661994370357e-05, | |
| "loss": 1.5877, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.601514228526673, | |
| "eval_loss": 1.6069624423980713, | |
| "eval_runtime": 14.0735, | |
| "eval_samples_per_second": 71.056, | |
| "eval_steps_per_second": 2.274, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.6189191541206163, | |
| "grad_norm": 0.8166564830453485, | |
| "learning_rate": 9.384631008649027e-05, | |
| "loss": 1.5875, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.636324079714559, | |
| "grad_norm": 0.9485787011897893, | |
| "learning_rate": 9.371469905516128e-05, | |
| "loss": 1.5926, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.636324079714559, | |
| "eval_loss": 1.6103551387786865, | |
| "eval_runtime": 14.0489, | |
| "eval_samples_per_second": 71.18, | |
| "eval_steps_per_second": 2.278, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.6537290053085023, | |
| "grad_norm": 0.6608190035209371, | |
| "learning_rate": 9.358179080043047e-05, | |
| "loss": 1.5852, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.6711339309024456, | |
| "grad_norm": 0.5091041850584289, | |
| "learning_rate": 9.344758931195186e-05, | |
| "loss": 1.5818, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6711339309024456, | |
| "eval_loss": 1.6055699586868286, | |
| "eval_runtime": 14.0386, | |
| "eval_samples_per_second": 71.232, | |
| "eval_steps_per_second": 2.279, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6885388564963884, | |
| "grad_norm": 0.4809752811498165, | |
| "learning_rate": 9.331209861819991e-05, | |
| "loss": 1.5945, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.7059437820903316, | |
| "grad_norm": 1.16696044120828, | |
| "learning_rate": 9.31753227863486e-05, | |
| "loss": 1.5906, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.7059437820903316, | |
| "eval_loss": 1.602495551109314, | |
| "eval_runtime": 14.0638, | |
| "eval_samples_per_second": 71.104, | |
| "eval_steps_per_second": 2.275, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.7233487076842746, | |
| "grad_norm": 0.7703478252526429, | |
| "learning_rate": 9.303726592214927e-05, | |
| "loss": 1.5759, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.7407536332782176, | |
| "grad_norm": 0.4326591794595183, | |
| "learning_rate": 9.289793216980748e-05, | |
| "loss": 1.589, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7407536332782176, | |
| "eval_loss": 1.598211646080017, | |
| "eval_runtime": 14.05, | |
| "eval_samples_per_second": 71.174, | |
| "eval_steps_per_second": 2.278, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7581585588721609, | |
| "grad_norm": 0.9150661442715593, | |
| "learning_rate": 9.275732571185852e-05, | |
| "loss": 1.5925, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.775563484466104, | |
| "grad_norm": 0.4835138015080412, | |
| "learning_rate": 9.261545076904189e-05, | |
| "loss": 1.587, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.775563484466104, | |
| "eval_loss": 1.5962464809417725, | |
| "eval_runtime": 14.0435, | |
| "eval_samples_per_second": 71.207, | |
| "eval_steps_per_second": 2.279, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.792968410060047, | |
| "grad_norm": 0.8246740616874354, | |
| "learning_rate": 9.247231160017462e-05, | |
| "loss": 1.5845, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.8103733356539902, | |
| "grad_norm": 0.7636936218440887, | |
| "learning_rate": 9.232791250202342e-05, | |
| "loss": 1.5789, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.8103733356539902, | |
| "eval_loss": 1.5962697267532349, | |
| "eval_runtime": 14.0724, | |
| "eval_samples_per_second": 71.061, | |
| "eval_steps_per_second": 2.274, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.8277782612479332, | |
| "grad_norm": 0.5278061111679693, | |
| "learning_rate": 9.218225780917564e-05, | |
| "loss": 1.5784, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.8451831868418762, | |
| "grad_norm": 0.5521436007234811, | |
| "learning_rate": 9.203535189390927e-05, | |
| "loss": 1.5859, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.8451831868418762, | |
| "eval_loss": 1.589383840560913, | |
| "eval_runtime": 14.0972, | |
| "eval_samples_per_second": 70.936, | |
| "eval_steps_per_second": 2.27, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.8625881124358195, | |
| "grad_norm": 0.9153838912238841, | |
| "learning_rate": 9.188719916606157e-05, | |
| "loss": 1.5767, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.8799930380297625, | |
| "grad_norm": 0.5869179835862129, | |
| "learning_rate": 9.17378040728968e-05, | |
| "loss": 1.5771, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.8799930380297625, | |
| "eval_loss": 1.5878838300704956, | |
| "eval_runtime": 14.0586, | |
| "eval_samples_per_second": 71.131, | |
| "eval_steps_per_second": 2.276, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.8973979636237055, | |
| "grad_norm": 0.8157168714834181, | |
| "learning_rate": 9.158717109897263e-05, | |
| "loss": 1.5626, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.9148028892176487, | |
| "grad_norm": 0.7455391308200009, | |
| "learning_rate": 9.14353047660056e-05, | |
| "loss": 1.5651, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.9148028892176487, | |
| "eval_loss": 1.5843595266342163, | |
| "eval_runtime": 14.0141, | |
| "eval_samples_per_second": 71.356, | |
| "eval_steps_per_second": 2.283, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.9322078148115915, | |
| "grad_norm": 0.48742202866618534, | |
| "learning_rate": 9.128220963273532e-05, | |
| "loss": 1.5806, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.9496127404055348, | |
| "grad_norm": 0.49018002201797567, | |
| "learning_rate": 9.112789029478769e-05, | |
| "loss": 1.5715, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.9496127404055348, | |
| "eval_loss": 1.583487868309021, | |
| "eval_runtime": 14.076, | |
| "eval_samples_per_second": 71.043, | |
| "eval_steps_per_second": 2.273, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.9670176659994778, | |
| "grad_norm": 0.7730233396950769, | |
| "learning_rate": 9.097235138453689e-05, | |
| "loss": 1.5762, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.9844225915934208, | |
| "grad_norm": 0.5303157923715942, | |
| "learning_rate": 9.081559757096637e-05, | |
| "loss": 1.5656, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.9844225915934208, | |
| "eval_loss": 1.5835527181625366, | |
| "eval_runtime": 14.0959, | |
| "eval_samples_per_second": 70.942, | |
| "eval_steps_per_second": 2.27, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.0020885910712733, | |
| "grad_norm": 1.0260303687016545, | |
| "learning_rate": 9.065763355952868e-05, | |
| "loss": 1.5804, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.019493516665216, | |
| "grad_norm": 0.618811524236402, | |
| "learning_rate": 9.049846409200417e-05, | |
| "loss": 1.4968, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.019493516665216, | |
| "eval_loss": 1.5831753015518188, | |
| "eval_runtime": 14.0889, | |
| "eval_samples_per_second": 70.978, | |
| "eval_steps_per_second": 2.271, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.0368984422591594, | |
| "grad_norm": 0.613529897165403, | |
| "learning_rate": 9.033809394635874e-05, | |
| "loss": 1.5022, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.0543033678531026, | |
| "grad_norm": 0.5015341058830712, | |
| "learning_rate": 9.017652793660039e-05, | |
| "loss": 1.4978, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.0543033678531026, | |
| "eval_loss": 1.5814894437789917, | |
| "eval_runtime": 14.0786, | |
| "eval_samples_per_second": 71.03, | |
| "eval_steps_per_second": 2.273, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.0717082934470454, | |
| "grad_norm": 0.6230434882603811, | |
| "learning_rate": 9.001377091263465e-05, | |
| "loss": 1.4918, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.0891132190409887, | |
| "grad_norm": 0.5236681791053263, | |
| "learning_rate": 8.984982776011906e-05, | |
| "loss": 1.4916, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.0891132190409887, | |
| "eval_loss": 1.5761847496032715, | |
| "eval_runtime": 14.0503, | |
| "eval_samples_per_second": 71.173, | |
| "eval_steps_per_second": 2.278, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.1065181446349315, | |
| "grad_norm": 0.5349428126602861, | |
| "learning_rate": 8.96847034003165e-05, | |
| "loss": 1.4917, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.1239230702288747, | |
| "grad_norm": 0.6861287366848919, | |
| "learning_rate": 8.951840278994747e-05, | |
| "loss": 1.4866, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.1239230702288747, | |
| "eval_loss": 1.5749881267547607, | |
| "eval_runtime": 14.056, | |
| "eval_samples_per_second": 71.144, | |
| "eval_steps_per_second": 2.277, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.141327995822818, | |
| "grad_norm": 0.6483285157830946, | |
| "learning_rate": 8.935093092104121e-05, | |
| "loss": 1.4962, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.1587329214167608, | |
| "grad_norm": 0.8142430709313933, | |
| "learning_rate": 8.9182292820786e-05, | |
| "loss": 1.4854, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.1587329214167608, | |
| "eval_loss": 1.5689362287521362, | |
| "eval_runtime": 14.1096, | |
| "eval_samples_per_second": 70.874, | |
| "eval_steps_per_second": 2.268, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.176137847010704, | |
| "grad_norm": 1.1670624800116283, | |
| "learning_rate": 8.901249355137816e-05, | |
| "loss": 1.486, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.1935427726046473, | |
| "grad_norm": 0.5523979614193149, | |
| "learning_rate": 8.884153820987008e-05, | |
| "loss": 1.4975, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.1935427726046473, | |
| "eval_loss": 1.5690404176712036, | |
| "eval_runtime": 14.0346, | |
| "eval_samples_per_second": 71.252, | |
| "eval_steps_per_second": 2.28, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.21094769819859, | |
| "grad_norm": 0.5960284645659393, | |
| "learning_rate": 8.866943192801729e-05, | |
| "loss": 1.5085, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.2283526237925333, | |
| "grad_norm": 0.5896589247735479, | |
| "learning_rate": 8.84961798721243e-05, | |
| "loss": 1.4815, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.2283526237925333, | |
| "eval_loss": 1.5669183731079102, | |
| "eval_runtime": 14.0717, | |
| "eval_samples_per_second": 71.065, | |
| "eval_steps_per_second": 2.274, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.2457575493864765, | |
| "grad_norm": 0.6944971656499748, | |
| "learning_rate": 8.832178724288966e-05, | |
| "loss": 1.4901, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.2631624749804193, | |
| "grad_norm": 0.48925376877273175, | |
| "learning_rate": 8.814625927524973e-05, | |
| "loss": 1.486, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.2631624749804193, | |
| "eval_loss": 1.5606794357299805, | |
| "eval_runtime": 14.1014, | |
| "eval_samples_per_second": 70.915, | |
| "eval_steps_per_second": 2.269, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.2805674005743626, | |
| "grad_norm": 0.7017723269145159, | |
| "learning_rate": 8.79696012382216e-05, | |
| "loss": 1.4921, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.297972326168306, | |
| "grad_norm": 0.5451878167122939, | |
| "learning_rate": 8.779181843474488e-05, | |
| "loss": 1.4952, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.297972326168306, | |
| "eval_loss": 1.561612606048584, | |
| "eval_runtime": 14.0563, | |
| "eval_samples_per_second": 71.143, | |
| "eval_steps_per_second": 2.277, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.3153772517622486, | |
| "grad_norm": 0.7029034750125012, | |
| "learning_rate": 8.761291620152251e-05, | |
| "loss": 1.49, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.332782177356192, | |
| "grad_norm": 0.6320953342648652, | |
| "learning_rate": 8.743289990886069e-05, | |
| "loss": 1.4965, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.332782177356192, | |
| "eval_loss": 1.5618507862091064, | |
| "eval_runtime": 14.3775, | |
| "eval_samples_per_second": 69.553, | |
| "eval_steps_per_second": 2.226, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.3501871029501347, | |
| "grad_norm": 0.44962999706711637, | |
| "learning_rate": 8.725177496050746e-05, | |
| "loss": 1.4956, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.367592028544078, | |
| "grad_norm": 0.5296736885871586, | |
| "learning_rate": 8.706954679349071e-05, | |
| "loss": 1.4836, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.367592028544078, | |
| "eval_loss": 1.5586892366409302, | |
| "eval_runtime": 14.0268, | |
| "eval_samples_per_second": 71.292, | |
| "eval_steps_per_second": 2.281, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.384996954138021, | |
| "grad_norm": 0.5506512514989426, | |
| "learning_rate": 8.688622087795476e-05, | |
| "loss": 1.4795, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.402401879731964, | |
| "grad_norm": 0.6654665155879538, | |
| "learning_rate": 8.670180271699632e-05, | |
| "loss": 1.4741, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.402401879731964, | |
| "eval_loss": 1.5594490766525269, | |
| "eval_runtime": 14.0756, | |
| "eval_samples_per_second": 71.045, | |
| "eval_steps_per_second": 2.273, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.419806805325907, | |
| "grad_norm": 0.6887037066063108, | |
| "learning_rate": 8.651629784649924e-05, | |
| "loss": 1.483, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.4372117309198504, | |
| "grad_norm": 0.689800498648978, | |
| "learning_rate": 8.632971183496832e-05, | |
| "loss": 1.4901, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.4372117309198504, | |
| "eval_loss": 1.556670904159546, | |
| "eval_runtime": 14.0999, | |
| "eval_samples_per_second": 70.923, | |
| "eval_steps_per_second": 2.27, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.4546166565137932, | |
| "grad_norm": 0.5661309135294365, | |
| "learning_rate": 8.614205028336217e-05, | |
| "loss": 1.4741, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.4720215821077365, | |
| "grad_norm": 0.5266042181833929, | |
| "learning_rate": 8.595331882492506e-05, | |
| "loss": 1.4808, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.4720215821077365, | |
| "eval_loss": 1.5564885139465332, | |
| "eval_runtime": 14.0633, | |
| "eval_samples_per_second": 71.107, | |
| "eval_steps_per_second": 2.275, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.4894265077016797, | |
| "grad_norm": 0.5952862014072801, | |
| "learning_rate": 8.576352312501787e-05, | |
| "loss": 1.4746, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.5068314332956225, | |
| "grad_norm": 0.4624470123124944, | |
| "learning_rate": 8.557266888094794e-05, | |
| "loss": 1.4946, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.5068314332956225, | |
| "eval_loss": 1.552463412284851, | |
| "eval_runtime": 14.0629, | |
| "eval_samples_per_second": 71.109, | |
| "eval_steps_per_second": 2.275, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.5242363588895658, | |
| "grad_norm": 0.45145611389782175, | |
| "learning_rate": 8.538076182179816e-05, | |
| "loss": 1.4961, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.541641284483509, | |
| "grad_norm": 0.90012189462666, | |
| "learning_rate": 8.518780770825489e-05, | |
| "loss": 1.4783, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.541641284483509, | |
| "eval_loss": 1.5499927997589111, | |
| "eval_runtime": 14.0891, | |
| "eval_samples_per_second": 70.977, | |
| "eval_steps_per_second": 2.271, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.559046210077452, | |
| "grad_norm": 0.48065365017717593, | |
| "learning_rate": 8.499381233243513e-05, | |
| "loss": 1.4769, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.576451135671395, | |
| "grad_norm": 0.6888035432484004, | |
| "learning_rate": 8.479878151771251e-05, | |
| "loss": 1.4789, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.576451135671395, | |
| "eval_loss": 1.5486310720443726, | |
| "eval_runtime": 13.9982, | |
| "eval_samples_per_second": 71.438, | |
| "eval_steps_per_second": 2.286, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.5938560612653383, | |
| "grad_norm": 0.48711196772194026, | |
| "learning_rate": 8.460272111854266e-05, | |
| "loss": 1.4847, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.611260986859281, | |
| "grad_norm": 0.824707610556562, | |
| "learning_rate": 8.440563702028738e-05, | |
| "loss": 1.4828, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.611260986859281, | |
| "eval_loss": 1.551180362701416, | |
| "eval_runtime": 14.0431, | |
| "eval_samples_per_second": 71.209, | |
| "eval_steps_per_second": 2.279, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.6286659124532243, | |
| "grad_norm": 0.852389610981971, | |
| "learning_rate": 8.42075351390379e-05, | |
| "loss": 1.4826, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.6460708380471676, | |
| "grad_norm": 0.6660183574150006, | |
| "learning_rate": 8.400842142143747e-05, | |
| "loss": 1.4845, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.6460708380471676, | |
| "eval_loss": 1.544368028640747, | |
| "eval_runtime": 14.0389, | |
| "eval_samples_per_second": 71.231, | |
| "eval_steps_per_second": 2.279, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.6634757636411104, | |
| "grad_norm": 0.5371708490732019, | |
| "learning_rate": 8.380830184450267e-05, | |
| "loss": 1.4793, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.6808806892350536, | |
| "grad_norm": 0.5364646266893728, | |
| "learning_rate": 8.360718241544412e-05, | |
| "loss": 1.4785, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.6808806892350536, | |
| "eval_loss": 1.543105959892273, | |
| "eval_runtime": 14.0605, | |
| "eval_samples_per_second": 71.121, | |
| "eval_steps_per_second": 2.276, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.698285614828997, | |
| "grad_norm": 0.46957227565425785, | |
| "learning_rate": 8.340506917148608e-05, | |
| "loss": 1.4742, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.7156905404229397, | |
| "grad_norm": 0.48372971397935693, | |
| "learning_rate": 8.320196817968525e-05, | |
| "loss": 1.4866, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.7156905404229397, | |
| "eval_loss": 1.5416640043258667, | |
| "eval_runtime": 14.0946, | |
| "eval_samples_per_second": 70.949, | |
| "eval_steps_per_second": 2.27, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.733095466016883, | |
| "grad_norm": 0.62992004899943, | |
| "learning_rate": 8.29978855367487e-05, | |
| "loss": 1.4805, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.7505003916108257, | |
| "grad_norm": 0.43123069122543717, | |
| "learning_rate": 8.279282736885072e-05, | |
| "loss": 1.4658, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.7505003916108257, | |
| "eval_loss": 1.535282850265503, | |
| "eval_runtime": 14.0868, | |
| "eval_samples_per_second": 70.988, | |
| "eval_steps_per_second": 2.272, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.767905317204769, | |
| "grad_norm": 0.6368197221637776, | |
| "learning_rate": 8.258679983144908e-05, | |
| "loss": 1.4758, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.785310242798712, | |
| "grad_norm": 0.6657566023644987, | |
| "learning_rate": 8.237980910910019e-05, | |
| "loss": 1.4745, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.785310242798712, | |
| "eval_loss": 1.5360466241836548, | |
| "eval_runtime": 14.0555, | |
| "eval_samples_per_second": 71.146, | |
| "eval_steps_per_second": 2.277, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.802715168392655, | |
| "grad_norm": 0.7182835079667393, | |
| "learning_rate": 8.217186141527335e-05, | |
| "loss": 1.4641, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.8201200939865982, | |
| "grad_norm": 0.7121132283989695, | |
| "learning_rate": 8.196296299216446e-05, | |
| "loss": 1.4759, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.8201200939865982, | |
| "eval_loss": 1.532562494277954, | |
| "eval_runtime": 14.0783, | |
| "eval_samples_per_second": 71.031, | |
| "eval_steps_per_second": 2.273, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.837525019580541, | |
| "grad_norm": 0.5023845726717355, | |
| "learning_rate": 8.175312011050845e-05, | |
| "loss": 1.4683, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.8549299451744843, | |
| "grad_norm": 0.5881687310427127, | |
| "learning_rate": 8.154233906939112e-05, | |
| "loss": 1.4663, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.8549299451744843, | |
| "eval_loss": 1.528754711151123, | |
| "eval_runtime": 14.0709, | |
| "eval_samples_per_second": 71.069, | |
| "eval_steps_per_second": 2.274, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.8723348707684275, | |
| "grad_norm": 0.8694427423730182, | |
| "learning_rate": 8.133062619605998e-05, | |
| "loss": 1.4652, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.8897397963623703, | |
| "grad_norm": 0.5428973347025838, | |
| "learning_rate": 8.111798784573448e-05, | |
| "loss": 1.4654, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.8897397963623703, | |
| "eval_loss": 1.5267043113708496, | |
| "eval_runtime": 14.0573, | |
| "eval_samples_per_second": 71.138, | |
| "eval_steps_per_second": 2.276, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.9071447219563136, | |
| "grad_norm": 0.5347673639983305, | |
| "learning_rate": 8.090443040141507e-05, | |
| "loss": 1.4686, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.924549647550257, | |
| "grad_norm": 0.3699648780599154, | |
| "learning_rate": 8.068996027369164e-05, | |
| "loss": 1.4609, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.924549647550257, | |
| "eval_loss": 1.5217338800430298, | |
| "eval_runtime": 14.0237, | |
| "eval_samples_per_second": 71.308, | |
| "eval_steps_per_second": 2.282, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.9419545731441996, | |
| "grad_norm": 0.48586968192782287, | |
| "learning_rate": 8.047458390055122e-05, | |
| "loss": 1.4612, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.959359498738143, | |
| "grad_norm": 0.5435332038220189, | |
| "learning_rate": 8.025830774718446e-05, | |
| "loss": 1.4692, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.959359498738143, | |
| "eval_loss": 1.5231417417526245, | |
| "eval_runtime": 14.0466, | |
| "eval_samples_per_second": 71.191, | |
| "eval_steps_per_second": 2.278, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.976764424332086, | |
| "grad_norm": 0.7096020132731112, | |
| "learning_rate": 8.004113830579183e-05, | |
| "loss": 1.471, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.994169349926029, | |
| "grad_norm": 0.539895931380371, | |
| "learning_rate": 7.982308209538854e-05, | |
| "loss": 1.4669, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.994169349926029, | |
| "eval_loss": 1.5212860107421875, | |
| "eval_runtime": 14.0526, | |
| "eval_samples_per_second": 71.161, | |
| "eval_steps_per_second": 2.277, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 3.0118353494038814, | |
| "grad_norm": 0.4864871708607549, | |
| "learning_rate": 7.960414566160895e-05, | |
| "loss": 1.416, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 3.0292402749978242, | |
| "grad_norm": 0.690002277872972, | |
| "learning_rate": 7.938433557651007e-05, | |
| "loss": 1.366, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 3.0292402749978242, | |
| "eval_loss": 1.529784917831421, | |
| "eval_runtime": 14.067, | |
| "eval_samples_per_second": 71.088, | |
| "eval_steps_per_second": 2.275, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 3.0466452005917675, | |
| "grad_norm": 0.5289813539417939, | |
| "learning_rate": 7.916365843837427e-05, | |
| "loss": 1.3613, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 3.0640501261857107, | |
| "grad_norm": 0.6657125938020144, | |
| "learning_rate": 7.894212087151115e-05, | |
| "loss": 1.3688, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 3.0640501261857107, | |
| "eval_loss": 1.5215730667114258, | |
| "eval_runtime": 14.0034, | |
| "eval_samples_per_second": 71.411, | |
| "eval_steps_per_second": 2.285, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 3.0814550517796535, | |
| "grad_norm": 0.7390029016004258, | |
| "learning_rate": 7.871972952605883e-05, | |
| "loss": 1.3683, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 3.0988599773735968, | |
| "grad_norm": 0.5057259460715381, | |
| "learning_rate": 7.849649107778423e-05, | |
| "loss": 1.3728, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 3.0988599773735968, | |
| "eval_loss": 1.5253978967666626, | |
| "eval_runtime": 14.1332, | |
| "eval_samples_per_second": 70.756, | |
| "eval_steps_per_second": 2.264, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 3.11626490296754, | |
| "grad_norm": 0.4295555287715609, | |
| "learning_rate": 7.827241222788265e-05, | |
| "loss": 1.3712, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 3.133669828561483, | |
| "grad_norm": 0.5114244593630245, | |
| "learning_rate": 7.804749970277668e-05, | |
| "loss": 1.3687, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.133669828561483, | |
| "eval_loss": 1.5244981050491333, | |
| "eval_runtime": 14.0709, | |
| "eval_samples_per_second": 71.069, | |
| "eval_steps_per_second": 2.274, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.151074754155426, | |
| "grad_norm": 0.5924217403505255, | |
| "learning_rate": 7.782176025391429e-05, | |
| "loss": 1.3599, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 3.1684796797493693, | |
| "grad_norm": 0.40820014979092695, | |
| "learning_rate": 7.759520065756606e-05, | |
| "loss": 1.3861, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.1684796797493693, | |
| "eval_loss": 1.511974573135376, | |
| "eval_runtime": 14.0345, | |
| "eval_samples_per_second": 71.253, | |
| "eval_steps_per_second": 2.28, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.185884605343312, | |
| "grad_norm": 0.4377700488526699, | |
| "learning_rate": 7.736782771462192e-05, | |
| "loss": 1.371, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 3.2032895309372553, | |
| "grad_norm": 0.6320656559166914, | |
| "learning_rate": 7.713964825038689e-05, | |
| "loss": 1.3686, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.2032895309372553, | |
| "eval_loss": 1.5161738395690918, | |
| "eval_runtime": 14.061, | |
| "eval_samples_per_second": 71.119, | |
| "eval_steps_per_second": 2.276, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.220694456531198, | |
| "grad_norm": 0.5547647399512429, | |
| "learning_rate": 7.69106691143762e-05, | |
| "loss": 1.3701, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 3.2380993821251414, | |
| "grad_norm": 0.5363716061147621, | |
| "learning_rate": 7.66808971801098e-05, | |
| "loss": 1.3661, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.2380993821251414, | |
| "eval_loss": 1.5094687938690186, | |
| "eval_runtime": 14.0539, | |
| "eval_samples_per_second": 71.155, | |
| "eval_steps_per_second": 2.277, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.2555043077190846, | |
| "grad_norm": 0.5599230381831105, | |
| "learning_rate": 7.645033934490586e-05, | |
| "loss": 1.3603, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 3.2729092333130274, | |
| "grad_norm": 0.8327764629351396, | |
| "learning_rate": 7.621900252967383e-05, | |
| "loss": 1.3735, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.2729092333130274, | |
| "eval_loss": 1.5098674297332764, | |
| "eval_runtime": 14.0398, | |
| "eval_samples_per_second": 71.226, | |
| "eval_steps_per_second": 2.279, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.2903141589069707, | |
| "grad_norm": 0.48878316002707184, | |
| "learning_rate": 7.59868936787067e-05, | |
| "loss": 1.3784, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 3.307719084500914, | |
| "grad_norm": 0.43212527062833306, | |
| "learning_rate": 7.575401975947243e-05, | |
| "loss": 1.3898, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.307719084500914, | |
| "eval_loss": 1.5034927129745483, | |
| "eval_runtime": 14.069, | |
| "eval_samples_per_second": 71.078, | |
| "eval_steps_per_second": 2.275, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.3251240100948567, | |
| "grad_norm": 0.5587972491626451, | |
| "learning_rate": 7.552038776240496e-05, | |
| "loss": 1.3756, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 3.3425289356888, | |
| "grad_norm": 0.48413494641810556, | |
| "learning_rate": 7.528600470069427e-05, | |
| "loss": 1.3766, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.3425289356888, | |
| "eval_loss": 1.5048415660858154, | |
| "eval_runtime": 14.0409, | |
| "eval_samples_per_second": 71.221, | |
| "eval_steps_per_second": 2.279, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.359933861282743, | |
| "grad_norm": 0.5249216781891894, | |
| "learning_rate": 7.505087761007585e-05, | |
| "loss": 1.3683, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 3.377338786876686, | |
| "grad_norm": 0.5021867457296348, | |
| "learning_rate": 7.481501354861958e-05, | |
| "loss": 1.3628, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.377338786876686, | |
| "eval_loss": 1.5056378841400146, | |
| "eval_runtime": 14.1165, | |
| "eval_samples_per_second": 70.839, | |
| "eval_steps_per_second": 2.267, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.3947437124706292, | |
| "grad_norm": 0.4319516918800896, | |
| "learning_rate": 7.457841959651772e-05, | |
| "loss": 1.3757, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 3.412148638064572, | |
| "grad_norm": 0.5272349233779042, | |
| "learning_rate": 7.434110285587257e-05, | |
| "loss": 1.3772, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.412148638064572, | |
| "eval_loss": 1.4979031085968018, | |
| "eval_runtime": 14.0583, | |
| "eval_samples_per_second": 71.132, | |
| "eval_steps_per_second": 2.276, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.4295535636585153, | |
| "grad_norm": 0.4170853105657618, | |
| "learning_rate": 7.410307045048309e-05, | |
| "loss": 1.3738, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 3.4469584892524585, | |
| "grad_norm": 0.48485775358670463, | |
| "learning_rate": 7.38643295256312e-05, | |
| "loss": 1.3724, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.4469584892524585, | |
| "eval_loss": 1.494421362876892, | |
| "eval_runtime": 14.0287, | |
| "eval_samples_per_second": 71.282, | |
| "eval_steps_per_second": 2.281, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.4643634148464013, | |
| "grad_norm": 0.4371734710144595, | |
| "learning_rate": 7.362488724786717e-05, | |
| "loss": 1.3744, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 3.4817683404403446, | |
| "grad_norm": 0.40923025134892577, | |
| "learning_rate": 7.338475080479464e-05, | |
| "loss": 1.3607, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.4817683404403446, | |
| "eval_loss": 1.4906189441680908, | |
| "eval_runtime": 14.13, | |
| "eval_samples_per_second": 70.772, | |
| "eval_steps_per_second": 2.265, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.499173266034288, | |
| "grad_norm": 0.566881072168113, | |
| "learning_rate": 7.31439274048547e-05, | |
| "loss": 1.3724, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 3.5165781916282306, | |
| "grad_norm": 0.6296518713524268, | |
| "learning_rate": 7.290242427710961e-05, | |
| "loss": 1.3727, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.5165781916282306, | |
| "eval_loss": 1.4904612302780151, | |
| "eval_runtime": 14.0139, | |
| "eval_samples_per_second": 71.358, | |
| "eval_steps_per_second": 2.283, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.533983117222174, | |
| "grad_norm": 0.5206499240433969, | |
| "learning_rate": 7.266024867102576e-05, | |
| "loss": 1.3692, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 3.551388042816117, | |
| "grad_norm": 0.40529224737208835, | |
| "learning_rate": 7.241740785625611e-05, | |
| "loss": 1.3806, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.551388042816117, | |
| "eval_loss": 1.4856830835342407, | |
| "eval_runtime": 14.0448, | |
| "eval_samples_per_second": 71.201, | |
| "eval_steps_per_second": 2.278, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.56879296841006, | |
| "grad_norm": 0.4300144130087216, | |
| "learning_rate": 7.217390912242188e-05, | |
| "loss": 1.3744, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.586197894004003, | |
| "grad_norm": 1.5706940836443246, | |
| "learning_rate": 7.19297597788938e-05, | |
| "loss": 1.3585, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.586197894004003, | |
| "eval_loss": 1.4865046739578247, | |
| "eval_runtime": 14.0585, | |
| "eval_samples_per_second": 71.131, | |
| "eval_steps_per_second": 2.276, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.6036028195979464, | |
| "grad_norm": 0.42832118387479046, | |
| "learning_rate": 7.168496715457262e-05, | |
| "loss": 1.3498, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.621007745191889, | |
| "grad_norm": 0.32937050038811627, | |
| "learning_rate": 7.143953859766922e-05, | |
| "loss": 1.3668, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.621007745191889, | |
| "eval_loss": 1.4809393882751465, | |
| "eval_runtime": 14.0762, | |
| "eval_samples_per_second": 71.042, | |
| "eval_steps_per_second": 2.273, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.6384126707858324, | |
| "grad_norm": 0.46661759551944226, | |
| "learning_rate": 7.119348147548397e-05, | |
| "loss": 1.3713, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.6558175963797757, | |
| "grad_norm": 0.37949075462180737, | |
| "learning_rate": 7.094680317418553e-05, | |
| "loss": 1.3738, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.6558175963797757, | |
| "eval_loss": 1.479648470878601, | |
| "eval_runtime": 14.0453, | |
| "eval_samples_per_second": 71.198, | |
| "eval_steps_per_second": 2.278, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.6732225219737185, | |
| "grad_norm": 0.3545653210995024, | |
| "learning_rate": 7.069951109858924e-05, | |
| "loss": 1.3778, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.6906274475676617, | |
| "grad_norm": 0.4800710308825892, | |
| "learning_rate": 7.045161267193473e-05, | |
| "loss": 1.3714, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.6906274475676617, | |
| "eval_loss": 1.478054165840149, | |
| "eval_runtime": 14.0495, | |
| "eval_samples_per_second": 71.177, | |
| "eval_steps_per_second": 2.278, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.708032373161605, | |
| "grad_norm": 0.4518327093756271, | |
| "learning_rate": 7.020311533566316e-05, | |
| "loss": 1.3603, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.7254372987555477, | |
| "grad_norm": 0.5598559026409418, | |
| "learning_rate": 6.995402654919383e-05, | |
| "loss": 1.3751, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.7254372987555477, | |
| "eval_loss": 1.4786157608032227, | |
| "eval_runtime": 14.0459, | |
| "eval_samples_per_second": 71.195, | |
| "eval_steps_per_second": 2.278, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.742842224349491, | |
| "grad_norm": 0.397449015844996, | |
| "learning_rate": 6.970435378970025e-05, | |
| "loss": 1.3696, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.7602471499434342, | |
| "grad_norm": 0.460203408949936, | |
| "learning_rate": 6.94541045518857e-05, | |
| "loss": 1.3662, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.7602471499434342, | |
| "eval_loss": 1.4727766513824463, | |
| "eval_runtime": 14.0456, | |
| "eval_samples_per_second": 71.197, | |
| "eval_steps_per_second": 2.278, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.777652075537377, | |
| "grad_norm": 0.5514873377132529, | |
| "learning_rate": 6.920328634775823e-05, | |
| "loss": 1.3547, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.7950570011313203, | |
| "grad_norm": 0.4046190880252359, | |
| "learning_rate": 6.895190670640517e-05, | |
| "loss": 1.3702, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.7950570011313203, | |
| "eval_loss": 1.4765475988388062, | |
| "eval_runtime": 14.044, | |
| "eval_samples_per_second": 71.205, | |
| "eval_steps_per_second": 2.279, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.8124619267252635, | |
| "grad_norm": 0.39733647374433373, | |
| "learning_rate": 6.86999731737672e-05, | |
| "loss": 1.3576, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.8298668523192063, | |
| "grad_norm": 0.4968531287288758, | |
| "learning_rate": 6.844749331241166e-05, | |
| "loss": 1.3683, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.8298668523192063, | |
| "eval_loss": 1.4669009447097778, | |
| "eval_runtime": 14.0594, | |
| "eval_samples_per_second": 71.127, | |
| "eval_steps_per_second": 2.276, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.8472717779131496, | |
| "grad_norm": 0.39798180208193673, | |
| "learning_rate": 6.819447470130576e-05, | |
| "loss": 1.3599, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.8646767035070924, | |
| "grad_norm": 0.45458727194541854, | |
| "learning_rate": 6.794092493558886e-05, | |
| "loss": 1.369, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.8646767035070924, | |
| "eval_loss": 1.4670898914337158, | |
| "eval_runtime": 14.0764, | |
| "eval_samples_per_second": 71.041, | |
| "eval_steps_per_second": 2.273, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.8820816291010356, | |
| "grad_norm": 0.3836811429180259, | |
| "learning_rate": 6.768685162634463e-05, | |
| "loss": 1.358, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.8994865546949784, | |
| "grad_norm": 0.38598388694396, | |
| "learning_rate": 6.743226240037251e-05, | |
| "loss": 1.3583, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.8994865546949784, | |
| "eval_loss": 1.4623597860336304, | |
| "eval_runtime": 14.0625, | |
| "eval_samples_per_second": 71.111, | |
| "eval_steps_per_second": 2.276, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.9168914802889216, | |
| "grad_norm": 0.44825069145730173, | |
| "learning_rate": 6.717716489995878e-05, | |
| "loss": 1.3502, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.934296405882865, | |
| "grad_norm": 0.38830830286241574, | |
| "learning_rate": 6.692156678264715e-05, | |
| "loss": 1.3532, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.934296405882865, | |
| "eval_loss": 1.4605158567428589, | |
| "eval_runtime": 14.0694, | |
| "eval_samples_per_second": 71.076, | |
| "eval_steps_per_second": 2.274, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.9517013314768077, | |
| "grad_norm": 0.411836793573614, | |
| "learning_rate": 6.666547572100892e-05, | |
| "loss": 1.36, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.969106257070751, | |
| "grad_norm": 0.5070202220012539, | |
| "learning_rate": 6.640889940241265e-05, | |
| "loss": 1.3621, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.969106257070751, | |
| "eval_loss": 1.4586904048919678, | |
| "eval_runtime": 14.0282, | |
| "eval_samples_per_second": 71.285, | |
| "eval_steps_per_second": 2.281, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.986511182664694, | |
| "grad_norm": 0.45972048455728964, | |
| "learning_rate": 6.615184552879333e-05, | |
| "loss": 1.3569, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 4.004177182142547, | |
| "grad_norm": 0.4844090667680897, | |
| "learning_rate": 6.589432181642133e-05, | |
| "loss": 1.356, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 4.004177182142547, | |
| "eval_loss": 1.474735140800476, | |
| "eval_runtime": 14.1044, | |
| "eval_samples_per_second": 70.9, | |
| "eval_steps_per_second": 2.269, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 4.0215821077364895, | |
| "grad_norm": 0.5126230881524994, | |
| "learning_rate": 6.563633599567065e-05, | |
| "loss": 1.2523, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 4.038987033330432, | |
| "grad_norm": 0.5098806194453889, | |
| "learning_rate": 6.537789581078693e-05, | |
| "loss": 1.2622, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 4.038987033330432, | |
| "eval_loss": 1.4786348342895508, | |
| "eval_runtime": 14.0611, | |
| "eval_samples_per_second": 71.118, | |
| "eval_steps_per_second": 2.276, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 4.056391958924376, | |
| "grad_norm": 0.45319295070868365, | |
| "learning_rate": 6.511900901965492e-05, | |
| "loss": 1.246, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 4.073796884518319, | |
| "grad_norm": 0.5020371370617683, | |
| "learning_rate": 6.485968339356566e-05, | |
| "loss": 1.263, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 4.073796884518319, | |
| "eval_loss": 1.4689204692840576, | |
| "eval_runtime": 14.0995, | |
| "eval_samples_per_second": 70.924, | |
| "eval_steps_per_second": 2.27, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 4.091201810112262, | |
| "grad_norm": 0.4891550610909729, | |
| "learning_rate": 6.459992671698323e-05, | |
| "loss": 1.2468, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 4.108606735706205, | |
| "grad_norm": 0.41320498769703473, | |
| "learning_rate": 6.433974678731097e-05, | |
| "loss": 1.2727, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 4.108606735706205, | |
| "eval_loss": 1.4705528020858765, | |
| "eval_runtime": 14.0263, | |
| "eval_samples_per_second": 71.295, | |
| "eval_steps_per_second": 2.281, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 4.126011661300148, | |
| "grad_norm": 0.42567451569194076, | |
| "learning_rate": 6.407915141465746e-05, | |
| "loss": 1.2496, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 4.143416586894091, | |
| "grad_norm": 0.4143204113738695, | |
| "learning_rate": 6.381814842160219e-05, | |
| "loss": 1.255, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 4.143416586894091, | |
| "eval_loss": 1.4660860300064087, | |
| "eval_runtime": 14.0759, | |
| "eval_samples_per_second": 71.043, | |
| "eval_steps_per_second": 2.273, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 4.160821512488035, | |
| "grad_norm": 0.467101344292312, | |
| "learning_rate": 6.355674564296053e-05, | |
| "loss": 1.2513, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 4.178226438081977, | |
| "grad_norm": 0.41772403175228634, | |
| "learning_rate": 6.329495092554872e-05, | |
| "loss": 1.2602, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.178226438081977, | |
| "eval_loss": 1.4664828777313232, | |
| "eval_runtime": 13.9906, | |
| "eval_samples_per_second": 71.477, | |
| "eval_steps_per_second": 2.287, | |
| "step": 12000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 28720, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 800, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1642006019244032e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |