rabiulawal's picture
Add files using upload-large-folder tool
2c66546 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.178226438081977,
"eval_steps": 100,
"global_step": 12000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017404925593943087,
"grad_norm": 4.2433902126025425,
"learning_rate": 9.8e-05,
"loss": 3.6204,
"step": 50
},
{
"epoch": 0.034809851187886175,
"grad_norm": 3.0786203091123565,
"learning_rate": 9.999928647255986e-05,
"loss": 2.5602,
"step": 100
},
{
"epoch": 0.034809851187886175,
"eval_loss": 2.363542079925537,
"eval_runtime": 14.092,
"eval_samples_per_second": 70.962,
"eval_steps_per_second": 2.271,
"step": 100
},
{
"epoch": 0.05221477678182926,
"grad_norm": 2.489798826862787,
"learning_rate": 9.999708736748881e-05,
"loss": 2.2577,
"step": 150
},
{
"epoch": 0.06961970237577235,
"grad_norm": 4.227397206704295,
"learning_rate": 9.999340245361986e-05,
"loss": 2.065,
"step": 200
},
{
"epoch": 0.06961970237577235,
"eval_loss": 2.0204052925109863,
"eval_runtime": 14.0707,
"eval_samples_per_second": 71.07,
"eval_steps_per_second": 2.274,
"step": 200
},
{
"epoch": 0.08702462796971543,
"grad_norm": 1.5627603609182088,
"learning_rate": 9.998823184156712e-05,
"loss": 1.9504,
"step": 250
},
{
"epoch": 0.10442955356365852,
"grad_norm": 2.0849274528531834,
"learning_rate": 9.998157568654259e-05,
"loss": 1.9106,
"step": 300
},
{
"epoch": 0.10442955356365852,
"eval_loss": 1.8868601322174072,
"eval_runtime": 14.0214,
"eval_samples_per_second": 71.319,
"eval_steps_per_second": 2.282,
"step": 300
},
{
"epoch": 0.12183447915760161,
"grad_norm": 1.2760571472974125,
"learning_rate": 9.997343418835142e-05,
"loss": 1.8861,
"step": 350
},
{
"epoch": 0.1392394047515447,
"grad_norm": 3.7430921365305005,
"learning_rate": 9.996380759138595e-05,
"loss": 1.8622,
"step": 400
},
{
"epoch": 0.1392394047515447,
"eval_loss": 1.8383088111877441,
"eval_runtime": 14.0403,
"eval_samples_per_second": 71.224,
"eval_steps_per_second": 2.279,
"step": 400
},
{
"epoch": 0.15664433034548778,
"grad_norm": 0.9586199143769504,
"learning_rate": 9.995269618461844e-05,
"loss": 1.8478,
"step": 450
},
{
"epoch": 0.17404925593943085,
"grad_norm": 1.5061363604288809,
"learning_rate": 9.99401003015922e-05,
"loss": 1.8117,
"step": 500
},
{
"epoch": 0.17404925593943085,
"eval_loss": 1.7942754030227661,
"eval_runtime": 14.0551,
"eval_samples_per_second": 71.149,
"eval_steps_per_second": 2.277,
"step": 500
},
{
"epoch": 0.19145418153337396,
"grad_norm": 2.433096036143497,
"learning_rate": 9.992602032041181e-05,
"loss": 1.8071,
"step": 550
},
{
"epoch": 0.20885910712731703,
"grad_norm": 3.0865325369733796,
"learning_rate": 9.991045666373163e-05,
"loss": 1.7895,
"step": 600
},
{
"epoch": 0.20885910712731703,
"eval_loss": 1.7960834503173828,
"eval_runtime": 14.0575,
"eval_samples_per_second": 71.136,
"eval_steps_per_second": 2.276,
"step": 600
},
{
"epoch": 0.2262640327212601,
"grad_norm": 1.0844936845986464,
"learning_rate": 9.989340979874317e-05,
"loss": 1.7954,
"step": 650
},
{
"epoch": 0.24366895831520322,
"grad_norm": 1.5841141135500758,
"learning_rate": 9.987488023716102e-05,
"loss": 1.7827,
"step": 700
},
{
"epoch": 0.24366895831520322,
"eval_loss": 1.7635940313339233,
"eval_runtime": 14.0192,
"eval_samples_per_second": 71.331,
"eval_steps_per_second": 2.283,
"step": 700
},
{
"epoch": 0.26107388390914626,
"grad_norm": 1.0358659820491174,
"learning_rate": 9.985486853520748e-05,
"loss": 1.7755,
"step": 750
},
{
"epoch": 0.2784788095030894,
"grad_norm": 2.921637768400008,
"learning_rate": 9.983337529359597e-05,
"loss": 1.7689,
"step": 800
},
{
"epoch": 0.2784788095030894,
"eval_loss": 1.7601885795593262,
"eval_runtime": 14.0539,
"eval_samples_per_second": 71.155,
"eval_steps_per_second": 2.277,
"step": 800
},
{
"epoch": 0.2958837350970325,
"grad_norm": 1.2109225200496343,
"learning_rate": 9.981040115751287e-05,
"loss": 1.7642,
"step": 850
},
{
"epoch": 0.31328866069097555,
"grad_norm": 0.9168156411457995,
"learning_rate": 9.978594681659822e-05,
"loss": 1.7584,
"step": 900
},
{
"epoch": 0.31328866069097555,
"eval_loss": 1.733883023262024,
"eval_runtime": 14.0773,
"eval_samples_per_second": 71.036,
"eval_steps_per_second": 2.273,
"step": 900
},
{
"epoch": 0.33069358628491863,
"grad_norm": 1.231514474527335,
"learning_rate": 9.976001300492505e-05,
"loss": 1.7476,
"step": 950
},
{
"epoch": 0.3480985118788617,
"grad_norm": 1.3345795373547282,
"learning_rate": 9.97326005009772e-05,
"loss": 1.7529,
"step": 1000
},
{
"epoch": 0.3480985118788617,
"eval_loss": 1.7277562618255615,
"eval_runtime": 14.0558,
"eval_samples_per_second": 71.145,
"eval_steps_per_second": 2.277,
"step": 1000
},
{
"epoch": 0.3655034374728048,
"grad_norm": 1.3848562250830305,
"learning_rate": 9.970371012762615e-05,
"loss": 1.7383,
"step": 1050
},
{
"epoch": 0.3829083630667479,
"grad_norm": 0.9102870237217918,
"learning_rate": 9.967334275210616e-05,
"loss": 1.7312,
"step": 1100
},
{
"epoch": 0.3829083630667479,
"eval_loss": 1.7197346687316895,
"eval_runtime": 14.1392,
"eval_samples_per_second": 70.725,
"eval_steps_per_second": 2.263,
"step": 1100
},
{
"epoch": 0.400313288660691,
"grad_norm": 0.9403631790293067,
"learning_rate": 9.964149928598834e-05,
"loss": 1.7354,
"step": 1150
},
{
"epoch": 0.41771821425463407,
"grad_norm": 1.5743784967989016,
"learning_rate": 9.96081806851532e-05,
"loss": 1.7384,
"step": 1200
},
{
"epoch": 0.41771821425463407,
"eval_loss": 1.7184182405471802,
"eval_runtime": 14.0812,
"eval_samples_per_second": 71.017,
"eval_steps_per_second": 2.273,
"step": 1200
},
{
"epoch": 0.43512313984857715,
"grad_norm": 2.0859941497191743,
"learning_rate": 9.957338794976201e-05,
"loss": 1.7389,
"step": 1250
},
{
"epoch": 0.4525280654425202,
"grad_norm": 1.347628827751819,
"learning_rate": 9.953712212422681e-05,
"loss": 1.7267,
"step": 1300
},
{
"epoch": 0.4525280654425202,
"eval_loss": 1.7116867303848267,
"eval_runtime": 14.0496,
"eval_samples_per_second": 71.177,
"eval_steps_per_second": 2.278,
"step": 1300
},
{
"epoch": 0.4699329910364633,
"grad_norm": 0.7775683989797394,
"learning_rate": 9.949938429717895e-05,
"loss": 1.7136,
"step": 1350
},
{
"epoch": 0.48733791663040643,
"grad_norm": 0.7955090270012505,
"learning_rate": 9.946017560143651e-05,
"loss": 1.7188,
"step": 1400
},
{
"epoch": 0.48733791663040643,
"eval_loss": 1.703679084777832,
"eval_runtime": 14.077,
"eval_samples_per_second": 71.038,
"eval_steps_per_second": 2.273,
"step": 1400
},
{
"epoch": 0.5047428422243495,
"grad_norm": 1.2332760111187542,
"learning_rate": 9.941949721397028e-05,
"loss": 1.7169,
"step": 1450
},
{
"epoch": 0.5221477678182925,
"grad_norm": 1.81355607393607,
"learning_rate": 9.93773503558684e-05,
"loss": 1.7157,
"step": 1500
},
{
"epoch": 0.5221477678182925,
"eval_loss": 1.7078830003738403,
"eval_runtime": 14.09,
"eval_samples_per_second": 70.972,
"eval_steps_per_second": 2.271,
"step": 1500
},
{
"epoch": 0.5395526934122357,
"grad_norm": 0.9053542478773059,
"learning_rate": 9.933373629229969e-05,
"loss": 1.7102,
"step": 1550
},
{
"epoch": 0.5569576190061788,
"grad_norm": 0.6503277295238644,
"learning_rate": 9.928865633247573e-05,
"loss": 1.7033,
"step": 1600
},
{
"epoch": 0.5569576190061788,
"eval_loss": 1.6917779445648193,
"eval_runtime": 14.0698,
"eval_samples_per_second": 71.074,
"eval_steps_per_second": 2.274,
"step": 1600
},
{
"epoch": 0.5743625446001218,
"grad_norm": 1.4224490096345375,
"learning_rate": 9.92421118296115e-05,
"loss": 1.6997,
"step": 1650
},
{
"epoch": 0.591767470194065,
"grad_norm": 0.7864420926166752,
"learning_rate": 9.919410418088481e-05,
"loss": 1.7102,
"step": 1700
},
{
"epoch": 0.591767470194065,
"eval_loss": 1.690305233001709,
"eval_runtime": 14.1062,
"eval_samples_per_second": 70.891,
"eval_steps_per_second": 2.269,
"step": 1700
},
{
"epoch": 0.609172395788008,
"grad_norm": 0.5663590518834145,
"learning_rate": 9.914463482739435e-05,
"loss": 1.7046,
"step": 1750
},
{
"epoch": 0.6265773213819511,
"grad_norm": 1.1145025421986445,
"learning_rate": 9.909370525411637e-05,
"loss": 1.6905,
"step": 1800
},
{
"epoch": 0.6265773213819511,
"eval_loss": 1.6856919527053833,
"eval_runtime": 14.0345,
"eval_samples_per_second": 71.253,
"eval_steps_per_second": 2.28,
"step": 1800
},
{
"epoch": 0.6439822469758941,
"grad_norm": 1.079593642429848,
"learning_rate": 9.90413169898602e-05,
"loss": 1.6973,
"step": 1850
},
{
"epoch": 0.6613871725698373,
"grad_norm": 0.8794305699903086,
"learning_rate": 9.898747160722229e-05,
"loss": 1.6821,
"step": 1900
},
{
"epoch": 0.6613871725698373,
"eval_loss": 1.680002212524414,
"eval_runtime": 14.0923,
"eval_samples_per_second": 70.961,
"eval_steps_per_second": 2.271,
"step": 1900
},
{
"epoch": 0.6787920981637804,
"grad_norm": 1.3664190261530837,
"learning_rate": 9.893217072253903e-05,
"loss": 1.6909,
"step": 1950
},
{
"epoch": 0.6961970237577234,
"grad_norm": 0.9268231360918758,
"learning_rate": 9.88754159958382e-05,
"loss": 1.6901,
"step": 2000
},
{
"epoch": 0.6961970237577234,
"eval_loss": 1.6765377521514893,
"eval_runtime": 14.0942,
"eval_samples_per_second": 70.951,
"eval_steps_per_second": 2.27,
"step": 2000
},
{
"epoch": 0.7136019493516665,
"grad_norm": 0.9864416812238661,
"learning_rate": 9.881720913078921e-05,
"loss": 1.6911,
"step": 2050
},
{
"epoch": 0.7310068749456096,
"grad_norm": 0.8706035984933645,
"learning_rate": 9.875755187465186e-05,
"loss": 1.6866,
"step": 2100
},
{
"epoch": 0.7310068749456096,
"eval_loss": 1.675471305847168,
"eval_runtime": 14.0392,
"eval_samples_per_second": 71.229,
"eval_steps_per_second": 2.279,
"step": 2100
},
{
"epoch": 0.7484118005395527,
"grad_norm": 0.9954026204157976,
"learning_rate": 9.869644601822396e-05,
"loss": 1.6764,
"step": 2150
},
{
"epoch": 0.7658167261334958,
"grad_norm": 0.9859776473729975,
"learning_rate": 9.863389339578761e-05,
"loss": 1.6772,
"step": 2200
},
{
"epoch": 0.7658167261334958,
"eval_loss": 1.6698520183563232,
"eval_runtime": 14.0605,
"eval_samples_per_second": 71.121,
"eval_steps_per_second": 2.276,
"step": 2200
},
{
"epoch": 0.7832216517274389,
"grad_norm": 0.9106273220771831,
"learning_rate": 9.856989588505399e-05,
"loss": 1.6796,
"step": 2250
},
{
"epoch": 0.800626577321382,
"grad_norm": 1.1219788313484198,
"learning_rate": 9.850445540710714e-05,
"loss": 1.6742,
"step": 2300
},
{
"epoch": 0.800626577321382,
"eval_loss": 1.663262963294983,
"eval_runtime": 14.0171,
"eval_samples_per_second": 71.341,
"eval_steps_per_second": 2.283,
"step": 2300
},
{
"epoch": 0.818031502915325,
"grad_norm": 0.7584108888049894,
"learning_rate": 9.843757392634629e-05,
"loss": 1.6773,
"step": 2350
},
{
"epoch": 0.8354364285092681,
"grad_norm": 0.8524680066268957,
"learning_rate": 9.836925345042675e-05,
"loss": 1.6802,
"step": 2400
},
{
"epoch": 0.8354364285092681,
"eval_loss": 1.6637836694717407,
"eval_runtime": 14.0523,
"eval_samples_per_second": 71.163,
"eval_steps_per_second": 2.277,
"step": 2400
},
{
"epoch": 0.8528413541032112,
"grad_norm": 0.5830225213698085,
"learning_rate": 9.82994960301998e-05,
"loss": 1.6774,
"step": 2450
},
{
"epoch": 0.8702462796971543,
"grad_norm": 0.8294538008262156,
"learning_rate": 9.822830375965103e-05,
"loss": 1.6702,
"step": 2500
},
{
"epoch": 0.8702462796971543,
"eval_loss": 1.6600449085235596,
"eval_runtime": 14.0902,
"eval_samples_per_second": 70.972,
"eval_steps_per_second": 2.271,
"step": 2500
},
{
"epoch": 0.8876512052910974,
"grad_norm": 1.0515224700476833,
"learning_rate": 9.815567877583758e-05,
"loss": 1.6758,
"step": 2550
},
{
"epoch": 0.9050561308850404,
"grad_norm": 0.7866141842693181,
"learning_rate": 9.808162325882385e-05,
"loss": 1.6645,
"step": 2600
},
{
"epoch": 0.9050561308850404,
"eval_loss": 1.657778263092041,
"eval_runtime": 14.0817,
"eval_samples_per_second": 71.014,
"eval_steps_per_second": 2.272,
"step": 2600
},
{
"epoch": 0.9224610564789836,
"grad_norm": 0.9909101251943951,
"learning_rate": 9.800613943161619e-05,
"loss": 1.6629,
"step": 2650
},
{
"epoch": 0.9398659820729266,
"grad_norm": 0.9534991636209588,
"learning_rate": 9.79292295600961e-05,
"loss": 1.6523,
"step": 2700
},
{
"epoch": 0.9398659820729266,
"eval_loss": 1.6576528549194336,
"eval_runtime": 14.0341,
"eval_samples_per_second": 71.255,
"eval_steps_per_second": 2.28,
"step": 2700
},
{
"epoch": 0.9572709076668697,
"grad_norm": 0.7620610436511178,
"learning_rate": 9.785089595295222e-05,
"loss": 1.6573,
"step": 2750
},
{
"epoch": 0.9746758332608129,
"grad_norm": 1.5752171211110084,
"learning_rate": 9.777114096161105e-05,
"loss": 1.6583,
"step": 2800
},
{
"epoch": 0.9746758332608129,
"eval_loss": 1.6622099876403809,
"eval_runtime": 14.0927,
"eval_samples_per_second": 70.959,
"eval_steps_per_second": 2.271,
"step": 2800
},
{
"epoch": 0.9920807588547559,
"grad_norm": 0.5970970379963504,
"learning_rate": 9.768996698016636e-05,
"loss": 1.6625,
"step": 2850
},
{
"epoch": 1.009746758332608,
"grad_norm": 0.7470976369983713,
"learning_rate": 9.760737644530726e-05,
"loss": 1.6597,
"step": 2900
},
{
"epoch": 1.009746758332608,
"eval_loss": 1.647603988647461,
"eval_runtime": 14.12,
"eval_samples_per_second": 70.822,
"eval_steps_per_second": 2.266,
"step": 2900
},
{
"epoch": 1.0271516839265513,
"grad_norm": 0.962160586071795,
"learning_rate": 9.75233718362452e-05,
"loss": 1.611,
"step": 2950
},
{
"epoch": 1.0445566095204943,
"grad_norm": 0.6386050774526276,
"learning_rate": 9.74379556746394e-05,
"loss": 1.619,
"step": 3000
},
{
"epoch": 1.0445566095204943,
"eval_loss": 1.6434565782546997,
"eval_runtime": 14.0132,
"eval_samples_per_second": 71.361,
"eval_steps_per_second": 2.284,
"step": 3000
},
{
"epoch": 1.0619615351144374,
"grad_norm": 1.5569952795665942,
"learning_rate": 9.735113052452119e-05,
"loss": 1.6108,
"step": 3050
},
{
"epoch": 1.0793664607083804,
"grad_norm": 1.223444554102184,
"learning_rate": 9.726289899221713e-05,
"loss": 1.6242,
"step": 3100
},
{
"epoch": 1.0793664607083804,
"eval_loss": 1.6534233093261719,
"eval_runtime": 14.0914,
"eval_samples_per_second": 70.965,
"eval_steps_per_second": 2.271,
"step": 3100
},
{
"epoch": 1.0967713863023236,
"grad_norm": 0.6055563672851731,
"learning_rate": 9.717326372627065e-05,
"loss": 1.6165,
"step": 3150
},
{
"epoch": 1.1141763118962666,
"grad_norm": 0.7125630072846985,
"learning_rate": 9.708222741736268e-05,
"loss": 1.6137,
"step": 3200
},
{
"epoch": 1.1141763118962666,
"eval_loss": 1.6405473947525024,
"eval_runtime": 14.0433,
"eval_samples_per_second": 71.208,
"eval_steps_per_second": 2.279,
"step": 3200
},
{
"epoch": 1.1315812374902097,
"grad_norm": 0.6828372843368237,
"learning_rate": 9.698979279823071e-05,
"loss": 1.6178,
"step": 3250
},
{
"epoch": 1.148986163084153,
"grad_norm": 0.6458088716811551,
"learning_rate": 9.689596264358694e-05,
"loss": 1.6057,
"step": 3300
},
{
"epoch": 1.148986163084153,
"eval_loss": 1.6405302286148071,
"eval_runtime": 14.0715,
"eval_samples_per_second": 71.065,
"eval_steps_per_second": 2.274,
"step": 3300
},
{
"epoch": 1.166391088678096,
"grad_norm": 0.7900271544609745,
"learning_rate": 9.680073977003483e-05,
"loss": 1.6031,
"step": 3350
},
{
"epoch": 1.183796014272039,
"grad_norm": 1.0152045530917768,
"learning_rate": 9.670412703598469e-05,
"loss": 1.6117,
"step": 3400
},
{
"epoch": 1.183796014272039,
"eval_loss": 1.639701247215271,
"eval_runtime": 14.0316,
"eval_samples_per_second": 71.268,
"eval_steps_per_second": 2.281,
"step": 3400
},
{
"epoch": 1.201200939865982,
"grad_norm": 0.7302281809291777,
"learning_rate": 9.660612734156777e-05,
"loss": 1.6027,
"step": 3450
},
{
"epoch": 1.2186058654599252,
"grad_norm": 0.6755389511951054,
"learning_rate": 9.650674362854923e-05,
"loss": 1.6227,
"step": 3500
},
{
"epoch": 1.2186058654599252,
"eval_loss": 1.633447289466858,
"eval_runtime": 14.0553,
"eval_samples_per_second": 71.148,
"eval_steps_per_second": 2.277,
"step": 3500
},
{
"epoch": 1.2360107910538682,
"grad_norm": 0.4958097178118167,
"learning_rate": 9.640597888023988e-05,
"loss": 1.6039,
"step": 3550
},
{
"epoch": 1.2534157166478113,
"grad_norm": 0.842473221384133,
"learning_rate": 9.630383612140661e-05,
"loss": 1.6105,
"step": 3600
},
{
"epoch": 1.2534157166478113,
"eval_loss": 1.6299790143966675,
"eval_runtime": 14.0661,
"eval_samples_per_second": 71.093,
"eval_steps_per_second": 2.275,
"step": 3600
},
{
"epoch": 1.2708206422417545,
"grad_norm": 0.5547865266670734,
"learning_rate": 9.62003184181815e-05,
"loss": 1.608,
"step": 3650
},
{
"epoch": 1.2882255678356975,
"grad_norm": 0.5625620088465835,
"learning_rate": 9.609542887796993e-05,
"loss": 1.6141,
"step": 3700
},
{
"epoch": 1.2882255678356975,
"eval_loss": 1.6269824504852295,
"eval_runtime": 14.0355,
"eval_samples_per_second": 71.248,
"eval_steps_per_second": 2.28,
"step": 3700
},
{
"epoch": 1.3056304934296405,
"grad_norm": 0.9870933875943105,
"learning_rate": 9.598917064935719e-05,
"loss": 1.6045,
"step": 3750
},
{
"epoch": 1.3230354190235838,
"grad_norm": 0.6538637454488698,
"learning_rate": 9.5881546922014e-05,
"loss": 1.601,
"step": 3800
},
{
"epoch": 1.3230354190235838,
"eval_loss": 1.626142144203186,
"eval_runtime": 14.1089,
"eval_samples_per_second": 70.877,
"eval_steps_per_second": 2.268,
"step": 3800
},
{
"epoch": 1.3404403446175268,
"grad_norm": 0.6547060258929419,
"learning_rate": 9.57725609266008e-05,
"loss": 1.6066,
"step": 3850
},
{
"epoch": 1.3578452702114698,
"grad_norm": 0.9358458562600437,
"learning_rate": 9.566221593467069e-05,
"loss": 1.6221,
"step": 3900
},
{
"epoch": 1.3578452702114698,
"eval_loss": 1.627410888671875,
"eval_runtime": 14.088,
"eval_samples_per_second": 70.982,
"eval_steps_per_second": 2.271,
"step": 3900
},
{
"epoch": 1.3752501958054129,
"grad_norm": 0.8129191474694835,
"learning_rate": 9.555051525857134e-05,
"loss": 1.5996,
"step": 3950
},
{
"epoch": 1.392655121399356,
"grad_norm": 0.6824919031119797,
"learning_rate": 9.54374622513454e-05,
"loss": 1.6101,
"step": 4000
},
{
"epoch": 1.392655121399356,
"eval_loss": 1.6165417432785034,
"eval_runtime": 14.0492,
"eval_samples_per_second": 71.179,
"eval_steps_per_second": 2.278,
"step": 4000
},
{
"epoch": 1.4100600469932991,
"grad_norm": 0.9330542502271321,
"learning_rate": 9.532306030663e-05,
"loss": 1.5958,
"step": 4050
},
{
"epoch": 1.4274649725872421,
"grad_norm": 0.6438330837104954,
"learning_rate": 9.520731285855482e-05,
"loss": 1.599,
"step": 4100
},
{
"epoch": 1.4274649725872421,
"eval_loss": 1.6210800409317017,
"eval_runtime": 14.0932,
"eval_samples_per_second": 70.956,
"eval_steps_per_second": 2.271,
"step": 4100
},
{
"epoch": 1.4448698981811852,
"grad_norm": 0.9636631744898069,
"learning_rate": 9.509022338163896e-05,
"loss": 1.5955,
"step": 4150
},
{
"epoch": 1.4622748237751284,
"grad_norm": 0.5569273625801461,
"learning_rate": 9.497179539068673e-05,
"loss": 1.6007,
"step": 4200
},
{
"epoch": 1.4622748237751284,
"eval_loss": 1.6149400472640991,
"eval_runtime": 14.0717,
"eval_samples_per_second": 71.064,
"eval_steps_per_second": 2.274,
"step": 4200
},
{
"epoch": 1.4796797493690714,
"grad_norm": 0.5160141243848255,
"learning_rate": 9.485203244068202e-05,
"loss": 1.5926,
"step": 4250
},
{
"epoch": 1.4970846749630145,
"grad_norm": 0.48151772986247815,
"learning_rate": 9.473093812668182e-05,
"loss": 1.5936,
"step": 4300
},
{
"epoch": 1.4970846749630145,
"eval_loss": 1.6123466491699219,
"eval_runtime": 14.0881,
"eval_samples_per_second": 70.982,
"eval_steps_per_second": 2.271,
"step": 4300
},
{
"epoch": 1.5144896005569577,
"grad_norm": 1.1271863223922003,
"learning_rate": 9.460851608370794e-05,
"loss": 1.6012,
"step": 4350
},
{
"epoch": 1.5318945261509007,
"grad_norm": 0.8558669669849335,
"learning_rate": 9.448476998663825e-05,
"loss": 1.605,
"step": 4400
},
{
"epoch": 1.5318945261509007,
"eval_loss": 1.6140981912612915,
"eval_runtime": 14.1256,
"eval_samples_per_second": 70.793,
"eval_steps_per_second": 2.265,
"step": 4400
},
{
"epoch": 1.5492994517448437,
"grad_norm": 0.7276127450869437,
"learning_rate": 9.435970355009615e-05,
"loss": 1.5938,
"step": 4450
},
{
"epoch": 1.566704377338787,
"grad_norm": 0.6065688198096086,
"learning_rate": 9.423332052833916e-05,
"loss": 1.5946,
"step": 4500
},
{
"epoch": 1.566704377338787,
"eval_loss": 1.611683964729309,
"eval_runtime": 14.0436,
"eval_samples_per_second": 71.207,
"eval_steps_per_second": 2.279,
"step": 4500
},
{
"epoch": 1.58410930293273,
"grad_norm": 0.7748024258482299,
"learning_rate": 9.410562471514616e-05,
"loss": 1.5894,
"step": 4550
},
{
"epoch": 1.601514228526673,
"grad_norm": 0.48917881847751543,
"learning_rate": 9.397661994370357e-05,
"loss": 1.5877,
"step": 4600
},
{
"epoch": 1.601514228526673,
"eval_loss": 1.6069624423980713,
"eval_runtime": 14.0735,
"eval_samples_per_second": 71.056,
"eval_steps_per_second": 2.274,
"step": 4600
},
{
"epoch": 1.6189191541206163,
"grad_norm": 0.8166564830453485,
"learning_rate": 9.384631008649027e-05,
"loss": 1.5875,
"step": 4650
},
{
"epoch": 1.636324079714559,
"grad_norm": 0.9485787011897893,
"learning_rate": 9.371469905516128e-05,
"loss": 1.5926,
"step": 4700
},
{
"epoch": 1.636324079714559,
"eval_loss": 1.6103551387786865,
"eval_runtime": 14.0489,
"eval_samples_per_second": 71.18,
"eval_steps_per_second": 2.278,
"step": 4700
},
{
"epoch": 1.6537290053085023,
"grad_norm": 0.6608190035209371,
"learning_rate": 9.358179080043047e-05,
"loss": 1.5852,
"step": 4750
},
{
"epoch": 1.6711339309024456,
"grad_norm": 0.5091041850584289,
"learning_rate": 9.344758931195186e-05,
"loss": 1.5818,
"step": 4800
},
{
"epoch": 1.6711339309024456,
"eval_loss": 1.6055699586868286,
"eval_runtime": 14.0386,
"eval_samples_per_second": 71.232,
"eval_steps_per_second": 2.279,
"step": 4800
},
{
"epoch": 1.6885388564963884,
"grad_norm": 0.4809752811498165,
"learning_rate": 9.331209861819991e-05,
"loss": 1.5945,
"step": 4850
},
{
"epoch": 1.7059437820903316,
"grad_norm": 1.16696044120828,
"learning_rate": 9.31753227863486e-05,
"loss": 1.5906,
"step": 4900
},
{
"epoch": 1.7059437820903316,
"eval_loss": 1.602495551109314,
"eval_runtime": 14.0638,
"eval_samples_per_second": 71.104,
"eval_steps_per_second": 2.275,
"step": 4900
},
{
"epoch": 1.7233487076842746,
"grad_norm": 0.7703478252526429,
"learning_rate": 9.303726592214927e-05,
"loss": 1.5759,
"step": 4950
},
{
"epoch": 1.7407536332782176,
"grad_norm": 0.4326591794595183,
"learning_rate": 9.289793216980748e-05,
"loss": 1.589,
"step": 5000
},
{
"epoch": 1.7407536332782176,
"eval_loss": 1.598211646080017,
"eval_runtime": 14.05,
"eval_samples_per_second": 71.174,
"eval_steps_per_second": 2.278,
"step": 5000
},
{
"epoch": 1.7581585588721609,
"grad_norm": 0.9150661442715593,
"learning_rate": 9.275732571185852e-05,
"loss": 1.5925,
"step": 5050
},
{
"epoch": 1.775563484466104,
"grad_norm": 0.4835138015080412,
"learning_rate": 9.261545076904189e-05,
"loss": 1.587,
"step": 5100
},
{
"epoch": 1.775563484466104,
"eval_loss": 1.5962464809417725,
"eval_runtime": 14.0435,
"eval_samples_per_second": 71.207,
"eval_steps_per_second": 2.279,
"step": 5100
},
{
"epoch": 1.792968410060047,
"grad_norm": 0.8246740616874354,
"learning_rate": 9.247231160017462e-05,
"loss": 1.5845,
"step": 5150
},
{
"epoch": 1.8103733356539902,
"grad_norm": 0.7636936218440887,
"learning_rate": 9.232791250202342e-05,
"loss": 1.5789,
"step": 5200
},
{
"epoch": 1.8103733356539902,
"eval_loss": 1.5962697267532349,
"eval_runtime": 14.0724,
"eval_samples_per_second": 71.061,
"eval_steps_per_second": 2.274,
"step": 5200
},
{
"epoch": 1.8277782612479332,
"grad_norm": 0.5278061111679693,
"learning_rate": 9.218225780917564e-05,
"loss": 1.5784,
"step": 5250
},
{
"epoch": 1.8451831868418762,
"grad_norm": 0.5521436007234811,
"learning_rate": 9.203535189390927e-05,
"loss": 1.5859,
"step": 5300
},
{
"epoch": 1.8451831868418762,
"eval_loss": 1.589383840560913,
"eval_runtime": 14.0972,
"eval_samples_per_second": 70.936,
"eval_steps_per_second": 2.27,
"step": 5300
},
{
"epoch": 1.8625881124358195,
"grad_norm": 0.9153838912238841,
"learning_rate": 9.188719916606157e-05,
"loss": 1.5767,
"step": 5350
},
{
"epoch": 1.8799930380297625,
"grad_norm": 0.5869179835862129,
"learning_rate": 9.17378040728968e-05,
"loss": 1.5771,
"step": 5400
},
{
"epoch": 1.8799930380297625,
"eval_loss": 1.5878838300704956,
"eval_runtime": 14.0586,
"eval_samples_per_second": 71.131,
"eval_steps_per_second": 2.276,
"step": 5400
},
{
"epoch": 1.8973979636237055,
"grad_norm": 0.8157168714834181,
"learning_rate": 9.158717109897263e-05,
"loss": 1.5626,
"step": 5450
},
{
"epoch": 1.9148028892176487,
"grad_norm": 0.7455391308200009,
"learning_rate": 9.14353047660056e-05,
"loss": 1.5651,
"step": 5500
},
{
"epoch": 1.9148028892176487,
"eval_loss": 1.5843595266342163,
"eval_runtime": 14.0141,
"eval_samples_per_second": 71.356,
"eval_steps_per_second": 2.283,
"step": 5500
},
{
"epoch": 1.9322078148115915,
"grad_norm": 0.48742202866618534,
"learning_rate": 9.128220963273532e-05,
"loss": 1.5806,
"step": 5550
},
{
"epoch": 1.9496127404055348,
"grad_norm": 0.49018002201797567,
"learning_rate": 9.112789029478769e-05,
"loss": 1.5715,
"step": 5600
},
{
"epoch": 1.9496127404055348,
"eval_loss": 1.583487868309021,
"eval_runtime": 14.076,
"eval_samples_per_second": 71.043,
"eval_steps_per_second": 2.273,
"step": 5600
},
{
"epoch": 1.9670176659994778,
"grad_norm": 0.7730233396950769,
"learning_rate": 9.097235138453689e-05,
"loss": 1.5762,
"step": 5650
},
{
"epoch": 1.9844225915934208,
"grad_norm": 0.5303157923715942,
"learning_rate": 9.081559757096637e-05,
"loss": 1.5656,
"step": 5700
},
{
"epoch": 1.9844225915934208,
"eval_loss": 1.5835527181625366,
"eval_runtime": 14.0959,
"eval_samples_per_second": 70.942,
"eval_steps_per_second": 2.27,
"step": 5700
},
{
"epoch": 2.0020885910712733,
"grad_norm": 1.0260303687016545,
"learning_rate": 9.065763355952868e-05,
"loss": 1.5804,
"step": 5750
},
{
"epoch": 2.019493516665216,
"grad_norm": 0.618811524236402,
"learning_rate": 9.049846409200417e-05,
"loss": 1.4968,
"step": 5800
},
{
"epoch": 2.019493516665216,
"eval_loss": 1.5831753015518188,
"eval_runtime": 14.0889,
"eval_samples_per_second": 70.978,
"eval_steps_per_second": 2.271,
"step": 5800
},
{
"epoch": 2.0368984422591594,
"grad_norm": 0.613529897165403,
"learning_rate": 9.033809394635874e-05,
"loss": 1.5022,
"step": 5850
},
{
"epoch": 2.0543033678531026,
"grad_norm": 0.5015341058830712,
"learning_rate": 9.017652793660039e-05,
"loss": 1.4978,
"step": 5900
},
{
"epoch": 2.0543033678531026,
"eval_loss": 1.5814894437789917,
"eval_runtime": 14.0786,
"eval_samples_per_second": 71.03,
"eval_steps_per_second": 2.273,
"step": 5900
},
{
"epoch": 2.0717082934470454,
"grad_norm": 0.6230434882603811,
"learning_rate": 9.001377091263465e-05,
"loss": 1.4918,
"step": 5950
},
{
"epoch": 2.0891132190409887,
"grad_norm": 0.5236681791053263,
"learning_rate": 8.984982776011906e-05,
"loss": 1.4916,
"step": 6000
},
{
"epoch": 2.0891132190409887,
"eval_loss": 1.5761847496032715,
"eval_runtime": 14.0503,
"eval_samples_per_second": 71.173,
"eval_steps_per_second": 2.278,
"step": 6000
},
{
"epoch": 2.1065181446349315,
"grad_norm": 0.5349428126602861,
"learning_rate": 8.96847034003165e-05,
"loss": 1.4917,
"step": 6050
},
{
"epoch": 2.1239230702288747,
"grad_norm": 0.6861287366848919,
"learning_rate": 8.951840278994747e-05,
"loss": 1.4866,
"step": 6100
},
{
"epoch": 2.1239230702288747,
"eval_loss": 1.5749881267547607,
"eval_runtime": 14.056,
"eval_samples_per_second": 71.144,
"eval_steps_per_second": 2.277,
"step": 6100
},
{
"epoch": 2.141327995822818,
"grad_norm": 0.6483285157830946,
"learning_rate": 8.935093092104121e-05,
"loss": 1.4962,
"step": 6150
},
{
"epoch": 2.1587329214167608,
"grad_norm": 0.8142430709313933,
"learning_rate": 8.9182292820786e-05,
"loss": 1.4854,
"step": 6200
},
{
"epoch": 2.1587329214167608,
"eval_loss": 1.5689362287521362,
"eval_runtime": 14.1096,
"eval_samples_per_second": 70.874,
"eval_steps_per_second": 2.268,
"step": 6200
},
{
"epoch": 2.176137847010704,
"grad_norm": 1.1670624800116283,
"learning_rate": 8.901249355137816e-05,
"loss": 1.486,
"step": 6250
},
{
"epoch": 2.1935427726046473,
"grad_norm": 0.5523979614193149,
"learning_rate": 8.884153820987008e-05,
"loss": 1.4975,
"step": 6300
},
{
"epoch": 2.1935427726046473,
"eval_loss": 1.5690404176712036,
"eval_runtime": 14.0346,
"eval_samples_per_second": 71.252,
"eval_steps_per_second": 2.28,
"step": 6300
},
{
"epoch": 2.21094769819859,
"grad_norm": 0.5960284645659393,
"learning_rate": 8.866943192801729e-05,
"loss": 1.5085,
"step": 6350
},
{
"epoch": 2.2283526237925333,
"grad_norm": 0.5896589247735479,
"learning_rate": 8.84961798721243e-05,
"loss": 1.4815,
"step": 6400
},
{
"epoch": 2.2283526237925333,
"eval_loss": 1.5669183731079102,
"eval_runtime": 14.0717,
"eval_samples_per_second": 71.065,
"eval_steps_per_second": 2.274,
"step": 6400
},
{
"epoch": 2.2457575493864765,
"grad_norm": 0.6944971656499748,
"learning_rate": 8.832178724288966e-05,
"loss": 1.4901,
"step": 6450
},
{
"epoch": 2.2631624749804193,
"grad_norm": 0.48925376877273175,
"learning_rate": 8.814625927524973e-05,
"loss": 1.486,
"step": 6500
},
{
"epoch": 2.2631624749804193,
"eval_loss": 1.5606794357299805,
"eval_runtime": 14.1014,
"eval_samples_per_second": 70.915,
"eval_steps_per_second": 2.269,
"step": 6500
},
{
"epoch": 2.2805674005743626,
"grad_norm": 0.7017723269145159,
"learning_rate": 8.79696012382216e-05,
"loss": 1.4921,
"step": 6550
},
{
"epoch": 2.297972326168306,
"grad_norm": 0.5451878167122939,
"learning_rate": 8.779181843474488e-05,
"loss": 1.4952,
"step": 6600
},
{
"epoch": 2.297972326168306,
"eval_loss": 1.561612606048584,
"eval_runtime": 14.0563,
"eval_samples_per_second": 71.143,
"eval_steps_per_second": 2.277,
"step": 6600
},
{
"epoch": 2.3153772517622486,
"grad_norm": 0.7029034750125012,
"learning_rate": 8.761291620152251e-05,
"loss": 1.49,
"step": 6650
},
{
"epoch": 2.332782177356192,
"grad_norm": 0.6320953342648652,
"learning_rate": 8.743289990886069e-05,
"loss": 1.4965,
"step": 6700
},
{
"epoch": 2.332782177356192,
"eval_loss": 1.5618507862091064,
"eval_runtime": 14.3775,
"eval_samples_per_second": 69.553,
"eval_steps_per_second": 2.226,
"step": 6700
},
{
"epoch": 2.3501871029501347,
"grad_norm": 0.44962999706711637,
"learning_rate": 8.725177496050746e-05,
"loss": 1.4956,
"step": 6750
},
{
"epoch": 2.367592028544078,
"grad_norm": 0.5296736885871586,
"learning_rate": 8.706954679349071e-05,
"loss": 1.4836,
"step": 6800
},
{
"epoch": 2.367592028544078,
"eval_loss": 1.5586892366409302,
"eval_runtime": 14.0268,
"eval_samples_per_second": 71.292,
"eval_steps_per_second": 2.281,
"step": 6800
},
{
"epoch": 2.384996954138021,
"grad_norm": 0.5506512514989426,
"learning_rate": 8.688622087795476e-05,
"loss": 1.4795,
"step": 6850
},
{
"epoch": 2.402401879731964,
"grad_norm": 0.6654665155879538,
"learning_rate": 8.670180271699632e-05,
"loss": 1.4741,
"step": 6900
},
{
"epoch": 2.402401879731964,
"eval_loss": 1.5594490766525269,
"eval_runtime": 14.0756,
"eval_samples_per_second": 71.045,
"eval_steps_per_second": 2.273,
"step": 6900
},
{
"epoch": 2.419806805325907,
"grad_norm": 0.6887037066063108,
"learning_rate": 8.651629784649924e-05,
"loss": 1.483,
"step": 6950
},
{
"epoch": 2.4372117309198504,
"grad_norm": 0.689800498648978,
"learning_rate": 8.632971183496832e-05,
"loss": 1.4901,
"step": 7000
},
{
"epoch": 2.4372117309198504,
"eval_loss": 1.556670904159546,
"eval_runtime": 14.0999,
"eval_samples_per_second": 70.923,
"eval_steps_per_second": 2.27,
"step": 7000
},
{
"epoch": 2.4546166565137932,
"grad_norm": 0.5661309135294365,
"learning_rate": 8.614205028336217e-05,
"loss": 1.4741,
"step": 7050
},
{
"epoch": 2.4720215821077365,
"grad_norm": 0.5266042181833929,
"learning_rate": 8.595331882492506e-05,
"loss": 1.4808,
"step": 7100
},
{
"epoch": 2.4720215821077365,
"eval_loss": 1.5564885139465332,
"eval_runtime": 14.0633,
"eval_samples_per_second": 71.107,
"eval_steps_per_second": 2.275,
"step": 7100
},
{
"epoch": 2.4894265077016797,
"grad_norm": 0.5952862014072801,
"learning_rate": 8.576352312501787e-05,
"loss": 1.4746,
"step": 7150
},
{
"epoch": 2.5068314332956225,
"grad_norm": 0.4624470123124944,
"learning_rate": 8.557266888094794e-05,
"loss": 1.4946,
"step": 7200
},
{
"epoch": 2.5068314332956225,
"eval_loss": 1.552463412284851,
"eval_runtime": 14.0629,
"eval_samples_per_second": 71.109,
"eval_steps_per_second": 2.275,
"step": 7200
},
{
"epoch": 2.5242363588895658,
"grad_norm": 0.45145611389782175,
"learning_rate": 8.538076182179816e-05,
"loss": 1.4961,
"step": 7250
},
{
"epoch": 2.541641284483509,
"grad_norm": 0.90012189462666,
"learning_rate": 8.518780770825489e-05,
"loss": 1.4783,
"step": 7300
},
{
"epoch": 2.541641284483509,
"eval_loss": 1.5499927997589111,
"eval_runtime": 14.0891,
"eval_samples_per_second": 70.977,
"eval_steps_per_second": 2.271,
"step": 7300
},
{
"epoch": 2.559046210077452,
"grad_norm": 0.48065365017717593,
"learning_rate": 8.499381233243513e-05,
"loss": 1.4769,
"step": 7350
},
{
"epoch": 2.576451135671395,
"grad_norm": 0.6888035432484004,
"learning_rate": 8.479878151771251e-05,
"loss": 1.4789,
"step": 7400
},
{
"epoch": 2.576451135671395,
"eval_loss": 1.5486310720443726,
"eval_runtime": 13.9982,
"eval_samples_per_second": 71.438,
"eval_steps_per_second": 2.286,
"step": 7400
},
{
"epoch": 2.5938560612653383,
"grad_norm": 0.48711196772194026,
"learning_rate": 8.460272111854266e-05,
"loss": 1.4847,
"step": 7450
},
{
"epoch": 2.611260986859281,
"grad_norm": 0.824707610556562,
"learning_rate": 8.440563702028738e-05,
"loss": 1.4828,
"step": 7500
},
{
"epoch": 2.611260986859281,
"eval_loss": 1.551180362701416,
"eval_runtime": 14.0431,
"eval_samples_per_second": 71.209,
"eval_steps_per_second": 2.279,
"step": 7500
},
{
"epoch": 2.6286659124532243,
"grad_norm": 0.852389610981971,
"learning_rate": 8.42075351390379e-05,
"loss": 1.4826,
"step": 7550
},
{
"epoch": 2.6460708380471676,
"grad_norm": 0.6660183574150006,
"learning_rate": 8.400842142143747e-05,
"loss": 1.4845,
"step": 7600
},
{
"epoch": 2.6460708380471676,
"eval_loss": 1.544368028640747,
"eval_runtime": 14.0389,
"eval_samples_per_second": 71.231,
"eval_steps_per_second": 2.279,
"step": 7600
},
{
"epoch": 2.6634757636411104,
"grad_norm": 0.5371708490732019,
"learning_rate": 8.380830184450267e-05,
"loss": 1.4793,
"step": 7650
},
{
"epoch": 2.6808806892350536,
"grad_norm": 0.5364646266893728,
"learning_rate": 8.360718241544412e-05,
"loss": 1.4785,
"step": 7700
},
{
"epoch": 2.6808806892350536,
"eval_loss": 1.543105959892273,
"eval_runtime": 14.0605,
"eval_samples_per_second": 71.121,
"eval_steps_per_second": 2.276,
"step": 7700
},
{
"epoch": 2.698285614828997,
"grad_norm": 0.46957227565425785,
"learning_rate": 8.340506917148608e-05,
"loss": 1.4742,
"step": 7750
},
{
"epoch": 2.7156905404229397,
"grad_norm": 0.48372971397935693,
"learning_rate": 8.320196817968525e-05,
"loss": 1.4866,
"step": 7800
},
{
"epoch": 2.7156905404229397,
"eval_loss": 1.5416640043258667,
"eval_runtime": 14.0946,
"eval_samples_per_second": 70.949,
"eval_steps_per_second": 2.27,
"step": 7800
},
{
"epoch": 2.733095466016883,
"grad_norm": 0.62992004899943,
"learning_rate": 8.29978855367487e-05,
"loss": 1.4805,
"step": 7850
},
{
"epoch": 2.7505003916108257,
"grad_norm": 0.43123069122543717,
"learning_rate": 8.279282736885072e-05,
"loss": 1.4658,
"step": 7900
},
{
"epoch": 2.7505003916108257,
"eval_loss": 1.535282850265503,
"eval_runtime": 14.0868,
"eval_samples_per_second": 70.988,
"eval_steps_per_second": 2.272,
"step": 7900
},
{
"epoch": 2.767905317204769,
"grad_norm": 0.6368197221637776,
"learning_rate": 8.258679983144908e-05,
"loss": 1.4758,
"step": 7950
},
{
"epoch": 2.785310242798712,
"grad_norm": 0.6657566023644987,
"learning_rate": 8.237980910910019e-05,
"loss": 1.4745,
"step": 8000
},
{
"epoch": 2.785310242798712,
"eval_loss": 1.5360466241836548,
"eval_runtime": 14.0555,
"eval_samples_per_second": 71.146,
"eval_steps_per_second": 2.277,
"step": 8000
},
{
"epoch": 2.802715168392655,
"grad_norm": 0.7182835079667393,
"learning_rate": 8.217186141527335e-05,
"loss": 1.4641,
"step": 8050
},
{
"epoch": 2.8201200939865982,
"grad_norm": 0.7121132283989695,
"learning_rate": 8.196296299216446e-05,
"loss": 1.4759,
"step": 8100
},
{
"epoch": 2.8201200939865982,
"eval_loss": 1.532562494277954,
"eval_runtime": 14.0783,
"eval_samples_per_second": 71.031,
"eval_steps_per_second": 2.273,
"step": 8100
},
{
"epoch": 2.837525019580541,
"grad_norm": 0.5023845726717355,
"learning_rate": 8.175312011050845e-05,
"loss": 1.4683,
"step": 8150
},
{
"epoch": 2.8549299451744843,
"grad_norm": 0.5881687310427127,
"learning_rate": 8.154233906939112e-05,
"loss": 1.4663,
"step": 8200
},
{
"epoch": 2.8549299451744843,
"eval_loss": 1.528754711151123,
"eval_runtime": 14.0709,
"eval_samples_per_second": 71.069,
"eval_steps_per_second": 2.274,
"step": 8200
},
{
"epoch": 2.8723348707684275,
"grad_norm": 0.8694427423730182,
"learning_rate": 8.133062619605998e-05,
"loss": 1.4652,
"step": 8250
},
{
"epoch": 2.8897397963623703,
"grad_norm": 0.5428973347025838,
"learning_rate": 8.111798784573448e-05,
"loss": 1.4654,
"step": 8300
},
{
"epoch": 2.8897397963623703,
"eval_loss": 1.5267043113708496,
"eval_runtime": 14.0573,
"eval_samples_per_second": 71.138,
"eval_steps_per_second": 2.276,
"step": 8300
},
{
"epoch": 2.9071447219563136,
"grad_norm": 0.5347673639983305,
"learning_rate": 8.090443040141507e-05,
"loss": 1.4686,
"step": 8350
},
{
"epoch": 2.924549647550257,
"grad_norm": 0.3699648780599154,
"learning_rate": 8.068996027369164e-05,
"loss": 1.4609,
"step": 8400
},
{
"epoch": 2.924549647550257,
"eval_loss": 1.5217338800430298,
"eval_runtime": 14.0237,
"eval_samples_per_second": 71.308,
"eval_steps_per_second": 2.282,
"step": 8400
},
{
"epoch": 2.9419545731441996,
"grad_norm": 0.48586968192782287,
"learning_rate": 8.047458390055122e-05,
"loss": 1.4612,
"step": 8450
},
{
"epoch": 2.959359498738143,
"grad_norm": 0.5435332038220189,
"learning_rate": 8.025830774718446e-05,
"loss": 1.4692,
"step": 8500
},
{
"epoch": 2.959359498738143,
"eval_loss": 1.5231417417526245,
"eval_runtime": 14.0466,
"eval_samples_per_second": 71.191,
"eval_steps_per_second": 2.278,
"step": 8500
},
{
"epoch": 2.976764424332086,
"grad_norm": 0.7096020132731112,
"learning_rate": 8.004113830579183e-05,
"loss": 1.471,
"step": 8550
},
{
"epoch": 2.994169349926029,
"grad_norm": 0.539895931380371,
"learning_rate": 7.982308209538854e-05,
"loss": 1.4669,
"step": 8600
},
{
"epoch": 2.994169349926029,
"eval_loss": 1.5212860107421875,
"eval_runtime": 14.0526,
"eval_samples_per_second": 71.161,
"eval_steps_per_second": 2.277,
"step": 8600
},
{
"epoch": 3.0118353494038814,
"grad_norm": 0.4864871708607549,
"learning_rate": 7.960414566160895e-05,
"loss": 1.416,
"step": 8650
},
{
"epoch": 3.0292402749978242,
"grad_norm": 0.690002277872972,
"learning_rate": 7.938433557651007e-05,
"loss": 1.366,
"step": 8700
},
{
"epoch": 3.0292402749978242,
"eval_loss": 1.529784917831421,
"eval_runtime": 14.067,
"eval_samples_per_second": 71.088,
"eval_steps_per_second": 2.275,
"step": 8700
},
{
"epoch": 3.0466452005917675,
"grad_norm": 0.5289813539417939,
"learning_rate": 7.916365843837427e-05,
"loss": 1.3613,
"step": 8750
},
{
"epoch": 3.0640501261857107,
"grad_norm": 0.6657125938020144,
"learning_rate": 7.894212087151115e-05,
"loss": 1.3688,
"step": 8800
},
{
"epoch": 3.0640501261857107,
"eval_loss": 1.5215730667114258,
"eval_runtime": 14.0034,
"eval_samples_per_second": 71.411,
"eval_steps_per_second": 2.285,
"step": 8800
},
{
"epoch": 3.0814550517796535,
"grad_norm": 0.7390029016004258,
"learning_rate": 7.871972952605883e-05,
"loss": 1.3683,
"step": 8850
},
{
"epoch": 3.0988599773735968,
"grad_norm": 0.5057259460715381,
"learning_rate": 7.849649107778423e-05,
"loss": 1.3728,
"step": 8900
},
{
"epoch": 3.0988599773735968,
"eval_loss": 1.5253978967666626,
"eval_runtime": 14.1332,
"eval_samples_per_second": 70.756,
"eval_steps_per_second": 2.264,
"step": 8900
},
{
"epoch": 3.11626490296754,
"grad_norm": 0.4295555287715609,
"learning_rate": 7.827241222788265e-05,
"loss": 1.3712,
"step": 8950
},
{
"epoch": 3.133669828561483,
"grad_norm": 0.5114244593630245,
"learning_rate": 7.804749970277668e-05,
"loss": 1.3687,
"step": 9000
},
{
"epoch": 3.133669828561483,
"eval_loss": 1.5244981050491333,
"eval_runtime": 14.0709,
"eval_samples_per_second": 71.069,
"eval_steps_per_second": 2.274,
"step": 9000
},
{
"epoch": 3.151074754155426,
"grad_norm": 0.5924217403505255,
"learning_rate": 7.782176025391429e-05,
"loss": 1.3599,
"step": 9050
},
{
"epoch": 3.1684796797493693,
"grad_norm": 0.40820014979092695,
"learning_rate": 7.759520065756606e-05,
"loss": 1.3861,
"step": 9100
},
{
"epoch": 3.1684796797493693,
"eval_loss": 1.511974573135376,
"eval_runtime": 14.0345,
"eval_samples_per_second": 71.253,
"eval_steps_per_second": 2.28,
"step": 9100
},
{
"epoch": 3.185884605343312,
"grad_norm": 0.4377700488526699,
"learning_rate": 7.736782771462192e-05,
"loss": 1.371,
"step": 9150
},
{
"epoch": 3.2032895309372553,
"grad_norm": 0.6320656559166914,
"learning_rate": 7.713964825038689e-05,
"loss": 1.3686,
"step": 9200
},
{
"epoch": 3.2032895309372553,
"eval_loss": 1.5161738395690918,
"eval_runtime": 14.061,
"eval_samples_per_second": 71.119,
"eval_steps_per_second": 2.276,
"step": 9200
},
{
"epoch": 3.220694456531198,
"grad_norm": 0.5547647399512429,
"learning_rate": 7.69106691143762e-05,
"loss": 1.3701,
"step": 9250
},
{
"epoch": 3.2380993821251414,
"grad_norm": 0.5363716061147621,
"learning_rate": 7.66808971801098e-05,
"loss": 1.3661,
"step": 9300
},
{
"epoch": 3.2380993821251414,
"eval_loss": 1.5094687938690186,
"eval_runtime": 14.0539,
"eval_samples_per_second": 71.155,
"eval_steps_per_second": 2.277,
"step": 9300
},
{
"epoch": 3.2555043077190846,
"grad_norm": 0.5599230381831105,
"learning_rate": 7.645033934490586e-05,
"loss": 1.3603,
"step": 9350
},
{
"epoch": 3.2729092333130274,
"grad_norm": 0.8327764629351396,
"learning_rate": 7.621900252967383e-05,
"loss": 1.3735,
"step": 9400
},
{
"epoch": 3.2729092333130274,
"eval_loss": 1.5098674297332764,
"eval_runtime": 14.0398,
"eval_samples_per_second": 71.226,
"eval_steps_per_second": 2.279,
"step": 9400
},
{
"epoch": 3.2903141589069707,
"grad_norm": 0.48878316002707184,
"learning_rate": 7.59868936787067e-05,
"loss": 1.3784,
"step": 9450
},
{
"epoch": 3.307719084500914,
"grad_norm": 0.43212527062833306,
"learning_rate": 7.575401975947243e-05,
"loss": 1.3898,
"step": 9500
},
{
"epoch": 3.307719084500914,
"eval_loss": 1.5034927129745483,
"eval_runtime": 14.069,
"eval_samples_per_second": 71.078,
"eval_steps_per_second": 2.275,
"step": 9500
},
{
"epoch": 3.3251240100948567,
"grad_norm": 0.5587972491626451,
"learning_rate": 7.552038776240496e-05,
"loss": 1.3756,
"step": 9550
},
{
"epoch": 3.3425289356888,
"grad_norm": 0.48413494641810556,
"learning_rate": 7.528600470069427e-05,
"loss": 1.3766,
"step": 9600
},
{
"epoch": 3.3425289356888,
"eval_loss": 1.5048415660858154,
"eval_runtime": 14.0409,
"eval_samples_per_second": 71.221,
"eval_steps_per_second": 2.279,
"step": 9600
},
{
"epoch": 3.359933861282743,
"grad_norm": 0.5249216781891894,
"learning_rate": 7.505087761007585e-05,
"loss": 1.3683,
"step": 9650
},
{
"epoch": 3.377338786876686,
"grad_norm": 0.5021867457296348,
"learning_rate": 7.481501354861958e-05,
"loss": 1.3628,
"step": 9700
},
{
"epoch": 3.377338786876686,
"eval_loss": 1.5056378841400146,
"eval_runtime": 14.1165,
"eval_samples_per_second": 70.839,
"eval_steps_per_second": 2.267,
"step": 9700
},
{
"epoch": 3.3947437124706292,
"grad_norm": 0.4319516918800896,
"learning_rate": 7.457841959651772e-05,
"loss": 1.3757,
"step": 9750
},
{
"epoch": 3.412148638064572,
"grad_norm": 0.5272349233779042,
"learning_rate": 7.434110285587257e-05,
"loss": 1.3772,
"step": 9800
},
{
"epoch": 3.412148638064572,
"eval_loss": 1.4979031085968018,
"eval_runtime": 14.0583,
"eval_samples_per_second": 71.132,
"eval_steps_per_second": 2.276,
"step": 9800
},
{
"epoch": 3.4295535636585153,
"grad_norm": 0.4170853105657618,
"learning_rate": 7.410307045048309e-05,
"loss": 1.3738,
"step": 9850
},
{
"epoch": 3.4469584892524585,
"grad_norm": 0.48485775358670463,
"learning_rate": 7.38643295256312e-05,
"loss": 1.3724,
"step": 9900
},
{
"epoch": 3.4469584892524585,
"eval_loss": 1.494421362876892,
"eval_runtime": 14.0287,
"eval_samples_per_second": 71.282,
"eval_steps_per_second": 2.281,
"step": 9900
},
{
"epoch": 3.4643634148464013,
"grad_norm": 0.4371734710144595,
"learning_rate": 7.362488724786717e-05,
"loss": 1.3744,
"step": 9950
},
{
"epoch": 3.4817683404403446,
"grad_norm": 0.40923025134892577,
"learning_rate": 7.338475080479464e-05,
"loss": 1.3607,
"step": 10000
},
{
"epoch": 3.4817683404403446,
"eval_loss": 1.4906189441680908,
"eval_runtime": 14.13,
"eval_samples_per_second": 70.772,
"eval_steps_per_second": 2.265,
"step": 10000
},
{
"epoch": 3.499173266034288,
"grad_norm": 0.566881072168113,
"learning_rate": 7.31439274048547e-05,
"loss": 1.3724,
"step": 10050
},
{
"epoch": 3.5165781916282306,
"grad_norm": 0.6296518713524268,
"learning_rate": 7.290242427710961e-05,
"loss": 1.3727,
"step": 10100
},
{
"epoch": 3.5165781916282306,
"eval_loss": 1.4904612302780151,
"eval_runtime": 14.0139,
"eval_samples_per_second": 71.358,
"eval_steps_per_second": 2.283,
"step": 10100
},
{
"epoch": 3.533983117222174,
"grad_norm": 0.5206499240433969,
"learning_rate": 7.266024867102576e-05,
"loss": 1.3692,
"step": 10150
},
{
"epoch": 3.551388042816117,
"grad_norm": 0.40529224737208835,
"learning_rate": 7.241740785625611e-05,
"loss": 1.3806,
"step": 10200
},
{
"epoch": 3.551388042816117,
"eval_loss": 1.4856830835342407,
"eval_runtime": 14.0448,
"eval_samples_per_second": 71.201,
"eval_steps_per_second": 2.278,
"step": 10200
},
{
"epoch": 3.56879296841006,
"grad_norm": 0.4300144130087216,
"learning_rate": 7.217390912242188e-05,
"loss": 1.3744,
"step": 10250
},
{
"epoch": 3.586197894004003,
"grad_norm": 1.5706940836443246,
"learning_rate": 7.19297597788938e-05,
"loss": 1.3585,
"step": 10300
},
{
"epoch": 3.586197894004003,
"eval_loss": 1.4865046739578247,
"eval_runtime": 14.0585,
"eval_samples_per_second": 71.131,
"eval_steps_per_second": 2.276,
"step": 10300
},
{
"epoch": 3.6036028195979464,
"grad_norm": 0.42832118387479046,
"learning_rate": 7.168496715457262e-05,
"loss": 1.3498,
"step": 10350
},
{
"epoch": 3.621007745191889,
"grad_norm": 0.32937050038811627,
"learning_rate": 7.143953859766922e-05,
"loss": 1.3668,
"step": 10400
},
{
"epoch": 3.621007745191889,
"eval_loss": 1.4809393882751465,
"eval_runtime": 14.0762,
"eval_samples_per_second": 71.042,
"eval_steps_per_second": 2.273,
"step": 10400
},
{
"epoch": 3.6384126707858324,
"grad_norm": 0.46661759551944226,
"learning_rate": 7.119348147548397e-05,
"loss": 1.3713,
"step": 10450
},
{
"epoch": 3.6558175963797757,
"grad_norm": 0.37949075462180737,
"learning_rate": 7.094680317418553e-05,
"loss": 1.3738,
"step": 10500
},
{
"epoch": 3.6558175963797757,
"eval_loss": 1.479648470878601,
"eval_runtime": 14.0453,
"eval_samples_per_second": 71.198,
"eval_steps_per_second": 2.278,
"step": 10500
},
{
"epoch": 3.6732225219737185,
"grad_norm": 0.3545653210995024,
"learning_rate": 7.069951109858924e-05,
"loss": 1.3778,
"step": 10550
},
{
"epoch": 3.6906274475676617,
"grad_norm": 0.4800710308825892,
"learning_rate": 7.045161267193473e-05,
"loss": 1.3714,
"step": 10600
},
{
"epoch": 3.6906274475676617,
"eval_loss": 1.478054165840149,
"eval_runtime": 14.0495,
"eval_samples_per_second": 71.177,
"eval_steps_per_second": 2.278,
"step": 10600
},
{
"epoch": 3.708032373161605,
"grad_norm": 0.4518327093756271,
"learning_rate": 7.020311533566316e-05,
"loss": 1.3603,
"step": 10650
},
{
"epoch": 3.7254372987555477,
"grad_norm": 0.5598559026409418,
"learning_rate": 6.995402654919383e-05,
"loss": 1.3751,
"step": 10700
},
{
"epoch": 3.7254372987555477,
"eval_loss": 1.4786157608032227,
"eval_runtime": 14.0459,
"eval_samples_per_second": 71.195,
"eval_steps_per_second": 2.278,
"step": 10700
},
{
"epoch": 3.742842224349491,
"grad_norm": 0.397449015844996,
"learning_rate": 6.970435378970025e-05,
"loss": 1.3696,
"step": 10750
},
{
"epoch": 3.7602471499434342,
"grad_norm": 0.460203408949936,
"learning_rate": 6.94541045518857e-05,
"loss": 1.3662,
"step": 10800
},
{
"epoch": 3.7602471499434342,
"eval_loss": 1.4727766513824463,
"eval_runtime": 14.0456,
"eval_samples_per_second": 71.197,
"eval_steps_per_second": 2.278,
"step": 10800
},
{
"epoch": 3.777652075537377,
"grad_norm": 0.5514873377132529,
"learning_rate": 6.920328634775823e-05,
"loss": 1.3547,
"step": 10850
},
{
"epoch": 3.7950570011313203,
"grad_norm": 0.4046190880252359,
"learning_rate": 6.895190670640517e-05,
"loss": 1.3702,
"step": 10900
},
{
"epoch": 3.7950570011313203,
"eval_loss": 1.4765475988388062,
"eval_runtime": 14.044,
"eval_samples_per_second": 71.205,
"eval_steps_per_second": 2.279,
"step": 10900
},
{
"epoch": 3.8124619267252635,
"grad_norm": 0.39733647374433373,
"learning_rate": 6.86999731737672e-05,
"loss": 1.3576,
"step": 10950
},
{
"epoch": 3.8298668523192063,
"grad_norm": 0.4968531287288758,
"learning_rate": 6.844749331241166e-05,
"loss": 1.3683,
"step": 11000
},
{
"epoch": 3.8298668523192063,
"eval_loss": 1.4669009447097778,
"eval_runtime": 14.0594,
"eval_samples_per_second": 71.127,
"eval_steps_per_second": 2.276,
"step": 11000
},
{
"epoch": 3.8472717779131496,
"grad_norm": 0.39798180208193673,
"learning_rate": 6.819447470130576e-05,
"loss": 1.3599,
"step": 11050
},
{
"epoch": 3.8646767035070924,
"grad_norm": 0.45458727194541854,
"learning_rate": 6.794092493558886e-05,
"loss": 1.369,
"step": 11100
},
{
"epoch": 3.8646767035070924,
"eval_loss": 1.4670898914337158,
"eval_runtime": 14.0764,
"eval_samples_per_second": 71.041,
"eval_steps_per_second": 2.273,
"step": 11100
},
{
"epoch": 3.8820816291010356,
"grad_norm": 0.3836811429180259,
"learning_rate": 6.768685162634463e-05,
"loss": 1.358,
"step": 11150
},
{
"epoch": 3.8994865546949784,
"grad_norm": 0.38598388694396,
"learning_rate": 6.743226240037251e-05,
"loss": 1.3583,
"step": 11200
},
{
"epoch": 3.8994865546949784,
"eval_loss": 1.4623597860336304,
"eval_runtime": 14.0625,
"eval_samples_per_second": 71.111,
"eval_steps_per_second": 2.276,
"step": 11200
},
{
"epoch": 3.9168914802889216,
"grad_norm": 0.44825069145730173,
"learning_rate": 6.717716489995878e-05,
"loss": 1.3502,
"step": 11250
},
{
"epoch": 3.934296405882865,
"grad_norm": 0.38830830286241574,
"learning_rate": 6.692156678264715e-05,
"loss": 1.3532,
"step": 11300
},
{
"epoch": 3.934296405882865,
"eval_loss": 1.4605158567428589,
"eval_runtime": 14.0694,
"eval_samples_per_second": 71.076,
"eval_steps_per_second": 2.274,
"step": 11300
},
{
"epoch": 3.9517013314768077,
"grad_norm": 0.411836793573614,
"learning_rate": 6.666547572100892e-05,
"loss": 1.36,
"step": 11350
},
{
"epoch": 3.969106257070751,
"grad_norm": 0.5070202220012539,
"learning_rate": 6.640889940241265e-05,
"loss": 1.3621,
"step": 11400
},
{
"epoch": 3.969106257070751,
"eval_loss": 1.4586904048919678,
"eval_runtime": 14.0282,
"eval_samples_per_second": 71.285,
"eval_steps_per_second": 2.281,
"step": 11400
},
{
"epoch": 3.986511182664694,
"grad_norm": 0.45972048455728964,
"learning_rate": 6.615184552879333e-05,
"loss": 1.3569,
"step": 11450
},
{
"epoch": 4.004177182142547,
"grad_norm": 0.4844090667680897,
"learning_rate": 6.589432181642133e-05,
"loss": 1.356,
"step": 11500
},
{
"epoch": 4.004177182142547,
"eval_loss": 1.474735140800476,
"eval_runtime": 14.1044,
"eval_samples_per_second": 70.9,
"eval_steps_per_second": 2.269,
"step": 11500
},
{
"epoch": 4.0215821077364895,
"grad_norm": 0.5126230881524994,
"learning_rate": 6.563633599567065e-05,
"loss": 1.2523,
"step": 11550
},
{
"epoch": 4.038987033330432,
"grad_norm": 0.5098806194453889,
"learning_rate": 6.537789581078693e-05,
"loss": 1.2622,
"step": 11600
},
{
"epoch": 4.038987033330432,
"eval_loss": 1.4786348342895508,
"eval_runtime": 14.0611,
"eval_samples_per_second": 71.118,
"eval_steps_per_second": 2.276,
"step": 11600
},
{
"epoch": 4.056391958924376,
"grad_norm": 0.45319295070868365,
"learning_rate": 6.511900901965492e-05,
"loss": 1.246,
"step": 11650
},
{
"epoch": 4.073796884518319,
"grad_norm": 0.5020371370617683,
"learning_rate": 6.485968339356566e-05,
"loss": 1.263,
"step": 11700
},
{
"epoch": 4.073796884518319,
"eval_loss": 1.4689204692840576,
"eval_runtime": 14.0995,
"eval_samples_per_second": 70.924,
"eval_steps_per_second": 2.27,
"step": 11700
},
{
"epoch": 4.091201810112262,
"grad_norm": 0.4891550610909729,
"learning_rate": 6.459992671698323e-05,
"loss": 1.2468,
"step": 11750
},
{
"epoch": 4.108606735706205,
"grad_norm": 0.41320498769703473,
"learning_rate": 6.433974678731097e-05,
"loss": 1.2727,
"step": 11800
},
{
"epoch": 4.108606735706205,
"eval_loss": 1.4705528020858765,
"eval_runtime": 14.0263,
"eval_samples_per_second": 71.295,
"eval_steps_per_second": 2.281,
"step": 11800
},
{
"epoch": 4.126011661300148,
"grad_norm": 0.42567451569194076,
"learning_rate": 6.407915141465746e-05,
"loss": 1.2496,
"step": 11850
},
{
"epoch": 4.143416586894091,
"grad_norm": 0.4143204113738695,
"learning_rate": 6.381814842160219e-05,
"loss": 1.255,
"step": 11900
},
{
"epoch": 4.143416586894091,
"eval_loss": 1.4660860300064087,
"eval_runtime": 14.0759,
"eval_samples_per_second": 71.043,
"eval_steps_per_second": 2.273,
"step": 11900
},
{
"epoch": 4.160821512488035,
"grad_norm": 0.467101344292312,
"learning_rate": 6.355674564296053e-05,
"loss": 1.2513,
"step": 11950
},
{
"epoch": 4.178226438081977,
"grad_norm": 0.41772403175228634,
"learning_rate": 6.329495092554872e-05,
"loss": 1.2602,
"step": 12000
},
{
"epoch": 4.178226438081977,
"eval_loss": 1.4664828777313232,
"eval_runtime": 13.9906,
"eval_samples_per_second": 71.477,
"eval_steps_per_second": 2.287,
"step": 12000
}
],
"logging_steps": 50,
"max_steps": 28720,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 800,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1642006019244032e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}