flan-t5-small-questionizer / trainer_state.json
agentlans's picture
Upload 10 files
1c0537e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 383360,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.026085141903171953,
"grad_norm": 3.252858877182007,
"learning_rate": 4.978305856983862e-05,
"loss": 1.9218,
"num_input_tokens_seen": 283536,
"step": 500,
"train_runtime": 18.7004,
"train_tokens_per_second": 15162.068
},
{
"epoch": 0.052170283806343906,
"grad_norm": 3.500765085220337,
"learning_rate": 4.956568238731219e-05,
"loss": 1.7605,
"num_input_tokens_seen": 574552,
"step": 1000,
"train_runtime": 38.0018,
"train_tokens_per_second": 15119.085
},
{
"epoch": 0.07825542570951587,
"grad_norm": 3.4590671062469482,
"learning_rate": 4.934830620478575e-05,
"loss": 1.6898,
"num_input_tokens_seen": 859976,
"step": 1500,
"train_runtime": 57.2593,
"train_tokens_per_second": 15018.978
},
{
"epoch": 0.10434056761268781,
"grad_norm": 3.67798113822937,
"learning_rate": 4.9130930022259324e-05,
"loss": 1.6968,
"num_input_tokens_seen": 1151232,
"step": 2000,
"train_runtime": 76.5735,
"train_tokens_per_second": 15034.338
},
{
"epoch": 0.13042570951585977,
"grad_norm": 3.009059190750122,
"learning_rate": 4.891355383973289e-05,
"loss": 1.6838,
"num_input_tokens_seen": 1439432,
"step": 2500,
"train_runtime": 95.8962,
"train_tokens_per_second": 15010.309
},
{
"epoch": 0.15651085141903173,
"grad_norm": 3.1467044353485107,
"learning_rate": 4.869617765720646e-05,
"loss": 1.6861,
"num_input_tokens_seen": 1727728,
"step": 3000,
"train_runtime": 114.9793,
"train_tokens_per_second": 15026.424
},
{
"epoch": 0.18259599332220366,
"grad_norm": 2.8238844871520996,
"learning_rate": 4.8478801474680025e-05,
"loss": 1.6343,
"num_input_tokens_seen": 2016488,
"step": 3500,
"train_runtime": 134.048,
"train_tokens_per_second": 15043.024
},
{
"epoch": 0.20868113522537562,
"grad_norm": 2.7848801612854004,
"learning_rate": 4.826142529215359e-05,
"loss": 1.6482,
"num_input_tokens_seen": 2310136,
"step": 4000,
"train_runtime": 153.775,
"train_tokens_per_second": 15022.828
},
{
"epoch": 0.23476627712854758,
"grad_norm": 3.402919054031372,
"learning_rate": 4.804404910962716e-05,
"loss": 1.6326,
"num_input_tokens_seen": 2601800,
"step": 4500,
"train_runtime": 173.1573,
"train_tokens_per_second": 15025.64
},
{
"epoch": 0.26085141903171954,
"grad_norm": 4.777134418487549,
"learning_rate": 4.7826672927100726e-05,
"loss": 1.6236,
"num_input_tokens_seen": 2889448,
"step": 5000,
"train_runtime": 192.4563,
"train_tokens_per_second": 15013.531
},
{
"epoch": 0.2869365609348915,
"grad_norm": 2.45479416847229,
"learning_rate": 4.760929674457429e-05,
"loss": 1.5949,
"num_input_tokens_seen": 3180128,
"step": 5500,
"train_runtime": 211.2052,
"train_tokens_per_second": 15057.053
},
{
"epoch": 0.31302170283806346,
"grad_norm": 2.6998794078826904,
"learning_rate": 4.7391920562047856e-05,
"loss": 1.6117,
"num_input_tokens_seen": 3470912,
"step": 6000,
"train_runtime": 230.9915,
"train_tokens_per_second": 15026.144
},
{
"epoch": 0.33910684474123537,
"grad_norm": 2.838428258895874,
"learning_rate": 4.717454437952143e-05,
"loss": 1.6056,
"num_input_tokens_seen": 3764848,
"step": 6500,
"train_runtime": 251.0138,
"train_tokens_per_second": 14998.572
},
{
"epoch": 0.36519198664440733,
"grad_norm": 2.8896422386169434,
"learning_rate": 4.695716819699499e-05,
"loss": 1.6002,
"num_input_tokens_seen": 4049200,
"step": 7000,
"train_runtime": 270.653,
"train_tokens_per_second": 14960.855
},
{
"epoch": 0.3912771285475793,
"grad_norm": 2.878220558166504,
"learning_rate": 4.673979201446856e-05,
"loss": 1.5839,
"num_input_tokens_seen": 4340488,
"step": 7500,
"train_runtime": 290.1843,
"train_tokens_per_second": 14957.693
},
{
"epoch": 0.41736227045075125,
"grad_norm": 2.7241406440734863,
"learning_rate": 4.652241583194213e-05,
"loss": 1.5844,
"num_input_tokens_seen": 4631904,
"step": 8000,
"train_runtime": 309.2754,
"train_tokens_per_second": 14976.633
},
{
"epoch": 0.4434474123539232,
"grad_norm": 2.727529287338257,
"learning_rate": 4.630503964941569e-05,
"loss": 1.5936,
"num_input_tokens_seen": 4919576,
"step": 8500,
"train_runtime": 328.4961,
"train_tokens_per_second": 14976.057
},
{
"epoch": 0.46953255425709517,
"grad_norm": 3.117870330810547,
"learning_rate": 4.6087663466889265e-05,
"loss": 1.5695,
"num_input_tokens_seen": 5211016,
"step": 9000,
"train_runtime": 348.3435,
"train_tokens_per_second": 14959.417
},
{
"epoch": 0.49561769616026713,
"grad_norm": 2.490983724594116,
"learning_rate": 4.587028728436283e-05,
"loss": 1.5802,
"num_input_tokens_seen": 5507568,
"step": 9500,
"train_runtime": 368.0383,
"train_tokens_per_second": 14964.661
},
{
"epoch": 0.5217028380634391,
"grad_norm": 2.392632246017456,
"learning_rate": 4.56529111018364e-05,
"loss": 1.5806,
"num_input_tokens_seen": 5798840,
"step": 10000,
"train_runtime": 387.6945,
"train_tokens_per_second": 14957.241
},
{
"epoch": 0.547787979966611,
"grad_norm": 2.6862573623657227,
"learning_rate": 4.5435534919309966e-05,
"loss": 1.5801,
"num_input_tokens_seen": 6085768,
"step": 10500,
"train_runtime": 407.4294,
"train_tokens_per_second": 14936.988
},
{
"epoch": 0.573873121869783,
"grad_norm": 3.164522647857666,
"learning_rate": 4.521815873678353e-05,
"loss": 1.5636,
"num_input_tokens_seen": 6371672,
"step": 11000,
"train_runtime": 426.5237,
"train_tokens_per_second": 14938.61
},
{
"epoch": 0.5999582637729549,
"grad_norm": 2.5483455657958984,
"learning_rate": 4.5000782554257095e-05,
"loss": 1.5541,
"num_input_tokens_seen": 6659744,
"step": 11500,
"train_runtime": 445.61,
"train_tokens_per_second": 14945.23
},
{
"epoch": 0.6260434056761269,
"grad_norm": 2.6326119899749756,
"learning_rate": 4.478340637173066e-05,
"loss": 1.5801,
"num_input_tokens_seen": 6947616,
"step": 12000,
"train_runtime": 465.2155,
"train_tokens_per_second": 14934.188
},
{
"epoch": 0.6521285475792988,
"grad_norm": 2.5993449687957764,
"learning_rate": 4.456603018920423e-05,
"loss": 1.5497,
"num_input_tokens_seen": 7236800,
"step": 12500,
"train_runtime": 484.6648,
"train_tokens_per_second": 14931.556
},
{
"epoch": 0.6782136894824707,
"grad_norm": 2.419832468032837,
"learning_rate": 4.4348654006677796e-05,
"loss": 1.5692,
"num_input_tokens_seen": 7525160,
"step": 13000,
"train_runtime": 504.4097,
"train_tokens_per_second": 14918.745
},
{
"epoch": 0.7042988313856428,
"grad_norm": 2.346853017807007,
"learning_rate": 4.413127782415137e-05,
"loss": 1.568,
"num_input_tokens_seen": 7815704,
"step": 13500,
"train_runtime": 523.0681,
"train_tokens_per_second": 14942.039
},
{
"epoch": 0.7303839732888147,
"grad_norm": 2.47847580909729,
"learning_rate": 4.391390164162493e-05,
"loss": 1.5597,
"num_input_tokens_seen": 8107760,
"step": 14000,
"train_runtime": 542.052,
"train_tokens_per_second": 14957.533
},
{
"epoch": 0.7564691151919867,
"grad_norm": 2.5489418506622314,
"learning_rate": 4.36965254590985e-05,
"loss": 1.5588,
"num_input_tokens_seen": 8400096,
"step": 14500,
"train_runtime": 562.4429,
"train_tokens_per_second": 14935.019
},
{
"epoch": 0.7825542570951586,
"grad_norm": 3.1929831504821777,
"learning_rate": 4.347914927657207e-05,
"loss": 1.5409,
"num_input_tokens_seen": 8679112,
"step": 15000,
"train_runtime": 581.6704,
"train_tokens_per_second": 14921.014
},
{
"epoch": 0.8086393989983306,
"grad_norm": 2.6714396476745605,
"learning_rate": 4.3261773094045634e-05,
"loss": 1.5494,
"num_input_tokens_seen": 8969456,
"step": 15500,
"train_runtime": 600.81,
"train_tokens_per_second": 14928.94
},
{
"epoch": 0.8347245409015025,
"grad_norm": 2.379903554916382,
"learning_rate": 4.3044396911519205e-05,
"loss": 1.5589,
"num_input_tokens_seen": 9261064,
"step": 16000,
"train_runtime": 619.9911,
"train_tokens_per_second": 14937.414
},
{
"epoch": 0.8608096828046744,
"grad_norm": 2.5801916122436523,
"learning_rate": 4.282702072899277e-05,
"loss": 1.5594,
"num_input_tokens_seen": 9550752,
"step": 16500,
"train_runtime": 639.7359,
"train_tokens_per_second": 14929.21
},
{
"epoch": 0.8868948247078464,
"grad_norm": 2.8763697147369385,
"learning_rate": 4.2609644546466335e-05,
"loss": 1.5768,
"num_input_tokens_seen": 9839304,
"step": 17000,
"train_runtime": 659.5206,
"train_tokens_per_second": 14918.873
},
{
"epoch": 0.9129799666110183,
"grad_norm": 3.0146758556365967,
"learning_rate": 4.23922683639399e-05,
"loss": 1.5499,
"num_input_tokens_seen": 10132792,
"step": 17500,
"train_runtime": 679.4357,
"train_tokens_per_second": 14913.541
},
{
"epoch": 0.9390651085141903,
"grad_norm": 2.629668951034546,
"learning_rate": 4.2174892181413464e-05,
"loss": 1.5469,
"num_input_tokens_seen": 10417368,
"step": 18000,
"train_runtime": 699.3592,
"train_tokens_per_second": 14895.589
},
{
"epoch": 0.9651502504173622,
"grad_norm": 2.527364492416382,
"learning_rate": 4.1957515998887036e-05,
"loss": 1.5396,
"num_input_tokens_seen": 10711800,
"step": 18500,
"train_runtime": 719.3221,
"train_tokens_per_second": 14891.522
},
{
"epoch": 0.9912353923205343,
"grad_norm": 2.3071608543395996,
"learning_rate": 4.17401398163606e-05,
"loss": 1.5194,
"num_input_tokens_seen": 10994712,
"step": 19000,
"train_runtime": 738.6768,
"train_tokens_per_second": 14884.334
},
{
"epoch": 1.0,
"eval_loss": 1.3842333555221558,
"eval_runtime": 47.3762,
"eval_samples_per_second": 809.161,
"eval_steps_per_second": 101.148,
"num_input_tokens_seen": 11091734,
"step": 19168
},
{
"epoch": 1.0173205342237062,
"grad_norm": 2.8192083835601807,
"learning_rate": 4.152276363383417e-05,
"loss": 1.4963,
"num_input_tokens_seen": 11281086,
"step": 19500,
"train_runtime": 806.5637,
"train_tokens_per_second": 13986.603
},
{
"epoch": 1.0434056761268782,
"grad_norm": 3.121436595916748,
"learning_rate": 4.130538745130774e-05,
"loss": 1.5117,
"num_input_tokens_seen": 11574638,
"step": 20000,
"train_runtime": 825.9512,
"train_tokens_per_second": 14013.707
},
{
"epoch": 1.06949081803005,
"grad_norm": 2.0136849880218506,
"learning_rate": 4.108801126878131e-05,
"loss": 1.5143,
"num_input_tokens_seen": 11864494,
"step": 20500,
"train_runtime": 845.2133,
"train_tokens_per_second": 14037.278
},
{
"epoch": 1.095575959933222,
"grad_norm": 2.6219029426574707,
"learning_rate": 4.087063508625487e-05,
"loss": 1.5055,
"num_input_tokens_seen": 12158550,
"step": 21000,
"train_runtime": 864.7079,
"train_tokens_per_second": 14060.876
},
{
"epoch": 1.121661101836394,
"grad_norm": 3.265441656112671,
"learning_rate": 4.065325890372844e-05,
"loss": 1.4973,
"num_input_tokens_seen": 12445726,
"step": 21500,
"train_runtime": 885.5348,
"train_tokens_per_second": 14054.474
},
{
"epoch": 1.147746243739566,
"grad_norm": 2.6268465518951416,
"learning_rate": 4.043588272120201e-05,
"loss": 1.5264,
"num_input_tokens_seen": 12733878,
"step": 22000,
"train_runtime": 905.2864,
"train_tokens_per_second": 14066.131
},
{
"epoch": 1.1738313856427378,
"grad_norm": 4.112071990966797,
"learning_rate": 4.0218506538675574e-05,
"loss": 1.4786,
"num_input_tokens_seen": 13017478,
"step": 22500,
"train_runtime": 924.8642,
"train_tokens_per_second": 14075.016
},
{
"epoch": 1.1999165275459098,
"grad_norm": 3.13775897026062,
"learning_rate": 4.000113035614914e-05,
"loss": 1.4809,
"num_input_tokens_seen": 13308726,
"step": 23000,
"train_runtime": 944.4145,
"train_tokens_per_second": 14092.038
},
{
"epoch": 1.2260016694490818,
"grad_norm": 2.7305409908294678,
"learning_rate": 3.9783754173622704e-05,
"loss": 1.5037,
"num_input_tokens_seen": 13600462,
"step": 23500,
"train_runtime": 964.343,
"train_tokens_per_second": 14103.346
},
{
"epoch": 1.2520868113522536,
"grad_norm": 3.8625481128692627,
"learning_rate": 3.9566377991096275e-05,
"loss": 1.4744,
"num_input_tokens_seen": 13886382,
"step": 24000,
"train_runtime": 983.6134,
"train_tokens_per_second": 14117.723
},
{
"epoch": 1.2781719532554257,
"grad_norm": 3.4027693271636963,
"learning_rate": 3.934900180856984e-05,
"loss": 1.4796,
"num_input_tokens_seen": 14171390,
"step": 24500,
"train_runtime": 1003.0211,
"train_tokens_per_second": 14128.706
},
{
"epoch": 1.3042570951585977,
"grad_norm": 2.1200718879699707,
"learning_rate": 3.9131625626043405e-05,
"loss": 1.5107,
"num_input_tokens_seen": 14461470,
"step": 25000,
"train_runtime": 1022.6959,
"train_tokens_per_second": 14140.538
},
{
"epoch": 1.3303422370617697,
"grad_norm": 2.7789530754089355,
"learning_rate": 3.8914249443516976e-05,
"loss": 1.4596,
"num_input_tokens_seen": 14747598,
"step": 25500,
"train_runtime": 1042.1868,
"train_tokens_per_second": 14150.628
},
{
"epoch": 1.3564273789649417,
"grad_norm": 2.1225244998931885,
"learning_rate": 3.869687326099054e-05,
"loss": 1.4669,
"num_input_tokens_seen": 15036278,
"step": 26000,
"train_runtime": 1061.7955,
"train_tokens_per_second": 14161.181
},
{
"epoch": 1.3825125208681135,
"grad_norm": 2.9342072010040283,
"learning_rate": 3.847949707846411e-05,
"loss": 1.4947,
"num_input_tokens_seen": 15322110,
"step": 26500,
"train_runtime": 1081.7408,
"train_tokens_per_second": 14164.308
},
{
"epoch": 1.4085976627712855,
"grad_norm": 2.25174880027771,
"learning_rate": 3.826212089593768e-05,
"loss": 1.472,
"num_input_tokens_seen": 15619830,
"step": 27000,
"train_runtime": 1101.4139,
"train_tokens_per_second": 14181.616
},
{
"epoch": 1.4346828046744573,
"grad_norm": 2.1327219009399414,
"learning_rate": 3.804474471341124e-05,
"loss": 1.4745,
"num_input_tokens_seen": 15910494,
"step": 27500,
"train_runtime": 1120.8296,
"train_tokens_per_second": 14195.283
},
{
"epoch": 1.4607679465776293,
"grad_norm": 2.2169244289398193,
"learning_rate": 3.782736853088481e-05,
"loss": 1.4961,
"num_input_tokens_seen": 16202854,
"step": 28000,
"train_runtime": 1140.1942,
"train_tokens_per_second": 14210.609
},
{
"epoch": 1.4868530884808013,
"grad_norm": 2.7171308994293213,
"learning_rate": 3.760999234835837e-05,
"loss": 1.4707,
"num_input_tokens_seen": 16491582,
"step": 28500,
"train_runtime": 1160.3313,
"train_tokens_per_second": 14212.822
},
{
"epoch": 1.5129382303839733,
"grad_norm": 2.9756038188934326,
"learning_rate": 3.739261616583194e-05,
"loss": 1.4584,
"num_input_tokens_seen": 16778886,
"step": 29000,
"train_runtime": 1180.225,
"train_tokens_per_second": 14216.684
},
{
"epoch": 1.5390233722871454,
"grad_norm": 2.1410768032073975,
"learning_rate": 3.717523998330551e-05,
"loss": 1.4856,
"num_input_tokens_seen": 17072582,
"step": 29500,
"train_runtime": 1199.0906,
"train_tokens_per_second": 14237.942
},
{
"epoch": 1.5651085141903172,
"grad_norm": 2.650392532348633,
"learning_rate": 3.695786380077908e-05,
"loss": 1.4821,
"num_input_tokens_seen": 17362110,
"step": 30000,
"train_runtime": 1218.8129,
"train_tokens_per_second": 14245.098
},
{
"epoch": 1.5911936560934892,
"grad_norm": 2.675250291824341,
"learning_rate": 3.6740487618252644e-05,
"loss": 1.4694,
"num_input_tokens_seen": 17647902,
"step": 30500,
"train_runtime": 1238.9908,
"train_tokens_per_second": 14243.772
},
{
"epoch": 1.617278797996661,
"grad_norm": 2.670755386352539,
"learning_rate": 3.652311143572621e-05,
"loss": 1.5342,
"num_input_tokens_seen": 17943398,
"step": 31000,
"train_runtime": 1259.7818,
"train_tokens_per_second": 14243.259
},
{
"epoch": 1.643363939899833,
"grad_norm": 2.637608051300049,
"learning_rate": 3.630573525319978e-05,
"loss": 1.4575,
"num_input_tokens_seen": 18231966,
"step": 31500,
"train_runtime": 1279.0356,
"train_tokens_per_second": 14254.464
},
{
"epoch": 1.669449081803005,
"grad_norm": 2.5078988075256348,
"learning_rate": 3.6088359070673345e-05,
"loss": 1.4518,
"num_input_tokens_seen": 18525670,
"step": 32000,
"train_runtime": 1297.7662,
"train_tokens_per_second": 14275.044
},
{
"epoch": 1.695534223706177,
"grad_norm": 2.266803503036499,
"learning_rate": 3.587098288814692e-05,
"loss": 1.5014,
"num_input_tokens_seen": 18815526,
"step": 32500,
"train_runtime": 1316.4234,
"train_tokens_per_second": 14292.914
},
{
"epoch": 1.721619365609349,
"grad_norm": 3.0197086334228516,
"learning_rate": 3.565360670562048e-05,
"loss": 1.4843,
"num_input_tokens_seen": 19112486,
"step": 33000,
"train_runtime": 1335.2332,
"train_tokens_per_second": 14313.968
},
{
"epoch": 1.7477045075125208,
"grad_norm": 2.791066884994507,
"learning_rate": 3.5436230523094046e-05,
"loss": 1.4878,
"num_input_tokens_seen": 19396846,
"step": 33500,
"train_runtime": 1353.9271,
"train_tokens_per_second": 14326.359
},
{
"epoch": 1.7737896494156928,
"grad_norm": 2.995617628097534,
"learning_rate": 3.521885434056761e-05,
"loss": 1.4606,
"num_input_tokens_seen": 19683174,
"step": 34000,
"train_runtime": 1372.6447,
"train_tokens_per_second": 14339.599
},
{
"epoch": 1.7998747913188646,
"grad_norm": 2.561185836791992,
"learning_rate": 3.5001478158041176e-05,
"loss": 1.4802,
"num_input_tokens_seen": 19973646,
"step": 34500,
"train_runtime": 1391.2808,
"train_tokens_per_second": 14356.301
},
{
"epoch": 1.8259599332220366,
"grad_norm": 3.1782171726226807,
"learning_rate": 3.478410197551475e-05,
"loss": 1.4588,
"num_input_tokens_seen": 20264526,
"step": 35000,
"train_runtime": 1409.9676,
"train_tokens_per_second": 14372.334
},
{
"epoch": 1.8520450751252087,
"grad_norm": 5.561634063720703,
"learning_rate": 3.456672579298831e-05,
"loss": 1.4609,
"num_input_tokens_seen": 20553006,
"step": 35500,
"train_runtime": 1428.6129,
"train_tokens_per_second": 14386.686
},
{
"epoch": 1.8781302170283807,
"grad_norm": 2.784186363220215,
"learning_rate": 3.4349349610461884e-05,
"loss": 1.4682,
"num_input_tokens_seen": 20844014,
"step": 36000,
"train_runtime": 1447.2777,
"train_tokens_per_second": 14402.221
},
{
"epoch": 1.9042153589315527,
"grad_norm": 2.59779691696167,
"learning_rate": 3.413197342793545e-05,
"loss": 1.5035,
"num_input_tokens_seen": 21130910,
"step": 36500,
"train_runtime": 1465.9615,
"train_tokens_per_second": 14414.369
},
{
"epoch": 1.9303005008347245,
"grad_norm": 2.6355996131896973,
"learning_rate": 3.391459724540902e-05,
"loss": 1.4815,
"num_input_tokens_seen": 21419886,
"step": 37000,
"train_runtime": 1484.6953,
"train_tokens_per_second": 14427.126
},
{
"epoch": 1.9563856427378965,
"grad_norm": 2.1540422439575195,
"learning_rate": 3.3697221062882585e-05,
"loss": 1.4686,
"num_input_tokens_seen": 21706222,
"step": 37500,
"train_runtime": 1503.3619,
"train_tokens_per_second": 14438.454
},
{
"epoch": 1.9824707846410683,
"grad_norm": 2.1270930767059326,
"learning_rate": 3.347984488035615e-05,
"loss": 1.4853,
"num_input_tokens_seen": 21997414,
"step": 38000,
"train_runtime": 1522.056,
"train_tokens_per_second": 14452.434
},
{
"epoch": 2.0,
"eval_loss": 1.347296118736267,
"eval_runtime": 45.0902,
"eval_samples_per_second": 850.185,
"eval_steps_per_second": 106.276,
"num_input_tokens_seen": 22196446,
"step": 38336
},
{
"epoch": 2.0085559265442403,
"grad_norm": 2.812293767929077,
"learning_rate": 3.326246869782972e-05,
"loss": 1.4672,
"num_input_tokens_seen": 22289118,
"step": 38500,
"train_runtime": 1586.651,
"train_tokens_per_second": 14047.902
},
{
"epoch": 2.0346410684474123,
"grad_norm": 3.67232346534729,
"learning_rate": 3.3045092515303286e-05,
"loss": 1.4381,
"num_input_tokens_seen": 22577710,
"step": 39000,
"train_runtime": 1605.3175,
"train_tokens_per_second": 14064.327
},
{
"epoch": 2.0607262103505843,
"grad_norm": 2.2775866985321045,
"learning_rate": 3.282771633277685e-05,
"loss": 1.4397,
"num_input_tokens_seen": 22866142,
"step": 39500,
"train_runtime": 1623.9658,
"train_tokens_per_second": 14080.434
},
{
"epoch": 2.0868113522537564,
"grad_norm": 3.0156877040863037,
"learning_rate": 3.2610340150250415e-05,
"loss": 1.4657,
"num_input_tokens_seen": 23163734,
"step": 40000,
"train_runtime": 1642.6646,
"train_tokens_per_second": 14101.317
},
{
"epoch": 2.1128964941569284,
"grad_norm": 3.8104028701782227,
"learning_rate": 3.239296396772399e-05,
"loss": 1.4687,
"num_input_tokens_seen": 23451982,
"step": 40500,
"train_runtime": 1661.3261,
"train_tokens_per_second": 14116.423
},
{
"epoch": 2.1389816360601,
"grad_norm": 1.780987024307251,
"learning_rate": 3.217558778519755e-05,
"loss": 1.4432,
"num_input_tokens_seen": 23743406,
"step": 41000,
"train_runtime": 1679.966,
"train_tokens_per_second": 14133.266
},
{
"epoch": 2.165066777963272,
"grad_norm": 2.234935998916626,
"learning_rate": 3.1958211602671117e-05,
"loss": 1.447,
"num_input_tokens_seen": 24037990,
"step": 41500,
"train_runtime": 1698.6679,
"train_tokens_per_second": 14151.082
},
{
"epoch": 2.191151919866444,
"grad_norm": 2.599027395248413,
"learning_rate": 3.174083542014469e-05,
"loss": 1.4307,
"num_input_tokens_seen": 24333206,
"step": 42000,
"train_runtime": 1717.3337,
"train_tokens_per_second": 14169.177
},
{
"epoch": 2.217237061769616,
"grad_norm": 3.104538917541504,
"learning_rate": 3.152345923761825e-05,
"loss": 1.4165,
"num_input_tokens_seen": 24623262,
"step": 42500,
"train_runtime": 1735.9704,
"train_tokens_per_second": 14184.149
},
{
"epoch": 2.243322203672788,
"grad_norm": 2.5183098316192627,
"learning_rate": 3.1306083055091824e-05,
"loss": 1.4251,
"num_input_tokens_seen": 24910790,
"step": 43000,
"train_runtime": 1754.6301,
"train_tokens_per_second": 14197.175
},
{
"epoch": 2.26940734557596,
"grad_norm": 3.010117530822754,
"learning_rate": 3.108870687256539e-05,
"loss": 1.4719,
"num_input_tokens_seen": 25200606,
"step": 43500,
"train_runtime": 1773.3028,
"train_tokens_per_second": 14211.112
},
{
"epoch": 2.295492487479132,
"grad_norm": 3.781156063079834,
"learning_rate": 3.087133069003896e-05,
"loss": 1.44,
"num_input_tokens_seen": 25494558,
"step": 44000,
"train_runtime": 1791.9661,
"train_tokens_per_second": 14227.143
},
{
"epoch": 2.321577629382304,
"grad_norm": 2.3171684741973877,
"learning_rate": 3.0653954507512525e-05,
"loss": 1.4048,
"num_input_tokens_seen": 25783878,
"step": 44500,
"train_runtime": 1810.6406,
"train_tokens_per_second": 14240.196
},
{
"epoch": 2.3476627712854756,
"grad_norm": 2.785936117172241,
"learning_rate": 3.0436578324986087e-05,
"loss": 1.4333,
"num_input_tokens_seen": 26074006,
"step": 45000,
"train_runtime": 1829.2827,
"train_tokens_per_second": 14253.677
},
{
"epoch": 2.3737479131886476,
"grad_norm": 3.067204475402832,
"learning_rate": 3.021920214245966e-05,
"loss": 1.412,
"num_input_tokens_seen": 26362862,
"step": 45500,
"train_runtime": 1847.9255,
"train_tokens_per_second": 14266.193
},
{
"epoch": 2.3998330550918197,
"grad_norm": 3.440131902694702,
"learning_rate": 3.0001825959933223e-05,
"loss": 1.4343,
"num_input_tokens_seen": 26659222,
"step": 46000,
"train_runtime": 1866.6572,
"train_tokens_per_second": 14281.799
},
{
"epoch": 2.4259181969949917,
"grad_norm": 4.180527210235596,
"learning_rate": 2.978444977740679e-05,
"loss": 1.4231,
"num_input_tokens_seen": 26945814,
"step": 46500,
"train_runtime": 1885.3282,
"train_tokens_per_second": 14292.373
},
{
"epoch": 2.4520033388981637,
"grad_norm": 4.318091869354248,
"learning_rate": 2.9567073594880356e-05,
"loss": 1.4234,
"num_input_tokens_seen": 27240518,
"step": 47000,
"train_runtime": 1904.0251,
"train_tokens_per_second": 14306.806
},
{
"epoch": 2.4780884808013357,
"grad_norm": 2.4914376735687256,
"learning_rate": 2.9349697412353928e-05,
"loss": 1.4466,
"num_input_tokens_seen": 27523134,
"step": 47500,
"train_runtime": 1922.7393,
"train_tokens_per_second": 14314.543
},
{
"epoch": 2.5041736227045073,
"grad_norm": 2.4933414459228516,
"learning_rate": 2.9132321229827492e-05,
"loss": 1.4219,
"num_input_tokens_seen": 27811630,
"step": 48000,
"train_runtime": 1941.4401,
"train_tokens_per_second": 14325.258
},
{
"epoch": 2.5302587646076793,
"grad_norm": 3.3003621101379395,
"learning_rate": 2.8914945047301057e-05,
"loss": 1.4167,
"num_input_tokens_seen": 28103582,
"step": 48500,
"train_runtime": 1960.1495,
"train_tokens_per_second": 14337.469
},
{
"epoch": 2.5563439065108513,
"grad_norm": 2.9343557357788086,
"learning_rate": 2.8697568864774625e-05,
"loss": 1.4343,
"num_input_tokens_seen": 28395062,
"step": 49000,
"train_runtime": 1978.7726,
"train_tokens_per_second": 14349.836
},
{
"epoch": 2.5824290484140233,
"grad_norm": 2.247775077819824,
"learning_rate": 2.848019268224819e-05,
"loss": 1.44,
"num_input_tokens_seen": 28682022,
"step": 49500,
"train_runtime": 1997.425,
"train_tokens_per_second": 14359.499
},
{
"epoch": 2.6085141903171953,
"grad_norm": 3.329780101776123,
"learning_rate": 2.826281649972176e-05,
"loss": 1.4366,
"num_input_tokens_seen": 28966702,
"step": 50000,
"train_runtime": 2016.0551,
"train_tokens_per_second": 14368.011
},
{
"epoch": 2.6345993322203674,
"grad_norm": 2.639854907989502,
"learning_rate": 2.8045440317195326e-05,
"loss": 1.4175,
"num_input_tokens_seen": 29256878,
"step": 50500,
"train_runtime": 2034.718,
"train_tokens_per_second": 14378.837
},
{
"epoch": 2.6606844741235394,
"grad_norm": 4.10645055770874,
"learning_rate": 2.7828064134668898e-05,
"loss": 1.4229,
"num_input_tokens_seen": 29545014,
"step": 51000,
"train_runtime": 2053.4349,
"train_tokens_per_second": 14388.094
},
{
"epoch": 2.6867696160267114,
"grad_norm": 3.233084201812744,
"learning_rate": 2.7610687952142463e-05,
"loss": 1.4396,
"num_input_tokens_seen": 29832302,
"step": 51500,
"train_runtime": 2072.1004,
"train_tokens_per_second": 14397.132
},
{
"epoch": 2.7128547579298834,
"grad_norm": 3.0811736583709717,
"learning_rate": 2.7393311769616027e-05,
"loss": 1.4417,
"num_input_tokens_seen": 30124678,
"step": 52000,
"train_runtime": 2090.765,
"train_tokens_per_second": 14408.448
},
{
"epoch": 2.738939899833055,
"grad_norm": 3.9066579341888428,
"learning_rate": 2.7175935587089595e-05,
"loss": 1.42,
"num_input_tokens_seen": 30411006,
"step": 52500,
"train_runtime": 2109.4596,
"train_tokens_per_second": 14416.492
},
{
"epoch": 2.765025041736227,
"grad_norm": 3.752941131591797,
"learning_rate": 2.695855940456316e-05,
"loss": 1.4416,
"num_input_tokens_seen": 30697118,
"step": 53000,
"train_runtime": 2128.1961,
"train_tokens_per_second": 14424.008
},
{
"epoch": 2.791110183639399,
"grad_norm": 2.2906174659729004,
"learning_rate": 2.6741183222036732e-05,
"loss": 1.434,
"num_input_tokens_seen": 30985038,
"step": 53500,
"train_runtime": 2146.9172,
"train_tokens_per_second": 14432.339
},
{
"epoch": 2.817195325542571,
"grad_norm": 4.612029075622559,
"learning_rate": 2.6523807039510297e-05,
"loss": 1.4167,
"num_input_tokens_seen": 31273350,
"step": 54000,
"train_runtime": 2165.6016,
"train_tokens_per_second": 14440.952
},
{
"epoch": 2.843280467445743,
"grad_norm": 2.9580113887786865,
"learning_rate": 2.6306430856983865e-05,
"loss": 1.4059,
"num_input_tokens_seen": 31560206,
"step": 54500,
"train_runtime": 2184.355,
"train_tokens_per_second": 14448.295
},
{
"epoch": 2.8693656093489146,
"grad_norm": 3.1787197589874268,
"learning_rate": 2.608905467445743e-05,
"loss": 1.4472,
"num_input_tokens_seen": 31852006,
"step": 55000,
"train_runtime": 2203.0469,
"train_tokens_per_second": 14458.161
},
{
"epoch": 2.8954507512520866,
"grad_norm": 2.0112416744232178,
"learning_rate": 2.5871678491930994e-05,
"loss": 1.4311,
"num_input_tokens_seen": 32138366,
"step": 55500,
"train_runtime": 2221.6719,
"train_tokens_per_second": 14465.847
},
{
"epoch": 2.9215358931552586,
"grad_norm": 1.9806029796600342,
"learning_rate": 2.5654302309404566e-05,
"loss": 1.4348,
"num_input_tokens_seen": 32427294,
"step": 56000,
"train_runtime": 2240.3821,
"train_tokens_per_second": 14474.002
},
{
"epoch": 2.9476210350584306,
"grad_norm": 1.9818835258483887,
"learning_rate": 2.543692612687813e-05,
"loss": 1.4442,
"num_input_tokens_seen": 32714750,
"step": 56500,
"train_runtime": 2259.0685,
"train_tokens_per_second": 14481.522
},
{
"epoch": 2.9737061769616027,
"grad_norm": 2.794255256652832,
"learning_rate": 2.52195499443517e-05,
"loss": 1.4452,
"num_input_tokens_seen": 33004950,
"step": 57000,
"train_runtime": 2277.7337,
"train_tokens_per_second": 14490.258
},
{
"epoch": 2.9997913188647747,
"grad_norm": 3.825054407119751,
"learning_rate": 2.5002173761825263e-05,
"loss": 1.4031,
"num_input_tokens_seen": 33292886,
"step": 57500,
"train_runtime": 2296.3777,
"train_tokens_per_second": 14498.001
},
{
"epoch": 3.0,
"eval_loss": 1.3332206010818481,
"eval_runtime": 45.0681,
"eval_samples_per_second": 850.602,
"eval_steps_per_second": 106.328,
"num_input_tokens_seen": 33294704,
"step": 57504
},
{
"epoch": 3.0258764607679467,
"grad_norm": 3.42480731010437,
"learning_rate": 2.478479757929883e-05,
"loss": 1.3848,
"num_input_tokens_seen": 33584784,
"step": 58000,
"train_runtime": 2361.2516,
"train_tokens_per_second": 14223.298
},
{
"epoch": 3.0519616026711187,
"grad_norm": 2.5299935340881348,
"learning_rate": 2.45674213967724e-05,
"loss": 1.3964,
"num_input_tokens_seen": 33871192,
"step": 58500,
"train_runtime": 2379.8401,
"train_tokens_per_second": 14232.55
},
{
"epoch": 3.0780467445742903,
"grad_norm": 2.3154349327087402,
"learning_rate": 2.4350045214245968e-05,
"loss": 1.4092,
"num_input_tokens_seen": 34162736,
"step": 59000,
"train_runtime": 2398.5047,
"train_tokens_per_second": 14243.348
},
{
"epoch": 3.1041318864774623,
"grad_norm": 3.183199167251587,
"learning_rate": 2.4132669031719536e-05,
"loss": 1.4007,
"num_input_tokens_seen": 34452880,
"step": 59500,
"train_runtime": 2417.223,
"train_tokens_per_second": 14253.083
},
{
"epoch": 3.1302170283806343,
"grad_norm": 2.856942892074585,
"learning_rate": 2.39152928491931e-05,
"loss": 1.407,
"num_input_tokens_seen": 34740064,
"step": 60000,
"train_runtime": 2435.9312,
"train_tokens_per_second": 14261.513
},
{
"epoch": 3.1563021702838063,
"grad_norm": 3.0104143619537354,
"learning_rate": 2.3697916666666666e-05,
"loss": 1.3869,
"num_input_tokens_seen": 35033296,
"step": 60500,
"train_runtime": 2454.6106,
"train_tokens_per_second": 14272.446
},
{
"epoch": 3.1823873121869783,
"grad_norm": 2.1120755672454834,
"learning_rate": 2.3480540484140234e-05,
"loss": 1.4128,
"num_input_tokens_seen": 35326400,
"step": 61000,
"train_runtime": 2473.3018,
"train_tokens_per_second": 14283.093
},
{
"epoch": 3.2084724540901504,
"grad_norm": 2.3867533206939697,
"learning_rate": 2.3263164301613802e-05,
"loss": 1.421,
"num_input_tokens_seen": 35610096,
"step": 61500,
"train_runtime": 2491.98,
"train_tokens_per_second": 14289.88
},
{
"epoch": 3.2345575959933224,
"grad_norm": 2.934441566467285,
"learning_rate": 2.304578811908737e-05,
"loss": 1.4507,
"num_input_tokens_seen": 35899736,
"step": 62000,
"train_runtime": 2510.6844,
"train_tokens_per_second": 14298.785
},
{
"epoch": 3.260642737896494,
"grad_norm": 1.9727118015289307,
"learning_rate": 2.2828411936560938e-05,
"loss": 1.4167,
"num_input_tokens_seen": 36185200,
"step": 62500,
"train_runtime": 2529.3663,
"train_tokens_per_second": 14306.034
},
{
"epoch": 3.286727879799666,
"grad_norm": 2.6939632892608643,
"learning_rate": 2.2611035754034503e-05,
"loss": 1.4152,
"num_input_tokens_seen": 36476040,
"step": 63000,
"train_runtime": 2548.104,
"train_tokens_per_second": 14314.973
},
{
"epoch": 3.312813021702838,
"grad_norm": 2.878223180770874,
"learning_rate": 2.2393659571508068e-05,
"loss": 1.4027,
"num_input_tokens_seen": 36776288,
"step": 63500,
"train_runtime": 2566.9571,
"train_tokens_per_second": 14326.803
},
{
"epoch": 3.33889816360601,
"grad_norm": 2.485452175140381,
"learning_rate": 2.2176283388981636e-05,
"loss": 1.3992,
"num_input_tokens_seen": 37063960,
"step": 64000,
"train_runtime": 2585.6586,
"train_tokens_per_second": 14334.437
},
{
"epoch": 3.364983305509182,
"grad_norm": 3.862046241760254,
"learning_rate": 2.1958907206455204e-05,
"loss": 1.3968,
"num_input_tokens_seen": 37353184,
"step": 64500,
"train_runtime": 2604.3949,
"train_tokens_per_second": 14342.366
},
{
"epoch": 3.391068447412354,
"grad_norm": 2.4618258476257324,
"learning_rate": 2.1741531023928772e-05,
"loss": 1.4059,
"num_input_tokens_seen": 37648648,
"step": 65000,
"train_runtime": 2623.1097,
"train_tokens_per_second": 14352.678
},
{
"epoch": 3.417153589315526,
"grad_norm": 2.7443792819976807,
"learning_rate": 2.152415484140234e-05,
"loss": 1.3809,
"num_input_tokens_seen": 37936072,
"step": 65500,
"train_runtime": 2641.8438,
"train_tokens_per_second": 14359.695
},
{
"epoch": 3.443238731218698,
"grad_norm": 2.808088541030884,
"learning_rate": 2.1306778658875905e-05,
"loss": 1.4118,
"num_input_tokens_seen": 38225568,
"step": 66000,
"train_runtime": 2660.549,
"train_tokens_per_second": 14367.549
},
{
"epoch": 3.4693238731218696,
"grad_norm": 2.7997331619262695,
"learning_rate": 2.1089402476349473e-05,
"loss": 1.404,
"num_input_tokens_seen": 38512144,
"step": 66500,
"train_runtime": 2679.274,
"train_tokens_per_second": 14374.097
},
{
"epoch": 3.4954090150250416,
"grad_norm": 2.4735493659973145,
"learning_rate": 2.0872026293823038e-05,
"loss": 1.4271,
"num_input_tokens_seen": 38797344,
"step": 67000,
"train_runtime": 2697.9506,
"train_tokens_per_second": 14380.302
},
{
"epoch": 3.5214941569282137,
"grad_norm": 4.414172172546387,
"learning_rate": 2.0654650111296606e-05,
"loss": 1.3969,
"num_input_tokens_seen": 39085088,
"step": 67500,
"train_runtime": 2716.6451,
"train_tokens_per_second": 14387.263
},
{
"epoch": 3.5475792988313857,
"grad_norm": 2.165419340133667,
"learning_rate": 2.0437273928770174e-05,
"loss": 1.4137,
"num_input_tokens_seen": 39369904,
"step": 68000,
"train_runtime": 2735.364,
"train_tokens_per_second": 14392.93
},
{
"epoch": 3.5736644407345577,
"grad_norm": 2.251249074935913,
"learning_rate": 2.021989774624374e-05,
"loss": 1.4066,
"num_input_tokens_seen": 39661008,
"step": 68500,
"train_runtime": 2754.1198,
"train_tokens_per_second": 14400.611
},
{
"epoch": 3.5997495826377297,
"grad_norm": 2.874959945678711,
"learning_rate": 2.0002521563717307e-05,
"loss": 1.3949,
"num_input_tokens_seen": 39953968,
"step": 69000,
"train_runtime": 2772.8706,
"train_tokens_per_second": 14408.883
},
{
"epoch": 3.6258347245409013,
"grad_norm": 2.662647008895874,
"learning_rate": 1.9785145381190875e-05,
"loss": 1.4054,
"num_input_tokens_seen": 40240768,
"step": 69500,
"train_runtime": 2791.6372,
"train_tokens_per_second": 14414.756
},
{
"epoch": 3.6519198664440733,
"grad_norm": 2.5272815227508545,
"learning_rate": 1.9567769198664444e-05,
"loss": 1.4323,
"num_input_tokens_seen": 40533416,
"step": 70000,
"train_runtime": 2810.3654,
"train_tokens_per_second": 14422.827
},
{
"epoch": 3.6780050083472453,
"grad_norm": 2.721334457397461,
"learning_rate": 1.9350393016138008e-05,
"loss": 1.3872,
"num_input_tokens_seen": 40825024,
"step": 70500,
"train_runtime": 2829.08,
"train_tokens_per_second": 14430.495
},
{
"epoch": 3.7040901502504173,
"grad_norm": 2.5722897052764893,
"learning_rate": 1.9133016833611576e-05,
"loss": 1.372,
"num_input_tokens_seen": 41113376,
"step": 71000,
"train_runtime": 2847.8223,
"train_tokens_per_second": 14436.777
},
{
"epoch": 3.7301752921535893,
"grad_norm": 2.262794256210327,
"learning_rate": 1.891564065108514e-05,
"loss": 1.3728,
"num_input_tokens_seen": 41401936,
"step": 71500,
"train_runtime": 2866.4955,
"train_tokens_per_second": 14443.398
},
{
"epoch": 3.7562604340567614,
"grad_norm": 2.6011643409729004,
"learning_rate": 1.869826446855871e-05,
"loss": 1.3901,
"num_input_tokens_seen": 41689120,
"step": 72000,
"train_runtime": 2885.163,
"train_tokens_per_second": 14449.485
},
{
"epoch": 3.7823455759599334,
"grad_norm": 2.6435554027557373,
"learning_rate": 1.8480888286032277e-05,
"loss": 1.4071,
"num_input_tokens_seen": 41974720,
"step": 72500,
"train_runtime": 2903.8827,
"train_tokens_per_second": 14454.689
},
{
"epoch": 3.8084307178631054,
"grad_norm": 2.489372730255127,
"learning_rate": 1.8263512103505846e-05,
"loss": 1.4023,
"num_input_tokens_seen": 42264016,
"step": 73000,
"train_runtime": 2922.5501,
"train_tokens_per_second": 14461.349
},
{
"epoch": 3.8345158597662774,
"grad_norm": 2.4132964611053467,
"learning_rate": 1.8046135920979414e-05,
"loss": 1.4153,
"num_input_tokens_seen": 42558416,
"step": 73500,
"train_runtime": 2941.2299,
"train_tokens_per_second": 14469.599
},
{
"epoch": 3.860601001669449,
"grad_norm": 3.1832597255706787,
"learning_rate": 1.782875973845298e-05,
"loss": 1.4076,
"num_input_tokens_seen": 42847504,
"step": 74000,
"train_runtime": 2959.9571,
"train_tokens_per_second": 14475.718
},
{
"epoch": 3.886686143572621,
"grad_norm": 2.246975898742676,
"learning_rate": 1.7611383555926543e-05,
"loss": 1.3755,
"num_input_tokens_seen": 43137392,
"step": 74500,
"train_runtime": 2978.6745,
"train_tokens_per_second": 14482.077
},
{
"epoch": 3.912771285475793,
"grad_norm": 3.47536039352417,
"learning_rate": 1.739400737340011e-05,
"loss": 1.3837,
"num_input_tokens_seen": 43421200,
"step": 75000,
"train_runtime": 2997.3314,
"train_tokens_per_second": 14486.62
},
{
"epoch": 3.938856427378965,
"grad_norm": 2.817647695541382,
"learning_rate": 1.717663119087368e-05,
"loss": 1.3869,
"num_input_tokens_seen": 43714432,
"step": 75500,
"train_runtime": 3015.9535,
"train_tokens_per_second": 14494.399
},
{
"epoch": 3.964941569282137,
"grad_norm": 2.670565366744995,
"learning_rate": 1.6959255008347248e-05,
"loss": 1.3875,
"num_input_tokens_seen": 44005040,
"step": 76000,
"train_runtime": 3034.653,
"train_tokens_per_second": 14500.847
},
{
"epoch": 3.9910267111853086,
"grad_norm": 3.01701021194458,
"learning_rate": 1.6741878825820816e-05,
"loss": 1.3875,
"num_input_tokens_seen": 44295304,
"step": 76500,
"train_runtime": 3053.3496,
"train_tokens_per_second": 14507.118
},
{
"epoch": 4.0,
"eval_loss": 1.3256505727767944,
"eval_runtime": 45.046,
"eval_samples_per_second": 851.018,
"eval_steps_per_second": 106.38,
"num_input_tokens_seen": 44395724,
"step": 76672
},
{
"epoch": 4.017111853088481,
"grad_norm": 2.520019292831421,
"learning_rate": 1.652450264329438e-05,
"loss": 1.3838,
"num_input_tokens_seen": 44585564,
"step": 77000,
"train_runtime": 3118.0069,
"train_tokens_per_second": 14299.379
},
{
"epoch": 4.043196994991653,
"grad_norm": 4.146509170532227,
"learning_rate": 1.6307126460767945e-05,
"loss": 1.3596,
"num_input_tokens_seen": 44870940,
"step": 77500,
"train_runtime": 3136.5879,
"train_tokens_per_second": 14305.654
},
{
"epoch": 4.069282136894825,
"grad_norm": 2.3407187461853027,
"learning_rate": 1.6089750278241514e-05,
"loss": 1.3979,
"num_input_tokens_seen": 45165140,
"step": 78000,
"train_runtime": 3155.153,
"train_tokens_per_second": 14314.723
},
{
"epoch": 4.095367278797997,
"grad_norm": 2.992572069168091,
"learning_rate": 1.5872374095715082e-05,
"loss": 1.4121,
"num_input_tokens_seen": 45458076,
"step": 78500,
"train_runtime": 3173.7885,
"train_tokens_per_second": 14322.97
},
{
"epoch": 4.121452420701169,
"grad_norm": 3.490511655807495,
"learning_rate": 1.565499791318865e-05,
"loss": 1.37,
"num_input_tokens_seen": 45746588,
"step": 79000,
"train_runtime": 3192.4179,
"train_tokens_per_second": 14329.762
},
{
"epoch": 4.147537562604341,
"grad_norm": 3.6620404720306396,
"learning_rate": 1.5437621730662215e-05,
"loss": 1.398,
"num_input_tokens_seen": 46037020,
"step": 79500,
"train_runtime": 3212.1684,
"train_tokens_per_second": 14332.069
},
{
"epoch": 4.173622704507513,
"grad_norm": 2.709702253341675,
"learning_rate": 1.5220245548135783e-05,
"loss": 1.3714,
"num_input_tokens_seen": 46327764,
"step": 80000,
"train_runtime": 3232.3645,
"train_tokens_per_second": 14332.469
},
{
"epoch": 4.199707846410685,
"grad_norm": 3.0171260833740234,
"learning_rate": 1.5002869365609348e-05,
"loss": 1.3777,
"num_input_tokens_seen": 46608924,
"step": 80500,
"train_runtime": 3252.0642,
"train_tokens_per_second": 14332.104
},
{
"epoch": 4.225792988313857,
"grad_norm": 2.588928461074829,
"learning_rate": 1.4785493183082916e-05,
"loss": 1.3768,
"num_input_tokens_seen": 46898436,
"step": 81000,
"train_runtime": 3271.9745,
"train_tokens_per_second": 14333.375
},
{
"epoch": 4.251878130217029,
"grad_norm": 2.5653598308563232,
"learning_rate": 1.4568117000556484e-05,
"loss": 1.3753,
"num_input_tokens_seen": 47187548,
"step": 81500,
"train_runtime": 3291.5411,
"train_tokens_per_second": 14336.005
},
{
"epoch": 4.2779632721202,
"grad_norm": 3.236936330795288,
"learning_rate": 1.435074081803005e-05,
"loss": 1.3987,
"num_input_tokens_seen": 47475276,
"step": 82000,
"train_runtime": 3311.1953,
"train_tokens_per_second": 14337.806
},
{
"epoch": 4.304048414023372,
"grad_norm": 2.4497241973876953,
"learning_rate": 1.4133364635503618e-05,
"loss": 1.36,
"num_input_tokens_seen": 47768556,
"step": 82500,
"train_runtime": 3330.633,
"train_tokens_per_second": 14342.185
},
{
"epoch": 4.330133555926544,
"grad_norm": 3.381693124771118,
"learning_rate": 1.3915988452977185e-05,
"loss": 1.4122,
"num_input_tokens_seen": 48056012,
"step": 83000,
"train_runtime": 3350.0565,
"train_tokens_per_second": 14344.836
},
{
"epoch": 4.356218697829716,
"grad_norm": 2.8100342750549316,
"learning_rate": 1.3698612270450753e-05,
"loss": 1.3836,
"num_input_tokens_seen": 48341348,
"step": 83500,
"train_runtime": 3369.3072,
"train_tokens_per_second": 14347.563
},
{
"epoch": 4.382303839732888,
"grad_norm": 3.380335569381714,
"learning_rate": 1.3481236087924318e-05,
"loss": 1.3726,
"num_input_tokens_seen": 48631420,
"step": 84000,
"train_runtime": 3389.3206,
"train_tokens_per_second": 14348.427
},
{
"epoch": 4.40838898163606,
"grad_norm": 2.434285879135132,
"learning_rate": 1.3263859905397884e-05,
"loss": 1.3937,
"num_input_tokens_seen": 48915972,
"step": 84500,
"train_runtime": 3409.284,
"train_tokens_per_second": 14347.873
},
{
"epoch": 4.434474123539232,
"grad_norm": 2.8802988529205322,
"learning_rate": 1.3046483722871452e-05,
"loss": 1.3761,
"num_input_tokens_seen": 49203916,
"step": 85000,
"train_runtime": 3428.2362,
"train_tokens_per_second": 14352.545
},
{
"epoch": 4.460559265442404,
"grad_norm": 3.350780963897705,
"learning_rate": 1.282910754034502e-05,
"loss": 1.3766,
"num_input_tokens_seen": 49493860,
"step": 85500,
"train_runtime": 3447.3803,
"train_tokens_per_second": 14356.948
},
{
"epoch": 4.486644407345576,
"grad_norm": 2.4271440505981445,
"learning_rate": 1.2611731357818587e-05,
"loss": 1.3672,
"num_input_tokens_seen": 49778012,
"step": 86000,
"train_runtime": 3466.8453,
"train_tokens_per_second": 14358.302
},
{
"epoch": 4.512729549248748,
"grad_norm": 2.5384743213653564,
"learning_rate": 1.2394355175292154e-05,
"loss": 1.3701,
"num_input_tokens_seen": 50065764,
"step": 86500,
"train_runtime": 3486.5719,
"train_tokens_per_second": 14359.596
},
{
"epoch": 4.53881469115192,
"grad_norm": 3.011307716369629,
"learning_rate": 1.2176978992765722e-05,
"loss": 1.3884,
"num_input_tokens_seen": 50349860,
"step": 87000,
"train_runtime": 3505.9535,
"train_tokens_per_second": 14361.246
},
{
"epoch": 4.564899833055092,
"grad_norm": 2.5870578289031982,
"learning_rate": 1.1959602810239288e-05,
"loss": 1.3991,
"num_input_tokens_seen": 50643260,
"step": 87500,
"train_runtime": 3525.1982,
"train_tokens_per_second": 14366.074
},
{
"epoch": 4.590984974958264,
"grad_norm": 3.0917413234710693,
"learning_rate": 1.1742226627712856e-05,
"loss": 1.3876,
"num_input_tokens_seen": 50934732,
"step": 88000,
"train_runtime": 3544.4536,
"train_tokens_per_second": 14370.264
},
{
"epoch": 4.617070116861436,
"grad_norm": 2.181250810623169,
"learning_rate": 1.1524850445186423e-05,
"loss": 1.3801,
"num_input_tokens_seen": 51225644,
"step": 88500,
"train_runtime": 3563.7836,
"train_tokens_per_second": 14373.949
},
{
"epoch": 4.643155258764608,
"grad_norm": 3.146324872970581,
"learning_rate": 1.130747426265999e-05,
"loss": 1.3451,
"num_input_tokens_seen": 51515932,
"step": 89000,
"train_runtime": 3583.4863,
"train_tokens_per_second": 14375.925
},
{
"epoch": 4.66924040066778,
"grad_norm": 2.4125654697418213,
"learning_rate": 1.1090098080133557e-05,
"loss": 1.3759,
"num_input_tokens_seen": 51803372,
"step": 89500,
"train_runtime": 3602.6645,
"train_tokens_per_second": 14379.183
},
{
"epoch": 4.695325542570951,
"grad_norm": 3.1065971851348877,
"learning_rate": 1.0872721897607122e-05,
"loss": 1.3846,
"num_input_tokens_seen": 52096660,
"step": 90000,
"train_runtime": 3621.3864,
"train_tokens_per_second": 14385.833
},
{
"epoch": 4.721410684474123,
"grad_norm": 2.9472384452819824,
"learning_rate": 1.065534571508069e-05,
"loss": 1.3826,
"num_input_tokens_seen": 52385124,
"step": 90500,
"train_runtime": 3640.3069,
"train_tokens_per_second": 14390.304
},
{
"epoch": 4.747495826377295,
"grad_norm": 3.2821028232574463,
"learning_rate": 1.0437969532554258e-05,
"loss": 1.3913,
"num_input_tokens_seen": 52675284,
"step": 91000,
"train_runtime": 3659.1435,
"train_tokens_per_second": 14395.523
},
{
"epoch": 4.773580968280467,
"grad_norm": 2.897390604019165,
"learning_rate": 1.0220593350027825e-05,
"loss": 1.3745,
"num_input_tokens_seen": 52966012,
"step": 91500,
"train_runtime": 3677.8728,
"train_tokens_per_second": 14401.263
},
{
"epoch": 4.799666110183639,
"grad_norm": 2.4328722953796387,
"learning_rate": 1.0003217167501391e-05,
"loss": 1.3675,
"num_input_tokens_seen": 53260060,
"step": 92000,
"train_runtime": 3696.7483,
"train_tokens_per_second": 14407.272
},
{
"epoch": 4.825751252086811,
"grad_norm": 2.3648526668548584,
"learning_rate": 9.78584098497496e-06,
"loss": 1.348,
"num_input_tokens_seen": 53549900,
"step": 92500,
"train_runtime": 3715.4001,
"train_tokens_per_second": 14412.956
},
{
"epoch": 4.851836393989983,
"grad_norm": 2.3531742095947266,
"learning_rate": 9.568464802448526e-06,
"loss": 1.3779,
"num_input_tokens_seen": 53844180,
"step": 93000,
"train_runtime": 3734.2446,
"train_tokens_per_second": 14419.029
},
{
"epoch": 4.877921535893155,
"grad_norm": 2.4701406955718994,
"learning_rate": 9.351088619922092e-06,
"loss": 1.3688,
"num_input_tokens_seen": 54132452,
"step": 93500,
"train_runtime": 3752.9114,
"train_tokens_per_second": 14424.122
},
{
"epoch": 4.904006677796327,
"grad_norm": 3.4860074520111084,
"learning_rate": 9.13371243739566e-06,
"loss": 1.3786,
"num_input_tokens_seen": 54424212,
"step": 94000,
"train_runtime": 3771.7803,
"train_tokens_per_second": 14429.316
},
{
"epoch": 4.930091819699499,
"grad_norm": 2.331005811691284,
"learning_rate": 8.916336254869227e-06,
"loss": 1.3582,
"num_input_tokens_seen": 54719684,
"step": 94500,
"train_runtime": 3790.6832,
"train_tokens_per_second": 14435.309
},
{
"epoch": 4.956176961602671,
"grad_norm": 2.379862070083618,
"learning_rate": 8.698960072342793e-06,
"loss": 1.3838,
"num_input_tokens_seen": 55006740,
"step": 95000,
"train_runtime": 3809.755,
"train_tokens_per_second": 14438.393
},
{
"epoch": 4.982262103505843,
"grad_norm": 3.527317523956299,
"learning_rate": 8.481583889816362e-06,
"loss": 1.3944,
"num_input_tokens_seen": 55294876,
"step": 95500,
"train_runtime": 3829.1057,
"train_tokens_per_second": 14440.676
},
{
"epoch": 5.0,
"eval_loss": 1.3229724168777466,
"eval_runtime": 46.7304,
"eval_samples_per_second": 820.343,
"eval_steps_per_second": 102.546,
"num_input_tokens_seen": 55492754,
"step": 95840
},
{
"epoch": 5.008347245409015,
"grad_norm": 2.8223490715026855,
"learning_rate": 8.264207707289928e-06,
"loss": 1.3501,
"num_input_tokens_seen": 55585722,
"step": 96000,
"train_runtime": 3896.8789,
"train_tokens_per_second": 14264.165
},
{
"epoch": 5.034432387312187,
"grad_norm": 3.312976360321045,
"learning_rate": 8.046831524763496e-06,
"loss": 1.364,
"num_input_tokens_seen": 55873162,
"step": 96500,
"train_runtime": 3916.5275,
"train_tokens_per_second": 14265.995
},
{
"epoch": 5.060517529215359,
"grad_norm": 4.365355491638184,
"learning_rate": 7.829455342237061e-06,
"loss": 1.3657,
"num_input_tokens_seen": 56159210,
"step": 97000,
"train_runtime": 3935.5771,
"train_tokens_per_second": 14269.625
},
{
"epoch": 5.086602671118531,
"grad_norm": 2.77451753616333,
"learning_rate": 7.612079159710629e-06,
"loss": 1.3722,
"num_input_tokens_seen": 56450234,
"step": 97500,
"train_runtime": 3954.8081,
"train_tokens_per_second": 14273.824
},
{
"epoch": 5.112687813021703,
"grad_norm": 2.028353214263916,
"learning_rate": 7.3947029771841964e-06,
"loss": 1.3778,
"num_input_tokens_seen": 56740002,
"step": 98000,
"train_runtime": 3973.4854,
"train_tokens_per_second": 14279.655
},
{
"epoch": 5.138772954924875,
"grad_norm": 2.0676374435424805,
"learning_rate": 7.177326794657763e-06,
"loss": 1.3462,
"num_input_tokens_seen": 57027226,
"step": 98500,
"train_runtime": 3992.3304,
"train_tokens_per_second": 14284.195
},
{
"epoch": 5.164858096828047,
"grad_norm": 2.0867531299591064,
"learning_rate": 6.95995061213133e-06,
"loss": 1.3739,
"num_input_tokens_seen": 57316978,
"step": 99000,
"train_runtime": 4012.1011,
"train_tokens_per_second": 14286.025
},
{
"epoch": 5.190943238731219,
"grad_norm": 2.3995723724365234,
"learning_rate": 6.7425744296048975e-06,
"loss": 1.3821,
"num_input_tokens_seen": 57607834,
"step": 99500,
"train_runtime": 4031.6912,
"train_tokens_per_second": 14288.752
},
{
"epoch": 5.217028380634391,
"grad_norm": 3.466399669647217,
"learning_rate": 6.525198247078465e-06,
"loss": 1.3499,
"num_input_tokens_seen": 57896786,
"step": 100000,
"train_runtime": 4051.2038,
"train_tokens_per_second": 14291.255
},
{
"epoch": 5.243113522537563,
"grad_norm": 2.673947811126709,
"learning_rate": 6.307822064552031e-06,
"loss": 1.3703,
"num_input_tokens_seen": 58184506,
"step": 100500,
"train_runtime": 4070.3919,
"train_tokens_per_second": 14294.571
},
{
"epoch": 5.269198664440735,
"grad_norm": 2.0675642490386963,
"learning_rate": 6.0904458820255986e-06,
"loss": 1.3759,
"num_input_tokens_seen": 58473186,
"step": 101000,
"train_runtime": 4090.425,
"train_tokens_per_second": 14295.137
},
{
"epoch": 5.295283806343907,
"grad_norm": 2.8680272102355957,
"learning_rate": 5.873069699499165e-06,
"loss": 1.3811,
"num_input_tokens_seen": 58764498,
"step": 101500,
"train_runtime": 4109.7435,
"train_tokens_per_second": 14298.824
},
{
"epoch": 5.321368948247079,
"grad_norm": 3.1335153579711914,
"learning_rate": 5.655693516972733e-06,
"loss": 1.3914,
"num_input_tokens_seen": 59053762,
"step": 102000,
"train_runtime": 4129.2443,
"train_tokens_per_second": 14301.348
},
{
"epoch": 5.347454090150251,
"grad_norm": 4.179940223693848,
"learning_rate": 5.4383173344463e-06,
"loss": 1.3353,
"num_input_tokens_seen": 59346138,
"step": 102500,
"train_runtime": 4148.9629,
"train_tokens_per_second": 14303.849
},
{
"epoch": 5.373539232053423,
"grad_norm": 2.837871551513672,
"learning_rate": 5.220941151919867e-06,
"loss": 1.3592,
"num_input_tokens_seen": 59634050,
"step": 103000,
"train_runtime": 4172.193,
"train_tokens_per_second": 14293.215
},
{
"epoch": 5.399624373956595,
"grad_norm": 2.620933771133423,
"learning_rate": 5.003564969393433e-06,
"loss": 1.3438,
"num_input_tokens_seen": 59920002,
"step": 103500,
"train_runtime": 4191.8017,
"train_tokens_per_second": 14294.57
},
{
"epoch": 5.425709515859766,
"grad_norm": 2.974597454071045,
"learning_rate": 4.786188786867001e-06,
"loss": 1.3848,
"num_input_tokens_seen": 60208490,
"step": 104000,
"train_runtime": 4210.5451,
"train_tokens_per_second": 14299.453
},
{
"epoch": 5.451794657762938,
"grad_norm": 2.7892649173736572,
"learning_rate": 4.568812604340568e-06,
"loss": 1.3947,
"num_input_tokens_seen": 60497570,
"step": 104500,
"train_runtime": 4229.6543,
"train_tokens_per_second": 14303.195
},
{
"epoch": 5.47787979966611,
"grad_norm": 2.9217751026153564,
"learning_rate": 4.3514364218141344e-06,
"loss": 1.3637,
"num_input_tokens_seen": 60791682,
"step": 105000,
"train_runtime": 4249.1377,
"train_tokens_per_second": 14306.828
},
{
"epoch": 5.503964941569282,
"grad_norm": 2.3021788597106934,
"learning_rate": 4.134060239287702e-06,
"loss": 1.3772,
"num_input_tokens_seen": 61081546,
"step": 105500,
"train_runtime": 4268.0879,
"train_tokens_per_second": 14311.22
},
{
"epoch": 5.530050083472454,
"grad_norm": 2.520854949951172,
"learning_rate": 3.916684056761269e-06,
"loss": 1.3595,
"num_input_tokens_seen": 61376714,
"step": 106000,
"train_runtime": 4287.1193,
"train_tokens_per_second": 14316.54
},
{
"epoch": 5.556135225375626,
"grad_norm": 2.5124387741088867,
"learning_rate": 3.6993078742348355e-06,
"loss": 1.3755,
"num_input_tokens_seen": 61670282,
"step": 106500,
"train_runtime": 4306.7613,
"train_tokens_per_second": 14319.41
},
{
"epoch": 5.582220367278798,
"grad_norm": 3.6542813777923584,
"learning_rate": 3.4819316917084032e-06,
"loss": 1.3299,
"num_input_tokens_seen": 61959530,
"step": 107000,
"train_runtime": 4325.9495,
"train_tokens_per_second": 14322.758
},
{
"epoch": 5.60830550918197,
"grad_norm": 2.480987787246704,
"learning_rate": 3.2645555091819697e-06,
"loss": 1.3488,
"num_input_tokens_seen": 62248610,
"step": 107500,
"train_runtime": 4344.6789,
"train_tokens_per_second": 14327.551
},
{
"epoch": 5.634390651085142,
"grad_norm": 3.620051383972168,
"learning_rate": 3.047179326655537e-06,
"loss": 1.3663,
"num_input_tokens_seen": 62535434,
"step": 108000,
"train_runtime": 4363.4204,
"train_tokens_per_second": 14331.746
},
{
"epoch": 5.660475792988314,
"grad_norm": 2.9154930114746094,
"learning_rate": 2.8298031441291043e-06,
"loss": 1.3719,
"num_input_tokens_seen": 62824930,
"step": 108500,
"train_runtime": 4382.1169,
"train_tokens_per_second": 14336.662
},
{
"epoch": 5.686560934891486,
"grad_norm": 2.5228476524353027,
"learning_rate": 2.612426961602671e-06,
"loss": 1.3476,
"num_input_tokens_seen": 63114954,
"step": 109000,
"train_runtime": 4400.737,
"train_tokens_per_second": 14341.905
},
{
"epoch": 5.712646076794658,
"grad_norm": 2.6546239852905273,
"learning_rate": 2.3950507790762385e-06,
"loss": 1.3474,
"num_input_tokens_seen": 63403826,
"step": 109500,
"train_runtime": 4419.5486,
"train_tokens_per_second": 14346.222
},
{
"epoch": 5.73873121869783,
"grad_norm": 3.8582890033721924,
"learning_rate": 2.1776745965498054e-06,
"loss": 1.3451,
"num_input_tokens_seen": 63689762,
"step": 110000,
"train_runtime": 4438.8474,
"train_tokens_per_second": 14348.266
},
{
"epoch": 5.764816360601001,
"grad_norm": 3.4054343700408936,
"learning_rate": 1.9602984140233727e-06,
"loss": 1.3811,
"num_input_tokens_seen": 63978794,
"step": 110500,
"train_runtime": 4458.4476,
"train_tokens_per_second": 14350.016
},
{
"epoch": 5.790901502504173,
"grad_norm": 2.907578468322754,
"learning_rate": 1.7429222314969393e-06,
"loss": 1.3843,
"num_input_tokens_seen": 64270234,
"step": 111000,
"train_runtime": 4478.4361,
"train_tokens_per_second": 14351.044
},
{
"epoch": 5.816986644407345,
"grad_norm": 2.72294020652771,
"learning_rate": 1.5255460489705064e-06,
"loss": 1.3511,
"num_input_tokens_seen": 64557130,
"step": 111500,
"train_runtime": 4498.1115,
"train_tokens_per_second": 14352.052
},
{
"epoch": 5.843071786310517,
"grad_norm": 2.910423755645752,
"learning_rate": 1.3081698664440735e-06,
"loss": 1.355,
"num_input_tokens_seen": 64847634,
"step": 112000,
"train_runtime": 4517.4916,
"train_tokens_per_second": 14354.788
},
{
"epoch": 5.869156928213689,
"grad_norm": 2.3920516967773438,
"learning_rate": 1.0907936839176406e-06,
"loss": 1.3696,
"num_input_tokens_seen": 65135722,
"step": 112500,
"train_runtime": 4536.2006,
"train_tokens_per_second": 14359.092
},
{
"epoch": 5.895242070116861,
"grad_norm": 2.619903087615967,
"learning_rate": 8.734175013912075e-07,
"loss": 1.3515,
"num_input_tokens_seen": 65423234,
"step": 113000,
"train_runtime": 4554.8848,
"train_tokens_per_second": 14363.312
},
{
"epoch": 5.921327212020033,
"grad_norm": 2.61676025390625,
"learning_rate": 6.560413188647746e-07,
"loss": 1.3784,
"num_input_tokens_seen": 65718338,
"step": 113500,
"train_runtime": 4573.7838,
"train_tokens_per_second": 14368.484
},
{
"epoch": 5.947412353923205,
"grad_norm": 2.6655712127685547,
"learning_rate": 4.3866513633834173e-07,
"loss": 1.3672,
"num_input_tokens_seen": 66007642,
"step": 114000,
"train_runtime": 4592.9685,
"train_tokens_per_second": 14371.456
},
{
"epoch": 5.973497495826377,
"grad_norm": 2.606362819671631,
"learning_rate": 2.2128895381190875e-07,
"loss": 1.3579,
"num_input_tokens_seen": 66290722,
"step": 114500,
"train_runtime": 4612.1986,
"train_tokens_per_second": 14372.911
},
{
"epoch": 5.999582637729549,
"grad_norm": 2.8683297634124756,
"learning_rate": 3.912771285475793e-09,
"loss": 1.3687,
"num_input_tokens_seen": 66581138,
"step": 115000,
"train_runtime": 4632.4758,
"train_tokens_per_second": 14372.69
},
{
"epoch": 6.0,
"eval_loss": 1.319564938545227,
"eval_runtime": 45.0275,
"eval_samples_per_second": 851.369,
"eval_steps_per_second": 106.424,
"num_input_tokens_seen": 66585670,
"step": 115008
},
{
"epoch": 6.025667779632721,
"grad_norm": 3.327254295349121,
"learning_rate": 1.9871921953255425e-05,
"loss": 1.3775,
"num_input_tokens_seen": 66874998,
"step": 115500,
"train_runtime": 18.7889,
"train_tokens_per_second": 3559284.107
},
{
"epoch": 6.051752921535893,
"grad_norm": 2.0363502502441406,
"learning_rate": 1.9741496243739565e-05,
"loss": 1.3598,
"num_input_tokens_seen": 67165902,
"step": 116000,
"train_runtime": 38.0722,
"train_tokens_per_second": 1764173.697
},
{
"epoch": 6.077838063439065,
"grad_norm": 3.2186789512634277,
"learning_rate": 1.9611070534223708e-05,
"loss": 1.3582,
"num_input_tokens_seen": 67454310,
"step": 116500,
"train_runtime": 56.9892,
"train_tokens_per_second": 1183632.851
},
{
"epoch": 6.103923205342237,
"grad_norm": 3.1102960109710693,
"learning_rate": 1.9480644824707847e-05,
"loss": 1.342,
"num_input_tokens_seen": 67741886,
"step": 117000,
"train_runtime": 76.1489,
"train_tokens_per_second": 889597.261
},
{
"epoch": 6.130008347245409,
"grad_norm": 2.1836190223693848,
"learning_rate": 1.9350219115191987e-05,
"loss": 1.3578,
"num_input_tokens_seen": 68030070,
"step": 117500,
"train_runtime": 95.3289,
"train_tokens_per_second": 713635.053
},
{
"epoch": 6.156093489148581,
"grad_norm": 2.637117624282837,
"learning_rate": 1.921979340567613e-05,
"loss": 1.3561,
"num_input_tokens_seen": 68313278,
"step": 118000,
"train_runtime": 114.4954,
"train_tokens_per_second": 596646.246
},
{
"epoch": 6.182178631051753,
"grad_norm": 2.454594612121582,
"learning_rate": 1.908936769616027e-05,
"loss": 1.3897,
"num_input_tokens_seen": 68603790,
"step": 118500,
"train_runtime": 133.1929,
"train_tokens_per_second": 515071.035
},
{
"epoch": 6.208263772954925,
"grad_norm": 2.6059861183166504,
"learning_rate": 1.895894198664441e-05,
"loss": 1.3662,
"num_input_tokens_seen": 68897534,
"step": 119000,
"train_runtime": 152.3637,
"train_tokens_per_second": 452191.312
},
{
"epoch": 6.234348914858097,
"grad_norm": 2.963710308074951,
"learning_rate": 1.8828516277128548e-05,
"loss": 1.3688,
"num_input_tokens_seen": 69185822,
"step": 119500,
"train_runtime": 171.3295,
"train_tokens_per_second": 403817.306
},
{
"epoch": 6.260434056761269,
"grad_norm": 2.3006739616394043,
"learning_rate": 1.8698090567612688e-05,
"loss": 1.3867,
"num_input_tokens_seen": 69477766,
"step": 120000,
"train_runtime": 189.6964,
"train_tokens_per_second": 366257.718
},
{
"epoch": 6.286519198664441,
"grad_norm": 2.4806406497955322,
"learning_rate": 1.8567664858096827e-05,
"loss": 1.349,
"num_input_tokens_seen": 69770974,
"step": 120500,
"train_runtime": 208.9904,
"train_tokens_per_second": 333847.728
},
{
"epoch": 6.312604340567613,
"grad_norm": 2.4395639896392822,
"learning_rate": 1.843723914858097e-05,
"loss": 1.3733,
"num_input_tokens_seen": 70062350,
"step": 121000,
"train_runtime": 228.9771,
"train_tokens_per_second": 305979.777
},
{
"epoch": 6.338689482470785,
"grad_norm": 2.7110908031463623,
"learning_rate": 1.830681343906511e-05,
"loss": 1.3708,
"num_input_tokens_seen": 70351870,
"step": 121500,
"train_runtime": 248.7026,
"train_tokens_per_second": 282875.484
},
{
"epoch": 6.364774624373957,
"grad_norm": 2.789796829223633,
"learning_rate": 1.817638772954925e-05,
"loss": 1.3688,
"num_input_tokens_seen": 70642750,
"step": 122000,
"train_runtime": 268.6462,
"train_tokens_per_second": 262958.28
},
{
"epoch": 6.390859766277129,
"grad_norm": 2.9111709594726562,
"learning_rate": 1.8045962020033392e-05,
"loss": 1.3518,
"num_input_tokens_seen": 70931190,
"step": 122500,
"train_runtime": 288.3677,
"train_tokens_per_second": 245974.799
},
{
"epoch": 6.416944908180301,
"grad_norm": 2.4599456787109375,
"learning_rate": 1.791553631051753e-05,
"loss": 1.3431,
"num_input_tokens_seen": 71224646,
"step": 123000,
"train_runtime": 307.3647,
"train_tokens_per_second": 231726.811
},
{
"epoch": 6.443030050083473,
"grad_norm": 2.365891456604004,
"learning_rate": 1.778511060100167e-05,
"loss": 1.3865,
"num_input_tokens_seen": 71511326,
"step": 123500,
"train_runtime": 326.1759,
"train_tokens_per_second": 219241.597
},
{
"epoch": 6.469115191986645,
"grad_norm": 2.6345105171203613,
"learning_rate": 1.765468489148581e-05,
"loss": 1.3734,
"num_input_tokens_seen": 71797622,
"step": 124000,
"train_runtime": 344.951,
"train_tokens_per_second": 208138.626
},
{
"epoch": 6.495200333889817,
"grad_norm": 3.2426106929779053,
"learning_rate": 1.752425918196995e-05,
"loss": 1.3628,
"num_input_tokens_seen": 72088862,
"step": 124500,
"train_runtime": 363.8685,
"train_tokens_per_second": 198117.913
},
{
"epoch": 6.521285475792988,
"grad_norm": 2.608137845993042,
"learning_rate": 1.739383347245409e-05,
"loss": 1.3723,
"num_input_tokens_seen": 72378534,
"step": 125000,
"train_runtime": 383.5577,
"train_tokens_per_second": 188703.107
},
{
"epoch": 6.54737061769616,
"grad_norm": 4.101028919219971,
"learning_rate": 1.726340776293823e-05,
"loss": 1.3776,
"num_input_tokens_seen": 72669942,
"step": 125500,
"train_runtime": 402.8471,
"train_tokens_per_second": 180390.889
},
{
"epoch": 6.573455759599332,
"grad_norm": 2.356037139892578,
"learning_rate": 1.7132982053422372e-05,
"loss": 1.376,
"num_input_tokens_seen": 72956998,
"step": 126000,
"train_runtime": 422.9625,
"train_tokens_per_second": 172490.455
},
{
"epoch": 6.599540901502504,
"grad_norm": 2.768091917037964,
"learning_rate": 1.7002556343906512e-05,
"loss": 1.3849,
"num_input_tokens_seen": 73246278,
"step": 126500,
"train_runtime": 442.5677,
"train_tokens_per_second": 165503.005
},
{
"epoch": 6.625626043405676,
"grad_norm": 2.1557633876800537,
"learning_rate": 1.687213063439065e-05,
"loss": 1.3692,
"num_input_tokens_seen": 73532518,
"step": 127000,
"train_runtime": 461.2902,
"train_tokens_per_second": 159406.192
},
{
"epoch": 6.651711185308848,
"grad_norm": 2.739330768585205,
"learning_rate": 1.6741704924874794e-05,
"loss": 1.3853,
"num_input_tokens_seen": 73816374,
"step": 127500,
"train_runtime": 480.2569,
"train_tokens_per_second": 153701.835
},
{
"epoch": 6.67779632721202,
"grad_norm": 2.28963303565979,
"learning_rate": 1.6611279215358934e-05,
"loss": 1.3539,
"num_input_tokens_seen": 74103334,
"step": 128000,
"train_runtime": 499.0026,
"train_tokens_per_second": 148502.901
},
{
"epoch": 6.703881469115192,
"grad_norm": 3.2728097438812256,
"learning_rate": 1.6480853505843073e-05,
"loss": 1.3519,
"num_input_tokens_seen": 74392214,
"step": 128500,
"train_runtime": 517.9355,
"train_tokens_per_second": 143632.196
},
{
"epoch": 6.729966611018364,
"grad_norm": 3.280041217803955,
"learning_rate": 1.6350427796327213e-05,
"loss": 1.3064,
"num_input_tokens_seen": 74677654,
"step": 129000,
"train_runtime": 536.8375,
"train_tokens_per_second": 139106.624
},
{
"epoch": 6.756051752921536,
"grad_norm": 3.9127538204193115,
"learning_rate": 1.6220002086811352e-05,
"loss": 1.3779,
"num_input_tokens_seen": 74968646,
"step": 129500,
"train_runtime": 555.72,
"train_tokens_per_second": 134903.621
},
{
"epoch": 6.782136894824708,
"grad_norm": 2.7960000038146973,
"learning_rate": 1.6089576377295492e-05,
"loss": 1.3327,
"num_input_tokens_seen": 75257286,
"step": 130000,
"train_runtime": 574.6797,
"train_tokens_per_second": 130955.186
},
{
"epoch": 6.80822203672788,
"grad_norm": 2.997286796569824,
"learning_rate": 1.5959150667779635e-05,
"loss": 1.3684,
"num_input_tokens_seen": 75546398,
"step": 130500,
"train_runtime": 593.4532,
"train_tokens_per_second": 127299.662
},
{
"epoch": 6.834307178631052,
"grad_norm": 2.6267356872558594,
"learning_rate": 1.5828724958263774e-05,
"loss": 1.3416,
"num_input_tokens_seen": 75840662,
"step": 131000,
"train_runtime": 612.3615,
"train_tokens_per_second": 123849.503
},
{
"epoch": 6.860392320534224,
"grad_norm": 2.1126062870025635,
"learning_rate": 1.5698299248747914e-05,
"loss": 1.3606,
"num_input_tokens_seen": 76125694,
"step": 131500,
"train_runtime": 631.2618,
"train_tokens_per_second": 120592.897
},
{
"epoch": 6.886477462437396,
"grad_norm": 2.9131317138671875,
"learning_rate": 1.5567873539232053e-05,
"loss": 1.3813,
"num_input_tokens_seen": 76417118,
"step": 132000,
"train_runtime": 650.1892,
"train_tokens_per_second": 117530.578
},
{
"epoch": 6.912562604340567,
"grad_norm": 3.5298712253570557,
"learning_rate": 1.5437447829716196e-05,
"loss": 1.3617,
"num_input_tokens_seen": 76703430,
"step": 132500,
"train_runtime": 669.1223,
"train_tokens_per_second": 114632.907
},
{
"epoch": 6.938647746243739,
"grad_norm": 2.850775718688965,
"learning_rate": 1.5307022120200336e-05,
"loss": 1.3672,
"num_input_tokens_seen": 76992342,
"step": 133000,
"train_runtime": 687.9389,
"train_tokens_per_second": 111917.419
},
{
"epoch": 6.964732888146911,
"grad_norm": 3.314821481704712,
"learning_rate": 1.5176596410684474e-05,
"loss": 1.3715,
"num_input_tokens_seen": 77284374,
"step": 133500,
"train_runtime": 706.8708,
"train_tokens_per_second": 109333.091
},
{
"epoch": 6.990818030050083,
"grad_norm": 3.3693618774414062,
"learning_rate": 1.5046170701168617e-05,
"loss": 1.3858,
"num_input_tokens_seen": 77571966,
"step": 134000,
"train_runtime": 725.7267,
"train_tokens_per_second": 106888.674
},
{
"epoch": 7.0,
"eval_loss": 1.3148815631866455,
"eval_runtime": 45.8848,
"eval_samples_per_second": 835.462,
"eval_steps_per_second": 104.435,
"num_input_tokens_seen": 77673096,
"step": 134176
},
{
"epoch": 7.016903171953255,
"grad_norm": 2.7694716453552246,
"learning_rate": 1.4915744991652755e-05,
"loss": 1.3419,
"num_input_tokens_seen": 77861608,
"step": 134500,
"train_runtime": 791.8621,
"train_tokens_per_second": 98327.231
},
{
"epoch": 7.042988313856427,
"grad_norm": 2.7334187030792236,
"learning_rate": 1.4785319282136894e-05,
"loss": 1.3308,
"num_input_tokens_seen": 78149784,
"step": 135000,
"train_runtime": 810.7343,
"train_tokens_per_second": 96393.825
},
{
"epoch": 7.069073455759599,
"grad_norm": 2.9365265369415283,
"learning_rate": 1.4654893572621037e-05,
"loss": 1.3525,
"num_input_tokens_seen": 78438792,
"step": 135500,
"train_runtime": 829.7324,
"train_tokens_per_second": 94535.049
},
{
"epoch": 7.095158597662771,
"grad_norm": 4.147580146789551,
"learning_rate": 1.4524467863105177e-05,
"loss": 1.3465,
"num_input_tokens_seen": 78732384,
"step": 136000,
"train_runtime": 848.5615,
"train_tokens_per_second": 92783.357
},
{
"epoch": 7.121243739565943,
"grad_norm": 2.915922164916992,
"learning_rate": 1.4394042153589316e-05,
"loss": 1.3614,
"num_input_tokens_seen": 79016208,
"step": 136500,
"train_runtime": 867.5653,
"train_tokens_per_second": 91078.111
},
{
"epoch": 7.147328881469115,
"grad_norm": 2.549786329269409,
"learning_rate": 1.4263616444073457e-05,
"loss": 1.318,
"num_input_tokens_seen": 79301784,
"step": 137000,
"train_runtime": 886.659,
"train_tokens_per_second": 89438.871
},
{
"epoch": 7.173414023372287,
"grad_norm": 2.5047004222869873,
"learning_rate": 1.4133190734557597e-05,
"loss": 1.368,
"num_input_tokens_seen": 79590400,
"step": 137500,
"train_runtime": 905.5133,
"train_tokens_per_second": 87895.338
},
{
"epoch": 7.199499165275459,
"grad_norm": 3.0781052112579346,
"learning_rate": 1.4002765025041736e-05,
"loss": 1.3653,
"num_input_tokens_seen": 79879504,
"step": 138000,
"train_runtime": 924.4454,
"train_tokens_per_second": 86408.029
},
{
"epoch": 7.225584307178631,
"grad_norm": 3.6476972103118896,
"learning_rate": 1.387233931552588e-05,
"loss": 1.3514,
"num_input_tokens_seen": 80167640,
"step": 138500,
"train_runtime": 943.229,
"train_tokens_per_second": 84992.766
},
{
"epoch": 7.2516694490818026,
"grad_norm": 5.114116191864014,
"learning_rate": 1.3741913606010017e-05,
"loss": 1.3413,
"num_input_tokens_seen": 80456216,
"step": 139000,
"train_runtime": 962.1141,
"train_tokens_per_second": 83624.399
},
{
"epoch": 7.277754590984975,
"grad_norm": 2.5727877616882324,
"learning_rate": 1.3611487896494157e-05,
"loss": 1.3414,
"num_input_tokens_seen": 80747832,
"step": 139500,
"train_runtime": 980.9921,
"train_tokens_per_second": 82312.418
},
{
"epoch": 7.303839732888147,
"grad_norm": 2.9491872787475586,
"learning_rate": 1.3481062186978296e-05,
"loss": 1.3412,
"num_input_tokens_seen": 81043216,
"step": 140000,
"train_runtime": 999.9989,
"train_tokens_per_second": 81043.309
},
{
"epoch": 7.329924874791319,
"grad_norm": 2.045164108276367,
"learning_rate": 1.3350636477462439e-05,
"loss": 1.3729,
"num_input_tokens_seen": 81333232,
"step": 140500,
"train_runtime": 1018.8484,
"train_tokens_per_second": 79828.588
},
{
"epoch": 7.356010016694491,
"grad_norm": 3.922563314437866,
"learning_rate": 1.3220210767946579e-05,
"loss": 1.3443,
"num_input_tokens_seen": 81622416,
"step": 141000,
"train_runtime": 1037.8422,
"train_tokens_per_second": 78646.268
},
{
"epoch": 7.382095158597663,
"grad_norm": 2.426223039627075,
"learning_rate": 1.3089785058430718e-05,
"loss": 1.3544,
"num_input_tokens_seen": 81911608,
"step": 141500,
"train_runtime": 1056.8045,
"train_tokens_per_second": 77508.763
},
{
"epoch": 7.408180300500835,
"grad_norm": 2.67075514793396,
"learning_rate": 1.295935934891486e-05,
"loss": 1.3246,
"num_input_tokens_seen": 82202544,
"step": 142000,
"train_runtime": 1075.733,
"train_tokens_per_second": 76415.38
},
{
"epoch": 7.434265442404007,
"grad_norm": 2.5923829078674316,
"learning_rate": 1.2828933639398999e-05,
"loss": 1.3388,
"num_input_tokens_seen": 82493944,
"step": 142500,
"train_runtime": 1094.8433,
"train_tokens_per_second": 75347.716
},
{
"epoch": 7.460350584307179,
"grad_norm": 2.602835178375244,
"learning_rate": 1.2698507929883138e-05,
"loss": 1.3423,
"num_input_tokens_seen": 82784656,
"step": 143000,
"train_runtime": 1113.7237,
"train_tokens_per_second": 74331.413
},
{
"epoch": 7.486435726210351,
"grad_norm": 3.1531965732574463,
"learning_rate": 1.256808222036728e-05,
"loss": 1.3452,
"num_input_tokens_seen": 83068624,
"step": 143500,
"train_runtime": 1132.6794,
"train_tokens_per_second": 73338.162
},
{
"epoch": 7.512520868113523,
"grad_norm": 2.2403712272644043,
"learning_rate": 1.243765651085142e-05,
"loss": 1.3618,
"num_input_tokens_seen": 83351920,
"step": 144000,
"train_runtime": 1151.592,
"train_tokens_per_second": 72379.733
},
{
"epoch": 7.538606010016695,
"grad_norm": 3.465223550796509,
"learning_rate": 1.2307230801335559e-05,
"loss": 1.3632,
"num_input_tokens_seen": 83638888,
"step": 144500,
"train_runtime": 1170.6851,
"train_tokens_per_second": 71444.392
},
{
"epoch": 7.564691151919867,
"grad_norm": 2.3392977714538574,
"learning_rate": 1.21768050918197e-05,
"loss": 1.3318,
"num_input_tokens_seen": 83931992,
"step": 145000,
"train_runtime": 1189.5906,
"train_tokens_per_second": 70555.362
},
{
"epoch": 7.590776293823039,
"grad_norm": 3.0218007564544678,
"learning_rate": 1.2046379382303841e-05,
"loss": 1.3636,
"num_input_tokens_seen": 84220168,
"step": 145500,
"train_runtime": 1208.5932,
"train_tokens_per_second": 69684.461
},
{
"epoch": 7.616861435726211,
"grad_norm": 3.329549789428711,
"learning_rate": 1.191595367278798e-05,
"loss": 1.355,
"num_input_tokens_seen": 84509512,
"step": 146000,
"train_runtime": 1227.4823,
"train_tokens_per_second": 68847.845
},
{
"epoch": 7.642946577629383,
"grad_norm": 6.515806198120117,
"learning_rate": 1.178552796327212e-05,
"loss": 1.3414,
"num_input_tokens_seen": 84808104,
"step": 146500,
"train_runtime": 1246.4918,
"train_tokens_per_second": 68037.434
},
{
"epoch": 7.669031719532554,
"grad_norm": 3.5463063716888428,
"learning_rate": 1.1655102253756262e-05,
"loss": 1.3617,
"num_input_tokens_seen": 85099704,
"step": 147000,
"train_runtime": 1265.5528,
"train_tokens_per_second": 67243.109
},
{
"epoch": 7.695116861435726,
"grad_norm": 2.877112627029419,
"learning_rate": 1.1524676544240401e-05,
"loss": 1.3524,
"num_input_tokens_seen": 85387272,
"step": 147500,
"train_runtime": 1284.3298,
"train_tokens_per_second": 66483.913
},
{
"epoch": 7.721202003338898,
"grad_norm": 2.8873534202575684,
"learning_rate": 1.1394250834724542e-05,
"loss": 1.3442,
"num_input_tokens_seen": 85671272,
"step": 148000,
"train_runtime": 1303.2108,
"train_tokens_per_second": 65738.615
},
{
"epoch": 7.74728714524207,
"grad_norm": 3.5610382556915283,
"learning_rate": 1.126382512520868e-05,
"loss": 1.3505,
"num_input_tokens_seen": 85959168,
"step": 148500,
"train_runtime": 1322.1598,
"train_tokens_per_second": 65014.207
},
{
"epoch": 7.773372287145242,
"grad_norm": 2.6103343963623047,
"learning_rate": 1.1133399415692821e-05,
"loss": 1.3616,
"num_input_tokens_seen": 86255128,
"step": 149000,
"train_runtime": 1341.3928,
"train_tokens_per_second": 64302.661
},
{
"epoch": 7.799457429048414,
"grad_norm": 2.5157065391540527,
"learning_rate": 1.1002973706176963e-05,
"loss": 1.3422,
"num_input_tokens_seen": 86546848,
"step": 149500,
"train_runtime": 1360.2395,
"train_tokens_per_second": 63626.184
},
{
"epoch": 7.825542570951586,
"grad_norm": 2.315091371536255,
"learning_rate": 1.0872547996661102e-05,
"loss": 1.3511,
"num_input_tokens_seen": 86837440,
"step": 150000,
"train_runtime": 1379.3034,
"train_tokens_per_second": 62957.46
},
{
"epoch": 7.851627712854758,
"grad_norm": 2.2483925819396973,
"learning_rate": 1.0742122287145243e-05,
"loss": 1.3355,
"num_input_tokens_seen": 87120032,
"step": 150500,
"train_runtime": 1398.2422,
"train_tokens_per_second": 62306.824
},
{
"epoch": 7.87771285475793,
"grad_norm": 2.340362071990967,
"learning_rate": 1.0611696577629383e-05,
"loss": 1.3537,
"num_input_tokens_seen": 87415824,
"step": 151000,
"train_runtime": 1417.1731,
"train_tokens_per_second": 61683.236
},
{
"epoch": 7.903797996661102,
"grad_norm": 2.813960552215576,
"learning_rate": 1.0481270868113522e-05,
"loss": 1.3479,
"num_input_tokens_seen": 87701680,
"step": 151500,
"train_runtime": 1436.1799,
"train_tokens_per_second": 61065.945
},
{
"epoch": 7.929883138564274,
"grad_norm": 2.2960751056671143,
"learning_rate": 1.0350845158597664e-05,
"loss": 1.3475,
"num_input_tokens_seen": 87992448,
"step": 152000,
"train_runtime": 1455.0801,
"train_tokens_per_second": 60472.578
},
{
"epoch": 7.955968280467446,
"grad_norm": 3.048780918121338,
"learning_rate": 1.0220419449081803e-05,
"loss": 1.3619,
"num_input_tokens_seen": 88281416,
"step": 152500,
"train_runtime": 1474.0153,
"train_tokens_per_second": 59891.791
},
{
"epoch": 7.982053422370617,
"grad_norm": 2.816805362701416,
"learning_rate": 1.0089993739565943e-05,
"loss": 1.357,
"num_input_tokens_seen": 88572368,
"step": 153000,
"train_runtime": 1492.8602,
"train_tokens_per_second": 59330.65
},
{
"epoch": 8.0,
"eval_loss": 1.310753345489502,
"eval_runtime": 45.8622,
"eval_samples_per_second": 835.874,
"eval_steps_per_second": 104.487,
"num_input_tokens_seen": 88772850,
"step": 153344
},
{
"epoch": 8.00813856427379,
"grad_norm": 2.93835186958313,
"learning_rate": 9.959568030050084e-06,
"loss": 1.3378,
"num_input_tokens_seen": 88861818,
"step": 153500,
"train_runtime": 1558.6187,
"train_tokens_per_second": 57013.187
},
{
"epoch": 8.034223706176961,
"grad_norm": 3.2679965496063232,
"learning_rate": 9.829142320534224e-06,
"loss": 1.3403,
"num_input_tokens_seen": 89148626,
"step": 154000,
"train_runtime": 1577.6034,
"train_tokens_per_second": 56508.897
},
{
"epoch": 8.060308848080133,
"grad_norm": 1.7137473821640015,
"learning_rate": 9.698716611018365e-06,
"loss": 1.3357,
"num_input_tokens_seen": 89432242,
"step": 154500,
"train_runtime": 1596.5679,
"train_tokens_per_second": 56015.306
},
{
"epoch": 8.086393989983305,
"grad_norm": 2.5696284770965576,
"learning_rate": 9.568290901502506e-06,
"loss": 1.3465,
"num_input_tokens_seen": 89721890,
"step": 155000,
"train_runtime": 1615.4358,
"train_tokens_per_second": 55540.364
},
{
"epoch": 8.112479131886477,
"grad_norm": 3.715364694595337,
"learning_rate": 9.437865191986644e-06,
"loss": 1.3407,
"num_input_tokens_seen": 90009618,
"step": 155500,
"train_runtime": 1634.2407,
"train_tokens_per_second": 55077.332
},
{
"epoch": 8.13856427378965,
"grad_norm": 2.7199196815490723,
"learning_rate": 9.307439482470785e-06,
"loss": 1.3444,
"num_input_tokens_seen": 90299538,
"step": 156000,
"train_runtime": 1653.1786,
"train_tokens_per_second": 54621.767
},
{
"epoch": 8.164649415692821,
"grad_norm": 2.546076774597168,
"learning_rate": 9.177013772954925e-06,
"loss": 1.3201,
"num_input_tokens_seen": 90585634,
"step": 156500,
"train_runtime": 1671.9571,
"train_tokens_per_second": 54179.401
},
{
"epoch": 8.190734557595993,
"grad_norm": 2.7355287075042725,
"learning_rate": 9.046588063439066e-06,
"loss": 1.3286,
"num_input_tokens_seen": 90875986,
"step": 157000,
"train_runtime": 1690.8444,
"train_tokens_per_second": 53745.919
},
{
"epoch": 8.216819699499165,
"grad_norm": 2.610476016998291,
"learning_rate": 8.916162353923205e-06,
"loss": 1.3624,
"num_input_tokens_seen": 91165682,
"step": 157500,
"train_runtime": 1709.7962,
"train_tokens_per_second": 53319.619
},
{
"epoch": 8.242904841402337,
"grad_norm": 3.424274444580078,
"learning_rate": 8.785736644407345e-06,
"loss": 1.3615,
"num_input_tokens_seen": 91458162,
"step": 158000,
"train_runtime": 1728.5683,
"train_tokens_per_second": 52909.776
},
{
"epoch": 8.26898998330551,
"grad_norm": 2.9222910404205322,
"learning_rate": 8.655310934891486e-06,
"loss": 1.3359,
"num_input_tokens_seen": 91748050,
"step": 158500,
"train_runtime": 1747.5127,
"train_tokens_per_second": 52502.078
},
{
"epoch": 8.295075125208681,
"grad_norm": 3.5217490196228027,
"learning_rate": 8.524885225375627e-06,
"loss": 1.3414,
"num_input_tokens_seen": 92035050,
"step": 159000,
"train_runtime": 1766.4856,
"train_tokens_per_second": 52100.651
},
{
"epoch": 8.321160267111853,
"grad_norm": 2.656613826751709,
"learning_rate": 8.394459515859767e-06,
"loss": 1.3436,
"num_input_tokens_seen": 92326378,
"step": 159500,
"train_runtime": 1785.3077,
"train_tokens_per_second": 51714.547
},
{
"epoch": 8.347245409015025,
"grad_norm": 2.8764595985412598,
"learning_rate": 8.264033806343906e-06,
"loss": 1.316,
"num_input_tokens_seen": 92617586,
"step": 160000,
"train_runtime": 1804.1264,
"train_tokens_per_second": 51336.529
},
{
"epoch": 8.373330550918197,
"grad_norm": 2.635450839996338,
"learning_rate": 8.133608096828046e-06,
"loss": 1.3745,
"num_input_tokens_seen": 92904010,
"step": 160500,
"train_runtime": 1823.1613,
"train_tokens_per_second": 50957.647
},
{
"epoch": 8.39941569282137,
"grad_norm": 3.4129796028137207,
"learning_rate": 8.003182387312187e-06,
"loss": 1.3278,
"num_input_tokens_seen": 93189170,
"step": 161000,
"train_runtime": 1842.0126,
"train_tokens_per_second": 50590.953
},
{
"epoch": 8.425500834724541,
"grad_norm": 3.2952401638031006,
"learning_rate": 7.872756677796328e-06,
"loss": 1.337,
"num_input_tokens_seen": 93475210,
"step": 161500,
"train_runtime": 1861.0469,
"train_tokens_per_second": 50227.218
},
{
"epoch": 8.451585976627713,
"grad_norm": 2.8078572750091553,
"learning_rate": 7.742330968280468e-06,
"loss": 1.3511,
"num_input_tokens_seen": 93764458,
"step": 162000,
"train_runtime": 1880.0164,
"train_tokens_per_second": 49874.278
},
{
"epoch": 8.477671118530886,
"grad_norm": 3.6334028244018555,
"learning_rate": 7.611905258764608e-06,
"loss": 1.3214,
"num_input_tokens_seen": 94054690,
"step": 162500,
"train_runtime": 1898.9183,
"train_tokens_per_second": 49530.666
},
{
"epoch": 8.503756260434058,
"grad_norm": 2.255051851272583,
"learning_rate": 7.481479549248749e-06,
"loss": 1.3181,
"num_input_tokens_seen": 94342986,
"step": 163000,
"train_runtime": 1917.8935,
"train_tokens_per_second": 49190.941
},
{
"epoch": 8.52984140233723,
"grad_norm": 2.2999086380004883,
"learning_rate": 7.351053839732888e-06,
"loss": 1.3468,
"num_input_tokens_seen": 94628458,
"step": 163500,
"train_runtime": 1936.8254,
"train_tokens_per_second": 48857.505
},
{
"epoch": 8.5559265442404,
"grad_norm": 2.8126626014709473,
"learning_rate": 7.220628130217029e-06,
"loss": 1.3442,
"num_input_tokens_seen": 94916450,
"step": 164000,
"train_runtime": 1955.7716,
"train_tokens_per_second": 48531.459
},
{
"epoch": 8.582011686143572,
"grad_norm": 3.6833460330963135,
"learning_rate": 7.090202420701168e-06,
"loss": 1.3097,
"num_input_tokens_seen": 95209610,
"step": 164500,
"train_runtime": 1974.6708,
"train_tokens_per_second": 48215.434
},
{
"epoch": 8.608096828046744,
"grad_norm": 2.2948975563049316,
"learning_rate": 6.959776711185309e-06,
"loss": 1.3158,
"num_input_tokens_seen": 95500162,
"step": 165000,
"train_runtime": 1993.7313,
"train_tokens_per_second": 47900.216
},
{
"epoch": 8.634181969949916,
"grad_norm": 2.677102565765381,
"learning_rate": 6.82935100166945e-06,
"loss": 1.3492,
"num_input_tokens_seen": 95791218,
"step": 165500,
"train_runtime": 2012.6562,
"train_tokens_per_second": 47594.428
},
{
"epoch": 8.660267111853088,
"grad_norm": 2.8302109241485596,
"learning_rate": 6.698925292153589e-06,
"loss": 1.3176,
"num_input_tokens_seen": 96078250,
"step": 166000,
"train_runtime": 2031.6856,
"train_tokens_per_second": 47289.919
},
{
"epoch": 8.68635225375626,
"grad_norm": 2.7552695274353027,
"learning_rate": 6.56849958263773e-06,
"loss": 1.3259,
"num_input_tokens_seen": 96363322,
"step": 166500,
"train_runtime": 2050.6873,
"train_tokens_per_second": 46990.744
},
{
"epoch": 8.712437395659432,
"grad_norm": 2.76167368888855,
"learning_rate": 6.438073873121871e-06,
"loss": 1.341,
"num_input_tokens_seen": 96655826,
"step": 167000,
"train_runtime": 2069.5519,
"train_tokens_per_second": 46703.746
},
{
"epoch": 8.738522537562604,
"grad_norm": 2.799135208129883,
"learning_rate": 6.3076481636060104e-06,
"loss": 1.3516,
"num_input_tokens_seen": 96941474,
"step": 167500,
"train_runtime": 2088.6051,
"train_tokens_per_second": 46414.458
},
{
"epoch": 8.764607679465776,
"grad_norm": 2.185119390487671,
"learning_rate": 6.177222454090151e-06,
"loss": 1.3495,
"num_input_tokens_seen": 97236010,
"step": 168000,
"train_runtime": 2107.5825,
"train_tokens_per_second": 46136.277
},
{
"epoch": 8.790692821368948,
"grad_norm": 2.787100315093994,
"learning_rate": 6.046796744574291e-06,
"loss": 1.3059,
"num_input_tokens_seen": 97526826,
"step": 168500,
"train_runtime": 2126.4823,
"train_tokens_per_second": 45862.984
},
{
"epoch": 8.81677796327212,
"grad_norm": 2.6303234100341797,
"learning_rate": 5.916371035058431e-06,
"loss": 1.3463,
"num_input_tokens_seen": 97816378,
"step": 169000,
"train_runtime": 2145.3741,
"train_tokens_per_second": 45594.088
},
{
"epoch": 8.842863105175292,
"grad_norm": 2.5196168422698975,
"learning_rate": 5.785945325542571e-06,
"loss": 1.3462,
"num_input_tokens_seen": 98111226,
"step": 169500,
"train_runtime": 2164.4052,
"train_tokens_per_second": 45329.417
},
{
"epoch": 8.868948247078464,
"grad_norm": 3.008777141571045,
"learning_rate": 5.6555196160267115e-06,
"loss": 1.3463,
"num_input_tokens_seen": 98404994,
"step": 170000,
"train_runtime": 2183.2406,
"train_tokens_per_second": 45072.904
},
{
"epoch": 8.895033388981636,
"grad_norm": 2.664883613586426,
"learning_rate": 5.525093906510852e-06,
"loss": 1.3505,
"num_input_tokens_seen": 98691458,
"step": 170500,
"train_runtime": 2202.2373,
"train_tokens_per_second": 44814.179
},
{
"epoch": 8.921118530884808,
"grad_norm": 3.8976974487304688,
"learning_rate": 5.3946681969949914e-06,
"loss": 1.3325,
"num_input_tokens_seen": 98980730,
"step": 171000,
"train_runtime": 2221.2328,
"train_tokens_per_second": 44561.169
},
{
"epoch": 8.94720367278798,
"grad_norm": 2.5917086601257324,
"learning_rate": 5.264242487479132e-06,
"loss": 1.333,
"num_input_tokens_seen": 99265698,
"step": 171500,
"train_runtime": 2240.1093,
"train_tokens_per_second": 44312.882
},
{
"epoch": 8.973288814691152,
"grad_norm": 3.012345314025879,
"learning_rate": 5.133816777963272e-06,
"loss": 1.3493,
"num_input_tokens_seen": 99562818,
"step": 172000,
"train_runtime": 2259.2484,
"train_tokens_per_second": 44069.001
},
{
"epoch": 8.999373956594324,
"grad_norm": 1.994488000869751,
"learning_rate": 5.0033910684474126e-06,
"loss": 1.3704,
"num_input_tokens_seen": 99855026,
"step": 172500,
"train_runtime": 2278.2393,
"train_tokens_per_second": 43829.912
},
{
"epoch": 9.0,
"eval_loss": 1.3092994689941406,
"eval_runtime": 45.6876,
"eval_samples_per_second": 839.069,
"eval_steps_per_second": 104.886,
"num_input_tokens_seen": 99861888,
"step": 172512
},
{
"epoch": 9.025459098497496,
"grad_norm": 3.0312609672546387,
"learning_rate": 4.872965358931553e-06,
"loss": 1.3012,
"num_input_tokens_seen": 100137840,
"step": 173000,
"train_runtime": 2343.9004,
"train_tokens_per_second": 42722.738
},
{
"epoch": 9.051544240400668,
"grad_norm": 2.9846737384796143,
"learning_rate": 4.7425396494156925e-06,
"loss": 1.3416,
"num_input_tokens_seen": 100428752,
"step": 173500,
"train_runtime": 2362.9108,
"train_tokens_per_second": 42502.134
},
{
"epoch": 9.07762938230384,
"grad_norm": 2.700178623199463,
"learning_rate": 4.612113939899834e-06,
"loss": 1.3509,
"num_input_tokens_seen": 100714360,
"step": 174000,
"train_runtime": 2381.7919,
"train_tokens_per_second": 42285.122
},
{
"epoch": 9.103714524207012,
"grad_norm": 2.5982463359832764,
"learning_rate": 4.481688230383973e-06,
"loss": 1.33,
"num_input_tokens_seen": 101010096,
"step": 174500,
"train_runtime": 2400.7332,
"train_tokens_per_second": 42074.685
},
{
"epoch": 9.129799666110184,
"grad_norm": 3.2345430850982666,
"learning_rate": 4.351262520868114e-06,
"loss": 1.3127,
"num_input_tokens_seen": 101301448,
"step": 175000,
"train_runtime": 2419.7,
"train_tokens_per_second": 41865.292
},
{
"epoch": 9.155884808013356,
"grad_norm": 2.8651511669158936,
"learning_rate": 4.220836811352254e-06,
"loss": 1.3198,
"num_input_tokens_seen": 101583952,
"step": 175500,
"train_runtime": 2438.5641,
"train_tokens_per_second": 41657.282
},
{
"epoch": 9.181969949916528,
"grad_norm": 2.723923921585083,
"learning_rate": 4.090411101836394e-06,
"loss": 1.3486,
"num_input_tokens_seen": 101879904,
"step": 176000,
"train_runtime": 2457.6414,
"train_tokens_per_second": 41454.341
},
{
"epoch": 9.2080550918197,
"grad_norm": 2.9765188694000244,
"learning_rate": 3.959985392320535e-06,
"loss": 1.3247,
"num_input_tokens_seen": 102169192,
"step": 176500,
"train_runtime": 2476.4667,
"train_tokens_per_second": 41256.033
},
{
"epoch": 9.234140233722872,
"grad_norm": 2.14411997795105,
"learning_rate": 3.829559682804674e-06,
"loss": 1.3542,
"num_input_tokens_seen": 102454992,
"step": 177000,
"train_runtime": 2495.3856,
"train_tokens_per_second": 41057.779
},
{
"epoch": 9.260225375626044,
"grad_norm": 2.7752788066864014,
"learning_rate": 3.6991339732888147e-06,
"loss": 1.3469,
"num_input_tokens_seen": 102739160,
"step": 177500,
"train_runtime": 2514.175,
"train_tokens_per_second": 40863.966
},
{
"epoch": 9.286310517529216,
"grad_norm": 2.3828213214874268,
"learning_rate": 3.5687082637729555e-06,
"loss": 1.3267,
"num_input_tokens_seen": 103027896,
"step": 178000,
"train_runtime": 2533.1537,
"train_tokens_per_second": 40671.791
},
{
"epoch": 9.312395659432386,
"grad_norm": 2.554948329925537,
"learning_rate": 3.4382825542570955e-06,
"loss": 1.3218,
"num_input_tokens_seen": 103314672,
"step": 178500,
"train_runtime": 2552.0027,
"train_tokens_per_second": 40483.762
},
{
"epoch": 9.338480801335558,
"grad_norm": 2.6806468963623047,
"learning_rate": 3.3078568447412354e-06,
"loss": 1.3384,
"num_input_tokens_seen": 103602648,
"step": 179000,
"train_runtime": 2571.0574,
"train_tokens_per_second": 40295.735
},
{
"epoch": 9.36456594323873,
"grad_norm": 3.95470929145813,
"learning_rate": 3.1774311352253754e-06,
"loss": 1.3187,
"num_input_tokens_seen": 103892480,
"step": 179500,
"train_runtime": 2590.0953,
"train_tokens_per_second": 40111.45
},
{
"epoch": 9.390651085141902,
"grad_norm": 2.708707332611084,
"learning_rate": 3.0470054257095158e-06,
"loss": 1.335,
"num_input_tokens_seen": 104178104,
"step": 180000,
"train_runtime": 2608.9847,
"train_tokens_per_second": 39930.515
},
{
"epoch": 9.416736227045075,
"grad_norm": 3.4441354274749756,
"learning_rate": 2.916579716193656e-06,
"loss": 1.3204,
"num_input_tokens_seen": 104470488,
"step": 180500,
"train_runtime": 2627.985,
"train_tokens_per_second": 39753.076
},
{
"epoch": 9.442821368948247,
"grad_norm": 3.5723414421081543,
"learning_rate": 2.7861540066777965e-06,
"loss": 1.3457,
"num_input_tokens_seen": 104759104,
"step": 181000,
"train_runtime": 2646.9218,
"train_tokens_per_second": 39577.71
},
{
"epoch": 9.468906510851419,
"grad_norm": 3.956160068511963,
"learning_rate": 2.655728297161937e-06,
"loss": 1.3717,
"num_input_tokens_seen": 105044408,
"step": 181500,
"train_runtime": 2665.9278,
"train_tokens_per_second": 39402.571
},
{
"epoch": 9.49499165275459,
"grad_norm": 2.565819025039673,
"learning_rate": 2.525302587646077e-06,
"loss": 1.3413,
"num_input_tokens_seen": 105327088,
"step": 182000,
"train_runtime": 2684.8097,
"train_tokens_per_second": 39230.746
},
{
"epoch": 9.521076794657763,
"grad_norm": 3.5526235103607178,
"learning_rate": 2.3948768781302173e-06,
"loss": 1.2786,
"num_input_tokens_seen": 105615560,
"step": 182500,
"train_runtime": 2703.8493,
"train_tokens_per_second": 39061.185
},
{
"epoch": 9.547161936560935,
"grad_norm": 2.816168785095215,
"learning_rate": 2.264451168614357e-06,
"loss": 1.3268,
"num_input_tokens_seen": 105904984,
"step": 183000,
"train_runtime": 2722.6828,
"train_tokens_per_second": 38897.29
},
{
"epoch": 9.573247078464107,
"grad_norm": 3.1430675983428955,
"learning_rate": 2.1340254590984976e-06,
"loss": 1.3181,
"num_input_tokens_seen": 106197728,
"step": 183500,
"train_runtime": 2741.6799,
"train_tokens_per_second": 38734.547
},
{
"epoch": 9.599332220367279,
"grad_norm": 3.099498748779297,
"learning_rate": 2.0035997495826376e-06,
"loss": 1.321,
"num_input_tokens_seen": 106489536,
"step": 184000,
"train_runtime": 2760.6349,
"train_tokens_per_second": 38574.292
},
{
"epoch": 9.62541736227045,
"grad_norm": 3.0963542461395264,
"learning_rate": 1.8731740400667781e-06,
"loss": 1.3177,
"num_input_tokens_seen": 106779640,
"step": 184500,
"train_runtime": 2779.5118,
"train_tokens_per_second": 38416.688
},
{
"epoch": 9.651502504173623,
"grad_norm": 2.6030497550964355,
"learning_rate": 1.742748330550918e-06,
"loss": 1.3052,
"num_input_tokens_seen": 107073888,
"step": 185000,
"train_runtime": 2798.5054,
"train_tokens_per_second": 38261.097
},
{
"epoch": 9.677587646076795,
"grad_norm": 3.022160768508911,
"learning_rate": 1.6123226210350585e-06,
"loss": 1.3436,
"num_input_tokens_seen": 107364848,
"step": 185500,
"train_runtime": 2817.3417,
"train_tokens_per_second": 38108.565
},
{
"epoch": 9.703672787979967,
"grad_norm": 2.626763105392456,
"learning_rate": 1.4818969115191989e-06,
"loss": 1.3352,
"num_input_tokens_seen": 107659488,
"step": 186000,
"train_runtime": 2836.337,
"train_tokens_per_second": 37957.227
},
{
"epoch": 9.729757929883139,
"grad_norm": 3.0171899795532227,
"learning_rate": 1.351471202003339e-06,
"loss": 1.317,
"num_input_tokens_seen": 107949608,
"step": 186500,
"train_runtime": 2855.247,
"train_tokens_per_second": 37807.45
},
{
"epoch": 9.75584307178631,
"grad_norm": 2.22269868850708,
"learning_rate": 1.2210454924874792e-06,
"loss": 1.3193,
"num_input_tokens_seen": 108245936,
"step": 187000,
"train_runtime": 2874.2964,
"train_tokens_per_second": 37659.977
},
{
"epoch": 9.781928213689483,
"grad_norm": 2.8673713207244873,
"learning_rate": 1.0906197829716196e-06,
"loss": 1.3392,
"num_input_tokens_seen": 108539552,
"step": 187500,
"train_runtime": 2893.1693,
"train_tokens_per_second": 37515.798
},
{
"epoch": 9.808013355592655,
"grad_norm": 2.645888566970825,
"learning_rate": 9.601940734557598e-07,
"loss": 1.3395,
"num_input_tokens_seen": 108827736,
"step": 188000,
"train_runtime": 2912.2618,
"train_tokens_per_second": 37368.802
},
{
"epoch": 9.834098497495827,
"grad_norm": 3.0480117797851562,
"learning_rate": 8.297683639398999e-07,
"loss": 1.3325,
"num_input_tokens_seen": 109119720,
"step": 188500,
"train_runtime": 2931.2115,
"train_tokens_per_second": 37226.832
},
{
"epoch": 9.860183639398999,
"grad_norm": 3.1074326038360596,
"learning_rate": 6.993426544240401e-07,
"loss": 1.3365,
"num_input_tokens_seen": 109406600,
"step": 189000,
"train_runtime": 2950.1291,
"train_tokens_per_second": 37085.361
},
{
"epoch": 9.88626878130217,
"grad_norm": 2.7331807613372803,
"learning_rate": 5.689169449081803e-07,
"loss": 1.346,
"num_input_tokens_seen": 109694976,
"step": 189500,
"train_runtime": 2969.1526,
"train_tokens_per_second": 36944.877
},
{
"epoch": 9.912353923205343,
"grad_norm": 2.5716543197631836,
"learning_rate": 4.3849123539232055e-07,
"loss": 1.3331,
"num_input_tokens_seen": 109985584,
"step": 190000,
"train_runtime": 2988.0525,
"train_tokens_per_second": 36808.451
},
{
"epoch": 9.938439065108515,
"grad_norm": 2.6166512966156006,
"learning_rate": 3.080655258764608e-07,
"loss": 1.3292,
"num_input_tokens_seen": 110272368,
"step": 190500,
"train_runtime": 3007.0207,
"train_tokens_per_second": 36671.636
},
{
"epoch": 9.964524207011687,
"grad_norm": 2.8893744945526123,
"learning_rate": 1.77639816360601e-07,
"loss": 1.3166,
"num_input_tokens_seen": 110557664,
"step": 191000,
"train_runtime": 3025.9068,
"train_tokens_per_second": 36537.035
},
{
"epoch": 9.990609348914859,
"grad_norm": 2.441220998764038,
"learning_rate": 4.721410684474124e-08,
"loss": 1.3429,
"num_input_tokens_seen": 110851304,
"step": 191500,
"train_runtime": 3044.8849,
"train_tokens_per_second": 36405.745
},
{
"epoch": 10.0,
"eval_loss": 1.3094313144683838,
"eval_runtime": 45.924,
"eval_samples_per_second": 834.748,
"eval_steps_per_second": 104.346,
"num_input_tokens_seen": 110955972,
"step": 191680
},
{
"epoch": 10.01669449081803,
"grad_norm": 3.2865073680877686,
"learning_rate": 2.495839419866444e-05,
"loss": 1.3149,
"num_input_tokens_seen": 111137740,
"step": 192000,
"train_runtime": 12.5405,
"train_tokens_per_second": 8862327.118
},
{
"epoch": 10.042779632721203,
"grad_norm": 1.945192813873291,
"learning_rate": 2.4893181343906512e-05,
"loss": 1.3105,
"num_input_tokens_seen": 111433348,
"step": 192500,
"train_runtime": 31.7519,
"train_tokens_per_second": 3509506.821
},
{
"epoch": 10.068864774624373,
"grad_norm": 2.5163190364837646,
"learning_rate": 2.482796848914858e-05,
"loss": 1.3183,
"num_input_tokens_seen": 111720916,
"step": 193000,
"train_runtime": 52.2905,
"train_tokens_per_second": 2136543.334
},
{
"epoch": 10.094949916527545,
"grad_norm": 2.6350646018981934,
"learning_rate": 2.4762755634390652e-05,
"loss": 1.3066,
"num_input_tokens_seen": 112012948,
"step": 193500,
"train_runtime": 72.2858,
"train_tokens_per_second": 1549584.477
},
{
"epoch": 10.121035058430717,
"grad_norm": 2.0416669845581055,
"learning_rate": 2.4697542779632723e-05,
"loss": 1.3383,
"num_input_tokens_seen": 112299028,
"step": 194000,
"train_runtime": 92.8402,
"train_tokens_per_second": 1209594.915
},
{
"epoch": 10.14712020033389,
"grad_norm": 2.219244956970215,
"learning_rate": 2.463232992487479e-05,
"loss": 1.3436,
"num_input_tokens_seen": 112590044,
"step": 194500,
"train_runtime": 113.0832,
"train_tokens_per_second": 995639.026
},
{
"epoch": 10.173205342237061,
"grad_norm": 3.015204429626465,
"learning_rate": 2.4567117070116863e-05,
"loss": 1.3393,
"num_input_tokens_seen": 112885940,
"step": 195000,
"train_runtime": 133.1713,
"train_tokens_per_second": 847674.91
},
{
"epoch": 10.199290484140233,
"grad_norm": 2.1486213207244873,
"learning_rate": 2.4501904215358934e-05,
"loss": 1.3463,
"num_input_tokens_seen": 113169852,
"step": 195500,
"train_runtime": 153.2143,
"train_tokens_per_second": 738637.549
},
{
"epoch": 10.225375626043405,
"grad_norm": 2.8701765537261963,
"learning_rate": 2.4436691360601002e-05,
"loss": 1.3125,
"num_input_tokens_seen": 113459500,
"step": 196000,
"train_runtime": 173.4235,
"train_tokens_per_second": 654233.54
},
{
"epoch": 10.251460767946577,
"grad_norm": 2.4410154819488525,
"learning_rate": 2.4371478505843074e-05,
"loss": 1.3423,
"num_input_tokens_seen": 113754868,
"step": 196500,
"train_runtime": 193.1046,
"train_tokens_per_second": 589084.328
},
{
"epoch": 10.27754590984975,
"grad_norm": 2.3649730682373047,
"learning_rate": 2.4306265651085145e-05,
"loss": 1.3583,
"num_input_tokens_seen": 114041052,
"step": 197000,
"train_runtime": 211.6778,
"train_tokens_per_second": 538748.365
},
{
"epoch": 10.303631051752921,
"grad_norm": 2.661882162094116,
"learning_rate": 2.4241052796327213e-05,
"loss": 1.3226,
"num_input_tokens_seen": 114327300,
"step": 197500,
"train_runtime": 230.1549,
"train_tokens_per_second": 496740.752
},
{
"epoch": 10.329716193656093,
"grad_norm": 3.2307496070861816,
"learning_rate": 2.4175839941569285e-05,
"loss": 1.3317,
"num_input_tokens_seen": 114614836,
"step": 198000,
"train_runtime": 248.6123,
"train_tokens_per_second": 461018.428
},
{
"epoch": 10.355801335559265,
"grad_norm": 2.0446155071258545,
"learning_rate": 2.4110627086811353e-05,
"loss": 1.3289,
"num_input_tokens_seen": 114898460,
"step": 198500,
"train_runtime": 267.0794,
"train_tokens_per_second": 430203.456
},
{
"epoch": 10.381886477462437,
"grad_norm": 2.149264335632324,
"learning_rate": 2.4045414232053424e-05,
"loss": 1.3479,
"num_input_tokens_seen": 115190612,
"step": 199000,
"train_runtime": 285.5801,
"train_tokens_per_second": 403356.593
},
{
"epoch": 10.40797161936561,
"grad_norm": 2.5007822513580322,
"learning_rate": 2.3980201377295496e-05,
"loss": 1.3398,
"num_input_tokens_seen": 115480604,
"step": 199500,
"train_runtime": 304.0559,
"train_tokens_per_second": 379800.589
},
{
"epoch": 10.434056761268781,
"grad_norm": 2.485358238220215,
"learning_rate": 2.3914988522537564e-05,
"loss": 1.3471,
"num_input_tokens_seen": 115772396,
"step": 200000,
"train_runtime": 322.5534,
"train_tokens_per_second": 358924.703
},
{
"epoch": 10.460141903171953,
"grad_norm": 3.0661306381225586,
"learning_rate": 2.3849775667779635e-05,
"loss": 1.3459,
"num_input_tokens_seen": 116055028,
"step": 200500,
"train_runtime": 341.0251,
"train_tokens_per_second": 340312.297
},
{
"epoch": 10.486227045075125,
"grad_norm": 3.0374038219451904,
"learning_rate": 2.3784562813021703e-05,
"loss": 1.3294,
"num_input_tokens_seen": 116342956,
"step": 201000,
"train_runtime": 359.5564,
"train_tokens_per_second": 323573.635
},
{
"epoch": 10.512312186978297,
"grad_norm": 2.4844298362731934,
"learning_rate": 2.371934995826377e-05,
"loss": 1.347,
"num_input_tokens_seen": 116629444,
"step": 201500,
"train_runtime": 378.1517,
"train_tokens_per_second": 308419.724
},
{
"epoch": 10.53839732888147,
"grad_norm": 3.5257129669189453,
"learning_rate": 2.3654137103505843e-05,
"loss": 1.3621,
"num_input_tokens_seen": 116918476,
"step": 202000,
"train_runtime": 396.7415,
"train_tokens_per_second": 294696.879
},
{
"epoch": 10.564482470784641,
"grad_norm": 2.989980936050415,
"learning_rate": 2.3588924248747914e-05,
"loss": 1.3474,
"num_input_tokens_seen": 117203300,
"step": 202500,
"train_runtime": 415.3093,
"train_tokens_per_second": 282207.249
},
{
"epoch": 10.590567612687813,
"grad_norm": 2.9134278297424316,
"learning_rate": 2.3523711393989982e-05,
"loss": 1.3293,
"num_input_tokens_seen": 117490356,
"step": 203000,
"train_runtime": 433.8487,
"train_tokens_per_second": 270809.506
},
{
"epoch": 10.616652754590985,
"grad_norm": 3.4408249855041504,
"learning_rate": 2.3458498539232054e-05,
"loss": 1.3323,
"num_input_tokens_seen": 117778116,
"step": 203500,
"train_runtime": 452.4151,
"train_tokens_per_second": 260331.947
},
{
"epoch": 10.642737896494157,
"grad_norm": 2.5976977348327637,
"learning_rate": 2.3393285684474125e-05,
"loss": 1.326,
"num_input_tokens_seen": 118066028,
"step": 204000,
"train_runtime": 471.0506,
"train_tokens_per_second": 250644.038
},
{
"epoch": 10.66882303839733,
"grad_norm": 2.8414862155914307,
"learning_rate": 2.3328072829716193e-05,
"loss": 1.3268,
"num_input_tokens_seen": 118349812,
"step": 204500,
"train_runtime": 489.6127,
"train_tokens_per_second": 241721.304
},
{
"epoch": 10.694908180300501,
"grad_norm": 2.9611923694610596,
"learning_rate": 2.3262859974958265e-05,
"loss": 1.3183,
"num_input_tokens_seen": 118641012,
"step": 205000,
"train_runtime": 508.2873,
"train_tokens_per_second": 233413.297
},
{
"epoch": 10.720993322203674,
"grad_norm": 3.3537490367889404,
"learning_rate": 2.3197647120200336e-05,
"loss": 1.3344,
"num_input_tokens_seen": 118928020,
"step": 205500,
"train_runtime": 526.9474,
"train_tokens_per_second": 225692.395
},
{
"epoch": 10.747078464106846,
"grad_norm": 2.557131290435791,
"learning_rate": 2.3132434265442404e-05,
"loss": 1.341,
"num_input_tokens_seen": 119221628,
"step": 206000,
"train_runtime": 545.6837,
"train_tokens_per_second": 218481.209
},
{
"epoch": 10.773163606010016,
"grad_norm": 3.0086355209350586,
"learning_rate": 2.3067221410684476e-05,
"loss": 1.3298,
"num_input_tokens_seen": 119513436,
"step": 206500,
"train_runtime": 564.4783,
"train_tokens_per_second": 211723.717
},
{
"epoch": 10.79924874791319,
"grad_norm": 3.600940227508545,
"learning_rate": 2.3002008555926547e-05,
"loss": 1.3572,
"num_input_tokens_seen": 119801196,
"step": 207000,
"train_runtime": 583.2037,
"train_tokens_per_second": 205419.144
},
{
"epoch": 10.82533388981636,
"grad_norm": 2.5225415229797363,
"learning_rate": 2.2936795701168615e-05,
"loss": 1.3173,
"num_input_tokens_seen": 120090740,
"step": 207500,
"train_runtime": 601.9046,
"train_tokens_per_second": 199517.884
},
{
"epoch": 10.851419031719532,
"grad_norm": 2.092555046081543,
"learning_rate": 2.2871582846410687e-05,
"loss": 1.3557,
"num_input_tokens_seen": 120377796,
"step": 208000,
"train_runtime": 620.6064,
"train_tokens_per_second": 193968.023
},
{
"epoch": 10.877504173622704,
"grad_norm": 2.5600435733795166,
"learning_rate": 2.2806369991652758e-05,
"loss": 1.3432,
"num_input_tokens_seen": 120669548,
"step": 208500,
"train_runtime": 639.2536,
"train_tokens_per_second": 188766.325
},
{
"epoch": 10.903589315525876,
"grad_norm": 2.583836793899536,
"learning_rate": 2.2741157136894826e-05,
"loss": 1.3376,
"num_input_tokens_seen": 120961348,
"step": 209000,
"train_runtime": 657.9319,
"train_tokens_per_second": 183850.86
},
{
"epoch": 10.929674457429048,
"grad_norm": 3.099386692047119,
"learning_rate": 2.2675944282136898e-05,
"loss": 1.3296,
"num_input_tokens_seen": 121257580,
"step": 209500,
"train_runtime": 676.7145,
"train_tokens_per_second": 179185.735
},
{
"epoch": 10.95575959933222,
"grad_norm": 3.329822063446045,
"learning_rate": 2.2610731427378966e-05,
"loss": 1.3424,
"num_input_tokens_seen": 121550684,
"step": 210000,
"train_runtime": 695.457,
"train_tokens_per_second": 174778.155
},
{
"epoch": 10.981844741235392,
"grad_norm": 2.160890817642212,
"learning_rate": 2.2545518572621034e-05,
"loss": 1.3391,
"num_input_tokens_seen": 121840244,
"step": 210500,
"train_runtime": 714.1173,
"train_tokens_per_second": 170616.572
},
{
"epoch": 11.0,
"eval_loss": 1.3036798238754272,
"eval_runtime": 45.5874,
"eval_samples_per_second": 840.912,
"eval_steps_per_second": 105.117,
"num_input_tokens_seen": 122042976,
"step": 210848
},
{
"epoch": 11.007929883138564,
"grad_norm": 2.8093433380126953,
"learning_rate": 2.2480305717863105e-05,
"loss": 1.3252,
"num_input_tokens_seen": 122133808,
"step": 211000,
"train_runtime": 779.5622,
"train_tokens_per_second": 156669.744
},
{
"epoch": 11.034015025041736,
"grad_norm": 2.5687525272369385,
"learning_rate": 2.2415092863105177e-05,
"loss": 1.3285,
"num_input_tokens_seen": 122424408,
"step": 211500,
"train_runtime": 798.366,
"train_tokens_per_second": 153343.713
},
{
"epoch": 11.060100166944908,
"grad_norm": 2.920220136642456,
"learning_rate": 2.2349880008347245e-05,
"loss": 1.2892,
"num_input_tokens_seen": 122706872,
"step": 212000,
"train_runtime": 817.1043,
"train_tokens_per_second": 150172.829
},
{
"epoch": 11.08618530884808,
"grad_norm": 2.7014081478118896,
"learning_rate": 2.2284667153589316e-05,
"loss": 1.3207,
"num_input_tokens_seen": 122993992,
"step": 212500,
"train_runtime": 835.8914,
"train_tokens_per_second": 147141.106
},
{
"epoch": 11.112270450751252,
"grad_norm": 2.6697499752044678,
"learning_rate": 2.2219454298831388e-05,
"loss": 1.3299,
"num_input_tokens_seen": 123284616,
"step": 213000,
"train_runtime": 854.6172,
"train_tokens_per_second": 144257.114
},
{
"epoch": 11.138355592654424,
"grad_norm": 3.0389206409454346,
"learning_rate": 2.2154241444073456e-05,
"loss": 1.3267,
"num_input_tokens_seen": 123574760,
"step": 213500,
"train_runtime": 873.3482,
"train_tokens_per_second": 141495.405
},
{
"epoch": 11.164440734557596,
"grad_norm": 2.5090649127960205,
"learning_rate": 2.2089028589315527e-05,
"loss": 1.3173,
"num_input_tokens_seen": 123863512,
"step": 214000,
"train_runtime": 892.119,
"train_tokens_per_second": 138841.92
},
{
"epoch": 11.190525876460768,
"grad_norm": 2.458717107772827,
"learning_rate": 2.2023815734557595e-05,
"loss": 1.3488,
"num_input_tokens_seen": 124153280,
"step": 214500,
"train_runtime": 910.8704,
"train_tokens_per_second": 136301.807
},
{
"epoch": 11.21661101836394,
"grad_norm": 2.2780613899230957,
"learning_rate": 2.1958602879799667e-05,
"loss": 1.3227,
"num_input_tokens_seen": 124441304,
"step": 215000,
"train_runtime": 929.5347,
"train_tokens_per_second": 133874.841
},
{
"epoch": 11.242696160267112,
"grad_norm": 2.2592554092407227,
"learning_rate": 2.189339002504174e-05,
"loss": 1.3417,
"num_input_tokens_seen": 124732192,
"step": 215500,
"train_runtime": 948.4081,
"train_tokens_per_second": 131517.428
},
{
"epoch": 11.268781302170284,
"grad_norm": 1.9092062711715698,
"learning_rate": 2.1828177170283806e-05,
"loss": 1.3168,
"num_input_tokens_seen": 125026840,
"step": 216000,
"train_runtime": 967.1853,
"train_tokens_per_second": 129268.756
},
{
"epoch": 11.294866444073456,
"grad_norm": 2.6668968200683594,
"learning_rate": 2.1762964315525878e-05,
"loss": 1.3158,
"num_input_tokens_seen": 125322792,
"step": 216500,
"train_runtime": 985.9404,
"train_tokens_per_second": 127109.902
},
{
"epoch": 11.320951585976628,
"grad_norm": 2.6406455039978027,
"learning_rate": 2.169775146076795e-05,
"loss": 1.3155,
"num_input_tokens_seen": 125610912,
"step": 217000,
"train_runtime": 1004.7846,
"train_tokens_per_second": 125012.78
},
{
"epoch": 11.3470367278798,
"grad_norm": 3.033663272857666,
"learning_rate": 2.1632538606010017e-05,
"loss": 1.3048,
"num_input_tokens_seen": 125899904,
"step": 217500,
"train_runtime": 1023.5588,
"train_tokens_per_second": 123002.125
},
{
"epoch": 11.373121869782972,
"grad_norm": 2.4079842567443848,
"learning_rate": 2.156732575125209e-05,
"loss": 1.3217,
"num_input_tokens_seen": 126190608,
"step": 218000,
"train_runtime": 1042.2822,
"train_tokens_per_second": 121071.437
},
{
"epoch": 11.399207011686144,
"grad_norm": 2.4821534156799316,
"learning_rate": 2.150211289649416e-05,
"loss": 1.3127,
"num_input_tokens_seen": 126477736,
"step": 218500,
"train_runtime": 1060.9849,
"train_tokens_per_second": 119207.852
},
{
"epoch": 11.425292153589316,
"grad_norm": 3.1184568405151367,
"learning_rate": 2.143690004173623e-05,
"loss": 1.3191,
"num_input_tokens_seen": 126768304,
"step": 219000,
"train_runtime": 1079.744,
"train_tokens_per_second": 117405.884
},
{
"epoch": 11.451377295492488,
"grad_norm": 2.4726860523223877,
"learning_rate": 2.1371687186978297e-05,
"loss": 1.3,
"num_input_tokens_seen": 127057344,
"step": 219500,
"train_runtime": 1098.4724,
"train_tokens_per_second": 115667.311
},
{
"epoch": 11.47746243739566,
"grad_norm": 2.8745577335357666,
"learning_rate": 2.1306474332220368e-05,
"loss": 1.3066,
"num_input_tokens_seen": 127342264,
"step": 220000,
"train_runtime": 1117.2372,
"train_tokens_per_second": 113979.609
},
{
"epoch": 11.503547579298832,
"grad_norm": 2.5106630325317383,
"learning_rate": 2.1241261477462436e-05,
"loss": 1.3081,
"num_input_tokens_seen": 127636384,
"step": 220500,
"train_runtime": 1136.0017,
"train_tokens_per_second": 112355.806
},
{
"epoch": 11.529632721202002,
"grad_norm": 2.9184515476226807,
"learning_rate": 2.1176048622704508e-05,
"loss": 1.3162,
"num_input_tokens_seen": 127929168,
"step": 221000,
"train_runtime": 1154.8123,
"train_tokens_per_second": 110779.183
},
{
"epoch": 11.555717863105176,
"grad_norm": 2.631758689880371,
"learning_rate": 2.111083576794658e-05,
"loss": 1.3154,
"num_input_tokens_seen": 128214768,
"step": 221500,
"train_runtime": 1173.5738,
"train_tokens_per_second": 109251.562
},
{
"epoch": 11.581803005008346,
"grad_norm": 3.0632224082946777,
"learning_rate": 2.1045622913188647e-05,
"loss": 1.3265,
"num_input_tokens_seen": 128502040,
"step": 222000,
"train_runtime": 1192.3765,
"train_tokens_per_second": 107769.681
},
{
"epoch": 11.607888146911518,
"grad_norm": 3.1149165630340576,
"learning_rate": 2.098041005843072e-05,
"loss": 1.321,
"num_input_tokens_seen": 128788576,
"step": 222500,
"train_runtime": 1211.1873,
"train_tokens_per_second": 106332.503
},
{
"epoch": 11.63397328881469,
"grad_norm": 3.4126601219177246,
"learning_rate": 2.091519720367279e-05,
"loss": 1.3089,
"num_input_tokens_seen": 129075456,
"step": 223000,
"train_runtime": 1229.9696,
"train_tokens_per_second": 104941.986
},
{
"epoch": 11.660058430717863,
"grad_norm": 2.5633208751678467,
"learning_rate": 2.0849984348914858e-05,
"loss": 1.3354,
"num_input_tokens_seen": 129363864,
"step": 223500,
"train_runtime": 1248.7371,
"train_tokens_per_second": 103595.756
},
{
"epoch": 11.686143572621035,
"grad_norm": 2.816091775894165,
"learning_rate": 2.078477149415693e-05,
"loss": 1.3338,
"num_input_tokens_seen": 129649336,
"step": 224000,
"train_runtime": 1267.5029,
"train_tokens_per_second": 102287.208
},
{
"epoch": 11.712228714524207,
"grad_norm": 3.5613439083099365,
"learning_rate": 2.0719558639399e-05,
"loss": 1.3199,
"num_input_tokens_seen": 129942320,
"step": 224500,
"train_runtime": 1286.259,
"train_tokens_per_second": 101023.451
},
{
"epoch": 11.738313856427379,
"grad_norm": 2.822772741317749,
"learning_rate": 2.065434578464107e-05,
"loss": 1.3044,
"num_input_tokens_seen": 130232704,
"step": 225000,
"train_runtime": 1305.0245,
"train_tokens_per_second": 99793.304
},
{
"epoch": 11.76439899833055,
"grad_norm": 2.610865592956543,
"learning_rate": 2.058913292988314e-05,
"loss": 1.3334,
"num_input_tokens_seen": 130524424,
"step": 225500,
"train_runtime": 1323.7569,
"train_tokens_per_second": 98601.505
},
{
"epoch": 11.790484140233723,
"grad_norm": 2.68410325050354,
"learning_rate": 2.0523920075125212e-05,
"loss": 1.3042,
"num_input_tokens_seen": 130811008,
"step": 226000,
"train_runtime": 1342.504,
"train_tokens_per_second": 97438.079
},
{
"epoch": 11.816569282136895,
"grad_norm": 2.4882125854492188,
"learning_rate": 2.045870722036728e-05,
"loss": 1.365,
"num_input_tokens_seen": 131095640,
"step": 226500,
"train_runtime": 1361.2815,
"train_tokens_per_second": 96303.109
},
{
"epoch": 11.842654424040067,
"grad_norm": 2.4496724605560303,
"learning_rate": 2.039349436560935e-05,
"loss": 1.3053,
"num_input_tokens_seen": 131380824,
"step": 227000,
"train_runtime": 1380.0428,
"train_tokens_per_second": 95200.546
},
{
"epoch": 11.868739565943239,
"grad_norm": 2.1208622455596924,
"learning_rate": 2.032828151085142e-05,
"loss": 1.3387,
"num_input_tokens_seen": 131669800,
"step": 227500,
"train_runtime": 1398.7962,
"train_tokens_per_second": 94130.797
},
{
"epoch": 11.89482470784641,
"grad_norm": 2.5656790733337402,
"learning_rate": 2.026306865609349e-05,
"loss": 1.3109,
"num_input_tokens_seen": 131956504,
"step": 228000,
"train_runtime": 1417.5824,
"train_tokens_per_second": 93085.598
},
{
"epoch": 11.920909849749583,
"grad_norm": 2.894057035446167,
"learning_rate": 2.019785580133556e-05,
"loss": 1.3385,
"num_input_tokens_seen": 132249552,
"step": 228500,
"train_runtime": 1436.3884,
"train_tokens_per_second": 92070.886
},
{
"epoch": 11.946994991652755,
"grad_norm": 4.0213446617126465,
"learning_rate": 2.013264294657763e-05,
"loss": 1.3252,
"num_input_tokens_seen": 132541072,
"step": 229000,
"train_runtime": 1455.1937,
"train_tokens_per_second": 91081.394
},
{
"epoch": 11.973080133555927,
"grad_norm": 2.279191255569458,
"learning_rate": 2.00674300918197e-05,
"loss": 1.3362,
"num_input_tokens_seen": 132831104,
"step": 229500,
"train_runtime": 1473.9285,
"train_tokens_per_second": 90120.453
},
{
"epoch": 11.999165275459099,
"grad_norm": 2.1568970680236816,
"learning_rate": 2.000221723706177e-05,
"loss": 1.293,
"num_input_tokens_seen": 133123320,
"step": 230000,
"train_runtime": 1492.6888,
"train_tokens_per_second": 89183.575
},
{
"epoch": 12.0,
"eval_loss": 1.303634762763977,
"eval_runtime": 45.533,
"eval_samples_per_second": 841.917,
"eval_steps_per_second": 105.242,
"num_input_tokens_seen": 133131832,
"step": 230016
},
{
"epoch": 12.02525041736227,
"grad_norm": 2.564668655395508,
"learning_rate": 1.9937004382303838e-05,
"loss": 1.2803,
"num_input_tokens_seen": 133411856,
"step": 230500,
"train_runtime": 1558.1778,
"train_tokens_per_second": 85620.431
},
{
"epoch": 12.051335559265443,
"grad_norm": 1.8836562633514404,
"learning_rate": 1.987179152754591e-05,
"loss": 1.3323,
"num_input_tokens_seen": 133703544,
"step": 231000,
"train_runtime": 1576.9538,
"train_tokens_per_second": 84785.959
},
{
"epoch": 12.077420701168615,
"grad_norm": 3.665679693222046,
"learning_rate": 1.980657867278798e-05,
"loss": 1.3101,
"num_input_tokens_seen": 133990048,
"step": 231500,
"train_runtime": 1595.7021,
"train_tokens_per_second": 83969.336
},
{
"epoch": 12.103505843071787,
"grad_norm": 2.481233596801758,
"learning_rate": 1.974136581803005e-05,
"loss": 1.3122,
"num_input_tokens_seen": 134279720,
"step": 232000,
"train_runtime": 1614.4866,
"train_tokens_per_second": 83171.778
},
{
"epoch": 12.129590984974959,
"grad_norm": 2.0712811946868896,
"learning_rate": 1.967615296327212e-05,
"loss": 1.3191,
"num_input_tokens_seen": 134570152,
"step": 232500,
"train_runtime": 1633.2616,
"train_tokens_per_second": 82393.51
},
{
"epoch": 12.15567612687813,
"grad_norm": 2.377253293991089,
"learning_rate": 1.9610940108514192e-05,
"loss": 1.303,
"num_input_tokens_seen": 134859336,
"step": 233000,
"train_runtime": 1652.0277,
"train_tokens_per_second": 81632.612
},
{
"epoch": 12.181761268781303,
"grad_norm": 2.749286651611328,
"learning_rate": 1.954572725375626e-05,
"loss": 1.3219,
"num_input_tokens_seen": 135151088,
"step": 233500,
"train_runtime": 1670.9562,
"train_tokens_per_second": 80882.482
},
{
"epoch": 12.207846410684475,
"grad_norm": 1.9715009927749634,
"learning_rate": 1.948051439899833e-05,
"loss": 1.3164,
"num_input_tokens_seen": 135441304,
"step": 234000,
"train_runtime": 1689.8315,
"train_tokens_per_second": 80150.776
},
{
"epoch": 12.233931552587647,
"grad_norm": 2.8835082054138184,
"learning_rate": 1.9415301544240403e-05,
"loss": 1.3164,
"num_input_tokens_seen": 135728888,
"step": 234500,
"train_runtime": 1708.6414,
"train_tokens_per_second": 79436.731
},
{
"epoch": 12.260016694490819,
"grad_norm": 2.7887117862701416,
"learning_rate": 1.935008868948247e-05,
"loss": 1.3003,
"num_input_tokens_seen": 136016392,
"step": 235000,
"train_runtime": 1727.3834,
"train_tokens_per_second": 78741.287
},
{
"epoch": 12.28610183639399,
"grad_norm": 2.219428777694702,
"learning_rate": 1.9284875834724543e-05,
"loss": 1.2853,
"num_input_tokens_seen": 136304528,
"step": 235500,
"train_runtime": 1746.1346,
"train_tokens_per_second": 78060.723
},
{
"epoch": 12.312186978297161,
"grad_norm": 2.7682409286499023,
"learning_rate": 1.9219662979966614e-05,
"loss": 1.3175,
"num_input_tokens_seen": 136593504,
"step": 236000,
"train_runtime": 1764.8652,
"train_tokens_per_second": 77395.999
},
{
"epoch": 12.338272120200333,
"grad_norm": 4.289463520050049,
"learning_rate": 1.9154450125208682e-05,
"loss": 1.2741,
"num_input_tokens_seen": 136885144,
"step": 236500,
"train_runtime": 1783.6052,
"train_tokens_per_second": 76746.323
},
{
"epoch": 12.364357262103505,
"grad_norm": 3.1798133850097656,
"learning_rate": 1.9089237270450754e-05,
"loss": 1.2896,
"num_input_tokens_seen": 137168736,
"step": 237000,
"train_runtime": 1802.3604,
"train_tokens_per_second": 76105.055
},
{
"epoch": 12.390442404006677,
"grad_norm": 3.9631903171539307,
"learning_rate": 1.9024024415692822e-05,
"loss": 1.3425,
"num_input_tokens_seen": 137463960,
"step": 237500,
"train_runtime": 1821.2214,
"train_tokens_per_second": 75478.997
},
{
"epoch": 12.41652754590985,
"grad_norm": 3.6029210090637207,
"learning_rate": 1.8958811560934893e-05,
"loss": 1.3134,
"num_input_tokens_seen": 137751968,
"step": 238000,
"train_runtime": 1839.9397,
"train_tokens_per_second": 74867.655
},
{
"epoch": 12.442612687813021,
"grad_norm": 2.178394317626953,
"learning_rate": 1.889359870617696e-05,
"loss": 1.2797,
"num_input_tokens_seen": 138044520,
"step": 238500,
"train_runtime": 1858.64,
"train_tokens_per_second": 74271.788
},
{
"epoch": 12.468697829716193,
"grad_norm": 2.5995266437530518,
"learning_rate": 1.8828385851419033e-05,
"loss": 1.3029,
"num_input_tokens_seen": 138334136,
"step": 239000,
"train_runtime": 1877.3231,
"train_tokens_per_second": 73686.909
},
{
"epoch": 12.494782971619365,
"grad_norm": 2.1378602981567383,
"learning_rate": 1.87631729966611e-05,
"loss": 1.3092,
"num_input_tokens_seen": 138621760,
"step": 239500,
"train_runtime": 1895.9609,
"train_tokens_per_second": 73114.252
},
{
"epoch": 12.520868113522537,
"grad_norm": 2.3101305961608887,
"learning_rate": 1.8697960141903172e-05,
"loss": 1.3457,
"num_input_tokens_seen": 138914632,
"step": 240000,
"train_runtime": 1914.6876,
"train_tokens_per_second": 72552.113
},
{
"epoch": 12.54695325542571,
"grad_norm": 2.8269946575164795,
"learning_rate": 1.8632747287145244e-05,
"loss": 1.3055,
"num_input_tokens_seen": 139199064,
"step": 240500,
"train_runtime": 1933.403,
"train_tokens_per_second": 71996.923
},
{
"epoch": 12.573038397328881,
"grad_norm": 4.536306858062744,
"learning_rate": 1.8567534432387312e-05,
"loss": 1.3104,
"num_input_tokens_seen": 139488888,
"step": 241000,
"train_runtime": 1952.205,
"train_tokens_per_second": 71451.969
},
{
"epoch": 12.599123539232053,
"grad_norm": 2.898843765258789,
"learning_rate": 1.8502321577629383e-05,
"loss": 1.2751,
"num_input_tokens_seen": 139777560,
"step": 241500,
"train_runtime": 1970.9694,
"train_tokens_per_second": 70918.18
},
{
"epoch": 12.625208681135225,
"grad_norm": 2.233572006225586,
"learning_rate": 1.8437108722871455e-05,
"loss": 1.2931,
"num_input_tokens_seen": 140065240,
"step": 242000,
"train_runtime": 1989.7056,
"train_tokens_per_second": 70394.956
},
{
"epoch": 12.651293823038397,
"grad_norm": 4.327518939971924,
"learning_rate": 1.8371895868113523e-05,
"loss": 1.2964,
"num_input_tokens_seen": 140353912,
"step": 242500,
"train_runtime": 2008.5433,
"train_tokens_per_second": 69878.458
},
{
"epoch": 12.67737896494157,
"grad_norm": 2.5169992446899414,
"learning_rate": 1.8306683013355594e-05,
"loss": 1.3056,
"num_input_tokens_seen": 140643424,
"step": 243000,
"train_runtime": 2027.3392,
"train_tokens_per_second": 69373.405
},
{
"epoch": 12.703464106844741,
"grad_norm": 2.1607372760772705,
"learning_rate": 1.8241470158597666e-05,
"loss": 1.2978,
"num_input_tokens_seen": 140936888,
"step": 243500,
"train_runtime": 2046.1461,
"train_tokens_per_second": 68879.19
},
{
"epoch": 12.729549248747913,
"grad_norm": 3.104569673538208,
"learning_rate": 1.8176257303839734e-05,
"loss": 1.3203,
"num_input_tokens_seen": 141227736,
"step": 244000,
"train_runtime": 2064.9328,
"train_tokens_per_second": 68393.38
},
{
"epoch": 12.755634390651085,
"grad_norm": 2.6793630123138428,
"learning_rate": 1.8111044449081805e-05,
"loss": 1.2928,
"num_input_tokens_seen": 141513976,
"step": 244500,
"train_runtime": 2083.7183,
"train_tokens_per_second": 67914.159
},
{
"epoch": 12.781719532554257,
"grad_norm": 2.779440402984619,
"learning_rate": 1.8045831594323873e-05,
"loss": 1.2963,
"num_input_tokens_seen": 141801952,
"step": 245000,
"train_runtime": 2102.5169,
"train_tokens_per_second": 67443.905
},
{
"epoch": 12.80780467445743,
"grad_norm": 2.685547351837158,
"learning_rate": 1.7980618739565945e-05,
"loss": 1.3113,
"num_input_tokens_seen": 142087288,
"step": 245500,
"train_runtime": 2121.2928,
"train_tokens_per_second": 66981.458
},
{
"epoch": 12.833889816360601,
"grad_norm": 3.5041792392730713,
"learning_rate": 1.7915405884808016e-05,
"loss": 1.3062,
"num_input_tokens_seen": 142379312,
"step": 246000,
"train_runtime": 2140.0752,
"train_tokens_per_second": 66530.051
},
{
"epoch": 12.859974958263773,
"grad_norm": 3.0701446533203125,
"learning_rate": 1.7850193030050084e-05,
"loss": 1.3036,
"num_input_tokens_seen": 142666568,
"step": 246500,
"train_runtime": 2158.8062,
"train_tokens_per_second": 66085.862
},
{
"epoch": 12.886060100166945,
"grad_norm": 1.8722320795059204,
"learning_rate": 1.7784980175292152e-05,
"loss": 1.3004,
"num_input_tokens_seen": 142954624,
"step": 247000,
"train_runtime": 2177.6361,
"train_tokens_per_second": 65646.701
},
{
"epoch": 12.912145242070117,
"grad_norm": 3.499333381652832,
"learning_rate": 1.7719767320534224e-05,
"loss": 1.3213,
"num_input_tokens_seen": 143246680,
"step": 247500,
"train_runtime": 2196.4459,
"train_tokens_per_second": 65217.486
},
{
"epoch": 12.93823038397329,
"grad_norm": 4.5629353523254395,
"learning_rate": 1.7654554465776292e-05,
"loss": 1.3231,
"num_input_tokens_seen": 143537736,
"step": 248000,
"train_runtime": 2215.2759,
"train_tokens_per_second": 64794.52
},
{
"epoch": 12.964315525876462,
"grad_norm": 3.0510342121124268,
"learning_rate": 1.7589341611018363e-05,
"loss": 1.2966,
"num_input_tokens_seen": 143823008,
"step": 248500,
"train_runtime": 2233.9986,
"train_tokens_per_second": 64379.186
},
{
"epoch": 12.990400667779634,
"grad_norm": 3.152311325073242,
"learning_rate": 1.7524128756260435e-05,
"loss": 1.2741,
"num_input_tokens_seen": 144116976,
"step": 249000,
"train_runtime": 2252.7592,
"train_tokens_per_second": 63973.537
},
{
"epoch": 13.0,
"eval_loss": 1.3037497997283936,
"eval_runtime": 45.363,
"eval_samples_per_second": 845.072,
"eval_steps_per_second": 105.637,
"num_input_tokens_seen": 144224222,
"step": 249184
},
{
"epoch": 13.016485809682806,
"grad_norm": 2.950641632080078,
"learning_rate": 1.7458915901502503e-05,
"loss": 1.2892,
"num_input_tokens_seen": 144404846,
"step": 249500,
"train_runtime": 2317.872,
"train_tokens_per_second": 62300.612
},
{
"epoch": 13.042570951585976,
"grad_norm": 3.1258602142333984,
"learning_rate": 1.7393703046744574e-05,
"loss": 1.279,
"num_input_tokens_seen": 144697406,
"step": 250000,
"train_runtime": 2336.6661,
"train_tokens_per_second": 61924.725
},
{
"epoch": 13.068656093489148,
"grad_norm": 2.8600733280181885,
"learning_rate": 1.7328490191986646e-05,
"loss": 1.2856,
"num_input_tokens_seen": 144992526,
"step": 250500,
"train_runtime": 2355.4549,
"train_tokens_per_second": 61556.062
},
{
"epoch": 13.09474123539232,
"grad_norm": 2.740837335586548,
"learning_rate": 1.7263277337228714e-05,
"loss": 1.2793,
"num_input_tokens_seen": 145286206,
"step": 251000,
"train_runtime": 2374.3019,
"train_tokens_per_second": 61191.125
},
{
"epoch": 13.120826377295492,
"grad_norm": 2.514106035232544,
"learning_rate": 1.7198064482470785e-05,
"loss": 1.2966,
"num_input_tokens_seen": 145576638,
"step": 251500,
"train_runtime": 2393.1024,
"train_tokens_per_second": 60831.763
},
{
"epoch": 13.146911519198664,
"grad_norm": 2.3407087326049805,
"learning_rate": 1.7132851627712857e-05,
"loss": 1.288,
"num_input_tokens_seen": 145861950,
"step": 252000,
"train_runtime": 2411.8629,
"train_tokens_per_second": 60476.884
},
{
"epoch": 13.172996661101836,
"grad_norm": 2.940520763397217,
"learning_rate": 1.7067638772954925e-05,
"loss": 1.2828,
"num_input_tokens_seen": 146153318,
"step": 252500,
"train_runtime": 2430.6861,
"train_tokens_per_second": 60128.423
},
{
"epoch": 13.199081803005008,
"grad_norm": 2.352440595626831,
"learning_rate": 1.7002425918196996e-05,
"loss": 1.3483,
"num_input_tokens_seen": 146442846,
"step": 253000,
"train_runtime": 2449.4406,
"train_tokens_per_second": 59786.24
},
{
"epoch": 13.22516694490818,
"grad_norm": 3.5476200580596924,
"learning_rate": 1.6937213063439068e-05,
"loss": 1.286,
"num_input_tokens_seen": 146729830,
"step": 253500,
"train_runtime": 2468.227,
"train_tokens_per_second": 59447.462
},
{
"epoch": 13.251252086811352,
"grad_norm": 3.1068811416625977,
"learning_rate": 1.6872000208681136e-05,
"loss": 1.2873,
"num_input_tokens_seen": 147026030,
"step": 254000,
"train_runtime": 2486.9722,
"train_tokens_per_second": 59118.484
},
{
"epoch": 13.277337228714524,
"grad_norm": 3.000011920928955,
"learning_rate": 1.6806787353923207e-05,
"loss": 1.2832,
"num_input_tokens_seen": 147309830,
"step": 254500,
"train_runtime": 2505.7198,
"train_tokens_per_second": 58789.428
},
{
"epoch": 13.303422370617696,
"grad_norm": 3.2478373050689697,
"learning_rate": 1.674157449916528e-05,
"loss": 1.3025,
"num_input_tokens_seen": 147604054,
"step": 255000,
"train_runtime": 2524.5534,
"train_tokens_per_second": 58467.393
},
{
"epoch": 13.329507512520868,
"grad_norm": 2.5078775882720947,
"learning_rate": 1.6676361644407347e-05,
"loss": 1.2669,
"num_input_tokens_seen": 147894782,
"step": 255500,
"train_runtime": 2543.3336,
"train_tokens_per_second": 58149.974
},
{
"epoch": 13.35559265442404,
"grad_norm": 2.6515934467315674,
"learning_rate": 1.6611148789649415e-05,
"loss": 1.2827,
"num_input_tokens_seen": 148189078,
"step": 256000,
"train_runtime": 2562.0637,
"train_tokens_per_second": 57839.731
},
{
"epoch": 13.381677796327212,
"grad_norm": 3.669487237930298,
"learning_rate": 1.6545935934891486e-05,
"loss": 1.3063,
"num_input_tokens_seen": 148477710,
"step": 256500,
"train_runtime": 2580.8969,
"train_tokens_per_second": 57529.5
},
{
"epoch": 13.407762938230384,
"grad_norm": 2.5362067222595215,
"learning_rate": 1.6480723080133555e-05,
"loss": 1.311,
"num_input_tokens_seen": 148771438,
"step": 257000,
"train_runtime": 2599.7745,
"train_tokens_per_second": 57224.747
},
{
"epoch": 13.433848080133556,
"grad_norm": 1.743450403213501,
"learning_rate": 1.6415510225375626e-05,
"loss": 1.2843,
"num_input_tokens_seen": 149060526,
"step": 257500,
"train_runtime": 2618.5247,
"train_tokens_per_second": 56925.386
},
{
"epoch": 13.459933222036728,
"grad_norm": 2.875257968902588,
"learning_rate": 1.6350297370617697e-05,
"loss": 1.2692,
"num_input_tokens_seen": 149346974,
"step": 258000,
"train_runtime": 2637.3123,
"train_tokens_per_second": 56628.474
},
{
"epoch": 13.4860183639399,
"grad_norm": 3.3050479888916016,
"learning_rate": 1.6285084515859766e-05,
"loss": 1.2869,
"num_input_tokens_seen": 149633070,
"step": 258500,
"train_runtime": 2656.0943,
"train_tokens_per_second": 56335.751
},
{
"epoch": 13.512103505843072,
"grad_norm": 2.2370221614837646,
"learning_rate": 1.6219871661101837e-05,
"loss": 1.3004,
"num_input_tokens_seen": 149926758,
"step": 259000,
"train_runtime": 2674.8246,
"train_tokens_per_second": 56051.06
},
{
"epoch": 13.538188647746244,
"grad_norm": 4.20009183883667,
"learning_rate": 1.615465880634391e-05,
"loss": 1.2629,
"num_input_tokens_seen": 150212054,
"step": 259500,
"train_runtime": 2693.5708,
"train_tokens_per_second": 55766.885
},
{
"epoch": 13.564273789649416,
"grad_norm": 2.247492551803589,
"learning_rate": 1.6089445951585977e-05,
"loss": 1.3251,
"num_input_tokens_seen": 150502366,
"step": 260000,
"train_runtime": 2712.3292,
"train_tokens_per_second": 55488.237
},
{
"epoch": 13.590358931552588,
"grad_norm": 2.1950037479400635,
"learning_rate": 1.6024233096828048e-05,
"loss": 1.2798,
"num_input_tokens_seen": 150787110,
"step": 260500,
"train_runtime": 2731.1083,
"train_tokens_per_second": 55210.959
},
{
"epoch": 13.61644407345576,
"grad_norm": 2.5948126316070557,
"learning_rate": 1.5959020242070116e-05,
"loss": 1.2685,
"num_input_tokens_seen": 151072982,
"step": 261000,
"train_runtime": 2749.822,
"train_tokens_per_second": 54939.185
},
{
"epoch": 13.642529215358932,
"grad_norm": 3.1042332649230957,
"learning_rate": 1.5893807387312188e-05,
"loss": 1.2917,
"num_input_tokens_seen": 151366958,
"step": 261500,
"train_runtime": 2768.6641,
"train_tokens_per_second": 54671.478
},
{
"epoch": 13.668614357262104,
"grad_norm": 2.2142746448516846,
"learning_rate": 1.582859453255426e-05,
"loss": 1.2928,
"num_input_tokens_seen": 151651278,
"step": 262000,
"train_runtime": 2787.4176,
"train_tokens_per_second": 54405.653
},
{
"epoch": 13.694699499165276,
"grad_norm": 2.406888008117676,
"learning_rate": 1.5763381677796327e-05,
"loss": 1.2888,
"num_input_tokens_seen": 151940174,
"step": 262500,
"train_runtime": 2806.1465,
"train_tokens_per_second": 54145.488
},
{
"epoch": 13.720784641068448,
"grad_norm": 2.989021062850952,
"learning_rate": 1.56981688230384e-05,
"loss": 1.3058,
"num_input_tokens_seen": 152226926,
"step": 263000,
"train_runtime": 2824.9214,
"train_tokens_per_second": 53887.137
},
{
"epoch": 13.746869782971618,
"grad_norm": 2.4519472122192383,
"learning_rate": 1.563295596828047e-05,
"loss": 1.3242,
"num_input_tokens_seen": 152519390,
"step": 263500,
"train_runtime": 2843.6976,
"train_tokens_per_second": 53634.181
},
{
"epoch": 13.772954924874792,
"grad_norm": 3.375582456588745,
"learning_rate": 1.5567743113522538e-05,
"loss": 1.2878,
"num_input_tokens_seen": 152810446,
"step": 264000,
"train_runtime": 2862.4801,
"train_tokens_per_second": 53383.932
},
{
"epoch": 13.799040066777962,
"grad_norm": 2.5288329124450684,
"learning_rate": 1.550253025876461e-05,
"loss": 1.279,
"num_input_tokens_seen": 153100030,
"step": 264500,
"train_runtime": 2881.2536,
"train_tokens_per_second": 53136.604
},
{
"epoch": 13.825125208681134,
"grad_norm": 2.273123025894165,
"learning_rate": 1.5437317404006678e-05,
"loss": 1.2912,
"num_input_tokens_seen": 153385646,
"step": 265000,
"train_runtime": 2900.0148,
"train_tokens_per_second": 52891.332
},
{
"epoch": 13.851210350584306,
"grad_norm": 5.488306522369385,
"learning_rate": 1.537210454924875e-05,
"loss": 1.3079,
"num_input_tokens_seen": 153672086,
"step": 265500,
"train_runtime": 2918.7985,
"train_tokens_per_second": 52649.091
},
{
"epoch": 13.877295492487479,
"grad_norm": 2.2071919441223145,
"learning_rate": 1.5306891694490817e-05,
"loss": 1.3046,
"num_input_tokens_seen": 153960638,
"step": 266000,
"train_runtime": 2937.5127,
"train_tokens_per_second": 52411.906
},
{
"epoch": 13.90338063439065,
"grad_norm": 3.046309471130371,
"learning_rate": 1.524167883973289e-05,
"loss": 1.2832,
"num_input_tokens_seen": 154246150,
"step": 266500,
"train_runtime": 2956.2532,
"train_tokens_per_second": 52176.231
},
{
"epoch": 13.929465776293823,
"grad_norm": 2.4747865200042725,
"learning_rate": 1.5176465984974958e-05,
"loss": 1.2976,
"num_input_tokens_seen": 154534870,
"step": 267000,
"train_runtime": 2975.0146,
"train_tokens_per_second": 51944.238
},
{
"epoch": 13.955550918196995,
"grad_norm": 2.148017168045044,
"learning_rate": 1.511125313021703e-05,
"loss": 1.3016,
"num_input_tokens_seen": 154821518,
"step": 267500,
"train_runtime": 2993.7975,
"train_tokens_per_second": 51714.091
},
{
"epoch": 13.981636060100167,
"grad_norm": 2.248180389404297,
"learning_rate": 1.50460402754591e-05,
"loss": 1.2983,
"num_input_tokens_seen": 155115046,
"step": 268000,
"train_runtime": 3012.5625,
"train_tokens_per_second": 51489.403
},
{
"epoch": 14.0,
"eval_loss": 1.2995389699935913,
"eval_runtime": 45.4147,
"eval_samples_per_second": 844.109,
"eval_steps_per_second": 105.516,
"num_input_tokens_seen": 155319448,
"step": 268352
},
{
"epoch": 14.007721202003339,
"grad_norm": 3.0312399864196777,
"learning_rate": 1.4980827420701168e-05,
"loss": 1.3027,
"num_input_tokens_seen": 155408024,
"step": 268500,
"train_runtime": 3077.7818,
"train_tokens_per_second": 50493.516
},
{
"epoch": 14.03380634390651,
"grad_norm": 4.309081077575684,
"learning_rate": 1.4915614565943239e-05,
"loss": 1.2652,
"num_input_tokens_seen": 155690152,
"step": 269000,
"train_runtime": 3096.5771,
"train_tokens_per_second": 50278.144
},
{
"epoch": 14.059891485809683,
"grad_norm": 2.96939754486084,
"learning_rate": 1.485040171118531e-05,
"loss": 1.271,
"num_input_tokens_seen": 155981000,
"step": 269500,
"train_runtime": 3115.3826,
"train_tokens_per_second": 50068.008
},
{
"epoch": 14.085976627712855,
"grad_norm": 2.4417145252227783,
"learning_rate": 1.4785188856427379e-05,
"loss": 1.2753,
"num_input_tokens_seen": 156272536,
"step": 270000,
"train_runtime": 3134.181,
"train_tokens_per_second": 49860.724
},
{
"epoch": 14.112061769616027,
"grad_norm": 3.6525328159332275,
"learning_rate": 1.471997600166945e-05,
"loss": 1.2708,
"num_input_tokens_seen": 156564232,
"step": 270500,
"train_runtime": 3152.9933,
"train_tokens_per_second": 49655.746
},
{
"epoch": 14.138146911519199,
"grad_norm": 2.702702045440674,
"learning_rate": 1.4654763146911522e-05,
"loss": 1.2644,
"num_input_tokens_seen": 156847192,
"step": 271000,
"train_runtime": 3171.767,
"train_tokens_per_second": 49451.045
},
{
"epoch": 14.16423205342237,
"grad_norm": 2.738504648208618,
"learning_rate": 1.458955029215359e-05,
"loss": 1.2735,
"num_input_tokens_seen": 157138056,
"step": 271500,
"train_runtime": 3190.5858,
"train_tokens_per_second": 49250.534
},
{
"epoch": 14.190317195325543,
"grad_norm": 2.680459976196289,
"learning_rate": 1.4524337437395661e-05,
"loss": 1.2923,
"num_input_tokens_seen": 157427656,
"step": 272000,
"train_runtime": 3209.3923,
"train_tokens_per_second": 49052.17
},
{
"epoch": 14.216402337228715,
"grad_norm": 2.5472817420959473,
"learning_rate": 1.4459124582637731e-05,
"loss": 1.2812,
"num_input_tokens_seen": 157714904,
"step": 272500,
"train_runtime": 3228.1634,
"train_tokens_per_second": 48855.924
},
{
"epoch": 14.242487479131887,
"grad_norm": 2.909809112548828,
"learning_rate": 1.4393911727879799e-05,
"loss": 1.3002,
"num_input_tokens_seen": 158004216,
"step": 273000,
"train_runtime": 3246.9319,
"train_tokens_per_second": 48662.621
},
{
"epoch": 14.268572621035059,
"grad_norm": 3.222720146179199,
"learning_rate": 1.432869887312187e-05,
"loss": 1.2887,
"num_input_tokens_seen": 158292352,
"step": 273500,
"train_runtime": 3265.647,
"train_tokens_per_second": 48471.973
},
{
"epoch": 14.29465776293823,
"grad_norm": 1.991113543510437,
"learning_rate": 1.4263486018363942e-05,
"loss": 1.2627,
"num_input_tokens_seen": 158587024,
"step": 274000,
"train_runtime": 3284.4013,
"train_tokens_per_second": 48284.91
},
{
"epoch": 14.320742904841403,
"grad_norm": 2.8505282402038574,
"learning_rate": 1.419827316360601e-05,
"loss": 1.2836,
"num_input_tokens_seen": 158886520,
"step": 274500,
"train_runtime": 3303.3083,
"train_tokens_per_second": 48099.209
},
{
"epoch": 14.346828046744575,
"grad_norm": 2.9469573497772217,
"learning_rate": 1.4133060308848081e-05,
"loss": 1.2749,
"num_input_tokens_seen": 159177696,
"step": 275000,
"train_runtime": 3322.098,
"train_tokens_per_second": 47914.811
},
{
"epoch": 14.372913188647747,
"grad_norm": 4.244631767272949,
"learning_rate": 1.4067847454090153e-05,
"loss": 1.2695,
"num_input_tokens_seen": 159460280,
"step": 275500,
"train_runtime": 3340.8943,
"train_tokens_per_second": 47729.819
},
{
"epoch": 14.398998330550919,
"grad_norm": 3.174166440963745,
"learning_rate": 1.4002634599332221e-05,
"loss": 1.2888,
"num_input_tokens_seen": 159745000,
"step": 276000,
"train_runtime": 3359.6609,
"train_tokens_per_second": 47547.953
},
{
"epoch": 14.42508347245409,
"grad_norm": 2.760267496109009,
"learning_rate": 1.3937421744574292e-05,
"loss": 1.2714,
"num_input_tokens_seen": 160037624,
"step": 276500,
"train_runtime": 3378.4646,
"train_tokens_per_second": 47369.928
},
{
"epoch": 14.451168614357263,
"grad_norm": 3.1717495918273926,
"learning_rate": 1.387220888981636e-05,
"loss": 1.2967,
"num_input_tokens_seen": 160328736,
"step": 277000,
"train_runtime": 3397.3414,
"train_tokens_per_second": 47192.412
},
{
"epoch": 14.477253756260435,
"grad_norm": 2.68973708152771,
"learning_rate": 1.380699603505843e-05,
"loss": 1.2688,
"num_input_tokens_seen": 160619656,
"step": 277500,
"train_runtime": 3416.1542,
"train_tokens_per_second": 47017.683
},
{
"epoch": 14.503338898163605,
"grad_norm": 2.4333648681640625,
"learning_rate": 1.3741783180300502e-05,
"loss": 1.2797,
"num_input_tokens_seen": 160908592,
"step": 278000,
"train_runtime": 3434.8918,
"train_tokens_per_second": 46845.316
},
{
"epoch": 14.529424040066779,
"grad_norm": 2.4637181758880615,
"learning_rate": 1.367657032554257e-05,
"loss": 1.2733,
"num_input_tokens_seen": 161202600,
"step": 278500,
"train_runtime": 3453.6295,
"train_tokens_per_second": 46676.287
},
{
"epoch": 14.55550918196995,
"grad_norm": 2.199878215789795,
"learning_rate": 1.3611357470784641e-05,
"loss": 1.2812,
"num_input_tokens_seen": 161493960,
"step": 279000,
"train_runtime": 3472.3475,
"train_tokens_per_second": 46508.583
},
{
"epoch": 14.581594323873121,
"grad_norm": 2.7561452388763428,
"learning_rate": 1.3546144616026713e-05,
"loss": 1.2981,
"num_input_tokens_seen": 161780984,
"step": 279500,
"train_runtime": 3491.0873,
"train_tokens_per_second": 46341.146
},
{
"epoch": 14.607679465776293,
"grad_norm": 2.5802223682403564,
"learning_rate": 1.348093176126878e-05,
"loss": 1.2772,
"num_input_tokens_seen": 162067272,
"step": 280000,
"train_runtime": 3509.8281,
"train_tokens_per_second": 46175.274
},
{
"epoch": 14.633764607679465,
"grad_norm": 2.8847203254699707,
"learning_rate": 1.3415718906510852e-05,
"loss": 1.2868,
"num_input_tokens_seen": 162356640,
"step": 280500,
"train_runtime": 3528.574,
"train_tokens_per_second": 46011.97
},
{
"epoch": 14.659849749582637,
"grad_norm": 2.8300564289093018,
"learning_rate": 1.3350506051752924e-05,
"loss": 1.3286,
"num_input_tokens_seen": 162645952,
"step": 281000,
"train_runtime": 3547.3388,
"train_tokens_per_second": 45850.132
},
{
"epoch": 14.68593489148581,
"grad_norm": 2.2055959701538086,
"learning_rate": 1.3285293196994992e-05,
"loss": 1.2874,
"num_input_tokens_seen": 162937608,
"step": 281500,
"train_runtime": 3566.1498,
"train_tokens_per_second": 45690.063
},
{
"epoch": 14.712020033388981,
"grad_norm": 2.794443368911743,
"learning_rate": 1.3220080342237062e-05,
"loss": 1.2976,
"num_input_tokens_seen": 163226160,
"step": 282000,
"train_runtime": 3584.9392,
"train_tokens_per_second": 45531.081
},
{
"epoch": 14.738105175292153,
"grad_norm": 2.3322718143463135,
"learning_rate": 1.3154867487479133e-05,
"loss": 1.3031,
"num_input_tokens_seen": 163520392,
"step": 282500,
"train_runtime": 3603.7244,
"train_tokens_per_second": 45375.388
},
{
"epoch": 14.764190317195325,
"grad_norm": 2.4972341060638428,
"learning_rate": 1.3089654632721201e-05,
"loss": 1.2688,
"num_input_tokens_seen": 163814080,
"step": 283000,
"train_runtime": 3622.5289,
"train_tokens_per_second": 45220.917
},
{
"epoch": 14.790275459098497,
"grad_norm": 2.5767734050750732,
"learning_rate": 1.3024441777963273e-05,
"loss": 1.2623,
"num_input_tokens_seen": 164098944,
"step": 283500,
"train_runtime": 3641.3406,
"train_tokens_per_second": 45065.531
},
{
"epoch": 14.81636060100167,
"grad_norm": 2.557332992553711,
"learning_rate": 1.2959228923205344e-05,
"loss": 1.2782,
"num_input_tokens_seen": 164388472,
"step": 284000,
"train_runtime": 3660.0837,
"train_tokens_per_second": 44913.856
},
{
"epoch": 14.842445742904841,
"grad_norm": 2.9156086444854736,
"learning_rate": 1.2894016068447412e-05,
"loss": 1.2929,
"num_input_tokens_seen": 164678824,
"step": 284500,
"train_runtime": 3678.8815,
"train_tokens_per_second": 44763.286
},
{
"epoch": 14.868530884808013,
"grad_norm": 2.550926685333252,
"learning_rate": 1.2828803213689484e-05,
"loss": 1.2843,
"num_input_tokens_seen": 164964520,
"step": 285000,
"train_runtime": 3697.6895,
"train_tokens_per_second": 44612.864
},
{
"epoch": 14.894616026711185,
"grad_norm": 3.0715761184692383,
"learning_rate": 1.2763590358931555e-05,
"loss": 1.2791,
"num_input_tokens_seen": 165252424,
"step": 285500,
"train_runtime": 3716.4903,
"train_tokens_per_second": 44464.646
},
{
"epoch": 14.920701168614357,
"grad_norm": 3.2298481464385986,
"learning_rate": 1.2698377504173623e-05,
"loss": 1.286,
"num_input_tokens_seen": 165546752,
"step": 286000,
"train_runtime": 3735.2292,
"train_tokens_per_second": 44320.373
},
{
"epoch": 14.94678631051753,
"grad_norm": 2.6789731979370117,
"learning_rate": 1.2633164649415693e-05,
"loss": 1.2922,
"num_input_tokens_seen": 165831800,
"step": 286500,
"train_runtime": 3754.0295,
"train_tokens_per_second": 44174.346
},
{
"epoch": 14.972871452420701,
"grad_norm": 2.6322739124298096,
"learning_rate": 1.2567951794657764e-05,
"loss": 1.2873,
"num_input_tokens_seen": 166125192,
"step": 287000,
"train_runtime": 3772.8414,
"train_tokens_per_second": 44031.852
},
{
"epoch": 14.998956594323873,
"grad_norm": 2.762434244155884,
"learning_rate": 1.2502738939899832e-05,
"loss": 1.2715,
"num_input_tokens_seen": 166410264,
"step": 287500,
"train_runtime": 3791.6838,
"train_tokens_per_second": 43888.222
},
{
"epoch": 15.0,
"eval_loss": 1.2970120906829834,
"eval_runtime": 45.5176,
"eval_samples_per_second": 842.201,
"eval_steps_per_second": 105.278,
"num_input_tokens_seen": 166422516,
"step": 287520
},
{
"epoch": 15.025041736227045,
"grad_norm": 2.177825927734375,
"learning_rate": 1.2437526085141904e-05,
"loss": 1.2801,
"num_input_tokens_seen": 166697628,
"step": 288000,
"train_runtime": 3857.3769,
"train_tokens_per_second": 43215.282
},
{
"epoch": 15.051126878130217,
"grad_norm": 3.206347703933716,
"learning_rate": 1.2372313230383974e-05,
"loss": 1.2709,
"num_input_tokens_seen": 166992924,
"step": 288500,
"train_runtime": 3876.1711,
"train_tokens_per_second": 43081.927
},
{
"epoch": 15.07721202003339,
"grad_norm": 2.4079601764678955,
"learning_rate": 1.2307100375626043e-05,
"loss": 1.2744,
"num_input_tokens_seen": 167286132,
"step": 289000,
"train_runtime": 3895.0066,
"train_tokens_per_second": 42948.869
},
{
"epoch": 15.103297161936561,
"grad_norm": 1.9692761898040771,
"learning_rate": 1.2241887520868115e-05,
"loss": 1.2559,
"num_input_tokens_seen": 167572372,
"step": 289500,
"train_runtime": 3913.7506,
"train_tokens_per_second": 42816.313
},
{
"epoch": 15.129382303839733,
"grad_norm": 2.694408416748047,
"learning_rate": 1.2176674666110185e-05,
"loss": 1.2661,
"num_input_tokens_seen": 167863284,
"step": 290000,
"train_runtime": 3932.5501,
"train_tokens_per_second": 42685.606
},
{
"epoch": 15.155467445742905,
"grad_norm": 2.9768283367156982,
"learning_rate": 1.2111461811352254e-05,
"loss": 1.2868,
"num_input_tokens_seen": 168153292,
"step": 290500,
"train_runtime": 3951.2884,
"train_tokens_per_second": 42556.573
},
{
"epoch": 15.181552587646078,
"grad_norm": 3.165743112564087,
"learning_rate": 1.2046248956594324e-05,
"loss": 1.2598,
"num_input_tokens_seen": 168442780,
"step": 291000,
"train_runtime": 3970.108,
"train_tokens_per_second": 42427.758
},
{
"epoch": 15.20763772954925,
"grad_norm": 2.1122047901153564,
"learning_rate": 1.1981036101836394e-05,
"loss": 1.2777,
"num_input_tokens_seen": 168730764,
"step": 291500,
"train_runtime": 3989.0323,
"train_tokens_per_second": 42298.671
},
{
"epoch": 15.233722871452422,
"grad_norm": 2.8908307552337646,
"learning_rate": 1.1915823247078464e-05,
"loss": 1.2524,
"num_input_tokens_seen": 169023804,
"step": 292000,
"train_runtime": 4008.0188,
"train_tokens_per_second": 42171.41
},
{
"epoch": 15.259808013355592,
"grad_norm": 5.693580627441406,
"learning_rate": 1.1850610392320535e-05,
"loss": 1.2636,
"num_input_tokens_seen": 169313124,
"step": 292500,
"train_runtime": 4028.3264,
"train_tokens_per_second": 42030.637
},
{
"epoch": 15.285893155258764,
"grad_norm": 2.3008134365081787,
"learning_rate": 1.1785397537562605e-05,
"loss": 1.2828,
"num_input_tokens_seen": 169601124,
"step": 293000,
"train_runtime": 4048.6666,
"train_tokens_per_second": 41890.613
},
{
"epoch": 15.311978297161936,
"grad_norm": 2.8285107612609863,
"learning_rate": 1.1720184682804675e-05,
"loss": 1.2528,
"num_input_tokens_seen": 169887028,
"step": 293500,
"train_runtime": 4068.1864,
"train_tokens_per_second": 41759.893
},
{
"epoch": 15.338063439065108,
"grad_norm": 2.4193263053894043,
"learning_rate": 1.1654971828046746e-05,
"loss": 1.272,
"num_input_tokens_seen": 170171812,
"step": 294000,
"train_runtime": 4087.6299,
"train_tokens_per_second": 41630.925
},
{
"epoch": 15.36414858096828,
"grad_norm": 2.8411006927490234,
"learning_rate": 1.1589758973288816e-05,
"loss": 1.2846,
"num_input_tokens_seen": 170459652,
"step": 294500,
"train_runtime": 4106.8845,
"train_tokens_per_second": 41505.83
},
{
"epoch": 15.390233722871452,
"grad_norm": 3.2765908241271973,
"learning_rate": 1.1524546118530886e-05,
"loss": 1.283,
"num_input_tokens_seen": 170746052,
"step": 295000,
"train_runtime": 4125.6554,
"train_tokens_per_second": 41386.407
},
{
"epoch": 15.416318864774624,
"grad_norm": 4.315444469451904,
"learning_rate": 1.1459333263772955e-05,
"loss": 1.2499,
"num_input_tokens_seen": 171039820,
"step": 295500,
"train_runtime": 4144.6159,
"train_tokens_per_second": 41267.954
},
{
"epoch": 15.442404006677796,
"grad_norm": 2.635226249694824,
"learning_rate": 1.1394120409015025e-05,
"loss": 1.271,
"num_input_tokens_seen": 171325612,
"step": 296000,
"train_runtime": 4164.0018,
"train_tokens_per_second": 41144.461
},
{
"epoch": 15.468489148580968,
"grad_norm": 2.699335813522339,
"learning_rate": 1.1328907554257095e-05,
"loss": 1.276,
"num_input_tokens_seen": 171612740,
"step": 296500,
"train_runtime": 4184.2714,
"train_tokens_per_second": 41013.768
},
{
"epoch": 15.49457429048414,
"grad_norm": 2.0063083171844482,
"learning_rate": 1.1263694699499165e-05,
"loss": 1.2596,
"num_input_tokens_seen": 171906348,
"step": 297000,
"train_runtime": 4203.2579,
"train_tokens_per_second": 40898.358
},
{
"epoch": 15.520659432387312,
"grad_norm": 2.836402654647827,
"learning_rate": 1.1198481844741236e-05,
"loss": 1.2578,
"num_input_tokens_seen": 172189356,
"step": 297500,
"train_runtime": 4222.1833,
"train_tokens_per_second": 40782.066
},
{
"epoch": 15.546744574290484,
"grad_norm": 3.0927999019622803,
"learning_rate": 1.1133268989983306e-05,
"loss": 1.2973,
"num_input_tokens_seen": 172482468,
"step": 298000,
"train_runtime": 4241.2002,
"train_tokens_per_second": 40668.316
},
{
"epoch": 15.572829716193656,
"grad_norm": 3.955559492111206,
"learning_rate": 1.1068056135225376e-05,
"loss": 1.272,
"num_input_tokens_seen": 172775212,
"step": 298500,
"train_runtime": 4260.8077,
"train_tokens_per_second": 40549.873
},
{
"epoch": 15.598914858096828,
"grad_norm": 2.954066753387451,
"learning_rate": 1.1002843280467447e-05,
"loss": 1.2696,
"num_input_tokens_seen": 173066468,
"step": 299000,
"train_runtime": 4279.6208,
"train_tokens_per_second": 40439.674
},
{
"epoch": 15.625,
"grad_norm": 2.927549362182617,
"learning_rate": 1.0937630425709517e-05,
"loss": 1.2947,
"num_input_tokens_seen": 173362372,
"step": 299500,
"train_runtime": 4298.4621,
"train_tokens_per_second": 40331.255
},
{
"epoch": 15.651085141903172,
"grad_norm": 3.2571945190429688,
"learning_rate": 1.0872417570951587e-05,
"loss": 1.2612,
"num_input_tokens_seen": 173657292,
"step": 300000,
"train_runtime": 4317.6857,
"train_tokens_per_second": 40219.994
},
{
"epoch": 15.677170283806344,
"grad_norm": 4.016629695892334,
"learning_rate": 1.0807204716193657e-05,
"loss": 1.2903,
"num_input_tokens_seen": 173953028,
"step": 300500,
"train_runtime": 4337.5188,
"train_tokens_per_second": 40104.27
},
{
"epoch": 15.703255425709516,
"grad_norm": 3.677175998687744,
"learning_rate": 1.0741991861435726e-05,
"loss": 1.2654,
"num_input_tokens_seen": 174243612,
"step": 301000,
"train_runtime": 4357.6686,
"train_tokens_per_second": 39985.512
},
{
"epoch": 15.729340567612688,
"grad_norm": 2.5401861667633057,
"learning_rate": 1.0676779006677796e-05,
"loss": 1.2785,
"num_input_tokens_seen": 174528492,
"step": 301500,
"train_runtime": 4377.8182,
"train_tokens_per_second": 39866.546
},
{
"epoch": 15.75542570951586,
"grad_norm": 3.0386669635772705,
"learning_rate": 1.0611566151919868e-05,
"loss": 1.2672,
"num_input_tokens_seen": 174824740,
"step": 302000,
"train_runtime": 4397.7063,
"train_tokens_per_second": 39753.619
},
{
"epoch": 15.781510851419032,
"grad_norm": 2.869920253753662,
"learning_rate": 1.0546353297161937e-05,
"loss": 1.2971,
"num_input_tokens_seen": 175115884,
"step": 302500,
"train_runtime": 4417.5927,
"train_tokens_per_second": 39640.568
},
{
"epoch": 15.807595993322204,
"grad_norm": 2.551456928253174,
"learning_rate": 1.0481140442404007e-05,
"loss": 1.2603,
"num_input_tokens_seen": 175404964,
"step": 303000,
"train_runtime": 4437.075,
"train_tokens_per_second": 39531.665
},
{
"epoch": 15.833681135225376,
"grad_norm": 2.8451788425445557,
"learning_rate": 1.0415927587646079e-05,
"loss": 1.3059,
"num_input_tokens_seen": 175694332,
"step": 303500,
"train_runtime": 4456.4315,
"train_tokens_per_second": 39424.893
},
{
"epoch": 15.859766277128548,
"grad_norm": 3.364713668823242,
"learning_rate": 1.0350714732888148e-05,
"loss": 1.2669,
"num_input_tokens_seen": 175983324,
"step": 304000,
"train_runtime": 4475.8992,
"train_tokens_per_second": 39317.982
},
{
"epoch": 15.88585141903172,
"grad_norm": 3.5180881023406982,
"learning_rate": 1.0285501878130218e-05,
"loss": 1.2704,
"num_input_tokens_seen": 176271988,
"step": 304500,
"train_runtime": 4494.9616,
"train_tokens_per_second": 39215.46
},
{
"epoch": 15.911936560934892,
"grad_norm": 3.1893362998962402,
"learning_rate": 1.0220289023372288e-05,
"loss": 1.2689,
"num_input_tokens_seen": 176565276,
"step": 305000,
"train_runtime": 4513.98,
"train_tokens_per_second": 39115.21
},
{
"epoch": 15.938021702838064,
"grad_norm": 3.272306442260742,
"learning_rate": 1.0155076168614358e-05,
"loss": 1.27,
"num_input_tokens_seen": 176847788,
"step": 305500,
"train_runtime": 4533.1414,
"train_tokens_per_second": 39012.193
},
{
"epoch": 15.964106844741236,
"grad_norm": 2.6090383529663086,
"learning_rate": 1.0089863313856427e-05,
"loss": 1.2684,
"num_input_tokens_seen": 177132460,
"step": 306000,
"train_runtime": 4551.9653,
"train_tokens_per_second": 38913.403
},
{
"epoch": 15.990191986644408,
"grad_norm": 2.874281644821167,
"learning_rate": 1.0024650459098497e-05,
"loss": 1.2839,
"num_input_tokens_seen": 177417428,
"step": 306500,
"train_runtime": 4571.3016,
"train_tokens_per_second": 38811.141
},
{
"epoch": 16.0,
"eval_loss": 1.2972913980484009,
"eval_runtime": 46.7515,
"eval_samples_per_second": 819.974,
"eval_steps_per_second": 102.499,
"num_input_tokens_seen": 177522072,
"step": 306688
},
{
"epoch": 16.01627712854758,
"grad_norm": 2.4503226280212402,
"learning_rate": 9.959437604340569e-06,
"loss": 1.2666,
"num_input_tokens_seen": 177704312,
"step": 307000,
"train_runtime": 4639.2676,
"train_tokens_per_second": 38304.389
},
{
"epoch": 16.042362270450752,
"grad_norm": 2.57148814201355,
"learning_rate": 9.894224749582638e-06,
"loss": 1.2827,
"num_input_tokens_seen": 177987984,
"step": 307500,
"train_runtime": 4658.9243,
"train_tokens_per_second": 38203.665
},
{
"epoch": 16.068447412353922,
"grad_norm": 2.241555690765381,
"learning_rate": 9.829011894824708e-06,
"loss": 1.2417,
"num_input_tokens_seen": 178276096,
"step": 308000,
"train_runtime": 4678.6926,
"train_tokens_per_second": 38103.828
},
{
"epoch": 16.094532554257096,
"grad_norm": 3.140139579772949,
"learning_rate": 9.76379904006678e-06,
"loss": 1.2696,
"num_input_tokens_seen": 178568312,
"step": 308500,
"train_runtime": 4698.0151,
"train_tokens_per_second": 38009.31
},
{
"epoch": 16.120617696160267,
"grad_norm": 2.9327456951141357,
"learning_rate": 9.69858618530885e-06,
"loss": 1.2835,
"num_input_tokens_seen": 178856160,
"step": 309000,
"train_runtime": 4717.2135,
"train_tokens_per_second": 37915.638
},
{
"epoch": 16.14670283806344,
"grad_norm": 3.2067556381225586,
"learning_rate": 9.633373330550919e-06,
"loss": 1.2688,
"num_input_tokens_seen": 179143944,
"step": 309500,
"train_runtime": 4736.2008,
"train_tokens_per_second": 37824.398
},
{
"epoch": 16.17278797996661,
"grad_norm": 2.4767651557922363,
"learning_rate": 9.568160475792989e-06,
"loss": 1.2721,
"num_input_tokens_seen": 179434664,
"step": 310000,
"train_runtime": 4755.3615,
"train_tokens_per_second": 37733.128
},
{
"epoch": 16.198873121869784,
"grad_norm": 2.9996862411499023,
"learning_rate": 9.502947621035059e-06,
"loss": 1.2569,
"num_input_tokens_seen": 179724792,
"step": 310500,
"train_runtime": 4774.7367,
"train_tokens_per_second": 37640.776
},
{
"epoch": 16.224958263772955,
"grad_norm": 2.587339162826538,
"learning_rate": 9.437734766277128e-06,
"loss": 1.2562,
"num_input_tokens_seen": 180020736,
"step": 311000,
"train_runtime": 4794.2064,
"train_tokens_per_second": 37549.642
},
{
"epoch": 16.25104340567613,
"grad_norm": 2.425332546234131,
"learning_rate": 9.3725219115192e-06,
"loss": 1.2859,
"num_input_tokens_seen": 180308088,
"step": 311500,
"train_runtime": 4813.4723,
"train_tokens_per_second": 37459.048
},
{
"epoch": 16.2771285475793,
"grad_norm": 3.213170289993286,
"learning_rate": 9.30730905676127e-06,
"loss": 1.2648,
"num_input_tokens_seen": 180593256,
"step": 312000,
"train_runtime": 4832.7472,
"train_tokens_per_second": 37368.653
},
{
"epoch": 16.303213689482472,
"grad_norm": 2.971393346786499,
"learning_rate": 9.24209620200334e-06,
"loss": 1.2565,
"num_input_tokens_seen": 180883912,
"step": 312500,
"train_runtime": 4853.0289,
"train_tokens_per_second": 37272.375
},
{
"epoch": 16.329298831385643,
"grad_norm": 3.2865586280822754,
"learning_rate": 9.17688334724541e-06,
"loss": 1.2695,
"num_input_tokens_seen": 181172920,
"step": 313000,
"train_runtime": 4872.3486,
"train_tokens_per_second": 37183.899
},
{
"epoch": 16.355383973288816,
"grad_norm": 2.691861867904663,
"learning_rate": 9.11167049248748e-06,
"loss": 1.2742,
"num_input_tokens_seen": 181457952,
"step": 313500,
"train_runtime": 4891.6907,
"train_tokens_per_second": 37095.14
},
{
"epoch": 16.381469115191987,
"grad_norm": 3.302048444747925,
"learning_rate": 9.04645763772955e-06,
"loss": 1.261,
"num_input_tokens_seen": 181746184,
"step": 314000,
"train_runtime": 4911.0159,
"train_tokens_per_second": 37007.859
},
{
"epoch": 16.407554257095157,
"grad_norm": 3.427002191543579,
"learning_rate": 8.981244782971618e-06,
"loss": 1.2763,
"num_input_tokens_seen": 182036728,
"step": 314500,
"train_runtime": 4930.339,
"train_tokens_per_second": 36921.747
},
{
"epoch": 16.43363939899833,
"grad_norm": 2.194302558898926,
"learning_rate": 8.91603192821369e-06,
"loss": 1.2347,
"num_input_tokens_seen": 182327360,
"step": 315000,
"train_runtime": 4949.6263,
"train_tokens_per_second": 36836.591
},
{
"epoch": 16.4597245409015,
"grad_norm": 2.6108365058898926,
"learning_rate": 8.85081907345576e-06,
"loss": 1.3033,
"num_input_tokens_seen": 182614776,
"step": 315500,
"train_runtime": 4968.861,
"train_tokens_per_second": 36751.839
},
{
"epoch": 16.485809682804675,
"grad_norm": 3.398846387863159,
"learning_rate": 8.78560621869783e-06,
"loss": 1.231,
"num_input_tokens_seen": 182898920,
"step": 316000,
"train_runtime": 4988.2986,
"train_tokens_per_second": 36665.592
},
{
"epoch": 16.511894824707845,
"grad_norm": 3.175825357437134,
"learning_rate": 8.720393363939901e-06,
"loss": 1.2653,
"num_input_tokens_seen": 183194016,
"step": 316500,
"train_runtime": 5007.4717,
"train_tokens_per_second": 36584.134
},
{
"epoch": 16.53797996661102,
"grad_norm": 3.3755290508270264,
"learning_rate": 8.65518050918197e-06,
"loss": 1.2382,
"num_input_tokens_seen": 183486192,
"step": 317000,
"train_runtime": 5026.7596,
"train_tokens_per_second": 36501.883
},
{
"epoch": 16.56406510851419,
"grad_norm": 3.120741128921509,
"learning_rate": 8.58996765442404e-06,
"loss": 1.2661,
"num_input_tokens_seen": 183774000,
"step": 317500,
"train_runtime": 5045.8839,
"train_tokens_per_second": 36420.577
},
{
"epoch": 16.590150250417363,
"grad_norm": 4.2182440757751465,
"learning_rate": 8.524754799666112e-06,
"loss": 1.254,
"num_input_tokens_seen": 184064816,
"step": 318000,
"train_runtime": 5065.2521,
"train_tokens_per_second": 36338.727
},
{
"epoch": 16.616235392320533,
"grad_norm": 3.3010435104370117,
"learning_rate": 8.459541944908182e-06,
"loss": 1.2621,
"num_input_tokens_seen": 184350480,
"step": 318500,
"train_runtime": 5084.6874,
"train_tokens_per_second": 36256.011
},
{
"epoch": 16.642320534223707,
"grad_norm": 3.2120778560638428,
"learning_rate": 8.39432909015025e-06,
"loss": 1.2563,
"num_input_tokens_seen": 184642440,
"step": 319000,
"train_runtime": 5103.9372,
"train_tokens_per_second": 36176.472
},
{
"epoch": 16.668405676126877,
"grad_norm": 2.9939897060394287,
"learning_rate": 8.329116235392321e-06,
"loss": 1.2594,
"num_input_tokens_seen": 184928112,
"step": 319500,
"train_runtime": 5123.0191,
"train_tokens_per_second": 36097.486
},
{
"epoch": 16.69449081803005,
"grad_norm": 3.710550308227539,
"learning_rate": 8.263903380634391e-06,
"loss": 1.2634,
"num_input_tokens_seen": 185211440,
"step": 320000,
"train_runtime": 5142.5889,
"train_tokens_per_second": 36015.214
},
{
"epoch": 16.72057595993322,
"grad_norm": 2.5137531757354736,
"learning_rate": 8.19869052587646e-06,
"loss": 1.2601,
"num_input_tokens_seen": 185506864,
"step": 320500,
"train_runtime": 5162.7072,
"train_tokens_per_second": 35932.091
},
{
"epoch": 16.746661101836395,
"grad_norm": 4.654266834259033,
"learning_rate": 8.13347767111853e-06,
"loss": 1.282,
"num_input_tokens_seen": 185792944,
"step": 321000,
"train_runtime": 5181.946,
"train_tokens_per_second": 35853.894
},
{
"epoch": 16.772746243739565,
"grad_norm": 2.9473636150360107,
"learning_rate": 8.068264816360602e-06,
"loss": 1.2839,
"num_input_tokens_seen": 186086024,
"step": 321500,
"train_runtime": 5201.2933,
"train_tokens_per_second": 35776.876
},
{
"epoch": 16.79883138564274,
"grad_norm": 2.2345118522644043,
"learning_rate": 8.003051961602672e-06,
"loss": 1.249,
"num_input_tokens_seen": 186378104,
"step": 322000,
"train_runtime": 5221.502,
"train_tokens_per_second": 35694.347
},
{
"epoch": 16.82491652754591,
"grad_norm": 2.1228227615356445,
"learning_rate": 7.937839106844742e-06,
"loss": 1.2856,
"num_input_tokens_seen": 186672776,
"step": 322500,
"train_runtime": 5242.0802,
"train_tokens_per_second": 35610.438
},
{
"epoch": 16.851001669449083,
"grad_norm": 3.548326253890991,
"learning_rate": 7.872626252086811e-06,
"loss": 1.2777,
"num_input_tokens_seen": 186964952,
"step": 323000,
"train_runtime": 5262.7553,
"train_tokens_per_second": 35526.058
},
{
"epoch": 16.877086811352253,
"grad_norm": 3.222048044204712,
"learning_rate": 7.807413397328881e-06,
"loss": 1.288,
"num_input_tokens_seen": 187250864,
"step": 323500,
"train_runtime": 5283.1207,
"train_tokens_per_second": 35443.23
},
{
"epoch": 16.903171953255427,
"grad_norm": 3.267969846725464,
"learning_rate": 7.74220054257095e-06,
"loss": 1.2746,
"num_input_tokens_seen": 187543856,
"step": 324000,
"train_runtime": 5303.6214,
"train_tokens_per_second": 35361.471
},
{
"epoch": 16.929257095158597,
"grad_norm": 2.1591436862945557,
"learning_rate": 7.676987687813022e-06,
"loss": 1.2524,
"num_input_tokens_seen": 187833368,
"step": 324500,
"train_runtime": 5324.0933,
"train_tokens_per_second": 35279.879
},
{
"epoch": 16.95534223706177,
"grad_norm": 5.07979154586792,
"learning_rate": 7.611774833055092e-06,
"loss": 1.2888,
"num_input_tokens_seen": 188120672,
"step": 325000,
"train_runtime": 5344.7171,
"train_tokens_per_second": 35197.499
},
{
"epoch": 16.98142737896494,
"grad_norm": 3.134291410446167,
"learning_rate": 7.546561978297162e-06,
"loss": 1.2575,
"num_input_tokens_seen": 188407336,
"step": 325500,
"train_runtime": 5365.0524,
"train_tokens_per_second": 35117.52
},
{
"epoch": 17.0,
"eval_loss": 1.2976926565170288,
"eval_runtime": 49.7121,
"eval_samples_per_second": 771.141,
"eval_steps_per_second": 96.395,
"num_input_tokens_seen": 188612114,
"step": 325856
},
{
"epoch": 17.007512520868115,
"grad_norm": 2.3629326820373535,
"learning_rate": 7.481349123539233e-06,
"loss": 1.2388,
"num_input_tokens_seen": 188700266,
"step": 326000,
"train_runtime": 5436.6369,
"train_tokens_per_second": 34709.007
},
{
"epoch": 17.033597662771285,
"grad_norm": 2.8408102989196777,
"learning_rate": 7.416136268781303e-06,
"loss": 1.2502,
"num_input_tokens_seen": 188990786,
"step": 326500,
"train_runtime": 5458.7072,
"train_tokens_per_second": 34621.894
},
{
"epoch": 17.05968280467446,
"grad_norm": 3.5564496517181396,
"learning_rate": 7.350923414023372e-06,
"loss": 1.2586,
"num_input_tokens_seen": 189289114,
"step": 327000,
"train_runtime": 5481.0865,
"train_tokens_per_second": 34534.962
},
{
"epoch": 17.08576794657763,
"grad_norm": 2.573309898376465,
"learning_rate": 7.2857105592654434e-06,
"loss": 1.255,
"num_input_tokens_seen": 189582338,
"step": 327500,
"train_runtime": 5502.6097,
"train_tokens_per_second": 34453.168
},
{
"epoch": 17.1118530884808,
"grad_norm": 2.900810718536377,
"learning_rate": 7.220497704507513e-06,
"loss": 1.2625,
"num_input_tokens_seen": 189873506,
"step": 328000,
"train_runtime": 5523.9124,
"train_tokens_per_second": 34373.012
},
{
"epoch": 17.137938230383973,
"grad_norm": 2.80328106880188,
"learning_rate": 7.155284849749583e-06,
"loss": 1.2621,
"num_input_tokens_seen": 190163986,
"step": 328500,
"train_runtime": 5545.2522,
"train_tokens_per_second": 34293.117
},
{
"epoch": 17.164023372287144,
"grad_norm": 2.8359973430633545,
"learning_rate": 7.090071994991653e-06,
"loss": 1.2276,
"num_input_tokens_seen": 190454602,
"step": 329000,
"train_runtime": 5566.6958,
"train_tokens_per_second": 34213.223
},
{
"epoch": 17.190108514190317,
"grad_norm": 2.6880123615264893,
"learning_rate": 7.024859140233723e-06,
"loss": 1.2414,
"num_input_tokens_seen": 190749178,
"step": 329500,
"train_runtime": 5587.661,
"train_tokens_per_second": 34137.572
},
{
"epoch": 17.216193656093488,
"grad_norm": 2.2190914154052734,
"learning_rate": 6.959646285475793e-06,
"loss": 1.2697,
"num_input_tokens_seen": 191041514,
"step": 330000,
"train_runtime": 5608.8953,
"train_tokens_per_second": 34060.453
},
{
"epoch": 17.24227879799666,
"grad_norm": 2.855161428451538,
"learning_rate": 6.894433430717863e-06,
"loss": 1.2656,
"num_input_tokens_seen": 191333666,
"step": 330500,
"train_runtime": 5629.9424,
"train_tokens_per_second": 33985.013
},
{
"epoch": 17.26836393989983,
"grad_norm": 2.8625779151916504,
"learning_rate": 6.829220575959934e-06,
"loss": 1.2595,
"num_input_tokens_seen": 191622570,
"step": 331000,
"train_runtime": 5650.9098,
"train_tokens_per_second": 33910.039
},
{
"epoch": 17.294449081803005,
"grad_norm": 2.630918502807617,
"learning_rate": 6.764007721202003e-06,
"loss": 1.2521,
"num_input_tokens_seen": 191911738,
"step": 331500,
"train_runtime": 5671.7949,
"train_tokens_per_second": 33836.156
},
{
"epoch": 17.320534223706176,
"grad_norm": 2.7609314918518066,
"learning_rate": 6.698794866444073e-06,
"loss": 1.2586,
"num_input_tokens_seen": 192200466,
"step": 332000,
"train_runtime": 5692.6673,
"train_tokens_per_second": 33762.814
},
{
"epoch": 17.34661936560935,
"grad_norm": 2.250659465789795,
"learning_rate": 6.6335820116861445e-06,
"loss": 1.2388,
"num_input_tokens_seen": 192489178,
"step": 332500,
"train_runtime": 5713.6569,
"train_tokens_per_second": 33689.313
},
{
"epoch": 17.37270450751252,
"grad_norm": 3.1896932125091553,
"learning_rate": 6.568369156928214e-06,
"loss": 1.2559,
"num_input_tokens_seen": 192778922,
"step": 333000,
"train_runtime": 5734.7257,
"train_tokens_per_second": 33616.067
},
{
"epoch": 17.398789649415694,
"grad_norm": 3.3856568336486816,
"learning_rate": 6.503156302170284e-06,
"loss": 1.267,
"num_input_tokens_seen": 193066674,
"step": 333500,
"train_runtime": 5755.6678,
"train_tokens_per_second": 33543.748
},
{
"epoch": 17.424874791318864,
"grad_norm": 2.031611919403076,
"learning_rate": 6.437943447412355e-06,
"loss": 1.2624,
"num_input_tokens_seen": 193346250,
"step": 334000,
"train_runtime": 5776.7111,
"train_tokens_per_second": 33469.953
},
{
"epoch": 17.450959933222038,
"grad_norm": 6.999661922454834,
"learning_rate": 6.3727305926544244e-06,
"loss": 1.25,
"num_input_tokens_seen": 193636658,
"step": 334500,
"train_runtime": 5797.5543,
"train_tokens_per_second": 33399.715
},
{
"epoch": 17.477045075125208,
"grad_norm": 3.335151433944702,
"learning_rate": 6.307517737896494e-06,
"loss": 1.2646,
"num_input_tokens_seen": 193927418,
"step": 335000,
"train_runtime": 5818.5663,
"train_tokens_per_second": 33329.072
},
{
"epoch": 17.50313021702838,
"grad_norm": 3.0118470191955566,
"learning_rate": 6.242304883138565e-06,
"loss": 1.2595,
"num_input_tokens_seen": 194220626,
"step": 335500,
"train_runtime": 5839.6078,
"train_tokens_per_second": 33259.19
},
{
"epoch": 17.529215358931552,
"grad_norm": 2.819512128829956,
"learning_rate": 6.177092028380635e-06,
"loss": 1.2604,
"num_input_tokens_seen": 194507882,
"step": 336000,
"train_runtime": 5860.9533,
"train_tokens_per_second": 33187.072
},
{
"epoch": 17.555300500834726,
"grad_norm": 2.87508225440979,
"learning_rate": 6.111879173622704e-06,
"loss": 1.2855,
"num_input_tokens_seen": 194796762,
"step": 336500,
"train_runtime": 5882.4096,
"train_tokens_per_second": 33115.13
},
{
"epoch": 17.581385642737896,
"grad_norm": 2.2459728717803955,
"learning_rate": 6.046666318864775e-06,
"loss": 1.2522,
"num_input_tokens_seen": 195084282,
"step": 337000,
"train_runtime": 5904.0114,
"train_tokens_per_second": 33042.667
},
{
"epoch": 17.60747078464107,
"grad_norm": 2.935845375061035,
"learning_rate": 5.981453464106846e-06,
"loss": 1.2545,
"num_input_tokens_seen": 195375162,
"step": 337500,
"train_runtime": 5925.5481,
"train_tokens_per_second": 32971.661
},
{
"epoch": 17.63355592654424,
"grad_norm": 3.0520784854888916,
"learning_rate": 5.916240609348915e-06,
"loss": 1.2587,
"num_input_tokens_seen": 195666498,
"step": 338000,
"train_runtime": 5946.9563,
"train_tokens_per_second": 32901.957
},
{
"epoch": 17.659641068447414,
"grad_norm": 1.9762933254241943,
"learning_rate": 5.851027754590985e-06,
"loss": 1.2714,
"num_input_tokens_seen": 195952418,
"step": 338500,
"train_runtime": 5968.684,
"train_tokens_per_second": 32830.087
},
{
"epoch": 17.685726210350584,
"grad_norm": 3.0459036827087402,
"learning_rate": 5.785814899833055e-06,
"loss": 1.2819,
"num_input_tokens_seen": 196243738,
"step": 339000,
"train_runtime": 5990.4534,
"train_tokens_per_second": 32759.413
},
{
"epoch": 17.711811352253758,
"grad_norm": 2.7781834602355957,
"learning_rate": 5.7206020450751255e-06,
"loss": 1.253,
"num_input_tokens_seen": 196532034,
"step": 339500,
"train_runtime": 6011.9799,
"train_tokens_per_second": 32690.068
},
{
"epoch": 17.737896494156928,
"grad_norm": 3.383931875228882,
"learning_rate": 5.655389190317196e-06,
"loss": 1.2521,
"num_input_tokens_seen": 196822202,
"step": 340000,
"train_runtime": 6033.5216,
"train_tokens_per_second": 32621.446
},
{
"epoch": 17.7639816360601,
"grad_norm": 2.72835373878479,
"learning_rate": 5.590176335559266e-06,
"loss": 1.2494,
"num_input_tokens_seen": 197110802,
"step": 340500,
"train_runtime": 6054.9604,
"train_tokens_per_second": 32553.607
},
{
"epoch": 17.790066777963272,
"grad_norm": 2.868680000305176,
"learning_rate": 5.524963480801336e-06,
"loss": 1.2436,
"num_input_tokens_seen": 197396914,
"step": 341000,
"train_runtime": 6076.2211,
"train_tokens_per_second": 32486.789
},
{
"epoch": 17.816151919866446,
"grad_norm": 2.985006809234619,
"learning_rate": 5.459750626043405e-06,
"loss": 1.269,
"num_input_tokens_seen": 197687178,
"step": 341500,
"train_runtime": 6097.2778,
"train_tokens_per_second": 32422.203
},
{
"epoch": 17.842237061769616,
"grad_norm": 2.457155704498291,
"learning_rate": 5.394537771285476e-06,
"loss": 1.2725,
"num_input_tokens_seen": 197978106,
"step": 342000,
"train_runtime": 6118.3065,
"train_tokens_per_second": 32358.318
},
{
"epoch": 17.86832220367279,
"grad_norm": 2.6323978900909424,
"learning_rate": 5.329324916527547e-06,
"loss": 1.2691,
"num_input_tokens_seen": 198267826,
"step": 342500,
"train_runtime": 6138.9907,
"train_tokens_per_second": 32296.486
},
{
"epoch": 17.89440734557596,
"grad_norm": 2.9683570861816406,
"learning_rate": 5.264112061769616e-06,
"loss": 1.2606,
"num_input_tokens_seen": 198555794,
"step": 343000,
"train_runtime": 6159.8347,
"train_tokens_per_second": 32233.948
},
{
"epoch": 17.92049248747913,
"grad_norm": 2.6426734924316406,
"learning_rate": 5.198899207011686e-06,
"loss": 1.2572,
"num_input_tokens_seen": 198837802,
"step": 343500,
"train_runtime": 6180.7096,
"train_tokens_per_second": 32170.708
},
{
"epoch": 17.946577629382304,
"grad_norm": 2.743959426879883,
"learning_rate": 5.133686352253757e-06,
"loss": 1.2584,
"num_input_tokens_seen": 199125674,
"step": 344000,
"train_runtime": 6201.5223,
"train_tokens_per_second": 32109.16
},
{
"epoch": 17.972662771285474,
"grad_norm": 2.5115082263946533,
"learning_rate": 5.0684734974958266e-06,
"loss": 1.2496,
"num_input_tokens_seen": 199418034,
"step": 344500,
"train_runtime": 6222.8136,
"train_tokens_per_second": 32046.281
},
{
"epoch": 17.998747913188648,
"grad_norm": 2.3742177486419678,
"learning_rate": 5.003260642737897e-06,
"loss": 1.2601,
"num_input_tokens_seen": 199702842,
"step": 345000,
"train_runtime": 6244.2476,
"train_tokens_per_second": 31981.89
},
{
"epoch": 18.0,
"eval_loss": 1.296248197555542,
"eval_runtime": 50.1469,
"eval_samples_per_second": 764.454,
"eval_steps_per_second": 95.559,
"num_input_tokens_seen": 199715854,
"step": 345024
},
{
"epoch": 18.02483305509182,
"grad_norm": 2.2281575202941895,
"learning_rate": 4.938047787979966e-06,
"loss": 1.2228,
"num_input_tokens_seen": 199990102,
"step": 345500,
"train_runtime": 6316.4242,
"train_tokens_per_second": 31661.917
},
{
"epoch": 18.050918196994992,
"grad_norm": 2.840803384780884,
"learning_rate": 4.872834933222037e-06,
"loss": 1.2581,
"num_input_tokens_seen": 200279302,
"step": 346000,
"train_runtime": 6337.2304,
"train_tokens_per_second": 31603.601
},
{
"epoch": 18.077003338898162,
"grad_norm": 2.4082562923431396,
"learning_rate": 4.807622078464107e-06,
"loss": 1.2566,
"num_input_tokens_seen": 200566038,
"step": 346500,
"train_runtime": 6358.142,
"train_tokens_per_second": 31544.756
},
{
"epoch": 18.103088480801336,
"grad_norm": 3.136262893676758,
"learning_rate": 4.742409223706177e-06,
"loss": 1.2631,
"num_input_tokens_seen": 200854406,
"step": 347000,
"train_runtime": 6379.2543,
"train_tokens_per_second": 31485.562
},
{
"epoch": 18.129173622704506,
"grad_norm": 2.251553535461426,
"learning_rate": 4.677196368948248e-06,
"loss": 1.2434,
"num_input_tokens_seen": 201141734,
"step": 347500,
"train_runtime": 6400.5038,
"train_tokens_per_second": 31425.922
},
{
"epoch": 18.15525876460768,
"grad_norm": 2.587162971496582,
"learning_rate": 4.6119835141903175e-06,
"loss": 1.2481,
"num_input_tokens_seen": 201429926,
"step": 348000,
"train_runtime": 6421.5455,
"train_tokens_per_second": 31367.827
},
{
"epoch": 18.18134390651085,
"grad_norm": 2.8229830265045166,
"learning_rate": 4.546770659432387e-06,
"loss": 1.2536,
"num_input_tokens_seen": 201720902,
"step": 348500,
"train_runtime": 6442.673,
"train_tokens_per_second": 31310.126
},
{
"epoch": 18.207429048414024,
"grad_norm": 2.943593740463257,
"learning_rate": 4.481557804674458e-06,
"loss": 1.2687,
"num_input_tokens_seen": 202015494,
"step": 349000,
"train_runtime": 6463.8115,
"train_tokens_per_second": 31253.308
},
{
"epoch": 18.233514190317194,
"grad_norm": 2.8468620777130127,
"learning_rate": 4.416344949916528e-06,
"loss": 1.2475,
"num_input_tokens_seen": 202301734,
"step": 349500,
"train_runtime": 6484.7729,
"train_tokens_per_second": 31196.426
},
{
"epoch": 18.25959933222037,
"grad_norm": 2.5584495067596436,
"learning_rate": 4.351132095158597e-06,
"loss": 1.2464,
"num_input_tokens_seen": 202582798,
"step": 350000,
"train_runtime": 6505.8233,
"train_tokens_per_second": 31138.688
},
{
"epoch": 18.28568447412354,
"grad_norm": 3.42409348487854,
"learning_rate": 4.285919240400668e-06,
"loss": 1.2696,
"num_input_tokens_seen": 202872662,
"step": 350500,
"train_runtime": 6526.8475,
"train_tokens_per_second": 31082.795
},
{
"epoch": 18.311769616026712,
"grad_norm": 2.7311031818389893,
"learning_rate": 4.220706385642738e-06,
"loss": 1.249,
"num_input_tokens_seen": 203159246,
"step": 351000,
"train_runtime": 6547.7257,
"train_tokens_per_second": 31027.452
},
{
"epoch": 18.337854757929883,
"grad_norm": 3.2200024127960205,
"learning_rate": 4.155493530884808e-06,
"loss": 1.2766,
"num_input_tokens_seen": 203449598,
"step": 351500,
"train_runtime": 6568.6802,
"train_tokens_per_second": 30972.675
},
{
"epoch": 18.363939899833056,
"grad_norm": 3.4853382110595703,
"learning_rate": 4.090280676126879e-06,
"loss": 1.2478,
"num_input_tokens_seen": 203737350,
"step": 352000,
"train_runtime": 6589.7847,
"train_tokens_per_second": 30917.148
},
{
"epoch": 18.390025041736227,
"grad_norm": 2.6248600482940674,
"learning_rate": 4.025067821368948e-06,
"loss": 1.2461,
"num_input_tokens_seen": 204033470,
"step": 352500,
"train_runtime": 6610.8585,
"train_tokens_per_second": 30863.385
},
{
"epoch": 18.4161101836394,
"grad_norm": 3.1528148651123047,
"learning_rate": 3.9598549666110185e-06,
"loss": 1.2487,
"num_input_tokens_seen": 204320822,
"step": 353000,
"train_runtime": 6632.008,
"train_tokens_per_second": 30808.289
},
{
"epoch": 18.44219532554257,
"grad_norm": 2.4708855152130127,
"learning_rate": 3.894642111853088e-06,
"loss": 1.2493,
"num_input_tokens_seen": 204615126,
"step": 353500,
"train_runtime": 6653.2853,
"train_tokens_per_second": 30753.998
},
{
"epoch": 18.468280467445744,
"grad_norm": 2.8539340496063232,
"learning_rate": 3.829429257095159e-06,
"loss": 1.2469,
"num_input_tokens_seen": 204908238,
"step": 354000,
"train_runtime": 6674.3465,
"train_tokens_per_second": 30700.869
},
{
"epoch": 18.494365609348915,
"grad_norm": 3.047869920730591,
"learning_rate": 3.764216402337229e-06,
"loss": 1.2571,
"num_input_tokens_seen": 205200078,
"step": 354500,
"train_runtime": 6695.2164,
"train_tokens_per_second": 30648.759
},
{
"epoch": 18.52045075125209,
"grad_norm": 3.70831298828125,
"learning_rate": 3.699003547579299e-06,
"loss": 1.2544,
"num_input_tokens_seen": 205493198,
"step": 355000,
"train_runtime": 6716.2607,
"train_tokens_per_second": 30596.37
},
{
"epoch": 18.54653589315526,
"grad_norm": 2.9419515132904053,
"learning_rate": 3.633790692821369e-06,
"loss": 1.2406,
"num_input_tokens_seen": 205782654,
"step": 355500,
"train_runtime": 6737.4279,
"train_tokens_per_second": 30543.207
},
{
"epoch": 18.572621035058432,
"grad_norm": 3.3979151248931885,
"learning_rate": 3.5685778380634397e-06,
"loss": 1.2387,
"num_input_tokens_seen": 206078310,
"step": 356000,
"train_runtime": 6758.4267,
"train_tokens_per_second": 30492.054
},
{
"epoch": 18.598706176961603,
"grad_norm": 2.5537753105163574,
"learning_rate": 3.503364983305509e-06,
"loss": 1.2454,
"num_input_tokens_seen": 206364678,
"step": 356500,
"train_runtime": 6779.3225,
"train_tokens_per_second": 30440.31
},
{
"epoch": 18.624791318864773,
"grad_norm": 3.0519020557403564,
"learning_rate": 3.4381521285475796e-06,
"loss": 1.2617,
"num_input_tokens_seen": 206651694,
"step": 357000,
"train_runtime": 6800.2161,
"train_tokens_per_second": 30388.989
},
{
"epoch": 18.650876460767947,
"grad_norm": 2.832632541656494,
"learning_rate": 3.3729392737896494e-06,
"loss": 1.2594,
"num_input_tokens_seen": 206935862,
"step": 357500,
"train_runtime": 6821.3364,
"train_tokens_per_second": 30336.557
},
{
"epoch": 18.676961602671117,
"grad_norm": 3.5510575771331787,
"learning_rate": 3.3077264190317196e-06,
"loss": 1.2576,
"num_input_tokens_seen": 207225006,
"step": 358000,
"train_runtime": 6842.5612,
"train_tokens_per_second": 30284.713
},
{
"epoch": 18.70304674457429,
"grad_norm": 2.7018370628356934,
"learning_rate": 3.24251356427379e-06,
"loss": 1.2524,
"num_input_tokens_seen": 207518494,
"step": 358500,
"train_runtime": 6863.6965,
"train_tokens_per_second": 30234.218
},
{
"epoch": 18.72913188647746,
"grad_norm": 2.3896238803863525,
"learning_rate": 3.1773007095158596e-06,
"loss": 1.2787,
"num_input_tokens_seen": 207806854,
"step": 359000,
"train_runtime": 6884.7293,
"train_tokens_per_second": 30183.736
},
{
"epoch": 18.755217028380635,
"grad_norm": 2.3457329273223877,
"learning_rate": 3.11208785475793e-06,
"loss": 1.2612,
"num_input_tokens_seen": 208104358,
"step": 359500,
"train_runtime": 6906.0499,
"train_tokens_per_second": 30133.631
},
{
"epoch": 18.781302170283805,
"grad_norm": 3.7799017429351807,
"learning_rate": 3.046875e-06,
"loss": 1.2278,
"num_input_tokens_seen": 208395230,
"step": 360000,
"train_runtime": 6927.2417,
"train_tokens_per_second": 30083.436
},
{
"epoch": 18.80738731218698,
"grad_norm": 2.9162731170654297,
"learning_rate": 2.98166214524207e-06,
"loss": 1.2495,
"num_input_tokens_seen": 208684190,
"step": 360500,
"train_runtime": 6948.3051,
"train_tokens_per_second": 30033.826
},
{
"epoch": 18.83347245409015,
"grad_norm": 3.2956576347351074,
"learning_rate": 2.9164492904841403e-06,
"loss": 1.2556,
"num_input_tokens_seen": 208972206,
"step": 361000,
"train_runtime": 6969.2518,
"train_tokens_per_second": 29984.884
},
{
"epoch": 18.859557595993323,
"grad_norm": 2.974874496459961,
"learning_rate": 2.8512364357262105e-06,
"loss": 1.2433,
"num_input_tokens_seen": 209260382,
"step": 361500,
"train_runtime": 6990.1944,
"train_tokens_per_second": 29936.275
},
{
"epoch": 18.885642737896493,
"grad_norm": 2.385434150695801,
"learning_rate": 2.7860235809682807e-06,
"loss": 1.2529,
"num_input_tokens_seen": 209544430,
"step": 362000,
"train_runtime": 7011.1006,
"train_tokens_per_second": 29887.523
},
{
"epoch": 18.911727879799667,
"grad_norm": 2.289966344833374,
"learning_rate": 2.7208107262103505e-06,
"loss": 1.262,
"num_input_tokens_seen": 209834774,
"step": 362500,
"train_runtime": 7032.3451,
"train_tokens_per_second": 29838.521
},
{
"epoch": 18.937813021702837,
"grad_norm": 2.8906939029693604,
"learning_rate": 2.655597871452421e-06,
"loss": 1.2716,
"num_input_tokens_seen": 210123054,
"step": 363000,
"train_runtime": 7053.702,
"train_tokens_per_second": 29789.046
},
{
"epoch": 18.96389816360601,
"grad_norm": 3.4153401851654053,
"learning_rate": 2.590385016694491e-06,
"loss": 1.2774,
"num_input_tokens_seen": 210412382,
"step": 363500,
"train_runtime": 7075.1001,
"train_tokens_per_second": 29739.845
},
{
"epoch": 18.98998330550918,
"grad_norm": 3.0862789154052734,
"learning_rate": 2.525172161936561e-06,
"loss": 1.2665,
"num_input_tokens_seen": 210705166,
"step": 364000,
"train_runtime": 7096.4466,
"train_tokens_per_second": 29691.644
},
{
"epoch": 19.0,
"eval_loss": 1.296281337738037,
"eval_runtime": 51.4225,
"eval_samples_per_second": 745.49,
"eval_steps_per_second": 93.189,
"num_input_tokens_seen": 210813428,
"step": 364192
},
{
"epoch": 19.016068447412355,
"grad_norm": 2.282921314239502,
"learning_rate": 2.459959307178631e-06,
"loss": 1.2246,
"num_input_tokens_seen": 210992604,
"step": 364500,
"train_runtime": 7170.5972,
"train_tokens_per_second": 29424.69
},
{
"epoch": 19.042153589315525,
"grad_norm": 2.1377789974212646,
"learning_rate": 2.3947464524207014e-06,
"loss": 1.2377,
"num_input_tokens_seen": 211280204,
"step": 365000,
"train_runtime": 7192.2041,
"train_tokens_per_second": 29376.28
},
{
"epoch": 19.0682387312187,
"grad_norm": 3.454662799835205,
"learning_rate": 2.3295335976627716e-06,
"loss": 1.2658,
"num_input_tokens_seen": 211569500,
"step": 365500,
"train_runtime": 7213.663,
"train_tokens_per_second": 29328.997
},
{
"epoch": 19.09432387312187,
"grad_norm": 2.45365309715271,
"learning_rate": 2.2643207429048414e-06,
"loss": 1.2296,
"num_input_tokens_seen": 211851156,
"step": 366000,
"train_runtime": 7235.0182,
"train_tokens_per_second": 29281.358
},
{
"epoch": 19.120409015025043,
"grad_norm": 2.841344118118286,
"learning_rate": 2.1991078881469116e-06,
"loss": 1.2817,
"num_input_tokens_seen": 212137660,
"step": 366500,
"train_runtime": 7256.1778,
"train_tokens_per_second": 29235.455
},
{
"epoch": 19.146494156928213,
"grad_norm": 2.386323928833008,
"learning_rate": 2.1338950333889818e-06,
"loss": 1.2336,
"num_input_tokens_seen": 212425948,
"step": 367000,
"train_runtime": 7277.2221,
"train_tokens_per_second": 29190.527
},
{
"epoch": 19.172579298831387,
"grad_norm": 3.1663670539855957,
"learning_rate": 2.068682178631052e-06,
"loss": 1.2755,
"num_input_tokens_seen": 212713028,
"step": 367500,
"train_runtime": 7298.2567,
"train_tokens_per_second": 29145.731
},
{
"epoch": 19.198664440734557,
"grad_norm": 2.1720612049102783,
"learning_rate": 2.0034693238731217e-06,
"loss": 1.2636,
"num_input_tokens_seen": 213002716,
"step": 368000,
"train_runtime": 7318.2656,
"train_tokens_per_second": 29105.628
},
{
"epoch": 19.22474958263773,
"grad_norm": 2.9212682247161865,
"learning_rate": 1.938256469115192e-06,
"loss": 1.2423,
"num_input_tokens_seen": 213288196,
"step": 368500,
"train_runtime": 7337.9518,
"train_tokens_per_second": 29066.448
},
{
"epoch": 19.2508347245409,
"grad_norm": 2.7475364208221436,
"learning_rate": 1.8730436143572623e-06,
"loss": 1.2443,
"num_input_tokens_seen": 213574692,
"step": 369000,
"train_runtime": 7356.9693,
"train_tokens_per_second": 29030.255
},
{
"epoch": 19.276919866444075,
"grad_norm": 2.422600030899048,
"learning_rate": 1.8078307595993323e-06,
"loss": 1.2201,
"num_input_tokens_seen": 213864116,
"step": 369500,
"train_runtime": 7375.605,
"train_tokens_per_second": 28996.146
},
{
"epoch": 19.303005008347245,
"grad_norm": 2.7195160388946533,
"learning_rate": 1.7426179048414023e-06,
"loss": 1.2481,
"num_input_tokens_seen": 214150676,
"step": 370000,
"train_runtime": 7396.4144,
"train_tokens_per_second": 28953.31
},
{
"epoch": 19.32909015025042,
"grad_norm": 2.50443172454834,
"learning_rate": 1.6774050500834725e-06,
"loss": 1.2302,
"num_input_tokens_seen": 214440244,
"step": 370500,
"train_runtime": 7416.7831,
"train_tokens_per_second": 28912.837
},
{
"epoch": 19.35517529215359,
"grad_norm": 2.887474775314331,
"learning_rate": 1.6121921953255427e-06,
"loss": 1.2449,
"num_input_tokens_seen": 214730132,
"step": 371000,
"train_runtime": 7437.2553,
"train_tokens_per_second": 28872.228
},
{
"epoch": 19.38126043405676,
"grad_norm": 2.5884950160980225,
"learning_rate": 1.5469793405676129e-06,
"loss": 1.2521,
"num_input_tokens_seen": 215019220,
"step": 371500,
"train_runtime": 7457.6502,
"train_tokens_per_second": 28832.033
},
{
"epoch": 19.407345575959933,
"grad_norm": 2.357685089111328,
"learning_rate": 1.4817664858096828e-06,
"loss": 1.2443,
"num_input_tokens_seen": 215310132,
"step": 372000,
"train_runtime": 7478.2575,
"train_tokens_per_second": 28791.484
},
{
"epoch": 19.433430717863104,
"grad_norm": 2.3335018157958984,
"learning_rate": 1.416553631051753e-06,
"loss": 1.2469,
"num_input_tokens_seen": 215600084,
"step": 372500,
"train_runtime": 7498.6623,
"train_tokens_per_second": 28751.806
},
{
"epoch": 19.459515859766277,
"grad_norm": 2.7641124725341797,
"learning_rate": 1.351340776293823e-06,
"loss": 1.228,
"num_input_tokens_seen": 215888340,
"step": 373000,
"train_runtime": 7519.0798,
"train_tokens_per_second": 28712.069
},
{
"epoch": 19.485601001669448,
"grad_norm": 2.7597529888153076,
"learning_rate": 1.2861279215358932e-06,
"loss": 1.2499,
"num_input_tokens_seen": 216178932,
"step": 373500,
"train_runtime": 7539.4463,
"train_tokens_per_second": 28673.051
},
{
"epoch": 19.51168614357262,
"grad_norm": 2.3733975887298584,
"learning_rate": 1.2209150667779632e-06,
"loss": 1.2484,
"num_input_tokens_seen": 216470580,
"step": 374000,
"train_runtime": 7559.9641,
"train_tokens_per_second": 28633.811
},
{
"epoch": 19.53777128547579,
"grad_norm": 2.3238165378570557,
"learning_rate": 1.1557022120200334e-06,
"loss": 1.2364,
"num_input_tokens_seen": 216763740,
"step": 374500,
"train_runtime": 7579.7748,
"train_tokens_per_second": 28597.649
},
{
"epoch": 19.563856427378965,
"grad_norm": 2.8229446411132812,
"learning_rate": 1.0904893572621036e-06,
"loss": 1.2358,
"num_input_tokens_seen": 217053292,
"step": 375000,
"train_runtime": 7598.7817,
"train_tokens_per_second": 28564.223
},
{
"epoch": 19.589941569282136,
"grad_norm": 2.4836158752441406,
"learning_rate": 1.0252765025041738e-06,
"loss": 1.2606,
"num_input_tokens_seen": 217344428,
"step": 375500,
"train_runtime": 7618.304,
"train_tokens_per_second": 28529.241
},
{
"epoch": 19.61602671118531,
"grad_norm": 2.7675931453704834,
"learning_rate": 9.600636477462437e-07,
"loss": 1.2629,
"num_input_tokens_seen": 217634524,
"step": 376000,
"train_runtime": 7637.288,
"train_tokens_per_second": 28496.31
},
{
"epoch": 19.64211185308848,
"grad_norm": 2.331380844116211,
"learning_rate": 8.948507929883139e-07,
"loss": 1.2521,
"num_input_tokens_seen": 217924508,
"step": 376500,
"train_runtime": 7656.4955,
"train_tokens_per_second": 28462.697
},
{
"epoch": 19.668196994991654,
"grad_norm": 3.3577489852905273,
"learning_rate": 8.29637938230384e-07,
"loss": 1.2571,
"num_input_tokens_seen": 218217084,
"step": 377000,
"train_runtime": 7675.4197,
"train_tokens_per_second": 28430.639
},
{
"epoch": 19.694282136894824,
"grad_norm": 2.872344970703125,
"learning_rate": 7.644250834724542e-07,
"loss": 1.271,
"num_input_tokens_seen": 218508180,
"step": 377500,
"train_runtime": 7694.4779,
"train_tokens_per_second": 28398.052
},
{
"epoch": 19.720367278797998,
"grad_norm": 2.9395909309387207,
"learning_rate": 6.992122287145243e-07,
"loss": 1.25,
"num_input_tokens_seen": 218798076,
"step": 378000,
"train_runtime": 7712.7627,
"train_tokens_per_second": 28368.314
},
{
"epoch": 19.746452420701168,
"grad_norm": 2.5424513816833496,
"learning_rate": 6.339993739565944e-07,
"loss": 1.2817,
"num_input_tokens_seen": 219089308,
"step": 378500,
"train_runtime": 7731.9549,
"train_tokens_per_second": 28335.565
},
{
"epoch": 19.77253756260434,
"grad_norm": 2.9725682735443115,
"learning_rate": 5.687865191986645e-07,
"loss": 1.2418,
"num_input_tokens_seen": 219383604,
"step": 379000,
"train_runtime": 7751.6984,
"train_tokens_per_second": 28301.36
},
{
"epoch": 19.798622704507512,
"grad_norm": 3.3688950538635254,
"learning_rate": 5.035736644407346e-07,
"loss": 1.2449,
"num_input_tokens_seen": 219679124,
"step": 379500,
"train_runtime": 7771.8118,
"train_tokens_per_second": 28266.14
},
{
"epoch": 19.824707846410686,
"grad_norm": 2.398789882659912,
"learning_rate": 4.3836080968280473e-07,
"loss": 1.2362,
"num_input_tokens_seen": 219963660,
"step": 380000,
"train_runtime": 7790.6642,
"train_tokens_per_second": 28234.263
},
{
"epoch": 19.850792988313856,
"grad_norm": 2.845128059387207,
"learning_rate": 3.731479549248748e-07,
"loss": 1.2803,
"num_input_tokens_seen": 220255900,
"step": 380500,
"train_runtime": 7809.3731,
"train_tokens_per_second": 28204.044
},
{
"epoch": 19.87687813021703,
"grad_norm": 2.6180248260498047,
"learning_rate": 3.079351001669449e-07,
"loss": 1.2634,
"num_input_tokens_seen": 220547100,
"step": 381000,
"train_runtime": 7827.8518,
"train_tokens_per_second": 28174.665
},
{
"epoch": 19.9029632721202,
"grad_norm": 2.5833303928375244,
"learning_rate": 2.4272224540901504e-07,
"loss": 1.2482,
"num_input_tokens_seen": 220835100,
"step": 381500,
"train_runtime": 7847.7813,
"train_tokens_per_second": 28139.813
},
{
"epoch": 19.929048414023374,
"grad_norm": 2.800402879714966,
"learning_rate": 1.7750939065108515e-07,
"loss": 1.2335,
"num_input_tokens_seen": 221122004,
"step": 382000,
"train_runtime": 7866.365,
"train_tokens_per_second": 28109.807
},
{
"epoch": 19.955133555926544,
"grad_norm": 2.8612380027770996,
"learning_rate": 1.1229653589315525e-07,
"loss": 1.2251,
"num_input_tokens_seen": 221409964,
"step": 382500,
"train_runtime": 7884.8107,
"train_tokens_per_second": 28080.568
},
{
"epoch": 19.981218697829718,
"grad_norm": 3.3842055797576904,
"learning_rate": 4.7083681135225376e-08,
"loss": 1.2888,
"num_input_tokens_seen": 221700476,
"step": 383000,
"train_runtime": 7903.5539,
"train_tokens_per_second": 28050.732
},
{
"epoch": 20.0,
"eval_loss": 1.2961275577545166,
"eval_runtime": 46.2863,
"eval_samples_per_second": 828.215,
"eval_steps_per_second": 103.53,
"num_input_tokens_seen": 221910640,
"step": 383360
},
{
"epoch": 20.0,
"num_input_tokens_seen": 221910640,
"step": 383360,
"total_flos": 8.056851732185088e+16,
"train_loss": 0.641815161904031,
"train_runtime": 7964.4512,
"train_samples_per_second": 385.056,
"train_steps_per_second": 48.134,
"train_tokens_per_second": 27853.103
}
],
"logging_steps": 500,
"max_steps": 383360,
"num_input_tokens_seen": 221910640,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.056851732185088e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}