mini-Llama-70M-SFT / checkpoint-5000 /trainer_state.json
rootxhacker's picture
Upload folder using huggingface_hub
d7bfc08 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.545595054095827,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030911901081916537,
"grad_norm": 3.058769941329956,
"learning_rate": 1.9972179289026277e-05,
"loss": 5.0209,
"mean_token_accuracy": 0.21833103336393833,
"num_tokens": 28941.0,
"step": 10
},
{
"epoch": 0.0061823802163833074,
"grad_norm": 2.485980749130249,
"learning_rate": 1.994126738794436e-05,
"loss": 4.7631,
"mean_token_accuracy": 0.23807235918939113,
"num_tokens": 60990.0,
"step": 20
},
{
"epoch": 0.00927357032457496,
"grad_norm": 5.288544178009033,
"learning_rate": 1.9910355486862444e-05,
"loss": 4.6799,
"mean_token_accuracy": 0.24858475103974342,
"num_tokens": 90602.0,
"step": 30
},
{
"epoch": 0.012364760432766615,
"grad_norm": 3.226719379425049,
"learning_rate": 1.9879443585780528e-05,
"loss": 4.5883,
"mean_token_accuracy": 0.2555678006261587,
"num_tokens": 121132.0,
"step": 40
},
{
"epoch": 0.015455950540958269,
"grad_norm": 2.027597188949585,
"learning_rate": 1.984853168469861e-05,
"loss": 4.5125,
"mean_token_accuracy": 0.26637452803552153,
"num_tokens": 154244.0,
"step": 50
},
{
"epoch": 0.01854714064914992,
"grad_norm": 1.6715962886810303,
"learning_rate": 1.9817619783616695e-05,
"loss": 4.4224,
"mean_token_accuracy": 0.27198897041380404,
"num_tokens": 185618.0,
"step": 60
},
{
"epoch": 0.021638330757341576,
"grad_norm": 3.0472540855407715,
"learning_rate": 1.9786707882534775e-05,
"loss": 4.3464,
"mean_token_accuracy": 0.2801041007041931,
"num_tokens": 220138.0,
"step": 70
},
{
"epoch": 0.02472952086553323,
"grad_norm": 2.74045729637146,
"learning_rate": 1.9755795981452862e-05,
"loss": 4.4569,
"mean_token_accuracy": 0.2733523309230804,
"num_tokens": 252919.0,
"step": 80
},
{
"epoch": 0.027820710973724884,
"grad_norm": 1.9274535179138184,
"learning_rate": 1.9724884080370946e-05,
"loss": 4.2569,
"mean_token_accuracy": 0.2858310595154762,
"num_tokens": 284739.0,
"step": 90
},
{
"epoch": 0.030911901081916538,
"grad_norm": 1.906964898109436,
"learning_rate": 1.9693972179289026e-05,
"loss": 4.2144,
"mean_token_accuracy": 0.2953581381589174,
"num_tokens": 315186.0,
"step": 100
},
{
"epoch": 0.03400309119010819,
"grad_norm": 2.042823314666748,
"learning_rate": 1.966306027820711e-05,
"loss": 4.2595,
"mean_token_accuracy": 0.287172843888402,
"num_tokens": 347507.0,
"step": 110
},
{
"epoch": 0.03709428129829984,
"grad_norm": 1.9889057874679565,
"learning_rate": 1.9632148377125197e-05,
"loss": 4.1643,
"mean_token_accuracy": 0.30306958928704264,
"num_tokens": 380331.0,
"step": 120
},
{
"epoch": 0.0401854714064915,
"grad_norm": 1.7952933311462402,
"learning_rate": 1.9601236476043277e-05,
"loss": 4.1334,
"mean_token_accuracy": 0.3080880597233772,
"num_tokens": 410947.0,
"step": 130
},
{
"epoch": 0.04327666151468315,
"grad_norm": 3.8508658409118652,
"learning_rate": 1.957032457496136e-05,
"loss": 4.0262,
"mean_token_accuracy": 0.3118089348077774,
"num_tokens": 444130.0,
"step": 140
},
{
"epoch": 0.04636785162287481,
"grad_norm": 31.280546188354492,
"learning_rate": 1.9539412673879444e-05,
"loss": 3.9889,
"mean_token_accuracy": 0.32304659858345985,
"num_tokens": 474570.0,
"step": 150
},
{
"epoch": 0.04945904173106646,
"grad_norm": 2.1569416522979736,
"learning_rate": 1.950850077279753e-05,
"loss": 4.1053,
"mean_token_accuracy": 0.31258094161748884,
"num_tokens": 505047.0,
"step": 160
},
{
"epoch": 0.05255023183925812,
"grad_norm": 1.808377981185913,
"learning_rate": 1.947758887171561e-05,
"loss": 3.9711,
"mean_token_accuracy": 0.33312112018465995,
"num_tokens": 533187.0,
"step": 170
},
{
"epoch": 0.05564142194744977,
"grad_norm": 2.8038811683654785,
"learning_rate": 1.9446676970633695e-05,
"loss": 3.961,
"mean_token_accuracy": 0.33416116759181025,
"num_tokens": 565598.0,
"step": 180
},
{
"epoch": 0.05873261205564142,
"grad_norm": 8.930831909179688,
"learning_rate": 1.941576506955178e-05,
"loss": 4.0107,
"mean_token_accuracy": 0.3261258576065302,
"num_tokens": 598505.0,
"step": 190
},
{
"epoch": 0.061823802163833076,
"grad_norm": 4.706038475036621,
"learning_rate": 1.9384853168469862e-05,
"loss": 3.9231,
"mean_token_accuracy": 0.33582728281617164,
"num_tokens": 630284.0,
"step": 200
},
{
"epoch": 0.06491499227202473,
"grad_norm": 2.9916040897369385,
"learning_rate": 1.9353941267387946e-05,
"loss": 3.9407,
"mean_token_accuracy": 0.32844844460487366,
"num_tokens": 660513.0,
"step": 210
},
{
"epoch": 0.06800618238021638,
"grad_norm": 2.763737678527832,
"learning_rate": 1.932302936630603e-05,
"loss": 4.0479,
"mean_token_accuracy": 0.3248734712600708,
"num_tokens": 693154.0,
"step": 220
},
{
"epoch": 0.07109737248840804,
"grad_norm": 3.656487464904785,
"learning_rate": 1.9292117465224113e-05,
"loss": 3.7843,
"mean_token_accuracy": 0.34686593189835546,
"num_tokens": 724686.0,
"step": 230
},
{
"epoch": 0.07418856259659969,
"grad_norm": 2.6140244007110596,
"learning_rate": 1.9261205564142196e-05,
"loss": 3.9261,
"mean_token_accuracy": 0.3310456670820713,
"num_tokens": 755625.0,
"step": 240
},
{
"epoch": 0.07727975270479134,
"grad_norm": 1.595627784729004,
"learning_rate": 1.923029366306028e-05,
"loss": 3.841,
"mean_token_accuracy": 0.34514380544424056,
"num_tokens": 785478.0,
"step": 250
},
{
"epoch": 0.080370942812983,
"grad_norm": 2.0485758781433105,
"learning_rate": 1.9199381761978363e-05,
"loss": 3.7596,
"mean_token_accuracy": 0.35417362824082377,
"num_tokens": 816351.0,
"step": 260
},
{
"epoch": 0.08346213292117466,
"grad_norm": 1.7564281225204468,
"learning_rate": 1.9168469860896447e-05,
"loss": 3.7927,
"mean_token_accuracy": 0.3487237967550755,
"num_tokens": 849774.0,
"step": 270
},
{
"epoch": 0.0865533230293663,
"grad_norm": 1.5662060976028442,
"learning_rate": 1.913755795981453e-05,
"loss": 3.7821,
"mean_token_accuracy": 0.3511029303073883,
"num_tokens": 881474.0,
"step": 280
},
{
"epoch": 0.08964451313755796,
"grad_norm": 2.539433002471924,
"learning_rate": 1.9106646058732614e-05,
"loss": 3.8613,
"mean_token_accuracy": 0.3471171148121357,
"num_tokens": 913459.0,
"step": 290
},
{
"epoch": 0.09273570324574962,
"grad_norm": 4.281046390533447,
"learning_rate": 1.9075734157650694e-05,
"loss": 3.8436,
"mean_token_accuracy": 0.3475985363125801,
"num_tokens": 947091.0,
"step": 300
},
{
"epoch": 0.09582689335394126,
"grad_norm": 2.379791736602783,
"learning_rate": 1.904482225656878e-05,
"loss": 3.8309,
"mean_token_accuracy": 0.34195478409528735,
"num_tokens": 983471.0,
"step": 310
},
{
"epoch": 0.09891808346213292,
"grad_norm": 2.4176697731018066,
"learning_rate": 1.9013910355486865e-05,
"loss": 3.75,
"mean_token_accuracy": 0.3633588753640652,
"num_tokens": 1011575.0,
"step": 320
},
{
"epoch": 0.10200927357032458,
"grad_norm": 3.375523328781128,
"learning_rate": 1.898299845440495e-05,
"loss": 3.7556,
"mean_token_accuracy": 0.36073010191321375,
"num_tokens": 1042031.0,
"step": 330
},
{
"epoch": 0.10510046367851623,
"grad_norm": 5.099122524261475,
"learning_rate": 1.895208655332303e-05,
"loss": 3.6861,
"mean_token_accuracy": 0.36301063373684883,
"num_tokens": 1074689.0,
"step": 340
},
{
"epoch": 0.10819165378670788,
"grad_norm": 1.4775930643081665,
"learning_rate": 1.8921174652241116e-05,
"loss": 3.6744,
"mean_token_accuracy": 0.3681299857795238,
"num_tokens": 1105719.0,
"step": 350
},
{
"epoch": 0.11128284389489954,
"grad_norm": 3.931447744369507,
"learning_rate": 1.88902627511592e-05,
"loss": 3.6248,
"mean_token_accuracy": 0.37062914595007895,
"num_tokens": 1136185.0,
"step": 360
},
{
"epoch": 0.1143740340030912,
"grad_norm": 2.6153130531311035,
"learning_rate": 1.885935085007728e-05,
"loss": 3.6971,
"mean_token_accuracy": 0.35686987787485125,
"num_tokens": 1166461.0,
"step": 370
},
{
"epoch": 0.11746522411128284,
"grad_norm": 3.0943849086761475,
"learning_rate": 1.8828438948995363e-05,
"loss": 3.7057,
"mean_token_accuracy": 0.36670113652944564,
"num_tokens": 1194661.0,
"step": 380
},
{
"epoch": 0.1205564142194745,
"grad_norm": 1.760920524597168,
"learning_rate": 1.879752704791345e-05,
"loss": 3.6806,
"mean_token_accuracy": 0.366318603605032,
"num_tokens": 1224670.0,
"step": 390
},
{
"epoch": 0.12364760432766615,
"grad_norm": 1.8976062536239624,
"learning_rate": 1.8766615146831534e-05,
"loss": 3.7318,
"mean_token_accuracy": 0.36500929966568946,
"num_tokens": 1252830.0,
"step": 400
},
{
"epoch": 0.1267387944358578,
"grad_norm": 1.950358510017395,
"learning_rate": 1.8735703245749614e-05,
"loss": 3.7106,
"mean_token_accuracy": 0.3675060346722603,
"num_tokens": 1285546.0,
"step": 410
},
{
"epoch": 0.12982998454404945,
"grad_norm": 2.707167148590088,
"learning_rate": 1.8704791344667697e-05,
"loss": 3.6688,
"mean_token_accuracy": 0.37156677842140196,
"num_tokens": 1316466.0,
"step": 420
},
{
"epoch": 0.13292117465224113,
"grad_norm": 2.084510564804077,
"learning_rate": 1.8673879443585784e-05,
"loss": 3.6758,
"mean_token_accuracy": 0.3684880450367928,
"num_tokens": 1349121.0,
"step": 430
},
{
"epoch": 0.13601236476043277,
"grad_norm": 2.2626636028289795,
"learning_rate": 1.8642967542503865e-05,
"loss": 3.7214,
"mean_token_accuracy": 0.35720510333776473,
"num_tokens": 1384366.0,
"step": 440
},
{
"epoch": 0.1391035548686244,
"grad_norm": 2.4145290851593018,
"learning_rate": 1.8612055641421948e-05,
"loss": 3.7261,
"mean_token_accuracy": 0.36429562568664553,
"num_tokens": 1415357.0,
"step": 450
},
{
"epoch": 0.14219474497681608,
"grad_norm": 2.7409212589263916,
"learning_rate": 1.8581143740340032e-05,
"loss": 3.7368,
"mean_token_accuracy": 0.3619597226381302,
"num_tokens": 1445641.0,
"step": 460
},
{
"epoch": 0.14528593508500773,
"grad_norm": 4.5937275886535645,
"learning_rate": 1.8550231839258115e-05,
"loss": 3.6911,
"mean_token_accuracy": 0.3683665543794632,
"num_tokens": 1475360.0,
"step": 470
},
{
"epoch": 0.14837712519319937,
"grad_norm": 12.00837516784668,
"learning_rate": 1.85193199381762e-05,
"loss": 3.6899,
"mean_token_accuracy": 0.3648961283266544,
"num_tokens": 1506048.0,
"step": 480
},
{
"epoch": 0.15146831530139104,
"grad_norm": 2.5790181159973145,
"learning_rate": 1.8488408037094283e-05,
"loss": 3.6948,
"mean_token_accuracy": 0.36360194012522695,
"num_tokens": 1539998.0,
"step": 490
},
{
"epoch": 0.1545595054095827,
"grad_norm": 1.8515182733535767,
"learning_rate": 1.8457496136012366e-05,
"loss": 3.65,
"mean_token_accuracy": 0.36655392646789553,
"num_tokens": 1573641.0,
"step": 500
},
{
"epoch": 0.15765069551777433,
"grad_norm": 2.3960251808166504,
"learning_rate": 1.842658423493045e-05,
"loss": 3.58,
"mean_token_accuracy": 0.373273029923439,
"num_tokens": 1605367.0,
"step": 510
},
{
"epoch": 0.160741885625966,
"grad_norm": 2.578730821609497,
"learning_rate": 1.8395672333848533e-05,
"loss": 3.6695,
"mean_token_accuracy": 0.368588350713253,
"num_tokens": 1638052.0,
"step": 520
},
{
"epoch": 0.16383307573415765,
"grad_norm": 1.9368691444396973,
"learning_rate": 1.8364760432766617e-05,
"loss": 3.6363,
"mean_token_accuracy": 0.3681382529437542,
"num_tokens": 1672043.0,
"step": 530
},
{
"epoch": 0.16692426584234932,
"grad_norm": 3.562593698501587,
"learning_rate": 1.83338485316847e-05,
"loss": 3.5386,
"mean_token_accuracy": 0.3801154658198357,
"num_tokens": 1704817.0,
"step": 540
},
{
"epoch": 0.17001545595054096,
"grad_norm": 1.564929723739624,
"learning_rate": 1.8302936630602784e-05,
"loss": 3.6191,
"mean_token_accuracy": 0.36746986880898475,
"num_tokens": 1737297.0,
"step": 550
},
{
"epoch": 0.1731066460587326,
"grad_norm": 1.4315626621246338,
"learning_rate": 1.8272024729520868e-05,
"loss": 3.592,
"mean_token_accuracy": 0.3701712526381016,
"num_tokens": 1770204.0,
"step": 560
},
{
"epoch": 0.17619783616692428,
"grad_norm": 6.3183746337890625,
"learning_rate": 1.824111282843895e-05,
"loss": 3.6281,
"mean_token_accuracy": 0.37336429879069327,
"num_tokens": 1800581.0,
"step": 570
},
{
"epoch": 0.17928902627511592,
"grad_norm": 1.3608078956604004,
"learning_rate": 1.8210200927357035e-05,
"loss": 3.5644,
"mean_token_accuracy": 0.37816725075244906,
"num_tokens": 1834564.0,
"step": 580
},
{
"epoch": 0.18238021638330756,
"grad_norm": 11.443370819091797,
"learning_rate": 1.817928902627512e-05,
"loss": 3.6134,
"mean_token_accuracy": 0.37460487633943557,
"num_tokens": 1866113.0,
"step": 590
},
{
"epoch": 0.18547140649149924,
"grad_norm": 2.1869001388549805,
"learning_rate": 1.8148377125193202e-05,
"loss": 3.6331,
"mean_token_accuracy": 0.3722210742533207,
"num_tokens": 1900627.0,
"step": 600
},
{
"epoch": 0.18856259659969088,
"grad_norm": 1.8551387786865234,
"learning_rate": 1.8117465224111282e-05,
"loss": 3.5707,
"mean_token_accuracy": 0.3772866874933243,
"num_tokens": 1932988.0,
"step": 610
},
{
"epoch": 0.19165378670788252,
"grad_norm": 5.528620719909668,
"learning_rate": 1.808655332302937e-05,
"loss": 3.6475,
"mean_token_accuracy": 0.3652547873556614,
"num_tokens": 1962031.0,
"step": 620
},
{
"epoch": 0.1947449768160742,
"grad_norm": 3.566514253616333,
"learning_rate": 1.8055641421947453e-05,
"loss": 3.5441,
"mean_token_accuracy": 0.3836688004434109,
"num_tokens": 1995070.0,
"step": 630
},
{
"epoch": 0.19783616692426584,
"grad_norm": 1.9582892656326294,
"learning_rate": 1.8024729520865533e-05,
"loss": 3.5473,
"mean_token_accuracy": 0.3757788948714733,
"num_tokens": 2024801.0,
"step": 640
},
{
"epoch": 0.2009273570324575,
"grad_norm": 1.7483699321746826,
"learning_rate": 1.7993817619783616e-05,
"loss": 3.6133,
"mean_token_accuracy": 0.37615733668208123,
"num_tokens": 2054131.0,
"step": 650
},
{
"epoch": 0.20401854714064915,
"grad_norm": 1.9855698347091675,
"learning_rate": 1.7962905718701703e-05,
"loss": 3.5724,
"mean_token_accuracy": 0.37552602738142016,
"num_tokens": 2086218.0,
"step": 660
},
{
"epoch": 0.2071097372488408,
"grad_norm": 2.380608558654785,
"learning_rate": 1.7931993817619787e-05,
"loss": 3.5476,
"mean_token_accuracy": 0.37423405200243,
"num_tokens": 2121389.0,
"step": 670
},
{
"epoch": 0.21020092735703247,
"grad_norm": 1.5300630331039429,
"learning_rate": 1.7901081916537867e-05,
"loss": 3.668,
"mean_token_accuracy": 0.3656138554215431,
"num_tokens": 2152513.0,
"step": 680
},
{
"epoch": 0.2132921174652241,
"grad_norm": 1.6176111698150635,
"learning_rate": 1.787017001545595e-05,
"loss": 3.6087,
"mean_token_accuracy": 0.37140627652406694,
"num_tokens": 2184920.0,
"step": 690
},
{
"epoch": 0.21638330757341576,
"grad_norm": 1.401524305343628,
"learning_rate": 1.7839258114374038e-05,
"loss": 3.4881,
"mean_token_accuracy": 0.3919842541217804,
"num_tokens": 2214174.0,
"step": 700
},
{
"epoch": 0.21947449768160743,
"grad_norm": 1.4391794204711914,
"learning_rate": 1.7808346213292118e-05,
"loss": 3.5622,
"mean_token_accuracy": 0.37740132212638855,
"num_tokens": 2247593.0,
"step": 710
},
{
"epoch": 0.22256568778979907,
"grad_norm": 5.364869117736816,
"learning_rate": 1.77774343122102e-05,
"loss": 3.5855,
"mean_token_accuracy": 0.37872459217905996,
"num_tokens": 2277810.0,
"step": 720
},
{
"epoch": 0.22565687789799072,
"grad_norm": 1.3683693408966064,
"learning_rate": 1.7746522411128285e-05,
"loss": 3.4653,
"mean_token_accuracy": 0.38120819330215455,
"num_tokens": 2309457.0,
"step": 730
},
{
"epoch": 0.2287480680061824,
"grad_norm": 1.3637374639511108,
"learning_rate": 1.771561051004637e-05,
"loss": 3.5131,
"mean_token_accuracy": 0.3864184685051441,
"num_tokens": 2339852.0,
"step": 740
},
{
"epoch": 0.23183925811437403,
"grad_norm": 2.7388429641723633,
"learning_rate": 1.7684698608964452e-05,
"loss": 3.573,
"mean_token_accuracy": 0.3795412413775921,
"num_tokens": 2371444.0,
"step": 750
},
{
"epoch": 0.23493044822256567,
"grad_norm": 9.798398971557617,
"learning_rate": 1.7653786707882536e-05,
"loss": 3.5441,
"mean_token_accuracy": 0.38080229982733727,
"num_tokens": 2401769.0,
"step": 760
},
{
"epoch": 0.23802163833075735,
"grad_norm": 2.1416878700256348,
"learning_rate": 1.762287480680062e-05,
"loss": 3.5043,
"mean_token_accuracy": 0.3838089659810066,
"num_tokens": 2432410.0,
"step": 770
},
{
"epoch": 0.241112828438949,
"grad_norm": 2.211545467376709,
"learning_rate": 1.7591962905718703e-05,
"loss": 3.5341,
"mean_token_accuracy": 0.388753118366003,
"num_tokens": 2462641.0,
"step": 780
},
{
"epoch": 0.24420401854714066,
"grad_norm": 5.351387023925781,
"learning_rate": 1.7561051004636787e-05,
"loss": 3.5031,
"mean_token_accuracy": 0.3821559719741344,
"num_tokens": 2496459.0,
"step": 790
},
{
"epoch": 0.2472952086553323,
"grad_norm": 2.3877508640289307,
"learning_rate": 1.753013910355487e-05,
"loss": 3.5589,
"mean_token_accuracy": 0.3789012677967548,
"num_tokens": 2526708.0,
"step": 800
},
{
"epoch": 0.250386398763524,
"grad_norm": 1.2962738275527954,
"learning_rate": 1.7499227202472954e-05,
"loss": 3.505,
"mean_token_accuracy": 0.3823570780456066,
"num_tokens": 2561229.0,
"step": 810
},
{
"epoch": 0.2534775888717156,
"grad_norm": 1.8147461414337158,
"learning_rate": 1.7468315301391037e-05,
"loss": 3.4861,
"mean_token_accuracy": 0.3802278622984886,
"num_tokens": 2592914.0,
"step": 820
},
{
"epoch": 0.25656877897990726,
"grad_norm": 3.4690439701080322,
"learning_rate": 1.743740340030912e-05,
"loss": 3.5202,
"mean_token_accuracy": 0.38112854287028314,
"num_tokens": 2627185.0,
"step": 830
},
{
"epoch": 0.2596599690880989,
"grad_norm": 4.353795051574707,
"learning_rate": 1.7406491499227205e-05,
"loss": 3.5667,
"mean_token_accuracy": 0.37200469225645066,
"num_tokens": 2660839.0,
"step": 840
},
{
"epoch": 0.26275115919629055,
"grad_norm": 2.6319100856781006,
"learning_rate": 1.7375579598145288e-05,
"loss": 3.464,
"mean_token_accuracy": 0.38627258986234664,
"num_tokens": 2690042.0,
"step": 850
},
{
"epoch": 0.26584234930448225,
"grad_norm": 1.6525287628173828,
"learning_rate": 1.7344667697063372e-05,
"loss": 3.5616,
"mean_token_accuracy": 0.3763453342020512,
"num_tokens": 2724354.0,
"step": 860
},
{
"epoch": 0.2689335394126739,
"grad_norm": 1.9737221002578735,
"learning_rate": 1.7313755795981455e-05,
"loss": 3.5147,
"mean_token_accuracy": 0.38340551406145096,
"num_tokens": 2755591.0,
"step": 870
},
{
"epoch": 0.27202472952086554,
"grad_norm": 2.5324320793151855,
"learning_rate": 1.7282843894899536e-05,
"loss": 3.4227,
"mean_token_accuracy": 0.3909419260919094,
"num_tokens": 2786566.0,
"step": 880
},
{
"epoch": 0.2751159196290572,
"grad_norm": 3.3844659328460693,
"learning_rate": 1.7251931993817623e-05,
"loss": 3.5561,
"mean_token_accuracy": 0.37579271346330645,
"num_tokens": 2814171.0,
"step": 890
},
{
"epoch": 0.2782071097372488,
"grad_norm": 2.7620153427124023,
"learning_rate": 1.7221020092735706e-05,
"loss": 3.5042,
"mean_token_accuracy": 0.3822973191738129,
"num_tokens": 2844402.0,
"step": 900
},
{
"epoch": 0.28129829984544047,
"grad_norm": 2.089118719100952,
"learning_rate": 1.7190108191653786e-05,
"loss": 3.4583,
"mean_token_accuracy": 0.39270223304629326,
"num_tokens": 2873728.0,
"step": 910
},
{
"epoch": 0.28438948995363217,
"grad_norm": 2.496480941772461,
"learning_rate": 1.715919629057187e-05,
"loss": 3.4466,
"mean_token_accuracy": 0.3897694177925587,
"num_tokens": 2905000.0,
"step": 920
},
{
"epoch": 0.2874806800618238,
"grad_norm": 2.162785053253174,
"learning_rate": 1.7128284389489957e-05,
"loss": 3.4075,
"mean_token_accuracy": 0.39374888986349105,
"num_tokens": 2935706.0,
"step": 930
},
{
"epoch": 0.29057187017001546,
"grad_norm": 1.5756185054779053,
"learning_rate": 1.709737248840804e-05,
"loss": 3.4396,
"mean_token_accuracy": 0.38780966177582743,
"num_tokens": 2966477.0,
"step": 940
},
{
"epoch": 0.2936630602782071,
"grad_norm": 5.708625316619873,
"learning_rate": 1.706646058732612e-05,
"loss": 3.4185,
"mean_token_accuracy": 0.39326486811041833,
"num_tokens": 2996439.0,
"step": 950
},
{
"epoch": 0.29675425038639874,
"grad_norm": 2.8689119815826416,
"learning_rate": 1.7035548686244204e-05,
"loss": 3.468,
"mean_token_accuracy": 0.3818407289683819,
"num_tokens": 3026332.0,
"step": 960
},
{
"epoch": 0.29984544049459044,
"grad_norm": 4.020405292510986,
"learning_rate": 1.700463678516229e-05,
"loss": 3.4943,
"mean_token_accuracy": 0.3888055384159088,
"num_tokens": 3057606.0,
"step": 970
},
{
"epoch": 0.3029366306027821,
"grad_norm": 1.8991873264312744,
"learning_rate": 1.697372488408037e-05,
"loss": 3.4939,
"mean_token_accuracy": 0.38414665684103966,
"num_tokens": 3089449.0,
"step": 980
},
{
"epoch": 0.30602782071097373,
"grad_norm": 13.78397274017334,
"learning_rate": 1.6942812982998455e-05,
"loss": 3.5487,
"mean_token_accuracy": 0.3769320294260979,
"num_tokens": 3119537.0,
"step": 990
},
{
"epoch": 0.3091190108191654,
"grad_norm": 1.599820613861084,
"learning_rate": 1.691190108191654e-05,
"loss": 3.465,
"mean_token_accuracy": 0.3884730890393257,
"num_tokens": 3151905.0,
"step": 1000
},
{
"epoch": 0.312210200927357,
"grad_norm": 6.015178680419922,
"learning_rate": 1.6880989180834622e-05,
"loss": 3.5043,
"mean_token_accuracy": 0.38035417571663854,
"num_tokens": 3184277.0,
"step": 1010
},
{
"epoch": 0.31530139103554866,
"grad_norm": 1.854093313217163,
"learning_rate": 1.6850077279752706e-05,
"loss": 3.4233,
"mean_token_accuracy": 0.389019088447094,
"num_tokens": 3217455.0,
"step": 1020
},
{
"epoch": 0.31839258114374036,
"grad_norm": 3.024531126022339,
"learning_rate": 1.681916537867079e-05,
"loss": 3.4768,
"mean_token_accuracy": 0.39151332527399063,
"num_tokens": 3251326.0,
"step": 1030
},
{
"epoch": 0.321483771251932,
"grad_norm": 1.8012628555297852,
"learning_rate": 1.6788253477588873e-05,
"loss": 3.4955,
"mean_token_accuracy": 0.3855120025575161,
"num_tokens": 3283488.0,
"step": 1040
},
{
"epoch": 0.32457496136012365,
"grad_norm": 1.4446407556533813,
"learning_rate": 1.6757341576506957e-05,
"loss": 3.4023,
"mean_token_accuracy": 0.3883494645357132,
"num_tokens": 3315970.0,
"step": 1050
},
{
"epoch": 0.3276661514683153,
"grad_norm": 2.132194995880127,
"learning_rate": 1.672642967542504e-05,
"loss": 3.5195,
"mean_token_accuracy": 0.38440208286046984,
"num_tokens": 3348645.0,
"step": 1060
},
{
"epoch": 0.33075734157650694,
"grad_norm": 2.436528444290161,
"learning_rate": 1.6695517774343124e-05,
"loss": 3.4595,
"mean_token_accuracy": 0.3885770753026009,
"num_tokens": 3378707.0,
"step": 1070
},
{
"epoch": 0.33384853168469864,
"grad_norm": 2.0806350708007812,
"learning_rate": 1.6664605873261207e-05,
"loss": 3.4576,
"mean_token_accuracy": 0.38848345205187795,
"num_tokens": 3410231.0,
"step": 1080
},
{
"epoch": 0.3369397217928903,
"grad_norm": 2.142319679260254,
"learning_rate": 1.663369397217929e-05,
"loss": 3.4644,
"mean_token_accuracy": 0.3861194223165512,
"num_tokens": 3444065.0,
"step": 1090
},
{
"epoch": 0.3400309119010819,
"grad_norm": 2.380552053451538,
"learning_rate": 1.6602782071097374e-05,
"loss": 3.3916,
"mean_token_accuracy": 0.39232398346066477,
"num_tokens": 3474974.0,
"step": 1100
},
{
"epoch": 0.34312210200927357,
"grad_norm": 1.6462984085083008,
"learning_rate": 1.6571870170015458e-05,
"loss": 3.3687,
"mean_token_accuracy": 0.3944365203380585,
"num_tokens": 3507101.0,
"step": 1110
},
{
"epoch": 0.3462132921174652,
"grad_norm": 4.342376232147217,
"learning_rate": 1.654095826893354e-05,
"loss": 3.4589,
"mean_token_accuracy": 0.3856883034110069,
"num_tokens": 3538893.0,
"step": 1120
},
{
"epoch": 0.34930448222565685,
"grad_norm": 1.8314056396484375,
"learning_rate": 1.6510046367851625e-05,
"loss": 3.6277,
"mean_token_accuracy": 0.3779118649661541,
"num_tokens": 3567776.0,
"step": 1130
},
{
"epoch": 0.35239567233384855,
"grad_norm": 1.7731289863586426,
"learning_rate": 1.647913446676971e-05,
"loss": 3.4375,
"mean_token_accuracy": 0.38362068235874175,
"num_tokens": 3600096.0,
"step": 1140
},
{
"epoch": 0.3554868624420402,
"grad_norm": 1.4724918603897095,
"learning_rate": 1.644822256568779e-05,
"loss": 3.4787,
"mean_token_accuracy": 0.39140490964055064,
"num_tokens": 3629645.0,
"step": 1150
},
{
"epoch": 0.35857805255023184,
"grad_norm": 2.0459768772125244,
"learning_rate": 1.6417310664605876e-05,
"loss": 3.4346,
"mean_token_accuracy": 0.38677491843700407,
"num_tokens": 3663559.0,
"step": 1160
},
{
"epoch": 0.3616692426584235,
"grad_norm": 3.235039710998535,
"learning_rate": 1.638639876352396e-05,
"loss": 3.4915,
"mean_token_accuracy": 0.38628031834959986,
"num_tokens": 3695457.0,
"step": 1170
},
{
"epoch": 0.36476043276661513,
"grad_norm": 1.2474194765090942,
"learning_rate": 1.635548686244204e-05,
"loss": 3.3999,
"mean_token_accuracy": 0.39414023533463477,
"num_tokens": 3725294.0,
"step": 1180
},
{
"epoch": 0.3678516228748068,
"grad_norm": 1.3599259853363037,
"learning_rate": 1.6324574961360123e-05,
"loss": 3.3712,
"mean_token_accuracy": 0.39129213988780975,
"num_tokens": 3756722.0,
"step": 1190
},
{
"epoch": 0.37094281298299847,
"grad_norm": 1.4785717725753784,
"learning_rate": 1.629366306027821e-05,
"loss": 3.3837,
"mean_token_accuracy": 0.39773035794496536,
"num_tokens": 3786396.0,
"step": 1200
},
{
"epoch": 0.3740340030911901,
"grad_norm": 1.1180949211120605,
"learning_rate": 1.6262751159196294e-05,
"loss": 3.374,
"mean_token_accuracy": 0.3942295677959919,
"num_tokens": 3816940.0,
"step": 1210
},
{
"epoch": 0.37712519319938176,
"grad_norm": 3.58443546295166,
"learning_rate": 1.6231839258114374e-05,
"loss": 3.3892,
"mean_token_accuracy": 0.3918235659599304,
"num_tokens": 3845728.0,
"step": 1220
},
{
"epoch": 0.3802163833075734,
"grad_norm": 7.910126686096191,
"learning_rate": 1.6200927357032458e-05,
"loss": 3.4141,
"mean_token_accuracy": 0.39633639603853227,
"num_tokens": 3876251.0,
"step": 1230
},
{
"epoch": 0.38330757341576505,
"grad_norm": 1.4737247228622437,
"learning_rate": 1.617001545595054e-05,
"loss": 3.3891,
"mean_token_accuracy": 0.38747691363096237,
"num_tokens": 3907979.0,
"step": 1240
},
{
"epoch": 0.38639876352395675,
"grad_norm": 2.543823003768921,
"learning_rate": 1.6139103554868625e-05,
"loss": 3.3857,
"mean_token_accuracy": 0.3897668160498142,
"num_tokens": 3939556.0,
"step": 1250
},
{
"epoch": 0.3894899536321484,
"grad_norm": 1.8837332725524902,
"learning_rate": 1.610819165378671e-05,
"loss": 3.3795,
"mean_token_accuracy": 0.39769657924771307,
"num_tokens": 3970481.0,
"step": 1260
},
{
"epoch": 0.39258114374034003,
"grad_norm": 3.573788642883301,
"learning_rate": 1.6077279752704792e-05,
"loss": 3.3105,
"mean_token_accuracy": 0.4041719429194927,
"num_tokens": 3999666.0,
"step": 1270
},
{
"epoch": 0.3956723338485317,
"grad_norm": 1.7824413776397705,
"learning_rate": 1.6046367851622876e-05,
"loss": 3.4468,
"mean_token_accuracy": 0.3911862142384052,
"num_tokens": 4031876.0,
"step": 1280
},
{
"epoch": 0.3987635239567233,
"grad_norm": 1.6730329990386963,
"learning_rate": 1.601545595054096e-05,
"loss": 3.4193,
"mean_token_accuracy": 0.39065720662474634,
"num_tokens": 4064132.0,
"step": 1290
},
{
"epoch": 0.401854714064915,
"grad_norm": 1.421411395072937,
"learning_rate": 1.5984544049459043e-05,
"loss": 3.4292,
"mean_token_accuracy": 0.3871223643422127,
"num_tokens": 4095074.0,
"step": 1300
},
{
"epoch": 0.40494590417310666,
"grad_norm": 1.7248343229293823,
"learning_rate": 1.5953632148377126e-05,
"loss": 3.379,
"mean_token_accuracy": 0.39316892698407174,
"num_tokens": 4130858.0,
"step": 1310
},
{
"epoch": 0.4080370942812983,
"grad_norm": 2.6556711196899414,
"learning_rate": 1.592272024729521e-05,
"loss": 3.4088,
"mean_token_accuracy": 0.38396543338894845,
"num_tokens": 4163277.0,
"step": 1320
},
{
"epoch": 0.41112828438948995,
"grad_norm": 2.2201597690582275,
"learning_rate": 1.5891808346213294e-05,
"loss": 3.446,
"mean_token_accuracy": 0.38794904500246047,
"num_tokens": 4195806.0,
"step": 1330
},
{
"epoch": 0.4142194744976816,
"grad_norm": 1.4001938104629517,
"learning_rate": 1.5860896445131377e-05,
"loss": 3.4074,
"mean_token_accuracy": 0.3959184519946575,
"num_tokens": 4225811.0,
"step": 1340
},
{
"epoch": 0.41731066460587324,
"grad_norm": 1.9553899765014648,
"learning_rate": 1.582998454404946e-05,
"loss": 3.353,
"mean_token_accuracy": 0.392233844101429,
"num_tokens": 4259418.0,
"step": 1350
},
{
"epoch": 0.42040185471406494,
"grad_norm": 2.1650876998901367,
"learning_rate": 1.5799072642967544e-05,
"loss": 3.3014,
"mean_token_accuracy": 0.40291827023029325,
"num_tokens": 4292593.0,
"step": 1360
},
{
"epoch": 0.4234930448222566,
"grad_norm": 1.6802010536193848,
"learning_rate": 1.5768160741885628e-05,
"loss": 3.3615,
"mean_token_accuracy": 0.3953541323542595,
"num_tokens": 4325628.0,
"step": 1370
},
{
"epoch": 0.4265842349304482,
"grad_norm": 2.8798279762268066,
"learning_rate": 1.573724884080371e-05,
"loss": 3.4365,
"mean_token_accuracy": 0.39370308369398116,
"num_tokens": 4358459.0,
"step": 1380
},
{
"epoch": 0.42967542503863987,
"grad_norm": 1.523694634437561,
"learning_rate": 1.5706336939721795e-05,
"loss": 3.3153,
"mean_token_accuracy": 0.3944700941443443,
"num_tokens": 4390529.0,
"step": 1390
},
{
"epoch": 0.4327666151468315,
"grad_norm": 2.0799732208251953,
"learning_rate": 1.567542503863988e-05,
"loss": 3.4242,
"mean_token_accuracy": 0.389276672154665,
"num_tokens": 4424045.0,
"step": 1400
},
{
"epoch": 0.43585780525502316,
"grad_norm": 1.3916538953781128,
"learning_rate": 1.5644513137557962e-05,
"loss": 3.444,
"mean_token_accuracy": 0.388773063570261,
"num_tokens": 4456115.0,
"step": 1410
},
{
"epoch": 0.43894899536321486,
"grad_norm": 11.072097778320312,
"learning_rate": 1.5613601236476042e-05,
"loss": 3.2926,
"mean_token_accuracy": 0.40686351582407954,
"num_tokens": 4490990.0,
"step": 1420
},
{
"epoch": 0.4420401854714065,
"grad_norm": 1.8008073568344116,
"learning_rate": 1.558268933539413e-05,
"loss": 3.4062,
"mean_token_accuracy": 0.3938455879688263,
"num_tokens": 4519847.0,
"step": 1430
},
{
"epoch": 0.44513137557959814,
"grad_norm": 4.065845012664795,
"learning_rate": 1.5551777434312213e-05,
"loss": 3.3776,
"mean_token_accuracy": 0.4004744917154312,
"num_tokens": 4551679.0,
"step": 1440
},
{
"epoch": 0.4482225656877898,
"grad_norm": 2.4614624977111816,
"learning_rate": 1.5520865533230297e-05,
"loss": 3.324,
"mean_token_accuracy": 0.4031891174614429,
"num_tokens": 4582622.0,
"step": 1450
},
{
"epoch": 0.45131375579598143,
"grad_norm": 2.6623904705047607,
"learning_rate": 1.5489953632148377e-05,
"loss": 3.414,
"mean_token_accuracy": 0.39484291821718215,
"num_tokens": 4613478.0,
"step": 1460
},
{
"epoch": 0.45440494590417313,
"grad_norm": 2.342698574066162,
"learning_rate": 1.545904173106646e-05,
"loss": 3.4348,
"mean_token_accuracy": 0.38681296780705454,
"num_tokens": 4647495.0,
"step": 1470
},
{
"epoch": 0.4574961360123648,
"grad_norm": 2.6113431453704834,
"learning_rate": 1.5428129829984547e-05,
"loss": 3.39,
"mean_token_accuracy": 0.391095020622015,
"num_tokens": 4681809.0,
"step": 1480
},
{
"epoch": 0.4605873261205564,
"grad_norm": 1.3145941495895386,
"learning_rate": 1.5397217928902627e-05,
"loss": 3.3677,
"mean_token_accuracy": 0.38954789489507674,
"num_tokens": 4716308.0,
"step": 1490
},
{
"epoch": 0.46367851622874806,
"grad_norm": 1.8231385946273804,
"learning_rate": 1.536630602782071e-05,
"loss": 3.4199,
"mean_token_accuracy": 0.38780914843082426,
"num_tokens": 4747835.0,
"step": 1500
},
{
"epoch": 0.4667697063369397,
"grad_norm": 2.2594892978668213,
"learning_rate": 1.5335394126738795e-05,
"loss": 3.3937,
"mean_token_accuracy": 0.3959015667438507,
"num_tokens": 4776486.0,
"step": 1510
},
{
"epoch": 0.46986089644513135,
"grad_norm": 3.840742349624634,
"learning_rate": 1.5304482225656878e-05,
"loss": 3.3087,
"mean_token_accuracy": 0.407059845328331,
"num_tokens": 4805943.0,
"step": 1520
},
{
"epoch": 0.47295208655332305,
"grad_norm": 4.683995246887207,
"learning_rate": 1.5273570324574962e-05,
"loss": 3.3373,
"mean_token_accuracy": 0.39871800169348715,
"num_tokens": 4838440.0,
"step": 1530
},
{
"epoch": 0.4760432766615147,
"grad_norm": 1.2700860500335693,
"learning_rate": 1.5242658423493047e-05,
"loss": 3.3339,
"mean_token_accuracy": 0.39955019652843476,
"num_tokens": 4870809.0,
"step": 1540
},
{
"epoch": 0.47913446676970634,
"grad_norm": 2.15336537361145,
"learning_rate": 1.521174652241113e-05,
"loss": 3.3699,
"mean_token_accuracy": 0.39904908165335656,
"num_tokens": 4903332.0,
"step": 1550
},
{
"epoch": 0.482225656877898,
"grad_norm": 1.299379587173462,
"learning_rate": 1.5180834621329213e-05,
"loss": 3.4024,
"mean_token_accuracy": 0.3913417667150497,
"num_tokens": 4933852.0,
"step": 1560
},
{
"epoch": 0.4853168469860896,
"grad_norm": 1.1829091310501099,
"learning_rate": 1.5149922720247296e-05,
"loss": 3.3253,
"mean_token_accuracy": 0.39646707102656364,
"num_tokens": 4966530.0,
"step": 1570
},
{
"epoch": 0.4884080370942813,
"grad_norm": 1.2553237676620483,
"learning_rate": 1.511901081916538e-05,
"loss": 3.2771,
"mean_token_accuracy": 0.405765625834465,
"num_tokens": 4998456.0,
"step": 1580
},
{
"epoch": 0.49149922720247297,
"grad_norm": 2.037930488586426,
"learning_rate": 1.5088098918083462e-05,
"loss": 3.3754,
"mean_token_accuracy": 0.3952351205050945,
"num_tokens": 5028995.0,
"step": 1590
},
{
"epoch": 0.4945904173106646,
"grad_norm": 1.090571403503418,
"learning_rate": 1.5057187017001547e-05,
"loss": 3.2832,
"mean_token_accuracy": 0.4088763400912285,
"num_tokens": 5057407.0,
"step": 1600
},
{
"epoch": 0.49768160741885625,
"grad_norm": 2.4721672534942627,
"learning_rate": 1.502627511591963e-05,
"loss": 3.3535,
"mean_token_accuracy": 0.3982778422534466,
"num_tokens": 5088330.0,
"step": 1610
},
{
"epoch": 0.500772797527048,
"grad_norm": 1.3971518278121948,
"learning_rate": 1.4995363214837714e-05,
"loss": 3.359,
"mean_token_accuracy": 0.3978785939514637,
"num_tokens": 5115511.0,
"step": 1620
},
{
"epoch": 0.5038639876352395,
"grad_norm": 1.766021490097046,
"learning_rate": 1.4964451313755796e-05,
"loss": 3.3388,
"mean_token_accuracy": 0.4023651979863644,
"num_tokens": 5145560.0,
"step": 1630
},
{
"epoch": 0.5069551777434312,
"grad_norm": 0.9848290681838989,
"learning_rate": 1.4933539412673881e-05,
"loss": 3.3312,
"mean_token_accuracy": 0.4001339070498943,
"num_tokens": 5178491.0,
"step": 1640
},
{
"epoch": 0.5100463678516228,
"grad_norm": 5.008669376373291,
"learning_rate": 1.4902627511591965e-05,
"loss": 3.3395,
"mean_token_accuracy": 0.39800570756196973,
"num_tokens": 5209376.0,
"step": 1650
},
{
"epoch": 0.5131375579598145,
"grad_norm": 3.3218369483947754,
"learning_rate": 1.4871715610510047e-05,
"loss": 3.398,
"mean_token_accuracy": 0.3898357510566711,
"num_tokens": 5240240.0,
"step": 1660
},
{
"epoch": 0.5162287480680062,
"grad_norm": 1.3590713739395142,
"learning_rate": 1.484080370942813e-05,
"loss": 3.3246,
"mean_token_accuracy": 0.39972665831446647,
"num_tokens": 5269658.0,
"step": 1670
},
{
"epoch": 0.5193199381761978,
"grad_norm": 1.647360920906067,
"learning_rate": 1.4809891808346216e-05,
"loss": 3.3101,
"mean_token_accuracy": 0.4039941616356373,
"num_tokens": 5301959.0,
"step": 1680
},
{
"epoch": 0.5224111282843895,
"grad_norm": 1.3231589794158936,
"learning_rate": 1.4778979907264298e-05,
"loss": 3.2715,
"mean_token_accuracy": 0.409288527816534,
"num_tokens": 5332939.0,
"step": 1690
},
{
"epoch": 0.5255023183925811,
"grad_norm": 1.8494716882705688,
"learning_rate": 1.4748068006182381e-05,
"loss": 3.3472,
"mean_token_accuracy": 0.4054514840245247,
"num_tokens": 5361660.0,
"step": 1700
},
{
"epoch": 0.5285935085007728,
"grad_norm": 1.1796019077301025,
"learning_rate": 1.4717156105100465e-05,
"loss": 3.3374,
"mean_token_accuracy": 0.40053225085139277,
"num_tokens": 5390818.0,
"step": 1710
},
{
"epoch": 0.5316846986089645,
"grad_norm": 1.4589906930923462,
"learning_rate": 1.468624420401855e-05,
"loss": 3.3532,
"mean_token_accuracy": 0.3908236466348171,
"num_tokens": 5423948.0,
"step": 1720
},
{
"epoch": 0.5347758887171561,
"grad_norm": 2.48760986328125,
"learning_rate": 1.4655332302936632e-05,
"loss": 3.3916,
"mean_token_accuracy": 0.39238951057195665,
"num_tokens": 5456721.0,
"step": 1730
},
{
"epoch": 0.5378670788253478,
"grad_norm": 1.3151116371154785,
"learning_rate": 1.4624420401854715e-05,
"loss": 3.3755,
"mean_token_accuracy": 0.39836090207099917,
"num_tokens": 5485487.0,
"step": 1740
},
{
"epoch": 0.5409582689335394,
"grad_norm": 1.3862218856811523,
"learning_rate": 1.4593508500772799e-05,
"loss": 3.3972,
"mean_token_accuracy": 0.39552291929721833,
"num_tokens": 5517812.0,
"step": 1750
},
{
"epoch": 0.5440494590417311,
"grad_norm": 1.4198209047317505,
"learning_rate": 1.4562596599690881e-05,
"loss": 3.323,
"mean_token_accuracy": 0.4038604758679867,
"num_tokens": 5549085.0,
"step": 1760
},
{
"epoch": 0.5471406491499228,
"grad_norm": 2.063263416290283,
"learning_rate": 1.4531684698608966e-05,
"loss": 3.2286,
"mean_token_accuracy": 0.40857273861765864,
"num_tokens": 5580826.0,
"step": 1770
},
{
"epoch": 0.5502318392581144,
"grad_norm": 1.5246375799179077,
"learning_rate": 1.450077279752705e-05,
"loss": 3.3077,
"mean_token_accuracy": 0.3984913781285286,
"num_tokens": 5611654.0,
"step": 1780
},
{
"epoch": 0.5533230293663061,
"grad_norm": 3.1407299041748047,
"learning_rate": 1.4469860896445132e-05,
"loss": 3.3631,
"mean_token_accuracy": 0.4005904957652092,
"num_tokens": 5643972.0,
"step": 1790
},
{
"epoch": 0.5564142194744977,
"grad_norm": 1.1959761381149292,
"learning_rate": 1.4438948995363215e-05,
"loss": 3.3863,
"mean_token_accuracy": 0.39492699652910235,
"num_tokens": 5674824.0,
"step": 1800
},
{
"epoch": 0.5595054095826894,
"grad_norm": 1.6108070611953735,
"learning_rate": 1.44080370942813e-05,
"loss": 3.3524,
"mean_token_accuracy": 0.3943831264972687,
"num_tokens": 5710374.0,
"step": 1810
},
{
"epoch": 0.5625965996908809,
"grad_norm": 2.3074188232421875,
"learning_rate": 1.4377125193199384e-05,
"loss": 3.4149,
"mean_token_accuracy": 0.3902184680104256,
"num_tokens": 5740450.0,
"step": 1820
},
{
"epoch": 0.5656877897990726,
"grad_norm": 1.2210988998413086,
"learning_rate": 1.4346213292117466e-05,
"loss": 3.368,
"mean_token_accuracy": 0.3951147675514221,
"num_tokens": 5769865.0,
"step": 1830
},
{
"epoch": 0.5687789799072643,
"grad_norm": 1.301492691040039,
"learning_rate": 1.431530139103555e-05,
"loss": 3.4011,
"mean_token_accuracy": 0.39827719777822496,
"num_tokens": 5798647.0,
"step": 1840
},
{
"epoch": 0.5718701700154559,
"grad_norm": 1.8520913124084473,
"learning_rate": 1.4284389489953633e-05,
"loss": 3.3297,
"mean_token_accuracy": 0.401262603700161,
"num_tokens": 5830176.0,
"step": 1850
},
{
"epoch": 0.5749613601236476,
"grad_norm": 1.7829524278640747,
"learning_rate": 1.4253477588871715e-05,
"loss": 3.2177,
"mean_token_accuracy": 0.4113538973033428,
"num_tokens": 5860769.0,
"step": 1860
},
{
"epoch": 0.5780525502318392,
"grad_norm": 1.5205345153808594,
"learning_rate": 1.42225656877898e-05,
"loss": 3.4594,
"mean_token_accuracy": 0.3904502220451832,
"num_tokens": 5896213.0,
"step": 1870
},
{
"epoch": 0.5811437403400309,
"grad_norm": 1.492475986480713,
"learning_rate": 1.4191653786707884e-05,
"loss": 3.2639,
"mean_token_accuracy": 0.40337538048624993,
"num_tokens": 5928539.0,
"step": 1880
},
{
"epoch": 0.5842349304482226,
"grad_norm": 2.07590651512146,
"learning_rate": 1.4160741885625968e-05,
"loss": 3.3918,
"mean_token_accuracy": 0.39563274309039115,
"num_tokens": 5960442.0,
"step": 1890
},
{
"epoch": 0.5873261205564142,
"grad_norm": 2.6959567070007324,
"learning_rate": 1.412982998454405e-05,
"loss": 3.3494,
"mean_token_accuracy": 0.40076613500714303,
"num_tokens": 5990404.0,
"step": 1900
},
{
"epoch": 0.5904173106646059,
"grad_norm": 1.291227102279663,
"learning_rate": 1.4098918083462135e-05,
"loss": 3.3794,
"mean_token_accuracy": 0.3960807867348194,
"num_tokens": 6023418.0,
"step": 1910
},
{
"epoch": 0.5935085007727975,
"grad_norm": 3.709761381149292,
"learning_rate": 1.4068006182380218e-05,
"loss": 3.3441,
"mean_token_accuracy": 0.3981980659067631,
"num_tokens": 6051604.0,
"step": 1920
},
{
"epoch": 0.5965996908809892,
"grad_norm": 1.5294419527053833,
"learning_rate": 1.40370942812983e-05,
"loss": 3.3821,
"mean_token_accuracy": 0.40094061717391016,
"num_tokens": 6081609.0,
"step": 1930
},
{
"epoch": 0.5996908809891809,
"grad_norm": 1.792324185371399,
"learning_rate": 1.4006182380216384e-05,
"loss": 3.3228,
"mean_token_accuracy": 0.40257288739085195,
"num_tokens": 6113487.0,
"step": 1940
},
{
"epoch": 0.6027820710973725,
"grad_norm": 1.3564989566802979,
"learning_rate": 1.3975270479134469e-05,
"loss": 3.3603,
"mean_token_accuracy": 0.3971732698380947,
"num_tokens": 6143546.0,
"step": 1950
},
{
"epoch": 0.6058732612055642,
"grad_norm": 1.8166826963424683,
"learning_rate": 1.3944358578052551e-05,
"loss": 3.3511,
"mean_token_accuracy": 0.3947428591549397,
"num_tokens": 6177302.0,
"step": 1960
},
{
"epoch": 0.6089644513137558,
"grad_norm": 2.3519530296325684,
"learning_rate": 1.3913446676970635e-05,
"loss": 3.3258,
"mean_token_accuracy": 0.40014824345707894,
"num_tokens": 6209913.0,
"step": 1970
},
{
"epoch": 0.6120556414219475,
"grad_norm": 1.9432330131530762,
"learning_rate": 1.3882534775888718e-05,
"loss": 3.2683,
"mean_token_accuracy": 0.40405927672982217,
"num_tokens": 6241773.0,
"step": 1980
},
{
"epoch": 0.615146831530139,
"grad_norm": 1.2190839052200317,
"learning_rate": 1.3851622874806803e-05,
"loss": 3.3476,
"mean_token_accuracy": 0.4006108805537224,
"num_tokens": 6274930.0,
"step": 1990
},
{
"epoch": 0.6182380216383307,
"grad_norm": 1.395822525024414,
"learning_rate": 1.3820710973724885e-05,
"loss": 3.2706,
"mean_token_accuracy": 0.40902155488729475,
"num_tokens": 6306366.0,
"step": 2000
},
{
"epoch": 0.6213292117465224,
"grad_norm": 3.3072211742401123,
"learning_rate": 1.3789799072642969e-05,
"loss": 3.2842,
"mean_token_accuracy": 0.4068531468510628,
"num_tokens": 6336620.0,
"step": 2010
},
{
"epoch": 0.624420401854714,
"grad_norm": 6.775637626647949,
"learning_rate": 1.3758887171561052e-05,
"loss": 3.3359,
"mean_token_accuracy": 0.399787887185812,
"num_tokens": 6368783.0,
"step": 2020
},
{
"epoch": 0.6275115919629057,
"grad_norm": 2.236809253692627,
"learning_rate": 1.3727975270479134e-05,
"loss": 3.1501,
"mean_token_accuracy": 0.4164234817028046,
"num_tokens": 6396050.0,
"step": 2030
},
{
"epoch": 0.6306027820710973,
"grad_norm": 1.864715814590454,
"learning_rate": 1.369706336939722e-05,
"loss": 3.2679,
"mean_token_accuracy": 0.40604618191719055,
"num_tokens": 6428401.0,
"step": 2040
},
{
"epoch": 0.633693972179289,
"grad_norm": 13.025823593139648,
"learning_rate": 1.3666151468315303e-05,
"loss": 3.2663,
"mean_token_accuracy": 0.39983370155096054,
"num_tokens": 6463617.0,
"step": 2050
},
{
"epoch": 0.6367851622874807,
"grad_norm": 1.4827322959899902,
"learning_rate": 1.3635239567233387e-05,
"loss": 3.3267,
"mean_token_accuracy": 0.4056967757642269,
"num_tokens": 6494545.0,
"step": 2060
},
{
"epoch": 0.6398763523956723,
"grad_norm": 2.8298745155334473,
"learning_rate": 1.3604327666151469e-05,
"loss": 3.2855,
"mean_token_accuracy": 0.4048807807266712,
"num_tokens": 6526119.0,
"step": 2070
},
{
"epoch": 0.642967542503864,
"grad_norm": 2.294051170349121,
"learning_rate": 1.3573415765069552e-05,
"loss": 3.3445,
"mean_token_accuracy": 0.3997214540839195,
"num_tokens": 6559408.0,
"step": 2080
},
{
"epoch": 0.6460587326120556,
"grad_norm": 6.12084436416626,
"learning_rate": 1.3542503863987638e-05,
"loss": 3.2882,
"mean_token_accuracy": 0.40451241433620455,
"num_tokens": 6588923.0,
"step": 2090
},
{
"epoch": 0.6491499227202473,
"grad_norm": 1.5177021026611328,
"learning_rate": 1.351159196290572e-05,
"loss": 3.2833,
"mean_token_accuracy": 0.39674848690629005,
"num_tokens": 6623193.0,
"step": 2100
},
{
"epoch": 0.652241112828439,
"grad_norm": 1.6658954620361328,
"learning_rate": 1.3480680061823803e-05,
"loss": 3.4093,
"mean_token_accuracy": 0.39299999698996546,
"num_tokens": 6657638.0,
"step": 2110
},
{
"epoch": 0.6553323029366306,
"grad_norm": 5.038951396942139,
"learning_rate": 1.3449768160741887e-05,
"loss": 3.3359,
"mean_token_accuracy": 0.4020788729190826,
"num_tokens": 6686552.0,
"step": 2120
},
{
"epoch": 0.6584234930448223,
"grad_norm": 1.931733250617981,
"learning_rate": 1.3418856259659968e-05,
"loss": 3.2761,
"mean_token_accuracy": 0.4039643190801144,
"num_tokens": 6719036.0,
"step": 2130
},
{
"epoch": 0.6615146831530139,
"grad_norm": 1.7501503229141235,
"learning_rate": 1.3387944358578054e-05,
"loss": 3.2999,
"mean_token_accuracy": 0.40199958309531214,
"num_tokens": 6752457.0,
"step": 2140
},
{
"epoch": 0.6646058732612056,
"grad_norm": 1.330138921737671,
"learning_rate": 1.3357032457496137e-05,
"loss": 3.2815,
"mean_token_accuracy": 0.4096176542341709,
"num_tokens": 6780937.0,
"step": 2150
},
{
"epoch": 0.6676970633693973,
"grad_norm": 3.2849926948547363,
"learning_rate": 1.3326120556414221e-05,
"loss": 3.3212,
"mean_token_accuracy": 0.39906698688864706,
"num_tokens": 6813861.0,
"step": 2160
},
{
"epoch": 0.6707882534775889,
"grad_norm": 1.3946915864944458,
"learning_rate": 1.3295208655332303e-05,
"loss": 3.3151,
"mean_token_accuracy": 0.40061264783143996,
"num_tokens": 6843256.0,
"step": 2170
},
{
"epoch": 0.6738794435857806,
"grad_norm": 1.5147260427474976,
"learning_rate": 1.3264296754250388e-05,
"loss": 3.2324,
"mean_token_accuracy": 0.4085581362247467,
"num_tokens": 6881169.0,
"step": 2180
},
{
"epoch": 0.6769706336939721,
"grad_norm": 1.5191727876663208,
"learning_rate": 1.3233384853168472e-05,
"loss": 3.2258,
"mean_token_accuracy": 0.4069525547325611,
"num_tokens": 6914374.0,
"step": 2190
},
{
"epoch": 0.6800618238021638,
"grad_norm": 11.71318531036377,
"learning_rate": 1.3202472952086554e-05,
"loss": 3.2411,
"mean_token_accuracy": 0.40940716192126275,
"num_tokens": 6944810.0,
"step": 2200
},
{
"epoch": 0.6831530139103554,
"grad_norm": 0.9575105309486389,
"learning_rate": 1.3171561051004637e-05,
"loss": 3.2721,
"mean_token_accuracy": 0.40148399621248243,
"num_tokens": 6975417.0,
"step": 2210
},
{
"epoch": 0.6862442040185471,
"grad_norm": 1.6727248430252075,
"learning_rate": 1.3140649149922722e-05,
"loss": 3.3072,
"mean_token_accuracy": 0.40622576996684073,
"num_tokens": 7005387.0,
"step": 2220
},
{
"epoch": 0.6893353941267388,
"grad_norm": 1.9732425212860107,
"learning_rate": 1.3109737248840804e-05,
"loss": 3.3486,
"mean_token_accuracy": 0.4046866536140442,
"num_tokens": 7036641.0,
"step": 2230
},
{
"epoch": 0.6924265842349304,
"grad_norm": 3.0926458835601807,
"learning_rate": 1.3078825347758888e-05,
"loss": 3.2027,
"mean_token_accuracy": 0.41145659387111666,
"num_tokens": 7065613.0,
"step": 2240
},
{
"epoch": 0.6955177743431221,
"grad_norm": 1.2291103601455688,
"learning_rate": 1.3047913446676972e-05,
"loss": 3.2273,
"mean_token_accuracy": 0.41015600264072416,
"num_tokens": 7098541.0,
"step": 2250
},
{
"epoch": 0.6986089644513137,
"grad_norm": 1.393871784210205,
"learning_rate": 1.3017001545595057e-05,
"loss": 3.3534,
"mean_token_accuracy": 0.398224713653326,
"num_tokens": 7131991.0,
"step": 2260
},
{
"epoch": 0.7017001545595054,
"grad_norm": 2.056251287460327,
"learning_rate": 1.2986089644513139e-05,
"loss": 3.2873,
"mean_token_accuracy": 0.4067487485706806,
"num_tokens": 7163563.0,
"step": 2270
},
{
"epoch": 0.7047913446676971,
"grad_norm": 3.4474806785583496,
"learning_rate": 1.2955177743431222e-05,
"loss": 3.3688,
"mean_token_accuracy": 0.39887151271104815,
"num_tokens": 7196235.0,
"step": 2280
},
{
"epoch": 0.7078825347758887,
"grad_norm": 4.399552345275879,
"learning_rate": 1.2924265842349306e-05,
"loss": 3.2107,
"mean_token_accuracy": 0.4083859778940678,
"num_tokens": 7229484.0,
"step": 2290
},
{
"epoch": 0.7109737248840804,
"grad_norm": 1.0608441829681396,
"learning_rate": 1.2893353941267388e-05,
"loss": 3.3511,
"mean_token_accuracy": 0.395867995172739,
"num_tokens": 7263929.0,
"step": 2300
},
{
"epoch": 0.714064914992272,
"grad_norm": 1.6351842880249023,
"learning_rate": 1.2862442040185471e-05,
"loss": 3.2312,
"mean_token_accuracy": 0.40859498232603075,
"num_tokens": 7297276.0,
"step": 2310
},
{
"epoch": 0.7171561051004637,
"grad_norm": 1.5289595127105713,
"learning_rate": 1.2831530139103557e-05,
"loss": 3.325,
"mean_token_accuracy": 0.40636713430285454,
"num_tokens": 7329069.0,
"step": 2320
},
{
"epoch": 0.7202472952086554,
"grad_norm": 1.5375980138778687,
"learning_rate": 1.280061823802164e-05,
"loss": 3.3243,
"mean_token_accuracy": 0.40893488600850103,
"num_tokens": 7362905.0,
"step": 2330
},
{
"epoch": 0.723338485316847,
"grad_norm": 1.2137787342071533,
"learning_rate": 1.2769706336939722e-05,
"loss": 3.2944,
"mean_token_accuracy": 0.4049150198698044,
"num_tokens": 7392845.0,
"step": 2340
},
{
"epoch": 0.7264296754250387,
"grad_norm": 3.618687152862549,
"learning_rate": 1.2738794435857806e-05,
"loss": 3.3114,
"mean_token_accuracy": 0.398918454349041,
"num_tokens": 7426273.0,
"step": 2350
},
{
"epoch": 0.7295208655332303,
"grad_norm": 1.3081494569778442,
"learning_rate": 1.2707882534775891e-05,
"loss": 3.2284,
"mean_token_accuracy": 0.4098616696894169,
"num_tokens": 7457128.0,
"step": 2360
},
{
"epoch": 0.732612055641422,
"grad_norm": 1.2867871522903442,
"learning_rate": 1.2676970633693973e-05,
"loss": 3.2734,
"mean_token_accuracy": 0.40420192629098894,
"num_tokens": 7490613.0,
"step": 2370
},
{
"epoch": 0.7357032457496137,
"grad_norm": 6.16511869430542,
"learning_rate": 1.2646058732612056e-05,
"loss": 3.2275,
"mean_token_accuracy": 0.4089609131217003,
"num_tokens": 7524066.0,
"step": 2380
},
{
"epoch": 0.7387944358578052,
"grad_norm": 3.3427209854125977,
"learning_rate": 1.261514683153014e-05,
"loss": 3.3428,
"mean_token_accuracy": 0.3993862606585026,
"num_tokens": 7554388.0,
"step": 2390
},
{
"epoch": 0.7418856259659969,
"grad_norm": 1.759032964706421,
"learning_rate": 1.2584234930448222e-05,
"loss": 3.2427,
"mean_token_accuracy": 0.411670895665884,
"num_tokens": 7586805.0,
"step": 2400
},
{
"epoch": 0.7449768160741885,
"grad_norm": 6.3605523109436035,
"learning_rate": 1.2553323029366307e-05,
"loss": 3.2989,
"mean_token_accuracy": 0.4035753831267357,
"num_tokens": 7621026.0,
"step": 2410
},
{
"epoch": 0.7480680061823802,
"grad_norm": 1.3226512670516968,
"learning_rate": 1.252241112828439e-05,
"loss": 3.2676,
"mean_token_accuracy": 0.4091234177350998,
"num_tokens": 7653588.0,
"step": 2420
},
{
"epoch": 0.7511591962905718,
"grad_norm": 1.2271887063980103,
"learning_rate": 1.2491499227202474e-05,
"loss": 3.1859,
"mean_token_accuracy": 0.4189001992344856,
"num_tokens": 7682672.0,
"step": 2430
},
{
"epoch": 0.7542503863987635,
"grad_norm": 1.1789538860321045,
"learning_rate": 1.2460587326120556e-05,
"loss": 3.2707,
"mean_token_accuracy": 0.41208377107977867,
"num_tokens": 7711661.0,
"step": 2440
},
{
"epoch": 0.7573415765069552,
"grad_norm": 1.4042397737503052,
"learning_rate": 1.2429675425038642e-05,
"loss": 3.3216,
"mean_token_accuracy": 0.40250647664070127,
"num_tokens": 7743853.0,
"step": 2450
},
{
"epoch": 0.7604327666151468,
"grad_norm": 1.9280140399932861,
"learning_rate": 1.2398763523956725e-05,
"loss": 3.2948,
"mean_token_accuracy": 0.4117195881903172,
"num_tokens": 7776380.0,
"step": 2460
},
{
"epoch": 0.7635239567233385,
"grad_norm": 1.244311809539795,
"learning_rate": 1.2367851622874807e-05,
"loss": 3.2397,
"mean_token_accuracy": 0.41137626469135286,
"num_tokens": 7808412.0,
"step": 2470
},
{
"epoch": 0.7666151468315301,
"grad_norm": 1.2338584661483765,
"learning_rate": 1.233693972179289e-05,
"loss": 3.2887,
"mean_token_accuracy": 0.41127968281507493,
"num_tokens": 7837345.0,
"step": 2480
},
{
"epoch": 0.7697063369397218,
"grad_norm": 1.0948349237442017,
"learning_rate": 1.2306027820710976e-05,
"loss": 3.2336,
"mean_token_accuracy": 0.4080419853329659,
"num_tokens": 7869621.0,
"step": 2490
},
{
"epoch": 0.7727975270479135,
"grad_norm": 3.051591157913208,
"learning_rate": 1.227511591962906e-05,
"loss": 3.3946,
"mean_token_accuracy": 0.39375910386443136,
"num_tokens": 7900668.0,
"step": 2500
},
{
"epoch": 0.7758887171561051,
"grad_norm": 1.2603470087051392,
"learning_rate": 1.2244204018547141e-05,
"loss": 3.2936,
"mean_token_accuracy": 0.40462088733911517,
"num_tokens": 7931473.0,
"step": 2510
},
{
"epoch": 0.7789799072642968,
"grad_norm": 1.3495979309082031,
"learning_rate": 1.2213292117465225e-05,
"loss": 3.2914,
"mean_token_accuracy": 0.40598206520080565,
"num_tokens": 7962649.0,
"step": 2520
},
{
"epoch": 0.7820710973724884,
"grad_norm": 1.4672921895980835,
"learning_rate": 1.218238021638331e-05,
"loss": 3.2363,
"mean_token_accuracy": 0.41317210271954535,
"num_tokens": 7994607.0,
"step": 2530
},
{
"epoch": 0.7851622874806801,
"grad_norm": 1.618283748626709,
"learning_rate": 1.2151468315301392e-05,
"loss": 3.2778,
"mean_token_accuracy": 0.40982001796364786,
"num_tokens": 8023977.0,
"step": 2540
},
{
"epoch": 0.7882534775888718,
"grad_norm": 1.4595403671264648,
"learning_rate": 1.2120556414219476e-05,
"loss": 3.306,
"mean_token_accuracy": 0.4041280455887318,
"num_tokens": 8053965.0,
"step": 2550
},
{
"epoch": 0.7913446676970634,
"grad_norm": 2.1894686222076416,
"learning_rate": 1.208964451313756e-05,
"loss": 3.2235,
"mean_token_accuracy": 0.4136913321912289,
"num_tokens": 8085818.0,
"step": 2560
},
{
"epoch": 0.794435857805255,
"grad_norm": 1.4496268033981323,
"learning_rate": 1.2058732612055641e-05,
"loss": 3.3197,
"mean_token_accuracy": 0.39820486679673195,
"num_tokens": 8118503.0,
"step": 2570
},
{
"epoch": 0.7975270479134466,
"grad_norm": 1.4080617427825928,
"learning_rate": 1.2027820710973725e-05,
"loss": 3.2565,
"mean_token_accuracy": 0.4059484012424946,
"num_tokens": 8151137.0,
"step": 2580
},
{
"epoch": 0.8006182380216383,
"grad_norm": 1.0931172370910645,
"learning_rate": 1.199690880989181e-05,
"loss": 3.2123,
"mean_token_accuracy": 0.40805021226406096,
"num_tokens": 8183728.0,
"step": 2590
},
{
"epoch": 0.80370942812983,
"grad_norm": 1.512048602104187,
"learning_rate": 1.1965996908809894e-05,
"loss": 3.2074,
"mean_token_accuracy": 0.40843924283981325,
"num_tokens": 8214541.0,
"step": 2600
},
{
"epoch": 0.8068006182380216,
"grad_norm": 1.5905691385269165,
"learning_rate": 1.1935085007727975e-05,
"loss": 3.3711,
"mean_token_accuracy": 0.40454950705170634,
"num_tokens": 8245713.0,
"step": 2610
},
{
"epoch": 0.8098918083462133,
"grad_norm": 1.6936638355255127,
"learning_rate": 1.1904173106646059e-05,
"loss": 3.2563,
"mean_token_accuracy": 0.40816242843866346,
"num_tokens": 8279816.0,
"step": 2620
},
{
"epoch": 0.8129829984544049,
"grad_norm": 1.3420774936676025,
"learning_rate": 1.1873261205564144e-05,
"loss": 3.226,
"mean_token_accuracy": 0.41122067645192145,
"num_tokens": 8313266.0,
"step": 2630
},
{
"epoch": 0.8160741885625966,
"grad_norm": 1.5639405250549316,
"learning_rate": 1.1842349304482226e-05,
"loss": 3.2754,
"mean_token_accuracy": 0.40760004371404646,
"num_tokens": 8344438.0,
"step": 2640
},
{
"epoch": 0.8191653786707882,
"grad_norm": 1.428361415863037,
"learning_rate": 1.181143740340031e-05,
"loss": 3.2856,
"mean_token_accuracy": 0.40872066244482996,
"num_tokens": 8375560.0,
"step": 2650
},
{
"epoch": 0.8222565687789799,
"grad_norm": 1.565278172492981,
"learning_rate": 1.1780525502318393e-05,
"loss": 3.2155,
"mean_token_accuracy": 0.40958060398697854,
"num_tokens": 8409286.0,
"step": 2660
},
{
"epoch": 0.8253477588871716,
"grad_norm": 1.5016591548919678,
"learning_rate": 1.1749613601236479e-05,
"loss": 3.2018,
"mean_token_accuracy": 0.4215152218937874,
"num_tokens": 8441409.0,
"step": 2670
},
{
"epoch": 0.8284389489953632,
"grad_norm": 1.3982653617858887,
"learning_rate": 1.171870170015456e-05,
"loss": 3.2591,
"mean_token_accuracy": 0.40580501705408095,
"num_tokens": 8473244.0,
"step": 2680
},
{
"epoch": 0.8315301391035549,
"grad_norm": 2.566338539123535,
"learning_rate": 1.1687789799072644e-05,
"loss": 3.2374,
"mean_token_accuracy": 0.41476899906992915,
"num_tokens": 8503342.0,
"step": 2690
},
{
"epoch": 0.8346213292117465,
"grad_norm": 1.7439295053482056,
"learning_rate": 1.1656877897990728e-05,
"loss": 3.2027,
"mean_token_accuracy": 0.4182712368667126,
"num_tokens": 8535344.0,
"step": 2700
},
{
"epoch": 0.8377125193199382,
"grad_norm": 3.3643314838409424,
"learning_rate": 1.162596599690881e-05,
"loss": 3.2049,
"mean_token_accuracy": 0.4144292987883091,
"num_tokens": 8566017.0,
"step": 2710
},
{
"epoch": 0.8408037094281299,
"grad_norm": 1.662192463874817,
"learning_rate": 1.1595054095826895e-05,
"loss": 3.2681,
"mean_token_accuracy": 0.3985265463590622,
"num_tokens": 8599866.0,
"step": 2720
},
{
"epoch": 0.8438948995363215,
"grad_norm": 1.715958833694458,
"learning_rate": 1.1564142194744979e-05,
"loss": 3.3068,
"mean_token_accuracy": 0.39729173853993416,
"num_tokens": 8630951.0,
"step": 2730
},
{
"epoch": 0.8469860896445132,
"grad_norm": 1.5541205406188965,
"learning_rate": 1.153323029366306e-05,
"loss": 3.2624,
"mean_token_accuracy": 0.4051229901611805,
"num_tokens": 8662309.0,
"step": 2740
},
{
"epoch": 0.8500772797527048,
"grad_norm": 1.596884846687317,
"learning_rate": 1.1502318392581144e-05,
"loss": 3.2699,
"mean_token_accuracy": 0.40654050633311273,
"num_tokens": 8697126.0,
"step": 2750
},
{
"epoch": 0.8531684698608965,
"grad_norm": 3.2481422424316406,
"learning_rate": 1.147140649149923e-05,
"loss": 3.148,
"mean_token_accuracy": 0.42360129579901695,
"num_tokens": 8725676.0,
"step": 2760
},
{
"epoch": 0.8562596599690881,
"grad_norm": 1.5681166648864746,
"learning_rate": 1.1440494590417313e-05,
"loss": 3.2573,
"mean_token_accuracy": 0.406087576597929,
"num_tokens": 8758924.0,
"step": 2770
},
{
"epoch": 0.8593508500772797,
"grad_norm": 1.4387476444244385,
"learning_rate": 1.1409582689335395e-05,
"loss": 3.2587,
"mean_token_accuracy": 0.4091624394059181,
"num_tokens": 8790688.0,
"step": 2780
},
{
"epoch": 0.8624420401854714,
"grad_norm": 2.345499038696289,
"learning_rate": 1.1378670788253478e-05,
"loss": 3.2515,
"mean_token_accuracy": 0.406622239202261,
"num_tokens": 8821597.0,
"step": 2790
},
{
"epoch": 0.865533230293663,
"grad_norm": 1.172705888748169,
"learning_rate": 1.1347758887171562e-05,
"loss": 3.1511,
"mean_token_accuracy": 0.42104474529623986,
"num_tokens": 8852087.0,
"step": 2800
},
{
"epoch": 0.8686244204018547,
"grad_norm": 2.773860216140747,
"learning_rate": 1.1316846986089644e-05,
"loss": 3.2913,
"mean_token_accuracy": 0.40134228840470315,
"num_tokens": 8885781.0,
"step": 2810
},
{
"epoch": 0.8717156105100463,
"grad_norm": 2.000077962875366,
"learning_rate": 1.1285935085007729e-05,
"loss": 3.1904,
"mean_token_accuracy": 0.4104874156415462,
"num_tokens": 8917898.0,
"step": 2820
},
{
"epoch": 0.874806800618238,
"grad_norm": 1.8527454137802124,
"learning_rate": 1.1255023183925813e-05,
"loss": 3.2801,
"mean_token_accuracy": 0.40994274243712425,
"num_tokens": 8949837.0,
"step": 2830
},
{
"epoch": 0.8778979907264297,
"grad_norm": 1.6382179260253906,
"learning_rate": 1.1224111282843895e-05,
"loss": 3.2767,
"mean_token_accuracy": 0.4028928212821484,
"num_tokens": 8984243.0,
"step": 2840
},
{
"epoch": 0.8809891808346213,
"grad_norm": 0.9455551505088806,
"learning_rate": 1.1193199381761978e-05,
"loss": 3.3033,
"mean_token_accuracy": 0.40776450037956236,
"num_tokens": 9017527.0,
"step": 2850
},
{
"epoch": 0.884080370942813,
"grad_norm": 1.8901394605636597,
"learning_rate": 1.1162287480680063e-05,
"loss": 3.1427,
"mean_token_accuracy": 0.4225546672940254,
"num_tokens": 9048607.0,
"step": 2860
},
{
"epoch": 0.8871715610510046,
"grad_norm": 3.34131121635437,
"learning_rate": 1.1131375579598147e-05,
"loss": 3.2084,
"mean_token_accuracy": 0.4111571215093136,
"num_tokens": 9083312.0,
"step": 2870
},
{
"epoch": 0.8902627511591963,
"grad_norm": 1.8717923164367676,
"learning_rate": 1.1100463678516229e-05,
"loss": 3.2198,
"mean_token_accuracy": 0.4138743795454502,
"num_tokens": 9114070.0,
"step": 2880
},
{
"epoch": 0.893353941267388,
"grad_norm": 3.7949368953704834,
"learning_rate": 1.1069551777434312e-05,
"loss": 3.2532,
"mean_token_accuracy": 0.4119734108448029,
"num_tokens": 9143686.0,
"step": 2890
},
{
"epoch": 0.8964451313755796,
"grad_norm": 1.2137411832809448,
"learning_rate": 1.1038639876352398e-05,
"loss": 3.2034,
"mean_token_accuracy": 0.41725371927022936,
"num_tokens": 9174271.0,
"step": 2900
},
{
"epoch": 0.8995363214837713,
"grad_norm": 1.1149791479110718,
"learning_rate": 1.100772797527048e-05,
"loss": 3.2835,
"mean_token_accuracy": 0.4074005588889122,
"num_tokens": 9208354.0,
"step": 2910
},
{
"epoch": 0.9026275115919629,
"grad_norm": 1.3814709186553955,
"learning_rate": 1.0976816074188563e-05,
"loss": 3.2484,
"mean_token_accuracy": 0.4074396938085556,
"num_tokens": 9238547.0,
"step": 2920
},
{
"epoch": 0.9057187017001546,
"grad_norm": 1.3796359300613403,
"learning_rate": 1.0945904173106647e-05,
"loss": 3.1762,
"mean_token_accuracy": 0.4161891110241413,
"num_tokens": 9272451.0,
"step": 2930
},
{
"epoch": 0.9088098918083463,
"grad_norm": 3.8864829540252686,
"learning_rate": 1.0914992272024732e-05,
"loss": 3.1503,
"mean_token_accuracy": 0.42186372056603433,
"num_tokens": 9303885.0,
"step": 2940
},
{
"epoch": 0.9119010819165378,
"grad_norm": 1.6369949579238892,
"learning_rate": 1.0884080370942814e-05,
"loss": 3.2353,
"mean_token_accuracy": 0.4133276604115963,
"num_tokens": 9333763.0,
"step": 2950
},
{
"epoch": 0.9149922720247295,
"grad_norm": 1.507304310798645,
"learning_rate": 1.0853168469860898e-05,
"loss": 3.2037,
"mean_token_accuracy": 0.41107733473181723,
"num_tokens": 9367354.0,
"step": 2960
},
{
"epoch": 0.9180834621329211,
"grad_norm": 4.891152858734131,
"learning_rate": 1.0822256568778981e-05,
"loss": 3.2332,
"mean_token_accuracy": 0.4125970214605331,
"num_tokens": 9397706.0,
"step": 2970
},
{
"epoch": 0.9211746522411128,
"grad_norm": 1.820520043373108,
"learning_rate": 1.0791344667697063e-05,
"loss": 3.2168,
"mean_token_accuracy": 0.40903283953666686,
"num_tokens": 9428549.0,
"step": 2980
},
{
"epoch": 0.9242658423493045,
"grad_norm": 1.2856605052947998,
"learning_rate": 1.0760432766615148e-05,
"loss": 3.2704,
"mean_token_accuracy": 0.4016518287360668,
"num_tokens": 9462364.0,
"step": 2990
},
{
"epoch": 0.9273570324574961,
"grad_norm": 1.1914364099502563,
"learning_rate": 1.0729520865533232e-05,
"loss": 3.2313,
"mean_token_accuracy": 0.4183121621608734,
"num_tokens": 9490314.0,
"step": 3000
},
{
"epoch": 0.9304482225656878,
"grad_norm": 2.964503526687622,
"learning_rate": 1.0698608964451314e-05,
"loss": 3.2828,
"mean_token_accuracy": 0.4028220146894455,
"num_tokens": 9520119.0,
"step": 3010
},
{
"epoch": 0.9335394126738794,
"grad_norm": 1.8764747381210327,
"learning_rate": 1.0667697063369397e-05,
"loss": 3.3497,
"mean_token_accuracy": 0.40479681119322775,
"num_tokens": 9549206.0,
"step": 3020
},
{
"epoch": 0.9366306027820711,
"grad_norm": 2.1000001430511475,
"learning_rate": 1.0636785162287481e-05,
"loss": 3.2438,
"mean_token_accuracy": 0.41448465660214423,
"num_tokens": 9578662.0,
"step": 3030
},
{
"epoch": 0.9397217928902627,
"grad_norm": 1.7962746620178223,
"learning_rate": 1.0605873261205566e-05,
"loss": 3.2039,
"mean_token_accuracy": 0.41432305723428725,
"num_tokens": 9611638.0,
"step": 3040
},
{
"epoch": 0.9428129829984544,
"grad_norm": 1.2367525100708008,
"learning_rate": 1.0574961360123648e-05,
"loss": 3.2756,
"mean_token_accuracy": 0.4112587310373783,
"num_tokens": 9645086.0,
"step": 3050
},
{
"epoch": 0.9459041731066461,
"grad_norm": 1.2295643091201782,
"learning_rate": 1.0544049459041732e-05,
"loss": 3.2597,
"mean_token_accuracy": 0.41084068119525907,
"num_tokens": 9676540.0,
"step": 3060
},
{
"epoch": 0.9489953632148377,
"grad_norm": 1.836288571357727,
"learning_rate": 1.0513137557959815e-05,
"loss": 3.1906,
"mean_token_accuracy": 0.4185685083270073,
"num_tokens": 9708023.0,
"step": 3070
},
{
"epoch": 0.9520865533230294,
"grad_norm": 1.4680696725845337,
"learning_rate": 1.0482225656877897e-05,
"loss": 3.2044,
"mean_token_accuracy": 0.4141043916344643,
"num_tokens": 9741153.0,
"step": 3080
},
{
"epoch": 0.955177743431221,
"grad_norm": 1.3278076648712158,
"learning_rate": 1.0451313755795983e-05,
"loss": 3.1744,
"mean_token_accuracy": 0.41462502256035805,
"num_tokens": 9772481.0,
"step": 3090
},
{
"epoch": 0.9582689335394127,
"grad_norm": 2.042060136795044,
"learning_rate": 1.0420401854714066e-05,
"loss": 3.2346,
"mean_token_accuracy": 0.4115324914455414,
"num_tokens": 9802122.0,
"step": 3100
},
{
"epoch": 0.9613601236476044,
"grad_norm": 1.6663880348205566,
"learning_rate": 1.038948995363215e-05,
"loss": 3.2437,
"mean_token_accuracy": 0.411440496891737,
"num_tokens": 9832001.0,
"step": 3110
},
{
"epoch": 0.964451313755796,
"grad_norm": 1.2543443441390991,
"learning_rate": 1.0358578052550232e-05,
"loss": 3.2323,
"mean_token_accuracy": 0.40699815154075625,
"num_tokens": 9862830.0,
"step": 3120
},
{
"epoch": 0.9675425038639877,
"grad_norm": 10.45780086517334,
"learning_rate": 1.0327666151468317e-05,
"loss": 3.2624,
"mean_token_accuracy": 0.4110853001475334,
"num_tokens": 9897540.0,
"step": 3130
},
{
"epoch": 0.9706336939721792,
"grad_norm": 1.6870477199554443,
"learning_rate": 1.02967542503864e-05,
"loss": 3.2788,
"mean_token_accuracy": 0.4078416295349598,
"num_tokens": 9929002.0,
"step": 3140
},
{
"epoch": 0.973724884080371,
"grad_norm": 1.3050179481506348,
"learning_rate": 1.0265842349304482e-05,
"loss": 3.1642,
"mean_token_accuracy": 0.41117783561348914,
"num_tokens": 9961618.0,
"step": 3150
},
{
"epoch": 0.9768160741885626,
"grad_norm": 1.570807695388794,
"learning_rate": 1.0234930448222566e-05,
"loss": 3.2458,
"mean_token_accuracy": 0.4177241921424866,
"num_tokens": 9991255.0,
"step": 3160
},
{
"epoch": 0.9799072642967542,
"grad_norm": 3.6920664310455322,
"learning_rate": 1.0204018547140651e-05,
"loss": 3.2963,
"mean_token_accuracy": 0.4077574260532856,
"num_tokens": 10024237.0,
"step": 3170
},
{
"epoch": 0.9829984544049459,
"grad_norm": 4.246991157531738,
"learning_rate": 1.0173106646058733e-05,
"loss": 3.2326,
"mean_token_accuracy": 0.41490627601742747,
"num_tokens": 10057244.0,
"step": 3180
},
{
"epoch": 0.9860896445131375,
"grad_norm": 1.615694522857666,
"learning_rate": 1.0142194744976817e-05,
"loss": 3.2114,
"mean_token_accuracy": 0.4113995648920536,
"num_tokens": 10091258.0,
"step": 3190
},
{
"epoch": 0.9891808346213292,
"grad_norm": 1.5819542407989502,
"learning_rate": 1.01112828438949e-05,
"loss": 3.0528,
"mean_token_accuracy": 0.4291278474032879,
"num_tokens": 10118722.0,
"step": 3200
},
{
"epoch": 0.9922720247295209,
"grad_norm": 4.877267360687256,
"learning_rate": 1.0080370942812986e-05,
"loss": 3.3058,
"mean_token_accuracy": 0.40104425325989723,
"num_tokens": 10149898.0,
"step": 3210
},
{
"epoch": 0.9953632148377125,
"grad_norm": 1.5499932765960693,
"learning_rate": 1.0049459041731067e-05,
"loss": 3.2483,
"mean_token_accuracy": 0.41600828766822817,
"num_tokens": 10180542.0,
"step": 3220
},
{
"epoch": 0.9984544049459042,
"grad_norm": 3.9266202449798584,
"learning_rate": 1.0018547140649151e-05,
"loss": 3.2672,
"mean_token_accuracy": 0.41103068739175797,
"num_tokens": 10211504.0,
"step": 3230
},
{
"epoch": 1.001545595054096,
"grad_norm": 2.134188413619995,
"learning_rate": 9.987635239567235e-06,
"loss": 3.1624,
"mean_token_accuracy": 0.4230015531182289,
"num_tokens": 10239942.0,
"step": 3240
},
{
"epoch": 1.0046367851622875,
"grad_norm": 1.5933483839035034,
"learning_rate": 9.956723338485318e-06,
"loss": 3.1625,
"mean_token_accuracy": 0.41645141169428823,
"num_tokens": 10274555.0,
"step": 3250
},
{
"epoch": 1.007727975270479,
"grad_norm": 1.4850564002990723,
"learning_rate": 9.925811437403402e-06,
"loss": 3.2296,
"mean_token_accuracy": 0.41107223033905027,
"num_tokens": 10308953.0,
"step": 3260
},
{
"epoch": 1.010819165378671,
"grad_norm": 1.2016632556915283,
"learning_rate": 9.894899536321485e-06,
"loss": 3.1902,
"mean_token_accuracy": 0.41538654640316963,
"num_tokens": 10343548.0,
"step": 3270
},
{
"epoch": 1.0139103554868625,
"grad_norm": 1.4952160120010376,
"learning_rate": 9.863987635239567e-06,
"loss": 3.2712,
"mean_token_accuracy": 0.4115736290812492,
"num_tokens": 10376226.0,
"step": 3280
},
{
"epoch": 1.017001545595054,
"grad_norm": 1.2473570108413696,
"learning_rate": 9.83307573415765e-06,
"loss": 3.1719,
"mean_token_accuracy": 0.4191995531320572,
"num_tokens": 10407408.0,
"step": 3290
},
{
"epoch": 1.0200927357032457,
"grad_norm": 1.5981847047805786,
"learning_rate": 9.802163833075734e-06,
"loss": 3.1756,
"mean_token_accuracy": 0.4126800112426281,
"num_tokens": 10439332.0,
"step": 3300
},
{
"epoch": 1.0231839258114375,
"grad_norm": 1.6504404544830322,
"learning_rate": 9.771251931993818e-06,
"loss": 3.2747,
"mean_token_accuracy": 0.4050762981176376,
"num_tokens": 10468931.0,
"step": 3310
},
{
"epoch": 1.026275115919629,
"grad_norm": 1.2191509008407593,
"learning_rate": 9.740340030911902e-06,
"loss": 3.1862,
"mean_token_accuracy": 0.4132268287241459,
"num_tokens": 10501965.0,
"step": 3320
},
{
"epoch": 1.0293663060278206,
"grad_norm": 1.6509348154067993,
"learning_rate": 9.709428129829985e-06,
"loss": 3.2574,
"mean_token_accuracy": 0.40723603740334513,
"num_tokens": 10532613.0,
"step": 3330
},
{
"epoch": 1.0324574961360125,
"grad_norm": 1.8046737909317017,
"learning_rate": 9.678516228748069e-06,
"loss": 3.2028,
"mean_token_accuracy": 0.4177181996405125,
"num_tokens": 10562323.0,
"step": 3340
},
{
"epoch": 1.035548686244204,
"grad_norm": 1.156565546989441,
"learning_rate": 9.647604327666152e-06,
"loss": 3.2189,
"mean_token_accuracy": 0.4168844804167747,
"num_tokens": 10592216.0,
"step": 3350
},
{
"epoch": 1.0386398763523956,
"grad_norm": 1.5646038055419922,
"learning_rate": 9.616692426584236e-06,
"loss": 3.239,
"mean_token_accuracy": 0.40751678571105004,
"num_tokens": 10624766.0,
"step": 3360
},
{
"epoch": 1.0417310664605872,
"grad_norm": 1.1149756908416748,
"learning_rate": 9.58578052550232e-06,
"loss": 3.0939,
"mean_token_accuracy": 0.42675758227705957,
"num_tokens": 10653981.0,
"step": 3370
},
{
"epoch": 1.044822256568779,
"grad_norm": 1.9262531995773315,
"learning_rate": 9.554868624420403e-06,
"loss": 3.1622,
"mean_token_accuracy": 0.41861816495656967,
"num_tokens": 10685402.0,
"step": 3380
},
{
"epoch": 1.0479134466769706,
"grad_norm": 1.5397542715072632,
"learning_rate": 9.523956723338487e-06,
"loss": 3.2075,
"mean_token_accuracy": 0.4133962944149971,
"num_tokens": 10719468.0,
"step": 3390
},
{
"epoch": 1.0510046367851622,
"grad_norm": 2.140308141708374,
"learning_rate": 9.49304482225657e-06,
"loss": 3.2143,
"mean_token_accuracy": 0.41401648372411726,
"num_tokens": 10750774.0,
"step": 3400
},
{
"epoch": 1.054095826893354,
"grad_norm": 1.6291162967681885,
"learning_rate": 9.462132921174652e-06,
"loss": 3.2932,
"mean_token_accuracy": 0.4043385870754719,
"num_tokens": 10782891.0,
"step": 3410
},
{
"epoch": 1.0571870170015456,
"grad_norm": 1.2068161964416504,
"learning_rate": 9.431221020092737e-06,
"loss": 3.2038,
"mean_token_accuracy": 0.4151684492826462,
"num_tokens": 10814759.0,
"step": 3420
},
{
"epoch": 1.0602782071097372,
"grad_norm": 1.3435413837432861,
"learning_rate": 9.40030911901082e-06,
"loss": 3.2655,
"mean_token_accuracy": 0.41177373975515363,
"num_tokens": 10846021.0,
"step": 3430
},
{
"epoch": 1.063369397217929,
"grad_norm": 1.1493477821350098,
"learning_rate": 9.369397217928905e-06,
"loss": 3.186,
"mean_token_accuracy": 0.4216080687940121,
"num_tokens": 10877787.0,
"step": 3440
},
{
"epoch": 1.0664605873261206,
"grad_norm": 1.7987961769104004,
"learning_rate": 9.338485316846986e-06,
"loss": 3.2626,
"mean_token_accuracy": 0.4110354706645012,
"num_tokens": 10906159.0,
"step": 3450
},
{
"epoch": 1.0695517774343122,
"grad_norm": 1.6236494779586792,
"learning_rate": 9.30757341576507e-06,
"loss": 3.2564,
"mean_token_accuracy": 0.4106232084333897,
"num_tokens": 10934948.0,
"step": 3460
},
{
"epoch": 1.0726429675425038,
"grad_norm": 2.3455023765563965,
"learning_rate": 9.276661514683154e-06,
"loss": 3.133,
"mean_token_accuracy": 0.4222550518810749,
"num_tokens": 10967516.0,
"step": 3470
},
{
"epoch": 1.0757341576506956,
"grad_norm": 1.278497576713562,
"learning_rate": 9.245749613601237e-06,
"loss": 3.2399,
"mean_token_accuracy": 0.4149567700922489,
"num_tokens": 10999132.0,
"step": 3480
},
{
"epoch": 1.0788253477588872,
"grad_norm": 2.0221869945526123,
"learning_rate": 9.21483771251932e-06,
"loss": 3.2568,
"mean_token_accuracy": 0.4081104606389999,
"num_tokens": 11031589.0,
"step": 3490
},
{
"epoch": 1.0819165378670788,
"grad_norm": 37.65148162841797,
"learning_rate": 9.183925811437404e-06,
"loss": 3.2473,
"mean_token_accuracy": 0.4095830604434013,
"num_tokens": 11059575.0,
"step": 3500
},
{
"epoch": 1.0850077279752706,
"grad_norm": 4.20313835144043,
"learning_rate": 9.153013910355486e-06,
"loss": 3.143,
"mean_token_accuracy": 0.426883215457201,
"num_tokens": 11091120.0,
"step": 3510
},
{
"epoch": 1.0880989180834622,
"grad_norm": 3.1230990886688232,
"learning_rate": 9.122102009273572e-06,
"loss": 3.3042,
"mean_token_accuracy": 0.4000666797161102,
"num_tokens": 11123837.0,
"step": 3520
},
{
"epoch": 1.0911901081916537,
"grad_norm": 0.9987094402313232,
"learning_rate": 9.091190108191653e-06,
"loss": 3.1809,
"mean_token_accuracy": 0.4136147178709507,
"num_tokens": 11154747.0,
"step": 3530
},
{
"epoch": 1.0942812982998453,
"grad_norm": 2.0009162425994873,
"learning_rate": 9.060278207109739e-06,
"loss": 3.2353,
"mean_token_accuracy": 0.40693147107958794,
"num_tokens": 11190794.0,
"step": 3540
},
{
"epoch": 1.0973724884080371,
"grad_norm": 2.4473884105682373,
"learning_rate": 9.02936630602782e-06,
"loss": 3.1614,
"mean_token_accuracy": 0.42429070770740507,
"num_tokens": 11219554.0,
"step": 3550
},
{
"epoch": 1.1004636785162287,
"grad_norm": 1.1736706495285034,
"learning_rate": 8.998454404945904e-06,
"loss": 3.2173,
"mean_token_accuracy": 0.41124544814229014,
"num_tokens": 11252636.0,
"step": 3560
},
{
"epoch": 1.1035548686244203,
"grad_norm": 1.163642168045044,
"learning_rate": 8.967542503863988e-06,
"loss": 3.1291,
"mean_token_accuracy": 0.4211157590150833,
"num_tokens": 11282896.0,
"step": 3570
},
{
"epoch": 1.1066460587326121,
"grad_norm": 1.362874984741211,
"learning_rate": 8.936630602782071e-06,
"loss": 3.2384,
"mean_token_accuracy": 0.41210982352495196,
"num_tokens": 11315528.0,
"step": 3580
},
{
"epoch": 1.1097372488408037,
"grad_norm": 1.640885829925537,
"learning_rate": 8.905718701700155e-06,
"loss": 3.2508,
"mean_token_accuracy": 0.40889245420694353,
"num_tokens": 11345228.0,
"step": 3590
},
{
"epoch": 1.1128284389489953,
"grad_norm": 1.803788661956787,
"learning_rate": 8.874806800618239e-06,
"loss": 3.1479,
"mean_token_accuracy": 0.42345268800854685,
"num_tokens": 11376476.0,
"step": 3600
},
{
"epoch": 1.1159196290571871,
"grad_norm": 3.247323751449585,
"learning_rate": 8.843894899536322e-06,
"loss": 3.1825,
"mean_token_accuracy": 0.4189397856593132,
"num_tokens": 11406589.0,
"step": 3610
},
{
"epoch": 1.1190108191653787,
"grad_norm": 1.748858094215393,
"learning_rate": 8.812982998454406e-06,
"loss": 3.2348,
"mean_token_accuracy": 0.41790731623768806,
"num_tokens": 11435694.0,
"step": 3620
},
{
"epoch": 1.1221020092735703,
"grad_norm": 1.1326861381530762,
"learning_rate": 8.78207109737249e-06,
"loss": 3.2358,
"mean_token_accuracy": 0.4107509456574917,
"num_tokens": 11465245.0,
"step": 3630
},
{
"epoch": 1.125193199381762,
"grad_norm": 1.4564932584762573,
"learning_rate": 8.751159196290573e-06,
"loss": 3.2268,
"mean_token_accuracy": 0.41108732894062994,
"num_tokens": 11498154.0,
"step": 3640
},
{
"epoch": 1.1282843894899537,
"grad_norm": 0.9113560914993286,
"learning_rate": 8.720247295208657e-06,
"loss": 3.1874,
"mean_token_accuracy": 0.41293532848358155,
"num_tokens": 11531872.0,
"step": 3650
},
{
"epoch": 1.1313755795981453,
"grad_norm": 1.6753915548324585,
"learning_rate": 8.68933539412674e-06,
"loss": 3.1935,
"mean_token_accuracy": 0.4139715678989887,
"num_tokens": 11565247.0,
"step": 3660
},
{
"epoch": 1.1344667697063369,
"grad_norm": 1.6150254011154175,
"learning_rate": 8.658423493044824e-06,
"loss": 3.2335,
"mean_token_accuracy": 0.41677759736776354,
"num_tokens": 11594001.0,
"step": 3670
},
{
"epoch": 1.1375579598145287,
"grad_norm": 1.0996955633163452,
"learning_rate": 8.627511591962906e-06,
"loss": 3.2328,
"mean_token_accuracy": 0.40990992560982703,
"num_tokens": 11626151.0,
"step": 3680
},
{
"epoch": 1.1406491499227203,
"grad_norm": 2.7414052486419678,
"learning_rate": 8.59659969088099e-06,
"loss": 3.2059,
"mean_token_accuracy": 0.41417448669672013,
"num_tokens": 11655585.0,
"step": 3690
},
{
"epoch": 1.1437403400309119,
"grad_norm": 1.7157678604125977,
"learning_rate": 8.565687789799073e-06,
"loss": 3.2184,
"mean_token_accuracy": 0.4183143936097622,
"num_tokens": 11688319.0,
"step": 3700
},
{
"epoch": 1.1468315301391034,
"grad_norm": 1.458961009979248,
"learning_rate": 8.534775888717158e-06,
"loss": 3.2694,
"mean_token_accuracy": 0.4085992857813835,
"num_tokens": 11718421.0,
"step": 3710
},
{
"epoch": 1.1499227202472952,
"grad_norm": 1.2802034616470337,
"learning_rate": 8.50386398763524e-06,
"loss": 3.2114,
"mean_token_accuracy": 0.41477348655462265,
"num_tokens": 11750217.0,
"step": 3720
},
{
"epoch": 1.1530139103554868,
"grad_norm": 1.8998056650161743,
"learning_rate": 8.472952086553323e-06,
"loss": 3.1341,
"mean_token_accuracy": 0.42046748399734496,
"num_tokens": 11783326.0,
"step": 3730
},
{
"epoch": 1.1561051004636784,
"grad_norm": 1.2350112199783325,
"learning_rate": 8.442040185471407e-06,
"loss": 3.2182,
"mean_token_accuracy": 0.41114275753498075,
"num_tokens": 11814954.0,
"step": 3740
},
{
"epoch": 1.1591962905718702,
"grad_norm": 1.5678590536117554,
"learning_rate": 8.41112828438949e-06,
"loss": 3.3188,
"mean_token_accuracy": 0.40804709047079085,
"num_tokens": 11848958.0,
"step": 3750
},
{
"epoch": 1.1622874806800618,
"grad_norm": 1.5195876359939575,
"learning_rate": 8.380216383307574e-06,
"loss": 3.2479,
"mean_token_accuracy": 0.40802566707134247,
"num_tokens": 11880748.0,
"step": 3760
},
{
"epoch": 1.1653786707882534,
"grad_norm": 2.050419807434082,
"learning_rate": 8.349304482225658e-06,
"loss": 3.1546,
"mean_token_accuracy": 0.42246685177087784,
"num_tokens": 11910387.0,
"step": 3770
},
{
"epoch": 1.1684698608964452,
"grad_norm": 1.7234852313995361,
"learning_rate": 8.31839258114374e-06,
"loss": 3.221,
"mean_token_accuracy": 0.4118661187589169,
"num_tokens": 11939685.0,
"step": 3780
},
{
"epoch": 1.1715610510046368,
"grad_norm": 1.4194177389144897,
"learning_rate": 8.287480680061825e-06,
"loss": 3.2305,
"mean_token_accuracy": 0.4091548278927803,
"num_tokens": 11971214.0,
"step": 3790
},
{
"epoch": 1.1746522411128284,
"grad_norm": 1.0113506317138672,
"learning_rate": 8.256568778979907e-06,
"loss": 3.2137,
"mean_token_accuracy": 0.41168191134929655,
"num_tokens": 12003031.0,
"step": 3800
},
{
"epoch": 1.1777434312210202,
"grad_norm": 2.5737476348876953,
"learning_rate": 8.225656877897992e-06,
"loss": 3.1441,
"mean_token_accuracy": 0.42300955280661584,
"num_tokens": 12033250.0,
"step": 3810
},
{
"epoch": 1.1808346213292118,
"grad_norm": 2.5177934169769287,
"learning_rate": 8.194744976816074e-06,
"loss": 3.2089,
"mean_token_accuracy": 0.41499723494052887,
"num_tokens": 12064537.0,
"step": 3820
},
{
"epoch": 1.1839258114374034,
"grad_norm": 1.1910068988800049,
"learning_rate": 8.16383307573416e-06,
"loss": 3.2179,
"mean_token_accuracy": 0.41548130139708517,
"num_tokens": 12096056.0,
"step": 3830
},
{
"epoch": 1.187017001545595,
"grad_norm": 1.5878336429595947,
"learning_rate": 8.132921174652241e-06,
"loss": 3.144,
"mean_token_accuracy": 0.40882509499788283,
"num_tokens": 12131664.0,
"step": 3840
},
{
"epoch": 1.1901081916537868,
"grad_norm": 1.3621925115585327,
"learning_rate": 8.102009273570325e-06,
"loss": 3.1344,
"mean_token_accuracy": 0.4220642536878586,
"num_tokens": 12162170.0,
"step": 3850
},
{
"epoch": 1.1931993817619784,
"grad_norm": 3.2442736625671387,
"learning_rate": 8.071097372488408e-06,
"loss": 3.1784,
"mean_token_accuracy": 0.41677999347448347,
"num_tokens": 12192536.0,
"step": 3860
},
{
"epoch": 1.19629057187017,
"grad_norm": 1.6372841596603394,
"learning_rate": 8.040185471406492e-06,
"loss": 3.1393,
"mean_token_accuracy": 0.42606005966663363,
"num_tokens": 12222253.0,
"step": 3870
},
{
"epoch": 1.1993817619783615,
"grad_norm": 6.679258823394775,
"learning_rate": 8.009273570324576e-06,
"loss": 3.1493,
"mean_token_accuracy": 0.4194886885583401,
"num_tokens": 12253757.0,
"step": 3880
},
{
"epoch": 1.2024729520865534,
"grad_norm": 2.9235403537750244,
"learning_rate": 7.97836166924266e-06,
"loss": 3.1981,
"mean_token_accuracy": 0.41685143783688544,
"num_tokens": 12285504.0,
"step": 3890
},
{
"epoch": 1.205564142194745,
"grad_norm": 3.680112838745117,
"learning_rate": 7.947449768160743e-06,
"loss": 3.2044,
"mean_token_accuracy": 0.41487403139472007,
"num_tokens": 12317649.0,
"step": 3900
},
{
"epoch": 1.2086553323029365,
"grad_norm": 4.2484283447265625,
"learning_rate": 7.916537867078826e-06,
"loss": 3.2147,
"mean_token_accuracy": 0.4154091864824295,
"num_tokens": 12349158.0,
"step": 3910
},
{
"epoch": 1.2117465224111283,
"grad_norm": 1.0454447269439697,
"learning_rate": 7.88562596599691e-06,
"loss": 3.2005,
"mean_token_accuracy": 0.4116973325610161,
"num_tokens": 12381883.0,
"step": 3920
},
{
"epoch": 1.21483771251932,
"grad_norm": 1.5970138311386108,
"learning_rate": 7.854714064914994e-06,
"loss": 3.2037,
"mean_token_accuracy": 0.41303489953279493,
"num_tokens": 12417101.0,
"step": 3930
},
{
"epoch": 1.2179289026275115,
"grad_norm": 1.4790903329849243,
"learning_rate": 7.823802163833077e-06,
"loss": 3.1911,
"mean_token_accuracy": 0.41875759288668635,
"num_tokens": 12446858.0,
"step": 3940
},
{
"epoch": 1.2210200927357033,
"grad_norm": 2.267620801925659,
"learning_rate": 7.792890262751159e-06,
"loss": 3.128,
"mean_token_accuracy": 0.4224107012152672,
"num_tokens": 12477693.0,
"step": 3950
},
{
"epoch": 1.224111282843895,
"grad_norm": 13.008003234863281,
"learning_rate": 7.761978361669244e-06,
"loss": 3.1414,
"mean_token_accuracy": 0.4242011792957783,
"num_tokens": 12508183.0,
"step": 3960
},
{
"epoch": 1.2272024729520865,
"grad_norm": 2.0274596214294434,
"learning_rate": 7.731066460587326e-06,
"loss": 3.1405,
"mean_token_accuracy": 0.41859717667102814,
"num_tokens": 12540160.0,
"step": 3970
},
{
"epoch": 1.2302936630602783,
"grad_norm": 1.9864860773086548,
"learning_rate": 7.70015455950541e-06,
"loss": 3.1902,
"mean_token_accuracy": 0.42011781483888627,
"num_tokens": 12573537.0,
"step": 3980
},
{
"epoch": 1.23338485316847,
"grad_norm": 2.4452314376831055,
"learning_rate": 7.669242658423493e-06,
"loss": 3.2631,
"mean_token_accuracy": 0.40838914439082147,
"num_tokens": 12605953.0,
"step": 3990
},
{
"epoch": 1.2364760432766615,
"grad_norm": 0.847280740737915,
"learning_rate": 7.638330757341577e-06,
"loss": 3.1697,
"mean_token_accuracy": 0.4186425693333149,
"num_tokens": 12639106.0,
"step": 4000
},
{
"epoch": 1.239567233384853,
"grad_norm": 1.1845167875289917,
"learning_rate": 7.6074188562596605e-06,
"loss": 3.2143,
"mean_token_accuracy": 0.41409589275717734,
"num_tokens": 12669010.0,
"step": 4010
},
{
"epoch": 1.242658423493045,
"grad_norm": 17.06900405883789,
"learning_rate": 7.576506955177744e-06,
"loss": 3.122,
"mean_token_accuracy": 0.4226062521338463,
"num_tokens": 12699833.0,
"step": 4020
},
{
"epoch": 1.2457496136012365,
"grad_norm": 1.4105889797210693,
"learning_rate": 7.545595054095828e-06,
"loss": 3.2082,
"mean_token_accuracy": 0.408999927341938,
"num_tokens": 12732842.0,
"step": 4030
},
{
"epoch": 1.248840803709428,
"grad_norm": 2.478212356567383,
"learning_rate": 7.51468315301391e-06,
"loss": 3.3335,
"mean_token_accuracy": 0.39411759525537493,
"num_tokens": 12764506.0,
"step": 4040
},
{
"epoch": 1.2519319938176197,
"grad_norm": 1.3297427892684937,
"learning_rate": 7.483771251931995e-06,
"loss": 3.1968,
"mean_token_accuracy": 0.409882578253746,
"num_tokens": 12797376.0,
"step": 4050
},
{
"epoch": 1.2550231839258115,
"grad_norm": 2.156684637069702,
"learning_rate": 7.4528593508500776e-06,
"loss": 3.2096,
"mean_token_accuracy": 0.4163349486887455,
"num_tokens": 12831070.0,
"step": 4060
},
{
"epoch": 1.258114374034003,
"grad_norm": 0.8902810215950012,
"learning_rate": 7.421947449768161e-06,
"loss": 3.0776,
"mean_token_accuracy": 0.42983465269207954,
"num_tokens": 12860548.0,
"step": 4070
},
{
"epoch": 1.2612055641421946,
"grad_norm": 1.5070351362228394,
"learning_rate": 7.391035548686245e-06,
"loss": 3.2621,
"mean_token_accuracy": 0.41044663786888125,
"num_tokens": 12895303.0,
"step": 4080
},
{
"epoch": 1.2642967542503865,
"grad_norm": 1.4987707138061523,
"learning_rate": 7.360123647604328e-06,
"loss": 3.27,
"mean_token_accuracy": 0.40910629704594614,
"num_tokens": 12926442.0,
"step": 4090
},
{
"epoch": 1.267387944358578,
"grad_norm": 1.4421848058700562,
"learning_rate": 7.329211746522412e-06,
"loss": 3.2123,
"mean_token_accuracy": 0.4119173936545849,
"num_tokens": 12958744.0,
"step": 4100
},
{
"epoch": 1.2704791344667696,
"grad_norm": 1.8334916830062866,
"learning_rate": 7.2982998454404955e-06,
"loss": 3.2069,
"mean_token_accuracy": 0.4157085955142975,
"num_tokens": 12990543.0,
"step": 4110
},
{
"epoch": 1.2735703245749614,
"grad_norm": 1.4132410287857056,
"learning_rate": 7.267387944358578e-06,
"loss": 3.2062,
"mean_token_accuracy": 0.4211029835045338,
"num_tokens": 13020509.0,
"step": 4120
},
{
"epoch": 1.276661514683153,
"grad_norm": 1.6009533405303955,
"learning_rate": 7.236476043276663e-06,
"loss": 3.1636,
"mean_token_accuracy": 0.41871346086263656,
"num_tokens": 13052226.0,
"step": 4130
},
{
"epoch": 1.2797527047913446,
"grad_norm": 2.6064114570617676,
"learning_rate": 7.205564142194745e-06,
"loss": 3.1769,
"mean_token_accuracy": 0.4199406482279301,
"num_tokens": 13084155.0,
"step": 4140
},
{
"epoch": 1.2828438948995364,
"grad_norm": 1.20345938205719,
"learning_rate": 7.17465224111283e-06,
"loss": 3.2276,
"mean_token_accuracy": 0.41344398483633993,
"num_tokens": 13117148.0,
"step": 4150
},
{
"epoch": 1.285935085007728,
"grad_norm": 3.0620017051696777,
"learning_rate": 7.143740340030913e-06,
"loss": 3.219,
"mean_token_accuracy": 0.4165691465139389,
"num_tokens": 13148216.0,
"step": 4160
},
{
"epoch": 1.2890262751159196,
"grad_norm": 1.6213176250457764,
"learning_rate": 7.112828438948995e-06,
"loss": 3.183,
"mean_token_accuracy": 0.41394164860248567,
"num_tokens": 13178115.0,
"step": 4170
},
{
"epoch": 1.2921174652241114,
"grad_norm": 2.890545129776001,
"learning_rate": 7.08191653786708e-06,
"loss": 3.2399,
"mean_token_accuracy": 0.41289833262562753,
"num_tokens": 13209452.0,
"step": 4180
},
{
"epoch": 1.295208655332303,
"grad_norm": 1.3374779224395752,
"learning_rate": 7.0510046367851625e-06,
"loss": 3.1967,
"mean_token_accuracy": 0.4149262882769108,
"num_tokens": 13240399.0,
"step": 4190
},
{
"epoch": 1.2982998454404946,
"grad_norm": 5.228854656219482,
"learning_rate": 7.020092735703247e-06,
"loss": 3.2154,
"mean_token_accuracy": 0.4165704995393753,
"num_tokens": 13273212.0,
"step": 4200
},
{
"epoch": 1.3013910355486862,
"grad_norm": 1.3633702993392944,
"learning_rate": 6.98918083462133e-06,
"loss": 3.1915,
"mean_token_accuracy": 0.41528667509555817,
"num_tokens": 13306852.0,
"step": 4210
},
{
"epoch": 1.3044822256568778,
"grad_norm": 1.2878068685531616,
"learning_rate": 6.958268933539414e-06,
"loss": 3.1694,
"mean_token_accuracy": 0.4169937312602997,
"num_tokens": 13340676.0,
"step": 4220
},
{
"epoch": 1.3075734157650696,
"grad_norm": 1.3162099123001099,
"learning_rate": 6.927357032457497e-06,
"loss": 3.142,
"mean_token_accuracy": 0.4265909008681774,
"num_tokens": 13371414.0,
"step": 4230
},
{
"epoch": 1.3106646058732612,
"grad_norm": 3.4883854389190674,
"learning_rate": 6.8964451313755796e-06,
"loss": 3.1395,
"mean_token_accuracy": 0.42550636306405065,
"num_tokens": 13403328.0,
"step": 4240
},
{
"epoch": 1.3137557959814528,
"grad_norm": 2.3180267810821533,
"learning_rate": 6.865533230293664e-06,
"loss": 3.2024,
"mean_token_accuracy": 0.4202399365603924,
"num_tokens": 13434597.0,
"step": 4250
},
{
"epoch": 1.3168469860896446,
"grad_norm": 1.3776673078536987,
"learning_rate": 6.834621329211747e-06,
"loss": 3.1493,
"mean_token_accuracy": 0.4209834337234497,
"num_tokens": 13463779.0,
"step": 4260
},
{
"epoch": 1.3199381761978362,
"grad_norm": 1.028826355934143,
"learning_rate": 6.803709428129831e-06,
"loss": 3.2035,
"mean_token_accuracy": 0.41408500224351885,
"num_tokens": 13494776.0,
"step": 4270
},
{
"epoch": 1.3230293663060277,
"grad_norm": 1.686012625694275,
"learning_rate": 6.772797527047914e-06,
"loss": 3.2745,
"mean_token_accuracy": 0.41270035356283186,
"num_tokens": 13529851.0,
"step": 4280
},
{
"epoch": 1.3261205564142196,
"grad_norm": 1.3231185674667358,
"learning_rate": 6.741885625965997e-06,
"loss": 3.2056,
"mean_token_accuracy": 0.4130039505660534,
"num_tokens": 13564424.0,
"step": 4290
},
{
"epoch": 1.3292117465224111,
"grad_norm": 1.856342077255249,
"learning_rate": 6.710973724884081e-06,
"loss": 3.1407,
"mean_token_accuracy": 0.4156997807323933,
"num_tokens": 13595883.0,
"step": 4300
},
{
"epoch": 1.3323029366306027,
"grad_norm": 1.5125768184661865,
"learning_rate": 6.680061823802164e-06,
"loss": 3.1297,
"mean_token_accuracy": 0.4260141022503376,
"num_tokens": 13628882.0,
"step": 4310
},
{
"epoch": 1.3353941267387945,
"grad_norm": 3.0248773097991943,
"learning_rate": 6.649149922720248e-06,
"loss": 3.2258,
"mean_token_accuracy": 0.4115126602351665,
"num_tokens": 13660324.0,
"step": 4320
},
{
"epoch": 1.3384853168469861,
"grad_norm": 2.414133310317993,
"learning_rate": 6.618238021638331e-06,
"loss": 3.1616,
"mean_token_accuracy": 0.4194211043417454,
"num_tokens": 13692530.0,
"step": 4330
},
{
"epoch": 1.3415765069551777,
"grad_norm": 1.5425348281860352,
"learning_rate": 6.5873261205564146e-06,
"loss": 3.2394,
"mean_token_accuracy": 0.4154033727943897,
"num_tokens": 13725306.0,
"step": 4340
},
{
"epoch": 1.3446676970633695,
"grad_norm": 1.8113696575164795,
"learning_rate": 6.556414219474498e-06,
"loss": 3.2223,
"mean_token_accuracy": 0.4164779372513294,
"num_tokens": 13758945.0,
"step": 4350
},
{
"epoch": 1.3477588871715611,
"grad_norm": 1.9818004369735718,
"learning_rate": 6.525502318392582e-06,
"loss": 3.1813,
"mean_token_accuracy": 0.41521124318242075,
"num_tokens": 13791607.0,
"step": 4360
},
{
"epoch": 1.3508500772797527,
"grad_norm": 1.7219854593276978,
"learning_rate": 6.494590417310665e-06,
"loss": 3.1954,
"mean_token_accuracy": 0.40949844419956205,
"num_tokens": 13822651.0,
"step": 4370
},
{
"epoch": 1.3539412673879443,
"grad_norm": 2.069145441055298,
"learning_rate": 6.463678516228749e-06,
"loss": 3.1196,
"mean_token_accuracy": 0.42854725793004034,
"num_tokens": 13852169.0,
"step": 4380
},
{
"epoch": 1.3570324574961359,
"grad_norm": 1.8879189491271973,
"learning_rate": 6.432766615146832e-06,
"loss": 3.1188,
"mean_token_accuracy": 0.4243581973016262,
"num_tokens": 13883678.0,
"step": 4390
},
{
"epoch": 1.3601236476043277,
"grad_norm": 1.1765724420547485,
"learning_rate": 6.401854714064915e-06,
"loss": 3.1485,
"mean_token_accuracy": 0.41584895700216296,
"num_tokens": 13918991.0,
"step": 4400
},
{
"epoch": 1.3632148377125193,
"grad_norm": 1.3428053855895996,
"learning_rate": 6.370942812982999e-06,
"loss": 3.1697,
"mean_token_accuracy": 0.41857780367136,
"num_tokens": 13953094.0,
"step": 4410
},
{
"epoch": 1.3663060278207109,
"grad_norm": 1.49298894405365,
"learning_rate": 6.340030911901082e-06,
"loss": 3.1357,
"mean_token_accuracy": 0.4209234081208706,
"num_tokens": 13982945.0,
"step": 4420
},
{
"epoch": 1.3693972179289027,
"grad_norm": 1.0709565877914429,
"learning_rate": 6.309119010819166e-06,
"loss": 3.2582,
"mean_token_accuracy": 0.4108918808400631,
"num_tokens": 14014071.0,
"step": 4430
},
{
"epoch": 1.3724884080370943,
"grad_norm": 2.4061436653137207,
"learning_rate": 6.27820710973725e-06,
"loss": 3.1506,
"mean_token_accuracy": 0.4154247589409351,
"num_tokens": 14042486.0,
"step": 4440
},
{
"epoch": 1.3755795981452859,
"grad_norm": 1.2331550121307373,
"learning_rate": 6.247295208655333e-06,
"loss": 3.1823,
"mean_token_accuracy": 0.4121369063854218,
"num_tokens": 14073272.0,
"step": 4450
},
{
"epoch": 1.3786707882534777,
"grad_norm": 1.5143426656723022,
"learning_rate": 6.216383307573416e-06,
"loss": 3.1427,
"mean_token_accuracy": 0.41784567162394526,
"num_tokens": 14103341.0,
"step": 4460
},
{
"epoch": 1.3817619783616693,
"grad_norm": 1.448833703994751,
"learning_rate": 6.1854714064915e-06,
"loss": 3.2622,
"mean_token_accuracy": 0.40576266273856165,
"num_tokens": 14134972.0,
"step": 4470
},
{
"epoch": 1.3848531684698608,
"grad_norm": 0.988325834274292,
"learning_rate": 6.154559505409583e-06,
"loss": 3.1114,
"mean_token_accuracy": 0.428489201515913,
"num_tokens": 14166647.0,
"step": 4480
},
{
"epoch": 1.3879443585780527,
"grad_norm": 1.2479734420776367,
"learning_rate": 6.1236476043276675e-06,
"loss": 3.1902,
"mean_token_accuracy": 0.4144682168960571,
"num_tokens": 14199779.0,
"step": 4490
},
{
"epoch": 1.3910355486862442,
"grad_norm": 4.43798303604126,
"learning_rate": 6.09273570324575e-06,
"loss": 3.1645,
"mean_token_accuracy": 0.4205217458307743,
"num_tokens": 14229188.0,
"step": 4500
},
{
"epoch": 1.3941267387944358,
"grad_norm": 2.4214413166046143,
"learning_rate": 6.061823802163833e-06,
"loss": 3.2059,
"mean_token_accuracy": 0.4073712095618248,
"num_tokens": 14263707.0,
"step": 4510
},
{
"epoch": 1.3972179289026276,
"grad_norm": 1.08336341381073,
"learning_rate": 6.030911901081917e-06,
"loss": 3.1831,
"mean_token_accuracy": 0.42334684580564497,
"num_tokens": 14294877.0,
"step": 4520
},
{
"epoch": 1.4003091190108192,
"grad_norm": 1.0653190612792969,
"learning_rate": 6e-06,
"loss": 3.1047,
"mean_token_accuracy": 0.42504297345876696,
"num_tokens": 14325959.0,
"step": 4530
},
{
"epoch": 1.4034003091190108,
"grad_norm": 1.7035560607910156,
"learning_rate": 5.969088098918085e-06,
"loss": 3.3124,
"mean_token_accuracy": 0.4030896335840225,
"num_tokens": 14358216.0,
"step": 4540
},
{
"epoch": 1.4064914992272024,
"grad_norm": 1.1060764789581299,
"learning_rate": 5.938176197836167e-06,
"loss": 3.2152,
"mean_token_accuracy": 0.4193869881331921,
"num_tokens": 14391366.0,
"step": 4550
},
{
"epoch": 1.409582689335394,
"grad_norm": 1.7701034545898438,
"learning_rate": 5.90726429675425e-06,
"loss": 3.2466,
"mean_token_accuracy": 0.4122362986207008,
"num_tokens": 14421925.0,
"step": 4560
},
{
"epoch": 1.4126738794435858,
"grad_norm": 1.8544549942016602,
"learning_rate": 5.8763523956723345e-06,
"loss": 3.1179,
"mean_token_accuracy": 0.42258076667785643,
"num_tokens": 14451856.0,
"step": 4570
},
{
"epoch": 1.4157650695517774,
"grad_norm": 2.212045431137085,
"learning_rate": 5.845440494590417e-06,
"loss": 3.364,
"mean_token_accuracy": 0.4010948471724987,
"num_tokens": 14483118.0,
"step": 4580
},
{
"epoch": 1.418856259659969,
"grad_norm": 1.5026146173477173,
"learning_rate": 5.814528593508502e-06,
"loss": 3.19,
"mean_token_accuracy": 0.4188863389194012,
"num_tokens": 14513476.0,
"step": 4590
},
{
"epoch": 1.4219474497681608,
"grad_norm": 1.5973678827285767,
"learning_rate": 5.783616692426584e-06,
"loss": 3.0433,
"mean_token_accuracy": 0.44073015078902245,
"num_tokens": 14539496.0,
"step": 4600
},
{
"epoch": 1.4250386398763524,
"grad_norm": 1.7628804445266724,
"learning_rate": 5.752704791344668e-06,
"loss": 3.2062,
"mean_token_accuracy": 0.4122942849993706,
"num_tokens": 14569765.0,
"step": 4610
},
{
"epoch": 1.428129829984544,
"grad_norm": 1.5680344104766846,
"learning_rate": 5.7217928902627516e-06,
"loss": 3.1586,
"mean_token_accuracy": 0.42099211886525156,
"num_tokens": 14600216.0,
"step": 4620
},
{
"epoch": 1.4312210200927358,
"grad_norm": 1.244256615638733,
"learning_rate": 5.690880989180835e-06,
"loss": 3.1328,
"mean_token_accuracy": 0.42461210340261457,
"num_tokens": 14634376.0,
"step": 4630
},
{
"epoch": 1.4343122102009274,
"grad_norm": 1.2207525968551636,
"learning_rate": 5.659969088098919e-06,
"loss": 3.1971,
"mean_token_accuracy": 0.4105593167245388,
"num_tokens": 14669227.0,
"step": 4640
},
{
"epoch": 1.437403400309119,
"grad_norm": 1.2037540674209595,
"learning_rate": 5.6290571870170015e-06,
"loss": 3.1483,
"mean_token_accuracy": 0.4223948784172535,
"num_tokens": 14699394.0,
"step": 4650
},
{
"epoch": 1.4404945904173108,
"grad_norm": 1.5292669534683228,
"learning_rate": 5.598145285935086e-06,
"loss": 3.2663,
"mean_token_accuracy": 0.4052347682416439,
"num_tokens": 14732089.0,
"step": 4660
},
{
"epoch": 1.4435857805255023,
"grad_norm": 1.7726776599884033,
"learning_rate": 5.567233384853169e-06,
"loss": 3.2756,
"mean_token_accuracy": 0.4116224706172943,
"num_tokens": 14761533.0,
"step": 4670
},
{
"epoch": 1.446676970633694,
"grad_norm": 1.3190699815750122,
"learning_rate": 5.536321483771252e-06,
"loss": 3.0779,
"mean_token_accuracy": 0.43598859906196596,
"num_tokens": 14789931.0,
"step": 4680
},
{
"epoch": 1.4497681607418857,
"grad_norm": 1.1200242042541504,
"learning_rate": 5.505409582689336e-06,
"loss": 3.1462,
"mean_token_accuracy": 0.42209191918373107,
"num_tokens": 14819843.0,
"step": 4690
},
{
"epoch": 1.4528593508500773,
"grad_norm": 1.8637281656265259,
"learning_rate": 5.474497681607419e-06,
"loss": 3.2485,
"mean_token_accuracy": 0.40834289118647576,
"num_tokens": 14852186.0,
"step": 4700
},
{
"epoch": 1.455950540958269,
"grad_norm": 1.0149264335632324,
"learning_rate": 5.443585780525503e-06,
"loss": 3.1762,
"mean_token_accuracy": 0.4144597060978413,
"num_tokens": 14885812.0,
"step": 4710
},
{
"epoch": 1.4590417310664605,
"grad_norm": 1.3329026699066162,
"learning_rate": 5.412673879443587e-06,
"loss": 3.1562,
"mean_token_accuracy": 0.42381680980324743,
"num_tokens": 14915638.0,
"step": 4720
},
{
"epoch": 1.4621329211746523,
"grad_norm": 1.2801613807678223,
"learning_rate": 5.381761978361669e-06,
"loss": 3.2291,
"mean_token_accuracy": 0.4082433968782425,
"num_tokens": 14948608.0,
"step": 4730
},
{
"epoch": 1.465224111282844,
"grad_norm": 1.522076964378357,
"learning_rate": 5.350850077279754e-06,
"loss": 3.1562,
"mean_token_accuracy": 0.42298023998737333,
"num_tokens": 14979493.0,
"step": 4740
},
{
"epoch": 1.4683153013910355,
"grad_norm": 1.266901969909668,
"learning_rate": 5.3199381761978365e-06,
"loss": 3.0935,
"mean_token_accuracy": 0.42193435728549955,
"num_tokens": 15009695.0,
"step": 4750
},
{
"epoch": 1.471406491499227,
"grad_norm": 2.0726027488708496,
"learning_rate": 5.28902627511592e-06,
"loss": 3.1655,
"mean_token_accuracy": 0.41975434496998787,
"num_tokens": 15042123.0,
"step": 4760
},
{
"epoch": 1.474497681607419,
"grad_norm": 1.504747986793518,
"learning_rate": 5.258114374034004e-06,
"loss": 3.1425,
"mean_token_accuracy": 0.42110041007399557,
"num_tokens": 15074722.0,
"step": 4770
},
{
"epoch": 1.4775888717156105,
"grad_norm": 1.510971188545227,
"learning_rate": 5.227202472952086e-06,
"loss": 3.1586,
"mean_token_accuracy": 0.4183110870420933,
"num_tokens": 15103293.0,
"step": 4780
},
{
"epoch": 1.480680061823802,
"grad_norm": 3.7153401374816895,
"learning_rate": 5.196290571870171e-06,
"loss": 3.1719,
"mean_token_accuracy": 0.4164193421602249,
"num_tokens": 15136915.0,
"step": 4790
},
{
"epoch": 1.4837712519319939,
"grad_norm": 1.9305976629257202,
"learning_rate": 5.1653786707882536e-06,
"loss": 3.1606,
"mean_token_accuracy": 0.4086958207190037,
"num_tokens": 15170496.0,
"step": 4800
},
{
"epoch": 1.4868624420401855,
"grad_norm": 1.2468324899673462,
"learning_rate": 5.134466769706338e-06,
"loss": 3.157,
"mean_token_accuracy": 0.4179227910935879,
"num_tokens": 15201560.0,
"step": 4810
},
{
"epoch": 1.489953632148377,
"grad_norm": 1.8711183071136475,
"learning_rate": 5.103554868624421e-06,
"loss": 3.1446,
"mean_token_accuracy": 0.42116508409380915,
"num_tokens": 15233195.0,
"step": 4820
},
{
"epoch": 1.4930448222565689,
"grad_norm": 1.30099618434906,
"learning_rate": 5.0726429675425035e-06,
"loss": 3.2033,
"mean_token_accuracy": 0.4153247632086277,
"num_tokens": 15266312.0,
"step": 4830
},
{
"epoch": 1.4961360123647605,
"grad_norm": 2.5389606952667236,
"learning_rate": 5.041731066460588e-06,
"loss": 3.1667,
"mean_token_accuracy": 0.4175982415676117,
"num_tokens": 15296476.0,
"step": 4840
},
{
"epoch": 1.499227202472952,
"grad_norm": 1.4658409357070923,
"learning_rate": 5.010819165378671e-06,
"loss": 3.1887,
"mean_token_accuracy": 0.41349576637148855,
"num_tokens": 15329787.0,
"step": 4850
},
{
"epoch": 1.5023183925811439,
"grad_norm": 1.282163143157959,
"learning_rate": 4.979907264296754e-06,
"loss": 3.1846,
"mean_token_accuracy": 0.4133185692131519,
"num_tokens": 15364093.0,
"step": 4860
},
{
"epoch": 1.5054095826893354,
"grad_norm": 1.9515365362167358,
"learning_rate": 4.948995363214838e-06,
"loss": 3.1212,
"mean_token_accuracy": 0.424532825499773,
"num_tokens": 15394102.0,
"step": 4870
},
{
"epoch": 1.508500772797527,
"grad_norm": 1.0463203191757202,
"learning_rate": 4.918083462132921e-06,
"loss": 3.1278,
"mean_token_accuracy": 0.42214716374874117,
"num_tokens": 15425672.0,
"step": 4880
},
{
"epoch": 1.5115919629057188,
"grad_norm": 2.7366933822631836,
"learning_rate": 4.887171561051005e-06,
"loss": 3.1837,
"mean_token_accuracy": 0.41872839331626893,
"num_tokens": 15457839.0,
"step": 4890
},
{
"epoch": 1.5146831530139102,
"grad_norm": 1.0603899955749512,
"learning_rate": 4.8562596599690886e-06,
"loss": 3.1954,
"mean_token_accuracy": 0.41243630051612856,
"num_tokens": 15492367.0,
"step": 4900
},
{
"epoch": 1.517774343122102,
"grad_norm": 1.2490918636322021,
"learning_rate": 4.825347758887172e-06,
"loss": 3.1728,
"mean_token_accuracy": 0.4147431656718254,
"num_tokens": 15524871.0,
"step": 4910
},
{
"epoch": 1.5208655332302936,
"grad_norm": 1.474180817604065,
"learning_rate": 4.794435857805255e-06,
"loss": 3.0814,
"mean_token_accuracy": 0.4267166741192341,
"num_tokens": 15556521.0,
"step": 4920
},
{
"epoch": 1.5239567233384852,
"grad_norm": 0.972766637802124,
"learning_rate": 4.7635239567233385e-06,
"loss": 3.0783,
"mean_token_accuracy": 0.42358799651265144,
"num_tokens": 15592437.0,
"step": 4930
},
{
"epoch": 1.527047913446677,
"grad_norm": 1.0824300050735474,
"learning_rate": 4.732612055641422e-06,
"loss": 3.2082,
"mean_token_accuracy": 0.4090407736599445,
"num_tokens": 15625545.0,
"step": 4940
},
{
"epoch": 1.5301391035548686,
"grad_norm": 1.0225831270217896,
"learning_rate": 4.701700154559506e-06,
"loss": 3.1231,
"mean_token_accuracy": 0.423577306419611,
"num_tokens": 15657233.0,
"step": 4950
},
{
"epoch": 1.5332302936630602,
"grad_norm": 1.2960033416748047,
"learning_rate": 4.670788253477589e-06,
"loss": 3.2722,
"mean_token_accuracy": 0.4066340148448944,
"num_tokens": 15689926.0,
"step": 4960
},
{
"epoch": 1.536321483771252,
"grad_norm": 13.400914192199707,
"learning_rate": 4.639876352395673e-06,
"loss": 3.1884,
"mean_token_accuracy": 0.41838386580348014,
"num_tokens": 15721424.0,
"step": 4970
},
{
"epoch": 1.5394126738794436,
"grad_norm": 6.94038200378418,
"learning_rate": 4.608964451313756e-06,
"loss": 3.1619,
"mean_token_accuracy": 0.41957569122314453,
"num_tokens": 15749894.0,
"step": 4980
},
{
"epoch": 1.5425038639876352,
"grad_norm": 2.4333202838897705,
"learning_rate": 4.57805255023184e-06,
"loss": 3.1165,
"mean_token_accuracy": 0.42476601898670197,
"num_tokens": 15782492.0,
"step": 4990
},
{
"epoch": 1.545595054095827,
"grad_norm": 1.1471242904663086,
"learning_rate": 4.547140649149923e-06,
"loss": 3.2202,
"mean_token_accuracy": 0.411971789598465,
"num_tokens": 15816554.0,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 6470,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9099448107669504.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}