Qwen-code-7B-SFT-100k-v2-cots / trainer_state.json
zhuangxialie
Model save
78c7f5f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.992481203007519,
"eval_steps": 500,
"global_step": 1860,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010741138560687433,
"grad_norm": 2.6824158480436027,
"learning_rate": 1.3440860215053765e-06,
"loss": 0.8294,
"mean_token_accuracy": 0.8010891914367676,
"step": 5
},
{
"epoch": 0.021482277121374866,
"grad_norm": 1.0834186154450132,
"learning_rate": 2.688172043010753e-06,
"loss": 0.7976,
"mean_token_accuracy": 0.8042729198932648,
"step": 10
},
{
"epoch": 0.0322234156820623,
"grad_norm": 0.9912101287518572,
"learning_rate": 4.032258064516129e-06,
"loss": 0.7318,
"mean_token_accuracy": 0.8116350173950195,
"step": 15
},
{
"epoch": 0.04296455424274973,
"grad_norm": 0.6161547535500218,
"learning_rate": 5.376344086021506e-06,
"loss": 0.6796,
"mean_token_accuracy": 0.8214974880218506,
"step": 20
},
{
"epoch": 0.05370569280343716,
"grad_norm": 0.4711983922639431,
"learning_rate": 6.720430107526882e-06,
"loss": 0.6403,
"mean_token_accuracy": 0.8289329469203949,
"step": 25
},
{
"epoch": 0.0644468313641246,
"grad_norm": 0.3514340436445771,
"learning_rate": 8.064516129032258e-06,
"loss": 0.6101,
"mean_token_accuracy": 0.8344561219215393,
"step": 30
},
{
"epoch": 0.07518796992481203,
"grad_norm": 0.2900558861500113,
"learning_rate": 9.408602150537635e-06,
"loss": 0.5849,
"mean_token_accuracy": 0.8396502792835235,
"step": 35
},
{
"epoch": 0.08592910848549946,
"grad_norm": 0.2722947727971047,
"learning_rate": 1.0752688172043012e-05,
"loss": 0.5701,
"mean_token_accuracy": 0.8420377433300018,
"step": 40
},
{
"epoch": 0.0966702470461869,
"grad_norm": 0.25248070544882645,
"learning_rate": 1.2096774193548388e-05,
"loss": 0.561,
"mean_token_accuracy": 0.8443691551685333,
"step": 45
},
{
"epoch": 0.10741138560687433,
"grad_norm": 0.2504332745775819,
"learning_rate": 1.3440860215053763e-05,
"loss": 0.5601,
"mean_token_accuracy": 0.8441641569137573,
"step": 50
},
{
"epoch": 0.11815252416756176,
"grad_norm": 0.21685484456472007,
"learning_rate": 1.4784946236559142e-05,
"loss": 0.5455,
"mean_token_accuracy": 0.8471231937408448,
"step": 55
},
{
"epoch": 0.1288936627282492,
"grad_norm": 0.23513981149675298,
"learning_rate": 1.6129032258064517e-05,
"loss": 0.5486,
"mean_token_accuracy": 0.8462919056415558,
"step": 60
},
{
"epoch": 0.13963480128893663,
"grad_norm": 0.21971215723488632,
"learning_rate": 1.7473118279569895e-05,
"loss": 0.5372,
"mean_token_accuracy": 0.8488749146461487,
"step": 65
},
{
"epoch": 0.15037593984962405,
"grad_norm": 0.22582010917696982,
"learning_rate": 1.881720430107527e-05,
"loss": 0.5341,
"mean_token_accuracy": 0.8489724159240722,
"step": 70
},
{
"epoch": 0.1611170784103115,
"grad_norm": 0.2505238494065726,
"learning_rate": 2.0161290322580645e-05,
"loss": 0.5288,
"mean_token_accuracy": 0.8500843226909638,
"step": 75
},
{
"epoch": 0.17185821697099893,
"grad_norm": 0.2485546682065235,
"learning_rate": 2.1505376344086024e-05,
"loss": 0.5265,
"mean_token_accuracy": 0.8504622042179107,
"step": 80
},
{
"epoch": 0.18259935553168635,
"grad_norm": 0.25134861732181085,
"learning_rate": 2.28494623655914e-05,
"loss": 0.5245,
"mean_token_accuracy": 0.8512703776359558,
"step": 85
},
{
"epoch": 0.1933404940923738,
"grad_norm": 0.2607421207193637,
"learning_rate": 2.4193548387096777e-05,
"loss": 0.5225,
"mean_token_accuracy": 0.8512581944465637,
"step": 90
},
{
"epoch": 0.20408163265306123,
"grad_norm": 0.2571937237076843,
"learning_rate": 2.5537634408602152e-05,
"loss": 0.5169,
"mean_token_accuracy": 0.8526618123054505,
"step": 95
},
{
"epoch": 0.21482277121374865,
"grad_norm": 0.2559454741361629,
"learning_rate": 2.6881720430107527e-05,
"loss": 0.5087,
"mean_token_accuracy": 0.8544329702854156,
"step": 100
},
{
"epoch": 0.22556390977443608,
"grad_norm": 0.25657620243689094,
"learning_rate": 2.822580645161291e-05,
"loss": 0.5069,
"mean_token_accuracy": 0.8545464932918548,
"step": 105
},
{
"epoch": 0.23630504833512353,
"grad_norm": 0.3084326429216429,
"learning_rate": 2.9569892473118284e-05,
"loss": 0.5109,
"mean_token_accuracy": 0.8538104116916656,
"step": 110
},
{
"epoch": 0.24704618689581095,
"grad_norm": 0.2964885334930525,
"learning_rate": 3.091397849462366e-05,
"loss": 0.5026,
"mean_token_accuracy": 0.8555706679821015,
"step": 115
},
{
"epoch": 0.2577873254564984,
"grad_norm": 0.2640055744535602,
"learning_rate": 3.2258064516129034e-05,
"loss": 0.4952,
"mean_token_accuracy": 0.8576966226100922,
"step": 120
},
{
"epoch": 0.26852846401718583,
"grad_norm": 0.28061492437295604,
"learning_rate": 3.360215053763441e-05,
"loss": 0.4983,
"mean_token_accuracy": 0.8568866074085235,
"step": 125
},
{
"epoch": 0.27926960257787325,
"grad_norm": 0.3222080670739919,
"learning_rate": 3.494623655913979e-05,
"loss": 0.4919,
"mean_token_accuracy": 0.8582496762275695,
"step": 130
},
{
"epoch": 0.2900107411385607,
"grad_norm": 0.3018861867966521,
"learning_rate": 3.6290322580645165e-05,
"loss": 0.4921,
"mean_token_accuracy": 0.858267605304718,
"step": 135
},
{
"epoch": 0.3007518796992481,
"grad_norm": 0.27298497353963225,
"learning_rate": 3.763440860215054e-05,
"loss": 0.4897,
"mean_token_accuracy": 0.858799421787262,
"step": 140
},
{
"epoch": 0.31149301825993553,
"grad_norm": 0.29189277480966186,
"learning_rate": 3.8978494623655915e-05,
"loss": 0.4831,
"mean_token_accuracy": 0.8604558348655701,
"step": 145
},
{
"epoch": 0.322234156820623,
"grad_norm": 0.28012276855965057,
"learning_rate": 4.032258064516129e-05,
"loss": 0.4834,
"mean_token_accuracy": 0.8607946753501892,
"step": 150
},
{
"epoch": 0.33297529538131043,
"grad_norm": 0.2822021421564993,
"learning_rate": 4.166666666666667e-05,
"loss": 0.4822,
"mean_token_accuracy": 0.8607180714607239,
"step": 155
},
{
"epoch": 0.34371643394199786,
"grad_norm": 0.2669043120039336,
"learning_rate": 4.301075268817205e-05,
"loss": 0.4709,
"mean_token_accuracy": 0.8635617375373841,
"step": 160
},
{
"epoch": 0.3544575725026853,
"grad_norm": 0.26430063130872034,
"learning_rate": 4.435483870967742e-05,
"loss": 0.4759,
"mean_token_accuracy": 0.8624868154525757,
"step": 165
},
{
"epoch": 0.3651987110633727,
"grad_norm": 0.2768300795347462,
"learning_rate": 4.56989247311828e-05,
"loss": 0.4698,
"mean_token_accuracy": 0.863774424791336,
"step": 170
},
{
"epoch": 0.37593984962406013,
"grad_norm": 0.27300710251352905,
"learning_rate": 4.704301075268818e-05,
"loss": 0.4688,
"mean_token_accuracy": 0.8640853643417359,
"step": 175
},
{
"epoch": 0.3866809881847476,
"grad_norm": 0.28130219154214986,
"learning_rate": 4.8387096774193554e-05,
"loss": 0.4616,
"mean_token_accuracy": 0.8659515857696534,
"step": 180
},
{
"epoch": 0.39742212674543503,
"grad_norm": 0.28040903261236555,
"learning_rate": 4.973118279569893e-05,
"loss": 0.4652,
"mean_token_accuracy": 0.8656746566295623,
"step": 185
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.32637783754316196,
"learning_rate": 4.999936604372673e-05,
"loss": 0.4584,
"mean_token_accuracy": 0.8662971913814544,
"step": 190
},
{
"epoch": 0.4189044038668099,
"grad_norm": 0.3235247316768069,
"learning_rate": 4.9996790657593474e-05,
"loss": 0.4652,
"mean_token_accuracy": 0.865262484550476,
"step": 195
},
{
"epoch": 0.4296455424274973,
"grad_norm": 0.2756975255703871,
"learning_rate": 4.999223444591954e-05,
"loss": 0.4533,
"mean_token_accuracy": 0.8687061607837677,
"step": 200
},
{
"epoch": 0.44038668098818473,
"grad_norm": 0.26466440633632593,
"learning_rate": 4.998569780987594e-05,
"loss": 0.4521,
"mean_token_accuracy": 0.8684524893760681,
"step": 205
},
{
"epoch": 0.45112781954887216,
"grad_norm": 0.25138863961089425,
"learning_rate": 4.997718132500857e-05,
"loss": 0.4456,
"mean_token_accuracy": 0.8701819539070129,
"step": 210
},
{
"epoch": 0.46186895810955964,
"grad_norm": 0.3025611470224811,
"learning_rate": 4.9966685741187544e-05,
"loss": 0.447,
"mean_token_accuracy": 0.8699068784713745,
"step": 215
},
{
"epoch": 0.47261009667024706,
"grad_norm": 0.24615962175136596,
"learning_rate": 4.995421198254114e-05,
"loss": 0.4445,
"mean_token_accuracy": 0.8706246316432953,
"step": 220
},
{
"epoch": 0.4833512352309345,
"grad_norm": 0.23780094613136366,
"learning_rate": 4.9939761147374455e-05,
"loss": 0.444,
"mean_token_accuracy": 0.8709352612495422,
"step": 225
},
{
"epoch": 0.4940923737916219,
"grad_norm": 0.26418243428675386,
"learning_rate": 4.992333450807268e-05,
"loss": 0.4428,
"mean_token_accuracy": 0.8712534010410309,
"step": 230
},
{
"epoch": 0.5048335123523093,
"grad_norm": 0.2452687330812135,
"learning_rate": 4.990493351098908e-05,
"loss": 0.4375,
"mean_token_accuracy": 0.8728318750858307,
"step": 235
},
{
"epoch": 0.5155746509129968,
"grad_norm": 0.2688160648750715,
"learning_rate": 4.9884559776317644e-05,
"loss": 0.4353,
"mean_token_accuracy": 0.8730437099933624,
"step": 240
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.25960118051112435,
"learning_rate": 4.986221509795043e-05,
"loss": 0.4317,
"mean_token_accuracy": 0.8739780306816101,
"step": 245
},
{
"epoch": 0.5370569280343717,
"grad_norm": 0.23341024093650933,
"learning_rate": 4.98379014433196e-05,
"loss": 0.4352,
"mean_token_accuracy": 0.8733076274394989,
"step": 250
},
{
"epoch": 0.547798066595059,
"grad_norm": 0.25741008352215955,
"learning_rate": 4.981162095322421e-05,
"loss": 0.4324,
"mean_token_accuracy": 0.8738310694694519,
"step": 255
},
{
"epoch": 0.5585392051557465,
"grad_norm": 0.23274342659284017,
"learning_rate": 4.9783375941641696e-05,
"loss": 0.4321,
"mean_token_accuracy": 0.8742413520812988,
"step": 260
},
{
"epoch": 0.569280343716434,
"grad_norm": 0.2451922230157493,
"learning_rate": 4.9753168895524136e-05,
"loss": 0.4202,
"mean_token_accuracy": 0.8772394955158234,
"step": 265
},
{
"epoch": 0.5800214822771214,
"grad_norm": 0.2681975618828881,
"learning_rate": 4.9721002474579285e-05,
"loss": 0.4265,
"mean_token_accuracy": 0.8758379638195037,
"step": 270
},
{
"epoch": 0.5907626208378088,
"grad_norm": 0.22840035689897775,
"learning_rate": 4.968687951103638e-05,
"loss": 0.4209,
"mean_token_accuracy": 0.8775071561336517,
"step": 275
},
{
"epoch": 0.6015037593984962,
"grad_norm": 0.22300755601220718,
"learning_rate": 4.965080300939675e-05,
"loss": 0.4153,
"mean_token_accuracy": 0.8784702062606812,
"step": 280
},
{
"epoch": 0.6122448979591837,
"grad_norm": 0.22676783176605783,
"learning_rate": 4.961277614616931e-05,
"loss": 0.4168,
"mean_token_accuracy": 0.8779775381088257,
"step": 285
},
{
"epoch": 0.6229860365198711,
"grad_norm": 0.24574274186354764,
"learning_rate": 4.957280226959083e-05,
"loss": 0.4119,
"mean_token_accuracy": 0.8798301517963409,
"step": 290
},
{
"epoch": 0.6337271750805585,
"grad_norm": 0.2281072685520932,
"learning_rate": 4.953088489933117e-05,
"loss": 0.4176,
"mean_token_accuracy": 0.878108823299408,
"step": 295
},
{
"epoch": 0.644468313641246,
"grad_norm": 0.2606268344040068,
"learning_rate": 4.948702772618332e-05,
"loss": 0.4114,
"mean_token_accuracy": 0.879868882894516,
"step": 300
},
{
"epoch": 0.6552094522019334,
"grad_norm": 0.2192902541038699,
"learning_rate": 4.944123461173849e-05,
"loss": 0.4141,
"mean_token_accuracy": 0.879179573059082,
"step": 305
},
{
"epoch": 0.6659505907626209,
"grad_norm": 0.21550855803478997,
"learning_rate": 4.9393509588046036e-05,
"loss": 0.4053,
"mean_token_accuracy": 0.8814833164215088,
"step": 310
},
{
"epoch": 0.6766917293233082,
"grad_norm": 0.23830421980148422,
"learning_rate": 4.934385685725851e-05,
"loss": 0.4068,
"mean_token_accuracy": 0.8807245373725892,
"step": 315
},
{
"epoch": 0.6874328678839957,
"grad_norm": 0.22141238716961,
"learning_rate": 4.9292280791261595e-05,
"loss": 0.4023,
"mean_token_accuracy": 0.8820916056632996,
"step": 320
},
{
"epoch": 0.6981740064446831,
"grad_norm": 0.23798938808653466,
"learning_rate": 4.9238785931289225e-05,
"loss": 0.4042,
"mean_token_accuracy": 0.882178908586502,
"step": 325
},
{
"epoch": 0.7089151450053706,
"grad_norm": 0.22152782163874513,
"learning_rate": 4.918337698752367e-05,
"loss": 0.4038,
"mean_token_accuracy": 0.8820820569992065,
"step": 330
},
{
"epoch": 0.719656283566058,
"grad_norm": 0.2238393672437065,
"learning_rate": 4.912605883868088e-05,
"loss": 0.4094,
"mean_token_accuracy": 0.8803297877311707,
"step": 335
},
{
"epoch": 0.7303974221267454,
"grad_norm": 0.2251835579056735,
"learning_rate": 4.906683653158086e-05,
"loss": 0.4022,
"mean_token_accuracy": 0.8820242047309875,
"step": 340
},
{
"epoch": 0.7411385606874329,
"grad_norm": 0.21096516273893903,
"learning_rate": 4.9005715280703295e-05,
"loss": 0.3963,
"mean_token_accuracy": 0.8838990330696106,
"step": 345
},
{
"epoch": 0.7518796992481203,
"grad_norm": 0.20550443098708907,
"learning_rate": 4.8942700467728505e-05,
"loss": 0.3955,
"mean_token_accuracy": 0.8842245638370514,
"step": 350
},
{
"epoch": 0.7626208378088077,
"grad_norm": 0.2058867389466749,
"learning_rate": 4.88777976410635e-05,
"loss": 0.3995,
"mean_token_accuracy": 0.8830176711082458,
"step": 355
},
{
"epoch": 0.7733619763694952,
"grad_norm": 0.20958669116131587,
"learning_rate": 4.8811012515353456e-05,
"loss": 0.3911,
"mean_token_accuracy": 0.8853914678096771,
"step": 360
},
{
"epoch": 0.7841031149301826,
"grad_norm": 0.20397609182823062,
"learning_rate": 4.874235097097861e-05,
"loss": 0.393,
"mean_token_accuracy": 0.8846873760223388,
"step": 365
},
{
"epoch": 0.7948442534908701,
"grad_norm": 0.21645535614809533,
"learning_rate": 4.8671819053536415e-05,
"loss": 0.3922,
"mean_token_accuracy": 0.8847495734691619,
"step": 370
},
{
"epoch": 0.8055853920515574,
"grad_norm": 0.22258952481615085,
"learning_rate": 4.859942297330932e-05,
"loss": 0.3982,
"mean_token_accuracy": 0.8832435965538025,
"step": 375
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.2024612867389681,
"learning_rate": 4.8525169104717846e-05,
"loss": 0.3903,
"mean_token_accuracy": 0.8853883922100068,
"step": 380
},
{
"epoch": 0.8270676691729323,
"grad_norm": 0.20556087856635372,
"learning_rate": 4.844906398575944e-05,
"loss": 0.3964,
"mean_token_accuracy": 0.8837718069553375,
"step": 385
},
{
"epoch": 0.8378088077336198,
"grad_norm": 0.20809549331239957,
"learning_rate": 4.8371114317432726e-05,
"loss": 0.3941,
"mean_token_accuracy": 0.8842520952224732,
"step": 390
},
{
"epoch": 0.8485499462943072,
"grad_norm": 0.21820552680801697,
"learning_rate": 4.8291326963147524e-05,
"loss": 0.3891,
"mean_token_accuracy": 0.8858624398708344,
"step": 395
},
{
"epoch": 0.8592910848549946,
"grad_norm": 0.20709264624327767,
"learning_rate": 4.820970894812053e-05,
"loss": 0.3845,
"mean_token_accuracy": 0.886957323551178,
"step": 400
},
{
"epoch": 0.8700322234156821,
"grad_norm": 0.21155796049345174,
"learning_rate": 4.812626745875673e-05,
"loss": 0.3909,
"mean_token_accuracy": 0.8852347731590271,
"step": 405
},
{
"epoch": 0.8807733619763695,
"grad_norm": 0.20230194258239817,
"learning_rate": 4.804100984201667e-05,
"loss": 0.3888,
"mean_token_accuracy": 0.8856496810913086,
"step": 410
},
{
"epoch": 0.8915145005370569,
"grad_norm": 0.1914371442320018,
"learning_rate": 4.795394360476955e-05,
"loss": 0.3927,
"mean_token_accuracy": 0.885220056772232,
"step": 415
},
{
"epoch": 0.9022556390977443,
"grad_norm": 0.21955921021321853,
"learning_rate": 4.7865076413132234e-05,
"loss": 0.3862,
"mean_token_accuracy": 0.8869829177856445,
"step": 420
},
{
"epoch": 0.9129967776584318,
"grad_norm": 0.19993088700133185,
"learning_rate": 4.777441609179428e-05,
"loss": 0.389,
"mean_token_accuracy": 0.8861649572849274,
"step": 425
},
{
"epoch": 0.9237379162191193,
"grad_norm": 0.20214442771764315,
"learning_rate": 4.768197062332898e-05,
"loss": 0.3805,
"mean_token_accuracy": 0.8884122192859649,
"step": 430
},
{
"epoch": 0.9344790547798066,
"grad_norm": 0.1936799045011743,
"learning_rate": 4.758774814749046e-05,
"loss": 0.3825,
"mean_token_accuracy": 0.8876857936382294,
"step": 435
},
{
"epoch": 0.9452201933404941,
"grad_norm": 0.19325903425845148,
"learning_rate": 4.749175696049706e-05,
"loss": 0.3826,
"mean_token_accuracy": 0.8881516516208648,
"step": 440
},
{
"epoch": 0.9559613319011815,
"grad_norm": 0.19255187762230458,
"learning_rate": 4.739400551430077e-05,
"loss": 0.3811,
"mean_token_accuracy": 0.8880790531635284,
"step": 445
},
{
"epoch": 0.966702470461869,
"grad_norm": 0.19450067956842618,
"learning_rate": 4.7294502415843105e-05,
"loss": 0.3783,
"mean_token_accuracy": 0.8890111207962036,
"step": 450
},
{
"epoch": 0.9774436090225563,
"grad_norm": 0.20174438790639918,
"learning_rate": 4.719325642629722e-05,
"loss": 0.378,
"mean_token_accuracy": 0.8890378654003144,
"step": 455
},
{
"epoch": 0.9881847475832438,
"grad_norm": 0.17832896478111976,
"learning_rate": 4.7090276460296555e-05,
"loss": 0.3843,
"mean_token_accuracy": 0.8872815728187561,
"step": 460
},
{
"epoch": 0.9989258861439313,
"grad_norm": 0.1913931630832869,
"learning_rate": 4.6985571585149876e-05,
"loss": 0.3796,
"mean_token_accuracy": 0.8887166023254395,
"step": 465
},
{
"epoch": 1.0085929108485499,
"grad_norm": 0.20263869484120534,
"learning_rate": 4.687915102004286e-05,
"loss": 0.3614,
"mean_token_accuracy": 0.8926012317339579,
"step": 470
},
{
"epoch": 1.0193340494092373,
"grad_norm": 0.19678722825673817,
"learning_rate": 4.677102413522645e-05,
"loss": 0.3495,
"mean_token_accuracy": 0.8955722391605377,
"step": 475
},
{
"epoch": 1.0300751879699248,
"grad_norm": 0.20376503491728473,
"learning_rate": 4.666120045119174e-05,
"loss": 0.3507,
"mean_token_accuracy": 0.8951772391796112,
"step": 480
},
{
"epoch": 1.0408163265306123,
"grad_norm": 0.2019062903436488,
"learning_rate": 4.654968963783171e-05,
"loss": 0.3531,
"mean_token_accuracy": 0.8947476446628571,
"step": 485
},
{
"epoch": 1.0515574650912998,
"grad_norm": 0.18722603018624961,
"learning_rate": 4.643650151358983e-05,
"loss": 0.3526,
"mean_token_accuracy": 0.894485878944397,
"step": 490
},
{
"epoch": 1.062298603651987,
"grad_norm": 0.19481656873843595,
"learning_rate": 4.632164604459553e-05,
"loss": 0.3468,
"mean_token_accuracy": 0.8964617013931274,
"step": 495
},
{
"epoch": 1.0730397422126745,
"grad_norm": 0.18585853331072713,
"learning_rate": 4.620513334378669e-05,
"loss": 0.3512,
"mean_token_accuracy": 0.8950131058692932,
"step": 500
},
{
"epoch": 1.083780880773362,
"grad_norm": 0.1930388596228489,
"learning_rate": 4.608697367001921e-05,
"loss": 0.3479,
"mean_token_accuracy": 0.895933198928833,
"step": 505
},
{
"epoch": 1.0945220193340495,
"grad_norm": 0.1978189680563173,
"learning_rate": 4.596717742716372e-05,
"loss": 0.3532,
"mean_token_accuracy": 0.8942179441452026,
"step": 510
},
{
"epoch": 1.1052631578947367,
"grad_norm": 0.2198969141563894,
"learning_rate": 4.584575516318954e-05,
"loss": 0.3492,
"mean_token_accuracy": 0.8957188785076141,
"step": 515
},
{
"epoch": 1.1160042964554242,
"grad_norm": 0.19175977623621587,
"learning_rate": 4.5722717569235924e-05,
"loss": 0.3553,
"mean_token_accuracy": 0.8938140749931336,
"step": 520
},
{
"epoch": 1.1267454350161117,
"grad_norm": 0.1995625771811619,
"learning_rate": 4.559807547867071e-05,
"loss": 0.3493,
"mean_token_accuracy": 0.8954446971416473,
"step": 525
},
{
"epoch": 1.1374865735767992,
"grad_norm": 0.1915734911527379,
"learning_rate": 4.5471839866136475e-05,
"loss": 0.3491,
"mean_token_accuracy": 0.8957653522491456,
"step": 530
},
{
"epoch": 1.1482277121374866,
"grad_norm": 0.19836797519712018,
"learning_rate": 4.5344021846584205e-05,
"loss": 0.3539,
"mean_token_accuracy": 0.8943828701972961,
"step": 535
},
{
"epoch": 1.158968850698174,
"grad_norm": 0.18808462761740152,
"learning_rate": 4.521463267429464e-05,
"loss": 0.3497,
"mean_token_accuracy": 0.8953365862369538,
"step": 540
},
{
"epoch": 1.1697099892588614,
"grad_norm": 0.19280122016496182,
"learning_rate": 4.508368374188731e-05,
"loss": 0.3496,
"mean_token_accuracy": 0.8953313529491425,
"step": 545
},
{
"epoch": 1.1804511278195489,
"grad_norm": 0.19677371481260625,
"learning_rate": 4.4951186579317504e-05,
"loss": 0.3528,
"mean_token_accuracy": 0.8949146151542664,
"step": 550
},
{
"epoch": 1.1911922663802363,
"grad_norm": 0.18538032977972374,
"learning_rate": 4.481715285286098e-05,
"loss": 0.3541,
"mean_token_accuracy": 0.8939870595932007,
"step": 555
},
{
"epoch": 1.2019334049409238,
"grad_norm": 0.18481539602601102,
"learning_rate": 4.46815943640868e-05,
"loss": 0.3553,
"mean_token_accuracy": 0.8940768420696259,
"step": 560
},
{
"epoch": 1.212674543501611,
"grad_norm": 0.1861386211911988,
"learning_rate": 4.454452304881821e-05,
"loss": 0.3468,
"mean_token_accuracy": 0.8959418594837188,
"step": 565
},
{
"epoch": 1.2234156820622986,
"grad_norm": 0.18228266310501318,
"learning_rate": 4.440595097608168e-05,
"loss": 0.3467,
"mean_token_accuracy": 0.8962770164012909,
"step": 570
},
{
"epoch": 1.234156820622986,
"grad_norm": 0.1841361210717962,
"learning_rate": 4.426589034704428e-05,
"loss": 0.3536,
"mean_token_accuracy": 0.8943024933338165,
"step": 575
},
{
"epoch": 1.2448979591836735,
"grad_norm": 0.17281724579297167,
"learning_rate": 4.412435349393931e-05,
"loss": 0.3509,
"mean_token_accuracy": 0.8950875043869019,
"step": 580
},
{
"epoch": 1.255639097744361,
"grad_norm": 0.1772300668593227,
"learning_rate": 4.398135287898052e-05,
"loss": 0.3485,
"mean_token_accuracy": 0.8955003321170807,
"step": 585
},
{
"epoch": 1.2663802363050483,
"grad_norm": 0.17772581177798846,
"learning_rate": 4.383690109326477e-05,
"loss": 0.3459,
"mean_token_accuracy": 0.8965889751911164,
"step": 590
},
{
"epoch": 1.2771213748657357,
"grad_norm": 0.18596059716645308,
"learning_rate": 4.369101085566342e-05,
"loss": 0.3496,
"mean_token_accuracy": 0.8954894125461579,
"step": 595
},
{
"epoch": 1.2878625134264232,
"grad_norm": 0.17598132780016223,
"learning_rate": 4.354369501170246e-05,
"loss": 0.3479,
"mean_token_accuracy": 0.8960169315338135,
"step": 600
},
{
"epoch": 1.2986036519871107,
"grad_norm": 0.1804871594490513,
"learning_rate": 4.3394966532431433e-05,
"loss": 0.352,
"mean_token_accuracy": 0.8948932409286499,
"step": 605
},
{
"epoch": 1.3093447905477982,
"grad_norm": 0.1865297212423964,
"learning_rate": 4.3244838513281367e-05,
"loss": 0.3515,
"mean_token_accuracy": 0.8949047923088074,
"step": 610
},
{
"epoch": 1.3200859291084854,
"grad_norm": 0.18053270547327416,
"learning_rate": 4.309332417291172e-05,
"loss": 0.3505,
"mean_token_accuracy": 0.8953122675418854,
"step": 615
},
{
"epoch": 1.330827067669173,
"grad_norm": 0.1744036148367508,
"learning_rate": 4.294043685204651e-05,
"loss": 0.3474,
"mean_token_accuracy": 0.8960575997829437,
"step": 620
},
{
"epoch": 1.3415682062298604,
"grad_norm": 0.16842924897825143,
"learning_rate": 4.278619001229962e-05,
"loss": 0.3474,
"mean_token_accuracy": 0.8961166024208069,
"step": 625
},
{
"epoch": 1.3523093447905479,
"grad_norm": 0.17741079904542595,
"learning_rate": 4.263059723498961e-05,
"loss": 0.3474,
"mean_token_accuracy": 0.8962021231651306,
"step": 630
},
{
"epoch": 1.3630504833512354,
"grad_norm": 0.17634563486082044,
"learning_rate": 4.247367221994377e-05,
"loss": 0.352,
"mean_token_accuracy": 0.8948638260364532,
"step": 635
},
{
"epoch": 1.3737916219119226,
"grad_norm": 0.16514936818638581,
"learning_rate": 4.2315428784291965e-05,
"loss": 0.348,
"mean_token_accuracy": 0.8962691247463226,
"step": 640
},
{
"epoch": 1.38453276047261,
"grad_norm": 0.18156198450594868,
"learning_rate": 4.215588086125001e-05,
"loss": 0.3473,
"mean_token_accuracy": 0.8962475776672363,
"step": 645
},
{
"epoch": 1.3952738990332976,
"grad_norm": 0.17302374962454448,
"learning_rate": 4.199504249889279e-05,
"loss": 0.3499,
"mean_token_accuracy": 0.8956164479255676,
"step": 650
},
{
"epoch": 1.4060150375939848,
"grad_norm": 0.17009271559786848,
"learning_rate": 4.18329278589175e-05,
"loss": 0.3481,
"mean_token_accuracy": 0.8962275862693787,
"step": 655
},
{
"epoch": 1.4167561761546725,
"grad_norm": 0.17232579890547844,
"learning_rate": 4.166955121539656e-05,
"loss": 0.3452,
"mean_token_accuracy": 0.8966892838478089,
"step": 660
},
{
"epoch": 1.4274973147153598,
"grad_norm": 0.18931912307479049,
"learning_rate": 4.150492695352086e-05,
"loss": 0.3476,
"mean_token_accuracy": 0.8961862683296203,
"step": 665
},
{
"epoch": 1.4382384532760473,
"grad_norm": 0.1812257587896816,
"learning_rate": 4.133906956833316e-05,
"loss": 0.3451,
"mean_token_accuracy": 0.8965191125869751,
"step": 670
},
{
"epoch": 1.4489795918367347,
"grad_norm": 0.18448866093949617,
"learning_rate": 4.1171993663451816e-05,
"loss": 0.3453,
"mean_token_accuracy": 0.8967220306396484,
"step": 675
},
{
"epoch": 1.459720730397422,
"grad_norm": 0.16318177527247005,
"learning_rate": 4.1003713949784905e-05,
"loss": 0.3491,
"mean_token_accuracy": 0.8957133948802948,
"step": 680
},
{
"epoch": 1.4704618689581095,
"grad_norm": 0.19223128076002124,
"learning_rate": 4.083424524423498e-05,
"loss": 0.3475,
"mean_token_accuracy": 0.8962952673435212,
"step": 685
},
{
"epoch": 1.481203007518797,
"grad_norm": 0.17065645296533696,
"learning_rate": 4.066360246839442e-05,
"loss": 0.3495,
"mean_token_accuracy": 0.8956079244613647,
"step": 690
},
{
"epoch": 1.4919441460794844,
"grad_norm": 0.1613801844631258,
"learning_rate": 4.049180064723164e-05,
"loss": 0.3491,
"mean_token_accuracy": 0.8964253485202789,
"step": 695
},
{
"epoch": 1.502685284640172,
"grad_norm": 0.17729165960730092,
"learning_rate": 4.031885490776811e-05,
"loss": 0.3461,
"mean_token_accuracy": 0.8965683281421661,
"step": 700
},
{
"epoch": 1.5134264232008592,
"grad_norm": 0.16772417608227957,
"learning_rate": 4.014478047774644e-05,
"loss": 0.3486,
"mean_token_accuracy": 0.8959019482135773,
"step": 705
},
{
"epoch": 1.5241675617615469,
"grad_norm": 0.1654092742061062,
"learning_rate": 3.99695926842896e-05,
"loss": 0.3452,
"mean_token_accuracy": 0.8970151007175445,
"step": 710
},
{
"epoch": 1.5349087003222341,
"grad_norm": 0.1770663143483711,
"learning_rate": 3.979330695255139e-05,
"loss": 0.3504,
"mean_token_accuracy": 0.8954713106155395,
"step": 715
},
{
"epoch": 1.5456498388829216,
"grad_norm": 0.16250407421180885,
"learning_rate": 3.9615938804358254e-05,
"loss": 0.3403,
"mean_token_accuracy": 0.8980903148651123,
"step": 720
},
{
"epoch": 1.556390977443609,
"grad_norm": 0.1739734421973896,
"learning_rate": 3.943750385684257e-05,
"loss": 0.3452,
"mean_token_accuracy": 0.8973391890525818,
"step": 725
},
{
"epoch": 1.5671321160042964,
"grad_norm": 0.17020682906702797,
"learning_rate": 3.9258017821067595e-05,
"loss": 0.341,
"mean_token_accuracy": 0.8981746196746826,
"step": 730
},
{
"epoch": 1.5778732545649838,
"grad_norm": 0.17090518777542177,
"learning_rate": 3.907749650064416e-05,
"loss": 0.3475,
"mean_token_accuracy": 0.8964370787143707,
"step": 735
},
{
"epoch": 1.5886143931256713,
"grad_norm": 0.18226436070710383,
"learning_rate": 3.889595579033907e-05,
"loss": 0.3548,
"mean_token_accuracy": 0.8943204343318939,
"step": 740
},
{
"epoch": 1.5993555316863588,
"grad_norm": 0.16867971152976394,
"learning_rate": 3.8713411674675706e-05,
"loss": 0.3468,
"mean_token_accuracy": 0.8964660108089447,
"step": 745
},
{
"epoch": 1.6100966702470463,
"grad_norm": 0.1634124661472663,
"learning_rate": 3.8529880226526504e-05,
"loss": 0.3419,
"mean_token_accuracy": 0.897741311788559,
"step": 750
},
{
"epoch": 1.6208378088077335,
"grad_norm": 0.16728119897984747,
"learning_rate": 3.834537760569779e-05,
"loss": 0.3477,
"mean_token_accuracy": 0.8964338660240173,
"step": 755
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.16636899767836238,
"learning_rate": 3.815992005750691e-05,
"loss": 0.3454,
"mean_token_accuracy": 0.897176194190979,
"step": 760
},
{
"epoch": 1.6423200859291085,
"grad_norm": 0.17370655470517776,
"learning_rate": 3.7973523911351873e-05,
"loss": 0.3457,
"mean_token_accuracy": 0.8967864811420441,
"step": 765
},
{
"epoch": 1.6530612244897958,
"grad_norm": 0.17387140846382934,
"learning_rate": 3.7786205579273494e-05,
"loss": 0.3461,
"mean_token_accuracy": 0.896539443731308,
"step": 770
},
{
"epoch": 1.6638023630504835,
"grad_norm": 0.17312244395133694,
"learning_rate": 3.75979815545104e-05,
"loss": 0.3469,
"mean_token_accuracy": 0.8965823531150818,
"step": 775
},
{
"epoch": 1.6745435016111707,
"grad_norm": 0.17134683681288093,
"learning_rate": 3.740886841004678e-05,
"loss": 0.3437,
"mean_token_accuracy": 0.8972635090351104,
"step": 780
},
{
"epoch": 1.6852846401718582,
"grad_norm": 0.1703220892784228,
"learning_rate": 3.72188827971531e-05,
"loss": 0.349,
"mean_token_accuracy": 0.8958061695098877,
"step": 785
},
{
"epoch": 1.6960257787325457,
"grad_norm": 0.15629690421483755,
"learning_rate": 3.7028041443920106e-05,
"loss": 0.345,
"mean_token_accuracy": 0.8972305715084076,
"step": 790
},
{
"epoch": 1.706766917293233,
"grad_norm": 0.16968855316404596,
"learning_rate": 3.6836361153785735e-05,
"loss": 0.3391,
"mean_token_accuracy": 0.8984034955501556,
"step": 795
},
{
"epoch": 1.7175080558539206,
"grad_norm": 0.1613956545932139,
"learning_rate": 3.6643858804055764e-05,
"loss": 0.3418,
"mean_token_accuracy": 0.8975095868110656,
"step": 800
},
{
"epoch": 1.728249194414608,
"grad_norm": 0.16488649273144998,
"learning_rate": 3.6450551344417656e-05,
"loss": 0.347,
"mean_token_accuracy": 0.8963462889194489,
"step": 805
},
{
"epoch": 1.7389903329752954,
"grad_norm": 0.18336562912600562,
"learning_rate": 3.625645579544824e-05,
"loss": 0.3417,
"mean_token_accuracy": 0.8978760004043579,
"step": 810
},
{
"epoch": 1.7497314715359829,
"grad_norm": 0.16442030655020706,
"learning_rate": 3.606158924711498e-05,
"loss": 0.3418,
"mean_token_accuracy": 0.8984208166599273,
"step": 815
},
{
"epoch": 1.76047261009667,
"grad_norm": 0.1648466060868627,
"learning_rate": 3.586596885727126e-05,
"loss": 0.346,
"mean_token_accuracy": 0.8967172205448151,
"step": 820
},
{
"epoch": 1.7712137486573578,
"grad_norm": 0.16380950472689287,
"learning_rate": 3.5669611850145676e-05,
"loss": 0.3404,
"mean_token_accuracy": 0.8981300175189972,
"step": 825
},
{
"epoch": 1.781954887218045,
"grad_norm": 0.16476649720519732,
"learning_rate": 3.54725355148254e-05,
"loss": 0.3417,
"mean_token_accuracy": 0.8978650271892548,
"step": 830
},
{
"epoch": 1.7926960257787325,
"grad_norm": 0.16250342083791575,
"learning_rate": 3.5274757203733906e-05,
"loss": 0.3429,
"mean_token_accuracy": 0.8977679431438446,
"step": 835
},
{
"epoch": 1.80343716433942,
"grad_norm": 0.1666333005283665,
"learning_rate": 3.507629433110311e-05,
"loss": 0.3437,
"mean_token_accuracy": 0.8972832322120666,
"step": 840
},
{
"epoch": 1.8141783029001073,
"grad_norm": 0.1615387362712691,
"learning_rate": 3.4877164371440075e-05,
"loss": 0.3453,
"mean_token_accuracy": 0.8970289349555969,
"step": 845
},
{
"epoch": 1.824919441460795,
"grad_norm": 0.16676447906725542,
"learning_rate": 3.467738485798836e-05,
"loss": 0.3451,
"mean_token_accuracy": 0.8969220995903016,
"step": 850
},
{
"epoch": 1.8356605800214822,
"grad_norm": 0.16168843045380168,
"learning_rate": 3.447697338118425e-05,
"loss": 0.3395,
"mean_token_accuracy": 0.898131811618805,
"step": 855
},
{
"epoch": 1.8464017185821697,
"grad_norm": 0.15334942056157058,
"learning_rate": 3.427594758710794e-05,
"loss": 0.3422,
"mean_token_accuracy": 0.8975472927093506,
"step": 860
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.1672358555124429,
"learning_rate": 3.407432517592979e-05,
"loss": 0.3403,
"mean_token_accuracy": 0.8983366131782532,
"step": 865
},
{
"epoch": 1.8678839957035445,
"grad_norm": 0.161941088262071,
"learning_rate": 3.3872123900351835e-05,
"loss": 0.3408,
"mean_token_accuracy": 0.8978644967079162,
"step": 870
},
{
"epoch": 1.8786251342642322,
"grad_norm": 0.1519842470665007,
"learning_rate": 3.3669361564044735e-05,
"loss": 0.3396,
"mean_token_accuracy": 0.898490047454834,
"step": 875
},
{
"epoch": 1.8893662728249194,
"grad_norm": 0.16037110333088753,
"learning_rate": 3.346605602008007e-05,
"loss": 0.3417,
"mean_token_accuracy": 0.8977841079235077,
"step": 880
},
{
"epoch": 1.900107411385607,
"grad_norm": 0.16442639618093918,
"learning_rate": 3.326222516935847e-05,
"loss": 0.3437,
"mean_token_accuracy": 0.8971070289611817,
"step": 885
},
{
"epoch": 1.9108485499462944,
"grad_norm": 0.15289173675825762,
"learning_rate": 3.3057886959033426e-05,
"loss": 0.3416,
"mean_token_accuracy": 0.8984978437423706,
"step": 890
},
{
"epoch": 1.9215896885069816,
"grad_norm": 0.14450841113047458,
"learning_rate": 3.285305938093108e-05,
"loss": 0.3392,
"mean_token_accuracy": 0.8983058393001556,
"step": 895
},
{
"epoch": 1.9323308270676691,
"grad_norm": 0.15549384924856993,
"learning_rate": 3.264776046996602e-05,
"loss": 0.3394,
"mean_token_accuracy": 0.8985956251621247,
"step": 900
},
{
"epoch": 1.9430719656283566,
"grad_norm": 0.162459823198956,
"learning_rate": 3.2442008302553346e-05,
"loss": 0.34,
"mean_token_accuracy": 0.8984286248683929,
"step": 905
},
{
"epoch": 1.953813104189044,
"grad_norm": 0.15039221824995944,
"learning_rate": 3.223582099501704e-05,
"loss": 0.3374,
"mean_token_accuracy": 0.8987222969532013,
"step": 910
},
{
"epoch": 1.9645542427497316,
"grad_norm": 0.1564002589458454,
"learning_rate": 3.202921670199485e-05,
"loss": 0.3369,
"mean_token_accuracy": 0.8994980156421661,
"step": 915
},
{
"epoch": 1.9752953813104188,
"grad_norm": 0.17459425481905663,
"learning_rate": 3.182221361483981e-05,
"loss": 0.3426,
"mean_token_accuracy": 0.8977073311805726,
"step": 920
},
{
"epoch": 1.9860365198711063,
"grad_norm": 0.15953782868809285,
"learning_rate": 3.161482996001842e-05,
"loss": 0.3406,
"mean_token_accuracy": 0.8983509004116058,
"step": 925
},
{
"epoch": 1.9967776584317938,
"grad_norm": 0.15713432539772912,
"learning_rate": 3.140708399750594e-05,
"loss": 0.3421,
"mean_token_accuracy": 0.8979579448699951,
"step": 930
},
{
"epoch": 2.0064446831364124,
"grad_norm": 0.16209947632099436,
"learning_rate": 3.11989940191785e-05,
"loss": 0.3137,
"mean_token_accuracy": 0.9049130148357816,
"step": 935
},
{
"epoch": 2.0171858216970997,
"grad_norm": 0.18807228831939848,
"learning_rate": 3.09905783472026e-05,
"loss": 0.305,
"mean_token_accuracy": 0.9070174276828766,
"step": 940
},
{
"epoch": 2.0279269602577874,
"grad_norm": 0.1647631068534088,
"learning_rate": 3.07818553324218e-05,
"loss": 0.3039,
"mean_token_accuracy": 0.9071334481239319,
"step": 945
},
{
"epoch": 2.0386680988184747,
"grad_norm": 0.16628057896853762,
"learning_rate": 3.057284335274097e-05,
"loss": 0.3026,
"mean_token_accuracy": 0.9071128606796265,
"step": 950
},
{
"epoch": 2.0494092373791624,
"grad_norm": 0.16953299184244167,
"learning_rate": 3.036356081150813e-05,
"loss": 0.3034,
"mean_token_accuracy": 0.9072185814380646,
"step": 955
},
{
"epoch": 2.0601503759398496,
"grad_norm": 0.16119678084859076,
"learning_rate": 3.0154026135894043e-05,
"loss": 0.2994,
"mean_token_accuracy": 0.9083474159240723,
"step": 960
},
{
"epoch": 2.070891514500537,
"grad_norm": 0.16680753647576305,
"learning_rate": 2.9944257775269686e-05,
"loss": 0.3046,
"mean_token_accuracy": 0.9070303261280059,
"step": 965
},
{
"epoch": 2.0816326530612246,
"grad_norm": 0.1557469947598615,
"learning_rate": 2.9734274199581857e-05,
"loss": 0.3028,
"mean_token_accuracy": 0.9075248777866364,
"step": 970
},
{
"epoch": 2.092373791621912,
"grad_norm": 0.15821336281763043,
"learning_rate": 2.9524093897726875e-05,
"loss": 0.2992,
"mean_token_accuracy": 0.9085965514183044,
"step": 975
},
{
"epoch": 2.1031149301825995,
"grad_norm": 0.16912179860419502,
"learning_rate": 2.931373537592264e-05,
"loss": 0.3059,
"mean_token_accuracy": 0.9063934266567231,
"step": 980
},
{
"epoch": 2.113856068743287,
"grad_norm": 0.1568909903521791,
"learning_rate": 2.9103217156079183e-05,
"loss": 0.3017,
"mean_token_accuracy": 0.9079225361347198,
"step": 985
},
{
"epoch": 2.124597207303974,
"grad_norm": 0.17149311680209844,
"learning_rate": 2.8892557774167843e-05,
"loss": 0.3023,
"mean_token_accuracy": 0.9075566232204437,
"step": 990
},
{
"epoch": 2.1353383458646618,
"grad_norm": 0.1730679539636109,
"learning_rate": 2.8681775778589164e-05,
"loss": 0.3031,
"mean_token_accuracy": 0.9074501514434814,
"step": 995
},
{
"epoch": 2.146079484425349,
"grad_norm": 0.168662599711155,
"learning_rate": 2.8470889728539725e-05,
"loss": 0.302,
"mean_token_accuracy": 0.9077127814292908,
"step": 1000
},
{
"epoch": 2.1568206229860367,
"grad_norm": 0.16226284047590997,
"learning_rate": 2.8259918192378038e-05,
"loss": 0.3041,
"mean_token_accuracy": 0.9070930540561676,
"step": 1005
},
{
"epoch": 2.167561761546724,
"grad_norm": 0.1576781128963043,
"learning_rate": 2.804887974598959e-05,
"loss": 0.3022,
"mean_token_accuracy": 0.907502681016922,
"step": 1010
},
{
"epoch": 2.1783029001074112,
"grad_norm": 0.15997962819428427,
"learning_rate": 2.7837792971151268e-05,
"loss": 0.3018,
"mean_token_accuracy": 0.9079727530479431,
"step": 1015
},
{
"epoch": 2.189044038668099,
"grad_norm": 0.16962861365112525,
"learning_rate": 2.7626676453895238e-05,
"loss": 0.3031,
"mean_token_accuracy": 0.9071884095668793,
"step": 1020
},
{
"epoch": 2.199785177228786,
"grad_norm": 0.16322576238996814,
"learning_rate": 2.7415548782872468e-05,
"loss": 0.3057,
"mean_token_accuracy": 0.9065694689750672,
"step": 1025
},
{
"epoch": 2.2105263157894735,
"grad_norm": 0.16909277271966566,
"learning_rate": 2.7204428547716027e-05,
"loss": 0.3052,
"mean_token_accuracy": 0.9069810092449189,
"step": 1030
},
{
"epoch": 2.221267454350161,
"grad_norm": 0.16098166127750824,
"learning_rate": 2.699333433740422e-05,
"loss": 0.3034,
"mean_token_accuracy": 0.907333254814148,
"step": 1035
},
{
"epoch": 2.2320085929108484,
"grad_norm": 0.17075220096927826,
"learning_rate": 2.678228473862391e-05,
"loss": 0.3059,
"mean_token_accuracy": 0.9066526055335998,
"step": 1040
},
{
"epoch": 2.242749731471536,
"grad_norm": 0.16370207033646628,
"learning_rate": 2.6571298334133947e-05,
"loss": 0.3049,
"mean_token_accuracy": 0.9068757057189941,
"step": 1045
},
{
"epoch": 2.2534908700322234,
"grad_norm": 0.1611010495321633,
"learning_rate": 2.6360393701128968e-05,
"loss": 0.3058,
"mean_token_accuracy": 0.9067712783813476,
"step": 1050
},
{
"epoch": 2.264232008592911,
"grad_norm": 0.16970228504955862,
"learning_rate": 2.614958940960369e-05,
"loss": 0.3052,
"mean_token_accuracy": 0.9068210601806641,
"step": 1055
},
{
"epoch": 2.2749731471535983,
"grad_norm": 0.1677663409783765,
"learning_rate": 2.593890402071784e-05,
"loss": 0.303,
"mean_token_accuracy": 0.9071888148784637,
"step": 1060
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.1594126722501793,
"learning_rate": 2.5728356085161864e-05,
"loss": 0.2979,
"mean_token_accuracy": 0.9088397026062012,
"step": 1065
},
{
"epoch": 2.2964554242749733,
"grad_norm": 0.15755295908932457,
"learning_rate": 2.5517964141523525e-05,
"loss": 0.3009,
"mean_token_accuracy": 0.9078912615776062,
"step": 1070
},
{
"epoch": 2.3071965628356605,
"grad_norm": 0.15824119025266686,
"learning_rate": 2.5307746714655634e-05,
"loss": 0.3065,
"mean_token_accuracy": 0.9067668735980987,
"step": 1075
},
{
"epoch": 2.317937701396348,
"grad_norm": 0.1593424773763769,
"learning_rate": 2.509772231404493e-05,
"loss": 0.3072,
"mean_token_accuracy": 0.9063262104988098,
"step": 1080
},
{
"epoch": 2.3286788399570355,
"grad_norm": 0.16745585583895234,
"learning_rate": 2.4887909432182316e-05,
"loss": 0.3205,
"mean_token_accuracy": 0.9050490736961365,
"step": 1085
},
{
"epoch": 2.3394199785177228,
"grad_norm": 0.18108073198198416,
"learning_rate": 2.4678326542934667e-05,
"loss": 0.3048,
"mean_token_accuracy": 0.9068881213665009,
"step": 1090
},
{
"epoch": 2.3501611170784105,
"grad_norm": 0.17241262713318053,
"learning_rate": 2.4468992099918138e-05,
"loss": 0.3032,
"mean_token_accuracy": 0.9073716223239898,
"step": 1095
},
{
"epoch": 2.3609022556390977,
"grad_norm": 0.16397300617763141,
"learning_rate": 2.4259924534873385e-05,
"loss": 0.3061,
"mean_token_accuracy": 0.9062675356864929,
"step": 1100
},
{
"epoch": 2.3716433941997854,
"grad_norm": 0.1700811614554712,
"learning_rate": 2.4051142256042697e-05,
"loss": 0.3011,
"mean_token_accuracy": 0.90796759724617,
"step": 1105
},
{
"epoch": 2.3823845327604727,
"grad_norm": 0.16924471517889025,
"learning_rate": 2.3842663646549085e-05,
"loss": 0.3025,
"mean_token_accuracy": 0.9076179921627044,
"step": 1110
},
{
"epoch": 2.39312567132116,
"grad_norm": 0.582746886765867,
"learning_rate": 2.3634507062777726e-05,
"loss": 0.3036,
"mean_token_accuracy": 0.9076011419296265,
"step": 1115
},
{
"epoch": 2.4038668098818476,
"grad_norm": 0.15789580559295846,
"learning_rate": 2.3426690832759652e-05,
"loss": 0.2997,
"mean_token_accuracy": 0.9084276914596557,
"step": 1120
},
{
"epoch": 2.414607948442535,
"grad_norm": 0.15924353242995867,
"learning_rate": 2.3219233254558025e-05,
"loss": 0.3029,
"mean_token_accuracy": 0.9074055433273316,
"step": 1125
},
{
"epoch": 2.425349087003222,
"grad_norm": 0.16646800963930639,
"learning_rate": 2.3012152594656982e-05,
"loss": 0.3043,
"mean_token_accuracy": 0.9070705771446228,
"step": 1130
},
{
"epoch": 2.43609022556391,
"grad_norm": 0.16197886055551655,
"learning_rate": 2.2805467086353268e-05,
"loss": 0.2983,
"mean_token_accuracy": 0.9087878286838531,
"step": 1135
},
{
"epoch": 2.446831364124597,
"grad_norm": 0.16381004501438137,
"learning_rate": 2.2599194928150842e-05,
"loss": 0.3037,
"mean_token_accuracy": 0.9073452115058899,
"step": 1140
},
{
"epoch": 2.457572502685285,
"grad_norm": 0.16540282102993875,
"learning_rate": 2.239335428215849e-05,
"loss": 0.3042,
"mean_token_accuracy": 0.9071446895599365,
"step": 1145
},
{
"epoch": 2.468313641245972,
"grad_norm": 0.16037824203377551,
"learning_rate": 2.2187963272490676e-05,
"loss": 0.3022,
"mean_token_accuracy": 0.9079298913478852,
"step": 1150
},
{
"epoch": 2.4790547798066593,
"grad_norm": 0.15882572997154093,
"learning_rate": 2.198303998367171e-05,
"loss": 0.3067,
"mean_token_accuracy": 0.9064932882785797,
"step": 1155
},
{
"epoch": 2.489795918367347,
"grad_norm": 0.15831447424850761,
"learning_rate": 2.1778602459043452e-05,
"loss": 0.3039,
"mean_token_accuracy": 0.9070046961307525,
"step": 1160
},
{
"epoch": 2.5005370569280343,
"grad_norm": 0.16081532493077333,
"learning_rate": 2.157466869917658e-05,
"loss": 0.3041,
"mean_token_accuracy": 0.9073209702968598,
"step": 1165
},
{
"epoch": 2.511278195488722,
"grad_norm": 0.15516248272553126,
"learning_rate": 2.1371256660285655e-05,
"loss": 0.3044,
"mean_token_accuracy": 0.9070526838302613,
"step": 1170
},
{
"epoch": 2.5220193340494093,
"grad_norm": 0.1587382733948704,
"learning_rate": 2.1168384252648117e-05,
"loss": 0.2999,
"mean_token_accuracy": 0.9086295425891876,
"step": 1175
},
{
"epoch": 2.5327604726100965,
"grad_norm": 0.15919430172381277,
"learning_rate": 2.0966069339027256e-05,
"loss": 0.3017,
"mean_token_accuracy": 0.9076282560825348,
"step": 1180
},
{
"epoch": 2.543501611170784,
"grad_norm": 0.1602383119084914,
"learning_rate": 2.0764329733099446e-05,
"loss": 0.2998,
"mean_token_accuracy": 0.9084926426410675,
"step": 1185
},
{
"epoch": 2.5542427497314715,
"grad_norm": 0.16156220155082493,
"learning_rate": 2.0563183197885653e-05,
"loss": 0.3068,
"mean_token_accuracy": 0.9063272118568421,
"step": 1190
},
{
"epoch": 2.5649838882921587,
"grad_norm": 0.15676424327787444,
"learning_rate": 2.03626474441874e-05,
"loss": 0.304,
"mean_token_accuracy": 0.9073390066623688,
"step": 1195
},
{
"epoch": 2.5757250268528464,
"grad_norm": 0.16064943066993936,
"learning_rate": 2.016274012902737e-05,
"loss": 0.3031,
"mean_token_accuracy": 0.9080215394496918,
"step": 1200
},
{
"epoch": 2.5864661654135337,
"grad_norm": 0.15163324815906554,
"learning_rate": 1.996347885409468e-05,
"loss": 0.2995,
"mean_token_accuracy": 0.9081439912319184,
"step": 1205
},
{
"epoch": 2.5972073039742214,
"grad_norm": 0.16245754277077917,
"learning_rate": 1.9764881164195113e-05,
"loss": 0.3015,
"mean_token_accuracy": 0.907852166891098,
"step": 1210
},
{
"epoch": 2.6079484425349087,
"grad_norm": 0.16043196872565563,
"learning_rate": 1.956696454570629e-05,
"loss": 0.3038,
"mean_token_accuracy": 0.9070708453655243,
"step": 1215
},
{
"epoch": 2.6186895810955964,
"grad_norm": 0.1518503511295408,
"learning_rate": 1.9369746425037983e-05,
"loss": 0.3031,
"mean_token_accuracy": 0.9073640763759613,
"step": 1220
},
{
"epoch": 2.6294307196562836,
"grad_norm": 0.16579054364092405,
"learning_rate": 1.9173244167097766e-05,
"loss": 0.3021,
"mean_token_accuracy": 0.9075863361358643,
"step": 1225
},
{
"epoch": 2.640171858216971,
"grad_norm": 0.16096483480946194,
"learning_rate": 1.8977475073762042e-05,
"loss": 0.3024,
"mean_token_accuracy": 0.907714718580246,
"step": 1230
},
{
"epoch": 2.6509129967776586,
"grad_norm": 0.16586554619371632,
"learning_rate": 1.878245638235262e-05,
"loss": 0.3032,
"mean_token_accuracy": 0.9077441573143006,
"step": 1235
},
{
"epoch": 2.661654135338346,
"grad_norm": 0.17145727431540336,
"learning_rate": 1.8588205264118974e-05,
"loss": 0.3007,
"mean_token_accuracy": 0.9080956459045411,
"step": 1240
},
{
"epoch": 2.672395273899033,
"grad_norm": 0.16247484247551466,
"learning_rate": 1.8394738822726337e-05,
"loss": 0.3078,
"mean_token_accuracy": 0.9063467800617218,
"step": 1245
},
{
"epoch": 2.683136412459721,
"grad_norm": 0.16303109945042918,
"learning_rate": 1.8202074092749754e-05,
"loss": 0.305,
"mean_token_accuracy": 0.9077015459537506,
"step": 1250
},
{
"epoch": 2.693877551020408,
"grad_norm": 0.15810829618004768,
"learning_rate": 1.8010228038174154e-05,
"loss": 0.3052,
"mean_token_accuracy": 0.9069934606552124,
"step": 1255
},
{
"epoch": 2.7046186895810957,
"grad_norm": 0.1572557171403785,
"learning_rate": 1.781921755090072e-05,
"loss": 0.3029,
"mean_token_accuracy": 0.9075438380241394,
"step": 1260
},
{
"epoch": 2.715359828141783,
"grad_norm": 0.15752257331645983,
"learning_rate": 1.7629059449259565e-05,
"loss": 0.2978,
"mean_token_accuracy": 0.9092587411403656,
"step": 1265
},
{
"epoch": 2.7261009667024707,
"grad_norm": 0.155952159894427,
"learning_rate": 1.7439770476528894e-05,
"loss": 0.3025,
"mean_token_accuracy": 0.9076742231845856,
"step": 1270
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.1578844927904049,
"learning_rate": 1.7251367299460735e-05,
"loss": 0.3043,
"mean_token_accuracy": 0.9071321785449982,
"step": 1275
},
{
"epoch": 2.7475832438238452,
"grad_norm": 0.15643506287974016,
"learning_rate": 1.7063866506813515e-05,
"loss": 0.3014,
"mean_token_accuracy": 0.9080881893634796,
"step": 1280
},
{
"epoch": 2.758324382384533,
"grad_norm": 0.16188588270959753,
"learning_rate": 1.687728460789136e-05,
"loss": 0.3029,
"mean_token_accuracy": 0.9077995300292969,
"step": 1285
},
{
"epoch": 2.76906552094522,
"grad_norm": 0.15914290923730717,
"learning_rate": 1.669163803109049e-05,
"loss": 0.3039,
"mean_token_accuracy": 0.9069546043872834,
"step": 1290
},
{
"epoch": 2.7798066595059074,
"grad_norm": 0.1531939594797534,
"learning_rate": 1.650694312245272e-05,
"loss": 0.301,
"mean_token_accuracy": 0.9082088112831116,
"step": 1295
},
{
"epoch": 2.790547798066595,
"grad_norm": 0.14781879067353518,
"learning_rate": 1.6323216144226218e-05,
"loss": 0.3006,
"mean_token_accuracy": 0.9082107961177825,
"step": 1300
},
{
"epoch": 2.8012889366272824,
"grad_norm": 0.15796491533044651,
"learning_rate": 1.614047327343358e-05,
"loss": 0.3037,
"mean_token_accuracy": 0.9073608994483948,
"step": 1305
},
{
"epoch": 2.8120300751879697,
"grad_norm": 0.15342589995319128,
"learning_rate": 1.5958730600447483e-05,
"loss": 0.2982,
"mean_token_accuracy": 0.9089851617813111,
"step": 1310
},
{
"epoch": 2.8227712137486574,
"grad_norm": 0.15213716012041018,
"learning_rate": 1.5778004127573954e-05,
"loss": 0.3018,
"mean_token_accuracy": 0.9082035005092621,
"step": 1315
},
{
"epoch": 2.833512352309345,
"grad_norm": 0.15689344716817114,
"learning_rate": 1.5598309767643355e-05,
"loss": 0.3015,
"mean_token_accuracy": 0.9079676389694213,
"step": 1320
},
{
"epoch": 2.8442534908700323,
"grad_norm": 0.15560793520372218,
"learning_rate": 1.5419663342609245e-05,
"loss": 0.301,
"mean_token_accuracy": 0.9079644203186035,
"step": 1325
},
{
"epoch": 2.8549946294307196,
"grad_norm": 0.15762229912652725,
"learning_rate": 1.524208058215536e-05,
"loss": 0.3004,
"mean_token_accuracy": 0.9081010043621063,
"step": 1330
},
{
"epoch": 2.8657357679914073,
"grad_norm": 0.1492296564674764,
"learning_rate": 1.5065577122310532e-05,
"loss": 0.3038,
"mean_token_accuracy": 0.9071996510028839,
"step": 1335
},
{
"epoch": 2.8764769065520945,
"grad_norm": 0.15341782949091415,
"learning_rate": 1.4890168504071986e-05,
"loss": 0.3013,
"mean_token_accuracy": 0.9081071972846985,
"step": 1340
},
{
"epoch": 2.887218045112782,
"grad_norm": 0.15319646472290932,
"learning_rate": 1.4715870172036961e-05,
"loss": 0.2985,
"mean_token_accuracy": 0.9089631140232086,
"step": 1345
},
{
"epoch": 2.8979591836734695,
"grad_norm": 0.155104806503441,
"learning_rate": 1.4542697473042855e-05,
"loss": 0.3015,
"mean_token_accuracy": 0.9081062614917755,
"step": 1350
},
{
"epoch": 2.9087003222341568,
"grad_norm": 0.14997293337059112,
"learning_rate": 1.4370665654815896e-05,
"loss": 0.3016,
"mean_token_accuracy": 0.9077993631362915,
"step": 1355
},
{
"epoch": 2.919441460794844,
"grad_norm": 0.15836235770159765,
"learning_rate": 1.4199789864628612e-05,
"loss": 0.3025,
"mean_token_accuracy": 0.9076350510120392,
"step": 1360
},
{
"epoch": 2.9301825993555317,
"grad_norm": 0.15239559171871817,
"learning_rate": 1.403008514796616e-05,
"loss": 0.3002,
"mean_token_accuracy": 0.9083379149436951,
"step": 1365
},
{
"epoch": 2.940923737916219,
"grad_norm": 0.15596273472793287,
"learning_rate": 1.3861566447201524e-05,
"loss": 0.2989,
"mean_token_accuracy": 0.9084150791168213,
"step": 1370
},
{
"epoch": 2.9516648764769067,
"grad_norm": 0.15225411451673648,
"learning_rate": 1.3694248600279886e-05,
"loss": 0.3002,
"mean_token_accuracy": 0.9083608329296112,
"step": 1375
},
{
"epoch": 2.962406015037594,
"grad_norm": 0.15301962057571455,
"learning_rate": 1.3528146339412146e-05,
"loss": 0.3021,
"mean_token_accuracy": 0.9078640341758728,
"step": 1380
},
{
"epoch": 2.9731471535982816,
"grad_norm": 0.15353042988029672,
"learning_rate": 1.3363274289777773e-05,
"loss": 0.2992,
"mean_token_accuracy": 0.9084159135818481,
"step": 1385
},
{
"epoch": 2.983888292158969,
"grad_norm": 0.1565397591962354,
"learning_rate": 1.3199646968237039e-05,
"loss": 0.3019,
"mean_token_accuracy": 0.9077640831470489,
"step": 1390
},
{
"epoch": 2.994629430719656,
"grad_norm": 0.15512948456888964,
"learning_rate": 1.3037278782052863e-05,
"loss": 0.301,
"mean_token_accuracy": 0.908068060874939,
"step": 1395
},
{
"epoch": 3.004296455424275,
"grad_norm": 0.17611687143689977,
"learning_rate": 1.2876184027622246e-05,
"loss": 0.2837,
"mean_token_accuracy": 0.9126578701866997,
"step": 1400
},
{
"epoch": 3.0150375939849625,
"grad_norm": 0.23111560237426948,
"learning_rate": 1.2716376889217446e-05,
"loss": 0.2617,
"mean_token_accuracy": 0.9192156255245209,
"step": 1405
},
{
"epoch": 3.0257787325456498,
"grad_norm": 0.18975174760198046,
"learning_rate": 1.2557871437737118e-05,
"loss": 0.2613,
"mean_token_accuracy": 0.9190598428249359,
"step": 1410
},
{
"epoch": 3.0365198711063375,
"grad_norm": 0.17890147872689252,
"learning_rate": 1.240068162946737e-05,
"loss": 0.2584,
"mean_token_accuracy": 0.91984983086586,
"step": 1415
},
{
"epoch": 3.0472610096670247,
"grad_norm": 0.17315801700410546,
"learning_rate": 1.2244821304852888e-05,
"loss": 0.2557,
"mean_token_accuracy": 0.9208986639976502,
"step": 1420
},
{
"epoch": 3.058002148227712,
"grad_norm": 0.18517285000872677,
"learning_rate": 1.2090304187278333e-05,
"loss": 0.2604,
"mean_token_accuracy": 0.9195366144180298,
"step": 1425
},
{
"epoch": 3.0687432867883997,
"grad_norm": 0.16562595080311196,
"learning_rate": 1.1937143881859981e-05,
"loss": 0.2577,
"mean_token_accuracy": 0.9203976690769196,
"step": 1430
},
{
"epoch": 3.079484425349087,
"grad_norm": 0.17393143558685065,
"learning_rate": 1.178535387424785e-05,
"loss": 0.2574,
"mean_token_accuracy": 0.9199799060821533,
"step": 1435
},
{
"epoch": 3.090225563909774,
"grad_norm": 0.1645998735975408,
"learning_rate": 1.163494752943822e-05,
"loss": 0.2568,
"mean_token_accuracy": 0.9204827189445496,
"step": 1440
},
{
"epoch": 3.100966702470462,
"grad_norm": 0.16887936249293273,
"learning_rate": 1.1485938090596918e-05,
"loss": 0.2586,
"mean_token_accuracy": 0.9197791635990142,
"step": 1445
},
{
"epoch": 3.111707841031149,
"grad_norm": 0.17416795475633623,
"learning_rate": 1.1338338677893261e-05,
"loss": 0.2584,
"mean_token_accuracy": 0.9200873076915741,
"step": 1450
},
{
"epoch": 3.122448979591837,
"grad_norm": 0.1751550798568952,
"learning_rate": 1.1192162287344806e-05,
"loss": 0.2584,
"mean_token_accuracy": 0.919762271642685,
"step": 1455
},
{
"epoch": 3.133190118152524,
"grad_norm": 0.17592907174451083,
"learning_rate": 1.1047421789673082e-05,
"loss": 0.2597,
"mean_token_accuracy": 0.9195389747619629,
"step": 1460
},
{
"epoch": 3.143931256713212,
"grad_norm": 0.17327426676281532,
"learning_rate": 1.0904129929170317e-05,
"loss": 0.2556,
"mean_token_accuracy": 0.9207349836826324,
"step": 1465
},
{
"epoch": 3.154672395273899,
"grad_norm": 0.17320030271762202,
"learning_rate": 1.0762299322577352e-05,
"loss": 0.2573,
"mean_token_accuracy": 0.9203036367893219,
"step": 1470
},
{
"epoch": 3.1654135338345863,
"grad_norm": 0.1722311431748818,
"learning_rate": 1.0621942457972692e-05,
"loss": 0.26,
"mean_token_accuracy": 0.9195259928703308,
"step": 1475
},
{
"epoch": 3.176154672395274,
"grad_norm": 0.17238717747260024,
"learning_rate": 1.0483071693672959e-05,
"loss": 0.2556,
"mean_token_accuracy": 0.9209478557109833,
"step": 1480
},
{
"epoch": 3.1868958109559613,
"grad_norm": 0.17188960001484813,
"learning_rate": 1.0345699257144787e-05,
"loss": 0.2599,
"mean_token_accuracy": 0.9196560025215149,
"step": 1485
},
{
"epoch": 3.1976369495166486,
"grad_norm": 0.16939046145995434,
"learning_rate": 1.0209837243928163e-05,
"loss": 0.2569,
"mean_token_accuracy": 0.9202696919441223,
"step": 1490
},
{
"epoch": 3.2083780880773363,
"grad_norm": 0.1643698296522669,
"learning_rate": 1.0075497616571402e-05,
"loss": 0.2613,
"mean_token_accuracy": 0.9193197846412658,
"step": 1495
},
{
"epoch": 3.2191192266380235,
"grad_norm": 0.17523553700537306,
"learning_rate": 9.942692203577937e-06,
"loss": 0.2617,
"mean_token_accuracy": 0.9192265450954438,
"step": 1500
},
{
"epoch": 3.2298603651987112,
"grad_norm": 0.17674127090736955,
"learning_rate": 9.811432698364748e-06,
"loss": 0.2611,
"mean_token_accuracy": 0.9191824972629548,
"step": 1505
},
{
"epoch": 3.2406015037593985,
"grad_norm": 0.17789280108349984,
"learning_rate": 9.681730658232796e-06,
"loss": 0.2631,
"mean_token_accuracy": 0.9186322450637817,
"step": 1510
},
{
"epoch": 3.2513426423200857,
"grad_norm": 0.17266428476273013,
"learning_rate": 9.553597503349415e-06,
"loss": 0.2582,
"mean_token_accuracy": 0.9197676658630372,
"step": 1515
},
{
"epoch": 3.2620837808807734,
"grad_norm": 0.1756023449894313,
"learning_rate": 9.427044515742773e-06,
"loss": 0.2583,
"mean_token_accuracy": 0.9203043103218078,
"step": 1520
},
{
"epoch": 3.2728249194414607,
"grad_norm": 0.1705185261901335,
"learning_rate": 9.302082838308494e-06,
"loss": 0.2588,
"mean_token_accuracy": 0.9197465479373932,
"step": 1525
},
{
"epoch": 3.2835660580021484,
"grad_norm": 0.1863220207081355,
"learning_rate": 9.178723473828517e-06,
"loss": 0.2592,
"mean_token_accuracy": 0.919755893945694,
"step": 1530
},
{
"epoch": 3.2943071965628357,
"grad_norm": 0.18144578655920904,
"learning_rate": 9.05697728400236e-06,
"loss": 0.2588,
"mean_token_accuracy": 0.9201307475566864,
"step": 1535
},
{
"epoch": 3.305048335123523,
"grad_norm": 0.17313846247861978,
"learning_rate": 8.936854988490695e-06,
"loss": 0.2627,
"mean_token_accuracy": 0.9188291728496552,
"step": 1540
},
{
"epoch": 3.3157894736842106,
"grad_norm": 0.1801914802446693,
"learning_rate": 8.818367163971535e-06,
"loss": 0.2557,
"mean_token_accuracy": 0.9207710027694702,
"step": 1545
},
{
"epoch": 3.326530612244898,
"grad_norm": 0.16994847146506772,
"learning_rate": 8.701524243208935e-06,
"loss": 0.2598,
"mean_token_accuracy": 0.9194996774196624,
"step": 1550
},
{
"epoch": 3.3372717508055856,
"grad_norm": 0.16955583517854705,
"learning_rate": 8.586336514134416e-06,
"loss": 0.2566,
"mean_token_accuracy": 0.9205721557140351,
"step": 1555
},
{
"epoch": 3.348012889366273,
"grad_norm": 0.17107585176009693,
"learning_rate": 8.472814118941111e-06,
"loss": 0.2594,
"mean_token_accuracy": 0.9197823405265808,
"step": 1560
},
{
"epoch": 3.35875402792696,
"grad_norm": 0.17753792836827956,
"learning_rate": 8.360967053190748e-06,
"loss": 0.2595,
"mean_token_accuracy": 0.9195821940898895,
"step": 1565
},
{
"epoch": 3.369495166487648,
"grad_norm": 0.1663276449550015,
"learning_rate": 8.250805164933576e-06,
"loss": 0.2576,
"mean_token_accuracy": 0.9204757869243622,
"step": 1570
},
{
"epoch": 3.380236305048335,
"grad_norm": 0.1727926922684143,
"learning_rate": 8.142338153841204e-06,
"loss": 0.2613,
"mean_token_accuracy": 0.9192953467369079,
"step": 1575
},
{
"epoch": 3.3909774436090228,
"grad_norm": 0.16245992891648223,
"learning_rate": 8.035575570352586e-06,
"loss": 0.2603,
"mean_token_accuracy": 0.9196378767490387,
"step": 1580
},
{
"epoch": 3.40171858216971,
"grad_norm": 0.1728382431801045,
"learning_rate": 7.930526814833114e-06,
"loss": 0.2642,
"mean_token_accuracy": 0.9182481050491333,
"step": 1585
},
{
"epoch": 3.4124597207303973,
"grad_norm": 0.17059237401574356,
"learning_rate": 7.827201136746903e-06,
"loss": 0.2608,
"mean_token_accuracy": 0.9196362137794495,
"step": 1590
},
{
"epoch": 3.423200859291085,
"grad_norm": 0.17006814998266018,
"learning_rate": 7.725607633842397e-06,
"loss": 0.262,
"mean_token_accuracy": 0.9188037991523743,
"step": 1595
},
{
"epoch": 3.4339419978517722,
"grad_norm": 0.17763939677962118,
"learning_rate": 7.625755251351302e-06,
"loss": 0.2571,
"mean_token_accuracy": 0.92064950466156,
"step": 1600
},
{
"epoch": 3.4446831364124595,
"grad_norm": 0.16880550111530884,
"learning_rate": 7.52765278120101e-06,
"loss": 0.2619,
"mean_token_accuracy": 0.919091010093689,
"step": 1605
},
{
"epoch": 3.455424274973147,
"grad_norm": 0.17470127038229266,
"learning_rate": 7.431308861240405e-06,
"loss": 0.2611,
"mean_token_accuracy": 0.9194313704967498,
"step": 1610
},
{
"epoch": 3.4661654135338344,
"grad_norm": 0.18361814009538877,
"learning_rate": 7.336731974479366e-06,
"loss": 0.2606,
"mean_token_accuracy": 0.9194453060626984,
"step": 1615
},
{
"epoch": 3.476906552094522,
"grad_norm": 0.16896194278522544,
"learning_rate": 7.2439304483418275e-06,
"loss": 0.2567,
"mean_token_accuracy": 0.9206092417240143,
"step": 1620
},
{
"epoch": 3.4876476906552094,
"grad_norm": 0.16668518571688956,
"learning_rate": 7.152912453932546e-06,
"loss": 0.2595,
"mean_token_accuracy": 0.9194850385189056,
"step": 1625
},
{
"epoch": 3.498388829215897,
"grad_norm": 0.17386165770379072,
"learning_rate": 7.063686005317651e-06,
"loss": 0.2579,
"mean_token_accuracy": 0.9201728105545044,
"step": 1630
},
{
"epoch": 3.5091299677765844,
"grad_norm": 0.17090370338380814,
"learning_rate": 6.976258958819e-06,
"loss": 0.2583,
"mean_token_accuracy": 0.9202900052070617,
"step": 1635
},
{
"epoch": 3.5198711063372716,
"grad_norm": 0.1670190265056932,
"learning_rate": 6.890639012322459e-06,
"loss": 0.2547,
"mean_token_accuracy": 0.9211665093898773,
"step": 1640
},
{
"epoch": 3.5306122448979593,
"grad_norm": 0.17315381341418587,
"learning_rate": 6.806833704600082e-06,
"loss": 0.2561,
"mean_token_accuracy": 0.9206245243549347,
"step": 1645
},
{
"epoch": 3.5413533834586466,
"grad_norm": 0.17367639326439366,
"learning_rate": 6.724850414646344e-06,
"loss": 0.2554,
"mean_token_accuracy": 0.9209690392017365,
"step": 1650
},
{
"epoch": 3.552094522019334,
"grad_norm": 0.18356634723924625,
"learning_rate": 6.644696361028427e-06,
"loss": 0.2546,
"mean_token_accuracy": 0.9211890578269959,
"step": 1655
},
{
"epoch": 3.5628356605800215,
"grad_norm": 0.1686096868472299,
"learning_rate": 6.566378601250625e-06,
"loss": 0.258,
"mean_token_accuracy": 0.9201010644435883,
"step": 1660
},
{
"epoch": 3.573576799140709,
"grad_norm": 0.17097492830249045,
"learning_rate": 6.489904031132919e-06,
"loss": 0.2573,
"mean_token_accuracy": 0.9203424453735352,
"step": 1665
},
{
"epoch": 3.5843179377013965,
"grad_norm": 0.1708922574820426,
"learning_rate": 6.415279384203853e-06,
"loss": 0.2573,
"mean_token_accuracy": 0.9202109038829803,
"step": 1670
},
{
"epoch": 3.5950590762620838,
"grad_norm": 0.1772280034240442,
"learning_rate": 6.3425112311075965e-06,
"loss": 0.2563,
"mean_token_accuracy": 0.9204185366630554,
"step": 1675
},
{
"epoch": 3.6058002148227715,
"grad_norm": 0.17186880847864094,
"learning_rate": 6.271605979025448e-06,
"loss": 0.2555,
"mean_token_accuracy": 0.9206036269664765,
"step": 1680
},
{
"epoch": 3.6165413533834587,
"grad_norm": 0.16731807378864566,
"learning_rate": 6.2025698711116535e-06,
"loss": 0.2565,
"mean_token_accuracy": 0.9205489337444306,
"step": 1685
},
{
"epoch": 3.627282491944146,
"grad_norm": 0.17180713091530317,
"learning_rate": 6.135408985943734e-06,
"loss": 0.2573,
"mean_token_accuracy": 0.9204003512859344,
"step": 1690
},
{
"epoch": 3.6380236305048337,
"grad_norm": 0.1761977177776313,
"learning_rate": 6.07012923698724e-06,
"loss": 0.2587,
"mean_token_accuracy": 0.9196424603462219,
"step": 1695
},
{
"epoch": 3.648764769065521,
"grad_norm": 0.17221380858566646,
"learning_rate": 6.006736372075093e-06,
"loss": 0.2579,
"mean_token_accuracy": 0.9200917899608612,
"step": 1700
},
{
"epoch": 3.659505907626208,
"grad_norm": 0.16805608384415285,
"learning_rate": 5.9452359729015004e-06,
"loss": 0.2573,
"mean_token_accuracy": 0.9203401625156402,
"step": 1705
},
{
"epoch": 3.670247046186896,
"grad_norm": 0.1736765217184823,
"learning_rate": 5.8856334545304676e-06,
"loss": 0.2574,
"mean_token_accuracy": 0.9203644514083862,
"step": 1710
},
{
"epoch": 3.680988184747583,
"grad_norm": 0.1726788133620247,
"learning_rate": 5.8279340649190244e-06,
"loss": 0.2611,
"mean_token_accuracy": 0.9194235980510712,
"step": 1715
},
{
"epoch": 3.6917293233082704,
"grad_norm": 0.16707078529197217,
"learning_rate": 5.7721428844551425e-06,
"loss": 0.2611,
"mean_token_accuracy": 0.9193582713603974,
"step": 1720
},
{
"epoch": 3.702470461868958,
"grad_norm": 0.17182290992101512,
"learning_rate": 5.7182648255104065e-06,
"loss": 0.2596,
"mean_token_accuracy": 0.9196705460548401,
"step": 1725
},
{
"epoch": 3.7132116004296454,
"grad_norm": 0.17419790279430714,
"learning_rate": 5.666304632007487e-06,
"loss": 0.2595,
"mean_token_accuracy": 0.9197326540946961,
"step": 1730
},
{
"epoch": 3.723952738990333,
"grad_norm": 0.18041100180688655,
"learning_rate": 5.616266879002444e-06,
"loss": 0.2575,
"mean_token_accuracy": 0.9202880382537841,
"step": 1735
},
{
"epoch": 3.7346938775510203,
"grad_norm": 0.16636878690891047,
"learning_rate": 5.568155972281892e-06,
"loss": 0.2582,
"mean_token_accuracy": 0.9199542105197906,
"step": 1740
},
{
"epoch": 3.745435016111708,
"grad_norm": 0.17005943549418737,
"learning_rate": 5.521976147975078e-06,
"loss": 0.2575,
"mean_token_accuracy": 0.9207047700881958,
"step": 1745
},
{
"epoch": 3.7561761546723953,
"grad_norm": 0.17142683208534373,
"learning_rate": 5.477731472180884e-06,
"loss": 0.2578,
"mean_token_accuracy": 0.9200609147548675,
"step": 1750
},
{
"epoch": 3.7669172932330826,
"grad_norm": 0.19597039412044637,
"learning_rate": 5.4354258406098275e-06,
"loss": 0.2605,
"mean_token_accuracy": 0.9196163058280945,
"step": 1755
},
{
"epoch": 3.7776584317937703,
"grad_norm": 0.1891144335762954,
"learning_rate": 5.395062978241028e-06,
"loss": 0.256,
"mean_token_accuracy": 0.9203970789909363,
"step": 1760
},
{
"epoch": 3.7883995703544575,
"grad_norm": 0.1734382570098929,
"learning_rate": 5.356646438994236e-06,
"loss": 0.2562,
"mean_token_accuracy": 0.9206745564937592,
"step": 1765
},
{
"epoch": 3.7991407089151448,
"grad_norm": 0.167509733493585,
"learning_rate": 5.3201796054169155e-06,
"loss": 0.2587,
"mean_token_accuracy": 0.919745409488678,
"step": 1770
},
{
"epoch": 3.8098818474758325,
"grad_norm": 0.1758205628466223,
"learning_rate": 5.285665688386408e-06,
"loss": 0.2554,
"mean_token_accuracy": 0.9208223819732666,
"step": 1775
},
{
"epoch": 3.8206229860365197,
"grad_norm": 0.16934855068248722,
"learning_rate": 5.253107726827213e-06,
"loss": 0.2553,
"mean_token_accuracy": 0.9208275616168976,
"step": 1780
},
{
"epoch": 3.8313641245972074,
"grad_norm": 0.17212203700590173,
"learning_rate": 5.222508587443419e-06,
"loss": 0.2558,
"mean_token_accuracy": 0.9208298087120056,
"step": 1785
},
{
"epoch": 3.8421052631578947,
"grad_norm": 0.17351309384632746,
"learning_rate": 5.193870964466299e-06,
"loss": 0.2572,
"mean_token_accuracy": 0.9206307530403137,
"step": 1790
},
{
"epoch": 3.8528464017185824,
"grad_norm": 0.17423994454268188,
"learning_rate": 5.167197379417072e-06,
"loss": 0.2563,
"mean_token_accuracy": 0.9204454243183136,
"step": 1795
},
{
"epoch": 3.8635875402792696,
"grad_norm": 0.17091404042612268,
"learning_rate": 5.142490180884889e-06,
"loss": 0.2566,
"mean_token_accuracy": 0.920625650882721,
"step": 1800
},
{
"epoch": 3.874328678839957,
"grad_norm": 0.17402338382213903,
"learning_rate": 5.119751544320045e-06,
"loss": 0.2548,
"mean_token_accuracy": 0.9212319254875183,
"step": 1805
},
{
"epoch": 3.8850698174006446,
"grad_norm": 0.17785847377734187,
"learning_rate": 5.098983471842435e-06,
"loss": 0.2582,
"mean_token_accuracy": 0.9204130828380584,
"step": 1810
},
{
"epoch": 3.895810955961332,
"grad_norm": 0.17476387276762337,
"learning_rate": 5.080187792065258e-06,
"loss": 0.2576,
"mean_token_accuracy": 0.9203925788402557,
"step": 1815
},
{
"epoch": 3.906552094522019,
"grad_norm": 0.17401606856867693,
"learning_rate": 5.063366159934019e-06,
"loss": 0.257,
"mean_token_accuracy": 0.9207073092460633,
"step": 1820
},
{
"epoch": 3.917293233082707,
"grad_norm": 0.1709751716211779,
"learning_rate": 5.04852005658081e-06,
"loss": 0.2567,
"mean_token_accuracy": 0.9206726491451264,
"step": 1825
},
{
"epoch": 3.928034371643394,
"grad_norm": 0.17944667291264363,
"learning_rate": 5.035650789193893e-06,
"loss": 0.2583,
"mean_token_accuracy": 0.919947350025177,
"step": 1830
},
{
"epoch": 3.938775510204082,
"grad_norm": 0.17075839857976619,
"learning_rate": 5.024759490902604e-06,
"loss": 0.2606,
"mean_token_accuracy": 0.9192731857299805,
"step": 1835
},
{
"epoch": 3.949516648764769,
"grad_norm": 0.1725574446830871,
"learning_rate": 5.015847120677588e-06,
"loss": 0.2585,
"mean_token_accuracy": 0.9199050843715668,
"step": 1840
},
{
"epoch": 3.9602577873254567,
"grad_norm": 0.17546758649223276,
"learning_rate": 5.008914463246362e-06,
"loss": 0.2586,
"mean_token_accuracy": 0.920122253894806,
"step": 1845
},
{
"epoch": 3.970998925886144,
"grad_norm": 0.16820021081330186,
"learning_rate": 5.0039621290242065e-06,
"loss": 0.2583,
"mean_token_accuracy": 0.9200729191303253,
"step": 1850
},
{
"epoch": 3.9817400644468313,
"grad_norm": 0.17517771341096255,
"learning_rate": 5.000990554060436e-06,
"loss": 0.2604,
"mean_token_accuracy": 0.9193271338939667,
"step": 1855
},
{
"epoch": 3.992481203007519,
"grad_norm": 0.17294557291581655,
"learning_rate": 5e-06,
"loss": 0.2556,
"mean_token_accuracy": 0.920825207233429,
"step": 1860
},
{
"epoch": 3.992481203007519,
"step": 1860,
"total_flos": 966947082862592.0,
"train_loss": 0.34282420668550717,
"train_runtime": 10626.5662,
"train_samples_per_second": 2.802,
"train_steps_per_second": 0.175
}
],
"logging_steps": 5,
"max_steps": 1860,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 966947082862592.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}