ReForm-SFT-1.5B / trainer_state.json
SiniShell1's picture
Upload folder using huggingface_hub
42d8ea3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 620,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0064516129032258064,
"grad_norm": 2.3284332752227783,
"learning_rate": 4.032258064516129e-07,
"loss": 0.1831,
"step": 1
},
{
"epoch": 0.012903225806451613,
"grad_norm": 3.1032278537750244,
"learning_rate": 8.064516129032258e-07,
"loss": 0.2496,
"step": 2
},
{
"epoch": 0.01935483870967742,
"grad_norm": 2.7308666706085205,
"learning_rate": 1.2096774193548388e-06,
"loss": 0.2497,
"step": 3
},
{
"epoch": 0.025806451612903226,
"grad_norm": 2.6942598819732666,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.2513,
"step": 4
},
{
"epoch": 0.03225806451612903,
"grad_norm": 2.281903028488159,
"learning_rate": 2.0161290322580646e-06,
"loss": 0.2021,
"step": 5
},
{
"epoch": 0.03870967741935484,
"grad_norm": 2.2116780281066895,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.2472,
"step": 6
},
{
"epoch": 0.04516129032258064,
"grad_norm": 2.3709909915924072,
"learning_rate": 2.82258064516129e-06,
"loss": 0.2133,
"step": 7
},
{
"epoch": 0.05161290322580645,
"grad_norm": 1.6222234964370728,
"learning_rate": 3.225806451612903e-06,
"loss": 0.2072,
"step": 8
},
{
"epoch": 0.05806451612903226,
"grad_norm": 1.6226286888122559,
"learning_rate": 3.6290322580645166e-06,
"loss": 0.1732,
"step": 9
},
{
"epoch": 0.06451612903225806,
"grad_norm": 1.514697551727295,
"learning_rate": 4.032258064516129e-06,
"loss": 0.2088,
"step": 10
},
{
"epoch": 0.07096774193548387,
"grad_norm": 1.6407102346420288,
"learning_rate": 4.435483870967742e-06,
"loss": 0.1733,
"step": 11
},
{
"epoch": 0.07741935483870968,
"grad_norm": 1.5498034954071045,
"learning_rate": 4.838709677419355e-06,
"loss": 0.2087,
"step": 12
},
{
"epoch": 0.08387096774193549,
"grad_norm": 1.9465513229370117,
"learning_rate": 5.241935483870968e-06,
"loss": 0.2046,
"step": 13
},
{
"epoch": 0.09032258064516129,
"grad_norm": 1.3483728170394897,
"learning_rate": 5.64516129032258e-06,
"loss": 0.1783,
"step": 14
},
{
"epoch": 0.0967741935483871,
"grad_norm": 1.4068559408187866,
"learning_rate": 6.048387096774194e-06,
"loss": 0.1519,
"step": 15
},
{
"epoch": 0.1032258064516129,
"grad_norm": 1.3083986043930054,
"learning_rate": 6.451612903225806e-06,
"loss": 0.1586,
"step": 16
},
{
"epoch": 0.10967741935483871,
"grad_norm": 1.6559300422668457,
"learning_rate": 6.854838709677419e-06,
"loss": 0.1476,
"step": 17
},
{
"epoch": 0.11612903225806452,
"grad_norm": 1.0691255331039429,
"learning_rate": 7.258064516129033e-06,
"loss": 0.1478,
"step": 18
},
{
"epoch": 0.12258064516129032,
"grad_norm": 1.1155110597610474,
"learning_rate": 7.661290322580646e-06,
"loss": 0.1382,
"step": 19
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.9816218018531799,
"learning_rate": 8.064516129032258e-06,
"loss": 0.1204,
"step": 20
},
{
"epoch": 0.13548387096774195,
"grad_norm": 1.2463096380233765,
"learning_rate": 8.46774193548387e-06,
"loss": 0.1503,
"step": 21
},
{
"epoch": 0.14193548387096774,
"grad_norm": 1.3447906970977783,
"learning_rate": 8.870967741935484e-06,
"loss": 0.1224,
"step": 22
},
{
"epoch": 0.14838709677419354,
"grad_norm": 1.1465381383895874,
"learning_rate": 9.274193548387097e-06,
"loss": 0.1374,
"step": 23
},
{
"epoch": 0.15483870967741936,
"grad_norm": 1.0978549718856812,
"learning_rate": 9.67741935483871e-06,
"loss": 0.1303,
"step": 24
},
{
"epoch": 0.16129032258064516,
"grad_norm": 1.1053048372268677,
"learning_rate": 1.0080645161290323e-05,
"loss": 0.1423,
"step": 25
},
{
"epoch": 0.16774193548387098,
"grad_norm": 1.0212026834487915,
"learning_rate": 1.0483870967741936e-05,
"loss": 0.1281,
"step": 26
},
{
"epoch": 0.17419354838709677,
"grad_norm": 0.9742250442504883,
"learning_rate": 1.0887096774193549e-05,
"loss": 0.1228,
"step": 27
},
{
"epoch": 0.18064516129032257,
"grad_norm": 1.1676782369613647,
"learning_rate": 1.129032258064516e-05,
"loss": 0.1429,
"step": 28
},
{
"epoch": 0.1870967741935484,
"grad_norm": 1.088600516319275,
"learning_rate": 1.1693548387096775e-05,
"loss": 0.1069,
"step": 29
},
{
"epoch": 0.1935483870967742,
"grad_norm": 1.3947196006774902,
"learning_rate": 1.2096774193548388e-05,
"loss": 0.1316,
"step": 30
},
{
"epoch": 0.2,
"grad_norm": 0.9059141874313354,
"learning_rate": 1.25e-05,
"loss": 0.1121,
"step": 31
},
{
"epoch": 0.2064516129032258,
"grad_norm": 1.0918734073638916,
"learning_rate": 1.2903225806451613e-05,
"loss": 0.1521,
"step": 32
},
{
"epoch": 0.2129032258064516,
"grad_norm": 1.1550475358963013,
"learning_rate": 1.3306451612903225e-05,
"loss": 0.1251,
"step": 33
},
{
"epoch": 0.21935483870967742,
"grad_norm": 1.2110551595687866,
"learning_rate": 1.3709677419354839e-05,
"loss": 0.1325,
"step": 34
},
{
"epoch": 0.22580645161290322,
"grad_norm": 1.0340098142623901,
"learning_rate": 1.4112903225806454e-05,
"loss": 0.1115,
"step": 35
},
{
"epoch": 0.23225806451612904,
"grad_norm": 0.9789180159568787,
"learning_rate": 1.4516129032258066e-05,
"loss": 0.1154,
"step": 36
},
{
"epoch": 0.23870967741935484,
"grad_norm": 0.8362810015678406,
"learning_rate": 1.4919354838709679e-05,
"loss": 0.1041,
"step": 37
},
{
"epoch": 0.24516129032258063,
"grad_norm": 1.0805575847625732,
"learning_rate": 1.5322580645161292e-05,
"loss": 0.1201,
"step": 38
},
{
"epoch": 0.25161290322580643,
"grad_norm": 1.0794912576675415,
"learning_rate": 1.5725806451612903e-05,
"loss": 0.1387,
"step": 39
},
{
"epoch": 0.25806451612903225,
"grad_norm": 1.0303066968917847,
"learning_rate": 1.6129032258064517e-05,
"loss": 0.1381,
"step": 40
},
{
"epoch": 0.2645161290322581,
"grad_norm": 0.8959848284721375,
"learning_rate": 1.653225806451613e-05,
"loss": 0.1251,
"step": 41
},
{
"epoch": 0.2709677419354839,
"grad_norm": 1.0856695175170898,
"learning_rate": 1.693548387096774e-05,
"loss": 0.1363,
"step": 42
},
{
"epoch": 0.27741935483870966,
"grad_norm": 0.8375802636146545,
"learning_rate": 1.733870967741936e-05,
"loss": 0.1009,
"step": 43
},
{
"epoch": 0.2838709677419355,
"grad_norm": 0.9029824733734131,
"learning_rate": 1.774193548387097e-05,
"loss": 0.1006,
"step": 44
},
{
"epoch": 0.2903225806451613,
"grad_norm": 0.8736345767974854,
"learning_rate": 1.8145161290322583e-05,
"loss": 0.1212,
"step": 45
},
{
"epoch": 0.2967741935483871,
"grad_norm": 1.165887713432312,
"learning_rate": 1.8548387096774193e-05,
"loss": 0.1241,
"step": 46
},
{
"epoch": 0.3032258064516129,
"grad_norm": 0.8511247634887695,
"learning_rate": 1.8951612903225807e-05,
"loss": 0.0942,
"step": 47
},
{
"epoch": 0.3096774193548387,
"grad_norm": 1.0182602405548096,
"learning_rate": 1.935483870967742e-05,
"loss": 0.116,
"step": 48
},
{
"epoch": 0.3161290322580645,
"grad_norm": 0.8452662825584412,
"learning_rate": 1.975806451612903e-05,
"loss": 0.1023,
"step": 49
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.9583229422569275,
"learning_rate": 2.0161290322580645e-05,
"loss": 0.0956,
"step": 50
},
{
"epoch": 0.32903225806451614,
"grad_norm": 0.935484766960144,
"learning_rate": 2.056451612903226e-05,
"loss": 0.1185,
"step": 51
},
{
"epoch": 0.33548387096774196,
"grad_norm": 0.9844627380371094,
"learning_rate": 2.0967741935483873e-05,
"loss": 0.0998,
"step": 52
},
{
"epoch": 0.3419354838709677,
"grad_norm": 1.0139315128326416,
"learning_rate": 2.1370967741935487e-05,
"loss": 0.0901,
"step": 53
},
{
"epoch": 0.34838709677419355,
"grad_norm": 0.844688892364502,
"learning_rate": 2.1774193548387097e-05,
"loss": 0.1158,
"step": 54
},
{
"epoch": 0.3548387096774194,
"grad_norm": 0.778408408164978,
"learning_rate": 2.217741935483871e-05,
"loss": 0.0884,
"step": 55
},
{
"epoch": 0.36129032258064514,
"grad_norm": 0.7307286858558655,
"learning_rate": 2.258064516129032e-05,
"loss": 0.1099,
"step": 56
},
{
"epoch": 0.36774193548387096,
"grad_norm": 0.681089460849762,
"learning_rate": 2.2983870967741935e-05,
"loss": 0.0965,
"step": 57
},
{
"epoch": 0.3741935483870968,
"grad_norm": 0.7206712365150452,
"learning_rate": 2.338709677419355e-05,
"loss": 0.0898,
"step": 58
},
{
"epoch": 0.38064516129032255,
"grad_norm": 0.6326794624328613,
"learning_rate": 2.3790322580645163e-05,
"loss": 0.0896,
"step": 59
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.6684013605117798,
"learning_rate": 2.4193548387096777e-05,
"loss": 0.0859,
"step": 60
},
{
"epoch": 0.3935483870967742,
"grad_norm": 0.7839128971099854,
"learning_rate": 2.4596774193548387e-05,
"loss": 0.0918,
"step": 61
},
{
"epoch": 0.4,
"grad_norm": 0.7025837302207947,
"learning_rate": 2.5e-05,
"loss": 0.0933,
"step": 62
},
{
"epoch": 0.4064516129032258,
"grad_norm": 0.7583072185516357,
"learning_rate": 2.4999801888257584e-05,
"loss": 0.0916,
"step": 63
},
{
"epoch": 0.4129032258064516,
"grad_norm": 0.8116795420646667,
"learning_rate": 2.499920755931005e-05,
"loss": 0.0934,
"step": 64
},
{
"epoch": 0.41935483870967744,
"grad_norm": 0.9053534865379333,
"learning_rate": 2.4998217031996375e-05,
"loss": 0.1116,
"step": 65
},
{
"epoch": 0.4258064516129032,
"grad_norm": 0.773985743522644,
"learning_rate": 2.4996830337714163e-05,
"loss": 0.0874,
"step": 66
},
{
"epoch": 0.432258064516129,
"grad_norm": 0.8468173146247864,
"learning_rate": 2.4995047520418692e-05,
"loss": 0.0954,
"step": 67
},
{
"epoch": 0.43870967741935485,
"grad_norm": 0.7126619815826416,
"learning_rate": 2.4992868636621474e-05,
"loss": 0.1017,
"step": 68
},
{
"epoch": 0.44516129032258067,
"grad_norm": 0.7975043654441833,
"learning_rate": 2.4990293755388524e-05,
"loss": 0.1086,
"step": 69
},
{
"epoch": 0.45161290322580644,
"grad_norm": 0.8055579662322998,
"learning_rate": 2.4987322958338095e-05,
"loss": 0.0836,
"step": 70
},
{
"epoch": 0.45806451612903226,
"grad_norm": 0.6494209170341492,
"learning_rate": 2.4983956339638158e-05,
"loss": 0.0883,
"step": 71
},
{
"epoch": 0.4645161290322581,
"grad_norm": 0.6997829675674438,
"learning_rate": 2.4980194006003392e-05,
"loss": 0.0763,
"step": 72
},
{
"epoch": 0.47096774193548385,
"grad_norm": 0.596174418926239,
"learning_rate": 2.4976036076691787e-05,
"loss": 0.0871,
"step": 73
},
{
"epoch": 0.4774193548387097,
"grad_norm": 0.6535652279853821,
"learning_rate": 2.4971482683500884e-05,
"loss": 0.0869,
"step": 74
},
{
"epoch": 0.4838709677419355,
"grad_norm": 0.8003737926483154,
"learning_rate": 2.4966533970763586e-05,
"loss": 0.1086,
"step": 75
},
{
"epoch": 0.49032258064516127,
"grad_norm": 0.6992926001548767,
"learning_rate": 2.496119009534359e-05,
"loss": 0.0822,
"step": 76
},
{
"epoch": 0.4967741935483871,
"grad_norm": 0.6500689387321472,
"learning_rate": 2.4955451226630412e-05,
"loss": 0.0876,
"step": 77
},
{
"epoch": 0.5032258064516129,
"grad_norm": 0.7626132369041443,
"learning_rate": 2.4949317546534018e-05,
"loss": 0.0911,
"step": 78
},
{
"epoch": 0.5096774193548387,
"grad_norm": 0.6485949158668518,
"learning_rate": 2.4942789249479054e-05,
"loss": 0.0914,
"step": 79
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.692364513874054,
"learning_rate": 2.493586654239869e-05,
"loss": 0.1074,
"step": 80
},
{
"epoch": 0.5225806451612903,
"grad_norm": 0.7383131980895996,
"learning_rate": 2.4928549644728057e-05,
"loss": 0.085,
"step": 81
},
{
"epoch": 0.5290322580645161,
"grad_norm": 0.6585950255393982,
"learning_rate": 2.492083878839729e-05,
"loss": 0.0795,
"step": 82
},
{
"epoch": 0.535483870967742,
"grad_norm": 0.7683681845664978,
"learning_rate": 2.491273421782417e-05,
"loss": 0.073,
"step": 83
},
{
"epoch": 0.5419354838709678,
"grad_norm": 0.5386450290679932,
"learning_rate": 2.4904236189906406e-05,
"loss": 0.0814,
"step": 84
},
{
"epoch": 0.5483870967741935,
"grad_norm": 0.725712239742279,
"learning_rate": 2.489534497401345e-05,
"loss": 0.0896,
"step": 85
},
{
"epoch": 0.5548387096774193,
"grad_norm": 0.8596577644348145,
"learning_rate": 2.488606085197799e-05,
"loss": 0.0816,
"step": 86
},
{
"epoch": 0.5612903225806452,
"grad_norm": 0.7653164863586426,
"learning_rate": 2.4876384118086992e-05,
"loss": 0.1078,
"step": 87
},
{
"epoch": 0.567741935483871,
"grad_norm": 0.713628351688385,
"learning_rate": 2.48663150790724e-05,
"loss": 0.0887,
"step": 88
},
{
"epoch": 0.5741935483870968,
"grad_norm": 0.5724640488624573,
"learning_rate": 2.4855854054101395e-05,
"loss": 0.0849,
"step": 89
},
{
"epoch": 0.5806451612903226,
"grad_norm": 0.6235289573669434,
"learning_rate": 2.484500137476627e-05,
"loss": 0.0875,
"step": 90
},
{
"epoch": 0.5870967741935483,
"grad_norm": 0.785372793674469,
"learning_rate": 2.483375738507395e-05,
"loss": 0.1225,
"step": 91
},
{
"epoch": 0.5935483870967742,
"grad_norm": 0.6431748867034912,
"learning_rate": 2.4822122441435047e-05,
"loss": 0.0913,
"step": 92
},
{
"epoch": 0.6,
"grad_norm": 0.8031719923019409,
"learning_rate": 2.4810096912652604e-05,
"loss": 0.102,
"step": 93
},
{
"epoch": 0.6064516129032258,
"grad_norm": 0.5750744938850403,
"learning_rate": 2.4797681179910363e-05,
"loss": 0.0754,
"step": 94
},
{
"epoch": 0.6129032258064516,
"grad_norm": 0.7892565727233887,
"learning_rate": 2.4784875636760727e-05,
"loss": 0.0825,
"step": 95
},
{
"epoch": 0.6193548387096774,
"grad_norm": 0.7932739853858948,
"learning_rate": 2.4771680689112244e-05,
"loss": 0.1262,
"step": 96
},
{
"epoch": 0.6258064516129033,
"grad_norm": 0.7647889852523804,
"learning_rate": 2.4758096755216763e-05,
"loss": 0.1083,
"step": 97
},
{
"epoch": 0.632258064516129,
"grad_norm": 0.9550963640213013,
"learning_rate": 2.474412426565618e-05,
"loss": 0.0828,
"step": 98
},
{
"epoch": 0.6387096774193548,
"grad_norm": 0.6981013417243958,
"learning_rate": 2.4729763663328774e-05,
"loss": 0.0943,
"step": 99
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.8088532090187073,
"learning_rate": 2.4715015403435176e-05,
"loss": 0.0954,
"step": 100
},
{
"epoch": 0.6516129032258065,
"grad_norm": 0.9130911231040955,
"learning_rate": 2.4699879953463945e-05,
"loss": 0.0973,
"step": 101
},
{
"epoch": 0.6580645161290323,
"grad_norm": 0.791867196559906,
"learning_rate": 2.468435779317673e-05,
"loss": 0.0946,
"step": 102
},
{
"epoch": 0.6645161290322581,
"grad_norm": 0.6049063205718994,
"learning_rate": 2.466844941459309e-05,
"loss": 0.0797,
"step": 103
},
{
"epoch": 0.6709677419354839,
"grad_norm": 0.6488558053970337,
"learning_rate": 2.4652155321974883e-05,
"loss": 0.1004,
"step": 104
},
{
"epoch": 0.6774193548387096,
"grad_norm": 0.7218672633171082,
"learning_rate": 2.4635476031810284e-05,
"loss": 0.0943,
"step": 105
},
{
"epoch": 0.6838709677419355,
"grad_norm": 0.7997153997421265,
"learning_rate": 2.4618412072797407e-05,
"loss": 0.0831,
"step": 106
},
{
"epoch": 0.6903225806451613,
"grad_norm": 0.8165119886398315,
"learning_rate": 2.4600963985827555e-05,
"loss": 0.0919,
"step": 107
},
{
"epoch": 0.6967741935483871,
"grad_norm": 0.704238772392273,
"learning_rate": 2.458313232396808e-05,
"loss": 0.0778,
"step": 108
},
{
"epoch": 0.7032258064516129,
"grad_norm": 0.6857476234436035,
"learning_rate": 2.456491765244483e-05,
"loss": 0.0914,
"step": 109
},
{
"epoch": 0.7096774193548387,
"grad_norm": 0.7254015803337097,
"learning_rate": 2.4546320548624264e-05,
"loss": 0.1102,
"step": 110
},
{
"epoch": 0.7161290322580646,
"grad_norm": 0.6534197330474854,
"learning_rate": 2.4527341601995115e-05,
"loss": 0.0841,
"step": 111
},
{
"epoch": 0.7225806451612903,
"grad_norm": 0.6944810152053833,
"learning_rate": 2.450798141414974e-05,
"loss": 0.1067,
"step": 112
},
{
"epoch": 0.7290322580645161,
"grad_norm": 0.7583324909210205,
"learning_rate": 2.448824059876503e-05,
"loss": 0.0979,
"step": 113
},
{
"epoch": 0.7354838709677419,
"grad_norm": 0.5010597705841064,
"learning_rate": 2.4468119781582948e-05,
"loss": 0.069,
"step": 114
},
{
"epoch": 0.7419354838709677,
"grad_norm": 0.5694583058357239,
"learning_rate": 2.444761960039072e-05,
"loss": 0.0687,
"step": 115
},
{
"epoch": 0.7483870967741936,
"grad_norm": 0.803371787071228,
"learning_rate": 2.442674070500061e-05,
"loss": 0.123,
"step": 116
},
{
"epoch": 0.7548387096774194,
"grad_norm": 0.6523027420043945,
"learning_rate": 2.4405483757229314e-05,
"loss": 0.0917,
"step": 117
},
{
"epoch": 0.7612903225806451,
"grad_norm": 0.6718930006027222,
"learning_rate": 2.438384943087698e-05,
"loss": 0.0854,
"step": 118
},
{
"epoch": 0.7677419354838709,
"grad_norm": 0.5987946391105652,
"learning_rate": 2.4361838411705865e-05,
"loss": 0.0941,
"step": 119
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.6336897015571594,
"learning_rate": 2.4339451397418584e-05,
"loss": 0.0885,
"step": 120
},
{
"epoch": 0.7806451612903226,
"grad_norm": 0.7484766840934753,
"learning_rate": 2.4316689097636008e-05,
"loss": 0.0966,
"step": 121
},
{
"epoch": 0.7870967741935484,
"grad_norm": 0.7096850275993347,
"learning_rate": 2.4293552233874754e-05,
"loss": 0.0843,
"step": 122
},
{
"epoch": 0.7935483870967742,
"grad_norm": 0.6953093409538269,
"learning_rate": 2.4270041539524322e-05,
"loss": 0.079,
"step": 123
},
{
"epoch": 0.8,
"grad_norm": 0.6068540215492249,
"learning_rate": 2.4246157759823855e-05,
"loss": 0.0846,
"step": 124
},
{
"epoch": 0.8064516129032258,
"grad_norm": 0.5982446670532227,
"learning_rate": 2.4221901651838506e-05,
"loss": 0.0864,
"step": 125
},
{
"epoch": 0.8129032258064516,
"grad_norm": 0.6706437468528748,
"learning_rate": 2.419727398443545e-05,
"loss": 0.0796,
"step": 126
},
{
"epoch": 0.8193548387096774,
"grad_norm": 0.6994534730911255,
"learning_rate": 2.417227553825949e-05,
"loss": 0.0775,
"step": 127
},
{
"epoch": 0.8258064516129032,
"grad_norm": 0.6935513615608215,
"learning_rate": 2.4146907105708357e-05,
"loss": 0.1003,
"step": 128
},
{
"epoch": 0.832258064516129,
"grad_norm": 0.6945312023162842,
"learning_rate": 2.4121169490907544e-05,
"loss": 0.0901,
"step": 129
},
{
"epoch": 0.8387096774193549,
"grad_norm": 0.6928992867469788,
"learning_rate": 2.409506350968485e-05,
"loss": 0.0991,
"step": 130
},
{
"epoch": 0.8451612903225807,
"grad_norm": 0.6358478665351868,
"learning_rate": 2.4068589989544498e-05,
"loss": 0.0877,
"step": 131
},
{
"epoch": 0.8516129032258064,
"grad_norm": 0.6835708022117615,
"learning_rate": 2.404174976964092e-05,
"loss": 0.1058,
"step": 132
},
{
"epoch": 0.8580645161290322,
"grad_norm": 0.6372717022895813,
"learning_rate": 2.4014543700752156e-05,
"loss": 0.0899,
"step": 133
},
{
"epoch": 0.864516129032258,
"grad_norm": 0.671310544013977,
"learning_rate": 2.3986972645252883e-05,
"loss": 0.0744,
"step": 134
},
{
"epoch": 0.8709677419354839,
"grad_norm": 0.5800638794898987,
"learning_rate": 2.395903747708707e-05,
"loss": 0.0818,
"step": 135
},
{
"epoch": 0.8774193548387097,
"grad_norm": 0.5142645835876465,
"learning_rate": 2.39307390817403e-05,
"loss": 0.0811,
"step": 136
},
{
"epoch": 0.8838709677419355,
"grad_norm": 0.7107434868812561,
"learning_rate": 2.390207835621167e-05,
"loss": 0.0876,
"step": 137
},
{
"epoch": 0.8903225806451613,
"grad_norm": 0.6197046637535095,
"learning_rate": 2.3873056208985383e-05,
"loss": 0.0907,
"step": 138
},
{
"epoch": 0.896774193548387,
"grad_norm": 0.8946641087532043,
"learning_rate": 2.384367356000195e-05,
"loss": 0.0867,
"step": 139
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.6002138257026672,
"learning_rate": 2.3813931340629018e-05,
"loss": 0.0766,
"step": 140
},
{
"epoch": 0.9096774193548387,
"grad_norm": 0.4771173298358917,
"learning_rate": 2.378383049363184e-05,
"loss": 0.074,
"step": 141
},
{
"epoch": 0.9161290322580645,
"grad_norm": 0.6188220381736755,
"learning_rate": 2.3753371973143433e-05,
"loss": 0.0823,
"step": 142
},
{
"epoch": 0.9225806451612903,
"grad_norm": 0.509564995765686,
"learning_rate": 2.3722556744634272e-05,
"loss": 0.069,
"step": 143
},
{
"epoch": 0.9290322580645162,
"grad_norm": 0.5153804421424866,
"learning_rate": 2.3691385784881743e-05,
"loss": 0.064,
"step": 144
},
{
"epoch": 0.9354838709677419,
"grad_norm": 0.5935696363449097,
"learning_rate": 2.3659860081939146e-05,
"loss": 0.0827,
"step": 145
},
{
"epoch": 0.9419354838709677,
"grad_norm": 0.4910190999507904,
"learning_rate": 2.3627980635104396e-05,
"loss": 0.0804,
"step": 146
},
{
"epoch": 0.9483870967741935,
"grad_norm": 0.6524127721786499,
"learning_rate": 2.359574845488833e-05,
"loss": 0.0956,
"step": 147
},
{
"epoch": 0.9548387096774194,
"grad_norm": 0.6664571762084961,
"learning_rate": 2.356316456298269e-05,
"loss": 0.0937,
"step": 148
},
{
"epoch": 0.9612903225806452,
"grad_norm": 0.579138994216919,
"learning_rate": 2.353022999222774e-05,
"loss": 0.0936,
"step": 149
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.6929976940155029,
"learning_rate": 2.3496945786579503e-05,
"loss": 0.1197,
"step": 150
},
{
"epoch": 0.9741935483870968,
"grad_norm": 0.6236998438835144,
"learning_rate": 2.3463313001076696e-05,
"loss": 0.0958,
"step": 151
},
{
"epoch": 0.9806451612903225,
"grad_norm": 0.4676724672317505,
"learning_rate": 2.342933270180728e-05,
"loss": 0.0713,
"step": 152
},
{
"epoch": 0.9870967741935484,
"grad_norm": 0.4456840753555298,
"learning_rate": 2.3395005965874657e-05,
"loss": 0.0784,
"step": 153
},
{
"epoch": 0.9935483870967742,
"grad_norm": 0.5712344646453857,
"learning_rate": 2.336033388136355e-05,
"loss": 0.0935,
"step": 154
},
{
"epoch": 1.0,
"grad_norm": 0.4726645350456238,
"learning_rate": 2.3325317547305485e-05,
"loss": 0.0712,
"step": 155
},
{
"epoch": 1.0064516129032257,
"grad_norm": 0.48649105429649353,
"learning_rate": 2.3289958073643976e-05,
"loss": 0.0584,
"step": 156
},
{
"epoch": 1.0129032258064516,
"grad_norm": 0.5159472823143005,
"learning_rate": 2.3254256581199336e-05,
"loss": 0.0579,
"step": 157
},
{
"epoch": 1.0193548387096774,
"grad_norm": 0.5775710344314575,
"learning_rate": 2.3218214201633136e-05,
"loss": 0.0676,
"step": 158
},
{
"epoch": 1.0258064516129033,
"grad_norm": 0.5070593357086182,
"learning_rate": 2.318183207741237e-05,
"loss": 0.0794,
"step": 159
},
{
"epoch": 1.032258064516129,
"grad_norm": 0.38065212965011597,
"learning_rate": 2.3145111361773186e-05,
"loss": 0.051,
"step": 160
},
{
"epoch": 1.038709677419355,
"grad_norm": 0.562282383441925,
"learning_rate": 2.310805321868439e-05,
"loss": 0.0753,
"step": 161
},
{
"epoch": 1.0451612903225806,
"grad_norm": 0.49883219599723816,
"learning_rate": 2.30706588228105e-05,
"loss": 0.0554,
"step": 162
},
{
"epoch": 1.0516129032258064,
"grad_norm": 0.5298740863800049,
"learning_rate": 2.303292935947455e-05,
"loss": 0.0602,
"step": 163
},
{
"epoch": 1.0580645161290323,
"grad_norm": 0.5768100619316101,
"learning_rate": 2.2994866024620486e-05,
"loss": 0.0585,
"step": 164
},
{
"epoch": 1.064516129032258,
"grad_norm": 0.5079744458198547,
"learning_rate": 2.2956470024775294e-05,
"loss": 0.0459,
"step": 165
},
{
"epoch": 1.070967741935484,
"grad_norm": 0.5212790966033936,
"learning_rate": 2.291774257701072e-05,
"loss": 0.0619,
"step": 166
},
{
"epoch": 1.0774193548387097,
"grad_norm": 0.5063428282737732,
"learning_rate": 2.2878684908904707e-05,
"loss": 0.0609,
"step": 167
},
{
"epoch": 1.0838709677419356,
"grad_norm": 0.6523650288581848,
"learning_rate": 2.2839298258502483e-05,
"loss": 0.067,
"step": 168
},
{
"epoch": 1.0903225806451613,
"grad_norm": 0.57984459400177,
"learning_rate": 2.279958387427732e-05,
"loss": 0.0703,
"step": 169
},
{
"epoch": 1.096774193548387,
"grad_norm": 0.6002654433250427,
"learning_rate": 2.2759543015090955e-05,
"loss": 0.074,
"step": 170
},
{
"epoch": 1.103225806451613,
"grad_norm": 0.3899862766265869,
"learning_rate": 2.2719176950153688e-05,
"loss": 0.0461,
"step": 171
},
{
"epoch": 1.1096774193548387,
"grad_norm": 0.5003259778022766,
"learning_rate": 2.267848695898416e-05,
"loss": 0.0613,
"step": 172
},
{
"epoch": 1.1161290322580646,
"grad_norm": 0.558653712272644,
"learning_rate": 2.2637474331368766e-05,
"loss": 0.0658,
"step": 173
},
{
"epoch": 1.1225806451612903,
"grad_norm": 0.5032625794410706,
"learning_rate": 2.2596140367320813e-05,
"loss": 0.0564,
"step": 174
},
{
"epoch": 1.129032258064516,
"grad_norm": 0.5199857950210571,
"learning_rate": 2.2554486377039282e-05,
"loss": 0.0587,
"step": 175
},
{
"epoch": 1.135483870967742,
"grad_norm": 0.6159687042236328,
"learning_rate": 2.251251368086731e-05,
"loss": 0.0585,
"step": 176
},
{
"epoch": 1.1419354838709677,
"grad_norm": 0.5216447114944458,
"learning_rate": 2.2470223609250328e-05,
"loss": 0.0501,
"step": 177
},
{
"epoch": 1.1483870967741936,
"grad_norm": 0.49131321907043457,
"learning_rate": 2.24276175026939e-05,
"loss": 0.053,
"step": 178
},
{
"epoch": 1.1548387096774193,
"grad_norm": 0.8894760608673096,
"learning_rate": 2.238469671172123e-05,
"loss": 0.0854,
"step": 179
},
{
"epoch": 1.1612903225806452,
"grad_norm": 0.6628456711769104,
"learning_rate": 2.2341462596830354e-05,
"loss": 0.064,
"step": 180
},
{
"epoch": 1.167741935483871,
"grad_norm": 0.4577731788158417,
"learning_rate": 2.229791652845099e-05,
"loss": 0.0543,
"step": 181
},
{
"epoch": 1.1741935483870969,
"grad_norm": 0.49301421642303467,
"learning_rate": 2.225405988690115e-05,
"loss": 0.0598,
"step": 182
},
{
"epoch": 1.1806451612903226,
"grad_norm": 0.523009717464447,
"learning_rate": 2.220989406234333e-05,
"loss": 0.0752,
"step": 183
},
{
"epoch": 1.1870967741935483,
"grad_norm": 0.7591210007667542,
"learning_rate": 2.2165420454740494e-05,
"loss": 0.0643,
"step": 184
},
{
"epoch": 1.1935483870967742,
"grad_norm": 0.532319962978363,
"learning_rate": 2.2120640473811656e-05,
"loss": 0.0464,
"step": 185
},
{
"epoch": 1.2,
"grad_norm": 0.4334715008735657,
"learning_rate": 2.2075555538987227e-05,
"loss": 0.0669,
"step": 186
},
{
"epoch": 1.206451612903226,
"grad_norm": 0.4052492380142212,
"learning_rate": 2.2030167079364007e-05,
"loss": 0.0506,
"step": 187
},
{
"epoch": 1.2129032258064516,
"grad_norm": 0.763782799243927,
"learning_rate": 2.1984476533659888e-05,
"loss": 0.0477,
"step": 188
},
{
"epoch": 1.2193548387096773,
"grad_norm": 0.46810972690582275,
"learning_rate": 2.1938485350168248e-05,
"loss": 0.055,
"step": 189
},
{
"epoch": 1.2258064516129032,
"grad_norm": 0.4722144901752472,
"learning_rate": 2.1892194986712045e-05,
"loss": 0.053,
"step": 190
},
{
"epoch": 1.232258064516129,
"grad_norm": 0.5537333488464355,
"learning_rate": 2.1845606910597616e-05,
"loss": 0.0686,
"step": 191
},
{
"epoch": 1.238709677419355,
"grad_norm": 0.5123704671859741,
"learning_rate": 2.179872259856814e-05,
"loss": 0.0627,
"step": 192
},
{
"epoch": 1.2451612903225806,
"grad_norm": 0.5691571831703186,
"learning_rate": 2.175154353675686e-05,
"loss": 0.0601,
"step": 193
},
{
"epoch": 1.2516129032258063,
"grad_norm": 0.4747653007507324,
"learning_rate": 2.1704071220639965e-05,
"loss": 0.0551,
"step": 194
},
{
"epoch": 1.2580645161290323,
"grad_norm": 0.5692989826202393,
"learning_rate": 2.1656307154989174e-05,
"loss": 0.0482,
"step": 195
},
{
"epoch": 1.2645161290322582,
"grad_norm": 0.7472412586212158,
"learning_rate": 2.1608252853824047e-05,
"loss": 0.0609,
"step": 196
},
{
"epoch": 1.270967741935484,
"grad_norm": 0.568708062171936,
"learning_rate": 2.1559909840364e-05,
"loss": 0.0572,
"step": 197
},
{
"epoch": 1.2774193548387096,
"grad_norm": 0.6601235866546631,
"learning_rate": 2.1511279646980016e-05,
"loss": 0.0777,
"step": 198
},
{
"epoch": 1.2838709677419355,
"grad_norm": 0.429850697517395,
"learning_rate": 2.1462363815146065e-05,
"loss": 0.0454,
"step": 199
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.7702894806861877,
"learning_rate": 2.1413163895390254e-05,
"loss": 0.0655,
"step": 200
},
{
"epoch": 1.2967741935483872,
"grad_norm": 0.4497153162956238,
"learning_rate": 2.1363681447245686e-05,
"loss": 0.0512,
"step": 201
},
{
"epoch": 1.303225806451613,
"grad_norm": 0.5631290674209595,
"learning_rate": 2.1313918039200995e-05,
"loss": 0.0645,
"step": 202
},
{
"epoch": 1.3096774193548386,
"grad_norm": 0.7414901852607727,
"learning_rate": 2.1263875248650662e-05,
"loss": 0.0561,
"step": 203
},
{
"epoch": 1.3161290322580645,
"grad_norm": 0.5053102970123291,
"learning_rate": 2.121355466184499e-05,
"loss": 0.0608,
"step": 204
},
{
"epoch": 1.3225806451612903,
"grad_norm": 10.00545597076416,
"learning_rate": 2.116295787383985e-05,
"loss": 0.0826,
"step": 205
},
{
"epoch": 1.3290322580645162,
"grad_norm": 0.6418637037277222,
"learning_rate": 2.1112086488446085e-05,
"loss": 0.0743,
"step": 206
},
{
"epoch": 1.335483870967742,
"grad_norm": 0.4627211391925812,
"learning_rate": 2.1060942118178706e-05,
"loss": 0.0476,
"step": 207
},
{
"epoch": 1.3419354838709676,
"grad_norm": 0.5375849604606628,
"learning_rate": 2.1009526384205767e-05,
"loss": 0.048,
"step": 208
},
{
"epoch": 1.3483870967741935,
"grad_norm": 0.606073796749115,
"learning_rate": 2.095784091629697e-05,
"loss": 0.0704,
"step": 209
},
{
"epoch": 1.3548387096774195,
"grad_norm": 0.44339242577552795,
"learning_rate": 2.0905887352772004e-05,
"loss": 0.0516,
"step": 210
},
{
"epoch": 1.3612903225806452,
"grad_norm": 0.6248610019683838,
"learning_rate": 2.085366734044864e-05,
"loss": 0.066,
"step": 211
},
{
"epoch": 1.367741935483871,
"grad_norm": 0.5914815664291382,
"learning_rate": 2.080118253459049e-05,
"loss": 0.0611,
"step": 212
},
{
"epoch": 1.3741935483870968,
"grad_norm": 0.45894381403923035,
"learning_rate": 2.0748434598854573e-05,
"loss": 0.0501,
"step": 213
},
{
"epoch": 1.3806451612903226,
"grad_norm": 0.49100032448768616,
"learning_rate": 2.0695425205238557e-05,
"loss": 0.0552,
"step": 214
},
{
"epoch": 1.3870967741935485,
"grad_norm": 0.528611958026886,
"learning_rate": 2.0642156034027783e-05,
"loss": 0.0639,
"step": 215
},
{
"epoch": 1.3935483870967742,
"grad_norm": 0.4487656056880951,
"learning_rate": 2.0588628773741973e-05,
"loss": 0.0435,
"step": 216
},
{
"epoch": 1.4,
"grad_norm": 0.547444224357605,
"learning_rate": 2.0534845121081742e-05,
"loss": 0.0547,
"step": 217
},
{
"epoch": 1.4064516129032258,
"grad_norm": 0.5207445621490479,
"learning_rate": 2.0480806780874794e-05,
"loss": 0.0574,
"step": 218
},
{
"epoch": 1.4129032258064516,
"grad_norm": 0.5784499049186707,
"learning_rate": 2.0426515466021887e-05,
"loss": 0.0608,
"step": 219
},
{
"epoch": 1.4193548387096775,
"grad_norm": 0.7198527455329895,
"learning_rate": 2.0371972897442532e-05,
"loss": 0.0639,
"step": 220
},
{
"epoch": 1.4258064516129032,
"grad_norm": 0.4550151228904724,
"learning_rate": 2.031718080402046e-05,
"loss": 0.0547,
"step": 221
},
{
"epoch": 1.432258064516129,
"grad_norm": 0.48588842153549194,
"learning_rate": 2.026214092254881e-05,
"loss": 0.0603,
"step": 222
},
{
"epoch": 1.4387096774193548,
"grad_norm": 0.5426737666130066,
"learning_rate": 2.0206854997675072e-05,
"loss": 0.0616,
"step": 223
},
{
"epoch": 1.4451612903225808,
"grad_norm": 0.5034387707710266,
"learning_rate": 2.0151324781845787e-05,
"loss": 0.0644,
"step": 224
},
{
"epoch": 1.4516129032258065,
"grad_norm": 0.5200063586235046,
"learning_rate": 2.0095552035251007e-05,
"loss": 0.0596,
"step": 225
},
{
"epoch": 1.4580645161290322,
"grad_norm": 0.4462428390979767,
"learning_rate": 2.0039538525768496e-05,
"loss": 0.0523,
"step": 226
},
{
"epoch": 1.4645161290322581,
"grad_norm": 0.5513397455215454,
"learning_rate": 1.9983286028907687e-05,
"loss": 0.0528,
"step": 227
},
{
"epoch": 1.4709677419354839,
"grad_norm": 0.44743800163269043,
"learning_rate": 1.992679632775341e-05,
"loss": 0.0649,
"step": 228
},
{
"epoch": 1.4774193548387098,
"grad_norm": 0.4505648910999298,
"learning_rate": 1.9870071212909357e-05,
"loss": 0.0453,
"step": 229
},
{
"epoch": 1.4838709677419355,
"grad_norm": 0.48718520998954773,
"learning_rate": 1.9813112482441345e-05,
"loss": 0.0664,
"step": 230
},
{
"epoch": 1.4903225806451612,
"grad_norm": 0.4392196834087372,
"learning_rate": 1.9755921941820314e-05,
"loss": 0.0504,
"step": 231
},
{
"epoch": 1.4967741935483871,
"grad_norm": 0.5312716364860535,
"learning_rate": 1.9698501403865083e-05,
"loss": 0.0699,
"step": 232
},
{
"epoch": 1.5032258064516129,
"grad_norm": 0.5387852787971497,
"learning_rate": 1.9640852688684904e-05,
"loss": 0.071,
"step": 233
},
{
"epoch": 1.5096774193548388,
"grad_norm": 0.4734801650047302,
"learning_rate": 1.9582977623621766e-05,
"loss": 0.0561,
"step": 234
},
{
"epoch": 1.5161290322580645,
"grad_norm": 0.4738084375858307,
"learning_rate": 1.9524878043192463e-05,
"loss": 0.0545,
"step": 235
},
{
"epoch": 1.5225806451612902,
"grad_norm": 0.5166822671890259,
"learning_rate": 1.9466555789030456e-05,
"loss": 0.0708,
"step": 236
},
{
"epoch": 1.5290322580645161,
"grad_norm": 0.5719185471534729,
"learning_rate": 1.9408012709827485e-05,
"loss": 0.073,
"step": 237
},
{
"epoch": 1.535483870967742,
"grad_norm": 0.5363075733184814,
"learning_rate": 1.934925066127498e-05,
"loss": 0.0581,
"step": 238
},
{
"epoch": 1.5419354838709678,
"grad_norm": 0.551699697971344,
"learning_rate": 1.9290271506005236e-05,
"loss": 0.0598,
"step": 239
},
{
"epoch": 1.5483870967741935,
"grad_norm": 0.5568850636482239,
"learning_rate": 1.9231077113532363e-05,
"loss": 0.0471,
"step": 240
},
{
"epoch": 1.5548387096774192,
"grad_norm": 0.5048776268959045,
"learning_rate": 1.917166936019304e-05,
"loss": 0.0613,
"step": 241
},
{
"epoch": 1.5612903225806452,
"grad_norm": 0.516986608505249,
"learning_rate": 1.911205012908703e-05,
"loss": 0.0678,
"step": 242
},
{
"epoch": 1.567741935483871,
"grad_norm": 0.48142287135124207,
"learning_rate": 1.90522213100175e-05,
"loss": 0.0557,
"step": 243
},
{
"epoch": 1.5741935483870968,
"grad_norm": 0.4997798800468445,
"learning_rate": 1.8992184799431095e-05,
"loss": 0.042,
"step": 244
},
{
"epoch": 1.5806451612903225,
"grad_norm": 0.5074776411056519,
"learning_rate": 1.893194250035786e-05,
"loss": 0.073,
"step": 245
},
{
"epoch": 1.5870967741935482,
"grad_norm": 0.5136696696281433,
"learning_rate": 1.8871496322350883e-05,
"loss": 0.0547,
"step": 246
},
{
"epoch": 1.5935483870967742,
"grad_norm": 0.6183574795722961,
"learning_rate": 1.881084818142579e-05,
"loss": 0.0708,
"step": 247
},
{
"epoch": 1.6,
"grad_norm": 0.5576770901679993,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.0629,
"step": 248
},
{
"epoch": 1.6064516129032258,
"grad_norm": 0.4211249351501465,
"learning_rate": 1.868895370683179e-05,
"loss": 0.0544,
"step": 249
},
{
"epoch": 1.6129032258064515,
"grad_norm": 0.43533676862716675,
"learning_rate": 1.862771123695917e-05,
"loss": 0.0568,
"step": 250
},
{
"epoch": 1.6193548387096774,
"grad_norm": 0.48171842098236084,
"learning_rate": 1.8566274531638516e-05,
"loss": 0.0524,
"step": 251
},
{
"epoch": 1.6258064516129034,
"grad_norm": 0.459471195936203,
"learning_rate": 1.850464553828307e-05,
"loss": 0.0471,
"step": 252
},
{
"epoch": 1.632258064516129,
"grad_norm": 0.5311537384986877,
"learning_rate": 1.844282621040119e-05,
"loss": 0.0766,
"step": 253
},
{
"epoch": 1.6387096774193548,
"grad_norm": 0.5022658109664917,
"learning_rate": 1.838081850753445e-05,
"loss": 0.0579,
"step": 254
},
{
"epoch": 1.6451612903225805,
"grad_norm": 0.5516560077667236,
"learning_rate": 1.8318624395195483e-05,
"loss": 0.0616,
"step": 255
},
{
"epoch": 1.6516129032258065,
"grad_norm": 0.4552045166492462,
"learning_rate": 1.825624584480573e-05,
"loss": 0.0512,
"step": 256
},
{
"epoch": 1.6580645161290324,
"grad_norm": 0.5871717929840088,
"learning_rate": 1.8193684833632925e-05,
"loss": 0.0641,
"step": 257
},
{
"epoch": 1.664516129032258,
"grad_norm": 0.46038615703582764,
"learning_rate": 1.8130943344728414e-05,
"loss": 0.0459,
"step": 258
},
{
"epoch": 1.6709677419354838,
"grad_norm": 0.5282014608383179,
"learning_rate": 1.8068023366864305e-05,
"loss": 0.0569,
"step": 259
},
{
"epoch": 1.6774193548387095,
"grad_norm": 0.3797317147254944,
"learning_rate": 1.800492689447043e-05,
"loss": 0.0459,
"step": 260
},
{
"epoch": 1.6838709677419355,
"grad_norm": 0.5863360166549683,
"learning_rate": 1.7941655927571125e-05,
"loss": 0.0695,
"step": 261
},
{
"epoch": 1.6903225806451614,
"grad_norm": 0.5562090873718262,
"learning_rate": 1.7878212471721824e-05,
"loss": 0.0554,
"step": 262
},
{
"epoch": 1.696774193548387,
"grad_norm": 0.5164937973022461,
"learning_rate": 1.781459853794551e-05,
"loss": 0.0542,
"step": 263
},
{
"epoch": 1.7032258064516128,
"grad_norm": 0.5710752010345459,
"learning_rate": 1.7750816142668937e-05,
"loss": 0.0641,
"step": 264
},
{
"epoch": 1.7096774193548387,
"grad_norm": 0.43633976578712463,
"learning_rate": 1.7686867307658743e-05,
"loss": 0.0498,
"step": 265
},
{
"epoch": 1.7161290322580647,
"grad_norm": 0.5214335322380066,
"learning_rate": 1.7622754059957343e-05,
"loss": 0.054,
"step": 266
},
{
"epoch": 1.7225806451612904,
"grad_norm": 0.6009476780891418,
"learning_rate": 1.7558478431818702e-05,
"loss": 0.0538,
"step": 267
},
{
"epoch": 1.729032258064516,
"grad_norm": 0.5809276700019836,
"learning_rate": 1.749404246064388e-05,
"loss": 0.0751,
"step": 268
},
{
"epoch": 1.7354838709677418,
"grad_norm": 0.5733875632286072,
"learning_rate": 1.7429448188916483e-05,
"loss": 0.0685,
"step": 269
},
{
"epoch": 1.7419354838709677,
"grad_norm": 0.3861143887042999,
"learning_rate": 1.7364697664137912e-05,
"loss": 0.044,
"step": 270
},
{
"epoch": 1.7483870967741937,
"grad_norm": 0.8718386292457581,
"learning_rate": 1.7299792938762443e-05,
"loss": 0.0807,
"step": 271
},
{
"epoch": 1.7548387096774194,
"grad_norm": 0.6809967160224915,
"learning_rate": 1.72347360701322e-05,
"loss": 0.0698,
"step": 272
},
{
"epoch": 1.761290322580645,
"grad_norm": 0.45045140385627747,
"learning_rate": 1.7169529120411922e-05,
"loss": 0.0552,
"step": 273
},
{
"epoch": 1.7677419354838708,
"grad_norm": 0.46889108419418335,
"learning_rate": 1.710417415652359e-05,
"loss": 0.0576,
"step": 274
},
{
"epoch": 1.7741935483870968,
"grad_norm": 0.5053566098213196,
"learning_rate": 1.7038673250080934e-05,
"loss": 0.0535,
"step": 275
},
{
"epoch": 1.7806451612903227,
"grad_norm": 0.4330599904060364,
"learning_rate": 1.6973028477323742e-05,
"loss": 0.0518,
"step": 276
},
{
"epoch": 1.7870967741935484,
"grad_norm": 0.4866834580898285,
"learning_rate": 1.6907241919052068e-05,
"loss": 0.051,
"step": 277
},
{
"epoch": 1.793548387096774,
"grad_norm": 0.6048698425292969,
"learning_rate": 1.6841315660560252e-05,
"loss": 0.0683,
"step": 278
},
{
"epoch": 1.8,
"grad_norm": 0.39490604400634766,
"learning_rate": 1.677525179157086e-05,
"loss": 0.0515,
"step": 279
},
{
"epoch": 1.8064516129032258,
"grad_norm": 0.4544709324836731,
"learning_rate": 1.6709052406168393e-05,
"loss": 0.0624,
"step": 280
},
{
"epoch": 1.8129032258064517,
"grad_norm": 0.5158767700195312,
"learning_rate": 1.664271960273295e-05,
"loss": 0.0575,
"step": 281
},
{
"epoch": 1.8193548387096774,
"grad_norm": 0.5172263979911804,
"learning_rate": 1.6576255483873686e-05,
"loss": 0.0578,
"step": 282
},
{
"epoch": 1.8258064516129031,
"grad_norm": 0.4233238995075226,
"learning_rate": 1.6509662156362196e-05,
"loss": 0.0547,
"step": 283
},
{
"epoch": 1.832258064516129,
"grad_norm": 0.45361143350601196,
"learning_rate": 1.6442941731065697e-05,
"loss": 0.0512,
"step": 284
},
{
"epoch": 1.838709677419355,
"grad_norm": 0.5802233219146729,
"learning_rate": 1.637609632288014e-05,
"loss": 0.0596,
"step": 285
},
{
"epoch": 1.8451612903225807,
"grad_norm": 0.5369323492050171,
"learning_rate": 1.630912805066317e-05,
"loss": 0.0646,
"step": 286
},
{
"epoch": 1.8516129032258064,
"grad_norm": 0.45122525095939636,
"learning_rate": 1.6242039037166977e-05,
"loss": 0.0517,
"step": 287
},
{
"epoch": 1.8580645161290321,
"grad_norm": 0.39205196499824524,
"learning_rate": 1.6174831408970964e-05,
"loss": 0.0491,
"step": 288
},
{
"epoch": 1.864516129032258,
"grad_norm": 0.4472959637641907,
"learning_rate": 1.6107507296414383e-05,
"loss": 0.049,
"step": 289
},
{
"epoch": 1.870967741935484,
"grad_norm": 0.41624531149864197,
"learning_rate": 1.6040068833528797e-05,
"loss": 0.0483,
"step": 290
},
{
"epoch": 1.8774193548387097,
"grad_norm": 0.43875551223754883,
"learning_rate": 1.597251815797044e-05,
"loss": 0.0487,
"step": 291
},
{
"epoch": 1.8838709677419354,
"grad_norm": 0.4978736937046051,
"learning_rate": 1.5904857410952417e-05,
"loss": 0.0573,
"step": 292
},
{
"epoch": 1.8903225806451613,
"grad_norm": 0.5798497796058655,
"learning_rate": 1.5837088737176896e-05,
"loss": 0.0683,
"step": 293
},
{
"epoch": 1.896774193548387,
"grad_norm": 0.7377052903175354,
"learning_rate": 1.5769214284767086e-05,
"loss": 0.0583,
"step": 294
},
{
"epoch": 1.903225806451613,
"grad_norm": 0.4153827428817749,
"learning_rate": 1.570123620519915e-05,
"loss": 0.0543,
"step": 295
},
{
"epoch": 1.9096774193548387,
"grad_norm": 0.4852810800075531,
"learning_rate": 1.563315665323401e-05,
"loss": 0.0636,
"step": 296
},
{
"epoch": 1.9161290322580644,
"grad_norm": 0.5545767545700073,
"learning_rate": 1.5564977786849055e-05,
"loss": 0.062,
"step": 297
},
{
"epoch": 1.9225806451612903,
"grad_norm": 0.4363822937011719,
"learning_rate": 1.549670176716973e-05,
"loss": 0.0516,
"step": 298
},
{
"epoch": 1.9290322580645163,
"grad_norm": 0.5309383273124695,
"learning_rate": 1.5428330758401027e-05,
"loss": 0.0647,
"step": 299
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.6617056131362915,
"learning_rate": 1.53598669277589e-05,
"loss": 0.0641,
"step": 300
},
{
"epoch": 1.9419354838709677,
"grad_norm": 0.49968254566192627,
"learning_rate": 1.529131244540155e-05,
"loss": 0.0585,
"step": 301
},
{
"epoch": 1.9483870967741934,
"grad_norm": 0.40158751606941223,
"learning_rate": 1.5222669484360644e-05,
"loss": 0.0537,
"step": 302
},
{
"epoch": 1.9548387096774194,
"grad_norm": 0.4537198543548584,
"learning_rate": 1.5153940220472451e-05,
"loss": 0.0511,
"step": 303
},
{
"epoch": 1.9612903225806453,
"grad_norm": 0.47163766622543335,
"learning_rate": 1.5085126832308843e-05,
"loss": 0.0532,
"step": 304
},
{
"epoch": 1.967741935483871,
"grad_norm": 0.4738634526729584,
"learning_rate": 1.5016231501108253e-05,
"loss": 0.0615,
"step": 305
},
{
"epoch": 1.9741935483870967,
"grad_norm": 0.38560569286346436,
"learning_rate": 1.494725641070654e-05,
"loss": 0.0522,
"step": 306
},
{
"epoch": 1.9806451612903224,
"grad_norm": 0.5569445490837097,
"learning_rate": 1.4878203747467764e-05,
"loss": 0.0731,
"step": 307
},
{
"epoch": 1.9870967741935484,
"grad_norm": 0.38958773016929626,
"learning_rate": 1.480907570021487e-05,
"loss": 0.0461,
"step": 308
},
{
"epoch": 1.9935483870967743,
"grad_norm": 0.4473820924758911,
"learning_rate": 1.4739874460160316e-05,
"loss": 0.0555,
"step": 309
},
{
"epoch": 2.0,
"grad_norm": 0.39891934394836426,
"learning_rate": 1.4670602220836633e-05,
"loss": 0.051,
"step": 310
},
{
"epoch": 2.0064516129032257,
"grad_norm": 0.474127858877182,
"learning_rate": 1.4601261178026854e-05,
"loss": 0.0401,
"step": 311
},
{
"epoch": 2.0129032258064514,
"grad_norm": 0.3391839563846588,
"learning_rate": 1.4531853529694956e-05,
"loss": 0.0333,
"step": 312
},
{
"epoch": 2.0193548387096776,
"grad_norm": 0.3230273723602295,
"learning_rate": 1.446238147591616e-05,
"loss": 0.0282,
"step": 313
},
{
"epoch": 2.0258064516129033,
"grad_norm": 0.3246399462223053,
"learning_rate": 1.439284721880721e-05,
"loss": 0.0345,
"step": 314
},
{
"epoch": 2.032258064516129,
"grad_norm": 0.41817039251327515,
"learning_rate": 1.4323252962456554e-05,
"loss": 0.0288,
"step": 315
},
{
"epoch": 2.0387096774193547,
"grad_norm": 0.48674166202545166,
"learning_rate": 1.4253600912854497e-05,
"loss": 0.0354,
"step": 316
},
{
"epoch": 2.0451612903225804,
"grad_norm": 0.42214757204055786,
"learning_rate": 1.4183893277823265e-05,
"loss": 0.0388,
"step": 317
},
{
"epoch": 2.0516129032258066,
"grad_norm": 0.5475701093673706,
"learning_rate": 1.411413226694702e-05,
"loss": 0.0294,
"step": 318
},
{
"epoch": 2.0580645161290323,
"grad_norm": 0.5432962775230408,
"learning_rate": 1.4044320091501834e-05,
"loss": 0.0372,
"step": 319
},
{
"epoch": 2.064516129032258,
"grad_norm": 0.49539855122566223,
"learning_rate": 1.3974458964385579e-05,
"loss": 0.0425,
"step": 320
},
{
"epoch": 2.0709677419354837,
"grad_norm": 0.340425044298172,
"learning_rate": 1.3904551100047791e-05,
"loss": 0.026,
"step": 321
},
{
"epoch": 2.07741935483871,
"grad_norm": 0.4815217852592468,
"learning_rate": 1.3834598714419486e-05,
"loss": 0.0352,
"step": 322
},
{
"epoch": 2.0838709677419356,
"grad_norm": 0.4457317888736725,
"learning_rate": 1.3764604024842903e-05,
"loss": 0.028,
"step": 323
},
{
"epoch": 2.0903225806451613,
"grad_norm": 0.45776546001434326,
"learning_rate": 1.369456925000123e-05,
"loss": 0.0287,
"step": 324
},
{
"epoch": 2.096774193548387,
"grad_norm": 0.3825792968273163,
"learning_rate": 1.362449660984826e-05,
"loss": 0.0257,
"step": 325
},
{
"epoch": 2.1032258064516127,
"grad_norm": 0.44209763407707214,
"learning_rate": 1.3554388325538059e-05,
"loss": 0.0274,
"step": 326
},
{
"epoch": 2.109677419354839,
"grad_norm": 0.37732046842575073,
"learning_rate": 1.3484246619354524e-05,
"loss": 0.0263,
"step": 327
},
{
"epoch": 2.1161290322580646,
"grad_norm": 0.4975365698337555,
"learning_rate": 1.3414073714640951e-05,
"loss": 0.0294,
"step": 328
},
{
"epoch": 2.1225806451612903,
"grad_norm": 0.37548768520355225,
"learning_rate": 1.3343871835729565e-05,
"loss": 0.0261,
"step": 329
},
{
"epoch": 2.129032258064516,
"grad_norm": 0.722154438495636,
"learning_rate": 1.3273643207871025e-05,
"loss": 0.0296,
"step": 330
},
{
"epoch": 2.135483870967742,
"grad_norm": 0.513611912727356,
"learning_rate": 1.3203390057163855e-05,
"loss": 0.0326,
"step": 331
},
{
"epoch": 2.141935483870968,
"grad_norm": 0.43579375743865967,
"learning_rate": 1.3133114610483909e-05,
"loss": 0.035,
"step": 332
},
{
"epoch": 2.1483870967741936,
"grad_norm": 0.4927336275577545,
"learning_rate": 1.3062819095413786e-05,
"loss": 0.0358,
"step": 333
},
{
"epoch": 2.1548387096774193,
"grad_norm": 0.43542489409446716,
"learning_rate": 1.2992505740172196e-05,
"loss": 0.035,
"step": 334
},
{
"epoch": 2.161290322580645,
"grad_norm": 0.34009236097335815,
"learning_rate": 1.2922176773543355e-05,
"loss": 0.0264,
"step": 335
},
{
"epoch": 2.167741935483871,
"grad_norm": 0.4710192084312439,
"learning_rate": 1.2851834424806314e-05,
"loss": 0.0403,
"step": 336
},
{
"epoch": 2.174193548387097,
"grad_norm": 0.8653304576873779,
"learning_rate": 1.2781480923664326e-05,
"loss": 0.0839,
"step": 337
},
{
"epoch": 2.1806451612903226,
"grad_norm": 0.7528795599937439,
"learning_rate": 1.2711118500174138e-05,
"loss": 0.0488,
"step": 338
},
{
"epoch": 2.1870967741935483,
"grad_norm": 0.5551451444625854,
"learning_rate": 1.2640749384675324e-05,
"loss": 0.0223,
"step": 339
},
{
"epoch": 2.193548387096774,
"grad_norm": 0.42200708389282227,
"learning_rate": 1.2570375807719576e-05,
"loss": 0.0305,
"step": 340
},
{
"epoch": 2.2,
"grad_norm": 0.5258976817131042,
"learning_rate": 1.25e-05,
"loss": 0.0455,
"step": 341
},
{
"epoch": 2.206451612903226,
"grad_norm": 0.495807945728302,
"learning_rate": 1.242962419228043e-05,
"loss": 0.0323,
"step": 342
},
{
"epoch": 2.2129032258064516,
"grad_norm": 0.5464356541633606,
"learning_rate": 1.2359250615324678e-05,
"loss": 0.0325,
"step": 343
},
{
"epoch": 2.2193548387096773,
"grad_norm": 0.5555934906005859,
"learning_rate": 1.2288881499825863e-05,
"loss": 0.0504,
"step": 344
},
{
"epoch": 2.225806451612903,
"grad_norm": 0.41927701234817505,
"learning_rate": 1.2218519076335677e-05,
"loss": 0.0288,
"step": 345
},
{
"epoch": 2.232258064516129,
"grad_norm": 0.5449569821357727,
"learning_rate": 1.2148165575193685e-05,
"loss": 0.0328,
"step": 346
},
{
"epoch": 2.238709677419355,
"grad_norm": 0.4198172688484192,
"learning_rate": 1.2077823226456648e-05,
"loss": 0.0284,
"step": 347
},
{
"epoch": 2.2451612903225806,
"grad_norm": 0.5396814346313477,
"learning_rate": 1.2007494259827809e-05,
"loss": 0.0379,
"step": 348
},
{
"epoch": 2.2516129032258063,
"grad_norm": 0.4842919409275055,
"learning_rate": 1.1937180904586215e-05,
"loss": 0.0316,
"step": 349
},
{
"epoch": 2.258064516129032,
"grad_norm": 0.5152572989463806,
"learning_rate": 1.1866885389516092e-05,
"loss": 0.0321,
"step": 350
},
{
"epoch": 2.264516129032258,
"grad_norm": 0.556614875793457,
"learning_rate": 1.179660994283615e-05,
"loss": 0.0372,
"step": 351
},
{
"epoch": 2.270967741935484,
"grad_norm": 0.5159235000610352,
"learning_rate": 1.1726356792128978e-05,
"loss": 0.0328,
"step": 352
},
{
"epoch": 2.2774193548387096,
"grad_norm": 0.5564429759979248,
"learning_rate": 1.1656128164270436e-05,
"loss": 0.0304,
"step": 353
},
{
"epoch": 2.2838709677419353,
"grad_norm": 0.6227903366088867,
"learning_rate": 1.1585926285359049e-05,
"loss": 0.0321,
"step": 354
},
{
"epoch": 2.2903225806451615,
"grad_norm": 0.5218878388404846,
"learning_rate": 1.1515753380645479e-05,
"loss": 0.0358,
"step": 355
},
{
"epoch": 2.296774193548387,
"grad_norm": 0.49731266498565674,
"learning_rate": 1.1445611674461942e-05,
"loss": 0.0331,
"step": 356
},
{
"epoch": 2.303225806451613,
"grad_norm": 0.5095941424369812,
"learning_rate": 1.1375503390151737e-05,
"loss": 0.0315,
"step": 357
},
{
"epoch": 2.3096774193548386,
"grad_norm": 0.4576358199119568,
"learning_rate": 1.1305430749998775e-05,
"loss": 0.0304,
"step": 358
},
{
"epoch": 2.3161290322580643,
"grad_norm": 0.5103798508644104,
"learning_rate": 1.12353959751571e-05,
"loss": 0.0281,
"step": 359
},
{
"epoch": 2.3225806451612905,
"grad_norm": 0.5072308778762817,
"learning_rate": 1.1165401285580515e-05,
"loss": 0.0267,
"step": 360
},
{
"epoch": 2.329032258064516,
"grad_norm": 0.45558691024780273,
"learning_rate": 1.1095448899952212e-05,
"loss": 0.0302,
"step": 361
},
{
"epoch": 2.335483870967742,
"grad_norm": 0.4773171842098236,
"learning_rate": 1.1025541035614427e-05,
"loss": 0.0307,
"step": 362
},
{
"epoch": 2.3419354838709676,
"grad_norm": 0.4630301892757416,
"learning_rate": 1.0955679908498171e-05,
"loss": 0.0292,
"step": 363
},
{
"epoch": 2.3483870967741938,
"grad_norm": 0.5814460515975952,
"learning_rate": 1.0885867733052985e-05,
"loss": 0.034,
"step": 364
},
{
"epoch": 2.3548387096774195,
"grad_norm": 0.3135308623313904,
"learning_rate": 1.0816106722176741e-05,
"loss": 0.0264,
"step": 365
},
{
"epoch": 2.361290322580645,
"grad_norm": 0.4219888150691986,
"learning_rate": 1.0746399087145504e-05,
"loss": 0.0304,
"step": 366
},
{
"epoch": 2.367741935483871,
"grad_norm": 0.4246158003807068,
"learning_rate": 1.0676747037543447e-05,
"loss": 0.032,
"step": 367
},
{
"epoch": 2.3741935483870966,
"grad_norm": 0.4565359950065613,
"learning_rate": 1.0607152781192796e-05,
"loss": 0.0326,
"step": 368
},
{
"epoch": 2.3806451612903228,
"grad_norm": 0.4495943486690521,
"learning_rate": 1.053761852408384e-05,
"loss": 0.0307,
"step": 369
},
{
"epoch": 2.3870967741935485,
"grad_norm": 0.47505924105644226,
"learning_rate": 1.0468146470305047e-05,
"loss": 0.0366,
"step": 370
},
{
"epoch": 2.393548387096774,
"grad_norm": 0.41802337765693665,
"learning_rate": 1.039873882197315e-05,
"loss": 0.0242,
"step": 371
},
{
"epoch": 2.4,
"grad_norm": 0.4308302104473114,
"learning_rate": 1.0329397779163372e-05,
"loss": 0.0303,
"step": 372
},
{
"epoch": 2.4064516129032256,
"grad_norm": 0.3776704967021942,
"learning_rate": 1.0260125539839686e-05,
"loss": 0.0224,
"step": 373
},
{
"epoch": 2.412903225806452,
"grad_norm": 0.3952430188655853,
"learning_rate": 1.0190924299785138e-05,
"loss": 0.0236,
"step": 374
},
{
"epoch": 2.4193548387096775,
"grad_norm": 0.5212628841400146,
"learning_rate": 1.0121796252532237e-05,
"loss": 0.0352,
"step": 375
},
{
"epoch": 2.425806451612903,
"grad_norm": 0.5264010429382324,
"learning_rate": 1.0052743589293463e-05,
"loss": 0.0366,
"step": 376
},
{
"epoch": 2.432258064516129,
"grad_norm": 0.42148974537849426,
"learning_rate": 9.983768498891747e-06,
"loss": 0.0281,
"step": 377
},
{
"epoch": 2.4387096774193546,
"grad_norm": 0.4387865960597992,
"learning_rate": 9.91487316769116e-06,
"loss": 0.0321,
"step": 378
},
{
"epoch": 2.445161290322581,
"grad_norm": 0.4530801475048065,
"learning_rate": 9.846059779527552e-06,
"loss": 0.03,
"step": 379
},
{
"epoch": 2.4516129032258065,
"grad_norm": 0.44786474108695984,
"learning_rate": 9.777330515639356e-06,
"loss": 0.0312,
"step": 380
},
{
"epoch": 2.458064516129032,
"grad_norm": 0.42808324098587036,
"learning_rate": 9.708687554598454e-06,
"loss": 0.0321,
"step": 381
},
{
"epoch": 2.464516129032258,
"grad_norm": 0.4658293128013611,
"learning_rate": 9.640133072241105e-06,
"loss": 0.0335,
"step": 382
},
{
"epoch": 2.4709677419354836,
"grad_norm": 0.45854416489601135,
"learning_rate": 9.571669241598974e-06,
"loss": 0.0306,
"step": 383
},
{
"epoch": 2.47741935483871,
"grad_norm": 0.5602400302886963,
"learning_rate": 9.503298232830274e-06,
"loss": 0.0425,
"step": 384
},
{
"epoch": 2.4838709677419355,
"grad_norm": 0.4135296940803528,
"learning_rate": 9.43502221315095e-06,
"loss": 0.0317,
"step": 385
},
{
"epoch": 2.490322580645161,
"grad_norm": 0.6756112575531006,
"learning_rate": 9.366843346765992e-06,
"loss": 0.0492,
"step": 386
},
{
"epoch": 2.496774193548387,
"grad_norm": 0.6048617362976074,
"learning_rate": 9.298763794800856e-06,
"loss": 0.0326,
"step": 387
},
{
"epoch": 2.5032258064516126,
"grad_norm": 0.3737858831882477,
"learning_rate": 9.230785715232917e-06,
"loss": 0.0226,
"step": 388
},
{
"epoch": 2.509677419354839,
"grad_norm": 0.49958306550979614,
"learning_rate": 9.162911262823104e-06,
"loss": 0.0293,
"step": 389
},
{
"epoch": 2.5161290322580645,
"grad_norm": 0.4132345914840698,
"learning_rate": 9.095142589047586e-06,
"loss": 0.0268,
"step": 390
},
{
"epoch": 2.52258064516129,
"grad_norm": 0.5339500308036804,
"learning_rate": 9.027481842029567e-06,
"loss": 0.0308,
"step": 391
},
{
"epoch": 2.5290322580645164,
"grad_norm": 0.5680338740348816,
"learning_rate": 8.9599311664712e-06,
"loss": 0.026,
"step": 392
},
{
"epoch": 2.535483870967742,
"grad_norm": 0.4945621192455292,
"learning_rate": 8.89249270358562e-06,
"loss": 0.0414,
"step": 393
},
{
"epoch": 2.541935483870968,
"grad_norm": 0.478188157081604,
"learning_rate": 8.825168591029042e-06,
"loss": 0.0325,
"step": 394
},
{
"epoch": 2.5483870967741935,
"grad_norm": 0.41539856791496277,
"learning_rate": 8.757960962833026e-06,
"loss": 0.0276,
"step": 395
},
{
"epoch": 2.554838709677419,
"grad_norm": 0.41548025608062744,
"learning_rate": 8.69087194933683e-06,
"loss": 0.0258,
"step": 396
},
{
"epoch": 2.5612903225806454,
"grad_norm": 0.7209835052490234,
"learning_rate": 8.623903677119866e-06,
"loss": 0.0275,
"step": 397
},
{
"epoch": 2.567741935483871,
"grad_norm": 0.45113834738731384,
"learning_rate": 8.557058268934306e-06,
"loss": 0.0276,
"step": 398
},
{
"epoch": 2.574193548387097,
"grad_norm": 0.4919924736022949,
"learning_rate": 8.490337843637807e-06,
"loss": 0.0352,
"step": 399
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.4441167414188385,
"learning_rate": 8.423744516126313e-06,
"loss": 0.0312,
"step": 400
},
{
"epoch": 2.587096774193548,
"grad_norm": 0.3870048522949219,
"learning_rate": 8.357280397267054e-06,
"loss": 0.0273,
"step": 401
},
{
"epoch": 2.5935483870967744,
"grad_norm": 0.4747593104839325,
"learning_rate": 8.29094759383161e-06,
"loss": 0.0428,
"step": 402
},
{
"epoch": 2.6,
"grad_norm": 0.3494237959384918,
"learning_rate": 8.224748208429142e-06,
"loss": 0.0249,
"step": 403
},
{
"epoch": 2.606451612903226,
"grad_norm": 0.3618505001068115,
"learning_rate": 8.158684339439748e-06,
"loss": 0.0221,
"step": 404
},
{
"epoch": 2.6129032258064515,
"grad_norm": 0.45744070410728455,
"learning_rate": 8.092758080947939e-06,
"loss": 0.0303,
"step": 405
},
{
"epoch": 2.6193548387096772,
"grad_norm": 0.3921363055706024,
"learning_rate": 8.02697152267626e-06,
"loss": 0.0267,
"step": 406
},
{
"epoch": 2.6258064516129034,
"grad_norm": 0.5149343013763428,
"learning_rate": 7.961326749919069e-06,
"loss": 0.0347,
"step": 407
},
{
"epoch": 2.632258064516129,
"grad_norm": 0.5246243476867676,
"learning_rate": 7.895825843476412e-06,
"loss": 0.0318,
"step": 408
},
{
"epoch": 2.638709677419355,
"grad_norm": 0.5338672995567322,
"learning_rate": 7.83047087958808e-06,
"loss": 0.0331,
"step": 409
},
{
"epoch": 2.6451612903225805,
"grad_norm": 0.4028920531272888,
"learning_rate": 7.7652639298678e-06,
"loss": 0.0251,
"step": 410
},
{
"epoch": 2.6516129032258062,
"grad_norm": 0.3391985297203064,
"learning_rate": 7.70020706123756e-06,
"loss": 0.0206,
"step": 411
},
{
"epoch": 2.6580645161290324,
"grad_norm": 0.4651046097278595,
"learning_rate": 7.635302335862094e-06,
"loss": 0.0242,
"step": 412
},
{
"epoch": 2.664516129032258,
"grad_norm": 0.4581477642059326,
"learning_rate": 7.570551811083521e-06,
"loss": 0.0334,
"step": 413
},
{
"epoch": 2.670967741935484,
"grad_norm": 0.629748523235321,
"learning_rate": 7.505957539356126e-06,
"loss": 0.0426,
"step": 414
},
{
"epoch": 2.6774193548387095,
"grad_norm": 0.44972798228263855,
"learning_rate": 7.441521568181299e-06,
"loss": 0.0279,
"step": 415
},
{
"epoch": 2.6838709677419352,
"grad_norm": 0.49497148394584656,
"learning_rate": 7.37724594004266e-06,
"loss": 0.0331,
"step": 416
},
{
"epoch": 2.6903225806451614,
"grad_norm": 0.4186260998249054,
"learning_rate": 7.313132692341263e-06,
"loss": 0.0294,
"step": 417
},
{
"epoch": 2.696774193548387,
"grad_norm": 0.4715961813926697,
"learning_rate": 7.249183857331064e-06,
"loss": 0.0293,
"step": 418
},
{
"epoch": 2.703225806451613,
"grad_norm": 0.48064178228378296,
"learning_rate": 7.185401462054495e-06,
"loss": 0.0312,
"step": 419
},
{
"epoch": 2.709677419354839,
"grad_norm": 0.4826470613479614,
"learning_rate": 7.121787528278177e-06,
"loss": 0.0303,
"step": 420
},
{
"epoch": 2.7161290322580647,
"grad_norm": 0.39333951473236084,
"learning_rate": 7.058344072428877e-06,
"loss": 0.0211,
"step": 421
},
{
"epoch": 2.7225806451612904,
"grad_norm": 0.3964556157588959,
"learning_rate": 6.99507310552957e-06,
"loss": 0.031,
"step": 422
},
{
"epoch": 2.729032258064516,
"grad_norm": 0.5450259447097778,
"learning_rate": 6.931976633135695e-06,
"loss": 0.0344,
"step": 423
},
{
"epoch": 2.735483870967742,
"grad_norm": 0.4331640601158142,
"learning_rate": 6.869056655271588e-06,
"loss": 0.0261,
"step": 424
},
{
"epoch": 2.741935483870968,
"grad_norm": 0.46446603536605835,
"learning_rate": 6.806315166367075e-06,
"loss": 0.0311,
"step": 425
},
{
"epoch": 2.7483870967741937,
"grad_norm": 0.5200790166854858,
"learning_rate": 6.743754155194268e-06,
"loss": 0.0292,
"step": 426
},
{
"epoch": 2.7548387096774194,
"grad_norm": 0.6154363751411438,
"learning_rate": 6.681375604804521e-06,
"loss": 0.0252,
"step": 427
},
{
"epoch": 2.761290322580645,
"grad_norm": 0.43054288625717163,
"learning_rate": 6.619181492465557e-06,
"loss": 0.0225,
"step": 428
},
{
"epoch": 2.767741935483871,
"grad_norm": 0.5042747259140015,
"learning_rate": 6.55717378959881e-06,
"loss": 0.0266,
"step": 429
},
{
"epoch": 2.774193548387097,
"grad_norm": 0.5118414759635925,
"learning_rate": 6.4953544617169376e-06,
"loss": 0.0324,
"step": 430
},
{
"epoch": 2.7806451612903227,
"grad_norm": 0.4071415364742279,
"learning_rate": 6.43372546836149e-06,
"loss": 0.0306,
"step": 431
},
{
"epoch": 2.7870967741935484,
"grad_norm": 0.43498843908309937,
"learning_rate": 6.372288763040833e-06,
"loss": 0.0267,
"step": 432
},
{
"epoch": 2.793548387096774,
"grad_norm": 0.6407294273376465,
"learning_rate": 6.3110462931682075e-06,
"loss": 0.0529,
"step": 433
},
{
"epoch": 2.8,
"grad_norm": 0.4058496356010437,
"learning_rate": 6.250000000000003e-06,
"loss": 0.0289,
"step": 434
},
{
"epoch": 2.806451612903226,
"grad_norm": 0.34818795323371887,
"learning_rate": 6.1891518185742116e-06,
"loss": 0.0236,
"step": 435
},
{
"epoch": 2.8129032258064517,
"grad_norm": 0.4517665505409241,
"learning_rate": 6.1285036776491165e-06,
"loss": 0.0341,
"step": 436
},
{
"epoch": 2.8193548387096774,
"grad_norm": 0.5423181056976318,
"learning_rate": 6.068057499642144e-06,
"loss": 0.0406,
"step": 437
},
{
"epoch": 2.825806451612903,
"grad_norm": 0.4574117362499237,
"learning_rate": 6.007815200568906e-06,
"loss": 0.0344,
"step": 438
},
{
"epoch": 2.832258064516129,
"grad_norm": 0.4028095006942749,
"learning_rate": 5.9477786899825024e-06,
"loss": 0.026,
"step": 439
},
{
"epoch": 2.838709677419355,
"grad_norm": 0.4277281165122986,
"learning_rate": 5.8879498709129735e-06,
"loss": 0.032,
"step": 440
},
{
"epoch": 2.8451612903225807,
"grad_norm": 0.4217607080936432,
"learning_rate": 5.82833063980696e-06,
"loss": 0.0275,
"step": 441
},
{
"epoch": 2.8516129032258064,
"grad_norm": 0.4865557849407196,
"learning_rate": 5.7689228864676394e-06,
"loss": 0.0344,
"step": 442
},
{
"epoch": 2.858064516129032,
"grad_norm": 0.44111689925193787,
"learning_rate": 5.70972849399477e-06,
"loss": 0.028,
"step": 443
},
{
"epoch": 2.864516129032258,
"grad_norm": 0.5612359046936035,
"learning_rate": 5.650749338725019e-06,
"loss": 0.0433,
"step": 444
},
{
"epoch": 2.870967741935484,
"grad_norm": 0.42652663588523865,
"learning_rate": 5.591987290172518e-06,
"loss": 0.0266,
"step": 445
},
{
"epoch": 2.8774193548387097,
"grad_norm": 0.43139341473579407,
"learning_rate": 5.533444210969546e-06,
"loss": 0.0228,
"step": 446
},
{
"epoch": 2.8838709677419354,
"grad_norm": 0.3348155915737152,
"learning_rate": 5.475121956807537e-06,
"loss": 0.0236,
"step": 447
},
{
"epoch": 2.8903225806451616,
"grad_norm": 0.41803082823753357,
"learning_rate": 5.417022376378239e-06,
"loss": 0.0275,
"step": 448
},
{
"epoch": 2.896774193548387,
"grad_norm": 0.4131038784980774,
"learning_rate": 5.359147311315094e-06,
"loss": 0.0265,
"step": 449
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.5227479934692383,
"learning_rate": 5.30149859613492e-06,
"loss": 0.025,
"step": 450
},
{
"epoch": 2.9096774193548387,
"grad_norm": 0.43142953515052795,
"learning_rate": 5.244078058179691e-06,
"loss": 0.0249,
"step": 451
},
{
"epoch": 2.9161290322580644,
"grad_norm": 0.4158158600330353,
"learning_rate": 5.186887517558653e-06,
"loss": 0.0297,
"step": 452
},
{
"epoch": 2.9225806451612906,
"grad_norm": 0.3552153706550598,
"learning_rate": 5.129928787090646e-06,
"loss": 0.0234,
"step": 453
},
{
"epoch": 2.9290322580645163,
"grad_norm": 0.49204781651496887,
"learning_rate": 5.073203672246593e-06,
"loss": 0.0379,
"step": 454
},
{
"epoch": 2.935483870967742,
"grad_norm": 0.38140571117401123,
"learning_rate": 5.016713971092311e-06,
"loss": 0.0294,
"step": 455
},
{
"epoch": 2.9419354838709677,
"grad_norm": 0.5261517763137817,
"learning_rate": 4.960461474231505e-06,
"loss": 0.0305,
"step": 456
},
{
"epoch": 2.9483870967741934,
"grad_norm": 0.6391315460205078,
"learning_rate": 4.904447964748993e-06,
"loss": 0.038,
"step": 457
},
{
"epoch": 2.9548387096774196,
"grad_norm": 0.3812016546726227,
"learning_rate": 4.848675218154214e-06,
"loss": 0.0259,
"step": 458
},
{
"epoch": 2.9612903225806453,
"grad_norm": 0.4748527407646179,
"learning_rate": 4.793145002324933e-06,
"loss": 0.0329,
"step": 459
},
{
"epoch": 2.967741935483871,
"grad_norm": 0.4919755458831787,
"learning_rate": 4.737859077451191e-06,
"loss": 0.0253,
"step": 460
},
{
"epoch": 2.9741935483870967,
"grad_norm": 0.4986102879047394,
"learning_rate": 4.68281919597954e-06,
"loss": 0.0293,
"step": 461
},
{
"epoch": 2.9806451612903224,
"grad_norm": 0.48589223623275757,
"learning_rate": 4.6280271025574695e-06,
"loss": 0.0287,
"step": 462
},
{
"epoch": 2.9870967741935486,
"grad_norm": 0.4930824935436249,
"learning_rate": 4.573484533978119e-06,
"loss": 0.0258,
"step": 463
},
{
"epoch": 2.9935483870967743,
"grad_norm": 0.38358667492866516,
"learning_rate": 4.5191932191252075e-06,
"loss": 0.0235,
"step": 464
},
{
"epoch": 3.0,
"grad_norm": 0.5300599336624146,
"learning_rate": 4.465154878918258e-06,
"loss": 0.0309,
"step": 465
},
{
"epoch": 3.0064516129032257,
"grad_norm": 0.24884271621704102,
"learning_rate": 4.411371226258032e-06,
"loss": 0.0182,
"step": 466
},
{
"epoch": 3.0129032258064514,
"grad_norm": 0.3061859607696533,
"learning_rate": 4.3578439659722246e-06,
"loss": 0.0161,
"step": 467
},
{
"epoch": 3.0193548387096776,
"grad_norm": 0.3014248311519623,
"learning_rate": 4.304574794761447e-06,
"loss": 0.0135,
"step": 468
},
{
"epoch": 3.0258064516129033,
"grad_norm": 0.31640511751174927,
"learning_rate": 4.251565401145432e-06,
"loss": 0.0168,
"step": 469
},
{
"epoch": 3.032258064516129,
"grad_norm": 0.25447705388069153,
"learning_rate": 4.1988174654095104e-06,
"loss": 0.0117,
"step": 470
},
{
"epoch": 3.0387096774193547,
"grad_norm": 0.31153520941734314,
"learning_rate": 4.146332659551364e-06,
"loss": 0.0147,
"step": 471
},
{
"epoch": 3.0451612903225804,
"grad_norm": 0.32956749200820923,
"learning_rate": 4.094112647227996e-06,
"loss": 0.015,
"step": 472
},
{
"epoch": 3.0516129032258066,
"grad_norm": 0.3105918765068054,
"learning_rate": 4.042159083703031e-06,
"loss": 0.0136,
"step": 473
},
{
"epoch": 3.0580645161290323,
"grad_norm": 0.3890332281589508,
"learning_rate": 3.9904736157942355e-06,
"loss": 0.0128,
"step": 474
},
{
"epoch": 3.064516129032258,
"grad_norm": 0.2500901222229004,
"learning_rate": 3.939057881821295e-06,
"loss": 0.0109,
"step": 475
},
{
"epoch": 3.0709677419354837,
"grad_norm": 0.33506497740745544,
"learning_rate": 3.887913511553917e-06,
"loss": 0.0138,
"step": 476
},
{
"epoch": 3.07741935483871,
"grad_norm": 0.35200193524360657,
"learning_rate": 3.837042126160157e-06,
"loss": 0.0163,
"step": 477
},
{
"epoch": 3.0838709677419356,
"grad_norm": 0.33882763981819153,
"learning_rate": 3.786445338155013e-06,
"loss": 0.0125,
"step": 478
},
{
"epoch": 3.0903225806451613,
"grad_norm": 0.4239828586578369,
"learning_rate": 3.736124751349343e-06,
"loss": 0.0146,
"step": 479
},
{
"epoch": 3.096774193548387,
"grad_norm": 0.39569729566574097,
"learning_rate": 3.6860819607990108e-06,
"loss": 0.0149,
"step": 480
},
{
"epoch": 3.1032258064516127,
"grad_norm": 0.4424724578857422,
"learning_rate": 3.6363185527543156e-06,
"loss": 0.0147,
"step": 481
},
{
"epoch": 3.109677419354839,
"grad_norm": 0.54300457239151,
"learning_rate": 3.5868361046097475e-06,
"loss": 0.0166,
"step": 482
},
{
"epoch": 3.1161290322580646,
"grad_norm": 0.41813674569129944,
"learning_rate": 3.537636184853939e-06,
"loss": 0.0131,
"step": 483
},
{
"epoch": 3.1225806451612903,
"grad_norm": 0.4122736155986786,
"learning_rate": 3.4887203530199864e-06,
"loss": 0.0141,
"step": 484
},
{
"epoch": 3.129032258064516,
"grad_norm": 0.39058443903923035,
"learning_rate": 3.440090159636003e-06,
"loss": 0.0115,
"step": 485
},
{
"epoch": 3.135483870967742,
"grad_norm": 0.42365285754203796,
"learning_rate": 3.391747146175954e-06,
"loss": 0.0097,
"step": 486
},
{
"epoch": 3.141935483870968,
"grad_norm": 0.5885961055755615,
"learning_rate": 3.3436928450108264e-06,
"loss": 0.0219,
"step": 487
},
{
"epoch": 3.1483870967741936,
"grad_norm": 0.39843595027923584,
"learning_rate": 3.2959287793600356e-06,
"loss": 0.0098,
"step": 488
},
{
"epoch": 3.1548387096774193,
"grad_norm": 0.27729499340057373,
"learning_rate": 3.2484564632431396e-06,
"loss": 0.0068,
"step": 489
},
{
"epoch": 3.161290322580645,
"grad_norm": 0.4415301978588104,
"learning_rate": 3.2012774014318625e-06,
"loss": 0.0151,
"step": 490
},
{
"epoch": 3.167741935483871,
"grad_norm": 0.40353745222091675,
"learning_rate": 3.154393089402391e-06,
"loss": 0.016,
"step": 491
},
{
"epoch": 3.174193548387097,
"grad_norm": 0.4263345003128052,
"learning_rate": 3.107805013287958e-06,
"loss": 0.0112,
"step": 492
},
{
"epoch": 3.1806451612903226,
"grad_norm": 0.37084028124809265,
"learning_rate": 3.061514649831755e-06,
"loss": 0.014,
"step": 493
},
{
"epoch": 3.1870967741935483,
"grad_norm": 0.5308308005332947,
"learning_rate": 3.0155234663401146e-06,
"loss": 0.0146,
"step": 494
},
{
"epoch": 3.193548387096774,
"grad_norm": 0.47034651041030884,
"learning_rate": 2.9698329206359925e-06,
"loss": 0.0124,
"step": 495
},
{
"epoch": 3.2,
"grad_norm": 0.3541916012763977,
"learning_rate": 2.9244444610127764e-06,
"loss": 0.0121,
"step": 496
},
{
"epoch": 3.206451612903226,
"grad_norm": 0.4616714417934418,
"learning_rate": 2.8793595261883465e-06,
"loss": 0.0181,
"step": 497
},
{
"epoch": 3.2129032258064516,
"grad_norm": 0.36847707629203796,
"learning_rate": 2.8345795452595095e-06,
"loss": 0.0165,
"step": 498
},
{
"epoch": 3.2193548387096773,
"grad_norm": 0.3731675148010254,
"learning_rate": 2.790105937656673e-06,
"loss": 0.013,
"step": 499
},
{
"epoch": 3.225806451612903,
"grad_norm": 0.42840346693992615,
"learning_rate": 2.7459401130988534e-06,
"loss": 0.0109,
"step": 500
},
{
"epoch": 3.232258064516129,
"grad_norm": 0.33602991700172424,
"learning_rate": 2.7020834715490093e-06,
"loss": 0.0106,
"step": 501
},
{
"epoch": 3.238709677419355,
"grad_norm": 0.47426825761795044,
"learning_rate": 2.6585374031696474e-06,
"loss": 0.0133,
"step": 502
},
{
"epoch": 3.2451612903225806,
"grad_norm": 0.41371604800224304,
"learning_rate": 2.61530328827877e-06,
"loss": 0.0094,
"step": 503
},
{
"epoch": 3.2516129032258063,
"grad_norm": 0.4533410668373108,
"learning_rate": 2.5723824973061e-06,
"loss": 0.0123,
"step": 504
},
{
"epoch": 3.258064516129032,
"grad_norm": 0.2636722922325134,
"learning_rate": 2.5297763907496746e-06,
"loss": 0.0086,
"step": 505
},
{
"epoch": 3.264516129032258,
"grad_norm": 0.48444676399230957,
"learning_rate": 2.4874863191326953e-06,
"loss": 0.0169,
"step": 506
},
{
"epoch": 3.270967741935484,
"grad_norm": 0.5979859828948975,
"learning_rate": 2.44551362296072e-06,
"loss": 0.0112,
"step": 507
},
{
"epoch": 3.2774193548387096,
"grad_norm": 0.43151959776878357,
"learning_rate": 2.4038596326791884e-06,
"loss": 0.0109,
"step": 508
},
{
"epoch": 3.2838709677419353,
"grad_norm": 0.4825892746448517,
"learning_rate": 2.362525668631238e-06,
"loss": 0.013,
"step": 509
},
{
"epoch": 3.2903225806451615,
"grad_norm": 0.3168151080608368,
"learning_rate": 2.3215130410158424e-06,
"loss": 0.0106,
"step": 510
},
{
"epoch": 3.296774193548387,
"grad_norm": 0.4605632722377777,
"learning_rate": 2.2808230498463116e-06,
"loss": 0.0189,
"step": 511
},
{
"epoch": 3.303225806451613,
"grad_norm": 0.47640544176101685,
"learning_rate": 2.240456984909049e-06,
"loss": 0.015,
"step": 512
},
{
"epoch": 3.3096774193548386,
"grad_norm": 0.5328596234321594,
"learning_rate": 2.2004161257226805e-06,
"loss": 0.0201,
"step": 513
},
{
"epoch": 3.3161290322580643,
"grad_norm": 0.5342445969581604,
"learning_rate": 2.16070174149752e-06,
"loss": 0.0109,
"step": 514
},
{
"epoch": 3.3225806451612905,
"grad_norm": 0.5308839082717896,
"learning_rate": 2.121315091095297e-06,
"loss": 0.014,
"step": 515
},
{
"epoch": 3.329032258064516,
"grad_norm": 0.4669474959373474,
"learning_rate": 2.082257422989281e-06,
"loss": 0.0105,
"step": 516
},
{
"epoch": 3.335483870967742,
"grad_norm": 0.37382492423057556,
"learning_rate": 2.0435299752247077e-06,
"loss": 0.0132,
"step": 517
},
{
"epoch": 3.3419354838709676,
"grad_norm": 0.4566926062107086,
"learning_rate": 2.0051339753795125e-06,
"loss": 0.0159,
"step": 518
},
{
"epoch": 3.3483870967741938,
"grad_norm": 0.4399929642677307,
"learning_rate": 1.9670706405254548e-06,
"loss": 0.0149,
"step": 519
},
{
"epoch": 3.3548387096774195,
"grad_norm": 0.3071390986442566,
"learning_rate": 1.929341177189506e-06,
"loss": 0.0085,
"step": 520
},
{
"epoch": 3.361290322580645,
"grad_norm": 0.39541929960250854,
"learning_rate": 1.8919467813156121e-06,
"loss": 0.0088,
"step": 521
},
{
"epoch": 3.367741935483871,
"grad_norm": 0.49959710240364075,
"learning_rate": 1.854888638226815e-06,
"loss": 0.0147,
"step": 522
},
{
"epoch": 3.3741935483870966,
"grad_norm": 0.3740963339805603,
"learning_rate": 1.8181679225876324e-06,
"loss": 0.0099,
"step": 523
},
{
"epoch": 3.3806451612903228,
"grad_norm": 0.27066710591316223,
"learning_rate": 1.7817857983668612e-06,
"loss": 0.0071,
"step": 524
},
{
"epoch": 3.3870967741935485,
"grad_norm": 0.47990116477012634,
"learning_rate": 1.745743418800669e-06,
"loss": 0.012,
"step": 525
},
{
"epoch": 3.393548387096774,
"grad_norm": 0.40311211347579956,
"learning_rate": 1.7100419263560263e-06,
"loss": 0.0102,
"step": 526
},
{
"epoch": 3.4,
"grad_norm": 0.4134123623371124,
"learning_rate": 1.6746824526945163e-06,
"loss": 0.0123,
"step": 527
},
{
"epoch": 3.4064516129032256,
"grad_norm": 0.44432902336120605,
"learning_rate": 1.6396661186364543e-06,
"loss": 0.0106,
"step": 528
},
{
"epoch": 3.412903225806452,
"grad_norm": 0.4438421428203583,
"learning_rate": 1.6049940341253442e-06,
"loss": 0.0172,
"step": 529
},
{
"epoch": 3.4193548387096775,
"grad_norm": 0.5001305341720581,
"learning_rate": 1.570667298192724e-06,
"loss": 0.0115,
"step": 530
},
{
"epoch": 3.425806451612903,
"grad_norm": 0.36009618639945984,
"learning_rate": 1.5366869989233062e-06,
"loss": 0.0111,
"step": 531
},
{
"epoch": 3.432258064516129,
"grad_norm": 0.35556045174598694,
"learning_rate": 1.5030542134205003e-06,
"loss": 0.0129,
"step": 532
},
{
"epoch": 3.4387096774193546,
"grad_norm": 0.38800477981567383,
"learning_rate": 1.4697700077722616e-06,
"loss": 0.0111,
"step": 533
},
{
"epoch": 3.445161290322581,
"grad_norm": 0.35132497549057007,
"learning_rate": 1.4368354370173073e-06,
"loss": 0.0133,
"step": 534
},
{
"epoch": 3.4516129032258065,
"grad_norm": 0.49624019861221313,
"learning_rate": 1.404251545111672e-06,
"loss": 0.0152,
"step": 535
},
{
"epoch": 3.458064516129032,
"grad_norm": 0.34581199288368225,
"learning_rate": 1.3720193648956062e-06,
"loss": 0.0093,
"step": 536
},
{
"epoch": 3.464516129032258,
"grad_norm": 0.4548514783382416,
"learning_rate": 1.3401399180608551e-06,
"loss": 0.0174,
"step": 537
},
{
"epoch": 3.4709677419354836,
"grad_norm": 0.33973830938339233,
"learning_rate": 1.3086142151182605e-06,
"loss": 0.0143,
"step": 538
},
{
"epoch": 3.47741935483871,
"grad_norm": 0.3562283515930176,
"learning_rate": 1.2774432553657303e-06,
"loss": 0.0129,
"step": 539
},
{
"epoch": 3.4838709677419355,
"grad_norm": 0.42894405126571655,
"learning_rate": 1.2466280268565708e-06,
"loss": 0.0136,
"step": 540
},
{
"epoch": 3.490322580645161,
"grad_norm": 0.36266642808914185,
"learning_rate": 1.2161695063681589e-06,
"loss": 0.0152,
"step": 541
},
{
"epoch": 3.496774193548387,
"grad_norm": 0.41463732719421387,
"learning_rate": 1.186068659370984e-06,
"loss": 0.0126,
"step": 542
},
{
"epoch": 3.5032258064516126,
"grad_norm": 0.3517482876777649,
"learning_rate": 1.1563264399980512e-06,
"loss": 0.0106,
"step": 543
},
{
"epoch": 3.509677419354839,
"grad_norm": 0.3592299520969391,
"learning_rate": 1.1269437910146173e-06,
"loss": 0.01,
"step": 544
},
{
"epoch": 3.5161290322580645,
"grad_norm": 0.3486897051334381,
"learning_rate": 1.0979216437883327e-06,
"loss": 0.0132,
"step": 545
},
{
"epoch": 3.52258064516129,
"grad_norm": 0.3669939935207367,
"learning_rate": 1.069260918259704e-06,
"loss": 0.0108,
"step": 546
},
{
"epoch": 3.5290322580645164,
"grad_norm": 0.345688134431839,
"learning_rate": 1.0409625229129292e-06,
"loss": 0.0112,
"step": 547
},
{
"epoch": 3.535483870967742,
"grad_norm": 0.40567103028297424,
"learning_rate": 1.0130273547471176e-06,
"loss": 0.017,
"step": 548
},
{
"epoch": 3.541935483870968,
"grad_norm": 0.45447733998298645,
"learning_rate": 9.854562992478445e-07,
"loss": 0.0293,
"step": 549
},
{
"epoch": 3.5483870967741935,
"grad_norm": 0.40127208828926086,
"learning_rate": 9.582502303590798e-07,
"loss": 0.0151,
"step": 550
},
{
"epoch": 3.554838709677419,
"grad_norm": 0.32802486419677734,
"learning_rate": 9.314100104555066e-07,
"loss": 0.0101,
"step": 551
},
{
"epoch": 3.5612903225806454,
"grad_norm": 0.35557428002357483,
"learning_rate": 9.049364903151558e-07,
"loss": 0.0108,
"step": 552
},
{
"epoch": 3.567741935483871,
"grad_norm": 0.5105459094047546,
"learning_rate": 8.788305090924556e-07,
"loss": 0.0167,
"step": 553
},
{
"epoch": 3.574193548387097,
"grad_norm": 0.3577045202255249,
"learning_rate": 8.530928942916447e-07,
"loss": 0.0076,
"step": 554
},
{
"epoch": 3.5806451612903225,
"grad_norm": 0.2892685532569885,
"learning_rate": 8.277244617405102e-07,
"loss": 0.0077,
"step": 555
},
{
"epoch": 3.587096774193548,
"grad_norm": 0.47886940836906433,
"learning_rate": 8.027260155645546e-07,
"loss": 0.0109,
"step": 556
},
{
"epoch": 3.5935483870967744,
"grad_norm": 0.3236874043941498,
"learning_rate": 7.780983481614962e-07,
"loss": 0.0074,
"step": 557
},
{
"epoch": 3.6,
"grad_norm": 0.46806618571281433,
"learning_rate": 7.538422401761461e-07,
"loss": 0.0174,
"step": 558
},
{
"epoch": 3.606451612903226,
"grad_norm": 0.4186045229434967,
"learning_rate": 7.299584604756784e-07,
"loss": 0.0111,
"step": 559
},
{
"epoch": 3.6129032258064515,
"grad_norm": 0.4132605791091919,
"learning_rate": 7.064477661252483e-07,
"loss": 0.0132,
"step": 560
},
{
"epoch": 3.6193548387096772,
"grad_norm": 0.5827385783195496,
"learning_rate": 6.833109023639928e-07,
"loss": 0.017,
"step": 561
},
{
"epoch": 3.6258064516129034,
"grad_norm": 0.3105774521827698,
"learning_rate": 6.605486025814164e-07,
"loss": 0.0091,
"step": 562
},
{
"epoch": 3.632258064516129,
"grad_norm": 0.34796178340911865,
"learning_rate": 6.381615882941366e-07,
"loss": 0.0083,
"step": 563
},
{
"epoch": 3.638709677419355,
"grad_norm": 0.3462621867656708,
"learning_rate": 6.16150569123021e-07,
"loss": 0.0143,
"step": 564
},
{
"epoch": 3.6451612903225805,
"grad_norm": 0.4699903428554535,
"learning_rate": 5.945162427706888e-07,
"loss": 0.0145,
"step": 565
},
{
"epoch": 3.6516129032258062,
"grad_norm": 0.42084646224975586,
"learning_rate": 5.732592949993898e-07,
"loss": 0.015,
"step": 566
},
{
"epoch": 3.6580645161290324,
"grad_norm": 0.4539680778980255,
"learning_rate": 5.5238039960928e-07,
"loss": 0.0154,
"step": 567
},
{
"epoch": 3.664516129032258,
"grad_norm": 0.3853324353694916,
"learning_rate": 5.318802184170565e-07,
"loss": 0.0126,
"step": 568
},
{
"epoch": 3.670967741935484,
"grad_norm": 0.409679651260376,
"learning_rate": 5.117594012349735e-07,
"loss": 0.0143,
"step": 569
},
{
"epoch": 3.6774193548387095,
"grad_norm": 0.42005378007888794,
"learning_rate": 4.920185858502596e-07,
"loss": 0.0129,
"step": 570
},
{
"epoch": 3.6838709677419352,
"grad_norm": 0.34078460931777954,
"learning_rate": 4.7265839800488543e-07,
"loss": 0.0132,
"step": 571
},
{
"epoch": 3.6903225806451614,
"grad_norm": 0.5289260149002075,
"learning_rate": 4.5367945137573946e-07,
"loss": 0.0114,
"step": 572
},
{
"epoch": 3.696774193548387,
"grad_norm": 0.43742361664772034,
"learning_rate": 4.350823475551713e-07,
"loss": 0.0099,
"step": 573
},
{
"epoch": 3.703225806451613,
"grad_norm": 0.3581911623477936,
"learning_rate": 4.1686767603192344e-07,
"loss": 0.0153,
"step": 574
},
{
"epoch": 3.709677419354839,
"grad_norm": 0.3517850637435913,
"learning_rate": 3.990360141724478e-07,
"loss": 0.0123,
"step": 575
},
{
"epoch": 3.7161290322580647,
"grad_norm": 0.44253072142601013,
"learning_rate": 3.815879272025966e-07,
"loss": 0.0068,
"step": 576
},
{
"epoch": 3.7225806451612904,
"grad_norm": 0.3429562449455261,
"learning_rate": 3.6452396818971863e-07,
"loss": 0.0121,
"step": 577
},
{
"epoch": 3.729032258064516,
"grad_norm": 0.7084751129150391,
"learning_rate": 3.4784467802511797e-07,
"loss": 0.0145,
"step": 578
},
{
"epoch": 3.735483870967742,
"grad_norm": 0.388698011636734,
"learning_rate": 3.3155058540691037e-07,
"loss": 0.0107,
"step": 579
},
{
"epoch": 3.741935483870968,
"grad_norm": 0.41982078552246094,
"learning_rate": 3.1564220682327314e-07,
"loss": 0.014,
"step": 580
},
{
"epoch": 3.7483870967741937,
"grad_norm": 0.645720899105072,
"learning_rate": 3.001200465360593e-07,
"loss": 0.015,
"step": 581
},
{
"epoch": 3.7548387096774194,
"grad_norm": 0.5690763592720032,
"learning_rate": 2.8498459656482317e-07,
"loss": 0.0189,
"step": 582
},
{
"epoch": 3.761290322580645,
"grad_norm": 0.4491289556026459,
"learning_rate": 2.702363366712257e-07,
"loss": 0.0127,
"step": 583
},
{
"epoch": 3.767741935483871,
"grad_norm": 0.4925324618816376,
"learning_rate": 2.5587573434381895e-07,
"loss": 0.0138,
"step": 584
},
{
"epoch": 3.774193548387097,
"grad_norm": 0.6388445496559143,
"learning_rate": 2.41903244783237e-07,
"loss": 0.0145,
"step": 585
},
{
"epoch": 3.7806451612903227,
"grad_norm": 0.31376415491104126,
"learning_rate": 2.2831931088775904e-07,
"loss": 0.0135,
"step": 586
},
{
"epoch": 3.7870967741935484,
"grad_norm": 0.3549552261829376,
"learning_rate": 2.1512436323927604e-07,
"loss": 0.014,
"step": 587
},
{
"epoch": 3.793548387096774,
"grad_norm": 0.446304053068161,
"learning_rate": 2.0231882008963783e-07,
"loss": 0.0144,
"step": 588
},
{
"epoch": 3.8,
"grad_norm": 0.45615440607070923,
"learning_rate": 1.8990308734739976e-07,
"loss": 0.0143,
"step": 589
},
{
"epoch": 3.806451612903226,
"grad_norm": 0.3915248513221741,
"learning_rate": 1.7787755856495254e-07,
"loss": 0.0131,
"step": 590
},
{
"epoch": 3.8129032258064517,
"grad_norm": 0.2556948959827423,
"learning_rate": 1.6624261492605153e-07,
"loss": 0.0061,
"step": 591
},
{
"epoch": 3.8193548387096774,
"grad_norm": 0.5648970603942871,
"learning_rate": 1.5499862523372933e-07,
"loss": 0.011,
"step": 592
},
{
"epoch": 3.825806451612903,
"grad_norm": 0.30211833119392395,
"learning_rate": 1.4414594589860774e-07,
"loss": 0.0087,
"step": 593
},
{
"epoch": 3.832258064516129,
"grad_norm": 0.21295692026615143,
"learning_rate": 1.3368492092760142e-07,
"loss": 0.0059,
"step": 594
},
{
"epoch": 3.838709677419355,
"grad_norm": 0.49749764800071716,
"learning_rate": 1.2361588191300983e-07,
"loss": 0.0112,
"step": 595
},
{
"epoch": 3.8451612903225807,
"grad_norm": 0.3051888048648834,
"learning_rate": 1.139391480220145e-07,
"loss": 0.0077,
"step": 596
},
{
"epoch": 3.8516129032258064,
"grad_norm": 0.3583107590675354,
"learning_rate": 1.0465502598655114e-07,
"loss": 0.0115,
"step": 597
},
{
"epoch": 3.858064516129032,
"grad_norm": 0.4116378426551819,
"learning_rate": 9.576381009359508e-08,
"loss": 0.0127,
"step": 598
},
{
"epoch": 3.864516129032258,
"grad_norm": 0.3633911907672882,
"learning_rate": 8.726578217582993e-08,
"loss": 0.0109,
"step": 599
},
{
"epoch": 3.870967741935484,
"grad_norm": 0.35112428665161133,
"learning_rate": 7.916121160271572e-08,
"loss": 0.0112,
"step": 600
},
{
"epoch": 3.8774193548387097,
"grad_norm": 0.4247336685657501,
"learning_rate": 7.145035527194588e-08,
"loss": 0.0103,
"step": 601
},
{
"epoch": 3.8838709677419354,
"grad_norm": 0.36888250708580017,
"learning_rate": 6.413345760131057e-08,
"loss": 0.0145,
"step": 602
},
{
"epoch": 3.8903225806451616,
"grad_norm": 0.3833377957344055,
"learning_rate": 5.721075052094599e-08,
"loss": 0.0094,
"step": 603
},
{
"epoch": 3.896774193548387,
"grad_norm": 0.45575153827667236,
"learning_rate": 5.068245346598332e-08,
"loss": 0.0111,
"step": 604
},
{
"epoch": 3.903225806451613,
"grad_norm": 0.3035842776298523,
"learning_rate": 4.454877336958763e-08,
"loss": 0.0071,
"step": 605
},
{
"epoch": 3.9096774193548387,
"grad_norm": 0.3646605908870697,
"learning_rate": 3.8809904656410264e-08,
"loss": 0.0111,
"step": 606
},
{
"epoch": 3.9161290322580644,
"grad_norm": 0.3893778920173645,
"learning_rate": 3.346602923641473e-08,
"loss": 0.0084,
"step": 607
},
{
"epoch": 3.9225806451612906,
"grad_norm": 0.29794201254844666,
"learning_rate": 2.8517316499115932e-08,
"loss": 0.0069,
"step": 608
},
{
"epoch": 3.9290322580645163,
"grad_norm": 0.38667142391204834,
"learning_rate": 2.3963923308212288e-08,
"loss": 0.0144,
"step": 609
},
{
"epoch": 3.935483870967742,
"grad_norm": 0.39387401938438416,
"learning_rate": 1.9805993996606376e-08,
"loss": 0.0087,
"step": 610
},
{
"epoch": 3.9419354838709677,
"grad_norm": 0.3552229106426239,
"learning_rate": 1.604366036184052e-08,
"loss": 0.0093,
"step": 611
},
{
"epoch": 3.9483870967741934,
"grad_norm": 0.5617074966430664,
"learning_rate": 1.2677041661907085e-08,
"loss": 0.008,
"step": 612
},
{
"epoch": 3.9548387096774196,
"grad_norm": 0.3588564693927765,
"learning_rate": 9.706244611480674e-09,
"loss": 0.016,
"step": 613
},
{
"epoch": 3.9612903225806453,
"grad_norm": 0.36114564538002014,
"learning_rate": 7.131363378524991e-09,
"loss": 0.0113,
"step": 614
},
{
"epoch": 3.967741935483871,
"grad_norm": 0.5072866678237915,
"learning_rate": 4.952479581311897e-09,
"loss": 0.0128,
"step": 615
},
{
"epoch": 3.9741935483870967,
"grad_norm": 0.3469507396221161,
"learning_rate": 3.1696622858373716e-09,
"loss": 0.01,
"step": 616
},
{
"epoch": 3.9806451612903224,
"grad_norm": 0.6267412900924683,
"learning_rate": 1.7829680036274276e-09,
"loss": 0.0201,
"step": 617
},
{
"epoch": 3.9870967741935486,
"grad_norm": 0.4204852879047394,
"learning_rate": 7.924406899492698e-10,
"loss": 0.0108,
"step": 618
},
{
"epoch": 3.9935483870967743,
"grad_norm": 0.2933880090713501,
"learning_rate": 1.9811174241796127e-10,
"loss": 0.0113,
"step": 619
},
{
"epoch": 4.0,
"grad_norm": 0.3794356882572174,
"learning_rate": 0.0,
"loss": 0.0109,
"step": 620
},
{
"epoch": 4.0,
"step": 620,
"total_flos": 2.582134138035241e+17,
"train_loss": 0.053207253262911355,
"train_runtime": 778.335,
"train_samples_per_second": 25.449,
"train_steps_per_second": 0.797
}
],
"logging_steps": 1,
"max_steps": 620,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.582134138035241e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}