SuperCoder-7B-Qwen2.5-0525-peft / trainer_state.json
ferdinandjasong's picture
Model save
ffc2771 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9984559958826558,
"eval_steps": 500,
"global_step": 970,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002058672156459084,
"grad_norm": 0.16932366788387299,
"learning_rate": 6.666666666666667e-07,
"loss": 0.6431,
"step": 1
},
{
"epoch": 0.004117344312918168,
"grad_norm": 0.19655312597751617,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.6827,
"step": 2
},
{
"epoch": 0.006176016469377252,
"grad_norm": 0.17550164461135864,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.6469,
"step": 3
},
{
"epoch": 0.008234688625836336,
"grad_norm": 0.15466387569904327,
"learning_rate": 2.666666666666667e-06,
"loss": 0.6673,
"step": 4
},
{
"epoch": 0.010293360782295419,
"grad_norm": 0.1689433455467224,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.659,
"step": 5
},
{
"epoch": 0.012352032938754504,
"grad_norm": 0.17134369909763336,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6405,
"step": 6
},
{
"epoch": 0.014410705095213588,
"grad_norm": 0.16362226009368896,
"learning_rate": 4.666666666666667e-06,
"loss": 0.663,
"step": 7
},
{
"epoch": 0.016469377251672673,
"grad_norm": 0.17139802873134613,
"learning_rate": 5.333333333333334e-06,
"loss": 0.6571,
"step": 8
},
{
"epoch": 0.018528049408131755,
"grad_norm": 0.1583557277917862,
"learning_rate": 6e-06,
"loss": 0.6724,
"step": 9
},
{
"epoch": 0.020586721564590838,
"grad_norm": 0.1418927162885666,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6629,
"step": 10
},
{
"epoch": 0.022645393721049924,
"grad_norm": 0.15474237501621246,
"learning_rate": 7.333333333333333e-06,
"loss": 0.6738,
"step": 11
},
{
"epoch": 0.024704065877509007,
"grad_norm": 0.1299714297056198,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6195,
"step": 12
},
{
"epoch": 0.02676273803396809,
"grad_norm": 0.1421244591474533,
"learning_rate": 8.666666666666668e-06,
"loss": 0.6726,
"step": 13
},
{
"epoch": 0.028821410190427176,
"grad_norm": 0.13614629209041595,
"learning_rate": 9.333333333333334e-06,
"loss": 0.6725,
"step": 14
},
{
"epoch": 0.03088008234688626,
"grad_norm": 0.1408453732728958,
"learning_rate": 1e-05,
"loss": 0.6572,
"step": 15
},
{
"epoch": 0.032938754503345345,
"grad_norm": 0.14334328472614288,
"learning_rate": 9.999899472488811e-06,
"loss": 0.6799,
"step": 16
},
{
"epoch": 0.034997426659804425,
"grad_norm": 0.13737936317920685,
"learning_rate": 9.999597894446702e-06,
"loss": 0.6685,
"step": 17
},
{
"epoch": 0.03705609881626351,
"grad_norm": 0.14840838313102722,
"learning_rate": 9.999095279347846e-06,
"loss": 0.649,
"step": 18
},
{
"epoch": 0.0391147709727226,
"grad_norm": 0.16579611599445343,
"learning_rate": 9.998391649648529e-06,
"loss": 0.6383,
"step": 19
},
{
"epoch": 0.041173443129181676,
"grad_norm": 0.15359951555728912,
"learning_rate": 9.997487036786146e-06,
"loss": 0.6714,
"step": 20
},
{
"epoch": 0.04323211528564076,
"grad_norm": 0.14463751018047333,
"learning_rate": 9.996381481177804e-06,
"loss": 0.6824,
"step": 21
},
{
"epoch": 0.04529078744209985,
"grad_norm": 0.1506095975637436,
"learning_rate": 9.995075032218501e-06,
"loss": 0.6714,
"step": 22
},
{
"epoch": 0.04734945959855893,
"grad_norm": 0.15253892540931702,
"learning_rate": 9.993567748278931e-06,
"loss": 0.6719,
"step": 23
},
{
"epoch": 0.049408131755018014,
"grad_norm": 0.1528758406639099,
"learning_rate": 9.991859696702876e-06,
"loss": 0.6337,
"step": 24
},
{
"epoch": 0.0514668039114771,
"grad_norm": 0.147063210606575,
"learning_rate": 9.98995095380419e-06,
"loss": 0.694,
"step": 25
},
{
"epoch": 0.05352547606793618,
"grad_norm": 0.13172896206378937,
"learning_rate": 9.987841604863393e-06,
"loss": 0.6552,
"step": 26
},
{
"epoch": 0.055584148224395266,
"grad_norm": 0.15532605350017548,
"learning_rate": 9.985531744123863e-06,
"loss": 0.6665,
"step": 27
},
{
"epoch": 0.05764282038085435,
"grad_norm": 0.153465136885643,
"learning_rate": 9.983021474787623e-06,
"loss": 0.6723,
"step": 28
},
{
"epoch": 0.05970149253731343,
"grad_norm": 0.15754538774490356,
"learning_rate": 9.980310909010732e-06,
"loss": 0.6681,
"step": 29
},
{
"epoch": 0.06176016469377252,
"grad_norm": 0.14468301832675934,
"learning_rate": 9.97740016789827e-06,
"loss": 0.6462,
"step": 30
},
{
"epoch": 0.0638188368502316,
"grad_norm": 0.13467183709144592,
"learning_rate": 9.974289381498927e-06,
"loss": 0.6647,
"step": 31
},
{
"epoch": 0.06587750900669069,
"grad_norm": 0.1391790211200714,
"learning_rate": 9.970978688799202e-06,
"loss": 0.6668,
"step": 32
},
{
"epoch": 0.06793618116314977,
"grad_norm": 0.13762535154819489,
"learning_rate": 9.967468237717179e-06,
"loss": 0.6503,
"step": 33
},
{
"epoch": 0.06999485331960885,
"grad_norm": 0.14493001997470856,
"learning_rate": 9.963758185095935e-06,
"loss": 0.6573,
"step": 34
},
{
"epoch": 0.07205352547606794,
"grad_norm": 0.13914668560028076,
"learning_rate": 9.95984869669651e-06,
"loss": 0.6595,
"step": 35
},
{
"epoch": 0.07411219763252702,
"grad_norm": 0.14349870383739471,
"learning_rate": 9.95573994719053e-06,
"loss": 0.6477,
"step": 36
},
{
"epoch": 0.0761708697889861,
"grad_norm": 0.14882634580135345,
"learning_rate": 9.95143212015237e-06,
"loss": 0.676,
"step": 37
},
{
"epoch": 0.0782295419454452,
"grad_norm": 0.1246320977807045,
"learning_rate": 9.94692540805098e-06,
"loss": 0.6445,
"step": 38
},
{
"epoch": 0.08028821410190427,
"grad_norm": 0.14476200938224792,
"learning_rate": 9.942220012241274e-06,
"loss": 0.6721,
"step": 39
},
{
"epoch": 0.08234688625836335,
"grad_norm": 0.1350948065519333,
"learning_rate": 9.937316142955129e-06,
"loss": 0.6467,
"step": 40
},
{
"epoch": 0.08440555841482245,
"grad_norm": 0.14155423641204834,
"learning_rate": 9.932214019292002e-06,
"loss": 0.629,
"step": 41
},
{
"epoch": 0.08646423057128153,
"grad_norm": 0.14425547420978546,
"learning_rate": 9.926913869209132e-06,
"loss": 0.6652,
"step": 42
},
{
"epoch": 0.0885229027277406,
"grad_norm": 0.14249692857265472,
"learning_rate": 9.921415929511367e-06,
"loss": 0.6534,
"step": 43
},
{
"epoch": 0.0905815748841997,
"grad_norm": 0.13896256685256958,
"learning_rate": 9.915720445840566e-06,
"loss": 0.6697,
"step": 44
},
{
"epoch": 0.09264024704065878,
"grad_norm": 0.12779143452644348,
"learning_rate": 9.909827672664642e-06,
"loss": 0.6411,
"step": 45
},
{
"epoch": 0.09469891919711786,
"grad_norm": 0.13794159889221191,
"learning_rate": 9.90373787326618e-06,
"loss": 0.6512,
"step": 46
},
{
"epoch": 0.09675759135357695,
"grad_norm": 0.1406450718641281,
"learning_rate": 9.897451319730686e-06,
"loss": 0.6267,
"step": 47
},
{
"epoch": 0.09881626351003603,
"grad_norm": 0.1331593543291092,
"learning_rate": 9.890968292934413e-06,
"loss": 0.6478,
"step": 48
},
{
"epoch": 0.10087493566649511,
"grad_norm": 0.1347612887620926,
"learning_rate": 9.884289082531829e-06,
"loss": 0.6762,
"step": 49
},
{
"epoch": 0.1029336078229542,
"grad_norm": 0.15122374892234802,
"learning_rate": 9.877413986942668e-06,
"loss": 0.6322,
"step": 50
},
{
"epoch": 0.10499227997941328,
"grad_norm": 0.1411016583442688,
"learning_rate": 9.870343313338594e-06,
"loss": 0.6832,
"step": 51
},
{
"epoch": 0.10705095213587236,
"grad_norm": 0.14521309733390808,
"learning_rate": 9.863077377629484e-06,
"loss": 0.66,
"step": 52
},
{
"epoch": 0.10910962429233145,
"grad_norm": 0.1375553160905838,
"learning_rate": 9.855616504449308e-06,
"loss": 0.6558,
"step": 53
},
{
"epoch": 0.11116829644879053,
"grad_norm": 0.16293329000473022,
"learning_rate": 9.847961027141625e-06,
"loss": 0.6613,
"step": 54
},
{
"epoch": 0.11322696860524961,
"grad_norm": 0.14329375326633453,
"learning_rate": 9.840111287744696e-06,
"loss": 0.6355,
"step": 55
},
{
"epoch": 0.1152856407617087,
"grad_norm": 0.14197561144828796,
"learning_rate": 9.832067636976193e-06,
"loss": 0.6418,
"step": 56
},
{
"epoch": 0.11734431291816778,
"grad_norm": 0.14646419882774353,
"learning_rate": 9.823830434217533e-06,
"loss": 0.646,
"step": 57
},
{
"epoch": 0.11940298507462686,
"grad_norm": 0.15349656343460083,
"learning_rate": 9.815400047497829e-06,
"loss": 0.6742,
"step": 58
},
{
"epoch": 0.12146165723108594,
"grad_norm": 0.13744938373565674,
"learning_rate": 9.80677685347743e-06,
"loss": 0.6418,
"step": 59
},
{
"epoch": 0.12352032938754504,
"grad_norm": 0.152848482131958,
"learning_rate": 9.797961237431106e-06,
"loss": 0.6962,
"step": 60
},
{
"epoch": 0.12557900154400412,
"grad_norm": 0.1377212554216385,
"learning_rate": 9.788953593230835e-06,
"loss": 0.6485,
"step": 61
},
{
"epoch": 0.1276376737004632,
"grad_norm": 0.14834953844547272,
"learning_rate": 9.779754323328192e-06,
"loss": 0.6509,
"step": 62
},
{
"epoch": 0.12969634585692227,
"grad_norm": 0.1412557065486908,
"learning_rate": 9.770363838736382e-06,
"loss": 0.6409,
"step": 63
},
{
"epoch": 0.13175501801338138,
"grad_norm": 0.13826924562454224,
"learning_rate": 9.760782559011871e-06,
"loss": 0.6635,
"step": 64
},
{
"epoch": 0.13381369016984046,
"grad_norm": 0.15501998364925385,
"learning_rate": 9.751010912235635e-06,
"loss": 0.6759,
"step": 65
},
{
"epoch": 0.13587236232629954,
"grad_norm": 0.1448049247264862,
"learning_rate": 9.741049334994047e-06,
"loss": 0.6692,
"step": 66
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.16344545781612396,
"learning_rate": 9.730898272359358e-06,
"loss": 0.6603,
"step": 67
},
{
"epoch": 0.1399897066392177,
"grad_norm": 0.13649916648864746,
"learning_rate": 9.720558177869817e-06,
"loss": 0.6554,
"step": 68
},
{
"epoch": 0.14204837879567678,
"grad_norm": 0.14864695072174072,
"learning_rate": 9.71002951350941e-06,
"loss": 0.6478,
"step": 69
},
{
"epoch": 0.14410705095213588,
"grad_norm": 0.13324975967407227,
"learning_rate": 9.699312749687211e-06,
"loss": 0.6623,
"step": 70
},
{
"epoch": 0.14616572310859496,
"grad_norm": 0.12993821501731873,
"learning_rate": 9.688408365216379e-06,
"loss": 0.6548,
"step": 71
},
{
"epoch": 0.14822439526505404,
"grad_norm": 0.15033897757530212,
"learning_rate": 9.677316847292746e-06,
"loss": 0.6316,
"step": 72
},
{
"epoch": 0.15028306742151312,
"grad_norm": 0.14298583567142487,
"learning_rate": 9.666038691473067e-06,
"loss": 0.6864,
"step": 73
},
{
"epoch": 0.1523417395779722,
"grad_norm": 0.13992749154567719,
"learning_rate": 9.654574401652864e-06,
"loss": 0.6843,
"step": 74
},
{
"epoch": 0.15440041173443128,
"grad_norm": 0.12663426995277405,
"learning_rate": 9.64292449004393e-06,
"loss": 0.6492,
"step": 75
},
{
"epoch": 0.1564590838908904,
"grad_norm": 0.14738014340400696,
"learning_rate": 9.631089477151428e-06,
"loss": 0.6499,
"step": 76
},
{
"epoch": 0.15851775604734947,
"grad_norm": 0.12475479394197464,
"learning_rate": 9.619069891750642e-06,
"loss": 0.6465,
"step": 77
},
{
"epoch": 0.16057642820380855,
"grad_norm": 0.13470493257045746,
"learning_rate": 9.606866270863352e-06,
"loss": 0.6282,
"step": 78
},
{
"epoch": 0.16263510036026763,
"grad_norm": 0.13865548372268677,
"learning_rate": 9.594479159733841e-06,
"loss": 0.6694,
"step": 79
},
{
"epoch": 0.1646937725167267,
"grad_norm": 0.12641094624996185,
"learning_rate": 9.581909111804534e-06,
"loss": 0.6569,
"step": 80
},
{
"epoch": 0.16675244467318578,
"grad_norm": 0.15122997760772705,
"learning_rate": 9.569156688691267e-06,
"loss": 0.6489,
"step": 81
},
{
"epoch": 0.1688111168296449,
"grad_norm": 0.13927607238292694,
"learning_rate": 9.556222460158196e-06,
"loss": 0.6323,
"step": 82
},
{
"epoch": 0.17086978898610397,
"grad_norm": 0.13584642112255096,
"learning_rate": 9.543107004092351e-06,
"loss": 0.6404,
"step": 83
},
{
"epoch": 0.17292846114256305,
"grad_norm": 0.15319298207759857,
"learning_rate": 9.529810906477795e-06,
"loss": 0.6286,
"step": 84
},
{
"epoch": 0.17498713329902213,
"grad_norm": 0.14342650771141052,
"learning_rate": 9.516334761369467e-06,
"loss": 0.6528,
"step": 85
},
{
"epoch": 0.1770458054554812,
"grad_norm": 0.13270069658756256,
"learning_rate": 9.50267917086662e-06,
"loss": 0.6755,
"step": 86
},
{
"epoch": 0.1791044776119403,
"grad_norm": 0.1380058228969574,
"learning_rate": 9.488844745085935e-06,
"loss": 0.6686,
"step": 87
},
{
"epoch": 0.1811631497683994,
"grad_norm": 0.1321735680103302,
"learning_rate": 9.474832102134254e-06,
"loss": 0.6312,
"step": 88
},
{
"epoch": 0.18322182192485847,
"grad_norm": 0.17360003292560577,
"learning_rate": 9.460641868080961e-06,
"loss": 0.6722,
"step": 89
},
{
"epoch": 0.18528049408131755,
"grad_norm": 0.14511004090309143,
"learning_rate": 9.446274676930022e-06,
"loss": 0.6567,
"step": 90
},
{
"epoch": 0.18733916623777663,
"grad_norm": 0.1359574794769287,
"learning_rate": 9.43173117059164e-06,
"loss": 0.6462,
"step": 91
},
{
"epoch": 0.1893978383942357,
"grad_norm": 0.14268562197685242,
"learning_rate": 9.417011998853596e-06,
"loss": 0.6531,
"step": 92
},
{
"epoch": 0.1914565105506948,
"grad_norm": 0.14946860074996948,
"learning_rate": 9.4021178193522e-06,
"loss": 0.6442,
"step": 93
},
{
"epoch": 0.1935151827071539,
"grad_norm": 0.13256080448627472,
"learning_rate": 9.387049297542914e-06,
"loss": 0.6528,
"step": 94
},
{
"epoch": 0.19557385486361298,
"grad_norm": 0.15630853176116943,
"learning_rate": 9.371807106670628e-06,
"loss": 0.6434,
"step": 95
},
{
"epoch": 0.19763252702007206,
"grad_norm": 0.1315910816192627,
"learning_rate": 9.356391927739569e-06,
"loss": 0.6463,
"step": 96
},
{
"epoch": 0.19969119917653114,
"grad_norm": 0.14017589390277863,
"learning_rate": 9.340804449482882e-06,
"loss": 0.6752,
"step": 97
},
{
"epoch": 0.20174987133299022,
"grad_norm": 0.16451334953308105,
"learning_rate": 9.325045368331851e-06,
"loss": 0.659,
"step": 98
},
{
"epoch": 0.2038085434894493,
"grad_norm": 0.14280985295772552,
"learning_rate": 9.309115388384793e-06,
"loss": 0.6447,
"step": 99
},
{
"epoch": 0.2058672156459084,
"grad_norm": 0.15770740807056427,
"learning_rate": 9.293015221375588e-06,
"loss": 0.6925,
"step": 100
},
{
"epoch": 0.20792588780236748,
"grad_norm": 0.16106918454170227,
"learning_rate": 9.27674558664189e-06,
"loss": 0.6865,
"step": 101
},
{
"epoch": 0.20998455995882656,
"grad_norm": 0.13006511330604553,
"learning_rate": 9.260307211092987e-06,
"loss": 0.6452,
"step": 102
},
{
"epoch": 0.21204323211528564,
"grad_norm": 0.14992979168891907,
"learning_rate": 9.243700829177307e-06,
"loss": 0.6787,
"step": 103
},
{
"epoch": 0.21410190427174472,
"grad_norm": 0.1523681879043579,
"learning_rate": 9.226927182849631e-06,
"loss": 0.6534,
"step": 104
},
{
"epoch": 0.2161605764282038,
"grad_norm": 0.14899896085262299,
"learning_rate": 9.209987021537921e-06,
"loss": 0.6547,
"step": 105
},
{
"epoch": 0.2182192485846629,
"grad_norm": 0.14096584916114807,
"learning_rate": 9.192881102109848e-06,
"loss": 0.647,
"step": 106
},
{
"epoch": 0.22027792074112199,
"grad_norm": 0.1324293166399002,
"learning_rate": 9.17561018883897e-06,
"loss": 0.6622,
"step": 107
},
{
"epoch": 0.22233659289758106,
"grad_norm": 0.14914196729660034,
"learning_rate": 9.158175053370585e-06,
"loss": 0.6642,
"step": 108
},
{
"epoch": 0.22439526505404014,
"grad_norm": 0.1294255554676056,
"learning_rate": 9.140576474687263e-06,
"loss": 0.6697,
"step": 109
},
{
"epoch": 0.22645393721049922,
"grad_norm": 0.15517495572566986,
"learning_rate": 9.122815239074034e-06,
"loss": 0.6626,
"step": 110
},
{
"epoch": 0.2285126093669583,
"grad_norm": 0.15496940910816193,
"learning_rate": 9.104892140083257e-06,
"loss": 0.6505,
"step": 111
},
{
"epoch": 0.2305712815234174,
"grad_norm": 0.13310006260871887,
"learning_rate": 9.086807978499167e-06,
"loss": 0.6519,
"step": 112
},
{
"epoch": 0.2326299536798765,
"grad_norm": 0.1449541598558426,
"learning_rate": 9.068563562302106e-06,
"loss": 0.6604,
"step": 113
},
{
"epoch": 0.23468862583633557,
"grad_norm": 0.1366063952445984,
"learning_rate": 9.050159706632403e-06,
"loss": 0.6431,
"step": 114
},
{
"epoch": 0.23674729799279465,
"grad_norm": 0.15107031166553497,
"learning_rate": 9.031597233753975e-06,
"loss": 0.6639,
"step": 115
},
{
"epoch": 0.23880597014925373,
"grad_norm": 0.14455774426460266,
"learning_rate": 9.012876973017578e-06,
"loss": 0.6661,
"step": 116
},
{
"epoch": 0.2408646423057128,
"grad_norm": 0.13330498337745667,
"learning_rate": 8.993999760823753e-06,
"loss": 0.6575,
"step": 117
},
{
"epoch": 0.24292331446217189,
"grad_norm": 0.13691799342632294,
"learning_rate": 8.974966440585461e-06,
"loss": 0.6786,
"step": 118
},
{
"epoch": 0.244981986618631,
"grad_norm": 0.1397402137517929,
"learning_rate": 8.955777862690397e-06,
"loss": 0.6583,
"step": 119
},
{
"epoch": 0.24704065877509007,
"grad_norm": 0.1374320238828659,
"learning_rate": 8.936434884462995e-06,
"loss": 0.6721,
"step": 120
},
{
"epoch": 0.24909933093154915,
"grad_norm": 0.15251614153385162,
"learning_rate": 8.916938370126127e-06,
"loss": 0.6693,
"step": 121
},
{
"epoch": 0.25115800308800823,
"grad_norm": 0.1461092084646225,
"learning_rate": 8.897289190762488e-06,
"loss": 0.6599,
"step": 122
},
{
"epoch": 0.25321667524446734,
"grad_norm": 0.1487017124891281,
"learning_rate": 8.877488224275676e-06,
"loss": 0.6565,
"step": 123
},
{
"epoch": 0.2552753474009264,
"grad_norm": 0.14306996762752533,
"learning_rate": 8.857536355350972e-06,
"loss": 0.6866,
"step": 124
},
{
"epoch": 0.2573340195573855,
"grad_norm": 0.15157969295978546,
"learning_rate": 8.837434475415811e-06,
"loss": 0.6626,
"step": 125
},
{
"epoch": 0.25939269171384455,
"grad_norm": 0.14696666598320007,
"learning_rate": 8.81718348259995e-06,
"loss": 0.6777,
"step": 126
},
{
"epoch": 0.26145136387030365,
"grad_norm": 0.15098857879638672,
"learning_rate": 8.79678428169535e-06,
"loss": 0.6868,
"step": 127
},
{
"epoch": 0.26351003602676276,
"grad_norm": 0.15824772417545319,
"learning_rate": 8.776237784115743e-06,
"loss": 0.6542,
"step": 128
},
{
"epoch": 0.2655687081832218,
"grad_norm": 0.13801230490207672,
"learning_rate": 8.755544907855913e-06,
"loss": 0.6728,
"step": 129
},
{
"epoch": 0.2676273803396809,
"grad_norm": 0.13847684860229492,
"learning_rate": 8.734706577450684e-06,
"loss": 0.6426,
"step": 130
},
{
"epoch": 0.26968605249613997,
"grad_norm": 0.15203307569026947,
"learning_rate": 8.713723723933604e-06,
"loss": 0.6381,
"step": 131
},
{
"epoch": 0.2717447246525991,
"grad_norm": 0.1289345920085907,
"learning_rate": 8.692597284795363e-06,
"loss": 0.6587,
"step": 132
},
{
"epoch": 0.27380339680905813,
"grad_norm": 0.13953810930252075,
"learning_rate": 8.671328203941889e-06,
"loss": 0.6473,
"step": 133
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.153923898935318,
"learning_rate": 8.649917431652191e-06,
"loss": 0.6685,
"step": 134
},
{
"epoch": 0.27792074112197634,
"grad_norm": 0.15305279195308685,
"learning_rate": 8.628365924535892e-06,
"loss": 0.6582,
"step": 135
},
{
"epoch": 0.2799794132784354,
"grad_norm": 0.13822555541992188,
"learning_rate": 8.606674645490487e-06,
"loss": 0.6305,
"step": 136
},
{
"epoch": 0.2820380854348945,
"grad_norm": 0.13595739006996155,
"learning_rate": 8.584844563658334e-06,
"loss": 0.6627,
"step": 137
},
{
"epoch": 0.28409675759135355,
"grad_norm": 0.145783469080925,
"learning_rate": 8.56287665438334e-06,
"loss": 0.6484,
"step": 138
},
{
"epoch": 0.28615542974781266,
"grad_norm": 0.14241014420986176,
"learning_rate": 8.540771899167395e-06,
"loss": 0.6313,
"step": 139
},
{
"epoch": 0.28821410190427177,
"grad_norm": 0.13967099785804749,
"learning_rate": 8.518531285626506e-06,
"loss": 0.671,
"step": 140
},
{
"epoch": 0.2902727740607308,
"grad_norm": 0.13454201817512512,
"learning_rate": 8.496155807446688e-06,
"loss": 0.6626,
"step": 141
},
{
"epoch": 0.2923314462171899,
"grad_norm": 0.14464090764522552,
"learning_rate": 8.473646464339557e-06,
"loss": 0.6537,
"step": 142
},
{
"epoch": 0.294390118373649,
"grad_norm": 0.1445484757423401,
"learning_rate": 8.451004261997664e-06,
"loss": 0.6588,
"step": 143
},
{
"epoch": 0.2964487905301081,
"grad_norm": 0.13212668895721436,
"learning_rate": 8.428230212049562e-06,
"loss": 0.6209,
"step": 144
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.1413717269897461,
"learning_rate": 8.40532533201461e-06,
"loss": 0.6491,
"step": 145
},
{
"epoch": 0.30056613484302624,
"grad_norm": 0.14644797146320343,
"learning_rate": 8.382290645257518e-06,
"loss": 0.6551,
"step": 146
},
{
"epoch": 0.30262480699948535,
"grad_norm": 0.1352260410785675,
"learning_rate": 8.359127180942607e-06,
"loss": 0.6731,
"step": 147
},
{
"epoch": 0.3046834791559444,
"grad_norm": 0.1495533436536789,
"learning_rate": 8.335835973987841e-06,
"loss": 0.6717,
"step": 148
},
{
"epoch": 0.3067421513124035,
"grad_norm": 0.15269213914871216,
"learning_rate": 8.312418065018582e-06,
"loss": 0.6738,
"step": 149
},
{
"epoch": 0.30880082346886256,
"grad_norm": 0.13821226358413696,
"learning_rate": 8.288874500321103e-06,
"loss": 0.6348,
"step": 150
},
{
"epoch": 0.31085949562532167,
"grad_norm": 0.13883084058761597,
"learning_rate": 8.26520633179583e-06,
"loss": 0.6408,
"step": 151
},
{
"epoch": 0.3129181677817808,
"grad_norm": 0.1385805606842041,
"learning_rate": 8.24141461691035e-06,
"loss": 0.6686,
"step": 152
},
{
"epoch": 0.3149768399382398,
"grad_norm": 0.1448226422071457,
"learning_rate": 8.21750041865217e-06,
"loss": 0.6555,
"step": 153
},
{
"epoch": 0.31703551209469893,
"grad_norm": 0.153200164437294,
"learning_rate": 8.193464805481218e-06,
"loss": 0.6632,
"step": 154
},
{
"epoch": 0.319094184251158,
"grad_norm": 0.13748742640018463,
"learning_rate": 8.1693088512821e-06,
"loss": 0.6241,
"step": 155
},
{
"epoch": 0.3211528564076171,
"grad_norm": 0.14238247275352478,
"learning_rate": 8.14503363531613e-06,
"loss": 0.6813,
"step": 156
},
{
"epoch": 0.32321152856407614,
"grad_norm": 0.13559186458587646,
"learning_rate": 8.120640242173107e-06,
"loss": 0.6463,
"step": 157
},
{
"epoch": 0.32527020072053525,
"grad_norm": 0.1391640305519104,
"learning_rate": 8.096129761722855e-06,
"loss": 0.6454,
"step": 158
},
{
"epoch": 0.32732887287699436,
"grad_norm": 0.13450267910957336,
"learning_rate": 8.071503289066526e-06,
"loss": 0.6459,
"step": 159
},
{
"epoch": 0.3293875450334534,
"grad_norm": 0.13703607022762299,
"learning_rate": 8.046761924487679e-06,
"loss": 0.645,
"step": 160
},
{
"epoch": 0.3314462171899125,
"grad_norm": 0.12534037232398987,
"learning_rate": 8.021906773403116e-06,
"loss": 0.6535,
"step": 161
},
{
"epoch": 0.33350488934637157,
"grad_norm": 0.13822266459465027,
"learning_rate": 7.996938946313496e-06,
"loss": 0.6436,
"step": 162
},
{
"epoch": 0.3355635615028307,
"grad_norm": 0.1410212516784668,
"learning_rate": 7.971859558753714e-06,
"loss": 0.635,
"step": 163
},
{
"epoch": 0.3376222336592898,
"grad_norm": 0.13720424473285675,
"learning_rate": 7.946669731243064e-06,
"loss": 0.6512,
"step": 164
},
{
"epoch": 0.33968090581574883,
"grad_norm": 0.14550641179084778,
"learning_rate": 7.921370589235178e-06,
"loss": 0.662,
"step": 165
},
{
"epoch": 0.34173957797220794,
"grad_norm": 0.15036438405513763,
"learning_rate": 7.895963263067734e-06,
"loss": 0.6653,
"step": 166
},
{
"epoch": 0.343798250128667,
"grad_norm": 0.14174431562423706,
"learning_rate": 7.870448887911961e-06,
"loss": 0.6855,
"step": 167
},
{
"epoch": 0.3458569222851261,
"grad_norm": 0.13360486924648285,
"learning_rate": 7.844828603721923e-06,
"loss": 0.6621,
"step": 168
},
{
"epoch": 0.34791559444158515,
"grad_norm": 0.15754815936088562,
"learning_rate": 7.819103555183575e-06,
"loss": 0.6709,
"step": 169
},
{
"epoch": 0.34997426659804426,
"grad_norm": 0.13917994499206543,
"learning_rate": 7.79327489166363e-06,
"loss": 0.6765,
"step": 170
},
{
"epoch": 0.35203293875450337,
"grad_norm": 0.1421346664428711,
"learning_rate": 7.767343767158201e-06,
"loss": 0.6689,
"step": 171
},
{
"epoch": 0.3540916109109624,
"grad_norm": 0.14150221645832062,
"learning_rate": 7.741311340241256e-06,
"loss": 0.6553,
"step": 172
},
{
"epoch": 0.3561502830674215,
"grad_norm": 0.13491779565811157,
"learning_rate": 7.715178774012825e-06,
"loss": 0.6792,
"step": 173
},
{
"epoch": 0.3582089552238806,
"grad_norm": 0.14319780468940735,
"learning_rate": 7.688947236047066e-06,
"loss": 0.6313,
"step": 174
},
{
"epoch": 0.3602676273803397,
"grad_norm": 0.14893461763858795,
"learning_rate": 7.662617898340077e-06,
"loss": 0.6551,
"step": 175
},
{
"epoch": 0.3623262995367988,
"grad_norm": 0.13200290501117706,
"learning_rate": 7.636191937257545e-06,
"loss": 0.6639,
"step": 176
},
{
"epoch": 0.36438497169325784,
"grad_norm": 0.12563662230968475,
"learning_rate": 7.609670533482181e-06,
"loss": 0.6628,
"step": 177
},
{
"epoch": 0.36644364384971695,
"grad_norm": 0.13975931704044342,
"learning_rate": 7.583054871960968e-06,
"loss": 0.6412,
"step": 178
},
{
"epoch": 0.368502316006176,
"grad_norm": 0.1629699319601059,
"learning_rate": 7.556346141852221e-06,
"loss": 0.6511,
"step": 179
},
{
"epoch": 0.3705609881626351,
"grad_norm": 0.1409747153520584,
"learning_rate": 7.529545536472459e-06,
"loss": 0.6096,
"step": 180
},
{
"epoch": 0.37261966031909416,
"grad_norm": 0.15238557755947113,
"learning_rate": 7.502654253243084e-06,
"loss": 0.623,
"step": 181
},
{
"epoch": 0.37467833247555327,
"grad_norm": 0.14103642106056213,
"learning_rate": 7.475673493636889e-06,
"loss": 0.6397,
"step": 182
},
{
"epoch": 0.3767370046320124,
"grad_norm": 0.14412514865398407,
"learning_rate": 7.448604463124365e-06,
"loss": 0.6388,
"step": 183
},
{
"epoch": 0.3787956767884714,
"grad_norm": 0.14426256716251373,
"learning_rate": 7.4214483711198535e-06,
"loss": 0.6335,
"step": 184
},
{
"epoch": 0.38085434894493053,
"grad_norm": 0.15213246643543243,
"learning_rate": 7.394206430927509e-06,
"loss": 0.6639,
"step": 185
},
{
"epoch": 0.3829130211013896,
"grad_norm": 0.1403135508298874,
"learning_rate": 7.366879859687083e-06,
"loss": 0.6571,
"step": 186
},
{
"epoch": 0.3849716932578487,
"grad_norm": 0.12427900731563568,
"learning_rate": 7.339469878319554e-06,
"loss": 0.6428,
"step": 187
},
{
"epoch": 0.3870303654143078,
"grad_norm": 0.13845951855182648,
"learning_rate": 7.311977711472569e-06,
"loss": 0.654,
"step": 188
},
{
"epoch": 0.38908903757076685,
"grad_norm": 0.13073399662971497,
"learning_rate": 7.284404587465733e-06,
"loss": 0.6454,
"step": 189
},
{
"epoch": 0.39114770972722596,
"grad_norm": 0.14320863783359528,
"learning_rate": 7.2567517382357265e-06,
"loss": 0.6602,
"step": 190
},
{
"epoch": 0.393206381883685,
"grad_norm": 0.12681038677692413,
"learning_rate": 7.229020399281268e-06,
"loss": 0.6597,
"step": 191
},
{
"epoch": 0.3952650540401441,
"grad_norm": 0.13371235132217407,
"learning_rate": 7.201211809607903e-06,
"loss": 0.6425,
"step": 192
},
{
"epoch": 0.39732372619660317,
"grad_norm": 0.13812264800071716,
"learning_rate": 7.17332721167266e-06,
"loss": 0.6728,
"step": 193
},
{
"epoch": 0.3993823983530623,
"grad_norm": 0.158253014087677,
"learning_rate": 7.145367851328531e-06,
"loss": 0.6973,
"step": 194
},
{
"epoch": 0.4014410705095214,
"grad_norm": 0.15043658018112183,
"learning_rate": 7.1173349777688075e-06,
"loss": 0.6485,
"step": 195
},
{
"epoch": 0.40349974266598043,
"grad_norm": 0.14149758219718933,
"learning_rate": 7.089229843471276e-06,
"loss": 0.6643,
"step": 196
},
{
"epoch": 0.40555841482243954,
"grad_norm": 0.16252540051937103,
"learning_rate": 7.061053704142244e-06,
"loss": 0.6628,
"step": 197
},
{
"epoch": 0.4076170869788986,
"grad_norm": 0.14584672451019287,
"learning_rate": 7.032807818660451e-06,
"loss": 0.644,
"step": 198
},
{
"epoch": 0.4096757591353577,
"grad_norm": 0.13865599036216736,
"learning_rate": 7.004493449020818e-06,
"loss": 0.6649,
"step": 199
},
{
"epoch": 0.4117344312918168,
"grad_norm": 0.14170795679092407,
"learning_rate": 6.976111860278061e-06,
"loss": 0.6651,
"step": 200
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.14801864326000214,
"learning_rate": 6.947664320490169e-06,
"loss": 0.6367,
"step": 201
},
{
"epoch": 0.41585177560473496,
"grad_norm": 0.16109682619571686,
"learning_rate": 6.919152100661758e-06,
"loss": 0.6734,
"step": 202
},
{
"epoch": 0.417910447761194,
"grad_norm": 0.144635409116745,
"learning_rate": 6.890576474687264e-06,
"loss": 0.6564,
"step": 203
},
{
"epoch": 0.4199691199176531,
"grad_norm": 0.14229914546012878,
"learning_rate": 6.8619387192940516e-06,
"loss": 0.6759,
"step": 204
},
{
"epoch": 0.4220277920741122,
"grad_norm": 0.1457197219133377,
"learning_rate": 6.8332401139853545e-06,
"loss": 0.6601,
"step": 205
},
{
"epoch": 0.4240864642305713,
"grad_norm": 0.14699086546897888,
"learning_rate": 6.804481940983111e-06,
"loss": 0.6723,
"step": 206
},
{
"epoch": 0.4261451363870304,
"grad_norm": 0.13042482733726501,
"learning_rate": 6.775665485170681e-06,
"loss": 0.6187,
"step": 207
},
{
"epoch": 0.42820380854348944,
"grad_norm": 0.16707850992679596,
"learning_rate": 6.746792034035437e-06,
"loss": 0.656,
"step": 208
},
{
"epoch": 0.43026248069994855,
"grad_norm": 0.13840922713279724,
"learning_rate": 6.7178628776112355e-06,
"loss": 0.6637,
"step": 209
},
{
"epoch": 0.4323211528564076,
"grad_norm": 0.13450464606285095,
"learning_rate": 6.6888793084207885e-06,
"loss": 0.6457,
"step": 210
},
{
"epoch": 0.4343798250128667,
"grad_norm": 0.15876543521881104,
"learning_rate": 6.6598426214179045e-06,
"loss": 0.6644,
"step": 211
},
{
"epoch": 0.4364384971693258,
"grad_norm": 0.1447768658399582,
"learning_rate": 6.6307541139296415e-06,
"loss": 0.6595,
"step": 212
},
{
"epoch": 0.43849716932578486,
"grad_norm": 0.14029446244239807,
"learning_rate": 6.60161508559834e-06,
"loss": 0.6564,
"step": 213
},
{
"epoch": 0.44055584148224397,
"grad_norm": 0.14148728549480438,
"learning_rate": 6.57242683832355e-06,
"loss": 0.6398,
"step": 214
},
{
"epoch": 0.442614513638703,
"grad_norm": 0.15543389320373535,
"learning_rate": 6.543190676203877e-06,
"loss": 0.6546,
"step": 215
},
{
"epoch": 0.44467318579516213,
"grad_norm": 0.13774473965168,
"learning_rate": 6.5139079054787055e-06,
"loss": 0.6935,
"step": 216
},
{
"epoch": 0.4467318579516212,
"grad_norm": 0.14050574600696564,
"learning_rate": 6.484579834469838e-06,
"loss": 0.6388,
"step": 217
},
{
"epoch": 0.4487905301080803,
"grad_norm": 0.1455702781677246,
"learning_rate": 6.4552077735230485e-06,
"loss": 0.6726,
"step": 218
},
{
"epoch": 0.4508492022645394,
"grad_norm": 0.15097936987876892,
"learning_rate": 6.425793034949528e-06,
"loss": 0.6568,
"step": 219
},
{
"epoch": 0.45290787442099845,
"grad_norm": 0.14591309428215027,
"learning_rate": 6.396336932967262e-06,
"loss": 0.6456,
"step": 220
},
{
"epoch": 0.45496654657745755,
"grad_norm": 0.14051714539527893,
"learning_rate": 6.366840783642305e-06,
"loss": 0.6795,
"step": 221
},
{
"epoch": 0.4570252187339166,
"grad_norm": 0.14078466594219208,
"learning_rate": 6.337305904829979e-06,
"loss": 0.6237,
"step": 222
},
{
"epoch": 0.4590838908903757,
"grad_norm": 0.13030259311199188,
"learning_rate": 6.307733616116002e-06,
"loss": 0.6472,
"step": 223
},
{
"epoch": 0.4611425630468348,
"grad_norm": 0.12666334211826324,
"learning_rate": 6.2781252387575206e-06,
"loss": 0.6554,
"step": 224
},
{
"epoch": 0.46320123520329387,
"grad_norm": 0.13531836867332458,
"learning_rate": 6.248482095624087e-06,
"loss": 0.6548,
"step": 225
},
{
"epoch": 0.465259907359753,
"grad_norm": 0.14652900397777557,
"learning_rate": 6.218805511138544e-06,
"loss": 0.6849,
"step": 226
},
{
"epoch": 0.46731857951621203,
"grad_norm": 0.14632223546504974,
"learning_rate": 6.18909681121786e-06,
"loss": 0.6702,
"step": 227
},
{
"epoch": 0.46937725167267114,
"grad_norm": 0.133872389793396,
"learning_rate": 6.15935732321389e-06,
"loss": 0.6188,
"step": 228
},
{
"epoch": 0.4714359238291302,
"grad_norm": 0.133718803524971,
"learning_rate": 6.1295883758540595e-06,
"loss": 0.6622,
"step": 229
},
{
"epoch": 0.4734945959855893,
"grad_norm": 0.14238756895065308,
"learning_rate": 6.0997912991820065e-06,
"loss": 0.671,
"step": 230
},
{
"epoch": 0.4755532681420484,
"grad_norm": 0.13712720572948456,
"learning_rate": 6.069967424498157e-06,
"loss": 0.6622,
"step": 231
},
{
"epoch": 0.47761194029850745,
"grad_norm": 0.13542483747005463,
"learning_rate": 6.0401180843002435e-06,
"loss": 0.683,
"step": 232
},
{
"epoch": 0.47967061245496656,
"grad_norm": 0.1468353122472763,
"learning_rate": 6.01024461222377e-06,
"loss": 0.666,
"step": 233
},
{
"epoch": 0.4817292846114256,
"grad_norm": 0.1414223313331604,
"learning_rate": 5.980348342982424e-06,
"loss": 0.652,
"step": 234
},
{
"epoch": 0.4837879567678847,
"grad_norm": 0.13714027404785156,
"learning_rate": 5.950430612308444e-06,
"loss": 0.654,
"step": 235
},
{
"epoch": 0.48584662892434377,
"grad_norm": 0.12817879021167755,
"learning_rate": 5.9204927568929446e-06,
"loss": 0.6361,
"step": 236
},
{
"epoch": 0.4879053010808029,
"grad_norm": 0.13338102400302887,
"learning_rate": 5.89053611432619e-06,
"loss": 0.6557,
"step": 237
},
{
"epoch": 0.489963973237262,
"grad_norm": 0.14501504600048065,
"learning_rate": 5.860562023037834e-06,
"loss": 0.6799,
"step": 238
},
{
"epoch": 0.49202264539372104,
"grad_norm": 0.13377492129802704,
"learning_rate": 5.83057182223712e-06,
"loss": 0.6598,
"step": 239
},
{
"epoch": 0.49408131755018014,
"grad_norm": 0.15723681449890137,
"learning_rate": 5.800566851853048e-06,
"loss": 0.6385,
"step": 240
},
{
"epoch": 0.4961399897066392,
"grad_norm": 0.15144281089305878,
"learning_rate": 5.770548452474503e-06,
"loss": 0.6564,
"step": 241
},
{
"epoch": 0.4981986618630983,
"grad_norm": 0.13463923335075378,
"learning_rate": 5.740517965290366e-06,
"loss": 0.6809,
"step": 242
},
{
"epoch": 0.5002573340195574,
"grad_norm": 0.13458071649074554,
"learning_rate": 5.710476732029584e-06,
"loss": 0.6654,
"step": 243
},
{
"epoch": 0.5023160061760165,
"grad_norm": 0.1466810703277588,
"learning_rate": 5.680426094901233e-06,
"loss": 0.6519,
"step": 244
},
{
"epoch": 0.5043746783324755,
"grad_norm": 0.13636760413646698,
"learning_rate": 5.650367396534536e-06,
"loss": 0.637,
"step": 245
},
{
"epoch": 0.5064333504889347,
"grad_norm": 0.13849836587905884,
"learning_rate": 5.620301979918891e-06,
"loss": 0.6548,
"step": 246
},
{
"epoch": 0.5084920226453937,
"grad_norm": 0.13412857055664062,
"learning_rate": 5.590231188343852e-06,
"loss": 0.6706,
"step": 247
},
{
"epoch": 0.5105506948018528,
"grad_norm": 0.12860961258411407,
"learning_rate": 5.560156365339128e-06,
"loss": 0.6136,
"step": 248
},
{
"epoch": 0.5126093669583119,
"grad_norm": 0.12716427445411682,
"learning_rate": 5.530078854614541e-06,
"loss": 0.6531,
"step": 249
},
{
"epoch": 0.514668039114771,
"grad_norm": 0.13519296050071716,
"learning_rate": 5.500000000000001e-06,
"loss": 0.6526,
"step": 250
},
{
"epoch": 0.51672671127123,
"grad_norm": 0.15988165140151978,
"learning_rate": 5.469921145385462e-06,
"loss": 0.6177,
"step": 251
},
{
"epoch": 0.5187853834276891,
"grad_norm": 0.1526211053133011,
"learning_rate": 5.4398436346608726e-06,
"loss": 0.6473,
"step": 252
},
{
"epoch": 0.5208440555841483,
"grad_norm": 0.1384144276380539,
"learning_rate": 5.409768811656149e-06,
"loss": 0.6601,
"step": 253
},
{
"epoch": 0.5229027277406073,
"grad_norm": 0.1304050236940384,
"learning_rate": 5.379698020081111e-06,
"loss": 0.6667,
"step": 254
},
{
"epoch": 0.5249613998970664,
"grad_norm": 0.1473226696252823,
"learning_rate": 5.349632603465467e-06,
"loss": 0.6623,
"step": 255
},
{
"epoch": 0.5270200720535255,
"grad_norm": 0.14035162329673767,
"learning_rate": 5.3195739050987695e-06,
"loss": 0.6598,
"step": 256
},
{
"epoch": 0.5290787442099846,
"grad_norm": 0.12912693619728088,
"learning_rate": 5.289523267970417e-06,
"loss": 0.6669,
"step": 257
},
{
"epoch": 0.5311374163664436,
"grad_norm": 0.15704086422920227,
"learning_rate": 5.259482034709637e-06,
"loss": 0.6643,
"step": 258
},
{
"epoch": 0.5331960885229027,
"grad_norm": 0.15738819539546967,
"learning_rate": 5.229451547525499e-06,
"loss": 0.6695,
"step": 259
},
{
"epoch": 0.5352547606793618,
"grad_norm": 0.14451304078102112,
"learning_rate": 5.199433148146954e-06,
"loss": 0.6946,
"step": 260
},
{
"epoch": 0.5373134328358209,
"grad_norm": 0.13368944823741913,
"learning_rate": 5.169428177762882e-06,
"loss": 0.6802,
"step": 261
},
{
"epoch": 0.5393721049922799,
"grad_norm": 0.13744553923606873,
"learning_rate": 5.139437976962168e-06,
"loss": 0.6501,
"step": 262
},
{
"epoch": 0.5414307771487391,
"grad_norm": 0.1409062147140503,
"learning_rate": 5.109463885673812e-06,
"loss": 0.6462,
"step": 263
},
{
"epoch": 0.5434894493051982,
"grad_norm": 0.13469253480434418,
"learning_rate": 5.079507243107057e-06,
"loss": 0.6515,
"step": 264
},
{
"epoch": 0.5455481214616572,
"grad_norm": 0.13426664471626282,
"learning_rate": 5.0495693876915574e-06,
"loss": 0.6475,
"step": 265
},
{
"epoch": 0.5476067936181163,
"grad_norm": 0.14371594786643982,
"learning_rate": 5.019651657017578e-06,
"loss": 0.648,
"step": 266
},
{
"epoch": 0.5496654657745754,
"grad_norm": 0.1432650238275528,
"learning_rate": 4.989755387776231e-06,
"loss": 0.6851,
"step": 267
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.15230941772460938,
"learning_rate": 4.959881915699757e-06,
"loss": 0.6654,
"step": 268
},
{
"epoch": 0.5537828100874935,
"grad_norm": 0.13175268471240997,
"learning_rate": 4.930032575501845e-06,
"loss": 0.6681,
"step": 269
},
{
"epoch": 0.5558414822439527,
"grad_norm": 0.131820410490036,
"learning_rate": 4.9002087008179966e-06,
"loss": 0.6463,
"step": 270
},
{
"epoch": 0.5579001544004117,
"grad_norm": 0.14459112286567688,
"learning_rate": 4.870411624145942e-06,
"loss": 0.6762,
"step": 271
},
{
"epoch": 0.5599588265568708,
"grad_norm": 0.14455579221248627,
"learning_rate": 4.840642676786111e-06,
"loss": 0.6613,
"step": 272
},
{
"epoch": 0.56201749871333,
"grad_norm": 0.14118269085884094,
"learning_rate": 4.81090318878214e-06,
"loss": 0.6571,
"step": 273
},
{
"epoch": 0.564076170869789,
"grad_norm": 0.13893471658229828,
"learning_rate": 4.781194488861459e-06,
"loss": 0.6875,
"step": 274
},
{
"epoch": 0.5661348430262481,
"grad_norm": 0.14105477929115295,
"learning_rate": 4.751517904375915e-06,
"loss": 0.6367,
"step": 275
},
{
"epoch": 0.5681935151827071,
"grad_norm": 0.14537222683429718,
"learning_rate": 4.721874761242482e-06,
"loss": 0.6349,
"step": 276
},
{
"epoch": 0.5702521873391663,
"grad_norm": 0.1382058560848236,
"learning_rate": 4.692266383884e-06,
"loss": 0.6656,
"step": 277
},
{
"epoch": 0.5723108594956253,
"grad_norm": 0.15627695620059967,
"learning_rate": 4.662694095170023e-06,
"loss": 0.645,
"step": 278
},
{
"epoch": 0.5743695316520844,
"grad_norm": 0.13674962520599365,
"learning_rate": 4.633159216357696e-06,
"loss": 0.6777,
"step": 279
},
{
"epoch": 0.5764282038085435,
"grad_norm": 0.14432717859745026,
"learning_rate": 4.603663067032738e-06,
"loss": 0.6732,
"step": 280
},
{
"epoch": 0.5784868759650026,
"grad_norm": 0.14750322699546814,
"learning_rate": 4.574206965050472e-06,
"loss": 0.644,
"step": 281
},
{
"epoch": 0.5805455481214616,
"grad_norm": 0.13499832153320312,
"learning_rate": 4.544792226476954e-06,
"loss": 0.6534,
"step": 282
},
{
"epoch": 0.5826042202779207,
"grad_norm": 0.14756721258163452,
"learning_rate": 4.515420165530162e-06,
"loss": 0.6328,
"step": 283
},
{
"epoch": 0.5846628924343799,
"grad_norm": 0.13895276188850403,
"learning_rate": 4.486092094521296e-06,
"loss": 0.6573,
"step": 284
},
{
"epoch": 0.5867215645908389,
"grad_norm": 0.14413511753082275,
"learning_rate": 4.456809323796123e-06,
"loss": 0.6686,
"step": 285
},
{
"epoch": 0.588780236747298,
"grad_norm": 0.13578923046588898,
"learning_rate": 4.427573161676452e-06,
"loss": 0.6282,
"step": 286
},
{
"epoch": 0.5908389089037571,
"grad_norm": 0.14140614867210388,
"learning_rate": 4.398384914401663e-06,
"loss": 0.6761,
"step": 287
},
{
"epoch": 0.5928975810602162,
"grad_norm": 0.12944689393043518,
"learning_rate": 4.36924588607036e-06,
"loss": 0.6515,
"step": 288
},
{
"epoch": 0.5949562532166752,
"grad_norm": 0.13431216776371002,
"learning_rate": 4.3401573785820986e-06,
"loss": 0.6561,
"step": 289
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.14476732909679413,
"learning_rate": 4.311120691579214e-06,
"loss": 0.6593,
"step": 290
},
{
"epoch": 0.5990735975295934,
"grad_norm": 0.14597386121749878,
"learning_rate": 4.282137122388765e-06,
"loss": 0.6523,
"step": 291
},
{
"epoch": 0.6011322696860525,
"grad_norm": 0.13887791335582733,
"learning_rate": 4.253207965964565e-06,
"loss": 0.6537,
"step": 292
},
{
"epoch": 0.6031909418425115,
"grad_norm": 0.13218793272972107,
"learning_rate": 4.2243345148293215e-06,
"loss": 0.645,
"step": 293
},
{
"epoch": 0.6052496139989707,
"grad_norm": 0.13599814474582672,
"learning_rate": 4.195518059016891e-06,
"loss": 0.6041,
"step": 294
},
{
"epoch": 0.6073082861554298,
"grad_norm": 0.1398853212594986,
"learning_rate": 4.166759886014649e-06,
"loss": 0.6351,
"step": 295
},
{
"epoch": 0.6093669583118888,
"grad_norm": 0.13316257297992706,
"learning_rate": 4.13806128070595e-06,
"loss": 0.6566,
"step": 296
},
{
"epoch": 0.611425630468348,
"grad_norm": 0.14696818590164185,
"learning_rate": 4.109423525312738e-06,
"loss": 0.6408,
"step": 297
},
{
"epoch": 0.613484302624807,
"grad_norm": 0.1275351196527481,
"learning_rate": 4.080847899338244e-06,
"loss": 0.6514,
"step": 298
},
{
"epoch": 0.6155429747812661,
"grad_norm": 0.1334499716758728,
"learning_rate": 4.052335679509831e-06,
"loss": 0.6318,
"step": 299
},
{
"epoch": 0.6176016469377251,
"grad_norm": 0.1320067048072815,
"learning_rate": 4.02388813972194e-06,
"loss": 0.6527,
"step": 300
},
{
"epoch": 0.6196603190941843,
"grad_norm": 0.13550327718257904,
"learning_rate": 3.995506550979182e-06,
"loss": 0.6618,
"step": 301
},
{
"epoch": 0.6217189912506433,
"grad_norm": 0.14642766118049622,
"learning_rate": 3.967192181339549e-06,
"loss": 0.6611,
"step": 302
},
{
"epoch": 0.6237776634071024,
"grad_norm": 0.1360124945640564,
"learning_rate": 3.938946295857758e-06,
"loss": 0.6722,
"step": 303
},
{
"epoch": 0.6258363355635616,
"grad_norm": 0.14635741710662842,
"learning_rate": 3.910770156528726e-06,
"loss": 0.6529,
"step": 304
},
{
"epoch": 0.6278950077200206,
"grad_norm": 0.13334567844867706,
"learning_rate": 3.882665022231193e-06,
"loss": 0.6351,
"step": 305
},
{
"epoch": 0.6299536798764797,
"grad_norm": 0.13501949608325958,
"learning_rate": 3.8546321486714714e-06,
"loss": 0.6657,
"step": 306
},
{
"epoch": 0.6320123520329387,
"grad_norm": 0.14261625707149506,
"learning_rate": 3.826672788327341e-06,
"loss": 0.6573,
"step": 307
},
{
"epoch": 0.6340710241893979,
"grad_norm": 0.15793120861053467,
"learning_rate": 3.798788190392099e-06,
"loss": 0.6607,
"step": 308
},
{
"epoch": 0.6361296963458569,
"grad_norm": 0.1266845464706421,
"learning_rate": 3.770979600718734e-06,
"loss": 0.6593,
"step": 309
},
{
"epoch": 0.638188368502316,
"grad_norm": 0.14376017451286316,
"learning_rate": 3.743248261764274e-06,
"loss": 0.6492,
"step": 310
},
{
"epoch": 0.6402470406587751,
"grad_norm": 0.13482894003391266,
"learning_rate": 3.7155954125342685e-06,
"loss": 0.6721,
"step": 311
},
{
"epoch": 0.6423057128152342,
"grad_norm": 0.13621023297309875,
"learning_rate": 3.688022288527433e-06,
"loss": 0.6793,
"step": 312
},
{
"epoch": 0.6443643849716932,
"grad_norm": 0.13253401219844818,
"learning_rate": 3.6605301216804477e-06,
"loss": 0.653,
"step": 313
},
{
"epoch": 0.6464230571281523,
"grad_norm": 0.14206025004386902,
"learning_rate": 3.633120140312919e-06,
"loss": 0.6668,
"step": 314
},
{
"epoch": 0.6484817292846115,
"grad_norm": 0.14922694861888885,
"learning_rate": 3.605793569072493e-06,
"loss": 0.64,
"step": 315
},
{
"epoch": 0.6505404014410705,
"grad_norm": 0.13803939521312714,
"learning_rate": 3.578551628880148e-06,
"loss": 0.634,
"step": 316
},
{
"epoch": 0.6525990735975296,
"grad_norm": 0.14033062756061554,
"learning_rate": 3.551395536875637e-06,
"loss": 0.6335,
"step": 317
},
{
"epoch": 0.6546577457539887,
"grad_norm": 0.14162582159042358,
"learning_rate": 3.5243265063631125e-06,
"loss": 0.6668,
"step": 318
},
{
"epoch": 0.6567164179104478,
"grad_norm": 0.14522220194339752,
"learning_rate": 3.4973457467569155e-06,
"loss": 0.6433,
"step": 319
},
{
"epoch": 0.6587750900669068,
"grad_norm": 0.13454043865203857,
"learning_rate": 3.4704544635275416e-06,
"loss": 0.6498,
"step": 320
},
{
"epoch": 0.660833762223366,
"grad_norm": 0.14392301440238953,
"learning_rate": 3.4436538581477796e-06,
"loss": 0.6653,
"step": 321
},
{
"epoch": 0.662892434379825,
"grad_norm": 0.15344378352165222,
"learning_rate": 3.4169451280390343e-06,
"loss": 0.6941,
"step": 322
},
{
"epoch": 0.6649511065362841,
"grad_norm": 0.14536505937576294,
"learning_rate": 3.3903294665178204e-06,
"loss": 0.6546,
"step": 323
},
{
"epoch": 0.6670097786927431,
"grad_norm": 0.13551141321659088,
"learning_rate": 3.363808062742455e-06,
"loss": 0.6312,
"step": 324
},
{
"epoch": 0.6690684508492023,
"grad_norm": 0.1345638483762741,
"learning_rate": 3.3373821016599236e-06,
"loss": 0.6617,
"step": 325
},
{
"epoch": 0.6711271230056614,
"grad_norm": 0.1479879766702652,
"learning_rate": 3.3110527639529356e-06,
"loss": 0.6536,
"step": 326
},
{
"epoch": 0.6731857951621204,
"grad_norm": 0.14307381212711334,
"learning_rate": 3.2848212259871763e-06,
"loss": 0.6963,
"step": 327
},
{
"epoch": 0.6752444673185796,
"grad_norm": 0.13033559918403625,
"learning_rate": 3.2586886597587466e-06,
"loss": 0.6627,
"step": 328
},
{
"epoch": 0.6773031394750386,
"grad_norm": 0.150541290640831,
"learning_rate": 3.2326562328418e-06,
"loss": 0.6247,
"step": 329
},
{
"epoch": 0.6793618116314977,
"grad_norm": 0.13265936076641083,
"learning_rate": 3.2067251083363714e-06,
"loss": 0.6238,
"step": 330
},
{
"epoch": 0.6814204837879567,
"grad_norm": 0.13777919113636017,
"learning_rate": 3.180896444816427e-06,
"loss": 0.6473,
"step": 331
},
{
"epoch": 0.6834791559444159,
"grad_norm": 0.14997805655002594,
"learning_rate": 3.155171396278078e-06,
"loss": 0.6497,
"step": 332
},
{
"epoch": 0.6855378281008749,
"grad_norm": 0.14183354377746582,
"learning_rate": 3.1295511120880382e-06,
"loss": 0.6601,
"step": 333
},
{
"epoch": 0.687596500257334,
"grad_norm": 0.14279931783676147,
"learning_rate": 3.104036736932268e-06,
"loss": 0.6766,
"step": 334
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.13766002655029297,
"learning_rate": 3.078629410764824e-06,
"loss": 0.6352,
"step": 335
},
{
"epoch": 0.6917138445702522,
"grad_norm": 0.12679381668567657,
"learning_rate": 3.0533302687569368e-06,
"loss": 0.6538,
"step": 336
},
{
"epoch": 0.6937725167267113,
"grad_norm": 0.15894080698490143,
"learning_rate": 3.0281404412462866e-06,
"loss": 0.6452,
"step": 337
},
{
"epoch": 0.6958311888831703,
"grad_norm": 0.16960495710372925,
"learning_rate": 3.0030610536865047e-06,
"loss": 0.645,
"step": 338
},
{
"epoch": 0.6978898610396295,
"grad_norm": 0.13922789692878723,
"learning_rate": 2.978093226596884e-06,
"loss": 0.6361,
"step": 339
},
{
"epoch": 0.6999485331960885,
"grad_norm": 0.14750796556472778,
"learning_rate": 2.953238075512321e-06,
"loss": 0.6561,
"step": 340
},
{
"epoch": 0.7020072053525476,
"grad_norm": 0.14396293461322784,
"learning_rate": 2.928496710933477e-06,
"loss": 0.6676,
"step": 341
},
{
"epoch": 0.7040658775090067,
"grad_norm": 0.12205420434474945,
"learning_rate": 2.9038702382771476e-06,
"loss": 0.6441,
"step": 342
},
{
"epoch": 0.7061245496654658,
"grad_norm": 0.13652381300926208,
"learning_rate": 2.879359757826895e-06,
"loss": 0.653,
"step": 343
},
{
"epoch": 0.7081832218219248,
"grad_norm": 0.13769610226154327,
"learning_rate": 2.854966364683872e-06,
"loss": 0.6436,
"step": 344
},
{
"epoch": 0.7102418939783839,
"grad_norm": 0.12619943916797638,
"learning_rate": 2.8306911487179023e-06,
"loss": 0.6339,
"step": 345
},
{
"epoch": 0.712300566134843,
"grad_norm": 0.14851100742816925,
"learning_rate": 2.8065351945187837e-06,
"loss": 0.6616,
"step": 346
},
{
"epoch": 0.7143592382913021,
"grad_norm": 0.13649091124534607,
"learning_rate": 2.7824995813478295e-06,
"loss": 0.6575,
"step": 347
},
{
"epoch": 0.7164179104477612,
"grad_norm": 0.1376604586839676,
"learning_rate": 2.7585853830896527e-06,
"loss": 0.6612,
"step": 348
},
{
"epoch": 0.7184765826042203,
"grad_norm": 0.1331382691860199,
"learning_rate": 2.734793668204172e-06,
"loss": 0.671,
"step": 349
},
{
"epoch": 0.7205352547606794,
"grad_norm": 0.16621175408363342,
"learning_rate": 2.7111254996788995e-06,
"loss": 0.6829,
"step": 350
},
{
"epoch": 0.7225939269171384,
"grad_norm": 0.15275327861309052,
"learning_rate": 2.687581934981419e-06,
"loss": 0.6399,
"step": 351
},
{
"epoch": 0.7246525990735976,
"grad_norm": 0.1319313943386078,
"learning_rate": 2.664164026012161e-06,
"loss": 0.6693,
"step": 352
},
{
"epoch": 0.7267112712300566,
"grad_norm": 0.13566693663597107,
"learning_rate": 2.640872819057394e-06,
"loss": 0.6319,
"step": 353
},
{
"epoch": 0.7287699433865157,
"grad_norm": 0.14120978116989136,
"learning_rate": 2.6177093547424826e-06,
"loss": 0.6548,
"step": 354
},
{
"epoch": 0.7308286155429747,
"grad_norm": 0.13419358432292938,
"learning_rate": 2.5946746679853894e-06,
"loss": 0.6424,
"step": 355
},
{
"epoch": 0.7328872876994339,
"grad_norm": 0.14351977407932281,
"learning_rate": 2.57176978795044e-06,
"loss": 0.6382,
"step": 356
},
{
"epoch": 0.734945959855893,
"grad_norm": 0.1318623423576355,
"learning_rate": 2.548995738002338e-06,
"loss": 0.6725,
"step": 357
},
{
"epoch": 0.737004632012352,
"grad_norm": 0.12857410311698914,
"learning_rate": 2.5263535356604428e-06,
"loss": 0.6435,
"step": 358
},
{
"epoch": 0.7390633041688112,
"grad_norm": 0.1396513730287552,
"learning_rate": 2.503844192553313e-06,
"loss": 0.6557,
"step": 359
},
{
"epoch": 0.7411219763252702,
"grad_norm": 0.14913763105869293,
"learning_rate": 2.481468714373496e-06,
"loss": 0.6629,
"step": 360
},
{
"epoch": 0.7431806484817293,
"grad_norm": 0.1364419162273407,
"learning_rate": 2.4592281008326075e-06,
"loss": 0.6343,
"step": 361
},
{
"epoch": 0.7452393206381883,
"grad_norm": 0.15360961854457855,
"learning_rate": 2.437123345616661e-06,
"loss": 0.6321,
"step": 362
},
{
"epoch": 0.7472979927946475,
"grad_norm": 0.13863076269626617,
"learning_rate": 2.4151554363416676e-06,
"loss": 0.6363,
"step": 363
},
{
"epoch": 0.7493566649511065,
"grad_norm": 0.13245247304439545,
"learning_rate": 2.3933253545095143e-06,
"loss": 0.6604,
"step": 364
},
{
"epoch": 0.7514153371075656,
"grad_norm": 0.13168726861476898,
"learning_rate": 2.37163407546411e-06,
"loss": 0.6654,
"step": 365
},
{
"epoch": 0.7534740092640247,
"grad_norm": 0.13630832731723785,
"learning_rate": 2.3500825683478096e-06,
"loss": 0.6388,
"step": 366
},
{
"epoch": 0.7555326814204838,
"grad_norm": 0.14733514189720154,
"learning_rate": 2.328671796058113e-06,
"loss": 0.6632,
"step": 367
},
{
"epoch": 0.7575913535769428,
"grad_norm": 0.1482279747724533,
"learning_rate": 2.3074027152046384e-06,
"loss": 0.6596,
"step": 368
},
{
"epoch": 0.7596500257334019,
"grad_norm": 0.14237797260284424,
"learning_rate": 2.286276276066398e-06,
"loss": 0.6673,
"step": 369
},
{
"epoch": 0.7617086978898611,
"grad_norm": 0.1356516182422638,
"learning_rate": 2.265293422549319e-06,
"loss": 0.6612,
"step": 370
},
{
"epoch": 0.7637673700463201,
"grad_norm": 0.14135190844535828,
"learning_rate": 2.2444550921440884e-06,
"loss": 0.659,
"step": 371
},
{
"epoch": 0.7658260422027792,
"grad_norm": 0.1370486319065094,
"learning_rate": 2.2237622158842584e-06,
"loss": 0.6655,
"step": 372
},
{
"epoch": 0.7678847143592383,
"grad_norm": 0.14796264469623566,
"learning_rate": 2.2032157183046515e-06,
"loss": 0.6632,
"step": 373
},
{
"epoch": 0.7699433865156974,
"grad_norm": 0.1222207248210907,
"learning_rate": 2.1828165174000513e-06,
"loss": 0.6472,
"step": 374
},
{
"epoch": 0.7720020586721564,
"grad_norm": 0.12472368031740189,
"learning_rate": 2.162565524584191e-06,
"loss": 0.6539,
"step": 375
},
{
"epoch": 0.7740607308286156,
"grad_norm": 0.1569022238254547,
"learning_rate": 2.142463644649029e-06,
"loss": 0.6456,
"step": 376
},
{
"epoch": 0.7761194029850746,
"grad_norm": 0.13983507454395294,
"learning_rate": 2.1225117757243263e-06,
"loss": 0.6587,
"step": 377
},
{
"epoch": 0.7781780751415337,
"grad_norm": 0.14793013036251068,
"learning_rate": 2.1027108092375147e-06,
"loss": 0.7043,
"step": 378
},
{
"epoch": 0.7802367472979927,
"grad_norm": 0.13598047196865082,
"learning_rate": 2.0830616298738746e-06,
"loss": 0.6327,
"step": 379
},
{
"epoch": 0.7822954194544519,
"grad_norm": 0.150112122297287,
"learning_rate": 2.0635651155370064e-06,
"loss": 0.6451,
"step": 380
},
{
"epoch": 0.784354091610911,
"grad_norm": 0.1408482939004898,
"learning_rate": 2.0442221373096045e-06,
"loss": 0.6566,
"step": 381
},
{
"epoch": 0.78641276376737,
"grad_norm": 0.14770856499671936,
"learning_rate": 2.02503355941454e-06,
"loss": 0.6607,
"step": 382
},
{
"epoch": 0.7884714359238292,
"grad_norm": 0.15583674609661102,
"learning_rate": 2.0060002391762477e-06,
"loss": 0.6475,
"step": 383
},
{
"epoch": 0.7905301080802882,
"grad_norm": 0.14111104607582092,
"learning_rate": 1.987123026982423e-06,
"loss": 0.6368,
"step": 384
},
{
"epoch": 0.7925887802367473,
"grad_norm": 0.12933233380317688,
"learning_rate": 1.968402766246026e-06,
"loss": 0.6415,
"step": 385
},
{
"epoch": 0.7946474523932063,
"grad_norm": 0.14513786137104034,
"learning_rate": 1.9498402933676e-06,
"loss": 0.6619,
"step": 386
},
{
"epoch": 0.7967061245496655,
"grad_norm": 0.14459173381328583,
"learning_rate": 1.931436437697896e-06,
"loss": 0.6403,
"step": 387
},
{
"epoch": 0.7987647967061245,
"grad_norm": 0.14979158341884613,
"learning_rate": 1.9131920215008344e-06,
"loss": 0.6416,
"step": 388
},
{
"epoch": 0.8008234688625836,
"grad_norm": 0.1412138193845749,
"learning_rate": 1.895107859916746e-06,
"loss": 0.6314,
"step": 389
},
{
"epoch": 0.8028821410190428,
"grad_norm": 0.1349021941423416,
"learning_rate": 1.8771847609259675e-06,
"loss": 0.6633,
"step": 390
},
{
"epoch": 0.8049408131755018,
"grad_norm": 0.14212565124034882,
"learning_rate": 1.8594235253127373e-06,
"loss": 0.6393,
"step": 391
},
{
"epoch": 0.8069994853319609,
"grad_norm": 0.12094785273075104,
"learning_rate": 1.8418249466294153e-06,
"loss": 0.6438,
"step": 392
},
{
"epoch": 0.8090581574884199,
"grad_norm": 0.1392224282026291,
"learning_rate": 1.8243898111610314e-06,
"loss": 0.6443,
"step": 393
},
{
"epoch": 0.8111168296448791,
"grad_norm": 0.13410909473896027,
"learning_rate": 1.807118897890152e-06,
"loss": 0.6721,
"step": 394
},
{
"epoch": 0.8131755018013381,
"grad_norm": 0.13233357667922974,
"learning_rate": 1.7900129784620798e-06,
"loss": 0.6509,
"step": 395
},
{
"epoch": 0.8152341739577972,
"grad_norm": 0.1279245764017105,
"learning_rate": 1.7730728171503704e-06,
"loss": 0.6591,
"step": 396
},
{
"epoch": 0.8172928461142563,
"grad_norm": 0.1380516141653061,
"learning_rate": 1.7562991708226945e-06,
"loss": 0.6454,
"step": 397
},
{
"epoch": 0.8193515182707154,
"grad_norm": 0.14562861621379852,
"learning_rate": 1.7396927889070164e-06,
"loss": 0.6629,
"step": 398
},
{
"epoch": 0.8214101904271744,
"grad_norm": 0.14138072729110718,
"learning_rate": 1.723254413358111e-06,
"loss": 0.6479,
"step": 399
},
{
"epoch": 0.8234688625836336,
"grad_norm": 0.14233651757240295,
"learning_rate": 1.7069847786244136e-06,
"loss": 0.657,
"step": 400
},
{
"epoch": 0.8255275347400927,
"grad_norm": 0.14190144836902618,
"learning_rate": 1.690884611615209e-06,
"loss": 0.6598,
"step": 401
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.14529059827327728,
"learning_rate": 1.67495463166815e-06,
"loss": 0.6573,
"step": 402
},
{
"epoch": 0.8296448790530108,
"grad_norm": 0.13673891127109528,
"learning_rate": 1.6591955505171198e-06,
"loss": 0.6227,
"step": 403
},
{
"epoch": 0.8317035512094699,
"grad_norm": 0.1400144100189209,
"learning_rate": 1.6436080722604314e-06,
"loss": 0.6707,
"step": 404
},
{
"epoch": 0.833762223365929,
"grad_norm": 0.151056170463562,
"learning_rate": 1.628192893329374e-06,
"loss": 0.6468,
"step": 405
},
{
"epoch": 0.835820895522388,
"grad_norm": 0.1367853730916977,
"learning_rate": 1.612950702457087e-06,
"loss": 0.6765,
"step": 406
},
{
"epoch": 0.8378795676788472,
"grad_norm": 0.13104775547981262,
"learning_rate": 1.5978821806478027e-06,
"loss": 0.6541,
"step": 407
},
{
"epoch": 0.8399382398353062,
"grad_norm": 0.1309516578912735,
"learning_rate": 1.582988001146405e-06,
"loss": 0.6632,
"step": 408
},
{
"epoch": 0.8419969119917653,
"grad_norm": 0.15327829122543335,
"learning_rate": 1.5682688294083594e-06,
"loss": 0.6623,
"step": 409
},
{
"epoch": 0.8440555841482243,
"grad_norm": 0.12894751131534576,
"learning_rate": 1.5537253230699784e-06,
"loss": 0.6586,
"step": 410
},
{
"epoch": 0.8461142563046835,
"grad_norm": 0.13982105255126953,
"learning_rate": 1.5393581319190382e-06,
"loss": 0.6654,
"step": 411
},
{
"epoch": 0.8481729284611426,
"grad_norm": 0.1252405196428299,
"learning_rate": 1.5251678978657464e-06,
"loss": 0.6316,
"step": 412
},
{
"epoch": 0.8502316006176016,
"grad_norm": 0.12443134933710098,
"learning_rate": 1.511155254914065e-06,
"loss": 0.6814,
"step": 413
},
{
"epoch": 0.8522902727740608,
"grad_norm": 0.13635645806789398,
"learning_rate": 1.4973208291333813e-06,
"loss": 0.65,
"step": 414
},
{
"epoch": 0.8543489449305198,
"grad_norm": 0.18179555237293243,
"learning_rate": 1.4836652386305351e-06,
"loss": 0.6496,
"step": 415
},
{
"epoch": 0.8564076170869789,
"grad_norm": 0.1520225703716278,
"learning_rate": 1.4701890935222062e-06,
"loss": 0.6516,
"step": 416
},
{
"epoch": 0.8584662892434379,
"grad_norm": 0.1353698968887329,
"learning_rate": 1.4568929959076512e-06,
"loss": 0.6652,
"step": 417
},
{
"epoch": 0.8605249613998971,
"grad_norm": 0.13589033484458923,
"learning_rate": 1.4437775398418042e-06,
"loss": 0.6541,
"step": 418
},
{
"epoch": 0.8625836335563561,
"grad_norm": 0.1323903501033783,
"learning_rate": 1.4308433113087346e-06,
"loss": 0.6498,
"step": 419
},
{
"epoch": 0.8646423057128152,
"grad_norm": 0.1342097967863083,
"learning_rate": 1.4180908881954668e-06,
"loss": 0.6643,
"step": 420
},
{
"epoch": 0.8667009778692744,
"grad_norm": 0.14252899587154388,
"learning_rate": 1.405520840266159e-06,
"loss": 0.639,
"step": 421
},
{
"epoch": 0.8687596500257334,
"grad_norm": 0.13460814952850342,
"learning_rate": 1.3931337291366488e-06,
"loss": 0.6562,
"step": 422
},
{
"epoch": 0.8708183221821925,
"grad_norm": 0.12760917842388153,
"learning_rate": 1.3809301082493592e-06,
"loss": 0.6512,
"step": 423
},
{
"epoch": 0.8728769943386516,
"grad_norm": 0.14362888038158417,
"learning_rate": 1.3689105228485739e-06,
"loss": 0.6511,
"step": 424
},
{
"epoch": 0.8749356664951107,
"grad_norm": 0.1235094666481018,
"learning_rate": 1.3570755099560701e-06,
"loss": 0.6529,
"step": 425
},
{
"epoch": 0.8769943386515697,
"grad_norm": 0.14431904256343842,
"learning_rate": 1.3454255983471367e-06,
"loss": 0.6494,
"step": 426
},
{
"epoch": 0.8790530108080288,
"grad_norm": 0.1322588473558426,
"learning_rate": 1.3339613085269357e-06,
"loss": 0.6408,
"step": 427
},
{
"epoch": 0.8811116829644879,
"grad_norm": 0.12662683427333832,
"learning_rate": 1.322683152707255e-06,
"loss": 0.6432,
"step": 428
},
{
"epoch": 0.883170355120947,
"grad_norm": 0.14211656153202057,
"learning_rate": 1.3115916347836222e-06,
"loss": 0.6364,
"step": 429
},
{
"epoch": 0.885229027277406,
"grad_norm": 0.13972091674804688,
"learning_rate": 1.3006872503127887e-06,
"loss": 0.6654,
"step": 430
},
{
"epoch": 0.8872876994338652,
"grad_norm": 0.13244371116161346,
"learning_rate": 1.2899704864905922e-06,
"loss": 0.656,
"step": 431
},
{
"epoch": 0.8893463715903243,
"grad_norm": 0.13926206529140472,
"learning_rate": 1.2794418221301842e-06,
"loss": 0.6674,
"step": 432
},
{
"epoch": 0.8914050437467833,
"grad_norm": 0.14457055926322937,
"learning_rate": 1.269101727640644e-06,
"loss": 0.6424,
"step": 433
},
{
"epoch": 0.8934637159032424,
"grad_norm": 0.12490473687648773,
"learning_rate": 1.2589506650059544e-06,
"loss": 0.6623,
"step": 434
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.1367175132036209,
"learning_rate": 1.248989087764366e-06,
"loss": 0.6541,
"step": 435
},
{
"epoch": 0.8975810602161606,
"grad_norm": 0.1517862230539322,
"learning_rate": 1.2392174409881311e-06,
"loss": 0.649,
"step": 436
},
{
"epoch": 0.8996397323726196,
"grad_norm": 0.11842308938503265,
"learning_rate": 1.229636161263619e-06,
"loss": 0.6481,
"step": 437
},
{
"epoch": 0.9016984045290788,
"grad_norm": 0.1355244219303131,
"learning_rate": 1.2202456766718092e-06,
"loss": 0.6285,
"step": 438
},
{
"epoch": 0.9037570766855378,
"grad_norm": 0.13762575387954712,
"learning_rate": 1.2110464067691666e-06,
"loss": 0.6547,
"step": 439
},
{
"epoch": 0.9058157488419969,
"grad_norm": 0.12868613004684448,
"learning_rate": 1.2020387625688943e-06,
"loss": 0.6812,
"step": 440
},
{
"epoch": 0.9078744209984559,
"grad_norm": 0.14232338964939117,
"learning_rate": 1.1932231465225714e-06,
"loss": 0.6541,
"step": 441
},
{
"epoch": 0.9099330931549151,
"grad_norm": 0.13460494577884674,
"learning_rate": 1.1845999525021723e-06,
"loss": 0.6328,
"step": 442
},
{
"epoch": 0.9119917653113742,
"grad_norm": 0.14562971889972687,
"learning_rate": 1.1761695657824677e-06,
"loss": 0.6581,
"step": 443
},
{
"epoch": 0.9140504374678332,
"grad_norm": 0.13268987834453583,
"learning_rate": 1.1679323630238087e-06,
"loss": 0.6619,
"step": 444
},
{
"epoch": 0.9161091096242924,
"grad_norm": 0.14215055108070374,
"learning_rate": 1.1598887122553061e-06,
"loss": 0.6404,
"step": 445
},
{
"epoch": 0.9181677817807514,
"grad_norm": 0.14430803060531616,
"learning_rate": 1.1520389728583763e-06,
"loss": 0.6501,
"step": 446
},
{
"epoch": 0.9202264539372105,
"grad_norm": 0.15590347349643707,
"learning_rate": 1.1443834955506942e-06,
"loss": 0.657,
"step": 447
},
{
"epoch": 0.9222851260936696,
"grad_norm": 0.13576243817806244,
"learning_rate": 1.1369226223705176e-06,
"loss": 0.6495,
"step": 448
},
{
"epoch": 0.9243437982501287,
"grad_norm": 0.1325463354587555,
"learning_rate": 1.1296566866614067e-06,
"loss": 0.6725,
"step": 449
},
{
"epoch": 0.9264024704065877,
"grad_norm": 0.1399555504322052,
"learning_rate": 1.1225860130573334e-06,
"loss": 0.5978,
"step": 450
},
{
"epoch": 0.9284611425630468,
"grad_norm": 0.11871365457773209,
"learning_rate": 1.1157109174681713e-06,
"loss": 0.6021,
"step": 451
},
{
"epoch": 0.930519814719506,
"grad_norm": 0.12837566435337067,
"learning_rate": 1.109031707065588e-06,
"loss": 0.6716,
"step": 452
},
{
"epoch": 0.932578486875965,
"grad_norm": 0.1489572525024414,
"learning_rate": 1.1025486802693158e-06,
"loss": 0.6626,
"step": 453
},
{
"epoch": 0.9346371590324241,
"grad_norm": 0.12535393238067627,
"learning_rate": 1.0962621267338198e-06,
"loss": 0.6421,
"step": 454
},
{
"epoch": 0.9366958311888832,
"grad_norm": 0.13144055008888245,
"learning_rate": 1.0901723273353599e-06,
"loss": 0.6488,
"step": 455
},
{
"epoch": 0.9387545033453423,
"grad_norm": 0.15396398305892944,
"learning_rate": 1.0842795541594354e-06,
"loss": 0.6482,
"step": 456
},
{
"epoch": 0.9408131755018013,
"grad_norm": 0.14828471839427948,
"learning_rate": 1.078584070488635e-06,
"loss": 0.6516,
"step": 457
},
{
"epoch": 0.9428718476582604,
"grad_norm": 0.1343657374382019,
"learning_rate": 1.0730861307908677e-06,
"loss": 0.6596,
"step": 458
},
{
"epoch": 0.9449305198147195,
"grad_norm": 0.1373317688703537,
"learning_rate": 1.0677859807079994e-06,
"loss": 0.6562,
"step": 459
},
{
"epoch": 0.9469891919711786,
"grad_norm": 0.14464855194091797,
"learning_rate": 1.0626838570448716e-06,
"loss": 0.6652,
"step": 460
},
{
"epoch": 0.9490478641276376,
"grad_norm": 0.14019303023815155,
"learning_rate": 1.057779987758727e-06,
"loss": 0.6602,
"step": 461
},
{
"epoch": 0.9511065362840968,
"grad_norm": 0.1424388289451599,
"learning_rate": 1.0530745919490201e-06,
"loss": 0.6713,
"step": 462
},
{
"epoch": 0.9531652084405559,
"grad_norm": 0.1371728628873825,
"learning_rate": 1.048567879847631e-06,
"loss": 0.6766,
"step": 463
},
{
"epoch": 0.9552238805970149,
"grad_norm": 0.14678040146827698,
"learning_rate": 1.0442600528094722e-06,
"loss": 0.6555,
"step": 464
},
{
"epoch": 0.957282552753474,
"grad_norm": 0.14888082444667816,
"learning_rate": 1.04015130330349e-06,
"loss": 0.6501,
"step": 465
},
{
"epoch": 0.9593412249099331,
"grad_norm": 0.13902142643928528,
"learning_rate": 1.0362418149040673e-06,
"loss": 0.6462,
"step": 466
},
{
"epoch": 0.9613998970663922,
"grad_norm": 0.14032316207885742,
"learning_rate": 1.0325317622828216e-06,
"loss": 0.66,
"step": 467
},
{
"epoch": 0.9634585692228512,
"grad_norm": 0.1258266568183899,
"learning_rate": 1.0290213112007999e-06,
"loss": 0.6103,
"step": 468
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.14587683975696564,
"learning_rate": 1.0257106185010746e-06,
"loss": 0.6521,
"step": 469
},
{
"epoch": 0.9675759135357694,
"grad_norm": 0.15115003287792206,
"learning_rate": 1.0225998321017314e-06,
"loss": 0.6454,
"step": 470
},
{
"epoch": 0.9696345856922285,
"grad_norm": 0.157597154378891,
"learning_rate": 1.019689090989268e-06,
"loss": 0.6759,
"step": 471
},
{
"epoch": 0.9716932578486875,
"grad_norm": 0.14675654470920563,
"learning_rate": 1.0169785252123765e-06,
"loss": 0.6572,
"step": 472
},
{
"epoch": 0.9737519300051467,
"grad_norm": 0.14471760392189026,
"learning_rate": 1.0144682558761371e-06,
"loss": 0.6398,
"step": 473
},
{
"epoch": 0.9758106021616058,
"grad_norm": 0.12943477928638458,
"learning_rate": 1.0121583951366075e-06,
"loss": 0.6914,
"step": 474
},
{
"epoch": 0.9778692743180648,
"grad_norm": 0.1375255137681961,
"learning_rate": 1.010049046195811e-06,
"loss": 0.6363,
"step": 475
},
{
"epoch": 0.979927946474524,
"grad_norm": 0.13072469830513,
"learning_rate": 1.0081403032971245e-06,
"loss": 0.633,
"step": 476
},
{
"epoch": 0.981986618630983,
"grad_norm": 0.13308100402355194,
"learning_rate": 1.006432251721069e-06,
"loss": 0.643,
"step": 477
},
{
"epoch": 0.9840452907874421,
"grad_norm": 0.15122868120670319,
"learning_rate": 1.0049249677815005e-06,
"loss": 0.656,
"step": 478
},
{
"epoch": 0.9861039629439012,
"grad_norm": 0.12934233248233795,
"learning_rate": 1.0036185188221976e-06,
"loss": 0.6727,
"step": 479
},
{
"epoch": 0.9881626351003603,
"grad_norm": 0.15089304745197296,
"learning_rate": 1.0025129632138545e-06,
"loss": 0.6458,
"step": 480
},
{
"epoch": 0.9902213072568193,
"grad_norm": 0.1326705515384674,
"learning_rate": 1.0016083503514734e-06,
"loss": 0.6371,
"step": 481
},
{
"epoch": 0.9922799794132784,
"grad_norm": 0.13898327946662903,
"learning_rate": 1.0009047206521559e-06,
"loss": 0.6513,
"step": 482
},
{
"epoch": 0.9943386515697376,
"grad_norm": 0.13787615299224854,
"learning_rate": 1.000402105553299e-06,
"loss": 0.6604,
"step": 483
},
{
"epoch": 0.9963973237261966,
"grad_norm": 0.12259743362665176,
"learning_rate": 1.0001005275111895e-06,
"loss": 0.6671,
"step": 484
},
{
"epoch": 0.9984559958826557,
"grad_norm": 0.1266549974679947,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.6372,
"step": 485
},
{
"epoch": 1.002058672156459,
"grad_norm": 0.13590970635414124,
"learning_rate": 5.710476732029584e-06,
"loss": 0.635,
"step": 486
},
{
"epoch": 1.004117344312918,
"grad_norm": 0.14185404777526855,
"learning_rate": 5.695452505043673e-06,
"loss": 0.6325,
"step": 487
},
{
"epoch": 1.0061760164693772,
"grad_norm": 0.1392943561077118,
"learning_rate": 5.680426094901233e-06,
"loss": 0.6399,
"step": 488
},
{
"epoch": 1.0082346886258364,
"grad_norm": 0.13933075964450836,
"learning_rate": 5.665397669443578e-06,
"loss": 0.6378,
"step": 489
},
{
"epoch": 1.0102933607822955,
"grad_norm": 0.1552516222000122,
"learning_rate": 5.650367396534536e-06,
"loss": 0.6406,
"step": 490
},
{
"epoch": 1.0123520329387545,
"grad_norm": 0.13938720524311066,
"learning_rate": 5.6353354440585676e-06,
"loss": 0.6613,
"step": 491
},
{
"epoch": 1.0144107050952136,
"grad_norm": 0.13705240190029144,
"learning_rate": 5.620301979918891e-06,
"loss": 0.6763,
"step": 492
},
{
"epoch": 1.0164693772516726,
"grad_norm": 0.1428154855966568,
"learning_rate": 5.605267172035615e-06,
"loss": 0.6577,
"step": 493
},
{
"epoch": 1.0185280494081317,
"grad_norm": 0.140982523560524,
"learning_rate": 5.590231188343852e-06,
"loss": 0.64,
"step": 494
},
{
"epoch": 1.0205867215645907,
"grad_norm": 0.13976819813251495,
"learning_rate": 5.575194196791854e-06,
"loss": 0.6583,
"step": 495
},
{
"epoch": 1.02264539372105,
"grad_norm": 0.14706793427467346,
"learning_rate": 5.560156365339128e-06,
"loss": 0.6421,
"step": 496
},
{
"epoch": 1.024704065877509,
"grad_norm": 0.15089593827724457,
"learning_rate": 5.545117861954558e-06,
"loss": 0.6454,
"step": 497
},
{
"epoch": 1.0267627380339681,
"grad_norm": 0.14291644096374512,
"learning_rate": 5.530078854614541e-06,
"loss": 0.6612,
"step": 498
},
{
"epoch": 1.0288214101904272,
"grad_norm": 0.1260952353477478,
"learning_rate": 5.515039511301097e-06,
"loss": 0.6513,
"step": 499
},
{
"epoch": 1.0308800823468862,
"grad_norm": 0.14070680737495422,
"learning_rate": 5.500000000000001e-06,
"loss": 0.6398,
"step": 500
},
{
"epoch": 1.0329387545033453,
"grad_norm": 0.14129748940467834,
"learning_rate": 5.484960488698905e-06,
"loss": 0.6323,
"step": 501
},
{
"epoch": 1.0349974266598043,
"grad_norm": 0.15318261086940765,
"learning_rate": 5.469921145385462e-06,
"loss": 0.6362,
"step": 502
},
{
"epoch": 1.0370560988162636,
"grad_norm": 0.14378410577774048,
"learning_rate": 5.4548821380454444e-06,
"loss": 0.6475,
"step": 503
},
{
"epoch": 1.0391147709727226,
"grad_norm": 0.14161469042301178,
"learning_rate": 5.4398436346608726e-06,
"loss": 0.651,
"step": 504
},
{
"epoch": 1.0411734431291817,
"grad_norm": 0.14112524688243866,
"learning_rate": 5.424805803208147e-06,
"loss": 0.6742,
"step": 505
},
{
"epoch": 1.0432321152856407,
"grad_norm": 0.13087144494056702,
"learning_rate": 5.409768811656149e-06,
"loss": 0.6566,
"step": 506
},
{
"epoch": 1.0452907874420998,
"grad_norm": 0.12194062024354935,
"learning_rate": 5.394732827964388e-06,
"loss": 0.659,
"step": 507
},
{
"epoch": 1.0473494595985589,
"grad_norm": 0.14136454463005066,
"learning_rate": 5.379698020081111e-06,
"loss": 0.6352,
"step": 508
},
{
"epoch": 1.0494081317550181,
"grad_norm": 0.12959690392017365,
"learning_rate": 5.364664555941434e-06,
"loss": 0.679,
"step": 509
},
{
"epoch": 1.0514668039114772,
"grad_norm": 0.1391456127166748,
"learning_rate": 5.349632603465467e-06,
"loss": 0.6519,
"step": 510
},
{
"epoch": 1.0535254760679362,
"grad_norm": 0.12429799884557724,
"learning_rate": 5.334602330556423e-06,
"loss": 0.651,
"step": 511
},
{
"epoch": 1.0555841482243953,
"grad_norm": 0.1405404508113861,
"learning_rate": 5.3195739050987695e-06,
"loss": 0.6595,
"step": 512
},
{
"epoch": 1.0576428203808543,
"grad_norm": 0.13673624396324158,
"learning_rate": 5.304547494956329e-06,
"loss": 0.6732,
"step": 513
},
{
"epoch": 1.0597014925373134,
"grad_norm": 0.13902150094509125,
"learning_rate": 5.289523267970417e-06,
"loss": 0.6598,
"step": 514
},
{
"epoch": 1.0617601646937724,
"grad_norm": 0.15974578261375427,
"learning_rate": 5.274501391957964e-06,
"loss": 0.6682,
"step": 515
},
{
"epoch": 1.0638188368502317,
"grad_norm": 0.14193038642406464,
"learning_rate": 5.259482034709637e-06,
"loss": 0.6586,
"step": 516
},
{
"epoch": 1.0658775090066908,
"grad_norm": 0.1465134620666504,
"learning_rate": 5.244465363987972e-06,
"loss": 0.646,
"step": 517
},
{
"epoch": 1.0679361811631498,
"grad_norm": 0.13889746367931366,
"learning_rate": 5.229451547525499e-06,
"loss": 0.6395,
"step": 518
},
{
"epoch": 1.0699948533196089,
"grad_norm": 0.14239132404327393,
"learning_rate": 5.214440753022863e-06,
"loss": 0.6753,
"step": 519
},
{
"epoch": 1.072053525476068,
"grad_norm": 0.16652631759643555,
"learning_rate": 5.199433148146954e-06,
"loss": 0.6524,
"step": 520
},
{
"epoch": 1.074112197632527,
"grad_norm": 0.1389884054660797,
"learning_rate": 5.184428900529039e-06,
"loss": 0.635,
"step": 521
},
{
"epoch": 1.076170869788986,
"grad_norm": 0.13372258841991425,
"learning_rate": 5.169428177762882e-06,
"loss": 0.6306,
"step": 522
},
{
"epoch": 1.0782295419454453,
"grad_norm": 0.14761124551296234,
"learning_rate": 5.154431147402874e-06,
"loss": 0.6519,
"step": 523
},
{
"epoch": 1.0802882141019043,
"grad_norm": 0.14540445804595947,
"learning_rate": 5.139437976962168e-06,
"loss": 0.6409,
"step": 524
},
{
"epoch": 1.0823468862583634,
"grad_norm": 0.15441715717315674,
"learning_rate": 5.124448833910797e-06,
"loss": 0.6712,
"step": 525
},
{
"epoch": 1.0844055584148224,
"grad_norm": 0.1473764181137085,
"learning_rate": 5.109463885673812e-06,
"loss": 0.6775,
"step": 526
},
{
"epoch": 1.0864642305712815,
"grad_norm": 0.1535579264163971,
"learning_rate": 5.094483299629409e-06,
"loss": 0.6507,
"step": 527
},
{
"epoch": 1.0885229027277405,
"grad_norm": 0.1290774792432785,
"learning_rate": 5.079507243107057e-06,
"loss": 0.6693,
"step": 528
},
{
"epoch": 1.0905815748841996,
"grad_norm": 0.1406625360250473,
"learning_rate": 5.0645358833856365e-06,
"loss": 0.6608,
"step": 529
},
{
"epoch": 1.0926402470406589,
"grad_norm": 0.14666709303855896,
"learning_rate": 5.0495693876915574e-06,
"loss": 0.6594,
"step": 530
},
{
"epoch": 1.094698919197118,
"grad_norm": 0.13475078344345093,
"learning_rate": 5.034607923196911e-06,
"loss": 0.6665,
"step": 531
},
{
"epoch": 1.096757591353577,
"grad_norm": 0.14045949280261993,
"learning_rate": 5.019651657017578e-06,
"loss": 0.6517,
"step": 532
},
{
"epoch": 1.098816263510036,
"grad_norm": 0.1452544629573822,
"learning_rate": 5.004700756211388e-06,
"loss": 0.6355,
"step": 533
},
{
"epoch": 1.100874935666495,
"grad_norm": 0.1488482654094696,
"learning_rate": 4.989755387776231e-06,
"loss": 0.6488,
"step": 534
},
{
"epoch": 1.1029336078229541,
"grad_norm": 0.11999811977148056,
"learning_rate": 4.974815718648207e-06,
"loss": 0.673,
"step": 535
},
{
"epoch": 1.1049922799794132,
"grad_norm": 0.1541755348443985,
"learning_rate": 4.959881915699757e-06,
"loss": 0.6715,
"step": 536
},
{
"epoch": 1.1070509521358725,
"grad_norm": 0.13446086645126343,
"learning_rate": 4.944954145737794e-06,
"loss": 0.653,
"step": 537
},
{
"epoch": 1.1091096242923315,
"grad_norm": 0.1450193077325821,
"learning_rate": 4.930032575501845e-06,
"loss": 0.6562,
"step": 538
},
{
"epoch": 1.1111682964487906,
"grad_norm": 0.14609019458293915,
"learning_rate": 4.915117371662189e-06,
"loss": 0.6663,
"step": 539
},
{
"epoch": 1.1132269686052496,
"grad_norm": 0.139775812625885,
"learning_rate": 4.9002087008179966e-06,
"loss": 0.6191,
"step": 540
},
{
"epoch": 1.1152856407617087,
"grad_norm": 0.15244266390800476,
"learning_rate": 4.885306729495459e-06,
"loss": 0.6603,
"step": 541
},
{
"epoch": 1.1173443129181677,
"grad_norm": 0.12155251204967499,
"learning_rate": 4.870411624145942e-06,
"loss": 0.6482,
"step": 542
},
{
"epoch": 1.1194029850746268,
"grad_norm": 0.13439474999904633,
"learning_rate": 4.855523551144119e-06,
"loss": 0.6215,
"step": 543
},
{
"epoch": 1.121461657231086,
"grad_norm": 0.1316884607076645,
"learning_rate": 4.840642676786111e-06,
"loss": 0.6374,
"step": 544
},
{
"epoch": 1.123520329387545,
"grad_norm": 0.15030314028263092,
"learning_rate": 4.825769167287634e-06,
"loss": 0.6429,
"step": 545
},
{
"epoch": 1.1255790015440041,
"grad_norm": 0.13496868312358856,
"learning_rate": 4.81090318878214e-06,
"loss": 0.6548,
"step": 546
},
{
"epoch": 1.1276376737004632,
"grad_norm": 0.13917021453380585,
"learning_rate": 4.796044907318961e-06,
"loss": 0.6398,
"step": 547
},
{
"epoch": 1.1296963458569222,
"grad_norm": 0.1306729018688202,
"learning_rate": 4.781194488861459e-06,
"loss": 0.6619,
"step": 548
},
{
"epoch": 1.1317550180133813,
"grad_norm": 0.12746194005012512,
"learning_rate": 4.7663520992851585e-06,
"loss": 0.6365,
"step": 549
},
{
"epoch": 1.1338136901698403,
"grad_norm": 0.13715361058712006,
"learning_rate": 4.751517904375915e-06,
"loss": 0.6825,
"step": 550
},
{
"epoch": 1.1358723623262996,
"grad_norm": 0.13766644895076752,
"learning_rate": 4.736692069828044e-06,
"loss": 0.6392,
"step": 551
},
{
"epoch": 1.1379310344827587,
"grad_norm": 0.13901367783546448,
"learning_rate": 4.721874761242482e-06,
"loss": 0.6571,
"step": 552
},
{
"epoch": 1.1399897066392177,
"grad_norm": 0.14460472762584686,
"learning_rate": 4.7070661441249266e-06,
"loss": 0.651,
"step": 553
},
{
"epoch": 1.1420483787956768,
"grad_norm": 0.14462216198444366,
"learning_rate": 4.692266383884e-06,
"loss": 0.6874,
"step": 554
},
{
"epoch": 1.1441070509521358,
"grad_norm": 0.13282893598079681,
"learning_rate": 4.6774756458293945e-06,
"loss": 0.6513,
"step": 555
},
{
"epoch": 1.1461657231085949,
"grad_norm": 0.13160084187984467,
"learning_rate": 4.662694095170023e-06,
"loss": 0.6778,
"step": 556
},
{
"epoch": 1.148224395265054,
"grad_norm": 0.14130820333957672,
"learning_rate": 4.64792189701218e-06,
"loss": 0.6687,
"step": 557
},
{
"epoch": 1.1502830674215132,
"grad_norm": 0.13801245391368866,
"learning_rate": 4.633159216357696e-06,
"loss": 0.6325,
"step": 558
},
{
"epoch": 1.1523417395779723,
"grad_norm": 0.14570364356040955,
"learning_rate": 4.618406218102093e-06,
"loss": 0.6655,
"step": 559
},
{
"epoch": 1.1544004117344313,
"grad_norm": 0.14590254426002502,
"learning_rate": 4.603663067032738e-06,
"loss": 0.6625,
"step": 560
},
{
"epoch": 1.1564590838908904,
"grad_norm": 0.12933455407619476,
"learning_rate": 4.588929927827015e-06,
"loss": 0.6512,
"step": 561
},
{
"epoch": 1.1585177560473494,
"grad_norm": 0.1645902544260025,
"learning_rate": 4.574206965050472e-06,
"loss": 0.6423,
"step": 562
},
{
"epoch": 1.1605764282038085,
"grad_norm": 0.14364786446094513,
"learning_rate": 4.559494343154993e-06,
"loss": 0.6481,
"step": 563
},
{
"epoch": 1.1626351003602675,
"grad_norm": 0.13910852372646332,
"learning_rate": 4.544792226476954e-06,
"loss": 0.6637,
"step": 564
},
{
"epoch": 1.1646937725167268,
"grad_norm": 0.14113080501556396,
"learning_rate": 4.53010077923539e-06,
"loss": 0.643,
"step": 565
},
{
"epoch": 1.1667524446731858,
"grad_norm": 0.13962271809577942,
"learning_rate": 4.515420165530162e-06,
"loss": 0.6271,
"step": 566
},
{
"epoch": 1.168811116829645,
"grad_norm": 0.15055330097675323,
"learning_rate": 4.500750549340128e-06,
"loss": 0.6424,
"step": 567
},
{
"epoch": 1.170869788986104,
"grad_norm": 0.13697674870491028,
"learning_rate": 4.486092094521296e-06,
"loss": 0.6274,
"step": 568
},
{
"epoch": 1.172928461142563,
"grad_norm": 0.13252243399620056,
"learning_rate": 4.471444964805013e-06,
"loss": 0.6423,
"step": 569
},
{
"epoch": 1.174987133299022,
"grad_norm": 0.13780078291893005,
"learning_rate": 4.456809323796123e-06,
"loss": 0.6397,
"step": 570
},
{
"epoch": 1.177045805455481,
"grad_norm": 0.141806960105896,
"learning_rate": 4.442185334971148e-06,
"loss": 0.6563,
"step": 571
},
{
"epoch": 1.1791044776119404,
"grad_norm": 0.14122751355171204,
"learning_rate": 4.427573161676452e-06,
"loss": 0.645,
"step": 572
},
{
"epoch": 1.1811631497683994,
"grad_norm": 0.14296391606330872,
"learning_rate": 4.412972967126425e-06,
"loss": 0.6743,
"step": 573
},
{
"epoch": 1.1832218219248585,
"grad_norm": 0.14250048995018005,
"learning_rate": 4.398384914401663e-06,
"loss": 0.6527,
"step": 574
},
{
"epoch": 1.1852804940813175,
"grad_norm": 0.15447309613227844,
"learning_rate": 4.383809166447131e-06,
"loss": 0.6253,
"step": 575
},
{
"epoch": 1.1873391662377766,
"grad_norm": 0.13448724150657654,
"learning_rate": 4.36924588607036e-06,
"loss": 0.6809,
"step": 576
},
{
"epoch": 1.1893978383942356,
"grad_norm": 0.1386733055114746,
"learning_rate": 4.354695235939618e-06,
"loss": 0.6613,
"step": 577
},
{
"epoch": 1.1914565105506947,
"grad_norm": 0.13458538055419922,
"learning_rate": 4.3401573785820986e-06,
"loss": 0.6525,
"step": 578
},
{
"epoch": 1.193515182707154,
"grad_norm": 0.14328433573246002,
"learning_rate": 4.325632476382098e-06,
"loss": 0.659,
"step": 579
},
{
"epoch": 1.195573854863613,
"grad_norm": 0.15153314173221588,
"learning_rate": 4.311120691579214e-06,
"loss": 0.6261,
"step": 580
},
{
"epoch": 1.197632527020072,
"grad_norm": 0.136255145072937,
"learning_rate": 4.29662218626652e-06,
"loss": 0.6272,
"step": 581
},
{
"epoch": 1.199691199176531,
"grad_norm": 0.15096060931682587,
"learning_rate": 4.282137122388765e-06,
"loss": 0.6685,
"step": 582
},
{
"epoch": 1.2017498713329902,
"grad_norm": 0.14815060794353485,
"learning_rate": 4.267665661740559e-06,
"loss": 0.5883,
"step": 583
},
{
"epoch": 1.2038085434894492,
"grad_norm": 0.13184097409248352,
"learning_rate": 4.253207965964565e-06,
"loss": 0.6463,
"step": 584
},
{
"epoch": 1.2058672156459085,
"grad_norm": 0.1358867883682251,
"learning_rate": 4.238764196549697e-06,
"loss": 0.6438,
"step": 585
},
{
"epoch": 1.2079258878023675,
"grad_norm": 0.15228141844272614,
"learning_rate": 4.2243345148293215e-06,
"loss": 0.6477,
"step": 586
},
{
"epoch": 1.2099845599588266,
"grad_norm": 0.14477193355560303,
"learning_rate": 4.209919081979435e-06,
"loss": 0.6794,
"step": 587
},
{
"epoch": 1.2120432321152856,
"grad_norm": 0.15588507056236267,
"learning_rate": 4.195518059016891e-06,
"loss": 0.6412,
"step": 588
},
{
"epoch": 1.2141019042717447,
"grad_norm": 0.13339221477508545,
"learning_rate": 4.181131606797582e-06,
"loss": 0.6358,
"step": 589
},
{
"epoch": 1.2161605764282037,
"grad_norm": 0.1495896428823471,
"learning_rate": 4.166759886014649e-06,
"loss": 0.6815,
"step": 590
},
{
"epoch": 1.218219248584663,
"grad_norm": 0.13854841887950897,
"learning_rate": 4.152403057196686e-06,
"loss": 0.656,
"step": 591
},
{
"epoch": 1.220277920741122,
"grad_norm": 0.14208003878593445,
"learning_rate": 4.13806128070595e-06,
"loss": 0.6522,
"step": 592
},
{
"epoch": 1.2223365928975811,
"grad_norm": 0.16169045865535736,
"learning_rate": 4.123734716736566e-06,
"loss": 0.6577,
"step": 593
},
{
"epoch": 1.2243952650540402,
"grad_norm": 0.13855846226215363,
"learning_rate": 4.109423525312738e-06,
"loss": 0.6519,
"step": 594
},
{
"epoch": 1.2264539372104992,
"grad_norm": 0.13011138141155243,
"learning_rate": 4.095127866286962e-06,
"loss": 0.6372,
"step": 595
},
{
"epoch": 1.2285126093669583,
"grad_norm": 0.1390632688999176,
"learning_rate": 4.080847899338244e-06,
"loss": 0.6259,
"step": 596
},
{
"epoch": 1.2305712815234173,
"grad_norm": 0.15639440715312958,
"learning_rate": 4.066583783970311e-06,
"loss": 0.641,
"step": 597
},
{
"epoch": 1.2326299536798766,
"grad_norm": 0.12465671449899673,
"learning_rate": 4.052335679509831e-06,
"loss": 0.6595,
"step": 598
},
{
"epoch": 1.2346886258363357,
"grad_norm": 0.12938636541366577,
"learning_rate": 4.038103745104635e-06,
"loss": 0.6462,
"step": 599
},
{
"epoch": 1.2367472979927947,
"grad_norm": 0.13924743235111237,
"learning_rate": 4.02388813972194e-06,
"loss": 0.6435,
"step": 600
},
{
"epoch": 1.2388059701492538,
"grad_norm": 0.13049103319644928,
"learning_rate": 4.009689022146569e-06,
"loss": 0.6194,
"step": 601
},
{
"epoch": 1.2408646423057128,
"grad_norm": 0.13774777948856354,
"learning_rate": 3.995506550979182e-06,
"loss": 0.6673,
"step": 602
},
{
"epoch": 1.2429233144621719,
"grad_norm": 0.14476458728313446,
"learning_rate": 3.981340884634504e-06,
"loss": 0.6524,
"step": 603
},
{
"epoch": 1.244981986618631,
"grad_norm": 0.15251493453979492,
"learning_rate": 3.967192181339549e-06,
"loss": 0.6518,
"step": 604
},
{
"epoch": 1.2470406587750902,
"grad_norm": 0.13875854015350342,
"learning_rate": 3.953060599131867e-06,
"loss": 0.6422,
"step": 605
},
{
"epoch": 1.2490993309315492,
"grad_norm": 0.13512182235717773,
"learning_rate": 3.938946295857758e-06,
"loss": 0.6467,
"step": 606
},
{
"epoch": 1.2511580030880083,
"grad_norm": 0.12567898631095886,
"learning_rate": 3.924849429170531e-06,
"loss": 0.6521,
"step": 607
},
{
"epoch": 1.2532166752444673,
"grad_norm": 0.13632598519325256,
"learning_rate": 3.910770156528726e-06,
"loss": 0.6442,
"step": 608
},
{
"epoch": 1.2552753474009264,
"grad_norm": 0.13966801762580872,
"learning_rate": 3.8967086351943665e-06,
"loss": 0.6642,
"step": 609
},
{
"epoch": 1.2573340195573854,
"grad_norm": 0.13744676113128662,
"learning_rate": 3.882665022231193e-06,
"loss": 0.6637,
"step": 610
},
{
"epoch": 1.2593926917138445,
"grad_norm": 0.1319979578256607,
"learning_rate": 3.868639474502918e-06,
"loss": 0.6473,
"step": 611
},
{
"epoch": 1.2614513638703038,
"grad_norm": 0.13453079760074615,
"learning_rate": 3.8546321486714714e-06,
"loss": 0.6675,
"step": 612
},
{
"epoch": 1.2635100360267628,
"grad_norm": 0.1409771889448166,
"learning_rate": 3.840643201195241e-06,
"loss": 0.6544,
"step": 613
},
{
"epoch": 1.2655687081832219,
"grad_norm": 0.14161136746406555,
"learning_rate": 3.826672788327341e-06,
"loss": 0.6815,
"step": 614
},
{
"epoch": 1.267627380339681,
"grad_norm": 0.1445418894290924,
"learning_rate": 3.812721066113856e-06,
"loss": 0.6496,
"step": 615
},
{
"epoch": 1.26968605249614,
"grad_norm": 0.13115796446800232,
"learning_rate": 3.798788190392099e-06,
"loss": 0.6662,
"step": 616
},
{
"epoch": 1.271744724652599,
"grad_norm": 0.13190345466136932,
"learning_rate": 3.7848743167888735e-06,
"loss": 0.6495,
"step": 617
},
{
"epoch": 1.273803396809058,
"grad_norm": 0.13790887594223022,
"learning_rate": 3.770979600718734e-06,
"loss": 0.6783,
"step": 618
},
{
"epoch": 1.2758620689655173,
"grad_norm": 0.14959625899791718,
"learning_rate": 3.7571041973822497e-06,
"loss": 0.6354,
"step": 619
},
{
"epoch": 1.2779207411219764,
"grad_norm": 0.14615973830223083,
"learning_rate": 3.743248261764274e-06,
"loss": 0.6663,
"step": 620
},
{
"epoch": 1.2799794132784355,
"grad_norm": 0.1481384038925171,
"learning_rate": 3.7294119486322054e-06,
"loss": 0.709,
"step": 621
},
{
"epoch": 1.2820380854348945,
"grad_norm": 0.13109134137630463,
"learning_rate": 3.7155954125342685e-06,
"loss": 0.6524,
"step": 622
},
{
"epoch": 1.2840967575913536,
"grad_norm": 0.15184319019317627,
"learning_rate": 3.7017988077977796e-06,
"loss": 0.6863,
"step": 623
},
{
"epoch": 1.2861554297478126,
"grad_norm": 0.13635946810245514,
"learning_rate": 3.688022288527433e-06,
"loss": 0.6599,
"step": 624
},
{
"epoch": 1.2882141019042717,
"grad_norm": 0.13947440683841705,
"learning_rate": 3.6742660086035644e-06,
"loss": 0.6553,
"step": 625
},
{
"epoch": 1.290272774060731,
"grad_norm": 0.12968918681144714,
"learning_rate": 3.6605301216804477e-06,
"loss": 0.6342,
"step": 626
},
{
"epoch": 1.29233144621719,
"grad_norm": 0.13788466155529022,
"learning_rate": 3.64681478118457e-06,
"loss": 0.6441,
"step": 627
},
{
"epoch": 1.294390118373649,
"grad_norm": 0.1530154049396515,
"learning_rate": 3.633120140312919e-06,
"loss": 0.6798,
"step": 628
},
{
"epoch": 1.296448790530108,
"grad_norm": 0.14195966720581055,
"learning_rate": 3.619446352031273e-06,
"loss": 0.6628,
"step": 629
},
{
"epoch": 1.2985074626865671,
"grad_norm": 0.16132400929927826,
"learning_rate": 3.605793569072493e-06,
"loss": 0.6335,
"step": 630
},
{
"epoch": 1.3005661348430262,
"grad_norm": 0.15894006192684174,
"learning_rate": 3.5921619439348167e-06,
"loss": 0.6573,
"step": 631
},
{
"epoch": 1.3026248069994852,
"grad_norm": 0.1499107927083969,
"learning_rate": 3.578551628880148e-06,
"loss": 0.6414,
"step": 632
},
{
"epoch": 1.3046834791559445,
"grad_norm": 0.14822602272033691,
"learning_rate": 3.5649627759323714e-06,
"loss": 0.6327,
"step": 633
},
{
"epoch": 1.3067421513124036,
"grad_norm": 0.12993744015693665,
"learning_rate": 3.551395536875637e-06,
"loss": 0.6677,
"step": 634
},
{
"epoch": 1.3088008234688626,
"grad_norm": 0.13447578251361847,
"learning_rate": 3.537850063252679e-06,
"loss": 0.663,
"step": 635
},
{
"epoch": 1.3108594956253217,
"grad_norm": 0.1351039707660675,
"learning_rate": 3.5243265063631125e-06,
"loss": 0.658,
"step": 636
},
{
"epoch": 1.3129181677817807,
"grad_norm": 0.14985619485378265,
"learning_rate": 3.510825017261752e-06,
"loss": 0.6628,
"step": 637
},
{
"epoch": 1.3149768399382398,
"grad_norm": 0.13566601276397705,
"learning_rate": 3.4973457467569155e-06,
"loss": 0.6628,
"step": 638
},
{
"epoch": 1.3170355120946988,
"grad_norm": 0.1341661512851715,
"learning_rate": 3.4838888454087517e-06,
"loss": 0.6601,
"step": 639
},
{
"epoch": 1.319094184251158,
"grad_norm": 0.14291465282440186,
"learning_rate": 3.4704544635275416e-06,
"loss": 0.6678,
"step": 640
},
{
"epoch": 1.3211528564076171,
"grad_norm": 0.14012368023395538,
"learning_rate": 3.45704275117204e-06,
"loss": 0.6644,
"step": 641
},
{
"epoch": 1.3232115285640762,
"grad_norm": 0.15221066772937775,
"learning_rate": 3.4436538581477796e-06,
"loss": 0.6909,
"step": 642
},
{
"epoch": 1.3252702007205353,
"grad_norm": 0.13185715675354004,
"learning_rate": 3.430287934005414e-06,
"loss": 0.6618,
"step": 643
},
{
"epoch": 1.3273288728769943,
"grad_norm": 0.12956033647060394,
"learning_rate": 3.4169451280390343e-06,
"loss": 0.6422,
"step": 644
},
{
"epoch": 1.3293875450334534,
"grad_norm": 0.13954877853393555,
"learning_rate": 3.403625589284509e-06,
"loss": 0.6683,
"step": 645
},
{
"epoch": 1.3314462171899124,
"grad_norm": 0.13885624706745148,
"learning_rate": 3.3903294665178204e-06,
"loss": 0.673,
"step": 646
},
{
"epoch": 1.3335048893463717,
"grad_norm": 0.13849471509456635,
"learning_rate": 3.3770569082533943e-06,
"loss": 0.6436,
"step": 647
},
{
"epoch": 1.3355635615028307,
"grad_norm": 0.1327974945306778,
"learning_rate": 3.363808062742455e-06,
"loss": 0.6658,
"step": 648
},
{
"epoch": 1.3376222336592898,
"grad_norm": 0.13802169263362885,
"learning_rate": 3.350583077971352e-06,
"loss": 0.6523,
"step": 649
},
{
"epoch": 1.3396809058157488,
"grad_norm": 0.12908506393432617,
"learning_rate": 3.3373821016599236e-06,
"loss": 0.627,
"step": 650
},
{
"epoch": 1.3417395779722079,
"grad_norm": 0.13945898413658142,
"learning_rate": 3.3242052812598353e-06,
"loss": 0.6388,
"step": 651
},
{
"epoch": 1.343798250128667,
"grad_norm": 0.13595330715179443,
"learning_rate": 3.3110527639529356e-06,
"loss": 0.6333,
"step": 652
},
{
"epoch": 1.345856922285126,
"grad_norm": 0.14493203163146973,
"learning_rate": 3.297924696649619e-06,
"loss": 0.6817,
"step": 653
},
{
"epoch": 1.3479155944415853,
"grad_norm": 0.14481669664382935,
"learning_rate": 3.2848212259871763e-06,
"loss": 0.6535,
"step": 654
},
{
"epoch": 1.3499742665980443,
"grad_norm": 0.16757646203041077,
"learning_rate": 3.2717424983281598e-06,
"loss": 0.6417,
"step": 655
},
{
"epoch": 1.3520329387545034,
"grad_norm": 0.16268573701381683,
"learning_rate": 3.2586886597587466e-06,
"loss": 0.6621,
"step": 656
},
{
"epoch": 1.3540916109109624,
"grad_norm": 0.1454005092382431,
"learning_rate": 3.2456598560871133e-06,
"loss": 0.66,
"step": 657
},
{
"epoch": 1.3561502830674215,
"grad_norm": 0.12595096230506897,
"learning_rate": 3.2326562328418e-06,
"loss": 0.6469,
"step": 658
},
{
"epoch": 1.3582089552238805,
"grad_norm": 0.14004173874855042,
"learning_rate": 3.2196779352700857e-06,
"loss": 0.5994,
"step": 659
},
{
"epoch": 1.3602676273803396,
"grad_norm": 0.13839490711688995,
"learning_rate": 3.2067251083363714e-06,
"loss": 0.6671,
"step": 660
},
{
"epoch": 1.3623262995367988,
"grad_norm": 0.13650719821453094,
"learning_rate": 3.1937978967205585e-06,
"loss": 0.6521,
"step": 661
},
{
"epoch": 1.364384971693258,
"grad_norm": 0.14337030053138733,
"learning_rate": 3.180896444816427e-06,
"loss": 0.647,
"step": 662
},
{
"epoch": 1.366443643849717,
"grad_norm": 0.1445995718240738,
"learning_rate": 3.168020896730028e-06,
"loss": 0.6534,
"step": 663
},
{
"epoch": 1.368502316006176,
"grad_norm": 0.14641247689723969,
"learning_rate": 3.155171396278078e-06,
"loss": 0.6362,
"step": 664
},
{
"epoch": 1.370560988162635,
"grad_norm": 0.13869577646255493,
"learning_rate": 3.142348086986342e-06,
"loss": 0.6706,
"step": 665
},
{
"epoch": 1.372619660319094,
"grad_norm": 0.1305873841047287,
"learning_rate": 3.1295511120880382e-06,
"loss": 0.651,
"step": 666
},
{
"epoch": 1.3746783324755532,
"grad_norm": 0.12877009809017181,
"learning_rate": 3.116780614522238e-06,
"loss": 0.6582,
"step": 667
},
{
"epoch": 1.3767370046320124,
"grad_norm": 0.13101081550121307,
"learning_rate": 3.104036736932268e-06,
"loss": 0.6499,
"step": 668
},
{
"epoch": 1.3787956767884715,
"grad_norm": 0.1467534303665161,
"learning_rate": 3.0913196216641105e-06,
"loss": 0.6344,
"step": 669
},
{
"epoch": 1.3808543489449305,
"grad_norm": 0.1271016001701355,
"learning_rate": 3.078629410764824e-06,
"loss": 0.6807,
"step": 670
},
{
"epoch": 1.3829130211013896,
"grad_norm": 0.14458723366260529,
"learning_rate": 3.0659662459809525e-06,
"loss": 0.6496,
"step": 671
},
{
"epoch": 1.3849716932578486,
"grad_norm": 0.13007880747318268,
"learning_rate": 3.0533302687569368e-06,
"loss": 0.6656,
"step": 672
},
{
"epoch": 1.387030365414308,
"grad_norm": 0.13446252048015594,
"learning_rate": 3.040721620233546e-06,
"loss": 0.6453,
"step": 673
},
{
"epoch": 1.3890890375707667,
"grad_norm": 0.13023251295089722,
"learning_rate": 3.0281404412462866e-06,
"loss": 0.6548,
"step": 674
},
{
"epoch": 1.391147709727226,
"grad_norm": 0.16312147676944733,
"learning_rate": 3.015586872323846e-06,
"loss": 0.6588,
"step": 675
},
{
"epoch": 1.393206381883685,
"grad_norm": 0.1303606480360031,
"learning_rate": 3.0030610536865047e-06,
"loss": 0.6561,
"step": 676
},
{
"epoch": 1.3952650540401441,
"grad_norm": 0.1296800971031189,
"learning_rate": 2.9905631252445864e-06,
"loss": 0.6457,
"step": 677
},
{
"epoch": 1.3973237261966032,
"grad_norm": 0.13874535262584686,
"learning_rate": 2.978093226596884e-06,
"loss": 0.6561,
"step": 678
},
{
"epoch": 1.3993823983530622,
"grad_norm": 0.14332440495491028,
"learning_rate": 2.965651497029108e-06,
"loss": 0.6723,
"step": 679
},
{
"epoch": 1.4014410705095215,
"grad_norm": 0.15567727386951447,
"learning_rate": 2.953238075512321e-06,
"loss": 0.6654,
"step": 680
},
{
"epoch": 1.4034997426659803,
"grad_norm": 0.1367722749710083,
"learning_rate": 2.940853100701402e-06,
"loss": 0.6593,
"step": 681
},
{
"epoch": 1.4055584148224396,
"grad_norm": 0.13934288918972015,
"learning_rate": 2.928496710933477e-06,
"loss": 0.6909,
"step": 682
},
{
"epoch": 1.4076170869788986,
"grad_norm": 0.1457984745502472,
"learning_rate": 2.916169044226387e-06,
"loss": 0.6477,
"step": 683
},
{
"epoch": 1.4096757591353577,
"grad_norm": 0.13270024955272675,
"learning_rate": 2.9038702382771476e-06,
"loss": 0.6588,
"step": 684
},
{
"epoch": 1.4117344312918167,
"grad_norm": 0.13029582798480988,
"learning_rate": 2.891600430460402e-06,
"loss": 0.6564,
"step": 685
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.14011262357234955,
"learning_rate": 2.879359757826895e-06,
"loss": 0.6368,
"step": 686
},
{
"epoch": 1.415851775604735,
"grad_norm": 0.14998824894428253,
"learning_rate": 2.8671483571019337e-06,
"loss": 0.629,
"step": 687
},
{
"epoch": 1.417910447761194,
"grad_norm": 0.14513614773750305,
"learning_rate": 2.854966364683872e-06,
"loss": 0.656,
"step": 688
},
{
"epoch": 1.4199691199176532,
"grad_norm": 0.1355086714029312,
"learning_rate": 2.842813916642574e-06,
"loss": 0.6649,
"step": 689
},
{
"epoch": 1.4220277920741122,
"grad_norm": 0.143692284822464,
"learning_rate": 2.8306911487179023e-06,
"loss": 0.64,
"step": 690
},
{
"epoch": 1.4240864642305713,
"grad_norm": 0.13343499600887299,
"learning_rate": 2.818598196318201e-06,
"loss": 0.6644,
"step": 691
},
{
"epoch": 1.4261451363870303,
"grad_norm": 0.14596882462501526,
"learning_rate": 2.8065351945187837e-06,
"loss": 0.6387,
"step": 692
},
{
"epoch": 1.4282038085434894,
"grad_norm": 0.1412520557641983,
"learning_rate": 2.7945022780604185e-06,
"loss": 0.6451,
"step": 693
},
{
"epoch": 1.4302624806999487,
"grad_norm": 0.12341731041669846,
"learning_rate": 2.7824995813478295e-06,
"loss": 0.6593,
"step": 694
},
{
"epoch": 1.4323211528564075,
"grad_norm": 0.1608169823884964,
"learning_rate": 2.7705272384481972e-06,
"loss": 0.6376,
"step": 695
},
{
"epoch": 1.4343798250128668,
"grad_norm": 0.13672709465026855,
"learning_rate": 2.7585853830896527e-06,
"loss": 0.6308,
"step": 696
},
{
"epoch": 1.4364384971693258,
"grad_norm": 0.13107231259346008,
"learning_rate": 2.746674148659788e-06,
"loss": 0.6537,
"step": 697
},
{
"epoch": 1.4384971693257849,
"grad_norm": 0.1523398607969284,
"learning_rate": 2.734793668204172e-06,
"loss": 0.6627,
"step": 698
},
{
"epoch": 1.440555841482244,
"grad_norm": 0.13535884022712708,
"learning_rate": 2.722944074424858e-06,
"loss": 0.6389,
"step": 699
},
{
"epoch": 1.442614513638703,
"grad_norm": 0.15502335131168365,
"learning_rate": 2.7111254996788995e-06,
"loss": 0.6563,
"step": 700
},
{
"epoch": 1.4446731857951622,
"grad_norm": 0.14101667702198029,
"learning_rate": 2.699338075976875e-06,
"loss": 0.6613,
"step": 701
},
{
"epoch": 1.446731857951621,
"grad_norm": 0.12796726822853088,
"learning_rate": 2.687581934981419e-06,
"loss": 0.6556,
"step": 702
},
{
"epoch": 1.4487905301080803,
"grad_norm": 0.1300426423549652,
"learning_rate": 2.6758572080057398e-06,
"loss": 0.6375,
"step": 703
},
{
"epoch": 1.4508492022645394,
"grad_norm": 0.14779579639434814,
"learning_rate": 2.664164026012161e-06,
"loss": 0.6371,
"step": 704
},
{
"epoch": 1.4529078744209984,
"grad_norm": 0.14179164171218872,
"learning_rate": 2.652502519610657e-06,
"loss": 0.6609,
"step": 705
},
{
"epoch": 1.4549665465774575,
"grad_norm": 0.1387166529893875,
"learning_rate": 2.640872819057394e-06,
"loss": 0.6509,
"step": 706
},
{
"epoch": 1.4570252187339165,
"grad_norm": 0.13394972681999207,
"learning_rate": 2.629275054253274e-06,
"loss": 0.6236,
"step": 707
},
{
"epoch": 1.4590838908903758,
"grad_norm": 0.16035096347332,
"learning_rate": 2.6177093547424826e-06,
"loss": 0.6318,
"step": 708
},
{
"epoch": 1.4611425630468349,
"grad_norm": 0.14260070025920868,
"learning_rate": 2.606175849711048e-06,
"loss": 0.6625,
"step": 709
},
{
"epoch": 1.463201235203294,
"grad_norm": 0.1396186351776123,
"learning_rate": 2.5946746679853894e-06,
"loss": 0.6741,
"step": 710
},
{
"epoch": 1.465259907359753,
"grad_norm": 0.14683037996292114,
"learning_rate": 2.583205938030888e-06,
"loss": 0.6482,
"step": 711
},
{
"epoch": 1.467318579516212,
"grad_norm": 0.15719832479953766,
"learning_rate": 2.57176978795044e-06,
"loss": 0.6416,
"step": 712
},
{
"epoch": 1.469377251672671,
"grad_norm": 0.13454781472682953,
"learning_rate": 2.5603663454830395e-06,
"loss": 0.6622,
"step": 713
},
{
"epoch": 1.4714359238291301,
"grad_norm": 0.13637712597846985,
"learning_rate": 2.548995738002338e-06,
"loss": 0.662,
"step": 714
},
{
"epoch": 1.4734945959855894,
"grad_norm": 0.13223333656787872,
"learning_rate": 2.5376580925152335e-06,
"loss": 0.6284,
"step": 715
},
{
"epoch": 1.4755532681420485,
"grad_norm": 0.14275510609149933,
"learning_rate": 2.5263535356604428e-06,
"loss": 0.6585,
"step": 716
},
{
"epoch": 1.4776119402985075,
"grad_norm": 0.1544455587863922,
"learning_rate": 2.5150821937070946e-06,
"loss": 0.6852,
"step": 717
},
{
"epoch": 1.4796706124549666,
"grad_norm": 0.1354699581861496,
"learning_rate": 2.503844192553313e-06,
"loss": 0.6394,
"step": 718
},
{
"epoch": 1.4817292846114256,
"grad_norm": 0.16445569694042206,
"learning_rate": 2.4926396577248097e-06,
"loss": 0.6811,
"step": 719
},
{
"epoch": 1.4837879567678847,
"grad_norm": 0.13989467918872833,
"learning_rate": 2.481468714373496e-06,
"loss": 0.638,
"step": 720
},
{
"epoch": 1.4858466289243437,
"grad_norm": 0.13825134932994843,
"learning_rate": 2.4703314872760623e-06,
"loss": 0.6662,
"step": 721
},
{
"epoch": 1.487905301080803,
"grad_norm": 0.12880556285381317,
"learning_rate": 2.4592281008326075e-06,
"loss": 0.6502,
"step": 722
},
{
"epoch": 1.489963973237262,
"grad_norm": 0.12772305309772491,
"learning_rate": 2.448158679065231e-06,
"loss": 0.6763,
"step": 723
},
{
"epoch": 1.492022645393721,
"grad_norm": 0.13914746046066284,
"learning_rate": 2.437123345616661e-06,
"loss": 0.645,
"step": 724
},
{
"epoch": 1.4940813175501801,
"grad_norm": 0.1409793198108673,
"learning_rate": 2.4261222237488616e-06,
"loss": 0.6588,
"step": 725
},
{
"epoch": 1.4961399897066392,
"grad_norm": 0.1305626630783081,
"learning_rate": 2.4151554363416676e-06,
"loss": 0.6603,
"step": 726
},
{
"epoch": 1.4981986618630982,
"grad_norm": 0.13002273440361023,
"learning_rate": 2.4042231058914027e-06,
"loss": 0.6248,
"step": 727
},
{
"epoch": 1.5002573340195573,
"grad_norm": 0.14312808215618134,
"learning_rate": 2.3933253545095143e-06,
"loss": 0.6459,
"step": 728
},
{
"epoch": 1.5023160061760166,
"grad_norm": 0.133903369307518,
"learning_rate": 2.382462303921213e-06,
"loss": 0.6477,
"step": 729
},
{
"epoch": 1.5043746783324754,
"grad_norm": 0.1325589120388031,
"learning_rate": 2.37163407546411e-06,
"loss": 0.6816,
"step": 730
},
{
"epoch": 1.5064333504889347,
"grad_norm": 0.13143044710159302,
"learning_rate": 2.3608407900868597e-06,
"loss": 0.6492,
"step": 731
},
{
"epoch": 1.5084920226453937,
"grad_norm": 0.13435474038124084,
"learning_rate": 2.3500825683478096e-06,
"loss": 0.6528,
"step": 732
},
{
"epoch": 1.5105506948018528,
"grad_norm": 0.1382218599319458,
"learning_rate": 2.33935953041366e-06,
"loss": 0.6292,
"step": 733
},
{
"epoch": 1.512609366958312,
"grad_norm": 0.14546315371990204,
"learning_rate": 2.328671796058113e-06,
"loss": 0.6548,
"step": 734
},
{
"epoch": 1.5146680391147709,
"grad_norm": 0.14825578033924103,
"learning_rate": 2.3180194846605367e-06,
"loss": 0.65,
"step": 735
},
{
"epoch": 1.5167267112712302,
"grad_norm": 0.1339276134967804,
"learning_rate": 2.3074027152046384e-06,
"loss": 0.6475,
"step": 736
},
{
"epoch": 1.518785383427689,
"grad_norm": 0.1629195660352707,
"learning_rate": 2.296821606277129e-06,
"loss": 0.6302,
"step": 737
},
{
"epoch": 1.5208440555841483,
"grad_norm": 0.13913467526435852,
"learning_rate": 2.286276276066398e-06,
"loss": 0.6858,
"step": 738
},
{
"epoch": 1.5229027277406073,
"grad_norm": 0.13781745731830597,
"learning_rate": 2.275766842361195e-06,
"loss": 0.6477,
"step": 739
},
{
"epoch": 1.5249613998970664,
"grad_norm": 0.13842599093914032,
"learning_rate": 2.265293422549319e-06,
"loss": 0.6507,
"step": 740
},
{
"epoch": 1.5270200720535256,
"grad_norm": 0.1484973430633545,
"learning_rate": 2.2548561336162984e-06,
"loss": 0.6636,
"step": 741
},
{
"epoch": 1.5290787442099845,
"grad_norm": 0.13376715779304504,
"learning_rate": 2.2444550921440884e-06,
"loss": 0.6561,
"step": 742
},
{
"epoch": 1.5311374163664437,
"grad_norm": 0.1416800618171692,
"learning_rate": 2.2340904143097715e-06,
"loss": 0.6542,
"step": 743
},
{
"epoch": 1.5331960885229026,
"grad_norm": 0.1370648294687271,
"learning_rate": 2.2237622158842584e-06,
"loss": 0.6755,
"step": 744
},
{
"epoch": 1.5352547606793618,
"grad_norm": 0.12818501889705658,
"learning_rate": 2.213470612230989e-06,
"loss": 0.6177,
"step": 745
},
{
"epoch": 1.537313432835821,
"grad_norm": 0.1407281905412674,
"learning_rate": 2.2032157183046515e-06,
"loss": 0.6507,
"step": 746
},
{
"epoch": 1.53937210499228,
"grad_norm": 0.13501091301441193,
"learning_rate": 2.192997648649896e-06,
"loss": 0.6468,
"step": 747
},
{
"epoch": 1.5414307771487392,
"grad_norm": 0.12218450754880905,
"learning_rate": 2.1828165174000513e-06,
"loss": 0.6429,
"step": 748
},
{
"epoch": 1.543489449305198,
"grad_norm": 0.13836924731731415,
"learning_rate": 2.172672438275859e-06,
"loss": 0.6598,
"step": 749
},
{
"epoch": 1.5455481214616573,
"grad_norm": 0.14301835000514984,
"learning_rate": 2.162565524584191e-06,
"loss": 0.6469,
"step": 750
},
{
"epoch": 1.5476067936181161,
"grad_norm": 0.13408608734607697,
"learning_rate": 2.1524958892167968e-06,
"loss": 0.6468,
"step": 751
},
{
"epoch": 1.5496654657745754,
"grad_norm": 0.14460250735282898,
"learning_rate": 2.142463644649029e-06,
"loss": 0.6726,
"step": 752
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.12662199139595032,
"learning_rate": 2.132468902938602e-06,
"loss": 0.6322,
"step": 753
},
{
"epoch": 1.5537828100874935,
"grad_norm": 0.13386270403862,
"learning_rate": 2.1225117757243263e-06,
"loss": 0.6694,
"step": 754
},
{
"epoch": 1.5558414822439528,
"grad_norm": 0.13018876314163208,
"learning_rate": 2.112592374224869e-06,
"loss": 0.6514,
"step": 755
},
{
"epoch": 1.5579001544004116,
"grad_norm": 0.13406234979629517,
"learning_rate": 2.1027108092375147e-06,
"loss": 0.6556,
"step": 756
},
{
"epoch": 1.559958826556871,
"grad_norm": 0.1314801722764969,
"learning_rate": 2.0928671911369157e-06,
"loss": 0.6597,
"step": 757
},
{
"epoch": 1.56201749871333,
"grad_norm": 0.1464812457561493,
"learning_rate": 2.0830616298738746e-06,
"loss": 0.659,
"step": 758
},
{
"epoch": 1.564076170869789,
"grad_norm": 0.13219048082828522,
"learning_rate": 2.0732942349741025e-06,
"loss": 0.6683,
"step": 759
},
{
"epoch": 1.566134843026248,
"grad_norm": 0.12457743287086487,
"learning_rate": 2.0635651155370064e-06,
"loss": 0.6264,
"step": 760
},
{
"epoch": 1.568193515182707,
"grad_norm": 0.14103274047374725,
"learning_rate": 2.053874380234461e-06,
"loss": 0.65,
"step": 761
},
{
"epoch": 1.5702521873391664,
"grad_norm": 0.12605510652065277,
"learning_rate": 2.0442221373096045e-06,
"loss": 0.6407,
"step": 762
},
{
"epoch": 1.5723108594956252,
"grad_norm": 0.1612055003643036,
"learning_rate": 2.03460849457562e-06,
"loss": 0.6741,
"step": 763
},
{
"epoch": 1.5743695316520845,
"grad_norm": 0.1546323448419571,
"learning_rate": 2.02503355941454e-06,
"loss": 0.6852,
"step": 764
},
{
"epoch": 1.5764282038085435,
"grad_norm": 0.13762885332107544,
"learning_rate": 2.0154974387760404e-06,
"loss": 0.6535,
"step": 765
},
{
"epoch": 1.5784868759650026,
"grad_norm": 0.1335284411907196,
"learning_rate": 2.0060002391762477e-06,
"loss": 0.6552,
"step": 766
},
{
"epoch": 1.5805455481214616,
"grad_norm": 0.1296115517616272,
"learning_rate": 1.996542066696553e-06,
"loss": 0.6613,
"step": 767
},
{
"epoch": 1.5826042202779207,
"grad_norm": 0.15091270208358765,
"learning_rate": 1.987123026982423e-06,
"loss": 0.6548,
"step": 768
},
{
"epoch": 1.58466289243438,
"grad_norm": 0.1307397335767746,
"learning_rate": 1.97774322524222e-06,
"loss": 0.6492,
"step": 769
},
{
"epoch": 1.5867215645908388,
"grad_norm": 0.14567354321479797,
"learning_rate": 1.968402766246026e-06,
"loss": 0.6574,
"step": 770
},
{
"epoch": 1.588780236747298,
"grad_norm": 0.13739728927612305,
"learning_rate": 1.959101754324479e-06,
"loss": 0.6759,
"step": 771
},
{
"epoch": 1.5908389089037571,
"grad_norm": 0.13658419251441956,
"learning_rate": 1.9498402933676e-06,
"loss": 0.6705,
"step": 772
},
{
"epoch": 1.5928975810602162,
"grad_norm": 0.12363572418689728,
"learning_rate": 1.940618486823632e-06,
"loss": 0.666,
"step": 773
},
{
"epoch": 1.5949562532166752,
"grad_norm": 0.15004467964172363,
"learning_rate": 1.931436437697896e-06,
"loss": 0.6403,
"step": 774
},
{
"epoch": 1.5970149253731343,
"grad_norm": 0.12857025861740112,
"learning_rate": 1.9222942485516265e-06,
"loss": 0.6526,
"step": 775
},
{
"epoch": 1.5990735975295935,
"grad_norm": 0.1368139386177063,
"learning_rate": 1.9131920215008344e-06,
"loss": 0.6429,
"step": 776
},
{
"epoch": 1.6011322696860524,
"grad_norm": 0.138963520526886,
"learning_rate": 1.904129858215159e-06,
"loss": 0.6561,
"step": 777
},
{
"epoch": 1.6031909418425117,
"grad_norm": 0.1443481296300888,
"learning_rate": 1.895107859916746e-06,
"loss": 0.6668,
"step": 778
},
{
"epoch": 1.6052496139989707,
"grad_norm": 0.13580967485904694,
"learning_rate": 1.8861261273790994e-06,
"loss": 0.6736,
"step": 779
},
{
"epoch": 1.6073082861554298,
"grad_norm": 0.1417776346206665,
"learning_rate": 1.8771847609259675e-06,
"loss": 0.6597,
"step": 780
},
{
"epoch": 1.6093669583118888,
"grad_norm": 0.14724156260490417,
"learning_rate": 1.8682838604302217e-06,
"loss": 0.6525,
"step": 781
},
{
"epoch": 1.6114256304683479,
"grad_norm": 0.13658173382282257,
"learning_rate": 1.8594235253127373e-06,
"loss": 0.6571,
"step": 782
},
{
"epoch": 1.6134843026248071,
"grad_norm": 0.12604515254497528,
"learning_rate": 1.8506038545412823e-06,
"loss": 0.6514,
"step": 783
},
{
"epoch": 1.615542974781266,
"grad_norm": 0.1426461786031723,
"learning_rate": 1.8418249466294153e-06,
"loss": 0.665,
"step": 784
},
{
"epoch": 1.6176016469377252,
"grad_norm": 0.13183601200580597,
"learning_rate": 1.8330868996353851e-06,
"loss": 0.6575,
"step": 785
},
{
"epoch": 1.6196603190941843,
"grad_norm": 0.1418440341949463,
"learning_rate": 1.8243898111610314e-06,
"loss": 0.6344,
"step": 786
},
{
"epoch": 1.6217189912506433,
"grad_norm": 0.1399330198764801,
"learning_rate": 1.8157337783506998e-06,
"loss": 0.6418,
"step": 787
},
{
"epoch": 1.6237776634071024,
"grad_norm": 0.13530658185482025,
"learning_rate": 1.807118897890152e-06,
"loss": 0.6339,
"step": 788
},
{
"epoch": 1.6258363355635614,
"grad_norm": 0.12609177827835083,
"learning_rate": 1.7985452660054908e-06,
"loss": 0.6319,
"step": 789
},
{
"epoch": 1.6278950077200207,
"grad_norm": 0.16546158492565155,
"learning_rate": 1.7900129784620798e-06,
"loss": 0.6191,
"step": 790
},
{
"epoch": 1.6299536798764795,
"grad_norm": 0.13399174809455872,
"learning_rate": 1.7815221305634764e-06,
"loss": 0.6648,
"step": 791
},
{
"epoch": 1.6320123520329388,
"grad_norm": 0.1417001485824585,
"learning_rate": 1.7730728171503704e-06,
"loss": 0.6445,
"step": 792
},
{
"epoch": 1.6340710241893979,
"grad_norm": 0.14180314540863037,
"learning_rate": 1.7646651325995178e-06,
"loss": 0.6386,
"step": 793
},
{
"epoch": 1.636129696345857,
"grad_norm": 0.14928773045539856,
"learning_rate": 1.7562991708226945e-06,
"loss": 0.6492,
"step": 794
},
{
"epoch": 1.638188368502316,
"grad_norm": 0.1402241438627243,
"learning_rate": 1.7479750252656388e-06,
"loss": 0.6238,
"step": 795
},
{
"epoch": 1.640247040658775,
"grad_norm": 0.13829921185970306,
"learning_rate": 1.7396927889070164e-06,
"loss": 0.6583,
"step": 796
},
{
"epoch": 1.6423057128152343,
"grad_norm": 0.1358102262020111,
"learning_rate": 1.731452554257373e-06,
"loss": 0.6501,
"step": 797
},
{
"epoch": 1.6443643849716931,
"grad_norm": 0.1420017033815384,
"learning_rate": 1.723254413358111e-06,
"loss": 0.6591,
"step": 798
},
{
"epoch": 1.6464230571281524,
"grad_norm": 0.1424987018108368,
"learning_rate": 1.715098457780449e-06,
"loss": 0.6729,
"step": 799
},
{
"epoch": 1.6484817292846115,
"grad_norm": 0.13872523605823517,
"learning_rate": 1.7069847786244136e-06,
"loss": 0.6307,
"step": 800
},
{
"epoch": 1.6505404014410705,
"grad_norm": 0.13062597811222076,
"learning_rate": 1.698913466517808e-06,
"loss": 0.6428,
"step": 801
},
{
"epoch": 1.6525990735975296,
"grad_norm": 0.14626429975032806,
"learning_rate": 1.690884611615209e-06,
"loss": 0.6198,
"step": 802
},
{
"epoch": 1.6546577457539886,
"grad_norm": 0.15756435692310333,
"learning_rate": 1.6828983035969565e-06,
"loss": 0.6731,
"step": 803
},
{
"epoch": 1.6567164179104479,
"grad_norm": 0.14816921949386597,
"learning_rate": 1.67495463166815e-06,
"loss": 0.6242,
"step": 804
},
{
"epoch": 1.6587750900669067,
"grad_norm": 0.13179270923137665,
"learning_rate": 1.6670536845576573e-06,
"loss": 0.6626,
"step": 805
},
{
"epoch": 1.660833762223366,
"grad_norm": 0.13587595522403717,
"learning_rate": 1.6591955505171198e-06,
"loss": 0.6534,
"step": 806
},
{
"epoch": 1.662892434379825,
"grad_norm": 0.13836577534675598,
"learning_rate": 1.6513803173199653e-06,
"loss": 0.655,
"step": 807
},
{
"epoch": 1.664951106536284,
"grad_norm": 0.1396552473306656,
"learning_rate": 1.6436080722604314e-06,
"loss": 0.6557,
"step": 808
},
{
"epoch": 1.6670097786927431,
"grad_norm": 0.14703871309757233,
"learning_rate": 1.63587890215259e-06,
"loss": 0.5978,
"step": 809
},
{
"epoch": 1.6690684508492022,
"grad_norm": 0.1638912856578827,
"learning_rate": 1.628192893329374e-06,
"loss": 0.6394,
"step": 810
},
{
"epoch": 1.6711271230056615,
"grad_norm": 0.14806923270225525,
"learning_rate": 1.620550131641615e-06,
"loss": 0.6538,
"step": 811
},
{
"epoch": 1.6731857951621203,
"grad_norm": 0.1463487446308136,
"learning_rate": 1.612950702457087e-06,
"loss": 0.6587,
"step": 812
},
{
"epoch": 1.6752444673185796,
"grad_norm": 0.1333266645669937,
"learning_rate": 1.6053946906595502e-06,
"loss": 0.6237,
"step": 813
},
{
"epoch": 1.6773031394750386,
"grad_norm": 0.1463502198457718,
"learning_rate": 1.5978821806478027e-06,
"loss": 0.675,
"step": 814
},
{
"epoch": 1.6793618116314977,
"grad_norm": 0.13438794016838074,
"learning_rate": 1.590413256334736e-06,
"loss": 0.6444,
"step": 815
},
{
"epoch": 1.6814204837879567,
"grad_norm": 0.1534508913755417,
"learning_rate": 1.582988001146405e-06,
"loss": 0.6584,
"step": 816
},
{
"epoch": 1.6834791559444158,
"grad_norm": 0.12801076471805573,
"learning_rate": 1.5756064980210867e-06,
"loss": 0.667,
"step": 817
},
{
"epoch": 1.685537828100875,
"grad_norm": 0.14711901545524597,
"learning_rate": 1.5682688294083594e-06,
"loss": 0.6405,
"step": 818
},
{
"epoch": 1.6875965002573339,
"grad_norm": 0.12615957856178284,
"learning_rate": 1.5609750772681826e-06,
"loss": 0.6632,
"step": 819
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.14474698901176453,
"learning_rate": 1.5537253230699784e-06,
"loss": 0.6381,
"step": 820
},
{
"epoch": 1.6917138445702522,
"grad_norm": 0.14288835227489471,
"learning_rate": 1.5465196477917225e-06,
"loss": 0.6771,
"step": 821
},
{
"epoch": 1.6937725167267113,
"grad_norm": 0.14714893698692322,
"learning_rate": 1.5393581319190382e-06,
"loss": 0.6537,
"step": 822
},
{
"epoch": 1.6958311888831703,
"grad_norm": 0.13413602113723755,
"learning_rate": 1.5322408554443027e-06,
"loss": 0.6582,
"step": 823
},
{
"epoch": 1.6978898610396294,
"grad_norm": 0.14149825274944305,
"learning_rate": 1.5251678978657464e-06,
"loss": 0.6479,
"step": 824
},
{
"epoch": 1.6999485331960886,
"grad_norm": 0.14568348228931427,
"learning_rate": 1.5181393381865716e-06,
"loss": 0.668,
"step": 825
},
{
"epoch": 1.7020072053525475,
"grad_norm": 0.13481447100639343,
"learning_rate": 1.511155254914065e-06,
"loss": 0.6473,
"step": 826
},
{
"epoch": 1.7040658775090067,
"grad_norm": 0.15289506316184998,
"learning_rate": 1.5042157260587231e-06,
"loss": 0.6351,
"step": 827
},
{
"epoch": 1.7061245496654658,
"grad_norm": 0.12859320640563965,
"learning_rate": 1.4973208291333813e-06,
"loss": 0.6305,
"step": 828
},
{
"epoch": 1.7081832218219248,
"grad_norm": 0.1490619033575058,
"learning_rate": 1.490470641152345e-06,
"loss": 0.6296,
"step": 829
},
{
"epoch": 1.7102418939783839,
"grad_norm": 0.12138031423091888,
"learning_rate": 1.4836652386305351e-06,
"loss": 0.639,
"step": 830
},
{
"epoch": 1.712300566134843,
"grad_norm": 0.13470202684402466,
"learning_rate": 1.4769046975826267e-06,
"loss": 0.6458,
"step": 831
},
{
"epoch": 1.7143592382913022,
"grad_norm": 0.14960254728794098,
"learning_rate": 1.4701890935222062e-06,
"loss": 0.6717,
"step": 832
},
{
"epoch": 1.716417910447761,
"grad_norm": 0.13615302741527557,
"learning_rate": 1.4635185014609216e-06,
"loss": 0.6509,
"step": 833
},
{
"epoch": 1.7184765826042203,
"grad_norm": 0.12218176573514938,
"learning_rate": 1.4568929959076512e-06,
"loss": 0.647,
"step": 834
},
{
"epoch": 1.7205352547606794,
"grad_norm": 0.13748720288276672,
"learning_rate": 1.4503126508676652e-06,
"loss": 0.672,
"step": 835
},
{
"epoch": 1.7225939269171384,
"grad_norm": 0.1389789581298828,
"learning_rate": 1.4437775398418042e-06,
"loss": 0.6587,
"step": 836
},
{
"epoch": 1.7246525990735977,
"grad_norm": 0.13413724303245544,
"learning_rate": 1.4372877358256543e-06,
"loss": 0.6538,
"step": 837
},
{
"epoch": 1.7267112712300565,
"grad_norm": 0.1562497615814209,
"learning_rate": 1.4308433113087346e-06,
"loss": 0.6617,
"step": 838
},
{
"epoch": 1.7287699433865158,
"grad_norm": 0.1408907175064087,
"learning_rate": 1.4244443382736858e-06,
"loss": 0.6504,
"step": 839
},
{
"epoch": 1.7308286155429746,
"grad_norm": 0.13438640534877777,
"learning_rate": 1.4180908881954668e-06,
"loss": 0.654,
"step": 840
},
{
"epoch": 1.732887287699434,
"grad_norm": 0.13639037311077118,
"learning_rate": 1.4117830320405568e-06,
"loss": 0.6685,
"step": 841
},
{
"epoch": 1.734945959855893,
"grad_norm": 0.14213210344314575,
"learning_rate": 1.405520840266159e-06,
"loss": 0.6556,
"step": 842
},
{
"epoch": 1.737004632012352,
"grad_norm": 0.1482432335615158,
"learning_rate": 1.3993043828194217e-06,
"loss": 0.6419,
"step": 843
},
{
"epoch": 1.7390633041688113,
"grad_norm": 0.1303771287202835,
"learning_rate": 1.3931337291366488e-06,
"loss": 0.6361,
"step": 844
},
{
"epoch": 1.74112197632527,
"grad_norm": 0.1479034125804901,
"learning_rate": 1.387008948142528e-06,
"loss": 0.6569,
"step": 845
},
{
"epoch": 1.7431806484817294,
"grad_norm": 0.168193057179451,
"learning_rate": 1.3809301082493592e-06,
"loss": 0.6567,
"step": 846
},
{
"epoch": 1.7452393206381882,
"grad_norm": 0.13786673545837402,
"learning_rate": 1.3748972773562946e-06,
"loss": 0.6405,
"step": 847
},
{
"epoch": 1.7472979927946475,
"grad_norm": 0.13587407767772675,
"learning_rate": 1.3689105228485739e-06,
"loss": 0.6353,
"step": 848
},
{
"epoch": 1.7493566649511065,
"grad_norm": 0.13992324471473694,
"learning_rate": 1.3629699115967757e-06,
"loss": 0.655,
"step": 849
},
{
"epoch": 1.7514153371075656,
"grad_norm": 0.13453035056591034,
"learning_rate": 1.3570755099560701e-06,
"loss": 0.658,
"step": 850
},
{
"epoch": 1.7534740092640249,
"grad_norm": 0.15303651988506317,
"learning_rate": 1.3512273837654793e-06,
"loss": 0.6135,
"step": 851
},
{
"epoch": 1.7555326814204837,
"grad_norm": 0.13186489045619965,
"learning_rate": 1.3454255983471367e-06,
"loss": 0.6647,
"step": 852
},
{
"epoch": 1.757591353576943,
"grad_norm": 0.13427408039569855,
"learning_rate": 1.3396702185055614e-06,
"loss": 0.6597,
"step": 853
},
{
"epoch": 1.7596500257334018,
"grad_norm": 0.15728993713855743,
"learning_rate": 1.3339613085269357e-06,
"loss": 0.659,
"step": 854
},
{
"epoch": 1.761708697889861,
"grad_norm": 0.13846909999847412,
"learning_rate": 1.3282989321783822e-06,
"loss": 0.6543,
"step": 855
},
{
"epoch": 1.7637673700463201,
"grad_norm": 0.13936255872249603,
"learning_rate": 1.322683152707255e-06,
"loss": 0.6436,
"step": 856
},
{
"epoch": 1.7658260422027792,
"grad_norm": 0.1383313685655594,
"learning_rate": 1.3171140328404339e-06,
"loss": 0.6493,
"step": 857
},
{
"epoch": 1.7678847143592384,
"grad_norm": 0.14984488487243652,
"learning_rate": 1.3115916347836222e-06,
"loss": 0.6628,
"step": 858
},
{
"epoch": 1.7699433865156973,
"grad_norm": 0.1229867935180664,
"learning_rate": 1.3061160202206501e-06,
"loss": 0.6301,
"step": 859
},
{
"epoch": 1.7720020586721565,
"grad_norm": 0.12617282569408417,
"learning_rate": 1.3006872503127887e-06,
"loss": 0.6368,
"step": 860
},
{
"epoch": 1.7740607308286156,
"grad_norm": 0.1278039813041687,
"learning_rate": 1.2953053856980674e-06,
"loss": 0.6588,
"step": 861
},
{
"epoch": 1.7761194029850746,
"grad_norm": 0.1354069858789444,
"learning_rate": 1.2899704864905922e-06,
"loss": 0.6669,
"step": 862
},
{
"epoch": 1.7781780751415337,
"grad_norm": 0.1457299292087555,
"learning_rate": 1.284682612279878e-06,
"loss": 0.6522,
"step": 863
},
{
"epoch": 1.7802367472979927,
"grad_norm": 0.13067875802516937,
"learning_rate": 1.2794418221301842e-06,
"loss": 0.6367,
"step": 864
},
{
"epoch": 1.782295419454452,
"grad_norm": 0.14541006088256836,
"learning_rate": 1.2742481745798496e-06,
"loss": 0.6594,
"step": 865
},
{
"epoch": 1.7843540916109109,
"grad_norm": 0.14047367870807648,
"learning_rate": 1.269101727640644e-06,
"loss": 0.635,
"step": 866
},
{
"epoch": 1.7864127637673701,
"grad_norm": 0.1398647427558899,
"learning_rate": 1.2640025387971156e-06,
"loss": 0.6349,
"step": 867
},
{
"epoch": 1.7884714359238292,
"grad_norm": 0.13229133188724518,
"learning_rate": 1.2589506650059544e-06,
"loss": 0.6436,
"step": 868
},
{
"epoch": 1.7905301080802882,
"grad_norm": 0.18162629008293152,
"learning_rate": 1.2539461626953498e-06,
"loss": 0.6548,
"step": 869
},
{
"epoch": 1.7925887802367473,
"grad_norm": 0.14638854563236237,
"learning_rate": 1.248989087764366e-06,
"loss": 0.675,
"step": 870
},
{
"epoch": 1.7946474523932063,
"grad_norm": 0.12400197237730026,
"learning_rate": 1.2440794955823128e-06,
"loss": 0.6442,
"step": 871
},
{
"epoch": 1.7967061245496656,
"grad_norm": 0.13715320825576782,
"learning_rate": 1.2392174409881311e-06,
"loss": 0.6573,
"step": 872
},
{
"epoch": 1.7987647967061244,
"grad_norm": 0.1651625782251358,
"learning_rate": 1.2344029782897774e-06,
"loss": 0.6412,
"step": 873
},
{
"epoch": 1.8008234688625837,
"grad_norm": 0.13429532945156097,
"learning_rate": 1.229636161263619e-06,
"loss": 0.6756,
"step": 874
},
{
"epoch": 1.8028821410190428,
"grad_norm": 0.13396626710891724,
"learning_rate": 1.224917043153832e-06,
"loss": 0.6354,
"step": 875
},
{
"epoch": 1.8049408131755018,
"grad_norm": 0.1430749148130417,
"learning_rate": 1.2202456766718092e-06,
"loss": 0.6482,
"step": 876
},
{
"epoch": 1.8069994853319609,
"grad_norm": 0.15483106672763824,
"learning_rate": 1.2156221139955669e-06,
"loss": 0.6767,
"step": 877
},
{
"epoch": 1.80905815748842,
"grad_norm": 0.1451236456632614,
"learning_rate": 1.2110464067691666e-06,
"loss": 0.6449,
"step": 878
},
{
"epoch": 1.8111168296448792,
"grad_norm": 0.1282985806465149,
"learning_rate": 1.206518606102135e-06,
"loss": 0.6602,
"step": 879
},
{
"epoch": 1.813175501801338,
"grad_norm": 0.13331440091133118,
"learning_rate": 1.2020387625688943e-06,
"loss": 0.66,
"step": 880
},
{
"epoch": 1.8152341739577973,
"grad_norm": 0.13615739345550537,
"learning_rate": 1.1976069262081987e-06,
"loss": 0.6377,
"step": 881
},
{
"epoch": 1.8172928461142563,
"grad_norm": 0.14270727336406708,
"learning_rate": 1.1932231465225714e-06,
"loss": 0.6456,
"step": 882
},
{
"epoch": 1.8193515182707154,
"grad_norm": 0.13713142275810242,
"learning_rate": 1.1888874724777582e-06,
"loss": 0.6569,
"step": 883
},
{
"epoch": 1.8214101904271744,
"grad_norm": 0.13675430417060852,
"learning_rate": 1.1845999525021723e-06,
"loss": 0.6474,
"step": 884
},
{
"epoch": 1.8234688625836335,
"grad_norm": 0.14368852972984314,
"learning_rate": 1.1803606344863615e-06,
"loss": 0.5877,
"step": 885
},
{
"epoch": 1.8255275347400928,
"grad_norm": 0.14031407237052917,
"learning_rate": 1.1761695657824677e-06,
"loss": 0.6448,
"step": 886
},
{
"epoch": 1.8275862068965516,
"grad_norm": 0.150904580950737,
"learning_rate": 1.1720267932036986e-06,
"loss": 0.6559,
"step": 887
},
{
"epoch": 1.8296448790530109,
"grad_norm": 0.14362327754497528,
"learning_rate": 1.1679323630238087e-06,
"loss": 0.6355,
"step": 888
},
{
"epoch": 1.83170355120947,
"grad_norm": 0.1305018961429596,
"learning_rate": 1.163886320976579e-06,
"loss": 0.6691,
"step": 889
},
{
"epoch": 1.833762223365929,
"grad_norm": 0.131569966673851,
"learning_rate": 1.1598887122553061e-06,
"loss": 0.6731,
"step": 890
},
{
"epoch": 1.835820895522388,
"grad_norm": 0.14277830719947815,
"learning_rate": 1.1559395815122975e-06,
"loss": 0.6614,
"step": 891
},
{
"epoch": 1.837879567678847,
"grad_norm": 0.15379874408245087,
"learning_rate": 1.1520389728583763e-06,
"loss": 0.6481,
"step": 892
},
{
"epoch": 1.8399382398353064,
"grad_norm": 0.13738073408603668,
"learning_rate": 1.1481869298623837e-06,
"loss": 0.6665,
"step": 893
},
{
"epoch": 1.8419969119917652,
"grad_norm": 0.1339586228132248,
"learning_rate": 1.1443834955506942e-06,
"loss": 0.6573,
"step": 894
},
{
"epoch": 1.8440555841482245,
"grad_norm": 0.14855796098709106,
"learning_rate": 1.140628712406736e-06,
"loss": 0.6795,
"step": 895
},
{
"epoch": 1.8461142563046835,
"grad_norm": 0.132362961769104,
"learning_rate": 1.1369226223705176e-06,
"loss": 0.6795,
"step": 896
},
{
"epoch": 1.8481729284611426,
"grad_norm": 0.1348879635334015,
"learning_rate": 1.133265266838153e-06,
"loss": 0.6529,
"step": 897
},
{
"epoch": 1.8502316006176016,
"grad_norm": 0.14411653578281403,
"learning_rate": 1.1296566866614067e-06,
"loss": 0.6538,
"step": 898
},
{
"epoch": 1.8522902727740607,
"grad_norm": 0.14565573632717133,
"learning_rate": 1.1260969221472352e-06,
"loss": 0.6572,
"step": 899
},
{
"epoch": 1.85434894493052,
"grad_norm": 0.1327780932188034,
"learning_rate": 1.1225860130573334e-06,
"loss": 0.6655,
"step": 900
},
{
"epoch": 1.8564076170869788,
"grad_norm": 0.14732906222343445,
"learning_rate": 1.1191239986076947e-06,
"loss": 0.6755,
"step": 901
},
{
"epoch": 1.858466289243438,
"grad_norm": 0.15718206763267517,
"learning_rate": 1.1157109174681713e-06,
"loss": 0.6408,
"step": 902
},
{
"epoch": 1.860524961399897,
"grad_norm": 0.12371277809143066,
"learning_rate": 1.1123468077620423e-06,
"loss": 0.6387,
"step": 903
},
{
"epoch": 1.8625836335563561,
"grad_norm": 0.12767182290554047,
"learning_rate": 1.109031707065588e-06,
"loss": 0.647,
"step": 904
},
{
"epoch": 1.8646423057128152,
"grad_norm": 0.13031242787837982,
"learning_rate": 1.1057656524076691e-06,
"loss": 0.6287,
"step": 905
},
{
"epoch": 1.8667009778692742,
"grad_norm": 0.14342977106571198,
"learning_rate": 1.1025486802693158e-06,
"loss": 0.6728,
"step": 906
},
{
"epoch": 1.8687596500257335,
"grad_norm": 0.13070468604564667,
"learning_rate": 1.099380826583316e-06,
"loss": 0.6233,
"step": 907
},
{
"epoch": 1.8708183221821924,
"grad_norm": 0.145916149020195,
"learning_rate": 1.0962621267338198e-06,
"loss": 0.6327,
"step": 908
},
{
"epoch": 1.8728769943386516,
"grad_norm": 0.14288493990898132,
"learning_rate": 1.0931926155559384e-06,
"loss": 0.6465,
"step": 909
},
{
"epoch": 1.8749356664951107,
"grad_norm": 0.1527261734008789,
"learning_rate": 1.0901723273353599e-06,
"loss": 0.6797,
"step": 910
},
{
"epoch": 1.8769943386515697,
"grad_norm": 0.13376379013061523,
"learning_rate": 1.0872012958079609e-06,
"loss": 0.6446,
"step": 911
},
{
"epoch": 1.8790530108080288,
"grad_norm": 0.13035869598388672,
"learning_rate": 1.0842795541594354e-06,
"loss": 0.6892,
"step": 912
},
{
"epoch": 1.8811116829644878,
"grad_norm": 0.13996127247810364,
"learning_rate": 1.0814071350249213e-06,
"loss": 0.6529,
"step": 913
},
{
"epoch": 1.883170355120947,
"grad_norm": 0.14141958951950073,
"learning_rate": 1.078584070488635e-06,
"loss": 0.6645,
"step": 914
},
{
"epoch": 1.885229027277406,
"grad_norm": 0.13510560989379883,
"learning_rate": 1.0758103920835144e-06,
"loss": 0.6366,
"step": 915
},
{
"epoch": 1.8872876994338652,
"grad_norm": 0.15664884448051453,
"learning_rate": 1.0730861307908677e-06,
"loss": 0.6738,
"step": 916
},
{
"epoch": 1.8893463715903243,
"grad_norm": 0.1363374888896942,
"learning_rate": 1.0704113170400253e-06,
"loss": 0.6188,
"step": 917
},
{
"epoch": 1.8914050437467833,
"grad_norm": 0.1372835487127304,
"learning_rate": 1.0677859807079994e-06,
"loss": 0.6599,
"step": 918
},
{
"epoch": 1.8934637159032424,
"grad_norm": 0.1461261808872223,
"learning_rate": 1.0652101511191535e-06,
"loss": 0.6595,
"step": 919
},
{
"epoch": 1.8955223880597014,
"grad_norm": 0.11850570142269135,
"learning_rate": 1.0626838570448716e-06,
"loss": 0.6557,
"step": 920
},
{
"epoch": 1.8975810602161607,
"grad_norm": 0.13033467531204224,
"learning_rate": 1.0602071267032394e-06,
"loss": 0.6523,
"step": 921
},
{
"epoch": 1.8996397323726195,
"grad_norm": 0.12491913139820099,
"learning_rate": 1.057779987758727e-06,
"loss": 0.6439,
"step": 922
},
{
"epoch": 1.9016984045290788,
"grad_norm": 0.1398879587650299,
"learning_rate": 1.0554024673218808e-06,
"loss": 0.6636,
"step": 923
},
{
"epoch": 1.9037570766855378,
"grad_norm": 0.1425112634897232,
"learning_rate": 1.0530745919490201e-06,
"loss": 0.6634,
"step": 924
},
{
"epoch": 1.905815748841997,
"grad_norm": 0.1285870522260666,
"learning_rate": 1.0507963876419424e-06,
"loss": 0.6532,
"step": 925
},
{
"epoch": 1.907874420998456,
"grad_norm": 0.13235710561275482,
"learning_rate": 1.048567879847631e-06,
"loss": 0.6711,
"step": 926
},
{
"epoch": 1.909933093154915,
"grad_norm": 0.13371641933918,
"learning_rate": 1.0463890934579714e-06,
"loss": 0.6642,
"step": 927
},
{
"epoch": 1.9119917653113743,
"grad_norm": 0.14254343509674072,
"learning_rate": 1.0442600528094722e-06,
"loss": 0.6514,
"step": 928
},
{
"epoch": 1.914050437467833,
"grad_norm": 0.13420189917087555,
"learning_rate": 1.0421807816829955e-06,
"loss": 0.6522,
"step": 929
},
{
"epoch": 1.9161091096242924,
"grad_norm": 0.1435345858335495,
"learning_rate": 1.04015130330349e-06,
"loss": 0.6384,
"step": 930
},
{
"epoch": 1.9181677817807514,
"grad_norm": 0.14257173240184784,
"learning_rate": 1.0381716403397304e-06,
"loss": 0.6587,
"step": 931
},
{
"epoch": 1.9202264539372105,
"grad_norm": 0.12754391133785248,
"learning_rate": 1.0362418149040673e-06,
"loss": 0.6407,
"step": 932
},
{
"epoch": 1.9222851260936697,
"grad_norm": 0.13912741839885712,
"learning_rate": 1.0343618485521762e-06,
"loss": 0.642,
"step": 933
},
{
"epoch": 1.9243437982501286,
"grad_norm": 0.1380196213722229,
"learning_rate": 1.0325317622828216e-06,
"loss": 0.6418,
"step": 934
},
{
"epoch": 1.9264024704065879,
"grad_norm": 0.15211521089076996,
"learning_rate": 1.0307515765376167e-06,
"loss": 0.6884,
"step": 935
},
{
"epoch": 1.9284611425630467,
"grad_norm": 0.13326020538806915,
"learning_rate": 1.0290213112007999e-06,
"loss": 0.6478,
"step": 936
},
{
"epoch": 1.930519814719506,
"grad_norm": 0.1361563354730606,
"learning_rate": 1.0273409855990113e-06,
"loss": 0.6719,
"step": 937
},
{
"epoch": 1.932578486875965,
"grad_norm": 0.1280031055212021,
"learning_rate": 1.0257106185010746e-06,
"loss": 0.6288,
"step": 938
},
{
"epoch": 1.934637159032424,
"grad_norm": 0.12487500160932541,
"learning_rate": 1.0241302281177906e-06,
"loss": 0.6469,
"step": 939
},
{
"epoch": 1.9366958311888833,
"grad_norm": 0.13708752393722534,
"learning_rate": 1.0225998321017314e-06,
"loss": 0.6216,
"step": 940
},
{
"epoch": 1.9387545033453422,
"grad_norm": 0.1262640804052353,
"learning_rate": 1.021119447547047e-06,
"loss": 0.6456,
"step": 941
},
{
"epoch": 1.9408131755018014,
"grad_norm": 0.1493413746356964,
"learning_rate": 1.019689090989268e-06,
"loss": 0.7005,
"step": 942
},
{
"epoch": 1.9428718476582603,
"grad_norm": 0.1342114359140396,
"learning_rate": 1.0183087784051269e-06,
"loss": 0.66,
"step": 943
},
{
"epoch": 1.9449305198147195,
"grad_norm": 0.1446632593870163,
"learning_rate": 1.0169785252123765e-06,
"loss": 0.6647,
"step": 944
},
{
"epoch": 1.9469891919711786,
"grad_norm": 0.14843548834323883,
"learning_rate": 1.015698346269618e-06,
"loss": 0.648,
"step": 945
},
{
"epoch": 1.9490478641276376,
"grad_norm": 0.12779954075813293,
"learning_rate": 1.0144682558761371e-06,
"loss": 0.6595,
"step": 946
},
{
"epoch": 1.951106536284097,
"grad_norm": 0.14572598040103912,
"learning_rate": 1.013288267771741e-06,
"loss": 0.6551,
"step": 947
},
{
"epoch": 1.9531652084405557,
"grad_norm": 0.15109211206436157,
"learning_rate": 1.0121583951366075e-06,
"loss": 0.6552,
"step": 948
},
{
"epoch": 1.955223880597015,
"grad_norm": 0.12881575524806976,
"learning_rate": 1.011078650591137e-06,
"loss": 0.6818,
"step": 949
},
{
"epoch": 1.9572825527534738,
"grad_norm": 0.1423639953136444,
"learning_rate": 1.010049046195811e-06,
"loss": 0.662,
"step": 950
},
{
"epoch": 1.9593412249099331,
"grad_norm": 0.13927972316741943,
"learning_rate": 1.0090695934510577e-06,
"loss": 0.6184,
"step": 951
},
{
"epoch": 1.9613998970663922,
"grad_norm": 0.13898450136184692,
"learning_rate": 1.0081403032971245e-06,
"loss": 0.6391,
"step": 952
},
{
"epoch": 1.9634585692228512,
"grad_norm": 0.13634267449378967,
"learning_rate": 1.0072611861139538e-06,
"loss": 0.6273,
"step": 953
},
{
"epoch": 1.9655172413793105,
"grad_norm": 0.15343570709228516,
"learning_rate": 1.006432251721069e-06,
"loss": 0.6302,
"step": 954
},
{
"epoch": 1.9675759135357693,
"grad_norm": 0.13033926486968994,
"learning_rate": 1.0056535093774643e-06,
"loss": 0.6322,
"step": 955
},
{
"epoch": 1.9696345856922286,
"grad_norm": 0.14002519845962524,
"learning_rate": 1.0049249677815005e-06,
"loss": 0.6646,
"step": 956
},
{
"epoch": 1.9716932578486874,
"grad_norm": 0.14621154963970184,
"learning_rate": 1.0042466350708083e-06,
"loss": 0.6923,
"step": 957
},
{
"epoch": 1.9737519300051467,
"grad_norm": 0.1436609923839569,
"learning_rate": 1.0036185188221976e-06,
"loss": 0.6494,
"step": 958
},
{
"epoch": 1.9758106021616058,
"grad_norm": 0.14785051345825195,
"learning_rate": 1.0030406260515726e-06,
"loss": 0.6472,
"step": 959
},
{
"epoch": 1.9778692743180648,
"grad_norm": 0.13290195167064667,
"learning_rate": 1.0025129632138545e-06,
"loss": 0.6662,
"step": 960
},
{
"epoch": 1.979927946474524,
"grad_norm": 0.14163339138031006,
"learning_rate": 1.002035536202907e-06,
"loss": 0.6727,
"step": 961
},
{
"epoch": 1.981986618630983,
"grad_norm": 0.2116987407207489,
"learning_rate": 1.0016083503514734e-06,
"loss": 0.6291,
"step": 962
},
{
"epoch": 1.9840452907874422,
"grad_norm": 0.1317073106765747,
"learning_rate": 1.0012314104311142e-06,
"loss": 0.6564,
"step": 963
},
{
"epoch": 1.9861039629439012,
"grad_norm": 0.13808391988277435,
"learning_rate": 1.0009047206521559e-06,
"loss": 0.6372,
"step": 964
},
{
"epoch": 1.9881626351003603,
"grad_norm": 0.1466529816389084,
"learning_rate": 1.0006282846636434e-06,
"loss": 0.6741,
"step": 965
},
{
"epoch": 1.9902213072568193,
"grad_norm": 0.1410931497812271,
"learning_rate": 1.000402105553299e-06,
"loss": 0.633,
"step": 966
},
{
"epoch": 1.9922799794132784,
"grad_norm": 0.1295842081308365,
"learning_rate": 1.0002261858474878e-06,
"loss": 0.6724,
"step": 967
},
{
"epoch": 1.9943386515697377,
"grad_norm": 0.13751675188541412,
"learning_rate": 1.0001005275111895e-06,
"loss": 0.6354,
"step": 968
},
{
"epoch": 1.9963973237261965,
"grad_norm": 0.13173769414424896,
"learning_rate": 1.0000251319479768e-06,
"loss": 0.6481,
"step": 969
},
{
"epoch": 1.9984559958826558,
"grad_norm": 0.13309703767299652,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.6489,
"step": 970
},
{
"epoch": 1.9984559958826558,
"step": 970,
"total_flos": 1.3119474486813327e+19,
"train_loss": 0.32624263990785657,
"train_runtime": 85452.3313,
"train_samples_per_second": 0.182,
"train_steps_per_second": 0.011
}
],
"logging_steps": 1,
"max_steps": 970,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3119474486813327e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}