jaykaydg's picture
Upload folder using huggingface_hub
45c9eab verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.8458242699887943,
"eval_steps": 500,
"global_step": 7000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001318304660206974,
"grad_norm": 4.59375,
"learning_rate": 0.0002,
"loss": 1.9624,
"step": 5
},
{
"epoch": 0.002636609320413948,
"grad_norm": 1.7421875,
"learning_rate": 0.00019986805647183008,
"loss": 0.6513,
"step": 10
},
{
"epoch": 0.003954913980620921,
"grad_norm": 1.84375,
"learning_rate": 0.00019973611294366012,
"loss": 0.1146,
"step": 15
},
{
"epoch": 0.005273218640827896,
"grad_norm": 1.3203125,
"learning_rate": 0.0001996041694154902,
"loss": 0.0529,
"step": 20
},
{
"epoch": 0.006591523301034869,
"grad_norm": 0.40234375,
"learning_rate": 0.00019947222588732023,
"loss": 0.1214,
"step": 25
},
{
"epoch": 0.007909827961241843,
"grad_norm": 1.5390625,
"learning_rate": 0.0001993402823591503,
"loss": 0.0919,
"step": 30
},
{
"epoch": 0.009228132621448816,
"grad_norm": 0.06201171875,
"learning_rate": 0.00019920833883098034,
"loss": 0.09,
"step": 35
},
{
"epoch": 0.010546437281655791,
"grad_norm": 1.53125,
"learning_rate": 0.0001990763953028104,
"loss": 0.1945,
"step": 40
},
{
"epoch": 0.011864741941862765,
"grad_norm": 0.2890625,
"learning_rate": 0.00019894445177464048,
"loss": 0.1259,
"step": 45
},
{
"epoch": 0.013183046602069738,
"grad_norm": 0.609375,
"learning_rate": 0.00019881250824647052,
"loss": 0.027,
"step": 50
},
{
"epoch": 0.014501351262276712,
"grad_norm": 0.369140625,
"learning_rate": 0.00019868056471830057,
"loss": 0.1068,
"step": 55
},
{
"epoch": 0.015819655922483685,
"grad_norm": 0.34765625,
"learning_rate": 0.00019854862119013064,
"loss": 0.0542,
"step": 60
},
{
"epoch": 0.01713796058269066,
"grad_norm": 0.055419921875,
"learning_rate": 0.00019841667766196068,
"loss": 0.0901,
"step": 65
},
{
"epoch": 0.018456265242897632,
"grad_norm": 0.0247802734375,
"learning_rate": 0.00019828473413379075,
"loss": 0.0091,
"step": 70
},
{
"epoch": 0.019774569903104607,
"grad_norm": 0.0079345703125,
"learning_rate": 0.0001981527906056208,
"loss": 0.0744,
"step": 75
},
{
"epoch": 0.021092874563311582,
"grad_norm": 0.65234375,
"learning_rate": 0.00019802084707745086,
"loss": 0.1108,
"step": 80
},
{
"epoch": 0.022411179223518554,
"grad_norm": 0.50390625,
"learning_rate": 0.0001978889035492809,
"loss": 0.0446,
"step": 85
},
{
"epoch": 0.02372948388372553,
"grad_norm": 0.1787109375,
"learning_rate": 0.00019775696002111097,
"loss": 0.0982,
"step": 90
},
{
"epoch": 0.0250477885439325,
"grad_norm": 0.490234375,
"learning_rate": 0.00019762501649294104,
"loss": 0.1035,
"step": 95
},
{
"epoch": 0.026366093204139476,
"grad_norm": 0.12158203125,
"learning_rate": 0.00019749307296477108,
"loss": 0.0401,
"step": 100
},
{
"epoch": 0.02768439786434645,
"grad_norm": 0.16015625,
"learning_rate": 0.00019736112943660115,
"loss": 0.0309,
"step": 105
},
{
"epoch": 0.029002702524553423,
"grad_norm": 1.359375,
"learning_rate": 0.0001972291859084312,
"loss": 0.1032,
"step": 110
},
{
"epoch": 0.0303210071847604,
"grad_norm": 0.52734375,
"learning_rate": 0.00019709724238026126,
"loss": 0.0811,
"step": 115
},
{
"epoch": 0.03163931184496737,
"grad_norm": 0.177734375,
"learning_rate": 0.00019696529885209133,
"loss": 0.0258,
"step": 120
},
{
"epoch": 0.03295761650517435,
"grad_norm": 0.234375,
"learning_rate": 0.00019683335532392137,
"loss": 0.0437,
"step": 125
},
{
"epoch": 0.03427592116538132,
"grad_norm": 1.3046875,
"learning_rate": 0.00019670141179575144,
"loss": 0.0967,
"step": 130
},
{
"epoch": 0.03559422582558829,
"grad_norm": 0.2734375,
"learning_rate": 0.00019656946826758148,
"loss": 0.0132,
"step": 135
},
{
"epoch": 0.036912530485795264,
"grad_norm": 0.66015625,
"learning_rate": 0.00019643752473941155,
"loss": 0.0396,
"step": 140
},
{
"epoch": 0.03823083514600224,
"grad_norm": 1.0546875,
"learning_rate": 0.0001963055812112416,
"loss": 0.0449,
"step": 145
},
{
"epoch": 0.039549139806209214,
"grad_norm": 0.2021484375,
"learning_rate": 0.00019617363768307166,
"loss": 0.1196,
"step": 150
},
{
"epoch": 0.040867444466416186,
"grad_norm": 0.5859375,
"learning_rate": 0.0001960416941549017,
"loss": 0.0588,
"step": 155
},
{
"epoch": 0.042185749126623165,
"grad_norm": 0.06005859375,
"learning_rate": 0.00019590975062673175,
"loss": 0.0234,
"step": 160
},
{
"epoch": 0.04350405378683014,
"grad_norm": 0.4921875,
"learning_rate": 0.00019577780709856182,
"loss": 0.0916,
"step": 165
},
{
"epoch": 0.04482235844703711,
"grad_norm": 0.84375,
"learning_rate": 0.0001956458635703919,
"loss": 0.0271,
"step": 170
},
{
"epoch": 0.04614066310724409,
"grad_norm": 0.8828125,
"learning_rate": 0.00019551392004222193,
"loss": 0.0175,
"step": 175
},
{
"epoch": 0.04745896776745106,
"grad_norm": 0.0152587890625,
"learning_rate": 0.000195381976514052,
"loss": 0.0356,
"step": 180
},
{
"epoch": 0.04877727242765803,
"grad_norm": 0.09326171875,
"learning_rate": 0.00019525003298588204,
"loss": 0.0057,
"step": 185
},
{
"epoch": 0.050095577087865,
"grad_norm": 0.24609375,
"learning_rate": 0.0001951180894577121,
"loss": 0.0082,
"step": 190
},
{
"epoch": 0.05141388174807198,
"grad_norm": 0.05029296875,
"learning_rate": 0.00019498614592954215,
"loss": 0.0178,
"step": 195
},
{
"epoch": 0.05273218640827895,
"grad_norm": 0.0390625,
"learning_rate": 0.00019485420240137222,
"loss": 0.0789,
"step": 200
},
{
"epoch": 0.054050491068485924,
"grad_norm": 0.5625,
"learning_rate": 0.0001947222588732023,
"loss": 0.0645,
"step": 205
},
{
"epoch": 0.0553687957286929,
"grad_norm": 0.53515625,
"learning_rate": 0.00019459031534503233,
"loss": 0.116,
"step": 210
},
{
"epoch": 0.056687100388899875,
"grad_norm": 0.55078125,
"learning_rate": 0.0001944583718168624,
"loss": 0.0516,
"step": 215
},
{
"epoch": 0.058005405049106847,
"grad_norm": 0.314453125,
"learning_rate": 0.00019432642828869244,
"loss": 0.1019,
"step": 220
},
{
"epoch": 0.059323709709313825,
"grad_norm": 0.1123046875,
"learning_rate": 0.0001941944847605225,
"loss": 0.0529,
"step": 225
},
{
"epoch": 0.0606420143695208,
"grad_norm": 0.4921875,
"learning_rate": 0.00019406254123235256,
"loss": 0.0368,
"step": 230
},
{
"epoch": 0.06196031902972777,
"grad_norm": 0.054443359375,
"learning_rate": 0.00019393059770418262,
"loss": 0.037,
"step": 235
},
{
"epoch": 0.06327862368993474,
"grad_norm": 0.008544921875,
"learning_rate": 0.0001937986541760127,
"loss": 0.0324,
"step": 240
},
{
"epoch": 0.06459692835014172,
"grad_norm": 1.5,
"learning_rate": 0.00019366671064784274,
"loss": 0.0334,
"step": 245
},
{
"epoch": 0.0659152330103487,
"grad_norm": 0.2109375,
"learning_rate": 0.0001935347671196728,
"loss": 0.0671,
"step": 250
},
{
"epoch": 0.06723353767055566,
"grad_norm": 2.0625,
"learning_rate": 0.00019340282359150285,
"loss": 0.1559,
"step": 255
},
{
"epoch": 0.06855184233076264,
"grad_norm": 0.7734375,
"learning_rate": 0.0001932708800633329,
"loss": 0.0198,
"step": 260
},
{
"epoch": 0.06987014699096962,
"grad_norm": 0.42578125,
"learning_rate": 0.00019313893653516296,
"loss": 0.0151,
"step": 265
},
{
"epoch": 0.07118845165117658,
"grad_norm": 0.1884765625,
"learning_rate": 0.000193006993006993,
"loss": 0.0269,
"step": 270
},
{
"epoch": 0.07250675631138356,
"grad_norm": 1.546875,
"learning_rate": 0.00019287504947882307,
"loss": 0.0565,
"step": 275
},
{
"epoch": 0.07382506097159053,
"grad_norm": 0.5078125,
"learning_rate": 0.0001927431059506531,
"loss": 0.0942,
"step": 280
},
{
"epoch": 0.0751433656317975,
"grad_norm": 0.392578125,
"learning_rate": 0.00019261116242248318,
"loss": 0.0061,
"step": 285
},
{
"epoch": 0.07646167029200449,
"grad_norm": 1.9140625,
"learning_rate": 0.00019247921889431325,
"loss": 0.0497,
"step": 290
},
{
"epoch": 0.07777997495221145,
"grad_norm": 0.08837890625,
"learning_rate": 0.0001923472753661433,
"loss": 0.0573,
"step": 295
},
{
"epoch": 0.07909827961241843,
"grad_norm": 1.046875,
"learning_rate": 0.00019221533183797336,
"loss": 0.0528,
"step": 300
},
{
"epoch": 0.08041658427262541,
"grad_norm": 0.2275390625,
"learning_rate": 0.0001920833883098034,
"loss": 0.0506,
"step": 305
},
{
"epoch": 0.08173488893283237,
"grad_norm": 0.08203125,
"learning_rate": 0.00019195144478163347,
"loss": 0.0307,
"step": 310
},
{
"epoch": 0.08305319359303935,
"grad_norm": 0.111328125,
"learning_rate": 0.00019181950125346354,
"loss": 0.0365,
"step": 315
},
{
"epoch": 0.08437149825324633,
"grad_norm": 1.2890625,
"learning_rate": 0.00019168755772529358,
"loss": 0.0447,
"step": 320
},
{
"epoch": 0.0856898029134533,
"grad_norm": 0.6015625,
"learning_rate": 0.00019155561419712365,
"loss": 0.0605,
"step": 325
},
{
"epoch": 0.08700810757366027,
"grad_norm": 0.71875,
"learning_rate": 0.0001914236706689537,
"loss": 0.0846,
"step": 330
},
{
"epoch": 0.08832641223386725,
"grad_norm": 0.1494140625,
"learning_rate": 0.00019129172714078376,
"loss": 0.0713,
"step": 335
},
{
"epoch": 0.08964471689407422,
"grad_norm": 0.1669921875,
"learning_rate": 0.0001911597836126138,
"loss": 0.0826,
"step": 340
},
{
"epoch": 0.0909630215542812,
"grad_norm": 2.203125,
"learning_rate": 0.00019102784008444388,
"loss": 0.0441,
"step": 345
},
{
"epoch": 0.09228132621448817,
"grad_norm": 1.21875,
"learning_rate": 0.00019089589655627395,
"loss": 0.1378,
"step": 350
},
{
"epoch": 0.09359963087469514,
"grad_norm": 3.0625,
"learning_rate": 0.00019076395302810396,
"loss": 0.1552,
"step": 355
},
{
"epoch": 0.09491793553490212,
"grad_norm": 0.232421875,
"learning_rate": 0.00019063200949993403,
"loss": 0.0458,
"step": 360
},
{
"epoch": 0.0962362401951091,
"grad_norm": 0.71875,
"learning_rate": 0.0001905000659717641,
"loss": 0.0312,
"step": 365
},
{
"epoch": 0.09755454485531606,
"grad_norm": 0.0218505859375,
"learning_rate": 0.00019036812244359414,
"loss": 0.0247,
"step": 370
},
{
"epoch": 0.09887284951552304,
"grad_norm": 0.064453125,
"learning_rate": 0.0001902361789154242,
"loss": 0.054,
"step": 375
},
{
"epoch": 0.10019115417573,
"grad_norm": 0.021240234375,
"learning_rate": 0.00019010423538725425,
"loss": 0.0023,
"step": 380
},
{
"epoch": 0.10150945883593698,
"grad_norm": 0.0361328125,
"learning_rate": 0.00018997229185908432,
"loss": 0.0884,
"step": 385
},
{
"epoch": 0.10282776349614396,
"grad_norm": 1.703125,
"learning_rate": 0.00018984034833091436,
"loss": 0.0506,
"step": 390
},
{
"epoch": 0.10414606815635093,
"grad_norm": 0.08837890625,
"learning_rate": 0.00018970840480274443,
"loss": 0.1123,
"step": 395
},
{
"epoch": 0.1054643728165579,
"grad_norm": 0.6953125,
"learning_rate": 0.0001895764612745745,
"loss": 0.0597,
"step": 400
},
{
"epoch": 0.10678267747676488,
"grad_norm": 0.18359375,
"learning_rate": 0.00018944451774640454,
"loss": 0.0138,
"step": 405
},
{
"epoch": 0.10810098213697185,
"grad_norm": 0.0272216796875,
"learning_rate": 0.0001893125742182346,
"loss": 0.0249,
"step": 410
},
{
"epoch": 0.10941928679717883,
"grad_norm": 0.00970458984375,
"learning_rate": 0.00018918063069006466,
"loss": 0.0084,
"step": 415
},
{
"epoch": 0.1107375914573858,
"grad_norm": 0.54296875,
"learning_rate": 0.00018904868716189472,
"loss": 0.0541,
"step": 420
},
{
"epoch": 0.11205589611759277,
"grad_norm": 0.74609375,
"learning_rate": 0.00018891674363372477,
"loss": 0.007,
"step": 425
},
{
"epoch": 0.11337420077779975,
"grad_norm": 0.0211181640625,
"learning_rate": 0.00018878480010555484,
"loss": 0.0875,
"step": 430
},
{
"epoch": 0.11469250543800673,
"grad_norm": 0.9296875,
"learning_rate": 0.0001886528565773849,
"loss": 0.1207,
"step": 435
},
{
"epoch": 0.11601081009821369,
"grad_norm": 1.2734375,
"learning_rate": 0.00018852091304921495,
"loss": 0.1143,
"step": 440
},
{
"epoch": 0.11732911475842067,
"grad_norm": 0.6484375,
"learning_rate": 0.00018838896952104502,
"loss": 0.0393,
"step": 445
},
{
"epoch": 0.11864741941862765,
"grad_norm": 0.1552734375,
"learning_rate": 0.00018825702599287506,
"loss": 0.02,
"step": 450
},
{
"epoch": 0.11996572407883462,
"grad_norm": 0.486328125,
"learning_rate": 0.0001881250824647051,
"loss": 0.0891,
"step": 455
},
{
"epoch": 0.1212840287390416,
"grad_norm": 1.0,
"learning_rate": 0.00018799313893653517,
"loss": 0.0469,
"step": 460
},
{
"epoch": 0.12260233339924857,
"grad_norm": 0.2099609375,
"learning_rate": 0.0001878611954083652,
"loss": 0.019,
"step": 465
},
{
"epoch": 0.12392063805945554,
"grad_norm": 0.03857421875,
"learning_rate": 0.00018772925188019528,
"loss": 0.007,
"step": 470
},
{
"epoch": 0.12523894271966252,
"grad_norm": 0.0257568359375,
"learning_rate": 0.00018759730835202532,
"loss": 0.0039,
"step": 475
},
{
"epoch": 0.12655724737986948,
"grad_norm": 0.014404296875,
"learning_rate": 0.0001874653648238554,
"loss": 0.0043,
"step": 480
},
{
"epoch": 0.12787555204007647,
"grad_norm": 0.51953125,
"learning_rate": 0.00018733342129568546,
"loss": 0.1326,
"step": 485
},
{
"epoch": 0.12919385670028344,
"grad_norm": 0.99609375,
"learning_rate": 0.0001872014777675155,
"loss": 0.0369,
"step": 490
},
{
"epoch": 0.1305121613604904,
"grad_norm": 0.2734375,
"learning_rate": 0.00018706953423934557,
"loss": 0.0395,
"step": 495
},
{
"epoch": 0.1318304660206974,
"grad_norm": 0.083984375,
"learning_rate": 0.00018693759071117561,
"loss": 0.0284,
"step": 500
},
{
"epoch": 0.1318304660206974,
"eval_loss": 0.04542969539761543,
"eval_model_preparation_time": 0.0076,
"eval_runtime": 457.5293,
"eval_samples_per_second": 7.37,
"eval_steps_per_second": 3.685,
"step": 500
},
{
"epoch": 0.13314877068090436,
"grad_norm": 0.0291748046875,
"learning_rate": 0.00018680564718300568,
"loss": 0.0533,
"step": 505
},
{
"epoch": 0.13446707534111133,
"grad_norm": 0.71484375,
"learning_rate": 0.00018667370365483575,
"loss": 0.0183,
"step": 510
},
{
"epoch": 0.13578538000131832,
"grad_norm": 0.018798828125,
"learning_rate": 0.0001865417601266658,
"loss": 0.0473,
"step": 515
},
{
"epoch": 0.13710368466152528,
"grad_norm": 0.388671875,
"learning_rate": 0.00018640981659849586,
"loss": 0.0562,
"step": 520
},
{
"epoch": 0.13842198932173225,
"grad_norm": 0.77734375,
"learning_rate": 0.0001862778730703259,
"loss": 0.0755,
"step": 525
},
{
"epoch": 0.13974029398193924,
"grad_norm": 2.8125,
"learning_rate": 0.00018614592954215598,
"loss": 0.0422,
"step": 530
},
{
"epoch": 0.1410585986421462,
"grad_norm": 0.48828125,
"learning_rate": 0.00018601398601398602,
"loss": 0.0882,
"step": 535
},
{
"epoch": 0.14237690330235317,
"grad_norm": 0.16015625,
"learning_rate": 0.0001858820424858161,
"loss": 0.0131,
"step": 540
},
{
"epoch": 0.14369520796256013,
"grad_norm": 0.31640625,
"learning_rate": 0.00018575009895764616,
"loss": 0.03,
"step": 545
},
{
"epoch": 0.14501351262276713,
"grad_norm": 0.0120849609375,
"learning_rate": 0.0001856181554294762,
"loss": 0.0425,
"step": 550
},
{
"epoch": 0.1463318172829741,
"grad_norm": 0.390625,
"learning_rate": 0.00018548621190130624,
"loss": 0.011,
"step": 555
},
{
"epoch": 0.14765012194318106,
"grad_norm": 1.9609375,
"learning_rate": 0.0001853542683731363,
"loss": 0.0807,
"step": 560
},
{
"epoch": 0.14896842660338805,
"grad_norm": 0.609375,
"learning_rate": 0.00018522232484496635,
"loss": 0.0278,
"step": 565
},
{
"epoch": 0.150286731263595,
"grad_norm": 0.087890625,
"learning_rate": 0.00018509038131679642,
"loss": 0.0484,
"step": 570
},
{
"epoch": 0.15160503592380198,
"grad_norm": 0.5078125,
"learning_rate": 0.00018495843778862646,
"loss": 0.1277,
"step": 575
},
{
"epoch": 0.15292334058400897,
"grad_norm": 0.8125,
"learning_rate": 0.00018482649426045653,
"loss": 0.058,
"step": 580
},
{
"epoch": 0.15424164524421594,
"grad_norm": 0.22265625,
"learning_rate": 0.00018469455073228657,
"loss": 0.0259,
"step": 585
},
{
"epoch": 0.1555599499044229,
"grad_norm": 1.8984375,
"learning_rate": 0.00018456260720411664,
"loss": 0.113,
"step": 590
},
{
"epoch": 0.1568782545646299,
"grad_norm": 0.12451171875,
"learning_rate": 0.0001844306636759467,
"loss": 0.0312,
"step": 595
},
{
"epoch": 0.15819655922483686,
"grad_norm": 0.0322265625,
"learning_rate": 0.00018429872014777676,
"loss": 0.0476,
"step": 600
},
{
"epoch": 0.15951486388504382,
"grad_norm": 0.0281982421875,
"learning_rate": 0.00018416677661960682,
"loss": 0.0232,
"step": 605
},
{
"epoch": 0.16083316854525082,
"grad_norm": 0.57421875,
"learning_rate": 0.00018403483309143687,
"loss": 0.1287,
"step": 610
},
{
"epoch": 0.16215147320545778,
"grad_norm": 0.765625,
"learning_rate": 0.00018390288956326694,
"loss": 0.0991,
"step": 615
},
{
"epoch": 0.16346977786566474,
"grad_norm": 0.3125,
"learning_rate": 0.00018377094603509698,
"loss": 0.0247,
"step": 620
},
{
"epoch": 0.16478808252587174,
"grad_norm": 0.37890625,
"learning_rate": 0.00018363900250692705,
"loss": 0.0632,
"step": 625
},
{
"epoch": 0.1661063871860787,
"grad_norm": 0.1494140625,
"learning_rate": 0.00018350705897875712,
"loss": 0.0314,
"step": 630
},
{
"epoch": 0.16742469184628567,
"grad_norm": 0.0673828125,
"learning_rate": 0.00018337511545058716,
"loss": 0.0425,
"step": 635
},
{
"epoch": 0.16874299650649266,
"grad_norm": 0.396484375,
"learning_rate": 0.00018324317192241723,
"loss": 0.0613,
"step": 640
},
{
"epoch": 0.17006130116669962,
"grad_norm": 0.057373046875,
"learning_rate": 0.00018311122839424727,
"loss": 0.0569,
"step": 645
},
{
"epoch": 0.1713796058269066,
"grad_norm": 0.001373291015625,
"learning_rate": 0.00018297928486607734,
"loss": 0.007,
"step": 650
},
{
"epoch": 0.17269791048711358,
"grad_norm": 1.0859375,
"learning_rate": 0.00018284734133790738,
"loss": 0.0189,
"step": 655
},
{
"epoch": 0.17401621514732055,
"grad_norm": 0.6015625,
"learning_rate": 0.00018271539780973742,
"loss": 0.0601,
"step": 660
},
{
"epoch": 0.1753345198075275,
"grad_norm": 0.25390625,
"learning_rate": 0.0001825834542815675,
"loss": 0.0211,
"step": 665
},
{
"epoch": 0.1766528244677345,
"grad_norm": 2.6875,
"learning_rate": 0.00018245151075339753,
"loss": 0.0713,
"step": 670
},
{
"epoch": 0.17797112912794147,
"grad_norm": 1.1875,
"learning_rate": 0.0001823195672252276,
"loss": 0.0522,
"step": 675
},
{
"epoch": 0.17928943378814843,
"grad_norm": 0.025146484375,
"learning_rate": 0.00018218762369705767,
"loss": 0.0242,
"step": 680
},
{
"epoch": 0.18060773844835543,
"grad_norm": 0.048095703125,
"learning_rate": 0.00018205568016888772,
"loss": 0.0129,
"step": 685
},
{
"epoch": 0.1819260431085624,
"grad_norm": 0.04541015625,
"learning_rate": 0.00018192373664071778,
"loss": 0.0142,
"step": 690
},
{
"epoch": 0.18324434776876936,
"grad_norm": 0.00830078125,
"learning_rate": 0.00018179179311254783,
"loss": 0.0121,
"step": 695
},
{
"epoch": 0.18456265242897635,
"grad_norm": 0.53125,
"learning_rate": 0.0001816598495843779,
"loss": 0.0163,
"step": 700
},
{
"epoch": 0.1858809570891833,
"grad_norm": 0.185546875,
"learning_rate": 0.00018152790605620796,
"loss": 0.0203,
"step": 705
},
{
"epoch": 0.18719926174939028,
"grad_norm": 1.2578125,
"learning_rate": 0.000181395962528038,
"loss": 0.1548,
"step": 710
},
{
"epoch": 0.18851756640959727,
"grad_norm": 0.0247802734375,
"learning_rate": 0.00018126401899986808,
"loss": 0.0543,
"step": 715
},
{
"epoch": 0.18983587106980424,
"grad_norm": 0.07568359375,
"learning_rate": 0.00018113207547169812,
"loss": 0.0346,
"step": 720
},
{
"epoch": 0.1911541757300112,
"grad_norm": 0.1318359375,
"learning_rate": 0.0001810001319435282,
"loss": 0.03,
"step": 725
},
{
"epoch": 0.1924724803902182,
"grad_norm": 0.1455078125,
"learning_rate": 0.00018086818841535823,
"loss": 0.0796,
"step": 730
},
{
"epoch": 0.19379078505042516,
"grad_norm": 0.09814453125,
"learning_rate": 0.0001807362448871883,
"loss": 0.0662,
"step": 735
},
{
"epoch": 0.19510908971063212,
"grad_norm": 0.91015625,
"learning_rate": 0.00018060430135901837,
"loss": 0.0675,
"step": 740
},
{
"epoch": 0.19642739437083911,
"grad_norm": 0.10693359375,
"learning_rate": 0.0001804723578308484,
"loss": 0.0377,
"step": 745
},
{
"epoch": 0.19774569903104608,
"grad_norm": 0.95703125,
"learning_rate": 0.00018034041430267848,
"loss": 0.0174,
"step": 750
},
{
"epoch": 0.19906400369125304,
"grad_norm": 1.7890625,
"learning_rate": 0.00018020847077450852,
"loss": 0.0278,
"step": 755
},
{
"epoch": 0.20038230835146,
"grad_norm": 0.8515625,
"learning_rate": 0.00018007652724633856,
"loss": 0.0113,
"step": 760
},
{
"epoch": 0.201700613011667,
"grad_norm": 0.016845703125,
"learning_rate": 0.00017994458371816863,
"loss": 0.0589,
"step": 765
},
{
"epoch": 0.20301891767187397,
"grad_norm": 0.01043701171875,
"learning_rate": 0.00017981264018999867,
"loss": 0.0203,
"step": 770
},
{
"epoch": 0.20433722233208093,
"grad_norm": 0.0242919921875,
"learning_rate": 0.00017968069666182874,
"loss": 0.0494,
"step": 775
},
{
"epoch": 0.20565552699228792,
"grad_norm": 0.56640625,
"learning_rate": 0.00017954875313365879,
"loss": 0.0394,
"step": 780
},
{
"epoch": 0.2069738316524949,
"grad_norm": 0.06591796875,
"learning_rate": 0.00017941680960548886,
"loss": 0.0848,
"step": 785
},
{
"epoch": 0.20829213631270185,
"grad_norm": 0.40234375,
"learning_rate": 0.00017928486607731892,
"loss": 0.0464,
"step": 790
},
{
"epoch": 0.20961044097290885,
"grad_norm": 0.06298828125,
"learning_rate": 0.00017915292254914897,
"loss": 0.0222,
"step": 795
},
{
"epoch": 0.2109287456331158,
"grad_norm": 0.5390625,
"learning_rate": 0.00017902097902097904,
"loss": 0.0434,
"step": 800
},
{
"epoch": 0.21224705029332278,
"grad_norm": 1.390625,
"learning_rate": 0.00017888903549280908,
"loss": 0.0222,
"step": 805
},
{
"epoch": 0.21356535495352977,
"grad_norm": 0.0272216796875,
"learning_rate": 0.00017875709196463915,
"loss": 0.0099,
"step": 810
},
{
"epoch": 0.21488365961373673,
"grad_norm": 0.10009765625,
"learning_rate": 0.0001786251484364692,
"loss": 0.0086,
"step": 815
},
{
"epoch": 0.2162019642739437,
"grad_norm": 0.06396484375,
"learning_rate": 0.00017849320490829926,
"loss": 0.0715,
"step": 820
},
{
"epoch": 0.2175202689341507,
"grad_norm": 0.365234375,
"learning_rate": 0.00017836126138012933,
"loss": 0.0642,
"step": 825
},
{
"epoch": 0.21883857359435765,
"grad_norm": 0.01519775390625,
"learning_rate": 0.00017822931785195937,
"loss": 0.0111,
"step": 830
},
{
"epoch": 0.22015687825456462,
"grad_norm": 1.1640625,
"learning_rate": 0.00017809737432378944,
"loss": 0.0518,
"step": 835
},
{
"epoch": 0.2214751829147716,
"grad_norm": 0.00921630859375,
"learning_rate": 0.00017796543079561948,
"loss": 0.0384,
"step": 840
},
{
"epoch": 0.22279348757497858,
"grad_norm": 0.33984375,
"learning_rate": 0.00017783348726744955,
"loss": 0.0204,
"step": 845
},
{
"epoch": 0.22411179223518554,
"grad_norm": 0.294921875,
"learning_rate": 0.00017770154373927962,
"loss": 0.0075,
"step": 850
},
{
"epoch": 0.22543009689539253,
"grad_norm": 0.033203125,
"learning_rate": 0.00017756960021110963,
"loss": 0.0895,
"step": 855
},
{
"epoch": 0.2267484015555995,
"grad_norm": 0.08056640625,
"learning_rate": 0.0001774376566829397,
"loss": 0.1039,
"step": 860
},
{
"epoch": 0.22806670621580646,
"grad_norm": 0.55078125,
"learning_rate": 0.00017730571315476975,
"loss": 0.0125,
"step": 865
},
{
"epoch": 0.22938501087601346,
"grad_norm": 0.5859375,
"learning_rate": 0.00017717376962659982,
"loss": 0.0381,
"step": 870
},
{
"epoch": 0.23070331553622042,
"grad_norm": 0.029052734375,
"learning_rate": 0.00017704182609842988,
"loss": 0.0434,
"step": 875
},
{
"epoch": 0.23202162019642739,
"grad_norm": 0.43359375,
"learning_rate": 0.00017690988257025993,
"loss": 0.0799,
"step": 880
},
{
"epoch": 0.23333992485663438,
"grad_norm": 0.04150390625,
"learning_rate": 0.00017677793904209,
"loss": 0.0692,
"step": 885
},
{
"epoch": 0.23465822951684134,
"grad_norm": 0.435546875,
"learning_rate": 0.00017664599551392004,
"loss": 0.0544,
"step": 890
},
{
"epoch": 0.2359765341770483,
"grad_norm": 1.171875,
"learning_rate": 0.0001765140519857501,
"loss": 0.0619,
"step": 895
},
{
"epoch": 0.2372948388372553,
"grad_norm": 0.01263427734375,
"learning_rate": 0.00017638210845758018,
"loss": 0.0418,
"step": 900
},
{
"epoch": 0.23861314349746227,
"grad_norm": 0.017578125,
"learning_rate": 0.00017625016492941022,
"loss": 0.0195,
"step": 905
},
{
"epoch": 0.23993144815766923,
"grad_norm": 0.6171875,
"learning_rate": 0.0001761182214012403,
"loss": 0.067,
"step": 910
},
{
"epoch": 0.24124975281787622,
"grad_norm": 0.59765625,
"learning_rate": 0.00017598627787307033,
"loss": 0.049,
"step": 915
},
{
"epoch": 0.2425680574780832,
"grad_norm": 1.2421875,
"learning_rate": 0.0001758543343449004,
"loss": 0.0539,
"step": 920
},
{
"epoch": 0.24388636213829015,
"grad_norm": 0.10302734375,
"learning_rate": 0.00017572239081673044,
"loss": 0.0725,
"step": 925
},
{
"epoch": 0.24520466679849715,
"grad_norm": 0.330078125,
"learning_rate": 0.0001755904472885605,
"loss": 0.064,
"step": 930
},
{
"epoch": 0.2465229714587041,
"grad_norm": 0.220703125,
"learning_rate": 0.00017545850376039058,
"loss": 0.0271,
"step": 935
},
{
"epoch": 0.24784127611891107,
"grad_norm": 0.01470947265625,
"learning_rate": 0.00017532656023222062,
"loss": 0.0247,
"step": 940
},
{
"epoch": 0.24915958077911807,
"grad_norm": 0.013427734375,
"learning_rate": 0.0001751946167040507,
"loss": 0.017,
"step": 945
},
{
"epoch": 0.25047788543932503,
"grad_norm": 0.58984375,
"learning_rate": 0.00017506267317588073,
"loss": 0.0254,
"step": 950
},
{
"epoch": 0.251796190099532,
"grad_norm": 0.412109375,
"learning_rate": 0.00017493072964771078,
"loss": 0.0186,
"step": 955
},
{
"epoch": 0.25311449475973896,
"grad_norm": 0.66796875,
"learning_rate": 0.00017479878611954084,
"loss": 0.0617,
"step": 960
},
{
"epoch": 0.25443279941994595,
"grad_norm": 0.322265625,
"learning_rate": 0.00017466684259137089,
"loss": 0.0173,
"step": 965
},
{
"epoch": 0.25575110408015295,
"grad_norm": 0.83203125,
"learning_rate": 0.00017453489906320096,
"loss": 0.0512,
"step": 970
},
{
"epoch": 0.2570694087403599,
"grad_norm": 0.08447265625,
"learning_rate": 0.000174402955535031,
"loss": 0.0361,
"step": 975
},
{
"epoch": 0.2583877134005669,
"grad_norm": 0.423828125,
"learning_rate": 0.00017427101200686107,
"loss": 0.0175,
"step": 980
},
{
"epoch": 0.25970601806077387,
"grad_norm": 0.77734375,
"learning_rate": 0.00017413906847869114,
"loss": 0.0139,
"step": 985
},
{
"epoch": 0.2610243227209808,
"grad_norm": 0.515625,
"learning_rate": 0.00017400712495052118,
"loss": 0.0948,
"step": 990
},
{
"epoch": 0.2623426273811878,
"grad_norm": 1.421875,
"learning_rate": 0.00017387518142235125,
"loss": 0.0406,
"step": 995
},
{
"epoch": 0.2636609320413948,
"grad_norm": 0.058837890625,
"learning_rate": 0.0001737432378941813,
"loss": 0.1011,
"step": 1000
},
{
"epoch": 0.2636609320413948,
"eval_loss": 0.045552924275398254,
"eval_model_preparation_time": 0.0076,
"eval_runtime": 457.6113,
"eval_samples_per_second": 7.369,
"eval_steps_per_second": 3.684,
"step": 1000
},
{
"epoch": 0.26497923670160173,
"grad_norm": 0.380859375,
"learning_rate": 0.00017361129436601136,
"loss": 0.0711,
"step": 1005
},
{
"epoch": 0.2662975413618087,
"grad_norm": 0.0208740234375,
"learning_rate": 0.00017347935083784143,
"loss": 0.0218,
"step": 1010
},
{
"epoch": 0.2676158460220157,
"grad_norm": 0.04345703125,
"learning_rate": 0.00017334740730967147,
"loss": 0.0301,
"step": 1015
},
{
"epoch": 0.26893415068222265,
"grad_norm": 0.2734375,
"learning_rate": 0.00017321546378150154,
"loss": 0.0721,
"step": 1020
},
{
"epoch": 0.27025245534242964,
"grad_norm": 0.25390625,
"learning_rate": 0.00017308352025333158,
"loss": 0.0363,
"step": 1025
},
{
"epoch": 0.27157076000263664,
"grad_norm": 0.04345703125,
"learning_rate": 0.00017295157672516165,
"loss": 0.0313,
"step": 1030
},
{
"epoch": 0.2728890646628436,
"grad_norm": 0.0211181640625,
"learning_rate": 0.0001728196331969917,
"loss": 0.0385,
"step": 1035
},
{
"epoch": 0.27420736932305056,
"grad_norm": 0.00787353515625,
"learning_rate": 0.00017268768966882176,
"loss": 0.0405,
"step": 1040
},
{
"epoch": 0.27552567398325756,
"grad_norm": 0.484375,
"learning_rate": 0.00017255574614065183,
"loss": 0.0616,
"step": 1045
},
{
"epoch": 0.2768439786434645,
"grad_norm": 0.0908203125,
"learning_rate": 0.00017242380261248185,
"loss": 0.0057,
"step": 1050
},
{
"epoch": 0.2781622833036715,
"grad_norm": 0.1904296875,
"learning_rate": 0.00017229185908431192,
"loss": 0.0417,
"step": 1055
},
{
"epoch": 0.2794805879638785,
"grad_norm": 0.30078125,
"learning_rate": 0.00017215991555614196,
"loss": 0.0346,
"step": 1060
},
{
"epoch": 0.2807988926240854,
"grad_norm": 0.016357421875,
"learning_rate": 0.00017202797202797203,
"loss": 0.0295,
"step": 1065
},
{
"epoch": 0.2821171972842924,
"grad_norm": 0.490234375,
"learning_rate": 0.0001718960284998021,
"loss": 0.0448,
"step": 1070
},
{
"epoch": 0.28343550194449935,
"grad_norm": 0.004241943359375,
"learning_rate": 0.00017176408497163214,
"loss": 0.0051,
"step": 1075
},
{
"epoch": 0.28475380660470634,
"grad_norm": 0.01904296875,
"learning_rate": 0.0001716321414434622,
"loss": 0.0894,
"step": 1080
},
{
"epoch": 0.28607211126491333,
"grad_norm": 0.83984375,
"learning_rate": 0.00017150019791529225,
"loss": 0.0288,
"step": 1085
},
{
"epoch": 0.28739041592512027,
"grad_norm": 0.2021484375,
"learning_rate": 0.00017136825438712232,
"loss": 0.0222,
"step": 1090
},
{
"epoch": 0.28870872058532726,
"grad_norm": 0.322265625,
"learning_rate": 0.0001712363108589524,
"loss": 0.0444,
"step": 1095
},
{
"epoch": 0.29002702524553425,
"grad_norm": 0.408203125,
"learning_rate": 0.00017110436733078243,
"loss": 0.0828,
"step": 1100
},
{
"epoch": 0.2913453299057412,
"grad_norm": 0.04052734375,
"learning_rate": 0.0001709724238026125,
"loss": 0.0725,
"step": 1105
},
{
"epoch": 0.2926636345659482,
"grad_norm": 0.2578125,
"learning_rate": 0.00017084048027444254,
"loss": 0.0204,
"step": 1110
},
{
"epoch": 0.2939819392261552,
"grad_norm": 0.67578125,
"learning_rate": 0.0001707085367462726,
"loss": 0.0503,
"step": 1115
},
{
"epoch": 0.2953002438863621,
"grad_norm": 0.0059814453125,
"learning_rate": 0.00017057659321810265,
"loss": 0.0144,
"step": 1120
},
{
"epoch": 0.2966185485465691,
"grad_norm": 0.0269775390625,
"learning_rate": 0.00017044464968993272,
"loss": 0.0044,
"step": 1125
},
{
"epoch": 0.2979368532067761,
"grad_norm": 0.1396484375,
"learning_rate": 0.0001703127061617628,
"loss": 0.013,
"step": 1130
},
{
"epoch": 0.29925515786698303,
"grad_norm": 0.287109375,
"learning_rate": 0.00017018076263359283,
"loss": 0.0245,
"step": 1135
},
{
"epoch": 0.30057346252719,
"grad_norm": 0.26171875,
"learning_rate": 0.0001700488191054229,
"loss": 0.0247,
"step": 1140
},
{
"epoch": 0.301891767187397,
"grad_norm": 0.40625,
"learning_rate": 0.00016991687557725294,
"loss": 0.0402,
"step": 1145
},
{
"epoch": 0.30321007184760396,
"grad_norm": 1.2578125,
"learning_rate": 0.000169784932049083,
"loss": 0.0071,
"step": 1150
},
{
"epoch": 0.30452837650781095,
"grad_norm": 0.330078125,
"learning_rate": 0.00016965298852091306,
"loss": 0.0177,
"step": 1155
},
{
"epoch": 0.30584668116801794,
"grad_norm": 0.07275390625,
"learning_rate": 0.0001695210449927431,
"loss": 0.0029,
"step": 1160
},
{
"epoch": 0.3071649858282249,
"grad_norm": 0.455078125,
"learning_rate": 0.00016938910146457317,
"loss": 0.0262,
"step": 1165
},
{
"epoch": 0.30848329048843187,
"grad_norm": 0.002655029296875,
"learning_rate": 0.0001692571579364032,
"loss": 0.0346,
"step": 1170
},
{
"epoch": 0.30980159514863886,
"grad_norm": 0.1748046875,
"learning_rate": 0.00016912521440823328,
"loss": 0.0494,
"step": 1175
},
{
"epoch": 0.3111198998088458,
"grad_norm": 1.4609375,
"learning_rate": 0.00016899327088006335,
"loss": 0.0603,
"step": 1180
},
{
"epoch": 0.3124382044690528,
"grad_norm": 0.1572265625,
"learning_rate": 0.0001688613273518934,
"loss": 0.0366,
"step": 1185
},
{
"epoch": 0.3137565091292598,
"grad_norm": 0.01422119140625,
"learning_rate": 0.00016872938382372346,
"loss": 0.0678,
"step": 1190
},
{
"epoch": 0.3150748137894667,
"grad_norm": 0.2412109375,
"learning_rate": 0.0001685974402955535,
"loss": 0.0359,
"step": 1195
},
{
"epoch": 0.3163931184496737,
"grad_norm": 0.275390625,
"learning_rate": 0.00016846549676738357,
"loss": 0.1099,
"step": 1200
},
{
"epoch": 0.3177114231098807,
"grad_norm": 0.212890625,
"learning_rate": 0.00016833355323921364,
"loss": 0.0343,
"step": 1205
},
{
"epoch": 0.31902972777008765,
"grad_norm": 0.0302734375,
"learning_rate": 0.00016820160971104368,
"loss": 0.0138,
"step": 1210
},
{
"epoch": 0.32034803243029464,
"grad_norm": 0.016845703125,
"learning_rate": 0.00016806966618287375,
"loss": 0.0202,
"step": 1215
},
{
"epoch": 0.32166633709050163,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001679377226547038,
"loss": 0.0442,
"step": 1220
},
{
"epoch": 0.32298464175070857,
"grad_norm": 0.049072265625,
"learning_rate": 0.00016780577912653386,
"loss": 0.0375,
"step": 1225
},
{
"epoch": 0.32430294641091556,
"grad_norm": 0.1337890625,
"learning_rate": 0.0001676738355983639,
"loss": 0.01,
"step": 1230
},
{
"epoch": 0.32562125107112255,
"grad_norm": 0.02197265625,
"learning_rate": 0.00016754189207019397,
"loss": 0.0139,
"step": 1235
},
{
"epoch": 0.3269395557313295,
"grad_norm": 0.09228515625,
"learning_rate": 0.00016740994854202404,
"loss": 0.014,
"step": 1240
},
{
"epoch": 0.3282578603915365,
"grad_norm": 0.47265625,
"learning_rate": 0.00016727800501385408,
"loss": 0.1546,
"step": 1245
},
{
"epoch": 0.3295761650517435,
"grad_norm": 0.02294921875,
"learning_rate": 0.00016714606148568413,
"loss": 0.0803,
"step": 1250
},
{
"epoch": 0.3308944697119504,
"grad_norm": 0.185546875,
"learning_rate": 0.00016701411795751417,
"loss": 0.0376,
"step": 1255
},
{
"epoch": 0.3322127743721574,
"grad_norm": 0.1123046875,
"learning_rate": 0.00016688217442934424,
"loss": 0.0375,
"step": 1260
},
{
"epoch": 0.3335310790323644,
"grad_norm": 1.03125,
"learning_rate": 0.0001667502309011743,
"loss": 0.0442,
"step": 1265
},
{
"epoch": 0.33484938369257133,
"grad_norm": 0.0172119140625,
"learning_rate": 0.00016661828737300435,
"loss": 0.0261,
"step": 1270
},
{
"epoch": 0.3361676883527783,
"grad_norm": 0.42578125,
"learning_rate": 0.00016648634384483442,
"loss": 0.0553,
"step": 1275
},
{
"epoch": 0.3374859930129853,
"grad_norm": 0.1328125,
"learning_rate": 0.00016635440031666446,
"loss": 0.0065,
"step": 1280
},
{
"epoch": 0.33880429767319226,
"grad_norm": 0.263671875,
"learning_rate": 0.00016622245678849453,
"loss": 0.0527,
"step": 1285
},
{
"epoch": 0.34012260233339925,
"grad_norm": 0.314453125,
"learning_rate": 0.0001660905132603246,
"loss": 0.0297,
"step": 1290
},
{
"epoch": 0.34144090699360624,
"grad_norm": 0.04345703125,
"learning_rate": 0.00016595856973215464,
"loss": 0.0477,
"step": 1295
},
{
"epoch": 0.3427592116538132,
"grad_norm": 0.08154296875,
"learning_rate": 0.0001658266262039847,
"loss": 0.0298,
"step": 1300
},
{
"epoch": 0.34407751631402017,
"grad_norm": 0.08935546875,
"learning_rate": 0.00016569468267581475,
"loss": 0.0481,
"step": 1305
},
{
"epoch": 0.34539582097422716,
"grad_norm": 0.06640625,
"learning_rate": 0.00016556273914764482,
"loss": 0.0153,
"step": 1310
},
{
"epoch": 0.3467141256344341,
"grad_norm": 0.00592041015625,
"learning_rate": 0.00016543079561947486,
"loss": 0.0111,
"step": 1315
},
{
"epoch": 0.3480324302946411,
"grad_norm": 0.2236328125,
"learning_rate": 0.00016529885209130493,
"loss": 0.0309,
"step": 1320
},
{
"epoch": 0.3493507349548481,
"grad_norm": 0.0198974609375,
"learning_rate": 0.000165166908563135,
"loss": 0.0579,
"step": 1325
},
{
"epoch": 0.350669039615055,
"grad_norm": 0.10107421875,
"learning_rate": 0.00016503496503496504,
"loss": 0.0055,
"step": 1330
},
{
"epoch": 0.351987344275262,
"grad_norm": 0.71875,
"learning_rate": 0.00016490302150679511,
"loss": 0.0299,
"step": 1335
},
{
"epoch": 0.353305648935469,
"grad_norm": 0.01348876953125,
"learning_rate": 0.00016477107797862516,
"loss": 0.0943,
"step": 1340
},
{
"epoch": 0.35462395359567594,
"grad_norm": 0.3046875,
"learning_rate": 0.00016463913445045523,
"loss": 0.0216,
"step": 1345
},
{
"epoch": 0.35594225825588294,
"grad_norm": 0.02392578125,
"learning_rate": 0.00016450719092228527,
"loss": 0.0265,
"step": 1350
},
{
"epoch": 0.35726056291608993,
"grad_norm": 0.453125,
"learning_rate": 0.0001643752473941153,
"loss": 0.0539,
"step": 1355
},
{
"epoch": 0.35857886757629687,
"grad_norm": 0.00823974609375,
"learning_rate": 0.00016424330386594538,
"loss": 0.0139,
"step": 1360
},
{
"epoch": 0.35989717223650386,
"grad_norm": 0.55859375,
"learning_rate": 0.00016411136033777542,
"loss": 0.0428,
"step": 1365
},
{
"epoch": 0.36121547689671085,
"grad_norm": 0.052734375,
"learning_rate": 0.0001639794168096055,
"loss": 0.0346,
"step": 1370
},
{
"epoch": 0.3625337815569178,
"grad_norm": 0.12158203125,
"learning_rate": 0.00016384747328143556,
"loss": 0.0095,
"step": 1375
},
{
"epoch": 0.3638520862171248,
"grad_norm": 0.0240478515625,
"learning_rate": 0.0001637155297532656,
"loss": 0.0224,
"step": 1380
},
{
"epoch": 0.3651703908773318,
"grad_norm": 0.01318359375,
"learning_rate": 0.00016358358622509567,
"loss": 0.0316,
"step": 1385
},
{
"epoch": 0.3664886955375387,
"grad_norm": 0.011962890625,
"learning_rate": 0.0001634516426969257,
"loss": 0.0051,
"step": 1390
},
{
"epoch": 0.3678070001977457,
"grad_norm": 0.00396728515625,
"learning_rate": 0.00016331969916875578,
"loss": 0.038,
"step": 1395
},
{
"epoch": 0.3691253048579527,
"grad_norm": 0.375,
"learning_rate": 0.00016318775564058585,
"loss": 0.029,
"step": 1400
},
{
"epoch": 0.37044360951815963,
"grad_norm": 0.265625,
"learning_rate": 0.0001630558121124159,
"loss": 0.0072,
"step": 1405
},
{
"epoch": 0.3717619141783666,
"grad_norm": 0.00127410888671875,
"learning_rate": 0.00016292386858424596,
"loss": 0.0381,
"step": 1410
},
{
"epoch": 0.3730802188385736,
"grad_norm": 1.15625,
"learning_rate": 0.000162791925056076,
"loss": 0.0573,
"step": 1415
},
{
"epoch": 0.37439852349878056,
"grad_norm": 0.0244140625,
"learning_rate": 0.00016265998152790607,
"loss": 0.051,
"step": 1420
},
{
"epoch": 0.37571682815898755,
"grad_norm": 0.0015106201171875,
"learning_rate": 0.00016252803799973612,
"loss": 0.0239,
"step": 1425
},
{
"epoch": 0.37703513281919454,
"grad_norm": 0.26953125,
"learning_rate": 0.00016239609447156618,
"loss": 0.0165,
"step": 1430
},
{
"epoch": 0.3783534374794015,
"grad_norm": 0.006134033203125,
"learning_rate": 0.00016226415094339625,
"loss": 0.0071,
"step": 1435
},
{
"epoch": 0.37967174213960847,
"grad_norm": 2.828125,
"learning_rate": 0.0001621322074152263,
"loss": 0.0272,
"step": 1440
},
{
"epoch": 0.38099004679981546,
"grad_norm": 0.349609375,
"learning_rate": 0.00016200026388705637,
"loss": 0.0647,
"step": 1445
},
{
"epoch": 0.3823083514600224,
"grad_norm": 0.09326171875,
"learning_rate": 0.00016186832035888638,
"loss": 0.0262,
"step": 1450
},
{
"epoch": 0.3836266561202294,
"grad_norm": 0.041015625,
"learning_rate": 0.00016173637683071645,
"loss": 0.0576,
"step": 1455
},
{
"epoch": 0.3849449607804364,
"grad_norm": 0.033935546875,
"learning_rate": 0.00016160443330254652,
"loss": 0.0142,
"step": 1460
},
{
"epoch": 0.3862632654406433,
"grad_norm": 0.09130859375,
"learning_rate": 0.00016147248977437656,
"loss": 0.0348,
"step": 1465
},
{
"epoch": 0.3875815701008503,
"grad_norm": 2.390625,
"learning_rate": 0.00016134054624620663,
"loss": 0.0672,
"step": 1470
},
{
"epoch": 0.3888998747610573,
"grad_norm": 0.439453125,
"learning_rate": 0.00016120860271803667,
"loss": 0.0121,
"step": 1475
},
{
"epoch": 0.39021817942126424,
"grad_norm": 0.1298828125,
"learning_rate": 0.00016107665918986674,
"loss": 0.0114,
"step": 1480
},
{
"epoch": 0.39153648408147124,
"grad_norm": 0.85546875,
"learning_rate": 0.0001609447156616968,
"loss": 0.0968,
"step": 1485
},
{
"epoch": 0.39285478874167823,
"grad_norm": 0.703125,
"learning_rate": 0.00016081277213352685,
"loss": 0.0349,
"step": 1490
},
{
"epoch": 0.39417309340188517,
"grad_norm": 0.021728515625,
"learning_rate": 0.00016068082860535692,
"loss": 0.0106,
"step": 1495
},
{
"epoch": 0.39549139806209216,
"grad_norm": 0.7265625,
"learning_rate": 0.00016054888507718696,
"loss": 0.0225,
"step": 1500
},
{
"epoch": 0.39549139806209216,
"eval_loss": 0.03515048325061798,
"eval_model_preparation_time": 0.0076,
"eval_runtime": 457.3497,
"eval_samples_per_second": 7.373,
"eval_steps_per_second": 3.686,
"step": 1500
},
{
"epoch": 0.3968097027222991,
"grad_norm": 0.016519820317626,
"learning_rate": 0.00016041694154901703,
"loss": 0.0202,
"step": 1505
},
{
"epoch": 0.3981280073825061,
"grad_norm": 0.8505942225456238,
"learning_rate": 0.00016028499802084708,
"loss": 0.0541,
"step": 1510
},
{
"epoch": 0.3994463120427131,
"grad_norm": 0.04163295030593872,
"learning_rate": 0.00016015305449267714,
"loss": 0.0037,
"step": 1515
},
{
"epoch": 0.40076461670292,
"grad_norm": 0.011332935653626919,
"learning_rate": 0.00016002111096450721,
"loss": 0.0459,
"step": 1520
},
{
"epoch": 0.402082921363127,
"grad_norm": 0.9360129833221436,
"learning_rate": 0.00015988916743633726,
"loss": 0.013,
"step": 1525
},
{
"epoch": 0.403401226023334,
"grad_norm": 0.11991436779499054,
"learning_rate": 0.00015975722390816733,
"loss": 0.0079,
"step": 1530
},
{
"epoch": 0.40471953068354094,
"grad_norm": 0.36911076307296753,
"learning_rate": 0.00015962528037999737,
"loss": 0.0638,
"step": 1535
},
{
"epoch": 0.40603783534374793,
"grad_norm": 0.020278634503483772,
"learning_rate": 0.00015949333685182744,
"loss": 0.0217,
"step": 1540
},
{
"epoch": 0.4073561400039549,
"grad_norm": 0.14263059198856354,
"learning_rate": 0.0001593613933236575,
"loss": 0.0495,
"step": 1545
},
{
"epoch": 0.40867444466416186,
"grad_norm": 0.09494803845882416,
"learning_rate": 0.00015922944979548752,
"loss": 0.0248,
"step": 1550
},
{
"epoch": 0.40999274932436885,
"grad_norm": 0.23064319789409637,
"learning_rate": 0.0001590975062673176,
"loss": 0.0285,
"step": 1555
},
{
"epoch": 0.41131105398457585,
"grad_norm": 0.32220256328582764,
"learning_rate": 0.00015896556273914763,
"loss": 0.0537,
"step": 1560
},
{
"epoch": 0.4126293586447828,
"grad_norm": 0.41208815574645996,
"learning_rate": 0.0001588336192109777,
"loss": 0.0453,
"step": 1565
},
{
"epoch": 0.4139476633049898,
"grad_norm": 0.03775424137711525,
"learning_rate": 0.00015870167568280777,
"loss": 0.0134,
"step": 1570
},
{
"epoch": 0.41526596796519677,
"grad_norm": 0.6526333093643188,
"learning_rate": 0.0001585697321546378,
"loss": 0.0329,
"step": 1575
},
{
"epoch": 0.4165842726254037,
"grad_norm": 1.001305103302002,
"learning_rate": 0.00015843778862646788,
"loss": 0.0912,
"step": 1580
},
{
"epoch": 0.4179025772856107,
"grad_norm": 0.4055219888687134,
"learning_rate": 0.00015830584509829792,
"loss": 0.0519,
"step": 1585
},
{
"epoch": 0.4192208819458177,
"grad_norm": 0.035015616565942764,
"learning_rate": 0.000158173901570128,
"loss": 0.0191,
"step": 1590
},
{
"epoch": 0.42053918660602463,
"grad_norm": 0.09326844662427902,
"learning_rate": 0.00015804195804195806,
"loss": 0.0106,
"step": 1595
},
{
"epoch": 0.4218574912662316,
"grad_norm": 0.06223440542817116,
"learning_rate": 0.0001579100145137881,
"loss": 0.0113,
"step": 1600
},
{
"epoch": 0.4231757959264386,
"grad_norm": 0.0625135526061058,
"learning_rate": 0.00015777807098561817,
"loss": 0.0191,
"step": 1605
},
{
"epoch": 0.42449410058664555,
"grad_norm": 0.2645983099937439,
"learning_rate": 0.00015764612745744822,
"loss": 0.0829,
"step": 1610
},
{
"epoch": 0.42581240524685254,
"grad_norm": 0.009632415138185024,
"learning_rate": 0.00015751418392927829,
"loss": 0.0542,
"step": 1615
},
{
"epoch": 0.42713070990705954,
"grad_norm": 0.01979319378733635,
"learning_rate": 0.00015738224040110833,
"loss": 0.0517,
"step": 1620
},
{
"epoch": 0.4284490145672665,
"grad_norm": 0.3065454065799713,
"learning_rate": 0.0001572502968729384,
"loss": 0.0738,
"step": 1625
},
{
"epoch": 0.42976731922747347,
"grad_norm": 0.09581473469734192,
"learning_rate": 0.00015711835334476847,
"loss": 0.0571,
"step": 1630
},
{
"epoch": 0.43108562388768046,
"grad_norm": 0.23746591806411743,
"learning_rate": 0.0001569864098165985,
"loss": 0.0128,
"step": 1635
},
{
"epoch": 0.4324039285478874,
"grad_norm": 0.936278760433197,
"learning_rate": 0.00015685446628842858,
"loss": 0.0665,
"step": 1640
},
{
"epoch": 0.4337222332080944,
"grad_norm": 0.18487441539764404,
"learning_rate": 0.00015672252276025862,
"loss": 0.0527,
"step": 1645
},
{
"epoch": 0.4350405378683014,
"grad_norm": 0.6980624794960022,
"learning_rate": 0.00015659057923208866,
"loss": 0.0613,
"step": 1650
},
{
"epoch": 0.4363588425285083,
"grad_norm": 0.4696301221847534,
"learning_rate": 0.00015645863570391873,
"loss": 0.0569,
"step": 1655
},
{
"epoch": 0.4376771471887153,
"grad_norm": 0.15083105862140656,
"learning_rate": 0.00015632669217574877,
"loss": 0.0394,
"step": 1660
},
{
"epoch": 0.4389954518489223,
"grad_norm": 0.44701239466667175,
"learning_rate": 0.00015619474864757884,
"loss": 0.0494,
"step": 1665
},
{
"epoch": 0.44031375650912924,
"grad_norm": 0.07418403029441833,
"learning_rate": 0.00015606280511940888,
"loss": 0.0291,
"step": 1670
},
{
"epoch": 0.44163206116933623,
"grad_norm": 0.02311861515045166,
"learning_rate": 0.00015593086159123895,
"loss": 0.0304,
"step": 1675
},
{
"epoch": 0.4429503658295432,
"grad_norm": 0.4416038990020752,
"learning_rate": 0.00015579891806306902,
"loss": 0.0176,
"step": 1680
},
{
"epoch": 0.44426867048975016,
"grad_norm": 0.5124915242195129,
"learning_rate": 0.00015566697453489906,
"loss": 0.0454,
"step": 1685
},
{
"epoch": 0.44558697514995715,
"grad_norm": 0.3159286081790924,
"learning_rate": 0.00015553503100672913,
"loss": 0.047,
"step": 1690
},
{
"epoch": 0.44690527981016415,
"grad_norm": 0.032126396894454956,
"learning_rate": 0.00015540308747855918,
"loss": 0.0151,
"step": 1695
},
{
"epoch": 0.4482235844703711,
"grad_norm": 0.04663548618555069,
"learning_rate": 0.00015527114395038924,
"loss": 0.0375,
"step": 1700
},
{
"epoch": 0.4495418891305781,
"grad_norm": 0.013753900304436684,
"learning_rate": 0.0001551392004222193,
"loss": 0.0485,
"step": 1705
},
{
"epoch": 0.45086019379078507,
"grad_norm": 1.9952393770217896,
"learning_rate": 0.00015500725689404936,
"loss": 0.0625,
"step": 1710
},
{
"epoch": 0.452178498450992,
"grad_norm": 0.014283270575106144,
"learning_rate": 0.00015487531336587943,
"loss": 0.0037,
"step": 1715
},
{
"epoch": 0.453496803111199,
"grad_norm": 0.3897913098335266,
"learning_rate": 0.00015474336983770947,
"loss": 0.0304,
"step": 1720
},
{
"epoch": 0.454815107771406,
"grad_norm": 0.3730885684490204,
"learning_rate": 0.00015461142630953954,
"loss": 0.0115,
"step": 1725
},
{
"epoch": 0.45613341243161293,
"grad_norm": 0.035858724266290665,
"learning_rate": 0.00015447948278136958,
"loss": 0.0021,
"step": 1730
},
{
"epoch": 0.4574517170918199,
"grad_norm": 0.20589517056941986,
"learning_rate": 0.00015434753925319965,
"loss": 0.0132,
"step": 1735
},
{
"epoch": 0.4587700217520269,
"grad_norm": 0.004939342383295298,
"learning_rate": 0.00015421559572502972,
"loss": 0.0471,
"step": 1740
},
{
"epoch": 0.46008832641223385,
"grad_norm": 0.03493283689022064,
"learning_rate": 0.00015408365219685976,
"loss": 0.0062,
"step": 1745
},
{
"epoch": 0.46140663107244084,
"grad_norm": 0.045927103608846664,
"learning_rate": 0.0001539517086686898,
"loss": 0.0283,
"step": 1750
},
{
"epoch": 0.46272493573264784,
"grad_norm": 0.012629454955458641,
"learning_rate": 0.00015381976514051984,
"loss": 0.0133,
"step": 1755
},
{
"epoch": 0.46404324039285477,
"grad_norm": 0.8001697659492493,
"learning_rate": 0.0001536878216123499,
"loss": 0.0224,
"step": 1760
},
{
"epoch": 0.46536154505306176,
"grad_norm": 0.002036362886428833,
"learning_rate": 0.00015355587808417998,
"loss": 0.0066,
"step": 1765
},
{
"epoch": 0.46667984971326876,
"grad_norm": 1.0261330604553223,
"learning_rate": 0.00015342393455601002,
"loss": 0.191,
"step": 1770
},
{
"epoch": 0.4679981543734757,
"grad_norm": 0.3033429682254791,
"learning_rate": 0.0001532919910278401,
"loss": 0.0222,
"step": 1775
},
{
"epoch": 0.4693164590336827,
"grad_norm": 0.36911338567733765,
"learning_rate": 0.00015316004749967014,
"loss": 0.0363,
"step": 1780
},
{
"epoch": 0.4706347636938897,
"grad_norm": 0.0406811460852623,
"learning_rate": 0.0001530281039715002,
"loss": 0.0283,
"step": 1785
},
{
"epoch": 0.4719530683540966,
"grad_norm": 0.23334211111068726,
"learning_rate": 0.00015289616044333027,
"loss": 0.0274,
"step": 1790
},
{
"epoch": 0.4732713730143036,
"grad_norm": 0.013081169687211514,
"learning_rate": 0.00015276421691516032,
"loss": 0.0221,
"step": 1795
},
{
"epoch": 0.4745896776745106,
"grad_norm": 0.2480790615081787,
"learning_rate": 0.00015263227338699039,
"loss": 0.019,
"step": 1800
},
{
"epoch": 0.47590798233471754,
"grad_norm": 0.0373196005821228,
"learning_rate": 0.00015250032985882043,
"loss": 0.0292,
"step": 1805
},
{
"epoch": 0.47722628699492453,
"grad_norm": 0.004609994124621153,
"learning_rate": 0.0001523683863306505,
"loss": 0.0918,
"step": 1810
},
{
"epoch": 0.4785445916551315,
"grad_norm": 0.02370987832546234,
"learning_rate": 0.00015223644280248054,
"loss": 0.0462,
"step": 1815
},
{
"epoch": 0.47986289631533846,
"grad_norm": 0.05842221528291702,
"learning_rate": 0.0001521044992743106,
"loss": 0.0595,
"step": 1820
},
{
"epoch": 0.48118120097554545,
"grad_norm": 0.009685276076197624,
"learning_rate": 0.00015197255574614068,
"loss": 0.0074,
"step": 1825
},
{
"epoch": 0.48249950563575245,
"grad_norm": 0.8933250308036804,
"learning_rate": 0.00015184061221797072,
"loss": 0.0757,
"step": 1830
},
{
"epoch": 0.4838178102959594,
"grad_norm": 0.07075401395559311,
"learning_rate": 0.0001517086686898008,
"loss": 0.0226,
"step": 1835
},
{
"epoch": 0.4851361149561664,
"grad_norm": 0.732706606388092,
"learning_rate": 0.00015157672516163083,
"loss": 0.0161,
"step": 1840
},
{
"epoch": 0.48645441961637337,
"grad_norm": 1.1897023916244507,
"learning_rate": 0.0001514447816334609,
"loss": 0.0265,
"step": 1845
},
{
"epoch": 0.4877727242765803,
"grad_norm": 0.052572328597307205,
"learning_rate": 0.00015131283810529094,
"loss": 0.0094,
"step": 1850
},
{
"epoch": 0.4890910289367873,
"grad_norm": 0.08263898640871048,
"learning_rate": 0.00015118089457712098,
"loss": 0.0631,
"step": 1855
},
{
"epoch": 0.4904093335969943,
"grad_norm": 0.03225664421916008,
"learning_rate": 0.00015104895104895105,
"loss": 0.023,
"step": 1860
},
{
"epoch": 0.4917276382572012,
"grad_norm": 0.007935039699077606,
"learning_rate": 0.0001509170075207811,
"loss": 0.0039,
"step": 1865
},
{
"epoch": 0.4930459429174082,
"grad_norm": 0.00830796267837286,
"learning_rate": 0.00015078506399261116,
"loss": 0.007,
"step": 1870
},
{
"epoch": 0.4943642475776152,
"grad_norm": 0.08042234182357788,
"learning_rate": 0.00015065312046444123,
"loss": 0.0366,
"step": 1875
},
{
"epoch": 0.49568255223782215,
"grad_norm": 0.009092851541936398,
"learning_rate": 0.00015052117693627128,
"loss": 0.0107,
"step": 1880
},
{
"epoch": 0.49700085689802914,
"grad_norm": 0.2674141824245453,
"learning_rate": 0.00015038923340810135,
"loss": 0.0076,
"step": 1885
},
{
"epoch": 0.49831916155823613,
"grad_norm": 0.07694366574287415,
"learning_rate": 0.0001502572898799314,
"loss": 0.0252,
"step": 1890
},
{
"epoch": 0.49963746621844307,
"grad_norm": 0.5699467062950134,
"learning_rate": 0.00015012534635176146,
"loss": 0.0487,
"step": 1895
},
{
"epoch": 0.5009557708786501,
"grad_norm": 0.18800878524780273,
"learning_rate": 0.0001499934028235915,
"loss": 0.0183,
"step": 1900
},
{
"epoch": 0.5022740755388571,
"grad_norm": 0.019469989463686943,
"learning_rate": 0.00014986145929542157,
"loss": 0.0268,
"step": 1905
},
{
"epoch": 0.503592380199064,
"grad_norm": 0.01890506222844124,
"learning_rate": 0.00014972951576725164,
"loss": 0.0449,
"step": 1910
},
{
"epoch": 0.5049106848592709,
"grad_norm": 0.0006314461352303624,
"learning_rate": 0.00014959757223908168,
"loss": 0.0056,
"step": 1915
},
{
"epoch": 0.5062289895194779,
"grad_norm": 0.32654041051864624,
"learning_rate": 0.00014946562871091175,
"loss": 0.0256,
"step": 1920
},
{
"epoch": 0.5075472941796849,
"grad_norm": 0.7803483605384827,
"learning_rate": 0.0001493336851827418,
"loss": 0.0374,
"step": 1925
},
{
"epoch": 0.5088655988398919,
"grad_norm": 0.028441445901989937,
"learning_rate": 0.00014920174165457186,
"loss": 0.0161,
"step": 1930
},
{
"epoch": 0.5101839035000989,
"grad_norm": 0.028379200026392937,
"learning_rate": 0.00014906979812640193,
"loss": 0.0151,
"step": 1935
},
{
"epoch": 0.5115022081603059,
"grad_norm": 0.021159596741199493,
"learning_rate": 0.00014893785459823197,
"loss": 0.0303,
"step": 1940
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.24903325736522675,
"learning_rate": 0.000148805911070062,
"loss": 0.0076,
"step": 1945
},
{
"epoch": 0.5141388174807198,
"grad_norm": 0.007065301761031151,
"learning_rate": 0.00014867396754189206,
"loss": 0.022,
"step": 1950
},
{
"epoch": 0.5154571221409268,
"grad_norm": 0.004032329190522432,
"learning_rate": 0.00014854202401372212,
"loss": 0.0083,
"step": 1955
},
{
"epoch": 0.5167754268011338,
"grad_norm": 0.3045775592327118,
"learning_rate": 0.0001484100804855522,
"loss": 0.0113,
"step": 1960
},
{
"epoch": 0.5180937314613407,
"grad_norm": 0.36974939703941345,
"learning_rate": 0.00014827813695738224,
"loss": 0.0267,
"step": 1965
},
{
"epoch": 0.5194120361215477,
"grad_norm": 0.009729950688779354,
"learning_rate": 0.0001481461934292123,
"loss": 0.027,
"step": 1970
},
{
"epoch": 0.5207303407817546,
"grad_norm": 0.0013097926275804639,
"learning_rate": 0.00014801424990104235,
"loss": 0.003,
"step": 1975
},
{
"epoch": 0.5220486454419616,
"grad_norm": 0.0706263929605484,
"learning_rate": 0.00014788230637287242,
"loss": 0.0193,
"step": 1980
},
{
"epoch": 0.5233669501021686,
"grad_norm": 1.435702919960022,
"learning_rate": 0.00014775036284470249,
"loss": 0.0647,
"step": 1985
},
{
"epoch": 0.5246852547623756,
"grad_norm": 0.00661757867783308,
"learning_rate": 0.00014761841931653253,
"loss": 0.0373,
"step": 1990
},
{
"epoch": 0.5260035594225826,
"grad_norm": 0.12014541029930115,
"learning_rate": 0.0001474864757883626,
"loss": 0.0178,
"step": 1995
},
{
"epoch": 0.5273218640827896,
"grad_norm": 1.0549248456954956,
"learning_rate": 0.00014735453226019264,
"loss": 0.0191,
"step": 2000
},
{
"epoch": 0.5273218640827896,
"eval_loss": 0.037292081862688065,
"eval_runtime": 454.3033,
"eval_samples_per_second": 7.422,
"eval_steps_per_second": 3.711,
"step": 2000
},
{
"epoch": 0.5286401687429965,
"grad_norm": 0.47634151577949524,
"learning_rate": 0.0001472225887320227,
"loss": 0.0404,
"step": 2005
},
{
"epoch": 0.5299584734032035,
"grad_norm": 0.006752463988959789,
"learning_rate": 0.00014709064520385275,
"loss": 0.034,
"step": 2010
},
{
"epoch": 0.5312767780634104,
"grad_norm": 0.20780125260353088,
"learning_rate": 0.00014695870167568282,
"loss": 0.0421,
"step": 2015
},
{
"epoch": 0.5325950827236174,
"grad_norm": 0.010941066779196262,
"learning_rate": 0.0001468267581475129,
"loss": 0.0086,
"step": 2020
},
{
"epoch": 0.5339133873838244,
"grad_norm": 0.3439581096172333,
"learning_rate": 0.00014669481461934293,
"loss": 0.0187,
"step": 2025
},
{
"epoch": 0.5352316920440314,
"grad_norm": 0.14961636066436768,
"learning_rate": 0.000146562871091173,
"loss": 0.0504,
"step": 2030
},
{
"epoch": 0.5365499967042383,
"grad_norm": 0.0044641937129199505,
"learning_rate": 0.00014643092756300304,
"loss": 0.0134,
"step": 2035
},
{
"epoch": 0.5378683013644453,
"grad_norm": 0.14088386297225952,
"learning_rate": 0.0001462989840348331,
"loss": 0.0096,
"step": 2040
},
{
"epoch": 0.5391866060246523,
"grad_norm": 0.48116979002952576,
"learning_rate": 0.00014616704050666315,
"loss": 0.0124,
"step": 2045
},
{
"epoch": 0.5405049106848593,
"grad_norm": 0.3688766360282898,
"learning_rate": 0.0001460350969784932,
"loss": 0.0226,
"step": 2050
},
{
"epoch": 0.5418232153450663,
"grad_norm": 0.002938181860372424,
"learning_rate": 0.00014590315345032326,
"loss": 0.0267,
"step": 2055
},
{
"epoch": 0.5431415200052733,
"grad_norm": 0.3335214853286743,
"learning_rate": 0.0001457712099221533,
"loss": 0.0367,
"step": 2060
},
{
"epoch": 0.5444598246654802,
"grad_norm": 0.004644686821848154,
"learning_rate": 0.00014563926639398338,
"loss": 0.0121,
"step": 2065
},
{
"epoch": 0.5457781293256871,
"grad_norm": 0.19505545496940613,
"learning_rate": 0.00014550732286581345,
"loss": 0.0591,
"step": 2070
},
{
"epoch": 0.5470964339858941,
"grad_norm": 0.018028756603598595,
"learning_rate": 0.0001453753793376435,
"loss": 0.0131,
"step": 2075
},
{
"epoch": 0.5484147386461011,
"grad_norm": 0.045639291405677795,
"learning_rate": 0.00014524343580947356,
"loss": 0.0443,
"step": 2080
},
{
"epoch": 0.5497330433063081,
"grad_norm": 0.727981686592102,
"learning_rate": 0.0001451114922813036,
"loss": 0.0205,
"step": 2085
},
{
"epoch": 0.5510513479665151,
"grad_norm": 0.03766491636633873,
"learning_rate": 0.00014497954875313367,
"loss": 0.0067,
"step": 2090
},
{
"epoch": 0.552369652626722,
"grad_norm": 0.1911504715681076,
"learning_rate": 0.0001448476052249637,
"loss": 0.0397,
"step": 2095
},
{
"epoch": 0.553687957286929,
"grad_norm": 0.08238353580236435,
"learning_rate": 0.00014471566169679378,
"loss": 0.0513,
"step": 2100
},
{
"epoch": 0.555006261947136,
"grad_norm": 0.06317206472158432,
"learning_rate": 0.00014458371816862385,
"loss": 0.0178,
"step": 2105
},
{
"epoch": 0.556324566607343,
"grad_norm": 0.0652734637260437,
"learning_rate": 0.0001444517746404539,
"loss": 0.0184,
"step": 2110
},
{
"epoch": 0.55764287126755,
"grad_norm": 0.05471858009696007,
"learning_rate": 0.00014431983111228396,
"loss": 0.0089,
"step": 2115
},
{
"epoch": 0.558961175927757,
"grad_norm": 0.005062670446932316,
"learning_rate": 0.000144187887584114,
"loss": 0.0052,
"step": 2120
},
{
"epoch": 0.5602794805879638,
"grad_norm": 0.06337414681911469,
"learning_rate": 0.00014405594405594407,
"loss": 0.053,
"step": 2125
},
{
"epoch": 0.5615977852481708,
"grad_norm": 0.33745357394218445,
"learning_rate": 0.00014392400052777414,
"loss": 0.0166,
"step": 2130
},
{
"epoch": 0.5629160899083778,
"grad_norm": 0.7382741570472717,
"learning_rate": 0.00014379205699960418,
"loss": 0.0191,
"step": 2135
},
{
"epoch": 0.5642343945685848,
"grad_norm": 0.007551972754299641,
"learning_rate": 0.00014366011347143425,
"loss": 0.0022,
"step": 2140
},
{
"epoch": 0.5655526992287918,
"grad_norm": 0.6260896921157837,
"learning_rate": 0.00014352816994326427,
"loss": 0.0095,
"step": 2145
},
{
"epoch": 0.5668710038889987,
"grad_norm": 0.11619322001934052,
"learning_rate": 0.00014339622641509434,
"loss": 0.015,
"step": 2150
},
{
"epoch": 0.5681893085492057,
"grad_norm": 1.1440670490264893,
"learning_rate": 0.0001432642828869244,
"loss": 0.1343,
"step": 2155
},
{
"epoch": 0.5695076132094127,
"grad_norm": 1.1793878078460693,
"learning_rate": 0.00014313233935875445,
"loss": 0.0968,
"step": 2160
},
{
"epoch": 0.5708259178696197,
"grad_norm": 0.6865736842155457,
"learning_rate": 0.00014300039583058452,
"loss": 0.0195,
"step": 2165
},
{
"epoch": 0.5721442225298267,
"grad_norm": 0.140816792845726,
"learning_rate": 0.00014286845230241456,
"loss": 0.0761,
"step": 2170
},
{
"epoch": 0.5734625271900337,
"grad_norm": 0.04071786254644394,
"learning_rate": 0.00014273650877424463,
"loss": 0.0193,
"step": 2175
},
{
"epoch": 0.5747808318502405,
"grad_norm": 0.044617727398872375,
"learning_rate": 0.0001426045652460747,
"loss": 0.0112,
"step": 2180
},
{
"epoch": 0.5760991365104475,
"grad_norm": 0.11001799255609512,
"learning_rate": 0.00014247262171790474,
"loss": 0.0039,
"step": 2185
},
{
"epoch": 0.5774174411706545,
"grad_norm": 0.0036315324250608683,
"learning_rate": 0.0001423406781897348,
"loss": 0.0038,
"step": 2190
},
{
"epoch": 0.5787357458308615,
"grad_norm": 0.9866570830345154,
"learning_rate": 0.00014220873466156485,
"loss": 0.025,
"step": 2195
},
{
"epoch": 0.5800540504910685,
"grad_norm": 0.023570384830236435,
"learning_rate": 0.00014207679113339492,
"loss": 0.0468,
"step": 2200
},
{
"epoch": 0.5813723551512755,
"grad_norm": 0.20010559260845184,
"learning_rate": 0.00014194484760522496,
"loss": 0.0198,
"step": 2205
},
{
"epoch": 0.5826906598114824,
"grad_norm": 0.06153270602226257,
"learning_rate": 0.00014181290407705503,
"loss": 0.0764,
"step": 2210
},
{
"epoch": 0.5840089644716894,
"grad_norm": 0.033162448555231094,
"learning_rate": 0.0001416809605488851,
"loss": 0.028,
"step": 2215
},
{
"epoch": 0.5853272691318964,
"grad_norm": 0.428382933139801,
"learning_rate": 0.00014154901702071514,
"loss": 0.0652,
"step": 2220
},
{
"epoch": 0.5866455737921034,
"grad_norm": 0.25004762411117554,
"learning_rate": 0.0001414170734925452,
"loss": 0.0411,
"step": 2225
},
{
"epoch": 0.5879638784523104,
"grad_norm": 0.22649863362312317,
"learning_rate": 0.00014128512996437525,
"loss": 0.0517,
"step": 2230
},
{
"epoch": 0.5892821831125173,
"grad_norm": 0.035932112485170364,
"learning_rate": 0.00014115318643620532,
"loss": 0.015,
"step": 2235
},
{
"epoch": 0.5906004877727242,
"grad_norm": 0.3800172507762909,
"learning_rate": 0.00014102124290803536,
"loss": 0.0324,
"step": 2240
},
{
"epoch": 0.5919187924329312,
"grad_norm": 0.6974118947982788,
"learning_rate": 0.0001408892993798654,
"loss": 0.0216,
"step": 2245
},
{
"epoch": 0.5932370970931382,
"grad_norm": 0.15472032129764557,
"learning_rate": 0.00014075735585169548,
"loss": 0.0164,
"step": 2250
},
{
"epoch": 0.5945554017533452,
"grad_norm": 0.015000814571976662,
"learning_rate": 0.00014062541232352552,
"loss": 0.0395,
"step": 2255
},
{
"epoch": 0.5958737064135522,
"grad_norm": 0.052086081355810165,
"learning_rate": 0.0001404934687953556,
"loss": 0.0032,
"step": 2260
},
{
"epoch": 0.5971920110737592,
"grad_norm": 0.004600350745022297,
"learning_rate": 0.00014036152526718566,
"loss": 0.0056,
"step": 2265
},
{
"epoch": 0.5985103157339661,
"grad_norm": 0.4940958321094513,
"learning_rate": 0.0001402295817390157,
"loss": 0.0206,
"step": 2270
},
{
"epoch": 0.5998286203941731,
"grad_norm": 0.09658394008874893,
"learning_rate": 0.00014009763821084577,
"loss": 0.0052,
"step": 2275
},
{
"epoch": 0.60114692505438,
"grad_norm": 0.00020539117394946516,
"learning_rate": 0.0001399656946826758,
"loss": 0.087,
"step": 2280
},
{
"epoch": 0.602465229714587,
"grad_norm": 0.1871018409729004,
"learning_rate": 0.00013983375115450588,
"loss": 0.0812,
"step": 2285
},
{
"epoch": 0.603783534374794,
"grad_norm": 0.02583954855799675,
"learning_rate": 0.00013970180762633592,
"loss": 0.0232,
"step": 2290
},
{
"epoch": 0.605101839035001,
"grad_norm": 1.2103784084320068,
"learning_rate": 0.000139569864098166,
"loss": 0.0151,
"step": 2295
},
{
"epoch": 0.6064201436952079,
"grad_norm": 0.023514943197369576,
"learning_rate": 0.00013943792056999606,
"loss": 0.0193,
"step": 2300
},
{
"epoch": 0.6077384483554149,
"grad_norm": 0.0076395305804908276,
"learning_rate": 0.0001393059770418261,
"loss": 0.0379,
"step": 2305
},
{
"epoch": 0.6090567530156219,
"grad_norm": 0.12412039190530777,
"learning_rate": 0.00013917403351365617,
"loss": 0.0095,
"step": 2310
},
{
"epoch": 0.6103750576758289,
"grad_norm": 0.021904783323407173,
"learning_rate": 0.0001390420899854862,
"loss": 0.0166,
"step": 2315
},
{
"epoch": 0.6116933623360359,
"grad_norm": 0.004012851510196924,
"learning_rate": 0.00013891014645731628,
"loss": 0.0103,
"step": 2320
},
{
"epoch": 0.6130116669962429,
"grad_norm": 0.007267913781106472,
"learning_rate": 0.00013877820292914635,
"loss": 0.0708,
"step": 2325
},
{
"epoch": 0.6143299716564498,
"grad_norm": 0.10363642126321793,
"learning_rate": 0.0001386462594009764,
"loss": 0.0473,
"step": 2330
},
{
"epoch": 0.6156482763166568,
"grad_norm": 0.04899830371141434,
"learning_rate": 0.00013851431587280646,
"loss": 0.0283,
"step": 2335
},
{
"epoch": 0.6169665809768637,
"grad_norm": 0.39460498094558716,
"learning_rate": 0.0001383823723446365,
"loss": 0.0597,
"step": 2340
},
{
"epoch": 0.6182848856370707,
"grad_norm": 0.04092290997505188,
"learning_rate": 0.00013825042881646655,
"loss": 0.0167,
"step": 2345
},
{
"epoch": 0.6196031902972777,
"grad_norm": 0.2781132161617279,
"learning_rate": 0.00013811848528829662,
"loss": 0.0097,
"step": 2350
},
{
"epoch": 0.6209214949574847,
"grad_norm": 0.041443537920713425,
"learning_rate": 0.00013798654176012666,
"loss": 0.0226,
"step": 2355
},
{
"epoch": 0.6222397996176916,
"grad_norm": 0.1242462694644928,
"learning_rate": 0.00013785459823195673,
"loss": 0.0055,
"step": 2360
},
{
"epoch": 0.6235581042778986,
"grad_norm": 0.4440467357635498,
"learning_rate": 0.00013772265470378677,
"loss": 0.049,
"step": 2365
},
{
"epoch": 0.6248764089381056,
"grad_norm": 0.014354427345097065,
"learning_rate": 0.00013759071117561684,
"loss": 0.0327,
"step": 2370
},
{
"epoch": 0.6261947135983126,
"grad_norm": 0.011539973318576813,
"learning_rate": 0.0001374587676474469,
"loss": 0.0222,
"step": 2375
},
{
"epoch": 0.6275130182585196,
"grad_norm": 0.23539051413536072,
"learning_rate": 0.00013732682411927695,
"loss": 0.0816,
"step": 2380
},
{
"epoch": 0.6288313229187266,
"grad_norm": 0.26793941855430603,
"learning_rate": 0.00013719488059110702,
"loss": 0.0325,
"step": 2385
},
{
"epoch": 0.6301496275789334,
"grad_norm": 0.01662217453122139,
"learning_rate": 0.00013706293706293706,
"loss": 0.0221,
"step": 2390
},
{
"epoch": 0.6314679322391404,
"grad_norm": 0.30669671297073364,
"learning_rate": 0.00013693099353476713,
"loss": 0.026,
"step": 2395
},
{
"epoch": 0.6327862368993474,
"grad_norm": 0.03350894898176193,
"learning_rate": 0.00013679905000659717,
"loss": 0.0072,
"step": 2400
},
{
"epoch": 0.6341045415595544,
"grad_norm": 0.014983875676989555,
"learning_rate": 0.00013666710647842724,
"loss": 0.049,
"step": 2405
},
{
"epoch": 0.6354228462197614,
"grad_norm": 1.8989384174346924,
"learning_rate": 0.0001365351629502573,
"loss": 0.0335,
"step": 2410
},
{
"epoch": 0.6367411508799684,
"grad_norm": 0.030135562643408775,
"learning_rate": 0.00013640321942208735,
"loss": 0.0051,
"step": 2415
},
{
"epoch": 0.6380594555401753,
"grad_norm": 0.02079075388610363,
"learning_rate": 0.00013627127589391742,
"loss": 0.0138,
"step": 2420
},
{
"epoch": 0.6393777602003823,
"grad_norm": 0.06065403297543526,
"learning_rate": 0.00013613933236574746,
"loss": 0.0357,
"step": 2425
},
{
"epoch": 0.6406960648605893,
"grad_norm": 0.2980937659740448,
"learning_rate": 0.00013600738883757753,
"loss": 0.0138,
"step": 2430
},
{
"epoch": 0.6420143695207963,
"grad_norm": 0.4820438623428345,
"learning_rate": 0.00013587544530940758,
"loss": 0.01,
"step": 2435
},
{
"epoch": 0.6433326741810033,
"grad_norm": 0.005618259310722351,
"learning_rate": 0.00013574350178123765,
"loss": 0.0052,
"step": 2440
},
{
"epoch": 0.6446509788412103,
"grad_norm": 0.7173821926116943,
"learning_rate": 0.0001356115582530677,
"loss": 0.0133,
"step": 2445
},
{
"epoch": 0.6459692835014171,
"grad_norm": 0.0053142281249165535,
"learning_rate": 0.00013547961472489773,
"loss": 0.0045,
"step": 2450
},
{
"epoch": 0.6472875881616241,
"grad_norm": 0.06118829548358917,
"learning_rate": 0.0001353476711967278,
"loss": 0.056,
"step": 2455
},
{
"epoch": 0.6486058928218311,
"grad_norm": 3.5878078937530518,
"learning_rate": 0.00013521572766855787,
"loss": 0.0232,
"step": 2460
},
{
"epoch": 0.6499241974820381,
"grad_norm": 0.004911276511847973,
"learning_rate": 0.0001350837841403879,
"loss": 0.0074,
"step": 2465
},
{
"epoch": 0.6512425021422451,
"grad_norm": 0.0028026222717016935,
"learning_rate": 0.00013495184061221798,
"loss": 0.0782,
"step": 2470
},
{
"epoch": 0.6525608068024521,
"grad_norm": 0.7317615747451782,
"learning_rate": 0.00013481989708404802,
"loss": 0.0222,
"step": 2475
},
{
"epoch": 0.653879111462659,
"grad_norm": 0.01835751160979271,
"learning_rate": 0.0001346879535558781,
"loss": 0.0661,
"step": 2480
},
{
"epoch": 0.655197416122866,
"grad_norm": 0.03598962351679802,
"learning_rate": 0.00013455601002770813,
"loss": 0.0395,
"step": 2485
},
{
"epoch": 0.656515720783073,
"grad_norm": 0.013886351138353348,
"learning_rate": 0.0001344240664995382,
"loss": 0.0156,
"step": 2490
},
{
"epoch": 0.65783402544328,
"grad_norm": 5.741530895233154,
"learning_rate": 0.00013429212297136827,
"loss": 0.0317,
"step": 2495
},
{
"epoch": 0.659152330103487,
"grad_norm": 0.20793496072292328,
"learning_rate": 0.0001341601794431983,
"loss": 0.0072,
"step": 2500
},
{
"epoch": 0.659152330103487,
"eval_loss": 0.0300898440182209,
"eval_runtime": 453.0554,
"eval_samples_per_second": 7.443,
"eval_steps_per_second": 3.721,
"step": 2500
},
{
"epoch": 0.6604706347636939,
"grad_norm": 0.03460961952805519,
"learning_rate": 0.00013402823591502838,
"loss": 0.0097,
"step": 2505
},
{
"epoch": 0.6617889394239008,
"grad_norm": 0.31785696744918823,
"learning_rate": 0.00013389629238685842,
"loss": 0.0303,
"step": 2510
},
{
"epoch": 0.6631072440841078,
"grad_norm": 0.4273851215839386,
"learning_rate": 0.0001337643488586885,
"loss": 0.0499,
"step": 2515
},
{
"epoch": 0.6644255487443148,
"grad_norm": 0.02236153744161129,
"learning_rate": 0.00013363240533051856,
"loss": 0.0069,
"step": 2520
},
{
"epoch": 0.6657438534045218,
"grad_norm": 0.1592864990234375,
"learning_rate": 0.0001335004618023486,
"loss": 0.0326,
"step": 2525
},
{
"epoch": 0.6670621580647288,
"grad_norm": 0.029961545020341873,
"learning_rate": 0.00013336851827417867,
"loss": 0.0178,
"step": 2530
},
{
"epoch": 0.6683804627249358,
"grad_norm": 0.03120764158666134,
"learning_rate": 0.00013323657474600872,
"loss": 0.115,
"step": 2535
},
{
"epoch": 0.6696987673851427,
"grad_norm": 0.01060028001666069,
"learning_rate": 0.00013310463121783879,
"loss": 0.0036,
"step": 2540
},
{
"epoch": 0.6710170720453497,
"grad_norm": 0.053470809012651443,
"learning_rate": 0.00013297268768966883,
"loss": 0.0079,
"step": 2545
},
{
"epoch": 0.6723353767055567,
"grad_norm": 0.022777097299695015,
"learning_rate": 0.00013284074416149887,
"loss": 0.0078,
"step": 2550
},
{
"epoch": 0.6736536813657636,
"grad_norm": 0.0548521913588047,
"learning_rate": 0.00013270880063332894,
"loss": 0.0503,
"step": 2555
},
{
"epoch": 0.6749719860259706,
"grad_norm": 0.02028457075357437,
"learning_rate": 0.00013257685710515898,
"loss": 0.0096,
"step": 2560
},
{
"epoch": 0.6762902906861776,
"grad_norm": 0.01569107361137867,
"learning_rate": 0.00013244491357698905,
"loss": 0.008,
"step": 2565
},
{
"epoch": 0.6776085953463845,
"grad_norm": 0.00743742985650897,
"learning_rate": 0.00013231297004881912,
"loss": 0.005,
"step": 2570
},
{
"epoch": 0.6789269000065915,
"grad_norm": 0.025164416059851646,
"learning_rate": 0.00013218102652064916,
"loss": 0.018,
"step": 2575
},
{
"epoch": 0.6802452046667985,
"grad_norm": 0.3653188645839691,
"learning_rate": 0.00013204908299247923,
"loss": 0.0295,
"step": 2580
},
{
"epoch": 0.6815635093270055,
"grad_norm": 0.685422956943512,
"learning_rate": 0.00013191713946430927,
"loss": 0.0335,
"step": 2585
},
{
"epoch": 0.6828818139872125,
"grad_norm": 0.675740122795105,
"learning_rate": 0.00013178519593613934,
"loss": 0.0592,
"step": 2590
},
{
"epoch": 0.6842001186474194,
"grad_norm": 0.10513252764940262,
"learning_rate": 0.00013165325240796938,
"loss": 0.0353,
"step": 2595
},
{
"epoch": 0.6855184233076264,
"grad_norm": 0.43512973189353943,
"learning_rate": 0.00013152130887979945,
"loss": 0.0142,
"step": 2600
},
{
"epoch": 0.6868367279678333,
"grad_norm": 0.029436839744448662,
"learning_rate": 0.00013138936535162952,
"loss": 0.0042,
"step": 2605
},
{
"epoch": 0.6881550326280403,
"grad_norm": 0.5607122778892517,
"learning_rate": 0.00013125742182345957,
"loss": 0.0184,
"step": 2610
},
{
"epoch": 0.6894733372882473,
"grad_norm": 0.11365406215190887,
"learning_rate": 0.00013112547829528963,
"loss": 0.006,
"step": 2615
},
{
"epoch": 0.6907916419484543,
"grad_norm": 0.047227244824171066,
"learning_rate": 0.00013099353476711968,
"loss": 0.008,
"step": 2620
},
{
"epoch": 0.6921099466086612,
"grad_norm": 0.0005877618095837533,
"learning_rate": 0.00013086159123894975,
"loss": 0.0286,
"step": 2625
},
{
"epoch": 0.6934282512688682,
"grad_norm": 0.010759112425148487,
"learning_rate": 0.0001307296477107798,
"loss": 0.0062,
"step": 2630
},
{
"epoch": 0.6947465559290752,
"grad_norm": 0.07117745280265808,
"learning_rate": 0.00013059770418260986,
"loss": 0.0891,
"step": 2635
},
{
"epoch": 0.6960648605892822,
"grad_norm": 0.0639057606458664,
"learning_rate": 0.00013046576065443993,
"loss": 0.0072,
"step": 2640
},
{
"epoch": 0.6973831652494892,
"grad_norm": 0.027350090444087982,
"learning_rate": 0.00013033381712626994,
"loss": 0.0103,
"step": 2645
},
{
"epoch": 0.6987014699096962,
"grad_norm": 0.015336195938289165,
"learning_rate": 0.0001302018735981,
"loss": 0.0041,
"step": 2650
},
{
"epoch": 0.700019774569903,
"grad_norm": 1.0650830268859863,
"learning_rate": 0.00013006993006993008,
"loss": 0.0443,
"step": 2655
},
{
"epoch": 0.70133807923011,
"grad_norm": 0.019073212519288063,
"learning_rate": 0.00012993798654176012,
"loss": 0.0331,
"step": 2660
},
{
"epoch": 0.702656383890317,
"grad_norm": 0.10109209269285202,
"learning_rate": 0.0001298060430135902,
"loss": 0.0054,
"step": 2665
},
{
"epoch": 0.703974688550524,
"grad_norm": 0.03528957813978195,
"learning_rate": 0.00012967409948542023,
"loss": 0.0427,
"step": 2670
},
{
"epoch": 0.705292993210731,
"grad_norm": 0.03577788919210434,
"learning_rate": 0.0001295421559572503,
"loss": 0.023,
"step": 2675
},
{
"epoch": 0.706611297870938,
"grad_norm": 0.5576180815696716,
"learning_rate": 0.00012941021242908034,
"loss": 0.0416,
"step": 2680
},
{
"epoch": 0.7079296025311449,
"grad_norm": 0.017131298780441284,
"learning_rate": 0.0001292782689009104,
"loss": 0.0235,
"step": 2685
},
{
"epoch": 0.7092479071913519,
"grad_norm": 0.8517888784408569,
"learning_rate": 0.00012914632537274048,
"loss": 0.0168,
"step": 2690
},
{
"epoch": 0.7105662118515589,
"grad_norm": 0.23812156915664673,
"learning_rate": 0.00012901438184457052,
"loss": 0.0483,
"step": 2695
},
{
"epoch": 0.7118845165117659,
"grad_norm": 0.11746613681316376,
"learning_rate": 0.0001288824383164006,
"loss": 0.0255,
"step": 2700
},
{
"epoch": 0.7132028211719729,
"grad_norm": 0.20089928805828094,
"learning_rate": 0.00012875049478823064,
"loss": 0.0267,
"step": 2705
},
{
"epoch": 0.7145211258321799,
"grad_norm": 0.8301129937171936,
"learning_rate": 0.0001286185512600607,
"loss": 0.016,
"step": 2710
},
{
"epoch": 0.7158394304923867,
"grad_norm": 0.01838674768805504,
"learning_rate": 0.00012848660773189077,
"loss": 0.0229,
"step": 2715
},
{
"epoch": 0.7171577351525937,
"grad_norm": 0.03670337051153183,
"learning_rate": 0.00012835466420372082,
"loss": 0.038,
"step": 2720
},
{
"epoch": 0.7184760398128007,
"grad_norm": 0.0452633760869503,
"learning_rate": 0.00012822272067555089,
"loss": 0.0622,
"step": 2725
},
{
"epoch": 0.7197943444730077,
"grad_norm": 0.09503110498189926,
"learning_rate": 0.00012809077714738093,
"loss": 0.0209,
"step": 2730
},
{
"epoch": 0.7211126491332147,
"grad_norm": 1.0327308177947998,
"learning_rate": 0.000127958833619211,
"loss": 0.0361,
"step": 2735
},
{
"epoch": 0.7224309537934217,
"grad_norm": 1.0049290657043457,
"learning_rate": 0.00012782689009104104,
"loss": 0.0365,
"step": 2740
},
{
"epoch": 0.7237492584536286,
"grad_norm": 0.029774073511362076,
"learning_rate": 0.00012769494656287108,
"loss": 0.0257,
"step": 2745
},
{
"epoch": 0.7250675631138356,
"grad_norm": 0.20974040031433105,
"learning_rate": 0.00012756300303470115,
"loss": 0.0542,
"step": 2750
},
{
"epoch": 0.7263858677740426,
"grad_norm": 0.8153854608535767,
"learning_rate": 0.0001274310595065312,
"loss": 0.0216,
"step": 2755
},
{
"epoch": 0.7277041724342496,
"grad_norm": 0.4393698573112488,
"learning_rate": 0.00012729911597836126,
"loss": 0.0451,
"step": 2760
},
{
"epoch": 0.7290224770944566,
"grad_norm": 0.06990349292755127,
"learning_rate": 0.00012716717245019133,
"loss": 0.03,
"step": 2765
},
{
"epoch": 0.7303407817546635,
"grad_norm": 0.32689470052719116,
"learning_rate": 0.00012703522892202137,
"loss": 0.0263,
"step": 2770
},
{
"epoch": 0.7316590864148704,
"grad_norm": 0.026600876823067665,
"learning_rate": 0.00012690328539385144,
"loss": 0.0404,
"step": 2775
},
{
"epoch": 0.7329773910750774,
"grad_norm": 0.11228257417678833,
"learning_rate": 0.00012677134186568148,
"loss": 0.0224,
"step": 2780
},
{
"epoch": 0.7342956957352844,
"grad_norm": 0.6469443440437317,
"learning_rate": 0.00012663939833751155,
"loss": 0.0178,
"step": 2785
},
{
"epoch": 0.7356140003954914,
"grad_norm": 0.020773250609636307,
"learning_rate": 0.0001265074548093416,
"loss": 0.011,
"step": 2790
},
{
"epoch": 0.7369323050556984,
"grad_norm": 0.7378728985786438,
"learning_rate": 0.00012637551128117167,
"loss": 0.0227,
"step": 2795
},
{
"epoch": 0.7382506097159054,
"grad_norm": 0.008189595304429531,
"learning_rate": 0.00012624356775300173,
"loss": 0.0892,
"step": 2800
},
{
"epoch": 0.7395689143761123,
"grad_norm": 0.031633853912353516,
"learning_rate": 0.00012611162422483178,
"loss": 0.0093,
"step": 2805
},
{
"epoch": 0.7408872190363193,
"grad_norm": 0.5078475475311279,
"learning_rate": 0.00012597968069666185,
"loss": 0.0567,
"step": 2810
},
{
"epoch": 0.7422055236965263,
"grad_norm": 0.21766887605190277,
"learning_rate": 0.0001258477371684919,
"loss": 0.0485,
"step": 2815
},
{
"epoch": 0.7435238283567333,
"grad_norm": 0.3029612898826599,
"learning_rate": 0.00012571579364032196,
"loss": 0.032,
"step": 2820
},
{
"epoch": 0.7448421330169402,
"grad_norm": 1.2135159969329834,
"learning_rate": 0.00012558385011215203,
"loss": 0.0139,
"step": 2825
},
{
"epoch": 0.7461604376771472,
"grad_norm": 0.016875172033905983,
"learning_rate": 0.00012545190658398207,
"loss": 0.0323,
"step": 2830
},
{
"epoch": 0.7474787423373541,
"grad_norm": 0.08923230320215225,
"learning_rate": 0.00012531996305581214,
"loss": 0.0343,
"step": 2835
},
{
"epoch": 0.7487970469975611,
"grad_norm": 0.2958766520023346,
"learning_rate": 0.00012518801952764215,
"loss": 0.0431,
"step": 2840
},
{
"epoch": 0.7501153516577681,
"grad_norm": 0.7344386577606201,
"learning_rate": 0.00012505607599947222,
"loss": 0.0389,
"step": 2845
},
{
"epoch": 0.7514336563179751,
"grad_norm": 0.03681635856628418,
"learning_rate": 0.0001249241324713023,
"loss": 0.0258,
"step": 2850
},
{
"epoch": 0.7527519609781821,
"grad_norm": 0.22866861522197723,
"learning_rate": 0.00012479218894313233,
"loss": 0.0223,
"step": 2855
},
{
"epoch": 0.7540702656383891,
"grad_norm": 0.029770435765385628,
"learning_rate": 0.0001246602454149624,
"loss": 0.0205,
"step": 2860
},
{
"epoch": 0.755388570298596,
"grad_norm": 0.011845707893371582,
"learning_rate": 0.00012452830188679244,
"loss": 0.0252,
"step": 2865
},
{
"epoch": 0.756706874958803,
"grad_norm": 0.06696149706840515,
"learning_rate": 0.00012439635835862251,
"loss": 0.0166,
"step": 2870
},
{
"epoch": 0.75802517961901,
"grad_norm": 0.01653144136071205,
"learning_rate": 0.00012426441483045256,
"loss": 0.0487,
"step": 2875
},
{
"epoch": 0.7593434842792169,
"grad_norm": 0.031312476843595505,
"learning_rate": 0.00012413247130228263,
"loss": 0.0155,
"step": 2880
},
{
"epoch": 0.7606617889394239,
"grad_norm": 0.011625733226537704,
"learning_rate": 0.0001240005277741127,
"loss": 0.0333,
"step": 2885
},
{
"epoch": 0.7619800935996309,
"grad_norm": 0.012089414522051811,
"learning_rate": 0.00012386858424594274,
"loss": 0.003,
"step": 2890
},
{
"epoch": 0.7632983982598378,
"grad_norm": 0.3012307584285736,
"learning_rate": 0.0001237366407177728,
"loss": 0.0172,
"step": 2895
},
{
"epoch": 0.7646167029200448,
"grad_norm": 0.31575000286102295,
"learning_rate": 0.00012360469718960285,
"loss": 0.0409,
"step": 2900
},
{
"epoch": 0.7659350075802518,
"grad_norm": 0.009794364683330059,
"learning_rate": 0.00012347275366143292,
"loss": 0.0214,
"step": 2905
},
{
"epoch": 0.7672533122404588,
"grad_norm": 0.5973085165023804,
"learning_rate": 0.00012334081013326299,
"loss": 0.0245,
"step": 2910
},
{
"epoch": 0.7685716169006658,
"grad_norm": 0.019750040024518967,
"learning_rate": 0.00012320886660509303,
"loss": 0.0063,
"step": 2915
},
{
"epoch": 0.7698899215608728,
"grad_norm": 0.06402858346700668,
"learning_rate": 0.0001230769230769231,
"loss": 0.0444,
"step": 2920
},
{
"epoch": 0.7712082262210797,
"grad_norm": 0.02876671403646469,
"learning_rate": 0.00012294497954875314,
"loss": 0.0103,
"step": 2925
},
{
"epoch": 0.7725265308812866,
"grad_norm": 0.6962207555770874,
"learning_rate": 0.0001228130360205832,
"loss": 0.0318,
"step": 2930
},
{
"epoch": 0.7738448355414936,
"grad_norm": 0.006536522414535284,
"learning_rate": 0.00012268109249241325,
"loss": 0.0096,
"step": 2935
},
{
"epoch": 0.7751631402017006,
"grad_norm": 0.07097168266773224,
"learning_rate": 0.0001225491489642433,
"loss": 0.0174,
"step": 2940
},
{
"epoch": 0.7764814448619076,
"grad_norm": 0.042360126972198486,
"learning_rate": 0.00012241720543607336,
"loss": 0.0158,
"step": 2945
},
{
"epoch": 0.7777997495221146,
"grad_norm": 0.01159572321921587,
"learning_rate": 0.0001222852619079034,
"loss": 0.0265,
"step": 2950
},
{
"epoch": 0.7791180541823215,
"grad_norm": 0.38408163189888,
"learning_rate": 0.00012215331837973347,
"loss": 0.0233,
"step": 2955
},
{
"epoch": 0.7804363588425285,
"grad_norm": 0.15588605403900146,
"learning_rate": 0.00012202137485156353,
"loss": 0.0041,
"step": 2960
},
{
"epoch": 0.7817546635027355,
"grad_norm": 0.006892362609505653,
"learning_rate": 0.00012188943132339358,
"loss": 0.0026,
"step": 2965
},
{
"epoch": 0.7830729681629425,
"grad_norm": 0.030915727838873863,
"learning_rate": 0.00012175748779522364,
"loss": 0.0028,
"step": 2970
},
{
"epoch": 0.7843912728231495,
"grad_norm": 0.8151025772094727,
"learning_rate": 0.00012162554426705371,
"loss": 0.0429,
"step": 2975
},
{
"epoch": 0.7857095774833565,
"grad_norm": 0.6765475273132324,
"learning_rate": 0.00012149360073888377,
"loss": 0.0319,
"step": 2980
},
{
"epoch": 0.7870278821435633,
"grad_norm": 0.054469238966703415,
"learning_rate": 0.00012136165721071382,
"loss": 0.0413,
"step": 2985
},
{
"epoch": 0.7883461868037703,
"grad_norm": 0.045610666275024414,
"learning_rate": 0.00012122971368254388,
"loss": 0.0521,
"step": 2990
},
{
"epoch": 0.7896644914639773,
"grad_norm": 0.4222470223903656,
"learning_rate": 0.00012109777015437393,
"loss": 0.0846,
"step": 2995
},
{
"epoch": 0.7909827961241843,
"grad_norm": 0.0272397268563509,
"learning_rate": 0.00012096582662620399,
"loss": 0.0364,
"step": 3000
},
{
"epoch": 0.7909827961241843,
"eval_loss": 0.033312585204839706,
"eval_runtime": 452.2552,
"eval_samples_per_second": 7.456,
"eval_steps_per_second": 3.728,
"step": 3000
},
{
"epoch": 0.7923011007843913,
"grad_norm": 0.08674059063196182,
"learning_rate": 0.00012083388309803406,
"loss": 0.0081,
"step": 3005
},
{
"epoch": 0.7936194054445982,
"grad_norm": 0.21960832178592682,
"learning_rate": 0.00012070193956986411,
"loss": 0.0468,
"step": 3010
},
{
"epoch": 0.7949377101048052,
"grad_norm": 0.11259289085865021,
"learning_rate": 0.00012056999604169417,
"loss": 0.0124,
"step": 3015
},
{
"epoch": 0.7962560147650122,
"grad_norm": 0.02945362776517868,
"learning_rate": 0.00012043805251352422,
"loss": 0.0298,
"step": 3020
},
{
"epoch": 0.7975743194252192,
"grad_norm": 0.27889615297317505,
"learning_rate": 0.00012030610898535428,
"loss": 0.0251,
"step": 3025
},
{
"epoch": 0.7988926240854262,
"grad_norm": 0.05873241275548935,
"learning_rate": 0.00012017416545718434,
"loss": 0.0132,
"step": 3030
},
{
"epoch": 0.8002109287456332,
"grad_norm": 0.1570046991109848,
"learning_rate": 0.00012004222192901439,
"loss": 0.0228,
"step": 3035
},
{
"epoch": 0.80152923340584,
"grad_norm": 0.12575332820415497,
"learning_rate": 0.00011991027840084443,
"loss": 0.0049,
"step": 3040
},
{
"epoch": 0.802847538066047,
"grad_norm": 0.8416435122489929,
"learning_rate": 0.00011977833487267449,
"loss": 0.0542,
"step": 3045
},
{
"epoch": 0.804165842726254,
"grad_norm": 0.2605098485946655,
"learning_rate": 0.00011964639134450454,
"loss": 0.0084,
"step": 3050
},
{
"epoch": 0.805484147386461,
"grad_norm": 0.8996294736862183,
"learning_rate": 0.00011951444781633461,
"loss": 0.0442,
"step": 3055
},
{
"epoch": 0.806802452046668,
"grad_norm": 2.7525105476379395,
"learning_rate": 0.00011938250428816467,
"loss": 0.0642,
"step": 3060
},
{
"epoch": 0.808120756706875,
"grad_norm": 0.14955930411815643,
"learning_rate": 0.00011925056075999473,
"loss": 0.0384,
"step": 3065
},
{
"epoch": 0.8094390613670819,
"grad_norm": 0.018756115809082985,
"learning_rate": 0.00011911861723182478,
"loss": 0.0154,
"step": 3070
},
{
"epoch": 0.8107573660272889,
"grad_norm": 0.23998615145683289,
"learning_rate": 0.00011898667370365484,
"loss": 0.0413,
"step": 3075
},
{
"epoch": 0.8120756706874959,
"grad_norm": 0.27253249287605286,
"learning_rate": 0.00011885473017548489,
"loss": 0.0081,
"step": 3080
},
{
"epoch": 0.8133939753477029,
"grad_norm": 0.2925993502140045,
"learning_rate": 0.00011872278664731495,
"loss": 0.0332,
"step": 3085
},
{
"epoch": 0.8147122800079099,
"grad_norm": 0.5364832878112793,
"learning_rate": 0.00011859084311914502,
"loss": 0.0143,
"step": 3090
},
{
"epoch": 0.8160305846681168,
"grad_norm": 0.32104921340942383,
"learning_rate": 0.00011845889959097507,
"loss": 0.0216,
"step": 3095
},
{
"epoch": 0.8173488893283237,
"grad_norm": 0.0205856766551733,
"learning_rate": 0.00011832695606280513,
"loss": 0.0346,
"step": 3100
},
{
"epoch": 0.8186671939885307,
"grad_norm": 0.2541547417640686,
"learning_rate": 0.00011819501253463518,
"loss": 0.0793,
"step": 3105
},
{
"epoch": 0.8199854986487377,
"grad_norm": 0.08333491533994675,
"learning_rate": 0.00011806306900646524,
"loss": 0.0049,
"step": 3110
},
{
"epoch": 0.8213038033089447,
"grad_norm": 0.0355968177318573,
"learning_rate": 0.0001179311254782953,
"loss": 0.0051,
"step": 3115
},
{
"epoch": 0.8226221079691517,
"grad_norm": 0.06948401033878326,
"learning_rate": 0.00011779918195012536,
"loss": 0.013,
"step": 3120
},
{
"epoch": 0.8239404126293587,
"grad_norm": 0.03328891843557358,
"learning_rate": 0.00011766723842195542,
"loss": 0.0122,
"step": 3125
},
{
"epoch": 0.8252587172895656,
"grad_norm": 0.013782350346446037,
"learning_rate": 0.00011753529489378548,
"loss": 0.0073,
"step": 3130
},
{
"epoch": 0.8265770219497726,
"grad_norm": 0.024390392005443573,
"learning_rate": 0.00011740335136561553,
"loss": 0.0143,
"step": 3135
},
{
"epoch": 0.8278953266099796,
"grad_norm": 0.002548128366470337,
"learning_rate": 0.00011727140783744557,
"loss": 0.0027,
"step": 3140
},
{
"epoch": 0.8292136312701865,
"grad_norm": 0.11674848943948746,
"learning_rate": 0.00011713946430927563,
"loss": 0.0253,
"step": 3145
},
{
"epoch": 0.8305319359303935,
"grad_norm": 0.005774884019047022,
"learning_rate": 0.00011700752078110568,
"loss": 0.0018,
"step": 3150
},
{
"epoch": 0.8318502405906005,
"grad_norm": 0.5763069987297058,
"learning_rate": 0.00011687557725293574,
"loss": 0.0119,
"step": 3155
},
{
"epoch": 0.8331685452508074,
"grad_norm": 0.0027607593219727278,
"learning_rate": 0.0001167436337247658,
"loss": 0.0279,
"step": 3160
},
{
"epoch": 0.8344868499110144,
"grad_norm": 1.859642505645752,
"learning_rate": 0.00011661169019659585,
"loss": 0.0228,
"step": 3165
},
{
"epoch": 0.8358051545712214,
"grad_norm": 0.16597022116184235,
"learning_rate": 0.00011647974666842592,
"loss": 0.1228,
"step": 3170
},
{
"epoch": 0.8371234592314284,
"grad_norm": 0.33833742141723633,
"learning_rate": 0.00011634780314025598,
"loss": 0.073,
"step": 3175
},
{
"epoch": 0.8384417638916354,
"grad_norm": 0.024682912975549698,
"learning_rate": 0.00011621585961208603,
"loss": 0.0042,
"step": 3180
},
{
"epoch": 0.8397600685518424,
"grad_norm": 0.05926942452788353,
"learning_rate": 0.00011608391608391609,
"loss": 0.0066,
"step": 3185
},
{
"epoch": 0.8410783732120493,
"grad_norm": 0.1414029747247696,
"learning_rate": 0.00011595197255574614,
"loss": 0.0603,
"step": 3190
},
{
"epoch": 0.8423966778722562,
"grad_norm": 0.37928736209869385,
"learning_rate": 0.0001158200290275762,
"loss": 0.0266,
"step": 3195
},
{
"epoch": 0.8437149825324632,
"grad_norm": 0.018329354003071785,
"learning_rate": 0.00011568808549940627,
"loss": 0.0047,
"step": 3200
},
{
"epoch": 0.8450332871926702,
"grad_norm": 0.2993735373020172,
"learning_rate": 0.00011555614197123632,
"loss": 0.0218,
"step": 3205
},
{
"epoch": 0.8463515918528772,
"grad_norm": 0.1767728328704834,
"learning_rate": 0.00011542419844306638,
"loss": 0.0363,
"step": 3210
},
{
"epoch": 0.8476698965130842,
"grad_norm": 0.39774414896965027,
"learning_rate": 0.00011529225491489644,
"loss": 0.0506,
"step": 3215
},
{
"epoch": 0.8489882011732911,
"grad_norm": 0.021896762773394585,
"learning_rate": 0.00011516031138672649,
"loss": 0.0081,
"step": 3220
},
{
"epoch": 0.8503065058334981,
"grad_norm": 0.358372300863266,
"learning_rate": 0.00011502836785855655,
"loss": 0.0224,
"step": 3225
},
{
"epoch": 0.8516248104937051,
"grad_norm": 0.01605542004108429,
"learning_rate": 0.00011489642433038662,
"loss": 0.0215,
"step": 3230
},
{
"epoch": 0.8529431151539121,
"grad_norm": 0.021189266815781593,
"learning_rate": 0.00011476448080221667,
"loss": 0.0051,
"step": 3235
},
{
"epoch": 0.8542614198141191,
"grad_norm": 0.013394076377153397,
"learning_rate": 0.0001146325372740467,
"loss": 0.021,
"step": 3240
},
{
"epoch": 0.8555797244743261,
"grad_norm": 0.19848507642745972,
"learning_rate": 0.00011450059374587676,
"loss": 0.0285,
"step": 3245
},
{
"epoch": 0.856898029134533,
"grad_norm": 0.2463046759366989,
"learning_rate": 0.00011436865021770683,
"loss": 0.0384,
"step": 3250
},
{
"epoch": 0.8582163337947399,
"grad_norm": 0.37432390451431274,
"learning_rate": 0.00011423670668953688,
"loss": 0.0098,
"step": 3255
},
{
"epoch": 0.8595346384549469,
"grad_norm": 0.060943394899368286,
"learning_rate": 0.00011410476316136694,
"loss": 0.0087,
"step": 3260
},
{
"epoch": 0.8608529431151539,
"grad_norm": 0.2846696674823761,
"learning_rate": 0.00011397281963319699,
"loss": 0.0148,
"step": 3265
},
{
"epoch": 0.8621712477753609,
"grad_norm": 0.009311323054134846,
"learning_rate": 0.00011384087610502705,
"loss": 0.0024,
"step": 3270
},
{
"epoch": 0.8634895524355679,
"grad_norm": 0.046277035027742386,
"learning_rate": 0.0001137089325768571,
"loss": 0.0274,
"step": 3275
},
{
"epoch": 0.8648078570957748,
"grad_norm": 0.006024620030075312,
"learning_rate": 0.00011357698904868716,
"loss": 0.0286,
"step": 3280
},
{
"epoch": 0.8661261617559818,
"grad_norm": 0.033578380942344666,
"learning_rate": 0.00011344504552051723,
"loss": 0.0153,
"step": 3285
},
{
"epoch": 0.8674444664161888,
"grad_norm": 0.8537917137145996,
"learning_rate": 0.00011331310199234728,
"loss": 0.0304,
"step": 3290
},
{
"epoch": 0.8687627710763958,
"grad_norm": 0.013933337293565273,
"learning_rate": 0.00011318115846417734,
"loss": 0.0112,
"step": 3295
},
{
"epoch": 0.8700810757366028,
"grad_norm": 0.35437721014022827,
"learning_rate": 0.0001130492149360074,
"loss": 0.0228,
"step": 3300
},
{
"epoch": 0.8713993803968098,
"grad_norm": 1.3024121522903442,
"learning_rate": 0.00011291727140783745,
"loss": 0.0203,
"step": 3305
},
{
"epoch": 0.8727176850570166,
"grad_norm": 0.5131255984306335,
"learning_rate": 0.00011278532787966751,
"loss": 0.0181,
"step": 3310
},
{
"epoch": 0.8740359897172236,
"grad_norm": 0.039366886019706726,
"learning_rate": 0.00011265338435149758,
"loss": 0.0192,
"step": 3315
},
{
"epoch": 0.8753542943774306,
"grad_norm": 0.13679669797420502,
"learning_rate": 0.00011252144082332763,
"loss": 0.004,
"step": 3320
},
{
"epoch": 0.8766725990376376,
"grad_norm": 0.003076886525377631,
"learning_rate": 0.00011238949729515769,
"loss": 0.0405,
"step": 3325
},
{
"epoch": 0.8779909036978446,
"grad_norm": 0.019953785464167595,
"learning_rate": 0.00011225755376698774,
"loss": 0.0241,
"step": 3330
},
{
"epoch": 0.8793092083580516,
"grad_norm": 0.007980377413332462,
"learning_rate": 0.0001121256102388178,
"loss": 0.0064,
"step": 3335
},
{
"epoch": 0.8806275130182585,
"grad_norm": 0.018761295825242996,
"learning_rate": 0.00011199366671064784,
"loss": 0.0032,
"step": 3340
},
{
"epoch": 0.8819458176784655,
"grad_norm": 0.022511709481477737,
"learning_rate": 0.0001118617231824779,
"loss": 0.0055,
"step": 3345
},
{
"epoch": 0.8832641223386725,
"grad_norm": 0.021270718425512314,
"learning_rate": 0.00011172977965430795,
"loss": 0.033,
"step": 3350
},
{
"epoch": 0.8845824269988795,
"grad_norm": 0.02710561640560627,
"learning_rate": 0.00011159783612613801,
"loss": 0.0094,
"step": 3355
},
{
"epoch": 0.8859007316590864,
"grad_norm": 0.4353378117084503,
"learning_rate": 0.00011146589259796806,
"loss": 0.0089,
"step": 3360
},
{
"epoch": 0.8872190363192934,
"grad_norm": 0.0257766991853714,
"learning_rate": 0.00011133394906979813,
"loss": 0.0059,
"step": 3365
},
{
"epoch": 0.8885373409795003,
"grad_norm": 0.80838942527771,
"learning_rate": 0.00011120200554162819,
"loss": 0.0263,
"step": 3370
},
{
"epoch": 0.8898556456397073,
"grad_norm": 0.007799761835485697,
"learning_rate": 0.00011107006201345824,
"loss": 0.0028,
"step": 3375
},
{
"epoch": 0.8911739502999143,
"grad_norm": 0.007315775845199823,
"learning_rate": 0.0001109381184852883,
"loss": 0.0127,
"step": 3380
},
{
"epoch": 0.8924922549601213,
"grad_norm": 1.4861233234405518,
"learning_rate": 0.00011080617495711836,
"loss": 0.0562,
"step": 3385
},
{
"epoch": 0.8938105596203283,
"grad_norm": 0.010219530202448368,
"learning_rate": 0.00011067423142894841,
"loss": 0.0438,
"step": 3390
},
{
"epoch": 0.8951288642805353,
"grad_norm": 1.0191857814788818,
"learning_rate": 0.00011054228790077848,
"loss": 0.0493,
"step": 3395
},
{
"epoch": 0.8964471689407422,
"grad_norm": 0.01459536887705326,
"learning_rate": 0.00011041034437260854,
"loss": 0.0117,
"step": 3400
},
{
"epoch": 0.8977654736009492,
"grad_norm": 0.008682495914399624,
"learning_rate": 0.00011027840084443859,
"loss": 0.02,
"step": 3405
},
{
"epoch": 0.8990837782611562,
"grad_norm": 0.02197263017296791,
"learning_rate": 0.00011014645731626865,
"loss": 0.0454,
"step": 3410
},
{
"epoch": 0.9004020829213631,
"grad_norm": 0.01436714269220829,
"learning_rate": 0.0001100145137880987,
"loss": 0.0283,
"step": 3415
},
{
"epoch": 0.9017203875815701,
"grad_norm": 0.14327946305274963,
"learning_rate": 0.00010988257025992876,
"loss": 0.0461,
"step": 3420
},
{
"epoch": 0.9030386922417771,
"grad_norm": 1.671773910522461,
"learning_rate": 0.00010975062673175883,
"loss": 0.054,
"step": 3425
},
{
"epoch": 0.904356996901984,
"grad_norm": 0.009926804341375828,
"learning_rate": 0.00010961868320358888,
"loss": 0.0429,
"step": 3430
},
{
"epoch": 0.905675301562191,
"grad_norm": 0.554020881652832,
"learning_rate": 0.00010948673967541894,
"loss": 0.0618,
"step": 3435
},
{
"epoch": 0.906993606222398,
"grad_norm": 0.1399248093366623,
"learning_rate": 0.00010935479614724897,
"loss": 0.0229,
"step": 3440
},
{
"epoch": 0.908311910882605,
"grad_norm": 0.02739197015762329,
"learning_rate": 0.00010922285261907904,
"loss": 0.0082,
"step": 3445
},
{
"epoch": 0.909630215542812,
"grad_norm": 0.33394527435302734,
"learning_rate": 0.00010909090909090909,
"loss": 0.0403,
"step": 3450
},
{
"epoch": 0.9109485202030189,
"grad_norm": 0.08083894103765488,
"learning_rate": 0.00010895896556273915,
"loss": 0.0406,
"step": 3455
},
{
"epoch": 0.9122668248632259,
"grad_norm": 0.39336663484573364,
"learning_rate": 0.0001088270220345692,
"loss": 0.02,
"step": 3460
},
{
"epoch": 0.9135851295234328,
"grad_norm": 0.20481553673744202,
"learning_rate": 0.00010869507850639926,
"loss": 0.0221,
"step": 3465
},
{
"epoch": 0.9149034341836398,
"grad_norm": 1.4507408142089844,
"learning_rate": 0.00010856313497822932,
"loss": 0.0357,
"step": 3470
},
{
"epoch": 0.9162217388438468,
"grad_norm": 0.2678806483745575,
"learning_rate": 0.00010843119145005937,
"loss": 0.0181,
"step": 3475
},
{
"epoch": 0.9175400435040538,
"grad_norm": 0.007361674215644598,
"learning_rate": 0.00010829924792188944,
"loss": 0.0978,
"step": 3480
},
{
"epoch": 0.9188583481642607,
"grad_norm": 0.773695707321167,
"learning_rate": 0.0001081673043937195,
"loss": 0.0401,
"step": 3485
},
{
"epoch": 0.9201766528244677,
"grad_norm": 0.0010772625682875514,
"learning_rate": 0.00010803536086554955,
"loss": 0.0233,
"step": 3490
},
{
"epoch": 0.9214949574846747,
"grad_norm": 0.08971104770898819,
"learning_rate": 0.00010790341733737961,
"loss": 0.0319,
"step": 3495
},
{
"epoch": 0.9228132621448817,
"grad_norm": 0.21372731029987335,
"learning_rate": 0.00010777147380920966,
"loss": 0.0315,
"step": 3500
},
{
"epoch": 0.9228132621448817,
"eval_loss": 0.02952708676457405,
"eval_runtime": 451.5837,
"eval_samples_per_second": 7.467,
"eval_steps_per_second": 3.734,
"step": 3500
},
{
"epoch": 0.9241315668050887,
"grad_norm": 0.016639264300465584,
"learning_rate": 0.00010763953028103972,
"loss": 0.0125,
"step": 3505
},
{
"epoch": 0.9254498714652957,
"grad_norm": 0.46340492367744446,
"learning_rate": 0.00010750758675286979,
"loss": 0.0186,
"step": 3510
},
{
"epoch": 0.9267681761255026,
"grad_norm": 0.01847526989877224,
"learning_rate": 0.00010737564322469984,
"loss": 0.0026,
"step": 3515
},
{
"epoch": 0.9280864807857095,
"grad_norm": 0.5947860479354858,
"learning_rate": 0.0001072436996965299,
"loss": 0.0259,
"step": 3520
},
{
"epoch": 0.9294047854459165,
"grad_norm": 0.06145291402935982,
"learning_rate": 0.00010711175616835995,
"loss": 0.0057,
"step": 3525
},
{
"epoch": 0.9307230901061235,
"grad_norm": 0.0143959429115057,
"learning_rate": 0.00010697981264019001,
"loss": 0.0145,
"step": 3530
},
{
"epoch": 0.9320413947663305,
"grad_norm": 0.21143831312656403,
"learning_rate": 0.00010684786911202007,
"loss": 0.0459,
"step": 3535
},
{
"epoch": 0.9333596994265375,
"grad_norm": 0.02548077143728733,
"learning_rate": 0.00010671592558385011,
"loss": 0.0051,
"step": 3540
},
{
"epoch": 0.9346780040867444,
"grad_norm": 0.008077048696577549,
"learning_rate": 0.00010658398205568016,
"loss": 0.0306,
"step": 3545
},
{
"epoch": 0.9359963087469514,
"grad_norm": 0.0030760422814637423,
"learning_rate": 0.00010645203852751022,
"loss": 0.0575,
"step": 3550
},
{
"epoch": 0.9373146134071584,
"grad_norm": 0.18114158511161804,
"learning_rate": 0.00010632009499934027,
"loss": 0.0885,
"step": 3555
},
{
"epoch": 0.9386329180673654,
"grad_norm": 0.02450549602508545,
"learning_rate": 0.00010618815147117034,
"loss": 0.0045,
"step": 3560
},
{
"epoch": 0.9399512227275724,
"grad_norm": 0.1238626018166542,
"learning_rate": 0.0001060562079430004,
"loss": 0.0166,
"step": 3565
},
{
"epoch": 0.9412695273877794,
"grad_norm": 0.1879919469356537,
"learning_rate": 0.00010592426441483046,
"loss": 0.0077,
"step": 3570
},
{
"epoch": 0.9425878320479862,
"grad_norm": 0.11323565989732742,
"learning_rate": 0.00010579232088666051,
"loss": 0.0213,
"step": 3575
},
{
"epoch": 0.9439061367081932,
"grad_norm": 0.35575854778289795,
"learning_rate": 0.00010566037735849057,
"loss": 0.0336,
"step": 3580
},
{
"epoch": 0.9452244413684002,
"grad_norm": 0.14052227139472961,
"learning_rate": 0.00010552843383032062,
"loss": 0.0325,
"step": 3585
},
{
"epoch": 0.9465427460286072,
"grad_norm": 0.2643798887729645,
"learning_rate": 0.00010539649030215069,
"loss": 0.0192,
"step": 3590
},
{
"epoch": 0.9478610506888142,
"grad_norm": 0.3207031190395355,
"learning_rate": 0.00010526454677398075,
"loss": 0.0221,
"step": 3595
},
{
"epoch": 0.9491793553490212,
"grad_norm": 0.022803861647844315,
"learning_rate": 0.0001051326032458108,
"loss": 0.029,
"step": 3600
},
{
"epoch": 0.9504976600092281,
"grad_norm": 0.02511664852499962,
"learning_rate": 0.00010500065971764086,
"loss": 0.0422,
"step": 3605
},
{
"epoch": 0.9518159646694351,
"grad_norm": 0.06505445390939713,
"learning_rate": 0.00010486871618947091,
"loss": 0.0092,
"step": 3610
},
{
"epoch": 0.9531342693296421,
"grad_norm": 0.09998584538698196,
"learning_rate": 0.00010473677266130097,
"loss": 0.0242,
"step": 3615
},
{
"epoch": 0.9544525739898491,
"grad_norm": 0.9645698666572571,
"learning_rate": 0.00010460482913313104,
"loss": 0.0124,
"step": 3620
},
{
"epoch": 0.955770878650056,
"grad_norm": 0.2389964610338211,
"learning_rate": 0.0001044728856049611,
"loss": 0.0169,
"step": 3625
},
{
"epoch": 0.957089183310263,
"grad_norm": 2.030608654022217,
"learning_rate": 0.00010434094207679115,
"loss": 0.0518,
"step": 3630
},
{
"epoch": 0.9584074879704699,
"grad_norm": 0.05979987606406212,
"learning_rate": 0.0001042089985486212,
"loss": 0.0081,
"step": 3635
},
{
"epoch": 0.9597257926306769,
"grad_norm": 0.15761719644069672,
"learning_rate": 0.00010407705502045125,
"loss": 0.0061,
"step": 3640
},
{
"epoch": 0.9610440972908839,
"grad_norm": 0.6534290909767151,
"learning_rate": 0.0001039451114922813,
"loss": 0.0104,
"step": 3645
},
{
"epoch": 0.9623624019510909,
"grad_norm": 1.0324147939682007,
"learning_rate": 0.00010381316796411136,
"loss": 0.0381,
"step": 3650
},
{
"epoch": 0.9636807066112979,
"grad_norm": 0.002968872431665659,
"learning_rate": 0.00010368122443594142,
"loss": 0.0343,
"step": 3655
},
{
"epoch": 0.9649990112715049,
"grad_norm": 0.011243184097111225,
"learning_rate": 0.00010354928090777147,
"loss": 0.019,
"step": 3660
},
{
"epoch": 0.9663173159317118,
"grad_norm": 0.17663739621639252,
"learning_rate": 0.00010341733737960153,
"loss": 0.0452,
"step": 3665
},
{
"epoch": 0.9676356205919188,
"grad_norm": 1.2647719383239746,
"learning_rate": 0.00010328539385143158,
"loss": 0.0154,
"step": 3670
},
{
"epoch": 0.9689539252521258,
"grad_norm": 0.3691752552986145,
"learning_rate": 0.00010315345032326165,
"loss": 0.028,
"step": 3675
},
{
"epoch": 0.9702722299123328,
"grad_norm": 0.0015879774000495672,
"learning_rate": 0.00010302150679509171,
"loss": 0.0202,
"step": 3680
},
{
"epoch": 0.9715905345725397,
"grad_norm": 0.1441984623670578,
"learning_rate": 0.00010288956326692176,
"loss": 0.0221,
"step": 3685
},
{
"epoch": 0.9729088392327467,
"grad_norm": 0.20431455969810486,
"learning_rate": 0.00010275761973875182,
"loss": 0.0072,
"step": 3690
},
{
"epoch": 0.9742271438929536,
"grad_norm": 0.861625611782074,
"learning_rate": 0.00010262567621058187,
"loss": 0.0523,
"step": 3695
},
{
"epoch": 0.9755454485531606,
"grad_norm": 0.005049478262662888,
"learning_rate": 0.00010249373268241193,
"loss": 0.0051,
"step": 3700
},
{
"epoch": 0.9768637532133676,
"grad_norm": 0.49685510993003845,
"learning_rate": 0.000102361789154242,
"loss": 0.023,
"step": 3705
},
{
"epoch": 0.9781820578735746,
"grad_norm": 0.08789395540952682,
"learning_rate": 0.00010222984562607205,
"loss": 0.0159,
"step": 3710
},
{
"epoch": 0.9795003625337816,
"grad_norm": 0.027168691158294678,
"learning_rate": 0.00010209790209790211,
"loss": 0.0083,
"step": 3715
},
{
"epoch": 0.9808186671939886,
"grad_norm": 0.0006773864733986557,
"learning_rate": 0.00010196595856973217,
"loss": 0.0048,
"step": 3720
},
{
"epoch": 0.9821369718541955,
"grad_norm": 0.01636457070708275,
"learning_rate": 0.00010183401504156222,
"loss": 0.0159,
"step": 3725
},
{
"epoch": 0.9834552765144025,
"grad_norm": 0.10160859674215317,
"learning_rate": 0.00010170207151339228,
"loss": 0.0047,
"step": 3730
},
{
"epoch": 0.9847735811746094,
"grad_norm": 0.14173269271850586,
"learning_rate": 0.00010157012798522232,
"loss": 0.006,
"step": 3735
},
{
"epoch": 0.9860918858348164,
"grad_norm": 0.003458512481302023,
"learning_rate": 0.00010143818445705238,
"loss": 0.0193,
"step": 3740
},
{
"epoch": 0.9874101904950234,
"grad_norm": 0.005163820460438728,
"learning_rate": 0.00010130624092888243,
"loss": 0.0039,
"step": 3745
},
{
"epoch": 0.9887284951552304,
"grad_norm": 0.005913791712373495,
"learning_rate": 0.00010117429740071249,
"loss": 0.0119,
"step": 3750
},
{
"epoch": 0.9900467998154373,
"grad_norm": 0.00800853967666626,
"learning_rate": 0.00010104235387254256,
"loss": 0.044,
"step": 3755
},
{
"epoch": 0.9913651044756443,
"grad_norm": 0.18146778643131256,
"learning_rate": 0.00010091041034437261,
"loss": 0.0048,
"step": 3760
},
{
"epoch": 0.9926834091358513,
"grad_norm": 0.01235104724764824,
"learning_rate": 0.00010077846681620267,
"loss": 0.0017,
"step": 3765
},
{
"epoch": 0.9940017137960583,
"grad_norm": 0.17677897214889526,
"learning_rate": 0.00010064652328803272,
"loss": 0.0339,
"step": 3770
},
{
"epoch": 0.9953200184562653,
"grad_norm": 0.0017472271574661136,
"learning_rate": 0.00010051457975986278,
"loss": 0.0494,
"step": 3775
},
{
"epoch": 0.9966383231164723,
"grad_norm": 0.10814860463142395,
"learning_rate": 0.00010038263623169283,
"loss": 0.0741,
"step": 3780
},
{
"epoch": 0.9979566277766792,
"grad_norm": 0.11329760402441025,
"learning_rate": 0.0001002506927035229,
"loss": 0.0182,
"step": 3785
},
{
"epoch": 0.9992749324368861,
"grad_norm": 0.11573276668787003,
"learning_rate": 0.00010011874917535296,
"loss": 0.0068,
"step": 3790
},
{
"epoch": 1.000790982796124,
"grad_norm": 0.08449886739253998,
"learning_rate": 9.998680564718301e-05,
"loss": 0.0141,
"step": 3795
},
{
"epoch": 1.002109287456331,
"grad_norm": 0.05035184696316719,
"learning_rate": 9.985486211901307e-05,
"loss": 0.0293,
"step": 3800
},
{
"epoch": 1.003427592116538,
"grad_norm": 0.0255444198846817,
"learning_rate": 9.972291859084313e-05,
"loss": 0.0054,
"step": 3805
},
{
"epoch": 1.004745896776745,
"grad_norm": 0.0033677336759865284,
"learning_rate": 9.959097506267318e-05,
"loss": 0.0567,
"step": 3810
},
{
"epoch": 1.006064201436952,
"grad_norm": 0.09453682601451874,
"learning_rate": 9.945903153450324e-05,
"loss": 0.0589,
"step": 3815
},
{
"epoch": 1.007382506097159,
"grad_norm": 0.01592979207634926,
"learning_rate": 9.932708800633329e-05,
"loss": 0.0043,
"step": 3820
},
{
"epoch": 1.008700810757366,
"grad_norm": 0.002263693604618311,
"learning_rate": 9.919514447816335e-05,
"loss": 0.0195,
"step": 3825
},
{
"epoch": 1.010019115417573,
"grad_norm": 0.013390793465077877,
"learning_rate": 9.90632009499934e-05,
"loss": 0.0152,
"step": 3830
},
{
"epoch": 1.01133742007778,
"grad_norm": 0.10473847389221191,
"learning_rate": 9.893125742182346e-05,
"loss": 0.0606,
"step": 3835
},
{
"epoch": 1.012655724737987,
"grad_norm": 0.05837221071124077,
"learning_rate": 9.879931389365353e-05,
"loss": 0.0121,
"step": 3840
},
{
"epoch": 1.013974029398194,
"grad_norm": 0.3803791105747223,
"learning_rate": 9.866737036548358e-05,
"loss": 0.0386,
"step": 3845
},
{
"epoch": 1.0152923340584008,
"grad_norm": 0.4067519009113312,
"learning_rate": 9.853542683731364e-05,
"loss": 0.0115,
"step": 3850
},
{
"epoch": 1.0166106387186078,
"grad_norm": 0.02585229091346264,
"learning_rate": 9.84034833091437e-05,
"loss": 0.0214,
"step": 3855
},
{
"epoch": 1.0179289433788148,
"grad_norm": 0.03670825809240341,
"learning_rate": 9.827153978097374e-05,
"loss": 0.0059,
"step": 3860
},
{
"epoch": 1.0192472480390218,
"grad_norm": 0.014171554706990719,
"learning_rate": 9.81395962528038e-05,
"loss": 0.0145,
"step": 3865
},
{
"epoch": 1.0205655526992288,
"grad_norm": 0.027376385405659676,
"learning_rate": 9.800765272463386e-05,
"loss": 0.0089,
"step": 3870
},
{
"epoch": 1.0218838573594358,
"grad_norm": 0.03168405964970589,
"learning_rate": 9.787570919646392e-05,
"loss": 0.0132,
"step": 3875
},
{
"epoch": 1.0232021620196428,
"grad_norm": 0.03346199914813042,
"learning_rate": 9.774376566829397e-05,
"loss": 0.0246,
"step": 3880
},
{
"epoch": 1.0245204666798498,
"grad_norm": 0.00894144270569086,
"learning_rate": 9.761182214012403e-05,
"loss": 0.0105,
"step": 3885
},
{
"epoch": 1.0258387713400567,
"grad_norm": 0.3172806203365326,
"learning_rate": 9.747987861195409e-05,
"loss": 0.0103,
"step": 3890
},
{
"epoch": 1.0271570760002637,
"grad_norm": 0.009055040776729584,
"learning_rate": 9.734793508378414e-05,
"loss": 0.0103,
"step": 3895
},
{
"epoch": 1.0284753806604707,
"grad_norm": 0.014140011742711067,
"learning_rate": 9.721599155561421e-05,
"loss": 0.0037,
"step": 3900
},
{
"epoch": 1.0297936853206777,
"grad_norm": 0.008317383006215096,
"learning_rate": 9.708404802744427e-05,
"loss": 0.002,
"step": 3905
},
{
"epoch": 1.0311119899808845,
"grad_norm": 0.005038558971136808,
"learning_rate": 9.695210449927431e-05,
"loss": 0.0017,
"step": 3910
},
{
"epoch": 1.0324302946410915,
"grad_norm": 0.40058520436286926,
"learning_rate": 9.682016097110436e-05,
"loss": 0.0065,
"step": 3915
},
{
"epoch": 1.0337485993012985,
"grad_norm": 0.005197151098400354,
"learning_rate": 9.668821744293442e-05,
"loss": 0.0031,
"step": 3920
},
{
"epoch": 1.0350669039615055,
"grad_norm": 0.014353781007230282,
"learning_rate": 9.655627391476449e-05,
"loss": 0.0009,
"step": 3925
},
{
"epoch": 1.0363852086217125,
"grad_norm": 0.13260559737682343,
"learning_rate": 9.642433038659454e-05,
"loss": 0.0323,
"step": 3930
},
{
"epoch": 1.0377035132819195,
"grad_norm": 0.006795065477490425,
"learning_rate": 9.62923868584246e-05,
"loss": 0.0022,
"step": 3935
},
{
"epoch": 1.0390218179421264,
"grad_norm": 0.2276086062192917,
"learning_rate": 9.616044333025466e-05,
"loss": 0.0221,
"step": 3940
},
{
"epoch": 1.0403401226023334,
"grad_norm": 0.06121920794248581,
"learning_rate": 9.602849980208471e-05,
"loss": 0.0037,
"step": 3945
},
{
"epoch": 1.0416584272625404,
"grad_norm": 0.9180755019187927,
"learning_rate": 9.589655627391477e-05,
"loss": 0.0589,
"step": 3950
},
{
"epoch": 1.0429767319227474,
"grad_norm": 0.07515591382980347,
"learning_rate": 9.576461274574484e-05,
"loss": 0.0653,
"step": 3955
},
{
"epoch": 1.0442950365829544,
"grad_norm": 0.018060607835650444,
"learning_rate": 9.563266921757488e-05,
"loss": 0.0178,
"step": 3960
},
{
"epoch": 1.0456133412431612,
"grad_norm": 0.02751368284225464,
"learning_rate": 9.550072568940493e-05,
"loss": 0.0076,
"step": 3965
},
{
"epoch": 1.0469316459033682,
"grad_norm": 0.653998613357544,
"learning_rate": 9.536878216123499e-05,
"loss": 0.0066,
"step": 3970
},
{
"epoch": 1.0482499505635752,
"grad_norm": 0.3117768168449402,
"learning_rate": 9.523683863306505e-05,
"loss": 0.0087,
"step": 3975
},
{
"epoch": 1.0495682552237822,
"grad_norm": 0.013952831737697124,
"learning_rate": 9.510489510489511e-05,
"loss": 0.0037,
"step": 3980
},
{
"epoch": 1.0508865598839892,
"grad_norm": 0.01806250400841236,
"learning_rate": 9.497295157672517e-05,
"loss": 0.0028,
"step": 3985
},
{
"epoch": 1.0522048645441962,
"grad_norm": 0.13678006827831268,
"learning_rate": 9.484100804855523e-05,
"loss": 0.0533,
"step": 3990
},
{
"epoch": 1.0535231692044031,
"grad_norm": 0.14869382977485657,
"learning_rate": 9.470906452038528e-05,
"loss": 0.009,
"step": 3995
},
{
"epoch": 1.0548414738646101,
"grad_norm": 0.33614659309387207,
"learning_rate": 9.457712099221534e-05,
"loss": 0.0555,
"step": 4000
},
{
"epoch": 1.0548414738646101,
"eval_loss": 0.026165226474404335,
"eval_runtime": 452.2482,
"eval_samples_per_second": 7.456,
"eval_steps_per_second": 3.728,
"step": 4000
},
{
"epoch": 1.0561597785248171,
"grad_norm": 0.007546027656644583,
"learning_rate": 9.444517746404539e-05,
"loss": 0.0029,
"step": 4005
},
{
"epoch": 1.0574780831850241,
"grad_norm": 0.3720332384109497,
"learning_rate": 9.431323393587545e-05,
"loss": 0.0353,
"step": 4010
},
{
"epoch": 1.0587963878452311,
"grad_norm": 1.1335264444351196,
"learning_rate": 9.41812904077055e-05,
"loss": 0.0142,
"step": 4015
},
{
"epoch": 1.060114692505438,
"grad_norm": 0.024723488837480545,
"learning_rate": 9.404934687953556e-05,
"loss": 0.006,
"step": 4020
},
{
"epoch": 1.0614329971656449,
"grad_norm": 0.040354058146476746,
"learning_rate": 9.391740335136562e-05,
"loss": 0.0107,
"step": 4025
},
{
"epoch": 1.0627513018258519,
"grad_norm": 0.222810298204422,
"learning_rate": 9.378545982319567e-05,
"loss": 0.0273,
"step": 4030
},
{
"epoch": 1.0640696064860589,
"grad_norm": 0.025684095919132233,
"learning_rate": 9.365351629502574e-05,
"loss": 0.0033,
"step": 4035
},
{
"epoch": 1.0653879111462659,
"grad_norm": 0.05338352546095848,
"learning_rate": 9.35215727668558e-05,
"loss": 0.0052,
"step": 4040
},
{
"epoch": 1.0667062158064728,
"grad_norm": 0.06182330474257469,
"learning_rate": 9.338962923868585e-05,
"loss": 0.0038,
"step": 4045
},
{
"epoch": 1.0680245204666798,
"grad_norm": 0.012170832604169846,
"learning_rate": 9.325768571051591e-05,
"loss": 0.0018,
"step": 4050
},
{
"epoch": 1.0693428251268868,
"grad_norm": 0.5424306392669678,
"learning_rate": 9.312574218234596e-05,
"loss": 0.0445,
"step": 4055
},
{
"epoch": 1.0706611297870938,
"grad_norm": 0.017939254641532898,
"learning_rate": 9.299379865417602e-05,
"loss": 0.0389,
"step": 4060
},
{
"epoch": 1.0719794344473008,
"grad_norm": 0.0060431682504713535,
"learning_rate": 9.286185512600607e-05,
"loss": 0.0025,
"step": 4065
},
{
"epoch": 1.0732977391075078,
"grad_norm": 0.0071444883942604065,
"learning_rate": 9.272991159783613e-05,
"loss": 0.0333,
"step": 4070
},
{
"epoch": 1.0746160437677148,
"grad_norm": 0.29632750153541565,
"learning_rate": 9.259796806966619e-05,
"loss": 0.0151,
"step": 4075
},
{
"epoch": 1.0759343484279218,
"grad_norm": 0.004526323173195124,
"learning_rate": 9.246602454149624e-05,
"loss": 0.006,
"step": 4080
},
{
"epoch": 1.0772526530881286,
"grad_norm": 0.023945212364196777,
"learning_rate": 9.23340810133263e-05,
"loss": 0.004,
"step": 4085
},
{
"epoch": 1.0785709577483356,
"grad_norm": 0.13235126435756683,
"learning_rate": 9.220213748515635e-05,
"loss": 0.0059,
"step": 4090
},
{
"epoch": 1.0798892624085425,
"grad_norm": 0.17592330276966095,
"learning_rate": 9.207019395698642e-05,
"loss": 0.0302,
"step": 4095
},
{
"epoch": 1.0812075670687495,
"grad_norm": 0.004582866560667753,
"learning_rate": 9.193825042881648e-05,
"loss": 0.009,
"step": 4100
},
{
"epoch": 1.0825258717289565,
"grad_norm": 0.15214525163173676,
"learning_rate": 9.180630690064653e-05,
"loss": 0.0062,
"step": 4105
},
{
"epoch": 1.0838441763891635,
"grad_norm": 0.16535983979701996,
"learning_rate": 9.167436337247658e-05,
"loss": 0.0926,
"step": 4110
},
{
"epoch": 1.0851624810493705,
"grad_norm": 0.013285227119922638,
"learning_rate": 9.154241984430663e-05,
"loss": 0.0043,
"step": 4115
},
{
"epoch": 1.0864807857095775,
"grad_norm": 0.012116984464228153,
"learning_rate": 9.14104763161367e-05,
"loss": 0.0037,
"step": 4120
},
{
"epoch": 1.0877990903697845,
"grad_norm": 0.0373845212161541,
"learning_rate": 9.127853278796676e-05,
"loss": 0.0081,
"step": 4125
},
{
"epoch": 1.0891173950299915,
"grad_norm": 0.09324615448713303,
"learning_rate": 9.114658925979681e-05,
"loss": 0.0534,
"step": 4130
},
{
"epoch": 1.0904356996901985,
"grad_norm": 0.010992968454957008,
"learning_rate": 9.101464573162687e-05,
"loss": 0.0025,
"step": 4135
},
{
"epoch": 1.0917540043504055,
"grad_norm": 0.13710318505764008,
"learning_rate": 9.088270220345692e-05,
"loss": 0.0555,
"step": 4140
},
{
"epoch": 1.0930723090106123,
"grad_norm": 0.010403074324131012,
"learning_rate": 9.075075867528698e-05,
"loss": 0.0042,
"step": 4145
},
{
"epoch": 1.0943906136708192,
"grad_norm": 0.21544460952281952,
"learning_rate": 9.061881514711705e-05,
"loss": 0.0144,
"step": 4150
},
{
"epoch": 1.0957089183310262,
"grad_norm": 0.04194799065589905,
"learning_rate": 9.04868716189471e-05,
"loss": 0.0106,
"step": 4155
},
{
"epoch": 1.0970272229912332,
"grad_norm": 0.029204202815890312,
"learning_rate": 9.035492809077715e-05,
"loss": 0.0085,
"step": 4160
},
{
"epoch": 1.0983455276514402,
"grad_norm": 0.006751026958227158,
"learning_rate": 9.02229845626072e-05,
"loss": 0.0049,
"step": 4165
},
{
"epoch": 1.0996638323116472,
"grad_norm": 0.008232722990214825,
"learning_rate": 9.009104103443726e-05,
"loss": 0.0172,
"step": 4170
},
{
"epoch": 1.1009821369718542,
"grad_norm": 0.05630079656839371,
"learning_rate": 8.995909750626733e-05,
"loss": 0.0112,
"step": 4175
},
{
"epoch": 1.1023004416320612,
"grad_norm": 0.0011601662263274193,
"learning_rate": 8.982715397809738e-05,
"loss": 0.0317,
"step": 4180
},
{
"epoch": 1.1036187462922682,
"grad_norm": 0.006554402410984039,
"learning_rate": 8.969521044992744e-05,
"loss": 0.0035,
"step": 4185
},
{
"epoch": 1.1049370509524752,
"grad_norm": 0.34513652324676514,
"learning_rate": 8.956326692175749e-05,
"loss": 0.0036,
"step": 4190
},
{
"epoch": 1.1062553556126822,
"grad_norm": 0.283669650554657,
"learning_rate": 8.943132339358755e-05,
"loss": 0.0182,
"step": 4195
},
{
"epoch": 1.1075736602728892,
"grad_norm": 0.5376952290534973,
"learning_rate": 8.92993798654176e-05,
"loss": 0.0293,
"step": 4200
},
{
"epoch": 1.108891964933096,
"grad_norm": 0.01689724065363407,
"learning_rate": 8.916743633724767e-05,
"loss": 0.0206,
"step": 4205
},
{
"epoch": 1.110210269593303,
"grad_norm": 0.026538770645856857,
"learning_rate": 8.903549280907772e-05,
"loss": 0.0181,
"step": 4210
},
{
"epoch": 1.11152857425351,
"grad_norm": 0.6372873783111572,
"learning_rate": 8.890354928090777e-05,
"loss": 0.021,
"step": 4215
},
{
"epoch": 1.112846878913717,
"grad_norm": 0.06177428737282753,
"learning_rate": 8.877160575273783e-05,
"loss": 0.0033,
"step": 4220
},
{
"epoch": 1.114165183573924,
"grad_norm": 0.3712109923362732,
"learning_rate": 8.863966222456788e-05,
"loss": 0.0075,
"step": 4225
},
{
"epoch": 1.115483488234131,
"grad_norm": 0.030514653772115707,
"learning_rate": 8.850771869639795e-05,
"loss": 0.0183,
"step": 4230
},
{
"epoch": 1.116801792894338,
"grad_norm": 0.012861707247793674,
"learning_rate": 8.837577516822801e-05,
"loss": 0.0032,
"step": 4235
},
{
"epoch": 1.118120097554545,
"grad_norm": 0.3278522789478302,
"learning_rate": 8.824383164005806e-05,
"loss": 0.0058,
"step": 4240
},
{
"epoch": 1.1194384022147519,
"grad_norm": 0.580259382724762,
"learning_rate": 8.811188811188812e-05,
"loss": 0.0068,
"step": 4245
},
{
"epoch": 1.1207567068749589,
"grad_norm": 0.007002575788646936,
"learning_rate": 8.797994458371817e-05,
"loss": 0.0063,
"step": 4250
},
{
"epoch": 1.1220750115351659,
"grad_norm": 0.22484643757343292,
"learning_rate": 8.784800105554823e-05,
"loss": 0.0167,
"step": 4255
},
{
"epoch": 1.1233933161953726,
"grad_norm": 0.004122686106711626,
"learning_rate": 8.771605752737829e-05,
"loss": 0.002,
"step": 4260
},
{
"epoch": 1.1247116208555796,
"grad_norm": 0.009832561016082764,
"learning_rate": 8.758411399920834e-05,
"loss": 0.0029,
"step": 4265
},
{
"epoch": 1.1260299255157866,
"grad_norm": 0.04854527860879898,
"learning_rate": 8.74521704710384e-05,
"loss": 0.0068,
"step": 4270
},
{
"epoch": 1.1273482301759936,
"grad_norm": 0.12221235036849976,
"learning_rate": 8.732022694286845e-05,
"loss": 0.003,
"step": 4275
},
{
"epoch": 1.1286665348362006,
"grad_norm": 0.005857539363205433,
"learning_rate": 8.718828341469851e-05,
"loss": 0.0022,
"step": 4280
},
{
"epoch": 1.1299848394964076,
"grad_norm": 0.10582758486270905,
"learning_rate": 8.705633988652856e-05,
"loss": 0.002,
"step": 4285
},
{
"epoch": 1.1313031441566146,
"grad_norm": 0.006190940272063017,
"learning_rate": 8.692439635835863e-05,
"loss": 0.0022,
"step": 4290
},
{
"epoch": 1.1326214488168216,
"grad_norm": 0.00221514655277133,
"learning_rate": 8.679245283018869e-05,
"loss": 0.0314,
"step": 4295
},
{
"epoch": 1.1339397534770286,
"grad_norm": 0.0796755850315094,
"learning_rate": 8.666050930201874e-05,
"loss": 0.0347,
"step": 4300
},
{
"epoch": 1.1352580581372356,
"grad_norm": 0.20088806748390198,
"learning_rate": 8.65285657738488e-05,
"loss": 0.0048,
"step": 4305
},
{
"epoch": 1.1365763627974426,
"grad_norm": 0.4018377363681793,
"learning_rate": 8.639662224567884e-05,
"loss": 0.0234,
"step": 4310
},
{
"epoch": 1.1378946674576496,
"grad_norm": 0.014961684122681618,
"learning_rate": 8.626467871750891e-05,
"loss": 0.0033,
"step": 4315
},
{
"epoch": 1.1392129721178565,
"grad_norm": 0.004534922540187836,
"learning_rate": 8.613273518933897e-05,
"loss": 0.0021,
"step": 4320
},
{
"epoch": 1.1405312767780633,
"grad_norm": 0.06340984255075455,
"learning_rate": 8.600079166116902e-05,
"loss": 0.0538,
"step": 4325
},
{
"epoch": 1.1418495814382703,
"grad_norm": 0.007374623324722052,
"learning_rate": 8.586884813299908e-05,
"loss": 0.0157,
"step": 4330
},
{
"epoch": 1.1431678860984773,
"grad_norm": 0.02313193492591381,
"learning_rate": 8.573690460482913e-05,
"loss": 0.0307,
"step": 4335
},
{
"epoch": 1.1444861907586843,
"grad_norm": 0.014071634039282799,
"learning_rate": 8.560496107665919e-05,
"loss": 0.0058,
"step": 4340
},
{
"epoch": 1.1458044954188913,
"grad_norm": 1.4664901494979858,
"learning_rate": 8.547301754848926e-05,
"loss": 0.0566,
"step": 4345
},
{
"epoch": 1.1471228000790983,
"grad_norm": 0.023680074140429497,
"learning_rate": 8.534107402031931e-05,
"loss": 0.0048,
"step": 4350
},
{
"epoch": 1.1484411047393053,
"grad_norm": 0.012555698864161968,
"learning_rate": 8.520913049214937e-05,
"loss": 0.0076,
"step": 4355
},
{
"epoch": 1.1497594093995123,
"grad_norm": 0.013624129816889763,
"learning_rate": 8.507718696397941e-05,
"loss": 0.0373,
"step": 4360
},
{
"epoch": 1.1510777140597193,
"grad_norm": 0.015372387133538723,
"learning_rate": 8.494524343580947e-05,
"loss": 0.0147,
"step": 4365
},
{
"epoch": 1.1523960187199263,
"grad_norm": 0.3312993347644806,
"learning_rate": 8.481329990763954e-05,
"loss": 0.0299,
"step": 4370
},
{
"epoch": 1.1537143233801332,
"grad_norm": 0.023838184773921967,
"learning_rate": 8.468135637946959e-05,
"loss": 0.0226,
"step": 4375
},
{
"epoch": 1.15503262804034,
"grad_norm": 0.42516952753067017,
"learning_rate": 8.454941285129965e-05,
"loss": 0.0088,
"step": 4380
},
{
"epoch": 1.156350932700547,
"grad_norm": 0.6900278925895691,
"learning_rate": 8.44174693231297e-05,
"loss": 0.0245,
"step": 4385
},
{
"epoch": 1.157669237360754,
"grad_norm": 0.2932703197002411,
"learning_rate": 8.428552579495976e-05,
"loss": 0.0207,
"step": 4390
},
{
"epoch": 1.158987542020961,
"grad_norm": 0.12942780554294586,
"learning_rate": 8.415358226678982e-05,
"loss": 0.0037,
"step": 4395
},
{
"epoch": 1.160305846681168,
"grad_norm": 0.9499046802520752,
"learning_rate": 8.402163873861989e-05,
"loss": 0.0246,
"step": 4400
},
{
"epoch": 1.161624151341375,
"grad_norm": 0.008869118988513947,
"learning_rate": 8.388969521044994e-05,
"loss": 0.0171,
"step": 4405
},
{
"epoch": 1.162942456001582,
"grad_norm": 1.7409231662750244,
"learning_rate": 8.375775168227998e-05,
"loss": 0.017,
"step": 4410
},
{
"epoch": 1.164260760661789,
"grad_norm": 0.0020101398695260286,
"learning_rate": 8.362580815411004e-05,
"loss": 0.0027,
"step": 4415
},
{
"epoch": 1.165579065321996,
"grad_norm": 0.0785067081451416,
"learning_rate": 8.34938646259401e-05,
"loss": 0.0043,
"step": 4420
},
{
"epoch": 1.166897369982203,
"grad_norm": 0.0029506285209208727,
"learning_rate": 8.336192109777016e-05,
"loss": 0.0109,
"step": 4425
},
{
"epoch": 1.16821567464241,
"grad_norm": 0.02216683328151703,
"learning_rate": 8.322997756960022e-05,
"loss": 0.0026,
"step": 4430
},
{
"epoch": 1.1695339793026167,
"grad_norm": 0.02216639369726181,
"learning_rate": 8.309803404143027e-05,
"loss": 0.0045,
"step": 4435
},
{
"epoch": 1.170852283962824,
"grad_norm": 0.0,
"learning_rate": 8.296609051326033e-05,
"loss": 0.006,
"step": 4440
},
{
"epoch": 1.1721705886230307,
"grad_norm": 0.0019736960530281067,
"learning_rate": 8.283414698509039e-05,
"loss": 0.0078,
"step": 4445
},
{
"epoch": 1.1734888932832377,
"grad_norm": 0.012957746163010597,
"learning_rate": 8.270220345692044e-05,
"loss": 0.002,
"step": 4450
},
{
"epoch": 1.1748071979434447,
"grad_norm": 0.010877869091928005,
"learning_rate": 8.25702599287505e-05,
"loss": 0.0237,
"step": 4455
},
{
"epoch": 1.1761255026036517,
"grad_norm": 0.005947659723460674,
"learning_rate": 8.243831640058055e-05,
"loss": 0.0341,
"step": 4460
},
{
"epoch": 1.1774438072638587,
"grad_norm": 0.0005026470171287656,
"learning_rate": 8.230637287241061e-05,
"loss": 0.0033,
"step": 4465
},
{
"epoch": 1.1787621119240657,
"grad_norm": 0.022054588422179222,
"learning_rate": 8.217442934424066e-05,
"loss": 0.0042,
"step": 4470
},
{
"epoch": 1.1800804165842727,
"grad_norm": 0.7929030656814575,
"learning_rate": 8.204248581607072e-05,
"loss": 0.0076,
"step": 4475
},
{
"epoch": 1.1813987212444796,
"grad_norm": 0.39052629470825195,
"learning_rate": 8.191054228790078e-05,
"loss": 0.0228,
"step": 4480
},
{
"epoch": 1.1827170259046866,
"grad_norm": 0.007177622988820076,
"learning_rate": 8.177859875973084e-05,
"loss": 0.01,
"step": 4485
},
{
"epoch": 1.1840353305648936,
"grad_norm": 0.006175135262310505,
"learning_rate": 8.16466552315609e-05,
"loss": 0.0037,
"step": 4490
},
{
"epoch": 1.1853536352251006,
"grad_norm": 0.0356481671333313,
"learning_rate": 8.151471170339096e-05,
"loss": 0.0024,
"step": 4495
},
{
"epoch": 1.1866719398853074,
"grad_norm": 0.19069480895996094,
"learning_rate": 8.138276817522101e-05,
"loss": 0.0048,
"step": 4500
},
{
"epoch": 1.1866719398853074,
"eval_loss": 0.026386437937617302,
"eval_runtime": 452.2896,
"eval_samples_per_second": 7.455,
"eval_steps_per_second": 3.728,
"step": 4500
},
{
"epoch": 1.1879902445455144,
"grad_norm": 0.002254961524158716,
"learning_rate": 8.125082464705107e-05,
"loss": 0.0014,
"step": 4505
},
{
"epoch": 1.1893085492057214,
"grad_norm": 0.8026870489120483,
"learning_rate": 8.111888111888112e-05,
"loss": 0.0411,
"step": 4510
},
{
"epoch": 1.1906268538659284,
"grad_norm": 0.47328072786331177,
"learning_rate": 8.098693759071118e-05,
"loss": 0.0271,
"step": 4515
},
{
"epoch": 1.1919451585261354,
"grad_norm": 0.4888288676738739,
"learning_rate": 8.085499406254123e-05,
"loss": 0.039,
"step": 4520
},
{
"epoch": 1.1932634631863424,
"grad_norm": 0.000925812462810427,
"learning_rate": 8.072305053437129e-05,
"loss": 0.0461,
"step": 4525
},
{
"epoch": 1.1945817678465493,
"grad_norm": 0.12472371757030487,
"learning_rate": 8.059110700620135e-05,
"loss": 0.0037,
"step": 4530
},
{
"epoch": 1.1959000725067563,
"grad_norm": 0.002875336678698659,
"learning_rate": 8.04591634780314e-05,
"loss": 0.0425,
"step": 4535
},
{
"epoch": 1.1972183771669633,
"grad_norm": 0.042056187987327576,
"learning_rate": 8.032721994986147e-05,
"loss": 0.0068,
"step": 4540
},
{
"epoch": 1.1985366818271703,
"grad_norm": 0.157605841755867,
"learning_rate": 8.019527642169153e-05,
"loss": 0.0179,
"step": 4545
},
{
"epoch": 1.1998549864873773,
"grad_norm": 0.005153563339263201,
"learning_rate": 8.006333289352158e-05,
"loss": 0.0045,
"step": 4550
},
{
"epoch": 1.201173291147584,
"grad_norm": 0.02541598491370678,
"learning_rate": 7.993138936535164e-05,
"loss": 0.0041,
"step": 4555
},
{
"epoch": 1.2024915958077913,
"grad_norm": 0.04266195371747017,
"learning_rate": 7.979944583718168e-05,
"loss": 0.0121,
"step": 4560
},
{
"epoch": 1.203809900467998,
"grad_norm": 0.36108532547950745,
"learning_rate": 7.966750230901175e-05,
"loss": 0.0147,
"step": 4565
},
{
"epoch": 1.205128205128205,
"grad_norm": 0.40405452251434326,
"learning_rate": 7.95355587808418e-05,
"loss": 0.0056,
"step": 4570
},
{
"epoch": 1.206446509788412,
"grad_norm": 0.030422702431678772,
"learning_rate": 7.940361525267186e-05,
"loss": 0.0055,
"step": 4575
},
{
"epoch": 1.207764814448619,
"grad_norm": 0.014555396512150764,
"learning_rate": 7.927167172450192e-05,
"loss": 0.0029,
"step": 4580
},
{
"epoch": 1.209083119108826,
"grad_norm": 0.33962950110435486,
"learning_rate": 7.913972819633197e-05,
"loss": 0.0191,
"step": 4585
},
{
"epoch": 1.210401423769033,
"grad_norm": 0.040150560438632965,
"learning_rate": 7.900778466816203e-05,
"loss": 0.0096,
"step": 4590
},
{
"epoch": 1.21171972842924,
"grad_norm": 0.2968510091304779,
"learning_rate": 7.88758411399921e-05,
"loss": 0.0311,
"step": 4595
},
{
"epoch": 1.213038033089447,
"grad_norm": 0.04709814116358757,
"learning_rate": 7.874389761182215e-05,
"loss": 0.0175,
"step": 4600
},
{
"epoch": 1.214356337749654,
"grad_norm": 0.1379537284374237,
"learning_rate": 7.861195408365221e-05,
"loss": 0.02,
"step": 4605
},
{
"epoch": 1.215674642409861,
"grad_norm": 0.018291711807250977,
"learning_rate": 7.848001055548225e-05,
"loss": 0.003,
"step": 4610
},
{
"epoch": 1.216992947070068,
"grad_norm": 0.041676126420497894,
"learning_rate": 7.83480670273123e-05,
"loss": 0.0054,
"step": 4615
},
{
"epoch": 1.2183112517302748,
"grad_norm": 0.0013747498160228133,
"learning_rate": 7.821612349914237e-05,
"loss": 0.0132,
"step": 4620
},
{
"epoch": 1.2196295563904818,
"grad_norm": 0.0050489697605371475,
"learning_rate": 7.808417997097243e-05,
"loss": 0.0272,
"step": 4625
},
{
"epoch": 1.2209478610506888,
"grad_norm": 0.017974581569433212,
"learning_rate": 7.795223644280249e-05,
"loss": 0.0037,
"step": 4630
},
{
"epoch": 1.2222661657108957,
"grad_norm": 0.001916698063723743,
"learning_rate": 7.782029291463254e-05,
"loss": 0.002,
"step": 4635
},
{
"epoch": 1.2235844703711027,
"grad_norm": 0.05344574153423309,
"learning_rate": 7.76883493864626e-05,
"loss": 0.0114,
"step": 4640
},
{
"epoch": 1.2249027750313097,
"grad_norm": 0.22823786735534668,
"learning_rate": 7.755640585829265e-05,
"loss": 0.0296,
"step": 4645
},
{
"epoch": 1.2262210796915167,
"grad_norm": 0.02051074244081974,
"learning_rate": 7.742446233012272e-05,
"loss": 0.0037,
"step": 4650
},
{
"epoch": 1.2275393843517237,
"grad_norm": 0.9797061681747437,
"learning_rate": 7.729251880195276e-05,
"loss": 0.011,
"step": 4655
},
{
"epoch": 1.2288576890119307,
"grad_norm": 0.0017285927897319198,
"learning_rate": 7.716057527378282e-05,
"loss": 0.0224,
"step": 4660
},
{
"epoch": 1.2301759936721377,
"grad_norm": 0.021783018484711647,
"learning_rate": 7.702863174561288e-05,
"loss": 0.0174,
"step": 4665
},
{
"epoch": 1.2314942983323447,
"grad_norm": 0.00763307698071003,
"learning_rate": 7.689668821744293e-05,
"loss": 0.0516,
"step": 4670
},
{
"epoch": 1.2328126029925515,
"grad_norm": 0.32605209946632385,
"learning_rate": 7.676474468927299e-05,
"loss": 0.0301,
"step": 4675
},
{
"epoch": 1.2341309076527585,
"grad_norm": 1.2027722597122192,
"learning_rate": 7.663280116110306e-05,
"loss": 0.0474,
"step": 4680
},
{
"epoch": 1.2354492123129655,
"grad_norm": 0.10201717168092728,
"learning_rate": 7.650085763293311e-05,
"loss": 0.0144,
"step": 4685
},
{
"epoch": 1.2367675169731724,
"grad_norm": 0.013835664838552475,
"learning_rate": 7.636891410476317e-05,
"loss": 0.0024,
"step": 4690
},
{
"epoch": 1.2380858216333794,
"grad_norm": 0.005699916277080774,
"learning_rate": 7.623697057659322e-05,
"loss": 0.0089,
"step": 4695
},
{
"epoch": 1.2394041262935864,
"grad_norm": 0.16583332419395447,
"learning_rate": 7.610502704842328e-05,
"loss": 0.019,
"step": 4700
},
{
"epoch": 1.2407224309537934,
"grad_norm": 0.2734023332595825,
"learning_rate": 7.597308352025333e-05,
"loss": 0.0041,
"step": 4705
},
{
"epoch": 1.2420407356140004,
"grad_norm": 0.04209504276514053,
"learning_rate": 7.584113999208339e-05,
"loss": 0.0292,
"step": 4710
},
{
"epoch": 1.2433590402742074,
"grad_norm": 0.0303195733577013,
"learning_rate": 7.570919646391345e-05,
"loss": 0.0019,
"step": 4715
},
{
"epoch": 1.2446773449344144,
"grad_norm": 0.014011899940669537,
"learning_rate": 7.55772529357435e-05,
"loss": 0.0236,
"step": 4720
},
{
"epoch": 1.2459956495946214,
"grad_norm": 0.37838876247406006,
"learning_rate": 7.544530940757356e-05,
"loss": 0.0081,
"step": 4725
},
{
"epoch": 1.2473139542548284,
"grad_norm": 0.003717717481777072,
"learning_rate": 7.531336587940361e-05,
"loss": 0.0036,
"step": 4730
},
{
"epoch": 1.2486322589150354,
"grad_norm": 1.2284752130508423,
"learning_rate": 7.518142235123368e-05,
"loss": 0.0089,
"step": 4735
},
{
"epoch": 1.2499505635752421,
"grad_norm": 0.015356095507740974,
"learning_rate": 7.504947882306374e-05,
"loss": 0.0074,
"step": 4740
},
{
"epoch": 1.2512688682354491,
"grad_norm": 0.0020383282098919153,
"learning_rate": 7.49175352948938e-05,
"loss": 0.0444,
"step": 4745
},
{
"epoch": 1.2525871728956561,
"grad_norm": 0.006680132355540991,
"learning_rate": 7.478559176672385e-05,
"loss": 0.009,
"step": 4750
},
{
"epoch": 1.2539054775558631,
"grad_norm": 0.01650019735097885,
"learning_rate": 7.465364823855389e-05,
"loss": 0.0022,
"step": 4755
},
{
"epoch": 1.2552237822160701,
"grad_norm": 0.009536102414131165,
"learning_rate": 7.452170471038396e-05,
"loss": 0.0026,
"step": 4760
},
{
"epoch": 1.256542086876277,
"grad_norm": 0.04677430912852287,
"learning_rate": 7.438976118221402e-05,
"loss": 0.004,
"step": 4765
},
{
"epoch": 1.257860391536484,
"grad_norm": 0.007777783088386059,
"learning_rate": 7.425781765404407e-05,
"loss": 0.0112,
"step": 4770
},
{
"epoch": 1.259178696196691,
"grad_norm": 0.03724197298288345,
"learning_rate": 7.412587412587413e-05,
"loss": 0.0065,
"step": 4775
},
{
"epoch": 1.260497000856898,
"grad_norm": 0.0023958412930369377,
"learning_rate": 7.399393059770418e-05,
"loss": 0.0238,
"step": 4780
},
{
"epoch": 1.261815305517105,
"grad_norm": 0.0036889975890517235,
"learning_rate": 7.386198706953424e-05,
"loss": 0.0012,
"step": 4785
},
{
"epoch": 1.263133610177312,
"grad_norm": 0.0009220903157256544,
"learning_rate": 7.373004354136431e-05,
"loss": 0.0017,
"step": 4790
},
{
"epoch": 1.2644519148375188,
"grad_norm": 0.0033395602367818356,
"learning_rate": 7.359810001319436e-05,
"loss": 0.0474,
"step": 4795
},
{
"epoch": 1.265770219497726,
"grad_norm": 0.004093261435627937,
"learning_rate": 7.346615648502442e-05,
"loss": 0.0025,
"step": 4800
},
{
"epoch": 1.2670885241579328,
"grad_norm": 0.004395488649606705,
"learning_rate": 7.333421295685446e-05,
"loss": 0.0011,
"step": 4805
},
{
"epoch": 1.2684068288181398,
"grad_norm": 0.024034051224589348,
"learning_rate": 7.320226942868452e-05,
"loss": 0.0027,
"step": 4810
},
{
"epoch": 1.2697251334783468,
"grad_norm": 0.9501499533653259,
"learning_rate": 7.307032590051459e-05,
"loss": 0.0279,
"step": 4815
},
{
"epoch": 1.2710434381385538,
"grad_norm": 0.008805549703538418,
"learning_rate": 7.293838237234464e-05,
"loss": 0.0403,
"step": 4820
},
{
"epoch": 1.2723617427987608,
"grad_norm": 0.01750873774290085,
"learning_rate": 7.28064388441747e-05,
"loss": 0.0571,
"step": 4825
},
{
"epoch": 1.2736800474589678,
"grad_norm": 0.004490260500460863,
"learning_rate": 7.267449531600475e-05,
"loss": 0.0269,
"step": 4830
},
{
"epoch": 1.2749983521191748,
"grad_norm": 0.07510064542293549,
"learning_rate": 7.254255178783481e-05,
"loss": 0.0123,
"step": 4835
},
{
"epoch": 1.2763166567793818,
"grad_norm": 0.039783038198947906,
"learning_rate": 7.241060825966486e-05,
"loss": 0.0137,
"step": 4840
},
{
"epoch": 1.2776349614395888,
"grad_norm": 0.019004900008440018,
"learning_rate": 7.227866473149493e-05,
"loss": 0.0047,
"step": 4845
},
{
"epoch": 1.2789532660997955,
"grad_norm": 0.04813052713871002,
"learning_rate": 7.214672120332499e-05,
"loss": 0.0021,
"step": 4850
},
{
"epoch": 1.2802715707600028,
"grad_norm": 0.00835048221051693,
"learning_rate": 7.201477767515503e-05,
"loss": 0.0014,
"step": 4855
},
{
"epoch": 1.2815898754202095,
"grad_norm": 0.008609198965132236,
"learning_rate": 7.188283414698509e-05,
"loss": 0.0219,
"step": 4860
},
{
"epoch": 1.2829081800804165,
"grad_norm": 0.007337458431720734,
"learning_rate": 7.175089061881514e-05,
"loss": 0.0014,
"step": 4865
},
{
"epoch": 1.2842264847406235,
"grad_norm": 0.0032645913306623697,
"learning_rate": 7.161894709064521e-05,
"loss": 0.0026,
"step": 4870
},
{
"epoch": 1.2855447894008305,
"grad_norm": 0.27384671568870544,
"learning_rate": 7.148700356247527e-05,
"loss": 0.0227,
"step": 4875
},
{
"epoch": 1.2868630940610375,
"grad_norm": 0.03584875538945198,
"learning_rate": 7.135506003430532e-05,
"loss": 0.0299,
"step": 4880
},
{
"epoch": 1.2881813987212445,
"grad_norm": 0.03482440486550331,
"learning_rate": 7.122311650613538e-05,
"loss": 0.0125,
"step": 4885
},
{
"epoch": 1.2894997033814515,
"grad_norm": 0.005974395200610161,
"learning_rate": 7.109117297796543e-05,
"loss": 0.0029,
"step": 4890
},
{
"epoch": 1.2908180080416585,
"grad_norm": 0.01820153370499611,
"learning_rate": 7.095922944979549e-05,
"loss": 0.0254,
"step": 4895
},
{
"epoch": 1.2921363127018655,
"grad_norm": 0.1733965277671814,
"learning_rate": 7.082728592162555e-05,
"loss": 0.028,
"step": 4900
},
{
"epoch": 1.2934546173620725,
"grad_norm": 1.3017303943634033,
"learning_rate": 7.06953423934556e-05,
"loss": 0.0213,
"step": 4905
},
{
"epoch": 1.2947729220222794,
"grad_norm": 0.01360877975821495,
"learning_rate": 7.056339886528566e-05,
"loss": 0.0039,
"step": 4910
},
{
"epoch": 1.2960912266824862,
"grad_norm": 0.01503999624401331,
"learning_rate": 7.043145533711571e-05,
"loss": 0.0102,
"step": 4915
},
{
"epoch": 1.2974095313426934,
"grad_norm": 0.2200804352760315,
"learning_rate": 7.029951180894577e-05,
"loss": 0.0461,
"step": 4920
},
{
"epoch": 1.2987278360029002,
"grad_norm": 0.08512946963310242,
"learning_rate": 7.016756828077582e-05,
"loss": 0.0066,
"step": 4925
},
{
"epoch": 1.3000461406631072,
"grad_norm": 0.08296570926904678,
"learning_rate": 7.00356247526059e-05,
"loss": 0.0223,
"step": 4930
},
{
"epoch": 1.3013644453233142,
"grad_norm": 0.008866079151630402,
"learning_rate": 6.990368122443595e-05,
"loss": 0.0032,
"step": 4935
},
{
"epoch": 1.3026827499835212,
"grad_norm": 0.024493014439940453,
"learning_rate": 6.9771737696266e-05,
"loss": 0.0128,
"step": 4940
},
{
"epoch": 1.3040010546437282,
"grad_norm": 0.08965341746807098,
"learning_rate": 6.963979416809606e-05,
"loss": 0.028,
"step": 4945
},
{
"epoch": 1.3053193593039352,
"grad_norm": 0.023156631737947464,
"learning_rate": 6.950785063992612e-05,
"loss": 0.0187,
"step": 4950
},
{
"epoch": 1.3066376639641422,
"grad_norm": 0.18552155792713165,
"learning_rate": 6.937590711175617e-05,
"loss": 0.0424,
"step": 4955
},
{
"epoch": 1.3079559686243492,
"grad_norm": 0.02200198918581009,
"learning_rate": 6.924396358358623e-05,
"loss": 0.0148,
"step": 4960
},
{
"epoch": 1.3092742732845561,
"grad_norm": 0.00568364467471838,
"learning_rate": 6.911202005541628e-05,
"loss": 0.0199,
"step": 4965
},
{
"epoch": 1.310592577944763,
"grad_norm": 0.021591177210211754,
"learning_rate": 6.898007652724634e-05,
"loss": 0.0092,
"step": 4970
},
{
"epoch": 1.3119108826049701,
"grad_norm": 0.327177494764328,
"learning_rate": 6.88481329990764e-05,
"loss": 0.0047,
"step": 4975
},
{
"epoch": 1.313229187265177,
"grad_norm": 0.024512887001037598,
"learning_rate": 6.871618947090645e-05,
"loss": 0.0046,
"step": 4980
},
{
"epoch": 1.314547491925384,
"grad_norm": 0.05725006014108658,
"learning_rate": 6.858424594273652e-05,
"loss": 0.0227,
"step": 4985
},
{
"epoch": 1.3158657965855909,
"grad_norm": 0.011280277743935585,
"learning_rate": 6.845230241456658e-05,
"loss": 0.0056,
"step": 4990
},
{
"epoch": 1.3171841012457979,
"grad_norm": 0.022504402324557304,
"learning_rate": 6.832035888639663e-05,
"loss": 0.0029,
"step": 4995
},
{
"epoch": 1.3185024059060049,
"grad_norm": 0.02168826013803482,
"learning_rate": 6.818841535822669e-05,
"loss": 0.0198,
"step": 5000
},
{
"epoch": 1.3185024059060049,
"eval_loss": 0.025039294734597206,
"eval_runtime": 452.1097,
"eval_samples_per_second": 7.458,
"eval_steps_per_second": 3.729,
"step": 5000
},
{
"epoch": 1.3198207105662119,
"grad_norm": 0.0064329709857702255,
"learning_rate": 6.805647183005673e-05,
"loss": 0.0299,
"step": 5005
},
{
"epoch": 1.3211390152264189,
"grad_norm": 0.00267885928042233,
"learning_rate": 6.79245283018868e-05,
"loss": 0.0065,
"step": 5010
},
{
"epoch": 1.3224573198866258,
"grad_norm": 0.6842889189720154,
"learning_rate": 6.779258477371685e-05,
"loss": 0.008,
"step": 5015
},
{
"epoch": 1.3237756245468328,
"grad_norm": 0.002985635306686163,
"learning_rate": 6.766064124554691e-05,
"loss": 0.0119,
"step": 5020
},
{
"epoch": 1.3250939292070396,
"grad_norm": 0.019304940477013588,
"learning_rate": 6.752869771737696e-05,
"loss": 0.0041,
"step": 5025
},
{
"epoch": 1.3264122338672468,
"grad_norm": 0.011305035091936588,
"learning_rate": 6.739675418920702e-05,
"loss": 0.0031,
"step": 5030
},
{
"epoch": 1.3277305385274536,
"grad_norm": 0.006184784695506096,
"learning_rate": 6.726481066103708e-05,
"loss": 0.0081,
"step": 5035
},
{
"epoch": 1.3290488431876606,
"grad_norm": 0.0073184361681342125,
"learning_rate": 6.713286713286715e-05,
"loss": 0.0202,
"step": 5040
},
{
"epoch": 1.3303671478478676,
"grad_norm": 0.006566181313246489,
"learning_rate": 6.70009236046972e-05,
"loss": 0.0052,
"step": 5045
},
{
"epoch": 1.3316854525080746,
"grad_norm": 0.31427526473999023,
"learning_rate": 6.686898007652726e-05,
"loss": 0.017,
"step": 5050
},
{
"epoch": 1.3330037571682816,
"grad_norm": 0.005085447803139687,
"learning_rate": 6.67370365483573e-05,
"loss": 0.009,
"step": 5055
},
{
"epoch": 1.3343220618284886,
"grad_norm": 0.2745366096496582,
"learning_rate": 6.660509302018735e-05,
"loss": 0.0119,
"step": 5060
},
{
"epoch": 1.3356403664886956,
"grad_norm": 0.2871796786785126,
"learning_rate": 6.647314949201742e-05,
"loss": 0.0158,
"step": 5065
},
{
"epoch": 1.3369586711489025,
"grad_norm": 0.2774186134338379,
"learning_rate": 6.634120596384748e-05,
"loss": 0.0084,
"step": 5070
},
{
"epoch": 1.3382769758091095,
"grad_norm": 0.013278775848448277,
"learning_rate": 6.620926243567753e-05,
"loss": 0.0111,
"step": 5075
},
{
"epoch": 1.3395952804693165,
"grad_norm": 0.01614517532289028,
"learning_rate": 6.607731890750759e-05,
"loss": 0.0066,
"step": 5080
},
{
"epoch": 1.3409135851295235,
"grad_norm": 0.0037789656780660152,
"learning_rate": 6.594537537933765e-05,
"loss": 0.0142,
"step": 5085
},
{
"epoch": 1.3422318897897303,
"grad_norm": 0.03221861273050308,
"learning_rate": 6.58134318511677e-05,
"loss": 0.0155,
"step": 5090
},
{
"epoch": 1.3435501944499375,
"grad_norm": 0.005637989845126867,
"learning_rate": 6.568148832299776e-05,
"loss": 0.0022,
"step": 5095
},
{
"epoch": 1.3448684991101443,
"grad_norm": 0.0017844432732090354,
"learning_rate": 6.554954479482783e-05,
"loss": 0.0217,
"step": 5100
},
{
"epoch": 1.3461868037703513,
"grad_norm": 0.08099021762609482,
"learning_rate": 6.541760126665787e-05,
"loss": 0.0222,
"step": 5105
},
{
"epoch": 1.3475051084305583,
"grad_norm": 0.011909045279026031,
"learning_rate": 6.528565773848792e-05,
"loss": 0.0058,
"step": 5110
},
{
"epoch": 1.3488234130907653,
"grad_norm": 0.7332578301429749,
"learning_rate": 6.515371421031798e-05,
"loss": 0.0286,
"step": 5115
},
{
"epoch": 1.3501417177509722,
"grad_norm": 0.3415885865688324,
"learning_rate": 6.502177068214804e-05,
"loss": 0.1191,
"step": 5120
},
{
"epoch": 1.3514600224111792,
"grad_norm": 0.00904211588203907,
"learning_rate": 6.48898271539781e-05,
"loss": 0.0043,
"step": 5125
},
{
"epoch": 1.3527783270713862,
"grad_norm": 0.1978830248117447,
"learning_rate": 6.475788362580816e-05,
"loss": 0.0316,
"step": 5130
},
{
"epoch": 1.3540966317315932,
"grad_norm": 0.10229042172431946,
"learning_rate": 6.462594009763822e-05,
"loss": 0.0194,
"step": 5135
},
{
"epoch": 1.3554149363918002,
"grad_norm": 0.4457210600376129,
"learning_rate": 6.449399656946827e-05,
"loss": 0.0276,
"step": 5140
},
{
"epoch": 1.356733241052007,
"grad_norm": 0.023706572130322456,
"learning_rate": 6.436205304129833e-05,
"loss": 0.0163,
"step": 5145
},
{
"epoch": 1.3580515457122142,
"grad_norm": 1.166896939277649,
"learning_rate": 6.423010951312838e-05,
"loss": 0.0189,
"step": 5150
},
{
"epoch": 1.359369850372421,
"grad_norm": 0.0016115796752274036,
"learning_rate": 6.409816598495844e-05,
"loss": 0.0191,
"step": 5155
},
{
"epoch": 1.360688155032628,
"grad_norm": 0.00786682777106762,
"learning_rate": 6.39662224567885e-05,
"loss": 0.0119,
"step": 5160
},
{
"epoch": 1.362006459692835,
"grad_norm": 1.042732834815979,
"learning_rate": 6.383427892861855e-05,
"loss": 0.0497,
"step": 5165
},
{
"epoch": 1.363324764353042,
"grad_norm": 0.007983304560184479,
"learning_rate": 6.37023354004486e-05,
"loss": 0.044,
"step": 5170
},
{
"epoch": 1.364643069013249,
"grad_norm": 0.009767642244696617,
"learning_rate": 6.357039187227866e-05,
"loss": 0.0405,
"step": 5175
},
{
"epoch": 1.365961373673456,
"grad_norm": 0.03164628520607948,
"learning_rate": 6.343844834410873e-05,
"loss": 0.0138,
"step": 5180
},
{
"epoch": 1.367279678333663,
"grad_norm": 0.004159921780228615,
"learning_rate": 6.330650481593879e-05,
"loss": 0.0045,
"step": 5185
},
{
"epoch": 1.36859798299387,
"grad_norm": 0.004395391326397657,
"learning_rate": 6.317456128776884e-05,
"loss": 0.0046,
"step": 5190
},
{
"epoch": 1.369916287654077,
"grad_norm": 0.011886746622622013,
"learning_rate": 6.30426177595989e-05,
"loss": 0.0064,
"step": 5195
},
{
"epoch": 1.371234592314284,
"grad_norm": 0.2259266972541809,
"learning_rate": 6.291067423142895e-05,
"loss": 0.0076,
"step": 5200
},
{
"epoch": 1.372552896974491,
"grad_norm": 0.01407301053404808,
"learning_rate": 6.277873070325901e-05,
"loss": 0.0201,
"step": 5205
},
{
"epoch": 1.3738712016346977,
"grad_norm": 0.00911578256636858,
"learning_rate": 6.264678717508906e-05,
"loss": 0.0164,
"step": 5210
},
{
"epoch": 1.3751895062949049,
"grad_norm": 0.20968014001846313,
"learning_rate": 6.251484364691912e-05,
"loss": 0.0075,
"step": 5215
},
{
"epoch": 1.3765078109551117,
"grad_norm": 0.008801166899502277,
"learning_rate": 6.238290011874918e-05,
"loss": 0.0068,
"step": 5220
},
{
"epoch": 1.3778261156153186,
"grad_norm": 0.007181806955486536,
"learning_rate": 6.225095659057923e-05,
"loss": 0.0136,
"step": 5225
},
{
"epoch": 1.3791444202755256,
"grad_norm": 0.7527109980583191,
"learning_rate": 6.211901306240929e-05,
"loss": 0.0287,
"step": 5230
},
{
"epoch": 1.3804627249357326,
"grad_norm": 0.039015207439661026,
"learning_rate": 6.198706953423936e-05,
"loss": 0.0326,
"step": 5235
},
{
"epoch": 1.3817810295959396,
"grad_norm": 0.021076606586575508,
"learning_rate": 6.185512600606941e-05,
"loss": 0.0191,
"step": 5240
},
{
"epoch": 1.3830993342561466,
"grad_norm": 0.016630731523036957,
"learning_rate": 6.172318247789947e-05,
"loss": 0.0131,
"step": 5245
},
{
"epoch": 1.3844176389163536,
"grad_norm": 0.011133644729852676,
"learning_rate": 6.159123894972952e-05,
"loss": 0.0029,
"step": 5250
},
{
"epoch": 1.3857359435765606,
"grad_norm": 0.6434677243232727,
"learning_rate": 6.145929542155957e-05,
"loss": 0.0091,
"step": 5255
},
{
"epoch": 1.3870542482367676,
"grad_norm": 0.051020298153162,
"learning_rate": 6.132735189338964e-05,
"loss": 0.0086,
"step": 5260
},
{
"epoch": 1.3883725528969744,
"grad_norm": 0.016413932666182518,
"learning_rate": 6.119540836521969e-05,
"loss": 0.0061,
"step": 5265
},
{
"epoch": 1.3896908575571816,
"grad_norm": 0.005769540090113878,
"learning_rate": 6.106346483704975e-05,
"loss": 0.0027,
"step": 5270
},
{
"epoch": 1.3910091622173884,
"grad_norm": 0.06687796860933304,
"learning_rate": 6.09315213088798e-05,
"loss": 0.0423,
"step": 5275
},
{
"epoch": 1.3923274668775953,
"grad_norm": 0.005641553085297346,
"learning_rate": 6.079957778070986e-05,
"loss": 0.0353,
"step": 5280
},
{
"epoch": 1.3936457715378023,
"grad_norm": 0.04460568353533745,
"learning_rate": 6.066763425253992e-05,
"loss": 0.0041,
"step": 5285
},
{
"epoch": 1.3949640761980093,
"grad_norm": 0.0387534461915493,
"learning_rate": 6.0535690724369976e-05,
"loss": 0.006,
"step": 5290
},
{
"epoch": 1.3962823808582163,
"grad_norm": 0.010292598977684975,
"learning_rate": 6.040374719620003e-05,
"loss": 0.0038,
"step": 5295
},
{
"epoch": 1.3976006855184233,
"grad_norm": 0.3646155297756195,
"learning_rate": 6.0271803668030094e-05,
"loss": 0.0111,
"step": 5300
},
{
"epoch": 1.3989189901786303,
"grad_norm": 0.022035539150238037,
"learning_rate": 6.0139860139860136e-05,
"loss": 0.0507,
"step": 5305
},
{
"epoch": 1.4002372948388373,
"grad_norm": 0.003314939560368657,
"learning_rate": 6.00079166116902e-05,
"loss": 0.0132,
"step": 5310
},
{
"epoch": 1.4015555994990443,
"grad_norm": 0.0838267058134079,
"learning_rate": 5.9875973083520254e-05,
"loss": 0.0105,
"step": 5315
},
{
"epoch": 1.4028739041592513,
"grad_norm": 0.009368584491312504,
"learning_rate": 5.974402955535031e-05,
"loss": 0.0026,
"step": 5320
},
{
"epoch": 1.4041922088194583,
"grad_norm": 0.031248098239302635,
"learning_rate": 5.961208602718037e-05,
"loss": 0.0151,
"step": 5325
},
{
"epoch": 1.405510513479665,
"grad_norm": 0.06447605788707733,
"learning_rate": 5.948014249901043e-05,
"loss": 0.0219,
"step": 5330
},
{
"epoch": 1.4068288181398723,
"grad_norm": 0.010814374312758446,
"learning_rate": 5.9348198970840484e-05,
"loss": 0.0038,
"step": 5335
},
{
"epoch": 1.408147122800079,
"grad_norm": 0.6235967874526978,
"learning_rate": 5.9216255442670546e-05,
"loss": 0.0354,
"step": 5340
},
{
"epoch": 1.409465427460286,
"grad_norm": 0.026741521432995796,
"learning_rate": 5.90843119145006e-05,
"loss": 0.0032,
"step": 5345
},
{
"epoch": 1.410783732120493,
"grad_norm": 0.019413433969020844,
"learning_rate": 5.895236838633066e-05,
"loss": 0.0216,
"step": 5350
},
{
"epoch": 1.4121020367807,
"grad_norm": 0.0735543966293335,
"learning_rate": 5.8820424858160706e-05,
"loss": 0.0033,
"step": 5355
},
{
"epoch": 1.413420341440907,
"grad_norm": 0.005189546383917332,
"learning_rate": 5.868848132999076e-05,
"loss": 0.021,
"step": 5360
},
{
"epoch": 1.414738646101114,
"grad_norm": 0.21240335702896118,
"learning_rate": 5.8556537801820824e-05,
"loss": 0.0294,
"step": 5365
},
{
"epoch": 1.416056950761321,
"grad_norm": 0.010165920481085777,
"learning_rate": 5.842459427365088e-05,
"loss": 0.0021,
"step": 5370
},
{
"epoch": 1.417375255421528,
"grad_norm": 0.026774069294333458,
"learning_rate": 5.8292650745480936e-05,
"loss": 0.0299,
"step": 5375
},
{
"epoch": 1.418693560081735,
"grad_norm": 0.0019810455851256847,
"learning_rate": 5.816070721731099e-05,
"loss": 0.0029,
"step": 5380
},
{
"epoch": 1.4200118647419417,
"grad_norm": 0.038888879120349884,
"learning_rate": 5.8028763689141054e-05,
"loss": 0.0069,
"step": 5385
},
{
"epoch": 1.421330169402149,
"grad_norm": 0.016180936247110367,
"learning_rate": 5.789682016097111e-05,
"loss": 0.0032,
"step": 5390
},
{
"epoch": 1.4226484740623557,
"grad_norm": 0.01119404286146164,
"learning_rate": 5.7764876632801165e-05,
"loss": 0.0024,
"step": 5395
},
{
"epoch": 1.4239667787225627,
"grad_norm": 0.010486694052815437,
"learning_rate": 5.763293310463123e-05,
"loss": 0.0324,
"step": 5400
},
{
"epoch": 1.4252850833827697,
"grad_norm": 0.005453066434711218,
"learning_rate": 5.750098957646127e-05,
"loss": 0.0038,
"step": 5405
},
{
"epoch": 1.4266033880429767,
"grad_norm": 0.17556461691856384,
"learning_rate": 5.736904604829133e-05,
"loss": 0.0305,
"step": 5410
},
{
"epoch": 1.4279216927031837,
"grad_norm": 0.03074715845286846,
"learning_rate": 5.723710252012139e-05,
"loss": 0.003,
"step": 5415
},
{
"epoch": 1.4292399973633907,
"grad_norm": 1.7238941192626953,
"learning_rate": 5.710515899195144e-05,
"loss": 0.0254,
"step": 5420
},
{
"epoch": 1.4305583020235977,
"grad_norm": 0.012462320737540722,
"learning_rate": 5.6973215463781506e-05,
"loss": 0.0018,
"step": 5425
},
{
"epoch": 1.4318766066838047,
"grad_norm": 0.021576853469014168,
"learning_rate": 5.684127193561156e-05,
"loss": 0.0472,
"step": 5430
},
{
"epoch": 1.4331949113440117,
"grad_norm": 0.2862134575843811,
"learning_rate": 5.670932840744162e-05,
"loss": 0.0258,
"step": 5435
},
{
"epoch": 1.4345132160042184,
"grad_norm": 0.28419312834739685,
"learning_rate": 5.657738487927168e-05,
"loss": 0.0053,
"step": 5440
},
{
"epoch": 1.4358315206644257,
"grad_norm": 0.013650139793753624,
"learning_rate": 5.6445441351101735e-05,
"loss": 0.0126,
"step": 5445
},
{
"epoch": 1.4371498253246324,
"grad_norm": 0.01203097216784954,
"learning_rate": 5.631349782293179e-05,
"loss": 0.0076,
"step": 5450
},
{
"epoch": 1.4384681299848394,
"grad_norm": 0.0881054624915123,
"learning_rate": 5.618155429476184e-05,
"loss": 0.0178,
"step": 5455
},
{
"epoch": 1.4397864346450464,
"grad_norm": 0.5258516669273376,
"learning_rate": 5.6049610766591895e-05,
"loss": 0.0112,
"step": 5460
},
{
"epoch": 1.4411047393052534,
"grad_norm": 0.001202153041958809,
"learning_rate": 5.591766723842196e-05,
"loss": 0.0089,
"step": 5465
},
{
"epoch": 1.4424230439654604,
"grad_norm": 0.4498993456363678,
"learning_rate": 5.5785723710252014e-05,
"loss": 0.0252,
"step": 5470
},
{
"epoch": 1.4437413486256674,
"grad_norm": 0.17477644979953766,
"learning_rate": 5.565378018208207e-05,
"loss": 0.0169,
"step": 5475
},
{
"epoch": 1.4450596532858744,
"grad_norm": 0.019443338736891747,
"learning_rate": 5.552183665391213e-05,
"loss": 0.0019,
"step": 5480
},
{
"epoch": 1.4463779579460814,
"grad_norm": 0.005653039086610079,
"learning_rate": 5.538989312574219e-05,
"loss": 0.0231,
"step": 5485
},
{
"epoch": 1.4476962626062884,
"grad_norm": 0.01554112322628498,
"learning_rate": 5.525794959757224e-05,
"loss": 0.0167,
"step": 5490
},
{
"epoch": 1.4490145672664954,
"grad_norm": 0.044272180646657944,
"learning_rate": 5.5126006069402305e-05,
"loss": 0.007,
"step": 5495
},
{
"epoch": 1.4503328719267023,
"grad_norm": 0.014857172966003418,
"learning_rate": 5.499406254123236e-05,
"loss": 0.0045,
"step": 5500
},
{
"epoch": 1.4503328719267023,
"eval_loss": 0.02392147295176983,
"eval_runtime": 452.468,
"eval_samples_per_second": 7.452,
"eval_steps_per_second": 3.726,
"step": 5500
},
{
"epoch": 1.4516511765869091,
"grad_norm": 0.007390835788100958,
"learning_rate": 5.486211901306241e-05,
"loss": 0.0171,
"step": 5505
},
{
"epoch": 1.4529694812471163,
"grad_norm": 0.0050474610179662704,
"learning_rate": 5.4730175484892466e-05,
"loss": 0.004,
"step": 5510
},
{
"epoch": 1.454287785907323,
"grad_norm": 0.08066163957118988,
"learning_rate": 5.459823195672252e-05,
"loss": 0.0103,
"step": 5515
},
{
"epoch": 1.45560609056753,
"grad_norm": 0.0062376330606639385,
"learning_rate": 5.4466288428552584e-05,
"loss": 0.0066,
"step": 5520
},
{
"epoch": 1.456924395227737,
"grad_norm": 0.00711809890344739,
"learning_rate": 5.433434490038264e-05,
"loss": 0.003,
"step": 5525
},
{
"epoch": 1.458242699887944,
"grad_norm": 0.004010149277746677,
"learning_rate": 5.4202401372212695e-05,
"loss": 0.0231,
"step": 5530
},
{
"epoch": 1.459561004548151,
"grad_norm": 0.4791967272758484,
"learning_rate": 5.407045784404276e-05,
"loss": 0.0277,
"step": 5535
},
{
"epoch": 1.460879309208358,
"grad_norm": 0.03979189693927765,
"learning_rate": 5.393851431587281e-05,
"loss": 0.0033,
"step": 5540
},
{
"epoch": 1.462197613868565,
"grad_norm": 0.03331119939684868,
"learning_rate": 5.380657078770287e-05,
"loss": 0.0187,
"step": 5545
},
{
"epoch": 1.463515918528772,
"grad_norm": 0.0042802803218364716,
"learning_rate": 5.367462725953293e-05,
"loss": 0.0032,
"step": 5550
},
{
"epoch": 1.464834223188979,
"grad_norm": 0.05439918115735054,
"learning_rate": 5.354268373136297e-05,
"loss": 0.0043,
"step": 5555
},
{
"epoch": 1.4661525278491858,
"grad_norm": 0.042643506079912186,
"learning_rate": 5.3410740203193036e-05,
"loss": 0.0059,
"step": 5560
},
{
"epoch": 1.467470832509393,
"grad_norm": 0.023453116416931152,
"learning_rate": 5.327879667502309e-05,
"loss": 0.0043,
"step": 5565
},
{
"epoch": 1.4687891371695998,
"grad_norm": 0.037712760269641876,
"learning_rate": 5.314685314685315e-05,
"loss": 0.0033,
"step": 5570
},
{
"epoch": 1.4701074418298068,
"grad_norm": 1.0485608577728271,
"learning_rate": 5.301490961868321e-05,
"loss": 0.0489,
"step": 5575
},
{
"epoch": 1.4714257464900138,
"grad_norm": 0.004728829488158226,
"learning_rate": 5.2882966090513265e-05,
"loss": 0.0067,
"step": 5580
},
{
"epoch": 1.4727440511502208,
"grad_norm": 0.027893677353858948,
"learning_rate": 5.275102256234332e-05,
"loss": 0.0208,
"step": 5585
},
{
"epoch": 1.4740623558104278,
"grad_norm": 0.02256879396736622,
"learning_rate": 5.2619079034173377e-05,
"loss": 0.0036,
"step": 5590
},
{
"epoch": 1.4753806604706348,
"grad_norm": 0.12636558711528778,
"learning_rate": 5.248713550600344e-05,
"loss": 0.0046,
"step": 5595
},
{
"epoch": 1.4766989651308418,
"grad_norm": 0.000997041119262576,
"learning_rate": 5.235519197783348e-05,
"loss": 0.0101,
"step": 5600
},
{
"epoch": 1.4780172697910487,
"grad_norm": 0.023494020104408264,
"learning_rate": 5.2223248449663543e-05,
"loss": 0.0039,
"step": 5605
},
{
"epoch": 1.4793355744512557,
"grad_norm": 0.01525307446718216,
"learning_rate": 5.20913049214936e-05,
"loss": 0.021,
"step": 5610
},
{
"epoch": 1.4806538791114627,
"grad_norm": 0.0024215306621044874,
"learning_rate": 5.1959361393323655e-05,
"loss": 0.0017,
"step": 5615
},
{
"epoch": 1.4819721837716697,
"grad_norm": 1.4708061218261719,
"learning_rate": 5.182741786515372e-05,
"loss": 0.04,
"step": 5620
},
{
"epoch": 1.4832904884318765,
"grad_norm": 0.015033531002700329,
"learning_rate": 5.169547433698377e-05,
"loss": 0.0042,
"step": 5625
},
{
"epoch": 1.4846087930920837,
"grad_norm": 0.0035444959066808224,
"learning_rate": 5.156353080881383e-05,
"loss": 0.0087,
"step": 5630
},
{
"epoch": 1.4859270977522905,
"grad_norm": 0.010087919421494007,
"learning_rate": 5.143158728064389e-05,
"loss": 0.0158,
"step": 5635
},
{
"epoch": 1.4872454024124975,
"grad_norm": 0.05779251083731651,
"learning_rate": 5.129964375247395e-05,
"loss": 0.0157,
"step": 5640
},
{
"epoch": 1.4885637070727045,
"grad_norm": 0.14927980303764343,
"learning_rate": 5.1167700224304e-05,
"loss": 0.0257,
"step": 5645
},
{
"epoch": 1.4898820117329115,
"grad_norm": 0.004252352751791477,
"learning_rate": 5.103575669613405e-05,
"loss": 0.0198,
"step": 5650
},
{
"epoch": 1.4912003163931185,
"grad_norm": 0.0029206848703324795,
"learning_rate": 5.090381316796411e-05,
"loss": 0.0016,
"step": 5655
},
{
"epoch": 1.4925186210533254,
"grad_norm": 0.005047530401498079,
"learning_rate": 5.077186963979417e-05,
"loss": 0.0023,
"step": 5660
},
{
"epoch": 1.4938369257135324,
"grad_norm": 0.003732564626261592,
"learning_rate": 5.0639926111624225e-05,
"loss": 0.0336,
"step": 5665
},
{
"epoch": 1.4951552303737394,
"grad_norm": 0.3832889497280121,
"learning_rate": 5.050798258345428e-05,
"loss": 0.0476,
"step": 5670
},
{
"epoch": 1.4964735350339464,
"grad_norm": 0.06733009219169617,
"learning_rate": 5.037603905528434e-05,
"loss": 0.0044,
"step": 5675
},
{
"epoch": 1.4977918396941532,
"grad_norm": 0.008067069575190544,
"learning_rate": 5.02440955271144e-05,
"loss": 0.0035,
"step": 5680
},
{
"epoch": 1.4991101443543604,
"grad_norm": 0.01706300489604473,
"learning_rate": 5.0112151998944454e-05,
"loss": 0.0031,
"step": 5685
},
{
"epoch": 1.5004284490145672,
"grad_norm": 0.009932024404406548,
"learning_rate": 4.998020847077451e-05,
"loss": 0.0587,
"step": 5690
},
{
"epoch": 1.5017467536747744,
"grad_norm": 0.006488936021924019,
"learning_rate": 4.9848264942604566e-05,
"loss": 0.002,
"step": 5695
},
{
"epoch": 1.5030650583349812,
"grad_norm": 0.17488756775856018,
"learning_rate": 4.971632141443462e-05,
"loss": 0.0245,
"step": 5700
},
{
"epoch": 1.5043833629951882,
"grad_norm": 0.3327178359031677,
"learning_rate": 4.9584377886264684e-05,
"loss": 0.0404,
"step": 5705
},
{
"epoch": 1.5057016676553951,
"grad_norm": 0.18467263877391815,
"learning_rate": 4.945243435809474e-05,
"loss": 0.0248,
"step": 5710
},
{
"epoch": 1.5070199723156021,
"grad_norm": 0.020061776041984558,
"learning_rate": 4.9320490829924795e-05,
"loss": 0.0034,
"step": 5715
},
{
"epoch": 1.5083382769758091,
"grad_norm": 0.0005288647953420877,
"learning_rate": 4.918854730175485e-05,
"loss": 0.0076,
"step": 5720
},
{
"epoch": 1.5096565816360161,
"grad_norm": 0.007515576668083668,
"learning_rate": 4.9056603773584906e-05,
"loss": 0.004,
"step": 5725
},
{
"epoch": 1.5109748862962231,
"grad_norm": 0.05365758761763573,
"learning_rate": 4.892466024541497e-05,
"loss": 0.0222,
"step": 5730
},
{
"epoch": 1.51229319095643,
"grad_norm": 0.00572391040623188,
"learning_rate": 4.8792716717245025e-05,
"loss": 0.0132,
"step": 5735
},
{
"epoch": 1.513611495616637,
"grad_norm": 0.21178627014160156,
"learning_rate": 4.8660773189075073e-05,
"loss": 0.0417,
"step": 5740
},
{
"epoch": 1.5149298002768439,
"grad_norm": 0.0641486868262291,
"learning_rate": 4.8528829660905136e-05,
"loss": 0.011,
"step": 5745
},
{
"epoch": 1.516248104937051,
"grad_norm": 0.04451924189925194,
"learning_rate": 4.839688613273519e-05,
"loss": 0.012,
"step": 5750
},
{
"epoch": 1.5175664095972579,
"grad_norm": 0.019951259717345238,
"learning_rate": 4.826494260456525e-05,
"loss": 0.009,
"step": 5755
},
{
"epoch": 1.5188847142574649,
"grad_norm": 0.021919893100857735,
"learning_rate": 4.813299907639531e-05,
"loss": 0.0081,
"step": 5760
},
{
"epoch": 1.5202030189176718,
"grad_norm": 0.5730367302894592,
"learning_rate": 4.800105554822536e-05,
"loss": 0.0254,
"step": 5765
},
{
"epoch": 1.5215213235778788,
"grad_norm": 0.02501523122191429,
"learning_rate": 4.786911202005542e-05,
"loss": 0.0045,
"step": 5770
},
{
"epoch": 1.5228396282380858,
"grad_norm": 0.01574208028614521,
"learning_rate": 4.773716849188548e-05,
"loss": 0.0081,
"step": 5775
},
{
"epoch": 1.5241579328982928,
"grad_norm": 0.009626791812479496,
"learning_rate": 4.760522496371553e-05,
"loss": 0.0037,
"step": 5780
},
{
"epoch": 1.5254762375584998,
"grad_norm": 0.535539448261261,
"learning_rate": 4.747328143554559e-05,
"loss": 0.0149,
"step": 5785
},
{
"epoch": 1.5267945422187066,
"grad_norm": 0.004934845492243767,
"learning_rate": 4.7341337907375644e-05,
"loss": 0.0048,
"step": 5790
},
{
"epoch": 1.5281128468789138,
"grad_norm": 0.009070080704987049,
"learning_rate": 4.72093943792057e-05,
"loss": 0.0028,
"step": 5795
},
{
"epoch": 1.5294311515391206,
"grad_norm": 0.0040720063261687756,
"learning_rate": 4.707745085103576e-05,
"loss": 0.0016,
"step": 5800
},
{
"epoch": 1.5307494561993278,
"grad_norm": 0.45212000608444214,
"learning_rate": 4.694550732286582e-05,
"loss": 0.0111,
"step": 5805
},
{
"epoch": 1.5320677608595346,
"grad_norm": 0.024048497900366783,
"learning_rate": 4.681356379469587e-05,
"loss": 0.0149,
"step": 5810
},
{
"epoch": 1.5333860655197418,
"grad_norm": 0.11899136006832123,
"learning_rate": 4.668162026652593e-05,
"loss": 0.0034,
"step": 5815
},
{
"epoch": 1.5347043701799485,
"grad_norm": 0.011249657720327377,
"learning_rate": 4.6549676738355984e-05,
"loss": 0.0052,
"step": 5820
},
{
"epoch": 1.5360226748401555,
"grad_norm": 0.051634710282087326,
"learning_rate": 4.641773321018604e-05,
"loss": 0.0031,
"step": 5825
},
{
"epoch": 1.5373409795003625,
"grad_norm": 0.3726826012134552,
"learning_rate": 4.62857896820161e-05,
"loss": 0.0582,
"step": 5830
},
{
"epoch": 1.5386592841605695,
"grad_norm": 0.5827310681343079,
"learning_rate": 4.615384615384616e-05,
"loss": 0.0652,
"step": 5835
},
{
"epoch": 1.5399775888207765,
"grad_norm": 0.006390869617462158,
"learning_rate": 4.6021902625676214e-05,
"loss": 0.0022,
"step": 5840
},
{
"epoch": 1.5412958934809835,
"grad_norm": 0.022760871797800064,
"learning_rate": 4.588995909750627e-05,
"loss": 0.0311,
"step": 5845
},
{
"epoch": 1.5426141981411905,
"grad_norm": 0.22773241996765137,
"learning_rate": 4.5758015569336325e-05,
"loss": 0.0051,
"step": 5850
},
{
"epoch": 1.5439325028013973,
"grad_norm": 0.015375247225165367,
"learning_rate": 4.562607204116639e-05,
"loss": 0.0023,
"step": 5855
},
{
"epoch": 1.5452508074616045,
"grad_norm": 0.007347101345658302,
"learning_rate": 4.549412851299644e-05,
"loss": 0.0437,
"step": 5860
},
{
"epoch": 1.5465691121218113,
"grad_norm": 0.012344900518655777,
"learning_rate": 4.536218498482649e-05,
"loss": 0.004,
"step": 5865
},
{
"epoch": 1.5478874167820185,
"grad_norm": 0.27038896083831787,
"learning_rate": 4.5230241456656555e-05,
"loss": 0.0047,
"step": 5870
},
{
"epoch": 1.5492057214422252,
"grad_norm": 0.016395213082432747,
"learning_rate": 4.509829792848661e-05,
"loss": 0.0026,
"step": 5875
},
{
"epoch": 1.5505240261024322,
"grad_norm": 0.4217267632484436,
"learning_rate": 4.4966354400316666e-05,
"loss": 0.0364,
"step": 5880
},
{
"epoch": 1.5518423307626392,
"grad_norm": 0.20046105980873108,
"learning_rate": 4.483441087214673e-05,
"loss": 0.0243,
"step": 5885
},
{
"epoch": 1.5531606354228462,
"grad_norm": 0.004307698458433151,
"learning_rate": 4.470246734397678e-05,
"loss": 0.0064,
"step": 5890
},
{
"epoch": 1.5544789400830532,
"grad_norm": 0.46102187037467957,
"learning_rate": 4.457052381580683e-05,
"loss": 0.0115,
"step": 5895
},
{
"epoch": 1.5557972447432602,
"grad_norm": 0.0689118504524231,
"learning_rate": 4.4438580287636895e-05,
"loss": 0.0334,
"step": 5900
},
{
"epoch": 1.5571155494034672,
"grad_norm": 0.003091114340350032,
"learning_rate": 4.430663675946695e-05,
"loss": 0.0246,
"step": 5905
},
{
"epoch": 1.558433854063674,
"grad_norm": 0.003877349430695176,
"learning_rate": 4.417469323129701e-05,
"loss": 0.0032,
"step": 5910
},
{
"epoch": 1.5597521587238812,
"grad_norm": 0.30713143944740295,
"learning_rate": 4.404274970312706e-05,
"loss": 0.0229,
"step": 5915
},
{
"epoch": 1.561070463384088,
"grad_norm": 0.07344445586204529,
"learning_rate": 4.391080617495712e-05,
"loss": 0.0078,
"step": 5920
},
{
"epoch": 1.5623887680442952,
"grad_norm": 0.01774723082780838,
"learning_rate": 4.377886264678718e-05,
"loss": 0.0034,
"step": 5925
},
{
"epoch": 1.563707072704502,
"grad_norm": 0.476324200630188,
"learning_rate": 4.3646919118617236e-05,
"loss": 0.0071,
"step": 5930
},
{
"epoch": 1.5650253773647091,
"grad_norm": 0.11624465882778168,
"learning_rate": 4.351497559044729e-05,
"loss": 0.0236,
"step": 5935
},
{
"epoch": 1.566343682024916,
"grad_norm": 0.190691277384758,
"learning_rate": 4.338303206227735e-05,
"loss": 0.006,
"step": 5940
},
{
"epoch": 1.567661986685123,
"grad_norm": 0.20517045259475708,
"learning_rate": 4.32510885341074e-05,
"loss": 0.009,
"step": 5945
},
{
"epoch": 1.56898029134533,
"grad_norm": 0.008122317492961884,
"learning_rate": 4.311914500593746e-05,
"loss": 0.0041,
"step": 5950
},
{
"epoch": 1.570298596005537,
"grad_norm": 0.01982291042804718,
"learning_rate": 4.298720147776752e-05,
"loss": 0.0258,
"step": 5955
},
{
"epoch": 1.5716169006657439,
"grad_norm": 0.000996922142803669,
"learning_rate": 4.285525794959758e-05,
"loss": 0.0233,
"step": 5960
},
{
"epoch": 1.5729352053259509,
"grad_norm": 0.09725592285394669,
"learning_rate": 4.272331442142763e-05,
"loss": 0.0218,
"step": 5965
},
{
"epoch": 1.5742535099861579,
"grad_norm": 0.0672350749373436,
"learning_rate": 4.259137089325769e-05,
"loss": 0.0194,
"step": 5970
},
{
"epoch": 1.5755718146463646,
"grad_norm": 0.014844833873212337,
"learning_rate": 4.2459427365087744e-05,
"loss": 0.0298,
"step": 5975
},
{
"epoch": 1.5768901193065719,
"grad_norm": 0.030519040301442146,
"learning_rate": 4.2327483836917806e-05,
"loss": 0.0178,
"step": 5980
},
{
"epoch": 1.5782084239667786,
"grad_norm": 0.018561460077762604,
"learning_rate": 4.219554030874786e-05,
"loss": 0.0154,
"step": 5985
},
{
"epoch": 1.5795267286269858,
"grad_norm": 0.02470085583627224,
"learning_rate": 4.206359678057791e-05,
"loss": 0.0361,
"step": 5990
},
{
"epoch": 1.5808450332871926,
"grad_norm": 0.055412422865629196,
"learning_rate": 4.193165325240797e-05,
"loss": 0.0162,
"step": 5995
},
{
"epoch": 1.5821633379473996,
"grad_norm": 0.0034158769994974136,
"learning_rate": 4.179970972423803e-05,
"loss": 0.0068,
"step": 6000
},
{
"epoch": 1.5821633379473996,
"eval_loss": 0.024797894060611725,
"eval_runtime": 452.1611,
"eval_samples_per_second": 7.458,
"eval_steps_per_second": 3.729,
"step": 6000
},
{
"epoch": 1.5834816426076066,
"grad_norm": 0.01284120511263609,
"learning_rate": 4.1667766196068085e-05,
"loss": 0.0036,
"step": 6005
},
{
"epoch": 1.5847999472678136,
"grad_norm": 0.01274865586310625,
"learning_rate": 4.153582266789815e-05,
"loss": 0.0447,
"step": 6010
},
{
"epoch": 1.5861182519280206,
"grad_norm": 0.03555435314774513,
"learning_rate": 4.1403879139728196e-05,
"loss": 0.0078,
"step": 6015
},
{
"epoch": 1.5874365565882276,
"grad_norm": 0.0011938117677345872,
"learning_rate": 4.127193561155825e-05,
"loss": 0.0136,
"step": 6020
},
{
"epoch": 1.5887548612484346,
"grad_norm": 0.9741255640983582,
"learning_rate": 4.1139992083388314e-05,
"loss": 0.0153,
"step": 6025
},
{
"epoch": 1.5900731659086413,
"grad_norm": 0.011220674030482769,
"learning_rate": 4.100804855521837e-05,
"loss": 0.0262,
"step": 6030
},
{
"epoch": 1.5913914705688486,
"grad_norm": 0.021556466817855835,
"learning_rate": 4.0876105027048425e-05,
"loss": 0.0044,
"step": 6035
},
{
"epoch": 1.5927097752290553,
"grad_norm": 0.2725502848625183,
"learning_rate": 4.074416149887848e-05,
"loss": 0.0558,
"step": 6040
},
{
"epoch": 1.5940280798892625,
"grad_norm": 0.6407182216644287,
"learning_rate": 4.0612217970708537e-05,
"loss": 0.0261,
"step": 6045
},
{
"epoch": 1.5953463845494693,
"grad_norm": 0.0024960115551948547,
"learning_rate": 4.04802744425386e-05,
"loss": 0.0128,
"step": 6050
},
{
"epoch": 1.5966646892096763,
"grad_norm": 0.11380109190940857,
"learning_rate": 4.0348330914368655e-05,
"loss": 0.0199,
"step": 6055
},
{
"epoch": 1.5979829938698833,
"grad_norm": 0.18358005583286285,
"learning_rate": 4.0216387386198704e-05,
"loss": 0.0083,
"step": 6060
},
{
"epoch": 1.5993012985300903,
"grad_norm": 0.06412303447723389,
"learning_rate": 4.0084443858028766e-05,
"loss": 0.0548,
"step": 6065
},
{
"epoch": 1.6006196031902973,
"grad_norm": 0.6999421119689941,
"learning_rate": 3.995250032985882e-05,
"loss": 0.0074,
"step": 6070
},
{
"epoch": 1.6019379078505043,
"grad_norm": 0.18698133528232574,
"learning_rate": 3.982055680168888e-05,
"loss": 0.0542,
"step": 6075
},
{
"epoch": 1.6032562125107113,
"grad_norm": 0.014717207290232182,
"learning_rate": 3.968861327351894e-05,
"loss": 0.0071,
"step": 6080
},
{
"epoch": 1.604574517170918,
"grad_norm": 0.0765385851264,
"learning_rate": 3.955666974534899e-05,
"loss": 0.0063,
"step": 6085
},
{
"epoch": 1.6058928218311253,
"grad_norm": 0.4332450330257416,
"learning_rate": 3.9424726217179044e-05,
"loss": 0.0071,
"step": 6090
},
{
"epoch": 1.607211126491332,
"grad_norm": 0.003700035158544779,
"learning_rate": 3.929278268900911e-05,
"loss": 0.0052,
"step": 6095
},
{
"epoch": 1.6085294311515392,
"grad_norm": 0.02500278130173683,
"learning_rate": 3.916083916083916e-05,
"loss": 0.0387,
"step": 6100
},
{
"epoch": 1.609847735811746,
"grad_norm": 0.023568281903862953,
"learning_rate": 3.902889563266922e-05,
"loss": 0.0594,
"step": 6105
},
{
"epoch": 1.6111660404719532,
"grad_norm": 0.02687825821340084,
"learning_rate": 3.8896952104499274e-05,
"loss": 0.0229,
"step": 6110
},
{
"epoch": 1.61248434513216,
"grad_norm": 0.005178579594939947,
"learning_rate": 3.876500857632933e-05,
"loss": 0.0293,
"step": 6115
},
{
"epoch": 1.613802649792367,
"grad_norm": 0.3987988531589508,
"learning_rate": 3.863306504815939e-05,
"loss": 0.015,
"step": 6120
},
{
"epoch": 1.615120954452574,
"grad_norm": 0.18915466964244843,
"learning_rate": 3.850112151998945e-05,
"loss": 0.023,
"step": 6125
},
{
"epoch": 1.616439259112781,
"grad_norm": 0.015252528712153435,
"learning_rate": 3.83691779918195e-05,
"loss": 0.0185,
"step": 6130
},
{
"epoch": 1.617757563772988,
"grad_norm": 0.04947187379002571,
"learning_rate": 3.823723446364956e-05,
"loss": 0.0131,
"step": 6135
},
{
"epoch": 1.619075868433195,
"grad_norm": 0.017095958814024925,
"learning_rate": 3.8105290935479615e-05,
"loss": 0.0071,
"step": 6140
},
{
"epoch": 1.620394173093402,
"grad_norm": 0.013050337322056293,
"learning_rate": 3.797334740730967e-05,
"loss": 0.0038,
"step": 6145
},
{
"epoch": 1.6217124777536087,
"grad_norm": 0.08132806420326233,
"learning_rate": 3.784140387913973e-05,
"loss": 0.0043,
"step": 6150
},
{
"epoch": 1.623030782413816,
"grad_norm": 0.020741304382681847,
"learning_rate": 3.770946035096979e-05,
"loss": 0.006,
"step": 6155
},
{
"epoch": 1.6243490870740227,
"grad_norm": 0.0576217919588089,
"learning_rate": 3.7577516822799844e-05,
"loss": 0.0033,
"step": 6160
},
{
"epoch": 1.62566739173423,
"grad_norm": 0.03032900020480156,
"learning_rate": 3.74455732946299e-05,
"loss": 0.0318,
"step": 6165
},
{
"epoch": 1.6269856963944367,
"grad_norm": 0.8868799209594727,
"learning_rate": 3.7313629766459955e-05,
"loss": 0.0304,
"step": 6170
},
{
"epoch": 1.6283040010546437,
"grad_norm": 0.003816834883764386,
"learning_rate": 3.718168623829002e-05,
"loss": 0.003,
"step": 6175
},
{
"epoch": 1.6296223057148507,
"grad_norm": 0.05368296429514885,
"learning_rate": 3.704974271012007e-05,
"loss": 0.0064,
"step": 6180
},
{
"epoch": 1.6309406103750577,
"grad_norm": 0.09963366389274597,
"learning_rate": 3.691779918195012e-05,
"loss": 0.0097,
"step": 6185
},
{
"epoch": 1.6322589150352647,
"grad_norm": 0.006273225415498018,
"learning_rate": 3.6785855653780185e-05,
"loss": 0.0071,
"step": 6190
},
{
"epoch": 1.6335772196954716,
"grad_norm": 0.15079188346862793,
"learning_rate": 3.665391212561024e-05,
"loss": 0.0058,
"step": 6195
},
{
"epoch": 1.6348955243556786,
"grad_norm": 0.004980973433703184,
"learning_rate": 3.6521968597440296e-05,
"loss": 0.0051,
"step": 6200
},
{
"epoch": 1.6362138290158854,
"grad_norm": 0.004235363099724054,
"learning_rate": 3.639002506927036e-05,
"loss": 0.0028,
"step": 6205
},
{
"epoch": 1.6375321336760926,
"grad_norm": 0.003829963505268097,
"learning_rate": 3.625808154110041e-05,
"loss": 0.0347,
"step": 6210
},
{
"epoch": 1.6388504383362994,
"grad_norm": 0.021650686860084534,
"learning_rate": 3.612613801293046e-05,
"loss": 0.0036,
"step": 6215
},
{
"epoch": 1.6401687429965066,
"grad_norm": 0.06326934695243835,
"learning_rate": 3.5994194484760525e-05,
"loss": 0.0228,
"step": 6220
},
{
"epoch": 1.6414870476567134,
"grad_norm": 0.017276322469115257,
"learning_rate": 3.586225095659058e-05,
"loss": 0.0025,
"step": 6225
},
{
"epoch": 1.6428053523169206,
"grad_norm": 0.005066063720732927,
"learning_rate": 3.573030742842064e-05,
"loss": 0.0047,
"step": 6230
},
{
"epoch": 1.6441236569771274,
"grad_norm": 0.003512267954647541,
"learning_rate": 3.559836390025069e-05,
"loss": 0.0018,
"step": 6235
},
{
"epoch": 1.6454419616373344,
"grad_norm": 0.004347699694335461,
"learning_rate": 3.546642037208075e-05,
"loss": 0.0045,
"step": 6240
},
{
"epoch": 1.6467602662975414,
"grad_norm": 0.008277533575892448,
"learning_rate": 3.533447684391081e-05,
"loss": 0.0456,
"step": 6245
},
{
"epoch": 1.6480785709577483,
"grad_norm": 0.00973033718764782,
"learning_rate": 3.5202533315740866e-05,
"loss": 0.0215,
"step": 6250
},
{
"epoch": 1.6493968756179553,
"grad_norm": 1.9432978630065918,
"learning_rate": 3.507058978757092e-05,
"loss": 0.0132,
"step": 6255
},
{
"epoch": 1.6507151802781623,
"grad_norm": 0.2693535387516022,
"learning_rate": 3.493864625940098e-05,
"loss": 0.0037,
"step": 6260
},
{
"epoch": 1.6520334849383693,
"grad_norm": 0.02107766456902027,
"learning_rate": 3.480670273123103e-05,
"loss": 0.0031,
"step": 6265
},
{
"epoch": 1.653351789598576,
"grad_norm": 0.07168436795473099,
"learning_rate": 3.467475920306109e-05,
"loss": 0.0101,
"step": 6270
},
{
"epoch": 1.6546700942587833,
"grad_norm": 0.06479799002408981,
"learning_rate": 3.454281567489115e-05,
"loss": 0.0032,
"step": 6275
},
{
"epoch": 1.65598839891899,
"grad_norm": 0.0013557536294683814,
"learning_rate": 3.441087214672121e-05,
"loss": 0.0037,
"step": 6280
},
{
"epoch": 1.6573067035791973,
"grad_norm": 0.07330150157213211,
"learning_rate": 3.427892861855126e-05,
"loss": 0.0031,
"step": 6285
},
{
"epoch": 1.658625008239404,
"grad_norm": 0.08246012777090073,
"learning_rate": 3.414698509038132e-05,
"loss": 0.0028,
"step": 6290
},
{
"epoch": 1.659943312899611,
"grad_norm": 0.6232367157936096,
"learning_rate": 3.4015041562211374e-05,
"loss": 0.0042,
"step": 6295
},
{
"epoch": 1.661261617559818,
"grad_norm": 0.007676729932427406,
"learning_rate": 3.388309803404143e-05,
"loss": 0.0501,
"step": 6300
},
{
"epoch": 1.662579922220025,
"grad_norm": 0.02081216312944889,
"learning_rate": 3.375115450587149e-05,
"loss": 0.0047,
"step": 6305
},
{
"epoch": 1.663898226880232,
"grad_norm": 0.008829087018966675,
"learning_rate": 3.361921097770154e-05,
"loss": 0.0298,
"step": 6310
},
{
"epoch": 1.665216531540439,
"grad_norm": 0.4426127076148987,
"learning_rate": 3.34872674495316e-05,
"loss": 0.0045,
"step": 6315
},
{
"epoch": 1.666534836200646,
"grad_norm": 0.025818035006523132,
"learning_rate": 3.335532392136166e-05,
"loss": 0.0028,
"step": 6320
},
{
"epoch": 1.6678531408608528,
"grad_norm": 0.6068133115768433,
"learning_rate": 3.3223380393191715e-05,
"loss": 0.0202,
"step": 6325
},
{
"epoch": 1.66917144552106,
"grad_norm": 0.02740122564136982,
"learning_rate": 3.309143686502178e-05,
"loss": 0.0025,
"step": 6330
},
{
"epoch": 1.6704897501812668,
"grad_norm": 0.15878735482692719,
"learning_rate": 3.2959493336851826e-05,
"loss": 0.004,
"step": 6335
},
{
"epoch": 1.671808054841474,
"grad_norm": 0.006827466655522585,
"learning_rate": 3.282754980868188e-05,
"loss": 0.0048,
"step": 6340
},
{
"epoch": 1.6731263595016808,
"grad_norm": 0.19508551061153412,
"learning_rate": 3.2695606280511944e-05,
"loss": 0.0025,
"step": 6345
},
{
"epoch": 1.674444664161888,
"grad_norm": 0.8176754713058472,
"learning_rate": 3.2563662752342e-05,
"loss": 0.0151,
"step": 6350
},
{
"epoch": 1.6757629688220947,
"grad_norm": 0.011672024615108967,
"learning_rate": 3.2431719224172055e-05,
"loss": 0.0452,
"step": 6355
},
{
"epoch": 1.6770812734823017,
"grad_norm": 0.015824951231479645,
"learning_rate": 3.229977569600211e-05,
"loss": 0.0236,
"step": 6360
},
{
"epoch": 1.6783995781425087,
"grad_norm": 0.1358737051486969,
"learning_rate": 3.216783216783217e-05,
"loss": 0.0078,
"step": 6365
},
{
"epoch": 1.6797178828027157,
"grad_norm": 0.004896901547908783,
"learning_rate": 3.203588863966223e-05,
"loss": 0.0042,
"step": 6370
},
{
"epoch": 1.6810361874629227,
"grad_norm": 0.22593103349208832,
"learning_rate": 3.1903945111492285e-05,
"loss": 0.0053,
"step": 6375
},
{
"epoch": 1.6823544921231297,
"grad_norm": 0.0073196059092879295,
"learning_rate": 3.177200158332234e-05,
"loss": 0.0287,
"step": 6380
},
{
"epoch": 1.6836727967833367,
"grad_norm": 0.018524926155805588,
"learning_rate": 3.1640058055152396e-05,
"loss": 0.0122,
"step": 6385
},
{
"epoch": 1.6849911014435435,
"grad_norm": 0.7453815937042236,
"learning_rate": 3.150811452698245e-05,
"loss": 0.0378,
"step": 6390
},
{
"epoch": 1.6863094061037507,
"grad_norm": 0.22409795224666595,
"learning_rate": 3.137617099881251e-05,
"loss": 0.0282,
"step": 6395
},
{
"epoch": 1.6876277107639575,
"grad_norm": 0.005432693753391504,
"learning_rate": 3.124422747064257e-05,
"loss": 0.0162,
"step": 6400
},
{
"epoch": 1.6889460154241647,
"grad_norm": 0.1493055820465088,
"learning_rate": 3.1112283942472626e-05,
"loss": 0.0123,
"step": 6405
},
{
"epoch": 1.6902643200843714,
"grad_norm": 0.1638440042734146,
"learning_rate": 3.0980340414302674e-05,
"loss": 0.0058,
"step": 6410
},
{
"epoch": 1.6915826247445784,
"grad_norm": 0.015779908746480942,
"learning_rate": 3.084839688613274e-05,
"loss": 0.0157,
"step": 6415
},
{
"epoch": 1.6929009294047854,
"grad_norm": 0.0012348912423476577,
"learning_rate": 3.071645335796279e-05,
"loss": 0.0016,
"step": 6420
},
{
"epoch": 1.6942192340649924,
"grad_norm": 0.05294624716043472,
"learning_rate": 3.058450982979285e-05,
"loss": 0.0037,
"step": 6425
},
{
"epoch": 1.6955375387251994,
"grad_norm": 0.01926981844007969,
"learning_rate": 3.045256630162291e-05,
"loss": 0.0053,
"step": 6430
},
{
"epoch": 1.6968558433854064,
"grad_norm": 0.005958891473710537,
"learning_rate": 3.0320622773452963e-05,
"loss": 0.0025,
"step": 6435
},
{
"epoch": 1.6981741480456134,
"grad_norm": 0.001902201445773244,
"learning_rate": 3.018867924528302e-05,
"loss": 0.0027,
"step": 6440
},
{
"epoch": 1.6994924527058202,
"grad_norm": 0.036614127457141876,
"learning_rate": 3.0056735717113078e-05,
"loss": 0.0026,
"step": 6445
},
{
"epoch": 1.7008107573660274,
"grad_norm": 0.07294526696205139,
"learning_rate": 2.9924792188943133e-05,
"loss": 0.0042,
"step": 6450
},
{
"epoch": 1.7021290620262342,
"grad_norm": 0.42822372913360596,
"learning_rate": 2.9792848660773192e-05,
"loss": 0.013,
"step": 6455
},
{
"epoch": 1.7034473666864414,
"grad_norm": 0.036622967571020126,
"learning_rate": 2.9660905132603245e-05,
"loss": 0.0029,
"step": 6460
},
{
"epoch": 1.7047656713466481,
"grad_norm": 0.08314034342765808,
"learning_rate": 2.9528961604433304e-05,
"loss": 0.0043,
"step": 6465
},
{
"epoch": 1.7060839760068551,
"grad_norm": 0.0005654952838085592,
"learning_rate": 2.939701807626336e-05,
"loss": 0.0595,
"step": 6470
},
{
"epoch": 1.7074022806670621,
"grad_norm": 0.004545385017991066,
"learning_rate": 2.926507454809342e-05,
"loss": 0.0044,
"step": 6475
},
{
"epoch": 1.7087205853272691,
"grad_norm": 0.00033831383916549385,
"learning_rate": 2.9133131019923477e-05,
"loss": 0.0046,
"step": 6480
},
{
"epoch": 1.710038889987476,
"grad_norm": 0.0019903562497347593,
"learning_rate": 2.900118749175353e-05,
"loss": 0.0026,
"step": 6485
},
{
"epoch": 1.711357194647683,
"grad_norm": 0.10188104957342148,
"learning_rate": 2.8869243963583585e-05,
"loss": 0.0069,
"step": 6490
},
{
"epoch": 1.71267549930789,
"grad_norm": 0.2123432606458664,
"learning_rate": 2.8737300435413644e-05,
"loss": 0.0199,
"step": 6495
},
{
"epoch": 1.7139938039680969,
"grad_norm": 0.43209517002105713,
"learning_rate": 2.8605356907243703e-05,
"loss": 0.0099,
"step": 6500
},
{
"epoch": 1.7139938039680969,
"eval_loss": 0.024327505379915237,
"eval_runtime": 452.0052,
"eval_samples_per_second": 7.46,
"eval_steps_per_second": 3.73,
"step": 6500
},
{
"epoch": 1.715312108628304,
"grad_norm": 0.009868285618722439,
"learning_rate": 2.847341337907376e-05,
"loss": 0.0025,
"step": 6505
},
{
"epoch": 1.7166304132885108,
"grad_norm": 0.00778606254607439,
"learning_rate": 2.834146985090381e-05,
"loss": 0.0028,
"step": 6510
},
{
"epoch": 1.717948717948718,
"grad_norm": 0.02987460047006607,
"learning_rate": 2.820952632273387e-05,
"loss": 0.0068,
"step": 6515
},
{
"epoch": 1.7192670226089248,
"grad_norm": 0.04475142061710358,
"learning_rate": 2.807758279456393e-05,
"loss": 0.0022,
"step": 6520
},
{
"epoch": 1.720585327269132,
"grad_norm": 0.12720516324043274,
"learning_rate": 2.7945639266393985e-05,
"loss": 0.0488,
"step": 6525
},
{
"epoch": 1.7219036319293388,
"grad_norm": 0.0011463731061667204,
"learning_rate": 2.7813695738224044e-05,
"loss": 0.0023,
"step": 6530
},
{
"epoch": 1.7232219365895458,
"grad_norm": 0.008907752111554146,
"learning_rate": 2.7681752210054096e-05,
"loss": 0.0039,
"step": 6535
},
{
"epoch": 1.7245402412497528,
"grad_norm": 0.008416680619120598,
"learning_rate": 2.7549808681884156e-05,
"loss": 0.0055,
"step": 6540
},
{
"epoch": 1.7258585459099598,
"grad_norm": 0.26278871297836304,
"learning_rate": 2.741786515371421e-05,
"loss": 0.0386,
"step": 6545
},
{
"epoch": 1.7271768505701668,
"grad_norm": 0.01750275492668152,
"learning_rate": 2.728592162554427e-05,
"loss": 0.0048,
"step": 6550
},
{
"epoch": 1.7284951552303738,
"grad_norm": 0.009483959525823593,
"learning_rate": 2.7153978097374326e-05,
"loss": 0.0061,
"step": 6555
},
{
"epoch": 1.7298134598905808,
"grad_norm": 0.016591722145676613,
"learning_rate": 2.7022034569204378e-05,
"loss": 0.0058,
"step": 6560
},
{
"epoch": 1.7311317645507875,
"grad_norm": 0.5120682716369629,
"learning_rate": 2.6890091041034437e-05,
"loss": 0.0229,
"step": 6565
},
{
"epoch": 1.7324500692109948,
"grad_norm": 0.03748248517513275,
"learning_rate": 2.6758147512864496e-05,
"loss": 0.0026,
"step": 6570
},
{
"epoch": 1.7337683738712015,
"grad_norm": 0.08328749984502792,
"learning_rate": 2.6626203984694552e-05,
"loss": 0.0052,
"step": 6575
},
{
"epoch": 1.7350866785314087,
"grad_norm": 0.012284482829272747,
"learning_rate": 2.649426045652461e-05,
"loss": 0.0353,
"step": 6580
},
{
"epoch": 1.7364049831916155,
"grad_norm": 0.06362583488225937,
"learning_rate": 2.6362316928354663e-05,
"loss": 0.0309,
"step": 6585
},
{
"epoch": 1.7377232878518225,
"grad_norm": 0.01475360058248043,
"learning_rate": 2.6230373400184722e-05,
"loss": 0.0034,
"step": 6590
},
{
"epoch": 1.7390415925120295,
"grad_norm": 0.002241638721898198,
"learning_rate": 2.6098429872014778e-05,
"loss": 0.0365,
"step": 6595
},
{
"epoch": 1.7403598971722365,
"grad_norm": 0.11375941336154938,
"learning_rate": 2.5966486343844837e-05,
"loss": 0.0241,
"step": 6600
},
{
"epoch": 1.7416782018324435,
"grad_norm": 0.009631779976189137,
"learning_rate": 2.5834542815674896e-05,
"loss": 0.0026,
"step": 6605
},
{
"epoch": 1.7429965064926505,
"grad_norm": 0.12113262712955475,
"learning_rate": 2.570259928750495e-05,
"loss": 0.0207,
"step": 6610
},
{
"epoch": 1.7443148111528575,
"grad_norm": 0.006536155007779598,
"learning_rate": 2.5570655759335004e-05,
"loss": 0.0022,
"step": 6615
},
{
"epoch": 1.7456331158130642,
"grad_norm": 0.043030887842178345,
"learning_rate": 2.5438712231165063e-05,
"loss": 0.003,
"step": 6620
},
{
"epoch": 1.7469514204732715,
"grad_norm": 0.00860620103776455,
"learning_rate": 2.5306768702995122e-05,
"loss": 0.027,
"step": 6625
},
{
"epoch": 1.7482697251334782,
"grad_norm": 0.014589210972189903,
"learning_rate": 2.5174825174825178e-05,
"loss": 0.0224,
"step": 6630
},
{
"epoch": 1.7495880297936854,
"grad_norm": 0.01215316355228424,
"learning_rate": 2.504288164665523e-05,
"loss": 0.011,
"step": 6635
},
{
"epoch": 1.7509063344538922,
"grad_norm": 0.10951556265354156,
"learning_rate": 2.491093811848529e-05,
"loss": 0.0384,
"step": 6640
},
{
"epoch": 1.7522246391140994,
"grad_norm": 0.30859875679016113,
"learning_rate": 2.4778994590315345e-05,
"loss": 0.0031,
"step": 6645
},
{
"epoch": 1.7535429437743062,
"grad_norm": 0.025427229702472687,
"learning_rate": 2.4647051062145404e-05,
"loss": 0.0171,
"step": 6650
},
{
"epoch": 1.7548612484345132,
"grad_norm": 0.03334197774529457,
"learning_rate": 2.451510753397546e-05,
"loss": 0.0473,
"step": 6655
},
{
"epoch": 1.7561795530947202,
"grad_norm": 0.013445639982819557,
"learning_rate": 2.438316400580552e-05,
"loss": 0.0056,
"step": 6660
},
{
"epoch": 1.7574978577549272,
"grad_norm": 0.008306960575282574,
"learning_rate": 2.425122047763557e-05,
"loss": 0.0104,
"step": 6665
},
{
"epoch": 1.7588161624151342,
"grad_norm": 0.012615012936294079,
"learning_rate": 2.411927694946563e-05,
"loss": 0.0097,
"step": 6670
},
{
"epoch": 1.7601344670753412,
"grad_norm": 0.006827410310506821,
"learning_rate": 2.398733342129569e-05,
"loss": 0.0057,
"step": 6675
},
{
"epoch": 1.7614527717355482,
"grad_norm": 0.017035294324159622,
"learning_rate": 2.3855389893125745e-05,
"loss": 0.0035,
"step": 6680
},
{
"epoch": 1.762771076395755,
"grad_norm": 0.036102693527936935,
"learning_rate": 2.37234463649558e-05,
"loss": 0.0031,
"step": 6685
},
{
"epoch": 1.7640893810559621,
"grad_norm": 0.5004498958587646,
"learning_rate": 2.3591502836785856e-05,
"loss": 0.0217,
"step": 6690
},
{
"epoch": 1.765407685716169,
"grad_norm": 0.017726672813296318,
"learning_rate": 2.3459559308615915e-05,
"loss": 0.0112,
"step": 6695
},
{
"epoch": 1.7667259903763761,
"grad_norm": 0.00940331444144249,
"learning_rate": 2.332761578044597e-05,
"loss": 0.0107,
"step": 6700
},
{
"epoch": 1.768044295036583,
"grad_norm": 0.007495497819036245,
"learning_rate": 2.3195672252276026e-05,
"loss": 0.0032,
"step": 6705
},
{
"epoch": 1.7693625996967899,
"grad_norm": 0.6863199472427368,
"learning_rate": 2.3063728724106085e-05,
"loss": 0.034,
"step": 6710
},
{
"epoch": 1.7706809043569969,
"grad_norm": 0.004587489180266857,
"learning_rate": 2.293178519593614e-05,
"loss": 0.0032,
"step": 6715
},
{
"epoch": 1.7719992090172039,
"grad_norm": 0.017706016078591347,
"learning_rate": 2.2799841667766197e-05,
"loss": 0.0036,
"step": 6720
},
{
"epoch": 1.7733175136774109,
"grad_norm": 0.012740216217935085,
"learning_rate": 2.2667898139596252e-05,
"loss": 0.0147,
"step": 6725
},
{
"epoch": 1.7746358183376179,
"grad_norm": 0.010391579940915108,
"learning_rate": 2.253595461142631e-05,
"loss": 0.0041,
"step": 6730
},
{
"epoch": 1.7759541229978248,
"grad_norm": 0.021570540964603424,
"learning_rate": 2.2404011083256367e-05,
"loss": 0.0363,
"step": 6735
},
{
"epoch": 1.7772724276580316,
"grad_norm": 0.005778402555733919,
"learning_rate": 2.2272067555086423e-05,
"loss": 0.002,
"step": 6740
},
{
"epoch": 1.7785907323182388,
"grad_norm": 0.0,
"learning_rate": 2.2140124026916482e-05,
"loss": 0.0058,
"step": 6745
},
{
"epoch": 1.7799090369784456,
"grad_norm": 0.010869967751204967,
"learning_rate": 2.2008180498746537e-05,
"loss": 0.0036,
"step": 6750
},
{
"epoch": 1.7812273416386528,
"grad_norm": 0.04336518794298172,
"learning_rate": 2.1876236970576593e-05,
"loss": 0.0074,
"step": 6755
},
{
"epoch": 1.7825456462988596,
"grad_norm": 0.008664094842970371,
"learning_rate": 2.1744293442406652e-05,
"loss": 0.0027,
"step": 6760
},
{
"epoch": 1.7838639509590668,
"grad_norm": 0.9408183097839355,
"learning_rate": 2.1612349914236708e-05,
"loss": 0.0371,
"step": 6765
},
{
"epoch": 1.7851822556192736,
"grad_norm": 0.016822539269924164,
"learning_rate": 2.1480406386066763e-05,
"loss": 0.0137,
"step": 6770
},
{
"epoch": 1.7865005602794806,
"grad_norm": 0.00829544197767973,
"learning_rate": 2.134846285789682e-05,
"loss": 0.0134,
"step": 6775
},
{
"epoch": 1.7878188649396876,
"grad_norm": 0.0035508016590029,
"learning_rate": 2.1216519329726878e-05,
"loss": 0.0231,
"step": 6780
},
{
"epoch": 1.7891371695998946,
"grad_norm": 0.13871321082115173,
"learning_rate": 2.1084575801556937e-05,
"loss": 0.0296,
"step": 6785
},
{
"epoch": 1.7904554742601015,
"grad_norm": 0.002578354673460126,
"learning_rate": 2.095263227338699e-05,
"loss": 0.0178,
"step": 6790
},
{
"epoch": 1.7917737789203085,
"grad_norm": 0.5279458165168762,
"learning_rate": 2.082068874521705e-05,
"loss": 0.0336,
"step": 6795
},
{
"epoch": 1.7930920835805155,
"grad_norm": 0.0017439400544390082,
"learning_rate": 2.0688745217047104e-05,
"loss": 0.0031,
"step": 6800
},
{
"epoch": 1.7944103882407223,
"grad_norm": 0.007989778183400631,
"learning_rate": 2.055680168887716e-05,
"loss": 0.0081,
"step": 6805
},
{
"epoch": 1.7957286929009295,
"grad_norm": 0.015163813717663288,
"learning_rate": 2.042485816070722e-05,
"loss": 0.0234,
"step": 6810
},
{
"epoch": 1.7970469975611363,
"grad_norm": 0.10615389794111252,
"learning_rate": 2.0292914632537275e-05,
"loss": 0.0144,
"step": 6815
},
{
"epoch": 1.7983653022213435,
"grad_norm": 0.03466172143816948,
"learning_rate": 2.0160971104367334e-05,
"loss": 0.0036,
"step": 6820
},
{
"epoch": 1.7996836068815503,
"grad_norm": 0.047511328011751175,
"learning_rate": 2.0029027576197386e-05,
"loss": 0.002,
"step": 6825
},
{
"epoch": 1.8010019115417573,
"grad_norm": 0.019772246479988098,
"learning_rate": 1.9897084048027445e-05,
"loss": 0.0049,
"step": 6830
},
{
"epoch": 1.8023202162019643,
"grad_norm": 0.1156701073050499,
"learning_rate": 1.9765140519857504e-05,
"loss": 0.0033,
"step": 6835
},
{
"epoch": 1.8036385208621712,
"grad_norm": 0.010991690680384636,
"learning_rate": 1.963319699168756e-05,
"loss": 0.0036,
"step": 6840
},
{
"epoch": 1.8049568255223782,
"grad_norm": 0.29658815264701843,
"learning_rate": 1.9501253463517615e-05,
"loss": 0.0042,
"step": 6845
},
{
"epoch": 1.8062751301825852,
"grad_norm": 0.056147243827581406,
"learning_rate": 1.936930993534767e-05,
"loss": 0.0052,
"step": 6850
},
{
"epoch": 1.8075934348427922,
"grad_norm": 0.010382590815424919,
"learning_rate": 1.923736640717773e-05,
"loss": 0.0033,
"step": 6855
},
{
"epoch": 1.808911739502999,
"grad_norm": 1.1247020959854126,
"learning_rate": 1.9105422879007786e-05,
"loss": 0.0112,
"step": 6860
},
{
"epoch": 1.8102300441632062,
"grad_norm": 1.4515737295150757,
"learning_rate": 1.897347935083784e-05,
"loss": 0.0202,
"step": 6865
},
{
"epoch": 1.811548348823413,
"grad_norm": 0.016307830810546875,
"learning_rate": 1.88415358226679e-05,
"loss": 0.0148,
"step": 6870
},
{
"epoch": 1.8128666534836202,
"grad_norm": 0.0745878592133522,
"learning_rate": 1.8709592294497956e-05,
"loss": 0.0062,
"step": 6875
},
{
"epoch": 1.814184958143827,
"grad_norm": 0.02554013952612877,
"learning_rate": 1.8577648766328012e-05,
"loss": 0.003,
"step": 6880
},
{
"epoch": 1.815503262804034,
"grad_norm": 0.45748665928840637,
"learning_rate": 1.844570523815807e-05,
"loss": 0.0386,
"step": 6885
},
{
"epoch": 1.816821567464241,
"grad_norm": 0.013801589608192444,
"learning_rate": 1.8313761709988126e-05,
"loss": 0.0342,
"step": 6890
},
{
"epoch": 1.818139872124448,
"grad_norm": 0.6251696944236755,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.0101,
"step": 6895
},
{
"epoch": 1.819458176784655,
"grad_norm": 0.28203102946281433,
"learning_rate": 1.8049874653648238e-05,
"loss": 0.0032,
"step": 6900
},
{
"epoch": 1.820776481444862,
"grad_norm": 0.28511062264442444,
"learning_rate": 1.7917931125478297e-05,
"loss": 0.0343,
"step": 6905
},
{
"epoch": 1.822094786105069,
"grad_norm": 0.004940215498209,
"learning_rate": 1.7785987597308352e-05,
"loss": 0.0265,
"step": 6910
},
{
"epoch": 1.8234130907652757,
"grad_norm": 0.002903093583881855,
"learning_rate": 1.7654044069138408e-05,
"loss": 0.0025,
"step": 6915
},
{
"epoch": 1.824731395425483,
"grad_norm": 0.008801674470305443,
"learning_rate": 1.7522100540968467e-05,
"loss": 0.0246,
"step": 6920
},
{
"epoch": 1.8260497000856897,
"grad_norm": 0.13823826611042023,
"learning_rate": 1.7390157012798523e-05,
"loss": 0.0058,
"step": 6925
},
{
"epoch": 1.827368004745897,
"grad_norm": 0.020868878811597824,
"learning_rate": 1.725821348462858e-05,
"loss": 0.0014,
"step": 6930
},
{
"epoch": 1.8286863094061037,
"grad_norm": 0.0027356524951756,
"learning_rate": 1.7126269956458638e-05,
"loss": 0.0035,
"step": 6935
},
{
"epoch": 1.8300046140663109,
"grad_norm": 0.06023023650050163,
"learning_rate": 1.6994326428288693e-05,
"loss": 0.0212,
"step": 6940
},
{
"epoch": 1.8313229187265176,
"grad_norm": 0.0009826788445934653,
"learning_rate": 1.686238290011875e-05,
"loss": 0.0034,
"step": 6945
},
{
"epoch": 1.8326412233867246,
"grad_norm": 0.2867647707462311,
"learning_rate": 1.6730439371948805e-05,
"loss": 0.0146,
"step": 6950
},
{
"epoch": 1.8339595280469316,
"grad_norm": 0.004501632414758205,
"learning_rate": 1.6598495843778864e-05,
"loss": 0.0026,
"step": 6955
},
{
"epoch": 1.8352778327071386,
"grad_norm": 0.01251616608351469,
"learning_rate": 1.6466552315608923e-05,
"loss": 0.0107,
"step": 6960
},
{
"epoch": 1.8365961373673456,
"grad_norm": 0.054781850427389145,
"learning_rate": 1.6334608787438975e-05,
"loss": 0.0044,
"step": 6965
},
{
"epoch": 1.8379144420275526,
"grad_norm": 0.1120501235127449,
"learning_rate": 1.6202665259269034e-05,
"loss": 0.0284,
"step": 6970
},
{
"epoch": 1.8392327466877596,
"grad_norm": 0.001668553682975471,
"learning_rate": 1.607072173109909e-05,
"loss": 0.0169,
"step": 6975
},
{
"epoch": 1.8405510513479664,
"grad_norm": 1.6374458074569702,
"learning_rate": 1.593877820292915e-05,
"loss": 0.031,
"step": 6980
},
{
"epoch": 1.8418693560081736,
"grad_norm": 0.012474550865590572,
"learning_rate": 1.5806834674759204e-05,
"loss": 0.0037,
"step": 6985
},
{
"epoch": 1.8431876606683804,
"grad_norm": 0.014898869208991528,
"learning_rate": 1.567489114658926e-05,
"loss": 0.003,
"step": 6990
},
{
"epoch": 1.8445059653285876,
"grad_norm": 0.035570453852415085,
"learning_rate": 1.554294761841932e-05,
"loss": 0.0038,
"step": 6995
},
{
"epoch": 1.8458242699887943,
"grad_norm": 0.9279152750968933,
"learning_rate": 1.541100409024937e-05,
"loss": 0.0235,
"step": 7000
},
{
"epoch": 1.8458242699887943,
"eval_loss": 0.022339830175042152,
"eval_runtime": 451.9068,
"eval_samples_per_second": 7.462,
"eval_steps_per_second": 3.731,
"step": 7000
}
],
"logging_steps": 5,
"max_steps": 7584,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.6496806486741606e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}