diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10558 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.2999670003299966, + "eval_steps": 50000, + "global_step": 150000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0021999780002199976, + "grad_norm": 1.3319692611694336, + "learning_rate": 9.9e-06, + "loss": 0.5606, + "step": 100 + }, + { + "epoch": 0.004399956000439995, + "grad_norm": 1.4906107187271118, + "learning_rate": 9.99564212611423e-06, + "loss": 0.5478, + "step": 200 + }, + { + "epoch": 0.006599934000659994, + "grad_norm": 1.542100191116333, + "learning_rate": 9.99124023330032e-06, + "loss": 0.5508, + "step": 300 + }, + { + "epoch": 0.00879991200087999, + "grad_norm": 1.59752357006073, + "learning_rate": 9.98683834048641e-06, + "loss": 0.566, + "step": 400 + }, + { + "epoch": 0.010999890001099988, + "grad_norm": 1.535962462425232, + "learning_rate": 9.9824364476725e-06, + "loss": 0.5604, + "step": 500 + }, + { + "epoch": 0.013199868001319988, + "grad_norm": 1.6737797260284424, + "learning_rate": 9.97803455485859e-06, + "loss": 0.5651, + "step": 600 + }, + { + "epoch": 0.015399846001539985, + "grad_norm": 1.5698915719985962, + "learning_rate": 9.97363266204468e-06, + "loss": 0.5384, + "step": 700 + }, + { + "epoch": 0.01759982400175998, + "grad_norm": 1.501681923866272, + "learning_rate": 9.96923076923077e-06, + "loss": 0.5454, + "step": 800 + }, + { + "epoch": 0.01979980200197998, + "grad_norm": 1.6730457544326782, + "learning_rate": 9.96482887641686e-06, + "loss": 0.5515, + "step": 900 + }, + { + "epoch": 0.021999780002199976, + "grad_norm": 1.7415289878845215, + "learning_rate": 9.960426983602949e-06, + "loss": 0.5641, + "step": 1000 + }, + { + "epoch": 0.024199758002419976, + "grad_norm": 1.7273190021514893, + "learning_rate": 9.95602509078904e-06, + "loss": 0.559, + "step": 1100 + }, + { + "epoch": 0.026399736002639975, + "grad_norm": 1.7402335405349731, + "learning_rate": 9.95162319797513e-06, + "loss": 0.5391, + "step": 1200 + }, + { + "epoch": 0.02859971400285997, + "grad_norm": 1.8390350341796875, + "learning_rate": 9.94722130516122e-06, + "loss": 0.5563, + "step": 1300 + }, + { + "epoch": 0.03079969200307997, + "grad_norm": 1.3122905492782593, + "learning_rate": 9.94281941234731e-06, + "loss": 0.5594, + "step": 1400 + }, + { + "epoch": 0.032999670003299966, + "grad_norm": 1.3811813592910767, + "learning_rate": 9.9384175195334e-06, + "loss": 0.5592, + "step": 1500 + }, + { + "epoch": 0.03519964800351996, + "grad_norm": 1.8546792268753052, + "learning_rate": 9.934015626719489e-06, + "loss": 0.5522, + "step": 1600 + }, + { + "epoch": 0.037399626003739965, + "grad_norm": 1.6485520601272583, + "learning_rate": 9.92961373390558e-06, + "loss": 0.5354, + "step": 1700 + }, + { + "epoch": 0.03959960400395996, + "grad_norm": 1.366682767868042, + "learning_rate": 9.92521184109167e-06, + "loss": 0.5507, + "step": 1800 + }, + { + "epoch": 0.04179958200417996, + "grad_norm": 1.7690378427505493, + "learning_rate": 9.92080994827776e-06, + "loss": 0.5444, + "step": 1900 + }, + { + "epoch": 0.04399956000439995, + "grad_norm": 1.5437382459640503, + "learning_rate": 9.91640805546385e-06, + "loss": 0.5651, + "step": 2000 + }, + { + "epoch": 0.046199538004619956, + "grad_norm": 1.156587839126587, + "learning_rate": 9.91200616264994e-06, + "loss": 0.562, + "step": 2100 + }, + { + "epoch": 0.04839951600483995, + "grad_norm": 1.7941553592681885, + "learning_rate": 9.90760426983603e-06, + "loss": 0.5659, + "step": 2200 + }, + { + "epoch": 0.05059949400505995, + "grad_norm": 1.4848283529281616, + "learning_rate": 9.903202377022121e-06, + "loss": 0.5629, + "step": 2300 + }, + { + "epoch": 0.05279947200527995, + "grad_norm": 1.4486836194992065, + "learning_rate": 9.898800484208211e-06, + "loss": 0.5459, + "step": 2400 + }, + { + "epoch": 0.054999450005499946, + "grad_norm": 1.731554388999939, + "learning_rate": 9.894398591394302e-06, + "loss": 0.5626, + "step": 2500 + }, + { + "epoch": 0.05719942800571994, + "grad_norm": 1.6251667737960815, + "learning_rate": 9.88999669858039e-06, + "loss": 0.5516, + "step": 2600 + }, + { + "epoch": 0.05939940600593994, + "grad_norm": 1.256371021270752, + "learning_rate": 9.88559480576648e-06, + "loss": 0.5459, + "step": 2700 + }, + { + "epoch": 0.06159938400615994, + "grad_norm": 1.418700933456421, + "learning_rate": 9.88119291295257e-06, + "loss": 0.5495, + "step": 2800 + }, + { + "epoch": 0.06379936200637994, + "grad_norm": 1.6376900672912598, + "learning_rate": 9.876791020138661e-06, + "loss": 0.5641, + "step": 2900 + }, + { + "epoch": 0.06599934000659993, + "grad_norm": 1.5085667371749878, + "learning_rate": 9.872389127324751e-06, + "loss": 0.5625, + "step": 3000 + }, + { + "epoch": 0.06819931800681993, + "grad_norm": 1.5381278991699219, + "learning_rate": 9.86798723451084e-06, + "loss": 0.5603, + "step": 3100 + }, + { + "epoch": 0.07039929600703992, + "grad_norm": 1.5536515712738037, + "learning_rate": 9.86358534169693e-06, + "loss": 0.5529, + "step": 3200 + }, + { + "epoch": 0.07259927400725993, + "grad_norm": 1.9047861099243164, + "learning_rate": 9.85918344888302e-06, + "loss": 0.549, + "step": 3300 + }, + { + "epoch": 0.07479925200747993, + "grad_norm": 1.517338514328003, + "learning_rate": 9.85478155606911e-06, + "loss": 0.561, + "step": 3400 + }, + { + "epoch": 0.07699923000769993, + "grad_norm": 1.5779054164886475, + "learning_rate": 9.850379663255201e-06, + "loss": 0.5706, + "step": 3500 + }, + { + "epoch": 0.07919920800791992, + "grad_norm": 1.704124927520752, + "learning_rate": 9.845977770441291e-06, + "loss": 0.5523, + "step": 3600 + }, + { + "epoch": 0.08139918600813992, + "grad_norm": 1.5121921300888062, + "learning_rate": 9.84157587762738e-06, + "loss": 0.5539, + "step": 3700 + }, + { + "epoch": 0.08359916400835991, + "grad_norm": 1.6511967182159424, + "learning_rate": 9.83717398481347e-06, + "loss": 0.5443, + "step": 3800 + }, + { + "epoch": 0.08579914200857991, + "grad_norm": 1.719138503074646, + "learning_rate": 9.83277209199956e-06, + "loss": 0.55, + "step": 3900 + }, + { + "epoch": 0.0879991200087999, + "grad_norm": 1.6003084182739258, + "learning_rate": 9.82837019918565e-06, + "loss": 0.5588, + "step": 4000 + }, + { + "epoch": 0.09019909800901992, + "grad_norm": 1.787855625152588, + "learning_rate": 9.823968306371741e-06, + "loss": 0.5636, + "step": 4100 + }, + { + "epoch": 0.09239907600923991, + "grad_norm": 1.6582859754562378, + "learning_rate": 9.819566413557831e-06, + "loss": 0.5618, + "step": 4200 + }, + { + "epoch": 0.09459905400945991, + "grad_norm": 1.696978211402893, + "learning_rate": 9.81516452074392e-06, + "loss": 0.5546, + "step": 4300 + }, + { + "epoch": 0.0967990320096799, + "grad_norm": 1.8410296440124512, + "learning_rate": 9.81076262793001e-06, + "loss": 0.5471, + "step": 4400 + }, + { + "epoch": 0.0989990100098999, + "grad_norm": 1.736607313156128, + "learning_rate": 9.8063607351161e-06, + "loss": 0.5461, + "step": 4500 + }, + { + "epoch": 0.1011989880101199, + "grad_norm": 1.507016897201538, + "learning_rate": 9.80195884230219e-06, + "loss": 0.5609, + "step": 4600 + }, + { + "epoch": 0.10339896601033989, + "grad_norm": 1.6941606998443604, + "learning_rate": 9.797556949488281e-06, + "loss": 0.5656, + "step": 4700 + }, + { + "epoch": 0.1055989440105599, + "grad_norm": 1.6578975915908813, + "learning_rate": 9.793155056674371e-06, + "loss": 0.5624, + "step": 4800 + }, + { + "epoch": 0.1077989220107799, + "grad_norm": 1.6376292705535889, + "learning_rate": 9.78875316386046e-06, + "loss": 0.5483, + "step": 4900 + }, + { + "epoch": 0.10999890001099989, + "grad_norm": 1.8150690793991089, + "learning_rate": 9.78435127104655e-06, + "loss": 0.5739, + "step": 5000 + }, + { + "epoch": 0.11219887801121989, + "grad_norm": 1.8733948469161987, + "learning_rate": 9.77994937823264e-06, + "loss": 0.5511, + "step": 5100 + }, + { + "epoch": 0.11439885601143988, + "grad_norm": 1.3109201192855835, + "learning_rate": 9.77554748541873e-06, + "loss": 0.5584, + "step": 5200 + }, + { + "epoch": 0.11659883401165988, + "grad_norm": 2.0025064945220947, + "learning_rate": 9.771145592604821e-06, + "loss": 0.5638, + "step": 5300 + }, + { + "epoch": 0.11879881201187988, + "grad_norm": 1.584830641746521, + "learning_rate": 9.76674369979091e-06, + "loss": 0.575, + "step": 5400 + }, + { + "epoch": 0.12099879001209989, + "grad_norm": 1.7688754796981812, + "learning_rate": 9.762341806977e-06, + "loss": 0.5603, + "step": 5500 + }, + { + "epoch": 0.12319876801231988, + "grad_norm": 1.6688051223754883, + "learning_rate": 9.75793991416309e-06, + "loss": 0.5746, + "step": 5600 + }, + { + "epoch": 0.12539874601253986, + "grad_norm": 1.6409167051315308, + "learning_rate": 9.753538021349182e-06, + "loss": 0.5469, + "step": 5700 + }, + { + "epoch": 0.12759872401275987, + "grad_norm": 1.5867542028427124, + "learning_rate": 9.74913612853527e-06, + "loss": 0.5414, + "step": 5800 + }, + { + "epoch": 0.12979870201297988, + "grad_norm": 1.7665027379989624, + "learning_rate": 9.744734235721361e-06, + "loss": 0.5574, + "step": 5900 + }, + { + "epoch": 0.13199868001319986, + "grad_norm": 1.298757553100586, + "learning_rate": 9.740332342907451e-06, + "loss": 0.5356, + "step": 6000 + }, + { + "epoch": 0.13419865801341987, + "grad_norm": 1.381654143333435, + "learning_rate": 9.735930450093542e-06, + "loss": 0.5525, + "step": 6100 + }, + { + "epoch": 0.13639863601363986, + "grad_norm": 1.398958683013916, + "learning_rate": 9.731528557279632e-06, + "loss": 0.5427, + "step": 6200 + }, + { + "epoch": 0.13859861401385987, + "grad_norm": 1.4779409170150757, + "learning_rate": 9.727126664465722e-06, + "loss": 0.5583, + "step": 6300 + }, + { + "epoch": 0.14079859201407985, + "grad_norm": 1.5421425104141235, + "learning_rate": 9.72272477165181e-06, + "loss": 0.5484, + "step": 6400 + }, + { + "epoch": 0.14299857001429986, + "grad_norm": 1.7208441495895386, + "learning_rate": 9.718322878837901e-06, + "loss": 0.5478, + "step": 6500 + }, + { + "epoch": 0.14519854801451987, + "grad_norm": 1.643373727798462, + "learning_rate": 9.713920986023991e-06, + "loss": 0.5742, + "step": 6600 + }, + { + "epoch": 0.14739852601473985, + "grad_norm": 1.5801072120666504, + "learning_rate": 9.709519093210082e-06, + "loss": 0.5516, + "step": 6700 + }, + { + "epoch": 0.14959850401495986, + "grad_norm": 1.5034841299057007, + "learning_rate": 9.705117200396172e-06, + "loss": 0.558, + "step": 6800 + }, + { + "epoch": 0.15179848201517984, + "grad_norm": 1.6282888650894165, + "learning_rate": 9.70071530758226e-06, + "loss": 0.5575, + "step": 6900 + }, + { + "epoch": 0.15399846001539985, + "grad_norm": 1.4846858978271484, + "learning_rate": 9.69631341476835e-06, + "loss": 0.5487, + "step": 7000 + }, + { + "epoch": 0.15619843801561983, + "grad_norm": 1.6254215240478516, + "learning_rate": 9.691911521954441e-06, + "loss": 0.5443, + "step": 7100 + }, + { + "epoch": 0.15839841601583984, + "grad_norm": 1.7018550634384155, + "learning_rate": 9.687509629140531e-06, + "loss": 0.556, + "step": 7200 + }, + { + "epoch": 0.16059839401605983, + "grad_norm": 1.6466326713562012, + "learning_rate": 9.683107736326622e-06, + "loss": 0.5541, + "step": 7300 + }, + { + "epoch": 0.16279837201627984, + "grad_norm": 1.4446876049041748, + "learning_rate": 9.678705843512712e-06, + "loss": 0.5464, + "step": 7400 + }, + { + "epoch": 0.16499835001649985, + "grad_norm": 1.5896605253219604, + "learning_rate": 9.6743039506988e-06, + "loss": 0.5394, + "step": 7500 + }, + { + "epoch": 0.16719832801671983, + "grad_norm": 1.837875485420227, + "learning_rate": 9.66990205788489e-06, + "loss": 0.5351, + "step": 7600 + }, + { + "epoch": 0.16939830601693984, + "grad_norm": 1.5089105367660522, + "learning_rate": 9.665500165070981e-06, + "loss": 0.5434, + "step": 7700 + }, + { + "epoch": 0.17159828401715982, + "grad_norm": 1.5068552494049072, + "learning_rate": 9.661098272257071e-06, + "loss": 0.5542, + "step": 7800 + }, + { + "epoch": 0.17379826201737983, + "grad_norm": 1.7671160697937012, + "learning_rate": 9.656696379443162e-06, + "loss": 0.5434, + "step": 7900 + }, + { + "epoch": 0.1759982400175998, + "grad_norm": 1.612404227256775, + "learning_rate": 9.652294486629252e-06, + "loss": 0.5481, + "step": 8000 + }, + { + "epoch": 0.17819821801781982, + "grad_norm": 1.403520941734314, + "learning_rate": 9.64789259381534e-06, + "loss": 0.5436, + "step": 8100 + }, + { + "epoch": 0.18039819601803983, + "grad_norm": 1.786060094833374, + "learning_rate": 9.64349070100143e-06, + "loss": 0.5571, + "step": 8200 + }, + { + "epoch": 0.1825981740182598, + "grad_norm": 1.6619782447814941, + "learning_rate": 9.639088808187521e-06, + "loss": 0.5402, + "step": 8300 + }, + { + "epoch": 0.18479815201847982, + "grad_norm": 1.805365800857544, + "learning_rate": 9.634686915373611e-06, + "loss": 0.5705, + "step": 8400 + }, + { + "epoch": 0.1869981300186998, + "grad_norm": 1.5753322839736938, + "learning_rate": 9.630285022559702e-06, + "loss": 0.5477, + "step": 8500 + }, + { + "epoch": 0.18919810801891981, + "grad_norm": 1.688490629196167, + "learning_rate": 9.625883129745792e-06, + "loss": 0.5497, + "step": 8600 + }, + { + "epoch": 0.1913980860191398, + "grad_norm": 1.5862349271774292, + "learning_rate": 9.62148123693188e-06, + "loss": 0.5374, + "step": 8700 + }, + { + "epoch": 0.1935980640193598, + "grad_norm": 1.8771247863769531, + "learning_rate": 9.61707934411797e-06, + "loss": 0.5445, + "step": 8800 + }, + { + "epoch": 0.19579804201957982, + "grad_norm": 1.432055115699768, + "learning_rate": 9.612677451304061e-06, + "loss": 0.5478, + "step": 8900 + }, + { + "epoch": 0.1979980200197998, + "grad_norm": 1.7091459035873413, + "learning_rate": 9.608275558490151e-06, + "loss": 0.5509, + "step": 9000 + }, + { + "epoch": 0.2001979980200198, + "grad_norm": 1.5979877710342407, + "learning_rate": 9.603873665676242e-06, + "loss": 0.5439, + "step": 9100 + }, + { + "epoch": 0.2023979760202398, + "grad_norm": 1.5256608724594116, + "learning_rate": 9.599471772862332e-06, + "loss": 0.546, + "step": 9200 + }, + { + "epoch": 0.2045979540204598, + "grad_norm": 1.7038841247558594, + "learning_rate": 9.595069880048422e-06, + "loss": 0.5455, + "step": 9300 + }, + { + "epoch": 0.20679793202067978, + "grad_norm": 1.6116039752960205, + "learning_rate": 9.590667987234512e-06, + "loss": 0.5448, + "step": 9400 + }, + { + "epoch": 0.2089979100208998, + "grad_norm": 1.6021257638931274, + "learning_rate": 9.586266094420603e-06, + "loss": 0.5373, + "step": 9500 + }, + { + "epoch": 0.2111978880211198, + "grad_norm": 1.8599495887756348, + "learning_rate": 9.581864201606691e-06, + "loss": 0.5445, + "step": 9600 + }, + { + "epoch": 0.21339786602133978, + "grad_norm": 1.5737359523773193, + "learning_rate": 9.577462308792782e-06, + "loss": 0.554, + "step": 9700 + }, + { + "epoch": 0.2155978440215598, + "grad_norm": 1.9932422637939453, + "learning_rate": 9.573060415978872e-06, + "loss": 0.5466, + "step": 9800 + }, + { + "epoch": 0.21779782202177977, + "grad_norm": 1.2846128940582275, + "learning_rate": 9.568658523164962e-06, + "loss": 0.552, + "step": 9900 + }, + { + "epoch": 0.21999780002199978, + "grad_norm": 1.845566987991333, + "learning_rate": 9.564256630351052e-06, + "loss": 0.5351, + "step": 10000 + }, + { + "epoch": 0.22219777802221977, + "grad_norm": 1.7098534107208252, + "learning_rate": 9.559854737537143e-06, + "loss": 0.5701, + "step": 10100 + }, + { + "epoch": 0.22439775602243978, + "grad_norm": 1.6359370946884155, + "learning_rate": 9.555452844723231e-06, + "loss": 0.5399, + "step": 10200 + }, + { + "epoch": 0.22659773402265979, + "grad_norm": 1.8628222942352295, + "learning_rate": 9.551050951909322e-06, + "loss": 0.5428, + "step": 10300 + }, + { + "epoch": 0.22879771202287977, + "grad_norm": 1.7202619314193726, + "learning_rate": 9.546649059095412e-06, + "loss": 0.5473, + "step": 10400 + }, + { + "epoch": 0.23099769002309978, + "grad_norm": 1.6408450603485107, + "learning_rate": 9.542247166281502e-06, + "loss": 0.5566, + "step": 10500 + }, + { + "epoch": 0.23319766802331976, + "grad_norm": 1.6586904525756836, + "learning_rate": 9.537845273467592e-06, + "loss": 0.5357, + "step": 10600 + }, + { + "epoch": 0.23539764602353977, + "grad_norm": 1.8505043983459473, + "learning_rate": 9.533443380653683e-06, + "loss": 0.5596, + "step": 10700 + }, + { + "epoch": 0.23759762402375975, + "grad_norm": 1.9244803190231323, + "learning_rate": 9.529041487839771e-06, + "loss": 0.5428, + "step": 10800 + }, + { + "epoch": 0.23979760202397976, + "grad_norm": 1.5375540256500244, + "learning_rate": 9.524639595025862e-06, + "loss": 0.5478, + "step": 10900 + }, + { + "epoch": 0.24199758002419977, + "grad_norm": 1.7372453212738037, + "learning_rate": 9.520237702211952e-06, + "loss": 0.5458, + "step": 11000 + }, + { + "epoch": 0.24419755802441975, + "grad_norm": 1.5542049407958984, + "learning_rate": 9.515835809398042e-06, + "loss": 0.5412, + "step": 11100 + }, + { + "epoch": 0.24639753602463976, + "grad_norm": 1.5235602855682373, + "learning_rate": 9.511433916584132e-06, + "loss": 0.5631, + "step": 11200 + }, + { + "epoch": 0.24859751402485974, + "grad_norm": 1.7347521781921387, + "learning_rate": 9.507032023770221e-06, + "loss": 0.5508, + "step": 11300 + }, + { + "epoch": 0.2507974920250797, + "grad_norm": 1.8189500570297241, + "learning_rate": 9.502630130956311e-06, + "loss": 0.5346, + "step": 11400 + }, + { + "epoch": 0.25299747002529976, + "grad_norm": 1.5607105493545532, + "learning_rate": 9.498228238142402e-06, + "loss": 0.5454, + "step": 11500 + }, + { + "epoch": 0.25519744802551975, + "grad_norm": 1.5799516439437866, + "learning_rate": 9.493826345328492e-06, + "loss": 0.5271, + "step": 11600 + }, + { + "epoch": 0.25739742602573973, + "grad_norm": 1.4460997581481934, + "learning_rate": 9.489424452514582e-06, + "loss": 0.5437, + "step": 11700 + }, + { + "epoch": 0.25959740402595977, + "grad_norm": 1.368635892868042, + "learning_rate": 9.485022559700672e-06, + "loss": 0.5442, + "step": 11800 + }, + { + "epoch": 0.26179738202617975, + "grad_norm": 1.8246245384216309, + "learning_rate": 9.480620666886761e-06, + "loss": 0.5321, + "step": 11900 + }, + { + "epoch": 0.26399736002639973, + "grad_norm": 1.8881937265396118, + "learning_rate": 9.476218774072851e-06, + "loss": 0.5639, + "step": 12000 + }, + { + "epoch": 0.2661973380266197, + "grad_norm": 1.39218008518219, + "learning_rate": 9.471816881258942e-06, + "loss": 0.5634, + "step": 12100 + }, + { + "epoch": 0.26839731602683975, + "grad_norm": 1.5577659606933594, + "learning_rate": 9.467414988445032e-06, + "loss": 0.5422, + "step": 12200 + }, + { + "epoch": 0.27059729402705973, + "grad_norm": 1.9022492170333862, + "learning_rate": 9.463013095631122e-06, + "loss": 0.5429, + "step": 12300 + }, + { + "epoch": 0.2727972720272797, + "grad_norm": 1.7101701498031616, + "learning_rate": 9.458611202817212e-06, + "loss": 0.5473, + "step": 12400 + }, + { + "epoch": 0.27499725002749975, + "grad_norm": 2.0155210494995117, + "learning_rate": 9.454209310003301e-06, + "loss": 0.5689, + "step": 12500 + }, + { + "epoch": 0.27719722802771973, + "grad_norm": 1.994775414466858, + "learning_rate": 9.449807417189393e-06, + "loss": 0.514, + "step": 12600 + }, + { + "epoch": 0.2793972060279397, + "grad_norm": 1.5826818943023682, + "learning_rate": 9.445405524375483e-06, + "loss": 0.5413, + "step": 12700 + }, + { + "epoch": 0.2815971840281597, + "grad_norm": 1.589729905128479, + "learning_rate": 9.441003631561574e-06, + "loss": 0.5339, + "step": 12800 + }, + { + "epoch": 0.28379716202837973, + "grad_norm": 1.8156132698059082, + "learning_rate": 9.436601738747662e-06, + "loss": 0.5546, + "step": 12900 + }, + { + "epoch": 0.2859971400285997, + "grad_norm": 1.576416254043579, + "learning_rate": 9.432199845933752e-06, + "loss": 0.5465, + "step": 13000 + }, + { + "epoch": 0.2881971180288197, + "grad_norm": 1.9609074592590332, + "learning_rate": 9.427797953119843e-06, + "loss": 0.553, + "step": 13100 + }, + { + "epoch": 0.29039709602903974, + "grad_norm": 1.5881434679031372, + "learning_rate": 9.423396060305933e-06, + "loss": 0.5377, + "step": 13200 + }, + { + "epoch": 0.2925970740292597, + "grad_norm": 1.569200038909912, + "learning_rate": 9.418994167492023e-06, + "loss": 0.5467, + "step": 13300 + }, + { + "epoch": 0.2947970520294797, + "grad_norm": 1.7305947542190552, + "learning_rate": 9.414592274678112e-06, + "loss": 0.5388, + "step": 13400 + }, + { + "epoch": 0.2969970300296997, + "grad_norm": 1.9278624057769775, + "learning_rate": 9.410190381864202e-06, + "loss": 0.5419, + "step": 13500 + }, + { + "epoch": 0.2991970080299197, + "grad_norm": 1.6430861949920654, + "learning_rate": 9.405788489050292e-06, + "loss": 0.5579, + "step": 13600 + }, + { + "epoch": 0.3013969860301397, + "grad_norm": 1.4233689308166504, + "learning_rate": 9.401386596236383e-06, + "loss": 0.5385, + "step": 13700 + }, + { + "epoch": 0.3035969640303597, + "grad_norm": 1.705346941947937, + "learning_rate": 9.396984703422473e-06, + "loss": 0.5491, + "step": 13800 + }, + { + "epoch": 0.3057969420305797, + "grad_norm": 1.7933902740478516, + "learning_rate": 9.392582810608563e-06, + "loss": 0.5513, + "step": 13900 + }, + { + "epoch": 0.3079969200307997, + "grad_norm": 1.901663899421692, + "learning_rate": 9.388180917794652e-06, + "loss": 0.5614, + "step": 14000 + }, + { + "epoch": 0.3101968980310197, + "grad_norm": 1.6877708435058594, + "learning_rate": 9.383779024980742e-06, + "loss": 0.5334, + "step": 14100 + }, + { + "epoch": 0.31239687603123967, + "grad_norm": 1.7979609966278076, + "learning_rate": 9.379377132166832e-06, + "loss": 0.5527, + "step": 14200 + }, + { + "epoch": 0.3145968540314597, + "grad_norm": 1.7708429098129272, + "learning_rate": 9.374975239352923e-06, + "loss": 0.5386, + "step": 14300 + }, + { + "epoch": 0.3167968320316797, + "grad_norm": 1.3621147871017456, + "learning_rate": 9.370573346539013e-06, + "loss": 0.5626, + "step": 14400 + }, + { + "epoch": 0.31899681003189967, + "grad_norm": 1.5842787027359009, + "learning_rate": 9.366171453725103e-06, + "loss": 0.529, + "step": 14500 + }, + { + "epoch": 0.32119678803211965, + "grad_norm": 1.817987084388733, + "learning_rate": 9.361769560911192e-06, + "loss": 0.538, + "step": 14600 + }, + { + "epoch": 0.3233967660323397, + "grad_norm": 1.6293082237243652, + "learning_rate": 9.357367668097282e-06, + "loss": 0.5481, + "step": 14700 + }, + { + "epoch": 0.32559674403255967, + "grad_norm": 1.5916519165039062, + "learning_rate": 9.352965775283372e-06, + "loss": 0.5534, + "step": 14800 + }, + { + "epoch": 0.32779672203277965, + "grad_norm": 1.5773463249206543, + "learning_rate": 9.348563882469463e-06, + "loss": 0.5501, + "step": 14900 + }, + { + "epoch": 0.3299967000329997, + "grad_norm": 1.9787790775299072, + "learning_rate": 9.344161989655553e-06, + "loss": 0.541, + "step": 15000 + }, + { + "epoch": 0.3321966780332197, + "grad_norm": 1.3281339406967163, + "learning_rate": 9.339760096841642e-06, + "loss": 0.539, + "step": 15100 + }, + { + "epoch": 0.33439665603343965, + "grad_norm": 2.091588020324707, + "learning_rate": 9.335358204027732e-06, + "loss": 0.5393, + "step": 15200 + }, + { + "epoch": 0.33659663403365964, + "grad_norm": 1.912660837173462, + "learning_rate": 9.330956311213822e-06, + "loss": 0.5168, + "step": 15300 + }, + { + "epoch": 0.3387966120338797, + "grad_norm": 1.7248882055282593, + "learning_rate": 9.326554418399912e-06, + "loss": 0.538, + "step": 15400 + }, + { + "epoch": 0.34099659003409966, + "grad_norm": 1.8949754238128662, + "learning_rate": 9.322152525586003e-06, + "loss": 0.5444, + "step": 15500 + }, + { + "epoch": 0.34319656803431964, + "grad_norm": 1.4323865175247192, + "learning_rate": 9.317750632772093e-06, + "loss": 0.542, + "step": 15600 + }, + { + "epoch": 0.3453965460345397, + "grad_norm": 1.7454142570495605, + "learning_rate": 9.313348739958182e-06, + "loss": 0.5346, + "step": 15700 + }, + { + "epoch": 0.34759652403475966, + "grad_norm": 2.214750289916992, + "learning_rate": 9.308946847144272e-06, + "loss": 0.5391, + "step": 15800 + }, + { + "epoch": 0.34979650203497964, + "grad_norm": 1.7991106510162354, + "learning_rate": 9.304544954330362e-06, + "loss": 0.551, + "step": 15900 + }, + { + "epoch": 0.3519964800351996, + "grad_norm": 1.7487062215805054, + "learning_rate": 9.300143061516452e-06, + "loss": 0.5536, + "step": 16000 + }, + { + "epoch": 0.35419645803541966, + "grad_norm": 1.7137202024459839, + "learning_rate": 9.295741168702543e-06, + "loss": 0.5472, + "step": 16100 + }, + { + "epoch": 0.35639643603563964, + "grad_norm": 1.569287657737732, + "learning_rate": 9.291339275888633e-06, + "loss": 0.5286, + "step": 16200 + }, + { + "epoch": 0.3585964140358596, + "grad_norm": 1.805232286453247, + "learning_rate": 9.286937383074723e-06, + "loss": 0.535, + "step": 16300 + }, + { + "epoch": 0.36079639203607966, + "grad_norm": 1.8445895910263062, + "learning_rate": 9.282535490260814e-06, + "loss": 0.5297, + "step": 16400 + }, + { + "epoch": 0.36299637003629964, + "grad_norm": 1.8282471895217896, + "learning_rate": 9.278133597446904e-06, + "loss": 0.5341, + "step": 16500 + }, + { + "epoch": 0.3651963480365196, + "grad_norm": 1.5979552268981934, + "learning_rate": 9.273731704632994e-06, + "loss": 0.5471, + "step": 16600 + }, + { + "epoch": 0.3673963260367396, + "grad_norm": 1.6148823499679565, + "learning_rate": 9.269329811819083e-06, + "loss": 0.534, + "step": 16700 + }, + { + "epoch": 0.36959630403695964, + "grad_norm": 1.7306467294692993, + "learning_rate": 9.264927919005173e-06, + "loss": 0.5475, + "step": 16800 + }, + { + "epoch": 0.3717962820371796, + "grad_norm": 1.5774517059326172, + "learning_rate": 9.260526026191263e-06, + "loss": 0.5604, + "step": 16900 + }, + { + "epoch": 0.3739962600373996, + "grad_norm": 1.6581697463989258, + "learning_rate": 9.256124133377354e-06, + "loss": 0.5474, + "step": 17000 + }, + { + "epoch": 0.37619623803761965, + "grad_norm": 1.8324202299118042, + "learning_rate": 9.251722240563444e-06, + "loss": 0.5341, + "step": 17100 + }, + { + "epoch": 0.37839621603783963, + "grad_norm": 1.7121940851211548, + "learning_rate": 9.247320347749532e-06, + "loss": 0.5538, + "step": 17200 + }, + { + "epoch": 0.3805961940380596, + "grad_norm": 1.8483502864837646, + "learning_rate": 9.242918454935623e-06, + "loss": 0.5231, + "step": 17300 + }, + { + "epoch": 0.3827961720382796, + "grad_norm": 1.7600507736206055, + "learning_rate": 9.238516562121713e-06, + "loss": 0.5581, + "step": 17400 + }, + { + "epoch": 0.38499615003849963, + "grad_norm": 1.779398798942566, + "learning_rate": 9.234114669307803e-06, + "loss": 0.5468, + "step": 17500 + }, + { + "epoch": 0.3871961280387196, + "grad_norm": 1.7732363939285278, + "learning_rate": 9.229712776493894e-06, + "loss": 0.558, + "step": 17600 + }, + { + "epoch": 0.3893961060389396, + "grad_norm": 1.7597503662109375, + "learning_rate": 9.225310883679984e-06, + "loss": 0.5231, + "step": 17700 + }, + { + "epoch": 0.39159608403915963, + "grad_norm": 1.8344216346740723, + "learning_rate": 9.220908990866072e-06, + "loss": 0.5428, + "step": 17800 + }, + { + "epoch": 0.3937960620393796, + "grad_norm": 1.662919044494629, + "learning_rate": 9.216507098052163e-06, + "loss": 0.5314, + "step": 17900 + }, + { + "epoch": 0.3959960400395996, + "grad_norm": 1.3180632591247559, + "learning_rate": 9.212105205238253e-06, + "loss": 0.5335, + "step": 18000 + }, + { + "epoch": 0.3981960180398196, + "grad_norm": 1.8466808795928955, + "learning_rate": 9.207703312424343e-06, + "loss": 0.5251, + "step": 18100 + }, + { + "epoch": 0.4003959960400396, + "grad_norm": 1.942530632019043, + "learning_rate": 9.203301419610434e-06, + "loss": 0.5361, + "step": 18200 + }, + { + "epoch": 0.4025959740402596, + "grad_norm": 1.6795586347579956, + "learning_rate": 9.198899526796524e-06, + "loss": 0.5322, + "step": 18300 + }, + { + "epoch": 0.4047959520404796, + "grad_norm": 1.8028258085250854, + "learning_rate": 9.194497633982612e-06, + "loss": 0.5332, + "step": 18400 + }, + { + "epoch": 0.4069959300406996, + "grad_norm": 1.9072916507720947, + "learning_rate": 9.190095741168703e-06, + "loss": 0.5436, + "step": 18500 + }, + { + "epoch": 0.4091959080409196, + "grad_norm": 1.849950909614563, + "learning_rate": 9.185693848354793e-06, + "loss": 0.5464, + "step": 18600 + }, + { + "epoch": 0.4113958860411396, + "grad_norm": 1.8676297664642334, + "learning_rate": 9.181291955540883e-06, + "loss": 0.5598, + "step": 18700 + }, + { + "epoch": 0.41359586404135956, + "grad_norm": 1.8260865211486816, + "learning_rate": 9.176890062726974e-06, + "loss": 0.5433, + "step": 18800 + }, + { + "epoch": 0.4157958420415796, + "grad_norm": 1.6370753049850464, + "learning_rate": 9.172488169913064e-06, + "loss": 0.5473, + "step": 18900 + }, + { + "epoch": 0.4179958200417996, + "grad_norm": 1.583030104637146, + "learning_rate": 9.168086277099152e-06, + "loss": 0.5478, + "step": 19000 + }, + { + "epoch": 0.42019579804201956, + "grad_norm": 1.895065188407898, + "learning_rate": 9.163684384285243e-06, + "loss": 0.5391, + "step": 19100 + }, + { + "epoch": 0.4223957760422396, + "grad_norm": 1.6694116592407227, + "learning_rate": 9.159282491471333e-06, + "loss": 0.5206, + "step": 19200 + }, + { + "epoch": 0.4245957540424596, + "grad_norm": 1.630575180053711, + "learning_rate": 9.154880598657423e-06, + "loss": 0.5451, + "step": 19300 + }, + { + "epoch": 0.42679573204267957, + "grad_norm": 2.0224249362945557, + "learning_rate": 9.150478705843514e-06, + "loss": 0.5334, + "step": 19400 + }, + { + "epoch": 0.42899571004289955, + "grad_norm": 1.6329941749572754, + "learning_rate": 9.146076813029602e-06, + "loss": 0.5279, + "step": 19500 + }, + { + "epoch": 0.4311956880431196, + "grad_norm": 1.3999661207199097, + "learning_rate": 9.141674920215694e-06, + "loss": 0.5366, + "step": 19600 + }, + { + "epoch": 0.43339566604333957, + "grad_norm": 1.5041108131408691, + "learning_rate": 9.137273027401784e-06, + "loss": 0.5324, + "step": 19700 + }, + { + "epoch": 0.43559564404355955, + "grad_norm": 1.714513897895813, + "learning_rate": 9.132871134587875e-06, + "loss": 0.5341, + "step": 19800 + }, + { + "epoch": 0.4377956220437796, + "grad_norm": 1.7554248571395874, + "learning_rate": 9.128469241773963e-06, + "loss": 0.5436, + "step": 19900 + }, + { + "epoch": 0.43999560004399957, + "grad_norm": 1.665436029434204, + "learning_rate": 9.124067348960054e-06, + "loss": 0.5299, + "step": 20000 + }, + { + "epoch": 0.44219557804421955, + "grad_norm": 1.668437123298645, + "learning_rate": 9.119665456146144e-06, + "loss": 0.5188, + "step": 20100 + }, + { + "epoch": 0.44439555604443953, + "grad_norm": 1.9339295625686646, + "learning_rate": 9.115263563332234e-06, + "loss": 0.5574, + "step": 20200 + }, + { + "epoch": 0.44659553404465957, + "grad_norm": 1.7263190746307373, + "learning_rate": 9.110861670518324e-06, + "loss": 0.5469, + "step": 20300 + }, + { + "epoch": 0.44879551204487955, + "grad_norm": 1.5733555555343628, + "learning_rate": 9.106459777704415e-06, + "loss": 0.529, + "step": 20400 + }, + { + "epoch": 0.45099549004509953, + "grad_norm": 1.6786284446716309, + "learning_rate": 9.102057884890503e-06, + "loss": 0.539, + "step": 20500 + }, + { + "epoch": 0.45319546804531957, + "grad_norm": 1.6025316715240479, + "learning_rate": 9.097655992076594e-06, + "loss": 0.5394, + "step": 20600 + }, + { + "epoch": 0.45539544604553955, + "grad_norm": 1.7945187091827393, + "learning_rate": 9.093254099262684e-06, + "loss": 0.5233, + "step": 20700 + }, + { + "epoch": 0.45759542404575954, + "grad_norm": 1.6407737731933594, + "learning_rate": 9.088852206448774e-06, + "loss": 0.547, + "step": 20800 + }, + { + "epoch": 0.4597954020459795, + "grad_norm": 1.623547911643982, + "learning_rate": 9.084450313634864e-06, + "loss": 0.5609, + "step": 20900 + }, + { + "epoch": 0.46199538004619956, + "grad_norm": 1.7454668283462524, + "learning_rate": 9.080048420820953e-06, + "loss": 0.5484, + "step": 21000 + }, + { + "epoch": 0.46419535804641954, + "grad_norm": 2.0362443923950195, + "learning_rate": 9.075646528007043e-06, + "loss": 0.5199, + "step": 21100 + }, + { + "epoch": 0.4663953360466395, + "grad_norm": 1.8968782424926758, + "learning_rate": 9.071244635193134e-06, + "loss": 0.5471, + "step": 21200 + }, + { + "epoch": 0.46859531404685956, + "grad_norm": 1.7040385007858276, + "learning_rate": 9.066842742379224e-06, + "loss": 0.5167, + "step": 21300 + }, + { + "epoch": 0.47079529204707954, + "grad_norm": 1.8420989513397217, + "learning_rate": 9.062440849565314e-06, + "loss": 0.5359, + "step": 21400 + }, + { + "epoch": 0.4729952700472995, + "grad_norm": 1.6311464309692383, + "learning_rate": 9.058038956751404e-06, + "loss": 0.5375, + "step": 21500 + }, + { + "epoch": 0.4751952480475195, + "grad_norm": 2.0437209606170654, + "learning_rate": 9.053637063937493e-06, + "loss": 0.5427, + "step": 21600 + }, + { + "epoch": 0.47739522604773954, + "grad_norm": 1.6111825704574585, + "learning_rate": 9.049235171123583e-06, + "loss": 0.526, + "step": 21700 + }, + { + "epoch": 0.4795952040479595, + "grad_norm": 1.3677709102630615, + "learning_rate": 9.044833278309674e-06, + "loss": 0.5328, + "step": 21800 + }, + { + "epoch": 0.4817951820481795, + "grad_norm": 2.1056365966796875, + "learning_rate": 9.040431385495764e-06, + "loss": 0.5391, + "step": 21900 + }, + { + "epoch": 0.48399516004839954, + "grad_norm": 1.807760238647461, + "learning_rate": 9.036029492681854e-06, + "loss": 0.5606, + "step": 22000 + }, + { + "epoch": 0.4861951380486195, + "grad_norm": 1.8556056022644043, + "learning_rate": 9.031627599867944e-06, + "loss": 0.5351, + "step": 22100 + }, + { + "epoch": 0.4883951160488395, + "grad_norm": 2.0106847286224365, + "learning_rate": 9.027225707054033e-06, + "loss": 0.5542, + "step": 22200 + }, + { + "epoch": 0.4905950940490595, + "grad_norm": 1.6676563024520874, + "learning_rate": 9.022823814240123e-06, + "loss": 0.538, + "step": 22300 + }, + { + "epoch": 0.4927950720492795, + "grad_norm": 1.4103186130523682, + "learning_rate": 9.018421921426214e-06, + "loss": 0.5241, + "step": 22400 + }, + { + "epoch": 0.4949950500494995, + "grad_norm": 1.8032267093658447, + "learning_rate": 9.014020028612304e-06, + "loss": 0.5367, + "step": 22500 + }, + { + "epoch": 0.4971950280497195, + "grad_norm": 1.6195557117462158, + "learning_rate": 9.009618135798394e-06, + "loss": 0.5434, + "step": 22600 + }, + { + "epoch": 0.4993950060499395, + "grad_norm": 1.7808386087417603, + "learning_rate": 9.005216242984484e-06, + "loss": 0.5421, + "step": 22700 + }, + { + "epoch": 0.5015949840501595, + "grad_norm": 1.746341586112976, + "learning_rate": 9.000814350170573e-06, + "loss": 0.5362, + "step": 22800 + }, + { + "epoch": 0.5037949620503795, + "grad_norm": 2.1744487285614014, + "learning_rate": 8.996412457356663e-06, + "loss": 0.5243, + "step": 22900 + }, + { + "epoch": 0.5059949400505995, + "grad_norm": 1.7973219156265259, + "learning_rate": 8.992010564542755e-06, + "loss": 0.5504, + "step": 23000 + }, + { + "epoch": 0.5081949180508195, + "grad_norm": 1.6203027963638306, + "learning_rate": 8.987608671728844e-06, + "loss": 0.5426, + "step": 23100 + }, + { + "epoch": 0.5103948960510395, + "grad_norm": 1.6453986167907715, + "learning_rate": 8.983206778914934e-06, + "loss": 0.548, + "step": 23200 + }, + { + "epoch": 0.5125948740512595, + "grad_norm": 1.8163201808929443, + "learning_rate": 8.978804886101024e-06, + "loss": 0.5306, + "step": 23300 + }, + { + "epoch": 0.5147948520514795, + "grad_norm": 1.7606194019317627, + "learning_rate": 8.974402993287115e-06, + "loss": 0.5318, + "step": 23400 + }, + { + "epoch": 0.5169948300516994, + "grad_norm": 1.9621275663375854, + "learning_rate": 8.970001100473205e-06, + "loss": 0.5289, + "step": 23500 + }, + { + "epoch": 0.5191948080519195, + "grad_norm": 1.707217812538147, + "learning_rate": 8.965599207659295e-06, + "loss": 0.5374, + "step": 23600 + }, + { + "epoch": 0.5213947860521395, + "grad_norm": 1.9041409492492676, + "learning_rate": 8.961197314845384e-06, + "loss": 0.5512, + "step": 23700 + }, + { + "epoch": 0.5235947640523595, + "grad_norm": 1.7021831274032593, + "learning_rate": 8.956795422031474e-06, + "loss": 0.5363, + "step": 23800 + }, + { + "epoch": 0.5257947420525795, + "grad_norm": 1.6546313762664795, + "learning_rate": 8.952393529217564e-06, + "loss": 0.5355, + "step": 23900 + }, + { + "epoch": 0.5279947200527995, + "grad_norm": 2.1298437118530273, + "learning_rate": 8.947991636403655e-06, + "loss": 0.5336, + "step": 24000 + }, + { + "epoch": 0.5301946980530194, + "grad_norm": 1.6351710557937622, + "learning_rate": 8.943589743589745e-06, + "loss": 0.5298, + "step": 24100 + }, + { + "epoch": 0.5323946760532394, + "grad_norm": 1.7850167751312256, + "learning_rate": 8.939187850775835e-06, + "loss": 0.5295, + "step": 24200 + }, + { + "epoch": 0.5345946540534595, + "grad_norm": 1.6639127731323242, + "learning_rate": 8.934785957961924e-06, + "loss": 0.5482, + "step": 24300 + }, + { + "epoch": 0.5367946320536795, + "grad_norm": 1.6761794090270996, + "learning_rate": 8.930384065148014e-06, + "loss": 0.5398, + "step": 24400 + }, + { + "epoch": 0.5389946100538995, + "grad_norm": 2.0362918376922607, + "learning_rate": 8.925982172334104e-06, + "loss": 0.5387, + "step": 24500 + }, + { + "epoch": 0.5411945880541195, + "grad_norm": 1.5029228925704956, + "learning_rate": 8.921580279520195e-06, + "loss": 0.5296, + "step": 24600 + }, + { + "epoch": 0.5433945660543394, + "grad_norm": 1.7153294086456299, + "learning_rate": 8.917178386706285e-06, + "loss": 0.5395, + "step": 24700 + }, + { + "epoch": 0.5455945440545594, + "grad_norm": 1.6009351015090942, + "learning_rate": 8.912776493892375e-06, + "loss": 0.5301, + "step": 24800 + }, + { + "epoch": 0.5477945220547794, + "grad_norm": 1.7909400463104248, + "learning_rate": 8.908374601078464e-06, + "loss": 0.5292, + "step": 24900 + }, + { + "epoch": 0.5499945000549995, + "grad_norm": 2.1847472190856934, + "learning_rate": 8.903972708264554e-06, + "loss": 0.5326, + "step": 25000 + }, + { + "epoch": 0.5521944780552195, + "grad_norm": 2.270923614501953, + "learning_rate": 8.899570815450644e-06, + "loss": 0.545, + "step": 25100 + }, + { + "epoch": 0.5543944560554395, + "grad_norm": 2.044668436050415, + "learning_rate": 8.895168922636735e-06, + "loss": 0.5335, + "step": 25200 + }, + { + "epoch": 0.5565944340556594, + "grad_norm": 1.9989433288574219, + "learning_rate": 8.890767029822825e-06, + "loss": 0.5516, + "step": 25300 + }, + { + "epoch": 0.5587944120558794, + "grad_norm": 1.7529683113098145, + "learning_rate": 8.886365137008914e-06, + "loss": 0.5379, + "step": 25400 + }, + { + "epoch": 0.5609943900560994, + "grad_norm": 1.4954921007156372, + "learning_rate": 8.881963244195004e-06, + "loss": 0.5346, + "step": 25500 + }, + { + "epoch": 0.5631943680563194, + "grad_norm": 1.7510510683059692, + "learning_rate": 8.877561351381094e-06, + "loss": 0.5186, + "step": 25600 + }, + { + "epoch": 0.5653943460565395, + "grad_norm": 1.8264451026916504, + "learning_rate": 8.873159458567184e-06, + "loss": 0.5419, + "step": 25700 + }, + { + "epoch": 0.5675943240567595, + "grad_norm": 2.1004931926727295, + "learning_rate": 8.868757565753275e-06, + "loss": 0.5419, + "step": 25800 + }, + { + "epoch": 0.5697943020569795, + "grad_norm": 1.9316984415054321, + "learning_rate": 8.864355672939365e-06, + "loss": 0.5209, + "step": 25900 + }, + { + "epoch": 0.5719942800571994, + "grad_norm": 2.182731866836548, + "learning_rate": 8.859953780125454e-06, + "loss": 0.5356, + "step": 26000 + }, + { + "epoch": 0.5741942580574194, + "grad_norm": 1.6151630878448486, + "learning_rate": 8.855551887311544e-06, + "loss": 0.5419, + "step": 26100 + }, + { + "epoch": 0.5763942360576394, + "grad_norm": 1.8083909749984741, + "learning_rate": 8.851149994497634e-06, + "loss": 0.5218, + "step": 26200 + }, + { + "epoch": 0.5785942140578594, + "grad_norm": 1.6356123685836792, + "learning_rate": 8.846748101683724e-06, + "loss": 0.5256, + "step": 26300 + }, + { + "epoch": 0.5807941920580795, + "grad_norm": 2.2701175212860107, + "learning_rate": 8.842346208869815e-06, + "loss": 0.534, + "step": 26400 + }, + { + "epoch": 0.5829941700582995, + "grad_norm": 1.9146398305892944, + "learning_rate": 8.837944316055905e-06, + "loss": 0.5399, + "step": 26500 + }, + { + "epoch": 0.5851941480585194, + "grad_norm": 1.9954113960266113, + "learning_rate": 8.833542423241995e-06, + "loss": 0.537, + "step": 26600 + }, + { + "epoch": 0.5873941260587394, + "grad_norm": 1.6357481479644775, + "learning_rate": 8.829140530428086e-06, + "loss": 0.5322, + "step": 26700 + }, + { + "epoch": 0.5895941040589594, + "grad_norm": 1.7142163515090942, + "learning_rate": 8.824738637614176e-06, + "loss": 0.5475, + "step": 26800 + }, + { + "epoch": 0.5917940820591794, + "grad_norm": 1.7539161443710327, + "learning_rate": 8.820336744800266e-06, + "loss": 0.523, + "step": 26900 + }, + { + "epoch": 0.5939940600593994, + "grad_norm": 1.6141777038574219, + "learning_rate": 8.815934851986355e-06, + "loss": 0.5318, + "step": 27000 + }, + { + "epoch": 0.5961940380596195, + "grad_norm": 2.0629382133483887, + "learning_rate": 8.811532959172445e-06, + "loss": 0.5334, + "step": 27100 + }, + { + "epoch": 0.5983940160598394, + "grad_norm": 1.999254584312439, + "learning_rate": 8.807131066358535e-06, + "loss": 0.5504, + "step": 27200 + }, + { + "epoch": 0.6005939940600594, + "grad_norm": 1.8531382083892822, + "learning_rate": 8.802729173544626e-06, + "loss": 0.5376, + "step": 27300 + }, + { + "epoch": 0.6027939720602794, + "grad_norm": 1.4768983125686646, + "learning_rate": 8.798327280730716e-06, + "loss": 0.5344, + "step": 27400 + }, + { + "epoch": 0.6049939500604994, + "grad_norm": 1.7571672201156616, + "learning_rate": 8.793925387916804e-06, + "loss": 0.5342, + "step": 27500 + }, + { + "epoch": 0.6071939280607194, + "grad_norm": 1.7986180782318115, + "learning_rate": 8.789523495102895e-06, + "loss": 0.5474, + "step": 27600 + }, + { + "epoch": 0.6093939060609393, + "grad_norm": 1.9569381475448608, + "learning_rate": 8.785121602288985e-06, + "loss": 0.5403, + "step": 27700 + }, + { + "epoch": 0.6115938840611594, + "grad_norm": 2.1773102283477783, + "learning_rate": 8.780719709475075e-06, + "loss": 0.5239, + "step": 27800 + }, + { + "epoch": 0.6137938620613794, + "grad_norm": 2.050550937652588, + "learning_rate": 8.776317816661166e-06, + "loss": 0.5253, + "step": 27900 + }, + { + "epoch": 0.6159938400615994, + "grad_norm": 1.7763617038726807, + "learning_rate": 8.771915923847256e-06, + "loss": 0.5283, + "step": 28000 + }, + { + "epoch": 0.6181938180618194, + "grad_norm": 1.6701637506484985, + "learning_rate": 8.767514031033344e-06, + "loss": 0.5316, + "step": 28100 + }, + { + "epoch": 0.6203937960620394, + "grad_norm": 1.6922410726547241, + "learning_rate": 8.763112138219435e-06, + "loss": 0.5384, + "step": 28200 + }, + { + "epoch": 0.6225937740622594, + "grad_norm": 2.3351800441741943, + "learning_rate": 8.758710245405525e-06, + "loss": 0.5462, + "step": 28300 + }, + { + "epoch": 0.6247937520624793, + "grad_norm": 1.7946525812149048, + "learning_rate": 8.754308352591615e-06, + "loss": 0.5341, + "step": 28400 + }, + { + "epoch": 0.6269937300626994, + "grad_norm": 1.6485981941223145, + "learning_rate": 8.749906459777706e-06, + "loss": 0.5229, + "step": 28500 + }, + { + "epoch": 0.6291937080629194, + "grad_norm": 2.138338327407837, + "learning_rate": 8.745504566963796e-06, + "loss": 0.5489, + "step": 28600 + }, + { + "epoch": 0.6313936860631394, + "grad_norm": 1.7668613195419312, + "learning_rate": 8.741102674149884e-06, + "loss": 0.5239, + "step": 28700 + }, + { + "epoch": 0.6335936640633594, + "grad_norm": 2.0970587730407715, + "learning_rate": 8.736700781335975e-06, + "loss": 0.5313, + "step": 28800 + }, + { + "epoch": 0.6357936420635794, + "grad_norm": 1.7800394296646118, + "learning_rate": 8.732298888522065e-06, + "loss": 0.5322, + "step": 28900 + }, + { + "epoch": 0.6379936200637993, + "grad_norm": 1.7388654947280884, + "learning_rate": 8.727896995708155e-06, + "loss": 0.5291, + "step": 29000 + }, + { + "epoch": 0.6401935980640193, + "grad_norm": 1.6228729486465454, + "learning_rate": 8.723495102894246e-06, + "loss": 0.5318, + "step": 29100 + }, + { + "epoch": 0.6423935760642393, + "grad_norm": 2.1541671752929688, + "learning_rate": 8.719093210080334e-06, + "loss": 0.5376, + "step": 29200 + }, + { + "epoch": 0.6445935540644594, + "grad_norm": 2.0600032806396484, + "learning_rate": 8.714691317266424e-06, + "loss": 0.5342, + "step": 29300 + }, + { + "epoch": 0.6467935320646794, + "grad_norm": 1.673624873161316, + "learning_rate": 8.710289424452515e-06, + "loss": 0.5533, + "step": 29400 + }, + { + "epoch": 0.6489935100648994, + "grad_norm": 1.8217624425888062, + "learning_rate": 8.705887531638605e-06, + "loss": 0.526, + "step": 29500 + }, + { + "epoch": 0.6511934880651193, + "grad_norm": 2.1350643634796143, + "learning_rate": 8.701485638824695e-06, + "loss": 0.5254, + "step": 29600 + }, + { + "epoch": 0.6533934660653393, + "grad_norm": 1.7675269842147827, + "learning_rate": 8.697083746010786e-06, + "loss": 0.5191, + "step": 29700 + }, + { + "epoch": 0.6555934440655593, + "grad_norm": 2.134058952331543, + "learning_rate": 8.692681853196874e-06, + "loss": 0.5329, + "step": 29800 + }, + { + "epoch": 0.6577934220657793, + "grad_norm": 1.6623740196228027, + "learning_rate": 8.688279960382964e-06, + "loss": 0.5287, + "step": 29900 + }, + { + "epoch": 0.6599934000659994, + "grad_norm": 2.05334210395813, + "learning_rate": 8.683878067569056e-06, + "loss": 0.5393, + "step": 30000 + }, + { + "epoch": 0.6621933780662194, + "grad_norm": 1.7684849500656128, + "learning_rate": 8.679476174755147e-06, + "loss": 0.527, + "step": 30100 + }, + { + "epoch": 0.6643933560664393, + "grad_norm": 1.825725793838501, + "learning_rate": 8.675074281941235e-06, + "loss": 0.5314, + "step": 30200 + }, + { + "epoch": 0.6665933340666593, + "grad_norm": 1.9619163274765015, + "learning_rate": 8.670672389127326e-06, + "loss": 0.5238, + "step": 30300 + }, + { + "epoch": 0.6687933120668793, + "grad_norm": 1.7254787683486938, + "learning_rate": 8.666270496313416e-06, + "loss": 0.5253, + "step": 30400 + }, + { + "epoch": 0.6709932900670993, + "grad_norm": 1.739046335220337, + "learning_rate": 8.661868603499506e-06, + "loss": 0.5452, + "step": 30500 + }, + { + "epoch": 0.6731932680673193, + "grad_norm": 1.9458619356155396, + "learning_rate": 8.657466710685596e-06, + "loss": 0.5253, + "step": 30600 + }, + { + "epoch": 0.6753932460675394, + "grad_norm": 1.9501069784164429, + "learning_rate": 8.653064817871687e-06, + "loss": 0.5313, + "step": 30700 + }, + { + "epoch": 0.6775932240677593, + "grad_norm": 1.4754610061645508, + "learning_rate": 8.648662925057775e-06, + "loss": 0.5409, + "step": 30800 + }, + { + "epoch": 0.6797932020679793, + "grad_norm": 1.7951412200927734, + "learning_rate": 8.644261032243866e-06, + "loss": 0.558, + "step": 30900 + }, + { + "epoch": 0.6819931800681993, + "grad_norm": 1.5883880853652954, + "learning_rate": 8.639859139429956e-06, + "loss": 0.5668, + "step": 31000 + }, + { + "epoch": 0.6841931580684193, + "grad_norm": 1.7715564966201782, + "learning_rate": 8.635457246616046e-06, + "loss": 0.5567, + "step": 31100 + }, + { + "epoch": 0.6863931360686393, + "grad_norm": 1.7103959321975708, + "learning_rate": 8.631055353802136e-06, + "loss": 0.5646, + "step": 31200 + }, + { + "epoch": 0.6885931140688593, + "grad_norm": 2.053924322128296, + "learning_rate": 8.626653460988225e-06, + "loss": 0.5554, + "step": 31300 + }, + { + "epoch": 0.6907930920690794, + "grad_norm": 1.3964165449142456, + "learning_rate": 8.622251568174315e-06, + "loss": 0.5341, + "step": 31400 + }, + { + "epoch": 0.6929930700692993, + "grad_norm": 1.623286485671997, + "learning_rate": 8.617849675360406e-06, + "loss": 0.5475, + "step": 31500 + }, + { + "epoch": 0.6951930480695193, + "grad_norm": 1.5909929275512695, + "learning_rate": 8.613447782546496e-06, + "loss": 0.543, + "step": 31600 + }, + { + "epoch": 0.6973930260697393, + "grad_norm": 1.6793596744537354, + "learning_rate": 8.609045889732586e-06, + "loss": 0.5642, + "step": 31700 + }, + { + "epoch": 0.6995930040699593, + "grad_norm": 1.5003210306167603, + "learning_rate": 8.604643996918676e-06, + "loss": 0.5528, + "step": 31800 + }, + { + "epoch": 0.7017929820701793, + "grad_norm": 1.6098058223724365, + "learning_rate": 8.600242104104765e-06, + "loss": 0.5591, + "step": 31900 + }, + { + "epoch": 0.7039929600703992, + "grad_norm": 1.8180344104766846, + "learning_rate": 8.595840211290855e-06, + "loss": 0.5575, + "step": 32000 + }, + { + "epoch": 0.7061929380706193, + "grad_norm": 1.6185832023620605, + "learning_rate": 8.591438318476946e-06, + "loss": 0.5555, + "step": 32100 + }, + { + "epoch": 0.7083929160708393, + "grad_norm": 1.7686482667922974, + "learning_rate": 8.587036425663036e-06, + "loss": 0.5562, + "step": 32200 + }, + { + "epoch": 0.7105928940710593, + "grad_norm": 1.6809719800949097, + "learning_rate": 8.582634532849126e-06, + "loss": 0.5519, + "step": 32300 + }, + { + "epoch": 0.7127928720712793, + "grad_norm": 1.8532384634017944, + "learning_rate": 8.578232640035216e-06, + "loss": 0.5466, + "step": 32400 + }, + { + "epoch": 0.7149928500714993, + "grad_norm": 1.6389007568359375, + "learning_rate": 8.573830747221305e-06, + "loss": 0.5527, + "step": 32500 + }, + { + "epoch": 0.7171928280717192, + "grad_norm": 1.6388925313949585, + "learning_rate": 8.569428854407395e-06, + "loss": 0.5439, + "step": 32600 + }, + { + "epoch": 0.7193928060719392, + "grad_norm": 1.7384296655654907, + "learning_rate": 8.565026961593486e-06, + "loss": 0.5375, + "step": 32700 + }, + { + "epoch": 0.7215927840721593, + "grad_norm": 1.7327488660812378, + "learning_rate": 8.560625068779576e-06, + "loss": 0.5548, + "step": 32800 + }, + { + "epoch": 0.7237927620723793, + "grad_norm": 1.564349889755249, + "learning_rate": 8.556223175965666e-06, + "loss": 0.5573, + "step": 32900 + }, + { + "epoch": 0.7259927400725993, + "grad_norm": 1.8052953481674194, + "learning_rate": 8.551821283151756e-06, + "loss": 0.524, + "step": 33000 + }, + { + "epoch": 0.7281927180728193, + "grad_norm": 1.5981229543685913, + "learning_rate": 8.547419390337845e-06, + "loss": 0.5449, + "step": 33100 + }, + { + "epoch": 0.7303926960730392, + "grad_norm": 1.4789613485336304, + "learning_rate": 8.543017497523935e-06, + "loss": 0.5356, + "step": 33200 + }, + { + "epoch": 0.7325926740732592, + "grad_norm": 1.8192943334579468, + "learning_rate": 8.538615604710026e-06, + "loss": 0.5691, + "step": 33300 + }, + { + "epoch": 0.7347926520734792, + "grad_norm": 1.874607801437378, + "learning_rate": 8.534213711896116e-06, + "loss": 0.5539, + "step": 33400 + }, + { + "epoch": 0.7369926300736993, + "grad_norm": 1.6394860744476318, + "learning_rate": 8.529811819082206e-06, + "loss": 0.5653, + "step": 33500 + }, + { + "epoch": 0.7391926080739193, + "grad_norm": 1.9063067436218262, + "learning_rate": 8.525409926268296e-06, + "loss": 0.5515, + "step": 33600 + }, + { + "epoch": 0.7413925860741393, + "grad_norm": 1.6854544878005981, + "learning_rate": 8.521008033454387e-06, + "loss": 0.5534, + "step": 33700 + }, + { + "epoch": 0.7435925640743593, + "grad_norm": 1.7821418046951294, + "learning_rate": 8.516606140640477e-06, + "loss": 0.5521, + "step": 33800 + }, + { + "epoch": 0.7457925420745792, + "grad_norm": 1.5063166618347168, + "learning_rate": 8.512204247826567e-06, + "loss": 0.5667, + "step": 33900 + }, + { + "epoch": 0.7479925200747992, + "grad_norm": 1.9604572057724, + "learning_rate": 8.507802355012656e-06, + "loss": 0.5434, + "step": 34000 + }, + { + "epoch": 0.7501924980750192, + "grad_norm": 1.8538181781768799, + "learning_rate": 8.503400462198746e-06, + "loss": 0.5366, + "step": 34100 + }, + { + "epoch": 0.7523924760752393, + "grad_norm": 1.8284313678741455, + "learning_rate": 8.498998569384836e-06, + "loss": 0.5549, + "step": 34200 + }, + { + "epoch": 0.7545924540754593, + "grad_norm": 1.5392765998840332, + "learning_rate": 8.494596676570927e-06, + "loss": 0.5459, + "step": 34300 + }, + { + "epoch": 0.7567924320756793, + "grad_norm": 1.601608157157898, + "learning_rate": 8.490194783757017e-06, + "loss": 0.5478, + "step": 34400 + }, + { + "epoch": 0.7589924100758992, + "grad_norm": 1.602129340171814, + "learning_rate": 8.485792890943107e-06, + "loss": 0.5264, + "step": 34500 + }, + { + "epoch": 0.7611923880761192, + "grad_norm": 1.5455442667007446, + "learning_rate": 8.481390998129196e-06, + "loss": 0.5452, + "step": 34600 + }, + { + "epoch": 0.7633923660763392, + "grad_norm": 1.7308459281921387, + "learning_rate": 8.476989105315286e-06, + "loss": 0.5346, + "step": 34700 + }, + { + "epoch": 0.7655923440765592, + "grad_norm": 1.9421132802963257, + "learning_rate": 8.472587212501376e-06, + "loss": 0.5502, + "step": 34800 + }, + { + "epoch": 0.7677923220767793, + "grad_norm": 1.6126275062561035, + "learning_rate": 8.468185319687467e-06, + "loss": 0.5531, + "step": 34900 + }, + { + "epoch": 0.7699923000769993, + "grad_norm": 1.9307098388671875, + "learning_rate": 8.463783426873557e-06, + "loss": 0.5451, + "step": 35000 + }, + { + "epoch": 0.7721922780772192, + "grad_norm": 1.785501480102539, + "learning_rate": 8.459381534059646e-06, + "loss": 0.5657, + "step": 35100 + }, + { + "epoch": 0.7743922560774392, + "grad_norm": 1.3118321895599365, + "learning_rate": 8.454979641245736e-06, + "loss": 0.5425, + "step": 35200 + }, + { + "epoch": 0.7765922340776592, + "grad_norm": 1.6785212755203247, + "learning_rate": 8.450577748431826e-06, + "loss": 0.5608, + "step": 35300 + }, + { + "epoch": 0.7787922120778792, + "grad_norm": 1.687156081199646, + "learning_rate": 8.446175855617916e-06, + "loss": 0.5268, + "step": 35400 + }, + { + "epoch": 0.7809921900780992, + "grad_norm": 1.6766939163208008, + "learning_rate": 8.441773962804007e-06, + "loss": 0.5505, + "step": 35500 + }, + { + "epoch": 0.7831921680783193, + "grad_norm": 1.3873755931854248, + "learning_rate": 8.437372069990097e-06, + "loss": 0.5346, + "step": 35600 + }, + { + "epoch": 0.7853921460785392, + "grad_norm": 1.4507646560668945, + "learning_rate": 8.432970177176186e-06, + "loss": 0.5456, + "step": 35700 + }, + { + "epoch": 0.7875921240787592, + "grad_norm": 1.7354850769042969, + "learning_rate": 8.428568284362276e-06, + "loss": 0.5502, + "step": 35800 + }, + { + "epoch": 0.7897921020789792, + "grad_norm": 1.4922300577163696, + "learning_rate": 8.424166391548366e-06, + "loss": 0.5628, + "step": 35900 + }, + { + "epoch": 0.7919920800791992, + "grad_norm": 1.722380518913269, + "learning_rate": 8.419764498734456e-06, + "loss": 0.5556, + "step": 36000 + }, + { + "epoch": 0.7941920580794192, + "grad_norm": 1.905194640159607, + "learning_rate": 8.415362605920547e-06, + "loss": 0.5529, + "step": 36100 + }, + { + "epoch": 0.7963920360796392, + "grad_norm": 2.140815496444702, + "learning_rate": 8.410960713106637e-06, + "loss": 0.5567, + "step": 36200 + }, + { + "epoch": 0.7985920140798592, + "grad_norm": 1.5261491537094116, + "learning_rate": 8.406558820292726e-06, + "loss": 0.554, + "step": 36300 + }, + { + "epoch": 0.8007919920800792, + "grad_norm": 1.6273101568222046, + "learning_rate": 8.402156927478816e-06, + "loss": 0.5534, + "step": 36400 + }, + { + "epoch": 0.8029919700802992, + "grad_norm": 1.7818236351013184, + "learning_rate": 8.397755034664906e-06, + "loss": 0.5408, + "step": 36500 + }, + { + "epoch": 0.8051919480805192, + "grad_norm": 1.9317457675933838, + "learning_rate": 8.393353141850996e-06, + "loss": 0.5726, + "step": 36600 + }, + { + "epoch": 0.8073919260807392, + "grad_norm": 1.813769817352295, + "learning_rate": 8.388951249037087e-06, + "loss": 0.5605, + "step": 36700 + }, + { + "epoch": 0.8095919040809592, + "grad_norm": 1.9883424043655396, + "learning_rate": 8.384549356223177e-06, + "loss": 0.5489, + "step": 36800 + }, + { + "epoch": 0.8117918820811791, + "grad_norm": 1.709024429321289, + "learning_rate": 8.380147463409267e-06, + "loss": 0.5411, + "step": 36900 + }, + { + "epoch": 0.8139918600813992, + "grad_norm": 1.4431244134902954, + "learning_rate": 8.375745570595357e-06, + "loss": 0.5472, + "step": 37000 + }, + { + "epoch": 0.8161918380816192, + "grad_norm": 1.5251537561416626, + "learning_rate": 8.371343677781448e-06, + "loss": 0.5479, + "step": 37100 + }, + { + "epoch": 0.8183918160818392, + "grad_norm": 1.687023401260376, + "learning_rate": 8.366941784967536e-06, + "loss": 0.543, + "step": 37200 + }, + { + "epoch": 0.8205917940820592, + "grad_norm": 1.5462446212768555, + "learning_rate": 8.362539892153627e-06, + "loss": 0.55, + "step": 37300 + }, + { + "epoch": 0.8227917720822792, + "grad_norm": 1.984750747680664, + "learning_rate": 8.358137999339717e-06, + "loss": 0.5495, + "step": 37400 + }, + { + "epoch": 0.8249917500824991, + "grad_norm": 1.6375317573547363, + "learning_rate": 8.353736106525807e-06, + "loss": 0.5479, + "step": 37500 + }, + { + "epoch": 0.8271917280827191, + "grad_norm": 1.8285633325576782, + "learning_rate": 8.349334213711897e-06, + "loss": 0.5398, + "step": 37600 + }, + { + "epoch": 0.8293917060829392, + "grad_norm": 1.7603964805603027, + "learning_rate": 8.344932320897988e-06, + "loss": 0.5343, + "step": 37700 + }, + { + "epoch": 0.8315916840831592, + "grad_norm": 1.4836808443069458, + "learning_rate": 8.340530428084076e-06, + "loss": 0.5559, + "step": 37800 + }, + { + "epoch": 0.8337916620833792, + "grad_norm": 1.4867973327636719, + "learning_rate": 8.336128535270167e-06, + "loss": 0.5433, + "step": 37900 + }, + { + "epoch": 0.8359916400835992, + "grad_norm": 1.784264326095581, + "learning_rate": 8.331726642456257e-06, + "loss": 0.5451, + "step": 38000 + }, + { + "epoch": 0.8381916180838191, + "grad_norm": 1.3747423887252808, + "learning_rate": 8.327324749642347e-06, + "loss": 0.538, + "step": 38100 + }, + { + "epoch": 0.8403915960840391, + "grad_norm": 1.8073352575302124, + "learning_rate": 8.322922856828437e-06, + "loss": 0.545, + "step": 38200 + }, + { + "epoch": 0.8425915740842591, + "grad_norm": 1.6162651777267456, + "learning_rate": 8.318520964014528e-06, + "loss": 0.5448, + "step": 38300 + }, + { + "epoch": 0.8447915520844792, + "grad_norm": 1.6627821922302246, + "learning_rate": 8.314119071200616e-06, + "loss": 0.5504, + "step": 38400 + }, + { + "epoch": 0.8469915300846992, + "grad_norm": 1.594759464263916, + "learning_rate": 8.309717178386707e-06, + "loss": 0.5344, + "step": 38500 + }, + { + "epoch": 0.8491915080849192, + "grad_norm": 1.7449952363967896, + "learning_rate": 8.305315285572797e-06, + "loss": 0.5558, + "step": 38600 + }, + { + "epoch": 0.8513914860851391, + "grad_norm": 1.6787577867507935, + "learning_rate": 8.300913392758887e-06, + "loss": 0.5282, + "step": 38700 + }, + { + "epoch": 0.8535914640853591, + "grad_norm": 2.2145471572875977, + "learning_rate": 8.296511499944977e-06, + "loss": 0.5371, + "step": 38800 + }, + { + "epoch": 0.8557914420855791, + "grad_norm": 1.7959023714065552, + "learning_rate": 8.292109607131068e-06, + "loss": 0.5467, + "step": 38900 + }, + { + "epoch": 0.8579914200857991, + "grad_norm": 1.7362741231918335, + "learning_rate": 8.287707714317156e-06, + "loss": 0.5334, + "step": 39000 + }, + { + "epoch": 0.8601913980860192, + "grad_norm": 1.471660852432251, + "learning_rate": 8.283305821503247e-06, + "loss": 0.5563, + "step": 39100 + }, + { + "epoch": 0.8623913760862392, + "grad_norm": 1.9247560501098633, + "learning_rate": 8.278903928689337e-06, + "loss": 0.5422, + "step": 39200 + }, + { + "epoch": 0.8645913540864592, + "grad_norm": 1.4459770917892456, + "learning_rate": 8.274502035875427e-06, + "loss": 0.5549, + "step": 39300 + }, + { + "epoch": 0.8667913320866791, + "grad_norm": 1.8843663930892944, + "learning_rate": 8.270100143061517e-06, + "loss": 0.5463, + "step": 39400 + }, + { + "epoch": 0.8689913100868991, + "grad_norm": 1.6664437055587769, + "learning_rate": 8.265698250247606e-06, + "loss": 0.557, + "step": 39500 + }, + { + "epoch": 0.8711912880871191, + "grad_norm": 1.8281344175338745, + "learning_rate": 8.261296357433696e-06, + "loss": 0.5306, + "step": 39600 + }, + { + "epoch": 0.8733912660873391, + "grad_norm": 1.9608473777770996, + "learning_rate": 8.256894464619787e-06, + "loss": 0.5458, + "step": 39700 + }, + { + "epoch": 0.8755912440875592, + "grad_norm": 1.9003684520721436, + "learning_rate": 8.252492571805877e-06, + "loss": 0.55, + "step": 39800 + }, + { + "epoch": 0.8777912220877792, + "grad_norm": 1.8628289699554443, + "learning_rate": 8.248090678991967e-06, + "loss": 0.5379, + "step": 39900 + }, + { + "epoch": 0.8799912000879991, + "grad_norm": 1.5854053497314453, + "learning_rate": 8.243688786178057e-06, + "loss": 0.5352, + "step": 40000 + }, + { + "epoch": 0.8821911780882191, + "grad_norm": 1.957435965538025, + "learning_rate": 8.239286893364146e-06, + "loss": 0.5358, + "step": 40100 + }, + { + "epoch": 0.8843911560884391, + "grad_norm": 1.838132381439209, + "learning_rate": 8.234885000550236e-06, + "loss": 0.5423, + "step": 40200 + }, + { + "epoch": 0.8865911340886591, + "grad_norm": 1.936266541481018, + "learning_rate": 8.230483107736327e-06, + "loss": 0.5335, + "step": 40300 + }, + { + "epoch": 0.8887911120888791, + "grad_norm": 1.5629870891571045, + "learning_rate": 8.226081214922419e-06, + "loss": 0.5354, + "step": 40400 + }, + { + "epoch": 0.8909910900890992, + "grad_norm": 1.7080520391464233, + "learning_rate": 8.221679322108507e-06, + "loss": 0.5532, + "step": 40500 + }, + { + "epoch": 0.8931910680893191, + "grad_norm": 1.795921802520752, + "learning_rate": 8.217277429294597e-06, + "loss": 0.5528, + "step": 40600 + }, + { + "epoch": 0.8953910460895391, + "grad_norm": 1.955198884010315, + "learning_rate": 8.212875536480688e-06, + "loss": 0.5598, + "step": 40700 + }, + { + "epoch": 0.8975910240897591, + "grad_norm": 1.865143895149231, + "learning_rate": 8.208473643666778e-06, + "loss": 0.5371, + "step": 40800 + }, + { + "epoch": 0.8997910020899791, + "grad_norm": 1.8305407762527466, + "learning_rate": 8.204071750852868e-06, + "loss": 0.5459, + "step": 40900 + }, + { + "epoch": 0.9019909800901991, + "grad_norm": 2.158996820449829, + "learning_rate": 8.199669858038959e-06, + "loss": 0.5477, + "step": 41000 + }, + { + "epoch": 0.904190958090419, + "grad_norm": 1.5184693336486816, + "learning_rate": 8.195267965225047e-06, + "loss": 0.5536, + "step": 41100 + }, + { + "epoch": 0.9063909360906391, + "grad_norm": 1.2580761909484863, + "learning_rate": 8.190866072411137e-06, + "loss": 0.5444, + "step": 41200 + }, + { + "epoch": 0.9085909140908591, + "grad_norm": 1.5662882328033447, + "learning_rate": 8.186464179597228e-06, + "loss": 0.5474, + "step": 41300 + }, + { + "epoch": 0.9107908920910791, + "grad_norm": 1.775161623954773, + "learning_rate": 8.182062286783318e-06, + "loss": 0.5405, + "step": 41400 + }, + { + "epoch": 0.9129908700912991, + "grad_norm": 1.604435920715332, + "learning_rate": 8.177660393969408e-06, + "loss": 0.5425, + "step": 41500 + }, + { + "epoch": 0.9151908480915191, + "grad_norm": 1.9549158811569214, + "learning_rate": 8.173258501155497e-06, + "loss": 0.5398, + "step": 41600 + }, + { + "epoch": 0.917390826091739, + "grad_norm": 1.4547535181045532, + "learning_rate": 8.168856608341587e-06, + "loss": 0.5511, + "step": 41700 + }, + { + "epoch": 0.919590804091959, + "grad_norm": 1.8771201372146606, + "learning_rate": 8.164454715527677e-06, + "loss": 0.5481, + "step": 41800 + }, + { + "epoch": 0.9217907820921791, + "grad_norm": 2.0473129749298096, + "learning_rate": 8.160052822713768e-06, + "loss": 0.5418, + "step": 41900 + }, + { + "epoch": 0.9239907600923991, + "grad_norm": 1.8082759380340576, + "learning_rate": 8.155650929899858e-06, + "loss": 0.5346, + "step": 42000 + }, + { + "epoch": 0.9261907380926191, + "grad_norm": 1.8849467039108276, + "learning_rate": 8.151249037085948e-06, + "loss": 0.5563, + "step": 42100 + }, + { + "epoch": 0.9283907160928391, + "grad_norm": 1.6767569780349731, + "learning_rate": 8.146847144272037e-06, + "loss": 0.536, + "step": 42200 + }, + { + "epoch": 0.9305906940930591, + "grad_norm": 1.9930092096328735, + "learning_rate": 8.142445251458127e-06, + "loss": 0.5507, + "step": 42300 + }, + { + "epoch": 0.932790672093279, + "grad_norm": 1.9420870542526245, + "learning_rate": 8.138043358644217e-06, + "loss": 0.5405, + "step": 42400 + }, + { + "epoch": 0.934990650093499, + "grad_norm": 1.6965640783309937, + "learning_rate": 8.133641465830308e-06, + "loss": 0.5469, + "step": 42500 + }, + { + "epoch": 0.9371906280937191, + "grad_norm": 1.4808323383331299, + "learning_rate": 8.129239573016398e-06, + "loss": 0.5341, + "step": 42600 + }, + { + "epoch": 0.9393906060939391, + "grad_norm": 1.516119122505188, + "learning_rate": 8.124837680202488e-06, + "loss": 0.5515, + "step": 42700 + }, + { + "epoch": 0.9415905840941591, + "grad_norm": 1.6243934631347656, + "learning_rate": 8.120435787388577e-06, + "loss": 0.541, + "step": 42800 + }, + { + "epoch": 0.9437905620943791, + "grad_norm": 1.6918444633483887, + "learning_rate": 8.116033894574667e-06, + "loss": 0.5302, + "step": 42900 + }, + { + "epoch": 0.945990540094599, + "grad_norm": 1.6359889507293701, + "learning_rate": 8.111632001760757e-06, + "loss": 0.5295, + "step": 43000 + }, + { + "epoch": 0.948190518094819, + "grad_norm": 1.7587625980377197, + "learning_rate": 8.107230108946848e-06, + "loss": 0.5415, + "step": 43100 + }, + { + "epoch": 0.950390496095039, + "grad_norm": 1.8017805814743042, + "learning_rate": 8.102828216132938e-06, + "loss": 0.5422, + "step": 43200 + }, + { + "epoch": 0.9525904740952591, + "grad_norm": 1.970982313156128, + "learning_rate": 8.098426323319027e-06, + "loss": 0.5296, + "step": 43300 + }, + { + "epoch": 0.9547904520954791, + "grad_norm": 1.8112688064575195, + "learning_rate": 8.094024430505117e-06, + "loss": 0.5539, + "step": 43400 + }, + { + "epoch": 0.9569904300956991, + "grad_norm": 1.7808321714401245, + "learning_rate": 8.089622537691207e-06, + "loss": 0.5498, + "step": 43500 + }, + { + "epoch": 0.959190408095919, + "grad_norm": 1.9657952785491943, + "learning_rate": 8.085220644877297e-06, + "loss": 0.5424, + "step": 43600 + }, + { + "epoch": 0.961390386096139, + "grad_norm": 1.8520526885986328, + "learning_rate": 8.080818752063388e-06, + "loss": 0.5392, + "step": 43700 + }, + { + "epoch": 0.963590364096359, + "grad_norm": 1.7919948101043701, + "learning_rate": 8.076416859249478e-06, + "loss": 0.532, + "step": 43800 + }, + { + "epoch": 0.965790342096579, + "grad_norm": 1.600967288017273, + "learning_rate": 8.072014966435568e-06, + "loss": 0.5406, + "step": 43900 + }, + { + "epoch": 0.9679903200967991, + "grad_norm": 1.638075351715088, + "learning_rate": 8.067613073621659e-06, + "loss": 0.553, + "step": 44000 + }, + { + "epoch": 0.9701902980970191, + "grad_norm": 1.5249767303466797, + "learning_rate": 8.063211180807749e-06, + "loss": 0.5533, + "step": 44100 + }, + { + "epoch": 0.972390276097239, + "grad_norm": 1.6304973363876343, + "learning_rate": 8.05880928799384e-06, + "loss": 0.5377, + "step": 44200 + }, + { + "epoch": 0.974590254097459, + "grad_norm": 1.8152045011520386, + "learning_rate": 8.054407395179928e-06, + "loss": 0.5284, + "step": 44300 + }, + { + "epoch": 0.976790232097679, + "grad_norm": 1.652199625968933, + "learning_rate": 8.050005502366018e-06, + "loss": 0.5448, + "step": 44400 + }, + { + "epoch": 0.978990210097899, + "grad_norm": 1.7338589429855347, + "learning_rate": 8.045603609552108e-06, + "loss": 0.5395, + "step": 44500 + }, + { + "epoch": 0.981190188098119, + "grad_norm": 1.5801849365234375, + "learning_rate": 8.041201716738199e-06, + "loss": 0.5297, + "step": 44600 + }, + { + "epoch": 0.9833901660983391, + "grad_norm": 2.031813621520996, + "learning_rate": 8.036799823924289e-06, + "loss": 0.5617, + "step": 44700 + }, + { + "epoch": 0.985590144098559, + "grad_norm": 1.934370756149292, + "learning_rate": 8.03239793111038e-06, + "loss": 0.5329, + "step": 44800 + }, + { + "epoch": 0.987790122098779, + "grad_norm": 1.849741816520691, + "learning_rate": 8.027996038296468e-06, + "loss": 0.5413, + "step": 44900 + }, + { + "epoch": 0.989990100098999, + "grad_norm": 1.757784366607666, + "learning_rate": 8.023594145482558e-06, + "loss": 0.5319, + "step": 45000 + }, + { + "epoch": 0.992190078099219, + "grad_norm": 1.6084299087524414, + "learning_rate": 8.019192252668648e-06, + "loss": 0.5465, + "step": 45100 + }, + { + "epoch": 0.994390056099439, + "grad_norm": 1.9279767274856567, + "learning_rate": 8.014790359854739e-06, + "loss": 0.5425, + "step": 45200 + }, + { + "epoch": 0.996590034099659, + "grad_norm": 1.5739712715148926, + "learning_rate": 8.010388467040829e-06, + "loss": 0.5471, + "step": 45300 + }, + { + "epoch": 0.998790012099879, + "grad_norm": 1.5087926387786865, + "learning_rate": 8.005986574226917e-06, + "loss": 0.5417, + "step": 45400 + }, + { + "epoch": 1.000989990100099, + "grad_norm": 2.411069393157959, + "learning_rate": 8.001584681413008e-06, + "loss": 0.5328, + "step": 45500 + }, + { + "epoch": 1.003189968100319, + "grad_norm": 2.560279607772827, + "learning_rate": 7.997182788599098e-06, + "loss": 0.5018, + "step": 45600 + }, + { + "epoch": 1.005389946100539, + "grad_norm": 1.8764352798461914, + "learning_rate": 7.992780895785188e-06, + "loss": 0.4947, + "step": 45700 + }, + { + "epoch": 1.007589924100759, + "grad_norm": 2.0531773567199707, + "learning_rate": 7.988379002971279e-06, + "loss": 0.5016, + "step": 45800 + }, + { + "epoch": 1.009789902100979, + "grad_norm": 2.1719043254852295, + "learning_rate": 7.983977110157369e-06, + "loss": 0.504, + "step": 45900 + }, + { + "epoch": 1.011989880101199, + "grad_norm": 1.8235334157943726, + "learning_rate": 7.979575217343457e-06, + "loss": 0.4967, + "step": 46000 + }, + { + "epoch": 1.014189858101419, + "grad_norm": 2.329827308654785, + "learning_rate": 7.975173324529548e-06, + "loss": 0.5121, + "step": 46100 + }, + { + "epoch": 1.016389836101639, + "grad_norm": 2.2712931632995605, + "learning_rate": 7.970771431715638e-06, + "loss": 0.4901, + "step": 46200 + }, + { + "epoch": 1.018589814101859, + "grad_norm": 1.9942501783370972, + "learning_rate": 7.966369538901728e-06, + "loss": 0.5052, + "step": 46300 + }, + { + "epoch": 1.020789792102079, + "grad_norm": 2.014451742172241, + "learning_rate": 7.961967646087819e-06, + "loss": 0.5117, + "step": 46400 + }, + { + "epoch": 1.022989770102299, + "grad_norm": 2.1809909343719482, + "learning_rate": 7.957565753273909e-06, + "loss": 0.5106, + "step": 46500 + }, + { + "epoch": 1.025189748102519, + "grad_norm": 1.6118221282958984, + "learning_rate": 7.953163860459997e-06, + "loss": 0.4959, + "step": 46600 + }, + { + "epoch": 1.027389726102739, + "grad_norm": 1.9853328466415405, + "learning_rate": 7.948761967646088e-06, + "loss": 0.5127, + "step": 46700 + }, + { + "epoch": 1.029589704102959, + "grad_norm": 2.3931078910827637, + "learning_rate": 7.944360074832178e-06, + "loss": 0.5084, + "step": 46800 + }, + { + "epoch": 1.031789682103179, + "grad_norm": 1.6679604053497314, + "learning_rate": 7.939958182018268e-06, + "loss": 0.4913, + "step": 46900 + }, + { + "epoch": 1.0339896601033989, + "grad_norm": 2.377412796020508, + "learning_rate": 7.935556289204359e-06, + "loss": 0.4915, + "step": 47000 + }, + { + "epoch": 1.0361896381036189, + "grad_norm": 2.0759618282318115, + "learning_rate": 7.931154396390449e-06, + "loss": 0.5011, + "step": 47100 + }, + { + "epoch": 1.038389616103839, + "grad_norm": 2.061979055404663, + "learning_rate": 7.926752503576537e-06, + "loss": 0.4945, + "step": 47200 + }, + { + "epoch": 1.040589594104059, + "grad_norm": 1.912423849105835, + "learning_rate": 7.92235061076263e-06, + "loss": 0.496, + "step": 47300 + }, + { + "epoch": 1.042789572104279, + "grad_norm": 2.3455774784088135, + "learning_rate": 7.91794871794872e-06, + "loss": 0.5063, + "step": 47400 + }, + { + "epoch": 1.044989550104499, + "grad_norm": 1.7976536750793457, + "learning_rate": 7.913546825134808e-06, + "loss": 0.5053, + "step": 47500 + }, + { + "epoch": 1.047189528104719, + "grad_norm": 2.056267023086548, + "learning_rate": 7.909144932320899e-06, + "loss": 0.4939, + "step": 47600 + }, + { + "epoch": 1.049389506104939, + "grad_norm": 2.216721534729004, + "learning_rate": 7.904743039506989e-06, + "loss": 0.5007, + "step": 47700 + }, + { + "epoch": 1.051589484105159, + "grad_norm": 1.4782536029815674, + "learning_rate": 7.90034114669308e-06, + "loss": 0.4765, + "step": 47800 + }, + { + "epoch": 1.053789462105379, + "grad_norm": 1.739716649055481, + "learning_rate": 7.89593925387917e-06, + "loss": 0.5245, + "step": 47900 + }, + { + "epoch": 1.055989440105599, + "grad_norm": 1.5695744752883911, + "learning_rate": 7.89153736106526e-06, + "loss": 0.511, + "step": 48000 + }, + { + "epoch": 1.058189418105819, + "grad_norm": 2.0835139751434326, + "learning_rate": 7.887135468251348e-06, + "loss": 0.4989, + "step": 48100 + }, + { + "epoch": 1.0603893961060389, + "grad_norm": 1.9040948152542114, + "learning_rate": 7.882733575437439e-06, + "loss": 0.5001, + "step": 48200 + }, + { + "epoch": 1.0625893741062589, + "grad_norm": 2.1570136547088623, + "learning_rate": 7.878331682623529e-06, + "loss": 0.5031, + "step": 48300 + }, + { + "epoch": 1.0647893521064788, + "grad_norm": 1.8248552083969116, + "learning_rate": 7.873929789809619e-06, + "loss": 0.504, + "step": 48400 + }, + { + "epoch": 1.0669893301066988, + "grad_norm": 1.8128606081008911, + "learning_rate": 7.86952789699571e-06, + "loss": 0.4825, + "step": 48500 + }, + { + "epoch": 1.069189308106919, + "grad_norm": 2.15380597114563, + "learning_rate": 7.8651260041818e-06, + "loss": 0.4843, + "step": 48600 + }, + { + "epoch": 1.071389286107139, + "grad_norm": 2.4410858154296875, + "learning_rate": 7.860724111367888e-06, + "loss": 0.4973, + "step": 48700 + }, + { + "epoch": 1.073589264107359, + "grad_norm": 1.9602640867233276, + "learning_rate": 7.856322218553979e-06, + "loss": 0.5039, + "step": 48800 + }, + { + "epoch": 1.075789242107579, + "grad_norm": 2.189321994781494, + "learning_rate": 7.851920325740069e-06, + "loss": 0.5002, + "step": 48900 + }, + { + "epoch": 1.077989220107799, + "grad_norm": 2.153059244155884, + "learning_rate": 7.847518432926159e-06, + "loss": 0.5074, + "step": 49000 + }, + { + "epoch": 1.080189198108019, + "grad_norm": 1.9804766178131104, + "learning_rate": 7.84311654011225e-06, + "loss": 0.4981, + "step": 49100 + }, + { + "epoch": 1.082389176108239, + "grad_norm": 2.228227376937866, + "learning_rate": 7.838714647298338e-06, + "loss": 0.5115, + "step": 49200 + }, + { + "epoch": 1.084589154108459, + "grad_norm": 2.639230489730835, + "learning_rate": 7.834312754484428e-06, + "loss": 0.4956, + "step": 49300 + }, + { + "epoch": 1.086789132108679, + "grad_norm": 2.2388269901275635, + "learning_rate": 7.829910861670519e-06, + "loss": 0.4957, + "step": 49400 + }, + { + "epoch": 1.0889891101088989, + "grad_norm": 2.2344448566436768, + "learning_rate": 7.825508968856609e-06, + "loss": 0.5191, + "step": 49500 + }, + { + "epoch": 1.0911890881091189, + "grad_norm": 2.1383955478668213, + "learning_rate": 7.821107076042699e-06, + "loss": 0.5035, + "step": 49600 + }, + { + "epoch": 1.0933890661093388, + "grad_norm": 2.0469112396240234, + "learning_rate": 7.81670518322879e-06, + "loss": 0.4991, + "step": 49700 + }, + { + "epoch": 1.0955890441095588, + "grad_norm": 2.091733694076538, + "learning_rate": 7.812303290414878e-06, + "loss": 0.5213, + "step": 49800 + }, + { + "epoch": 1.0977890221097788, + "grad_norm": 2.2485196590423584, + "learning_rate": 7.807901397600968e-06, + "loss": 0.5159, + "step": 49900 + }, + { + "epoch": 1.099989000109999, + "grad_norm": 2.335508108139038, + "learning_rate": 7.803499504787059e-06, + "loss": 0.5035, + "step": 50000 + }, + { + "epoch": 1.099989000109999, + "eval_loss": 0.579010546207428, + "eval_runtime": 378.8096, + "eval_samples_per_second": 158.391, + "eval_steps_per_second": 4.95, + "step": 50000 + }, + { + "epoch": 1.102188978110219, + "grad_norm": 2.1119778156280518, + "learning_rate": 7.799097611973149e-06, + "loss": 0.5081, + "step": 50100 + }, + { + "epoch": 1.104388956110439, + "grad_norm": 2.182777166366577, + "learning_rate": 7.794695719159239e-06, + "loss": 0.4925, + "step": 50200 + }, + { + "epoch": 1.106588934110659, + "grad_norm": 2.2675302028656006, + "learning_rate": 7.79029382634533e-06, + "loss": 0.4865, + "step": 50300 + }, + { + "epoch": 1.108788912110879, + "grad_norm": 1.858472228050232, + "learning_rate": 7.785891933531418e-06, + "loss": 0.5118, + "step": 50400 + }, + { + "epoch": 1.110988890111099, + "grad_norm": 1.8882789611816406, + "learning_rate": 7.781490040717508e-06, + "loss": 0.5087, + "step": 50500 + }, + { + "epoch": 1.113188868111319, + "grad_norm": 1.9170640707015991, + "learning_rate": 7.777088147903599e-06, + "loss": 0.491, + "step": 50600 + }, + { + "epoch": 1.1153888461115389, + "grad_norm": 1.9825174808502197, + "learning_rate": 7.772686255089689e-06, + "loss": 0.5072, + "step": 50700 + }, + { + "epoch": 1.1175888241117589, + "grad_norm": 2.3916232585906982, + "learning_rate": 7.768284362275779e-06, + "loss": 0.5111, + "step": 50800 + }, + { + "epoch": 1.1197888021119788, + "grad_norm": 2.069160223007202, + "learning_rate": 7.76388246946187e-06, + "loss": 0.4927, + "step": 50900 + }, + { + "epoch": 1.1219887801121988, + "grad_norm": 1.780382752418518, + "learning_rate": 7.75948057664796e-06, + "loss": 0.4959, + "step": 51000 + }, + { + "epoch": 1.1241887581124188, + "grad_norm": 2.5268094539642334, + "learning_rate": 7.75507868383405e-06, + "loss": 0.4975, + "step": 51100 + }, + { + "epoch": 1.1263887361126388, + "grad_norm": 1.9989362955093384, + "learning_rate": 7.75067679102014e-06, + "loss": 0.504, + "step": 51200 + }, + { + "epoch": 1.1285887141128588, + "grad_norm": 2.230954647064209, + "learning_rate": 7.746274898206229e-06, + "loss": 0.5172, + "step": 51300 + }, + { + "epoch": 1.1307886921130788, + "grad_norm": 2.2332351207733154, + "learning_rate": 7.741873005392319e-06, + "loss": 0.5026, + "step": 51400 + }, + { + "epoch": 1.132988670113299, + "grad_norm": 2.234415054321289, + "learning_rate": 7.73747111257841e-06, + "loss": 0.5169, + "step": 51500 + }, + { + "epoch": 1.135188648113519, + "grad_norm": 1.9074784517288208, + "learning_rate": 7.7330692197645e-06, + "loss": 0.4878, + "step": 51600 + }, + { + "epoch": 1.137388626113739, + "grad_norm": 1.9809048175811768, + "learning_rate": 7.72866732695059e-06, + "loss": 0.4794, + "step": 51700 + }, + { + "epoch": 1.139588604113959, + "grad_norm": 1.90762460231781, + "learning_rate": 7.72426543413668e-06, + "loss": 0.4996, + "step": 51800 + }, + { + "epoch": 1.1417885821141789, + "grad_norm": 2.3830220699310303, + "learning_rate": 7.719863541322769e-06, + "loss": 0.5028, + "step": 51900 + }, + { + "epoch": 1.1439885601143989, + "grad_norm": 2.052335023880005, + "learning_rate": 7.715461648508859e-06, + "loss": 0.5189, + "step": 52000 + }, + { + "epoch": 1.1461885381146188, + "grad_norm": 2.3055222034454346, + "learning_rate": 7.71105975569495e-06, + "loss": 0.5117, + "step": 52100 + }, + { + "epoch": 1.1483885161148388, + "grad_norm": 2.7478485107421875, + "learning_rate": 7.70665786288104e-06, + "loss": 0.503, + "step": 52200 + }, + { + "epoch": 1.1505884941150588, + "grad_norm": 1.8724684715270996, + "learning_rate": 7.70225597006713e-06, + "loss": 0.5017, + "step": 52300 + }, + { + "epoch": 1.1527884721152788, + "grad_norm": 2.1905338764190674, + "learning_rate": 7.69785407725322e-06, + "loss": 0.4995, + "step": 52400 + }, + { + "epoch": 1.1549884501154988, + "grad_norm": 2.169680118560791, + "learning_rate": 7.693452184439309e-06, + "loss": 0.5012, + "step": 52500 + }, + { + "epoch": 1.1571884281157188, + "grad_norm": 2.3531687259674072, + "learning_rate": 7.689050291625399e-06, + "loss": 0.4835, + "step": 52600 + }, + { + "epoch": 1.159388406115939, + "grad_norm": 1.9876978397369385, + "learning_rate": 7.68464839881149e-06, + "loss": 0.4949, + "step": 52700 + }, + { + "epoch": 1.161588384116159, + "grad_norm": 2.463718891143799, + "learning_rate": 7.68024650599758e-06, + "loss": 0.5121, + "step": 52800 + }, + { + "epoch": 1.163788362116379, + "grad_norm": 2.4976985454559326, + "learning_rate": 7.67584461318367e-06, + "loss": 0.4882, + "step": 52900 + }, + { + "epoch": 1.165988340116599, + "grad_norm": 1.968513011932373, + "learning_rate": 7.67144272036976e-06, + "loss": 0.5052, + "step": 53000 + }, + { + "epoch": 1.168188318116819, + "grad_norm": 1.998396396636963, + "learning_rate": 7.667040827555849e-06, + "loss": 0.4912, + "step": 53100 + }, + { + "epoch": 1.1703882961170389, + "grad_norm": 2.0211946964263916, + "learning_rate": 7.662638934741939e-06, + "loss": 0.5087, + "step": 53200 + }, + { + "epoch": 1.1725882741172589, + "grad_norm": 1.97858464717865, + "learning_rate": 7.65823704192803e-06, + "loss": 0.5015, + "step": 53300 + }, + { + "epoch": 1.1747882521174788, + "grad_norm": 2.1665027141571045, + "learning_rate": 7.65383514911412e-06, + "loss": 0.5088, + "step": 53400 + }, + { + "epoch": 1.1769882301176988, + "grad_norm": 2.3747305870056152, + "learning_rate": 7.64943325630021e-06, + "loss": 0.4971, + "step": 53500 + }, + { + "epoch": 1.1791882081179188, + "grad_norm": 2.0653445720672607, + "learning_rate": 7.645031363486299e-06, + "loss": 0.4999, + "step": 53600 + }, + { + "epoch": 1.1813881861181388, + "grad_norm": 2.0202314853668213, + "learning_rate": 7.640629470672389e-06, + "loss": 0.4857, + "step": 53700 + }, + { + "epoch": 1.1835881641183588, + "grad_norm": 2.1644513607025146, + "learning_rate": 7.636227577858479e-06, + "loss": 0.4925, + "step": 53800 + }, + { + "epoch": 1.1857881421185787, + "grad_norm": 2.2284882068634033, + "learning_rate": 7.63182568504457e-06, + "loss": 0.5076, + "step": 53900 + }, + { + "epoch": 1.1879881201187987, + "grad_norm": 1.9216992855072021, + "learning_rate": 7.62742379223066e-06, + "loss": 0.4937, + "step": 54000 + }, + { + "epoch": 1.1901880981190187, + "grad_norm": 2.151033401489258, + "learning_rate": 7.623021899416749e-06, + "loss": 0.5042, + "step": 54100 + }, + { + "epoch": 1.1923880761192387, + "grad_norm": 2.544735908508301, + "learning_rate": 7.618620006602839e-06, + "loss": 0.5016, + "step": 54200 + }, + { + "epoch": 1.194588054119459, + "grad_norm": 2.404811382293701, + "learning_rate": 7.6142181137889306e-06, + "loss": 0.4859, + "step": 54300 + }, + { + "epoch": 1.1967880321196789, + "grad_norm": 2.071399450302124, + "learning_rate": 7.60981622097502e-06, + "loss": 0.5009, + "step": 54400 + }, + { + "epoch": 1.1989880101198989, + "grad_norm": 2.0729258060455322, + "learning_rate": 7.60541432816111e-06, + "loss": 0.5068, + "step": 54500 + }, + { + "epoch": 1.2011879881201188, + "grad_norm": 1.9438556432724, + "learning_rate": 7.6010124353472006e-06, + "loss": 0.5151, + "step": 54600 + }, + { + "epoch": 1.2033879661203388, + "grad_norm": 2.3928163051605225, + "learning_rate": 7.59661054253329e-06, + "loss": 0.5152, + "step": 54700 + }, + { + "epoch": 1.2055879441205588, + "grad_norm": 2.0218889713287354, + "learning_rate": 7.59220864971938e-06, + "loss": 0.4935, + "step": 54800 + }, + { + "epoch": 1.2077879221207788, + "grad_norm": 2.0265040397644043, + "learning_rate": 7.5878067569054706e-06, + "loss": 0.4995, + "step": 54900 + }, + { + "epoch": 1.2099879001209988, + "grad_norm": 2.6148312091827393, + "learning_rate": 7.58340486409156e-06, + "loss": 0.5082, + "step": 55000 + }, + { + "epoch": 1.2121878781212188, + "grad_norm": 2.4383389949798584, + "learning_rate": 7.57900297127765e-06, + "loss": 0.4982, + "step": 55100 + }, + { + "epoch": 1.2143878561214387, + "grad_norm": 2.649778366088867, + "learning_rate": 7.5746010784637406e-06, + "loss": 0.4974, + "step": 55200 + }, + { + "epoch": 1.2165878341216587, + "grad_norm": 2.525026559829712, + "learning_rate": 7.57019918564983e-06, + "loss": 0.4953, + "step": 55300 + }, + { + "epoch": 1.2187878121218787, + "grad_norm": 2.795290470123291, + "learning_rate": 7.56579729283592e-06, + "loss": 0.5118, + "step": 55400 + }, + { + "epoch": 1.220987790122099, + "grad_norm": 1.8484504222869873, + "learning_rate": 7.5613954000220105e-06, + "loss": 0.4897, + "step": 55500 + }, + { + "epoch": 1.2231877681223189, + "grad_norm": 2.673802614212036, + "learning_rate": 7.5569935072081e-06, + "loss": 0.4856, + "step": 55600 + }, + { + "epoch": 1.2253877461225389, + "grad_norm": 2.250032663345337, + "learning_rate": 7.55259161439419e-06, + "loss": 0.4942, + "step": 55700 + }, + { + "epoch": 1.2275877241227588, + "grad_norm": 2.281285285949707, + "learning_rate": 7.5481897215802805e-06, + "loss": 0.492, + "step": 55800 + }, + { + "epoch": 1.2297877021229788, + "grad_norm": 2.1768269538879395, + "learning_rate": 7.54378782876637e-06, + "loss": 0.5014, + "step": 55900 + }, + { + "epoch": 1.2319876801231988, + "grad_norm": 2.172852039337158, + "learning_rate": 7.53938593595246e-06, + "loss": 0.5055, + "step": 56000 + }, + { + "epoch": 1.2341876581234188, + "grad_norm": 2.2055068016052246, + "learning_rate": 7.5349840431385505e-06, + "loss": 0.4994, + "step": 56100 + }, + { + "epoch": 1.2363876361236388, + "grad_norm": 2.2056238651275635, + "learning_rate": 7.53058215032464e-06, + "loss": 0.5082, + "step": 56200 + }, + { + "epoch": 1.2385876141238588, + "grad_norm": 1.8684000968933105, + "learning_rate": 7.52618025751073e-06, + "loss": 0.5001, + "step": 56300 + }, + { + "epoch": 1.2407875921240787, + "grad_norm": 1.8799563646316528, + "learning_rate": 7.52177836469682e-06, + "loss": 0.4863, + "step": 56400 + }, + { + "epoch": 1.2429875701242987, + "grad_norm": 2.0053553581237793, + "learning_rate": 7.51737647188291e-06, + "loss": 0.5019, + "step": 56500 + }, + { + "epoch": 1.2451875481245187, + "grad_norm": 2.526304244995117, + "learning_rate": 7.512974579069e-06, + "loss": 0.4783, + "step": 56600 + }, + { + "epoch": 1.2473875261247387, + "grad_norm": 2.2301254272460938, + "learning_rate": 7.50857268625509e-06, + "loss": 0.4975, + "step": 56700 + }, + { + "epoch": 1.2495875041249587, + "grad_norm": 1.8377426862716675, + "learning_rate": 7.50417079344118e-06, + "loss": 0.4929, + "step": 56800 + }, + { + "epoch": 1.2517874821251787, + "grad_norm": 2.6410109996795654, + "learning_rate": 7.49976890062727e-06, + "loss": 0.4816, + "step": 56900 + }, + { + "epoch": 1.2539874601253986, + "grad_norm": 2.0295798778533936, + "learning_rate": 7.49536700781336e-06, + "loss": 0.5038, + "step": 57000 + }, + { + "epoch": 1.2561874381256186, + "grad_norm": 2.7886478900909424, + "learning_rate": 7.49096511499945e-06, + "loss": 0.5147, + "step": 57100 + }, + { + "epoch": 1.2583874161258388, + "grad_norm": 2.330388307571411, + "learning_rate": 7.48656322218554e-06, + "loss": 0.4929, + "step": 57200 + }, + { + "epoch": 1.2605873941260588, + "grad_norm": 1.756525993347168, + "learning_rate": 7.48216132937163e-06, + "loss": 0.4873, + "step": 57300 + }, + { + "epoch": 1.2627873721262788, + "grad_norm": 1.7345948219299316, + "learning_rate": 7.47775943655772e-06, + "loss": 0.4906, + "step": 57400 + }, + { + "epoch": 1.2649873501264988, + "grad_norm": 2.1234254837036133, + "learning_rate": 7.47335754374381e-06, + "loss": 0.5082, + "step": 57500 + }, + { + "epoch": 1.2671873281267187, + "grad_norm": 1.7519376277923584, + "learning_rate": 7.4689556509299e-06, + "loss": 0.5061, + "step": 57600 + }, + { + "epoch": 1.2693873061269387, + "grad_norm": 2.4111804962158203, + "learning_rate": 7.464553758115991e-06, + "loss": 0.4903, + "step": 57700 + }, + { + "epoch": 1.2715872841271587, + "grad_norm": 1.9729013442993164, + "learning_rate": 7.460151865302081e-06, + "loss": 0.4881, + "step": 57800 + }, + { + "epoch": 1.2737872621273787, + "grad_norm": 2.7246460914611816, + "learning_rate": 7.455749972488171e-06, + "loss": 0.517, + "step": 57900 + }, + { + "epoch": 1.2759872401275987, + "grad_norm": 1.660434603691101, + "learning_rate": 7.451348079674261e-06, + "loss": 0.502, + "step": 58000 + }, + { + "epoch": 1.2781872181278187, + "grad_norm": 2.782742500305176, + "learning_rate": 7.446946186860351e-06, + "loss": 0.4985, + "step": 58100 + }, + { + "epoch": 1.2803871961280386, + "grad_norm": 2.264404296875, + "learning_rate": 7.4425442940464405e-06, + "loss": 0.5016, + "step": 58200 + }, + { + "epoch": 1.2825871741282588, + "grad_norm": 2.0111939907073975, + "learning_rate": 7.438142401232531e-06, + "loss": 0.4905, + "step": 58300 + }, + { + "epoch": 1.2847871521284788, + "grad_norm": 2.0050606727600098, + "learning_rate": 7.433740508418621e-06, + "loss": 0.4864, + "step": 58400 + }, + { + "epoch": 1.2869871301286988, + "grad_norm": 1.3107115030288696, + "learning_rate": 7.4293386156047105e-06, + "loss": 0.4915, + "step": 58500 + }, + { + "epoch": 1.2891871081289188, + "grad_norm": 1.8996055126190186, + "learning_rate": 7.424936722790801e-06, + "loss": 0.49, + "step": 58600 + }, + { + "epoch": 1.2913870861291388, + "grad_norm": 1.7696682214736938, + "learning_rate": 7.420534829976891e-06, + "loss": 0.4968, + "step": 58700 + }, + { + "epoch": 1.2935870641293588, + "grad_norm": 2.1315739154815674, + "learning_rate": 7.4161329371629805e-06, + "loss": 0.4916, + "step": 58800 + }, + { + "epoch": 1.2957870421295787, + "grad_norm": 2.2679789066314697, + "learning_rate": 7.411731044349071e-06, + "loss": 0.5021, + "step": 58900 + }, + { + "epoch": 1.2979870201297987, + "grad_norm": 2.128899097442627, + "learning_rate": 7.407329151535161e-06, + "loss": 0.5148, + "step": 59000 + }, + { + "epoch": 1.3001869981300187, + "grad_norm": 2.085585832595825, + "learning_rate": 7.4029272587212505e-06, + "loss": 0.4804, + "step": 59100 + }, + { + "epoch": 1.3023869761302387, + "grad_norm": 2.367190361022949, + "learning_rate": 7.398525365907341e-06, + "loss": 0.4921, + "step": 59200 + }, + { + "epoch": 1.3045869541304587, + "grad_norm": 2.3802804946899414, + "learning_rate": 7.394123473093431e-06, + "loss": 0.4974, + "step": 59300 + }, + { + "epoch": 1.3067869321306786, + "grad_norm": 2.332484483718872, + "learning_rate": 7.3897215802795205e-06, + "loss": 0.5115, + "step": 59400 + }, + { + "epoch": 1.3089869101308986, + "grad_norm": 2.1906321048736572, + "learning_rate": 7.385319687465611e-06, + "loss": 0.505, + "step": 59500 + }, + { + "epoch": 1.3111868881311186, + "grad_norm": 1.942108154296875, + "learning_rate": 7.380917794651701e-06, + "loss": 0.4937, + "step": 59600 + }, + { + "epoch": 1.3133868661313386, + "grad_norm": 2.0868446826934814, + "learning_rate": 7.3765159018377905e-06, + "loss": 0.4963, + "step": 59700 + }, + { + "epoch": 1.3155868441315586, + "grad_norm": 2.3469884395599365, + "learning_rate": 7.372114009023881e-06, + "loss": 0.5038, + "step": 59800 + }, + { + "epoch": 1.3177868221317786, + "grad_norm": 2.1203341484069824, + "learning_rate": 7.367712116209971e-06, + "loss": 0.4891, + "step": 59900 + }, + { + "epoch": 1.3199868001319988, + "grad_norm": 1.7752751111984253, + "learning_rate": 7.3633102233960605e-06, + "loss": 0.5036, + "step": 60000 + }, + { + "epoch": 1.3221867781322187, + "grad_norm": 2.311631441116333, + "learning_rate": 7.358908330582151e-06, + "loss": 0.5104, + "step": 60100 + }, + { + "epoch": 1.3243867561324387, + "grad_norm": 1.9225836992263794, + "learning_rate": 7.354506437768241e-06, + "loss": 0.4926, + "step": 60200 + }, + { + "epoch": 1.3265867341326587, + "grad_norm": 1.9772847890853882, + "learning_rate": 7.3501045449543305e-06, + "loss": 0.4923, + "step": 60300 + }, + { + "epoch": 1.3287867121328787, + "grad_norm": 1.6036473512649536, + "learning_rate": 7.345702652140421e-06, + "loss": 0.4955, + "step": 60400 + }, + { + "epoch": 1.3309866901330987, + "grad_norm": 1.8488271236419678, + "learning_rate": 7.34130075932651e-06, + "loss": 0.512, + "step": 60500 + }, + { + "epoch": 1.3331866681333187, + "grad_norm": 2.149338722229004, + "learning_rate": 7.3368988665126005e-06, + "loss": 0.4914, + "step": 60600 + }, + { + "epoch": 1.3353866461335386, + "grad_norm": 2.4873788356781006, + "learning_rate": 7.332496973698691e-06, + "loss": 0.4965, + "step": 60700 + }, + { + "epoch": 1.3375866241337586, + "grad_norm": 2.4446520805358887, + "learning_rate": 7.32809508088478e-06, + "loss": 0.4917, + "step": 60800 + }, + { + "epoch": 1.3397866021339786, + "grad_norm": 2.2292611598968506, + "learning_rate": 7.3236931880708705e-06, + "loss": 0.4876, + "step": 60900 + }, + { + "epoch": 1.3419865801341986, + "grad_norm": 2.0160257816314697, + "learning_rate": 7.319291295256961e-06, + "loss": 0.4875, + "step": 61000 + }, + { + "epoch": 1.3441865581344188, + "grad_norm": 2.0969207286834717, + "learning_rate": 7.31488940244305e-06, + "loss": 0.5031, + "step": 61100 + }, + { + "epoch": 1.3463865361346388, + "grad_norm": 2.283207416534424, + "learning_rate": 7.310487509629141e-06, + "loss": 0.4907, + "step": 61200 + }, + { + "epoch": 1.3485865141348587, + "grad_norm": 1.9769617319107056, + "learning_rate": 7.306085616815232e-06, + "loss": 0.4942, + "step": 61300 + }, + { + "epoch": 1.3507864921350787, + "grad_norm": 2.156163454055786, + "learning_rate": 7.301683724001322e-06, + "loss": 0.4992, + "step": 61400 + }, + { + "epoch": 1.3529864701352987, + "grad_norm": 1.6328924894332886, + "learning_rate": 7.297281831187411e-06, + "loss": 0.4861, + "step": 61500 + }, + { + "epoch": 1.3551864481355187, + "grad_norm": 2.365056276321411, + "learning_rate": 7.292879938373502e-06, + "loss": 0.4915, + "step": 61600 + }, + { + "epoch": 1.3573864261357387, + "grad_norm": 2.6308701038360596, + "learning_rate": 7.288478045559592e-06, + "loss": 0.4837, + "step": 61700 + }, + { + "epoch": 1.3595864041359587, + "grad_norm": 2.454827070236206, + "learning_rate": 7.284076152745681e-06, + "loss": 0.4921, + "step": 61800 + }, + { + "epoch": 1.3617863821361786, + "grad_norm": 2.19412899017334, + "learning_rate": 7.279674259931772e-06, + "loss": 0.501, + "step": 61900 + }, + { + "epoch": 1.3639863601363986, + "grad_norm": 2.183582305908203, + "learning_rate": 7.275272367117862e-06, + "loss": 0.4934, + "step": 62000 + }, + { + "epoch": 1.3661863381366186, + "grad_norm": 2.2355942726135254, + "learning_rate": 7.270870474303951e-06, + "loss": 0.5037, + "step": 62100 + }, + { + "epoch": 1.3683863161368386, + "grad_norm": 1.8665735721588135, + "learning_rate": 7.266468581490042e-06, + "loss": 0.5054, + "step": 62200 + }, + { + "epoch": 1.3705862941370586, + "grad_norm": 2.457763433456421, + "learning_rate": 7.262066688676131e-06, + "loss": 0.4986, + "step": 62300 + }, + { + "epoch": 1.3727862721372786, + "grad_norm": 2.2373385429382324, + "learning_rate": 7.257664795862221e-06, + "loss": 0.4807, + "step": 62400 + }, + { + "epoch": 1.3749862501374985, + "grad_norm": 2.129803419113159, + "learning_rate": 7.253262903048312e-06, + "loss": 0.4877, + "step": 62500 + }, + { + "epoch": 1.3771862281377185, + "grad_norm": 2.2858309745788574, + "learning_rate": 7.248861010234401e-06, + "loss": 0.492, + "step": 62600 + }, + { + "epoch": 1.3793862061379385, + "grad_norm": 2.4332919120788574, + "learning_rate": 7.244459117420491e-06, + "loss": 0.4907, + "step": 62700 + }, + { + "epoch": 1.3815861841381587, + "grad_norm": 1.7995531558990479, + "learning_rate": 7.240057224606582e-06, + "loss": 0.5037, + "step": 62800 + }, + { + "epoch": 1.3837861621383787, + "grad_norm": 2.672942876815796, + "learning_rate": 7.235655331792671e-06, + "loss": 0.4968, + "step": 62900 + }, + { + "epoch": 1.3859861401385987, + "grad_norm": 2.1194186210632324, + "learning_rate": 7.231253438978761e-06, + "loss": 0.4944, + "step": 63000 + }, + { + "epoch": 1.3881861181388186, + "grad_norm": 2.5758581161499023, + "learning_rate": 7.226851546164852e-06, + "loss": 0.4844, + "step": 63100 + }, + { + "epoch": 1.3903860961390386, + "grad_norm": 2.359781503677368, + "learning_rate": 7.222449653350941e-06, + "loss": 0.4847, + "step": 63200 + }, + { + "epoch": 1.3925860741392586, + "grad_norm": 2.3243279457092285, + "learning_rate": 7.218047760537031e-06, + "loss": 0.4986, + "step": 63300 + }, + { + "epoch": 1.3947860521394786, + "grad_norm": 2.4134695529937744, + "learning_rate": 7.213645867723122e-06, + "loss": 0.4961, + "step": 63400 + }, + { + "epoch": 1.3969860301396986, + "grad_norm": 2.3432512283325195, + "learning_rate": 7.209243974909211e-06, + "loss": 0.5028, + "step": 63500 + }, + { + "epoch": 1.3991860081399186, + "grad_norm": 2.474076747894287, + "learning_rate": 7.204842082095301e-06, + "loss": 0.5004, + "step": 63600 + }, + { + "epoch": 1.4013859861401385, + "grad_norm": 2.43440580368042, + "learning_rate": 7.200440189281392e-06, + "loss": 0.5031, + "step": 63700 + }, + { + "epoch": 1.4035859641403585, + "grad_norm": 2.1737067699432373, + "learning_rate": 7.196038296467481e-06, + "loss": 0.4871, + "step": 63800 + }, + { + "epoch": 1.4057859421405787, + "grad_norm": 1.9419715404510498, + "learning_rate": 7.191636403653571e-06, + "loss": 0.4903, + "step": 63900 + }, + { + "epoch": 1.4079859201407987, + "grad_norm": 2.1449568271636963, + "learning_rate": 7.187234510839662e-06, + "loss": 0.4819, + "step": 64000 + }, + { + "epoch": 1.4101858981410187, + "grad_norm": 2.1790225505828857, + "learning_rate": 7.182832618025751e-06, + "loss": 0.5155, + "step": 64100 + }, + { + "epoch": 1.4123858761412387, + "grad_norm": 2.4493134021759033, + "learning_rate": 7.178430725211841e-06, + "loss": 0.4922, + "step": 64200 + }, + { + "epoch": 1.4145858541414587, + "grad_norm": 2.250734806060791, + "learning_rate": 7.174028832397932e-06, + "loss": 0.4911, + "step": 64300 + }, + { + "epoch": 1.4167858321416786, + "grad_norm": 2.312277317047119, + "learning_rate": 7.169626939584021e-06, + "loss": 0.4884, + "step": 64400 + }, + { + "epoch": 1.4189858101418986, + "grad_norm": 2.0889904499053955, + "learning_rate": 7.165225046770111e-06, + "loss": 0.5023, + "step": 64500 + }, + { + "epoch": 1.4211857881421186, + "grad_norm": 2.2084124088287354, + "learning_rate": 7.160823153956201e-06, + "loss": 0.4974, + "step": 64600 + }, + { + "epoch": 1.4233857661423386, + "grad_norm": 2.046213150024414, + "learning_rate": 7.156421261142292e-06, + "loss": 0.4935, + "step": 64700 + }, + { + "epoch": 1.4255857441425586, + "grad_norm": 2.1457226276397705, + "learning_rate": 7.152019368328382e-06, + "loss": 0.4903, + "step": 64800 + }, + { + "epoch": 1.4277857221427785, + "grad_norm": 2.058285713195801, + "learning_rate": 7.1476174755144725e-06, + "loss": 0.5002, + "step": 64900 + }, + { + "epoch": 1.4299857001429985, + "grad_norm": 2.269285202026367, + "learning_rate": 7.143215582700562e-06, + "loss": 0.4891, + "step": 65000 + }, + { + "epoch": 1.4321856781432185, + "grad_norm": 2.030383586883545, + "learning_rate": 7.138813689886652e-06, + "loss": 0.5101, + "step": 65100 + }, + { + "epoch": 1.4343856561434385, + "grad_norm": 2.0629866123199463, + "learning_rate": 7.1344117970727425e-06, + "loss": 0.4931, + "step": 65200 + }, + { + "epoch": 1.4365856341436585, + "grad_norm": 2.064944267272949, + "learning_rate": 7.130009904258832e-06, + "loss": 0.4992, + "step": 65300 + }, + { + "epoch": 1.4387856121438785, + "grad_norm": 2.1032135486602783, + "learning_rate": 7.125608011444922e-06, + "loss": 0.4919, + "step": 65400 + }, + { + "epoch": 1.4409855901440984, + "grad_norm": 2.3275599479675293, + "learning_rate": 7.1212061186310125e-06, + "loss": 0.5119, + "step": 65500 + }, + { + "epoch": 1.4431855681443184, + "grad_norm": 2.2477211952209473, + "learning_rate": 7.116804225817102e-06, + "loss": 0.5092, + "step": 65600 + }, + { + "epoch": 1.4453855461445386, + "grad_norm": 1.8756898641586304, + "learning_rate": 7.112402333003192e-06, + "loss": 0.4977, + "step": 65700 + }, + { + "epoch": 1.4475855241447586, + "grad_norm": 2.839963436126709, + "learning_rate": 7.1080004401892825e-06, + "loss": 0.4939, + "step": 65800 + }, + { + "epoch": 1.4497855021449786, + "grad_norm": 1.8775593042373657, + "learning_rate": 7.103598547375372e-06, + "loss": 0.4851, + "step": 65900 + }, + { + "epoch": 1.4519854801451986, + "grad_norm": 2.1938886642456055, + "learning_rate": 7.099196654561462e-06, + "loss": 0.4797, + "step": 66000 + }, + { + "epoch": 1.4541854581454186, + "grad_norm": 2.063523769378662, + "learning_rate": 7.0947947617475525e-06, + "loss": 0.4949, + "step": 66100 + }, + { + "epoch": 1.4563854361456385, + "grad_norm": 2.156369924545288, + "learning_rate": 7.090392868933642e-06, + "loss": 0.4936, + "step": 66200 + }, + { + "epoch": 1.4585854141458585, + "grad_norm": 2.4886789321899414, + "learning_rate": 7.085990976119732e-06, + "loss": 0.4979, + "step": 66300 + }, + { + "epoch": 1.4607853921460785, + "grad_norm": 2.3196351528167725, + "learning_rate": 7.081589083305822e-06, + "loss": 0.5121, + "step": 66400 + }, + { + "epoch": 1.4629853701462985, + "grad_norm": 2.057623863220215, + "learning_rate": 7.077187190491912e-06, + "loss": 0.4827, + "step": 66500 + }, + { + "epoch": 1.4651853481465185, + "grad_norm": 1.9187816381454468, + "learning_rate": 7.072785297678002e-06, + "loss": 0.4972, + "step": 66600 + }, + { + "epoch": 1.4673853261467387, + "grad_norm": 1.9243098497390747, + "learning_rate": 7.068383404864092e-06, + "loss": 0.48, + "step": 66700 + }, + { + "epoch": 1.4695853041469586, + "grad_norm": 2.221501111984253, + "learning_rate": 7.063981512050182e-06, + "loss": 0.4817, + "step": 66800 + }, + { + "epoch": 1.4717852821471786, + "grad_norm": 2.145901679992676, + "learning_rate": 7.059579619236272e-06, + "loss": 0.4974, + "step": 66900 + }, + { + "epoch": 1.4739852601473986, + "grad_norm": 2.7018229961395264, + "learning_rate": 7.055177726422362e-06, + "loss": 0.4776, + "step": 67000 + }, + { + "epoch": 1.4761852381476186, + "grad_norm": 1.826542854309082, + "learning_rate": 7.050775833608452e-06, + "loss": 0.494, + "step": 67100 + }, + { + "epoch": 1.4783852161478386, + "grad_norm": 2.528482437133789, + "learning_rate": 7.046373940794542e-06, + "loss": 0.4804, + "step": 67200 + }, + { + "epoch": 1.4805851941480586, + "grad_norm": 2.3805463314056396, + "learning_rate": 7.041972047980632e-06, + "loss": 0.5, + "step": 67300 + }, + { + "epoch": 1.4827851721482785, + "grad_norm": 2.379004716873169, + "learning_rate": 7.037570155166722e-06, + "loss": 0.5008, + "step": 67400 + }, + { + "epoch": 1.4849851501484985, + "grad_norm": 2.351308584213257, + "learning_rate": 7.033168262352812e-06, + "loss": 0.4917, + "step": 67500 + }, + { + "epoch": 1.4871851281487185, + "grad_norm": 2.390312910079956, + "learning_rate": 7.028766369538902e-06, + "loss": 0.4962, + "step": 67600 + }, + { + "epoch": 1.4893851061489385, + "grad_norm": 2.4329919815063477, + "learning_rate": 7.024364476724992e-06, + "loss": 0.4877, + "step": 67700 + }, + { + "epoch": 1.4915850841491585, + "grad_norm": 2.452253580093384, + "learning_rate": 7.019962583911082e-06, + "loss": 0.4908, + "step": 67800 + }, + { + "epoch": 1.4937850621493785, + "grad_norm": 2.1782665252685547, + "learning_rate": 7.015560691097172e-06, + "loss": 0.4804, + "step": 67900 + }, + { + "epoch": 1.4959850401495984, + "grad_norm": 2.0464863777160645, + "learning_rate": 7.011158798283262e-06, + "loss": 0.4947, + "step": 68000 + }, + { + "epoch": 1.4981850181498184, + "grad_norm": 1.713578701019287, + "learning_rate": 7.006756905469353e-06, + "loss": 0.4875, + "step": 68100 + }, + { + "epoch": 1.5003849961500384, + "grad_norm": 2.025834560394287, + "learning_rate": 7.002355012655443e-06, + "loss": 0.5027, + "step": 68200 + }, + { + "epoch": 1.5025849741502584, + "grad_norm": 2.509138822555542, + "learning_rate": 6.997953119841533e-06, + "loss": 0.4822, + "step": 68300 + }, + { + "epoch": 1.5047849521504784, + "grad_norm": 2.0234317779541016, + "learning_rate": 6.993551227027623e-06, + "loss": 0.4975, + "step": 68400 + }, + { + "epoch": 1.5069849301506983, + "grad_norm": 2.465769052505493, + "learning_rate": 6.9891493342137125e-06, + "loss": 0.5012, + "step": 68500 + }, + { + "epoch": 1.5091849081509183, + "grad_norm": 2.5200085639953613, + "learning_rate": 6.984747441399803e-06, + "loss": 0.5017, + "step": 68600 + }, + { + "epoch": 1.5113848861511385, + "grad_norm": 2.2190017700195312, + "learning_rate": 6.980345548585893e-06, + "loss": 0.4898, + "step": 68700 + }, + { + "epoch": 1.5135848641513585, + "grad_norm": 2.2302262783050537, + "learning_rate": 6.9759436557719825e-06, + "loss": 0.4989, + "step": 68800 + }, + { + "epoch": 1.5157848421515785, + "grad_norm": 2.4511725902557373, + "learning_rate": 6.971541762958073e-06, + "loss": 0.4934, + "step": 68900 + }, + { + "epoch": 1.5179848201517985, + "grad_norm": 2.3731210231781006, + "learning_rate": 6.967139870144163e-06, + "loss": 0.4724, + "step": 69000 + }, + { + "epoch": 1.5201847981520185, + "grad_norm": 2.2834906578063965, + "learning_rate": 6.9627379773302525e-06, + "loss": 0.4833, + "step": 69100 + }, + { + "epoch": 1.5223847761522384, + "grad_norm": 2.483689785003662, + "learning_rate": 6.958336084516343e-06, + "loss": 0.4923, + "step": 69200 + }, + { + "epoch": 1.5245847541524584, + "grad_norm": 2.316864490509033, + "learning_rate": 6.953934191702433e-06, + "loss": 0.5233, + "step": 69300 + }, + { + "epoch": 1.5267847321526786, + "grad_norm": 2.1905770301818848, + "learning_rate": 6.9495322988885225e-06, + "loss": 0.5233, + "step": 69400 + }, + { + "epoch": 1.5289847101528986, + "grad_norm": 2.5095105171203613, + "learning_rate": 6.945130406074613e-06, + "loss": 0.4927, + "step": 69500 + }, + { + "epoch": 1.5311846881531186, + "grad_norm": 2.210827112197876, + "learning_rate": 6.940728513260703e-06, + "loss": 0.4965, + "step": 69600 + }, + { + "epoch": 1.5333846661533386, + "grad_norm": 2.6142313480377197, + "learning_rate": 6.9363266204467925e-06, + "loss": 0.5025, + "step": 69700 + }, + { + "epoch": 1.5355846441535586, + "grad_norm": 2.3923892974853516, + "learning_rate": 6.931924727632883e-06, + "loss": 0.4793, + "step": 69800 + }, + { + "epoch": 1.5377846221537785, + "grad_norm": 2.1831846237182617, + "learning_rate": 6.927522834818973e-06, + "loss": 0.4935, + "step": 69900 + }, + { + "epoch": 1.5399846001539985, + "grad_norm": 2.030944347381592, + "learning_rate": 6.9231209420050625e-06, + "loss": 0.494, + "step": 70000 + }, + { + "epoch": 1.5421845781542185, + "grad_norm": 2.089087724685669, + "learning_rate": 6.918719049191153e-06, + "loss": 0.4989, + "step": 70100 + }, + { + "epoch": 1.5443845561544385, + "grad_norm": 2.7058706283569336, + "learning_rate": 6.914317156377243e-06, + "loss": 0.4982, + "step": 70200 + }, + { + "epoch": 1.5465845341546585, + "grad_norm": 2.312584638595581, + "learning_rate": 6.9099152635633325e-06, + "loss": 0.4981, + "step": 70300 + }, + { + "epoch": 1.5487845121548784, + "grad_norm": 2.5172085762023926, + "learning_rate": 6.905513370749423e-06, + "loss": 0.4871, + "step": 70400 + }, + { + "epoch": 1.5509844901550984, + "grad_norm": 2.035313367843628, + "learning_rate": 6.901111477935512e-06, + "loss": 0.4859, + "step": 70500 + }, + { + "epoch": 1.5531844681553184, + "grad_norm": 2.3374691009521484, + "learning_rate": 6.8967095851216025e-06, + "loss": 0.4831, + "step": 70600 + }, + { + "epoch": 1.5553844461555384, + "grad_norm": 2.2027342319488525, + "learning_rate": 6.892307692307693e-06, + "loss": 0.4974, + "step": 70700 + }, + { + "epoch": 1.5575844241557584, + "grad_norm": 2.4372105598449707, + "learning_rate": 6.887905799493782e-06, + "loss": 0.4902, + "step": 70800 + }, + { + "epoch": 1.5597844021559784, + "grad_norm": 2.320554256439209, + "learning_rate": 6.8835039066798725e-06, + "loss": 0.4917, + "step": 70900 + }, + { + "epoch": 1.5619843801561983, + "grad_norm": 2.323988437652588, + "learning_rate": 6.879102013865963e-06, + "loss": 0.5034, + "step": 71000 + }, + { + "epoch": 1.5641843581564183, + "grad_norm": 2.111454725265503, + "learning_rate": 6.874700121052052e-06, + "loss": 0.492, + "step": 71100 + }, + { + "epoch": 1.5663843361566383, + "grad_norm": 2.664884328842163, + "learning_rate": 6.8702982282381425e-06, + "loss": 0.4982, + "step": 71200 + }, + { + "epoch": 1.5685843141568583, + "grad_norm": 1.9500539302825928, + "learning_rate": 6.865896335424233e-06, + "loss": 0.5147, + "step": 71300 + }, + { + "epoch": 1.5707842921570783, + "grad_norm": 2.3592636585235596, + "learning_rate": 6.861494442610322e-06, + "loss": 0.4825, + "step": 71400 + }, + { + "epoch": 1.5729842701572985, + "grad_norm": 2.4548308849334717, + "learning_rate": 6.8570925497964125e-06, + "loss": 0.4949, + "step": 71500 + }, + { + "epoch": 1.5751842481575185, + "grad_norm": 2.971724033355713, + "learning_rate": 6.852690656982504e-06, + "loss": 0.4945, + "step": 71600 + }, + { + "epoch": 1.5773842261577384, + "grad_norm": 2.399245023727417, + "learning_rate": 6.848288764168594e-06, + "loss": 0.4888, + "step": 71700 + }, + { + "epoch": 1.5795842041579584, + "grad_norm": 2.2702841758728027, + "learning_rate": 6.843886871354683e-06, + "loss": 0.49, + "step": 71800 + }, + { + "epoch": 1.5817841821581784, + "grad_norm": 1.9252210855484009, + "learning_rate": 6.839484978540774e-06, + "loss": 0.494, + "step": 71900 + }, + { + "epoch": 1.5839841601583984, + "grad_norm": 2.4878454208374023, + "learning_rate": 6.835083085726864e-06, + "loss": 0.4984, + "step": 72000 + }, + { + "epoch": 1.5861841381586184, + "grad_norm": 2.035708427429199, + "learning_rate": 6.830681192912953e-06, + "loss": 0.4825, + "step": 72100 + }, + { + "epoch": 1.5883841161588386, + "grad_norm": 2.55355167388916, + "learning_rate": 6.826279300099044e-06, + "loss": 0.5056, + "step": 72200 + }, + { + "epoch": 1.5905840941590585, + "grad_norm": 2.4391555786132812, + "learning_rate": 6.821877407285133e-06, + "loss": 0.4928, + "step": 72300 + }, + { + "epoch": 1.5927840721592785, + "grad_norm": 2.2338058948516846, + "learning_rate": 6.817475514471223e-06, + "loss": 0.4874, + "step": 72400 + }, + { + "epoch": 1.5949840501594985, + "grad_norm": 2.7937569618225098, + "learning_rate": 6.813073621657314e-06, + "loss": 0.477, + "step": 72500 + }, + { + "epoch": 1.5971840281597185, + "grad_norm": 2.2559831142425537, + "learning_rate": 6.808671728843403e-06, + "loss": 0.501, + "step": 72600 + }, + { + "epoch": 1.5993840061599385, + "grad_norm": 2.1428000926971436, + "learning_rate": 6.804269836029493e-06, + "loss": 0.4872, + "step": 72700 + }, + { + "epoch": 1.6015839841601585, + "grad_norm": 2.306943655014038, + "learning_rate": 6.799867943215584e-06, + "loss": 0.5002, + "step": 72800 + }, + { + "epoch": 1.6037839621603784, + "grad_norm": 2.3396975994110107, + "learning_rate": 6.795466050401673e-06, + "loss": 0.4951, + "step": 72900 + }, + { + "epoch": 1.6059839401605984, + "grad_norm": 1.8894736766815186, + "learning_rate": 6.791064157587763e-06, + "loss": 0.4872, + "step": 73000 + }, + { + "epoch": 1.6081839181608184, + "grad_norm": 2.0049326419830322, + "learning_rate": 6.786662264773854e-06, + "loss": 0.4877, + "step": 73100 + }, + { + "epoch": 1.6103838961610384, + "grad_norm": 2.3615005016326904, + "learning_rate": 6.782260371959943e-06, + "loss": 0.4925, + "step": 73200 + }, + { + "epoch": 1.6125838741612584, + "grad_norm": 2.386545419692993, + "learning_rate": 6.777858479146033e-06, + "loss": 0.4881, + "step": 73300 + }, + { + "epoch": 1.6147838521614784, + "grad_norm": 2.3752076625823975, + "learning_rate": 6.773456586332124e-06, + "loss": 0.4813, + "step": 73400 + }, + { + "epoch": 1.6169838301616983, + "grad_norm": 2.156837224960327, + "learning_rate": 6.769054693518213e-06, + "loss": 0.4793, + "step": 73500 + }, + { + "epoch": 1.6191838081619183, + "grad_norm": 2.788848638534546, + "learning_rate": 6.764652800704303e-06, + "loss": 0.4946, + "step": 73600 + }, + { + "epoch": 1.6213837861621383, + "grad_norm": 2.1992275714874268, + "learning_rate": 6.760250907890394e-06, + "loss": 0.5019, + "step": 73700 + }, + { + "epoch": 1.6235837641623583, + "grad_norm": 2.664424419403076, + "learning_rate": 6.755849015076483e-06, + "loss": 0.4885, + "step": 73800 + }, + { + "epoch": 1.6257837421625783, + "grad_norm": 2.3380892276763916, + "learning_rate": 6.751447122262573e-06, + "loss": 0.4947, + "step": 73900 + }, + { + "epoch": 1.6279837201627982, + "grad_norm": 2.3588438034057617, + "learning_rate": 6.747045229448664e-06, + "loss": 0.4652, + "step": 74000 + }, + { + "epoch": 1.6301836981630182, + "grad_norm": 2.6669723987579346, + "learning_rate": 6.742643336634753e-06, + "loss": 0.491, + "step": 74100 + }, + { + "epoch": 1.6323836761632382, + "grad_norm": 2.4595651626586914, + "learning_rate": 6.738241443820843e-06, + "loss": 0.501, + "step": 74200 + }, + { + "epoch": 1.6345836541634584, + "grad_norm": 2.2686636447906494, + "learning_rate": 6.733839551006934e-06, + "loss": 0.482, + "step": 74300 + }, + { + "epoch": 1.6367836321636784, + "grad_norm": 2.4227776527404785, + "learning_rate": 6.729437658193023e-06, + "loss": 0.4958, + "step": 74400 + }, + { + "epoch": 1.6389836101638984, + "grad_norm": 1.9847477674484253, + "learning_rate": 6.725035765379113e-06, + "loss": 0.4834, + "step": 74500 + }, + { + "epoch": 1.6411835881641184, + "grad_norm": 2.6502370834350586, + "learning_rate": 6.720633872565203e-06, + "loss": 0.4815, + "step": 74600 + }, + { + "epoch": 1.6433835661643383, + "grad_norm": 2.2831785678863525, + "learning_rate": 6.716231979751293e-06, + "loss": 0.4826, + "step": 74700 + }, + { + "epoch": 1.6455835441645583, + "grad_norm": 1.8865406513214111, + "learning_rate": 6.711830086937383e-06, + "loss": 0.4986, + "step": 74800 + }, + { + "epoch": 1.6477835221647783, + "grad_norm": 2.026791572570801, + "learning_rate": 6.707428194123473e-06, + "loss": 0.4872, + "step": 74900 + }, + { + "epoch": 1.6499835001649985, + "grad_norm": 2.772639036178589, + "learning_rate": 6.703026301309563e-06, + "loss": 0.4891, + "step": 75000 + }, + { + "epoch": 1.6521834781652185, + "grad_norm": 2.4932167530059814, + "learning_rate": 6.698624408495654e-06, + "loss": 0.4868, + "step": 75100 + }, + { + "epoch": 1.6543834561654385, + "grad_norm": 2.5153396129608154, + "learning_rate": 6.6942225156817445e-06, + "loss": 0.5073, + "step": 75200 + }, + { + "epoch": 1.6565834341656585, + "grad_norm": 1.7845731973648071, + "learning_rate": 6.689820622867834e-06, + "loss": 0.496, + "step": 75300 + }, + { + "epoch": 1.6587834121658784, + "grad_norm": 2.392333745956421, + "learning_rate": 6.685418730053924e-06, + "loss": 0.5044, + "step": 75400 + }, + { + "epoch": 1.6609833901660984, + "grad_norm": 2.624262809753418, + "learning_rate": 6.6810168372400145e-06, + "loss": 0.5196, + "step": 75500 + }, + { + "epoch": 1.6631833681663184, + "grad_norm": 2.421013355255127, + "learning_rate": 6.676614944426104e-06, + "loss": 0.4938, + "step": 75600 + }, + { + "epoch": 1.6653833461665384, + "grad_norm": 2.1836936473846436, + "learning_rate": 6.672213051612194e-06, + "loss": 0.4735, + "step": 75700 + }, + { + "epoch": 1.6675833241667584, + "grad_norm": 2.523780345916748, + "learning_rate": 6.6678111587982845e-06, + "loss": 0.4868, + "step": 75800 + }, + { + "epoch": 1.6697833021669783, + "grad_norm": 3.20668363571167, + "learning_rate": 6.663409265984374e-06, + "loss": 0.4902, + "step": 75900 + }, + { + "epoch": 1.6719832801671983, + "grad_norm": 2.6450743675231934, + "learning_rate": 6.659007373170464e-06, + "loss": 0.4852, + "step": 76000 + }, + { + "epoch": 1.6741832581674183, + "grad_norm": 2.3257484436035156, + "learning_rate": 6.6546054803565545e-06, + "loss": 0.4913, + "step": 76100 + }, + { + "epoch": 1.6763832361676383, + "grad_norm": 1.7676602602005005, + "learning_rate": 6.650203587542644e-06, + "loss": 0.5051, + "step": 76200 + }, + { + "epoch": 1.6785832141678583, + "grad_norm": 2.2192280292510986, + "learning_rate": 6.645801694728734e-06, + "loss": 0.4959, + "step": 76300 + }, + { + "epoch": 1.6807831921680783, + "grad_norm": 2.4453659057617188, + "learning_rate": 6.641399801914824e-06, + "loss": 0.4841, + "step": 76400 + }, + { + "epoch": 1.6829831701682982, + "grad_norm": 1.9458132982254028, + "learning_rate": 6.636997909100914e-06, + "loss": 0.4911, + "step": 76500 + }, + { + "epoch": 1.6851831481685182, + "grad_norm": 2.2809267044067383, + "learning_rate": 6.632596016287004e-06, + "loss": 0.4871, + "step": 76600 + }, + { + "epoch": 1.6873831261687382, + "grad_norm": 2.630840301513672, + "learning_rate": 6.628194123473094e-06, + "loss": 0.4813, + "step": 76700 + }, + { + "epoch": 1.6895831041689582, + "grad_norm": 2.8288991451263428, + "learning_rate": 6.623792230659184e-06, + "loss": 0.4918, + "step": 76800 + }, + { + "epoch": 1.6917830821691782, + "grad_norm": 2.220552921295166, + "learning_rate": 6.619390337845274e-06, + "loss": 0.4958, + "step": 76900 + }, + { + "epoch": 1.6939830601693981, + "grad_norm": 2.3790931701660156, + "learning_rate": 6.614988445031364e-06, + "loss": 0.5098, + "step": 77000 + }, + { + "epoch": 1.6961830381696184, + "grad_norm": 2.605365753173828, + "learning_rate": 6.610586552217454e-06, + "loss": 0.4999, + "step": 77100 + }, + { + "epoch": 1.6983830161698383, + "grad_norm": 2.526428461074829, + "learning_rate": 6.606184659403544e-06, + "loss": 0.5008, + "step": 77200 + }, + { + "epoch": 1.7005829941700583, + "grad_norm": 2.2195465564727783, + "learning_rate": 6.601782766589634e-06, + "loss": 0.4846, + "step": 77300 + }, + { + "epoch": 1.7027829721702783, + "grad_norm": 2.925656318664551, + "learning_rate": 6.597380873775724e-06, + "loss": 0.4773, + "step": 77400 + }, + { + "epoch": 1.7049829501704983, + "grad_norm": 2.5258848667144775, + "learning_rate": 6.592978980961814e-06, + "loss": 0.4972, + "step": 77500 + }, + { + "epoch": 1.7071829281707183, + "grad_norm": 3.0461318492889404, + "learning_rate": 6.588577088147904e-06, + "loss": 0.4857, + "step": 77600 + }, + { + "epoch": 1.7093829061709382, + "grad_norm": 2.3932976722717285, + "learning_rate": 6.584175195333994e-06, + "loss": 0.4999, + "step": 77700 + }, + { + "epoch": 1.7115828841711584, + "grad_norm": 2.044865369796753, + "learning_rate": 6.579773302520084e-06, + "loss": 0.4898, + "step": 77800 + }, + { + "epoch": 1.7137828621713784, + "grad_norm": 2.366441011428833, + "learning_rate": 6.575371409706174e-06, + "loss": 0.4786, + "step": 77900 + }, + { + "epoch": 1.7159828401715984, + "grad_norm": 2.57084584236145, + "learning_rate": 6.570969516892264e-06, + "loss": 0.4766, + "step": 78000 + }, + { + "epoch": 1.7181828181718184, + "grad_norm": 2.560520887374878, + "learning_rate": 6.566567624078354e-06, + "loss": 0.4891, + "step": 78100 + }, + { + "epoch": 1.7203827961720384, + "grad_norm": 2.1307547092437744, + "learning_rate": 6.562165731264444e-06, + "loss": 0.4852, + "step": 78200 + }, + { + "epoch": 1.7225827741722584, + "grad_norm": 2.4924020767211914, + "learning_rate": 6.557763838450534e-06, + "loss": 0.4836, + "step": 78300 + }, + { + "epoch": 1.7247827521724783, + "grad_norm": 2.323122978210449, + "learning_rate": 6.553361945636624e-06, + "loss": 0.4926, + "step": 78400 + }, + { + "epoch": 1.7269827301726983, + "grad_norm": 2.1391868591308594, + "learning_rate": 6.5489600528227145e-06, + "loss": 0.4974, + "step": 78500 + }, + { + "epoch": 1.7291827081729183, + "grad_norm": 2.2388463020324707, + "learning_rate": 6.544558160008805e-06, + "loss": 0.4825, + "step": 78600 + }, + { + "epoch": 1.7313826861731383, + "grad_norm": 2.617159843444824, + "learning_rate": 6.540156267194895e-06, + "loss": 0.4969, + "step": 78700 + }, + { + "epoch": 1.7335826641733583, + "grad_norm": 1.9445505142211914, + "learning_rate": 6.5357543743809845e-06, + "loss": 0.494, + "step": 78800 + }, + { + "epoch": 1.7357826421735782, + "grad_norm": 1.8033205270767212, + "learning_rate": 6.531352481567075e-06, + "loss": 0.4901, + "step": 78900 + }, + { + "epoch": 1.7379826201737982, + "grad_norm": 2.480191469192505, + "learning_rate": 6.526950588753165e-06, + "loss": 0.4756, + "step": 79000 + }, + { + "epoch": 1.7401825981740182, + "grad_norm": 2.203779697418213, + "learning_rate": 6.5225486959392545e-06, + "loss": 0.4949, + "step": 79100 + }, + { + "epoch": 1.7423825761742382, + "grad_norm": 2.6420180797576904, + "learning_rate": 6.518146803125345e-06, + "loss": 0.476, + "step": 79200 + }, + { + "epoch": 1.7445825541744582, + "grad_norm": 2.4949381351470947, + "learning_rate": 6.513744910311435e-06, + "loss": 0.4805, + "step": 79300 + }, + { + "epoch": 1.7467825321746782, + "grad_norm": 1.6507716178894043, + "learning_rate": 6.5093430174975245e-06, + "loss": 0.4928, + "step": 79400 + }, + { + "epoch": 1.7489825101748981, + "grad_norm": 2.849067211151123, + "learning_rate": 6.504941124683615e-06, + "loss": 0.4879, + "step": 79500 + }, + { + "epoch": 1.7511824881751181, + "grad_norm": 2.404705047607422, + "learning_rate": 6.500539231869705e-06, + "loss": 0.4761, + "step": 79600 + }, + { + "epoch": 1.753382466175338, + "grad_norm": 2.653310537338257, + "learning_rate": 6.4961373390557945e-06, + "loss": 0.5017, + "step": 79700 + }, + { + "epoch": 1.755582444175558, + "grad_norm": 2.31355619430542, + "learning_rate": 6.491735446241885e-06, + "loss": 0.4802, + "step": 79800 + }, + { + "epoch": 1.7577824221757783, + "grad_norm": 2.361945867538452, + "learning_rate": 6.487333553427975e-06, + "loss": 0.4816, + "step": 79900 + }, + { + "epoch": 1.7599824001759983, + "grad_norm": 2.199768304824829, + "learning_rate": 6.4829316606140645e-06, + "loss": 0.4632, + "step": 80000 + }, + { + "epoch": 1.7621823781762183, + "grad_norm": 1.8634425401687622, + "learning_rate": 6.478529767800155e-06, + "loss": 0.4909, + "step": 80100 + }, + { + "epoch": 1.7643823561764382, + "grad_norm": 2.742694616317749, + "learning_rate": 6.474127874986245e-06, + "loss": 0.4939, + "step": 80200 + }, + { + "epoch": 1.7665823341766582, + "grad_norm": 2.8734514713287354, + "learning_rate": 6.4697259821723345e-06, + "loss": 0.4917, + "step": 80300 + }, + { + "epoch": 1.7687823121768782, + "grad_norm": 2.59197735786438, + "learning_rate": 6.465324089358425e-06, + "loss": 0.4781, + "step": 80400 + }, + { + "epoch": 1.7709822901770982, + "grad_norm": 2.3575127124786377, + "learning_rate": 6.460922196544514e-06, + "loss": 0.4801, + "step": 80500 + }, + { + "epoch": 1.7731822681773184, + "grad_norm": 2.599222421646118, + "learning_rate": 6.4565203037306045e-06, + "loss": 0.4891, + "step": 80600 + }, + { + "epoch": 1.7753822461775384, + "grad_norm": 2.7138659954071045, + "learning_rate": 6.452118410916695e-06, + "loss": 0.491, + "step": 80700 + }, + { + "epoch": 1.7775822241777584, + "grad_norm": 2.467128038406372, + "learning_rate": 6.447716518102784e-06, + "loss": 0.4984, + "step": 80800 + }, + { + "epoch": 1.7797822021779783, + "grad_norm": 2.4047677516937256, + "learning_rate": 6.4433146252888745e-06, + "loss": 0.4756, + "step": 80900 + }, + { + "epoch": 1.7819821801781983, + "grad_norm": 2.0229098796844482, + "learning_rate": 6.438912732474965e-06, + "loss": 0.4792, + "step": 81000 + }, + { + "epoch": 1.7841821581784183, + "grad_norm": 2.463090658187866, + "learning_rate": 6.434510839661054e-06, + "loss": 0.4824, + "step": 81100 + }, + { + "epoch": 1.7863821361786383, + "grad_norm": 2.3522398471832275, + "learning_rate": 6.4301089468471445e-06, + "loss": 0.4938, + "step": 81200 + }, + { + "epoch": 1.7885821141788583, + "grad_norm": 2.1566226482391357, + "learning_rate": 6.425707054033235e-06, + "loss": 0.4858, + "step": 81300 + }, + { + "epoch": 1.7907820921790782, + "grad_norm": 2.452099084854126, + "learning_rate": 6.421305161219324e-06, + "loss": 0.4879, + "step": 81400 + }, + { + "epoch": 1.7929820701792982, + "grad_norm": 2.3728647232055664, + "learning_rate": 6.4169032684054144e-06, + "loss": 0.499, + "step": 81500 + }, + { + "epoch": 1.7951820481795182, + "grad_norm": 2.499342441558838, + "learning_rate": 6.412501375591505e-06, + "loss": 0.4799, + "step": 81600 + }, + { + "epoch": 1.7973820261797382, + "grad_norm": 2.281799077987671, + "learning_rate": 6.408099482777594e-06, + "loss": 0.4823, + "step": 81700 + }, + { + "epoch": 1.7995820041799582, + "grad_norm": 2.5670275688171387, + "learning_rate": 6.4036975899636844e-06, + "loss": 0.4956, + "step": 81800 + }, + { + "epoch": 1.8017819821801782, + "grad_norm": 2.830780506134033, + "learning_rate": 6.399295697149775e-06, + "loss": 0.4909, + "step": 81900 + }, + { + "epoch": 1.8039819601803981, + "grad_norm": 2.3581204414367676, + "learning_rate": 6.394893804335866e-06, + "loss": 0.4906, + "step": 82000 + }, + { + "epoch": 1.8061819381806181, + "grad_norm": 2.6061856746673584, + "learning_rate": 6.390491911521955e-06, + "loss": 0.488, + "step": 82100 + }, + { + "epoch": 1.808381916180838, + "grad_norm": 2.3762636184692383, + "learning_rate": 6.386090018708046e-06, + "loss": 0.4957, + "step": 82200 + }, + { + "epoch": 1.810581894181058, + "grad_norm": 2.7238190174102783, + "learning_rate": 6.381688125894136e-06, + "loss": 0.4866, + "step": 82300 + }, + { + "epoch": 1.812781872181278, + "grad_norm": 2.1085996627807617, + "learning_rate": 6.377286233080225e-06, + "loss": 0.4666, + "step": 82400 + }, + { + "epoch": 1.814981850181498, + "grad_norm": 2.127675771713257, + "learning_rate": 6.372884340266316e-06, + "loss": 0.4975, + "step": 82500 + }, + { + "epoch": 1.817181828181718, + "grad_norm": 2.0977835655212402, + "learning_rate": 6.368482447452405e-06, + "loss": 0.5016, + "step": 82600 + }, + { + "epoch": 1.8193818061819382, + "grad_norm": 2.5928144454956055, + "learning_rate": 6.364080554638495e-06, + "loss": 0.4904, + "step": 82700 + }, + { + "epoch": 1.8215817841821582, + "grad_norm": 2.5363171100616455, + "learning_rate": 6.359678661824586e-06, + "loss": 0.4739, + "step": 82800 + }, + { + "epoch": 1.8237817621823782, + "grad_norm": 1.779845952987671, + "learning_rate": 6.355276769010675e-06, + "loss": 0.475, + "step": 82900 + }, + { + "epoch": 1.8259817401825982, + "grad_norm": 2.3891873359680176, + "learning_rate": 6.350874876196765e-06, + "loss": 0.4867, + "step": 83000 + }, + { + "epoch": 1.8281817181828182, + "grad_norm": 2.5663325786590576, + "learning_rate": 6.3464729833828556e-06, + "loss": 0.4706, + "step": 83100 + }, + { + "epoch": 1.8303816961830381, + "grad_norm": 2.2070469856262207, + "learning_rate": 6.342071090568945e-06, + "loss": 0.4894, + "step": 83200 + }, + { + "epoch": 1.8325816741832581, + "grad_norm": 2.3300230503082275, + "learning_rate": 6.337669197755035e-06, + "loss": 0.4843, + "step": 83300 + }, + { + "epoch": 1.8347816521834783, + "grad_norm": 2.1778311729431152, + "learning_rate": 6.3332673049411256e-06, + "loss": 0.5032, + "step": 83400 + }, + { + "epoch": 1.8369816301836983, + "grad_norm": 2.106933832168579, + "learning_rate": 6.328865412127215e-06, + "loss": 0.4875, + "step": 83500 + }, + { + "epoch": 1.8391816081839183, + "grad_norm": 2.6579482555389404, + "learning_rate": 6.324463519313305e-06, + "loss": 0.4892, + "step": 83600 + }, + { + "epoch": 1.8413815861841383, + "grad_norm": 2.3309366703033447, + "learning_rate": 6.3200616264993956e-06, + "loss": 0.4699, + "step": 83700 + }, + { + "epoch": 1.8435815641843583, + "grad_norm": 2.503455400466919, + "learning_rate": 6.315659733685485e-06, + "loss": 0.4801, + "step": 83800 + }, + { + "epoch": 1.8457815421845782, + "grad_norm": 2.5221006870269775, + "learning_rate": 6.311257840871575e-06, + "loss": 0.4834, + "step": 83900 + }, + { + "epoch": 1.8479815201847982, + "grad_norm": 2.271540403366089, + "learning_rate": 6.3068559480576656e-06, + "loss": 0.4759, + "step": 84000 + }, + { + "epoch": 1.8501814981850182, + "grad_norm": 2.2240519523620605, + "learning_rate": 6.302454055243755e-06, + "loss": 0.4858, + "step": 84100 + }, + { + "epoch": 1.8523814761852382, + "grad_norm": 2.41463041305542, + "learning_rate": 6.298052162429845e-06, + "loss": 0.4951, + "step": 84200 + }, + { + "epoch": 1.8545814541854582, + "grad_norm": 2.420825242996216, + "learning_rate": 6.2936502696159356e-06, + "loss": 0.4949, + "step": 84300 + }, + { + "epoch": 1.8567814321856781, + "grad_norm": 2.6283483505249023, + "learning_rate": 6.289248376802025e-06, + "loss": 0.4928, + "step": 84400 + }, + { + "epoch": 1.8589814101858981, + "grad_norm": 2.6053175926208496, + "learning_rate": 6.284846483988115e-06, + "loss": 0.4951, + "step": 84500 + }, + { + "epoch": 1.8611813881861181, + "grad_norm": 2.556842803955078, + "learning_rate": 6.280444591174205e-06, + "loss": 0.4766, + "step": 84600 + }, + { + "epoch": 1.863381366186338, + "grad_norm": 2.583364248275757, + "learning_rate": 6.276042698360295e-06, + "loss": 0.4964, + "step": 84700 + }, + { + "epoch": 1.865581344186558, + "grad_norm": 2.407144069671631, + "learning_rate": 6.271640805546385e-06, + "loss": 0.4882, + "step": 84800 + }, + { + "epoch": 1.867781322186778, + "grad_norm": 2.20274019241333, + "learning_rate": 6.267238912732475e-06, + "loss": 0.488, + "step": 84900 + }, + { + "epoch": 1.869981300186998, + "grad_norm": 2.537299871444702, + "learning_rate": 6.262837019918565e-06, + "loss": 0.4912, + "step": 85000 + }, + { + "epoch": 1.872181278187218, + "grad_norm": 2.4242103099823, + "learning_rate": 6.258435127104655e-06, + "loss": 0.4857, + "step": 85100 + }, + { + "epoch": 1.874381256187438, + "grad_norm": 1.9029467105865479, + "learning_rate": 6.254033234290745e-06, + "loss": 0.4969, + "step": 85200 + }, + { + "epoch": 1.876581234187658, + "grad_norm": 3.0369937419891357, + "learning_rate": 6.249631341476835e-06, + "loss": 0.4854, + "step": 85300 + }, + { + "epoch": 1.878781212187878, + "grad_norm": 2.6991753578186035, + "learning_rate": 6.245229448662925e-06, + "loss": 0.4771, + "step": 85400 + }, + { + "epoch": 1.8809811901880982, + "grad_norm": 2.336350679397583, + "learning_rate": 6.240827555849016e-06, + "loss": 0.4922, + "step": 85500 + }, + { + "epoch": 1.8831811681883182, + "grad_norm": 2.731637477874756, + "learning_rate": 6.236425663035106e-06, + "loss": 0.4877, + "step": 85600 + }, + { + "epoch": 1.8853811461885381, + "grad_norm": 2.438896417617798, + "learning_rate": 6.232023770221196e-06, + "loss": 0.4743, + "step": 85700 + }, + { + "epoch": 1.8875811241887581, + "grad_norm": 2.8118035793304443, + "learning_rate": 6.227621877407286e-06, + "loss": 0.4804, + "step": 85800 + }, + { + "epoch": 1.889781102188978, + "grad_norm": 2.5621535778045654, + "learning_rate": 6.223219984593376e-06, + "loss": 0.4849, + "step": 85900 + }, + { + "epoch": 1.891981080189198, + "grad_norm": 2.3240880966186523, + "learning_rate": 6.218818091779466e-06, + "loss": 0.4919, + "step": 86000 + }, + { + "epoch": 1.894181058189418, + "grad_norm": 2.481004238128662, + "learning_rate": 6.214416198965556e-06, + "loss": 0.4794, + "step": 86100 + }, + { + "epoch": 1.8963810361896383, + "grad_norm": 2.4835259914398193, + "learning_rate": 6.210014306151646e-06, + "loss": 0.479, + "step": 86200 + }, + { + "epoch": 1.8985810141898583, + "grad_norm": 2.3219950199127197, + "learning_rate": 6.205612413337736e-06, + "loss": 0.4743, + "step": 86300 + }, + { + "epoch": 1.9007809921900782, + "grad_norm": 2.9407191276550293, + "learning_rate": 6.201210520523826e-06, + "loss": 0.4641, + "step": 86400 + }, + { + "epoch": 1.9029809701902982, + "grad_norm": 2.64907169342041, + "learning_rate": 6.196808627709916e-06, + "loss": 0.4821, + "step": 86500 + }, + { + "epoch": 1.9051809481905182, + "grad_norm": 2.1783690452575684, + "learning_rate": 6.192406734896006e-06, + "loss": 0.4709, + "step": 86600 + }, + { + "epoch": 1.9073809261907382, + "grad_norm": 2.755631685256958, + "learning_rate": 6.1880048420820956e-06, + "loss": 0.4816, + "step": 86700 + }, + { + "epoch": 1.9095809041909582, + "grad_norm": 2.761409044265747, + "learning_rate": 6.183602949268186e-06, + "loss": 0.4833, + "step": 86800 + }, + { + "epoch": 1.9117808821911781, + "grad_norm": 2.676274061203003, + "learning_rate": 6.179201056454276e-06, + "loss": 0.4962, + "step": 86900 + }, + { + "epoch": 1.9139808601913981, + "grad_norm": 2.450660467147827, + "learning_rate": 6.1747991636403656e-06, + "loss": 0.473, + "step": 87000 + }, + { + "epoch": 1.916180838191618, + "grad_norm": 2.693134069442749, + "learning_rate": 6.170397270826456e-06, + "loss": 0.4781, + "step": 87100 + }, + { + "epoch": 1.918380816191838, + "grad_norm": 2.411348581314087, + "learning_rate": 6.165995378012546e-06, + "loss": 0.4804, + "step": 87200 + }, + { + "epoch": 1.920580794192058, + "grad_norm": 2.500234842300415, + "learning_rate": 6.1615934851986356e-06, + "loss": 0.4837, + "step": 87300 + }, + { + "epoch": 1.922780772192278, + "grad_norm": 3.033048391342163, + "learning_rate": 6.157191592384726e-06, + "loss": 0.471, + "step": 87400 + }, + { + "epoch": 1.924980750192498, + "grad_norm": 1.847033143043518, + "learning_rate": 6.152789699570816e-06, + "loss": 0.4823, + "step": 87500 + }, + { + "epoch": 1.927180728192718, + "grad_norm": 2.5302257537841797, + "learning_rate": 6.1483878067569056e-06, + "loss": 0.4826, + "step": 87600 + }, + { + "epoch": 1.929380706192938, + "grad_norm": 1.998494029045105, + "learning_rate": 6.143985913942996e-06, + "loss": 0.4891, + "step": 87700 + }, + { + "epoch": 1.931580684193158, + "grad_norm": 2.995784044265747, + "learning_rate": 6.139584021129086e-06, + "loss": 0.4847, + "step": 87800 + }, + { + "epoch": 1.933780662193378, + "grad_norm": 2.2645761966705322, + "learning_rate": 6.1351821283151756e-06, + "loss": 0.5042, + "step": 87900 + }, + { + "epoch": 1.935980640193598, + "grad_norm": 2.3474481105804443, + "learning_rate": 6.130780235501266e-06, + "loss": 0.4845, + "step": 88000 + }, + { + "epoch": 1.938180618193818, + "grad_norm": 2.570206880569458, + "learning_rate": 6.126378342687356e-06, + "loss": 0.4794, + "step": 88100 + }, + { + "epoch": 1.940380596194038, + "grad_norm": 1.8715978860855103, + "learning_rate": 6.1219764498734456e-06, + "loss": 0.4775, + "step": 88200 + }, + { + "epoch": 1.942580574194258, + "grad_norm": 2.443993330001831, + "learning_rate": 6.117574557059536e-06, + "loss": 0.4824, + "step": 88300 + }, + { + "epoch": 1.944780552194478, + "grad_norm": 2.4730186462402344, + "learning_rate": 6.113172664245626e-06, + "loss": 0.4914, + "step": 88400 + }, + { + "epoch": 1.946980530194698, + "grad_norm": 2.6471264362335205, + "learning_rate": 6.1087707714317156e-06, + "loss": 0.4826, + "step": 88500 + }, + { + "epoch": 1.949180508194918, + "grad_norm": 2.5795907974243164, + "learning_rate": 6.104368878617806e-06, + "loss": 0.4871, + "step": 88600 + }, + { + "epoch": 1.951380486195138, + "grad_norm": 2.3072896003723145, + "learning_rate": 6.099966985803895e-06, + "loss": 0.4937, + "step": 88700 + }, + { + "epoch": 1.953580464195358, + "grad_norm": 2.5398294925689697, + "learning_rate": 6.0955650929899856e-06, + "loss": 0.4919, + "step": 88800 + }, + { + "epoch": 1.955780442195578, + "grad_norm": 2.15952730178833, + "learning_rate": 6.091163200176077e-06, + "loss": 0.4934, + "step": 88900 + }, + { + "epoch": 1.957980420195798, + "grad_norm": 2.4487977027893066, + "learning_rate": 6.086761307362167e-06, + "loss": 0.4842, + "step": 89000 + }, + { + "epoch": 1.9601803981960182, + "grad_norm": 2.4906442165374756, + "learning_rate": 6.082359414548256e-06, + "loss": 0.484, + "step": 89100 + }, + { + "epoch": 1.9623803761962382, + "grad_norm": 2.605121374130249, + "learning_rate": 6.077957521734347e-06, + "loss": 0.4903, + "step": 89200 + }, + { + "epoch": 1.9645803541964582, + "grad_norm": 2.7144834995269775, + "learning_rate": 6.073555628920437e-06, + "loss": 0.4931, + "step": 89300 + }, + { + "epoch": 1.9667803321966781, + "grad_norm": 2.7881131172180176, + "learning_rate": 6.069153736106526e-06, + "loss": 0.495, + "step": 89400 + }, + { + "epoch": 1.9689803101968981, + "grad_norm": 3.044265031814575, + "learning_rate": 6.064751843292617e-06, + "loss": 0.4757, + "step": 89500 + }, + { + "epoch": 1.971180288197118, + "grad_norm": 2.3652849197387695, + "learning_rate": 6.060349950478707e-06, + "loss": 0.4761, + "step": 89600 + }, + { + "epoch": 1.973380266197338, + "grad_norm": 1.9909372329711914, + "learning_rate": 6.055948057664796e-06, + "loss": 0.492, + "step": 89700 + }, + { + "epoch": 1.975580244197558, + "grad_norm": 2.1215572357177734, + "learning_rate": 6.051546164850887e-06, + "loss": 0.4787, + "step": 89800 + }, + { + "epoch": 1.977780222197778, + "grad_norm": 2.807328701019287, + "learning_rate": 6.047144272036977e-06, + "loss": 0.4845, + "step": 89900 + }, + { + "epoch": 1.979980200197998, + "grad_norm": 2.344365358352661, + "learning_rate": 6.042742379223066e-06, + "loss": 0.4892, + "step": 90000 + }, + { + "epoch": 1.982180178198218, + "grad_norm": 2.1772940158843994, + "learning_rate": 6.038340486409157e-06, + "loss": 0.4849, + "step": 90100 + }, + { + "epoch": 1.984380156198438, + "grad_norm": 2.4292235374450684, + "learning_rate": 6.033938593595247e-06, + "loss": 0.4869, + "step": 90200 + }, + { + "epoch": 1.986580134198658, + "grad_norm": 2.350494861602783, + "learning_rate": 6.029536700781336e-06, + "loss": 0.4945, + "step": 90300 + }, + { + "epoch": 1.988780112198878, + "grad_norm": 2.447011709213257, + "learning_rate": 6.025134807967427e-06, + "loss": 0.4632, + "step": 90400 + }, + { + "epoch": 1.990980090199098, + "grad_norm": 2.229335069656372, + "learning_rate": 6.020732915153516e-06, + "loss": 0.491, + "step": 90500 + }, + { + "epoch": 1.993180068199318, + "grad_norm": 2.659064292907715, + "learning_rate": 6.016331022339606e-06, + "loss": 0.4788, + "step": 90600 + }, + { + "epoch": 1.995380046199538, + "grad_norm": 2.435239791870117, + "learning_rate": 6.011929129525697e-06, + "loss": 0.4947, + "step": 90700 + }, + { + "epoch": 1.9975800241997579, + "grad_norm": 2.0373647212982178, + "learning_rate": 6.007527236711786e-06, + "loss": 0.4832, + "step": 90800 + }, + { + "epoch": 1.9997800021999779, + "grad_norm": 2.644747734069824, + "learning_rate": 6.003125343897876e-06, + "loss": 0.4884, + "step": 90900 + }, + { + "epoch": 2.001979980200198, + "grad_norm": 2.4957003593444824, + "learning_rate": 5.998723451083967e-06, + "loss": 0.4441, + "step": 91000 + }, + { + "epoch": 2.004179958200418, + "grad_norm": 2.8672921657562256, + "learning_rate": 5.994321558270056e-06, + "loss": 0.4586, + "step": 91100 + }, + { + "epoch": 2.006379936200638, + "grad_norm": 2.2238707542419434, + "learning_rate": 5.989919665456146e-06, + "loss": 0.4508, + "step": 91200 + }, + { + "epoch": 2.008579914200858, + "grad_norm": 3.085266590118408, + "learning_rate": 5.985517772642237e-06, + "loss": 0.4454, + "step": 91300 + }, + { + "epoch": 2.010779892201078, + "grad_norm": 2.7190568447113037, + "learning_rate": 5.981115879828326e-06, + "loss": 0.4421, + "step": 91400 + }, + { + "epoch": 2.012979870201298, + "grad_norm": 2.966407537460327, + "learning_rate": 5.976713987014416e-06, + "loss": 0.4334, + "step": 91500 + }, + { + "epoch": 2.015179848201518, + "grad_norm": 2.963914394378662, + "learning_rate": 5.972312094200507e-06, + "loss": 0.4428, + "step": 91600 + }, + { + "epoch": 2.017379826201738, + "grad_norm": 3.2475080490112305, + "learning_rate": 5.967910201386596e-06, + "loss": 0.4387, + "step": 91700 + }, + { + "epoch": 2.019579804201958, + "grad_norm": 2.248386859893799, + "learning_rate": 5.963508308572686e-06, + "loss": 0.4509, + "step": 91800 + }, + { + "epoch": 2.021779782202178, + "grad_norm": 2.9276363849639893, + "learning_rate": 5.959106415758777e-06, + "loss": 0.4509, + "step": 91900 + }, + { + "epoch": 2.023979760202398, + "grad_norm": 3.2354319095611572, + "learning_rate": 5.954704522944866e-06, + "loss": 0.4396, + "step": 92000 + }, + { + "epoch": 2.026179738202618, + "grad_norm": 3.478252649307251, + "learning_rate": 5.950302630130956e-06, + "loss": 0.454, + "step": 92100 + }, + { + "epoch": 2.028379716202838, + "grad_norm": 2.1570658683776855, + "learning_rate": 5.945900737317047e-06, + "loss": 0.4426, + "step": 92200 + }, + { + "epoch": 2.030579694203058, + "grad_norm": 3.555510997772217, + "learning_rate": 5.941498844503136e-06, + "loss": 0.4278, + "step": 92300 + }, + { + "epoch": 2.032779672203278, + "grad_norm": 3.0837221145629883, + "learning_rate": 5.937096951689227e-06, + "loss": 0.4582, + "step": 92400 + }, + { + "epoch": 2.034979650203498, + "grad_norm": 3.023439407348633, + "learning_rate": 5.9326950588753175e-06, + "loss": 0.445, + "step": 92500 + }, + { + "epoch": 2.037179628203718, + "grad_norm": 2.8164618015289307, + "learning_rate": 5.928293166061407e-06, + "loss": 0.4474, + "step": 92600 + }, + { + "epoch": 2.039379606203938, + "grad_norm": 2.4497897624969482, + "learning_rate": 5.923891273247497e-06, + "loss": 0.4581, + "step": 92700 + }, + { + "epoch": 2.041579584204158, + "grad_norm": 2.560822010040283, + "learning_rate": 5.9194893804335875e-06, + "loss": 0.4402, + "step": 92800 + }, + { + "epoch": 2.043779562204378, + "grad_norm": 2.457819938659668, + "learning_rate": 5.915087487619677e-06, + "loss": 0.457, + "step": 92900 + }, + { + "epoch": 2.045979540204598, + "grad_norm": 2.840198278427124, + "learning_rate": 5.910685594805767e-06, + "loss": 0.4457, + "step": 93000 + }, + { + "epoch": 2.048179518204818, + "grad_norm": 3.289562940597534, + "learning_rate": 5.9062837019918575e-06, + "loss": 0.4458, + "step": 93100 + }, + { + "epoch": 2.050379496205038, + "grad_norm": 3.20574688911438, + "learning_rate": 5.901881809177947e-06, + "loss": 0.4464, + "step": 93200 + }, + { + "epoch": 2.052579474205258, + "grad_norm": 3.1382062435150146, + "learning_rate": 5.897479916364037e-06, + "loss": 0.4407, + "step": 93300 + }, + { + "epoch": 2.054779452205478, + "grad_norm": 2.4946656227111816, + "learning_rate": 5.8930780235501275e-06, + "loss": 0.4404, + "step": 93400 + }, + { + "epoch": 2.056979430205698, + "grad_norm": 3.4237630367279053, + "learning_rate": 5.888676130736217e-06, + "loss": 0.4549, + "step": 93500 + }, + { + "epoch": 2.059179408205918, + "grad_norm": 2.6181180477142334, + "learning_rate": 5.884274237922307e-06, + "loss": 0.4305, + "step": 93600 + }, + { + "epoch": 2.061379386206138, + "grad_norm": 2.9076225757598877, + "learning_rate": 5.8798723451083975e-06, + "loss": 0.4543, + "step": 93700 + }, + { + "epoch": 2.063579364206358, + "grad_norm": 2.6111700534820557, + "learning_rate": 5.875470452294487e-06, + "loss": 0.4426, + "step": 93800 + }, + { + "epoch": 2.0657793422065778, + "grad_norm": 3.1381430625915527, + "learning_rate": 5.871068559480577e-06, + "loss": 0.4509, + "step": 93900 + }, + { + "epoch": 2.0679793202067978, + "grad_norm": 2.934509754180908, + "learning_rate": 5.8666666666666675e-06, + "loss": 0.4538, + "step": 94000 + }, + { + "epoch": 2.0701792982070177, + "grad_norm": 2.8510279655456543, + "learning_rate": 5.862264773852757e-06, + "loss": 0.4396, + "step": 94100 + }, + { + "epoch": 2.0723792762072377, + "grad_norm": 2.753408670425415, + "learning_rate": 5.857862881038847e-06, + "loss": 0.4498, + "step": 94200 + }, + { + "epoch": 2.0745792542074577, + "grad_norm": 2.5191516876220703, + "learning_rate": 5.8534609882249375e-06, + "loss": 0.4355, + "step": 94300 + }, + { + "epoch": 2.076779232207678, + "grad_norm": 3.058117628097534, + "learning_rate": 5.849059095411027e-06, + "loss": 0.4496, + "step": 94400 + }, + { + "epoch": 2.078979210207898, + "grad_norm": 2.3892626762390137, + "learning_rate": 5.844657202597117e-06, + "loss": 0.448, + "step": 94500 + }, + { + "epoch": 2.081179188208118, + "grad_norm": 3.303252935409546, + "learning_rate": 5.840255309783207e-06, + "loss": 0.4423, + "step": 94600 + }, + { + "epoch": 2.083379166208338, + "grad_norm": 2.571668863296509, + "learning_rate": 5.835853416969297e-06, + "loss": 0.4477, + "step": 94700 + }, + { + "epoch": 2.085579144208558, + "grad_norm": 2.8675763607025146, + "learning_rate": 5.831451524155387e-06, + "loss": 0.4402, + "step": 94800 + }, + { + "epoch": 2.087779122208778, + "grad_norm": 1.920617938041687, + "learning_rate": 5.827049631341477e-06, + "loss": 0.4469, + "step": 94900 + }, + { + "epoch": 2.089979100208998, + "grad_norm": 2.4607462882995605, + "learning_rate": 5.822647738527567e-06, + "loss": 0.4578, + "step": 95000 + }, + { + "epoch": 2.092179078209218, + "grad_norm": 2.3950858116149902, + "learning_rate": 5.818245845713657e-06, + "loss": 0.449, + "step": 95100 + }, + { + "epoch": 2.094379056209438, + "grad_norm": 2.5188486576080322, + "learning_rate": 5.813843952899747e-06, + "loss": 0.4411, + "step": 95200 + }, + { + "epoch": 2.096579034209658, + "grad_norm": 2.665241003036499, + "learning_rate": 5.809442060085837e-06, + "loss": 0.4555, + "step": 95300 + }, + { + "epoch": 2.098779012209878, + "grad_norm": 3.0195603370666504, + "learning_rate": 5.805040167271927e-06, + "loss": 0.4605, + "step": 95400 + }, + { + "epoch": 2.100978990210098, + "grad_norm": 3.2705276012420654, + "learning_rate": 5.800638274458017e-06, + "loss": 0.437, + "step": 95500 + }, + { + "epoch": 2.103178968210318, + "grad_norm": 2.4358837604522705, + "learning_rate": 5.796236381644107e-06, + "loss": 0.4556, + "step": 95600 + }, + { + "epoch": 2.105378946210538, + "grad_norm": 2.609314203262329, + "learning_rate": 5.791834488830197e-06, + "loss": 0.4396, + "step": 95700 + }, + { + "epoch": 2.107578924210758, + "grad_norm": 2.715202808380127, + "learning_rate": 5.787432596016287e-06, + "loss": 0.4409, + "step": 95800 + }, + { + "epoch": 2.109778902210978, + "grad_norm": 2.89326548576355, + "learning_rate": 5.783030703202378e-06, + "loss": 0.4473, + "step": 95900 + }, + { + "epoch": 2.111978880211198, + "grad_norm": 2.722426414489746, + "learning_rate": 5.778628810388468e-06, + "loss": 0.4392, + "step": 96000 + }, + { + "epoch": 2.114178858211418, + "grad_norm": 2.5516304969787598, + "learning_rate": 5.774226917574558e-06, + "loss": 0.4327, + "step": 96100 + }, + { + "epoch": 2.116378836211638, + "grad_norm": 1.6953123807907104, + "learning_rate": 5.769825024760648e-06, + "loss": 0.4354, + "step": 96200 + }, + { + "epoch": 2.118578814211858, + "grad_norm": 3.260712146759033, + "learning_rate": 5.765423131946738e-06, + "loss": 0.4587, + "step": 96300 + }, + { + "epoch": 2.1207787922120778, + "grad_norm": 3.15496826171875, + "learning_rate": 5.761021239132828e-06, + "loss": 0.4455, + "step": 96400 + }, + { + "epoch": 2.1229787702122977, + "grad_norm": 3.02713680267334, + "learning_rate": 5.756619346318918e-06, + "loss": 0.443, + "step": 96500 + }, + { + "epoch": 2.1251787482125177, + "grad_norm": 2.6551177501678467, + "learning_rate": 5.752217453505008e-06, + "loss": 0.4361, + "step": 96600 + }, + { + "epoch": 2.1273787262127377, + "grad_norm": 3.143676996231079, + "learning_rate": 5.7478155606910975e-06, + "loss": 0.4463, + "step": 96700 + }, + { + "epoch": 2.1295787042129577, + "grad_norm": 3.07769775390625, + "learning_rate": 5.743413667877188e-06, + "loss": 0.4563, + "step": 96800 + }, + { + "epoch": 2.1317786822131777, + "grad_norm": 2.862227439880371, + "learning_rate": 5.739011775063278e-06, + "loss": 0.4393, + "step": 96900 + }, + { + "epoch": 2.1339786602133977, + "grad_norm": 2.652214288711548, + "learning_rate": 5.7346098822493675e-06, + "loss": 0.443, + "step": 97000 + }, + { + "epoch": 2.136178638213618, + "grad_norm": 2.3733363151550293, + "learning_rate": 5.730207989435458e-06, + "loss": 0.4449, + "step": 97100 + }, + { + "epoch": 2.138378616213838, + "grad_norm": 2.734473705291748, + "learning_rate": 5.725806096621548e-06, + "loss": 0.4357, + "step": 97200 + }, + { + "epoch": 2.140578594214058, + "grad_norm": 2.783421039581299, + "learning_rate": 5.7214042038076375e-06, + "loss": 0.434, + "step": 97300 + }, + { + "epoch": 2.142778572214278, + "grad_norm": 2.4740219116210938, + "learning_rate": 5.717002310993728e-06, + "loss": 0.4417, + "step": 97400 + }, + { + "epoch": 2.144978550214498, + "grad_norm": 2.809589147567749, + "learning_rate": 5.712600418179818e-06, + "loss": 0.4507, + "step": 97500 + }, + { + "epoch": 2.147178528214718, + "grad_norm": 2.179594039916992, + "learning_rate": 5.7081985253659075e-06, + "loss": 0.4552, + "step": 97600 + }, + { + "epoch": 2.149378506214938, + "grad_norm": 2.5812172889709473, + "learning_rate": 5.703796632551998e-06, + "loss": 0.4462, + "step": 97700 + }, + { + "epoch": 2.151578484215158, + "grad_norm": 2.6970343589782715, + "learning_rate": 5.699394739738088e-06, + "loss": 0.4448, + "step": 97800 + }, + { + "epoch": 2.153778462215378, + "grad_norm": 3.2081048488616943, + "learning_rate": 5.6949928469241775e-06, + "loss": 0.4477, + "step": 97900 + }, + { + "epoch": 2.155978440215598, + "grad_norm": 2.283027410507202, + "learning_rate": 5.690590954110268e-06, + "loss": 0.4554, + "step": 98000 + }, + { + "epoch": 2.158178418215818, + "grad_norm": 2.4790256023406982, + "learning_rate": 5.686189061296358e-06, + "loss": 0.4443, + "step": 98100 + }, + { + "epoch": 2.160378396216038, + "grad_norm": 3.0653131008148193, + "learning_rate": 5.6817871684824475e-06, + "loss": 0.4435, + "step": 98200 + }, + { + "epoch": 2.162578374216258, + "grad_norm": 3.14249849319458, + "learning_rate": 5.677385275668538e-06, + "loss": 0.4528, + "step": 98300 + }, + { + "epoch": 2.164778352216478, + "grad_norm": 3.3730337619781494, + "learning_rate": 5.672983382854628e-06, + "loss": 0.4397, + "step": 98400 + }, + { + "epoch": 2.166978330216698, + "grad_norm": 3.2641589641571045, + "learning_rate": 5.6685814900407175e-06, + "loss": 0.4365, + "step": 98500 + }, + { + "epoch": 2.169178308216918, + "grad_norm": 3.698474407196045, + "learning_rate": 5.664179597226808e-06, + "loss": 0.4416, + "step": 98600 + }, + { + "epoch": 2.171378286217138, + "grad_norm": 2.253495454788208, + "learning_rate": 5.659777704412897e-06, + "loss": 0.4534, + "step": 98700 + }, + { + "epoch": 2.173578264217358, + "grad_norm": 3.342864990234375, + "learning_rate": 5.6553758115989875e-06, + "loss": 0.4546, + "step": 98800 + }, + { + "epoch": 2.1757782422175778, + "grad_norm": 2.818357229232788, + "learning_rate": 5.650973918785078e-06, + "loss": 0.4327, + "step": 98900 + }, + { + "epoch": 2.1779782202177977, + "grad_norm": 3.623086452484131, + "learning_rate": 5.646572025971167e-06, + "loss": 0.4566, + "step": 99000 + }, + { + "epoch": 2.1801781982180177, + "grad_norm": 3.0294673442840576, + "learning_rate": 5.6421701331572575e-06, + "loss": 0.4437, + "step": 99100 + }, + { + "epoch": 2.1823781762182377, + "grad_norm": 2.562649726867676, + "learning_rate": 5.637768240343348e-06, + "loss": 0.4504, + "step": 99200 + }, + { + "epoch": 2.1845781542184577, + "grad_norm": 2.9399819374084473, + "learning_rate": 5.633366347529439e-06, + "loss": 0.4405, + "step": 99300 + }, + { + "epoch": 2.1867781322186777, + "grad_norm": 2.589012861251831, + "learning_rate": 5.628964454715528e-06, + "loss": 0.4332, + "step": 99400 + }, + { + "epoch": 2.1889781102188977, + "grad_norm": 3.24257230758667, + "learning_rate": 5.624562561901619e-06, + "loss": 0.4486, + "step": 99500 + }, + { + "epoch": 2.1911780882191176, + "grad_norm": 2.6864874362945557, + "learning_rate": 5.620160669087709e-06, + "loss": 0.4476, + "step": 99600 + }, + { + "epoch": 2.1933780662193376, + "grad_norm": 2.183894634246826, + "learning_rate": 5.615758776273798e-06, + "loss": 0.4517, + "step": 99700 + }, + { + "epoch": 2.1955780442195576, + "grad_norm": 2.297757625579834, + "learning_rate": 5.611356883459889e-06, + "loss": 0.4414, + "step": 99800 + }, + { + "epoch": 2.1977780222197776, + "grad_norm": 2.6887316703796387, + "learning_rate": 5.606954990645979e-06, + "loss": 0.4359, + "step": 99900 + }, + { + "epoch": 2.199978000219998, + "grad_norm": 2.8383491039276123, + "learning_rate": 5.602553097832068e-06, + "loss": 0.4455, + "step": 100000 + }, + { + "epoch": 2.199978000219998, + "eval_loss": 0.5539576411247253, + "eval_runtime": 386.4228, + "eval_samples_per_second": 155.27, + "eval_steps_per_second": 4.852, + "step": 100000 + }, + { + "epoch": 2.202177978220218, + "grad_norm": 2.4842607975006104, + "learning_rate": 5.598151205018159e-06, + "loss": 0.4421, + "step": 100100 + }, + { + "epoch": 2.204377956220438, + "grad_norm": 2.3061771392822266, + "learning_rate": 5.593749312204249e-06, + "loss": 0.4529, + "step": 100200 + }, + { + "epoch": 2.206577934220658, + "grad_norm": 2.9890830516815186, + "learning_rate": 5.589347419390338e-06, + "loss": 0.4251, + "step": 100300 + }, + { + "epoch": 2.208777912220878, + "grad_norm": 2.5472826957702637, + "learning_rate": 5.584945526576429e-06, + "loss": 0.4384, + "step": 100400 + }, + { + "epoch": 2.210977890221098, + "grad_norm": 3.314694881439209, + "learning_rate": 5.580543633762519e-06, + "loss": 0.4372, + "step": 100500 + }, + { + "epoch": 2.213177868221318, + "grad_norm": 3.1046979427337646, + "learning_rate": 5.576141740948608e-06, + "loss": 0.434, + "step": 100600 + }, + { + "epoch": 2.215377846221538, + "grad_norm": 2.180180788040161, + "learning_rate": 5.571739848134699e-06, + "loss": 0.43, + "step": 100700 + }, + { + "epoch": 2.217577824221758, + "grad_norm": 3.7238945960998535, + "learning_rate": 5.567337955320788e-06, + "loss": 0.4404, + "step": 100800 + }, + { + "epoch": 2.219777802221978, + "grad_norm": 3.2101945877075195, + "learning_rate": 5.562936062506878e-06, + "loss": 0.4393, + "step": 100900 + }, + { + "epoch": 2.221977780222198, + "grad_norm": 2.822737455368042, + "learning_rate": 5.558534169692969e-06, + "loss": 0.4407, + "step": 101000 + }, + { + "epoch": 2.224177758222418, + "grad_norm": 2.736593723297119, + "learning_rate": 5.554132276879058e-06, + "loss": 0.4603, + "step": 101100 + }, + { + "epoch": 2.226377736222638, + "grad_norm": 2.5259158611297607, + "learning_rate": 5.549730384065148e-06, + "loss": 0.438, + "step": 101200 + }, + { + "epoch": 2.2285777142228578, + "grad_norm": 2.8023760318756104, + "learning_rate": 5.545328491251239e-06, + "loss": 0.4476, + "step": 101300 + }, + { + "epoch": 2.2307776922230778, + "grad_norm": 3.469649076461792, + "learning_rate": 5.540926598437328e-06, + "loss": 0.4498, + "step": 101400 + }, + { + "epoch": 2.2329776702232977, + "grad_norm": 2.2170920372009277, + "learning_rate": 5.536524705623418e-06, + "loss": 0.4531, + "step": 101500 + }, + { + "epoch": 2.2351776482235177, + "grad_norm": 2.9399514198303223, + "learning_rate": 5.532122812809509e-06, + "loss": 0.4496, + "step": 101600 + }, + { + "epoch": 2.2373776262237377, + "grad_norm": 3.1350746154785156, + "learning_rate": 5.527720919995598e-06, + "loss": 0.4412, + "step": 101700 + }, + { + "epoch": 2.2395776042239577, + "grad_norm": 2.7231826782226562, + "learning_rate": 5.523319027181688e-06, + "loss": 0.4434, + "step": 101800 + }, + { + "epoch": 2.2417775822241777, + "grad_norm": 2.8241002559661865, + "learning_rate": 5.518917134367779e-06, + "loss": 0.4405, + "step": 101900 + }, + { + "epoch": 2.2439775602243976, + "grad_norm": 2.6854066848754883, + "learning_rate": 5.514515241553868e-06, + "loss": 0.4558, + "step": 102000 + }, + { + "epoch": 2.2461775382246176, + "grad_norm": 3.1952197551727295, + "learning_rate": 5.510113348739958e-06, + "loss": 0.4354, + "step": 102100 + }, + { + "epoch": 2.2483775162248376, + "grad_norm": 2.9026472568511963, + "learning_rate": 5.505711455926049e-06, + "loss": 0.4485, + "step": 102200 + }, + { + "epoch": 2.2505774942250576, + "grad_norm": 3.1712558269500732, + "learning_rate": 5.501309563112138e-06, + "loss": 0.4468, + "step": 102300 + }, + { + "epoch": 2.2527774722252776, + "grad_norm": 2.9717068672180176, + "learning_rate": 5.496907670298228e-06, + "loss": 0.4386, + "step": 102400 + }, + { + "epoch": 2.2549774502254976, + "grad_norm": 2.8104095458984375, + "learning_rate": 5.492505777484319e-06, + "loss": 0.4452, + "step": 102500 + }, + { + "epoch": 2.2571774282257175, + "grad_norm": 3.142512798309326, + "learning_rate": 5.488103884670408e-06, + "loss": 0.4487, + "step": 102600 + }, + { + "epoch": 2.259377406225938, + "grad_norm": 3.723659038543701, + "learning_rate": 5.483701991856498e-06, + "loss": 0.449, + "step": 102700 + }, + { + "epoch": 2.2615773842261575, + "grad_norm": 3.365520477294922, + "learning_rate": 5.4793000990425895e-06, + "loss": 0.4409, + "step": 102800 + }, + { + "epoch": 2.263777362226378, + "grad_norm": 2.1158196926116943, + "learning_rate": 5.474898206228679e-06, + "loss": 0.4526, + "step": 102900 + }, + { + "epoch": 2.265977340226598, + "grad_norm": 2.77187442779541, + "learning_rate": 5.470496313414769e-06, + "loss": 0.4597, + "step": 103000 + }, + { + "epoch": 2.268177318226818, + "grad_norm": 3.1668035984039307, + "learning_rate": 5.4660944206008595e-06, + "loss": 0.4515, + "step": 103100 + }, + { + "epoch": 2.270377296227038, + "grad_norm": 3.3199713230133057, + "learning_rate": 5.461692527786949e-06, + "loss": 0.4421, + "step": 103200 + }, + { + "epoch": 2.272577274227258, + "grad_norm": 3.0452702045440674, + "learning_rate": 5.457290634973039e-06, + "loss": 0.451, + "step": 103300 + }, + { + "epoch": 2.274777252227478, + "grad_norm": 2.889191150665283, + "learning_rate": 5.4528887421591295e-06, + "loss": 0.4433, + "step": 103400 + }, + { + "epoch": 2.276977230227698, + "grad_norm": 3.1005496978759766, + "learning_rate": 5.448486849345219e-06, + "loss": 0.459, + "step": 103500 + }, + { + "epoch": 2.279177208227918, + "grad_norm": 3.024289131164551, + "learning_rate": 5.444084956531309e-06, + "loss": 0.4369, + "step": 103600 + }, + { + "epoch": 2.281377186228138, + "grad_norm": 2.3427116870880127, + "learning_rate": 5.4396830637173995e-06, + "loss": 0.4461, + "step": 103700 + }, + { + "epoch": 2.2835771642283578, + "grad_norm": 3.6452486515045166, + "learning_rate": 5.435281170903489e-06, + "loss": 0.4626, + "step": 103800 + }, + { + "epoch": 2.2857771422285778, + "grad_norm": 3.5883066654205322, + "learning_rate": 5.430879278089579e-06, + "loss": 0.4439, + "step": 103900 + }, + { + "epoch": 2.2879771202287977, + "grad_norm": 3.1896305084228516, + "learning_rate": 5.4264773852756695e-06, + "loss": 0.4342, + "step": 104000 + }, + { + "epoch": 2.2901770982290177, + "grad_norm": 3.0149104595184326, + "learning_rate": 5.422075492461759e-06, + "loss": 0.4503, + "step": 104100 + }, + { + "epoch": 2.2923770762292377, + "grad_norm": 3.1118035316467285, + "learning_rate": 5.417673599647849e-06, + "loss": 0.4402, + "step": 104200 + }, + { + "epoch": 2.2945770542294577, + "grad_norm": 3.0000152587890625, + "learning_rate": 5.4132717068339395e-06, + "loss": 0.4321, + "step": 104300 + }, + { + "epoch": 2.2967770322296777, + "grad_norm": 3.1988613605499268, + "learning_rate": 5.408869814020029e-06, + "loss": 0.4458, + "step": 104400 + }, + { + "epoch": 2.2989770102298976, + "grad_norm": 2.5336127281188965, + "learning_rate": 5.404467921206119e-06, + "loss": 0.4412, + "step": 104500 + }, + { + "epoch": 2.3011769882301176, + "grad_norm": 2.478907823562622, + "learning_rate": 5.400066028392209e-06, + "loss": 0.4391, + "step": 104600 + }, + { + "epoch": 2.3033769662303376, + "grad_norm": 3.0680346488952637, + "learning_rate": 5.395664135578299e-06, + "loss": 0.4466, + "step": 104700 + }, + { + "epoch": 2.3055769442305576, + "grad_norm": 2.976754665374756, + "learning_rate": 5.391262242764389e-06, + "loss": 0.4534, + "step": 104800 + }, + { + "epoch": 2.3077769222307776, + "grad_norm": 2.921550989151001, + "learning_rate": 5.386860349950479e-06, + "loss": 0.4461, + "step": 104900 + }, + { + "epoch": 2.3099769002309976, + "grad_norm": 2.6085400581359863, + "learning_rate": 5.382458457136569e-06, + "loss": 0.439, + "step": 105000 + }, + { + "epoch": 2.3121768782312175, + "grad_norm": 3.231365203857422, + "learning_rate": 5.378056564322659e-06, + "loss": 0.4511, + "step": 105100 + }, + { + "epoch": 2.3143768562314375, + "grad_norm": 3.2471604347229004, + "learning_rate": 5.373654671508749e-06, + "loss": 0.434, + "step": 105200 + }, + { + "epoch": 2.3165768342316575, + "grad_norm": 3.265526056289673, + "learning_rate": 5.369252778694839e-06, + "loss": 0.4414, + "step": 105300 + }, + { + "epoch": 2.318776812231878, + "grad_norm": 2.632627010345459, + "learning_rate": 5.364850885880929e-06, + "loss": 0.4469, + "step": 105400 + }, + { + "epoch": 2.3209767902320975, + "grad_norm": 3.3575692176818848, + "learning_rate": 5.360448993067019e-06, + "loss": 0.4517, + "step": 105500 + }, + { + "epoch": 2.323176768232318, + "grad_norm": 2.751236915588379, + "learning_rate": 5.356047100253109e-06, + "loss": 0.4321, + "step": 105600 + }, + { + "epoch": 2.3253767462325374, + "grad_norm": 3.4512314796447754, + "learning_rate": 5.351645207439199e-06, + "loss": 0.4513, + "step": 105700 + }, + { + "epoch": 2.327576724232758, + "grad_norm": 2.5892844200134277, + "learning_rate": 5.347243314625289e-06, + "loss": 0.448, + "step": 105800 + }, + { + "epoch": 2.329776702232978, + "grad_norm": 3.1810543537139893, + "learning_rate": 5.342841421811379e-06, + "loss": 0.4489, + "step": 105900 + }, + { + "epoch": 2.331976680233198, + "grad_norm": 3.527425765991211, + "learning_rate": 5.338439528997469e-06, + "loss": 0.4362, + "step": 106000 + }, + { + "epoch": 2.334176658233418, + "grad_norm": 2.97705340385437, + "learning_rate": 5.334037636183559e-06, + "loss": 0.4424, + "step": 106100 + }, + { + "epoch": 2.336376636233638, + "grad_norm": 2.3554928302764893, + "learning_rate": 5.329635743369649e-06, + "loss": 0.4354, + "step": 106200 + }, + { + "epoch": 2.3385766142338578, + "grad_norm": 3.598785161972046, + "learning_rate": 5.32523385055574e-06, + "loss": 0.4429, + "step": 106300 + }, + { + "epoch": 2.3407765922340777, + "grad_norm": 3.603203058242798, + "learning_rate": 5.32083195774183e-06, + "loss": 0.4508, + "step": 106400 + }, + { + "epoch": 2.3429765702342977, + "grad_norm": 2.5761771202087402, + "learning_rate": 5.31643006492792e-06, + "loss": 0.448, + "step": 106500 + }, + { + "epoch": 2.3451765482345177, + "grad_norm": 3.6221818923950195, + "learning_rate": 5.31202817211401e-06, + "loss": 0.4305, + "step": 106600 + }, + { + "epoch": 2.3473765262347377, + "grad_norm": 3.062361717224121, + "learning_rate": 5.3076262793000995e-06, + "loss": 0.45, + "step": 106700 + }, + { + "epoch": 2.3495765042349577, + "grad_norm": 2.6559677124023438, + "learning_rate": 5.30322438648619e-06, + "loss": 0.4569, + "step": 106800 + }, + { + "epoch": 2.3517764822351777, + "grad_norm": 2.8080978393554688, + "learning_rate": 5.29882249367228e-06, + "loss": 0.4376, + "step": 106900 + }, + { + "epoch": 2.3539764602353976, + "grad_norm": 2.880061388015747, + "learning_rate": 5.2944206008583695e-06, + "loss": 0.4435, + "step": 107000 + }, + { + "epoch": 2.3561764382356176, + "grad_norm": 2.902592420578003, + "learning_rate": 5.29001870804446e-06, + "loss": 0.4446, + "step": 107100 + }, + { + "epoch": 2.3583764162358376, + "grad_norm": 2.560408592224121, + "learning_rate": 5.28561681523055e-06, + "loss": 0.4533, + "step": 107200 + }, + { + "epoch": 2.3605763942360576, + "grad_norm": 3.5301778316497803, + "learning_rate": 5.2812149224166395e-06, + "loss": 0.4499, + "step": 107300 + }, + { + "epoch": 2.3627763722362776, + "grad_norm": 3.1170268058776855, + "learning_rate": 5.27681302960273e-06, + "loss": 0.4392, + "step": 107400 + }, + { + "epoch": 2.3649763502364975, + "grad_norm": 2.9975242614746094, + "learning_rate": 5.27241113678882e-06, + "loss": 0.4443, + "step": 107500 + }, + { + "epoch": 2.3671763282367175, + "grad_norm": 2.9318737983703613, + "learning_rate": 5.2680092439749095e-06, + "loss": 0.4382, + "step": 107600 + }, + { + "epoch": 2.3693763062369375, + "grad_norm": 2.7009778022766113, + "learning_rate": 5.263607351161e-06, + "loss": 0.4486, + "step": 107700 + }, + { + "epoch": 2.3715762842371575, + "grad_norm": 3.265301465988159, + "learning_rate": 5.25920545834709e-06, + "loss": 0.4386, + "step": 107800 + }, + { + "epoch": 2.3737762622373775, + "grad_norm": 3.5099949836730957, + "learning_rate": 5.2548035655331795e-06, + "loss": 0.4354, + "step": 107900 + }, + { + "epoch": 2.3759762402375975, + "grad_norm": 2.997199296951294, + "learning_rate": 5.25040167271927e-06, + "loss": 0.4449, + "step": 108000 + }, + { + "epoch": 2.3781762182378174, + "grad_norm": 3.5661022663116455, + "learning_rate": 5.24599977990536e-06, + "loss": 0.4533, + "step": 108100 + }, + { + "epoch": 2.3803761962380374, + "grad_norm": 2.6311588287353516, + "learning_rate": 5.2415978870914495e-06, + "loss": 0.4535, + "step": 108200 + }, + { + "epoch": 2.382576174238258, + "grad_norm": 2.3854992389678955, + "learning_rate": 5.23719599427754e-06, + "loss": 0.441, + "step": 108300 + }, + { + "epoch": 2.3847761522384774, + "grad_norm": 3.3768720626831055, + "learning_rate": 5.23279410146363e-06, + "loss": 0.4467, + "step": 108400 + }, + { + "epoch": 2.386976130238698, + "grad_norm": 3.2119550704956055, + "learning_rate": 5.2283922086497195e-06, + "loss": 0.4608, + "step": 108500 + }, + { + "epoch": 2.389176108238918, + "grad_norm": 3.434720277786255, + "learning_rate": 5.22399031583581e-06, + "loss": 0.4415, + "step": 108600 + }, + { + "epoch": 2.3913760862391378, + "grad_norm": 2.7094149589538574, + "learning_rate": 5.219588423021899e-06, + "loss": 0.4562, + "step": 108700 + }, + { + "epoch": 2.3935760642393578, + "grad_norm": 2.9980342388153076, + "learning_rate": 5.2151865302079895e-06, + "loss": 0.4564, + "step": 108800 + }, + { + "epoch": 2.3957760422395777, + "grad_norm": 3.241988182067871, + "learning_rate": 5.21078463739408e-06, + "loss": 0.4494, + "step": 108900 + }, + { + "epoch": 2.3979760202397977, + "grad_norm": 3.08487606048584, + "learning_rate": 5.206382744580169e-06, + "loss": 0.439, + "step": 109000 + }, + { + "epoch": 2.4001759982400177, + "grad_norm": 3.0313308238983154, + "learning_rate": 5.2019808517662595e-06, + "loss": 0.4412, + "step": 109100 + }, + { + "epoch": 2.4023759762402377, + "grad_norm": 3.303107738494873, + "learning_rate": 5.19757895895235e-06, + "loss": 0.4437, + "step": 109200 + }, + { + "epoch": 2.4045759542404577, + "grad_norm": 2.7623887062072754, + "learning_rate": 5.193177066138439e-06, + "loss": 0.4426, + "step": 109300 + }, + { + "epoch": 2.4067759322406777, + "grad_norm": 3.3932597637176514, + "learning_rate": 5.1887751733245295e-06, + "loss": 0.4381, + "step": 109400 + }, + { + "epoch": 2.4089759102408976, + "grad_norm": 2.618532657623291, + "learning_rate": 5.18437328051062e-06, + "loss": 0.4466, + "step": 109500 + }, + { + "epoch": 2.4111758882411176, + "grad_norm": 2.4478089809417725, + "learning_rate": 5.179971387696709e-06, + "loss": 0.4494, + "step": 109600 + }, + { + "epoch": 2.4133758662413376, + "grad_norm": 3.646751642227173, + "learning_rate": 5.1755694948828e-06, + "loss": 0.4684, + "step": 109700 + }, + { + "epoch": 2.4155758442415576, + "grad_norm": 2.54317569732666, + "learning_rate": 5.171167602068891e-06, + "loss": 0.4467, + "step": 109800 + }, + { + "epoch": 2.4177758222417776, + "grad_norm": 2.939131021499634, + "learning_rate": 5.166765709254981e-06, + "loss": 0.4438, + "step": 109900 + }, + { + "epoch": 2.4199758002419975, + "grad_norm": 3.4897677898406982, + "learning_rate": 5.16236381644107e-06, + "loss": 0.4444, + "step": 110000 + }, + { + "epoch": 2.4221757782422175, + "grad_norm": 3.108614683151245, + "learning_rate": 5.157961923627161e-06, + "loss": 0.4639, + "step": 110100 + }, + { + "epoch": 2.4243757562424375, + "grad_norm": 3.135338068008423, + "learning_rate": 5.153560030813251e-06, + "loss": 0.4424, + "step": 110200 + }, + { + "epoch": 2.4265757342426575, + "grad_norm": 2.249314785003662, + "learning_rate": 5.14915813799934e-06, + "loss": 0.4298, + "step": 110300 + }, + { + "epoch": 2.4287757122428775, + "grad_norm": 2.582010269165039, + "learning_rate": 5.144756245185431e-06, + "loss": 0.4404, + "step": 110400 + }, + { + "epoch": 2.4309756902430975, + "grad_norm": 3.1904852390289307, + "learning_rate": 5.140354352371521e-06, + "loss": 0.4526, + "step": 110500 + }, + { + "epoch": 2.4331756682433174, + "grad_norm": 3.054769277572632, + "learning_rate": 5.13595245955761e-06, + "loss": 0.4332, + "step": 110600 + }, + { + "epoch": 2.4353756462435374, + "grad_norm": 2.8248226642608643, + "learning_rate": 5.131550566743701e-06, + "loss": 0.446, + "step": 110700 + }, + { + "epoch": 2.4375756242437574, + "grad_norm": 3.6720070838928223, + "learning_rate": 5.12714867392979e-06, + "loss": 0.445, + "step": 110800 + }, + { + "epoch": 2.4397756022439774, + "grad_norm": 2.920863389968872, + "learning_rate": 5.12274678111588e-06, + "loss": 0.4482, + "step": 110900 + }, + { + "epoch": 2.441975580244198, + "grad_norm": 2.426818370819092, + "learning_rate": 5.118344888301971e-06, + "loss": 0.452, + "step": 111000 + }, + { + "epoch": 2.4441755582444173, + "grad_norm": 3.390026330947876, + "learning_rate": 5.11394299548806e-06, + "loss": 0.4413, + "step": 111100 + }, + { + "epoch": 2.4463755362446378, + "grad_norm": 3.1187210083007812, + "learning_rate": 5.10954110267415e-06, + "loss": 0.4381, + "step": 111200 + }, + { + "epoch": 2.4485755142448573, + "grad_norm": 3.0595436096191406, + "learning_rate": 5.105139209860241e-06, + "loss": 0.4432, + "step": 111300 + }, + { + "epoch": 2.4507754922450777, + "grad_norm": 2.9489197731018066, + "learning_rate": 5.10073731704633e-06, + "loss": 0.438, + "step": 111400 + }, + { + "epoch": 2.4529754702452977, + "grad_norm": 2.4834353923797607, + "learning_rate": 5.09633542423242e-06, + "loss": 0.4452, + "step": 111500 + }, + { + "epoch": 2.4551754482455177, + "grad_norm": 3.135232925415039, + "learning_rate": 5.091933531418511e-06, + "loss": 0.4493, + "step": 111600 + }, + { + "epoch": 2.4573754262457377, + "grad_norm": 3.158200263977051, + "learning_rate": 5.0875316386046e-06, + "loss": 0.446, + "step": 111700 + }, + { + "epoch": 2.4595754042459577, + "grad_norm": 3.199408531188965, + "learning_rate": 5.08312974579069e-06, + "loss": 0.4457, + "step": 111800 + }, + { + "epoch": 2.4617753822461776, + "grad_norm": 2.692122459411621, + "learning_rate": 5.078727852976781e-06, + "loss": 0.4453, + "step": 111900 + }, + { + "epoch": 2.4639753602463976, + "grad_norm": 2.708963632583618, + "learning_rate": 5.07432596016287e-06, + "loss": 0.4375, + "step": 112000 + }, + { + "epoch": 2.4661753382466176, + "grad_norm": 2.9427683353424072, + "learning_rate": 5.06992406734896e-06, + "loss": 0.4419, + "step": 112100 + }, + { + "epoch": 2.4683753162468376, + "grad_norm": 3.154421329498291, + "learning_rate": 5.065522174535051e-06, + "loss": 0.4539, + "step": 112200 + }, + { + "epoch": 2.4705752942470576, + "grad_norm": 2.364830255508423, + "learning_rate": 5.06112028172114e-06, + "loss": 0.4535, + "step": 112300 + }, + { + "epoch": 2.4727752722472776, + "grad_norm": 3.534681797027588, + "learning_rate": 5.05671838890723e-06, + "loss": 0.4457, + "step": 112400 + }, + { + "epoch": 2.4749752502474975, + "grad_norm": 3.2701926231384277, + "learning_rate": 5.052316496093321e-06, + "loss": 0.447, + "step": 112500 + }, + { + "epoch": 2.4771752282477175, + "grad_norm": 2.395355701446533, + "learning_rate": 5.04791460327941e-06, + "loss": 0.4399, + "step": 112600 + }, + { + "epoch": 2.4793752062479375, + "grad_norm": 3.506509780883789, + "learning_rate": 5.0435127104655e-06, + "loss": 0.4527, + "step": 112700 + }, + { + "epoch": 2.4815751842481575, + "grad_norm": 2.844534397125244, + "learning_rate": 5.03911081765159e-06, + "loss": 0.4436, + "step": 112800 + }, + { + "epoch": 2.4837751622483775, + "grad_norm": 3.3356661796569824, + "learning_rate": 5.03470892483768e-06, + "loss": 0.4432, + "step": 112900 + }, + { + "epoch": 2.4859751402485974, + "grad_norm": 2.6262450218200684, + "learning_rate": 5.03030703202377e-06, + "loss": 0.4508, + "step": 113000 + }, + { + "epoch": 2.4881751182488174, + "grad_norm": 2.852914810180664, + "learning_rate": 5.02590513920986e-06, + "loss": 0.453, + "step": 113100 + }, + { + "epoch": 2.4903750962490374, + "grad_norm": 3.224490165710449, + "learning_rate": 5.021503246395951e-06, + "loss": 0.4637, + "step": 113200 + }, + { + "epoch": 2.4925750742492574, + "grad_norm": 2.180025577545166, + "learning_rate": 5.017101353582041e-06, + "loss": 0.4316, + "step": 113300 + }, + { + "epoch": 2.4947750522494774, + "grad_norm": 2.4598264694213867, + "learning_rate": 5.0126994607681314e-06, + "loss": 0.4381, + "step": 113400 + }, + { + "epoch": 2.4969750302496974, + "grad_norm": 2.587557315826416, + "learning_rate": 5.008297567954221e-06, + "loss": 0.4469, + "step": 113500 + }, + { + "epoch": 2.4991750082499173, + "grad_norm": 2.93766450881958, + "learning_rate": 5.003895675140311e-06, + "loss": 0.459, + "step": 113600 + }, + { + "epoch": 2.5013749862501378, + "grad_norm": 2.926692485809326, + "learning_rate": 4.999493782326401e-06, + "loss": 0.4444, + "step": 113700 + }, + { + "epoch": 2.5035749642503573, + "grad_norm": 3.22385311126709, + "learning_rate": 4.995091889512491e-06, + "loss": 0.4395, + "step": 113800 + }, + { + "epoch": 2.5057749422505777, + "grad_norm": 2.241689682006836, + "learning_rate": 4.99068999669858e-06, + "loss": 0.4461, + "step": 113900 + }, + { + "epoch": 2.5079749202507973, + "grad_norm": 3.1882591247558594, + "learning_rate": 4.986288103884671e-06, + "loss": 0.4482, + "step": 114000 + }, + { + "epoch": 2.5101748982510177, + "grad_norm": 2.0357823371887207, + "learning_rate": 4.981886211070761e-06, + "loss": 0.4291, + "step": 114100 + }, + { + "epoch": 2.5123748762512372, + "grad_norm": 3.0307114124298096, + "learning_rate": 4.977484318256851e-06, + "loss": 0.4556, + "step": 114200 + }, + { + "epoch": 2.5145748542514577, + "grad_norm": 3.274482488632202, + "learning_rate": 4.9730824254429414e-06, + "loss": 0.4561, + "step": 114300 + }, + { + "epoch": 2.5167748322516776, + "grad_norm": 2.6364364624023438, + "learning_rate": 4.968680532629031e-06, + "loss": 0.4462, + "step": 114400 + }, + { + "epoch": 2.5189748102518976, + "grad_norm": 3.74102520942688, + "learning_rate": 4.964278639815121e-06, + "loss": 0.4343, + "step": 114500 + }, + { + "epoch": 2.5211747882521176, + "grad_norm": 2.6400420665740967, + "learning_rate": 4.9598767470012114e-06, + "loss": 0.431, + "step": 114600 + }, + { + "epoch": 2.5233747662523376, + "grad_norm": 3.313936948776245, + "learning_rate": 4.955474854187301e-06, + "loss": 0.4361, + "step": 114700 + }, + { + "epoch": 2.5255747442525576, + "grad_norm": 3.1733415126800537, + "learning_rate": 4.951072961373391e-06, + "loss": 0.4346, + "step": 114800 + }, + { + "epoch": 2.5277747222527776, + "grad_norm": 3.2925596237182617, + "learning_rate": 4.946671068559481e-06, + "loss": 0.4382, + "step": 114900 + }, + { + "epoch": 2.5299747002529975, + "grad_norm": 3.0472724437713623, + "learning_rate": 4.942269175745571e-06, + "loss": 0.4294, + "step": 115000 + }, + { + "epoch": 2.5321746782532175, + "grad_norm": 2.684380054473877, + "learning_rate": 4.937867282931661e-06, + "loss": 0.446, + "step": 115100 + }, + { + "epoch": 2.5343746562534375, + "grad_norm": 2.86007022857666, + "learning_rate": 4.933465390117751e-06, + "loss": 0.4445, + "step": 115200 + }, + { + "epoch": 2.5365746342536575, + "grad_norm": 3.1201276779174805, + "learning_rate": 4.929063497303841e-06, + "loss": 0.4363, + "step": 115300 + }, + { + "epoch": 2.5387746122538775, + "grad_norm": 2.558084726333618, + "learning_rate": 4.924661604489931e-06, + "loss": 0.4272, + "step": 115400 + }, + { + "epoch": 2.5409745902540974, + "grad_norm": 3.3204970359802246, + "learning_rate": 4.920259711676021e-06, + "loss": 0.425, + "step": 115500 + }, + { + "epoch": 2.5431745682543174, + "grad_norm": 2.824089288711548, + "learning_rate": 4.915857818862111e-06, + "loss": 0.4428, + "step": 115600 + }, + { + "epoch": 2.5453745462545374, + "grad_norm": 3.430072784423828, + "learning_rate": 4.911455926048201e-06, + "loss": 0.435, + "step": 115700 + }, + { + "epoch": 2.5475745242547574, + "grad_norm": 3.415888547897339, + "learning_rate": 4.9070540332342914e-06, + "loss": 0.4546, + "step": 115800 + }, + { + "epoch": 2.5497745022549774, + "grad_norm": 3.0503039360046387, + "learning_rate": 4.902652140420382e-06, + "loss": 0.4359, + "step": 115900 + }, + { + "epoch": 2.5519744802551974, + "grad_norm": 3.0880868434906006, + "learning_rate": 4.898250247606471e-06, + "loss": 0.445, + "step": 116000 + }, + { + "epoch": 2.5541744582554173, + "grad_norm": 3.1325924396514893, + "learning_rate": 4.8938483547925614e-06, + "loss": 0.4339, + "step": 116100 + }, + { + "epoch": 2.5563744362556373, + "grad_norm": 3.6278367042541504, + "learning_rate": 4.889446461978652e-06, + "loss": 0.4573, + "step": 116200 + }, + { + "epoch": 2.5585744142558573, + "grad_norm": 2.8526251316070557, + "learning_rate": 4.885044569164741e-06, + "loss": 0.4359, + "step": 116300 + }, + { + "epoch": 2.5607743922560773, + "grad_norm": 2.335749864578247, + "learning_rate": 4.8806426763508314e-06, + "loss": 0.4336, + "step": 116400 + }, + { + "epoch": 2.5629743702562973, + "grad_norm": 2.912464141845703, + "learning_rate": 4.876240783536922e-06, + "loss": 0.4463, + "step": 116500 + }, + { + "epoch": 2.5651743482565177, + "grad_norm": 3.1221654415130615, + "learning_rate": 4.871838890723011e-06, + "loss": 0.4394, + "step": 116600 + }, + { + "epoch": 2.5673743262567372, + "grad_norm": 3.0189766883850098, + "learning_rate": 4.8674369979091014e-06, + "loss": 0.4279, + "step": 116700 + }, + { + "epoch": 2.5695743042569577, + "grad_norm": 2.7130327224731445, + "learning_rate": 4.863035105095191e-06, + "loss": 0.4397, + "step": 116800 + }, + { + "epoch": 2.571774282257177, + "grad_norm": 3.0579302310943604, + "learning_rate": 4.858633212281281e-06, + "loss": 0.4509, + "step": 116900 + }, + { + "epoch": 2.5739742602573976, + "grad_norm": 2.568333625793457, + "learning_rate": 4.8542313194673714e-06, + "loss": 0.4391, + "step": 117000 + }, + { + "epoch": 2.576174238257617, + "grad_norm": 3.574082612991333, + "learning_rate": 4.849829426653461e-06, + "loss": 0.4354, + "step": 117100 + }, + { + "epoch": 2.5783742162578376, + "grad_norm": 2.516016721725464, + "learning_rate": 4.845427533839551e-06, + "loss": 0.4417, + "step": 117200 + }, + { + "epoch": 2.5805741942580576, + "grad_norm": 2.464613199234009, + "learning_rate": 4.8410256410256414e-06, + "loss": 0.4464, + "step": 117300 + }, + { + "epoch": 2.5827741722582775, + "grad_norm": 2.841379404067993, + "learning_rate": 4.836623748211731e-06, + "loss": 0.45, + "step": 117400 + }, + { + "epoch": 2.5849741502584975, + "grad_norm": 2.6032309532165527, + "learning_rate": 4.832221855397821e-06, + "loss": 0.4258, + "step": 117500 + }, + { + "epoch": 2.5871741282587175, + "grad_norm": 2.964139938354492, + "learning_rate": 4.8278199625839114e-06, + "loss": 0.4495, + "step": 117600 + }, + { + "epoch": 2.5893741062589375, + "grad_norm": 4.020530700683594, + "learning_rate": 4.823418069770002e-06, + "loss": 0.435, + "step": 117700 + }, + { + "epoch": 2.5915740842591575, + "grad_norm": 2.493910312652588, + "learning_rate": 4.819016176956092e-06, + "loss": 0.4371, + "step": 117800 + }, + { + "epoch": 2.5937740622593775, + "grad_norm": 3.542248249053955, + "learning_rate": 4.8146142841421814e-06, + "loss": 0.4489, + "step": 117900 + }, + { + "epoch": 2.5959740402595974, + "grad_norm": 3.5736639499664307, + "learning_rate": 4.810212391328272e-06, + "loss": 0.43, + "step": 118000 + }, + { + "epoch": 2.5981740182598174, + "grad_norm": 2.6904780864715576, + "learning_rate": 4.805810498514362e-06, + "loss": 0.4326, + "step": 118100 + }, + { + "epoch": 2.6003739962600374, + "grad_norm": 3.0727078914642334, + "learning_rate": 4.8014086057004514e-06, + "loss": 0.4338, + "step": 118200 + }, + { + "epoch": 2.6025739742602574, + "grad_norm": 3.355623722076416, + "learning_rate": 4.797006712886542e-06, + "loss": 0.4489, + "step": 118300 + }, + { + "epoch": 2.6047739522604774, + "grad_norm": 2.657305955886841, + "learning_rate": 4.792604820072632e-06, + "loss": 0.4416, + "step": 118400 + }, + { + "epoch": 2.6069739302606973, + "grad_norm": 2.6770079135894775, + "learning_rate": 4.7882029272587214e-06, + "loss": 0.4317, + "step": 118500 + }, + { + "epoch": 2.6091739082609173, + "grad_norm": 3.4358301162719727, + "learning_rate": 4.783801034444812e-06, + "loss": 0.4307, + "step": 118600 + }, + { + "epoch": 2.6113738862611373, + "grad_norm": 2.719377040863037, + "learning_rate": 4.779399141630901e-06, + "loss": 0.4452, + "step": 118700 + }, + { + "epoch": 2.6135738642613573, + "grad_norm": 3.2705419063568115, + "learning_rate": 4.7749972488169914e-06, + "loss": 0.4391, + "step": 118800 + }, + { + "epoch": 2.6157738422615773, + "grad_norm": 2.5769264698028564, + "learning_rate": 4.770595356003082e-06, + "loss": 0.4566, + "step": 118900 + }, + { + "epoch": 2.6179738202617973, + "grad_norm": 3.0442352294921875, + "learning_rate": 4.766193463189171e-06, + "loss": 0.4394, + "step": 119000 + }, + { + "epoch": 2.6201737982620172, + "grad_norm": 2.913459539413452, + "learning_rate": 4.7617915703752614e-06, + "loss": 0.4445, + "step": 119100 + }, + { + "epoch": 2.622373776262237, + "grad_norm": 2.6879146099090576, + "learning_rate": 4.757389677561352e-06, + "loss": 0.4433, + "step": 119200 + }, + { + "epoch": 2.6245737542624576, + "grad_norm": 3.6051576137542725, + "learning_rate": 4.752987784747442e-06, + "loss": 0.4385, + "step": 119300 + }, + { + "epoch": 2.626773732262677, + "grad_norm": 2.6867752075195312, + "learning_rate": 4.748585891933532e-06, + "loss": 0.4569, + "step": 119400 + }, + { + "epoch": 2.6289737102628976, + "grad_norm": 2.795522928237915, + "learning_rate": 4.744183999119622e-06, + "loss": 0.4413, + "step": 119500 + }, + { + "epoch": 2.631173688263117, + "grad_norm": 3.2469236850738525, + "learning_rate": 4.739782106305712e-06, + "loss": 0.4539, + "step": 119600 + }, + { + "epoch": 2.6333736662633376, + "grad_norm": 3.3304011821746826, + "learning_rate": 4.735380213491802e-06, + "loss": 0.46, + "step": 119700 + }, + { + "epoch": 2.635573644263557, + "grad_norm": 3.8114166259765625, + "learning_rate": 4.730978320677892e-06, + "loss": 0.4472, + "step": 119800 + }, + { + "epoch": 2.6377736222637775, + "grad_norm": 3.1319470405578613, + "learning_rate": 4.726576427863982e-06, + "loss": 0.4381, + "step": 119900 + }, + { + "epoch": 2.6399736002639975, + "grad_norm": 3.4921023845672607, + "learning_rate": 4.722174535050072e-06, + "loss": 0.4397, + "step": 120000 + }, + { + "epoch": 2.6421735782642175, + "grad_norm": 2.191277265548706, + "learning_rate": 4.717772642236162e-06, + "loss": 0.4329, + "step": 120100 + }, + { + "epoch": 2.6443735562644375, + "grad_norm": 3.3526830673217773, + "learning_rate": 4.713370749422252e-06, + "loss": 0.4425, + "step": 120200 + }, + { + "epoch": 2.6465735342646575, + "grad_norm": 3.1728663444519043, + "learning_rate": 4.708968856608342e-06, + "loss": 0.4304, + "step": 120300 + }, + { + "epoch": 2.6487735122648774, + "grad_norm": 2.8686399459838867, + "learning_rate": 4.704566963794432e-06, + "loss": 0.4414, + "step": 120400 + }, + { + "epoch": 2.6509734902650974, + "grad_norm": 3.0160744190216064, + "learning_rate": 4.700165070980522e-06, + "loss": 0.4318, + "step": 120500 + }, + { + "epoch": 2.6531734682653174, + "grad_norm": 3.1512398719787598, + "learning_rate": 4.695763178166612e-06, + "loss": 0.4469, + "step": 120600 + }, + { + "epoch": 2.6553734462655374, + "grad_norm": 3.48527193069458, + "learning_rate": 4.691361285352702e-06, + "loss": 0.4239, + "step": 120700 + }, + { + "epoch": 2.6575734242657574, + "grad_norm": 3.1018311977386475, + "learning_rate": 4.686959392538792e-06, + "loss": 0.4555, + "step": 120800 + }, + { + "epoch": 2.6597734022659774, + "grad_norm": 2.867196559906006, + "learning_rate": 4.6825574997248814e-06, + "loss": 0.4392, + "step": 120900 + }, + { + "epoch": 2.6619733802661973, + "grad_norm": 3.6857316493988037, + "learning_rate": 4.6781556069109726e-06, + "loss": 0.4384, + "step": 121000 + }, + { + "epoch": 2.6641733582664173, + "grad_norm": 3.517019748687744, + "learning_rate": 4.673753714097062e-06, + "loss": 0.4323, + "step": 121100 + }, + { + "epoch": 2.6663733362666373, + "grad_norm": 2.7562782764434814, + "learning_rate": 4.669351821283152e-06, + "loss": 0.4425, + "step": 121200 + }, + { + "epoch": 2.6685733142668573, + "grad_norm": 3.2904388904571533, + "learning_rate": 4.6649499284692426e-06, + "loss": 0.4365, + "step": 121300 + }, + { + "epoch": 2.6707732922670773, + "grad_norm": 2.5950496196746826, + "learning_rate": 4.660548035655332e-06, + "loss": 0.4451, + "step": 121400 + }, + { + "epoch": 2.6729732702672973, + "grad_norm": 2.883255958557129, + "learning_rate": 4.656146142841422e-06, + "loss": 0.4327, + "step": 121500 + }, + { + "epoch": 2.6751732482675172, + "grad_norm": 2.8640213012695312, + "learning_rate": 4.6517442500275126e-06, + "loss": 0.4507, + "step": 121600 + }, + { + "epoch": 2.677373226267737, + "grad_norm": 2.547304630279541, + "learning_rate": 4.647342357213602e-06, + "loss": 0.4391, + "step": 121700 + }, + { + "epoch": 2.679573204267957, + "grad_norm": 2.599860668182373, + "learning_rate": 4.642940464399692e-06, + "loss": 0.4351, + "step": 121800 + }, + { + "epoch": 2.681773182268177, + "grad_norm": 2.898108959197998, + "learning_rate": 4.6385385715857826e-06, + "loss": 0.4486, + "step": 121900 + }, + { + "epoch": 2.683973160268397, + "grad_norm": 3.2468693256378174, + "learning_rate": 4.634136678771872e-06, + "loss": 0.4465, + "step": 122000 + }, + { + "epoch": 2.686173138268617, + "grad_norm": 2.5715901851654053, + "learning_rate": 4.629734785957962e-06, + "loss": 0.4464, + "step": 122100 + }, + { + "epoch": 2.6883731162688376, + "grad_norm": 3.2512638568878174, + "learning_rate": 4.6253328931440526e-06, + "loss": 0.436, + "step": 122200 + }, + { + "epoch": 2.690573094269057, + "grad_norm": 3.1215593814849854, + "learning_rate": 4.620931000330142e-06, + "loss": 0.4333, + "step": 122300 + }, + { + "epoch": 2.6927730722692775, + "grad_norm": 3.264613151550293, + "learning_rate": 4.616529107516232e-06, + "loss": 0.4288, + "step": 122400 + }, + { + "epoch": 2.694973050269497, + "grad_norm": 3.0146634578704834, + "learning_rate": 4.6121272147023226e-06, + "loss": 0.4436, + "step": 122500 + }, + { + "epoch": 2.6971730282697175, + "grad_norm": 3.1037158966064453, + "learning_rate": 4.607725321888412e-06, + "loss": 0.4545, + "step": 122600 + }, + { + "epoch": 2.699373006269937, + "grad_norm": 3.4465529918670654, + "learning_rate": 4.603323429074502e-06, + "loss": 0.421, + "step": 122700 + }, + { + "epoch": 2.7015729842701575, + "grad_norm": 3.160284996032715, + "learning_rate": 4.5989215362605926e-06, + "loss": 0.4323, + "step": 122800 + }, + { + "epoch": 2.7037729622703774, + "grad_norm": 2.4882216453552246, + "learning_rate": 4.594519643446683e-06, + "loss": 0.4477, + "step": 122900 + }, + { + "epoch": 2.7059729402705974, + "grad_norm": 2.794928550720215, + "learning_rate": 4.590117750632772e-06, + "loss": 0.4365, + "step": 123000 + }, + { + "epoch": 2.7081729182708174, + "grad_norm": 3.3920443058013916, + "learning_rate": 4.5857158578188626e-06, + "loss": 0.4419, + "step": 123100 + }, + { + "epoch": 2.7103728962710374, + "grad_norm": 3.186394214630127, + "learning_rate": 4.581313965004953e-06, + "loss": 0.4226, + "step": 123200 + }, + { + "epoch": 2.7125728742712574, + "grad_norm": 3.5185060501098633, + "learning_rate": 4.576912072191042e-06, + "loss": 0.4487, + "step": 123300 + }, + { + "epoch": 2.7147728522714774, + "grad_norm": 2.8069283962249756, + "learning_rate": 4.5725101793771326e-06, + "loss": 0.443, + "step": 123400 + }, + { + "epoch": 2.7169728302716973, + "grad_norm": 4.11994743347168, + "learning_rate": 4.568108286563223e-06, + "loss": 0.4387, + "step": 123500 + }, + { + "epoch": 2.7191728082719173, + "grad_norm": 3.6650454998016357, + "learning_rate": 4.563706393749312e-06, + "loss": 0.4431, + "step": 123600 + }, + { + "epoch": 2.7213727862721373, + "grad_norm": 2.7214787006378174, + "learning_rate": 4.5593045009354026e-06, + "loss": 0.4299, + "step": 123700 + }, + { + "epoch": 2.7235727642723573, + "grad_norm": 3.1517221927642822, + "learning_rate": 4.554902608121493e-06, + "loss": 0.4417, + "step": 123800 + }, + { + "epoch": 2.7257727422725773, + "grad_norm": 2.9995832443237305, + "learning_rate": 4.550500715307582e-06, + "loss": 0.4357, + "step": 123900 + }, + { + "epoch": 2.7279727202727972, + "grad_norm": 3.1918044090270996, + "learning_rate": 4.5460988224936726e-06, + "loss": 0.442, + "step": 124000 + }, + { + "epoch": 2.7301726982730172, + "grad_norm": 3.101876735687256, + "learning_rate": 4.541696929679763e-06, + "loss": 0.4333, + "step": 124100 + }, + { + "epoch": 2.732372676273237, + "grad_norm": 3.4324588775634766, + "learning_rate": 4.537295036865852e-06, + "loss": 0.431, + "step": 124200 + }, + { + "epoch": 2.734572654273457, + "grad_norm": 3.6786587238311768, + "learning_rate": 4.5328931440519426e-06, + "loss": 0.4499, + "step": 124300 + }, + { + "epoch": 2.736772632273677, + "grad_norm": 2.661198139190674, + "learning_rate": 4.528491251238033e-06, + "loss": 0.4446, + "step": 124400 + }, + { + "epoch": 2.738972610273897, + "grad_norm": 2.958374500274658, + "learning_rate": 4.524089358424123e-06, + "loss": 0.4364, + "step": 124500 + }, + { + "epoch": 2.741172588274117, + "grad_norm": 3.012861967086792, + "learning_rate": 4.519687465610213e-06, + "loss": 0.432, + "step": 124600 + }, + { + "epoch": 2.743372566274337, + "grad_norm": 2.911194324493408, + "learning_rate": 4.515285572796303e-06, + "loss": 0.4524, + "step": 124700 + }, + { + "epoch": 2.745572544274557, + "grad_norm": 2.891263961791992, + "learning_rate": 4.510883679982393e-06, + "loss": 0.435, + "step": 124800 + }, + { + "epoch": 2.7477725222747775, + "grad_norm": 3.8306422233581543, + "learning_rate": 4.5064817871684826e-06, + "loss": 0.4331, + "step": 124900 + }, + { + "epoch": 2.749972500274997, + "grad_norm": 3.3201122283935547, + "learning_rate": 4.502079894354573e-06, + "loss": 0.4483, + "step": 125000 + }, + { + "epoch": 2.7521724782752175, + "grad_norm": 3.1668500900268555, + "learning_rate": 4.497678001540663e-06, + "loss": 0.4525, + "step": 125100 + }, + { + "epoch": 2.754372456275437, + "grad_norm": 2.9911599159240723, + "learning_rate": 4.4932761087267525e-06, + "loss": 0.426, + "step": 125200 + }, + { + "epoch": 2.7565724342756575, + "grad_norm": 2.79228138923645, + "learning_rate": 4.488874215912843e-06, + "loss": 0.4494, + "step": 125300 + }, + { + "epoch": 2.758772412275877, + "grad_norm": 3.6654934883117676, + "learning_rate": 4.484472323098933e-06, + "loss": 0.4246, + "step": 125400 + }, + { + "epoch": 2.7609723902760974, + "grad_norm": 3.1065101623535156, + "learning_rate": 4.4800704302850225e-06, + "loss": 0.4419, + "step": 125500 + }, + { + "epoch": 2.7631723682763174, + "grad_norm": 3.4590702056884766, + "learning_rate": 4.475668537471113e-06, + "loss": 0.4429, + "step": 125600 + }, + { + "epoch": 2.7653723462765374, + "grad_norm": 2.956099033355713, + "learning_rate": 4.471266644657203e-06, + "loss": 0.444, + "step": 125700 + }, + { + "epoch": 2.7675723242767574, + "grad_norm": 3.131133556365967, + "learning_rate": 4.4668647518432925e-06, + "loss": 0.436, + "step": 125800 + }, + { + "epoch": 2.7697723022769773, + "grad_norm": 2.866058349609375, + "learning_rate": 4.462462859029383e-06, + "loss": 0.4308, + "step": 125900 + }, + { + "epoch": 2.7719722802771973, + "grad_norm": 2.9503538608551025, + "learning_rate": 4.458060966215473e-06, + "loss": 0.4339, + "step": 126000 + }, + { + "epoch": 2.7741722582774173, + "grad_norm": 2.9972116947174072, + "learning_rate": 4.4536590734015625e-06, + "loss": 0.4457, + "step": 126100 + }, + { + "epoch": 2.7763722362776373, + "grad_norm": 2.7631094455718994, + "learning_rate": 4.449257180587654e-06, + "loss": 0.4387, + "step": 126200 + }, + { + "epoch": 2.7785722142778573, + "grad_norm": 2.7902297973632812, + "learning_rate": 4.444855287773743e-06, + "loss": 0.44, + "step": 126300 + }, + { + "epoch": 2.7807721922780773, + "grad_norm": 2.688309907913208, + "learning_rate": 4.440453394959833e-06, + "loss": 0.4432, + "step": 126400 + }, + { + "epoch": 2.7829721702782972, + "grad_norm": 2.6654300689697266, + "learning_rate": 4.436051502145924e-06, + "loss": 0.4347, + "step": 126500 + }, + { + "epoch": 2.7851721482785172, + "grad_norm": 3.6261539459228516, + "learning_rate": 4.431649609332013e-06, + "loss": 0.4267, + "step": 126600 + }, + { + "epoch": 2.787372126278737, + "grad_norm": 3.265857696533203, + "learning_rate": 4.427247716518103e-06, + "loss": 0.4374, + "step": 126700 + }, + { + "epoch": 2.789572104278957, + "grad_norm": 3.311096668243408, + "learning_rate": 4.422845823704193e-06, + "loss": 0.4432, + "step": 126800 + }, + { + "epoch": 2.791772082279177, + "grad_norm": 3.3290562629699707, + "learning_rate": 4.418443930890283e-06, + "loss": 0.4326, + "step": 126900 + }, + { + "epoch": 2.793972060279397, + "grad_norm": 3.593282461166382, + "learning_rate": 4.414042038076373e-06, + "loss": 0.4422, + "step": 127000 + }, + { + "epoch": 2.796172038279617, + "grad_norm": 2.8509931564331055, + "learning_rate": 4.409640145262463e-06, + "loss": 0.4443, + "step": 127100 + }, + { + "epoch": 2.798372016279837, + "grad_norm": 2.7307536602020264, + "learning_rate": 4.405238252448553e-06, + "loss": 0.4362, + "step": 127200 + }, + { + "epoch": 2.800571994280057, + "grad_norm": 3.677067518234253, + "learning_rate": 4.400836359634643e-06, + "loss": 0.4391, + "step": 127300 + }, + { + "epoch": 2.802771972280277, + "grad_norm": 3.1093156337738037, + "learning_rate": 4.396434466820733e-06, + "loss": 0.4299, + "step": 127400 + }, + { + "epoch": 2.804971950280497, + "grad_norm": 3.1432149410247803, + "learning_rate": 4.392032574006823e-06, + "loss": 0.4454, + "step": 127500 + }, + { + "epoch": 2.807171928280717, + "grad_norm": 3.175234079360962, + "learning_rate": 4.387630681192913e-06, + "loss": 0.4361, + "step": 127600 + }, + { + "epoch": 2.809371906280937, + "grad_norm": 3.7147324085235596, + "learning_rate": 4.383228788379003e-06, + "loss": 0.434, + "step": 127700 + }, + { + "epoch": 2.8115718842811575, + "grad_norm": 3.4218947887420654, + "learning_rate": 4.378826895565093e-06, + "loss": 0.4392, + "step": 127800 + }, + { + "epoch": 2.813771862281377, + "grad_norm": 4.2883195877075195, + "learning_rate": 4.374425002751183e-06, + "loss": 0.4254, + "step": 127900 + }, + { + "epoch": 2.8159718402815974, + "grad_norm": 2.9192450046539307, + "learning_rate": 4.370023109937274e-06, + "loss": 0.4596, + "step": 128000 + }, + { + "epoch": 2.818171818281817, + "grad_norm": 3.3479373455047607, + "learning_rate": 4.365621217123364e-06, + "loss": 0.4299, + "step": 128100 + }, + { + "epoch": 2.8203717962820374, + "grad_norm": 3.0389039516448975, + "learning_rate": 4.361219324309453e-06, + "loss": 0.4348, + "step": 128200 + }, + { + "epoch": 2.822571774282257, + "grad_norm": 3.236820697784424, + "learning_rate": 4.356817431495544e-06, + "loss": 0.4244, + "step": 128300 + }, + { + "epoch": 2.8247717522824773, + "grad_norm": 3.319667339324951, + "learning_rate": 4.352415538681634e-06, + "loss": 0.4316, + "step": 128400 + }, + { + "epoch": 2.8269717302826973, + "grad_norm": 3.5438737869262695, + "learning_rate": 4.348013645867723e-06, + "loss": 0.4427, + "step": 128500 + }, + { + "epoch": 2.8291717082829173, + "grad_norm": 2.995439052581787, + "learning_rate": 4.343611753053814e-06, + "loss": 0.4402, + "step": 128600 + }, + { + "epoch": 2.8313716862831373, + "grad_norm": 2.894104480743408, + "learning_rate": 4.339209860239904e-06, + "loss": 0.4351, + "step": 128700 + }, + { + "epoch": 2.8335716642833573, + "grad_norm": 3.5675222873687744, + "learning_rate": 4.334807967425993e-06, + "loss": 0.4359, + "step": 128800 + }, + { + "epoch": 2.8357716422835773, + "grad_norm": 2.749405860900879, + "learning_rate": 4.330406074612084e-06, + "loss": 0.4353, + "step": 128900 + }, + { + "epoch": 2.8379716202837972, + "grad_norm": 3.581648826599121, + "learning_rate": 4.326004181798173e-06, + "loss": 0.4465, + "step": 129000 + }, + { + "epoch": 2.840171598284017, + "grad_norm": 2.5396058559417725, + "learning_rate": 4.321602288984263e-06, + "loss": 0.4505, + "step": 129100 + }, + { + "epoch": 2.842371576284237, + "grad_norm": 3.2663464546203613, + "learning_rate": 4.317200396170354e-06, + "loss": 0.4315, + "step": 129200 + }, + { + "epoch": 2.844571554284457, + "grad_norm": 3.687699556350708, + "learning_rate": 4.312798503356443e-06, + "loss": 0.4341, + "step": 129300 + }, + { + "epoch": 2.846771532284677, + "grad_norm": 3.822061538696289, + "learning_rate": 4.308396610542533e-06, + "loss": 0.4204, + "step": 129400 + }, + { + "epoch": 2.848971510284897, + "grad_norm": 4.063410758972168, + "learning_rate": 4.303994717728624e-06, + "loss": 0.4281, + "step": 129500 + }, + { + "epoch": 2.851171488285117, + "grad_norm": 3.304727554321289, + "learning_rate": 4.299592824914713e-06, + "loss": 0.4276, + "step": 129600 + }, + { + "epoch": 2.853371466285337, + "grad_norm": 3.195687770843506, + "learning_rate": 4.295190932100804e-06, + "loss": 0.4195, + "step": 129700 + }, + { + "epoch": 2.855571444285557, + "grad_norm": 3.390817403793335, + "learning_rate": 4.290789039286894e-06, + "loss": 0.4442, + "step": 129800 + }, + { + "epoch": 2.857771422285777, + "grad_norm": 3.095522403717041, + "learning_rate": 4.286387146472984e-06, + "loss": 0.4459, + "step": 129900 + }, + { + "epoch": 2.859971400285997, + "grad_norm": 2.7765722274780273, + "learning_rate": 4.281985253659074e-06, + "loss": 0.4347, + "step": 130000 + }, + { + "epoch": 2.862171378286217, + "grad_norm": 3.3501715660095215, + "learning_rate": 4.277583360845164e-06, + "loss": 0.4315, + "step": 130100 + }, + { + "epoch": 2.864371356286437, + "grad_norm": 2.8992860317230225, + "learning_rate": 4.273181468031254e-06, + "loss": 0.4196, + "step": 130200 + }, + { + "epoch": 2.866571334286657, + "grad_norm": 3.240837335586548, + "learning_rate": 4.268779575217344e-06, + "loss": 0.4253, + "step": 130300 + }, + { + "epoch": 2.868771312286877, + "grad_norm": 2.687161445617676, + "learning_rate": 4.264377682403434e-06, + "loss": 0.4297, + "step": 130400 + }, + { + "epoch": 2.870971290287097, + "grad_norm": 3.1937789916992188, + "learning_rate": 4.259975789589524e-06, + "loss": 0.4367, + "step": 130500 + }, + { + "epoch": 2.873171268287317, + "grad_norm": 2.9205288887023926, + "learning_rate": 4.255573896775614e-06, + "loss": 0.434, + "step": 130600 + }, + { + "epoch": 2.8753712462875374, + "grad_norm": 2.830369710922241, + "learning_rate": 4.251172003961704e-06, + "loss": 0.4487, + "step": 130700 + }, + { + "epoch": 2.877571224287757, + "grad_norm": 3.458214044570923, + "learning_rate": 4.246770111147794e-06, + "loss": 0.4326, + "step": 130800 + }, + { + "epoch": 2.8797712022879773, + "grad_norm": 3.2541399002075195, + "learning_rate": 4.242368218333883e-06, + "loss": 0.4541, + "step": 130900 + }, + { + "epoch": 2.881971180288197, + "grad_norm": 3.3345773220062256, + "learning_rate": 4.237966325519974e-06, + "loss": 0.434, + "step": 131000 + }, + { + "epoch": 2.8841711582884173, + "grad_norm": 3.172386646270752, + "learning_rate": 4.233564432706064e-06, + "loss": 0.4399, + "step": 131100 + }, + { + "epoch": 2.886371136288637, + "grad_norm": 2.489182472229004, + "learning_rate": 4.229162539892153e-06, + "loss": 0.4324, + "step": 131200 + }, + { + "epoch": 2.8885711142888573, + "grad_norm": 3.0109496116638184, + "learning_rate": 4.224760647078244e-06, + "loss": 0.4342, + "step": 131300 + }, + { + "epoch": 2.8907710922890772, + "grad_norm": 3.0989527702331543, + "learning_rate": 4.220358754264334e-06, + "loss": 0.4317, + "step": 131400 + }, + { + "epoch": 2.8929710702892972, + "grad_norm": 4.138854026794434, + "learning_rate": 4.215956861450424e-06, + "loss": 0.4243, + "step": 131500 + }, + { + "epoch": 2.895171048289517, + "grad_norm": 2.925975799560547, + "learning_rate": 4.2115549686365145e-06, + "loss": 0.4307, + "step": 131600 + }, + { + "epoch": 2.897371026289737, + "grad_norm": 2.7254014015197754, + "learning_rate": 4.207153075822604e-06, + "loss": 0.4311, + "step": 131700 + }, + { + "epoch": 2.899571004289957, + "grad_norm": 3.5956203937530518, + "learning_rate": 4.202751183008694e-06, + "loss": 0.43, + "step": 131800 + }, + { + "epoch": 2.901770982290177, + "grad_norm": 3.406620502471924, + "learning_rate": 4.1983492901947845e-06, + "loss": 0.4321, + "step": 131900 + }, + { + "epoch": 2.903970960290397, + "grad_norm": 3.0268537998199463, + "learning_rate": 4.193947397380874e-06, + "loss": 0.4328, + "step": 132000 + }, + { + "epoch": 2.906170938290617, + "grad_norm": 3.0812931060791016, + "learning_rate": 4.189545504566964e-06, + "loss": 0.4443, + "step": 132100 + }, + { + "epoch": 2.908370916290837, + "grad_norm": 2.5374112129211426, + "learning_rate": 4.1851436117530545e-06, + "loss": 0.4324, + "step": 132200 + }, + { + "epoch": 2.910570894291057, + "grad_norm": 2.906034231185913, + "learning_rate": 4.180741718939144e-06, + "loss": 0.4358, + "step": 132300 + }, + { + "epoch": 2.912770872291277, + "grad_norm": 3.593029499053955, + "learning_rate": 4.176339826125234e-06, + "loss": 0.4517, + "step": 132400 + }, + { + "epoch": 2.914970850291497, + "grad_norm": 2.914520025253296, + "learning_rate": 4.1719379333113245e-06, + "loss": 0.4485, + "step": 132500 + }, + { + "epoch": 2.917170828291717, + "grad_norm": 2.874202013015747, + "learning_rate": 4.167536040497414e-06, + "loss": 0.4506, + "step": 132600 + }, + { + "epoch": 2.919370806291937, + "grad_norm": 3.172924041748047, + "learning_rate": 4.163134147683504e-06, + "loss": 0.4416, + "step": 132700 + }, + { + "epoch": 2.921570784292157, + "grad_norm": 4.034905433654785, + "learning_rate": 4.1587322548695945e-06, + "loss": 0.4297, + "step": 132800 + }, + { + "epoch": 2.923770762292377, + "grad_norm": 2.940948963165283, + "learning_rate": 4.154330362055684e-06, + "loss": 0.4277, + "step": 132900 + }, + { + "epoch": 2.925970740292597, + "grad_norm": 4.134010314941406, + "learning_rate": 4.149928469241774e-06, + "loss": 0.4206, + "step": 133000 + }, + { + "epoch": 2.928170718292817, + "grad_norm": 3.641511917114258, + "learning_rate": 4.145526576427864e-06, + "loss": 0.4433, + "step": 133100 + }, + { + "epoch": 2.930370696293037, + "grad_norm": 3.1284308433532715, + "learning_rate": 4.141124683613955e-06, + "loss": 0.42, + "step": 133200 + }, + { + "epoch": 2.932570674293257, + "grad_norm": 3.499300956726074, + "learning_rate": 4.136722790800044e-06, + "loss": 0.4429, + "step": 133300 + }, + { + "epoch": 2.9347706522934773, + "grad_norm": 3.7782890796661377, + "learning_rate": 4.1323208979861345e-06, + "loss": 0.4207, + "step": 133400 + }, + { + "epoch": 2.936970630293697, + "grad_norm": 2.8001630306243896, + "learning_rate": 4.127919005172225e-06, + "loss": 0.423, + "step": 133500 + }, + { + "epoch": 2.9391706082939173, + "grad_norm": 3.0386412143707275, + "learning_rate": 4.123517112358314e-06, + "loss": 0.4353, + "step": 133600 + }, + { + "epoch": 2.941370586294137, + "grad_norm": 3.480564594268799, + "learning_rate": 4.1191152195444045e-06, + "loss": 0.4373, + "step": 133700 + }, + { + "epoch": 2.9435705642943573, + "grad_norm": 3.148545503616333, + "learning_rate": 4.114713326730495e-06, + "loss": 0.4359, + "step": 133800 + }, + { + "epoch": 2.945770542294577, + "grad_norm": 2.8668603897094727, + "learning_rate": 4.110311433916584e-06, + "loss": 0.4435, + "step": 133900 + }, + { + "epoch": 2.9479705202947972, + "grad_norm": 3.410372495651245, + "learning_rate": 4.1059095411026745e-06, + "loss": 0.4515, + "step": 134000 + }, + { + "epoch": 2.950170498295017, + "grad_norm": 3.0960798263549805, + "learning_rate": 4.101507648288765e-06, + "loss": 0.4359, + "step": 134100 + }, + { + "epoch": 2.952370476295237, + "grad_norm": 2.3949267864227295, + "learning_rate": 4.097105755474854e-06, + "loss": 0.4283, + "step": 134200 + }, + { + "epoch": 2.954570454295457, + "grad_norm": 3.325115442276001, + "learning_rate": 4.0927038626609445e-06, + "loss": 0.4281, + "step": 134300 + }, + { + "epoch": 2.956770432295677, + "grad_norm": 3.046936511993408, + "learning_rate": 4.088301969847035e-06, + "loss": 0.4431, + "step": 134400 + }, + { + "epoch": 2.958970410295897, + "grad_norm": 3.0470268726348877, + "learning_rate": 4.083900077033124e-06, + "loss": 0.4235, + "step": 134500 + }, + { + "epoch": 2.961170388296117, + "grad_norm": 2.8730931282043457, + "learning_rate": 4.0794981842192145e-06, + "loss": 0.4248, + "step": 134600 + }, + { + "epoch": 2.963370366296337, + "grad_norm": 2.930630922317505, + "learning_rate": 4.075096291405305e-06, + "loss": 0.4398, + "step": 134700 + }, + { + "epoch": 2.965570344296557, + "grad_norm": 3.920790672302246, + "learning_rate": 4.070694398591394e-06, + "loss": 0.4423, + "step": 134800 + }, + { + "epoch": 2.967770322296777, + "grad_norm": 3.2532870769500732, + "learning_rate": 4.0662925057774845e-06, + "loss": 0.424, + "step": 134900 + }, + { + "epoch": 2.969970300296997, + "grad_norm": 3.4274468421936035, + "learning_rate": 4.061890612963575e-06, + "loss": 0.4408, + "step": 135000 + }, + { + "epoch": 2.972170278297217, + "grad_norm": 3.6045656204223633, + "learning_rate": 4.057488720149665e-06, + "loss": 0.4439, + "step": 135100 + }, + { + "epoch": 2.974370256297437, + "grad_norm": 3.063582181930542, + "learning_rate": 4.0530868273357545e-06, + "loss": 0.4305, + "step": 135200 + }, + { + "epoch": 2.976570234297657, + "grad_norm": 3.4174232482910156, + "learning_rate": 4.048684934521845e-06, + "loss": 0.4334, + "step": 135300 + }, + { + "epoch": 2.978770212297877, + "grad_norm": 2.2402751445770264, + "learning_rate": 4.044283041707935e-06, + "loss": 0.4376, + "step": 135400 + }, + { + "epoch": 2.980970190298097, + "grad_norm": 3.14042067527771, + "learning_rate": 4.0398811488940245e-06, + "loss": 0.443, + "step": 135500 + }, + { + "epoch": 2.983170168298317, + "grad_norm": 4.143354892730713, + "learning_rate": 4.035479256080115e-06, + "loss": 0.4336, + "step": 135600 + }, + { + "epoch": 2.985370146298537, + "grad_norm": 3.4250524044036865, + "learning_rate": 4.031077363266205e-06, + "loss": 0.4286, + "step": 135700 + }, + { + "epoch": 2.987570124298757, + "grad_norm": 3.041456937789917, + "learning_rate": 4.0266754704522945e-06, + "loss": 0.4296, + "step": 135800 + }, + { + "epoch": 2.989770102298977, + "grad_norm": 2.1302220821380615, + "learning_rate": 4.022273577638385e-06, + "loss": 0.4336, + "step": 135900 + }, + { + "epoch": 2.991970080299197, + "grad_norm": 3.928239107131958, + "learning_rate": 4.017871684824475e-06, + "loss": 0.4352, + "step": 136000 + }, + { + "epoch": 2.994170058299417, + "grad_norm": 2.7161359786987305, + "learning_rate": 4.0134697920105645e-06, + "loss": 0.4409, + "step": 136100 + }, + { + "epoch": 2.996370036299637, + "grad_norm": 2.8443000316619873, + "learning_rate": 4.009067899196655e-06, + "loss": 0.4227, + "step": 136200 + }, + { + "epoch": 2.9985700142998573, + "grad_norm": 2.336637020111084, + "learning_rate": 4.004666006382745e-06, + "loss": 0.4296, + "step": 136300 + }, + { + "epoch": 3.000769992300077, + "grad_norm": 2.747061252593994, + "learning_rate": 4.0002641135688345e-06, + "loss": 0.4243, + "step": 136400 + }, + { + "epoch": 3.0029699703002968, + "grad_norm": 2.6186234951019287, + "learning_rate": 3.995862220754925e-06, + "loss": 0.3946, + "step": 136500 + }, + { + "epoch": 3.005169948300517, + "grad_norm": 3.578420400619507, + "learning_rate": 3.991460327941015e-06, + "loss": 0.3841, + "step": 136600 + }, + { + "epoch": 3.007369926300737, + "grad_norm": 3.9675164222717285, + "learning_rate": 3.987058435127105e-06, + "loss": 0.4017, + "step": 136700 + }, + { + "epoch": 3.009569904300957, + "grad_norm": 3.1522490978240967, + "learning_rate": 3.982656542313195e-06, + "loss": 0.3878, + "step": 136800 + }, + { + "epoch": 3.011769882301177, + "grad_norm": 3.3388068675994873, + "learning_rate": 3.978254649499285e-06, + "loss": 0.4005, + "step": 136900 + }, + { + "epoch": 3.013969860301397, + "grad_norm": 3.6714820861816406, + "learning_rate": 3.973852756685375e-06, + "loss": 0.406, + "step": 137000 + }, + { + "epoch": 3.016169838301617, + "grad_norm": 2.9617388248443604, + "learning_rate": 3.969450863871465e-06, + "loss": 0.3915, + "step": 137100 + }, + { + "epoch": 3.018369816301837, + "grad_norm": 3.648895263671875, + "learning_rate": 3.965048971057555e-06, + "loss": 0.3919, + "step": 137200 + }, + { + "epoch": 3.020569794302057, + "grad_norm": 3.127763509750366, + "learning_rate": 3.960647078243645e-06, + "loss": 0.3969, + "step": 137300 + }, + { + "epoch": 3.022769772302277, + "grad_norm": 4.054533004760742, + "learning_rate": 3.956245185429735e-06, + "loss": 0.4023, + "step": 137400 + }, + { + "epoch": 3.024969750302497, + "grad_norm": 3.8178627490997314, + "learning_rate": 3.951843292615825e-06, + "loss": 0.3848, + "step": 137500 + }, + { + "epoch": 3.027169728302717, + "grad_norm": 3.450464963912964, + "learning_rate": 3.947441399801915e-06, + "loss": 0.3983, + "step": 137600 + }, + { + "epoch": 3.029369706302937, + "grad_norm": 3.210991144180298, + "learning_rate": 3.943039506988005e-06, + "loss": 0.3993, + "step": 137700 + }, + { + "epoch": 3.031569684303157, + "grad_norm": 3.706838369369507, + "learning_rate": 3.938637614174095e-06, + "loss": 0.3916, + "step": 137800 + }, + { + "epoch": 3.033769662303377, + "grad_norm": 3.306352138519287, + "learning_rate": 3.934235721360185e-06, + "loss": 0.404, + "step": 137900 + }, + { + "epoch": 3.035969640303597, + "grad_norm": 4.016099452972412, + "learning_rate": 3.929833828546275e-06, + "loss": 0.4058, + "step": 138000 + }, + { + "epoch": 3.038169618303817, + "grad_norm": 3.057190418243408, + "learning_rate": 3.925431935732365e-06, + "loss": 0.4105, + "step": 138100 + }, + { + "epoch": 3.040369596304037, + "grad_norm": 3.2745933532714844, + "learning_rate": 3.921030042918455e-06, + "loss": 0.3888, + "step": 138200 + }, + { + "epoch": 3.042569574304257, + "grad_norm": 3.0036211013793945, + "learning_rate": 3.916628150104545e-06, + "loss": 0.4039, + "step": 138300 + }, + { + "epoch": 3.044769552304477, + "grad_norm": 3.3982667922973633, + "learning_rate": 3.912226257290636e-06, + "loss": 0.3955, + "step": 138400 + }, + { + "epoch": 3.046969530304697, + "grad_norm": 2.8318960666656494, + "learning_rate": 3.907824364476725e-06, + "loss": 0.406, + "step": 138500 + }, + { + "epoch": 3.049169508304917, + "grad_norm": 2.669373035430908, + "learning_rate": 3.903422471662816e-06, + "loss": 0.4009, + "step": 138600 + }, + { + "epoch": 3.051369486305137, + "grad_norm": 3.476454257965088, + "learning_rate": 3.899020578848906e-06, + "loss": 0.3958, + "step": 138700 + }, + { + "epoch": 3.053569464305357, + "grad_norm": 3.4908926486968994, + "learning_rate": 3.894618686034995e-06, + "loss": 0.4008, + "step": 138800 + }, + { + "epoch": 3.055769442305577, + "grad_norm": 3.15459942817688, + "learning_rate": 3.890216793221086e-06, + "loss": 0.3928, + "step": 138900 + }, + { + "epoch": 3.0579694203057968, + "grad_norm": 3.2345471382141113, + "learning_rate": 3.885814900407175e-06, + "loss": 0.4051, + "step": 139000 + }, + { + "epoch": 3.0601693983060168, + "grad_norm": 3.2796826362609863, + "learning_rate": 3.881413007593265e-06, + "loss": 0.3985, + "step": 139100 + }, + { + "epoch": 3.0623693763062367, + "grad_norm": 3.1456501483917236, + "learning_rate": 3.877011114779356e-06, + "loss": 0.4006, + "step": 139200 + }, + { + "epoch": 3.0645693543064567, + "grad_norm": 3.5905213356018066, + "learning_rate": 3.872609221965445e-06, + "loss": 0.4005, + "step": 139300 + }, + { + "epoch": 3.066769332306677, + "grad_norm": 3.593623399734497, + "learning_rate": 3.868207329151535e-06, + "loss": 0.3984, + "step": 139400 + }, + { + "epoch": 3.068969310306897, + "grad_norm": 3.059357166290283, + "learning_rate": 3.863805436337626e-06, + "loss": 0.4105, + "step": 139500 + }, + { + "epoch": 3.071169288307117, + "grad_norm": 3.4862234592437744, + "learning_rate": 3.859403543523715e-06, + "loss": 0.3943, + "step": 139600 + }, + { + "epoch": 3.073369266307337, + "grad_norm": 3.381134033203125, + "learning_rate": 3.855001650709805e-06, + "loss": 0.3865, + "step": 139700 + }, + { + "epoch": 3.075569244307557, + "grad_norm": 3.13862681388855, + "learning_rate": 3.850599757895896e-06, + "loss": 0.3895, + "step": 139800 + }, + { + "epoch": 3.077769222307777, + "grad_norm": 3.6578209400177, + "learning_rate": 3.846197865081985e-06, + "loss": 0.3972, + "step": 139900 + }, + { + "epoch": 3.079969200307997, + "grad_norm": 3.353710174560547, + "learning_rate": 3.841795972268075e-06, + "loss": 0.3935, + "step": 140000 + }, + { + "epoch": 3.082169178308217, + "grad_norm": 3.3863718509674072, + "learning_rate": 3.837394079454166e-06, + "loss": 0.401, + "step": 140100 + }, + { + "epoch": 3.084369156308437, + "grad_norm": 3.574791669845581, + "learning_rate": 3.832992186640256e-06, + "loss": 0.3932, + "step": 140200 + }, + { + "epoch": 3.086569134308657, + "grad_norm": 3.1452407836914062, + "learning_rate": 3.828590293826346e-06, + "loss": 0.3982, + "step": 140300 + }, + { + "epoch": 3.088769112308877, + "grad_norm": 3.7013602256774902, + "learning_rate": 3.824188401012436e-06, + "loss": 0.3871, + "step": 140400 + }, + { + "epoch": 3.090969090309097, + "grad_norm": 3.909804105758667, + "learning_rate": 3.819786508198526e-06, + "loss": 0.3843, + "step": 140500 + }, + { + "epoch": 3.093169068309317, + "grad_norm": 3.576997995376587, + "learning_rate": 3.815384615384616e-06, + "loss": 0.3895, + "step": 140600 + }, + { + "epoch": 3.095369046309537, + "grad_norm": 3.9983808994293213, + "learning_rate": 3.8109827225707056e-06, + "loss": 0.3864, + "step": 140700 + }, + { + "epoch": 3.097569024309757, + "grad_norm": 3.6163980960845947, + "learning_rate": 3.806580829756796e-06, + "loss": 0.3903, + "step": 140800 + }, + { + "epoch": 3.099769002309977, + "grad_norm": 3.2333366870880127, + "learning_rate": 3.8021789369428858e-06, + "loss": 0.3939, + "step": 140900 + }, + { + "epoch": 3.101968980310197, + "grad_norm": 3.2053537368774414, + "learning_rate": 3.7977770441289756e-06, + "loss": 0.3983, + "step": 141000 + }, + { + "epoch": 3.104168958310417, + "grad_norm": 3.423635721206665, + "learning_rate": 3.793375151315066e-06, + "loss": 0.3937, + "step": 141100 + }, + { + "epoch": 3.106368936310637, + "grad_norm": 3.9658424854278564, + "learning_rate": 3.7889732585011558e-06, + "loss": 0.3952, + "step": 141200 + }, + { + "epoch": 3.108568914310857, + "grad_norm": 3.184368848800659, + "learning_rate": 3.7845713656872456e-06, + "loss": 0.3854, + "step": 141300 + }, + { + "epoch": 3.110768892311077, + "grad_norm": 3.8191583156585693, + "learning_rate": 3.7801694728733355e-06, + "loss": 0.3965, + "step": 141400 + }, + { + "epoch": 3.1129688703112968, + "grad_norm": 3.2895469665527344, + "learning_rate": 3.7757675800594258e-06, + "loss": 0.4101, + "step": 141500 + }, + { + "epoch": 3.1151688483115167, + "grad_norm": 3.220507860183716, + "learning_rate": 3.7713656872455156e-06, + "loss": 0.3977, + "step": 141600 + }, + { + "epoch": 3.1173688263117367, + "grad_norm": 3.0123960971832275, + "learning_rate": 3.7669637944316055e-06, + "loss": 0.4042, + "step": 141700 + }, + { + "epoch": 3.1195688043119567, + "grad_norm": 3.4893038272857666, + "learning_rate": 3.762561901617696e-06, + "loss": 0.4052, + "step": 141800 + }, + { + "epoch": 3.1217687823121767, + "grad_norm": 3.987698793411255, + "learning_rate": 3.758160008803786e-06, + "loss": 0.3979, + "step": 141900 + }, + { + "epoch": 3.1239687603123967, + "grad_norm": 3.5184412002563477, + "learning_rate": 3.7537581159898763e-06, + "loss": 0.4114, + "step": 142000 + }, + { + "epoch": 3.126168738312617, + "grad_norm": 4.024544715881348, + "learning_rate": 3.749356223175966e-06, + "loss": 0.3996, + "step": 142100 + }, + { + "epoch": 3.1283687163128366, + "grad_norm": 3.358962059020996, + "learning_rate": 3.744954330362056e-06, + "loss": 0.3981, + "step": 142200 + }, + { + "epoch": 3.130568694313057, + "grad_norm": 2.8024742603302, + "learning_rate": 3.7405524375481463e-06, + "loss": 0.402, + "step": 142300 + }, + { + "epoch": 3.132768672313277, + "grad_norm": 3.1832938194274902, + "learning_rate": 3.736150544734236e-06, + "loss": 0.3951, + "step": 142400 + }, + { + "epoch": 3.134968650313497, + "grad_norm": 3.7349486351013184, + "learning_rate": 3.731748651920326e-06, + "loss": 0.3911, + "step": 142500 + }, + { + "epoch": 3.137168628313717, + "grad_norm": 3.722146987915039, + "learning_rate": 3.727346759106416e-06, + "loss": 0.3991, + "step": 142600 + }, + { + "epoch": 3.139368606313937, + "grad_norm": 3.13198184967041, + "learning_rate": 3.722944866292506e-06, + "loss": 0.3937, + "step": 142700 + }, + { + "epoch": 3.141568584314157, + "grad_norm": 3.511359453201294, + "learning_rate": 3.718542973478596e-06, + "loss": 0.3989, + "step": 142800 + }, + { + "epoch": 3.143768562314377, + "grad_norm": 4.363528251647949, + "learning_rate": 3.714141080664686e-06, + "loss": 0.4041, + "step": 142900 + }, + { + "epoch": 3.145968540314597, + "grad_norm": 3.696638584136963, + "learning_rate": 3.709739187850776e-06, + "loss": 0.3971, + "step": 143000 + }, + { + "epoch": 3.148168518314817, + "grad_norm": 3.170654058456421, + "learning_rate": 3.705337295036866e-06, + "loss": 0.4002, + "step": 143100 + }, + { + "epoch": 3.150368496315037, + "grad_norm": 4.185492992401123, + "learning_rate": 3.700935402222956e-06, + "loss": 0.3912, + "step": 143200 + }, + { + "epoch": 3.152568474315257, + "grad_norm": 3.829686403274536, + "learning_rate": 3.696533509409046e-06, + "loss": 0.3914, + "step": 143300 + }, + { + "epoch": 3.154768452315477, + "grad_norm": 2.9345476627349854, + "learning_rate": 3.692131616595136e-06, + "loss": 0.3962, + "step": 143400 + }, + { + "epoch": 3.156968430315697, + "grad_norm": 3.666574239730835, + "learning_rate": 3.687729723781226e-06, + "loss": 0.401, + "step": 143500 + }, + { + "epoch": 3.159168408315917, + "grad_norm": 3.359739303588867, + "learning_rate": 3.6833278309673166e-06, + "loss": 0.3992, + "step": 143600 + }, + { + "epoch": 3.161368386316137, + "grad_norm": 4.227367877960205, + "learning_rate": 3.6789259381534065e-06, + "loss": 0.401, + "step": 143700 + }, + { + "epoch": 3.163568364316357, + "grad_norm": 2.7858405113220215, + "learning_rate": 3.6745240453394963e-06, + "loss": 0.3907, + "step": 143800 + }, + { + "epoch": 3.1657683423165768, + "grad_norm": 3.08479905128479, + "learning_rate": 3.6701221525255866e-06, + "loss": 0.395, + "step": 143900 + }, + { + "epoch": 3.1679683203167968, + "grad_norm": 3.606621265411377, + "learning_rate": 3.6657202597116765e-06, + "loss": 0.3916, + "step": 144000 + }, + { + "epoch": 3.1701682983170167, + "grad_norm": 4.141706466674805, + "learning_rate": 3.6613183668977663e-06, + "loss": 0.3987, + "step": 144100 + }, + { + "epoch": 3.1723682763172367, + "grad_norm": 3.2608320713043213, + "learning_rate": 3.6569164740838566e-06, + "loss": 0.4112, + "step": 144200 + }, + { + "epoch": 3.1745682543174567, + "grad_norm": 3.0765554904937744, + "learning_rate": 3.6525145812699465e-06, + "loss": 0.4057, + "step": 144300 + }, + { + "epoch": 3.1767682323176767, + "grad_norm": 3.198472738265991, + "learning_rate": 3.6481126884560363e-06, + "loss": 0.3928, + "step": 144400 + }, + { + "epoch": 3.1789682103178967, + "grad_norm": 3.3553693294525146, + "learning_rate": 3.643710795642126e-06, + "loss": 0.3934, + "step": 144500 + }, + { + "epoch": 3.1811681883181167, + "grad_norm": 3.7630527019500732, + "learning_rate": 3.6393089028282165e-06, + "loss": 0.3957, + "step": 144600 + }, + { + "epoch": 3.1833681663183366, + "grad_norm": 3.3282408714294434, + "learning_rate": 3.6349070100143063e-06, + "loss": 0.4038, + "step": 144700 + }, + { + "epoch": 3.1855681443185566, + "grad_norm": 3.896204710006714, + "learning_rate": 3.630505117200396e-06, + "loss": 0.4176, + "step": 144800 + }, + { + "epoch": 3.1877681223187766, + "grad_norm": 3.9070045948028564, + "learning_rate": 3.6261032243864865e-06, + "loss": 0.394, + "step": 144900 + }, + { + "epoch": 3.189968100318997, + "grad_norm": 4.341803073883057, + "learning_rate": 3.6217013315725763e-06, + "loss": 0.4016, + "step": 145000 + }, + { + "epoch": 3.192168078319217, + "grad_norm": 3.0518646240234375, + "learning_rate": 3.617299438758666e-06, + "loss": 0.4021, + "step": 145100 + }, + { + "epoch": 3.194368056319437, + "grad_norm": 2.9907262325286865, + "learning_rate": 3.6128975459447565e-06, + "loss": 0.401, + "step": 145200 + }, + { + "epoch": 3.196568034319657, + "grad_norm": 3.994093894958496, + "learning_rate": 3.6084956531308467e-06, + "loss": 0.3869, + "step": 145300 + }, + { + "epoch": 3.198768012319877, + "grad_norm": 4.31938362121582, + "learning_rate": 3.6040937603169366e-06, + "loss": 0.3942, + "step": 145400 + }, + { + "epoch": 3.200967990320097, + "grad_norm": 3.5487558841705322, + "learning_rate": 3.599691867503027e-06, + "loss": 0.3958, + "step": 145500 + }, + { + "epoch": 3.203167968320317, + "grad_norm": 4.53445291519165, + "learning_rate": 3.5952899746891167e-06, + "loss": 0.3962, + "step": 145600 + }, + { + "epoch": 3.205367946320537, + "grad_norm": 3.816943645477295, + "learning_rate": 3.5908880818752066e-06, + "loss": 0.4112, + "step": 145700 + }, + { + "epoch": 3.207567924320757, + "grad_norm": 4.102901935577393, + "learning_rate": 3.586486189061297e-06, + "loss": 0.3914, + "step": 145800 + }, + { + "epoch": 3.209767902320977, + "grad_norm": 3.5486576557159424, + "learning_rate": 3.5820842962473867e-06, + "loss": 0.3993, + "step": 145900 + }, + { + "epoch": 3.211967880321197, + "grad_norm": 3.8645424842834473, + "learning_rate": 3.5776824034334766e-06, + "loss": 0.4078, + "step": 146000 + }, + { + "epoch": 3.214167858321417, + "grad_norm": 3.282376527786255, + "learning_rate": 3.573280510619567e-06, + "loss": 0.4007, + "step": 146100 + }, + { + "epoch": 3.216367836321637, + "grad_norm": 3.293292999267578, + "learning_rate": 3.5688786178056567e-06, + "loss": 0.3909, + "step": 146200 + }, + { + "epoch": 3.218567814321857, + "grad_norm": 3.7592716217041016, + "learning_rate": 3.5644767249917466e-06, + "loss": 0.3916, + "step": 146300 + }, + { + "epoch": 3.2207677923220768, + "grad_norm": 3.8671295642852783, + "learning_rate": 3.5600748321778365e-06, + "loss": 0.4028, + "step": 146400 + }, + { + "epoch": 3.2229677703222968, + "grad_norm": 3.9341673851013184, + "learning_rate": 3.5556729393639267e-06, + "loss": 0.3957, + "step": 146500 + }, + { + "epoch": 3.2251677483225167, + "grad_norm": 3.232847213745117, + "learning_rate": 3.5512710465500166e-06, + "loss": 0.39, + "step": 146600 + }, + { + "epoch": 3.2273677263227367, + "grad_norm": 3.2240495681762695, + "learning_rate": 3.5468691537361065e-06, + "loss": 0.3947, + "step": 146700 + }, + { + "epoch": 3.2295677043229567, + "grad_norm": 3.527489423751831, + "learning_rate": 3.5424672609221967e-06, + "loss": 0.3968, + "step": 146800 + }, + { + "epoch": 3.2317676823231767, + "grad_norm": 3.633652687072754, + "learning_rate": 3.5380653681082866e-06, + "loss": 0.3883, + "step": 146900 + }, + { + "epoch": 3.2339676603233967, + "grad_norm": 3.5555477142333984, + "learning_rate": 3.5336634752943773e-06, + "loss": 0.3965, + "step": 147000 + }, + { + "epoch": 3.2361676383236166, + "grad_norm": 3.5308194160461426, + "learning_rate": 3.529261582480467e-06, + "loss": 0.3881, + "step": 147100 + }, + { + "epoch": 3.2383676163238366, + "grad_norm": 4.819995880126953, + "learning_rate": 3.524859689666557e-06, + "loss": 0.4036, + "step": 147200 + }, + { + "epoch": 3.2405675943240566, + "grad_norm": 4.3929033279418945, + "learning_rate": 3.5204577968526473e-06, + "loss": 0.3931, + "step": 147300 + }, + { + "epoch": 3.2427675723242766, + "grad_norm": 2.9747936725616455, + "learning_rate": 3.516055904038737e-06, + "loss": 0.3986, + "step": 147400 + }, + { + "epoch": 3.2449675503244966, + "grad_norm": 3.78959321975708, + "learning_rate": 3.511654011224827e-06, + "loss": 0.4019, + "step": 147500 + }, + { + "epoch": 3.2471675283247166, + "grad_norm": 3.943894624710083, + "learning_rate": 3.507252118410917e-06, + "loss": 0.3924, + "step": 147600 + }, + { + "epoch": 3.249367506324937, + "grad_norm": 3.963569402694702, + "learning_rate": 3.502850225597007e-06, + "loss": 0.4118, + "step": 147700 + }, + { + "epoch": 3.2515674843251565, + "grad_norm": 3.0138792991638184, + "learning_rate": 3.498448332783097e-06, + "loss": 0.3914, + "step": 147800 + }, + { + "epoch": 3.253767462325377, + "grad_norm": 3.723484754562378, + "learning_rate": 3.494046439969187e-06, + "loss": 0.3847, + "step": 147900 + }, + { + "epoch": 3.255967440325597, + "grad_norm": 3.7332823276519775, + "learning_rate": 3.489644547155277e-06, + "loss": 0.3943, + "step": 148000 + }, + { + "epoch": 3.258167418325817, + "grad_norm": 4.216028690338135, + "learning_rate": 3.485242654341367e-06, + "loss": 0.3959, + "step": 148100 + }, + { + "epoch": 3.260367396326037, + "grad_norm": 2.8157236576080322, + "learning_rate": 3.480840761527457e-06, + "loss": 0.4163, + "step": 148200 + }, + { + "epoch": 3.262567374326257, + "grad_norm": 3.428497076034546, + "learning_rate": 3.476438868713547e-06, + "loss": 0.4064, + "step": 148300 + }, + { + "epoch": 3.264767352326477, + "grad_norm": 3.9073712825775146, + "learning_rate": 3.472036975899637e-06, + "loss": 0.4014, + "step": 148400 + }, + { + "epoch": 3.266967330326697, + "grad_norm": 4.03035306930542, + "learning_rate": 3.467635083085727e-06, + "loss": 0.3991, + "step": 148500 + }, + { + "epoch": 3.269167308326917, + "grad_norm": 2.9807870388031006, + "learning_rate": 3.4632331902718167e-06, + "loss": 0.4106, + "step": 148600 + }, + { + "epoch": 3.271367286327137, + "grad_norm": 2.9369282722473145, + "learning_rate": 3.458831297457907e-06, + "loss": 0.4048, + "step": 148700 + }, + { + "epoch": 3.273567264327357, + "grad_norm": 3.9154739379882812, + "learning_rate": 3.4544294046439973e-06, + "loss": 0.3984, + "step": 148800 + }, + { + "epoch": 3.2757672423275768, + "grad_norm": 4.0573601722717285, + "learning_rate": 3.4500275118300876e-06, + "loss": 0.3994, + "step": 148900 + }, + { + "epoch": 3.2779672203277967, + "grad_norm": 3.521632671356201, + "learning_rate": 3.4456256190161774e-06, + "loss": 0.4101, + "step": 149000 + }, + { + "epoch": 3.2801671983280167, + "grad_norm": 3.2614357471466064, + "learning_rate": 3.4412237262022673e-06, + "loss": 0.404, + "step": 149100 + }, + { + "epoch": 3.2823671763282367, + "grad_norm": 2.79972767829895, + "learning_rate": 3.4368218333883576e-06, + "loss": 0.3997, + "step": 149200 + }, + { + "epoch": 3.2845671543284567, + "grad_norm": 3.753329038619995, + "learning_rate": 3.4324199405744474e-06, + "loss": 0.3992, + "step": 149300 + }, + { + "epoch": 3.2867671323286767, + "grad_norm": 2.569004774093628, + "learning_rate": 3.4280180477605373e-06, + "loss": 0.4007, + "step": 149400 + }, + { + "epoch": 3.2889671103288967, + "grad_norm": 3.0934865474700928, + "learning_rate": 3.423616154946627e-06, + "loss": 0.3978, + "step": 149500 + }, + { + "epoch": 3.2911670883291166, + "grad_norm": 3.7940945625305176, + "learning_rate": 3.4192142621327174e-06, + "loss": 0.4009, + "step": 149600 + }, + { + "epoch": 3.2933670663293366, + "grad_norm": 3.0418498516082764, + "learning_rate": 3.4148123693188073e-06, + "loss": 0.3977, + "step": 149700 + }, + { + "epoch": 3.2955670443295566, + "grad_norm": 3.6287832260131836, + "learning_rate": 3.410410476504897e-06, + "loss": 0.399, + "step": 149800 + }, + { + "epoch": 3.2977670223297766, + "grad_norm": 3.9616570472717285, + "learning_rate": 3.4060085836909874e-06, + "loss": 0.392, + "step": 149900 + }, + { + "epoch": 3.2999670003299966, + "grad_norm": 3.6250250339508057, + "learning_rate": 3.4016066908770773e-06, + "loss": 0.3837, + "step": 150000 + }, + { + "epoch": 3.2999670003299966, + "eval_loss": 0.5354483723640442, + "eval_runtime": 386.9888, + "eval_samples_per_second": 155.043, + "eval_steps_per_second": 4.845, + "step": 150000 + } + ], + "logging_steps": 100, + "max_steps": 227275, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.913014241035995e+17, + "train_batch_size": 22, + "trial_name": null, + "trial_params": null +}