diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,63639 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 908, + "global_step": 9075, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011019283746556474, + "grad_norm": 59.863380432128906, + "learning_rate": 1.098901098901099e-07, + "loss": 1.0565, + "step": 1 + }, + { + "epoch": 0.00022038567493112948, + "grad_norm": 172.912353515625, + "learning_rate": 2.197802197802198e-07, + "loss": 1.6306, + "step": 2 + }, + { + "epoch": 0.00033057851239669424, + "grad_norm": 193.34146118164062, + "learning_rate": 3.296703296703297e-07, + "loss": 1.5845, + "step": 3 + }, + { + "epoch": 0.00044077134986225897, + "grad_norm": 71.46421813964844, + "learning_rate": 4.395604395604396e-07, + "loss": 1.1894, + "step": 4 + }, + { + "epoch": 0.0005509641873278236, + "grad_norm": 214.38052368164062, + "learning_rate": 5.494505494505495e-07, + "loss": 1.5854, + "step": 5 + }, + { + "epoch": 0.0006611570247933885, + "grad_norm": 92.93484497070312, + "learning_rate": 6.593406593406594e-07, + "loss": 1.1533, + "step": 6 + }, + { + "epoch": 0.0007713498622589532, + "grad_norm": 88.05059814453125, + "learning_rate": 7.692307692307694e-07, + "loss": 1.2465, + "step": 7 + }, + { + "epoch": 0.0008815426997245179, + "grad_norm": 90.69667053222656, + "learning_rate": 8.791208791208792e-07, + "loss": 1.0339, + "step": 8 + }, + { + "epoch": 0.0009917355371900827, + "grad_norm": 174.13861083984375, + "learning_rate": 9.890109890109891e-07, + "loss": 1.653, + "step": 9 + }, + { + "epoch": 0.0011019283746556473, + "grad_norm": 62.3985710144043, + "learning_rate": 1.098901098901099e-06, + "loss": 0.986, + "step": 10 + }, + { + "epoch": 0.0012121212121212121, + "grad_norm": 39.197818756103516, + "learning_rate": 1.2087912087912089e-06, + "loss": 0.9859, + "step": 11 + }, + { + "epoch": 0.001322314049586777, + "grad_norm": 97.90790557861328, + "learning_rate": 1.3186813186813187e-06, + "loss": 1.057, + "step": 12 + }, + { + "epoch": 0.0014325068870523416, + "grad_norm": 34.72150421142578, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.9492, + "step": 13 + }, + { + "epoch": 0.0015426997245179064, + "grad_norm": 48.048912048339844, + "learning_rate": 1.5384615384615387e-06, + "loss": 1.0598, + "step": 14 + }, + { + "epoch": 0.001652892561983471, + "grad_norm": 26.897621154785156, + "learning_rate": 1.6483516483516484e-06, + "loss": 0.9134, + "step": 15 + }, + { + "epoch": 0.0017630853994490359, + "grad_norm": 40.0106201171875, + "learning_rate": 1.7582417582417585e-06, + "loss": 0.7494, + "step": 16 + }, + { + "epoch": 0.0018732782369146005, + "grad_norm": 24.09613037109375, + "learning_rate": 1.8681318681318684e-06, + "loss": 0.8329, + "step": 17 + }, + { + "epoch": 0.0019834710743801653, + "grad_norm": 27.178569793701172, + "learning_rate": 1.9780219780219782e-06, + "loss": 0.8935, + "step": 18 + }, + { + "epoch": 0.00209366391184573, + "grad_norm": 21.927278518676758, + "learning_rate": 2.0879120879120883e-06, + "loss": 0.869, + "step": 19 + }, + { + "epoch": 0.0022038567493112946, + "grad_norm": 32.57326126098633, + "learning_rate": 2.197802197802198e-06, + "loss": 0.8756, + "step": 20 + }, + { + "epoch": 0.0023140495867768596, + "grad_norm": 26.621749877929688, + "learning_rate": 2.307692307692308e-06, + "loss": 0.8861, + "step": 21 + }, + { + "epoch": 0.0024242424242424242, + "grad_norm": 21.098466873168945, + "learning_rate": 2.4175824175824177e-06, + "loss": 0.8368, + "step": 22 + }, + { + "epoch": 0.002534435261707989, + "grad_norm": 18.737295150756836, + "learning_rate": 2.5274725274725274e-06, + "loss": 0.7743, + "step": 23 + }, + { + "epoch": 0.002644628099173554, + "grad_norm": 19.97951316833496, + "learning_rate": 2.6373626373626375e-06, + "loss": 0.8272, + "step": 24 + }, + { + "epoch": 0.0027548209366391185, + "grad_norm": 26.72895050048828, + "learning_rate": 2.7472527472527476e-06, + "loss": 0.7512, + "step": 25 + }, + { + "epoch": 0.002865013774104683, + "grad_norm": 38.96323776245117, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.6617, + "step": 26 + }, + { + "epoch": 0.0029752066115702478, + "grad_norm": 29.704008102416992, + "learning_rate": 2.9670329670329673e-06, + "loss": 0.7189, + "step": 27 + }, + { + "epoch": 0.003085399449035813, + "grad_norm": 31.563518524169922, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.782, + "step": 28 + }, + { + "epoch": 0.0031955922865013774, + "grad_norm": 26.969806671142578, + "learning_rate": 3.1868131868131867e-06, + "loss": 0.6745, + "step": 29 + }, + { + "epoch": 0.003305785123966942, + "grad_norm": 26.056306838989258, + "learning_rate": 3.2967032967032968e-06, + "loss": 0.5712, + "step": 30 + }, + { + "epoch": 0.0034159779614325067, + "grad_norm": 23.492544174194336, + "learning_rate": 3.406593406593407e-06, + "loss": 0.6993, + "step": 31 + }, + { + "epoch": 0.0035261707988980717, + "grad_norm": 26.434640884399414, + "learning_rate": 3.516483516483517e-06, + "loss": 0.5461, + "step": 32 + }, + { + "epoch": 0.0036363636363636364, + "grad_norm": 21.654865264892578, + "learning_rate": 3.6263736263736266e-06, + "loss": 0.6576, + "step": 33 + }, + { + "epoch": 0.003746556473829201, + "grad_norm": 25.91067123413086, + "learning_rate": 3.7362637362637367e-06, + "loss": 0.6175, + "step": 34 + }, + { + "epoch": 0.003856749311294766, + "grad_norm": 18.07984733581543, + "learning_rate": 3.846153846153847e-06, + "loss": 0.5718, + "step": 35 + }, + { + "epoch": 0.003966942148760331, + "grad_norm": 18.15633201599121, + "learning_rate": 3.9560439560439565e-06, + "loss": 0.6699, + "step": 36 + }, + { + "epoch": 0.004077134986225896, + "grad_norm": 28.740493774414062, + "learning_rate": 4.065934065934066e-06, + "loss": 0.6541, + "step": 37 + }, + { + "epoch": 0.00418732782369146, + "grad_norm": 30.440296173095703, + "learning_rate": 4.175824175824177e-06, + "loss": 0.678, + "step": 38 + }, + { + "epoch": 0.004297520661157025, + "grad_norm": 16.116561889648438, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.5679, + "step": 39 + }, + { + "epoch": 0.004407713498622589, + "grad_norm": 25.8539981842041, + "learning_rate": 4.395604395604396e-06, + "loss": 0.7524, + "step": 40 + }, + { + "epoch": 0.004517906336088154, + "grad_norm": 16.08936882019043, + "learning_rate": 4.505494505494506e-06, + "loss": 0.5679, + "step": 41 + }, + { + "epoch": 0.004628099173553719, + "grad_norm": 11.73170280456543, + "learning_rate": 4.615384615384616e-06, + "loss": 0.5828, + "step": 42 + }, + { + "epoch": 0.004738292011019283, + "grad_norm": 22.269550323486328, + "learning_rate": 4.725274725274726e-06, + "loss": 0.6695, + "step": 43 + }, + { + "epoch": 0.0048484848484848485, + "grad_norm": 22.19915008544922, + "learning_rate": 4.8351648351648355e-06, + "loss": 0.6622, + "step": 44 + }, + { + "epoch": 0.0049586776859504135, + "grad_norm": 13.642354011535645, + "learning_rate": 4.945054945054946e-06, + "loss": 0.5272, + "step": 45 + }, + { + "epoch": 0.005068870523415978, + "grad_norm": 17.455495834350586, + "learning_rate": 5.054945054945055e-06, + "loss": 0.604, + "step": 46 + }, + { + "epoch": 0.005179063360881543, + "grad_norm": 19.724523544311523, + "learning_rate": 5.164835164835166e-06, + "loss": 0.5195, + "step": 47 + }, + { + "epoch": 0.005289256198347108, + "grad_norm": 35.24604034423828, + "learning_rate": 5.274725274725275e-06, + "loss": 0.5939, + "step": 48 + }, + { + "epoch": 0.005399449035812672, + "grad_norm": 23.06145668029785, + "learning_rate": 5.384615384615385e-06, + "loss": 0.6269, + "step": 49 + }, + { + "epoch": 0.005509641873278237, + "grad_norm": 19.79766273498535, + "learning_rate": 5.494505494505495e-06, + "loss": 0.5724, + "step": 50 + }, + { + "epoch": 0.005619834710743801, + "grad_norm": 11.354966163635254, + "learning_rate": 5.604395604395605e-06, + "loss": 0.5952, + "step": 51 + }, + { + "epoch": 0.005730027548209366, + "grad_norm": 11.693944931030273, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.6348, + "step": 52 + }, + { + "epoch": 0.005840220385674931, + "grad_norm": 43.29163360595703, + "learning_rate": 5.824175824175825e-06, + "loss": 0.6324, + "step": 53 + }, + { + "epoch": 0.0059504132231404955, + "grad_norm": 27.70946502685547, + "learning_rate": 5.934065934065935e-06, + "loss": 0.7219, + "step": 54 + }, + { + "epoch": 0.006060606060606061, + "grad_norm": 21.75235939025879, + "learning_rate": 6.043956043956044e-06, + "loss": 0.6411, + "step": 55 + }, + { + "epoch": 0.006170798898071626, + "grad_norm": 17.24085235595703, + "learning_rate": 6.153846153846155e-06, + "loss": 0.582, + "step": 56 + }, + { + "epoch": 0.00628099173553719, + "grad_norm": 14.344873428344727, + "learning_rate": 6.2637362637362645e-06, + "loss": 0.5875, + "step": 57 + }, + { + "epoch": 0.006391184573002755, + "grad_norm": 9.679116249084473, + "learning_rate": 6.373626373626373e-06, + "loss": 0.6291, + "step": 58 + }, + { + "epoch": 0.00650137741046832, + "grad_norm": 27.68067169189453, + "learning_rate": 6.483516483516485e-06, + "loss": 0.6413, + "step": 59 + }, + { + "epoch": 0.006611570247933884, + "grad_norm": 27.99264144897461, + "learning_rate": 6.5934065934065935e-06, + "loss": 0.678, + "step": 60 + }, + { + "epoch": 0.006721763085399449, + "grad_norm": 11.32085132598877, + "learning_rate": 6.703296703296703e-06, + "loss": 0.5968, + "step": 61 + }, + { + "epoch": 0.006831955922865013, + "grad_norm": 15.186779975891113, + "learning_rate": 6.813186813186814e-06, + "loss": 0.4404, + "step": 62 + }, + { + "epoch": 0.006942148760330578, + "grad_norm": 9.521608352661133, + "learning_rate": 6.923076923076923e-06, + "loss": 0.5429, + "step": 63 + }, + { + "epoch": 0.0070523415977961435, + "grad_norm": 12.57529067993164, + "learning_rate": 7.032967032967034e-06, + "loss": 0.5389, + "step": 64 + }, + { + "epoch": 0.007162534435261708, + "grad_norm": 22.200571060180664, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.6364, + "step": 65 + }, + { + "epoch": 0.007272727272727273, + "grad_norm": 15.877410888671875, + "learning_rate": 7.252747252747253e-06, + "loss": 0.5471, + "step": 66 + }, + { + "epoch": 0.007382920110192838, + "grad_norm": 24.132150650024414, + "learning_rate": 7.362637362637364e-06, + "loss": 0.5143, + "step": 67 + }, + { + "epoch": 0.007493112947658402, + "grad_norm": 21.11246109008789, + "learning_rate": 7.472527472527473e-06, + "loss": 0.562, + "step": 68 + }, + { + "epoch": 0.007603305785123967, + "grad_norm": 22.65631675720215, + "learning_rate": 7.582417582417583e-06, + "loss": 0.5674, + "step": 69 + }, + { + "epoch": 0.007713498622589532, + "grad_norm": 22.620630264282227, + "learning_rate": 7.692307692307694e-06, + "loss": 0.6273, + "step": 70 + }, + { + "epoch": 0.007823691460055097, + "grad_norm": 19.626869201660156, + "learning_rate": 7.802197802197802e-06, + "loss": 0.5974, + "step": 71 + }, + { + "epoch": 0.007933884297520661, + "grad_norm": 12.321710586547852, + "learning_rate": 7.912087912087913e-06, + "loss": 0.6292, + "step": 72 + }, + { + "epoch": 0.008044077134986225, + "grad_norm": 18.073720932006836, + "learning_rate": 8.021978021978023e-06, + "loss": 0.5283, + "step": 73 + }, + { + "epoch": 0.008154269972451791, + "grad_norm": 19.79140853881836, + "learning_rate": 8.131868131868132e-06, + "loss": 0.5381, + "step": 74 + }, + { + "epoch": 0.008264462809917356, + "grad_norm": 26.322410583496094, + "learning_rate": 8.241758241758243e-06, + "loss": 0.4972, + "step": 75 + }, + { + "epoch": 0.00837465564738292, + "grad_norm": 13.333000183105469, + "learning_rate": 8.351648351648353e-06, + "loss": 0.5632, + "step": 76 + }, + { + "epoch": 0.008484848484848486, + "grad_norm": 13.923554420471191, + "learning_rate": 8.461538461538462e-06, + "loss": 0.52, + "step": 77 + }, + { + "epoch": 0.00859504132231405, + "grad_norm": 12.857089042663574, + "learning_rate": 8.571428571428571e-06, + "loss": 0.5205, + "step": 78 + }, + { + "epoch": 0.008705234159779614, + "grad_norm": 16.085172653198242, + "learning_rate": 8.681318681318681e-06, + "loss": 0.5551, + "step": 79 + }, + { + "epoch": 0.008815426997245178, + "grad_norm": 15.384976387023926, + "learning_rate": 8.791208791208792e-06, + "loss": 0.5361, + "step": 80 + }, + { + "epoch": 0.008925619834710744, + "grad_norm": 12.480789184570312, + "learning_rate": 8.9010989010989e-06, + "loss": 0.4877, + "step": 81 + }, + { + "epoch": 0.009035812672176308, + "grad_norm": 28.784650802612305, + "learning_rate": 9.010989010989011e-06, + "loss": 0.6361, + "step": 82 + }, + { + "epoch": 0.009146005509641873, + "grad_norm": 20.274999618530273, + "learning_rate": 9.120879120879122e-06, + "loss": 0.5687, + "step": 83 + }, + { + "epoch": 0.009256198347107438, + "grad_norm": 9.781147956848145, + "learning_rate": 9.230769230769232e-06, + "loss": 0.5868, + "step": 84 + }, + { + "epoch": 0.009366391184573003, + "grad_norm": 34.077537536621094, + "learning_rate": 9.340659340659341e-06, + "loss": 0.4965, + "step": 85 + }, + { + "epoch": 0.009476584022038567, + "grad_norm": 16.664121627807617, + "learning_rate": 9.450549450549452e-06, + "loss": 0.5218, + "step": 86 + }, + { + "epoch": 0.009586776859504133, + "grad_norm": 17.106098175048828, + "learning_rate": 9.560439560439562e-06, + "loss": 0.5994, + "step": 87 + }, + { + "epoch": 0.009696969696969697, + "grad_norm": 11.295900344848633, + "learning_rate": 9.670329670329671e-06, + "loss": 0.5267, + "step": 88 + }, + { + "epoch": 0.009807162534435261, + "grad_norm": 13.43213176727295, + "learning_rate": 9.780219780219781e-06, + "loss": 0.6087, + "step": 89 + }, + { + "epoch": 0.009917355371900827, + "grad_norm": 20.79828643798828, + "learning_rate": 9.890109890109892e-06, + "loss": 0.5151, + "step": 90 + }, + { + "epoch": 0.010027548209366391, + "grad_norm": 8.796585083007812, + "learning_rate": 1e-05, + "loss": 0.4638, + "step": 91 + }, + { + "epoch": 0.010137741046831955, + "grad_norm": 14.324283599853516, + "learning_rate": 9.999999694296605e-06, + "loss": 0.642, + "step": 92 + }, + { + "epoch": 0.010247933884297521, + "grad_norm": 24.24766731262207, + "learning_rate": 9.999998777186455e-06, + "loss": 0.6757, + "step": 93 + }, + { + "epoch": 0.010358126721763086, + "grad_norm": 5.7737932205200195, + "learning_rate": 9.999997248669662e-06, + "loss": 0.593, + "step": 94 + }, + { + "epoch": 0.01046831955922865, + "grad_norm": 9.869256019592285, + "learning_rate": 9.999995108746413e-06, + "loss": 0.5359, + "step": 95 + }, + { + "epoch": 0.010578512396694216, + "grad_norm": 15.801307678222656, + "learning_rate": 9.999992357416972e-06, + "loss": 0.5176, + "step": 96 + }, + { + "epoch": 0.01068870523415978, + "grad_norm": 19.859272003173828, + "learning_rate": 9.999988994681672e-06, + "loss": 0.5891, + "step": 97 + }, + { + "epoch": 0.010798898071625344, + "grad_norm": 11.07324504852295, + "learning_rate": 9.999985020540928e-06, + "loss": 0.6453, + "step": 98 + }, + { + "epoch": 0.01090909090909091, + "grad_norm": 9.245484352111816, + "learning_rate": 9.999980434995223e-06, + "loss": 0.5534, + "step": 99 + }, + { + "epoch": 0.011019283746556474, + "grad_norm": 10.798912048339844, + "learning_rate": 9.999975238045117e-06, + "loss": 0.5368, + "step": 100 + }, + { + "epoch": 0.011129476584022038, + "grad_norm": 17.250713348388672, + "learning_rate": 9.99996942969125e-06, + "loss": 0.609, + "step": 101 + }, + { + "epoch": 0.011239669421487603, + "grad_norm": 14.687119483947754, + "learning_rate": 9.999963009934327e-06, + "loss": 0.5903, + "step": 102 + }, + { + "epoch": 0.011349862258953168, + "grad_norm": 13.010708808898926, + "learning_rate": 9.999955978775135e-06, + "loss": 0.4922, + "step": 103 + }, + { + "epoch": 0.011460055096418733, + "grad_norm": 12.1796875, + "learning_rate": 9.999948336214536e-06, + "loss": 0.5215, + "step": 104 + }, + { + "epoch": 0.011570247933884297, + "grad_norm": 22.230316162109375, + "learning_rate": 9.999940082253462e-06, + "loss": 0.4579, + "step": 105 + }, + { + "epoch": 0.011680440771349863, + "grad_norm": 11.79276180267334, + "learning_rate": 9.999931216892924e-06, + "loss": 0.5166, + "step": 106 + }, + { + "epoch": 0.011790633608815427, + "grad_norm": 13.385542869567871, + "learning_rate": 9.999921740134003e-06, + "loss": 0.6159, + "step": 107 + }, + { + "epoch": 0.011900826446280991, + "grad_norm": 15.622406005859375, + "learning_rate": 9.99991165197786e-06, + "loss": 0.5958, + "step": 108 + }, + { + "epoch": 0.012011019283746557, + "grad_norm": 11.1148099899292, + "learning_rate": 9.999900952425729e-06, + "loss": 0.5128, + "step": 109 + }, + { + "epoch": 0.012121212121212121, + "grad_norm": 11.416248321533203, + "learning_rate": 9.999889641478919e-06, + "loss": 0.5847, + "step": 110 + }, + { + "epoch": 0.012231404958677685, + "grad_norm": 14.961265563964844, + "learning_rate": 9.999877719138812e-06, + "loss": 0.53, + "step": 111 + }, + { + "epoch": 0.012341597796143251, + "grad_norm": 11.567359924316406, + "learning_rate": 9.999865185406865e-06, + "loss": 0.4921, + "step": 112 + }, + { + "epoch": 0.012451790633608815, + "grad_norm": 13.228514671325684, + "learning_rate": 9.999852040284612e-06, + "loss": 0.5288, + "step": 113 + }, + { + "epoch": 0.01256198347107438, + "grad_norm": 19.065542221069336, + "learning_rate": 9.999838283773658e-06, + "loss": 0.49, + "step": 114 + }, + { + "epoch": 0.012672176308539946, + "grad_norm": 13.72010612487793, + "learning_rate": 9.999823915875689e-06, + "loss": 0.4847, + "step": 115 + }, + { + "epoch": 0.01278236914600551, + "grad_norm": 9.950152397155762, + "learning_rate": 9.999808936592459e-06, + "loss": 0.6273, + "step": 116 + }, + { + "epoch": 0.012892561983471074, + "grad_norm": 10.342408180236816, + "learning_rate": 9.9997933459258e-06, + "loss": 0.5014, + "step": 117 + }, + { + "epoch": 0.01300275482093664, + "grad_norm": 14.401764869689941, + "learning_rate": 9.999777143877622e-06, + "loss": 0.5781, + "step": 118 + }, + { + "epoch": 0.013112947658402204, + "grad_norm": 15.089703559875488, + "learning_rate": 9.999760330449902e-06, + "loss": 0.611, + "step": 119 + }, + { + "epoch": 0.013223140495867768, + "grad_norm": 16.48484992980957, + "learning_rate": 9.999742905644697e-06, + "loss": 0.6119, + "step": 120 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 14.737231254577637, + "learning_rate": 9.999724869464136e-06, + "loss": 0.6894, + "step": 121 + }, + { + "epoch": 0.013443526170798898, + "grad_norm": 12.05566120147705, + "learning_rate": 9.999706221910428e-06, + "loss": 0.5635, + "step": 122 + }, + { + "epoch": 0.013553719008264463, + "grad_norm": 10.84809398651123, + "learning_rate": 9.999686962985852e-06, + "loss": 0.5556, + "step": 123 + }, + { + "epoch": 0.013663911845730027, + "grad_norm": 9.9436616897583, + "learning_rate": 9.999667092692763e-06, + "loss": 0.5408, + "step": 124 + }, + { + "epoch": 0.013774104683195593, + "grad_norm": 18.544424057006836, + "learning_rate": 9.99964661103359e-06, + "loss": 0.5044, + "step": 125 + }, + { + "epoch": 0.013884297520661157, + "grad_norm": 22.588855743408203, + "learning_rate": 9.999625518010837e-06, + "loss": 0.6151, + "step": 126 + }, + { + "epoch": 0.013994490358126721, + "grad_norm": 18.303138732910156, + "learning_rate": 9.999603813627087e-06, + "loss": 0.5957, + "step": 127 + }, + { + "epoch": 0.014104683195592287, + "grad_norm": 9.987324714660645, + "learning_rate": 9.999581497884992e-06, + "loss": 0.5337, + "step": 128 + }, + { + "epoch": 0.014214876033057851, + "grad_norm": 8.876851081848145, + "learning_rate": 9.999558570787277e-06, + "loss": 0.5816, + "step": 129 + }, + { + "epoch": 0.014325068870523415, + "grad_norm": 16.323097229003906, + "learning_rate": 9.999535032336749e-06, + "loss": 0.4789, + "step": 130 + }, + { + "epoch": 0.014435261707988981, + "grad_norm": 12.875208854675293, + "learning_rate": 9.999510882536288e-06, + "loss": 0.5265, + "step": 131 + }, + { + "epoch": 0.014545454545454545, + "grad_norm": 14.685235977172852, + "learning_rate": 9.999486121388844e-06, + "loss": 0.4931, + "step": 132 + }, + { + "epoch": 0.01465564738292011, + "grad_norm": 9.357027053833008, + "learning_rate": 9.999460748897447e-06, + "loss": 0.5076, + "step": 133 + }, + { + "epoch": 0.014765840220385676, + "grad_norm": 21.911592483520508, + "learning_rate": 9.999434765065197e-06, + "loss": 0.5248, + "step": 134 + }, + { + "epoch": 0.01487603305785124, + "grad_norm": 11.090331077575684, + "learning_rate": 9.999408169895273e-06, + "loss": 0.5696, + "step": 135 + }, + { + "epoch": 0.014986225895316804, + "grad_norm": 13.339033126831055, + "learning_rate": 9.999380963390929e-06, + "loss": 0.5207, + "step": 136 + }, + { + "epoch": 0.01509641873278237, + "grad_norm": 24.932645797729492, + "learning_rate": 9.999353145555486e-06, + "loss": 0.6721, + "step": 137 + }, + { + "epoch": 0.015206611570247934, + "grad_norm": 11.57662296295166, + "learning_rate": 9.999324716392352e-06, + "loss": 0.4752, + "step": 138 + }, + { + "epoch": 0.015316804407713498, + "grad_norm": 16.003211975097656, + "learning_rate": 9.999295675905001e-06, + "loss": 0.518, + "step": 139 + }, + { + "epoch": 0.015426997245179064, + "grad_norm": 13.036341667175293, + "learning_rate": 9.999266024096982e-06, + "loss": 0.5324, + "step": 140 + }, + { + "epoch": 0.015537190082644628, + "grad_norm": 11.445280075073242, + "learning_rate": 9.999235760971925e-06, + "loss": 0.5302, + "step": 141 + }, + { + "epoch": 0.015647382920110194, + "grad_norm": 12.373127937316895, + "learning_rate": 9.999204886533527e-06, + "loss": 0.4233, + "step": 142 + }, + { + "epoch": 0.01575757575757576, + "grad_norm": 13.878166198730469, + "learning_rate": 9.999173400785564e-06, + "loss": 0.5741, + "step": 143 + }, + { + "epoch": 0.015867768595041323, + "grad_norm": 44.70547866821289, + "learning_rate": 9.999141303731889e-06, + "loss": 0.5027, + "step": 144 + }, + { + "epoch": 0.015977961432506887, + "grad_norm": 19.48044776916504, + "learning_rate": 9.999108595376424e-06, + "loss": 0.5468, + "step": 145 + }, + { + "epoch": 0.01608815426997245, + "grad_norm": 21.51380157470703, + "learning_rate": 9.999075275723169e-06, + "loss": 0.6482, + "step": 146 + }, + { + "epoch": 0.016198347107438015, + "grad_norm": 13.136129379272461, + "learning_rate": 9.999041344776198e-06, + "loss": 0.6616, + "step": 147 + }, + { + "epoch": 0.016308539944903583, + "grad_norm": 8.277416229248047, + "learning_rate": 9.999006802539662e-06, + "loss": 0.6248, + "step": 148 + }, + { + "epoch": 0.016418732782369147, + "grad_norm": 12.341249465942383, + "learning_rate": 9.998971649017784e-06, + "loss": 0.5269, + "step": 149 + }, + { + "epoch": 0.01652892561983471, + "grad_norm": 12.141154289245605, + "learning_rate": 9.99893588421486e-06, + "loss": 0.5146, + "step": 150 + }, + { + "epoch": 0.016639118457300275, + "grad_norm": 14.272335052490234, + "learning_rate": 9.998899508135267e-06, + "loss": 0.5359, + "step": 151 + }, + { + "epoch": 0.01674931129476584, + "grad_norm": 11.248027801513672, + "learning_rate": 9.998862520783452e-06, + "loss": 0.5398, + "step": 152 + }, + { + "epoch": 0.016859504132231404, + "grad_norm": 7.830145835876465, + "learning_rate": 9.998824922163938e-06, + "loss": 0.4999, + "step": 153 + }, + { + "epoch": 0.01696969696969697, + "grad_norm": 14.68532943725586, + "learning_rate": 9.998786712281322e-06, + "loss": 0.497, + "step": 154 + }, + { + "epoch": 0.017079889807162536, + "grad_norm": 23.5339298248291, + "learning_rate": 9.998747891140277e-06, + "loss": 0.6275, + "step": 155 + }, + { + "epoch": 0.0171900826446281, + "grad_norm": 22.34769058227539, + "learning_rate": 9.99870845874555e-06, + "loss": 0.6367, + "step": 156 + }, + { + "epoch": 0.017300275482093664, + "grad_norm": 20.661041259765625, + "learning_rate": 9.99866841510196e-06, + "loss": 0.5138, + "step": 157 + }, + { + "epoch": 0.017410468319559228, + "grad_norm": 11.57186508178711, + "learning_rate": 9.99862776021441e-06, + "loss": 0.5041, + "step": 158 + }, + { + "epoch": 0.017520661157024792, + "grad_norm": 11.123800277709961, + "learning_rate": 9.998586494087865e-06, + "loss": 0.4916, + "step": 159 + }, + { + "epoch": 0.017630853994490357, + "grad_norm": 18.798917770385742, + "learning_rate": 9.998544616727374e-06, + "loss": 0.4933, + "step": 160 + }, + { + "epoch": 0.017741046831955924, + "grad_norm": 18.14601707458496, + "learning_rate": 9.998502128138056e-06, + "loss": 0.4565, + "step": 161 + }, + { + "epoch": 0.01785123966942149, + "grad_norm": 13.255478858947754, + "learning_rate": 9.99845902832511e-06, + "loss": 0.516, + "step": 162 + }, + { + "epoch": 0.017961432506887053, + "grad_norm": 15.851000785827637, + "learning_rate": 9.998415317293805e-06, + "loss": 0.5405, + "step": 163 + }, + { + "epoch": 0.018071625344352617, + "grad_norm": 10.505412101745605, + "learning_rate": 9.998370995049485e-06, + "loss": 0.5449, + "step": 164 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 10.862316131591797, + "learning_rate": 9.998326061597567e-06, + "loss": 0.4108, + "step": 165 + }, + { + "epoch": 0.018292011019283745, + "grad_norm": 10.101140022277832, + "learning_rate": 9.998280516943553e-06, + "loss": 0.4293, + "step": 166 + }, + { + "epoch": 0.018402203856749313, + "grad_norm": 14.068907737731934, + "learning_rate": 9.998234361093005e-06, + "loss": 0.4723, + "step": 167 + }, + { + "epoch": 0.018512396694214877, + "grad_norm": 15.875856399536133, + "learning_rate": 9.99818759405157e-06, + "loss": 0.5235, + "step": 168 + }, + { + "epoch": 0.01862258953168044, + "grad_norm": 18.25687026977539, + "learning_rate": 9.998140215824967e-06, + "loss": 0.3877, + "step": 169 + }, + { + "epoch": 0.018732782369146005, + "grad_norm": 21.297866821289062, + "learning_rate": 9.99809222641899e-06, + "loss": 0.5717, + "step": 170 + }, + { + "epoch": 0.01884297520661157, + "grad_norm": 11.994351387023926, + "learning_rate": 9.998043625839506e-06, + "loss": 0.51, + "step": 171 + }, + { + "epoch": 0.018953168044077134, + "grad_norm": 7.859072685241699, + "learning_rate": 9.997994414092458e-06, + "loss": 0.4668, + "step": 172 + }, + { + "epoch": 0.0190633608815427, + "grad_norm": 18.620986938476562, + "learning_rate": 9.997944591183864e-06, + "loss": 0.5655, + "step": 173 + }, + { + "epoch": 0.019173553719008266, + "grad_norm": 8.813560485839844, + "learning_rate": 9.997894157119816e-06, + "loss": 0.6078, + "step": 174 + }, + { + "epoch": 0.01928374655647383, + "grad_norm": 7.580974578857422, + "learning_rate": 9.997843111906482e-06, + "loss": 0.3836, + "step": 175 + }, + { + "epoch": 0.019393939393939394, + "grad_norm": 14.975537300109863, + "learning_rate": 9.997791455550102e-06, + "loss": 0.615, + "step": 176 + }, + { + "epoch": 0.019504132231404958, + "grad_norm": 18.99844741821289, + "learning_rate": 9.997739188056995e-06, + "loss": 0.5388, + "step": 177 + }, + { + "epoch": 0.019614325068870522, + "grad_norm": 11.795282363891602, + "learning_rate": 9.997686309433552e-06, + "loss": 0.454, + "step": 178 + }, + { + "epoch": 0.019724517906336086, + "grad_norm": 10.889883041381836, + "learning_rate": 9.997632819686238e-06, + "loss": 0.5309, + "step": 179 + }, + { + "epoch": 0.019834710743801654, + "grad_norm": 10.928999900817871, + "learning_rate": 9.997578718821594e-06, + "loss": 0.5196, + "step": 180 + }, + { + "epoch": 0.01994490358126722, + "grad_norm": 13.715165138244629, + "learning_rate": 9.997524006846235e-06, + "loss": 0.5389, + "step": 181 + }, + { + "epoch": 0.020055096418732783, + "grad_norm": 13.190074920654297, + "learning_rate": 9.997468683766853e-06, + "loss": 0.5283, + "step": 182 + }, + { + "epoch": 0.020165289256198347, + "grad_norm": 10.726973533630371, + "learning_rate": 9.997412749590212e-06, + "loss": 0.593, + "step": 183 + }, + { + "epoch": 0.02027548209366391, + "grad_norm": 10.734817504882812, + "learning_rate": 9.997356204323153e-06, + "loss": 0.5979, + "step": 184 + }, + { + "epoch": 0.020385674931129475, + "grad_norm": 9.629424095153809, + "learning_rate": 9.997299047972586e-06, + "loss": 0.5061, + "step": 185 + }, + { + "epoch": 0.020495867768595043, + "grad_norm": 19.34947395324707, + "learning_rate": 9.997241280545505e-06, + "loss": 0.5885, + "step": 186 + }, + { + "epoch": 0.020606060606060607, + "grad_norm": 15.34253215789795, + "learning_rate": 9.997182902048973e-06, + "loss": 0.5997, + "step": 187 + }, + { + "epoch": 0.02071625344352617, + "grad_norm": 11.760794639587402, + "learning_rate": 9.997123912490126e-06, + "loss": 0.4872, + "step": 188 + }, + { + "epoch": 0.020826446280991735, + "grad_norm": 8.705877304077148, + "learning_rate": 9.997064311876179e-06, + "loss": 0.5161, + "step": 189 + }, + { + "epoch": 0.0209366391184573, + "grad_norm": 14.717255592346191, + "learning_rate": 9.99700410021442e-06, + "loss": 0.6922, + "step": 190 + }, + { + "epoch": 0.021046831955922864, + "grad_norm": 11.279083251953125, + "learning_rate": 9.996943277512214e-06, + "loss": 0.5453, + "step": 191 + }, + { + "epoch": 0.02115702479338843, + "grad_norm": 15.692731857299805, + "learning_rate": 9.996881843776994e-06, + "loss": 0.5641, + "step": 192 + }, + { + "epoch": 0.021267217630853995, + "grad_norm": 13.773262023925781, + "learning_rate": 9.996819799016275e-06, + "loss": 0.5335, + "step": 193 + }, + { + "epoch": 0.02137741046831956, + "grad_norm": 11.219247817993164, + "learning_rate": 9.996757143237645e-06, + "loss": 0.5444, + "step": 194 + }, + { + "epoch": 0.021487603305785124, + "grad_norm": 15.12265396118164, + "learning_rate": 9.996693876448761e-06, + "loss": 0.5041, + "step": 195 + }, + { + "epoch": 0.021597796143250688, + "grad_norm": 14.708301544189453, + "learning_rate": 9.996629998657365e-06, + "loss": 0.5643, + "step": 196 + }, + { + "epoch": 0.021707988980716252, + "grad_norm": 11.828309059143066, + "learning_rate": 9.996565509871265e-06, + "loss": 0.6107, + "step": 197 + }, + { + "epoch": 0.02181818181818182, + "grad_norm": 22.66375732421875, + "learning_rate": 9.996500410098347e-06, + "loss": 0.6399, + "step": 198 + }, + { + "epoch": 0.021928374655647384, + "grad_norm": 13.554588317871094, + "learning_rate": 9.996434699346574e-06, + "loss": 0.5173, + "step": 199 + }, + { + "epoch": 0.02203856749311295, + "grad_norm": 12.405920028686523, + "learning_rate": 9.996368377623975e-06, + "loss": 0.5303, + "step": 200 + }, + { + "epoch": 0.022148760330578512, + "grad_norm": 9.011127471923828, + "learning_rate": 9.996301444938668e-06, + "loss": 0.4698, + "step": 201 + }, + { + "epoch": 0.022258953168044077, + "grad_norm": 11.63690185546875, + "learning_rate": 9.99623390129883e-06, + "loss": 0.5527, + "step": 202 + }, + { + "epoch": 0.02236914600550964, + "grad_norm": 8.766032218933105, + "learning_rate": 9.996165746712725e-06, + "loss": 0.5798, + "step": 203 + }, + { + "epoch": 0.022479338842975205, + "grad_norm": 8.39252758026123, + "learning_rate": 9.996096981188687e-06, + "loss": 0.5477, + "step": 204 + }, + { + "epoch": 0.022589531680440773, + "grad_norm": 6.701710224151611, + "learning_rate": 9.996027604735122e-06, + "loss": 0.4578, + "step": 205 + }, + { + "epoch": 0.022699724517906337, + "grad_norm": 8.533823013305664, + "learning_rate": 9.995957617360515e-06, + "loss": 0.5107, + "step": 206 + }, + { + "epoch": 0.0228099173553719, + "grad_norm": 11.137310028076172, + "learning_rate": 9.995887019073427e-06, + "loss": 0.5688, + "step": 207 + }, + { + "epoch": 0.022920110192837465, + "grad_norm": 8.286704063415527, + "learning_rate": 9.995815809882485e-06, + "loss": 0.5068, + "step": 208 + }, + { + "epoch": 0.02303030303030303, + "grad_norm": 11.508268356323242, + "learning_rate": 9.9957439897964e-06, + "loss": 0.5123, + "step": 209 + }, + { + "epoch": 0.023140495867768594, + "grad_norm": 9.160122871398926, + "learning_rate": 9.995671558823955e-06, + "loss": 0.445, + "step": 210 + }, + { + "epoch": 0.02325068870523416, + "grad_norm": 17.174598693847656, + "learning_rate": 9.995598516974005e-06, + "loss": 0.5242, + "step": 211 + }, + { + "epoch": 0.023360881542699725, + "grad_norm": 11.791619300842285, + "learning_rate": 9.995524864255484e-06, + "loss": 0.4747, + "step": 212 + }, + { + "epoch": 0.02347107438016529, + "grad_norm": 12.3840970993042, + "learning_rate": 9.995450600677395e-06, + "loss": 0.5269, + "step": 213 + }, + { + "epoch": 0.023581267217630854, + "grad_norm": 9.73430061340332, + "learning_rate": 9.995375726248821e-06, + "loss": 0.5148, + "step": 214 + }, + { + "epoch": 0.023691460055096418, + "grad_norm": 8.56908130645752, + "learning_rate": 9.995300240978918e-06, + "loss": 0.4607, + "step": 215 + }, + { + "epoch": 0.023801652892561982, + "grad_norm": 13.244146347045898, + "learning_rate": 9.995224144876916e-06, + "loss": 0.5644, + "step": 216 + }, + { + "epoch": 0.02391184573002755, + "grad_norm": 8.972795486450195, + "learning_rate": 9.995147437952121e-06, + "loss": 0.5192, + "step": 217 + }, + { + "epoch": 0.024022038567493114, + "grad_norm": 8.984622955322266, + "learning_rate": 9.995070120213913e-06, + "loss": 0.5259, + "step": 218 + }, + { + "epoch": 0.024132231404958678, + "grad_norm": 12.068046569824219, + "learning_rate": 9.994992191671743e-06, + "loss": 0.5816, + "step": 219 + }, + { + "epoch": 0.024242424242424242, + "grad_norm": 11.006875991821289, + "learning_rate": 9.994913652335144e-06, + "loss": 0.4294, + "step": 220 + }, + { + "epoch": 0.024352617079889807, + "grad_norm": 13.247452735900879, + "learning_rate": 9.994834502213718e-06, + "loss": 0.5899, + "step": 221 + }, + { + "epoch": 0.02446280991735537, + "grad_norm": 17.99578094482422, + "learning_rate": 9.994754741317146e-06, + "loss": 0.6063, + "step": 222 + }, + { + "epoch": 0.024573002754820935, + "grad_norm": 15.888850212097168, + "learning_rate": 9.994674369655178e-06, + "loss": 0.5367, + "step": 223 + }, + { + "epoch": 0.024683195592286503, + "grad_norm": 10.986605644226074, + "learning_rate": 9.994593387237643e-06, + "loss": 0.4896, + "step": 224 + }, + { + "epoch": 0.024793388429752067, + "grad_norm": 9.367854118347168, + "learning_rate": 9.994511794074446e-06, + "loss": 0.4505, + "step": 225 + }, + { + "epoch": 0.02490358126721763, + "grad_norm": 15.278581619262695, + "learning_rate": 9.99442959017556e-06, + "loss": 0.5066, + "step": 226 + }, + { + "epoch": 0.025013774104683195, + "grad_norm": 9.596366882324219, + "learning_rate": 9.99434677555104e-06, + "loss": 0.5934, + "step": 227 + }, + { + "epoch": 0.02512396694214876, + "grad_norm": 7.759309768676758, + "learning_rate": 9.994263350211014e-06, + "loss": 0.5601, + "step": 228 + }, + { + "epoch": 0.025234159779614324, + "grad_norm": 11.995270729064941, + "learning_rate": 9.994179314165681e-06, + "loss": 0.5547, + "step": 229 + }, + { + "epoch": 0.02534435261707989, + "grad_norm": 14.3855619430542, + "learning_rate": 9.994094667425316e-06, + "loss": 0.5354, + "step": 230 + }, + { + "epoch": 0.025454545454545455, + "grad_norm": 11.691323280334473, + "learning_rate": 9.994009410000273e-06, + "loss": 0.4525, + "step": 231 + }, + { + "epoch": 0.02556473829201102, + "grad_norm": 7.918416976928711, + "learning_rate": 9.993923541900974e-06, + "loss": 0.5041, + "step": 232 + }, + { + "epoch": 0.025674931129476584, + "grad_norm": 15.92847728729248, + "learning_rate": 9.993837063137923e-06, + "loss": 0.5382, + "step": 233 + }, + { + "epoch": 0.025785123966942148, + "grad_norm": 18.090808868408203, + "learning_rate": 9.99374997372169e-06, + "loss": 0.5493, + "step": 234 + }, + { + "epoch": 0.025895316804407712, + "grad_norm": 24.221498489379883, + "learning_rate": 9.993662273662928e-06, + "loss": 0.6946, + "step": 235 + }, + { + "epoch": 0.02600550964187328, + "grad_norm": 14.936357498168945, + "learning_rate": 9.99357396297236e-06, + "loss": 0.5798, + "step": 236 + }, + { + "epoch": 0.026115702479338844, + "grad_norm": 11.742161750793457, + "learning_rate": 9.993485041660784e-06, + "loss": 0.5504, + "step": 237 + }, + { + "epoch": 0.026225895316804408, + "grad_norm": 15.549546241760254, + "learning_rate": 9.993395509739076e-06, + "loss": 0.5448, + "step": 238 + }, + { + "epoch": 0.026336088154269972, + "grad_norm": 12.071475982666016, + "learning_rate": 9.99330536721818e-06, + "loss": 0.5485, + "step": 239 + }, + { + "epoch": 0.026446280991735537, + "grad_norm": 14.900094985961914, + "learning_rate": 9.993214614109122e-06, + "loss": 0.4189, + "step": 240 + }, + { + "epoch": 0.0265564738292011, + "grad_norm": 13.360709190368652, + "learning_rate": 9.993123250422998e-06, + "loss": 0.4946, + "step": 241 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 14.573027610778809, + "learning_rate": 9.993031276170981e-06, + "loss": 0.5292, + "step": 242 + }, + { + "epoch": 0.026776859504132233, + "grad_norm": 11.061827659606934, + "learning_rate": 9.992938691364317e-06, + "loss": 0.5122, + "step": 243 + }, + { + "epoch": 0.026887052341597797, + "grad_norm": 11.963610649108887, + "learning_rate": 9.992845496014327e-06, + "loss": 0.4796, + "step": 244 + }, + { + "epoch": 0.02699724517906336, + "grad_norm": 9.625136375427246, + "learning_rate": 9.99275169013241e-06, + "loss": 0.4729, + "step": 245 + }, + { + "epoch": 0.027107438016528925, + "grad_norm": 11.104870796203613, + "learning_rate": 9.992657273730031e-06, + "loss": 0.5244, + "step": 246 + }, + { + "epoch": 0.02721763085399449, + "grad_norm": 24.728092193603516, + "learning_rate": 9.99256224681874e-06, + "loss": 0.5564, + "step": 247 + }, + { + "epoch": 0.027327823691460053, + "grad_norm": 19.725374221801758, + "learning_rate": 9.992466609410156e-06, + "loss": 0.5839, + "step": 248 + }, + { + "epoch": 0.02743801652892562, + "grad_norm": 12.868062973022461, + "learning_rate": 9.992370361515973e-06, + "loss": 0.4347, + "step": 249 + }, + { + "epoch": 0.027548209366391185, + "grad_norm": 22.5147705078125, + "learning_rate": 9.99227350314796e-06, + "loss": 0.532, + "step": 250 + }, + { + "epoch": 0.02765840220385675, + "grad_norm": 7.776773929595947, + "learning_rate": 9.992176034317963e-06, + "loss": 0.4972, + "step": 251 + }, + { + "epoch": 0.027768595041322314, + "grad_norm": 9.029813766479492, + "learning_rate": 9.9920779550379e-06, + "loss": 0.5039, + "step": 252 + }, + { + "epoch": 0.027878787878787878, + "grad_norm": 13.0871000289917, + "learning_rate": 9.991979265319762e-06, + "loss": 0.6035, + "step": 253 + }, + { + "epoch": 0.027988980716253442, + "grad_norm": 12.87193489074707, + "learning_rate": 9.99187996517562e-06, + "loss": 0.5451, + "step": 254 + }, + { + "epoch": 0.02809917355371901, + "grad_norm": 16.561864852905273, + "learning_rate": 9.991780054617613e-06, + "loss": 0.4178, + "step": 255 + }, + { + "epoch": 0.028209366391184574, + "grad_norm": 8.838342666625977, + "learning_rate": 9.991679533657962e-06, + "loss": 0.5038, + "step": 256 + }, + { + "epoch": 0.028319559228650138, + "grad_norm": 10.147689819335938, + "learning_rate": 9.991578402308957e-06, + "loss": 0.5399, + "step": 257 + }, + { + "epoch": 0.028429752066115702, + "grad_norm": 9.908218383789062, + "learning_rate": 9.991476660582964e-06, + "loss": 0.512, + "step": 258 + }, + { + "epoch": 0.028539944903581266, + "grad_norm": 9.494897842407227, + "learning_rate": 9.991374308492424e-06, + "loss": 0.4804, + "step": 259 + }, + { + "epoch": 0.02865013774104683, + "grad_norm": 12.115522384643555, + "learning_rate": 9.991271346049855e-06, + "loss": 0.592, + "step": 260 + }, + { + "epoch": 0.0287603305785124, + "grad_norm": 9.767868995666504, + "learning_rate": 9.991167773267845e-06, + "loss": 0.4458, + "step": 261 + }, + { + "epoch": 0.028870523415977963, + "grad_norm": 15.636098861694336, + "learning_rate": 9.99106359015906e-06, + "loss": 0.6042, + "step": 262 + }, + { + "epoch": 0.028980716253443527, + "grad_norm": 14.885655403137207, + "learning_rate": 9.990958796736239e-06, + "loss": 0.4564, + "step": 263 + }, + { + "epoch": 0.02909090909090909, + "grad_norm": 15.355427742004395, + "learning_rate": 9.990853393012196e-06, + "loss": 0.4221, + "step": 264 + }, + { + "epoch": 0.029201101928374655, + "grad_norm": 15.80118179321289, + "learning_rate": 9.990747378999823e-06, + "loss": 0.5817, + "step": 265 + }, + { + "epoch": 0.02931129476584022, + "grad_norm": 10.542325019836426, + "learning_rate": 9.99064075471208e-06, + "loss": 0.572, + "step": 266 + }, + { + "epoch": 0.029421487603305783, + "grad_norm": 11.173530578613281, + "learning_rate": 9.990533520162007e-06, + "loss": 0.5448, + "step": 267 + }, + { + "epoch": 0.02953168044077135, + "grad_norm": 12.251981735229492, + "learning_rate": 9.990425675362715e-06, + "loss": 0.5747, + "step": 268 + }, + { + "epoch": 0.029641873278236915, + "grad_norm": 11.749959945678711, + "learning_rate": 9.990317220327393e-06, + "loss": 0.6011, + "step": 269 + }, + { + "epoch": 0.02975206611570248, + "grad_norm": 8.856796264648438, + "learning_rate": 9.990208155069303e-06, + "loss": 0.5938, + "step": 270 + }, + { + "epoch": 0.029862258953168044, + "grad_norm": 14.874869346618652, + "learning_rate": 9.99009847960178e-06, + "loss": 0.5323, + "step": 271 + }, + { + "epoch": 0.029972451790633608, + "grad_norm": 7.231379985809326, + "learning_rate": 9.989988193938239e-06, + "loss": 0.4536, + "step": 272 + }, + { + "epoch": 0.030082644628099172, + "grad_norm": 12.203042984008789, + "learning_rate": 9.989877298092161e-06, + "loss": 0.5048, + "step": 273 + }, + { + "epoch": 0.03019283746556474, + "grad_norm": 15.385665893554688, + "learning_rate": 9.98976579207711e-06, + "loss": 0.5121, + "step": 274 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 14.260980606079102, + "learning_rate": 9.989653675906722e-06, + "loss": 0.5526, + "step": 275 + }, + { + "epoch": 0.030413223140495868, + "grad_norm": 18.089174270629883, + "learning_rate": 9.989540949594701e-06, + "loss": 0.6646, + "step": 276 + }, + { + "epoch": 0.030523415977961432, + "grad_norm": 9.740517616271973, + "learning_rate": 9.989427613154838e-06, + "loss": 0.513, + "step": 277 + }, + { + "epoch": 0.030633608815426996, + "grad_norm": 13.526970863342285, + "learning_rate": 9.989313666600987e-06, + "loss": 0.4703, + "step": 278 + }, + { + "epoch": 0.03074380165289256, + "grad_norm": 13.345050811767578, + "learning_rate": 9.989199109947084e-06, + "loss": 0.4925, + "step": 279 + }, + { + "epoch": 0.03085399449035813, + "grad_norm": 18.05704116821289, + "learning_rate": 9.989083943207137e-06, + "loss": 0.5549, + "step": 280 + }, + { + "epoch": 0.030964187327823692, + "grad_norm": 10.237497329711914, + "learning_rate": 9.98896816639523e-06, + "loss": 0.5152, + "step": 281 + }, + { + "epoch": 0.031074380165289257, + "grad_norm": 7.69012975692749, + "learning_rate": 9.988851779525515e-06, + "loss": 0.5107, + "step": 282 + }, + { + "epoch": 0.03118457300275482, + "grad_norm": 10.806635856628418, + "learning_rate": 9.98873478261223e-06, + "loss": 0.5326, + "step": 283 + }, + { + "epoch": 0.03129476584022039, + "grad_norm": 9.593619346618652, + "learning_rate": 9.98861717566968e-06, + "loss": 0.3565, + "step": 284 + }, + { + "epoch": 0.03140495867768595, + "grad_norm": 9.333944320678711, + "learning_rate": 9.988498958712245e-06, + "loss": 0.55, + "step": 285 + }, + { + "epoch": 0.03151515151515152, + "grad_norm": 14.35074520111084, + "learning_rate": 9.98838013175438e-06, + "loss": 0.5326, + "step": 286 + }, + { + "epoch": 0.03162534435261708, + "grad_norm": 15.526893615722656, + "learning_rate": 9.988260694810616e-06, + "loss": 0.4703, + "step": 287 + }, + { + "epoch": 0.031735537190082645, + "grad_norm": 12.787149429321289, + "learning_rate": 9.988140647895562e-06, + "loss": 0.4842, + "step": 288 + }, + { + "epoch": 0.03184573002754821, + "grad_norm": 17.571022033691406, + "learning_rate": 9.98801999102389e-06, + "loss": 0.5092, + "step": 289 + }, + { + "epoch": 0.031955922865013774, + "grad_norm": 10.923630714416504, + "learning_rate": 9.987898724210359e-06, + "loss": 0.506, + "step": 290 + }, + { + "epoch": 0.03206611570247934, + "grad_norm": 15.550124168395996, + "learning_rate": 9.987776847469797e-06, + "loss": 0.4913, + "step": 291 + }, + { + "epoch": 0.0321763085399449, + "grad_norm": 10.694622039794922, + "learning_rate": 9.987654360817106e-06, + "loss": 0.4885, + "step": 292 + }, + { + "epoch": 0.032286501377410466, + "grad_norm": 9.674568176269531, + "learning_rate": 9.987531264267265e-06, + "loss": 0.4923, + "step": 293 + }, + { + "epoch": 0.03239669421487603, + "grad_norm": 9.947579383850098, + "learning_rate": 9.987407557835327e-06, + "loss": 0.4283, + "step": 294 + }, + { + "epoch": 0.032506887052341595, + "grad_norm": 10.365708351135254, + "learning_rate": 9.987283241536419e-06, + "loss": 0.3912, + "step": 295 + }, + { + "epoch": 0.032617079889807166, + "grad_norm": 11.711014747619629, + "learning_rate": 9.987158315385738e-06, + "loss": 0.4957, + "step": 296 + }, + { + "epoch": 0.03272727272727273, + "grad_norm": 13.972429275512695, + "learning_rate": 9.987032779398566e-06, + "loss": 0.5765, + "step": 297 + }, + { + "epoch": 0.032837465564738294, + "grad_norm": 11.309210777282715, + "learning_rate": 9.986906633590252e-06, + "loss": 0.4206, + "step": 298 + }, + { + "epoch": 0.03294765840220386, + "grad_norm": 14.466939926147461, + "learning_rate": 9.986779877976221e-06, + "loss": 0.4687, + "step": 299 + }, + { + "epoch": 0.03305785123966942, + "grad_norm": 11.72865104675293, + "learning_rate": 9.986652512571972e-06, + "loss": 0.5604, + "step": 300 + }, + { + "epoch": 0.03316804407713499, + "grad_norm": 12.25109577178955, + "learning_rate": 9.98652453739308e-06, + "loss": 0.4229, + "step": 301 + }, + { + "epoch": 0.03327823691460055, + "grad_norm": 10.369767189025879, + "learning_rate": 9.986395952455194e-06, + "loss": 0.4978, + "step": 302 + }, + { + "epoch": 0.033388429752066115, + "grad_norm": 14.900583267211914, + "learning_rate": 9.986266757774038e-06, + "loss": 0.4373, + "step": 303 + }, + { + "epoch": 0.03349862258953168, + "grad_norm": 10.342029571533203, + "learning_rate": 9.986136953365409e-06, + "loss": 0.4624, + "step": 304 + }, + { + "epoch": 0.03360881542699724, + "grad_norm": 10.64696216583252, + "learning_rate": 9.986006539245181e-06, + "loss": 0.4459, + "step": 305 + }, + { + "epoch": 0.03371900826446281, + "grad_norm": 11.714919090270996, + "learning_rate": 9.9858755154293e-06, + "loss": 0.5102, + "step": 306 + }, + { + "epoch": 0.03382920110192837, + "grad_norm": 9.903362274169922, + "learning_rate": 9.985743881933789e-06, + "loss": 0.5264, + "step": 307 + }, + { + "epoch": 0.03393939393939394, + "grad_norm": 7.611393451690674, + "learning_rate": 9.985611638774744e-06, + "loss": 0.5104, + "step": 308 + }, + { + "epoch": 0.03404958677685951, + "grad_norm": 9.278596878051758, + "learning_rate": 9.985478785968334e-06, + "loss": 0.4472, + "step": 309 + }, + { + "epoch": 0.03415977961432507, + "grad_norm": 14.639801025390625, + "learning_rate": 9.985345323530806e-06, + "loss": 0.4893, + "step": 310 + }, + { + "epoch": 0.034269972451790635, + "grad_norm": 12.649656295776367, + "learning_rate": 9.985211251478482e-06, + "loss": 0.4276, + "step": 311 + }, + { + "epoch": 0.0343801652892562, + "grad_norm": 16.50640106201172, + "learning_rate": 9.985076569827752e-06, + "loss": 0.4735, + "step": 312 + }, + { + "epoch": 0.034490358126721764, + "grad_norm": 12.83117961883545, + "learning_rate": 9.984941278595088e-06, + "loss": 0.5901, + "step": 313 + }, + { + "epoch": 0.03460055096418733, + "grad_norm": 14.302864074707031, + "learning_rate": 9.984805377797033e-06, + "loss": 0.4452, + "step": 314 + }, + { + "epoch": 0.03471074380165289, + "grad_norm": 14.504166603088379, + "learning_rate": 9.984668867450207e-06, + "loss": 0.5536, + "step": 315 + }, + { + "epoch": 0.034820936639118456, + "grad_norm": 17.524925231933594, + "learning_rate": 9.9845317475713e-06, + "loss": 0.582, + "step": 316 + }, + { + "epoch": 0.03493112947658402, + "grad_norm": 15.99123764038086, + "learning_rate": 9.984394018177079e-06, + "loss": 0.5481, + "step": 317 + }, + { + "epoch": 0.035041322314049585, + "grad_norm": 12.897750854492188, + "learning_rate": 9.984255679284388e-06, + "loss": 0.4505, + "step": 318 + }, + { + "epoch": 0.03515151515151515, + "grad_norm": 11.626313209533691, + "learning_rate": 9.984116730910141e-06, + "loss": 0.5878, + "step": 319 + }, + { + "epoch": 0.03526170798898071, + "grad_norm": 11.26834774017334, + "learning_rate": 9.98397717307133e-06, + "loss": 0.5804, + "step": 320 + }, + { + "epoch": 0.035371900826446284, + "grad_norm": 11.204084396362305, + "learning_rate": 9.983837005785022e-06, + "loss": 0.5339, + "step": 321 + }, + { + "epoch": 0.03548209366391185, + "grad_norm": 11.40246295928955, + "learning_rate": 9.983696229068354e-06, + "loss": 0.5181, + "step": 322 + }, + { + "epoch": 0.03559228650137741, + "grad_norm": 12.926445960998535, + "learning_rate": 9.98355484293854e-06, + "loss": 0.5742, + "step": 323 + }, + { + "epoch": 0.03570247933884298, + "grad_norm": 10.973544120788574, + "learning_rate": 9.983412847412872e-06, + "loss": 0.5005, + "step": 324 + }, + { + "epoch": 0.03581267217630854, + "grad_norm": 7.6059956550598145, + "learning_rate": 9.983270242508712e-06, + "loss": 0.4821, + "step": 325 + }, + { + "epoch": 0.035922865013774105, + "grad_norm": 15.33000659942627, + "learning_rate": 9.983127028243497e-06, + "loss": 0.503, + "step": 326 + }, + { + "epoch": 0.03603305785123967, + "grad_norm": 21.503162384033203, + "learning_rate": 9.98298320463474e-06, + "loss": 0.5549, + "step": 327 + }, + { + "epoch": 0.036143250688705233, + "grad_norm": 17.429031372070312, + "learning_rate": 9.982838771700027e-06, + "loss": 0.5355, + "step": 328 + }, + { + "epoch": 0.0362534435261708, + "grad_norm": 9.791876792907715, + "learning_rate": 9.982693729457023e-06, + "loss": 0.4157, + "step": 329 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 13.283187866210938, + "learning_rate": 9.98254807792346e-06, + "loss": 0.5341, + "step": 330 + }, + { + "epoch": 0.036473829201101926, + "grad_norm": 28.495004653930664, + "learning_rate": 9.982401817117149e-06, + "loss": 0.4931, + "step": 331 + }, + { + "epoch": 0.03658402203856749, + "grad_norm": 10.677569389343262, + "learning_rate": 9.982254947055976e-06, + "loss": 0.4693, + "step": 332 + }, + { + "epoch": 0.036694214876033054, + "grad_norm": 14.609735488891602, + "learning_rate": 9.982107467757902e-06, + "loss": 0.5492, + "step": 333 + }, + { + "epoch": 0.036804407713498626, + "grad_norm": 7.3842339515686035, + "learning_rate": 9.981959379240957e-06, + "loss": 0.4593, + "step": 334 + }, + { + "epoch": 0.03691460055096419, + "grad_norm": 11.509512901306152, + "learning_rate": 9.981810681523254e-06, + "loss": 0.482, + "step": 335 + }, + { + "epoch": 0.037024793388429754, + "grad_norm": 12.058653831481934, + "learning_rate": 9.981661374622974e-06, + "loss": 0.474, + "step": 336 + }, + { + "epoch": 0.03713498622589532, + "grad_norm": 9.072152137756348, + "learning_rate": 9.981511458558373e-06, + "loss": 0.5597, + "step": 337 + }, + { + "epoch": 0.03724517906336088, + "grad_norm": 12.551641464233398, + "learning_rate": 9.981360933347783e-06, + "loss": 0.4377, + "step": 338 + }, + { + "epoch": 0.037355371900826446, + "grad_norm": 17.34666633605957, + "learning_rate": 9.981209799009613e-06, + "loss": 0.496, + "step": 339 + }, + { + "epoch": 0.03746556473829201, + "grad_norm": 21.156679153442383, + "learning_rate": 9.981058055562343e-06, + "loss": 0.6343, + "step": 340 + }, + { + "epoch": 0.037575757575757575, + "grad_norm": 10.933553695678711, + "learning_rate": 9.980905703024525e-06, + "loss": 0.5036, + "step": 341 + }, + { + "epoch": 0.03768595041322314, + "grad_norm": 7.121741771697998, + "learning_rate": 9.980752741414796e-06, + "loss": 0.5217, + "step": 342 + }, + { + "epoch": 0.0377961432506887, + "grad_norm": 7.140209674835205, + "learning_rate": 9.980599170751852e-06, + "loss": 0.4371, + "step": 343 + }, + { + "epoch": 0.03790633608815427, + "grad_norm": 8.137228012084961, + "learning_rate": 9.980444991054478e-06, + "loss": 0.4995, + "step": 344 + }, + { + "epoch": 0.03801652892561983, + "grad_norm": 7.849926948547363, + "learning_rate": 9.980290202341525e-06, + "loss": 0.5248, + "step": 345 + }, + { + "epoch": 0.0381267217630854, + "grad_norm": 15.131035804748535, + "learning_rate": 9.980134804631922e-06, + "loss": 0.5769, + "step": 346 + }, + { + "epoch": 0.03823691460055097, + "grad_norm": 9.500950813293457, + "learning_rate": 9.97997879794467e-06, + "loss": 0.5525, + "step": 347 + }, + { + "epoch": 0.03834710743801653, + "grad_norm": 15.258380889892578, + "learning_rate": 9.979822182298843e-06, + "loss": 0.5377, + "step": 348 + }, + { + "epoch": 0.038457300275482095, + "grad_norm": 20.885284423828125, + "learning_rate": 9.9796649577136e-06, + "loss": 0.6536, + "step": 349 + }, + { + "epoch": 0.03856749311294766, + "grad_norm": 12.07435417175293, + "learning_rate": 9.979507124208158e-06, + "loss": 0.583, + "step": 350 + }, + { + "epoch": 0.038677685950413224, + "grad_norm": 12.532731056213379, + "learning_rate": 9.979348681801821e-06, + "loss": 0.4871, + "step": 351 + }, + { + "epoch": 0.03878787878787879, + "grad_norm": 9.900496482849121, + "learning_rate": 9.979189630513966e-06, + "loss": 0.5749, + "step": 352 + }, + { + "epoch": 0.03889807162534435, + "grad_norm": 12.795638084411621, + "learning_rate": 9.979029970364038e-06, + "loss": 0.5578, + "step": 353 + }, + { + "epoch": 0.039008264462809916, + "grad_norm": 20.84184455871582, + "learning_rate": 9.978869701371562e-06, + "loss": 0.6099, + "step": 354 + }, + { + "epoch": 0.03911845730027548, + "grad_norm": 8.181387901306152, + "learning_rate": 9.978708823556135e-06, + "loss": 0.4692, + "step": 355 + }, + { + "epoch": 0.039228650137741045, + "grad_norm": 9.632551193237305, + "learning_rate": 9.97854733693743e-06, + "loss": 0.3728, + "step": 356 + }, + { + "epoch": 0.03933884297520661, + "grad_norm": 10.69709587097168, + "learning_rate": 9.978385241535194e-06, + "loss": 0.5292, + "step": 357 + }, + { + "epoch": 0.03944903581267217, + "grad_norm": 8.557096481323242, + "learning_rate": 9.978222537369249e-06, + "loss": 0.4841, + "step": 358 + }, + { + "epoch": 0.039559228650137744, + "grad_norm": 9.893890380859375, + "learning_rate": 9.97805922445949e-06, + "loss": 0.4939, + "step": 359 + }, + { + "epoch": 0.03966942148760331, + "grad_norm": 10.700851440429688, + "learning_rate": 9.977895302825886e-06, + "loss": 0.3914, + "step": 360 + }, + { + "epoch": 0.03977961432506887, + "grad_norm": 11.766791343688965, + "learning_rate": 9.977730772488483e-06, + "loss": 0.5544, + "step": 361 + }, + { + "epoch": 0.03988980716253444, + "grad_norm": 13.675890922546387, + "learning_rate": 9.977565633467401e-06, + "loss": 0.6028, + "step": 362 + }, + { + "epoch": 0.04, + "grad_norm": 7.105637073516846, + "learning_rate": 9.97739988578283e-06, + "loss": 0.4251, + "step": 363 + }, + { + "epoch": 0.040110192837465565, + "grad_norm": 15.840269088745117, + "learning_rate": 9.977233529455042e-06, + "loss": 0.5866, + "step": 364 + }, + { + "epoch": 0.04022038567493113, + "grad_norm": 10.23617172241211, + "learning_rate": 9.977066564504374e-06, + "loss": 0.5122, + "step": 365 + }, + { + "epoch": 0.04033057851239669, + "grad_norm": 8.172809600830078, + "learning_rate": 9.976898990951249e-06, + "loss": 0.4953, + "step": 366 + }, + { + "epoch": 0.04044077134986226, + "grad_norm": 11.545899391174316, + "learning_rate": 9.976730808816153e-06, + "loss": 0.5851, + "step": 367 + }, + { + "epoch": 0.04055096418732782, + "grad_norm": 10.631319999694824, + "learning_rate": 9.976562018119654e-06, + "loss": 0.4278, + "step": 368 + }, + { + "epoch": 0.040661157024793386, + "grad_norm": 14.287742614746094, + "learning_rate": 9.976392618882391e-06, + "loss": 0.5445, + "step": 369 + }, + { + "epoch": 0.04077134986225895, + "grad_norm": 11.023226737976074, + "learning_rate": 9.976222611125079e-06, + "loss": 0.4763, + "step": 370 + }, + { + "epoch": 0.04088154269972452, + "grad_norm": 10.713460922241211, + "learning_rate": 9.976051994868506e-06, + "loss": 0.5459, + "step": 371 + }, + { + "epoch": 0.040991735537190085, + "grad_norm": 9.844223976135254, + "learning_rate": 9.975880770133537e-06, + "loss": 0.4924, + "step": 372 + }, + { + "epoch": 0.04110192837465565, + "grad_norm": 16.06312370300293, + "learning_rate": 9.975708936941107e-06, + "loss": 0.5548, + "step": 373 + }, + { + "epoch": 0.041212121212121214, + "grad_norm": 12.113286972045898, + "learning_rate": 9.97553649531223e-06, + "loss": 0.4829, + "step": 374 + }, + { + "epoch": 0.04132231404958678, + "grad_norm": 11.757156372070312, + "learning_rate": 9.975363445267993e-06, + "loss": 0.4318, + "step": 375 + }, + { + "epoch": 0.04143250688705234, + "grad_norm": 14.062914848327637, + "learning_rate": 9.975189786829554e-06, + "loss": 0.5562, + "step": 376 + }, + { + "epoch": 0.041542699724517906, + "grad_norm": 11.511475563049316, + "learning_rate": 9.975015520018149e-06, + "loss": 0.5945, + "step": 377 + }, + { + "epoch": 0.04165289256198347, + "grad_norm": 20.82297706604004, + "learning_rate": 9.974840644855091e-06, + "loss": 0.5634, + "step": 378 + }, + { + "epoch": 0.041763085399449035, + "grad_norm": 13.976396560668945, + "learning_rate": 9.974665161361759e-06, + "loss": 0.5392, + "step": 379 + }, + { + "epoch": 0.0418732782369146, + "grad_norm": 12.412278175354004, + "learning_rate": 9.974489069559615e-06, + "loss": 0.6214, + "step": 380 + }, + { + "epoch": 0.04198347107438016, + "grad_norm": 15.933039665222168, + "learning_rate": 9.97431236947019e-06, + "loss": 0.5612, + "step": 381 + }, + { + "epoch": 0.04209366391184573, + "grad_norm": 11.01435375213623, + "learning_rate": 9.974135061115091e-06, + "loss": 0.4561, + "step": 382 + }, + { + "epoch": 0.04220385674931129, + "grad_norm": 6.11425256729126, + "learning_rate": 9.973957144516002e-06, + "loss": 0.4799, + "step": 383 + }, + { + "epoch": 0.04231404958677686, + "grad_norm": 9.104811668395996, + "learning_rate": 9.973778619694673e-06, + "loss": 0.4598, + "step": 384 + }, + { + "epoch": 0.04242424242424243, + "grad_norm": 11.529860496520996, + "learning_rate": 9.973599486672942e-06, + "loss": 0.4746, + "step": 385 + }, + { + "epoch": 0.04253443526170799, + "grad_norm": 10.589902877807617, + "learning_rate": 9.973419745472708e-06, + "loss": 0.448, + "step": 386 + }, + { + "epoch": 0.042644628099173555, + "grad_norm": 6.479124546051025, + "learning_rate": 9.973239396115952e-06, + "loss": 0.5251, + "step": 387 + }, + { + "epoch": 0.04275482093663912, + "grad_norm": 38.29296112060547, + "learning_rate": 9.973058438624727e-06, + "loss": 0.5769, + "step": 388 + }, + { + "epoch": 0.042865013774104684, + "grad_norm": 11.937189102172852, + "learning_rate": 9.972876873021162e-06, + "loss": 0.5102, + "step": 389 + }, + { + "epoch": 0.04297520661157025, + "grad_norm": 12.845355987548828, + "learning_rate": 9.972694699327456e-06, + "loss": 0.549, + "step": 390 + }, + { + "epoch": 0.04308539944903581, + "grad_norm": 7.100241661071777, + "learning_rate": 9.972511917565889e-06, + "loss": 0.4335, + "step": 391 + }, + { + "epoch": 0.043195592286501376, + "grad_norm": 9.807534217834473, + "learning_rate": 9.97232852775881e-06, + "loss": 0.5071, + "step": 392 + }, + { + "epoch": 0.04330578512396694, + "grad_norm": 9.859899520874023, + "learning_rate": 9.972144529928644e-06, + "loss": 0.4604, + "step": 393 + }, + { + "epoch": 0.043415977961432504, + "grad_norm": 16.952932357788086, + "learning_rate": 9.971959924097892e-06, + "loss": 0.5819, + "step": 394 + }, + { + "epoch": 0.04352617079889807, + "grad_norm": 14.593550682067871, + "learning_rate": 9.971774710289124e-06, + "loss": 0.5367, + "step": 395 + }, + { + "epoch": 0.04363636363636364, + "grad_norm": 13.790374755859375, + "learning_rate": 9.971588888524993e-06, + "loss": 0.4683, + "step": 396 + }, + { + "epoch": 0.043746556473829204, + "grad_norm": 15.670136451721191, + "learning_rate": 9.971402458828218e-06, + "loss": 0.5072, + "step": 397 + }, + { + "epoch": 0.04385674931129477, + "grad_norm": 16.61530113220215, + "learning_rate": 9.9712154212216e-06, + "loss": 0.5691, + "step": 398 + }, + { + "epoch": 0.04396694214876033, + "grad_norm": 9.777688980102539, + "learning_rate": 9.971027775728007e-06, + "loss": 0.4844, + "step": 399 + }, + { + "epoch": 0.0440771349862259, + "grad_norm": 10.27904224395752, + "learning_rate": 9.970839522370383e-06, + "loss": 0.5481, + "step": 400 + }, + { + "epoch": 0.04418732782369146, + "grad_norm": 7.96864128112793, + "learning_rate": 9.970650661171751e-06, + "loss": 0.4588, + "step": 401 + }, + { + "epoch": 0.044297520661157025, + "grad_norm": 8.261791229248047, + "learning_rate": 9.970461192155205e-06, + "loss": 0.5101, + "step": 402 + }, + { + "epoch": 0.04440771349862259, + "grad_norm": 11.426614761352539, + "learning_rate": 9.970271115343911e-06, + "loss": 0.5948, + "step": 403 + }, + { + "epoch": 0.04451790633608815, + "grad_norm": 13.521474838256836, + "learning_rate": 9.970080430761116e-06, + "loss": 0.5626, + "step": 404 + }, + { + "epoch": 0.04462809917355372, + "grad_norm": 8.598532676696777, + "learning_rate": 9.969889138430133e-06, + "loss": 0.4888, + "step": 405 + }, + { + "epoch": 0.04473829201101928, + "grad_norm": 13.616643905639648, + "learning_rate": 9.969697238374355e-06, + "loss": 0.5023, + "step": 406 + }, + { + "epoch": 0.044848484848484846, + "grad_norm": 7.1386003494262695, + "learning_rate": 9.969504730617248e-06, + "loss": 0.4734, + "step": 407 + }, + { + "epoch": 0.04495867768595041, + "grad_norm": 10.026013374328613, + "learning_rate": 9.969311615182353e-06, + "loss": 0.4859, + "step": 408 + }, + { + "epoch": 0.04506887052341598, + "grad_norm": 15.475004196166992, + "learning_rate": 9.969117892093283e-06, + "loss": 0.588, + "step": 409 + }, + { + "epoch": 0.045179063360881545, + "grad_norm": 15.79782772064209, + "learning_rate": 9.968923561373728e-06, + "loss": 0.633, + "step": 410 + }, + { + "epoch": 0.04528925619834711, + "grad_norm": 22.108070373535156, + "learning_rate": 9.96872862304745e-06, + "loss": 0.5475, + "step": 411 + }, + { + "epoch": 0.045399449035812674, + "grad_norm": 8.265995979309082, + "learning_rate": 9.968533077138287e-06, + "loss": 0.4666, + "step": 412 + }, + { + "epoch": 0.04550964187327824, + "grad_norm": 7.072658061981201, + "learning_rate": 9.96833692367015e-06, + "loss": 0.4725, + "step": 413 + }, + { + "epoch": 0.0456198347107438, + "grad_norm": 10.56517219543457, + "learning_rate": 9.968140162667024e-06, + "loss": 0.4867, + "step": 414 + }, + { + "epoch": 0.045730027548209366, + "grad_norm": 16.286352157592773, + "learning_rate": 9.967942794152972e-06, + "loss": 0.4781, + "step": 415 + }, + { + "epoch": 0.04584022038567493, + "grad_norm": 8.922318458557129, + "learning_rate": 9.967744818152125e-06, + "loss": 0.4878, + "step": 416 + }, + { + "epoch": 0.045950413223140495, + "grad_norm": 6.937612056732178, + "learning_rate": 9.967546234688694e-06, + "loss": 0.5643, + "step": 417 + }, + { + "epoch": 0.04606060606060606, + "grad_norm": 12.612809181213379, + "learning_rate": 9.967347043786964e-06, + "loss": 0.5331, + "step": 418 + }, + { + "epoch": 0.04617079889807162, + "grad_norm": 7.663732051849365, + "learning_rate": 9.967147245471287e-06, + "loss": 0.4905, + "step": 419 + }, + { + "epoch": 0.04628099173553719, + "grad_norm": 10.40149974822998, + "learning_rate": 9.9669468397661e-06, + "loss": 0.4748, + "step": 420 + }, + { + "epoch": 0.04639118457300275, + "grad_norm": 8.454051971435547, + "learning_rate": 9.966745826695905e-06, + "loss": 0.3773, + "step": 421 + }, + { + "epoch": 0.04650137741046832, + "grad_norm": 7.636160373687744, + "learning_rate": 9.966544206285285e-06, + "loss": 0.4961, + "step": 422 + }, + { + "epoch": 0.04661157024793389, + "grad_norm": 6.887092113494873, + "learning_rate": 9.96634197855889e-06, + "loss": 0.509, + "step": 423 + }, + { + "epoch": 0.04672176308539945, + "grad_norm": 9.089897155761719, + "learning_rate": 9.966139143541455e-06, + "loss": 0.4543, + "step": 424 + }, + { + "epoch": 0.046831955922865015, + "grad_norm": 11.715934753417969, + "learning_rate": 9.965935701257779e-06, + "loss": 0.5041, + "step": 425 + }, + { + "epoch": 0.04694214876033058, + "grad_norm": 12.82231330871582, + "learning_rate": 9.96573165173274e-06, + "loss": 0.5387, + "step": 426 + }, + { + "epoch": 0.04705234159779614, + "grad_norm": 6.379404067993164, + "learning_rate": 9.965526994991288e-06, + "loss": 0.5053, + "step": 427 + }, + { + "epoch": 0.04716253443526171, + "grad_norm": 11.586793899536133, + "learning_rate": 9.96532173105845e-06, + "loss": 0.4907, + "step": 428 + }, + { + "epoch": 0.04727272727272727, + "grad_norm": 5.423346996307373, + "learning_rate": 9.965115859959327e-06, + "loss": 0.4862, + "step": 429 + }, + { + "epoch": 0.047382920110192836, + "grad_norm": 5.6065754890441895, + "learning_rate": 9.964909381719092e-06, + "loss": 0.5168, + "step": 430 + }, + { + "epoch": 0.0474931129476584, + "grad_norm": 11.98960018157959, + "learning_rate": 9.964702296362995e-06, + "loss": 0.4249, + "step": 431 + }, + { + "epoch": 0.047603305785123964, + "grad_norm": 19.07823944091797, + "learning_rate": 9.964494603916356e-06, + "loss": 0.5719, + "step": 432 + }, + { + "epoch": 0.04771349862258953, + "grad_norm": 8.011237144470215, + "learning_rate": 9.964286304404573e-06, + "loss": 0.4781, + "step": 433 + }, + { + "epoch": 0.0478236914600551, + "grad_norm": 11.941851615905762, + "learning_rate": 9.964077397853117e-06, + "loss": 0.5121, + "step": 434 + }, + { + "epoch": 0.047933884297520664, + "grad_norm": 12.690646171569824, + "learning_rate": 9.963867884287534e-06, + "loss": 0.341, + "step": 435 + }, + { + "epoch": 0.04804407713498623, + "grad_norm": 9.887231826782227, + "learning_rate": 9.963657763733445e-06, + "loss": 0.4575, + "step": 436 + }, + { + "epoch": 0.04815426997245179, + "grad_norm": 25.173494338989258, + "learning_rate": 9.96344703621654e-06, + "loss": 0.5522, + "step": 437 + }, + { + "epoch": 0.048264462809917356, + "grad_norm": 9.900611877441406, + "learning_rate": 9.963235701762591e-06, + "loss": 0.4944, + "step": 438 + }, + { + "epoch": 0.04837465564738292, + "grad_norm": 9.656048774719238, + "learning_rate": 9.963023760397437e-06, + "loss": 0.5256, + "step": 439 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 18.26109504699707, + "learning_rate": 9.962811212146997e-06, + "loss": 0.5189, + "step": 440 + }, + { + "epoch": 0.04859504132231405, + "grad_norm": 9.127241134643555, + "learning_rate": 9.96259805703726e-06, + "loss": 0.529, + "step": 441 + }, + { + "epoch": 0.04870523415977961, + "grad_norm": 9.09994125366211, + "learning_rate": 9.962384295094293e-06, + "loss": 0.4587, + "step": 442 + }, + { + "epoch": 0.04881542699724518, + "grad_norm": 6.427917003631592, + "learning_rate": 9.962169926344231e-06, + "loss": 0.4969, + "step": 443 + }, + { + "epoch": 0.04892561983471074, + "grad_norm": 12.126689910888672, + "learning_rate": 9.961954950813292e-06, + "loss": 0.6163, + "step": 444 + }, + { + "epoch": 0.049035812672176306, + "grad_norm": 13.614275932312012, + "learning_rate": 9.96173936852776e-06, + "loss": 0.4255, + "step": 445 + }, + { + "epoch": 0.04914600550964187, + "grad_norm": 10.977042198181152, + "learning_rate": 9.961523179514e-06, + "loss": 0.4867, + "step": 446 + }, + { + "epoch": 0.04925619834710744, + "grad_norm": 7.973937034606934, + "learning_rate": 9.961306383798445e-06, + "loss": 0.4506, + "step": 447 + }, + { + "epoch": 0.049366391184573005, + "grad_norm": 12.840161323547363, + "learning_rate": 9.961088981407607e-06, + "loss": 0.6008, + "step": 448 + }, + { + "epoch": 0.04947658402203857, + "grad_norm": 7.462771415710449, + "learning_rate": 9.960870972368068e-06, + "loss": 0.5071, + "step": 449 + }, + { + "epoch": 0.049586776859504134, + "grad_norm": 8.614627838134766, + "learning_rate": 9.960652356706489e-06, + "loss": 0.5687, + "step": 450 + }, + { + "epoch": 0.0496969696969697, + "grad_norm": 8.726750373840332, + "learning_rate": 9.960433134449601e-06, + "loss": 0.4525, + "step": 451 + }, + { + "epoch": 0.04980716253443526, + "grad_norm": 11.282340049743652, + "learning_rate": 9.960213305624211e-06, + "loss": 0.5434, + "step": 452 + }, + { + "epoch": 0.049917355371900826, + "grad_norm": 18.417638778686523, + "learning_rate": 9.9599928702572e-06, + "loss": 0.5903, + "step": 453 + }, + { + "epoch": 0.05002754820936639, + "grad_norm": 11.008544921875, + "learning_rate": 9.959771828375523e-06, + "loss": 0.4515, + "step": 454 + }, + { + "epoch": 0.050137741046831955, + "grad_norm": 7.547202110290527, + "learning_rate": 9.95955018000621e-06, + "loss": 0.3885, + "step": 455 + }, + { + "epoch": 0.05024793388429752, + "grad_norm": 7.816680431365967, + "learning_rate": 9.959327925176365e-06, + "loss": 0.4768, + "step": 456 + }, + { + "epoch": 0.05035812672176308, + "grad_norm": 10.285447120666504, + "learning_rate": 9.959105063913164e-06, + "loss": 0.501, + "step": 457 + }, + { + "epoch": 0.05046831955922865, + "grad_norm": 9.738030433654785, + "learning_rate": 9.95888159624386e-06, + "loss": 0.5915, + "step": 458 + }, + { + "epoch": 0.05057851239669422, + "grad_norm": 8.554464340209961, + "learning_rate": 9.958657522195779e-06, + "loss": 0.4127, + "step": 459 + }, + { + "epoch": 0.05068870523415978, + "grad_norm": 13.582873344421387, + "learning_rate": 9.958432841796319e-06, + "loss": 0.4419, + "step": 460 + }, + { + "epoch": 0.05079889807162535, + "grad_norm": 10.61713695526123, + "learning_rate": 9.958207555072957e-06, + "loss": 0.5171, + "step": 461 + }, + { + "epoch": 0.05090909090909091, + "grad_norm": 14.762274742126465, + "learning_rate": 9.957981662053239e-06, + "loss": 0.4755, + "step": 462 + }, + { + "epoch": 0.051019283746556475, + "grad_norm": 10.927807807922363, + "learning_rate": 9.957755162764789e-06, + "loss": 0.3426, + "step": 463 + }, + { + "epoch": 0.05112947658402204, + "grad_norm": 9.77183723449707, + "learning_rate": 9.957528057235301e-06, + "loss": 0.476, + "step": 464 + }, + { + "epoch": 0.0512396694214876, + "grad_norm": 9.146940231323242, + "learning_rate": 9.95730034549255e-06, + "loss": 0.5547, + "step": 465 + }, + { + "epoch": 0.05134986225895317, + "grad_norm": 9.617725372314453, + "learning_rate": 9.95707202756438e-06, + "loss": 0.4616, + "step": 466 + }, + { + "epoch": 0.05146005509641873, + "grad_norm": 7.482451915740967, + "learning_rate": 9.956843103478709e-06, + "loss": 0.399, + "step": 467 + }, + { + "epoch": 0.051570247933884296, + "grad_norm": 12.296363830566406, + "learning_rate": 9.95661357326353e-06, + "loss": 0.4369, + "step": 468 + }, + { + "epoch": 0.05168044077134986, + "grad_norm": 9.191376686096191, + "learning_rate": 9.956383436946908e-06, + "loss": 0.4178, + "step": 469 + }, + { + "epoch": 0.051790633608815424, + "grad_norm": 24.099687576293945, + "learning_rate": 9.956152694556988e-06, + "loss": 0.5529, + "step": 470 + }, + { + "epoch": 0.05190082644628099, + "grad_norm": 10.05209732055664, + "learning_rate": 9.955921346121985e-06, + "loss": 0.4258, + "step": 471 + }, + { + "epoch": 0.05201101928374656, + "grad_norm": 10.024127006530762, + "learning_rate": 9.955689391670188e-06, + "loss": 0.4693, + "step": 472 + }, + { + "epoch": 0.052121212121212124, + "grad_norm": 12.266153335571289, + "learning_rate": 9.95545683122996e-06, + "loss": 0.5465, + "step": 473 + }, + { + "epoch": 0.05223140495867769, + "grad_norm": 6.339637279510498, + "learning_rate": 9.955223664829739e-06, + "loss": 0.4511, + "step": 474 + }, + { + "epoch": 0.05234159779614325, + "grad_norm": 7.533390522003174, + "learning_rate": 9.954989892498037e-06, + "loss": 0.3883, + "step": 475 + }, + { + "epoch": 0.052451790633608816, + "grad_norm": 9.525851249694824, + "learning_rate": 9.954755514263442e-06, + "loss": 0.4514, + "step": 476 + }, + { + "epoch": 0.05256198347107438, + "grad_norm": 9.640487670898438, + "learning_rate": 9.95452053015461e-06, + "loss": 0.5306, + "step": 477 + }, + { + "epoch": 0.052672176308539945, + "grad_norm": 11.502488136291504, + "learning_rate": 9.95428494020028e-06, + "loss": 0.5595, + "step": 478 + }, + { + "epoch": 0.05278236914600551, + "grad_norm": 13.235862731933594, + "learning_rate": 9.954048744429256e-06, + "loss": 0.5212, + "step": 479 + }, + { + "epoch": 0.05289256198347107, + "grad_norm": 11.967812538146973, + "learning_rate": 9.953811942870422e-06, + "loss": 0.6275, + "step": 480 + }, + { + "epoch": 0.05300275482093664, + "grad_norm": 11.516127586364746, + "learning_rate": 9.953574535552735e-06, + "loss": 0.4844, + "step": 481 + }, + { + "epoch": 0.0531129476584022, + "grad_norm": 7.050947666168213, + "learning_rate": 9.953336522505227e-06, + "loss": 0.4414, + "step": 482 + }, + { + "epoch": 0.053223140495867766, + "grad_norm": 10.11103630065918, + "learning_rate": 9.953097903756997e-06, + "loss": 0.4532, + "step": 483 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 9.065771102905273, + "learning_rate": 9.95285867933723e-06, + "loss": 0.4776, + "step": 484 + }, + { + "epoch": 0.0534435261707989, + "grad_norm": 12.761256217956543, + "learning_rate": 9.952618849275173e-06, + "loss": 0.4895, + "step": 485 + }, + { + "epoch": 0.053553719008264465, + "grad_norm": 9.490694999694824, + "learning_rate": 9.952378413600159e-06, + "loss": 0.4583, + "step": 486 + }, + { + "epoch": 0.05366391184573003, + "grad_norm": 6.050409317016602, + "learning_rate": 9.952137372341584e-06, + "loss": 0.5119, + "step": 487 + }, + { + "epoch": 0.053774104683195594, + "grad_norm": 9.085648536682129, + "learning_rate": 9.951895725528924e-06, + "loss": 0.4401, + "step": 488 + }, + { + "epoch": 0.05388429752066116, + "grad_norm": 10.04773235321045, + "learning_rate": 9.951653473191727e-06, + "loss": 0.4676, + "step": 489 + }, + { + "epoch": 0.05399449035812672, + "grad_norm": 14.638328552246094, + "learning_rate": 9.951410615359619e-06, + "loss": 0.5522, + "step": 490 + }, + { + "epoch": 0.054104683195592286, + "grad_norm": 12.053609848022461, + "learning_rate": 9.951167152062296e-06, + "loss": 0.5225, + "step": 491 + }, + { + "epoch": 0.05421487603305785, + "grad_norm": 12.283226013183594, + "learning_rate": 9.950923083329525e-06, + "loss": 0.4905, + "step": 492 + }, + { + "epoch": 0.054325068870523414, + "grad_norm": 9.185400009155273, + "learning_rate": 9.950678409191157e-06, + "loss": 0.4756, + "step": 493 + }, + { + "epoch": 0.05443526170798898, + "grad_norm": 7.565023899078369, + "learning_rate": 9.950433129677106e-06, + "loss": 0.5202, + "step": 494 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 11.195167541503906, + "learning_rate": 9.950187244817368e-06, + "loss": 0.4532, + "step": 495 + }, + { + "epoch": 0.05465564738292011, + "grad_norm": 10.009805679321289, + "learning_rate": 9.94994075464201e-06, + "loss": 0.509, + "step": 496 + }, + { + "epoch": 0.05476584022038568, + "grad_norm": 11.064774513244629, + "learning_rate": 9.949693659181175e-06, + "loss": 0.534, + "step": 497 + }, + { + "epoch": 0.05487603305785124, + "grad_norm": 9.216227531433105, + "learning_rate": 9.949445958465074e-06, + "loss": 0.4937, + "step": 498 + }, + { + "epoch": 0.054986225895316806, + "grad_norm": 9.245732307434082, + "learning_rate": 9.949197652523996e-06, + "loss": 0.5174, + "step": 499 + }, + { + "epoch": 0.05509641873278237, + "grad_norm": 15.904149055480957, + "learning_rate": 9.94894874138831e-06, + "loss": 0.5527, + "step": 500 + }, + { + "epoch": 0.055206611570247935, + "grad_norm": 8.78382682800293, + "learning_rate": 9.948699225088446e-06, + "loss": 0.5693, + "step": 501 + }, + { + "epoch": 0.0553168044077135, + "grad_norm": 9.095039367675781, + "learning_rate": 9.94844910365492e-06, + "loss": 0.5161, + "step": 502 + }, + { + "epoch": 0.05542699724517906, + "grad_norm": 29.34706687927246, + "learning_rate": 9.948198377118316e-06, + "loss": 0.7604, + "step": 503 + }, + { + "epoch": 0.05553719008264463, + "grad_norm": 11.551289558410645, + "learning_rate": 9.947947045509292e-06, + "loss": 0.518, + "step": 504 + }, + { + "epoch": 0.05564738292011019, + "grad_norm": 12.79025936126709, + "learning_rate": 9.947695108858583e-06, + "loss": 0.5482, + "step": 505 + }, + { + "epoch": 0.055757575757575756, + "grad_norm": 5.822357654571533, + "learning_rate": 9.947442567196996e-06, + "loss": 0.4895, + "step": 506 + }, + { + "epoch": 0.05586776859504132, + "grad_norm": 7.41774320602417, + "learning_rate": 9.94718942055541e-06, + "loss": 0.4525, + "step": 507 + }, + { + "epoch": 0.055977961432506884, + "grad_norm": 6.24639368057251, + "learning_rate": 9.946935668964784e-06, + "loss": 0.579, + "step": 508 + }, + { + "epoch": 0.05608815426997245, + "grad_norm": 8.718338012695312, + "learning_rate": 9.946681312456142e-06, + "loss": 0.5288, + "step": 509 + }, + { + "epoch": 0.05619834710743802, + "grad_norm": 8.52311897277832, + "learning_rate": 9.946426351060589e-06, + "loss": 0.4902, + "step": 510 + }, + { + "epoch": 0.056308539944903584, + "grad_norm": 5.036555767059326, + "learning_rate": 9.946170784809307e-06, + "loss": 0.4719, + "step": 511 + }, + { + "epoch": 0.05641873278236915, + "grad_norm": 3.8602375984191895, + "learning_rate": 9.945914613733538e-06, + "loss": 0.481, + "step": 512 + }, + { + "epoch": 0.05652892561983471, + "grad_norm": 12.633068084716797, + "learning_rate": 9.945657837864615e-06, + "loss": 0.5185, + "step": 513 + }, + { + "epoch": 0.056639118457300276, + "grad_norm": 9.873920440673828, + "learning_rate": 9.945400457233931e-06, + "loss": 0.5364, + "step": 514 + }, + { + "epoch": 0.05674931129476584, + "grad_norm": 9.537657737731934, + "learning_rate": 9.945142471872963e-06, + "loss": 0.519, + "step": 515 + }, + { + "epoch": 0.056859504132231405, + "grad_norm": 10.282795906066895, + "learning_rate": 9.944883881813257e-06, + "loss": 0.4402, + "step": 516 + }, + { + "epoch": 0.05696969696969697, + "grad_norm": 12.182283401489258, + "learning_rate": 9.94462468708643e-06, + "loss": 0.437, + "step": 517 + }, + { + "epoch": 0.05707988980716253, + "grad_norm": 8.637264251708984, + "learning_rate": 9.944364887724182e-06, + "loss": 0.4978, + "step": 518 + }, + { + "epoch": 0.0571900826446281, + "grad_norm": 11.6417236328125, + "learning_rate": 9.94410448375828e-06, + "loss": 0.5038, + "step": 519 + }, + { + "epoch": 0.05730027548209366, + "grad_norm": 9.156976699829102, + "learning_rate": 9.943843475220565e-06, + "loss": 0.4275, + "step": 520 + }, + { + "epoch": 0.057410468319559226, + "grad_norm": 11.194124221801758, + "learning_rate": 9.943581862142953e-06, + "loss": 0.504, + "step": 521 + }, + { + "epoch": 0.0575206611570248, + "grad_norm": 13.460176467895508, + "learning_rate": 9.943319644557436e-06, + "loss": 0.5085, + "step": 522 + }, + { + "epoch": 0.05763085399449036, + "grad_norm": 13.589484214782715, + "learning_rate": 9.94305682249608e-06, + "loss": 0.4618, + "step": 523 + }, + { + "epoch": 0.057741046831955925, + "grad_norm": 11.005972862243652, + "learning_rate": 9.94279339599102e-06, + "loss": 0.5412, + "step": 524 + }, + { + "epoch": 0.05785123966942149, + "grad_norm": 12.504033088684082, + "learning_rate": 9.94252936507447e-06, + "loss": 0.4531, + "step": 525 + }, + { + "epoch": 0.05796143250688705, + "grad_norm": 8.159317016601562, + "learning_rate": 9.942264729778713e-06, + "loss": 0.5107, + "step": 526 + }, + { + "epoch": 0.05807162534435262, + "grad_norm": 11.655434608459473, + "learning_rate": 9.941999490136114e-06, + "loss": 0.5816, + "step": 527 + }, + { + "epoch": 0.05818181818181818, + "grad_norm": 12.202296257019043, + "learning_rate": 9.941733646179103e-06, + "loss": 0.537, + "step": 528 + }, + { + "epoch": 0.058292011019283746, + "grad_norm": 15.81233024597168, + "learning_rate": 9.94146719794019e-06, + "loss": 0.6009, + "step": 529 + }, + { + "epoch": 0.05840220385674931, + "grad_norm": 17.76038932800293, + "learning_rate": 9.941200145451955e-06, + "loss": 0.5005, + "step": 530 + }, + { + "epoch": 0.058512396694214874, + "grad_norm": 14.642497062683105, + "learning_rate": 9.940932488747054e-06, + "loss": 0.4706, + "step": 531 + }, + { + "epoch": 0.05862258953168044, + "grad_norm": 24.03742027282715, + "learning_rate": 9.940664227858218e-06, + "loss": 0.6553, + "step": 532 + }, + { + "epoch": 0.058732782369146, + "grad_norm": 13.785540580749512, + "learning_rate": 9.940395362818249e-06, + "loss": 0.5425, + "step": 533 + }, + { + "epoch": 0.05884297520661157, + "grad_norm": 13.390655517578125, + "learning_rate": 9.940125893660022e-06, + "loss": 0.5043, + "step": 534 + }, + { + "epoch": 0.05895316804407714, + "grad_norm": 8.240135192871094, + "learning_rate": 9.939855820416492e-06, + "loss": 0.5053, + "step": 535 + }, + { + "epoch": 0.0590633608815427, + "grad_norm": 6.785595417022705, + "learning_rate": 9.939585143120683e-06, + "loss": 0.5584, + "step": 536 + }, + { + "epoch": 0.059173553719008266, + "grad_norm": 9.69683837890625, + "learning_rate": 9.93931386180569e-06, + "loss": 0.4352, + "step": 537 + }, + { + "epoch": 0.05928374655647383, + "grad_norm": 7.655026912689209, + "learning_rate": 9.939041976504691e-06, + "loss": 0.4534, + "step": 538 + }, + { + "epoch": 0.059393939393939395, + "grad_norm": 5.39308500289917, + "learning_rate": 9.938769487250928e-06, + "loss": 0.4584, + "step": 539 + }, + { + "epoch": 0.05950413223140496, + "grad_norm": 9.690875053405762, + "learning_rate": 9.938496394077725e-06, + "loss": 0.4614, + "step": 540 + }, + { + "epoch": 0.05961432506887052, + "grad_norm": 17.464853286743164, + "learning_rate": 9.938222697018475e-06, + "loss": 0.4885, + "step": 541 + }, + { + "epoch": 0.05972451790633609, + "grad_norm": 11.740934371948242, + "learning_rate": 9.937948396106645e-06, + "loss": 0.5212, + "step": 542 + }, + { + "epoch": 0.05983471074380165, + "grad_norm": 12.96544361114502, + "learning_rate": 9.937673491375777e-06, + "loss": 0.4148, + "step": 543 + }, + { + "epoch": 0.059944903581267216, + "grad_norm": 11.744980812072754, + "learning_rate": 9.937397982859489e-06, + "loss": 0.5195, + "step": 544 + }, + { + "epoch": 0.06005509641873278, + "grad_norm": 11.595945358276367, + "learning_rate": 9.937121870591469e-06, + "loss": 0.4163, + "step": 545 + }, + { + "epoch": 0.060165289256198344, + "grad_norm": 10.950522422790527, + "learning_rate": 9.936845154605477e-06, + "loss": 0.4574, + "step": 546 + }, + { + "epoch": 0.060275482093663915, + "grad_norm": 8.607457160949707, + "learning_rate": 9.936567834935355e-06, + "loss": 0.3833, + "step": 547 + }, + { + "epoch": 0.06038567493112948, + "grad_norm": 10.207286834716797, + "learning_rate": 9.936289911615015e-06, + "loss": 0.4857, + "step": 548 + }, + { + "epoch": 0.060495867768595044, + "grad_norm": 11.0805025100708, + "learning_rate": 9.936011384678437e-06, + "loss": 0.4441, + "step": 549 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 6.993634223937988, + "learning_rate": 9.935732254159683e-06, + "loss": 0.4395, + "step": 550 + }, + { + "epoch": 0.06071625344352617, + "grad_norm": 9.192682266235352, + "learning_rate": 9.935452520092884e-06, + "loss": 0.5288, + "step": 551 + }, + { + "epoch": 0.060826446280991736, + "grad_norm": 5.134696006774902, + "learning_rate": 9.935172182512245e-06, + "loss": 0.5139, + "step": 552 + }, + { + "epoch": 0.0609366391184573, + "grad_norm": 8.106890678405762, + "learning_rate": 9.93489124145205e-06, + "loss": 0.4753, + "step": 553 + }, + { + "epoch": 0.061046831955922864, + "grad_norm": 9.831403732299805, + "learning_rate": 9.934609696946648e-06, + "loss": 0.4797, + "step": 554 + }, + { + "epoch": 0.06115702479338843, + "grad_norm": 16.136892318725586, + "learning_rate": 9.934327549030471e-06, + "loss": 0.5622, + "step": 555 + }, + { + "epoch": 0.06126721763085399, + "grad_norm": 8.392415046691895, + "learning_rate": 9.93404479773802e-06, + "loss": 0.4825, + "step": 556 + }, + { + "epoch": 0.06137741046831956, + "grad_norm": 10.084752082824707, + "learning_rate": 9.933761443103868e-06, + "loss": 0.4658, + "step": 557 + }, + { + "epoch": 0.06148760330578512, + "grad_norm": 11.000602722167969, + "learning_rate": 9.933477485162664e-06, + "loss": 0.5345, + "step": 558 + }, + { + "epoch": 0.061597796143250685, + "grad_norm": 9.608716011047363, + "learning_rate": 9.933192923949132e-06, + "loss": 0.5332, + "step": 559 + }, + { + "epoch": 0.06170798898071626, + "grad_norm": 12.702254295349121, + "learning_rate": 9.932907759498069e-06, + "loss": 0.4959, + "step": 560 + }, + { + "epoch": 0.06181818181818182, + "grad_norm": 13.748756408691406, + "learning_rate": 9.932621991844342e-06, + "loss": 0.5641, + "step": 561 + }, + { + "epoch": 0.061928374655647385, + "grad_norm": 16.632835388183594, + "learning_rate": 9.9323356210229e-06, + "loss": 0.578, + "step": 562 + }, + { + "epoch": 0.06203856749311295, + "grad_norm": 11.966531753540039, + "learning_rate": 9.932048647068759e-06, + "loss": 0.4808, + "step": 563 + }, + { + "epoch": 0.06214876033057851, + "grad_norm": 7.218803405761719, + "learning_rate": 9.931761070017008e-06, + "loss": 0.4511, + "step": 564 + }, + { + "epoch": 0.06225895316804408, + "grad_norm": 7.731607913970947, + "learning_rate": 9.931472889902814e-06, + "loss": 0.5155, + "step": 565 + }, + { + "epoch": 0.06236914600550964, + "grad_norm": 10.343323707580566, + "learning_rate": 9.931184106761419e-06, + "loss": 0.5582, + "step": 566 + }, + { + "epoch": 0.062479338842975206, + "grad_norm": 7.1000895500183105, + "learning_rate": 9.930894720628129e-06, + "loss": 0.506, + "step": 567 + }, + { + "epoch": 0.06258953168044078, + "grad_norm": 9.204298973083496, + "learning_rate": 9.930604731538337e-06, + "loss": 0.4962, + "step": 568 + }, + { + "epoch": 0.06269972451790634, + "grad_norm": 7.463597297668457, + "learning_rate": 9.930314139527501e-06, + "loss": 0.5024, + "step": 569 + }, + { + "epoch": 0.0628099173553719, + "grad_norm": 11.228713989257812, + "learning_rate": 9.930022944631155e-06, + "loss": 0.4776, + "step": 570 + }, + { + "epoch": 0.06292011019283747, + "grad_norm": 8.012906074523926, + "learning_rate": 9.929731146884904e-06, + "loss": 0.4753, + "step": 571 + }, + { + "epoch": 0.06303030303030303, + "grad_norm": 8.836130142211914, + "learning_rate": 9.929438746324436e-06, + "loss": 0.4902, + "step": 572 + }, + { + "epoch": 0.0631404958677686, + "grad_norm": 12.457344055175781, + "learning_rate": 9.929145742985498e-06, + "loss": 0.5347, + "step": 573 + }, + { + "epoch": 0.06325068870523416, + "grad_norm": 11.039780616760254, + "learning_rate": 9.928852136903926e-06, + "loss": 0.4673, + "step": 574 + }, + { + "epoch": 0.06336088154269973, + "grad_norm": 14.96688175201416, + "learning_rate": 9.928557928115619e-06, + "loss": 0.5673, + "step": 575 + }, + { + "epoch": 0.06347107438016529, + "grad_norm": 35.09209060668945, + "learning_rate": 9.928263116656554e-06, + "loss": 0.5101, + "step": 576 + }, + { + "epoch": 0.06358126721763085, + "grad_norm": 13.416524887084961, + "learning_rate": 9.92796770256278e-06, + "loss": 0.5174, + "step": 577 + }, + { + "epoch": 0.06369146005509642, + "grad_norm": 7.597424030303955, + "learning_rate": 9.92767168587042e-06, + "loss": 0.507, + "step": 578 + }, + { + "epoch": 0.06380165289256198, + "grad_norm": 9.764474868774414, + "learning_rate": 9.927375066615674e-06, + "loss": 0.5125, + "step": 579 + }, + { + "epoch": 0.06391184573002755, + "grad_norm": 11.765339851379395, + "learning_rate": 9.927077844834811e-06, + "loss": 0.4751, + "step": 580 + }, + { + "epoch": 0.06402203856749311, + "grad_norm": 8.978880882263184, + "learning_rate": 9.926780020564178e-06, + "loss": 0.4576, + "step": 581 + }, + { + "epoch": 0.06413223140495868, + "grad_norm": 8.072242736816406, + "learning_rate": 9.92648159384019e-06, + "loss": 0.556, + "step": 582 + }, + { + "epoch": 0.06424242424242424, + "grad_norm": 12.509285926818848, + "learning_rate": 9.926182564699343e-06, + "loss": 0.4711, + "step": 583 + }, + { + "epoch": 0.0643526170798898, + "grad_norm": 12.04463005065918, + "learning_rate": 9.925882933178199e-06, + "loss": 0.4865, + "step": 584 + }, + { + "epoch": 0.06446280991735537, + "grad_norm": 10.25395393371582, + "learning_rate": 9.925582699313397e-06, + "loss": 0.5415, + "step": 585 + }, + { + "epoch": 0.06457300275482093, + "grad_norm": 6.9483442306518555, + "learning_rate": 9.925281863141653e-06, + "loss": 0.4399, + "step": 586 + }, + { + "epoch": 0.0646831955922865, + "grad_norm": 9.354958534240723, + "learning_rate": 9.924980424699754e-06, + "loss": 0.4824, + "step": 587 + }, + { + "epoch": 0.06479338842975206, + "grad_norm": 8.679173469543457, + "learning_rate": 9.924678384024557e-06, + "loss": 0.5696, + "step": 588 + }, + { + "epoch": 0.06490358126721762, + "grad_norm": 7.74788761138916, + "learning_rate": 9.924375741152998e-06, + "loss": 0.3965, + "step": 589 + }, + { + "epoch": 0.06501377410468319, + "grad_norm": 7.50650691986084, + "learning_rate": 9.924072496122085e-06, + "loss": 0.4672, + "step": 590 + }, + { + "epoch": 0.06512396694214877, + "grad_norm": 15.767810821533203, + "learning_rate": 9.9237686489689e-06, + "loss": 0.5774, + "step": 591 + }, + { + "epoch": 0.06523415977961433, + "grad_norm": 10.386406898498535, + "learning_rate": 9.923464199730593e-06, + "loss": 0.4665, + "step": 592 + }, + { + "epoch": 0.0653443526170799, + "grad_norm": 12.454659461975098, + "learning_rate": 9.923159148444397e-06, + "loss": 0.4118, + "step": 593 + }, + { + "epoch": 0.06545454545454546, + "grad_norm": 12.94180679321289, + "learning_rate": 9.922853495147613e-06, + "loss": 0.5064, + "step": 594 + }, + { + "epoch": 0.06556473829201102, + "grad_norm": 10.078128814697266, + "learning_rate": 9.922547239877617e-06, + "loss": 0.4827, + "step": 595 + }, + { + "epoch": 0.06567493112947659, + "grad_norm": 7.623451232910156, + "learning_rate": 9.922240382671858e-06, + "loss": 0.4234, + "step": 596 + }, + { + "epoch": 0.06578512396694215, + "grad_norm": 12.806289672851562, + "learning_rate": 9.921932923567858e-06, + "loss": 0.5223, + "step": 597 + }, + { + "epoch": 0.06589531680440772, + "grad_norm": 16.55751609802246, + "learning_rate": 9.921624862603214e-06, + "loss": 0.5422, + "step": 598 + }, + { + "epoch": 0.06600550964187328, + "grad_norm": 10.228744506835938, + "learning_rate": 9.921316199815597e-06, + "loss": 0.4944, + "step": 599 + }, + { + "epoch": 0.06611570247933884, + "grad_norm": 7.3604960441589355, + "learning_rate": 9.92100693524275e-06, + "loss": 0.4693, + "step": 600 + }, + { + "epoch": 0.06622589531680441, + "grad_norm": 10.923355102539062, + "learning_rate": 9.920697068922491e-06, + "loss": 0.5234, + "step": 601 + }, + { + "epoch": 0.06633608815426997, + "grad_norm": 9.79210376739502, + "learning_rate": 9.92038660089271e-06, + "loss": 0.4189, + "step": 602 + }, + { + "epoch": 0.06644628099173554, + "grad_norm": 6.217829704284668, + "learning_rate": 9.920075531191371e-06, + "loss": 0.4696, + "step": 603 + }, + { + "epoch": 0.0665564738292011, + "grad_norm": 9.0442533493042, + "learning_rate": 9.919763859856514e-06, + "loss": 0.5369, + "step": 604 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 12.135570526123047, + "learning_rate": 9.919451586926249e-06, + "loss": 0.6227, + "step": 605 + }, + { + "epoch": 0.06677685950413223, + "grad_norm": 14.060270309448242, + "learning_rate": 9.91913871243876e-06, + "loss": 0.5006, + "step": 606 + }, + { + "epoch": 0.0668870523415978, + "grad_norm": 11.584596633911133, + "learning_rate": 9.91882523643231e-06, + "loss": 0.5498, + "step": 607 + }, + { + "epoch": 0.06699724517906336, + "grad_norm": 8.220637321472168, + "learning_rate": 9.918511158945226e-06, + "loss": 0.4144, + "step": 608 + }, + { + "epoch": 0.06710743801652892, + "grad_norm": 7.712873458862305, + "learning_rate": 9.918196480015918e-06, + "loss": 0.4731, + "step": 609 + }, + { + "epoch": 0.06721763085399449, + "grad_norm": 10.395918846130371, + "learning_rate": 9.917881199682864e-06, + "loss": 0.5316, + "step": 610 + }, + { + "epoch": 0.06732782369146005, + "grad_norm": 10.288063049316406, + "learning_rate": 9.917565317984614e-06, + "loss": 0.5693, + "step": 611 + }, + { + "epoch": 0.06743801652892562, + "grad_norm": 6.308658599853516, + "learning_rate": 9.917248834959799e-06, + "loss": 0.488, + "step": 612 + }, + { + "epoch": 0.06754820936639118, + "grad_norm": 7.876396179199219, + "learning_rate": 9.916931750647118e-06, + "loss": 0.4428, + "step": 613 + }, + { + "epoch": 0.06765840220385674, + "grad_norm": 9.849668502807617, + "learning_rate": 9.916614065085342e-06, + "loss": 0.4908, + "step": 614 + }, + { + "epoch": 0.06776859504132231, + "grad_norm": 9.0003662109375, + "learning_rate": 9.91629577831332e-06, + "loss": 0.4933, + "step": 615 + }, + { + "epoch": 0.06787878787878789, + "grad_norm": 11.975523948669434, + "learning_rate": 9.915976890369972e-06, + "loss": 0.5003, + "step": 616 + }, + { + "epoch": 0.06798898071625345, + "grad_norm": 7.651215553283691, + "learning_rate": 9.915657401294291e-06, + "loss": 0.5143, + "step": 617 + }, + { + "epoch": 0.06809917355371901, + "grad_norm": 6.625883102416992, + "learning_rate": 9.915337311125348e-06, + "loss": 0.471, + "step": 618 + }, + { + "epoch": 0.06820936639118458, + "grad_norm": 8.365671157836914, + "learning_rate": 9.91501661990228e-06, + "loss": 0.4172, + "step": 619 + }, + { + "epoch": 0.06831955922865014, + "grad_norm": 10.616379737854004, + "learning_rate": 9.914695327664306e-06, + "loss": 0.5127, + "step": 620 + }, + { + "epoch": 0.0684297520661157, + "grad_norm": 9.061238288879395, + "learning_rate": 9.914373434450707e-06, + "loss": 0.5785, + "step": 621 + }, + { + "epoch": 0.06853994490358127, + "grad_norm": 7.105732440948486, + "learning_rate": 9.914050940300852e-06, + "loss": 0.3874, + "step": 622 + }, + { + "epoch": 0.06865013774104683, + "grad_norm": 13.91950798034668, + "learning_rate": 9.913727845254173e-06, + "loss": 0.5533, + "step": 623 + }, + { + "epoch": 0.0687603305785124, + "grad_norm": 8.040175437927246, + "learning_rate": 9.913404149350177e-06, + "loss": 0.4253, + "step": 624 + }, + { + "epoch": 0.06887052341597796, + "grad_norm": 4.891575336456299, + "learning_rate": 9.91307985262845e-06, + "loss": 0.4509, + "step": 625 + }, + { + "epoch": 0.06898071625344353, + "grad_norm": 5.809885025024414, + "learning_rate": 9.912754955128641e-06, + "loss": 0.4876, + "step": 626 + }, + { + "epoch": 0.06909090909090909, + "grad_norm": 6.048772811889648, + "learning_rate": 9.912429456890484e-06, + "loss": 0.4972, + "step": 627 + }, + { + "epoch": 0.06920110192837466, + "grad_norm": 10.386425018310547, + "learning_rate": 9.912103357953782e-06, + "loss": 0.5544, + "step": 628 + }, + { + "epoch": 0.06931129476584022, + "grad_norm": 9.411628723144531, + "learning_rate": 9.911776658358408e-06, + "loss": 0.5733, + "step": 629 + }, + { + "epoch": 0.06942148760330578, + "grad_norm": 10.796219825744629, + "learning_rate": 9.911449358144311e-06, + "loss": 0.4771, + "step": 630 + }, + { + "epoch": 0.06953168044077135, + "grad_norm": 8.182342529296875, + "learning_rate": 9.911121457351516e-06, + "loss": 0.5043, + "step": 631 + }, + { + "epoch": 0.06964187327823691, + "grad_norm": 17.62633514404297, + "learning_rate": 9.910792956020119e-06, + "loss": 0.6273, + "step": 632 + }, + { + "epoch": 0.06975206611570248, + "grad_norm": 7.0770344734191895, + "learning_rate": 9.910463854190287e-06, + "loss": 0.4609, + "step": 633 + }, + { + "epoch": 0.06986225895316804, + "grad_norm": 15.625011444091797, + "learning_rate": 9.910134151902267e-06, + "loss": 0.523, + "step": 634 + }, + { + "epoch": 0.0699724517906336, + "grad_norm": 7.356435298919678, + "learning_rate": 9.90980384919637e-06, + "loss": 0.442, + "step": 635 + }, + { + "epoch": 0.07008264462809917, + "grad_norm": 10.88286018371582, + "learning_rate": 9.90947294611299e-06, + "loss": 0.4141, + "step": 636 + }, + { + "epoch": 0.07019283746556473, + "grad_norm": 9.952497482299805, + "learning_rate": 9.909141442692592e-06, + "loss": 0.4882, + "step": 637 + }, + { + "epoch": 0.0703030303030303, + "grad_norm": 9.48213005065918, + "learning_rate": 9.908809338975706e-06, + "loss": 0.4936, + "step": 638 + }, + { + "epoch": 0.07041322314049586, + "grad_norm": 18.07328987121582, + "learning_rate": 9.908476635002948e-06, + "loss": 0.5932, + "step": 639 + }, + { + "epoch": 0.07052341597796143, + "grad_norm": 8.986546516418457, + "learning_rate": 9.908143330815e-06, + "loss": 0.4723, + "step": 640 + }, + { + "epoch": 0.07063360881542699, + "grad_norm": 9.191007614135742, + "learning_rate": 9.907809426452617e-06, + "loss": 0.424, + "step": 641 + }, + { + "epoch": 0.07074380165289257, + "grad_norm": 7.707602500915527, + "learning_rate": 9.907474921956632e-06, + "loss": 0.5195, + "step": 642 + }, + { + "epoch": 0.07085399449035813, + "grad_norm": 8.087730407714844, + "learning_rate": 9.907139817367948e-06, + "loss": 0.5119, + "step": 643 + }, + { + "epoch": 0.0709641873278237, + "grad_norm": 10.142461776733398, + "learning_rate": 9.90680411272754e-06, + "loss": 0.5339, + "step": 644 + }, + { + "epoch": 0.07107438016528926, + "grad_norm": 8.02434253692627, + "learning_rate": 9.906467808076461e-06, + "loss": 0.4649, + "step": 645 + }, + { + "epoch": 0.07118457300275483, + "grad_norm": 10.722000122070312, + "learning_rate": 9.906130903455833e-06, + "loss": 0.4916, + "step": 646 + }, + { + "epoch": 0.07129476584022039, + "grad_norm": 9.784269332885742, + "learning_rate": 9.905793398906853e-06, + "loss": 0.4741, + "step": 647 + }, + { + "epoch": 0.07140495867768595, + "grad_norm": 7.949434757232666, + "learning_rate": 9.905455294470793e-06, + "loss": 0.4754, + "step": 648 + }, + { + "epoch": 0.07151515151515152, + "grad_norm": 14.611332893371582, + "learning_rate": 9.905116590188996e-06, + "loss": 0.565, + "step": 649 + }, + { + "epoch": 0.07162534435261708, + "grad_norm": 13.625049591064453, + "learning_rate": 9.90477728610288e-06, + "loss": 0.5274, + "step": 650 + }, + { + "epoch": 0.07173553719008265, + "grad_norm": 9.991351127624512, + "learning_rate": 9.904437382253935e-06, + "loss": 0.5799, + "step": 651 + }, + { + "epoch": 0.07184573002754821, + "grad_norm": 9.200335502624512, + "learning_rate": 9.904096878683724e-06, + "loss": 0.5184, + "step": 652 + }, + { + "epoch": 0.07195592286501377, + "grad_norm": 8.283576011657715, + "learning_rate": 9.903755775433886e-06, + "loss": 0.5627, + "step": 653 + }, + { + "epoch": 0.07206611570247934, + "grad_norm": 8.832159996032715, + "learning_rate": 9.90341407254613e-06, + "loss": 0.4769, + "step": 654 + }, + { + "epoch": 0.0721763085399449, + "grad_norm": 6.476308345794678, + "learning_rate": 9.90307177006224e-06, + "loss": 0.5206, + "step": 655 + }, + { + "epoch": 0.07228650137741047, + "grad_norm": 8.467257499694824, + "learning_rate": 9.902728868024075e-06, + "loss": 0.391, + "step": 656 + }, + { + "epoch": 0.07239669421487603, + "grad_norm": 6.937216758728027, + "learning_rate": 9.902385366473564e-06, + "loss": 0.479, + "step": 657 + }, + { + "epoch": 0.0725068870523416, + "grad_norm": 7.917789936065674, + "learning_rate": 9.90204126545271e-06, + "loss": 0.5145, + "step": 658 + }, + { + "epoch": 0.07261707988980716, + "grad_norm": 5.8603434562683105, + "learning_rate": 9.901696565003593e-06, + "loss": 0.5029, + "step": 659 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 7.931520462036133, + "learning_rate": 9.901351265168363e-06, + "loss": 0.4938, + "step": 660 + }, + { + "epoch": 0.07283746556473829, + "grad_norm": 10.546648979187012, + "learning_rate": 9.901005365989241e-06, + "loss": 0.4907, + "step": 661 + }, + { + "epoch": 0.07294765840220385, + "grad_norm": 8.893937110900879, + "learning_rate": 9.900658867508524e-06, + "loss": 0.4804, + "step": 662 + }, + { + "epoch": 0.07305785123966942, + "grad_norm": 7.418328762054443, + "learning_rate": 9.900311769768585e-06, + "loss": 0.4477, + "step": 663 + }, + { + "epoch": 0.07316804407713498, + "grad_norm": 9.132783889770508, + "learning_rate": 9.899964072811865e-06, + "loss": 0.4554, + "step": 664 + }, + { + "epoch": 0.07327823691460054, + "grad_norm": 15.406439781188965, + "learning_rate": 9.899615776680885e-06, + "loss": 0.4194, + "step": 665 + }, + { + "epoch": 0.07338842975206611, + "grad_norm": 10.79430866241455, + "learning_rate": 9.89926688141823e-06, + "loss": 0.4427, + "step": 666 + }, + { + "epoch": 0.07349862258953169, + "grad_norm": 7.434822082519531, + "learning_rate": 9.898917387066566e-06, + "loss": 0.4463, + "step": 667 + }, + { + "epoch": 0.07360881542699725, + "grad_norm": 6.936038970947266, + "learning_rate": 9.89856729366863e-06, + "loss": 0.3917, + "step": 668 + }, + { + "epoch": 0.07371900826446282, + "grad_norm": 7.734316349029541, + "learning_rate": 9.898216601267232e-06, + "loss": 0.468, + "step": 669 + }, + { + "epoch": 0.07382920110192838, + "grad_norm": 12.051238059997559, + "learning_rate": 9.897865309905254e-06, + "loss": 0.4934, + "step": 670 + }, + { + "epoch": 0.07393939393939394, + "grad_norm": 6.236438751220703, + "learning_rate": 9.897513419625653e-06, + "loss": 0.4516, + "step": 671 + }, + { + "epoch": 0.07404958677685951, + "grad_norm": 8.817441940307617, + "learning_rate": 9.897160930471457e-06, + "loss": 0.4753, + "step": 672 + }, + { + "epoch": 0.07415977961432507, + "grad_norm": 15.634927749633789, + "learning_rate": 9.896807842485772e-06, + "loss": 0.6153, + "step": 673 + }, + { + "epoch": 0.07426997245179064, + "grad_norm": 12.055826187133789, + "learning_rate": 9.896454155711771e-06, + "loss": 0.6136, + "step": 674 + }, + { + "epoch": 0.0743801652892562, + "grad_norm": 5.280544757843018, + "learning_rate": 9.896099870192706e-06, + "loss": 0.49, + "step": 675 + }, + { + "epoch": 0.07449035812672176, + "grad_norm": 8.10863208770752, + "learning_rate": 9.895744985971895e-06, + "loss": 0.5093, + "step": 676 + }, + { + "epoch": 0.07460055096418733, + "grad_norm": 5.737893104553223, + "learning_rate": 9.89538950309274e-06, + "loss": 0.5611, + "step": 677 + }, + { + "epoch": 0.07471074380165289, + "grad_norm": 9.80198860168457, + "learning_rate": 9.895033421598708e-06, + "loss": 0.5501, + "step": 678 + }, + { + "epoch": 0.07482093663911846, + "grad_norm": 12.883217811584473, + "learning_rate": 9.894676741533337e-06, + "loss": 0.5309, + "step": 679 + }, + { + "epoch": 0.07493112947658402, + "grad_norm": 9.768318176269531, + "learning_rate": 9.894319462940246e-06, + "loss": 0.5148, + "step": 680 + }, + { + "epoch": 0.07504132231404959, + "grad_norm": 6.506410598754883, + "learning_rate": 9.893961585863124e-06, + "loss": 0.4572, + "step": 681 + }, + { + "epoch": 0.07515151515151515, + "grad_norm": 9.509140968322754, + "learning_rate": 9.89360311034573e-06, + "loss": 0.5298, + "step": 682 + }, + { + "epoch": 0.07526170798898071, + "grad_norm": 4.768259048461914, + "learning_rate": 9.893244036431901e-06, + "loss": 0.4822, + "step": 683 + }, + { + "epoch": 0.07537190082644628, + "grad_norm": 6.9128289222717285, + "learning_rate": 9.892884364165545e-06, + "loss": 0.4858, + "step": 684 + }, + { + "epoch": 0.07548209366391184, + "grad_norm": 16.79252052307129, + "learning_rate": 9.89252409359064e-06, + "loss": 0.5622, + "step": 685 + }, + { + "epoch": 0.0755922865013774, + "grad_norm": 10.07437801361084, + "learning_rate": 9.892163224751245e-06, + "loss": 0.4842, + "step": 686 + }, + { + "epoch": 0.07570247933884297, + "grad_norm": 5.11728572845459, + "learning_rate": 9.891801757691487e-06, + "loss": 0.492, + "step": 687 + }, + { + "epoch": 0.07581267217630853, + "grad_norm": 7.831786155700684, + "learning_rate": 9.891439692455563e-06, + "loss": 0.5006, + "step": 688 + }, + { + "epoch": 0.0759228650137741, + "grad_norm": 12.940075874328613, + "learning_rate": 9.89107702908775e-06, + "loss": 0.4705, + "step": 689 + }, + { + "epoch": 0.07603305785123966, + "grad_norm": 15.072134017944336, + "learning_rate": 9.890713767632394e-06, + "loss": 0.6851, + "step": 690 + }, + { + "epoch": 0.07614325068870523, + "grad_norm": 8.534189224243164, + "learning_rate": 9.890349908133914e-06, + "loss": 0.4492, + "step": 691 + }, + { + "epoch": 0.0762534435261708, + "grad_norm": 9.891030311584473, + "learning_rate": 9.889985450636806e-06, + "loss": 0.4826, + "step": 692 + }, + { + "epoch": 0.07636363636363637, + "grad_norm": 18.696834564208984, + "learning_rate": 9.889620395185635e-06, + "loss": 0.4704, + "step": 693 + }, + { + "epoch": 0.07647382920110193, + "grad_norm": 11.502964973449707, + "learning_rate": 9.889254741825038e-06, + "loss": 0.4521, + "step": 694 + }, + { + "epoch": 0.0765840220385675, + "grad_norm": 6.212857246398926, + "learning_rate": 9.888888490599731e-06, + "loss": 0.5045, + "step": 695 + }, + { + "epoch": 0.07669421487603306, + "grad_norm": 13.820978164672852, + "learning_rate": 9.888521641554499e-06, + "loss": 0.5168, + "step": 696 + }, + { + "epoch": 0.07680440771349863, + "grad_norm": 10.593164443969727, + "learning_rate": 9.888154194734198e-06, + "loss": 0.4951, + "step": 697 + }, + { + "epoch": 0.07691460055096419, + "grad_norm": 10.364981651306152, + "learning_rate": 9.887786150183765e-06, + "loss": 0.5434, + "step": 698 + }, + { + "epoch": 0.07702479338842975, + "grad_norm": 15.26873779296875, + "learning_rate": 9.8874175079482e-06, + "loss": 0.5559, + "step": 699 + }, + { + "epoch": 0.07713498622589532, + "grad_norm": 7.997010707855225, + "learning_rate": 9.887048268072585e-06, + "loss": 0.459, + "step": 700 + }, + { + "epoch": 0.07724517906336088, + "grad_norm": 9.9561767578125, + "learning_rate": 9.886678430602068e-06, + "loss": 0.442, + "step": 701 + }, + { + "epoch": 0.07735537190082645, + "grad_norm": 12.93139934539795, + "learning_rate": 9.886307995581877e-06, + "loss": 0.4559, + "step": 702 + }, + { + "epoch": 0.07746556473829201, + "grad_norm": 8.539533615112305, + "learning_rate": 9.885936963057303e-06, + "loss": 0.487, + "step": 703 + }, + { + "epoch": 0.07757575757575758, + "grad_norm": 5.245579242706299, + "learning_rate": 9.885565333073723e-06, + "loss": 0.4743, + "step": 704 + }, + { + "epoch": 0.07768595041322314, + "grad_norm": 13.183752059936523, + "learning_rate": 9.885193105676577e-06, + "loss": 0.3537, + "step": 705 + }, + { + "epoch": 0.0777961432506887, + "grad_norm": 8.59929370880127, + "learning_rate": 9.884820280911383e-06, + "loss": 0.4825, + "step": 706 + }, + { + "epoch": 0.07790633608815427, + "grad_norm": 11.603909492492676, + "learning_rate": 9.884446858823728e-06, + "loss": 0.4862, + "step": 707 + }, + { + "epoch": 0.07801652892561983, + "grad_norm": 11.469616889953613, + "learning_rate": 9.884072839459278e-06, + "loss": 0.4586, + "step": 708 + }, + { + "epoch": 0.0781267217630854, + "grad_norm": 9.469440460205078, + "learning_rate": 9.883698222863765e-06, + "loss": 0.5705, + "step": 709 + }, + { + "epoch": 0.07823691460055096, + "grad_norm": 7.1158647537231445, + "learning_rate": 9.883323009083e-06, + "loss": 0.3444, + "step": 710 + }, + { + "epoch": 0.07834710743801652, + "grad_norm": 8.297913551330566, + "learning_rate": 9.882947198162865e-06, + "loss": 0.3462, + "step": 711 + }, + { + "epoch": 0.07845730027548209, + "grad_norm": 13.624194145202637, + "learning_rate": 9.882570790149313e-06, + "loss": 0.5259, + "step": 712 + }, + { + "epoch": 0.07856749311294765, + "grad_norm": 10.769474983215332, + "learning_rate": 9.882193785088372e-06, + "loss": 0.5351, + "step": 713 + }, + { + "epoch": 0.07867768595041322, + "grad_norm": 11.514634132385254, + "learning_rate": 9.881816183026145e-06, + "loss": 0.4189, + "step": 714 + }, + { + "epoch": 0.07878787878787878, + "grad_norm": 9.002857208251953, + "learning_rate": 9.881437984008801e-06, + "loss": 0.5667, + "step": 715 + }, + { + "epoch": 0.07889807162534435, + "grad_norm": 10.22419548034668, + "learning_rate": 9.881059188082592e-06, + "loss": 0.4815, + "step": 716 + }, + { + "epoch": 0.07900826446280992, + "grad_norm": 10.344335556030273, + "learning_rate": 9.880679795293835e-06, + "loss": 0.5151, + "step": 717 + }, + { + "epoch": 0.07911845730027549, + "grad_norm": 8.122965812683105, + "learning_rate": 9.880299805688922e-06, + "loss": 0.514, + "step": 718 + }, + { + "epoch": 0.07922865013774105, + "grad_norm": 8.082174301147461, + "learning_rate": 9.87991921931432e-06, + "loss": 0.4556, + "step": 719 + }, + { + "epoch": 0.07933884297520662, + "grad_norm": 8.465988159179688, + "learning_rate": 9.879538036216567e-06, + "loss": 0.4653, + "step": 720 + }, + { + "epoch": 0.07944903581267218, + "grad_norm": 12.201423645019531, + "learning_rate": 9.879156256442276e-06, + "loss": 0.5005, + "step": 721 + }, + { + "epoch": 0.07955922865013774, + "grad_norm": 7.324477672576904, + "learning_rate": 9.878773880038127e-06, + "loss": 0.4988, + "step": 722 + }, + { + "epoch": 0.07966942148760331, + "grad_norm": 10.313101768493652, + "learning_rate": 9.878390907050882e-06, + "loss": 0.5054, + "step": 723 + }, + { + "epoch": 0.07977961432506887, + "grad_norm": 7.98211669921875, + "learning_rate": 9.878007337527373e-06, + "loss": 0.5633, + "step": 724 + }, + { + "epoch": 0.07988980716253444, + "grad_norm": 7.980637073516846, + "learning_rate": 9.877623171514498e-06, + "loss": 0.4389, + "step": 725 + }, + { + "epoch": 0.08, + "grad_norm": 6.355591773986816, + "learning_rate": 9.877238409059237e-06, + "loss": 0.4547, + "step": 726 + }, + { + "epoch": 0.08011019283746557, + "grad_norm": 9.528438568115234, + "learning_rate": 9.876853050208637e-06, + "loss": 0.5235, + "step": 727 + }, + { + "epoch": 0.08022038567493113, + "grad_norm": 9.217560768127441, + "learning_rate": 9.876467095009823e-06, + "loss": 0.4124, + "step": 728 + }, + { + "epoch": 0.0803305785123967, + "grad_norm": 6.823374271392822, + "learning_rate": 9.876080543509987e-06, + "loss": 0.503, + "step": 729 + }, + { + "epoch": 0.08044077134986226, + "grad_norm": 14.821892738342285, + "learning_rate": 9.8756933957564e-06, + "loss": 0.462, + "step": 730 + }, + { + "epoch": 0.08055096418732782, + "grad_norm": 12.123024940490723, + "learning_rate": 9.8753056517964e-06, + "loss": 0.5251, + "step": 731 + }, + { + "epoch": 0.08066115702479339, + "grad_norm": 8.066936492919922, + "learning_rate": 9.874917311677405e-06, + "loss": 0.507, + "step": 732 + }, + { + "epoch": 0.08077134986225895, + "grad_norm": 9.832659721374512, + "learning_rate": 9.874528375446898e-06, + "loss": 0.4783, + "step": 733 + }, + { + "epoch": 0.08088154269972452, + "grad_norm": 7.567358016967773, + "learning_rate": 9.874138843152438e-06, + "loss": 0.466, + "step": 734 + }, + { + "epoch": 0.08099173553719008, + "grad_norm": 11.371232032775879, + "learning_rate": 9.873748714841661e-06, + "loss": 0.4871, + "step": 735 + }, + { + "epoch": 0.08110192837465564, + "grad_norm": 10.023006439208984, + "learning_rate": 9.873357990562272e-06, + "loss": 0.4108, + "step": 736 + }, + { + "epoch": 0.08121212121212121, + "grad_norm": 16.237262725830078, + "learning_rate": 9.872966670362048e-06, + "loss": 0.4999, + "step": 737 + }, + { + "epoch": 0.08132231404958677, + "grad_norm": 10.57867431640625, + "learning_rate": 9.872574754288838e-06, + "loss": 0.4475, + "step": 738 + }, + { + "epoch": 0.08143250688705234, + "grad_norm": 8.24145793914795, + "learning_rate": 9.87218224239057e-06, + "loss": 0.4632, + "step": 739 + }, + { + "epoch": 0.0815426997245179, + "grad_norm": 9.584731101989746, + "learning_rate": 9.87178913471524e-06, + "loss": 0.5202, + "step": 740 + }, + { + "epoch": 0.08165289256198346, + "grad_norm": 9.388596534729004, + "learning_rate": 9.871395431310915e-06, + "loss": 0.4376, + "step": 741 + }, + { + "epoch": 0.08176308539944904, + "grad_norm": 10.384413719177246, + "learning_rate": 9.87100113222574e-06, + "loss": 0.512, + "step": 742 + }, + { + "epoch": 0.0818732782369146, + "grad_norm": 13.565887451171875, + "learning_rate": 9.87060623750793e-06, + "loss": 0.5288, + "step": 743 + }, + { + "epoch": 0.08198347107438017, + "grad_norm": 8.631324768066406, + "learning_rate": 9.870210747205772e-06, + "loss": 0.452, + "step": 744 + }, + { + "epoch": 0.08209366391184574, + "grad_norm": 12.596695899963379, + "learning_rate": 9.869814661367631e-06, + "loss": 0.4553, + "step": 745 + }, + { + "epoch": 0.0822038567493113, + "grad_norm": 10.302438735961914, + "learning_rate": 9.869417980041937e-06, + "loss": 0.5333, + "step": 746 + }, + { + "epoch": 0.08231404958677686, + "grad_norm": 12.104216575622559, + "learning_rate": 9.869020703277197e-06, + "loss": 0.4825, + "step": 747 + }, + { + "epoch": 0.08242424242424243, + "grad_norm": 8.994564056396484, + "learning_rate": 9.868622831121992e-06, + "loss": 0.5453, + "step": 748 + }, + { + "epoch": 0.08253443526170799, + "grad_norm": 8.422258377075195, + "learning_rate": 9.868224363624975e-06, + "loss": 0.506, + "step": 749 + }, + { + "epoch": 0.08264462809917356, + "grad_norm": 7.729913711547852, + "learning_rate": 9.867825300834868e-06, + "loss": 0.4059, + "step": 750 + }, + { + "epoch": 0.08275482093663912, + "grad_norm": 6.395445823669434, + "learning_rate": 9.867425642800473e-06, + "loss": 0.4778, + "step": 751 + }, + { + "epoch": 0.08286501377410468, + "grad_norm": 7.772154808044434, + "learning_rate": 9.867025389570658e-06, + "loss": 0.4095, + "step": 752 + }, + { + "epoch": 0.08297520661157025, + "grad_norm": 7.181577205657959, + "learning_rate": 9.866624541194367e-06, + "loss": 0.5259, + "step": 753 + }, + { + "epoch": 0.08308539944903581, + "grad_norm": 7.02896785736084, + "learning_rate": 9.866223097720616e-06, + "loss": 0.4161, + "step": 754 + }, + { + "epoch": 0.08319559228650138, + "grad_norm": 5.862891674041748, + "learning_rate": 9.865821059198494e-06, + "loss": 0.4926, + "step": 755 + }, + { + "epoch": 0.08330578512396694, + "grad_norm": 9.70177936553955, + "learning_rate": 9.865418425677165e-06, + "loss": 0.4415, + "step": 756 + }, + { + "epoch": 0.0834159779614325, + "grad_norm": 9.973360061645508, + "learning_rate": 9.86501519720586e-06, + "loss": 0.4388, + "step": 757 + }, + { + "epoch": 0.08352617079889807, + "grad_norm": 12.362749099731445, + "learning_rate": 9.86461137383389e-06, + "loss": 0.5289, + "step": 758 + }, + { + "epoch": 0.08363636363636363, + "grad_norm": 12.198518753051758, + "learning_rate": 9.864206955610632e-06, + "loss": 0.4405, + "step": 759 + }, + { + "epoch": 0.0837465564738292, + "grad_norm": 8.491933822631836, + "learning_rate": 9.86380194258554e-06, + "loss": 0.4767, + "step": 760 + }, + { + "epoch": 0.08385674931129476, + "grad_norm": 13.432280540466309, + "learning_rate": 9.863396334808141e-06, + "loss": 0.5168, + "step": 761 + }, + { + "epoch": 0.08396694214876033, + "grad_norm": 7.089749813079834, + "learning_rate": 9.862990132328032e-06, + "loss": 0.4916, + "step": 762 + }, + { + "epoch": 0.08407713498622589, + "grad_norm": 15.24958610534668, + "learning_rate": 9.862583335194882e-06, + "loss": 0.5959, + "step": 763 + }, + { + "epoch": 0.08418732782369145, + "grad_norm": 9.171639442443848, + "learning_rate": 9.862175943458438e-06, + "loss": 0.4483, + "step": 764 + }, + { + "epoch": 0.08429752066115702, + "grad_norm": 13.582493782043457, + "learning_rate": 9.861767957168514e-06, + "loss": 0.5253, + "step": 765 + }, + { + "epoch": 0.08440771349862258, + "grad_norm": 9.245279312133789, + "learning_rate": 9.861359376375002e-06, + "loss": 0.5328, + "step": 766 + }, + { + "epoch": 0.08451790633608816, + "grad_norm": 12.219204902648926, + "learning_rate": 9.86095020112786e-06, + "loss": 0.5749, + "step": 767 + }, + { + "epoch": 0.08462809917355373, + "grad_norm": 11.779691696166992, + "learning_rate": 9.860540431477126e-06, + "loss": 0.5183, + "step": 768 + }, + { + "epoch": 0.08473829201101929, + "grad_norm": 5.6311187744140625, + "learning_rate": 9.860130067472904e-06, + "loss": 0.4032, + "step": 769 + }, + { + "epoch": 0.08484848484848485, + "grad_norm": 6.74190616607666, + "learning_rate": 9.859719109165376e-06, + "loss": 0.5194, + "step": 770 + }, + { + "epoch": 0.08495867768595042, + "grad_norm": 6.302004814147949, + "learning_rate": 9.859307556604794e-06, + "loss": 0.5004, + "step": 771 + }, + { + "epoch": 0.08506887052341598, + "grad_norm": 5.843480110168457, + "learning_rate": 9.858895409841485e-06, + "loss": 0.4876, + "step": 772 + }, + { + "epoch": 0.08517906336088155, + "grad_norm": 7.050701141357422, + "learning_rate": 9.858482668925843e-06, + "loss": 0.4691, + "step": 773 + }, + { + "epoch": 0.08528925619834711, + "grad_norm": 11.953496932983398, + "learning_rate": 9.858069333908341e-06, + "loss": 0.4997, + "step": 774 + }, + { + "epoch": 0.08539944903581267, + "grad_norm": 8.992799758911133, + "learning_rate": 9.857655404839522e-06, + "loss": 0.4915, + "step": 775 + }, + { + "epoch": 0.08550964187327824, + "grad_norm": 7.738897323608398, + "learning_rate": 9.857240881770003e-06, + "loss": 0.4843, + "step": 776 + }, + { + "epoch": 0.0856198347107438, + "grad_norm": 10.122507095336914, + "learning_rate": 9.856825764750468e-06, + "loss": 0.5273, + "step": 777 + }, + { + "epoch": 0.08573002754820937, + "grad_norm": 9.262826919555664, + "learning_rate": 9.856410053831685e-06, + "loss": 0.5056, + "step": 778 + }, + { + "epoch": 0.08584022038567493, + "grad_norm": 6.276966571807861, + "learning_rate": 9.85599374906448e-06, + "loss": 0.4164, + "step": 779 + }, + { + "epoch": 0.0859504132231405, + "grad_norm": 5.466647148132324, + "learning_rate": 9.855576850499767e-06, + "loss": 0.376, + "step": 780 + }, + { + "epoch": 0.08606060606060606, + "grad_norm": 7.285747051239014, + "learning_rate": 9.855159358188517e-06, + "loss": 0.4791, + "step": 781 + }, + { + "epoch": 0.08617079889807162, + "grad_norm": 6.792855739593506, + "learning_rate": 9.854741272181789e-06, + "loss": 0.5095, + "step": 782 + }, + { + "epoch": 0.08628099173553719, + "grad_norm": 6.635336875915527, + "learning_rate": 9.854322592530702e-06, + "loss": 0.5065, + "step": 783 + }, + { + "epoch": 0.08639118457300275, + "grad_norm": 7.093437194824219, + "learning_rate": 9.853903319286456e-06, + "loss": 0.3857, + "step": 784 + }, + { + "epoch": 0.08650137741046832, + "grad_norm": 7.978708267211914, + "learning_rate": 9.853483452500316e-06, + "loss": 0.4088, + "step": 785 + }, + { + "epoch": 0.08661157024793388, + "grad_norm": 9.05033016204834, + "learning_rate": 9.853062992223629e-06, + "loss": 0.4697, + "step": 786 + }, + { + "epoch": 0.08672176308539944, + "grad_norm": 9.65661334991455, + "learning_rate": 9.852641938507806e-06, + "loss": 0.4212, + "step": 787 + }, + { + "epoch": 0.08683195592286501, + "grad_norm": 8.787781715393066, + "learning_rate": 9.852220291404335e-06, + "loss": 0.4903, + "step": 788 + }, + { + "epoch": 0.08694214876033057, + "grad_norm": 10.779501914978027, + "learning_rate": 9.851798050964775e-06, + "loss": 0.5753, + "step": 789 + }, + { + "epoch": 0.08705234159779614, + "grad_norm": 8.177962303161621, + "learning_rate": 9.851375217240761e-06, + "loss": 0.4619, + "step": 790 + }, + { + "epoch": 0.0871625344352617, + "grad_norm": 6.387087345123291, + "learning_rate": 9.850951790283993e-06, + "loss": 0.4211, + "step": 791 + }, + { + "epoch": 0.08727272727272728, + "grad_norm": 8.48974323272705, + "learning_rate": 9.850527770146253e-06, + "loss": 0.511, + "step": 792 + }, + { + "epoch": 0.08738292011019284, + "grad_norm": 7.000217437744141, + "learning_rate": 9.850103156879386e-06, + "loss": 0.4904, + "step": 793 + }, + { + "epoch": 0.08749311294765841, + "grad_norm": 9.408488273620605, + "learning_rate": 9.849677950535319e-06, + "loss": 0.5271, + "step": 794 + }, + { + "epoch": 0.08760330578512397, + "grad_norm": 5.422463417053223, + "learning_rate": 9.849252151166044e-06, + "loss": 0.3922, + "step": 795 + }, + { + "epoch": 0.08771349862258954, + "grad_norm": 5.552674770355225, + "learning_rate": 9.848825758823629e-06, + "loss": 0.4377, + "step": 796 + }, + { + "epoch": 0.0878236914600551, + "grad_norm": 6.646964073181152, + "learning_rate": 9.848398773560213e-06, + "loss": 0.4523, + "step": 797 + }, + { + "epoch": 0.08793388429752066, + "grad_norm": 9.012452125549316, + "learning_rate": 9.84797119542801e-06, + "loss": 0.4846, + "step": 798 + }, + { + "epoch": 0.08804407713498623, + "grad_norm": 11.543864250183105, + "learning_rate": 9.847543024479304e-06, + "loss": 0.5324, + "step": 799 + }, + { + "epoch": 0.0881542699724518, + "grad_norm": 9.367711067199707, + "learning_rate": 9.847114260766451e-06, + "loss": 0.3494, + "step": 800 + }, + { + "epoch": 0.08826446280991736, + "grad_norm": 9.736997604370117, + "learning_rate": 9.846684904341883e-06, + "loss": 0.5678, + "step": 801 + }, + { + "epoch": 0.08837465564738292, + "grad_norm": 12.38846206665039, + "learning_rate": 9.846254955258101e-06, + "loss": 0.4822, + "step": 802 + }, + { + "epoch": 0.08848484848484849, + "grad_norm": 13.669387817382812, + "learning_rate": 9.845824413567679e-06, + "loss": 0.4647, + "step": 803 + }, + { + "epoch": 0.08859504132231405, + "grad_norm": 13.912272453308105, + "learning_rate": 9.845393279323268e-06, + "loss": 0.4578, + "step": 804 + }, + { + "epoch": 0.08870523415977961, + "grad_norm": 6.685051918029785, + "learning_rate": 9.844961552577583e-06, + "loss": 0.503, + "step": 805 + }, + { + "epoch": 0.08881542699724518, + "grad_norm": 9.136698722839355, + "learning_rate": 9.844529233383418e-06, + "loss": 0.5278, + "step": 806 + }, + { + "epoch": 0.08892561983471074, + "grad_norm": 7.765626430511475, + "learning_rate": 9.844096321793638e-06, + "loss": 0.4546, + "step": 807 + }, + { + "epoch": 0.0890358126721763, + "grad_norm": 7.0093092918396, + "learning_rate": 9.84366281786118e-06, + "loss": 0.4084, + "step": 808 + }, + { + "epoch": 0.08914600550964187, + "grad_norm": 11.89829158782959, + "learning_rate": 9.843228721639053e-06, + "loss": 0.4895, + "step": 809 + }, + { + "epoch": 0.08925619834710743, + "grad_norm": 10.255476951599121, + "learning_rate": 9.842794033180339e-06, + "loss": 0.4462, + "step": 810 + }, + { + "epoch": 0.089366391184573, + "grad_norm": 9.761069297790527, + "learning_rate": 9.842358752538193e-06, + "loss": 0.4419, + "step": 811 + }, + { + "epoch": 0.08947658402203856, + "grad_norm": 9.967449188232422, + "learning_rate": 9.84192287976584e-06, + "loss": 0.5406, + "step": 812 + }, + { + "epoch": 0.08958677685950413, + "grad_norm": 8.725493431091309, + "learning_rate": 9.841486414916581e-06, + "loss": 0.4606, + "step": 813 + }, + { + "epoch": 0.08969696969696969, + "grad_norm": 8.53592300415039, + "learning_rate": 9.841049358043787e-06, + "loss": 0.451, + "step": 814 + }, + { + "epoch": 0.08980716253443526, + "grad_norm": 11.516461372375488, + "learning_rate": 9.8406117092009e-06, + "loss": 0.5207, + "step": 815 + }, + { + "epoch": 0.08991735537190082, + "grad_norm": 6.130514621734619, + "learning_rate": 9.840173468441438e-06, + "loss": 0.401, + "step": 816 + }, + { + "epoch": 0.09002754820936638, + "grad_norm": 28.716140747070312, + "learning_rate": 9.83973463581899e-06, + "loss": 0.628, + "step": 817 + }, + { + "epoch": 0.09013774104683196, + "grad_norm": 4.71995210647583, + "learning_rate": 9.839295211387218e-06, + "loss": 0.4665, + "step": 818 + }, + { + "epoch": 0.09024793388429753, + "grad_norm": 12.816455841064453, + "learning_rate": 9.838855195199852e-06, + "loss": 0.3792, + "step": 819 + }, + { + "epoch": 0.09035812672176309, + "grad_norm": 6.196542739868164, + "learning_rate": 9.838414587310701e-06, + "loss": 0.4371, + "step": 820 + }, + { + "epoch": 0.09046831955922865, + "grad_norm": 9.20396900177002, + "learning_rate": 9.837973387773642e-06, + "loss": 0.4938, + "step": 821 + }, + { + "epoch": 0.09057851239669422, + "grad_norm": 11.53105354309082, + "learning_rate": 9.837531596642624e-06, + "loss": 0.5538, + "step": 822 + }, + { + "epoch": 0.09068870523415978, + "grad_norm": 6.2707085609436035, + "learning_rate": 9.837089213971674e-06, + "loss": 0.3766, + "step": 823 + }, + { + "epoch": 0.09079889807162535, + "grad_norm": 9.616811752319336, + "learning_rate": 9.836646239814883e-06, + "loss": 0.446, + "step": 824 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 18.91120147705078, + "learning_rate": 9.836202674226418e-06, + "loss": 0.4863, + "step": 825 + }, + { + "epoch": 0.09101928374655648, + "grad_norm": 12.505661964416504, + "learning_rate": 9.835758517260522e-06, + "loss": 0.5459, + "step": 826 + }, + { + "epoch": 0.09112947658402204, + "grad_norm": 14.198311805725098, + "learning_rate": 9.835313768971507e-06, + "loss": 0.5098, + "step": 827 + }, + { + "epoch": 0.0912396694214876, + "grad_norm": 6.708476543426514, + "learning_rate": 9.834868429413753e-06, + "loss": 0.467, + "step": 828 + }, + { + "epoch": 0.09134986225895317, + "grad_norm": 9.290276527404785, + "learning_rate": 9.834422498641722e-06, + "loss": 0.4402, + "step": 829 + }, + { + "epoch": 0.09146005509641873, + "grad_norm": 5.819812297821045, + "learning_rate": 9.833975976709942e-06, + "loss": 0.4294, + "step": 830 + }, + { + "epoch": 0.0915702479338843, + "grad_norm": 5.357295513153076, + "learning_rate": 9.833528863673013e-06, + "loss": 0.4823, + "step": 831 + }, + { + "epoch": 0.09168044077134986, + "grad_norm": 7.369269371032715, + "learning_rate": 9.833081159585607e-06, + "loss": 0.4368, + "step": 832 + }, + { + "epoch": 0.09179063360881543, + "grad_norm": 12.383874893188477, + "learning_rate": 9.832632864502472e-06, + "loss": 0.5421, + "step": 833 + }, + { + "epoch": 0.09190082644628099, + "grad_norm": 8.38817024230957, + "learning_rate": 9.832183978478426e-06, + "loss": 0.5052, + "step": 834 + }, + { + "epoch": 0.09201101928374655, + "grad_norm": 5.2538065910339355, + "learning_rate": 9.831734501568362e-06, + "loss": 0.4917, + "step": 835 + }, + { + "epoch": 0.09212121212121212, + "grad_norm": 9.768592834472656, + "learning_rate": 9.831284433827238e-06, + "loss": 0.5308, + "step": 836 + }, + { + "epoch": 0.09223140495867768, + "grad_norm": 7.479040145874023, + "learning_rate": 9.83083377531009e-06, + "loss": 0.5141, + "step": 837 + }, + { + "epoch": 0.09234159779614325, + "grad_norm": 11.684398651123047, + "learning_rate": 9.830382526072027e-06, + "loss": 0.496, + "step": 838 + }, + { + "epoch": 0.09245179063360881, + "grad_norm": 5.878693103790283, + "learning_rate": 9.829930686168225e-06, + "loss": 0.4375, + "step": 839 + }, + { + "epoch": 0.09256198347107437, + "grad_norm": 6.7616963386535645, + "learning_rate": 9.82947825565394e-06, + "loss": 0.3976, + "step": 840 + }, + { + "epoch": 0.09267217630853994, + "grad_norm": 6.9319167137146, + "learning_rate": 9.829025234584493e-06, + "loss": 0.4774, + "step": 841 + }, + { + "epoch": 0.0927823691460055, + "grad_norm": 7.360224723815918, + "learning_rate": 9.828571623015282e-06, + "loss": 0.4056, + "step": 842 + }, + { + "epoch": 0.09289256198347108, + "grad_norm": 10.568782806396484, + "learning_rate": 9.828117421001773e-06, + "loss": 0.4599, + "step": 843 + }, + { + "epoch": 0.09300275482093665, + "grad_norm": 13.017199516296387, + "learning_rate": 9.827662628599507e-06, + "loss": 0.5339, + "step": 844 + }, + { + "epoch": 0.09311294765840221, + "grad_norm": 8.242898941040039, + "learning_rate": 9.827207245864097e-06, + "loss": 0.5639, + "step": 845 + }, + { + "epoch": 0.09322314049586777, + "grad_norm": 10.402332305908203, + "learning_rate": 9.826751272851228e-06, + "loss": 0.5339, + "step": 846 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 7.68806266784668, + "learning_rate": 9.826294709616657e-06, + "loss": 0.3979, + "step": 847 + }, + { + "epoch": 0.0934435261707989, + "grad_norm": 5.082879066467285, + "learning_rate": 9.825837556216214e-06, + "loss": 0.4721, + "step": 848 + }, + { + "epoch": 0.09355371900826447, + "grad_norm": 15.04353141784668, + "learning_rate": 9.825379812705797e-06, + "loss": 0.6114, + "step": 849 + }, + { + "epoch": 0.09366391184573003, + "grad_norm": 7.5867462158203125, + "learning_rate": 9.824921479141385e-06, + "loss": 0.4747, + "step": 850 + }, + { + "epoch": 0.0937741046831956, + "grad_norm": 5.760891437530518, + "learning_rate": 9.824462555579019e-06, + "loss": 0.4118, + "step": 851 + }, + { + "epoch": 0.09388429752066116, + "grad_norm": 11.626730918884277, + "learning_rate": 9.824003042074818e-06, + "loss": 0.5749, + "step": 852 + }, + { + "epoch": 0.09399449035812672, + "grad_norm": 11.019942283630371, + "learning_rate": 9.823542938684972e-06, + "loss": 0.4681, + "step": 853 + }, + { + "epoch": 0.09410468319559229, + "grad_norm": 6.9120402336120605, + "learning_rate": 9.823082245465743e-06, + "loss": 0.4679, + "step": 854 + }, + { + "epoch": 0.09421487603305785, + "grad_norm": 6.156190395355225, + "learning_rate": 9.822620962473466e-06, + "loss": 0.4294, + "step": 855 + }, + { + "epoch": 0.09432506887052342, + "grad_norm": 11.259407997131348, + "learning_rate": 9.822159089764549e-06, + "loss": 0.5178, + "step": 856 + }, + { + "epoch": 0.09443526170798898, + "grad_norm": 9.78303050994873, + "learning_rate": 9.821696627395465e-06, + "loss": 0.4396, + "step": 857 + }, + { + "epoch": 0.09454545454545454, + "grad_norm": 11.318187713623047, + "learning_rate": 9.82123357542277e-06, + "loss": 0.442, + "step": 858 + }, + { + "epoch": 0.09465564738292011, + "grad_norm": 8.33297061920166, + "learning_rate": 9.820769933903082e-06, + "loss": 0.4594, + "step": 859 + }, + { + "epoch": 0.09476584022038567, + "grad_norm": 7.704953193664551, + "learning_rate": 9.8203057028931e-06, + "loss": 0.4941, + "step": 860 + }, + { + "epoch": 0.09487603305785124, + "grad_norm": 7.631678581237793, + "learning_rate": 9.81984088244959e-06, + "loss": 0.4377, + "step": 861 + }, + { + "epoch": 0.0949862258953168, + "grad_norm": 8.531377792358398, + "learning_rate": 9.819375472629388e-06, + "loss": 0.418, + "step": 862 + }, + { + "epoch": 0.09509641873278236, + "grad_norm": 7.013405799865723, + "learning_rate": 9.818909473489406e-06, + "loss": 0.4727, + "step": 863 + }, + { + "epoch": 0.09520661157024793, + "grad_norm": 9.04765796661377, + "learning_rate": 9.81844288508663e-06, + "loss": 0.4904, + "step": 864 + }, + { + "epoch": 0.09531680440771349, + "grad_norm": 6.089433193206787, + "learning_rate": 9.817975707478111e-06, + "loss": 0.5192, + "step": 865 + }, + { + "epoch": 0.09542699724517906, + "grad_norm": 10.373228073120117, + "learning_rate": 9.817507940720978e-06, + "loss": 0.5992, + "step": 866 + }, + { + "epoch": 0.09553719008264462, + "grad_norm": 12.56816577911377, + "learning_rate": 9.817039584872433e-06, + "loss": 0.5209, + "step": 867 + }, + { + "epoch": 0.0956473829201102, + "grad_norm": 7.873315811157227, + "learning_rate": 9.81657063998974e-06, + "loss": 0.472, + "step": 868 + }, + { + "epoch": 0.09575757575757576, + "grad_norm": 8.665975570678711, + "learning_rate": 9.816101106130249e-06, + "loss": 0.4502, + "step": 869 + }, + { + "epoch": 0.09586776859504133, + "grad_norm": 8.683157920837402, + "learning_rate": 9.815630983351372e-06, + "loss": 0.5341, + "step": 870 + }, + { + "epoch": 0.09597796143250689, + "grad_norm": 5.375618934631348, + "learning_rate": 9.815160271710596e-06, + "loss": 0.4331, + "step": 871 + }, + { + "epoch": 0.09608815426997246, + "grad_norm": 8.436710357666016, + "learning_rate": 9.814688971265482e-06, + "loss": 0.3948, + "step": 872 + }, + { + "epoch": 0.09619834710743802, + "grad_norm": 11.841187477111816, + "learning_rate": 9.814217082073662e-06, + "loss": 0.6072, + "step": 873 + }, + { + "epoch": 0.09630853994490358, + "grad_norm": 9.360101699829102, + "learning_rate": 9.813744604192836e-06, + "loss": 0.4533, + "step": 874 + }, + { + "epoch": 0.09641873278236915, + "grad_norm": 8.671298027038574, + "learning_rate": 9.81327153768078e-06, + "loss": 0.4271, + "step": 875 + }, + { + "epoch": 0.09652892561983471, + "grad_norm": 7.9733052253723145, + "learning_rate": 9.812797882595345e-06, + "loss": 0.5111, + "step": 876 + }, + { + "epoch": 0.09663911845730028, + "grad_norm": 5.683726787567139, + "learning_rate": 9.812323638994446e-06, + "loss": 0.4318, + "step": 877 + }, + { + "epoch": 0.09674931129476584, + "grad_norm": 13.997456550598145, + "learning_rate": 9.811848806936076e-06, + "loss": 0.513, + "step": 878 + }, + { + "epoch": 0.0968595041322314, + "grad_norm": 10.461240768432617, + "learning_rate": 9.811373386478296e-06, + "loss": 0.4362, + "step": 879 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 10.092257499694824, + "learning_rate": 9.810897377679243e-06, + "loss": 0.5318, + "step": 880 + }, + { + "epoch": 0.09707988980716253, + "grad_norm": 10.155533790588379, + "learning_rate": 9.810420780597126e-06, + "loss": 0.5287, + "step": 881 + }, + { + "epoch": 0.0971900826446281, + "grad_norm": 13.351264953613281, + "learning_rate": 9.809943595290219e-06, + "loss": 0.6615, + "step": 882 + }, + { + "epoch": 0.09730027548209366, + "grad_norm": 10.560681343078613, + "learning_rate": 9.809465821816877e-06, + "loss": 0.4489, + "step": 883 + }, + { + "epoch": 0.09741046831955923, + "grad_norm": 6.292139530181885, + "learning_rate": 9.808987460235521e-06, + "loss": 0.4243, + "step": 884 + }, + { + "epoch": 0.09752066115702479, + "grad_norm": 23.3513126373291, + "learning_rate": 9.808508510604647e-06, + "loss": 0.569, + "step": 885 + }, + { + "epoch": 0.09763085399449035, + "grad_norm": 8.537981033325195, + "learning_rate": 9.808028972982818e-06, + "loss": 0.4615, + "step": 886 + }, + { + "epoch": 0.09774104683195592, + "grad_norm": 11.562088012695312, + "learning_rate": 9.807548847428678e-06, + "loss": 0.4932, + "step": 887 + }, + { + "epoch": 0.09785123966942148, + "grad_norm": 9.631924629211426, + "learning_rate": 9.807068134000933e-06, + "loss": 0.4441, + "step": 888 + }, + { + "epoch": 0.09796143250688705, + "grad_norm": 6.422639846801758, + "learning_rate": 9.806586832758367e-06, + "loss": 0.524, + "step": 889 + }, + { + "epoch": 0.09807162534435261, + "grad_norm": 8.23627758026123, + "learning_rate": 9.806104943759832e-06, + "loss": 0.507, + "step": 890 + }, + { + "epoch": 0.09818181818181818, + "grad_norm": 7.2150654792785645, + "learning_rate": 9.80562246706426e-06, + "loss": 0.4919, + "step": 891 + }, + { + "epoch": 0.09829201101928374, + "grad_norm": 8.93393325805664, + "learning_rate": 9.805139402730641e-06, + "loss": 0.5218, + "step": 892 + }, + { + "epoch": 0.09840220385674932, + "grad_norm": 14.821327209472656, + "learning_rate": 9.804655750818051e-06, + "loss": 0.5575, + "step": 893 + }, + { + "epoch": 0.09851239669421488, + "grad_norm": 8.87337875366211, + "learning_rate": 9.80417151138563e-06, + "loss": 0.509, + "step": 894 + }, + { + "epoch": 0.09862258953168045, + "grad_norm": 7.318422317504883, + "learning_rate": 9.803686684492589e-06, + "loss": 0.4545, + "step": 895 + }, + { + "epoch": 0.09873278236914601, + "grad_norm": 10.265007019042969, + "learning_rate": 9.803201270198215e-06, + "loss": 0.5321, + "step": 896 + }, + { + "epoch": 0.09884297520661157, + "grad_norm": 8.000826835632324, + "learning_rate": 9.802715268561867e-06, + "loss": 0.4072, + "step": 897 + }, + { + "epoch": 0.09895316804407714, + "grad_norm": 9.18450927734375, + "learning_rate": 9.802228679642971e-06, + "loss": 0.4839, + "step": 898 + }, + { + "epoch": 0.0990633608815427, + "grad_norm": 8.497360229492188, + "learning_rate": 9.801741503501028e-06, + "loss": 0.5484, + "step": 899 + }, + { + "epoch": 0.09917355371900827, + "grad_norm": 14.225048065185547, + "learning_rate": 9.801253740195613e-06, + "loss": 0.551, + "step": 900 + }, + { + "epoch": 0.09928374655647383, + "grad_norm": 6.575657844543457, + "learning_rate": 9.800765389786368e-06, + "loss": 0.4461, + "step": 901 + }, + { + "epoch": 0.0993939393939394, + "grad_norm": 6.189450740814209, + "learning_rate": 9.80027645233301e-06, + "loss": 0.4668, + "step": 902 + }, + { + "epoch": 0.09950413223140496, + "grad_norm": 6.756643295288086, + "learning_rate": 9.799786927895328e-06, + "loss": 0.5309, + "step": 903 + }, + { + "epoch": 0.09961432506887052, + "grad_norm": 7.314714431762695, + "learning_rate": 9.799296816533178e-06, + "loss": 0.473, + "step": 904 + }, + { + "epoch": 0.09972451790633609, + "grad_norm": 5.048830032348633, + "learning_rate": 9.798806118306496e-06, + "loss": 0.5075, + "step": 905 + }, + { + "epoch": 0.09983471074380165, + "grad_norm": 10.5199613571167, + "learning_rate": 9.798314833275281e-06, + "loss": 0.6008, + "step": 906 + }, + { + "epoch": 0.09994490358126722, + "grad_norm": 14.887784004211426, + "learning_rate": 9.797822961499614e-06, + "loss": 0.4552, + "step": 907 + }, + { + "epoch": 0.10005509641873278, + "grad_norm": 7.37053918838501, + "learning_rate": 9.797330503039636e-06, + "loss": 0.4901, + "step": 908 + }, + { + "epoch": 0.10005509641873278, + "eval_loss": 0.47280246019363403, + "eval_runtime": 41.9516, + "eval_samples_per_second": 17.496, + "eval_steps_per_second": 2.193, + "step": 908 + }, + { + "epoch": 0.10016528925619834, + "grad_norm": 7.933466911315918, + "learning_rate": 9.796837457955568e-06, + "loss": 0.4259, + "step": 909 + }, + { + "epoch": 0.10027548209366391, + "grad_norm": 5.126168727874756, + "learning_rate": 9.7963438263077e-06, + "loss": 0.466, + "step": 910 + }, + { + "epoch": 0.10038567493112947, + "grad_norm": 9.237092018127441, + "learning_rate": 9.795849608156393e-06, + "loss": 0.5476, + "step": 911 + }, + { + "epoch": 0.10049586776859504, + "grad_norm": 9.323186874389648, + "learning_rate": 9.79535480356208e-06, + "loss": 0.4387, + "step": 912 + }, + { + "epoch": 0.1006060606060606, + "grad_norm": 9.716706275939941, + "learning_rate": 9.79485941258527e-06, + "loss": 0.4289, + "step": 913 + }, + { + "epoch": 0.10071625344352617, + "grad_norm": 12.264853477478027, + "learning_rate": 9.794363435286538e-06, + "loss": 0.4621, + "step": 914 + }, + { + "epoch": 0.10082644628099173, + "grad_norm": 10.040712356567383, + "learning_rate": 9.793866871726533e-06, + "loss": 0.5254, + "step": 915 + }, + { + "epoch": 0.1009366391184573, + "grad_norm": 8.736623764038086, + "learning_rate": 9.793369721965973e-06, + "loss": 0.4578, + "step": 916 + }, + { + "epoch": 0.10104683195592286, + "grad_norm": 10.712364196777344, + "learning_rate": 9.792871986065653e-06, + "loss": 0.4672, + "step": 917 + }, + { + "epoch": 0.10115702479338844, + "grad_norm": 6.687889099121094, + "learning_rate": 9.792373664086437e-06, + "loss": 0.4849, + "step": 918 + }, + { + "epoch": 0.101267217630854, + "grad_norm": 9.594983100891113, + "learning_rate": 9.791874756089258e-06, + "loss": 0.5466, + "step": 919 + }, + { + "epoch": 0.10137741046831956, + "grad_norm": 14.560150146484375, + "learning_rate": 9.791375262135126e-06, + "loss": 0.5258, + "step": 920 + }, + { + "epoch": 0.10148760330578513, + "grad_norm": 9.519214630126953, + "learning_rate": 9.790875182285119e-06, + "loss": 0.4361, + "step": 921 + }, + { + "epoch": 0.1015977961432507, + "grad_norm": 10.702827453613281, + "learning_rate": 9.790374516600384e-06, + "loss": 0.4343, + "step": 922 + }, + { + "epoch": 0.10170798898071626, + "grad_norm": 13.46514892578125, + "learning_rate": 9.78987326514215e-06, + "loss": 0.465, + "step": 923 + }, + { + "epoch": 0.10181818181818182, + "grad_norm": 9.053117752075195, + "learning_rate": 9.789371427971703e-06, + "loss": 0.4946, + "step": 924 + }, + { + "epoch": 0.10192837465564739, + "grad_norm": 8.024469375610352, + "learning_rate": 9.788869005150415e-06, + "loss": 0.5081, + "step": 925 + }, + { + "epoch": 0.10203856749311295, + "grad_norm": 8.866372108459473, + "learning_rate": 9.788365996739719e-06, + "loss": 0.4919, + "step": 926 + }, + { + "epoch": 0.10214876033057851, + "grad_norm": 9.30793285369873, + "learning_rate": 9.787862402801125e-06, + "loss": 0.5046, + "step": 927 + }, + { + "epoch": 0.10225895316804408, + "grad_norm": 7.904647350311279, + "learning_rate": 9.787358223396211e-06, + "loss": 0.5348, + "step": 928 + }, + { + "epoch": 0.10236914600550964, + "grad_norm": 6.154333591461182, + "learning_rate": 9.786853458586632e-06, + "loss": 0.4739, + "step": 929 + }, + { + "epoch": 0.1024793388429752, + "grad_norm": 6.14103889465332, + "learning_rate": 9.78634810843411e-06, + "loss": 0.496, + "step": 930 + }, + { + "epoch": 0.10258953168044077, + "grad_norm": 6.138935089111328, + "learning_rate": 9.785842173000439e-06, + "loss": 0.4887, + "step": 931 + }, + { + "epoch": 0.10269972451790634, + "grad_norm": 8.407147407531738, + "learning_rate": 9.785335652347485e-06, + "loss": 0.4524, + "step": 932 + }, + { + "epoch": 0.1028099173553719, + "grad_norm": 7.617428302764893, + "learning_rate": 9.784828546537189e-06, + "loss": 0.4698, + "step": 933 + }, + { + "epoch": 0.10292011019283746, + "grad_norm": 6.29920768737793, + "learning_rate": 9.784320855631558e-06, + "loss": 0.4895, + "step": 934 + }, + { + "epoch": 0.10303030303030303, + "grad_norm": 9.820000648498535, + "learning_rate": 9.783812579692675e-06, + "loss": 0.3639, + "step": 935 + }, + { + "epoch": 0.10314049586776859, + "grad_norm": 8.362419128417969, + "learning_rate": 9.78330371878269e-06, + "loss": 0.432, + "step": 936 + }, + { + "epoch": 0.10325068870523416, + "grad_norm": 7.344079971313477, + "learning_rate": 9.782794272963829e-06, + "loss": 0.4791, + "step": 937 + }, + { + "epoch": 0.10336088154269972, + "grad_norm": 12.409881591796875, + "learning_rate": 9.782284242298388e-06, + "loss": 0.5734, + "step": 938 + }, + { + "epoch": 0.10347107438016528, + "grad_norm": 8.979248046875, + "learning_rate": 9.781773626848735e-06, + "loss": 0.4816, + "step": 939 + }, + { + "epoch": 0.10358126721763085, + "grad_norm": 13.881377220153809, + "learning_rate": 9.781262426677304e-06, + "loss": 0.569, + "step": 940 + }, + { + "epoch": 0.10369146005509641, + "grad_norm": 8.64345645904541, + "learning_rate": 9.780750641846613e-06, + "loss": 0.4471, + "step": 941 + }, + { + "epoch": 0.10380165289256198, + "grad_norm": 9.803803443908691, + "learning_rate": 9.780238272419237e-06, + "loss": 0.5504, + "step": 942 + }, + { + "epoch": 0.10391184573002755, + "grad_norm": 8.662821769714355, + "learning_rate": 9.779725318457833e-06, + "loss": 0.4825, + "step": 943 + }, + { + "epoch": 0.10402203856749312, + "grad_norm": 6.113463878631592, + "learning_rate": 9.779211780025122e-06, + "loss": 0.5457, + "step": 944 + }, + { + "epoch": 0.10413223140495868, + "grad_norm": 7.981666564941406, + "learning_rate": 9.778697657183906e-06, + "loss": 0.5118, + "step": 945 + }, + { + "epoch": 0.10424242424242425, + "grad_norm": 5.331852436065674, + "learning_rate": 9.778182949997047e-06, + "loss": 0.4881, + "step": 946 + }, + { + "epoch": 0.10435261707988981, + "grad_norm": 9.32343864440918, + "learning_rate": 9.777667658527487e-06, + "loss": 0.4887, + "step": 947 + }, + { + "epoch": 0.10446280991735538, + "grad_norm": 9.610448837280273, + "learning_rate": 9.777151782838236e-06, + "loss": 0.4562, + "step": 948 + }, + { + "epoch": 0.10457300275482094, + "grad_norm": 8.85430908203125, + "learning_rate": 9.776635322992377e-06, + "loss": 0.3829, + "step": 949 + }, + { + "epoch": 0.1046831955922865, + "grad_norm": 9.623668670654297, + "learning_rate": 9.77611827905306e-06, + "loss": 0.4647, + "step": 950 + }, + { + "epoch": 0.10479338842975207, + "grad_norm": 8.303234100341797, + "learning_rate": 9.775600651083511e-06, + "loss": 0.455, + "step": 951 + }, + { + "epoch": 0.10490358126721763, + "grad_norm": 6.702611446380615, + "learning_rate": 9.77508243914703e-06, + "loss": 0.4579, + "step": 952 + }, + { + "epoch": 0.1050137741046832, + "grad_norm": 8.47449016571045, + "learning_rate": 9.774563643306982e-06, + "loss": 0.469, + "step": 953 + }, + { + "epoch": 0.10512396694214876, + "grad_norm": 9.198047637939453, + "learning_rate": 9.774044263626804e-06, + "loss": 0.4984, + "step": 954 + }, + { + "epoch": 0.10523415977961433, + "grad_norm": 9.3027925491333, + "learning_rate": 9.773524300170012e-06, + "loss": 0.4517, + "step": 955 + }, + { + "epoch": 0.10534435261707989, + "grad_norm": 10.216512680053711, + "learning_rate": 9.773003753000184e-06, + "loss": 0.4854, + "step": 956 + }, + { + "epoch": 0.10545454545454545, + "grad_norm": 9.105542182922363, + "learning_rate": 9.77248262218097e-06, + "loss": 0.5126, + "step": 957 + }, + { + "epoch": 0.10556473829201102, + "grad_norm": 6.106430530548096, + "learning_rate": 9.771960907776102e-06, + "loss": 0.437, + "step": 958 + }, + { + "epoch": 0.10567493112947658, + "grad_norm": 17.164031982421875, + "learning_rate": 9.771438609849368e-06, + "loss": 0.4824, + "step": 959 + }, + { + "epoch": 0.10578512396694215, + "grad_norm": 8.929496765136719, + "learning_rate": 9.770915728464643e-06, + "loss": 0.5196, + "step": 960 + }, + { + "epoch": 0.10589531680440771, + "grad_norm": 6.128411769866943, + "learning_rate": 9.770392263685861e-06, + "loss": 0.424, + "step": 961 + }, + { + "epoch": 0.10600550964187327, + "grad_norm": 8.186613082885742, + "learning_rate": 9.769868215577033e-06, + "loss": 0.5163, + "step": 962 + }, + { + "epoch": 0.10611570247933884, + "grad_norm": 9.785344123840332, + "learning_rate": 9.76934358420224e-06, + "loss": 0.388, + "step": 963 + }, + { + "epoch": 0.1062258953168044, + "grad_norm": 4.475244522094727, + "learning_rate": 9.768818369625635e-06, + "loss": 0.3925, + "step": 964 + }, + { + "epoch": 0.10633608815426997, + "grad_norm": 7.028677463531494, + "learning_rate": 9.768292571911443e-06, + "loss": 0.4047, + "step": 965 + }, + { + "epoch": 0.10644628099173553, + "grad_norm": 7.164196968078613, + "learning_rate": 9.767766191123957e-06, + "loss": 0.385, + "step": 966 + }, + { + "epoch": 0.1065564738292011, + "grad_norm": 9.74470329284668, + "learning_rate": 9.767239227327545e-06, + "loss": 0.4755, + "step": 967 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 8.157772064208984, + "learning_rate": 9.766711680586644e-06, + "loss": 0.3744, + "step": 968 + }, + { + "epoch": 0.10677685950413224, + "grad_norm": 9.312668800354004, + "learning_rate": 9.766183550965767e-06, + "loss": 0.5155, + "step": 969 + }, + { + "epoch": 0.1068870523415978, + "grad_norm": 14.372354507446289, + "learning_rate": 9.765654838529488e-06, + "loss": 0.5237, + "step": 970 + }, + { + "epoch": 0.10699724517906337, + "grad_norm": 8.154521942138672, + "learning_rate": 9.765125543342461e-06, + "loss": 0.4389, + "step": 971 + }, + { + "epoch": 0.10710743801652893, + "grad_norm": 8.324005126953125, + "learning_rate": 9.764595665469413e-06, + "loss": 0.4308, + "step": 972 + }, + { + "epoch": 0.1072176308539945, + "grad_norm": 8.264619827270508, + "learning_rate": 9.764065204975132e-06, + "loss": 0.4676, + "step": 973 + }, + { + "epoch": 0.10732782369146006, + "grad_norm": 14.091426849365234, + "learning_rate": 9.763534161924489e-06, + "loss": 0.4343, + "step": 974 + }, + { + "epoch": 0.10743801652892562, + "grad_norm": 14.29140567779541, + "learning_rate": 9.763002536382416e-06, + "loss": 0.469, + "step": 975 + }, + { + "epoch": 0.10754820936639119, + "grad_norm": 7.155638217926025, + "learning_rate": 9.762470328413925e-06, + "loss": 0.4953, + "step": 976 + }, + { + "epoch": 0.10765840220385675, + "grad_norm": 11.622149467468262, + "learning_rate": 9.761937538084092e-06, + "loss": 0.4917, + "step": 977 + }, + { + "epoch": 0.10776859504132232, + "grad_norm": 8.021838188171387, + "learning_rate": 9.761404165458068e-06, + "loss": 0.4889, + "step": 978 + }, + { + "epoch": 0.10787878787878788, + "grad_norm": 13.566268920898438, + "learning_rate": 9.760870210601074e-06, + "loss": 0.6036, + "step": 979 + }, + { + "epoch": 0.10798898071625344, + "grad_norm": 15.852530479431152, + "learning_rate": 9.760335673578405e-06, + "loss": 0.4561, + "step": 980 + }, + { + "epoch": 0.10809917355371901, + "grad_norm": 7.8272294998168945, + "learning_rate": 9.759800554455424e-06, + "loss": 0.4706, + "step": 981 + }, + { + "epoch": 0.10820936639118457, + "grad_norm": 7.1814398765563965, + "learning_rate": 9.759264853297565e-06, + "loss": 0.424, + "step": 982 + }, + { + "epoch": 0.10831955922865014, + "grad_norm": 7.137000560760498, + "learning_rate": 9.758728570170335e-06, + "loss": 0.3955, + "step": 983 + }, + { + "epoch": 0.1084297520661157, + "grad_norm": 11.43804931640625, + "learning_rate": 9.75819170513931e-06, + "loss": 0.5115, + "step": 984 + }, + { + "epoch": 0.10853994490358126, + "grad_norm": 7.88161039352417, + "learning_rate": 9.757654258270141e-06, + "loss": 0.4613, + "step": 985 + }, + { + "epoch": 0.10865013774104683, + "grad_norm": 6.659247875213623, + "learning_rate": 9.757116229628547e-06, + "loss": 0.486, + "step": 986 + }, + { + "epoch": 0.10876033057851239, + "grad_norm": 12.036238670349121, + "learning_rate": 9.756577619280319e-06, + "loss": 0.5612, + "step": 987 + }, + { + "epoch": 0.10887052341597796, + "grad_norm": 12.239153861999512, + "learning_rate": 9.756038427291318e-06, + "loss": 0.5577, + "step": 988 + }, + { + "epoch": 0.10898071625344352, + "grad_norm": 8.325892448425293, + "learning_rate": 9.755498653727477e-06, + "loss": 0.551, + "step": 989 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 8.936971664428711, + "learning_rate": 9.754958298654802e-06, + "loss": 0.4942, + "step": 990 + }, + { + "epoch": 0.10920110192837465, + "grad_norm": 7.798034191131592, + "learning_rate": 9.754417362139366e-06, + "loss": 0.5114, + "step": 991 + }, + { + "epoch": 0.10931129476584021, + "grad_norm": 8.612783432006836, + "learning_rate": 9.753875844247318e-06, + "loss": 0.4531, + "step": 992 + }, + { + "epoch": 0.10942148760330579, + "grad_norm": 8.013750076293945, + "learning_rate": 9.753333745044873e-06, + "loss": 0.499, + "step": 993 + }, + { + "epoch": 0.10953168044077136, + "grad_norm": 8.845736503601074, + "learning_rate": 9.752791064598322e-06, + "loss": 0.4827, + "step": 994 + }, + { + "epoch": 0.10964187327823692, + "grad_norm": 9.831127166748047, + "learning_rate": 9.752247802974023e-06, + "loss": 0.5425, + "step": 995 + }, + { + "epoch": 0.10975206611570248, + "grad_norm": 7.089080810546875, + "learning_rate": 9.751703960238408e-06, + "loss": 0.3942, + "step": 996 + }, + { + "epoch": 0.10986225895316805, + "grad_norm": 8.943671226501465, + "learning_rate": 9.751159536457977e-06, + "loss": 0.4379, + "step": 997 + }, + { + "epoch": 0.10997245179063361, + "grad_norm": 7.014477729797363, + "learning_rate": 9.750614531699304e-06, + "loss": 0.4104, + "step": 998 + }, + { + "epoch": 0.11008264462809918, + "grad_norm": 6.420469284057617, + "learning_rate": 9.750068946029034e-06, + "loss": 0.4419, + "step": 999 + }, + { + "epoch": 0.11019283746556474, + "grad_norm": 14.385880470275879, + "learning_rate": 9.749522779513883e-06, + "loss": 0.5737, + "step": 1000 + }, + { + "epoch": 0.1103030303030303, + "grad_norm": 19.77135467529297, + "learning_rate": 9.748976032220632e-06, + "loss": 0.534, + "step": 1001 + }, + { + "epoch": 0.11041322314049587, + "grad_norm": 9.207255363464355, + "learning_rate": 9.748428704216141e-06, + "loss": 0.5468, + "step": 1002 + }, + { + "epoch": 0.11052341597796143, + "grad_norm": 10.451583862304688, + "learning_rate": 9.747880795567338e-06, + "loss": 0.4539, + "step": 1003 + }, + { + "epoch": 0.110633608815427, + "grad_norm": 10.347034454345703, + "learning_rate": 9.747332306341222e-06, + "loss": 0.472, + "step": 1004 + }, + { + "epoch": 0.11074380165289256, + "grad_norm": 8.932332992553711, + "learning_rate": 9.746783236604864e-06, + "loss": 0.4288, + "step": 1005 + }, + { + "epoch": 0.11085399449035813, + "grad_norm": 8.662117958068848, + "learning_rate": 9.746233586425404e-06, + "loss": 0.507, + "step": 1006 + }, + { + "epoch": 0.11096418732782369, + "grad_norm": 7.668501377105713, + "learning_rate": 9.745683355870053e-06, + "loss": 0.5003, + "step": 1007 + }, + { + "epoch": 0.11107438016528925, + "grad_norm": 9.935391426086426, + "learning_rate": 9.745132545006096e-06, + "loss": 0.4623, + "step": 1008 + }, + { + "epoch": 0.11118457300275482, + "grad_norm": 9.756633758544922, + "learning_rate": 9.744581153900883e-06, + "loss": 0.5415, + "step": 1009 + }, + { + "epoch": 0.11129476584022038, + "grad_norm": 7.257644176483154, + "learning_rate": 9.744029182621845e-06, + "loss": 0.4688, + "step": 1010 + }, + { + "epoch": 0.11140495867768595, + "grad_norm": 11.103131294250488, + "learning_rate": 9.743476631236473e-06, + "loss": 0.5904, + "step": 1011 + }, + { + "epoch": 0.11151515151515151, + "grad_norm": 8.020723342895508, + "learning_rate": 9.742923499812335e-06, + "loss": 0.4852, + "step": 1012 + }, + { + "epoch": 0.11162534435261708, + "grad_norm": 8.784106254577637, + "learning_rate": 9.742369788417068e-06, + "loss": 0.4, + "step": 1013 + }, + { + "epoch": 0.11173553719008264, + "grad_norm": 8.087194442749023, + "learning_rate": 9.741815497118383e-06, + "loss": 0.4862, + "step": 1014 + }, + { + "epoch": 0.1118457300275482, + "grad_norm": 9.28781509399414, + "learning_rate": 9.741260625984057e-06, + "loss": 0.5297, + "step": 1015 + }, + { + "epoch": 0.11195592286501377, + "grad_norm": 7.464603424072266, + "learning_rate": 9.74070517508194e-06, + "loss": 0.5161, + "step": 1016 + }, + { + "epoch": 0.11206611570247933, + "grad_norm": 6.450089454650879, + "learning_rate": 9.740149144479957e-06, + "loss": 0.4945, + "step": 1017 + }, + { + "epoch": 0.1121763085399449, + "grad_norm": 7.314095973968506, + "learning_rate": 9.739592534246098e-06, + "loss": 0.4962, + "step": 1018 + }, + { + "epoch": 0.11228650137741047, + "grad_norm": 9.615967750549316, + "learning_rate": 9.739035344448425e-06, + "loss": 0.4424, + "step": 1019 + }, + { + "epoch": 0.11239669421487604, + "grad_norm": 5.225827693939209, + "learning_rate": 9.738477575155072e-06, + "loss": 0.4328, + "step": 1020 + }, + { + "epoch": 0.1125068870523416, + "grad_norm": 8.105597496032715, + "learning_rate": 9.737919226434245e-06, + "loss": 0.4604, + "step": 1021 + }, + { + "epoch": 0.11261707988980717, + "grad_norm": 6.439534664154053, + "learning_rate": 9.73736029835422e-06, + "loss": 0.4346, + "step": 1022 + }, + { + "epoch": 0.11272727272727273, + "grad_norm": 8.215489387512207, + "learning_rate": 9.73680079098334e-06, + "loss": 0.4113, + "step": 1023 + }, + { + "epoch": 0.1128374655647383, + "grad_norm": 9.294937133789062, + "learning_rate": 9.736240704390027e-06, + "loss": 0.5111, + "step": 1024 + }, + { + "epoch": 0.11294765840220386, + "grad_norm": 6.806695461273193, + "learning_rate": 9.735680038642767e-06, + "loss": 0.4648, + "step": 1025 + }, + { + "epoch": 0.11305785123966942, + "grad_norm": 7.230663776397705, + "learning_rate": 9.735118793810118e-06, + "loss": 0.4098, + "step": 1026 + }, + { + "epoch": 0.11316804407713499, + "grad_norm": 8.778502464294434, + "learning_rate": 9.734556969960712e-06, + "loss": 0.4613, + "step": 1027 + }, + { + "epoch": 0.11327823691460055, + "grad_norm": 8.630878448486328, + "learning_rate": 9.733994567163248e-06, + "loss": 0.4937, + "step": 1028 + }, + { + "epoch": 0.11338842975206612, + "grad_norm": 7.268901348114014, + "learning_rate": 9.733431585486499e-06, + "loss": 0.4959, + "step": 1029 + }, + { + "epoch": 0.11349862258953168, + "grad_norm": 7.849322319030762, + "learning_rate": 9.732868024999305e-06, + "loss": 0.4461, + "step": 1030 + }, + { + "epoch": 0.11360881542699725, + "grad_norm": 7.055856227874756, + "learning_rate": 9.73230388577058e-06, + "loss": 0.4543, + "step": 1031 + }, + { + "epoch": 0.11371900826446281, + "grad_norm": 6.915589332580566, + "learning_rate": 9.731739167869308e-06, + "loss": 0.555, + "step": 1032 + }, + { + "epoch": 0.11382920110192837, + "grad_norm": 19.499521255493164, + "learning_rate": 9.731173871364542e-06, + "loss": 0.5882, + "step": 1033 + }, + { + "epoch": 0.11393939393939394, + "grad_norm": 13.201210975646973, + "learning_rate": 9.730607996325408e-06, + "loss": 0.4885, + "step": 1034 + }, + { + "epoch": 0.1140495867768595, + "grad_norm": 12.455816268920898, + "learning_rate": 9.730041542821105e-06, + "loss": 0.4669, + "step": 1035 + }, + { + "epoch": 0.11415977961432507, + "grad_norm": 5.530920028686523, + "learning_rate": 9.729474510920895e-06, + "loss": 0.4181, + "step": 1036 + }, + { + "epoch": 0.11426997245179063, + "grad_norm": 5.375738620758057, + "learning_rate": 9.728906900694117e-06, + "loss": 0.2722, + "step": 1037 + }, + { + "epoch": 0.1143801652892562, + "grad_norm": 8.542257308959961, + "learning_rate": 9.728338712210181e-06, + "loss": 0.4797, + "step": 1038 + }, + { + "epoch": 0.11449035812672176, + "grad_norm": 9.09620475769043, + "learning_rate": 9.727769945538563e-06, + "loss": 0.466, + "step": 1039 + }, + { + "epoch": 0.11460055096418732, + "grad_norm": 6.134823322296143, + "learning_rate": 9.727200600748815e-06, + "loss": 0.4626, + "step": 1040 + }, + { + "epoch": 0.11471074380165289, + "grad_norm": 6.514139175415039, + "learning_rate": 9.726630677910556e-06, + "loss": 0.4281, + "step": 1041 + }, + { + "epoch": 0.11482093663911845, + "grad_norm": 7.309274673461914, + "learning_rate": 9.726060177093477e-06, + "loss": 0.4084, + "step": 1042 + }, + { + "epoch": 0.11493112947658402, + "grad_norm": 10.32424259185791, + "learning_rate": 9.72548909836734e-06, + "loss": 0.3898, + "step": 1043 + }, + { + "epoch": 0.1150413223140496, + "grad_norm": 11.615346908569336, + "learning_rate": 9.724917441801977e-06, + "loss": 0.393, + "step": 1044 + }, + { + "epoch": 0.11515151515151516, + "grad_norm": 6.7931389808654785, + "learning_rate": 9.724345207467292e-06, + "loss": 0.4691, + "step": 1045 + }, + { + "epoch": 0.11526170798898072, + "grad_norm": 11.140974044799805, + "learning_rate": 9.723772395433257e-06, + "loss": 0.4047, + "step": 1046 + }, + { + "epoch": 0.11537190082644629, + "grad_norm": 6.690062522888184, + "learning_rate": 9.723199005769917e-06, + "loss": 0.4866, + "step": 1047 + }, + { + "epoch": 0.11548209366391185, + "grad_norm": 9.408793449401855, + "learning_rate": 9.722625038547386e-06, + "loss": 0.4788, + "step": 1048 + }, + { + "epoch": 0.11559228650137741, + "grad_norm": 8.867337226867676, + "learning_rate": 9.722050493835852e-06, + "loss": 0.4866, + "step": 1049 + }, + { + "epoch": 0.11570247933884298, + "grad_norm": 6.5331597328186035, + "learning_rate": 9.721475371705567e-06, + "loss": 0.5087, + "step": 1050 + }, + { + "epoch": 0.11581267217630854, + "grad_norm": 11.793696403503418, + "learning_rate": 9.720899672226863e-06, + "loss": 0.4775, + "step": 1051 + }, + { + "epoch": 0.1159228650137741, + "grad_norm": 16.762985229492188, + "learning_rate": 9.720323395470132e-06, + "loss": 0.5354, + "step": 1052 + }, + { + "epoch": 0.11603305785123967, + "grad_norm": 13.210665702819824, + "learning_rate": 9.719746541505844e-06, + "loss": 0.5214, + "step": 1053 + }, + { + "epoch": 0.11614325068870524, + "grad_norm": 5.774608135223389, + "learning_rate": 9.719169110404538e-06, + "loss": 0.4265, + "step": 1054 + }, + { + "epoch": 0.1162534435261708, + "grad_norm": 8.105388641357422, + "learning_rate": 9.718591102236823e-06, + "loss": 0.5145, + "step": 1055 + }, + { + "epoch": 0.11636363636363636, + "grad_norm": 5.87872314453125, + "learning_rate": 9.71801251707338e-06, + "loss": 0.45, + "step": 1056 + }, + { + "epoch": 0.11647382920110193, + "grad_norm": 5.883322715759277, + "learning_rate": 9.717433354984957e-06, + "loss": 0.4363, + "step": 1057 + }, + { + "epoch": 0.11658402203856749, + "grad_norm": 9.888565063476562, + "learning_rate": 9.716853616042375e-06, + "loss": 0.4482, + "step": 1058 + }, + { + "epoch": 0.11669421487603306, + "grad_norm": 10.839025497436523, + "learning_rate": 9.716273300316526e-06, + "loss": 0.5161, + "step": 1059 + }, + { + "epoch": 0.11680440771349862, + "grad_norm": 5.918877124786377, + "learning_rate": 9.71569240787837e-06, + "loss": 0.428, + "step": 1060 + }, + { + "epoch": 0.11691460055096418, + "grad_norm": 7.210424900054932, + "learning_rate": 9.715110938798942e-06, + "loss": 0.4684, + "step": 1061 + }, + { + "epoch": 0.11702479338842975, + "grad_norm": 9.123066902160645, + "learning_rate": 9.714528893149343e-06, + "loss": 0.4826, + "step": 1062 + }, + { + "epoch": 0.11713498622589531, + "grad_norm": 7.897624969482422, + "learning_rate": 9.713946271000747e-06, + "loss": 0.5176, + "step": 1063 + }, + { + "epoch": 0.11724517906336088, + "grad_norm": 7.387955188751221, + "learning_rate": 9.713363072424398e-06, + "loss": 0.3383, + "step": 1064 + }, + { + "epoch": 0.11735537190082644, + "grad_norm": 5.582972526550293, + "learning_rate": 9.712779297491609e-06, + "loss": 0.5117, + "step": 1065 + }, + { + "epoch": 0.117465564738292, + "grad_norm": 9.218341827392578, + "learning_rate": 9.712194946273767e-06, + "loss": 0.4588, + "step": 1066 + }, + { + "epoch": 0.11757575757575757, + "grad_norm": 8.894552230834961, + "learning_rate": 9.711610018842325e-06, + "loss": 0.4658, + "step": 1067 + }, + { + "epoch": 0.11768595041322313, + "grad_norm": 6.712638854980469, + "learning_rate": 9.71102451526881e-06, + "loss": 0.4981, + "step": 1068 + }, + { + "epoch": 0.11779614325068871, + "grad_norm": 5.404588222503662, + "learning_rate": 9.710438435624818e-06, + "loss": 0.443, + "step": 1069 + }, + { + "epoch": 0.11790633608815428, + "grad_norm": 7.288143634796143, + "learning_rate": 9.709851779982017e-06, + "loss": 0.5119, + "step": 1070 + }, + { + "epoch": 0.11801652892561984, + "grad_norm": 6.820754528045654, + "learning_rate": 9.709264548412141e-06, + "loss": 0.463, + "step": 1071 + }, + { + "epoch": 0.1181267217630854, + "grad_norm": 10.298928260803223, + "learning_rate": 9.708676740986999e-06, + "loss": 0.5043, + "step": 1072 + }, + { + "epoch": 0.11823691460055097, + "grad_norm": 9.05601978302002, + "learning_rate": 9.708088357778472e-06, + "loss": 0.5091, + "step": 1073 + }, + { + "epoch": 0.11834710743801653, + "grad_norm": 10.932220458984375, + "learning_rate": 9.707499398858501e-06, + "loss": 0.4609, + "step": 1074 + }, + { + "epoch": 0.1184573002754821, + "grad_norm": 7.302265167236328, + "learning_rate": 9.706909864299112e-06, + "loss": 0.4627, + "step": 1075 + }, + { + "epoch": 0.11856749311294766, + "grad_norm": 6.128036022186279, + "learning_rate": 9.70631975417239e-06, + "loss": 0.4745, + "step": 1076 + }, + { + "epoch": 0.11867768595041323, + "grad_norm": 8.51804256439209, + "learning_rate": 9.705729068550495e-06, + "loss": 0.504, + "step": 1077 + }, + { + "epoch": 0.11878787878787879, + "grad_norm": 9.401533126831055, + "learning_rate": 9.70513780750566e-06, + "loss": 0.3415, + "step": 1078 + }, + { + "epoch": 0.11889807162534435, + "grad_norm": 9.22064208984375, + "learning_rate": 9.70454597111018e-06, + "loss": 0.4953, + "step": 1079 + }, + { + "epoch": 0.11900826446280992, + "grad_norm": 10.248266220092773, + "learning_rate": 9.703953559436429e-06, + "loss": 0.5212, + "step": 1080 + }, + { + "epoch": 0.11911845730027548, + "grad_norm": 7.634253978729248, + "learning_rate": 9.703360572556845e-06, + "loss": 0.4165, + "step": 1081 + }, + { + "epoch": 0.11922865013774105, + "grad_norm": 6.5556182861328125, + "learning_rate": 9.702767010543945e-06, + "loss": 0.4619, + "step": 1082 + }, + { + "epoch": 0.11933884297520661, + "grad_norm": 8.908390998840332, + "learning_rate": 9.702172873470304e-06, + "loss": 0.5006, + "step": 1083 + }, + { + "epoch": 0.11944903581267217, + "grad_norm": 8.97170639038086, + "learning_rate": 9.701578161408578e-06, + "loss": 0.511, + "step": 1084 + }, + { + "epoch": 0.11955922865013774, + "grad_norm": 7.142294883728027, + "learning_rate": 9.700982874431488e-06, + "loss": 0.4188, + "step": 1085 + }, + { + "epoch": 0.1196694214876033, + "grad_norm": 6.893550872802734, + "learning_rate": 9.700387012611827e-06, + "loss": 0.4255, + "step": 1086 + }, + { + "epoch": 0.11977961432506887, + "grad_norm": 7.846024513244629, + "learning_rate": 9.699790576022456e-06, + "loss": 0.4637, + "step": 1087 + }, + { + "epoch": 0.11988980716253443, + "grad_norm": 7.985848426818848, + "learning_rate": 9.699193564736308e-06, + "loss": 0.4641, + "step": 1088 + }, + { + "epoch": 0.12, + "grad_norm": 12.337224006652832, + "learning_rate": 9.69859597882639e-06, + "loss": 0.5153, + "step": 1089 + }, + { + "epoch": 0.12011019283746556, + "grad_norm": 7.637415885925293, + "learning_rate": 9.697997818365774e-06, + "loss": 0.4042, + "step": 1090 + }, + { + "epoch": 0.12022038567493112, + "grad_norm": 7.815561771392822, + "learning_rate": 9.697399083427602e-06, + "loss": 0.5332, + "step": 1091 + }, + { + "epoch": 0.12033057851239669, + "grad_norm": 10.246246337890625, + "learning_rate": 9.69679977408509e-06, + "loss": 0.4325, + "step": 1092 + }, + { + "epoch": 0.12044077134986225, + "grad_norm": 6.883440017700195, + "learning_rate": 9.69619989041152e-06, + "loss": 0.4729, + "step": 1093 + }, + { + "epoch": 0.12055096418732783, + "grad_norm": 8.58263874053955, + "learning_rate": 9.695599432480249e-06, + "loss": 0.4118, + "step": 1094 + }, + { + "epoch": 0.1206611570247934, + "grad_norm": 9.461421966552734, + "learning_rate": 9.6949984003647e-06, + "loss": 0.4378, + "step": 1095 + }, + { + "epoch": 0.12077134986225896, + "grad_norm": 10.19222354888916, + "learning_rate": 9.694396794138373e-06, + "loss": 0.4299, + "step": 1096 + }, + { + "epoch": 0.12088154269972452, + "grad_norm": 9.905485153198242, + "learning_rate": 9.693794613874825e-06, + "loss": 0.4034, + "step": 1097 + }, + { + "epoch": 0.12099173553719009, + "grad_norm": 7.480471134185791, + "learning_rate": 9.693191859647696e-06, + "loss": 0.4503, + "step": 1098 + }, + { + "epoch": 0.12110192837465565, + "grad_norm": 27.981821060180664, + "learning_rate": 9.692588531530693e-06, + "loss": 0.6124, + "step": 1099 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 12.169344902038574, + "learning_rate": 9.69198462959759e-06, + "loss": 0.5834, + "step": 1100 + }, + { + "epoch": 0.12132231404958678, + "grad_norm": 7.219008445739746, + "learning_rate": 9.691380153922235e-06, + "loss": 0.4682, + "step": 1101 + }, + { + "epoch": 0.12143250688705234, + "grad_norm": 12.253581047058105, + "learning_rate": 9.690775104578539e-06, + "loss": 0.5751, + "step": 1102 + }, + { + "epoch": 0.12154269972451791, + "grad_norm": 8.646245002746582, + "learning_rate": 9.690169481640492e-06, + "loss": 0.4849, + "step": 1103 + }, + { + "epoch": 0.12165289256198347, + "grad_norm": 8.885684967041016, + "learning_rate": 9.68956328518215e-06, + "loss": 0.4056, + "step": 1104 + }, + { + "epoch": 0.12176308539944904, + "grad_norm": 17.205263137817383, + "learning_rate": 9.68895651527764e-06, + "loss": 0.522, + "step": 1105 + }, + { + "epoch": 0.1218732782369146, + "grad_norm": 9.57472038269043, + "learning_rate": 9.688349172001157e-06, + "loss": 0.5177, + "step": 1106 + }, + { + "epoch": 0.12198347107438016, + "grad_norm": 7.7095794677734375, + "learning_rate": 9.687741255426969e-06, + "loss": 0.3892, + "step": 1107 + }, + { + "epoch": 0.12209366391184573, + "grad_norm": 7.4513092041015625, + "learning_rate": 9.687132765629412e-06, + "loss": 0.5235, + "step": 1108 + }, + { + "epoch": 0.1222038567493113, + "grad_norm": 9.538368225097656, + "learning_rate": 9.686523702682896e-06, + "loss": 0.4363, + "step": 1109 + }, + { + "epoch": 0.12231404958677686, + "grad_norm": 5.501028537750244, + "learning_rate": 9.685914066661893e-06, + "loss": 0.3747, + "step": 1110 + }, + { + "epoch": 0.12242424242424242, + "grad_norm": 8.349221229553223, + "learning_rate": 9.685303857640954e-06, + "loss": 0.4618, + "step": 1111 + }, + { + "epoch": 0.12253443526170799, + "grad_norm": 7.227693557739258, + "learning_rate": 9.684693075694696e-06, + "loss": 0.4728, + "step": 1112 + }, + { + "epoch": 0.12264462809917355, + "grad_norm": 7.649456024169922, + "learning_rate": 9.684081720897802e-06, + "loss": 0.3752, + "step": 1113 + }, + { + "epoch": 0.12275482093663911, + "grad_norm": 9.91862678527832, + "learning_rate": 9.683469793325036e-06, + "loss": 0.4001, + "step": 1114 + }, + { + "epoch": 0.12286501377410468, + "grad_norm": 9.030213356018066, + "learning_rate": 9.68285729305122e-06, + "loss": 0.5393, + "step": 1115 + }, + { + "epoch": 0.12297520661157024, + "grad_norm": 10.885235786437988, + "learning_rate": 9.682244220151253e-06, + "loss": 0.4393, + "step": 1116 + }, + { + "epoch": 0.1230853994490358, + "grad_norm": 5.922689914703369, + "learning_rate": 9.681630574700102e-06, + "loss": 0.4039, + "step": 1117 + }, + { + "epoch": 0.12319559228650137, + "grad_norm": 5.201834678649902, + "learning_rate": 9.681016356772805e-06, + "loss": 0.3368, + "step": 1118 + }, + { + "epoch": 0.12330578512396695, + "grad_norm": 11.609149932861328, + "learning_rate": 9.680401566444472e-06, + "loss": 0.3857, + "step": 1119 + }, + { + "epoch": 0.12341597796143251, + "grad_norm": 11.552797317504883, + "learning_rate": 9.679786203790276e-06, + "loss": 0.4921, + "step": 1120 + }, + { + "epoch": 0.12352617079889808, + "grad_norm": 6.618052959442139, + "learning_rate": 9.679170268885464e-06, + "loss": 0.4078, + "step": 1121 + }, + { + "epoch": 0.12363636363636364, + "grad_norm": 6.38631010055542, + "learning_rate": 9.67855376180536e-06, + "loss": 0.4434, + "step": 1122 + }, + { + "epoch": 0.1237465564738292, + "grad_norm": 7.464217662811279, + "learning_rate": 9.677936682625344e-06, + "loss": 0.4414, + "step": 1123 + }, + { + "epoch": 0.12385674931129477, + "grad_norm": 7.782440662384033, + "learning_rate": 9.677319031420875e-06, + "loss": 0.4064, + "step": 1124 + }, + { + "epoch": 0.12396694214876033, + "grad_norm": 7.2428154945373535, + "learning_rate": 9.676700808267483e-06, + "loss": 0.3844, + "step": 1125 + }, + { + "epoch": 0.1240771349862259, + "grad_norm": 22.8863525390625, + "learning_rate": 9.676082013240764e-06, + "loss": 0.5105, + "step": 1126 + }, + { + "epoch": 0.12418732782369146, + "grad_norm": 10.620203018188477, + "learning_rate": 9.675462646416385e-06, + "loss": 0.4406, + "step": 1127 + }, + { + "epoch": 0.12429752066115703, + "grad_norm": 7.522141933441162, + "learning_rate": 9.67484270787008e-06, + "loss": 0.3763, + "step": 1128 + }, + { + "epoch": 0.12440771349862259, + "grad_norm": 7.644880294799805, + "learning_rate": 9.67422219767766e-06, + "loss": 0.4832, + "step": 1129 + }, + { + "epoch": 0.12451790633608815, + "grad_norm": 8.263649940490723, + "learning_rate": 9.673601115915001e-06, + "loss": 0.4586, + "step": 1130 + }, + { + "epoch": 0.12462809917355372, + "grad_norm": 13.16008186340332, + "learning_rate": 9.672979462658047e-06, + "loss": 0.5466, + "step": 1131 + }, + { + "epoch": 0.12473829201101928, + "grad_norm": 10.177546501159668, + "learning_rate": 9.672357237982819e-06, + "loss": 0.4415, + "step": 1132 + }, + { + "epoch": 0.12484848484848485, + "grad_norm": 9.804990768432617, + "learning_rate": 9.6717344419654e-06, + "loss": 0.4931, + "step": 1133 + }, + { + "epoch": 0.12495867768595041, + "grad_norm": 10.962959289550781, + "learning_rate": 9.67111107468195e-06, + "loss": 0.4532, + "step": 1134 + }, + { + "epoch": 0.12506887052341598, + "grad_norm": 6.473902702331543, + "learning_rate": 9.670487136208688e-06, + "loss": 0.4059, + "step": 1135 + }, + { + "epoch": 0.12517906336088155, + "grad_norm": 12.173033714294434, + "learning_rate": 9.669862626621918e-06, + "loss": 0.3997, + "step": 1136 + }, + { + "epoch": 0.1252892561983471, + "grad_norm": 10.380783081054688, + "learning_rate": 9.669237545998002e-06, + "loss": 0.4907, + "step": 1137 + }, + { + "epoch": 0.12539944903581268, + "grad_norm": 14.192329406738281, + "learning_rate": 9.668611894413376e-06, + "loss": 0.4613, + "step": 1138 + }, + { + "epoch": 0.12550964187327823, + "grad_norm": 10.058757781982422, + "learning_rate": 9.667985671944546e-06, + "loss": 0.3814, + "step": 1139 + }, + { + "epoch": 0.1256198347107438, + "grad_norm": 8.420469284057617, + "learning_rate": 9.667358878668088e-06, + "loss": 0.4661, + "step": 1140 + }, + { + "epoch": 0.12573002754820936, + "grad_norm": 7.803491592407227, + "learning_rate": 9.666731514660646e-06, + "loss": 0.4629, + "step": 1141 + }, + { + "epoch": 0.12584022038567494, + "grad_norm": 11.714037895202637, + "learning_rate": 9.666103579998935e-06, + "loss": 0.5287, + "step": 1142 + }, + { + "epoch": 0.1259504132231405, + "grad_norm": 9.14343547821045, + "learning_rate": 9.665475074759739e-06, + "loss": 0.4649, + "step": 1143 + }, + { + "epoch": 0.12606060606060607, + "grad_norm": 8.096257209777832, + "learning_rate": 9.664845999019914e-06, + "loss": 0.4587, + "step": 1144 + }, + { + "epoch": 0.12617079889807162, + "grad_norm": 8.276843070983887, + "learning_rate": 9.664216352856386e-06, + "loss": 0.51, + "step": 1145 + }, + { + "epoch": 0.1262809917355372, + "grad_norm": 7.214012145996094, + "learning_rate": 9.663586136346143e-06, + "loss": 0.4942, + "step": 1146 + }, + { + "epoch": 0.12639118457300275, + "grad_norm": 9.526751518249512, + "learning_rate": 9.662955349566254e-06, + "loss": 0.4873, + "step": 1147 + }, + { + "epoch": 0.12650137741046832, + "grad_norm": 6.470208644866943, + "learning_rate": 9.662323992593852e-06, + "loss": 0.4211, + "step": 1148 + }, + { + "epoch": 0.12661157024793387, + "grad_norm": 6.078673362731934, + "learning_rate": 9.661692065506136e-06, + "loss": 0.4929, + "step": 1149 + }, + { + "epoch": 0.12672176308539945, + "grad_norm": 7.904278755187988, + "learning_rate": 9.661059568380384e-06, + "loss": 0.4637, + "step": 1150 + }, + { + "epoch": 0.126831955922865, + "grad_norm": 5.249787330627441, + "learning_rate": 9.660426501293937e-06, + "loss": 0.3808, + "step": 1151 + }, + { + "epoch": 0.12694214876033058, + "grad_norm": 6.705897331237793, + "learning_rate": 9.659792864324207e-06, + "loss": 0.4353, + "step": 1152 + }, + { + "epoch": 0.12705234159779613, + "grad_norm": 6.71006965637207, + "learning_rate": 9.659158657548676e-06, + "loss": 0.3755, + "step": 1153 + }, + { + "epoch": 0.1271625344352617, + "grad_norm": 15.06179141998291, + "learning_rate": 9.658523881044892e-06, + "loss": 0.5519, + "step": 1154 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 12.418374061584473, + "learning_rate": 9.657888534890484e-06, + "loss": 0.5228, + "step": 1155 + }, + { + "epoch": 0.12738292011019284, + "grad_norm": 10.345029830932617, + "learning_rate": 9.657252619163136e-06, + "loss": 0.4888, + "step": 1156 + }, + { + "epoch": 0.12749311294765842, + "grad_norm": 9.189409255981445, + "learning_rate": 9.656616133940612e-06, + "loss": 0.4751, + "step": 1157 + }, + { + "epoch": 0.12760330578512397, + "grad_norm": 6.832703590393066, + "learning_rate": 9.655979079300744e-06, + "loss": 0.4348, + "step": 1158 + }, + { + "epoch": 0.12771349862258954, + "grad_norm": 9.597253799438477, + "learning_rate": 9.655341455321427e-06, + "loss": 0.5213, + "step": 1159 + }, + { + "epoch": 0.1278236914600551, + "grad_norm": 7.796607971191406, + "learning_rate": 9.654703262080636e-06, + "loss": 0.4712, + "step": 1160 + }, + { + "epoch": 0.12793388429752067, + "grad_norm": 5.447074890136719, + "learning_rate": 9.654064499656405e-06, + "loss": 0.4379, + "step": 1161 + }, + { + "epoch": 0.12804407713498622, + "grad_norm": 7.612880706787109, + "learning_rate": 9.653425168126846e-06, + "loss": 0.4716, + "step": 1162 + }, + { + "epoch": 0.1281542699724518, + "grad_norm": 13.605120658874512, + "learning_rate": 9.652785267570136e-06, + "loss": 0.5116, + "step": 1163 + }, + { + "epoch": 0.12826446280991735, + "grad_norm": 9.965607643127441, + "learning_rate": 9.652144798064523e-06, + "loss": 0.4869, + "step": 1164 + }, + { + "epoch": 0.12837465564738293, + "grad_norm": 8.977864265441895, + "learning_rate": 9.651503759688325e-06, + "loss": 0.4552, + "step": 1165 + }, + { + "epoch": 0.12848484848484848, + "grad_norm": 12.288677215576172, + "learning_rate": 9.65086215251993e-06, + "loss": 0.4344, + "step": 1166 + }, + { + "epoch": 0.12859504132231406, + "grad_norm": 7.44070291519165, + "learning_rate": 9.650219976637792e-06, + "loss": 0.5138, + "step": 1167 + }, + { + "epoch": 0.1287052341597796, + "grad_norm": 5.929866790771484, + "learning_rate": 9.64957723212044e-06, + "loss": 0.4361, + "step": 1168 + }, + { + "epoch": 0.12881542699724519, + "grad_norm": 8.705809593200684, + "learning_rate": 9.648933919046466e-06, + "loss": 0.4308, + "step": 1169 + }, + { + "epoch": 0.12892561983471074, + "grad_norm": 11.498370170593262, + "learning_rate": 9.648290037494538e-06, + "loss": 0.5356, + "step": 1170 + }, + { + "epoch": 0.12903581267217631, + "grad_norm": 9.883990287780762, + "learning_rate": 9.647645587543391e-06, + "loss": 0.547, + "step": 1171 + }, + { + "epoch": 0.12914600550964186, + "grad_norm": 11.769598007202148, + "learning_rate": 9.647000569271829e-06, + "loss": 0.4811, + "step": 1172 + }, + { + "epoch": 0.12925619834710744, + "grad_norm": 13.798337936401367, + "learning_rate": 9.646354982758724e-06, + "loss": 0.5142, + "step": 1173 + }, + { + "epoch": 0.129366391184573, + "grad_norm": 15.07916259765625, + "learning_rate": 9.64570882808302e-06, + "loss": 0.5676, + "step": 1174 + }, + { + "epoch": 0.12947658402203857, + "grad_norm": 3.993927240371704, + "learning_rate": 9.64506210532373e-06, + "loss": 0.4158, + "step": 1175 + }, + { + "epoch": 0.12958677685950412, + "grad_norm": 9.357967376708984, + "learning_rate": 9.644414814559937e-06, + "loss": 0.407, + "step": 1176 + }, + { + "epoch": 0.1296969696969697, + "grad_norm": 11.830809593200684, + "learning_rate": 9.64376695587079e-06, + "loss": 0.5259, + "step": 1177 + }, + { + "epoch": 0.12980716253443525, + "grad_norm": 7.312831878662109, + "learning_rate": 9.643118529335514e-06, + "loss": 0.4122, + "step": 1178 + }, + { + "epoch": 0.12991735537190083, + "grad_norm": 7.975347518920898, + "learning_rate": 9.642469535033396e-06, + "loss": 0.4104, + "step": 1179 + }, + { + "epoch": 0.13002754820936638, + "grad_norm": 6.080129623413086, + "learning_rate": 9.641819973043796e-06, + "loss": 0.4823, + "step": 1180 + }, + { + "epoch": 0.13013774104683196, + "grad_norm": 7.574034690856934, + "learning_rate": 9.641169843446146e-06, + "loss": 0.4133, + "step": 1181 + }, + { + "epoch": 0.13024793388429753, + "grad_norm": 7.223039150238037, + "learning_rate": 9.640519146319941e-06, + "loss": 0.4473, + "step": 1182 + }, + { + "epoch": 0.13035812672176308, + "grad_norm": 20.465892791748047, + "learning_rate": 9.639867881744753e-06, + "loss": 0.4561, + "step": 1183 + }, + { + "epoch": 0.13046831955922866, + "grad_norm": 5.304706573486328, + "learning_rate": 9.63921604980022e-06, + "loss": 0.4085, + "step": 1184 + }, + { + "epoch": 0.1305785123966942, + "grad_norm": 13.216215133666992, + "learning_rate": 9.638563650566044e-06, + "loss": 0.5271, + "step": 1185 + }, + { + "epoch": 0.1306887052341598, + "grad_norm": 5.724482536315918, + "learning_rate": 9.637910684122003e-06, + "loss": 0.4576, + "step": 1186 + }, + { + "epoch": 0.13079889807162534, + "grad_norm": 9.094064712524414, + "learning_rate": 9.637257150547945e-06, + "loss": 0.4702, + "step": 1187 + }, + { + "epoch": 0.13090909090909092, + "grad_norm": 6.847584247589111, + "learning_rate": 9.636603049923783e-06, + "loss": 0.5027, + "step": 1188 + }, + { + "epoch": 0.13101928374655647, + "grad_norm": 5.101464748382568, + "learning_rate": 9.635948382329502e-06, + "loss": 0.3008, + "step": 1189 + }, + { + "epoch": 0.13112947658402205, + "grad_norm": 10.623517990112305, + "learning_rate": 9.635293147845156e-06, + "loss": 0.4668, + "step": 1190 + }, + { + "epoch": 0.1312396694214876, + "grad_norm": 9.423023223876953, + "learning_rate": 9.634637346550866e-06, + "loss": 0.5525, + "step": 1191 + }, + { + "epoch": 0.13134986225895318, + "grad_norm": 6.9419050216674805, + "learning_rate": 9.633980978526826e-06, + "loss": 0.5164, + "step": 1192 + }, + { + "epoch": 0.13146005509641873, + "grad_norm": 5.715521335601807, + "learning_rate": 9.6333240438533e-06, + "loss": 0.4139, + "step": 1193 + }, + { + "epoch": 0.1315702479338843, + "grad_norm": 5.699647903442383, + "learning_rate": 9.632666542610614e-06, + "loss": 0.4201, + "step": 1194 + }, + { + "epoch": 0.13168044077134985, + "grad_norm": 7.819087505340576, + "learning_rate": 9.632008474879171e-06, + "loss": 0.5354, + "step": 1195 + }, + { + "epoch": 0.13179063360881543, + "grad_norm": 7.091444492340088, + "learning_rate": 9.63134984073944e-06, + "loss": 0.4269, + "step": 1196 + }, + { + "epoch": 0.13190082644628098, + "grad_norm": 5.056596755981445, + "learning_rate": 9.630690640271958e-06, + "loss": 0.4025, + "step": 1197 + }, + { + "epoch": 0.13201101928374656, + "grad_norm": 13.227201461791992, + "learning_rate": 9.630030873557335e-06, + "loss": 0.5259, + "step": 1198 + }, + { + "epoch": 0.1321212121212121, + "grad_norm": 12.597100257873535, + "learning_rate": 9.629370540676246e-06, + "loss": 0.5291, + "step": 1199 + }, + { + "epoch": 0.1322314049586777, + "grad_norm": 11.199701309204102, + "learning_rate": 9.628709641709441e-06, + "loss": 0.5655, + "step": 1200 + }, + { + "epoch": 0.13234159779614324, + "grad_norm": 10.168828010559082, + "learning_rate": 9.628048176737734e-06, + "loss": 0.4606, + "step": 1201 + }, + { + "epoch": 0.13245179063360882, + "grad_norm": 8.4885835647583, + "learning_rate": 9.627386145842008e-06, + "loss": 0.538, + "step": 1202 + }, + { + "epoch": 0.13256198347107437, + "grad_norm": 5.746736526489258, + "learning_rate": 9.626723549103218e-06, + "loss": 0.4401, + "step": 1203 + }, + { + "epoch": 0.13267217630853995, + "grad_norm": 7.922615051269531, + "learning_rate": 9.62606038660239e-06, + "loss": 0.4987, + "step": 1204 + }, + { + "epoch": 0.1327823691460055, + "grad_norm": 6.999356269836426, + "learning_rate": 9.625396658420611e-06, + "loss": 0.4446, + "step": 1205 + }, + { + "epoch": 0.13289256198347107, + "grad_norm": 9.101819038391113, + "learning_rate": 9.624732364639046e-06, + "loss": 0.5127, + "step": 1206 + }, + { + "epoch": 0.13300275482093665, + "grad_norm": 7.7360358238220215, + "learning_rate": 9.624067505338928e-06, + "loss": 0.4511, + "step": 1207 + }, + { + "epoch": 0.1331129476584022, + "grad_norm": 7.754938125610352, + "learning_rate": 9.623402080601552e-06, + "loss": 0.4846, + "step": 1208 + }, + { + "epoch": 0.13322314049586778, + "grad_norm": 9.81610107421875, + "learning_rate": 9.62273609050829e-06, + "loss": 0.5006, + "step": 1209 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 7.27468204498291, + "learning_rate": 9.622069535140579e-06, + "loss": 0.4858, + "step": 1210 + }, + { + "epoch": 0.1334435261707989, + "grad_norm": 11.147741317749023, + "learning_rate": 9.621402414579928e-06, + "loss": 0.485, + "step": 1211 + }, + { + "epoch": 0.13355371900826446, + "grad_norm": 4.764684200286865, + "learning_rate": 9.620734728907912e-06, + "loss": 0.4267, + "step": 1212 + }, + { + "epoch": 0.13366391184573004, + "grad_norm": 8.194490432739258, + "learning_rate": 9.620066478206176e-06, + "loss": 0.5357, + "step": 1213 + }, + { + "epoch": 0.1337741046831956, + "grad_norm": 7.495524883270264, + "learning_rate": 9.619397662556434e-06, + "loss": 0.4516, + "step": 1214 + }, + { + "epoch": 0.13388429752066117, + "grad_norm": 7.69467306137085, + "learning_rate": 9.618728282040472e-06, + "loss": 0.4443, + "step": 1215 + }, + { + "epoch": 0.13399449035812672, + "grad_norm": 5.7928466796875, + "learning_rate": 9.618058336740144e-06, + "loss": 0.4539, + "step": 1216 + }, + { + "epoch": 0.1341046831955923, + "grad_norm": 6.241455554962158, + "learning_rate": 9.617387826737367e-06, + "loss": 0.4858, + "step": 1217 + }, + { + "epoch": 0.13421487603305784, + "grad_norm": 7.450160980224609, + "learning_rate": 9.616716752114135e-06, + "loss": 0.453, + "step": 1218 + }, + { + "epoch": 0.13432506887052342, + "grad_norm": 10.77891731262207, + "learning_rate": 9.616045112952508e-06, + "loss": 0.446, + "step": 1219 + }, + { + "epoch": 0.13443526170798897, + "grad_norm": 6.600894451141357, + "learning_rate": 9.615372909334612e-06, + "loss": 0.446, + "step": 1220 + }, + { + "epoch": 0.13454545454545455, + "grad_norm": 9.839051246643066, + "learning_rate": 9.61470014134265e-06, + "loss": 0.4126, + "step": 1221 + }, + { + "epoch": 0.1346556473829201, + "grad_norm": 5.878121852874756, + "learning_rate": 9.614026809058886e-06, + "loss": 0.4811, + "step": 1222 + }, + { + "epoch": 0.13476584022038568, + "grad_norm": 6.809379577636719, + "learning_rate": 9.613352912565656e-06, + "loss": 0.4785, + "step": 1223 + }, + { + "epoch": 0.13487603305785123, + "grad_norm": 11.274439811706543, + "learning_rate": 9.612678451945364e-06, + "loss": 0.5477, + "step": 1224 + }, + { + "epoch": 0.1349862258953168, + "grad_norm": 7.883140563964844, + "learning_rate": 9.612003427280487e-06, + "loss": 0.4922, + "step": 1225 + }, + { + "epoch": 0.13509641873278236, + "grad_norm": 11.556133270263672, + "learning_rate": 9.611327838653563e-06, + "loss": 0.5489, + "step": 1226 + }, + { + "epoch": 0.13520661157024794, + "grad_norm": 5.750260829925537, + "learning_rate": 9.61065168614721e-06, + "loss": 0.3844, + "step": 1227 + }, + { + "epoch": 0.1353168044077135, + "grad_norm": 7.41603422164917, + "learning_rate": 9.609974969844105e-06, + "loss": 0.3361, + "step": 1228 + }, + { + "epoch": 0.13542699724517906, + "grad_norm": 6.057075500488281, + "learning_rate": 9.609297689827e-06, + "loss": 0.39, + "step": 1229 + }, + { + "epoch": 0.13553719008264462, + "grad_norm": 7.7945146560668945, + "learning_rate": 9.608619846178711e-06, + "loss": 0.4986, + "step": 1230 + }, + { + "epoch": 0.1356473829201102, + "grad_norm": 7.78212833404541, + "learning_rate": 9.607941438982127e-06, + "loss": 0.4787, + "step": 1231 + }, + { + "epoch": 0.13575757575757577, + "grad_norm": 11.500925064086914, + "learning_rate": 9.607262468320205e-06, + "loss": 0.4491, + "step": 1232 + }, + { + "epoch": 0.13586776859504132, + "grad_norm": 8.095635414123535, + "learning_rate": 9.606582934275968e-06, + "loss": 0.4334, + "step": 1233 + }, + { + "epoch": 0.1359779614325069, + "grad_norm": 5.727466583251953, + "learning_rate": 9.605902836932514e-06, + "loss": 0.3908, + "step": 1234 + }, + { + "epoch": 0.13608815426997245, + "grad_norm": 8.273818016052246, + "learning_rate": 9.605222176373006e-06, + "loss": 0.4466, + "step": 1235 + }, + { + "epoch": 0.13619834710743803, + "grad_norm": 4.12173318862915, + "learning_rate": 9.604540952680672e-06, + "loss": 0.3868, + "step": 1236 + }, + { + "epoch": 0.13630853994490358, + "grad_norm": 7.504457950592041, + "learning_rate": 9.603859165938817e-06, + "loss": 0.507, + "step": 1237 + }, + { + "epoch": 0.13641873278236916, + "grad_norm": 8.912880897521973, + "learning_rate": 9.60317681623081e-06, + "loss": 0.4625, + "step": 1238 + }, + { + "epoch": 0.1365289256198347, + "grad_norm": 7.629209995269775, + "learning_rate": 9.602493903640089e-06, + "loss": 0.4457, + "step": 1239 + }, + { + "epoch": 0.13663911845730028, + "grad_norm": 11.610555648803711, + "learning_rate": 9.60181042825016e-06, + "loss": 0.4087, + "step": 1240 + }, + { + "epoch": 0.13674931129476584, + "grad_norm": 13.225961685180664, + "learning_rate": 9.601126390144602e-06, + "loss": 0.4733, + "step": 1241 + }, + { + "epoch": 0.1368595041322314, + "grad_norm": 9.693376541137695, + "learning_rate": 9.60044178940706e-06, + "loss": 0.4659, + "step": 1242 + }, + { + "epoch": 0.13696969696969696, + "grad_norm": 10.25334358215332, + "learning_rate": 9.599756626121244e-06, + "loss": 0.4908, + "step": 1243 + }, + { + "epoch": 0.13707988980716254, + "grad_norm": 15.683958053588867, + "learning_rate": 9.599070900370943e-06, + "loss": 0.5867, + "step": 1244 + }, + { + "epoch": 0.1371900826446281, + "grad_norm": 5.411210060119629, + "learning_rate": 9.598384612240004e-06, + "loss": 0.4215, + "step": 1245 + }, + { + "epoch": 0.13730027548209367, + "grad_norm": 6.647460460662842, + "learning_rate": 9.597697761812347e-06, + "loss": 0.4412, + "step": 1246 + }, + { + "epoch": 0.13741046831955922, + "grad_norm": 6.99817419052124, + "learning_rate": 9.597010349171964e-06, + "loss": 0.4447, + "step": 1247 + }, + { + "epoch": 0.1375206611570248, + "grad_norm": 7.737760543823242, + "learning_rate": 9.596322374402908e-06, + "loss": 0.4342, + "step": 1248 + }, + { + "epoch": 0.13763085399449035, + "grad_norm": 6.537383556365967, + "learning_rate": 9.595633837589313e-06, + "loss": 0.5261, + "step": 1249 + }, + { + "epoch": 0.13774104683195593, + "grad_norm": 9.799365043640137, + "learning_rate": 9.594944738815366e-06, + "loss": 0.5021, + "step": 1250 + }, + { + "epoch": 0.13785123966942148, + "grad_norm": 7.652403831481934, + "learning_rate": 9.594255078165338e-06, + "loss": 0.4058, + "step": 1251 + }, + { + "epoch": 0.13796143250688706, + "grad_norm": 6.344549655914307, + "learning_rate": 9.593564855723557e-06, + "loss": 0.3838, + "step": 1252 + }, + { + "epoch": 0.1380716253443526, + "grad_norm": 5.8052215576171875, + "learning_rate": 9.592874071574424e-06, + "loss": 0.4224, + "step": 1253 + }, + { + "epoch": 0.13818181818181818, + "grad_norm": 5.907143592834473, + "learning_rate": 9.592182725802412e-06, + "loss": 0.4224, + "step": 1254 + }, + { + "epoch": 0.13829201101928373, + "grad_norm": 8.23563289642334, + "learning_rate": 9.591490818492059e-06, + "loss": 0.4484, + "step": 1255 + }, + { + "epoch": 0.1384022038567493, + "grad_norm": 7.861181259155273, + "learning_rate": 9.590798349727972e-06, + "loss": 0.531, + "step": 1256 + }, + { + "epoch": 0.1385123966942149, + "grad_norm": 11.337368965148926, + "learning_rate": 9.590105319594825e-06, + "loss": 0.4131, + "step": 1257 + }, + { + "epoch": 0.13862258953168044, + "grad_norm": 6.5620269775390625, + "learning_rate": 9.589411728177367e-06, + "loss": 0.4461, + "step": 1258 + }, + { + "epoch": 0.13873278236914602, + "grad_norm": 14.474329948425293, + "learning_rate": 9.588717575560407e-06, + "loss": 0.544, + "step": 1259 + }, + { + "epoch": 0.13884297520661157, + "grad_norm": 13.015029907226562, + "learning_rate": 9.58802286182883e-06, + "loss": 0.475, + "step": 1260 + }, + { + "epoch": 0.13895316804407715, + "grad_norm": 8.727346420288086, + "learning_rate": 9.587327587067583e-06, + "loss": 0.5245, + "step": 1261 + }, + { + "epoch": 0.1390633608815427, + "grad_norm": 17.652963638305664, + "learning_rate": 9.58663175136169e-06, + "loss": 0.4731, + "step": 1262 + }, + { + "epoch": 0.13917355371900827, + "grad_norm": 6.671087265014648, + "learning_rate": 9.585935354796235e-06, + "loss": 0.47, + "step": 1263 + }, + { + "epoch": 0.13928374655647383, + "grad_norm": 4.637528896331787, + "learning_rate": 9.585238397456373e-06, + "loss": 0.4266, + "step": 1264 + }, + { + "epoch": 0.1393939393939394, + "grad_norm": 12.280485153198242, + "learning_rate": 9.584540879427335e-06, + "loss": 0.5641, + "step": 1265 + }, + { + "epoch": 0.13950413223140495, + "grad_norm": 6.294337272644043, + "learning_rate": 9.58384280079441e-06, + "loss": 0.4443, + "step": 1266 + }, + { + "epoch": 0.13961432506887053, + "grad_norm": 8.127622604370117, + "learning_rate": 9.583144161642958e-06, + "loss": 0.4848, + "step": 1267 + }, + { + "epoch": 0.13972451790633608, + "grad_norm": 6.222387313842773, + "learning_rate": 9.582444962058416e-06, + "loss": 0.4791, + "step": 1268 + }, + { + "epoch": 0.13983471074380166, + "grad_norm": 5.600038528442383, + "learning_rate": 9.581745202126278e-06, + "loss": 0.3918, + "step": 1269 + }, + { + "epoch": 0.1399449035812672, + "grad_norm": 5.010919094085693, + "learning_rate": 9.581044881932113e-06, + "loss": 0.4981, + "step": 1270 + }, + { + "epoch": 0.1400550964187328, + "grad_norm": 8.86003589630127, + "learning_rate": 9.580344001561557e-06, + "loss": 0.5298, + "step": 1271 + }, + { + "epoch": 0.14016528925619834, + "grad_norm": 7.720181941986084, + "learning_rate": 9.579642561100314e-06, + "loss": 0.429, + "step": 1272 + }, + { + "epoch": 0.14027548209366392, + "grad_norm": 9.330036163330078, + "learning_rate": 9.578940560634157e-06, + "loss": 0.4489, + "step": 1273 + }, + { + "epoch": 0.14038567493112947, + "grad_norm": 9.135281562805176, + "learning_rate": 9.578238000248931e-06, + "loss": 0.3906, + "step": 1274 + }, + { + "epoch": 0.14049586776859505, + "grad_norm": 8.328913688659668, + "learning_rate": 9.577534880030543e-06, + "loss": 0.5004, + "step": 1275 + }, + { + "epoch": 0.1406060606060606, + "grad_norm": 8.757101058959961, + "learning_rate": 9.576831200064972e-06, + "loss": 0.5734, + "step": 1276 + }, + { + "epoch": 0.14071625344352617, + "grad_norm": 7.532750606536865, + "learning_rate": 9.576126960438264e-06, + "loss": 0.4672, + "step": 1277 + }, + { + "epoch": 0.14082644628099172, + "grad_norm": 7.0799970626831055, + "learning_rate": 9.575422161236533e-06, + "loss": 0.4517, + "step": 1278 + }, + { + "epoch": 0.1409366391184573, + "grad_norm": 9.143688201904297, + "learning_rate": 9.574716802545968e-06, + "loss": 0.5049, + "step": 1279 + }, + { + "epoch": 0.14104683195592285, + "grad_norm": 4.941235542297363, + "learning_rate": 9.574010884452817e-06, + "loss": 0.4532, + "step": 1280 + }, + { + "epoch": 0.14115702479338843, + "grad_norm": 9.055264472961426, + "learning_rate": 9.573304407043402e-06, + "loss": 0.4694, + "step": 1281 + }, + { + "epoch": 0.14126721763085398, + "grad_norm": 5.88823127746582, + "learning_rate": 9.572597370404114e-06, + "loss": 0.4428, + "step": 1282 + }, + { + "epoch": 0.14137741046831956, + "grad_norm": 11.091938972473145, + "learning_rate": 9.571889774621406e-06, + "loss": 0.528, + "step": 1283 + }, + { + "epoch": 0.14148760330578514, + "grad_norm": 18.7547607421875, + "learning_rate": 9.571181619781806e-06, + "loss": 0.4894, + "step": 1284 + }, + { + "epoch": 0.1415977961432507, + "grad_norm": 18.28177833557129, + "learning_rate": 9.57047290597191e-06, + "loss": 0.5075, + "step": 1285 + }, + { + "epoch": 0.14170798898071627, + "grad_norm": 5.0757975578308105, + "learning_rate": 9.569763633278377e-06, + "loss": 0.4046, + "step": 1286 + }, + { + "epoch": 0.14181818181818182, + "grad_norm": 12.676048278808594, + "learning_rate": 9.56905380178794e-06, + "loss": 0.4505, + "step": 1287 + }, + { + "epoch": 0.1419283746556474, + "grad_norm": 6.966454982757568, + "learning_rate": 9.568343411587397e-06, + "loss": 0.4769, + "step": 1288 + }, + { + "epoch": 0.14203856749311294, + "grad_norm": 6.308920860290527, + "learning_rate": 9.567632462763617e-06, + "loss": 0.5722, + "step": 1289 + }, + { + "epoch": 0.14214876033057852, + "grad_norm": 8.201804161071777, + "learning_rate": 9.566920955403533e-06, + "loss": 0.4732, + "step": 1290 + }, + { + "epoch": 0.14225895316804407, + "grad_norm": 5.8679890632629395, + "learning_rate": 9.566208889594154e-06, + "loss": 0.5137, + "step": 1291 + }, + { + "epoch": 0.14236914600550965, + "grad_norm": 8.620732307434082, + "learning_rate": 9.565496265422549e-06, + "loss": 0.5064, + "step": 1292 + }, + { + "epoch": 0.1424793388429752, + "grad_norm": 7.340023040771484, + "learning_rate": 9.564783082975856e-06, + "loss": 0.5066, + "step": 1293 + }, + { + "epoch": 0.14258953168044078, + "grad_norm": 8.920328140258789, + "learning_rate": 9.56406934234129e-06, + "loss": 0.3834, + "step": 1294 + }, + { + "epoch": 0.14269972451790633, + "grad_norm": 10.822632789611816, + "learning_rate": 9.563355043606124e-06, + "loss": 0.5246, + "step": 1295 + }, + { + "epoch": 0.1428099173553719, + "grad_norm": 8.7300443649292, + "learning_rate": 9.562640186857706e-06, + "loss": 0.5316, + "step": 1296 + }, + { + "epoch": 0.14292011019283746, + "grad_norm": 5.638113498687744, + "learning_rate": 9.561924772183446e-06, + "loss": 0.3808, + "step": 1297 + }, + { + "epoch": 0.14303030303030304, + "grad_norm": 4.380406856536865, + "learning_rate": 9.561208799670828e-06, + "loss": 0.4787, + "step": 1298 + }, + { + "epoch": 0.14314049586776859, + "grad_norm": 10.42844295501709, + "learning_rate": 9.560492269407405e-06, + "loss": 0.4683, + "step": 1299 + }, + { + "epoch": 0.14325068870523416, + "grad_norm": 6.256180286407471, + "learning_rate": 9.559775181480791e-06, + "loss": 0.3951, + "step": 1300 + }, + { + "epoch": 0.14336088154269971, + "grad_norm": 5.697518825531006, + "learning_rate": 9.559057535978673e-06, + "loss": 0.4077, + "step": 1301 + }, + { + "epoch": 0.1434710743801653, + "grad_norm": 7.972506523132324, + "learning_rate": 9.558339332988807e-06, + "loss": 0.4687, + "step": 1302 + }, + { + "epoch": 0.14358126721763084, + "grad_norm": 6.847010612487793, + "learning_rate": 9.557620572599015e-06, + "loss": 0.4682, + "step": 1303 + }, + { + "epoch": 0.14369146005509642, + "grad_norm": 8.559355735778809, + "learning_rate": 9.55690125489719e-06, + "loss": 0.4327, + "step": 1304 + }, + { + "epoch": 0.14380165289256197, + "grad_norm": 8.214713096618652, + "learning_rate": 9.55618137997129e-06, + "loss": 0.5354, + "step": 1305 + }, + { + "epoch": 0.14391184573002755, + "grad_norm": 9.412150382995605, + "learning_rate": 9.55546094790934e-06, + "loss": 0.491, + "step": 1306 + }, + { + "epoch": 0.1440220385674931, + "grad_norm": 10.300345420837402, + "learning_rate": 9.554739958799438e-06, + "loss": 0.5273, + "step": 1307 + }, + { + "epoch": 0.14413223140495868, + "grad_norm": 10.135249137878418, + "learning_rate": 9.554018412729747e-06, + "loss": 0.3953, + "step": 1308 + }, + { + "epoch": 0.14424242424242426, + "grad_norm": 7.0443620681762695, + "learning_rate": 9.553296309788498e-06, + "loss": 0.4439, + "step": 1309 + }, + { + "epoch": 0.1443526170798898, + "grad_norm": 7.326988220214844, + "learning_rate": 9.552573650063992e-06, + "loss": 0.5392, + "step": 1310 + }, + { + "epoch": 0.14446280991735538, + "grad_norm": 9.309418678283691, + "learning_rate": 9.551850433644596e-06, + "loss": 0.4253, + "step": 1311 + }, + { + "epoch": 0.14457300275482093, + "grad_norm": 11.001035690307617, + "learning_rate": 9.551126660618746e-06, + "loss": 0.5079, + "step": 1312 + }, + { + "epoch": 0.1446831955922865, + "grad_norm": 13.418405532836914, + "learning_rate": 9.550402331074945e-06, + "loss": 0.4895, + "step": 1313 + }, + { + "epoch": 0.14479338842975206, + "grad_norm": 8.14391040802002, + "learning_rate": 9.549677445101766e-06, + "loss": 0.4726, + "step": 1314 + }, + { + "epoch": 0.14490358126721764, + "grad_norm": 8.856776237487793, + "learning_rate": 9.54895200278785e-06, + "loss": 0.4434, + "step": 1315 + }, + { + "epoch": 0.1450137741046832, + "grad_norm": 7.328104019165039, + "learning_rate": 9.548226004221903e-06, + "loss": 0.4649, + "step": 1316 + }, + { + "epoch": 0.14512396694214877, + "grad_norm": 8.873720169067383, + "learning_rate": 9.547499449492701e-06, + "loss": 0.4917, + "step": 1317 + }, + { + "epoch": 0.14523415977961432, + "grad_norm": 8.50603199005127, + "learning_rate": 9.54677233868909e-06, + "loss": 0.4602, + "step": 1318 + }, + { + "epoch": 0.1453443526170799, + "grad_norm": 6.062439441680908, + "learning_rate": 9.546044671899982e-06, + "loss": 0.4393, + "step": 1319 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 10.409941673278809, + "learning_rate": 9.545316449214354e-06, + "loss": 0.4722, + "step": 1320 + }, + { + "epoch": 0.14556473829201103, + "grad_norm": 14.732938766479492, + "learning_rate": 9.54458767072126e-06, + "loss": 0.4956, + "step": 1321 + }, + { + "epoch": 0.14567493112947658, + "grad_norm": 7.161686897277832, + "learning_rate": 9.54385833650981e-06, + "loss": 0.455, + "step": 1322 + }, + { + "epoch": 0.14578512396694215, + "grad_norm": 7.07647180557251, + "learning_rate": 9.543128446669191e-06, + "loss": 0.4994, + "step": 1323 + }, + { + "epoch": 0.1458953168044077, + "grad_norm": 8.985993385314941, + "learning_rate": 9.542398001288654e-06, + "loss": 0.4947, + "step": 1324 + }, + { + "epoch": 0.14600550964187328, + "grad_norm": 7.013548374176025, + "learning_rate": 9.54166700045752e-06, + "loss": 0.4678, + "step": 1325 + }, + { + "epoch": 0.14611570247933883, + "grad_norm": 11.606616020202637, + "learning_rate": 9.540935444265175e-06, + "loss": 0.5178, + "step": 1326 + }, + { + "epoch": 0.1462258953168044, + "grad_norm": 9.271910667419434, + "learning_rate": 9.540203332801075e-06, + "loss": 0.5439, + "step": 1327 + }, + { + "epoch": 0.14633608815426996, + "grad_norm": 6.152432918548584, + "learning_rate": 9.539470666154747e-06, + "loss": 0.455, + "step": 1328 + }, + { + "epoch": 0.14644628099173554, + "grad_norm": 6.23866081237793, + "learning_rate": 9.538737444415777e-06, + "loss": 0.4138, + "step": 1329 + }, + { + "epoch": 0.1465564738292011, + "grad_norm": 7.096884727478027, + "learning_rate": 9.538003667673828e-06, + "loss": 0.5072, + "step": 1330 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 6.731375694274902, + "learning_rate": 9.537269336018627e-06, + "loss": 0.4115, + "step": 1331 + }, + { + "epoch": 0.14677685950413222, + "grad_norm": 6.049683094024658, + "learning_rate": 9.536534449539966e-06, + "loss": 0.472, + "step": 1332 + }, + { + "epoch": 0.1468870523415978, + "grad_norm": 8.583109855651855, + "learning_rate": 9.535799008327711e-06, + "loss": 0.4451, + "step": 1333 + }, + { + "epoch": 0.14699724517906337, + "grad_norm": 5.552403450012207, + "learning_rate": 9.535063012471793e-06, + "loss": 0.3823, + "step": 1334 + }, + { + "epoch": 0.14710743801652892, + "grad_norm": 7.0078606605529785, + "learning_rate": 9.53432646206221e-06, + "loss": 0.4477, + "step": 1335 + }, + { + "epoch": 0.1472176308539945, + "grad_norm": 8.977633476257324, + "learning_rate": 9.533589357189026e-06, + "loss": 0.4246, + "step": 1336 + }, + { + "epoch": 0.14732782369146005, + "grad_norm": 11.107526779174805, + "learning_rate": 9.53285169794238e-06, + "loss": 0.5381, + "step": 1337 + }, + { + "epoch": 0.14743801652892563, + "grad_norm": 8.975915908813477, + "learning_rate": 9.532113484412468e-06, + "loss": 0.4812, + "step": 1338 + }, + { + "epoch": 0.14754820936639118, + "grad_norm": 7.837002277374268, + "learning_rate": 9.531374716689565e-06, + "loss": 0.5246, + "step": 1339 + }, + { + "epoch": 0.14765840220385676, + "grad_norm": 8.374943733215332, + "learning_rate": 9.530635394864006e-06, + "loss": 0.4484, + "step": 1340 + }, + { + "epoch": 0.1477685950413223, + "grad_norm": 7.772285461425781, + "learning_rate": 9.529895519026196e-06, + "loss": 0.4227, + "step": 1341 + }, + { + "epoch": 0.1478787878787879, + "grad_norm": 4.9404072761535645, + "learning_rate": 9.529155089266609e-06, + "loss": 0.4677, + "step": 1342 + }, + { + "epoch": 0.14798898071625344, + "grad_norm": 9.528486251831055, + "learning_rate": 9.528414105675785e-06, + "loss": 0.5177, + "step": 1343 + }, + { + "epoch": 0.14809917355371902, + "grad_norm": 6.80415153503418, + "learning_rate": 9.527672568344332e-06, + "loss": 0.5118, + "step": 1344 + }, + { + "epoch": 0.14820936639118457, + "grad_norm": 7.836904525756836, + "learning_rate": 9.52693047736293e-06, + "loss": 0.4326, + "step": 1345 + }, + { + "epoch": 0.14831955922865014, + "grad_norm": 6.0371012687683105, + "learning_rate": 9.526187832822318e-06, + "loss": 0.4778, + "step": 1346 + }, + { + "epoch": 0.1484297520661157, + "grad_norm": 5.394172668457031, + "learning_rate": 9.52544463481331e-06, + "loss": 0.436, + "step": 1347 + }, + { + "epoch": 0.14853994490358127, + "grad_norm": 7.3542351722717285, + "learning_rate": 9.524700883426786e-06, + "loss": 0.4585, + "step": 1348 + }, + { + "epoch": 0.14865013774104682, + "grad_norm": 7.725783824920654, + "learning_rate": 9.523956578753688e-06, + "loss": 0.4353, + "step": 1349 + }, + { + "epoch": 0.1487603305785124, + "grad_norm": 9.414610862731934, + "learning_rate": 9.523211720885038e-06, + "loss": 0.4457, + "step": 1350 + }, + { + "epoch": 0.14887052341597795, + "grad_norm": 5.255350112915039, + "learning_rate": 9.522466309911913e-06, + "loss": 0.4497, + "step": 1351 + }, + { + "epoch": 0.14898071625344353, + "grad_norm": 5.908344745635986, + "learning_rate": 9.521720345925464e-06, + "loss": 0.4214, + "step": 1352 + }, + { + "epoch": 0.14909090909090908, + "grad_norm": 10.101250648498535, + "learning_rate": 9.52097382901691e-06, + "loss": 0.4965, + "step": 1353 + }, + { + "epoch": 0.14920110192837466, + "grad_norm": 5.879767417907715, + "learning_rate": 9.520226759277536e-06, + "loss": 0.4725, + "step": 1354 + }, + { + "epoch": 0.1493112947658402, + "grad_norm": 7.752197265625, + "learning_rate": 9.519479136798693e-06, + "loss": 0.4929, + "step": 1355 + }, + { + "epoch": 0.14942148760330579, + "grad_norm": 7.521350860595703, + "learning_rate": 9.518730961671802e-06, + "loss": 0.379, + "step": 1356 + }, + { + "epoch": 0.14953168044077134, + "grad_norm": 6.87856912612915, + "learning_rate": 9.51798223398835e-06, + "loss": 0.4206, + "step": 1357 + }, + { + "epoch": 0.14964187327823691, + "grad_norm": 7.299850940704346, + "learning_rate": 9.517232953839894e-06, + "loss": 0.4177, + "step": 1358 + }, + { + "epoch": 0.1497520661157025, + "grad_norm": 14.304620742797852, + "learning_rate": 9.516483121318057e-06, + "loss": 0.5049, + "step": 1359 + }, + { + "epoch": 0.14986225895316804, + "grad_norm": 5.096722602844238, + "learning_rate": 9.515732736514526e-06, + "loss": 0.4325, + "step": 1360 + }, + { + "epoch": 0.14997245179063362, + "grad_norm": 5.899183750152588, + "learning_rate": 9.514981799521066e-06, + "loss": 0.3767, + "step": 1361 + }, + { + "epoch": 0.15008264462809917, + "grad_norm": 4.823487758636475, + "learning_rate": 9.514230310429498e-06, + "loss": 0.3536, + "step": 1362 + }, + { + "epoch": 0.15019283746556475, + "grad_norm": 11.252744674682617, + "learning_rate": 9.513478269331713e-06, + "loss": 0.4375, + "step": 1363 + }, + { + "epoch": 0.1503030303030303, + "grad_norm": 13.921759605407715, + "learning_rate": 9.512725676319677e-06, + "loss": 0.496, + "step": 1364 + }, + { + "epoch": 0.15041322314049588, + "grad_norm": 10.286101341247559, + "learning_rate": 9.511972531485414e-06, + "loss": 0.4727, + "step": 1365 + }, + { + "epoch": 0.15052341597796143, + "grad_norm": 4.919619083404541, + "learning_rate": 9.51121883492102e-06, + "loss": 0.4386, + "step": 1366 + }, + { + "epoch": 0.150633608815427, + "grad_norm": 8.442239761352539, + "learning_rate": 9.51046458671866e-06, + "loss": 0.4446, + "step": 1367 + }, + { + "epoch": 0.15074380165289256, + "grad_norm": 9.221701622009277, + "learning_rate": 9.509709786970564e-06, + "loss": 0.4601, + "step": 1368 + }, + { + "epoch": 0.15085399449035813, + "grad_norm": 11.813467979431152, + "learning_rate": 9.50895443576903e-06, + "loss": 0.5873, + "step": 1369 + }, + { + "epoch": 0.15096418732782368, + "grad_norm": 5.503012657165527, + "learning_rate": 9.50819853320642e-06, + "loss": 0.4285, + "step": 1370 + }, + { + "epoch": 0.15107438016528926, + "grad_norm": 8.24340534210205, + "learning_rate": 9.507442079375171e-06, + "loss": 0.4181, + "step": 1371 + }, + { + "epoch": 0.1511845730027548, + "grad_norm": 6.293192386627197, + "learning_rate": 9.506685074367782e-06, + "loss": 0.4747, + "step": 1372 + }, + { + "epoch": 0.1512947658402204, + "grad_norm": 9.944341659545898, + "learning_rate": 9.505927518276821e-06, + "loss": 0.4881, + "step": 1373 + }, + { + "epoch": 0.15140495867768594, + "grad_norm": 7.372109889984131, + "learning_rate": 9.505169411194921e-06, + "loss": 0.4513, + "step": 1374 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 6.490156650543213, + "learning_rate": 9.504410753214786e-06, + "loss": 0.356, + "step": 1375 + }, + { + "epoch": 0.15162534435261707, + "grad_norm": 3.989199638366699, + "learning_rate": 9.503651544429186e-06, + "loss": 0.4293, + "step": 1376 + }, + { + "epoch": 0.15173553719008265, + "grad_norm": 5.597970962524414, + "learning_rate": 9.502891784930957e-06, + "loss": 0.4755, + "step": 1377 + }, + { + "epoch": 0.1518457300275482, + "grad_norm": 7.86866569519043, + "learning_rate": 9.502131474813006e-06, + "loss": 0.4134, + "step": 1378 + }, + { + "epoch": 0.15195592286501378, + "grad_norm": 7.2889556884765625, + "learning_rate": 9.5013706141683e-06, + "loss": 0.4354, + "step": 1379 + }, + { + "epoch": 0.15206611570247933, + "grad_norm": 15.950618743896484, + "learning_rate": 9.500609203089883e-06, + "loss": 0.3591, + "step": 1380 + }, + { + "epoch": 0.1521763085399449, + "grad_norm": 6.034360885620117, + "learning_rate": 9.499847241670857e-06, + "loss": 0.3366, + "step": 1381 + }, + { + "epoch": 0.15228650137741045, + "grad_norm": 13.032929420471191, + "learning_rate": 9.499084730004398e-06, + "loss": 0.4793, + "step": 1382 + }, + { + "epoch": 0.15239669421487603, + "grad_norm": 9.218709945678711, + "learning_rate": 9.498321668183749e-06, + "loss": 0.4686, + "step": 1383 + }, + { + "epoch": 0.1525068870523416, + "grad_norm": 12.696232795715332, + "learning_rate": 9.497558056302214e-06, + "loss": 0.5946, + "step": 1384 + }, + { + "epoch": 0.15261707988980716, + "grad_norm": 8.161370277404785, + "learning_rate": 9.496793894453171e-06, + "loss": 0.4742, + "step": 1385 + }, + { + "epoch": 0.15272727272727274, + "grad_norm": 11.497420310974121, + "learning_rate": 9.496029182730063e-06, + "loss": 0.6843, + "step": 1386 + }, + { + "epoch": 0.1528374655647383, + "grad_norm": 12.493870735168457, + "learning_rate": 9.495263921226399e-06, + "loss": 0.5355, + "step": 1387 + }, + { + "epoch": 0.15294765840220387, + "grad_norm": 4.5893473625183105, + "learning_rate": 9.494498110035756e-06, + "loss": 0.4677, + "step": 1388 + }, + { + "epoch": 0.15305785123966942, + "grad_norm": 7.156816005706787, + "learning_rate": 9.49373174925178e-06, + "loss": 0.4558, + "step": 1389 + }, + { + "epoch": 0.153168044077135, + "grad_norm": 9.895354270935059, + "learning_rate": 9.49296483896818e-06, + "loss": 0.5318, + "step": 1390 + }, + { + "epoch": 0.15327823691460055, + "grad_norm": 7.410556793212891, + "learning_rate": 9.492197379278738e-06, + "loss": 0.4879, + "step": 1391 + }, + { + "epoch": 0.15338842975206612, + "grad_norm": 4.698634624481201, + "learning_rate": 9.491429370277298e-06, + "loss": 0.3865, + "step": 1392 + }, + { + "epoch": 0.15349862258953167, + "grad_norm": 9.832266807556152, + "learning_rate": 9.490660812057772e-06, + "loss": 0.4181, + "step": 1393 + }, + { + "epoch": 0.15360881542699725, + "grad_norm": 6.800455093383789, + "learning_rate": 9.489891704714145e-06, + "loss": 0.517, + "step": 1394 + }, + { + "epoch": 0.1537190082644628, + "grad_norm": 6.298616886138916, + "learning_rate": 9.489122048340457e-06, + "loss": 0.3812, + "step": 1395 + }, + { + "epoch": 0.15382920110192838, + "grad_norm": 7.242441654205322, + "learning_rate": 9.488351843030832e-06, + "loss": 0.5227, + "step": 1396 + }, + { + "epoch": 0.15393939393939393, + "grad_norm": 6.825589656829834, + "learning_rate": 9.487581088879443e-06, + "loss": 0.4433, + "step": 1397 + }, + { + "epoch": 0.1540495867768595, + "grad_norm": 6.0639519691467285, + "learning_rate": 9.486809785980544e-06, + "loss": 0.4206, + "step": 1398 + }, + { + "epoch": 0.15415977961432506, + "grad_norm": 6.887888431549072, + "learning_rate": 9.486037934428451e-06, + "loss": 0.5139, + "step": 1399 + }, + { + "epoch": 0.15426997245179064, + "grad_norm": 9.926239967346191, + "learning_rate": 9.485265534317544e-06, + "loss": 0.3926, + "step": 1400 + }, + { + "epoch": 0.1543801652892562, + "grad_norm": 6.610692977905273, + "learning_rate": 9.484492585742275e-06, + "loss": 0.3983, + "step": 1401 + }, + { + "epoch": 0.15449035812672177, + "grad_norm": 6.125588893890381, + "learning_rate": 9.483719088797161e-06, + "loss": 0.4405, + "step": 1402 + }, + { + "epoch": 0.15460055096418732, + "grad_norm": 11.86706256866455, + "learning_rate": 9.482945043576787e-06, + "loss": 0.4647, + "step": 1403 + }, + { + "epoch": 0.1547107438016529, + "grad_norm": 10.825469017028809, + "learning_rate": 9.482170450175804e-06, + "loss": 0.4889, + "step": 1404 + }, + { + "epoch": 0.15482093663911844, + "grad_norm": 8.743223190307617, + "learning_rate": 9.481395308688928e-06, + "loss": 0.524, + "step": 1405 + }, + { + "epoch": 0.15493112947658402, + "grad_norm": 10.586660385131836, + "learning_rate": 9.48061961921095e-06, + "loss": 0.4492, + "step": 1406 + }, + { + "epoch": 0.15504132231404957, + "grad_norm": 11.887472152709961, + "learning_rate": 9.479843381836714e-06, + "loss": 0.5481, + "step": 1407 + }, + { + "epoch": 0.15515151515151515, + "grad_norm": 10.426525115966797, + "learning_rate": 9.479066596661146e-06, + "loss": 0.4112, + "step": 1408 + }, + { + "epoch": 0.15526170798898073, + "grad_norm": 7.2527689933776855, + "learning_rate": 9.47828926377923e-06, + "loss": 0.4969, + "step": 1409 + }, + { + "epoch": 0.15537190082644628, + "grad_norm": 5.922959804534912, + "learning_rate": 9.47751138328602e-06, + "loss": 0.4402, + "step": 1410 + }, + { + "epoch": 0.15548209366391186, + "grad_norm": 12.400938987731934, + "learning_rate": 9.476732955276637e-06, + "loss": 0.4873, + "step": 1411 + }, + { + "epoch": 0.1555922865013774, + "grad_norm": 9.209698677062988, + "learning_rate": 9.475953979846267e-06, + "loss": 0.4264, + "step": 1412 + }, + { + "epoch": 0.155702479338843, + "grad_norm": 5.971480846405029, + "learning_rate": 9.475174457090162e-06, + "loss": 0.4584, + "step": 1413 + }, + { + "epoch": 0.15581267217630854, + "grad_norm": 6.5802788734436035, + "learning_rate": 9.474394387103648e-06, + "loss": 0.3941, + "step": 1414 + }, + { + "epoch": 0.15592286501377411, + "grad_norm": 7.03208589553833, + "learning_rate": 9.473613769982108e-06, + "loss": 0.3746, + "step": 1415 + }, + { + "epoch": 0.15603305785123966, + "grad_norm": 9.911009788513184, + "learning_rate": 9.472832605821003e-06, + "loss": 0.4821, + "step": 1416 + }, + { + "epoch": 0.15614325068870524, + "grad_norm": 4.675900936126709, + "learning_rate": 9.472050894715849e-06, + "loss": 0.3468, + "step": 1417 + }, + { + "epoch": 0.1562534435261708, + "grad_norm": 8.639227867126465, + "learning_rate": 9.471268636762237e-06, + "loss": 0.5741, + "step": 1418 + }, + { + "epoch": 0.15636363636363637, + "grad_norm": 5.883412837982178, + "learning_rate": 9.470485832055822e-06, + "loss": 0.4819, + "step": 1419 + }, + { + "epoch": 0.15647382920110192, + "grad_norm": 7.617645740509033, + "learning_rate": 9.469702480692326e-06, + "loss": 0.4767, + "step": 1420 + }, + { + "epoch": 0.1565840220385675, + "grad_norm": 6.042135715484619, + "learning_rate": 9.46891858276754e-06, + "loss": 0.481, + "step": 1421 + }, + { + "epoch": 0.15669421487603305, + "grad_norm": 8.383898735046387, + "learning_rate": 9.468134138377321e-06, + "loss": 0.4136, + "step": 1422 + }, + { + "epoch": 0.15680440771349863, + "grad_norm": 6.613709926605225, + "learning_rate": 9.467349147617589e-06, + "loss": 0.4291, + "step": 1423 + }, + { + "epoch": 0.15691460055096418, + "grad_norm": 6.821608543395996, + "learning_rate": 9.466563610584336e-06, + "loss": 0.447, + "step": 1424 + }, + { + "epoch": 0.15702479338842976, + "grad_norm": 8.442924499511719, + "learning_rate": 9.465777527373616e-06, + "loss": 0.423, + "step": 1425 + }, + { + "epoch": 0.1571349862258953, + "grad_norm": 6.351988315582275, + "learning_rate": 9.464990898081554e-06, + "loss": 0.4263, + "step": 1426 + }, + { + "epoch": 0.15724517906336088, + "grad_norm": 12.619184494018555, + "learning_rate": 9.46420372280434e-06, + "loss": 0.5921, + "step": 1427 + }, + { + "epoch": 0.15735537190082644, + "grad_norm": 14.88255500793457, + "learning_rate": 9.46341600163823e-06, + "loss": 0.5316, + "step": 1428 + }, + { + "epoch": 0.157465564738292, + "grad_norm": 9.838242530822754, + "learning_rate": 9.46262773467955e-06, + "loss": 0.507, + "step": 1429 + }, + { + "epoch": 0.15757575757575756, + "grad_norm": 11.328462600708008, + "learning_rate": 9.46183892202469e-06, + "loss": 0.4047, + "step": 1430 + }, + { + "epoch": 0.15768595041322314, + "grad_norm": 7.212308883666992, + "learning_rate": 9.461049563770102e-06, + "loss": 0.4398, + "step": 1431 + }, + { + "epoch": 0.1577961432506887, + "grad_norm": 6.799179553985596, + "learning_rate": 9.460259660012316e-06, + "loss": 0.4984, + "step": 1432 + }, + { + "epoch": 0.15790633608815427, + "grad_norm": 11.3588228225708, + "learning_rate": 9.459469210847919e-06, + "loss": 0.5055, + "step": 1433 + }, + { + "epoch": 0.15801652892561985, + "grad_norm": 5.606801509857178, + "learning_rate": 9.45867821637357e-06, + "loss": 0.4606, + "step": 1434 + }, + { + "epoch": 0.1581267217630854, + "grad_norm": 6.175350666046143, + "learning_rate": 9.457886676685992e-06, + "loss": 0.4075, + "step": 1435 + }, + { + "epoch": 0.15823691460055098, + "grad_norm": 7.256425857543945, + "learning_rate": 9.457094591881975e-06, + "loss": 0.4129, + "step": 1436 + }, + { + "epoch": 0.15834710743801653, + "grad_norm": 8.640177726745605, + "learning_rate": 9.456301962058377e-06, + "loss": 0.3757, + "step": 1437 + }, + { + "epoch": 0.1584573002754821, + "grad_norm": 8.584975242614746, + "learning_rate": 9.455508787312123e-06, + "loss": 0.5124, + "step": 1438 + }, + { + "epoch": 0.15856749311294766, + "grad_norm": 8.673279762268066, + "learning_rate": 9.454715067740202e-06, + "loss": 0.4775, + "step": 1439 + }, + { + "epoch": 0.15867768595041323, + "grad_norm": 3.941206693649292, + "learning_rate": 9.45392080343967e-06, + "loss": 0.4469, + "step": 1440 + }, + { + "epoch": 0.15878787878787878, + "grad_norm": 5.717761516571045, + "learning_rate": 9.453125994507654e-06, + "loss": 0.4708, + "step": 1441 + }, + { + "epoch": 0.15889807162534436, + "grad_norm": 11.770964622497559, + "learning_rate": 9.452330641041341e-06, + "loss": 0.4513, + "step": 1442 + }, + { + "epoch": 0.1590082644628099, + "grad_norm": 9.43468952178955, + "learning_rate": 9.45153474313799e-06, + "loss": 0.5151, + "step": 1443 + }, + { + "epoch": 0.1591184573002755, + "grad_norm": 8.744233131408691, + "learning_rate": 9.450738300894924e-06, + "loss": 0.4464, + "step": 1444 + }, + { + "epoch": 0.15922865013774104, + "grad_norm": 8.976760864257812, + "learning_rate": 9.449941314409532e-06, + "loss": 0.3869, + "step": 1445 + }, + { + "epoch": 0.15933884297520662, + "grad_norm": 22.744741439819336, + "learning_rate": 9.449143783779273e-06, + "loss": 0.4933, + "step": 1446 + }, + { + "epoch": 0.15944903581267217, + "grad_norm": 8.330665588378906, + "learning_rate": 9.448345709101667e-06, + "loss": 0.4662, + "step": 1447 + }, + { + "epoch": 0.15955922865013775, + "grad_norm": 6.56828498840332, + "learning_rate": 9.447547090474306e-06, + "loss": 0.4644, + "step": 1448 + }, + { + "epoch": 0.1596694214876033, + "grad_norm": 7.922265529632568, + "learning_rate": 9.446747927994844e-06, + "loss": 0.5257, + "step": 1449 + }, + { + "epoch": 0.15977961432506887, + "grad_norm": 10.60759449005127, + "learning_rate": 9.445948221761007e-06, + "loss": 0.4497, + "step": 1450 + }, + { + "epoch": 0.15988980716253443, + "grad_norm": 18.729232788085938, + "learning_rate": 9.445147971870581e-06, + "loss": 0.5753, + "step": 1451 + }, + { + "epoch": 0.16, + "grad_norm": 13.113868713378906, + "learning_rate": 9.444347178421423e-06, + "loss": 0.4972, + "step": 1452 + }, + { + "epoch": 0.16011019283746555, + "grad_norm": 5.94268798828125, + "learning_rate": 9.443545841511456e-06, + "loss": 0.4448, + "step": 1453 + }, + { + "epoch": 0.16022038567493113, + "grad_norm": 6.834156036376953, + "learning_rate": 9.442743961238665e-06, + "loss": 0.4677, + "step": 1454 + }, + { + "epoch": 0.16033057851239668, + "grad_norm": 5.015316486358643, + "learning_rate": 9.44194153770111e-06, + "loss": 0.3224, + "step": 1455 + }, + { + "epoch": 0.16044077134986226, + "grad_norm": 17.945768356323242, + "learning_rate": 9.44113857099691e-06, + "loss": 0.4532, + "step": 1456 + }, + { + "epoch": 0.1605509641873278, + "grad_norm": 12.237314224243164, + "learning_rate": 9.44033506122425e-06, + "loss": 0.4405, + "step": 1457 + }, + { + "epoch": 0.1606611570247934, + "grad_norm": 14.865547180175781, + "learning_rate": 9.439531008481392e-06, + "loss": 0.534, + "step": 1458 + }, + { + "epoch": 0.16077134986225897, + "grad_norm": 8.56263256072998, + "learning_rate": 9.438726412866648e-06, + "loss": 0.436, + "step": 1459 + }, + { + "epoch": 0.16088154269972452, + "grad_norm": 6.990463733673096, + "learning_rate": 9.43792127447841e-06, + "loss": 0.4856, + "step": 1460 + }, + { + "epoch": 0.1609917355371901, + "grad_norm": 9.773675918579102, + "learning_rate": 9.437115593415129e-06, + "loss": 0.5639, + "step": 1461 + }, + { + "epoch": 0.16110192837465565, + "grad_norm": 7.388878345489502, + "learning_rate": 9.436309369775328e-06, + "loss": 0.4377, + "step": 1462 + }, + { + "epoch": 0.16121212121212122, + "grad_norm": 6.132890701293945, + "learning_rate": 9.43550260365759e-06, + "loss": 0.4315, + "step": 1463 + }, + { + "epoch": 0.16132231404958677, + "grad_norm": 9.686576843261719, + "learning_rate": 9.434695295160568e-06, + "loss": 0.4449, + "step": 1464 + }, + { + "epoch": 0.16143250688705235, + "grad_norm": 8.422321319580078, + "learning_rate": 9.433887444382982e-06, + "loss": 0.498, + "step": 1465 + }, + { + "epoch": 0.1615426997245179, + "grad_norm": 11.859896659851074, + "learning_rate": 9.433079051423616e-06, + "loss": 0.5934, + "step": 1466 + }, + { + "epoch": 0.16165289256198348, + "grad_norm": 5.726517677307129, + "learning_rate": 9.432270116381323e-06, + "loss": 0.4092, + "step": 1467 + }, + { + "epoch": 0.16176308539944903, + "grad_norm": 10.08298397064209, + "learning_rate": 9.431460639355019e-06, + "loss": 0.421, + "step": 1468 + }, + { + "epoch": 0.1618732782369146, + "grad_norm": 8.94924259185791, + "learning_rate": 9.430650620443688e-06, + "loss": 0.5167, + "step": 1469 + }, + { + "epoch": 0.16198347107438016, + "grad_norm": 8.591166496276855, + "learning_rate": 9.42984005974638e-06, + "loss": 0.5073, + "step": 1470 + }, + { + "epoch": 0.16209366391184574, + "grad_norm": 7.914938926696777, + "learning_rate": 9.429028957362215e-06, + "loss": 0.4906, + "step": 1471 + }, + { + "epoch": 0.1622038567493113, + "grad_norm": 5.708611011505127, + "learning_rate": 9.428217313390371e-06, + "loss": 0.5026, + "step": 1472 + }, + { + "epoch": 0.16231404958677687, + "grad_norm": 7.096518039703369, + "learning_rate": 9.427405127930097e-06, + "loss": 0.4185, + "step": 1473 + }, + { + "epoch": 0.16242424242424242, + "grad_norm": 7.8787336349487305, + "learning_rate": 9.426592401080712e-06, + "loss": 0.4419, + "step": 1474 + }, + { + "epoch": 0.162534435261708, + "grad_norm": 8.509092330932617, + "learning_rate": 9.425779132941595e-06, + "loss": 0.534, + "step": 1475 + }, + { + "epoch": 0.16264462809917354, + "grad_norm": 4.901118755340576, + "learning_rate": 9.424965323612195e-06, + "loss": 0.3961, + "step": 1476 + }, + { + "epoch": 0.16275482093663912, + "grad_norm": 5.733903884887695, + "learning_rate": 9.424150973192023e-06, + "loss": 0.4299, + "step": 1477 + }, + { + "epoch": 0.16286501377410467, + "grad_norm": 14.781890869140625, + "learning_rate": 9.42333608178066e-06, + "loss": 0.5192, + "step": 1478 + }, + { + "epoch": 0.16297520661157025, + "grad_norm": 6.482476711273193, + "learning_rate": 9.422520649477754e-06, + "loss": 0.4981, + "step": 1479 + }, + { + "epoch": 0.1630853994490358, + "grad_norm": 10.953235626220703, + "learning_rate": 9.421704676383014e-06, + "loss": 0.51, + "step": 1480 + }, + { + "epoch": 0.16319559228650138, + "grad_norm": 9.130579948425293, + "learning_rate": 9.420888162596221e-06, + "loss": 0.4976, + "step": 1481 + }, + { + "epoch": 0.16330578512396693, + "grad_norm": 6.881532192230225, + "learning_rate": 9.420071108217216e-06, + "loss": 0.4551, + "step": 1482 + }, + { + "epoch": 0.1634159779614325, + "grad_norm": 5.993616104125977, + "learning_rate": 9.419253513345916e-06, + "loss": 0.4523, + "step": 1483 + }, + { + "epoch": 0.16352617079889809, + "grad_norm": 8.481162071228027, + "learning_rate": 9.41843537808229e-06, + "loss": 0.5667, + "step": 1484 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 7.955751419067383, + "learning_rate": 9.417616702526387e-06, + "loss": 0.4281, + "step": 1485 + }, + { + "epoch": 0.1637465564738292, + "grad_norm": 5.160492420196533, + "learning_rate": 9.41679748677831e-06, + "loss": 0.4632, + "step": 1486 + }, + { + "epoch": 0.16385674931129476, + "grad_norm": 6.2219767570495605, + "learning_rate": 9.415977730938237e-06, + "loss": 0.4349, + "step": 1487 + }, + { + "epoch": 0.16396694214876034, + "grad_norm": 11.044354438781738, + "learning_rate": 9.41515743510641e-06, + "loss": 0.5752, + "step": 1488 + }, + { + "epoch": 0.1640771349862259, + "grad_norm": 5.493617057800293, + "learning_rate": 9.414336599383133e-06, + "loss": 0.4504, + "step": 1489 + }, + { + "epoch": 0.16418732782369147, + "grad_norm": 5.468198776245117, + "learning_rate": 9.413515223868782e-06, + "loss": 0.4254, + "step": 1490 + }, + { + "epoch": 0.16429752066115702, + "grad_norm": 5.371224403381348, + "learning_rate": 9.412693308663793e-06, + "loss": 0.3265, + "step": 1491 + }, + { + "epoch": 0.1644077134986226, + "grad_norm": 5.7264404296875, + "learning_rate": 9.411870853868673e-06, + "loss": 0.4068, + "step": 1492 + }, + { + "epoch": 0.16451790633608815, + "grad_norm": 11.388520240783691, + "learning_rate": 9.41104785958399e-06, + "loss": 0.5433, + "step": 1493 + }, + { + "epoch": 0.16462809917355373, + "grad_norm": 12.780411720275879, + "learning_rate": 9.410224325910384e-06, + "loss": 0.5371, + "step": 1494 + }, + { + "epoch": 0.16473829201101928, + "grad_norm": 8.704211235046387, + "learning_rate": 9.409400252948558e-06, + "loss": 0.431, + "step": 1495 + }, + { + "epoch": 0.16484848484848486, + "grad_norm": 7.5440826416015625, + "learning_rate": 9.40857564079928e-06, + "loss": 0.3368, + "step": 1496 + }, + { + "epoch": 0.1649586776859504, + "grad_norm": 6.04235315322876, + "learning_rate": 9.407750489563381e-06, + "loss": 0.414, + "step": 1497 + }, + { + "epoch": 0.16506887052341598, + "grad_norm": 7.066317081451416, + "learning_rate": 9.406924799341767e-06, + "loss": 0.4522, + "step": 1498 + }, + { + "epoch": 0.16517906336088153, + "grad_norm": 10.957195281982422, + "learning_rate": 9.406098570235402e-06, + "loss": 0.402, + "step": 1499 + }, + { + "epoch": 0.1652892561983471, + "grad_norm": 8.854328155517578, + "learning_rate": 9.405271802345319e-06, + "loss": 0.4694, + "step": 1500 + }, + { + "epoch": 0.16539944903581266, + "grad_norm": 11.334278106689453, + "learning_rate": 9.404444495772615e-06, + "loss": 0.4365, + "step": 1501 + }, + { + "epoch": 0.16550964187327824, + "grad_norm": 12.177138328552246, + "learning_rate": 9.403616650618456e-06, + "loss": 0.6228, + "step": 1502 + }, + { + "epoch": 0.1656198347107438, + "grad_norm": 6.889142990112305, + "learning_rate": 9.402788266984071e-06, + "loss": 0.412, + "step": 1503 + }, + { + "epoch": 0.16573002754820937, + "grad_norm": 8.409960746765137, + "learning_rate": 9.401959344970756e-06, + "loss": 0.393, + "step": 1504 + }, + { + "epoch": 0.16584022038567492, + "grad_norm": 10.087620735168457, + "learning_rate": 9.401129884679874e-06, + "loss": 0.4533, + "step": 1505 + }, + { + "epoch": 0.1659504132231405, + "grad_norm": 8.47231674194336, + "learning_rate": 9.40029988621285e-06, + "loss": 0.4001, + "step": 1506 + }, + { + "epoch": 0.16606060606060605, + "grad_norm": 4.585776329040527, + "learning_rate": 9.39946934967118e-06, + "loss": 0.456, + "step": 1507 + }, + { + "epoch": 0.16617079889807163, + "grad_norm": 5.115619659423828, + "learning_rate": 9.39863827515642e-06, + "loss": 0.4247, + "step": 1508 + }, + { + "epoch": 0.1662809917355372, + "grad_norm": 7.151246547698975, + "learning_rate": 9.397806662770198e-06, + "loss": 0.2597, + "step": 1509 + }, + { + "epoch": 0.16639118457300275, + "grad_norm": 6.416628360748291, + "learning_rate": 9.396974512614203e-06, + "loss": 0.3686, + "step": 1510 + }, + { + "epoch": 0.16650137741046833, + "grad_norm": 8.886362075805664, + "learning_rate": 9.396141824790193e-06, + "loss": 0.499, + "step": 1511 + }, + { + "epoch": 0.16661157024793388, + "grad_norm": 10.70858383178711, + "learning_rate": 9.395308599399987e-06, + "loss": 0.47, + "step": 1512 + }, + { + "epoch": 0.16672176308539946, + "grad_norm": 9.563263893127441, + "learning_rate": 9.394474836545477e-06, + "loss": 0.4403, + "step": 1513 + }, + { + "epoch": 0.166831955922865, + "grad_norm": 5.888492584228516, + "learning_rate": 9.393640536328613e-06, + "loss": 0.3563, + "step": 1514 + }, + { + "epoch": 0.1669421487603306, + "grad_norm": 7.1910810470581055, + "learning_rate": 9.392805698851417e-06, + "loss": 0.4313, + "step": 1515 + }, + { + "epoch": 0.16705234159779614, + "grad_norm": 6.051369667053223, + "learning_rate": 9.391970324215973e-06, + "loss": 0.4821, + "step": 1516 + }, + { + "epoch": 0.16716253443526172, + "grad_norm": 6.591337203979492, + "learning_rate": 9.391134412524432e-06, + "loss": 0.394, + "step": 1517 + }, + { + "epoch": 0.16727272727272727, + "grad_norm": 7.165696620941162, + "learning_rate": 9.390297963879008e-06, + "loss": 0.431, + "step": 1518 + }, + { + "epoch": 0.16738292011019285, + "grad_norm": 8.706212997436523, + "learning_rate": 9.38946097838199e-06, + "loss": 0.4573, + "step": 1519 + }, + { + "epoch": 0.1674931129476584, + "grad_norm": 6.006828784942627, + "learning_rate": 9.388623456135717e-06, + "loss": 0.3732, + "step": 1520 + }, + { + "epoch": 0.16760330578512397, + "grad_norm": 7.291321277618408, + "learning_rate": 9.387785397242608e-06, + "loss": 0.4585, + "step": 1521 + }, + { + "epoch": 0.16771349862258952, + "grad_norm": 8.707494735717773, + "learning_rate": 9.386946801805141e-06, + "loss": 0.4481, + "step": 1522 + }, + { + "epoch": 0.1678236914600551, + "grad_norm": 7.811737537384033, + "learning_rate": 9.386107669925858e-06, + "loss": 0.4742, + "step": 1523 + }, + { + "epoch": 0.16793388429752065, + "grad_norm": 6.985987663269043, + "learning_rate": 9.385268001707373e-06, + "loss": 0.3789, + "step": 1524 + }, + { + "epoch": 0.16804407713498623, + "grad_norm": 7.202207088470459, + "learning_rate": 9.38442779725236e-06, + "loss": 0.4366, + "step": 1525 + }, + { + "epoch": 0.16815426997245178, + "grad_norm": 7.785396575927734, + "learning_rate": 9.38358705666356e-06, + "loss": 0.5049, + "step": 1526 + }, + { + "epoch": 0.16826446280991736, + "grad_norm": 5.80039119720459, + "learning_rate": 9.38274578004378e-06, + "loss": 0.4372, + "step": 1527 + }, + { + "epoch": 0.1683746556473829, + "grad_norm": 8.934199333190918, + "learning_rate": 9.381903967495893e-06, + "loss": 0.48, + "step": 1528 + }, + { + "epoch": 0.1684848484848485, + "grad_norm": 9.693472862243652, + "learning_rate": 9.381061619122835e-06, + "loss": 0.417, + "step": 1529 + }, + { + "epoch": 0.16859504132231404, + "grad_norm": 6.487861156463623, + "learning_rate": 9.380218735027614e-06, + "loss": 0.4508, + "step": 1530 + }, + { + "epoch": 0.16870523415977962, + "grad_norm": 11.514814376831055, + "learning_rate": 9.379375315313292e-06, + "loss": 0.5513, + "step": 1531 + }, + { + "epoch": 0.16881542699724517, + "grad_norm": 6.065979480743408, + "learning_rate": 9.378531360083011e-06, + "loss": 0.4021, + "step": 1532 + }, + { + "epoch": 0.16892561983471074, + "grad_norm": 12.198055267333984, + "learning_rate": 9.377686869439967e-06, + "loss": 0.5575, + "step": 1533 + }, + { + "epoch": 0.16903581267217632, + "grad_norm": 6.628361701965332, + "learning_rate": 9.376841843487427e-06, + "loss": 0.4029, + "step": 1534 + }, + { + "epoch": 0.16914600550964187, + "grad_norm": 7.487825393676758, + "learning_rate": 9.37599628232872e-06, + "loss": 0.4811, + "step": 1535 + }, + { + "epoch": 0.16925619834710745, + "grad_norm": 5.830073833465576, + "learning_rate": 9.375150186067243e-06, + "loss": 0.3705, + "step": 1536 + }, + { + "epoch": 0.169366391184573, + "grad_norm": 8.890256881713867, + "learning_rate": 9.374303554806458e-06, + "loss": 0.4355, + "step": 1537 + }, + { + "epoch": 0.16947658402203858, + "grad_norm": 12.017580032348633, + "learning_rate": 9.373456388649893e-06, + "loss": 0.4822, + "step": 1538 + }, + { + "epoch": 0.16958677685950413, + "grad_norm": 5.915216445922852, + "learning_rate": 9.37260868770114e-06, + "loss": 0.4844, + "step": 1539 + }, + { + "epoch": 0.1696969696969697, + "grad_norm": 12.784900665283203, + "learning_rate": 9.371760452063857e-06, + "loss": 0.5914, + "step": 1540 + }, + { + "epoch": 0.16980716253443526, + "grad_norm": 7.340192794799805, + "learning_rate": 9.370911681841768e-06, + "loss": 0.4692, + "step": 1541 + }, + { + "epoch": 0.16991735537190084, + "grad_norm": 5.451550006866455, + "learning_rate": 9.37006237713866e-06, + "loss": 0.4002, + "step": 1542 + }, + { + "epoch": 0.17002754820936639, + "grad_norm": 6.147626876831055, + "learning_rate": 9.369212538058389e-06, + "loss": 0.4145, + "step": 1543 + }, + { + "epoch": 0.17013774104683196, + "grad_norm": 8.239680290222168, + "learning_rate": 9.368362164704873e-06, + "loss": 0.4248, + "step": 1544 + }, + { + "epoch": 0.17024793388429751, + "grad_norm": 4.821244239807129, + "learning_rate": 9.3675112571821e-06, + "loss": 0.4278, + "step": 1545 + }, + { + "epoch": 0.1703581267217631, + "grad_norm": 8.407042503356934, + "learning_rate": 9.366659815594116e-06, + "loss": 0.5224, + "step": 1546 + }, + { + "epoch": 0.17046831955922864, + "grad_norm": 6.209753513336182, + "learning_rate": 9.365807840045037e-06, + "loss": 0.4261, + "step": 1547 + }, + { + "epoch": 0.17057851239669422, + "grad_norm": 7.687352657318115, + "learning_rate": 9.364955330639048e-06, + "loss": 0.4171, + "step": 1548 + }, + { + "epoch": 0.17068870523415977, + "grad_norm": 9.114327430725098, + "learning_rate": 9.36410228748039e-06, + "loss": 0.5157, + "step": 1549 + }, + { + "epoch": 0.17079889807162535, + "grad_norm": 7.95366907119751, + "learning_rate": 9.363248710673375e-06, + "loss": 0.553, + "step": 1550 + }, + { + "epoch": 0.1709090909090909, + "grad_norm": 10.192009925842285, + "learning_rate": 9.362394600322384e-06, + "loss": 0.424, + "step": 1551 + }, + { + "epoch": 0.17101928374655648, + "grad_norm": 12.724431991577148, + "learning_rate": 9.361539956531853e-06, + "loss": 0.5071, + "step": 1552 + }, + { + "epoch": 0.17112947658402203, + "grad_norm": 7.3048295974731445, + "learning_rate": 9.360684779406294e-06, + "loss": 0.4098, + "step": 1553 + }, + { + "epoch": 0.1712396694214876, + "grad_norm": 5.551223278045654, + "learning_rate": 9.359829069050274e-06, + "loss": 0.4497, + "step": 1554 + }, + { + "epoch": 0.17134986225895316, + "grad_norm": 12.205377578735352, + "learning_rate": 9.358972825568436e-06, + "loss": 0.4724, + "step": 1555 + }, + { + "epoch": 0.17146005509641873, + "grad_norm": 9.10533332824707, + "learning_rate": 9.358116049065478e-06, + "loss": 0.4131, + "step": 1556 + }, + { + "epoch": 0.17157024793388428, + "grad_norm": 5.6637492179870605, + "learning_rate": 9.35725873964617e-06, + "loss": 0.498, + "step": 1557 + }, + { + "epoch": 0.17168044077134986, + "grad_norm": 6.103901386260986, + "learning_rate": 9.356400897415345e-06, + "loss": 0.3669, + "step": 1558 + }, + { + "epoch": 0.17179063360881544, + "grad_norm": 6.015481472015381, + "learning_rate": 9.3555425224779e-06, + "loss": 0.3158, + "step": 1559 + }, + { + "epoch": 0.171900826446281, + "grad_norm": 5.316213607788086, + "learning_rate": 9.354683614938798e-06, + "loss": 0.4558, + "step": 1560 + }, + { + "epoch": 0.17201101928374657, + "grad_norm": 10.183836936950684, + "learning_rate": 9.35382417490307e-06, + "loss": 0.4287, + "step": 1561 + }, + { + "epoch": 0.17212121212121212, + "grad_norm": 6.446548938751221, + "learning_rate": 9.352964202475808e-06, + "loss": 0.4508, + "step": 1562 + }, + { + "epoch": 0.1722314049586777, + "grad_norm": 10.281747817993164, + "learning_rate": 9.352103697762169e-06, + "loss": 0.5018, + "step": 1563 + }, + { + "epoch": 0.17234159779614325, + "grad_norm": 7.714334011077881, + "learning_rate": 9.351242660867378e-06, + "loss": 0.4423, + "step": 1564 + }, + { + "epoch": 0.17245179063360883, + "grad_norm": 6.705694675445557, + "learning_rate": 9.350381091896725e-06, + "loss": 0.495, + "step": 1565 + }, + { + "epoch": 0.17256198347107438, + "grad_norm": 7.069931507110596, + "learning_rate": 9.349518990955561e-06, + "loss": 0.4489, + "step": 1566 + }, + { + "epoch": 0.17267217630853995, + "grad_norm": 4.913645267486572, + "learning_rate": 9.348656358149308e-06, + "loss": 0.4078, + "step": 1567 + }, + { + "epoch": 0.1727823691460055, + "grad_norm": 7.224776744842529, + "learning_rate": 9.34779319358345e-06, + "loss": 0.4023, + "step": 1568 + }, + { + "epoch": 0.17289256198347108, + "grad_norm": 7.386590003967285, + "learning_rate": 9.346929497363533e-06, + "loss": 0.428, + "step": 1569 + }, + { + "epoch": 0.17300275482093663, + "grad_norm": 8.202399253845215, + "learning_rate": 9.34606526959517e-06, + "loss": 0.4701, + "step": 1570 + }, + { + "epoch": 0.1731129476584022, + "grad_norm": 8.172073364257812, + "learning_rate": 9.345200510384044e-06, + "loss": 0.4355, + "step": 1571 + }, + { + "epoch": 0.17322314049586776, + "grad_norm": 5.922333717346191, + "learning_rate": 9.344335219835899e-06, + "loss": 0.4772, + "step": 1572 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 7.50675630569458, + "learning_rate": 9.34346939805654e-06, + "loss": 0.4776, + "step": 1573 + }, + { + "epoch": 0.1734435261707989, + "grad_norm": 4.919284343719482, + "learning_rate": 9.342603045151844e-06, + "loss": 0.412, + "step": 1574 + }, + { + "epoch": 0.17355371900826447, + "grad_norm": 6.294493198394775, + "learning_rate": 9.341736161227749e-06, + "loss": 0.4684, + "step": 1575 + }, + { + "epoch": 0.17366391184573002, + "grad_norm": 8.999917984008789, + "learning_rate": 9.340868746390257e-06, + "loss": 0.4699, + "step": 1576 + }, + { + "epoch": 0.1737741046831956, + "grad_norm": 7.87579870223999, + "learning_rate": 9.34000080074544e-06, + "loss": 0.4248, + "step": 1577 + }, + { + "epoch": 0.17388429752066115, + "grad_norm": 15.802863121032715, + "learning_rate": 9.339132324399427e-06, + "loss": 0.5674, + "step": 1578 + }, + { + "epoch": 0.17399449035812672, + "grad_norm": 4.844115257263184, + "learning_rate": 9.338263317458422e-06, + "loss": 0.4537, + "step": 1579 + }, + { + "epoch": 0.17410468319559227, + "grad_norm": 11.231072425842285, + "learning_rate": 9.337393780028684e-06, + "loss": 0.4172, + "step": 1580 + }, + { + "epoch": 0.17421487603305785, + "grad_norm": 9.87650203704834, + "learning_rate": 9.336523712216545e-06, + "loss": 0.54, + "step": 1581 + }, + { + "epoch": 0.1743250688705234, + "grad_norm": 6.8644022941589355, + "learning_rate": 9.335653114128393e-06, + "loss": 0.4106, + "step": 1582 + }, + { + "epoch": 0.17443526170798898, + "grad_norm": 11.86081600189209, + "learning_rate": 9.33478198587069e-06, + "loss": 0.6136, + "step": 1583 + }, + { + "epoch": 0.17454545454545456, + "grad_norm": 7.47705602645874, + "learning_rate": 9.333910327549958e-06, + "loss": 0.4817, + "step": 1584 + }, + { + "epoch": 0.1746556473829201, + "grad_norm": 8.04872989654541, + "learning_rate": 9.333038139272783e-06, + "loss": 0.4366, + "step": 1585 + }, + { + "epoch": 0.1747658402203857, + "grad_norm": 9.860109329223633, + "learning_rate": 9.332165421145821e-06, + "loss": 0.3969, + "step": 1586 + }, + { + "epoch": 0.17487603305785124, + "grad_norm": 9.992838859558105, + "learning_rate": 9.331292173275783e-06, + "loss": 0.4956, + "step": 1587 + }, + { + "epoch": 0.17498622589531682, + "grad_norm": 5.045669078826904, + "learning_rate": 9.330418395769457e-06, + "loss": 0.4504, + "step": 1588 + }, + { + "epoch": 0.17509641873278237, + "grad_norm": 7.532293319702148, + "learning_rate": 9.329544088733686e-06, + "loss": 0.3748, + "step": 1589 + }, + { + "epoch": 0.17520661157024794, + "grad_norm": 6.9422926902771, + "learning_rate": 9.328669252275385e-06, + "loss": 0.4797, + "step": 1590 + }, + { + "epoch": 0.1753168044077135, + "grad_norm": 13.932552337646484, + "learning_rate": 9.327793886501526e-06, + "loss": 0.4177, + "step": 1591 + }, + { + "epoch": 0.17542699724517907, + "grad_norm": 4.638037204742432, + "learning_rate": 9.326917991519153e-06, + "loss": 0.4402, + "step": 1592 + }, + { + "epoch": 0.17553719008264462, + "grad_norm": 6.164183616638184, + "learning_rate": 9.326041567435368e-06, + "loss": 0.4583, + "step": 1593 + }, + { + "epoch": 0.1756473829201102, + "grad_norm": 5.5203118324279785, + "learning_rate": 9.325164614357347e-06, + "loss": 0.4683, + "step": 1594 + }, + { + "epoch": 0.17575757575757575, + "grad_norm": 10.047639846801758, + "learning_rate": 9.32428713239232e-06, + "loss": 0.5439, + "step": 1595 + }, + { + "epoch": 0.17586776859504133, + "grad_norm": 5.9899163246154785, + "learning_rate": 9.323409121647588e-06, + "loss": 0.3647, + "step": 1596 + }, + { + "epoch": 0.17597796143250688, + "grad_norm": 7.982687950134277, + "learning_rate": 9.322530582230517e-06, + "loss": 0.457, + "step": 1597 + }, + { + "epoch": 0.17608815426997246, + "grad_norm": 7.037700176239014, + "learning_rate": 9.321651514248534e-06, + "loss": 0.4356, + "step": 1598 + }, + { + "epoch": 0.176198347107438, + "grad_norm": 6.525335311889648, + "learning_rate": 9.320771917809134e-06, + "loss": 0.4025, + "step": 1599 + }, + { + "epoch": 0.1763085399449036, + "grad_norm": 5.2210493087768555, + "learning_rate": 9.319891793019874e-06, + "loss": 0.3519, + "step": 1600 + }, + { + "epoch": 0.17641873278236914, + "grad_norm": 11.669242858886719, + "learning_rate": 9.319011139988378e-06, + "loss": 0.5395, + "step": 1601 + }, + { + "epoch": 0.17652892561983471, + "grad_norm": 13.469799995422363, + "learning_rate": 9.318129958822334e-06, + "loss": 0.4634, + "step": 1602 + }, + { + "epoch": 0.17663911845730026, + "grad_norm": 5.899416923522949, + "learning_rate": 9.31724824962949e-06, + "loss": 0.4997, + "step": 1603 + }, + { + "epoch": 0.17674931129476584, + "grad_norm": 4.695652484893799, + "learning_rate": 9.31636601251767e-06, + "loss": 0.4003, + "step": 1604 + }, + { + "epoch": 0.1768595041322314, + "grad_norm": 6.482518196105957, + "learning_rate": 9.315483247594748e-06, + "loss": 0.4392, + "step": 1605 + }, + { + "epoch": 0.17696969696969697, + "grad_norm": 6.833725452423096, + "learning_rate": 9.314599954968673e-06, + "loss": 0.5056, + "step": 1606 + }, + { + "epoch": 0.17707988980716252, + "grad_norm": 9.069969177246094, + "learning_rate": 9.313716134747455e-06, + "loss": 0.4752, + "step": 1607 + }, + { + "epoch": 0.1771900826446281, + "grad_norm": 12.654067993164062, + "learning_rate": 9.312831787039169e-06, + "loss": 0.5105, + "step": 1608 + }, + { + "epoch": 0.17730027548209368, + "grad_norm": 9.427743911743164, + "learning_rate": 9.311946911951952e-06, + "loss": 0.5026, + "step": 1609 + }, + { + "epoch": 0.17741046831955923, + "grad_norm": 6.150701999664307, + "learning_rate": 9.311061509594011e-06, + "loss": 0.489, + "step": 1610 + }, + { + "epoch": 0.1775206611570248, + "grad_norm": 4.896096229553223, + "learning_rate": 9.31017558007361e-06, + "loss": 0.4465, + "step": 1611 + }, + { + "epoch": 0.17763085399449036, + "grad_norm": 7.014410495758057, + "learning_rate": 9.309289123499088e-06, + "loss": 0.4055, + "step": 1612 + }, + { + "epoch": 0.17774104683195593, + "grad_norm": 6.980282306671143, + "learning_rate": 9.308402139978836e-06, + "loss": 0.4678, + "step": 1613 + }, + { + "epoch": 0.17785123966942148, + "grad_norm": 7.26424503326416, + "learning_rate": 9.307514629621318e-06, + "loss": 0.4338, + "step": 1614 + }, + { + "epoch": 0.17796143250688706, + "grad_norm": 11.836024284362793, + "learning_rate": 9.30662659253506e-06, + "loss": 0.5679, + "step": 1615 + }, + { + "epoch": 0.1780716253443526, + "grad_norm": 7.45238733291626, + "learning_rate": 9.305738028828653e-06, + "loss": 0.4848, + "step": 1616 + }, + { + "epoch": 0.1781818181818182, + "grad_norm": 9.33069896697998, + "learning_rate": 9.30484893861075e-06, + "loss": 0.4438, + "step": 1617 + }, + { + "epoch": 0.17829201101928374, + "grad_norm": 10.457403182983398, + "learning_rate": 9.303959321990072e-06, + "loss": 0.4788, + "step": 1618 + }, + { + "epoch": 0.17840220385674932, + "grad_norm": 5.067116737365723, + "learning_rate": 9.303069179075402e-06, + "loss": 0.3975, + "step": 1619 + }, + { + "epoch": 0.17851239669421487, + "grad_norm": 6.72924280166626, + "learning_rate": 9.302178509975588e-06, + "loss": 0.3877, + "step": 1620 + }, + { + "epoch": 0.17862258953168045, + "grad_norm": 19.54122543334961, + "learning_rate": 9.30128731479954e-06, + "loss": 0.4767, + "step": 1621 + }, + { + "epoch": 0.178732782369146, + "grad_norm": 13.766802787780762, + "learning_rate": 9.300395593656237e-06, + "loss": 0.4729, + "step": 1622 + }, + { + "epoch": 0.17884297520661158, + "grad_norm": 14.010546684265137, + "learning_rate": 9.299503346654721e-06, + "loss": 0.5181, + "step": 1623 + }, + { + "epoch": 0.17895316804407713, + "grad_norm": 7.248737812042236, + "learning_rate": 9.298610573904094e-06, + "loss": 0.4805, + "step": 1624 + }, + { + "epoch": 0.1790633608815427, + "grad_norm": 7.094707489013672, + "learning_rate": 9.297717275513526e-06, + "loss": 0.4545, + "step": 1625 + }, + { + "epoch": 0.17917355371900826, + "grad_norm": 7.688851833343506, + "learning_rate": 9.296823451592253e-06, + "loss": 0.3955, + "step": 1626 + }, + { + "epoch": 0.17928374655647383, + "grad_norm": 9.634920120239258, + "learning_rate": 9.295929102249572e-06, + "loss": 0.4597, + "step": 1627 + }, + { + "epoch": 0.17939393939393938, + "grad_norm": 9.308319091796875, + "learning_rate": 9.295034227594846e-06, + "loss": 0.5443, + "step": 1628 + }, + { + "epoch": 0.17950413223140496, + "grad_norm": 6.050390243530273, + "learning_rate": 9.294138827737498e-06, + "loss": 0.406, + "step": 1629 + }, + { + "epoch": 0.1796143250688705, + "grad_norm": 7.437664031982422, + "learning_rate": 9.293242902787023e-06, + "loss": 0.4229, + "step": 1630 + }, + { + "epoch": 0.1797245179063361, + "grad_norm": 13.359318733215332, + "learning_rate": 9.292346452852974e-06, + "loss": 0.4789, + "step": 1631 + }, + { + "epoch": 0.17983471074380164, + "grad_norm": 6.764444351196289, + "learning_rate": 9.291449478044968e-06, + "loss": 0.4367, + "step": 1632 + }, + { + "epoch": 0.17994490358126722, + "grad_norm": 7.310115337371826, + "learning_rate": 9.290551978472692e-06, + "loss": 0.4633, + "step": 1633 + }, + { + "epoch": 0.18005509641873277, + "grad_norm": 5.299081802368164, + "learning_rate": 9.289653954245892e-06, + "loss": 0.4097, + "step": 1634 + }, + { + "epoch": 0.18016528925619835, + "grad_norm": 8.130375862121582, + "learning_rate": 9.288755405474379e-06, + "loss": 0.4932, + "step": 1635 + }, + { + "epoch": 0.18027548209366392, + "grad_norm": 19.780685424804688, + "learning_rate": 9.28785633226803e-06, + "loss": 0.4528, + "step": 1636 + }, + { + "epoch": 0.18038567493112947, + "grad_norm": 6.51214599609375, + "learning_rate": 9.286956734736782e-06, + "loss": 0.4531, + "step": 1637 + }, + { + "epoch": 0.18049586776859505, + "grad_norm": 9.116598129272461, + "learning_rate": 9.286056612990644e-06, + "loss": 0.4667, + "step": 1638 + }, + { + "epoch": 0.1806060606060606, + "grad_norm": 7.414033889770508, + "learning_rate": 9.28515596713968e-06, + "loss": 0.3955, + "step": 1639 + }, + { + "epoch": 0.18071625344352618, + "grad_norm": 7.945151329040527, + "learning_rate": 9.284254797294025e-06, + "loss": 0.3675, + "step": 1640 + }, + { + "epoch": 0.18082644628099173, + "grad_norm": 7.031407833099365, + "learning_rate": 9.283353103563872e-06, + "loss": 0.472, + "step": 1641 + }, + { + "epoch": 0.1809366391184573, + "grad_norm": 6.033279895782471, + "learning_rate": 9.282450886059485e-06, + "loss": 0.4593, + "step": 1642 + }, + { + "epoch": 0.18104683195592286, + "grad_norm": 8.067496299743652, + "learning_rate": 9.281548144891183e-06, + "loss": 0.4434, + "step": 1643 + }, + { + "epoch": 0.18115702479338844, + "grad_norm": 7.44157075881958, + "learning_rate": 9.28064488016936e-06, + "loss": 0.5043, + "step": 1644 + }, + { + "epoch": 0.181267217630854, + "grad_norm": 8.112360000610352, + "learning_rate": 9.279741092004469e-06, + "loss": 0.489, + "step": 1645 + }, + { + "epoch": 0.18137741046831957, + "grad_norm": 6.331960678100586, + "learning_rate": 9.27883678050702e-06, + "loss": 0.5035, + "step": 1646 + }, + { + "epoch": 0.18148760330578512, + "grad_norm": 6.156603813171387, + "learning_rate": 9.2779319457876e-06, + "loss": 0.4342, + "step": 1647 + }, + { + "epoch": 0.1815977961432507, + "grad_norm": 8.530582427978516, + "learning_rate": 9.277026587956849e-06, + "loss": 0.4378, + "step": 1648 + }, + { + "epoch": 0.18170798898071625, + "grad_norm": 4.40578031539917, + "learning_rate": 9.276120707125477e-06, + "loss": 0.4722, + "step": 1649 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 8.593558311462402, + "learning_rate": 9.275214303404256e-06, + "loss": 0.4426, + "step": 1650 + }, + { + "epoch": 0.18192837465564737, + "grad_norm": 5.8893961906433105, + "learning_rate": 9.274307376904023e-06, + "loss": 0.4641, + "step": 1651 + }, + { + "epoch": 0.18203856749311295, + "grad_norm": 5.242407321929932, + "learning_rate": 9.273399927735679e-06, + "loss": 0.4018, + "step": 1652 + }, + { + "epoch": 0.1821487603305785, + "grad_norm": 6.03156042098999, + "learning_rate": 9.272491956010185e-06, + "loss": 0.4375, + "step": 1653 + }, + { + "epoch": 0.18225895316804408, + "grad_norm": 4.4334845542907715, + "learning_rate": 9.271583461838573e-06, + "loss": 0.4105, + "step": 1654 + }, + { + "epoch": 0.18236914600550963, + "grad_norm": 5.8249897956848145, + "learning_rate": 9.270674445331932e-06, + "loss": 0.46, + "step": 1655 + }, + { + "epoch": 0.1824793388429752, + "grad_norm": 6.178356647491455, + "learning_rate": 9.269764906601419e-06, + "loss": 0.456, + "step": 1656 + }, + { + "epoch": 0.18258953168044076, + "grad_norm": 5.149049282073975, + "learning_rate": 9.268854845758254e-06, + "loss": 0.331, + "step": 1657 + }, + { + "epoch": 0.18269972451790634, + "grad_norm": 5.921223163604736, + "learning_rate": 9.26794426291372e-06, + "loss": 0.3743, + "step": 1658 + }, + { + "epoch": 0.1828099173553719, + "grad_norm": 6.545019149780273, + "learning_rate": 9.267033158179162e-06, + "loss": 0.5212, + "step": 1659 + }, + { + "epoch": 0.18292011019283747, + "grad_norm": 9.221651077270508, + "learning_rate": 9.266121531665994e-06, + "loss": 0.49, + "step": 1660 + }, + { + "epoch": 0.18303030303030304, + "grad_norm": 8.458298683166504, + "learning_rate": 9.265209383485692e-06, + "loss": 0.4456, + "step": 1661 + }, + { + "epoch": 0.1831404958677686, + "grad_norm": 6.811192035675049, + "learning_rate": 9.26429671374979e-06, + "loss": 0.4542, + "step": 1662 + }, + { + "epoch": 0.18325068870523417, + "grad_norm": 10.922497749328613, + "learning_rate": 9.263383522569896e-06, + "loss": 0.5119, + "step": 1663 + }, + { + "epoch": 0.18336088154269972, + "grad_norm": 10.04183292388916, + "learning_rate": 9.26246981005767e-06, + "loss": 0.4339, + "step": 1664 + }, + { + "epoch": 0.1834710743801653, + "grad_norm": 5.958286285400391, + "learning_rate": 9.26155557632485e-06, + "loss": 0.4527, + "step": 1665 + }, + { + "epoch": 0.18358126721763085, + "grad_norm": 7.782229900360107, + "learning_rate": 9.260640821483222e-06, + "loss": 0.4145, + "step": 1666 + }, + { + "epoch": 0.18369146005509643, + "grad_norm": 7.860006332397461, + "learning_rate": 9.259725545644649e-06, + "loss": 0.4499, + "step": 1667 + }, + { + "epoch": 0.18380165289256198, + "grad_norm": 7.201568603515625, + "learning_rate": 9.25880974892105e-06, + "loss": 0.4353, + "step": 1668 + }, + { + "epoch": 0.18391184573002756, + "grad_norm": 5.932833194732666, + "learning_rate": 9.257893431424408e-06, + "loss": 0.436, + "step": 1669 + }, + { + "epoch": 0.1840220385674931, + "grad_norm": 8.118557929992676, + "learning_rate": 9.256976593266774e-06, + "loss": 0.441, + "step": 1670 + }, + { + "epoch": 0.18413223140495869, + "grad_norm": 6.2253313064575195, + "learning_rate": 9.25605923456026e-06, + "loss": 0.4287, + "step": 1671 + }, + { + "epoch": 0.18424242424242424, + "grad_norm": 7.285488605499268, + "learning_rate": 9.255141355417042e-06, + "loss": 0.4498, + "step": 1672 + }, + { + "epoch": 0.1843526170798898, + "grad_norm": 6.142855644226074, + "learning_rate": 9.254222955949359e-06, + "loss": 0.4375, + "step": 1673 + }, + { + "epoch": 0.18446280991735536, + "grad_norm": 9.95976448059082, + "learning_rate": 9.253304036269513e-06, + "loss": 0.4994, + "step": 1674 + }, + { + "epoch": 0.18457300275482094, + "grad_norm": 8.551350593566895, + "learning_rate": 9.252384596489874e-06, + "loss": 0.5382, + "step": 1675 + }, + { + "epoch": 0.1846831955922865, + "grad_norm": 6.680281639099121, + "learning_rate": 9.251464636722868e-06, + "loss": 0.3891, + "step": 1676 + }, + { + "epoch": 0.18479338842975207, + "grad_norm": 7.300052165985107, + "learning_rate": 9.250544157080992e-06, + "loss": 0.4391, + "step": 1677 + }, + { + "epoch": 0.18490358126721762, + "grad_norm": 6.606064796447754, + "learning_rate": 9.249623157676804e-06, + "loss": 0.4586, + "step": 1678 + }, + { + "epoch": 0.1850137741046832, + "grad_norm": 6.0628461837768555, + "learning_rate": 9.248701638622921e-06, + "loss": 0.4395, + "step": 1679 + }, + { + "epoch": 0.18512396694214875, + "grad_norm": 8.079413414001465, + "learning_rate": 9.247779600032032e-06, + "loss": 0.435, + "step": 1680 + }, + { + "epoch": 0.18523415977961433, + "grad_norm": 10.073164939880371, + "learning_rate": 9.246857042016883e-06, + "loss": 0.4077, + "step": 1681 + }, + { + "epoch": 0.18534435261707988, + "grad_norm": 7.541385173797607, + "learning_rate": 9.245933964690288e-06, + "loss": 0.3633, + "step": 1682 + }, + { + "epoch": 0.18545454545454546, + "grad_norm": 6.038715839385986, + "learning_rate": 9.245010368165118e-06, + "loss": 0.4782, + "step": 1683 + }, + { + "epoch": 0.185564738292011, + "grad_norm": 14.045479774475098, + "learning_rate": 9.244086252554313e-06, + "loss": 0.478, + "step": 1684 + }, + { + "epoch": 0.18567493112947658, + "grad_norm": 5.501763343811035, + "learning_rate": 9.24316161797088e-06, + "loss": 0.4166, + "step": 1685 + }, + { + "epoch": 0.18578512396694216, + "grad_norm": 6.4724555015563965, + "learning_rate": 9.242236464527877e-06, + "loss": 0.4311, + "step": 1686 + }, + { + "epoch": 0.1858953168044077, + "grad_norm": 5.08769416809082, + "learning_rate": 9.241310792338439e-06, + "loss": 0.3645, + "step": 1687 + }, + { + "epoch": 0.1860055096418733, + "grad_norm": 8.996004104614258, + "learning_rate": 9.240384601515753e-06, + "loss": 0.4901, + "step": 1688 + }, + { + "epoch": 0.18611570247933884, + "grad_norm": 6.509644985198975, + "learning_rate": 9.23945789217308e-06, + "loss": 0.3987, + "step": 1689 + }, + { + "epoch": 0.18622589531680442, + "grad_norm": 9.581886291503906, + "learning_rate": 9.238530664423737e-06, + "loss": 0.4781, + "step": 1690 + }, + { + "epoch": 0.18633608815426997, + "grad_norm": 11.073467254638672, + "learning_rate": 9.237602918381107e-06, + "loss": 0.5167, + "step": 1691 + }, + { + "epoch": 0.18644628099173555, + "grad_norm": 10.145783424377441, + "learning_rate": 9.236674654158637e-06, + "loss": 0.4679, + "step": 1692 + }, + { + "epoch": 0.1865564738292011, + "grad_norm": 9.544154167175293, + "learning_rate": 9.235745871869834e-06, + "loss": 0.4067, + "step": 1693 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 6.963332653045654, + "learning_rate": 9.23481657162827e-06, + "loss": 0.4524, + "step": 1694 + }, + { + "epoch": 0.18677685950413223, + "grad_norm": 6.609317302703857, + "learning_rate": 9.233886753547588e-06, + "loss": 0.4254, + "step": 1695 + }, + { + "epoch": 0.1868870523415978, + "grad_norm": 6.9944376945495605, + "learning_rate": 9.232956417741478e-06, + "loss": 0.4096, + "step": 1696 + }, + { + "epoch": 0.18699724517906335, + "grad_norm": 10.562012672424316, + "learning_rate": 9.23202556432371e-06, + "loss": 0.4147, + "step": 1697 + }, + { + "epoch": 0.18710743801652893, + "grad_norm": 6.0145134925842285, + "learning_rate": 9.231094193408107e-06, + "loss": 0.4501, + "step": 1698 + }, + { + "epoch": 0.18721763085399448, + "grad_norm": 7.005607604980469, + "learning_rate": 9.230162305108558e-06, + "loss": 0.4592, + "step": 1699 + }, + { + "epoch": 0.18732782369146006, + "grad_norm": 10.507290840148926, + "learning_rate": 9.229229899539018e-06, + "loss": 0.5106, + "step": 1700 + }, + { + "epoch": 0.1874380165289256, + "grad_norm": 8.026799201965332, + "learning_rate": 9.2282969768135e-06, + "loss": 0.4859, + "step": 1701 + }, + { + "epoch": 0.1875482093663912, + "grad_norm": 11.039427757263184, + "learning_rate": 9.227363537046083e-06, + "loss": 0.4278, + "step": 1702 + }, + { + "epoch": 0.18765840220385674, + "grad_norm": 10.61426830291748, + "learning_rate": 9.22642958035091e-06, + "loss": 0.4875, + "step": 1703 + }, + { + "epoch": 0.18776859504132232, + "grad_norm": 11.166681289672852, + "learning_rate": 9.225495106842188e-06, + "loss": 0.4511, + "step": 1704 + }, + { + "epoch": 0.18787878787878787, + "grad_norm": 7.04632568359375, + "learning_rate": 9.224560116634184e-06, + "loss": 0.3696, + "step": 1705 + }, + { + "epoch": 0.18798898071625345, + "grad_norm": 13.060858726501465, + "learning_rate": 9.223624609841232e-06, + "loss": 0.4687, + "step": 1706 + }, + { + "epoch": 0.188099173553719, + "grad_norm": 12.081947326660156, + "learning_rate": 9.222688586577724e-06, + "loss": 0.5184, + "step": 1707 + }, + { + "epoch": 0.18820936639118457, + "grad_norm": 13.18545913696289, + "learning_rate": 9.221752046958122e-06, + "loss": 0.4923, + "step": 1708 + }, + { + "epoch": 0.18831955922865012, + "grad_norm": 7.636499404907227, + "learning_rate": 9.220814991096943e-06, + "loss": 0.487, + "step": 1709 + }, + { + "epoch": 0.1884297520661157, + "grad_norm": 4.343592166900635, + "learning_rate": 9.219877419108773e-06, + "loss": 0.434, + "step": 1710 + }, + { + "epoch": 0.18853994490358128, + "grad_norm": 10.054169654846191, + "learning_rate": 9.218939331108261e-06, + "loss": 0.5029, + "step": 1711 + }, + { + "epoch": 0.18865013774104683, + "grad_norm": 7.799861431121826, + "learning_rate": 9.218000727210115e-06, + "loss": 0.4746, + "step": 1712 + }, + { + "epoch": 0.1887603305785124, + "grad_norm": 8.256338119506836, + "learning_rate": 9.217061607529111e-06, + "loss": 0.5506, + "step": 1713 + }, + { + "epoch": 0.18887052341597796, + "grad_norm": 9.735845565795898, + "learning_rate": 9.216121972180087e-06, + "loss": 0.4109, + "step": 1714 + }, + { + "epoch": 0.18898071625344354, + "grad_norm": 7.928435802459717, + "learning_rate": 9.21518182127794e-06, + "loss": 0.5023, + "step": 1715 + }, + { + "epoch": 0.1890909090909091, + "grad_norm": 15.789814949035645, + "learning_rate": 9.214241154937635e-06, + "loss": 0.6017, + "step": 1716 + }, + { + "epoch": 0.18920110192837467, + "grad_norm": 5.908916473388672, + "learning_rate": 9.213299973274197e-06, + "loss": 0.496, + "step": 1717 + }, + { + "epoch": 0.18931129476584022, + "grad_norm": 8.4912691116333, + "learning_rate": 9.212358276402716e-06, + "loss": 0.5218, + "step": 1718 + }, + { + "epoch": 0.1894214876033058, + "grad_norm": 9.090007781982422, + "learning_rate": 9.211416064438342e-06, + "loss": 0.4659, + "step": 1719 + }, + { + "epoch": 0.18953168044077134, + "grad_norm": 9.479738235473633, + "learning_rate": 9.210473337496289e-06, + "loss": 0.4745, + "step": 1720 + }, + { + "epoch": 0.18964187327823692, + "grad_norm": 7.842554569244385, + "learning_rate": 9.209530095691839e-06, + "loss": 0.4553, + "step": 1721 + }, + { + "epoch": 0.18975206611570247, + "grad_norm": 5.904482364654541, + "learning_rate": 9.20858633914033e-06, + "loss": 0.3563, + "step": 1722 + }, + { + "epoch": 0.18986225895316805, + "grad_norm": 8.66420841217041, + "learning_rate": 9.207642067957168e-06, + "loss": 0.5095, + "step": 1723 + }, + { + "epoch": 0.1899724517906336, + "grad_norm": 5.422713279724121, + "learning_rate": 9.206697282257817e-06, + "loss": 0.4117, + "step": 1724 + }, + { + "epoch": 0.19008264462809918, + "grad_norm": 8.360159873962402, + "learning_rate": 9.20575198215781e-06, + "loss": 0.4775, + "step": 1725 + }, + { + "epoch": 0.19019283746556473, + "grad_norm": 9.186153411865234, + "learning_rate": 9.204806167772734e-06, + "loss": 0.5071, + "step": 1726 + }, + { + "epoch": 0.1903030303030303, + "grad_norm": 14.28857135772705, + "learning_rate": 9.20385983921825e-06, + "loss": 0.6133, + "step": 1727 + }, + { + "epoch": 0.19041322314049586, + "grad_norm": 5.504502773284912, + "learning_rate": 9.202912996610076e-06, + "loss": 0.4208, + "step": 1728 + }, + { + "epoch": 0.19052341597796144, + "grad_norm": 9.405652046203613, + "learning_rate": 9.20196564006399e-06, + "loss": 0.4877, + "step": 1729 + }, + { + "epoch": 0.19063360881542699, + "grad_norm": 7.375481605529785, + "learning_rate": 9.201017769695838e-06, + "loss": 0.4022, + "step": 1730 + }, + { + "epoch": 0.19074380165289256, + "grad_norm": 15.241425514221191, + "learning_rate": 9.200069385621528e-06, + "loss": 0.4715, + "step": 1731 + }, + { + "epoch": 0.19085399449035811, + "grad_norm": 11.097306251525879, + "learning_rate": 9.199120487957027e-06, + "loss": 0.3701, + "step": 1732 + }, + { + "epoch": 0.1909641873278237, + "grad_norm": 7.114978313446045, + "learning_rate": 9.198171076818368e-06, + "loss": 0.3833, + "step": 1733 + }, + { + "epoch": 0.19107438016528924, + "grad_norm": 9.978882789611816, + "learning_rate": 9.197221152321648e-06, + "loss": 0.4948, + "step": 1734 + }, + { + "epoch": 0.19118457300275482, + "grad_norm": 7.850578308105469, + "learning_rate": 9.196270714583024e-06, + "loss": 0.4935, + "step": 1735 + }, + { + "epoch": 0.1912947658402204, + "grad_norm": 7.1560869216918945, + "learning_rate": 9.195319763718717e-06, + "loss": 0.5109, + "step": 1736 + }, + { + "epoch": 0.19140495867768595, + "grad_norm": 5.753309726715088, + "learning_rate": 9.194368299845012e-06, + "loss": 0.4673, + "step": 1737 + }, + { + "epoch": 0.19151515151515153, + "grad_norm": 9.460782051086426, + "learning_rate": 9.193416323078252e-06, + "loss": 0.4019, + "step": 1738 + }, + { + "epoch": 0.19162534435261708, + "grad_norm": 10.26589584350586, + "learning_rate": 9.192463833534848e-06, + "loss": 0.4986, + "step": 1739 + }, + { + "epoch": 0.19173553719008266, + "grad_norm": 10.60505199432373, + "learning_rate": 9.191510831331271e-06, + "loss": 0.4664, + "step": 1740 + }, + { + "epoch": 0.1918457300275482, + "grad_norm": 4.823023796081543, + "learning_rate": 9.190557316584057e-06, + "loss": 0.4589, + "step": 1741 + }, + { + "epoch": 0.19195592286501378, + "grad_norm": 17.65654754638672, + "learning_rate": 9.189603289409802e-06, + "loss": 0.5577, + "step": 1742 + }, + { + "epoch": 0.19206611570247933, + "grad_norm": 8.283257484436035, + "learning_rate": 9.188648749925165e-06, + "loss": 0.4656, + "step": 1743 + }, + { + "epoch": 0.1921763085399449, + "grad_norm": 8.1823148727417, + "learning_rate": 9.18769369824687e-06, + "loss": 0.4467, + "step": 1744 + }, + { + "epoch": 0.19228650137741046, + "grad_norm": 6.611006259918213, + "learning_rate": 9.1867381344917e-06, + "loss": 0.3631, + "step": 1745 + }, + { + "epoch": 0.19239669421487604, + "grad_norm": 6.430954933166504, + "learning_rate": 9.185782058776504e-06, + "loss": 0.3733, + "step": 1746 + }, + { + "epoch": 0.1925068870523416, + "grad_norm": 8.7865629196167, + "learning_rate": 9.184825471218193e-06, + "loss": 0.4406, + "step": 1747 + }, + { + "epoch": 0.19261707988980717, + "grad_norm": 6.636740684509277, + "learning_rate": 9.18386837193374e-06, + "loss": 0.4312, + "step": 1748 + }, + { + "epoch": 0.19272727272727272, + "grad_norm": 6.261380195617676, + "learning_rate": 9.182910761040177e-06, + "loss": 0.4475, + "step": 1749 + }, + { + "epoch": 0.1928374655647383, + "grad_norm": 6.949450492858887, + "learning_rate": 9.181952638654604e-06, + "loss": 0.4866, + "step": 1750 + }, + { + "epoch": 0.19294765840220385, + "grad_norm": 8.320935249328613, + "learning_rate": 9.180994004894184e-06, + "loss": 0.5215, + "step": 1751 + }, + { + "epoch": 0.19305785123966943, + "grad_norm": 6.757778167724609, + "learning_rate": 9.180034859876135e-06, + "loss": 0.4685, + "step": 1752 + }, + { + "epoch": 0.19316804407713498, + "grad_norm": 7.9120097160339355, + "learning_rate": 9.179075203717746e-06, + "loss": 0.4522, + "step": 1753 + }, + { + "epoch": 0.19327823691460055, + "grad_norm": 8.696321487426758, + "learning_rate": 9.178115036536365e-06, + "loss": 0.4557, + "step": 1754 + }, + { + "epoch": 0.1933884297520661, + "grad_norm": 29.67196273803711, + "learning_rate": 9.177154358449403e-06, + "loss": 0.4847, + "step": 1755 + }, + { + "epoch": 0.19349862258953168, + "grad_norm": 8.443095207214355, + "learning_rate": 9.176193169574332e-06, + "loss": 0.5122, + "step": 1756 + }, + { + "epoch": 0.19360881542699723, + "grad_norm": 8.4793062210083, + "learning_rate": 9.175231470028685e-06, + "loss": 0.4016, + "step": 1757 + }, + { + "epoch": 0.1937190082644628, + "grad_norm": 6.380115509033203, + "learning_rate": 9.174269259930064e-06, + "loss": 0.4166, + "step": 1758 + }, + { + "epoch": 0.19382920110192836, + "grad_norm": 10.314026832580566, + "learning_rate": 9.173306539396128e-06, + "loss": 0.4556, + "step": 1759 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 7.1314239501953125, + "learning_rate": 9.172343308544598e-06, + "loss": 0.4413, + "step": 1760 + }, + { + "epoch": 0.19404958677685952, + "grad_norm": 6.917520999908447, + "learning_rate": 9.171379567493261e-06, + "loss": 0.4119, + "step": 1761 + }, + { + "epoch": 0.19415977961432507, + "grad_norm": 6.420129776000977, + "learning_rate": 9.170415316359966e-06, + "loss": 0.4111, + "step": 1762 + }, + { + "epoch": 0.19426997245179065, + "grad_norm": 6.119045734405518, + "learning_rate": 9.16945055526262e-06, + "loss": 0.3883, + "step": 1763 + }, + { + "epoch": 0.1943801652892562, + "grad_norm": 9.331291198730469, + "learning_rate": 9.168485284319195e-06, + "loss": 0.4095, + "step": 1764 + }, + { + "epoch": 0.19449035812672177, + "grad_norm": 7.21309232711792, + "learning_rate": 9.167519503647729e-06, + "loss": 0.4418, + "step": 1765 + }, + { + "epoch": 0.19460055096418732, + "grad_norm": 11.516107559204102, + "learning_rate": 9.166553213366316e-06, + "loss": 0.5187, + "step": 1766 + }, + { + "epoch": 0.1947107438016529, + "grad_norm": 17.424747467041016, + "learning_rate": 9.165586413593118e-06, + "loss": 0.5168, + "step": 1767 + }, + { + "epoch": 0.19482093663911845, + "grad_norm": 6.920599937438965, + "learning_rate": 9.164619104446354e-06, + "loss": 0.4324, + "step": 1768 + }, + { + "epoch": 0.19493112947658403, + "grad_norm": 10.168035507202148, + "learning_rate": 9.163651286044308e-06, + "loss": 0.5094, + "step": 1769 + }, + { + "epoch": 0.19504132231404958, + "grad_norm": 8.317641258239746, + "learning_rate": 9.16268295850533e-06, + "loss": 0.3652, + "step": 1770 + }, + { + "epoch": 0.19515151515151516, + "grad_norm": 6.406209468841553, + "learning_rate": 9.161714121947822e-06, + "loss": 0.431, + "step": 1771 + }, + { + "epoch": 0.1952617079889807, + "grad_norm": 7.211702346801758, + "learning_rate": 9.160744776490258e-06, + "loss": 0.4585, + "step": 1772 + }, + { + "epoch": 0.1953719008264463, + "grad_norm": 8.599669456481934, + "learning_rate": 9.159774922251173e-06, + "loss": 0.4845, + "step": 1773 + }, + { + "epoch": 0.19548209366391184, + "grad_norm": 7.3090057373046875, + "learning_rate": 9.158804559349158e-06, + "loss": 0.3688, + "step": 1774 + }, + { + "epoch": 0.19559228650137742, + "grad_norm": 6.770844459533691, + "learning_rate": 9.157833687902872e-06, + "loss": 0.4231, + "step": 1775 + }, + { + "epoch": 0.19570247933884297, + "grad_norm": 8.398971557617188, + "learning_rate": 9.156862308031037e-06, + "loss": 0.3886, + "step": 1776 + }, + { + "epoch": 0.19581267217630854, + "grad_norm": 7.647953033447266, + "learning_rate": 9.155890419852432e-06, + "loss": 0.3672, + "step": 1777 + }, + { + "epoch": 0.1959228650137741, + "grad_norm": 11.409097671508789, + "learning_rate": 9.154918023485901e-06, + "loss": 0.5574, + "step": 1778 + }, + { + "epoch": 0.19603305785123967, + "grad_norm": 14.514908790588379, + "learning_rate": 9.153945119050349e-06, + "loss": 0.4787, + "step": 1779 + }, + { + "epoch": 0.19614325068870522, + "grad_norm": 6.008779525756836, + "learning_rate": 9.152971706664745e-06, + "loss": 0.4832, + "step": 1780 + }, + { + "epoch": 0.1962534435261708, + "grad_norm": 5.2900710105896, + "learning_rate": 9.151997786448123e-06, + "loss": 0.395, + "step": 1781 + }, + { + "epoch": 0.19636363636363635, + "grad_norm": 13.773778915405273, + "learning_rate": 9.151023358519569e-06, + "loss": 0.4648, + "step": 1782 + }, + { + "epoch": 0.19647382920110193, + "grad_norm": 6.885417938232422, + "learning_rate": 9.15004842299824e-06, + "loss": 0.3971, + "step": 1783 + }, + { + "epoch": 0.19658402203856748, + "grad_norm": 14.44425106048584, + "learning_rate": 9.149072980003354e-06, + "loss": 0.5117, + "step": 1784 + }, + { + "epoch": 0.19669421487603306, + "grad_norm": 6.065454483032227, + "learning_rate": 9.148097029654186e-06, + "loss": 0.4183, + "step": 1785 + }, + { + "epoch": 0.19680440771349864, + "grad_norm": 5.698667049407959, + "learning_rate": 9.14712057207008e-06, + "loss": 0.4628, + "step": 1786 + }, + { + "epoch": 0.1969146005509642, + "grad_norm": 6.582290172576904, + "learning_rate": 9.146143607370436e-06, + "loss": 0.4249, + "step": 1787 + }, + { + "epoch": 0.19702479338842976, + "grad_norm": 8.020262718200684, + "learning_rate": 9.14516613567472e-06, + "loss": 0.5491, + "step": 1788 + }, + { + "epoch": 0.19713498622589531, + "grad_norm": 6.097904682159424, + "learning_rate": 9.14418815710246e-06, + "loss": 0.4536, + "step": 1789 + }, + { + "epoch": 0.1972451790633609, + "grad_norm": 6.816990375518799, + "learning_rate": 9.14320967177324e-06, + "loss": 0.4149, + "step": 1790 + }, + { + "epoch": 0.19735537190082644, + "grad_norm": 12.400146484375, + "learning_rate": 9.142230679806716e-06, + "loss": 0.5054, + "step": 1791 + }, + { + "epoch": 0.19746556473829202, + "grad_norm": 6.677716255187988, + "learning_rate": 9.141251181322597e-06, + "loss": 0.3187, + "step": 1792 + }, + { + "epoch": 0.19757575757575757, + "grad_norm": 13.185030937194824, + "learning_rate": 9.140271176440658e-06, + "loss": 0.4739, + "step": 1793 + }, + { + "epoch": 0.19768595041322315, + "grad_norm": 11.510183334350586, + "learning_rate": 9.139290665280736e-06, + "loss": 0.4501, + "step": 1794 + }, + { + "epoch": 0.1977961432506887, + "grad_norm": 9.512884140014648, + "learning_rate": 9.138309647962729e-06, + "loss": 0.5343, + "step": 1795 + }, + { + "epoch": 0.19790633608815428, + "grad_norm": 5.829929828643799, + "learning_rate": 9.137328124606596e-06, + "loss": 0.4693, + "step": 1796 + }, + { + "epoch": 0.19801652892561983, + "grad_norm": 6.9572529792785645, + "learning_rate": 9.13634609533236e-06, + "loss": 0.4533, + "step": 1797 + }, + { + "epoch": 0.1981267217630854, + "grad_norm": 11.407742500305176, + "learning_rate": 9.135363560260105e-06, + "loss": 0.508, + "step": 1798 + }, + { + "epoch": 0.19823691460055096, + "grad_norm": 9.053552627563477, + "learning_rate": 9.134380519509976e-06, + "loss": 0.4438, + "step": 1799 + }, + { + "epoch": 0.19834710743801653, + "grad_norm": 9.490631103515625, + "learning_rate": 9.133396973202181e-06, + "loss": 0.4095, + "step": 1800 + }, + { + "epoch": 0.19845730027548208, + "grad_norm": 12.728962898254395, + "learning_rate": 9.13241292145699e-06, + "loss": 0.4589, + "step": 1801 + }, + { + "epoch": 0.19856749311294766, + "grad_norm": 7.135961055755615, + "learning_rate": 9.131428364394735e-06, + "loss": 0.361, + "step": 1802 + }, + { + "epoch": 0.1986776859504132, + "grad_norm": 13.275293350219727, + "learning_rate": 9.130443302135804e-06, + "loss": 0.4761, + "step": 1803 + }, + { + "epoch": 0.1987878787878788, + "grad_norm": 10.185133934020996, + "learning_rate": 9.129457734800659e-06, + "loss": 0.5152, + "step": 1804 + }, + { + "epoch": 0.19889807162534434, + "grad_norm": 5.512964725494385, + "learning_rate": 9.128471662509811e-06, + "loss": 0.3932, + "step": 1805 + }, + { + "epoch": 0.19900826446280992, + "grad_norm": 5.863252639770508, + "learning_rate": 9.127485085383841e-06, + "loss": 0.4458, + "step": 1806 + }, + { + "epoch": 0.19911845730027547, + "grad_norm": 7.236845016479492, + "learning_rate": 9.126498003543387e-06, + "loss": 0.42, + "step": 1807 + }, + { + "epoch": 0.19922865013774105, + "grad_norm": 4.469096660614014, + "learning_rate": 9.125510417109152e-06, + "loss": 0.445, + "step": 1808 + }, + { + "epoch": 0.1993388429752066, + "grad_norm": 7.3282341957092285, + "learning_rate": 9.1245223262019e-06, + "loss": 0.4747, + "step": 1809 + }, + { + "epoch": 0.19944903581267218, + "grad_norm": 4.438247203826904, + "learning_rate": 9.123533730942456e-06, + "loss": 0.3745, + "step": 1810 + }, + { + "epoch": 0.19955922865013775, + "grad_norm": 8.355430603027344, + "learning_rate": 9.122544631451703e-06, + "loss": 0.4216, + "step": 1811 + }, + { + "epoch": 0.1996694214876033, + "grad_norm": 6.029199600219727, + "learning_rate": 9.121555027850597e-06, + "loss": 0.4154, + "step": 1812 + }, + { + "epoch": 0.19977961432506888, + "grad_norm": 4.891571998596191, + "learning_rate": 9.12056492026014e-06, + "loss": 0.4122, + "step": 1813 + }, + { + "epoch": 0.19988980716253443, + "grad_norm": 8.599620819091797, + "learning_rate": 9.11957430880141e-06, + "loss": 0.4457, + "step": 1814 + }, + { + "epoch": 0.2, + "grad_norm": 6.806952953338623, + "learning_rate": 9.118583193595536e-06, + "loss": 0.4634, + "step": 1815 + }, + { + "epoch": 0.20011019283746556, + "grad_norm": 7.122880935668945, + "learning_rate": 9.117591574763714e-06, + "loss": 0.4144, + "step": 1816 + }, + { + "epoch": 0.20011019283746556, + "eval_loss": 0.45613893866539, + "eval_runtime": 41.9691, + "eval_samples_per_second": 17.489, + "eval_steps_per_second": 2.192, + "step": 1816 + }, + { + "epoch": 0.20022038567493114, + "grad_norm": 8.090353965759277, + "learning_rate": 9.116599452427201e-06, + "loss": 0.4032, + "step": 1817 + }, + { + "epoch": 0.2003305785123967, + "grad_norm": 6.1090593338012695, + "learning_rate": 9.115606826707317e-06, + "loss": 0.4011, + "step": 1818 + }, + { + "epoch": 0.20044077134986227, + "grad_norm": 12.8853120803833, + "learning_rate": 9.114613697725438e-06, + "loss": 0.4132, + "step": 1819 + }, + { + "epoch": 0.20055096418732782, + "grad_norm": 5.833970546722412, + "learning_rate": 9.113620065603008e-06, + "loss": 0.4547, + "step": 1820 + }, + { + "epoch": 0.2006611570247934, + "grad_norm": 6.591116428375244, + "learning_rate": 9.112625930461528e-06, + "loss": 0.3708, + "step": 1821 + }, + { + "epoch": 0.20077134986225895, + "grad_norm": 15.06023120880127, + "learning_rate": 9.111631292422562e-06, + "loss": 0.4409, + "step": 1822 + }, + { + "epoch": 0.20088154269972452, + "grad_norm": 13.558335304260254, + "learning_rate": 9.11063615160774e-06, + "loss": 0.4381, + "step": 1823 + }, + { + "epoch": 0.20099173553719007, + "grad_norm": 6.851561069488525, + "learning_rate": 9.109640508138742e-06, + "loss": 0.4188, + "step": 1824 + }, + { + "epoch": 0.20110192837465565, + "grad_norm": 7.203578948974609, + "learning_rate": 9.10864436213732e-06, + "loss": 0.4574, + "step": 1825 + }, + { + "epoch": 0.2012121212121212, + "grad_norm": 6.365950107574463, + "learning_rate": 9.107647713725287e-06, + "loss": 0.4895, + "step": 1826 + }, + { + "epoch": 0.20132231404958678, + "grad_norm": 6.2642388343811035, + "learning_rate": 9.10665056302451e-06, + "loss": 0.4239, + "step": 1827 + }, + { + "epoch": 0.20143250688705233, + "grad_norm": 10.329350471496582, + "learning_rate": 9.105652910156924e-06, + "loss": 0.4563, + "step": 1828 + }, + { + "epoch": 0.2015426997245179, + "grad_norm": 7.78670597076416, + "learning_rate": 9.104654755244524e-06, + "loss": 0.396, + "step": 1829 + }, + { + "epoch": 0.20165289256198346, + "grad_norm": 9.769115447998047, + "learning_rate": 9.103656098409364e-06, + "loss": 0.4818, + "step": 1830 + }, + { + "epoch": 0.20176308539944904, + "grad_norm": 5.7953009605407715, + "learning_rate": 9.102656939773561e-06, + "loss": 0.481, + "step": 1831 + }, + { + "epoch": 0.2018732782369146, + "grad_norm": 6.397300720214844, + "learning_rate": 9.101657279459297e-06, + "loss": 0.4416, + "step": 1832 + }, + { + "epoch": 0.20198347107438017, + "grad_norm": 6.988760948181152, + "learning_rate": 9.10065711758881e-06, + "loss": 0.4666, + "step": 1833 + }, + { + "epoch": 0.20209366391184572, + "grad_norm": 5.883671760559082, + "learning_rate": 9.099656454284396e-06, + "loss": 0.4417, + "step": 1834 + }, + { + "epoch": 0.2022038567493113, + "grad_norm": 7.063892841339111, + "learning_rate": 9.098655289668426e-06, + "loss": 0.4467, + "step": 1835 + }, + { + "epoch": 0.20231404958677687, + "grad_norm": 5.584319591522217, + "learning_rate": 9.097653623863319e-06, + "loss": 0.4041, + "step": 1836 + }, + { + "epoch": 0.20242424242424242, + "grad_norm": 8.371783256530762, + "learning_rate": 9.09665145699156e-06, + "loss": 0.5629, + "step": 1837 + }, + { + "epoch": 0.202534435261708, + "grad_norm": 4.433701992034912, + "learning_rate": 9.095648789175695e-06, + "loss": 0.4506, + "step": 1838 + }, + { + "epoch": 0.20264462809917355, + "grad_norm": 6.148390293121338, + "learning_rate": 9.094645620538334e-06, + "loss": 0.5054, + "step": 1839 + }, + { + "epoch": 0.20275482093663913, + "grad_norm": 5.67417573928833, + "learning_rate": 9.093641951202143e-06, + "loss": 0.393, + "step": 1840 + }, + { + "epoch": 0.20286501377410468, + "grad_norm": 12.673815727233887, + "learning_rate": 9.092637781289856e-06, + "loss": 0.4929, + "step": 1841 + }, + { + "epoch": 0.20297520661157026, + "grad_norm": 5.681674003601074, + "learning_rate": 9.09163311092426e-06, + "loss": 0.3919, + "step": 1842 + }, + { + "epoch": 0.2030853994490358, + "grad_norm": 5.172900676727295, + "learning_rate": 9.090627940228211e-06, + "loss": 0.502, + "step": 1843 + }, + { + "epoch": 0.2031955922865014, + "grad_norm": 14.66736888885498, + "learning_rate": 9.089622269324619e-06, + "loss": 0.4077, + "step": 1844 + }, + { + "epoch": 0.20330578512396694, + "grad_norm": 8.680099487304688, + "learning_rate": 9.088616098336461e-06, + "loss": 0.4933, + "step": 1845 + }, + { + "epoch": 0.20341597796143251, + "grad_norm": 6.403596878051758, + "learning_rate": 9.087609427386774e-06, + "loss": 0.4898, + "step": 1846 + }, + { + "epoch": 0.20352617079889807, + "grad_norm": 15.408370018005371, + "learning_rate": 9.086602256598654e-06, + "loss": 0.5043, + "step": 1847 + }, + { + "epoch": 0.20363636363636364, + "grad_norm": 9.622623443603516, + "learning_rate": 9.085594586095256e-06, + "loss": 0.4478, + "step": 1848 + }, + { + "epoch": 0.2037465564738292, + "grad_norm": 6.941022872924805, + "learning_rate": 9.084586415999804e-06, + "loss": 0.4172, + "step": 1849 + }, + { + "epoch": 0.20385674931129477, + "grad_norm": 5.249694347381592, + "learning_rate": 9.083577746435577e-06, + "loss": 0.4757, + "step": 1850 + }, + { + "epoch": 0.20396694214876032, + "grad_norm": 8.428592681884766, + "learning_rate": 9.082568577525916e-06, + "loss": 0.4937, + "step": 1851 + }, + { + "epoch": 0.2040771349862259, + "grad_norm": 9.229290008544922, + "learning_rate": 9.081558909394223e-06, + "loss": 0.4867, + "step": 1852 + }, + { + "epoch": 0.20418732782369145, + "grad_norm": 6.427026748657227, + "learning_rate": 9.080548742163963e-06, + "loss": 0.4162, + "step": 1853 + }, + { + "epoch": 0.20429752066115703, + "grad_norm": 5.985427379608154, + "learning_rate": 9.079538075958661e-06, + "loss": 0.456, + "step": 1854 + }, + { + "epoch": 0.20440771349862258, + "grad_norm": 6.108369827270508, + "learning_rate": 9.0785269109019e-06, + "loss": 0.3918, + "step": 1855 + }, + { + "epoch": 0.20451790633608816, + "grad_norm": 10.243864059448242, + "learning_rate": 9.077515247117329e-06, + "loss": 0.3983, + "step": 1856 + }, + { + "epoch": 0.2046280991735537, + "grad_norm": 7.947391033172607, + "learning_rate": 9.076503084728655e-06, + "loss": 0.4175, + "step": 1857 + }, + { + "epoch": 0.20473829201101928, + "grad_norm": 6.859309196472168, + "learning_rate": 9.075490423859645e-06, + "loss": 0.3996, + "step": 1858 + }, + { + "epoch": 0.20484848484848484, + "grad_norm": 9.099756240844727, + "learning_rate": 9.074477264634131e-06, + "loss": 0.4904, + "step": 1859 + }, + { + "epoch": 0.2049586776859504, + "grad_norm": 8.364721298217773, + "learning_rate": 9.073463607176003e-06, + "loss": 0.3858, + "step": 1860 + }, + { + "epoch": 0.205068870523416, + "grad_norm": 6.850193977355957, + "learning_rate": 9.072449451609211e-06, + "loss": 0.4272, + "step": 1861 + }, + { + "epoch": 0.20517906336088154, + "grad_norm": 9.688082695007324, + "learning_rate": 9.071434798057767e-06, + "loss": 0.4758, + "step": 1862 + }, + { + "epoch": 0.20528925619834712, + "grad_norm": 7.483328819274902, + "learning_rate": 9.070419646645747e-06, + "loss": 0.4202, + "step": 1863 + }, + { + "epoch": 0.20539944903581267, + "grad_norm": 7.1225199699401855, + "learning_rate": 9.069403997497283e-06, + "loss": 0.4982, + "step": 1864 + }, + { + "epoch": 0.20550964187327825, + "grad_norm": 11.751843452453613, + "learning_rate": 9.068387850736572e-06, + "loss": 0.4909, + "step": 1865 + }, + { + "epoch": 0.2056198347107438, + "grad_norm": 15.210118293762207, + "learning_rate": 9.067371206487867e-06, + "loss": 0.477, + "step": 1866 + }, + { + "epoch": 0.20573002754820938, + "grad_norm": 9.114174842834473, + "learning_rate": 9.066354064875486e-06, + "loss": 0.4353, + "step": 1867 + }, + { + "epoch": 0.20584022038567493, + "grad_norm": 9.445465087890625, + "learning_rate": 9.065336426023806e-06, + "loss": 0.435, + "step": 1868 + }, + { + "epoch": 0.2059504132231405, + "grad_norm": 10.78199291229248, + "learning_rate": 9.064318290057266e-06, + "loss": 0.458, + "step": 1869 + }, + { + "epoch": 0.20606060606060606, + "grad_norm": 6.4792656898498535, + "learning_rate": 9.063299657100363e-06, + "loss": 0.4008, + "step": 1870 + }, + { + "epoch": 0.20617079889807163, + "grad_norm": 9.852862358093262, + "learning_rate": 9.06228052727766e-06, + "loss": 0.4086, + "step": 1871 + }, + { + "epoch": 0.20628099173553718, + "grad_norm": 5.965283393859863, + "learning_rate": 9.061260900713777e-06, + "loss": 0.4791, + "step": 1872 + }, + { + "epoch": 0.20639118457300276, + "grad_norm": 6.138115882873535, + "learning_rate": 9.060240777533394e-06, + "loss": 0.4356, + "step": 1873 + }, + { + "epoch": 0.2065013774104683, + "grad_norm": 5.7559614181518555, + "learning_rate": 9.059220157861252e-06, + "loss": 0.4752, + "step": 1874 + }, + { + "epoch": 0.2066115702479339, + "grad_norm": 4.700835704803467, + "learning_rate": 9.058199041822155e-06, + "loss": 0.4183, + "step": 1875 + }, + { + "epoch": 0.20672176308539944, + "grad_norm": 6.3946075439453125, + "learning_rate": 9.057177429540969e-06, + "loss": 0.4635, + "step": 1876 + }, + { + "epoch": 0.20683195592286502, + "grad_norm": 5.3536458015441895, + "learning_rate": 9.056155321142615e-06, + "loss": 0.3633, + "step": 1877 + }, + { + "epoch": 0.20694214876033057, + "grad_norm": 9.934891700744629, + "learning_rate": 9.055132716752077e-06, + "loss": 0.5228, + "step": 1878 + }, + { + "epoch": 0.20705234159779615, + "grad_norm": 7.625913619995117, + "learning_rate": 9.054109616494403e-06, + "loss": 0.4418, + "step": 1879 + }, + { + "epoch": 0.2071625344352617, + "grad_norm": 6.623529434204102, + "learning_rate": 9.053086020494697e-06, + "loss": 0.3677, + "step": 1880 + }, + { + "epoch": 0.20727272727272728, + "grad_norm": 4.113265037536621, + "learning_rate": 9.052061928878128e-06, + "loss": 0.478, + "step": 1881 + }, + { + "epoch": 0.20738292011019283, + "grad_norm": 8.305842399597168, + "learning_rate": 9.051037341769923e-06, + "loss": 0.4773, + "step": 1882 + }, + { + "epoch": 0.2074931129476584, + "grad_norm": 6.318554878234863, + "learning_rate": 9.050012259295368e-06, + "loss": 0.3304, + "step": 1883 + }, + { + "epoch": 0.20760330578512395, + "grad_norm": 11.366905212402344, + "learning_rate": 9.048986681579814e-06, + "loss": 0.4307, + "step": 1884 + }, + { + "epoch": 0.20771349862258953, + "grad_norm": 11.362711906433105, + "learning_rate": 9.047960608748667e-06, + "loss": 0.5466, + "step": 1885 + }, + { + "epoch": 0.2078236914600551, + "grad_norm": 10.7440824508667, + "learning_rate": 9.046934040927398e-06, + "loss": 0.493, + "step": 1886 + }, + { + "epoch": 0.20793388429752066, + "grad_norm": 7.004652500152588, + "learning_rate": 9.045906978241538e-06, + "loss": 0.3823, + "step": 1887 + }, + { + "epoch": 0.20804407713498624, + "grad_norm": 7.787408828735352, + "learning_rate": 9.044879420816676e-06, + "loss": 0.4317, + "step": 1888 + }, + { + "epoch": 0.2081542699724518, + "grad_norm": 10.921992301940918, + "learning_rate": 9.043851368778464e-06, + "loss": 0.4119, + "step": 1889 + }, + { + "epoch": 0.20826446280991737, + "grad_norm": 6.97713565826416, + "learning_rate": 9.042822822252615e-06, + "loss": 0.4869, + "step": 1890 + }, + { + "epoch": 0.20837465564738292, + "grad_norm": 5.4596428871154785, + "learning_rate": 9.041793781364898e-06, + "loss": 0.4907, + "step": 1891 + }, + { + "epoch": 0.2084848484848485, + "grad_norm": 7.856202602386475, + "learning_rate": 9.040764246241148e-06, + "loss": 0.48, + "step": 1892 + }, + { + "epoch": 0.20859504132231405, + "grad_norm": 6.265694618225098, + "learning_rate": 9.039734217007258e-06, + "loss": 0.4903, + "step": 1893 + }, + { + "epoch": 0.20870523415977962, + "grad_norm": 8.078190803527832, + "learning_rate": 9.03870369378918e-06, + "loss": 0.3312, + "step": 1894 + }, + { + "epoch": 0.20881542699724517, + "grad_norm": 15.330924987792969, + "learning_rate": 9.037672676712928e-06, + "loss": 0.3968, + "step": 1895 + }, + { + "epoch": 0.20892561983471075, + "grad_norm": 12.826431274414062, + "learning_rate": 9.036641165904575e-06, + "loss": 0.5816, + "step": 1896 + }, + { + "epoch": 0.2090358126721763, + "grad_norm": 8.1292142868042, + "learning_rate": 9.035609161490258e-06, + "loss": 0.3688, + "step": 1897 + }, + { + "epoch": 0.20914600550964188, + "grad_norm": 14.929841995239258, + "learning_rate": 9.034576663596171e-06, + "loss": 0.5262, + "step": 1898 + }, + { + "epoch": 0.20925619834710743, + "grad_norm": 7.926431655883789, + "learning_rate": 9.03354367234857e-06, + "loss": 0.4719, + "step": 1899 + }, + { + "epoch": 0.209366391184573, + "grad_norm": 7.180224418640137, + "learning_rate": 9.032510187873769e-06, + "loss": 0.4552, + "step": 1900 + }, + { + "epoch": 0.20947658402203856, + "grad_norm": 8.119571685791016, + "learning_rate": 9.031476210298144e-06, + "loss": 0.4513, + "step": 1901 + }, + { + "epoch": 0.20958677685950414, + "grad_norm": 6.797088146209717, + "learning_rate": 9.030441739748133e-06, + "loss": 0.4122, + "step": 1902 + }, + { + "epoch": 0.2096969696969697, + "grad_norm": 15.355571746826172, + "learning_rate": 9.029406776350232e-06, + "loss": 0.4267, + "step": 1903 + }, + { + "epoch": 0.20980716253443527, + "grad_norm": 9.266566276550293, + "learning_rate": 9.028371320230996e-06, + "loss": 0.4733, + "step": 1904 + }, + { + "epoch": 0.20991735537190082, + "grad_norm": 10.276440620422363, + "learning_rate": 9.027335371517041e-06, + "loss": 0.4745, + "step": 1905 + }, + { + "epoch": 0.2100275482093664, + "grad_norm": 7.7341485023498535, + "learning_rate": 9.02629893033505e-06, + "loss": 0.4525, + "step": 1906 + }, + { + "epoch": 0.21013774104683194, + "grad_norm": 7.565461158752441, + "learning_rate": 9.025261996811752e-06, + "loss": 0.5141, + "step": 1907 + }, + { + "epoch": 0.21024793388429752, + "grad_norm": 10.702898025512695, + "learning_rate": 9.024224571073953e-06, + "loss": 0.4816, + "step": 1908 + }, + { + "epoch": 0.21035812672176307, + "grad_norm": 4.73010778427124, + "learning_rate": 9.023186653248506e-06, + "loss": 0.426, + "step": 1909 + }, + { + "epoch": 0.21046831955922865, + "grad_norm": 8.425341606140137, + "learning_rate": 9.02214824346233e-06, + "loss": 0.5173, + "step": 1910 + }, + { + "epoch": 0.21057851239669423, + "grad_norm": 6.806413173675537, + "learning_rate": 9.021109341842403e-06, + "loss": 0.431, + "step": 1911 + }, + { + "epoch": 0.21068870523415978, + "grad_norm": 8.63928508758545, + "learning_rate": 9.020069948515764e-06, + "loss": 0.4368, + "step": 1912 + }, + { + "epoch": 0.21079889807162536, + "grad_norm": 9.580857276916504, + "learning_rate": 9.01903006360951e-06, + "loss": 0.3569, + "step": 1913 + }, + { + "epoch": 0.2109090909090909, + "grad_norm": 6.788583755493164, + "learning_rate": 9.0179896872508e-06, + "loss": 0.4989, + "step": 1914 + }, + { + "epoch": 0.21101928374655649, + "grad_norm": 7.399003982543945, + "learning_rate": 9.016948819566855e-06, + "loss": 0.4294, + "step": 1915 + }, + { + "epoch": 0.21112947658402204, + "grad_norm": 9.5567045211792, + "learning_rate": 9.01590746068495e-06, + "loss": 0.4322, + "step": 1916 + }, + { + "epoch": 0.2112396694214876, + "grad_norm": 4.259730339050293, + "learning_rate": 9.014865610732429e-06, + "loss": 0.3784, + "step": 1917 + }, + { + "epoch": 0.21134986225895316, + "grad_norm": 8.83730697631836, + "learning_rate": 9.013823269836683e-06, + "loss": 0.4126, + "step": 1918 + }, + { + "epoch": 0.21146005509641874, + "grad_norm": 6.528851509094238, + "learning_rate": 9.012780438125178e-06, + "loss": 0.4805, + "step": 1919 + }, + { + "epoch": 0.2115702479338843, + "grad_norm": 15.830177307128906, + "learning_rate": 9.01173711572543e-06, + "loss": 0.45, + "step": 1920 + }, + { + "epoch": 0.21168044077134987, + "grad_norm": 7.5151872634887695, + "learning_rate": 9.010693302765018e-06, + "loss": 0.4472, + "step": 1921 + }, + { + "epoch": 0.21179063360881542, + "grad_norm": 7.604508399963379, + "learning_rate": 9.009648999371581e-06, + "loss": 0.489, + "step": 1922 + }, + { + "epoch": 0.211900826446281, + "grad_norm": 5.260583400726318, + "learning_rate": 9.008604205672818e-06, + "loss": 0.4191, + "step": 1923 + }, + { + "epoch": 0.21201101928374655, + "grad_norm": 9.22535228729248, + "learning_rate": 9.007558921796487e-06, + "loss": 0.3623, + "step": 1924 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 11.922446250915527, + "learning_rate": 9.006513147870406e-06, + "loss": 0.4881, + "step": 1925 + }, + { + "epoch": 0.21223140495867768, + "grad_norm": 6.315326690673828, + "learning_rate": 9.005466884022457e-06, + "loss": 0.488, + "step": 1926 + }, + { + "epoch": 0.21234159779614326, + "grad_norm": 9.142075538635254, + "learning_rate": 9.004420130380576e-06, + "loss": 0.4033, + "step": 1927 + }, + { + "epoch": 0.2124517906336088, + "grad_norm": 7.655306339263916, + "learning_rate": 9.003372887072761e-06, + "loss": 0.3743, + "step": 1928 + }, + { + "epoch": 0.21256198347107438, + "grad_norm": 4.443755149841309, + "learning_rate": 9.002325154227073e-06, + "loss": 0.3184, + "step": 1929 + }, + { + "epoch": 0.21267217630853993, + "grad_norm": 7.230881690979004, + "learning_rate": 9.001276931971628e-06, + "loss": 0.388, + "step": 1930 + }, + { + "epoch": 0.2127823691460055, + "grad_norm": 9.036759376525879, + "learning_rate": 9.000228220434604e-06, + "loss": 0.4088, + "step": 1931 + }, + { + "epoch": 0.21289256198347106, + "grad_norm": 8.434646606445312, + "learning_rate": 8.999179019744239e-06, + "loss": 0.4923, + "step": 1932 + }, + { + "epoch": 0.21300275482093664, + "grad_norm": 9.558442115783691, + "learning_rate": 8.998129330028833e-06, + "loss": 0.4631, + "step": 1933 + }, + { + "epoch": 0.2131129476584022, + "grad_norm": 7.401463031768799, + "learning_rate": 8.99707915141674e-06, + "loss": 0.3914, + "step": 1934 + }, + { + "epoch": 0.21322314049586777, + "grad_norm": 7.793752670288086, + "learning_rate": 8.99602848403638e-06, + "loss": 0.3831, + "step": 1935 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 9.429425239562988, + "learning_rate": 8.994977328016226e-06, + "loss": 0.4538, + "step": 1936 + }, + { + "epoch": 0.2134435261707989, + "grad_norm": 8.313745498657227, + "learning_rate": 8.993925683484821e-06, + "loss": 0.3812, + "step": 1937 + }, + { + "epoch": 0.21355371900826448, + "grad_norm": 11.815062522888184, + "learning_rate": 8.992873550570758e-06, + "loss": 0.5145, + "step": 1938 + }, + { + "epoch": 0.21366391184573003, + "grad_norm": 10.813628196716309, + "learning_rate": 8.991820929402692e-06, + "loss": 0.5057, + "step": 1939 + }, + { + "epoch": 0.2137741046831956, + "grad_norm": 11.73471736907959, + "learning_rate": 8.990767820109341e-06, + "loss": 0.5166, + "step": 1940 + }, + { + "epoch": 0.21388429752066115, + "grad_norm": 12.178422927856445, + "learning_rate": 8.989714222819479e-06, + "loss": 0.5423, + "step": 1941 + }, + { + "epoch": 0.21399449035812673, + "grad_norm": 6.593087673187256, + "learning_rate": 8.988660137661942e-06, + "loss": 0.5057, + "step": 1942 + }, + { + "epoch": 0.21410468319559228, + "grad_norm": 5.574822902679443, + "learning_rate": 8.987605564765628e-06, + "loss": 0.4922, + "step": 1943 + }, + { + "epoch": 0.21421487603305786, + "grad_norm": 10.696877479553223, + "learning_rate": 8.986550504259487e-06, + "loss": 0.4365, + "step": 1944 + }, + { + "epoch": 0.2143250688705234, + "grad_norm": 10.210994720458984, + "learning_rate": 8.985494956272536e-06, + "loss": 0.508, + "step": 1945 + }, + { + "epoch": 0.214435261707989, + "grad_norm": 6.919703483581543, + "learning_rate": 8.984438920933847e-06, + "loss": 0.443, + "step": 1946 + }, + { + "epoch": 0.21454545454545454, + "grad_norm": 14.901093482971191, + "learning_rate": 8.983382398372555e-06, + "loss": 0.4736, + "step": 1947 + }, + { + "epoch": 0.21465564738292012, + "grad_norm": 9.82280158996582, + "learning_rate": 8.982325388717853e-06, + "loss": 0.3837, + "step": 1948 + }, + { + "epoch": 0.21476584022038567, + "grad_norm": 4.7476301193237305, + "learning_rate": 8.981267892098993e-06, + "loss": 0.3799, + "step": 1949 + }, + { + "epoch": 0.21487603305785125, + "grad_norm": 5.6726555824279785, + "learning_rate": 8.980209908645286e-06, + "loss": 0.4045, + "step": 1950 + }, + { + "epoch": 0.2149862258953168, + "grad_norm": 7.372026443481445, + "learning_rate": 8.979151438486105e-06, + "loss": 0.419, + "step": 1951 + }, + { + "epoch": 0.21509641873278237, + "grad_norm": 14.011527061462402, + "learning_rate": 8.978092481750883e-06, + "loss": 0.4625, + "step": 1952 + }, + { + "epoch": 0.21520661157024792, + "grad_norm": 8.68128490447998, + "learning_rate": 8.977033038569106e-06, + "loss": 0.4081, + "step": 1953 + }, + { + "epoch": 0.2153168044077135, + "grad_norm": 4.661523342132568, + "learning_rate": 8.975973109070328e-06, + "loss": 0.3807, + "step": 1954 + }, + { + "epoch": 0.21542699724517905, + "grad_norm": 8.467016220092773, + "learning_rate": 8.974912693384156e-06, + "loss": 0.4912, + "step": 1955 + }, + { + "epoch": 0.21553719008264463, + "grad_norm": 10.955033302307129, + "learning_rate": 8.973851791640262e-06, + "loss": 0.5152, + "step": 1956 + }, + { + "epoch": 0.21564738292011018, + "grad_norm": 12.558293342590332, + "learning_rate": 8.972790403968374e-06, + "loss": 0.5096, + "step": 1957 + }, + { + "epoch": 0.21575757575757576, + "grad_norm": 6.917547225952148, + "learning_rate": 8.971728530498276e-06, + "loss": 0.4213, + "step": 1958 + }, + { + "epoch": 0.2158677685950413, + "grad_norm": 13.767768859863281, + "learning_rate": 8.970666171359821e-06, + "loss": 0.4437, + "step": 1959 + }, + { + "epoch": 0.2159779614325069, + "grad_norm": 7.672903060913086, + "learning_rate": 8.969603326682911e-06, + "loss": 0.5086, + "step": 1960 + }, + { + "epoch": 0.21608815426997247, + "grad_norm": 7.62104606628418, + "learning_rate": 8.968539996597514e-06, + "loss": 0.4778, + "step": 1961 + }, + { + "epoch": 0.21619834710743802, + "grad_norm": 9.937533378601074, + "learning_rate": 8.967476181233656e-06, + "loss": 0.5214, + "step": 1962 + }, + { + "epoch": 0.2163085399449036, + "grad_norm": 15.488443374633789, + "learning_rate": 8.966411880721422e-06, + "loss": 0.6193, + "step": 1963 + }, + { + "epoch": 0.21641873278236914, + "grad_norm": 6.0433735847473145, + "learning_rate": 8.965347095190956e-06, + "loss": 0.4644, + "step": 1964 + }, + { + "epoch": 0.21652892561983472, + "grad_norm": 9.517822265625, + "learning_rate": 8.964281824772458e-06, + "loss": 0.3872, + "step": 1965 + }, + { + "epoch": 0.21663911845730027, + "grad_norm": 9.647416114807129, + "learning_rate": 8.963216069596197e-06, + "loss": 0.5504, + "step": 1966 + }, + { + "epoch": 0.21674931129476585, + "grad_norm": 7.2650532722473145, + "learning_rate": 8.962149829792489e-06, + "loss": 0.3576, + "step": 1967 + }, + { + "epoch": 0.2168595041322314, + "grad_norm": 6.231147289276123, + "learning_rate": 8.961083105491718e-06, + "loss": 0.4159, + "step": 1968 + }, + { + "epoch": 0.21696969696969698, + "grad_norm": 7.0674567222595215, + "learning_rate": 8.960015896824324e-06, + "loss": 0.4666, + "step": 1969 + }, + { + "epoch": 0.21707988980716253, + "grad_norm": 9.527992248535156, + "learning_rate": 8.958948203920808e-06, + "loss": 0.3766, + "step": 1970 + }, + { + "epoch": 0.2171900826446281, + "grad_norm": 9.872891426086426, + "learning_rate": 8.957880026911727e-06, + "loss": 0.4628, + "step": 1971 + }, + { + "epoch": 0.21730027548209366, + "grad_norm": 8.264708518981934, + "learning_rate": 8.956811365927702e-06, + "loss": 0.3907, + "step": 1972 + }, + { + "epoch": 0.21741046831955924, + "grad_norm": 4.303475379943848, + "learning_rate": 8.955742221099405e-06, + "loss": 0.3753, + "step": 1973 + }, + { + "epoch": 0.21752066115702479, + "grad_norm": 7.1407318115234375, + "learning_rate": 8.954672592557578e-06, + "loss": 0.4638, + "step": 1974 + }, + { + "epoch": 0.21763085399449036, + "grad_norm": 6.071427822113037, + "learning_rate": 8.953602480433016e-06, + "loss": 0.4365, + "step": 1975 + }, + { + "epoch": 0.21774104683195591, + "grad_norm": 4.606165409088135, + "learning_rate": 8.95253188485657e-06, + "loss": 0.4439, + "step": 1976 + }, + { + "epoch": 0.2178512396694215, + "grad_norm": 8.184374809265137, + "learning_rate": 8.951460805959159e-06, + "loss": 0.5013, + "step": 1977 + }, + { + "epoch": 0.21796143250688704, + "grad_norm": 4.882009983062744, + "learning_rate": 8.95038924387175e-06, + "loss": 0.3109, + "step": 1978 + }, + { + "epoch": 0.21807162534435262, + "grad_norm": 10.992392539978027, + "learning_rate": 8.949317198725379e-06, + "loss": 0.5092, + "step": 1979 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 11.858559608459473, + "learning_rate": 8.948244670651137e-06, + "loss": 0.5491, + "step": 1980 + }, + { + "epoch": 0.21829201101928375, + "grad_norm": 5.792891502380371, + "learning_rate": 8.947171659780172e-06, + "loss": 0.4797, + "step": 1981 + }, + { + "epoch": 0.2184022038567493, + "grad_norm": 8.452299118041992, + "learning_rate": 8.946098166243696e-06, + "loss": 0.4986, + "step": 1982 + }, + { + "epoch": 0.21851239669421488, + "grad_norm": 4.906167507171631, + "learning_rate": 8.945024190172975e-06, + "loss": 0.4088, + "step": 1983 + }, + { + "epoch": 0.21862258953168043, + "grad_norm": 7.811947345733643, + "learning_rate": 8.943949731699337e-06, + "loss": 0.4175, + "step": 1984 + }, + { + "epoch": 0.218732782369146, + "grad_norm": 6.3078083992004395, + "learning_rate": 8.94287479095417e-06, + "loss": 0.4038, + "step": 1985 + }, + { + "epoch": 0.21884297520661158, + "grad_norm": 8.789570808410645, + "learning_rate": 8.941799368068916e-06, + "loss": 0.469, + "step": 1986 + }, + { + "epoch": 0.21895316804407713, + "grad_norm": 7.786570072174072, + "learning_rate": 8.940723463175083e-06, + "loss": 0.4931, + "step": 1987 + }, + { + "epoch": 0.2190633608815427, + "grad_norm": 6.052577972412109, + "learning_rate": 8.93964707640423e-06, + "loss": 0.4266, + "step": 1988 + }, + { + "epoch": 0.21917355371900826, + "grad_norm": 11.581875801086426, + "learning_rate": 8.938570207887981e-06, + "loss": 0.4935, + "step": 1989 + }, + { + "epoch": 0.21928374655647384, + "grad_norm": 7.972194671630859, + "learning_rate": 8.93749285775802e-06, + "loss": 0.5139, + "step": 1990 + }, + { + "epoch": 0.2193939393939394, + "grad_norm": 6.227068901062012, + "learning_rate": 8.93641502614608e-06, + "loss": 0.4265, + "step": 1991 + }, + { + "epoch": 0.21950413223140497, + "grad_norm": 8.917132377624512, + "learning_rate": 8.935336713183965e-06, + "loss": 0.4938, + "step": 1992 + }, + { + "epoch": 0.21961432506887052, + "grad_norm": 11.940751075744629, + "learning_rate": 8.934257919003532e-06, + "loss": 0.5562, + "step": 1993 + }, + { + "epoch": 0.2197245179063361, + "grad_norm": 7.603494644165039, + "learning_rate": 8.933178643736696e-06, + "loss": 0.4801, + "step": 1994 + }, + { + "epoch": 0.21983471074380165, + "grad_norm": 7.002358913421631, + "learning_rate": 8.932098887515432e-06, + "loss": 0.4775, + "step": 1995 + }, + { + "epoch": 0.21994490358126723, + "grad_norm": 7.376613616943359, + "learning_rate": 8.931018650471775e-06, + "loss": 0.4974, + "step": 1996 + }, + { + "epoch": 0.22005509641873278, + "grad_norm": 6.262222766876221, + "learning_rate": 8.929937932737818e-06, + "loss": 0.4584, + "step": 1997 + }, + { + "epoch": 0.22016528925619835, + "grad_norm": 3.9996824264526367, + "learning_rate": 8.928856734445712e-06, + "loss": 0.4186, + "step": 1998 + }, + { + "epoch": 0.2202754820936639, + "grad_norm": 7.933164119720459, + "learning_rate": 8.927775055727668e-06, + "loss": 0.4332, + "step": 1999 + }, + { + "epoch": 0.22038567493112948, + "grad_norm": 7.3600382804870605, + "learning_rate": 8.926692896715955e-06, + "loss": 0.5018, + "step": 2000 + }, + { + "epoch": 0.22049586776859503, + "grad_norm": 6.7328410148620605, + "learning_rate": 8.9256102575429e-06, + "loss": 0.421, + "step": 2001 + }, + { + "epoch": 0.2206060606060606, + "grad_norm": 10.002666473388672, + "learning_rate": 8.92452713834089e-06, + "loss": 0.4698, + "step": 2002 + }, + { + "epoch": 0.22071625344352616, + "grad_norm": 6.482827186584473, + "learning_rate": 8.923443539242371e-06, + "loss": 0.4616, + "step": 2003 + }, + { + "epoch": 0.22082644628099174, + "grad_norm": 12.589330673217773, + "learning_rate": 8.922359460379848e-06, + "loss": 0.4643, + "step": 2004 + }, + { + "epoch": 0.2209366391184573, + "grad_norm": 5.59968376159668, + "learning_rate": 8.92127490188588e-06, + "loss": 0.4637, + "step": 2005 + }, + { + "epoch": 0.22104683195592287, + "grad_norm": 4.8191609382629395, + "learning_rate": 8.920189863893092e-06, + "loss": 0.385, + "step": 2006 + }, + { + "epoch": 0.22115702479338842, + "grad_norm": 8.757711410522461, + "learning_rate": 8.919104346534162e-06, + "loss": 0.4544, + "step": 2007 + }, + { + "epoch": 0.221267217630854, + "grad_norm": 8.402291297912598, + "learning_rate": 8.918018349941829e-06, + "loss": 0.4519, + "step": 2008 + }, + { + "epoch": 0.22137741046831955, + "grad_norm": 17.948301315307617, + "learning_rate": 8.916931874248889e-06, + "loss": 0.4922, + "step": 2009 + }, + { + "epoch": 0.22148760330578512, + "grad_norm": 8.477241516113281, + "learning_rate": 8.9158449195882e-06, + "loss": 0.4376, + "step": 2010 + }, + { + "epoch": 0.22159779614325067, + "grad_norm": 10.58144760131836, + "learning_rate": 8.914757486092676e-06, + "loss": 0.4337, + "step": 2011 + }, + { + "epoch": 0.22170798898071625, + "grad_norm": 5.602427005767822, + "learning_rate": 8.913669573895285e-06, + "loss": 0.467, + "step": 2012 + }, + { + "epoch": 0.22181818181818183, + "grad_norm": 7.449388027191162, + "learning_rate": 8.912581183129067e-06, + "loss": 0.4084, + "step": 2013 + }, + { + "epoch": 0.22192837465564738, + "grad_norm": 8.887518882751465, + "learning_rate": 8.911492313927104e-06, + "loss": 0.4231, + "step": 2014 + }, + { + "epoch": 0.22203856749311296, + "grad_norm": 7.231354713439941, + "learning_rate": 8.910402966422549e-06, + "loss": 0.3867, + "step": 2015 + }, + { + "epoch": 0.2221487603305785, + "grad_norm": 10.660465240478516, + "learning_rate": 8.909313140748607e-06, + "loss": 0.4835, + "step": 2016 + }, + { + "epoch": 0.2222589531680441, + "grad_norm": 11.831543922424316, + "learning_rate": 8.908222837038545e-06, + "loss": 0.3749, + "step": 2017 + }, + { + "epoch": 0.22236914600550964, + "grad_norm": 7.562001705169678, + "learning_rate": 8.907132055425685e-06, + "loss": 0.5091, + "step": 2018 + }, + { + "epoch": 0.22247933884297522, + "grad_norm": 5.9218597412109375, + "learning_rate": 8.906040796043409e-06, + "loss": 0.403, + "step": 2019 + }, + { + "epoch": 0.22258953168044077, + "grad_norm": 7.158297538757324, + "learning_rate": 8.904949059025158e-06, + "loss": 0.4251, + "step": 2020 + }, + { + "epoch": 0.22269972451790634, + "grad_norm": 7.283941268920898, + "learning_rate": 8.903856844504435e-06, + "loss": 0.4382, + "step": 2021 + }, + { + "epoch": 0.2228099173553719, + "grad_norm": 14.837738037109375, + "learning_rate": 8.902764152614792e-06, + "loss": 0.5329, + "step": 2022 + }, + { + "epoch": 0.22292011019283747, + "grad_norm": 7.059011459350586, + "learning_rate": 8.901670983489848e-06, + "loss": 0.4003, + "step": 2023 + }, + { + "epoch": 0.22303030303030302, + "grad_norm": 17.28598976135254, + "learning_rate": 8.900577337263274e-06, + "loss": 0.4001, + "step": 2024 + }, + { + "epoch": 0.2231404958677686, + "grad_norm": 6.224469184875488, + "learning_rate": 8.899483214068807e-06, + "loss": 0.4522, + "step": 2025 + }, + { + "epoch": 0.22325068870523415, + "grad_norm": 7.4331374168396, + "learning_rate": 8.898388614040235e-06, + "loss": 0.4645, + "step": 2026 + }, + { + "epoch": 0.22336088154269973, + "grad_norm": 9.30374813079834, + "learning_rate": 8.897293537311408e-06, + "loss": 0.4421, + "step": 2027 + }, + { + "epoch": 0.22347107438016528, + "grad_norm": 11.287163734436035, + "learning_rate": 8.896197984016233e-06, + "loss": 0.4098, + "step": 2028 + }, + { + "epoch": 0.22358126721763086, + "grad_norm": 9.276102066040039, + "learning_rate": 8.895101954288675e-06, + "loss": 0.463, + "step": 2029 + }, + { + "epoch": 0.2236914600550964, + "grad_norm": 9.553190231323242, + "learning_rate": 8.894005448262762e-06, + "loss": 0.4344, + "step": 2030 + }, + { + "epoch": 0.223801652892562, + "grad_norm": 4.667308807373047, + "learning_rate": 8.89290846607257e-06, + "loss": 0.3673, + "step": 2031 + }, + { + "epoch": 0.22391184573002754, + "grad_norm": 8.357771873474121, + "learning_rate": 8.891811007852245e-06, + "loss": 0.4999, + "step": 2032 + }, + { + "epoch": 0.22402203856749311, + "grad_norm": 6.221590995788574, + "learning_rate": 8.890713073735983e-06, + "loss": 0.3871, + "step": 2033 + }, + { + "epoch": 0.22413223140495867, + "grad_norm": 5.789371967315674, + "learning_rate": 8.889614663858041e-06, + "loss": 0.4755, + "step": 2034 + }, + { + "epoch": 0.22424242424242424, + "grad_norm": 12.528146743774414, + "learning_rate": 8.888515778352735e-06, + "loss": 0.4535, + "step": 2035 + }, + { + "epoch": 0.2243526170798898, + "grad_norm": 8.438453674316406, + "learning_rate": 8.887416417354437e-06, + "loss": 0.5561, + "step": 2036 + }, + { + "epoch": 0.22446280991735537, + "grad_norm": 10.135050773620605, + "learning_rate": 8.886316580997578e-06, + "loss": 0.448, + "step": 2037 + }, + { + "epoch": 0.22457300275482095, + "grad_norm": 5.991178512573242, + "learning_rate": 8.88521626941665e-06, + "loss": 0.4274, + "step": 2038 + }, + { + "epoch": 0.2246831955922865, + "grad_norm": 7.5936055183410645, + "learning_rate": 8.884115482746199e-06, + "loss": 0.4437, + "step": 2039 + }, + { + "epoch": 0.22479338842975208, + "grad_norm": 6.577114105224609, + "learning_rate": 8.883014221120829e-06, + "loss": 0.446, + "step": 2040 + }, + { + "epoch": 0.22490358126721763, + "grad_norm": 12.858460426330566, + "learning_rate": 8.881912484675207e-06, + "loss": 0.5337, + "step": 2041 + }, + { + "epoch": 0.2250137741046832, + "grad_norm": 5.964365005493164, + "learning_rate": 8.880810273544053e-06, + "loss": 0.4245, + "step": 2042 + }, + { + "epoch": 0.22512396694214876, + "grad_norm": 6.543353080749512, + "learning_rate": 8.879707587862148e-06, + "loss": 0.4069, + "step": 2043 + }, + { + "epoch": 0.22523415977961433, + "grad_norm": 7.215315341949463, + "learning_rate": 8.878604427764326e-06, + "loss": 0.4115, + "step": 2044 + }, + { + "epoch": 0.22534435261707988, + "grad_norm": 6.376750469207764, + "learning_rate": 8.87750079338549e-06, + "loss": 0.3905, + "step": 2045 + }, + { + "epoch": 0.22545454545454546, + "grad_norm": 10.623973846435547, + "learning_rate": 8.876396684860588e-06, + "loss": 0.4954, + "step": 2046 + }, + { + "epoch": 0.225564738292011, + "grad_norm": 9.294370651245117, + "learning_rate": 8.875292102324634e-06, + "loss": 0.5065, + "step": 2047 + }, + { + "epoch": 0.2256749311294766, + "grad_norm": 7.6773271560668945, + "learning_rate": 8.874187045912697e-06, + "loss": 0.4181, + "step": 2048 + }, + { + "epoch": 0.22578512396694214, + "grad_norm": 8.934910774230957, + "learning_rate": 8.873081515759908e-06, + "loss": 0.4274, + "step": 2049 + }, + { + "epoch": 0.22589531680440772, + "grad_norm": 5.578096866607666, + "learning_rate": 8.871975512001448e-06, + "loss": 0.387, + "step": 2050 + }, + { + "epoch": 0.22600550964187327, + "grad_norm": 7.378517150878906, + "learning_rate": 8.870869034772563e-06, + "loss": 0.3857, + "step": 2051 + }, + { + "epoch": 0.22611570247933885, + "grad_norm": 8.191890716552734, + "learning_rate": 8.869762084208553e-06, + "loss": 0.4835, + "step": 2052 + }, + { + "epoch": 0.2262258953168044, + "grad_norm": 12.504240989685059, + "learning_rate": 8.86865466044478e-06, + "loss": 0.5257, + "step": 2053 + }, + { + "epoch": 0.22633608815426998, + "grad_norm": 8.958276748657227, + "learning_rate": 8.867546763616662e-06, + "loss": 0.4435, + "step": 2054 + }, + { + "epoch": 0.22644628099173553, + "grad_norm": 15.308467864990234, + "learning_rate": 8.86643839385967e-06, + "loss": 0.6179, + "step": 2055 + }, + { + "epoch": 0.2265564738292011, + "grad_norm": 7.645038604736328, + "learning_rate": 8.865329551309338e-06, + "loss": 0.4695, + "step": 2056 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 7.3347368240356445, + "learning_rate": 8.864220236101261e-06, + "loss": 0.3826, + "step": 2057 + }, + { + "epoch": 0.22677685950413223, + "grad_norm": 7.0317769050598145, + "learning_rate": 8.863110448371082e-06, + "loss": 0.3772, + "step": 2058 + }, + { + "epoch": 0.22688705234159778, + "grad_norm": 7.45179557800293, + "learning_rate": 8.862000188254512e-06, + "loss": 0.5059, + "step": 2059 + }, + { + "epoch": 0.22699724517906336, + "grad_norm": 3.892314910888672, + "learning_rate": 8.860889455887312e-06, + "loss": 0.4543, + "step": 2060 + }, + { + "epoch": 0.2271074380165289, + "grad_norm": 6.642572402954102, + "learning_rate": 8.859778251405304e-06, + "loss": 0.4144, + "step": 2061 + }, + { + "epoch": 0.2272176308539945, + "grad_norm": 6.904155254364014, + "learning_rate": 8.85866657494437e-06, + "loss": 0.4312, + "step": 2062 + }, + { + "epoch": 0.22732782369146007, + "grad_norm": 8.384150505065918, + "learning_rate": 8.857554426640445e-06, + "loss": 0.4316, + "step": 2063 + }, + { + "epoch": 0.22743801652892562, + "grad_norm": 9.582276344299316, + "learning_rate": 8.856441806629524e-06, + "loss": 0.5277, + "step": 2064 + }, + { + "epoch": 0.2275482093663912, + "grad_norm": 7.464855194091797, + "learning_rate": 8.855328715047662e-06, + "loss": 0.4663, + "step": 2065 + }, + { + "epoch": 0.22765840220385675, + "grad_norm": 10.291353225708008, + "learning_rate": 8.854215152030966e-06, + "loss": 0.4829, + "step": 2066 + }, + { + "epoch": 0.22776859504132232, + "grad_norm": 7.483318328857422, + "learning_rate": 8.853101117715609e-06, + "loss": 0.418, + "step": 2067 + }, + { + "epoch": 0.22787878787878788, + "grad_norm": 12.582602500915527, + "learning_rate": 8.851986612237809e-06, + "loss": 0.5183, + "step": 2068 + }, + { + "epoch": 0.22798898071625345, + "grad_norm": 9.944835662841797, + "learning_rate": 8.850871635733856e-06, + "loss": 0.4539, + "step": 2069 + }, + { + "epoch": 0.228099173553719, + "grad_norm": 7.1502532958984375, + "learning_rate": 8.849756188340089e-06, + "loss": 0.4384, + "step": 2070 + }, + { + "epoch": 0.22820936639118458, + "grad_norm": 7.528603553771973, + "learning_rate": 8.848640270192903e-06, + "loss": 0.4036, + "step": 2071 + }, + { + "epoch": 0.22831955922865013, + "grad_norm": 6.340047359466553, + "learning_rate": 8.84752388142876e-06, + "loss": 0.4259, + "step": 2072 + }, + { + "epoch": 0.2284297520661157, + "grad_norm": 6.14491605758667, + "learning_rate": 8.846407022184169e-06, + "loss": 0.4458, + "step": 2073 + }, + { + "epoch": 0.22853994490358126, + "grad_norm": 8.84805965423584, + "learning_rate": 8.845289692595703e-06, + "loss": 0.4089, + "step": 2074 + }, + { + "epoch": 0.22865013774104684, + "grad_norm": 5.433241367340088, + "learning_rate": 8.84417189279999e-06, + "loss": 0.4659, + "step": 2075 + }, + { + "epoch": 0.2287603305785124, + "grad_norm": 6.685497283935547, + "learning_rate": 8.843053622933716e-06, + "loss": 0.4496, + "step": 2076 + }, + { + "epoch": 0.22887052341597797, + "grad_norm": 6.667261123657227, + "learning_rate": 8.841934883133624e-06, + "loss": 0.4822, + "step": 2077 + }, + { + "epoch": 0.22898071625344352, + "grad_norm": 5.1992926597595215, + "learning_rate": 8.840815673536518e-06, + "loss": 0.3928, + "step": 2078 + }, + { + "epoch": 0.2290909090909091, + "grad_norm": 8.6360502243042, + "learning_rate": 8.839695994279253e-06, + "loss": 0.3625, + "step": 2079 + }, + { + "epoch": 0.22920110192837465, + "grad_norm": 11.109519958496094, + "learning_rate": 8.838575845498744e-06, + "loss": 0.4753, + "step": 2080 + }, + { + "epoch": 0.22931129476584022, + "grad_norm": 4.837209701538086, + "learning_rate": 8.83745522733197e-06, + "loss": 0.434, + "step": 2081 + }, + { + "epoch": 0.22942148760330577, + "grad_norm": 6.786084175109863, + "learning_rate": 8.836334139915957e-06, + "loss": 0.4481, + "step": 2082 + }, + { + "epoch": 0.22953168044077135, + "grad_norm": 8.713823318481445, + "learning_rate": 8.835212583387794e-06, + "loss": 0.4768, + "step": 2083 + }, + { + "epoch": 0.2296418732782369, + "grad_norm": 7.901473045349121, + "learning_rate": 8.834090557884625e-06, + "loss": 0.4223, + "step": 2084 + }, + { + "epoch": 0.22975206611570248, + "grad_norm": 5.997265815734863, + "learning_rate": 8.832968063543657e-06, + "loss": 0.2915, + "step": 2085 + }, + { + "epoch": 0.22986225895316803, + "grad_norm": 10.123592376708984, + "learning_rate": 8.831845100502145e-06, + "loss": 0.4141, + "step": 2086 + }, + { + "epoch": 0.2299724517906336, + "grad_norm": 7.859190940856934, + "learning_rate": 8.830721668897411e-06, + "loss": 0.4084, + "step": 2087 + }, + { + "epoch": 0.2300826446280992, + "grad_norm": 7.773241996765137, + "learning_rate": 8.829597768866827e-06, + "loss": 0.405, + "step": 2088 + }, + { + "epoch": 0.23019283746556474, + "grad_norm": 6.04829216003418, + "learning_rate": 8.828473400547825e-06, + "loss": 0.3989, + "step": 2089 + }, + { + "epoch": 0.23030303030303031, + "grad_norm": 10.272017478942871, + "learning_rate": 8.827348564077897e-06, + "loss": 0.5083, + "step": 2090 + }, + { + "epoch": 0.23041322314049587, + "grad_norm": 11.257768630981445, + "learning_rate": 8.826223259594587e-06, + "loss": 0.5208, + "step": 2091 + }, + { + "epoch": 0.23052341597796144, + "grad_norm": 20.224720001220703, + "learning_rate": 8.8250974872355e-06, + "loss": 0.481, + "step": 2092 + }, + { + "epoch": 0.230633608815427, + "grad_norm": 6.637451648712158, + "learning_rate": 8.823971247138296e-06, + "loss": 0.4079, + "step": 2093 + }, + { + "epoch": 0.23074380165289257, + "grad_norm": 10.840591430664062, + "learning_rate": 8.822844539440693e-06, + "loss": 0.4461, + "step": 2094 + }, + { + "epoch": 0.23085399449035812, + "grad_norm": 7.156713962554932, + "learning_rate": 8.821717364280467e-06, + "loss": 0.4566, + "step": 2095 + }, + { + "epoch": 0.2309641873278237, + "grad_norm": 16.366134643554688, + "learning_rate": 8.820589721795451e-06, + "loss": 0.4134, + "step": 2096 + }, + { + "epoch": 0.23107438016528925, + "grad_norm": 10.64284896850586, + "learning_rate": 8.819461612123532e-06, + "loss": 0.4492, + "step": 2097 + }, + { + "epoch": 0.23118457300275483, + "grad_norm": 5.6441802978515625, + "learning_rate": 8.81833303540266e-06, + "loss": 0.456, + "step": 2098 + }, + { + "epoch": 0.23129476584022038, + "grad_norm": 7.566303730010986, + "learning_rate": 8.81720399177084e-06, + "loss": 0.4573, + "step": 2099 + }, + { + "epoch": 0.23140495867768596, + "grad_norm": 10.060577392578125, + "learning_rate": 8.816074481366128e-06, + "loss": 0.4211, + "step": 2100 + }, + { + "epoch": 0.2315151515151515, + "grad_norm": 7.7261643409729, + "learning_rate": 8.814944504326645e-06, + "loss": 0.4656, + "step": 2101 + }, + { + "epoch": 0.23162534435261709, + "grad_norm": 6.533786296844482, + "learning_rate": 8.813814060790567e-06, + "loss": 0.3302, + "step": 2102 + }, + { + "epoch": 0.23173553719008264, + "grad_norm": 7.083710193634033, + "learning_rate": 8.812683150896126e-06, + "loss": 0.3828, + "step": 2103 + }, + { + "epoch": 0.2318457300275482, + "grad_norm": 10.25291919708252, + "learning_rate": 8.811551774781608e-06, + "loss": 0.5114, + "step": 2104 + }, + { + "epoch": 0.23195592286501376, + "grad_norm": 12.033116340637207, + "learning_rate": 8.810419932585362e-06, + "loss": 0.588, + "step": 2105 + }, + { + "epoch": 0.23206611570247934, + "grad_norm": 15.983132362365723, + "learning_rate": 8.809287624445792e-06, + "loss": 0.679, + "step": 2106 + }, + { + "epoch": 0.2321763085399449, + "grad_norm": 13.827681541442871, + "learning_rate": 8.808154850501356e-06, + "loss": 0.4535, + "step": 2107 + }, + { + "epoch": 0.23228650137741047, + "grad_norm": 5.498868465423584, + "learning_rate": 8.807021610890571e-06, + "loss": 0.4278, + "step": 2108 + }, + { + "epoch": 0.23239669421487602, + "grad_norm": 7.119879722595215, + "learning_rate": 8.805887905752015e-06, + "loss": 0.3981, + "step": 2109 + }, + { + "epoch": 0.2325068870523416, + "grad_norm": 6.168232440948486, + "learning_rate": 8.804753735224312e-06, + "loss": 0.4268, + "step": 2110 + }, + { + "epoch": 0.23261707988980715, + "grad_norm": 5.046046257019043, + "learning_rate": 8.803619099446157e-06, + "loss": 0.4618, + "step": 2111 + }, + { + "epoch": 0.23272727272727273, + "grad_norm": 4.919585227966309, + "learning_rate": 8.80248399855629e-06, + "loss": 0.4289, + "step": 2112 + }, + { + "epoch": 0.2328374655647383, + "grad_norm": 8.216880798339844, + "learning_rate": 8.801348432693518e-06, + "loss": 0.5318, + "step": 2113 + }, + { + "epoch": 0.23294765840220386, + "grad_norm": 11.217514991760254, + "learning_rate": 8.800212401996692e-06, + "loss": 0.456, + "step": 2114 + }, + { + "epoch": 0.23305785123966943, + "grad_norm": 10.725457191467285, + "learning_rate": 8.799075906604732e-06, + "loss": 0.5251, + "step": 2115 + }, + { + "epoch": 0.23316804407713498, + "grad_norm": 5.096673011779785, + "learning_rate": 8.797938946656612e-06, + "loss": 0.4617, + "step": 2116 + }, + { + "epoch": 0.23327823691460056, + "grad_norm": 6.513415336608887, + "learning_rate": 8.796801522291357e-06, + "loss": 0.4937, + "step": 2117 + }, + { + "epoch": 0.2333884297520661, + "grad_norm": 4.589240074157715, + "learning_rate": 8.795663633648055e-06, + "loss": 0.4308, + "step": 2118 + }, + { + "epoch": 0.2334986225895317, + "grad_norm": 4.732120990753174, + "learning_rate": 8.794525280865846e-06, + "loss": 0.4112, + "step": 2119 + }, + { + "epoch": 0.23360881542699724, + "grad_norm": 14.606962203979492, + "learning_rate": 8.793386464083932e-06, + "loss": 0.424, + "step": 2120 + }, + { + "epoch": 0.23371900826446282, + "grad_norm": 11.576171875, + "learning_rate": 8.792247183441572e-06, + "loss": 0.5453, + "step": 2121 + }, + { + "epoch": 0.23382920110192837, + "grad_norm": 10.318958282470703, + "learning_rate": 8.79110743907807e-06, + "loss": 0.4187, + "step": 2122 + }, + { + "epoch": 0.23393939393939395, + "grad_norm": 7.910745143890381, + "learning_rate": 8.789967231132805e-06, + "loss": 0.4416, + "step": 2123 + }, + { + "epoch": 0.2340495867768595, + "grad_norm": 8.218287467956543, + "learning_rate": 8.788826559745197e-06, + "loss": 0.4353, + "step": 2124 + }, + { + "epoch": 0.23415977961432508, + "grad_norm": 6.097707271575928, + "learning_rate": 8.787685425054729e-06, + "loss": 0.4175, + "step": 2125 + }, + { + "epoch": 0.23426997245179063, + "grad_norm": 9.37572956085205, + "learning_rate": 8.786543827200944e-06, + "loss": 0.427, + "step": 2126 + }, + { + "epoch": 0.2343801652892562, + "grad_norm": 9.844120025634766, + "learning_rate": 8.785401766323437e-06, + "loss": 0.5319, + "step": 2127 + }, + { + "epoch": 0.23449035812672175, + "grad_norm": 5.837325572967529, + "learning_rate": 8.784259242561858e-06, + "loss": 0.4415, + "step": 2128 + }, + { + "epoch": 0.23460055096418733, + "grad_norm": 9.294026374816895, + "learning_rate": 8.78311625605592e-06, + "loss": 0.4406, + "step": 2129 + }, + { + "epoch": 0.23471074380165288, + "grad_norm": 6.600424289703369, + "learning_rate": 8.781972806945385e-06, + "loss": 0.4437, + "step": 2130 + }, + { + "epoch": 0.23482093663911846, + "grad_norm": 6.484173774719238, + "learning_rate": 8.78082889537008e-06, + "loss": 0.4426, + "step": 2131 + }, + { + "epoch": 0.234931129476584, + "grad_norm": 5.145078659057617, + "learning_rate": 8.779684521469882e-06, + "loss": 0.4433, + "step": 2132 + }, + { + "epoch": 0.2350413223140496, + "grad_norm": 7.097766399383545, + "learning_rate": 8.778539685384726e-06, + "loss": 0.4173, + "step": 2133 + }, + { + "epoch": 0.23515151515151514, + "grad_norm": 11.189253807067871, + "learning_rate": 8.777394387254604e-06, + "loss": 0.4288, + "step": 2134 + }, + { + "epoch": 0.23526170798898072, + "grad_norm": 7.566659450531006, + "learning_rate": 8.776248627219566e-06, + "loss": 0.4971, + "step": 2135 + }, + { + "epoch": 0.23537190082644627, + "grad_norm": 6.61622953414917, + "learning_rate": 8.775102405419717e-06, + "loss": 0.4815, + "step": 2136 + }, + { + "epoch": 0.23548209366391185, + "grad_norm": 5.723613262176514, + "learning_rate": 8.773955721995217e-06, + "loss": 0.487, + "step": 2137 + }, + { + "epoch": 0.23559228650137742, + "grad_norm": 5.3530049324035645, + "learning_rate": 8.772808577086285e-06, + "loss": 0.4057, + "step": 2138 + }, + { + "epoch": 0.23570247933884297, + "grad_norm": 12.398603439331055, + "learning_rate": 8.771660970833194e-06, + "loss": 0.6096, + "step": 2139 + }, + { + "epoch": 0.23581267217630855, + "grad_norm": 6.584178924560547, + "learning_rate": 8.770512903376277e-06, + "loss": 0.4814, + "step": 2140 + }, + { + "epoch": 0.2359228650137741, + "grad_norm": 6.775885581970215, + "learning_rate": 8.769364374855923e-06, + "loss": 0.4945, + "step": 2141 + }, + { + "epoch": 0.23603305785123968, + "grad_norm": 9.103728294372559, + "learning_rate": 8.76821538541257e-06, + "loss": 0.5035, + "step": 2142 + }, + { + "epoch": 0.23614325068870523, + "grad_norm": 13.791357040405273, + "learning_rate": 8.767065935186723e-06, + "loss": 0.5347, + "step": 2143 + }, + { + "epoch": 0.2362534435261708, + "grad_norm": 11.703580856323242, + "learning_rate": 8.765916024318935e-06, + "loss": 0.4276, + "step": 2144 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 6.4634623527526855, + "learning_rate": 8.76476565294982e-06, + "loss": 0.4393, + "step": 2145 + }, + { + "epoch": 0.23647382920110194, + "grad_norm": 8.717704772949219, + "learning_rate": 8.763614821220047e-06, + "loss": 0.3366, + "step": 2146 + }, + { + "epoch": 0.2365840220385675, + "grad_norm": 6.30748987197876, + "learning_rate": 8.762463529270341e-06, + "loss": 0.4848, + "step": 2147 + }, + { + "epoch": 0.23669421487603307, + "grad_norm": 9.31478500366211, + "learning_rate": 8.761311777241485e-06, + "loss": 0.5241, + "step": 2148 + }, + { + "epoch": 0.23680440771349862, + "grad_norm": 5.8757452964782715, + "learning_rate": 8.760159565274316e-06, + "loss": 0.3795, + "step": 2149 + }, + { + "epoch": 0.2369146005509642, + "grad_norm": 6.232647895812988, + "learning_rate": 8.759006893509726e-06, + "loss": 0.4356, + "step": 2150 + }, + { + "epoch": 0.23702479338842974, + "grad_norm": 5.690622806549072, + "learning_rate": 8.757853762088671e-06, + "loss": 0.4515, + "step": 2151 + }, + { + "epoch": 0.23713498622589532, + "grad_norm": 4.078435897827148, + "learning_rate": 8.756700171152149e-06, + "loss": 0.4059, + "step": 2152 + }, + { + "epoch": 0.23724517906336087, + "grad_norm": 7.967753887176514, + "learning_rate": 8.755546120841229e-06, + "loss": 0.4858, + "step": 2153 + }, + { + "epoch": 0.23735537190082645, + "grad_norm": 5.991828918457031, + "learning_rate": 8.754391611297026e-06, + "loss": 0.4147, + "step": 2154 + }, + { + "epoch": 0.237465564738292, + "grad_norm": 8.03773021697998, + "learning_rate": 8.753236642660719e-06, + "loss": 0.3884, + "step": 2155 + }, + { + "epoch": 0.23757575757575758, + "grad_norm": 7.971235275268555, + "learning_rate": 8.752081215073536e-06, + "loss": 0.4222, + "step": 2156 + }, + { + "epoch": 0.23768595041322313, + "grad_norm": 7.698493957519531, + "learning_rate": 8.750925328676766e-06, + "loss": 0.41, + "step": 2157 + }, + { + "epoch": 0.2377961432506887, + "grad_norm": 8.206059455871582, + "learning_rate": 8.749768983611751e-06, + "loss": 0.4505, + "step": 2158 + }, + { + "epoch": 0.23790633608815426, + "grad_norm": 6.654517650604248, + "learning_rate": 8.748612180019893e-06, + "loss": 0.4268, + "step": 2159 + }, + { + "epoch": 0.23801652892561984, + "grad_norm": 7.217126846313477, + "learning_rate": 8.747454918042645e-06, + "loss": 0.4172, + "step": 2160 + }, + { + "epoch": 0.23812672176308539, + "grad_norm": 6.2145586013793945, + "learning_rate": 8.746297197821516e-06, + "loss": 0.4729, + "step": 2161 + }, + { + "epoch": 0.23823691460055096, + "grad_norm": 8.109328269958496, + "learning_rate": 8.745139019498079e-06, + "loss": 0.3934, + "step": 2162 + }, + { + "epoch": 0.23834710743801654, + "grad_norm": 5.94710636138916, + "learning_rate": 8.743980383213956e-06, + "loss": 0.4261, + "step": 2163 + }, + { + "epoch": 0.2384573002754821, + "grad_norm": 8.663981437683105, + "learning_rate": 8.742821289110825e-06, + "loss": 0.3978, + "step": 2164 + }, + { + "epoch": 0.23856749311294767, + "grad_norm": 8.286674499511719, + "learning_rate": 8.741661737330425e-06, + "loss": 0.4266, + "step": 2165 + }, + { + "epoch": 0.23867768595041322, + "grad_norm": 9.72221851348877, + "learning_rate": 8.740501728014543e-06, + "loss": 0.4871, + "step": 2166 + }, + { + "epoch": 0.2387878787878788, + "grad_norm": 4.6482415199279785, + "learning_rate": 8.73934126130503e-06, + "loss": 0.395, + "step": 2167 + }, + { + "epoch": 0.23889807162534435, + "grad_norm": 5.311193466186523, + "learning_rate": 8.738180337343788e-06, + "loss": 0.4496, + "step": 2168 + }, + { + "epoch": 0.23900826446280993, + "grad_norm": 7.596034049987793, + "learning_rate": 8.737018956272774e-06, + "loss": 0.3588, + "step": 2169 + }, + { + "epoch": 0.23911845730027548, + "grad_norm": 9.180919647216797, + "learning_rate": 8.735857118234008e-06, + "loss": 0.4978, + "step": 2170 + }, + { + "epoch": 0.23922865013774106, + "grad_norm": 11.98186206817627, + "learning_rate": 8.734694823369559e-06, + "loss": 0.4307, + "step": 2171 + }, + { + "epoch": 0.2393388429752066, + "grad_norm": 8.02349853515625, + "learning_rate": 8.733532071821553e-06, + "loss": 0.3434, + "step": 2172 + }, + { + "epoch": 0.23944903581267218, + "grad_norm": 7.891348838806152, + "learning_rate": 8.732368863732175e-06, + "loss": 0.5116, + "step": 2173 + }, + { + "epoch": 0.23955922865013773, + "grad_norm": 8.01990032196045, + "learning_rate": 8.731205199243661e-06, + "loss": 0.455, + "step": 2174 + }, + { + "epoch": 0.2396694214876033, + "grad_norm": 5.899777889251709, + "learning_rate": 8.730041078498307e-06, + "loss": 0.4518, + "step": 2175 + }, + { + "epoch": 0.23977961432506886, + "grad_norm": 7.121553421020508, + "learning_rate": 8.728876501638464e-06, + "loss": 0.5194, + "step": 2176 + }, + { + "epoch": 0.23988980716253444, + "grad_norm": 7.066677093505859, + "learning_rate": 8.727711468806537e-06, + "loss": 0.4485, + "step": 2177 + }, + { + "epoch": 0.24, + "grad_norm": 7.9821624755859375, + "learning_rate": 8.726545980144988e-06, + "loss": 0.4904, + "step": 2178 + }, + { + "epoch": 0.24011019283746557, + "grad_norm": 5.491579532623291, + "learning_rate": 8.725380035796334e-06, + "loss": 0.4295, + "step": 2179 + }, + { + "epoch": 0.24022038567493112, + "grad_norm": 6.961433410644531, + "learning_rate": 8.724213635903149e-06, + "loss": 0.4909, + "step": 2180 + }, + { + "epoch": 0.2403305785123967, + "grad_norm": 4.592325210571289, + "learning_rate": 8.723046780608061e-06, + "loss": 0.424, + "step": 2181 + }, + { + "epoch": 0.24044077134986225, + "grad_norm": 12.062141418457031, + "learning_rate": 8.721879470053758e-06, + "loss": 0.5157, + "step": 2182 + }, + { + "epoch": 0.24055096418732783, + "grad_norm": 8.86048698425293, + "learning_rate": 8.720711704382976e-06, + "loss": 0.4732, + "step": 2183 + }, + { + "epoch": 0.24066115702479338, + "grad_norm": 16.834884643554688, + "learning_rate": 8.719543483738513e-06, + "loss": 0.4236, + "step": 2184 + }, + { + "epoch": 0.24077134986225895, + "grad_norm": 5.7272443771362305, + "learning_rate": 8.718374808263221e-06, + "loss": 0.4204, + "step": 2185 + }, + { + "epoch": 0.2408815426997245, + "grad_norm": 7.467537879943848, + "learning_rate": 8.717205678100004e-06, + "loss": 0.5018, + "step": 2186 + }, + { + "epoch": 0.24099173553719008, + "grad_norm": 5.631223201751709, + "learning_rate": 8.71603609339183e-06, + "loss": 0.424, + "step": 2187 + }, + { + "epoch": 0.24110192837465566, + "grad_norm": 8.913368225097656, + "learning_rate": 8.714866054281714e-06, + "loss": 0.4667, + "step": 2188 + }, + { + "epoch": 0.2412121212121212, + "grad_norm": 5.795899391174316, + "learning_rate": 8.71369556091273e-06, + "loss": 0.3866, + "step": 2189 + }, + { + "epoch": 0.2413223140495868, + "grad_norm": 5.947133541107178, + "learning_rate": 8.712524613428009e-06, + "loss": 0.4024, + "step": 2190 + }, + { + "epoch": 0.24143250688705234, + "grad_norm": 7.146727085113525, + "learning_rate": 8.711353211970734e-06, + "loss": 0.4076, + "step": 2191 + }, + { + "epoch": 0.24154269972451792, + "grad_norm": 11.562288284301758, + "learning_rate": 8.710181356684149e-06, + "loss": 0.5004, + "step": 2192 + }, + { + "epoch": 0.24165289256198347, + "grad_norm": 9.729698181152344, + "learning_rate": 8.709009047711547e-06, + "loss": 0.4714, + "step": 2193 + }, + { + "epoch": 0.24176308539944905, + "grad_norm": 6.6752400398254395, + "learning_rate": 8.707836285196281e-06, + "loss": 0.3855, + "step": 2194 + }, + { + "epoch": 0.2418732782369146, + "grad_norm": 11.59221363067627, + "learning_rate": 8.706663069281755e-06, + "loss": 0.5057, + "step": 2195 + }, + { + "epoch": 0.24198347107438017, + "grad_norm": 9.448982238769531, + "learning_rate": 8.705489400111437e-06, + "loss": 0.5545, + "step": 2196 + }, + { + "epoch": 0.24209366391184572, + "grad_norm": 7.867563724517822, + "learning_rate": 8.70431527782884e-06, + "loss": 0.4745, + "step": 2197 + }, + { + "epoch": 0.2422038567493113, + "grad_norm": 7.003779888153076, + "learning_rate": 8.703140702577539e-06, + "loss": 0.4257, + "step": 2198 + }, + { + "epoch": 0.24231404958677685, + "grad_norm": 7.950750350952148, + "learning_rate": 8.701965674501162e-06, + "loss": 0.4784, + "step": 2199 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 6.600383281707764, + "learning_rate": 8.700790193743395e-06, + "loss": 0.4715, + "step": 2200 + }, + { + "epoch": 0.24253443526170798, + "grad_norm": 11.465103149414062, + "learning_rate": 8.699614260447976e-06, + "loss": 0.4216, + "step": 2201 + }, + { + "epoch": 0.24264462809917356, + "grad_norm": 6.537487030029297, + "learning_rate": 8.698437874758701e-06, + "loss": 0.4132, + "step": 2202 + }, + { + "epoch": 0.2427548209366391, + "grad_norm": 6.177744388580322, + "learning_rate": 8.697261036819417e-06, + "loss": 0.4844, + "step": 2203 + }, + { + "epoch": 0.2428650137741047, + "grad_norm": 7.86129903793335, + "learning_rate": 8.696083746774031e-06, + "loss": 0.4171, + "step": 2204 + }, + { + "epoch": 0.24297520661157024, + "grad_norm": 10.450275421142578, + "learning_rate": 8.694906004766505e-06, + "loss": 0.515, + "step": 2205 + }, + { + "epoch": 0.24308539944903582, + "grad_norm": 6.404592990875244, + "learning_rate": 8.693727810940853e-06, + "loss": 0.4126, + "step": 2206 + }, + { + "epoch": 0.24319559228650137, + "grad_norm": 6.1069865226745605, + "learning_rate": 8.692549165441146e-06, + "loss": 0.3866, + "step": 2207 + }, + { + "epoch": 0.24330578512396694, + "grad_norm": 6.747042179107666, + "learning_rate": 8.691370068411513e-06, + "loss": 0.4579, + "step": 2208 + }, + { + "epoch": 0.2434159779614325, + "grad_norm": 6.704173564910889, + "learning_rate": 8.690190519996133e-06, + "loss": 0.4246, + "step": 2209 + }, + { + "epoch": 0.24352617079889807, + "grad_norm": 5.756845951080322, + "learning_rate": 8.689010520339245e-06, + "loss": 0.4082, + "step": 2210 + }, + { + "epoch": 0.24363636363636362, + "grad_norm": 10.910465240478516, + "learning_rate": 8.687830069585138e-06, + "loss": 0.4536, + "step": 2211 + }, + { + "epoch": 0.2437465564738292, + "grad_norm": 7.12267541885376, + "learning_rate": 8.68664916787816e-06, + "loss": 0.3574, + "step": 2212 + }, + { + "epoch": 0.24385674931129478, + "grad_norm": 10.122380256652832, + "learning_rate": 8.685467815362714e-06, + "loss": 0.432, + "step": 2213 + }, + { + "epoch": 0.24396694214876033, + "grad_norm": 11.504134178161621, + "learning_rate": 8.684286012183259e-06, + "loss": 0.4656, + "step": 2214 + }, + { + "epoch": 0.2440771349862259, + "grad_norm": 8.178743362426758, + "learning_rate": 8.683103758484304e-06, + "loss": 0.4386, + "step": 2215 + }, + { + "epoch": 0.24418732782369146, + "grad_norm": 11.35354232788086, + "learning_rate": 8.68192105441042e-06, + "loss": 0.5298, + "step": 2216 + }, + { + "epoch": 0.24429752066115704, + "grad_norm": 7.735526084899902, + "learning_rate": 8.680737900106227e-06, + "loss": 0.4393, + "step": 2217 + }, + { + "epoch": 0.2444077134986226, + "grad_norm": 6.364700794219971, + "learning_rate": 8.679554295716403e-06, + "loss": 0.4907, + "step": 2218 + }, + { + "epoch": 0.24451790633608816, + "grad_norm": 6.380803108215332, + "learning_rate": 8.678370241385683e-06, + "loss": 0.4302, + "step": 2219 + }, + { + "epoch": 0.24462809917355371, + "grad_norm": 36.914852142333984, + "learning_rate": 8.677185737258854e-06, + "loss": 0.4743, + "step": 2220 + }, + { + "epoch": 0.2447382920110193, + "grad_norm": 19.26903533935547, + "learning_rate": 8.676000783480758e-06, + "loss": 0.5094, + "step": 2221 + }, + { + "epoch": 0.24484848484848484, + "grad_norm": 6.038789749145508, + "learning_rate": 8.674815380196291e-06, + "loss": 0.5194, + "step": 2222 + }, + { + "epoch": 0.24495867768595042, + "grad_norm": 7.294865608215332, + "learning_rate": 8.673629527550409e-06, + "loss": 0.4517, + "step": 2223 + }, + { + "epoch": 0.24506887052341597, + "grad_norm": 8.526937484741211, + "learning_rate": 8.672443225688117e-06, + "loss": 0.4507, + "step": 2224 + }, + { + "epoch": 0.24517906336088155, + "grad_norm": 11.056537628173828, + "learning_rate": 8.67125647475448e-06, + "loss": 0.5316, + "step": 2225 + }, + { + "epoch": 0.2452892561983471, + "grad_norm": 8.909558296203613, + "learning_rate": 8.670069274894613e-06, + "loss": 0.523, + "step": 2226 + }, + { + "epoch": 0.24539944903581268, + "grad_norm": 9.7420654296875, + "learning_rate": 8.668881626253692e-06, + "loss": 0.4595, + "step": 2227 + }, + { + "epoch": 0.24550964187327823, + "grad_norm": 6.577530384063721, + "learning_rate": 8.667693528976938e-06, + "loss": 0.5038, + "step": 2228 + }, + { + "epoch": 0.2456198347107438, + "grad_norm": 7.473630428314209, + "learning_rate": 8.666504983209641e-06, + "loss": 0.4432, + "step": 2229 + }, + { + "epoch": 0.24573002754820936, + "grad_norm": 7.230944633483887, + "learning_rate": 8.665315989097135e-06, + "loss": 0.4073, + "step": 2230 + }, + { + "epoch": 0.24584022038567493, + "grad_norm": 6.275236129760742, + "learning_rate": 8.664126546784808e-06, + "loss": 0.4011, + "step": 2231 + }, + { + "epoch": 0.24595041322314048, + "grad_norm": 7.976849555969238, + "learning_rate": 8.662936656418111e-06, + "loss": 0.5406, + "step": 2232 + }, + { + "epoch": 0.24606060606060606, + "grad_norm": 5.796950817108154, + "learning_rate": 8.661746318142544e-06, + "loss": 0.4271, + "step": 2233 + }, + { + "epoch": 0.2461707988980716, + "grad_norm": 5.247142314910889, + "learning_rate": 8.660555532103663e-06, + "loss": 0.4404, + "step": 2234 + }, + { + "epoch": 0.2462809917355372, + "grad_norm": 8.870091438293457, + "learning_rate": 8.659364298447079e-06, + "loss": 0.4442, + "step": 2235 + }, + { + "epoch": 0.24639118457300274, + "grad_norm": 8.041191101074219, + "learning_rate": 8.658172617318457e-06, + "loss": 0.3981, + "step": 2236 + }, + { + "epoch": 0.24650137741046832, + "grad_norm": 6.859613418579102, + "learning_rate": 8.65698048886352e-06, + "loss": 0.456, + "step": 2237 + }, + { + "epoch": 0.2466115702479339, + "grad_norm": 6.626808166503906, + "learning_rate": 8.65578791322804e-06, + "loss": 0.4438, + "step": 2238 + }, + { + "epoch": 0.24672176308539945, + "grad_norm": 5.693751335144043, + "learning_rate": 8.654594890557847e-06, + "loss": 0.4119, + "step": 2239 + }, + { + "epoch": 0.24683195592286503, + "grad_norm": 7.935238838195801, + "learning_rate": 8.653401420998831e-06, + "loss": 0.4847, + "step": 2240 + }, + { + "epoch": 0.24694214876033058, + "grad_norm": 6.543578624725342, + "learning_rate": 8.652207504696922e-06, + "loss": 0.519, + "step": 2241 + }, + { + "epoch": 0.24705234159779615, + "grad_norm": 5.309280872344971, + "learning_rate": 8.651013141798121e-06, + "loss": 0.3942, + "step": 2242 + }, + { + "epoch": 0.2471625344352617, + "grad_norm": 7.597204685211182, + "learning_rate": 8.649818332448472e-06, + "loss": 0.4737, + "step": 2243 + }, + { + "epoch": 0.24727272727272728, + "grad_norm": 4.780295372009277, + "learning_rate": 8.64862307679408e-06, + "loss": 0.461, + "step": 2244 + }, + { + "epoch": 0.24738292011019283, + "grad_norm": 10.477855682373047, + "learning_rate": 8.647427374981101e-06, + "loss": 0.4543, + "step": 2245 + }, + { + "epoch": 0.2474931129476584, + "grad_norm": 15.061379432678223, + "learning_rate": 8.646231227155751e-06, + "loss": 0.4741, + "step": 2246 + }, + { + "epoch": 0.24760330578512396, + "grad_norm": 8.661005973815918, + "learning_rate": 8.645034633464292e-06, + "loss": 0.4148, + "step": 2247 + }, + { + "epoch": 0.24771349862258954, + "grad_norm": 8.025259017944336, + "learning_rate": 8.643837594053045e-06, + "loss": 0.5376, + "step": 2248 + }, + { + "epoch": 0.2478236914600551, + "grad_norm": 10.17959213256836, + "learning_rate": 8.64264010906839e-06, + "loss": 0.4585, + "step": 2249 + }, + { + "epoch": 0.24793388429752067, + "grad_norm": 9.788511276245117, + "learning_rate": 8.641442178656752e-06, + "loss": 0.4379, + "step": 2250 + }, + { + "epoch": 0.24804407713498622, + "grad_norm": 5.2130279541015625, + "learning_rate": 8.64024380296462e-06, + "loss": 0.3961, + "step": 2251 + }, + { + "epoch": 0.2481542699724518, + "grad_norm": 14.40140438079834, + "learning_rate": 8.63904498213853e-06, + "loss": 0.4997, + "step": 2252 + }, + { + "epoch": 0.24826446280991735, + "grad_norm": 7.163171768188477, + "learning_rate": 8.637845716325076e-06, + "loss": 0.4464, + "step": 2253 + }, + { + "epoch": 0.24837465564738292, + "grad_norm": 6.978706359863281, + "learning_rate": 8.636646005670908e-06, + "loss": 0.3715, + "step": 2254 + }, + { + "epoch": 0.24848484848484848, + "grad_norm": 6.105304718017578, + "learning_rate": 8.635445850322725e-06, + "loss": 0.3892, + "step": 2255 + }, + { + "epoch": 0.24859504132231405, + "grad_norm": 9.44076156616211, + "learning_rate": 8.634245250427286e-06, + "loss": 0.4035, + "step": 2256 + }, + { + "epoch": 0.2487052341597796, + "grad_norm": 6.398384094238281, + "learning_rate": 8.633044206131401e-06, + "loss": 0.4568, + "step": 2257 + }, + { + "epoch": 0.24881542699724518, + "grad_norm": 6.775662899017334, + "learning_rate": 8.631842717581934e-06, + "loss": 0.3772, + "step": 2258 + }, + { + "epoch": 0.24892561983471073, + "grad_norm": 5.58888578414917, + "learning_rate": 8.630640784925808e-06, + "loss": 0.465, + "step": 2259 + }, + { + "epoch": 0.2490358126721763, + "grad_norm": 6.437169075012207, + "learning_rate": 8.629438408309994e-06, + "loss": 0.5361, + "step": 2260 + }, + { + "epoch": 0.24914600550964186, + "grad_norm": 9.29654312133789, + "learning_rate": 8.628235587881522e-06, + "loss": 0.5343, + "step": 2261 + }, + { + "epoch": 0.24925619834710744, + "grad_norm": 9.664660453796387, + "learning_rate": 8.627032323787473e-06, + "loss": 0.4821, + "step": 2262 + }, + { + "epoch": 0.24936639118457302, + "grad_norm": 5.591101169586182, + "learning_rate": 8.625828616174984e-06, + "loss": 0.4363, + "step": 2263 + }, + { + "epoch": 0.24947658402203857, + "grad_norm": 7.769825458526611, + "learning_rate": 8.62462446519125e-06, + "loss": 0.4363, + "step": 2264 + }, + { + "epoch": 0.24958677685950414, + "grad_norm": 7.316049575805664, + "learning_rate": 8.62341987098351e-06, + "loss": 0.4827, + "step": 2265 + }, + { + "epoch": 0.2496969696969697, + "grad_norm": 12.398195266723633, + "learning_rate": 8.622214833699067e-06, + "loss": 0.5515, + "step": 2266 + }, + { + "epoch": 0.24980716253443527, + "grad_norm": 5.563145637512207, + "learning_rate": 8.621009353485272e-06, + "loss": 0.4688, + "step": 2267 + }, + { + "epoch": 0.24991735537190082, + "grad_norm": 5.417816162109375, + "learning_rate": 8.619803430489537e-06, + "loss": 0.4319, + "step": 2268 + }, + { + "epoch": 0.2500275482093664, + "grad_norm": 5.636494159698486, + "learning_rate": 8.618597064859321e-06, + "loss": 0.4469, + "step": 2269 + }, + { + "epoch": 0.25013774104683195, + "grad_norm": 6.015632152557373, + "learning_rate": 8.617390256742142e-06, + "loss": 0.4153, + "step": 2270 + }, + { + "epoch": 0.2502479338842975, + "grad_norm": 6.239927768707275, + "learning_rate": 8.616183006285566e-06, + "loss": 0.3865, + "step": 2271 + }, + { + "epoch": 0.2503581267217631, + "grad_norm": 11.210003852844238, + "learning_rate": 8.61497531363722e-06, + "loss": 0.4984, + "step": 2272 + }, + { + "epoch": 0.25046831955922866, + "grad_norm": 14.910019874572754, + "learning_rate": 8.613767178944784e-06, + "loss": 0.4554, + "step": 2273 + }, + { + "epoch": 0.2505785123966942, + "grad_norm": 10.103205680847168, + "learning_rate": 8.612558602355988e-06, + "loss": 0.4278, + "step": 2274 + }, + { + "epoch": 0.25068870523415976, + "grad_norm": 8.149327278137207, + "learning_rate": 8.611349584018618e-06, + "loss": 0.4078, + "step": 2275 + }, + { + "epoch": 0.25079889807162536, + "grad_norm": 7.881106376647949, + "learning_rate": 8.610140124080515e-06, + "loss": 0.4312, + "step": 2276 + }, + { + "epoch": 0.2509090909090909, + "grad_norm": 8.182812690734863, + "learning_rate": 8.608930222689575e-06, + "loss": 0.5198, + "step": 2277 + }, + { + "epoch": 0.25101928374655647, + "grad_norm": 7.4772796630859375, + "learning_rate": 8.607719879993745e-06, + "loss": 0.4961, + "step": 2278 + }, + { + "epoch": 0.251129476584022, + "grad_norm": 7.813830852508545, + "learning_rate": 8.606509096141027e-06, + "loss": 0.4212, + "step": 2279 + }, + { + "epoch": 0.2512396694214876, + "grad_norm": 7.752506732940674, + "learning_rate": 8.605297871279478e-06, + "loss": 0.3529, + "step": 2280 + }, + { + "epoch": 0.25134986225895317, + "grad_norm": 5.12895393371582, + "learning_rate": 8.604086205557206e-06, + "loss": 0.4413, + "step": 2281 + }, + { + "epoch": 0.2514600550964187, + "grad_norm": 5.124846935272217, + "learning_rate": 8.60287409912238e-06, + "loss": 0.4424, + "step": 2282 + }, + { + "epoch": 0.2515702479338843, + "grad_norm": 5.033202648162842, + "learning_rate": 8.601661552123215e-06, + "loss": 0.3773, + "step": 2283 + }, + { + "epoch": 0.2516804407713499, + "grad_norm": 9.819452285766602, + "learning_rate": 8.600448564707982e-06, + "loss": 0.4828, + "step": 2284 + }, + { + "epoch": 0.25179063360881543, + "grad_norm": 11.069389343261719, + "learning_rate": 8.599235137025007e-06, + "loss": 0.4828, + "step": 2285 + }, + { + "epoch": 0.251900826446281, + "grad_norm": 5.964832305908203, + "learning_rate": 8.598021269222672e-06, + "loss": 0.4449, + "step": 2286 + }, + { + "epoch": 0.25201101928374653, + "grad_norm": 7.644845008850098, + "learning_rate": 8.59680696144941e-06, + "loss": 0.3804, + "step": 2287 + }, + { + "epoch": 0.25212121212121213, + "grad_norm": 9.03024673461914, + "learning_rate": 8.595592213853702e-06, + "loss": 0.441, + "step": 2288 + }, + { + "epoch": 0.2522314049586777, + "grad_norm": 10.455665588378906, + "learning_rate": 8.594377026584098e-06, + "loss": 0.2856, + "step": 2289 + }, + { + "epoch": 0.25234159779614324, + "grad_norm": 6.502602577209473, + "learning_rate": 8.593161399789188e-06, + "loss": 0.3827, + "step": 2290 + }, + { + "epoch": 0.25245179063360884, + "grad_norm": 5.824034214019775, + "learning_rate": 8.591945333617622e-06, + "loss": 0.3888, + "step": 2291 + }, + { + "epoch": 0.2525619834710744, + "grad_norm": 5.943631649017334, + "learning_rate": 8.5907288282181e-06, + "loss": 0.4399, + "step": 2292 + }, + { + "epoch": 0.25267217630853994, + "grad_norm": 9.132189750671387, + "learning_rate": 8.589511883739379e-06, + "loss": 0.462, + "step": 2293 + }, + { + "epoch": 0.2527823691460055, + "grad_norm": 6.748810768127441, + "learning_rate": 8.58829450033027e-06, + "loss": 0.454, + "step": 2294 + }, + { + "epoch": 0.2528925619834711, + "grad_norm": 9.523448944091797, + "learning_rate": 8.587076678139635e-06, + "loss": 0.4171, + "step": 2295 + }, + { + "epoch": 0.25300275482093665, + "grad_norm": 7.962812900543213, + "learning_rate": 8.585858417316391e-06, + "loss": 0.51, + "step": 2296 + }, + { + "epoch": 0.2531129476584022, + "grad_norm": 5.420536994934082, + "learning_rate": 8.584639718009508e-06, + "loss": 0.4154, + "step": 2297 + }, + { + "epoch": 0.25322314049586775, + "grad_norm": 5.080401420593262, + "learning_rate": 8.583420580368013e-06, + "loss": 0.469, + "step": 2298 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 6.994992733001709, + "learning_rate": 8.58220100454098e-06, + "loss": 0.4188, + "step": 2299 + }, + { + "epoch": 0.2534435261707989, + "grad_norm": 6.936811923980713, + "learning_rate": 8.580980990677543e-06, + "loss": 0.3953, + "step": 2300 + }, + { + "epoch": 0.25355371900826446, + "grad_norm": 5.2105793952941895, + "learning_rate": 8.579760538926887e-06, + "loss": 0.3762, + "step": 2301 + }, + { + "epoch": 0.25366391184573, + "grad_norm": 6.676947593688965, + "learning_rate": 8.57853964943825e-06, + "loss": 0.4554, + "step": 2302 + }, + { + "epoch": 0.2537741046831956, + "grad_norm": 6.580031871795654, + "learning_rate": 8.577318322360922e-06, + "loss": 0.4174, + "step": 2303 + }, + { + "epoch": 0.25388429752066116, + "grad_norm": 9.767011642456055, + "learning_rate": 8.57609655784425e-06, + "loss": 0.5647, + "step": 2304 + }, + { + "epoch": 0.2539944903581267, + "grad_norm": 8.932231903076172, + "learning_rate": 8.574874356037635e-06, + "loss": 0.4578, + "step": 2305 + }, + { + "epoch": 0.25410468319559226, + "grad_norm": 5.41628885269165, + "learning_rate": 8.573651717090526e-06, + "loss": 0.4322, + "step": 2306 + }, + { + "epoch": 0.25421487603305787, + "grad_norm": 7.599315643310547, + "learning_rate": 8.572428641152432e-06, + "loss": 0.3949, + "step": 2307 + }, + { + "epoch": 0.2543250688705234, + "grad_norm": 11.129815101623535, + "learning_rate": 8.57120512837291e-06, + "loss": 0.4471, + "step": 2308 + }, + { + "epoch": 0.25443526170798897, + "grad_norm": 7.195977210998535, + "learning_rate": 8.569981178901575e-06, + "loss": 0.3736, + "step": 2309 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 6.795074462890625, + "learning_rate": 8.568756792888092e-06, + "loss": 0.4654, + "step": 2310 + }, + { + "epoch": 0.2546556473829201, + "grad_norm": 5.778489589691162, + "learning_rate": 8.56753197048218e-06, + "loss": 0.39, + "step": 2311 + }, + { + "epoch": 0.2547658402203857, + "grad_norm": 3.8139307498931885, + "learning_rate": 8.566306711833613e-06, + "loss": 0.3855, + "step": 2312 + }, + { + "epoch": 0.2548760330578512, + "grad_norm": 6.988625526428223, + "learning_rate": 8.565081017092217e-06, + "loss": 0.4758, + "step": 2313 + }, + { + "epoch": 0.25498622589531683, + "grad_norm": 16.93379783630371, + "learning_rate": 8.563854886407872e-06, + "loss": 0.6658, + "step": 2314 + }, + { + "epoch": 0.2550964187327824, + "grad_norm": 9.037361145019531, + "learning_rate": 8.56262831993051e-06, + "loss": 0.4437, + "step": 2315 + }, + { + "epoch": 0.25520661157024793, + "grad_norm": 8.213639259338379, + "learning_rate": 8.561401317810118e-06, + "loss": 0.481, + "step": 2316 + }, + { + "epoch": 0.2553168044077135, + "grad_norm": 13.289407730102539, + "learning_rate": 8.560173880196734e-06, + "loss": 0.4594, + "step": 2317 + }, + { + "epoch": 0.2554269972451791, + "grad_norm": 6.225028991699219, + "learning_rate": 8.558946007240452e-06, + "loss": 0.3196, + "step": 2318 + }, + { + "epoch": 0.25553719008264464, + "grad_norm": 6.35842227935791, + "learning_rate": 8.557717699091419e-06, + "loss": 0.4687, + "step": 2319 + }, + { + "epoch": 0.2556473829201102, + "grad_norm": 5.915284156799316, + "learning_rate": 8.556488955899833e-06, + "loss": 0.4458, + "step": 2320 + }, + { + "epoch": 0.25575757575757574, + "grad_norm": 10.873257637023926, + "learning_rate": 8.555259777815946e-06, + "loss": 0.4865, + "step": 2321 + }, + { + "epoch": 0.25586776859504134, + "grad_norm": 6.701557636260986, + "learning_rate": 8.554030164990063e-06, + "loss": 0.4758, + "step": 2322 + }, + { + "epoch": 0.2559779614325069, + "grad_norm": 4.650301456451416, + "learning_rate": 8.552800117572546e-06, + "loss": 0.4689, + "step": 2323 + }, + { + "epoch": 0.25608815426997245, + "grad_norm": 6.8133440017700195, + "learning_rate": 8.551569635713804e-06, + "loss": 0.4373, + "step": 2324 + }, + { + "epoch": 0.256198347107438, + "grad_norm": 6.845016956329346, + "learning_rate": 8.550338719564301e-06, + "loss": 0.4943, + "step": 2325 + }, + { + "epoch": 0.2563085399449036, + "grad_norm": 9.78427791595459, + "learning_rate": 8.549107369274559e-06, + "loss": 0.4677, + "step": 2326 + }, + { + "epoch": 0.25641873278236915, + "grad_norm": 4.695962429046631, + "learning_rate": 8.547875584995146e-06, + "loss": 0.3793, + "step": 2327 + }, + { + "epoch": 0.2565289256198347, + "grad_norm": 5.9332451820373535, + "learning_rate": 8.546643366876686e-06, + "loss": 0.4679, + "step": 2328 + }, + { + "epoch": 0.25663911845730025, + "grad_norm": 6.372260093688965, + "learning_rate": 8.545410715069858e-06, + "loss": 0.4294, + "step": 2329 + }, + { + "epoch": 0.25674931129476586, + "grad_norm": 4.364634990692139, + "learning_rate": 8.544177629725393e-06, + "loss": 0.4569, + "step": 2330 + }, + { + "epoch": 0.2568595041322314, + "grad_norm": 6.3348164558410645, + "learning_rate": 8.542944110994072e-06, + "loss": 0.4367, + "step": 2331 + }, + { + "epoch": 0.25696969696969696, + "grad_norm": 7.209807872772217, + "learning_rate": 8.541710159026733e-06, + "loss": 0.4896, + "step": 2332 + }, + { + "epoch": 0.2570798898071625, + "grad_norm": 9.143774032592773, + "learning_rate": 8.540475773974264e-06, + "loss": 0.4798, + "step": 2333 + }, + { + "epoch": 0.2571900826446281, + "grad_norm": 9.026727676391602, + "learning_rate": 8.539240955987609e-06, + "loss": 0.5241, + "step": 2334 + }, + { + "epoch": 0.25730027548209367, + "grad_norm": 4.808675765991211, + "learning_rate": 8.538005705217762e-06, + "loss": 0.4511, + "step": 2335 + }, + { + "epoch": 0.2574104683195592, + "grad_norm": 11.612485885620117, + "learning_rate": 8.536770021815771e-06, + "loss": 0.4771, + "step": 2336 + }, + { + "epoch": 0.25752066115702477, + "grad_norm": 7.632146835327148, + "learning_rate": 8.535533905932739e-06, + "loss": 0.496, + "step": 2337 + }, + { + "epoch": 0.25763085399449037, + "grad_norm": 4.97585391998291, + "learning_rate": 8.534297357719816e-06, + "loss": 0.407, + "step": 2338 + }, + { + "epoch": 0.2577410468319559, + "grad_norm": 6.8093743324279785, + "learning_rate": 8.533060377328213e-06, + "loss": 0.4495, + "step": 2339 + }, + { + "epoch": 0.2578512396694215, + "grad_norm": 6.089529037475586, + "learning_rate": 8.531822964909188e-06, + "loss": 0.3967, + "step": 2340 + }, + { + "epoch": 0.2579614325068871, + "grad_norm": 5.844442367553711, + "learning_rate": 8.530585120614053e-06, + "loss": 0.4217, + "step": 2341 + }, + { + "epoch": 0.25807162534435263, + "grad_norm": 10.744218826293945, + "learning_rate": 8.529346844594172e-06, + "loss": 0.4048, + "step": 2342 + }, + { + "epoch": 0.2581818181818182, + "grad_norm": 7.852959632873535, + "learning_rate": 8.528108137000968e-06, + "loss": 0.4425, + "step": 2343 + }, + { + "epoch": 0.25829201101928373, + "grad_norm": 8.47117805480957, + "learning_rate": 8.526868997985905e-06, + "loss": 0.5659, + "step": 2344 + }, + { + "epoch": 0.25840220385674934, + "grad_norm": 6.843001842498779, + "learning_rate": 8.525629427700513e-06, + "loss": 0.4097, + "step": 2345 + }, + { + "epoch": 0.2585123966942149, + "grad_norm": 9.291862487792969, + "learning_rate": 8.524389426296364e-06, + "loss": 0.5022, + "step": 2346 + }, + { + "epoch": 0.25862258953168044, + "grad_norm": 4.559871196746826, + "learning_rate": 8.523148993925089e-06, + "loss": 0.5074, + "step": 2347 + }, + { + "epoch": 0.258732782369146, + "grad_norm": 6.633461952209473, + "learning_rate": 8.521908130738369e-06, + "loss": 0.4084, + "step": 2348 + }, + { + "epoch": 0.2588429752066116, + "grad_norm": 6.937253952026367, + "learning_rate": 8.520666836887939e-06, + "loss": 0.4127, + "step": 2349 + }, + { + "epoch": 0.25895316804407714, + "grad_norm": 11.475006103515625, + "learning_rate": 8.519425112525586e-06, + "loss": 0.4464, + "step": 2350 + }, + { + "epoch": 0.2590633608815427, + "grad_norm": 7.735561847686768, + "learning_rate": 8.518182957803149e-06, + "loss": 0.3952, + "step": 2351 + }, + { + "epoch": 0.25917355371900824, + "grad_norm": 7.2295756340026855, + "learning_rate": 8.51694037287252e-06, + "loss": 0.3981, + "step": 2352 + }, + { + "epoch": 0.25928374655647385, + "grad_norm": 4.578884601593018, + "learning_rate": 8.515697357885648e-06, + "loss": 0.4478, + "step": 2353 + }, + { + "epoch": 0.2593939393939394, + "grad_norm": 10.609971046447754, + "learning_rate": 8.514453912994524e-06, + "loss": 0.4542, + "step": 2354 + }, + { + "epoch": 0.25950413223140495, + "grad_norm": 5.635560035705566, + "learning_rate": 8.513210038351203e-06, + "loss": 0.3859, + "step": 2355 + }, + { + "epoch": 0.2596143250688705, + "grad_norm": 42.48790740966797, + "learning_rate": 8.511965734107787e-06, + "loss": 0.5248, + "step": 2356 + }, + { + "epoch": 0.2597245179063361, + "grad_norm": 5.832466125488281, + "learning_rate": 8.51072100041643e-06, + "loss": 0.3818, + "step": 2357 + }, + { + "epoch": 0.25983471074380166, + "grad_norm": 4.7486252784729, + "learning_rate": 8.509475837429339e-06, + "loss": 0.2956, + "step": 2358 + }, + { + "epoch": 0.2599449035812672, + "grad_norm": 5.074799060821533, + "learning_rate": 8.508230245298778e-06, + "loss": 0.3235, + "step": 2359 + }, + { + "epoch": 0.26005509641873276, + "grad_norm": 10.534666061401367, + "learning_rate": 8.506984224177056e-06, + "loss": 0.4491, + "step": 2360 + }, + { + "epoch": 0.26016528925619836, + "grad_norm": 7.45255184173584, + "learning_rate": 8.505737774216539e-06, + "loss": 0.4088, + "step": 2361 + }, + { + "epoch": 0.2602754820936639, + "grad_norm": 11.84439468383789, + "learning_rate": 8.504490895569645e-06, + "loss": 0.4307, + "step": 2362 + }, + { + "epoch": 0.26038567493112946, + "grad_norm": 7.551827430725098, + "learning_rate": 8.503243588388843e-06, + "loss": 0.3964, + "step": 2363 + }, + { + "epoch": 0.26049586776859507, + "grad_norm": 7.464683532714844, + "learning_rate": 8.501995852826658e-06, + "loss": 0.4345, + "step": 2364 + }, + { + "epoch": 0.2606060606060606, + "grad_norm": 5.859431266784668, + "learning_rate": 8.500747689035663e-06, + "loss": 0.4822, + "step": 2365 + }, + { + "epoch": 0.26071625344352617, + "grad_norm": 6.196678161621094, + "learning_rate": 8.499499097168485e-06, + "loss": 0.4524, + "step": 2366 + }, + { + "epoch": 0.2608264462809917, + "grad_norm": 5.3284759521484375, + "learning_rate": 8.498250077377803e-06, + "loss": 0.4404, + "step": 2367 + }, + { + "epoch": 0.2609366391184573, + "grad_norm": 9.878118515014648, + "learning_rate": 8.49700062981635e-06, + "loss": 0.4371, + "step": 2368 + }, + { + "epoch": 0.2610468319559229, + "grad_norm": 7.886068344116211, + "learning_rate": 8.495750754636909e-06, + "loss": 0.4588, + "step": 2369 + }, + { + "epoch": 0.2611570247933884, + "grad_norm": 9.894314765930176, + "learning_rate": 8.494500451992318e-06, + "loss": 0.4524, + "step": 2370 + }, + { + "epoch": 0.261267217630854, + "grad_norm": 9.895092964172363, + "learning_rate": 8.493249722035464e-06, + "loss": 0.5528, + "step": 2371 + }, + { + "epoch": 0.2613774104683196, + "grad_norm": 7.3613386154174805, + "learning_rate": 8.49199856491929e-06, + "loss": 0.3853, + "step": 2372 + }, + { + "epoch": 0.26148760330578513, + "grad_norm": 5.514358043670654, + "learning_rate": 8.490746980796787e-06, + "loss": 0.3426, + "step": 2373 + }, + { + "epoch": 0.2615977961432507, + "grad_norm": 7.374489784240723, + "learning_rate": 8.489494969821004e-06, + "loss": 0.4179, + "step": 2374 + }, + { + "epoch": 0.26170798898071623, + "grad_norm": 6.119042873382568, + "learning_rate": 8.488242532145035e-06, + "loss": 0.4394, + "step": 2375 + }, + { + "epoch": 0.26181818181818184, + "grad_norm": 5.095778465270996, + "learning_rate": 8.48698966792203e-06, + "loss": 0.3747, + "step": 2376 + }, + { + "epoch": 0.2619283746556474, + "grad_norm": 11.36087703704834, + "learning_rate": 8.485736377305191e-06, + "loss": 0.451, + "step": 2377 + }, + { + "epoch": 0.26203856749311294, + "grad_norm": 8.019950866699219, + "learning_rate": 8.484482660447775e-06, + "loss": 0.4427, + "step": 2378 + }, + { + "epoch": 0.2621487603305785, + "grad_norm": 6.189932823181152, + "learning_rate": 8.483228517503085e-06, + "loss": 0.4113, + "step": 2379 + }, + { + "epoch": 0.2622589531680441, + "grad_norm": 10.068389892578125, + "learning_rate": 8.48197394862448e-06, + "loss": 0.4674, + "step": 2380 + }, + { + "epoch": 0.26236914600550965, + "grad_norm": 10.848103523254395, + "learning_rate": 8.48071895396537e-06, + "loss": 0.4179, + "step": 2381 + }, + { + "epoch": 0.2624793388429752, + "grad_norm": 7.704885005950928, + "learning_rate": 8.47946353367922e-06, + "loss": 0.4916, + "step": 2382 + }, + { + "epoch": 0.26258953168044075, + "grad_norm": 18.67957878112793, + "learning_rate": 8.478207687919542e-06, + "loss": 0.4747, + "step": 2383 + }, + { + "epoch": 0.26269972451790635, + "grad_norm": 12.184995651245117, + "learning_rate": 8.476951416839904e-06, + "loss": 0.4911, + "step": 2384 + }, + { + "epoch": 0.2628099173553719, + "grad_norm": 5.447048664093018, + "learning_rate": 8.475694720593923e-06, + "loss": 0.4661, + "step": 2385 + }, + { + "epoch": 0.26292011019283745, + "grad_norm": 10.587584495544434, + "learning_rate": 8.47443759933527e-06, + "loss": 0.3728, + "step": 2386 + }, + { + "epoch": 0.263030303030303, + "grad_norm": 6.572682857513428, + "learning_rate": 8.47318005321767e-06, + "loss": 0.4758, + "step": 2387 + }, + { + "epoch": 0.2631404958677686, + "grad_norm": 9.376725196838379, + "learning_rate": 8.471922082394892e-06, + "loss": 0.4961, + "step": 2388 + }, + { + "epoch": 0.26325068870523416, + "grad_norm": 5.249814033508301, + "learning_rate": 8.470663687020769e-06, + "loss": 0.463, + "step": 2389 + }, + { + "epoch": 0.2633608815426997, + "grad_norm": 8.13807201385498, + "learning_rate": 8.469404867249172e-06, + "loss": 0.5362, + "step": 2390 + }, + { + "epoch": 0.2634710743801653, + "grad_norm": 7.37142276763916, + "learning_rate": 8.468145623234036e-06, + "loss": 0.4614, + "step": 2391 + }, + { + "epoch": 0.26358126721763087, + "grad_norm": 6.937657833099365, + "learning_rate": 8.466885955129345e-06, + "loss": 0.4759, + "step": 2392 + }, + { + "epoch": 0.2636914600550964, + "grad_norm": 9.085661888122559, + "learning_rate": 8.465625863089128e-06, + "loss": 0.4831, + "step": 2393 + }, + { + "epoch": 0.26380165289256197, + "grad_norm": 5.514589786529541, + "learning_rate": 8.464365347267473e-06, + "loss": 0.3941, + "step": 2394 + }, + { + "epoch": 0.26391184573002757, + "grad_norm": 5.748258113861084, + "learning_rate": 8.463104407818518e-06, + "loss": 0.424, + "step": 2395 + }, + { + "epoch": 0.2640220385674931, + "grad_norm": 5.270674228668213, + "learning_rate": 8.461843044896451e-06, + "loss": 0.4682, + "step": 2396 + }, + { + "epoch": 0.2641322314049587, + "grad_norm": 7.216822624206543, + "learning_rate": 8.460581258655515e-06, + "loss": 0.4456, + "step": 2397 + }, + { + "epoch": 0.2642424242424242, + "grad_norm": 4.967555999755859, + "learning_rate": 8.459319049250001e-06, + "loss": 0.4191, + "step": 2398 + }, + { + "epoch": 0.26435261707988983, + "grad_norm": 5.243597984313965, + "learning_rate": 8.458056416834255e-06, + "loss": 0.4045, + "step": 2399 + }, + { + "epoch": 0.2644628099173554, + "grad_norm": 6.000222206115723, + "learning_rate": 8.456793361562674e-06, + "loss": 0.4342, + "step": 2400 + }, + { + "epoch": 0.26457300275482093, + "grad_norm": 3.6822104454040527, + "learning_rate": 8.455529883589703e-06, + "loss": 0.4325, + "step": 2401 + }, + { + "epoch": 0.2646831955922865, + "grad_norm": 6.141874313354492, + "learning_rate": 8.454265983069848e-06, + "loss": 0.4673, + "step": 2402 + }, + { + "epoch": 0.2647933884297521, + "grad_norm": 8.271032333374023, + "learning_rate": 8.453001660157653e-06, + "loss": 0.4486, + "step": 2403 + }, + { + "epoch": 0.26490358126721764, + "grad_norm": 4.449370861053467, + "learning_rate": 8.451736915007725e-06, + "loss": 0.3562, + "step": 2404 + }, + { + "epoch": 0.2650137741046832, + "grad_norm": 7.038038730621338, + "learning_rate": 8.450471747774718e-06, + "loss": 0.4834, + "step": 2405 + }, + { + "epoch": 0.26512396694214874, + "grad_norm": 12.406217575073242, + "learning_rate": 8.449206158613338e-06, + "loss": 0.5946, + "step": 2406 + }, + { + "epoch": 0.26523415977961434, + "grad_norm": 6.856632709503174, + "learning_rate": 8.447940147678346e-06, + "loss": 0.5554, + "step": 2407 + }, + { + "epoch": 0.2653443526170799, + "grad_norm": 7.30984354019165, + "learning_rate": 8.446673715124548e-06, + "loss": 0.5054, + "step": 2408 + }, + { + "epoch": 0.26545454545454544, + "grad_norm": 4.488925457000732, + "learning_rate": 8.445406861106806e-06, + "loss": 0.4447, + "step": 2409 + }, + { + "epoch": 0.265564738292011, + "grad_norm": 6.564624786376953, + "learning_rate": 8.444139585780034e-06, + "loss": 0.4441, + "step": 2410 + }, + { + "epoch": 0.2656749311294766, + "grad_norm": 8.532626152038574, + "learning_rate": 8.442871889299194e-06, + "loss": 0.4346, + "step": 2411 + }, + { + "epoch": 0.26578512396694215, + "grad_norm": 9.0402193069458, + "learning_rate": 8.441603771819302e-06, + "loss": 0.4117, + "step": 2412 + }, + { + "epoch": 0.2658953168044077, + "grad_norm": 7.293003082275391, + "learning_rate": 8.440335233495428e-06, + "loss": 0.3776, + "step": 2413 + }, + { + "epoch": 0.2660055096418733, + "grad_norm": 6.154115676879883, + "learning_rate": 8.439066274482687e-06, + "loss": 0.4284, + "step": 2414 + }, + { + "epoch": 0.26611570247933886, + "grad_norm": 5.79887580871582, + "learning_rate": 8.43779689493625e-06, + "loss": 0.4172, + "step": 2415 + }, + { + "epoch": 0.2662258953168044, + "grad_norm": 9.61327838897705, + "learning_rate": 8.43652709501134e-06, + "loss": 0.4877, + "step": 2416 + }, + { + "epoch": 0.26633608815426996, + "grad_norm": 12.206517219543457, + "learning_rate": 8.43525687486323e-06, + "loss": 0.5253, + "step": 2417 + }, + { + "epoch": 0.26644628099173556, + "grad_norm": 5.559622764587402, + "learning_rate": 8.43398623464724e-06, + "loss": 0.5098, + "step": 2418 + }, + { + "epoch": 0.2665564738292011, + "grad_norm": 14.062077522277832, + "learning_rate": 8.43271517451875e-06, + "loss": 0.5241, + "step": 2419 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 6.438302993774414, + "learning_rate": 8.431443694633187e-06, + "loss": 0.4567, + "step": 2420 + }, + { + "epoch": 0.2667768595041322, + "grad_norm": 9.327301979064941, + "learning_rate": 8.430171795146025e-06, + "loss": 0.4578, + "step": 2421 + }, + { + "epoch": 0.2668870523415978, + "grad_norm": 5.0606818199157715, + "learning_rate": 8.428899476212798e-06, + "loss": 0.4326, + "step": 2422 + }, + { + "epoch": 0.26699724517906337, + "grad_norm": 5.966927528381348, + "learning_rate": 8.427626737989085e-06, + "loss": 0.4125, + "step": 2423 + }, + { + "epoch": 0.2671074380165289, + "grad_norm": 6.134056568145752, + "learning_rate": 8.426353580630519e-06, + "loss": 0.3967, + "step": 2424 + }, + { + "epoch": 0.26721763085399447, + "grad_norm": 6.048746585845947, + "learning_rate": 8.425080004292782e-06, + "loss": 0.4668, + "step": 2425 + }, + { + "epoch": 0.2673278236914601, + "grad_norm": 10.070021629333496, + "learning_rate": 8.42380600913161e-06, + "loss": 0.4439, + "step": 2426 + }, + { + "epoch": 0.2674380165289256, + "grad_norm": 6.099440097808838, + "learning_rate": 8.42253159530279e-06, + "loss": 0.4487, + "step": 2427 + }, + { + "epoch": 0.2675482093663912, + "grad_norm": 10.026384353637695, + "learning_rate": 8.421256762962156e-06, + "loss": 0.4665, + "step": 2428 + }, + { + "epoch": 0.2676584022038567, + "grad_norm": 5.96007776260376, + "learning_rate": 8.419981512265596e-06, + "loss": 0.3705, + "step": 2429 + }, + { + "epoch": 0.26776859504132233, + "grad_norm": 8.469380378723145, + "learning_rate": 8.418705843369055e-06, + "loss": 0.4131, + "step": 2430 + }, + { + "epoch": 0.2678787878787879, + "grad_norm": 5.206123352050781, + "learning_rate": 8.417429756428517e-06, + "loss": 0.4689, + "step": 2431 + }, + { + "epoch": 0.26798898071625343, + "grad_norm": 4.994175910949707, + "learning_rate": 8.416153251600026e-06, + "loss": 0.4722, + "step": 2432 + }, + { + "epoch": 0.268099173553719, + "grad_norm": 5.6883225440979, + "learning_rate": 8.414876329039675e-06, + "loss": 0.5019, + "step": 2433 + }, + { + "epoch": 0.2682093663911846, + "grad_norm": 5.061646938323975, + "learning_rate": 8.41359898890361e-06, + "loss": 0.4814, + "step": 2434 + }, + { + "epoch": 0.26831955922865014, + "grad_norm": 7.3997368812561035, + "learning_rate": 8.412321231348022e-06, + "loss": 0.4339, + "step": 2435 + }, + { + "epoch": 0.2684297520661157, + "grad_norm": 7.136973857879639, + "learning_rate": 8.411043056529158e-06, + "loss": 0.4767, + "step": 2436 + }, + { + "epoch": 0.26853994490358124, + "grad_norm": 4.894901275634766, + "learning_rate": 8.409764464603316e-06, + "loss": 0.4344, + "step": 2437 + }, + { + "epoch": 0.26865013774104685, + "grad_norm": 7.450186252593994, + "learning_rate": 8.408485455726844e-06, + "loss": 0.4932, + "step": 2438 + }, + { + "epoch": 0.2687603305785124, + "grad_norm": 12.19626235961914, + "learning_rate": 8.40720603005614e-06, + "loss": 0.4221, + "step": 2439 + }, + { + "epoch": 0.26887052341597795, + "grad_norm": 7.356841087341309, + "learning_rate": 8.405926187747658e-06, + "loss": 0.4903, + "step": 2440 + }, + { + "epoch": 0.26898071625344355, + "grad_norm": 5.755402565002441, + "learning_rate": 8.404645928957891e-06, + "loss": 0.3814, + "step": 2441 + }, + { + "epoch": 0.2690909090909091, + "grad_norm": 9.090841293334961, + "learning_rate": 8.403365253843397e-06, + "loss": 0.4033, + "step": 2442 + }, + { + "epoch": 0.26920110192837465, + "grad_norm": 5.735386848449707, + "learning_rate": 8.402084162560776e-06, + "loss": 0.3967, + "step": 2443 + }, + { + "epoch": 0.2693112947658402, + "grad_norm": 7.3089704513549805, + "learning_rate": 8.400802655266682e-06, + "loss": 0.5323, + "step": 2444 + }, + { + "epoch": 0.2694214876033058, + "grad_norm": 7.761696815490723, + "learning_rate": 8.39952073211782e-06, + "loss": 0.4866, + "step": 2445 + }, + { + "epoch": 0.26953168044077136, + "grad_norm": 6.164813041687012, + "learning_rate": 8.398238393270946e-06, + "loss": 0.3989, + "step": 2446 + }, + { + "epoch": 0.2696418732782369, + "grad_norm": 6.546687602996826, + "learning_rate": 8.396955638882864e-06, + "loss": 0.3693, + "step": 2447 + }, + { + "epoch": 0.26975206611570246, + "grad_norm": 7.708877086639404, + "learning_rate": 8.395672469110433e-06, + "loss": 0.4088, + "step": 2448 + }, + { + "epoch": 0.26986225895316807, + "grad_norm": 8.575736999511719, + "learning_rate": 8.394388884110559e-06, + "loss": 0.4556, + "step": 2449 + }, + { + "epoch": 0.2699724517906336, + "grad_norm": 9.647896766662598, + "learning_rate": 8.393104884040202e-06, + "loss": 0.4624, + "step": 2450 + }, + { + "epoch": 0.27008264462809917, + "grad_norm": 6.133730411529541, + "learning_rate": 8.391820469056371e-06, + "loss": 0.3986, + "step": 2451 + }, + { + "epoch": 0.2701928374655647, + "grad_norm": 7.404541015625, + "learning_rate": 8.390535639316124e-06, + "loss": 0.3624, + "step": 2452 + }, + { + "epoch": 0.2703030303030303, + "grad_norm": 8.516987800598145, + "learning_rate": 8.389250394976575e-06, + "loss": 0.4818, + "step": 2453 + }, + { + "epoch": 0.2704132231404959, + "grad_norm": 8.884702682495117, + "learning_rate": 8.387964736194884e-06, + "loss": 0.4364, + "step": 2454 + }, + { + "epoch": 0.2705234159779614, + "grad_norm": 11.272823333740234, + "learning_rate": 8.386678663128263e-06, + "loss": 0.4891, + "step": 2455 + }, + { + "epoch": 0.270633608815427, + "grad_norm": 7.883791446685791, + "learning_rate": 8.385392175933974e-06, + "loss": 0.4202, + "step": 2456 + }, + { + "epoch": 0.2707438016528926, + "grad_norm": 8.155399322509766, + "learning_rate": 8.384105274769331e-06, + "loss": 0.433, + "step": 2457 + }, + { + "epoch": 0.27085399449035813, + "grad_norm": 8.103219032287598, + "learning_rate": 8.3828179597917e-06, + "loss": 0.4528, + "step": 2458 + }, + { + "epoch": 0.2709641873278237, + "grad_norm": 5.788285255432129, + "learning_rate": 8.381530231158493e-06, + "loss": 0.3991, + "step": 2459 + }, + { + "epoch": 0.27107438016528923, + "grad_norm": 10.736172676086426, + "learning_rate": 8.380242089027174e-06, + "loss": 0.4536, + "step": 2460 + }, + { + "epoch": 0.27118457300275484, + "grad_norm": 7.713866710662842, + "learning_rate": 8.378953533555261e-06, + "loss": 0.4379, + "step": 2461 + }, + { + "epoch": 0.2712947658402204, + "grad_norm": 6.560085773468018, + "learning_rate": 8.377664564900322e-06, + "loss": 0.4984, + "step": 2462 + }, + { + "epoch": 0.27140495867768594, + "grad_norm": 7.5404372215271, + "learning_rate": 8.376375183219972e-06, + "loss": 0.3497, + "step": 2463 + }, + { + "epoch": 0.27151515151515154, + "grad_norm": 7.938744068145752, + "learning_rate": 8.375085388671877e-06, + "loss": 0.4557, + "step": 2464 + }, + { + "epoch": 0.2716253443526171, + "grad_norm": 5.309293746948242, + "learning_rate": 8.373795181413757e-06, + "loss": 0.4525, + "step": 2465 + }, + { + "epoch": 0.27173553719008264, + "grad_norm": 3.6332757472991943, + "learning_rate": 8.372504561603379e-06, + "loss": 0.4396, + "step": 2466 + }, + { + "epoch": 0.2718457300275482, + "grad_norm": 5.703434944152832, + "learning_rate": 8.371213529398561e-06, + "loss": 0.4696, + "step": 2467 + }, + { + "epoch": 0.2719559228650138, + "grad_norm": 9.001897811889648, + "learning_rate": 8.369922084957174e-06, + "loss": 0.4306, + "step": 2468 + }, + { + "epoch": 0.27206611570247935, + "grad_norm": 6.7894415855407715, + "learning_rate": 8.368630228437137e-06, + "loss": 0.4258, + "step": 2469 + }, + { + "epoch": 0.2721763085399449, + "grad_norm": 4.18749475479126, + "learning_rate": 8.36733795999642e-06, + "loss": 0.3744, + "step": 2470 + }, + { + "epoch": 0.27228650137741045, + "grad_norm": 7.989779949188232, + "learning_rate": 8.366045279793042e-06, + "loss": 0.4467, + "step": 2471 + }, + { + "epoch": 0.27239669421487606, + "grad_norm": 8.87408447265625, + "learning_rate": 8.364752187985077e-06, + "loss": 0.4525, + "step": 2472 + }, + { + "epoch": 0.2725068870523416, + "grad_norm": 10.908501625061035, + "learning_rate": 8.363458684730642e-06, + "loss": 0.4576, + "step": 2473 + }, + { + "epoch": 0.27261707988980716, + "grad_norm": 6.029660224914551, + "learning_rate": 8.36216477018791e-06, + "loss": 0.4239, + "step": 2474 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 7.507077217102051, + "learning_rate": 8.360870444515104e-06, + "loss": 0.3579, + "step": 2475 + }, + { + "epoch": 0.2728374655647383, + "grad_norm": 6.4907097816467285, + "learning_rate": 8.359575707870495e-06, + "loss": 0.4084, + "step": 2476 + }, + { + "epoch": 0.27294765840220386, + "grad_norm": 8.459695816040039, + "learning_rate": 8.358280560412403e-06, + "loss": 0.4789, + "step": 2477 + }, + { + "epoch": 0.2730578512396694, + "grad_norm": 7.87397575378418, + "learning_rate": 8.356985002299205e-06, + "loss": 0.4314, + "step": 2478 + }, + { + "epoch": 0.27316804407713496, + "grad_norm": 8.098743438720703, + "learning_rate": 8.355689033689321e-06, + "loss": 0.482, + "step": 2479 + }, + { + "epoch": 0.27327823691460057, + "grad_norm": 9.823105812072754, + "learning_rate": 8.35439265474122e-06, + "loss": 0.4553, + "step": 2480 + }, + { + "epoch": 0.2733884297520661, + "grad_norm": 7.965592861175537, + "learning_rate": 8.353095865613433e-06, + "loss": 0.4911, + "step": 2481 + }, + { + "epoch": 0.27349862258953167, + "grad_norm": 6.894061088562012, + "learning_rate": 8.351798666464527e-06, + "loss": 0.432, + "step": 2482 + }, + { + "epoch": 0.2736088154269972, + "grad_norm": 6.235803127288818, + "learning_rate": 8.350501057453127e-06, + "loss": 0.4212, + "step": 2483 + }, + { + "epoch": 0.2737190082644628, + "grad_norm": 9.930096626281738, + "learning_rate": 8.349203038737904e-06, + "loss": 0.43, + "step": 2484 + }, + { + "epoch": 0.2738292011019284, + "grad_norm": 10.2173433303833, + "learning_rate": 8.347904610477588e-06, + "loss": 0.5353, + "step": 2485 + }, + { + "epoch": 0.2739393939393939, + "grad_norm": 7.106429100036621, + "learning_rate": 8.346605772830946e-06, + "loss": 0.512, + "step": 2486 + }, + { + "epoch": 0.2740495867768595, + "grad_norm": 9.963252067565918, + "learning_rate": 8.345306525956807e-06, + "loss": 0.5617, + "step": 2487 + }, + { + "epoch": 0.2741597796143251, + "grad_norm": 5.965456008911133, + "learning_rate": 8.344006870014039e-06, + "loss": 0.3919, + "step": 2488 + }, + { + "epoch": 0.27426997245179063, + "grad_norm": 5.027235507965088, + "learning_rate": 8.34270680516157e-06, + "loss": 0.431, + "step": 2489 + }, + { + "epoch": 0.2743801652892562, + "grad_norm": 7.0271992683410645, + "learning_rate": 8.341406331558373e-06, + "loss": 0.4549, + "step": 2490 + }, + { + "epoch": 0.2744903581267218, + "grad_norm": 7.3226518630981445, + "learning_rate": 8.34010544936347e-06, + "loss": 0.4255, + "step": 2491 + }, + { + "epoch": 0.27460055096418734, + "grad_norm": 6.1253862380981445, + "learning_rate": 8.338804158735934e-06, + "loss": 0.4528, + "step": 2492 + }, + { + "epoch": 0.2747107438016529, + "grad_norm": 6.783452033996582, + "learning_rate": 8.337502459834892e-06, + "loss": 0.4496, + "step": 2493 + }, + { + "epoch": 0.27482093663911844, + "grad_norm": 5.832024097442627, + "learning_rate": 8.336200352819514e-06, + "loss": 0.3755, + "step": 2494 + }, + { + "epoch": 0.27493112947658405, + "grad_norm": 5.6729736328125, + "learning_rate": 8.334897837849027e-06, + "loss": 0.4363, + "step": 2495 + }, + { + "epoch": 0.2750413223140496, + "grad_norm": 7.188202857971191, + "learning_rate": 8.333594915082701e-06, + "loss": 0.4692, + "step": 2496 + }, + { + "epoch": 0.27515151515151515, + "grad_norm": 6.959208965301514, + "learning_rate": 8.33229158467986e-06, + "loss": 0.4514, + "step": 2497 + }, + { + "epoch": 0.2752617079889807, + "grad_norm": 7.196948051452637, + "learning_rate": 8.330987846799881e-06, + "loss": 0.4127, + "step": 2498 + }, + { + "epoch": 0.2753719008264463, + "grad_norm": 9.837214469909668, + "learning_rate": 8.32968370160218e-06, + "loss": 0.4987, + "step": 2499 + }, + { + "epoch": 0.27548209366391185, + "grad_norm": 10.358495712280273, + "learning_rate": 8.328379149246234e-06, + "loss": 0.3694, + "step": 2500 + }, + { + "epoch": 0.2755922865013774, + "grad_norm": 13.23357105255127, + "learning_rate": 8.327074189891564e-06, + "loss": 0.5031, + "step": 2501 + }, + { + "epoch": 0.27570247933884295, + "grad_norm": 8.414778709411621, + "learning_rate": 8.325768823697743e-06, + "loss": 0.4738, + "step": 2502 + }, + { + "epoch": 0.27581267217630856, + "grad_norm": 7.910693645477295, + "learning_rate": 8.324463050824394e-06, + "loss": 0.4116, + "step": 2503 + }, + { + "epoch": 0.2759228650137741, + "grad_norm": 8.252347946166992, + "learning_rate": 8.323156871431186e-06, + "loss": 0.3922, + "step": 2504 + }, + { + "epoch": 0.27603305785123966, + "grad_norm": 8.44232177734375, + "learning_rate": 8.321850285677842e-06, + "loss": 0.4281, + "step": 2505 + }, + { + "epoch": 0.2761432506887052, + "grad_norm": 6.815550804138184, + "learning_rate": 8.320543293724133e-06, + "loss": 0.4442, + "step": 2506 + }, + { + "epoch": 0.2762534435261708, + "grad_norm": 13.305642127990723, + "learning_rate": 8.319235895729878e-06, + "loss": 0.4527, + "step": 2507 + }, + { + "epoch": 0.27636363636363637, + "grad_norm": 5.294179916381836, + "learning_rate": 8.31792809185495e-06, + "loss": 0.4298, + "step": 2508 + }, + { + "epoch": 0.2764738292011019, + "grad_norm": 7.0405097007751465, + "learning_rate": 8.316619882259268e-06, + "loss": 0.4452, + "step": 2509 + }, + { + "epoch": 0.27658402203856747, + "grad_norm": 6.023338794708252, + "learning_rate": 8.315311267102802e-06, + "loss": 0.3899, + "step": 2510 + }, + { + "epoch": 0.2766942148760331, + "grad_norm": 10.251482963562012, + "learning_rate": 8.31400224654557e-06, + "loss": 0.5171, + "step": 2511 + }, + { + "epoch": 0.2768044077134986, + "grad_norm": 6.565580368041992, + "learning_rate": 8.312692820747644e-06, + "loss": 0.3834, + "step": 2512 + }, + { + "epoch": 0.2769146005509642, + "grad_norm": 6.3364577293396, + "learning_rate": 8.311382989869137e-06, + "loss": 0.4053, + "step": 2513 + }, + { + "epoch": 0.2770247933884298, + "grad_norm": 5.889908313751221, + "learning_rate": 8.310072754070223e-06, + "loss": 0.4285, + "step": 2514 + }, + { + "epoch": 0.27713498622589533, + "grad_norm": 5.778831958770752, + "learning_rate": 8.308762113511112e-06, + "loss": 0.4834, + "step": 2515 + }, + { + "epoch": 0.2772451790633609, + "grad_norm": 5.996057987213135, + "learning_rate": 8.307451068352078e-06, + "loss": 0.4179, + "step": 2516 + }, + { + "epoch": 0.27735537190082643, + "grad_norm": 8.068934440612793, + "learning_rate": 8.306139618753434e-06, + "loss": 0.4559, + "step": 2517 + }, + { + "epoch": 0.27746556473829204, + "grad_norm": 7.231866836547852, + "learning_rate": 8.304827764875547e-06, + "loss": 0.4152, + "step": 2518 + }, + { + "epoch": 0.2775757575757576, + "grad_norm": 5.904922962188721, + "learning_rate": 8.303515506878833e-06, + "loss": 0.3983, + "step": 2519 + }, + { + "epoch": 0.27768595041322314, + "grad_norm": 10.93997573852539, + "learning_rate": 8.302202844923753e-06, + "loss": 0.5144, + "step": 2520 + }, + { + "epoch": 0.2777961432506887, + "grad_norm": 8.458961486816406, + "learning_rate": 8.300889779170824e-06, + "loss": 0.3975, + "step": 2521 + }, + { + "epoch": 0.2779063360881543, + "grad_norm": 8.097896575927734, + "learning_rate": 8.29957630978061e-06, + "loss": 0.3939, + "step": 2522 + }, + { + "epoch": 0.27801652892561984, + "grad_norm": 6.291425704956055, + "learning_rate": 8.298262436913722e-06, + "loss": 0.3961, + "step": 2523 + }, + { + "epoch": 0.2781267217630854, + "grad_norm": 4.316643714904785, + "learning_rate": 8.296948160730822e-06, + "loss": 0.4179, + "step": 2524 + }, + { + "epoch": 0.27823691460055094, + "grad_norm": 8.72903823852539, + "learning_rate": 8.295633481392625e-06, + "loss": 0.4965, + "step": 2525 + }, + { + "epoch": 0.27834710743801655, + "grad_norm": 6.223062515258789, + "learning_rate": 8.294318399059888e-06, + "loss": 0.4358, + "step": 2526 + }, + { + "epoch": 0.2784573002754821, + "grad_norm": 10.879405975341797, + "learning_rate": 8.293002913893422e-06, + "loss": 0.4807, + "step": 2527 + }, + { + "epoch": 0.27856749311294765, + "grad_norm": 4.838342189788818, + "learning_rate": 8.291687026054086e-06, + "loss": 0.3958, + "step": 2528 + }, + { + "epoch": 0.2786776859504132, + "grad_norm": 7.267243385314941, + "learning_rate": 8.290370735702791e-06, + "loss": 0.49, + "step": 2529 + }, + { + "epoch": 0.2787878787878788, + "grad_norm": 9.317253112792969, + "learning_rate": 8.28905404300049e-06, + "loss": 0.4144, + "step": 2530 + }, + { + "epoch": 0.27889807162534436, + "grad_norm": 5.159974098205566, + "learning_rate": 8.287736948108197e-06, + "loss": 0.4972, + "step": 2531 + }, + { + "epoch": 0.2790082644628099, + "grad_norm": 8.115497589111328, + "learning_rate": 8.28641945118696e-06, + "loss": 0.3957, + "step": 2532 + }, + { + "epoch": 0.27911845730027546, + "grad_norm": 5.645632743835449, + "learning_rate": 8.285101552397892e-06, + "loss": 0.4226, + "step": 2533 + }, + { + "epoch": 0.27922865013774106, + "grad_norm": 7.425549030303955, + "learning_rate": 8.28378325190214e-06, + "loss": 0.4331, + "step": 2534 + }, + { + "epoch": 0.2793388429752066, + "grad_norm": 7.270996570587158, + "learning_rate": 8.282464549860915e-06, + "loss": 0.4335, + "step": 2535 + }, + { + "epoch": 0.27944903581267216, + "grad_norm": 11.127833366394043, + "learning_rate": 8.281145446435463e-06, + "loss": 0.4832, + "step": 2536 + }, + { + "epoch": 0.2795592286501377, + "grad_norm": 5.004961013793945, + "learning_rate": 8.27982594178709e-06, + "loss": 0.4377, + "step": 2537 + }, + { + "epoch": 0.2796694214876033, + "grad_norm": 7.438329219818115, + "learning_rate": 8.278506036077146e-06, + "loss": 0.4642, + "step": 2538 + }, + { + "epoch": 0.27977961432506887, + "grad_norm": 6.2618937492370605, + "learning_rate": 8.27718572946703e-06, + "loss": 0.4354, + "step": 2539 + }, + { + "epoch": 0.2798898071625344, + "grad_norm": 4.871488094329834, + "learning_rate": 8.275865022118193e-06, + "loss": 0.3845, + "step": 2540 + }, + { + "epoch": 0.28, + "grad_norm": 6.332246780395508, + "learning_rate": 8.274543914192129e-06, + "loss": 0.4488, + "step": 2541 + }, + { + "epoch": 0.2801101928374656, + "grad_norm": 6.855973243713379, + "learning_rate": 8.273222405850388e-06, + "loss": 0.4389, + "step": 2542 + }, + { + "epoch": 0.2802203856749311, + "grad_norm": 10.263267517089844, + "learning_rate": 8.271900497254566e-06, + "loss": 0.4875, + "step": 2543 + }, + { + "epoch": 0.2803305785123967, + "grad_norm": 4.8031415939331055, + "learning_rate": 8.270578188566305e-06, + "loss": 0.3905, + "step": 2544 + }, + { + "epoch": 0.2804407713498623, + "grad_norm": 3.894521713256836, + "learning_rate": 8.2692554799473e-06, + "loss": 0.3452, + "step": 2545 + }, + { + "epoch": 0.28055096418732783, + "grad_norm": 7.412913799285889, + "learning_rate": 8.267932371559297e-06, + "loss": 0.3899, + "step": 2546 + }, + { + "epoch": 0.2806611570247934, + "grad_norm": 4.74569845199585, + "learning_rate": 8.266608863564083e-06, + "loss": 0.4083, + "step": 2547 + }, + { + "epoch": 0.28077134986225893, + "grad_norm": 5.828033924102783, + "learning_rate": 8.265284956123498e-06, + "loss": 0.3356, + "step": 2548 + }, + { + "epoch": 0.28088154269972454, + "grad_norm": 5.4041948318481445, + "learning_rate": 8.263960649399437e-06, + "loss": 0.4093, + "step": 2549 + }, + { + "epoch": 0.2809917355371901, + "grad_norm": 7.65022611618042, + "learning_rate": 8.26263594355383e-06, + "loss": 0.3738, + "step": 2550 + }, + { + "epoch": 0.28110192837465564, + "grad_norm": 6.111780643463135, + "learning_rate": 8.261310838748671e-06, + "loss": 0.3792, + "step": 2551 + }, + { + "epoch": 0.2812121212121212, + "grad_norm": 7.705667495727539, + "learning_rate": 8.25998533514599e-06, + "loss": 0.4507, + "step": 2552 + }, + { + "epoch": 0.2813223140495868, + "grad_norm": 6.605652332305908, + "learning_rate": 8.258659432907877e-06, + "loss": 0.3666, + "step": 2553 + }, + { + "epoch": 0.28143250688705235, + "grad_norm": 7.341992378234863, + "learning_rate": 8.257333132196461e-06, + "loss": 0.4708, + "step": 2554 + }, + { + "epoch": 0.2815426997245179, + "grad_norm": 4.790026664733887, + "learning_rate": 8.256006433173925e-06, + "loss": 0.3901, + "step": 2555 + }, + { + "epoch": 0.28165289256198345, + "grad_norm": 8.660374641418457, + "learning_rate": 8.254679336002498e-06, + "loss": 0.398, + "step": 2556 + }, + { + "epoch": 0.28176308539944905, + "grad_norm": 7.649413108825684, + "learning_rate": 8.25335184084446e-06, + "loss": 0.4353, + "step": 2557 + }, + { + "epoch": 0.2818732782369146, + "grad_norm": 17.531639099121094, + "learning_rate": 8.252023947862143e-06, + "loss": 0.5034, + "step": 2558 + }, + { + "epoch": 0.28198347107438015, + "grad_norm": 8.118510246276855, + "learning_rate": 8.250695657217919e-06, + "loss": 0.4191, + "step": 2559 + }, + { + "epoch": 0.2820936639118457, + "grad_norm": 8.229848861694336, + "learning_rate": 8.249366969074215e-06, + "loss": 0.4461, + "step": 2560 + }, + { + "epoch": 0.2822038567493113, + "grad_norm": 9.5634765625, + "learning_rate": 8.248037883593502e-06, + "loss": 0.395, + "step": 2561 + }, + { + "epoch": 0.28231404958677686, + "grad_norm": 4.810074806213379, + "learning_rate": 8.246708400938306e-06, + "loss": 0.4922, + "step": 2562 + }, + { + "epoch": 0.2824242424242424, + "grad_norm": 16.95096778869629, + "learning_rate": 8.245378521271196e-06, + "loss": 0.4841, + "step": 2563 + }, + { + "epoch": 0.28253443526170796, + "grad_norm": 7.1187896728515625, + "learning_rate": 8.244048244754792e-06, + "loss": 0.3845, + "step": 2564 + }, + { + "epoch": 0.28264462809917357, + "grad_norm": 4.641600608825684, + "learning_rate": 8.242717571551763e-06, + "loss": 0.4276, + "step": 2565 + }, + { + "epoch": 0.2827548209366391, + "grad_norm": 7.558225631713867, + "learning_rate": 8.241386501824824e-06, + "loss": 0.4607, + "step": 2566 + }, + { + "epoch": 0.28286501377410467, + "grad_norm": 7.707322597503662, + "learning_rate": 8.24005503573674e-06, + "loss": 0.474, + "step": 2567 + }, + { + "epoch": 0.2829752066115703, + "grad_norm": 12.64372730255127, + "learning_rate": 8.238723173450326e-06, + "loss": 0.4801, + "step": 2568 + }, + { + "epoch": 0.2830853994490358, + "grad_norm": 5.532975673675537, + "learning_rate": 8.237390915128443e-06, + "loss": 0.4439, + "step": 2569 + }, + { + "epoch": 0.2831955922865014, + "grad_norm": 6.830770969390869, + "learning_rate": 8.236058260934e-06, + "loss": 0.4411, + "step": 2570 + }, + { + "epoch": 0.2833057851239669, + "grad_norm": 7.833096981048584, + "learning_rate": 8.234725211029957e-06, + "loss": 0.4083, + "step": 2571 + }, + { + "epoch": 0.28341597796143253, + "grad_norm": 6.798170566558838, + "learning_rate": 8.233391765579323e-06, + "loss": 0.4258, + "step": 2572 + }, + { + "epoch": 0.2835261707988981, + "grad_norm": 5.830904006958008, + "learning_rate": 8.232057924745152e-06, + "loss": 0.3322, + "step": 2573 + }, + { + "epoch": 0.28363636363636363, + "grad_norm": 9.897858619689941, + "learning_rate": 8.230723688690546e-06, + "loss": 0.511, + "step": 2574 + }, + { + "epoch": 0.2837465564738292, + "grad_norm": 9.8292818069458, + "learning_rate": 8.22938905757866e-06, + "loss": 0.5303, + "step": 2575 + }, + { + "epoch": 0.2838567493112948, + "grad_norm": 5.095852851867676, + "learning_rate": 8.228054031572692e-06, + "loss": 0.4417, + "step": 2576 + }, + { + "epoch": 0.28396694214876034, + "grad_norm": 7.318708419799805, + "learning_rate": 8.226718610835894e-06, + "loss": 0.4943, + "step": 2577 + }, + { + "epoch": 0.2840771349862259, + "grad_norm": 5.821365833282471, + "learning_rate": 8.225382795531558e-06, + "loss": 0.479, + "step": 2578 + }, + { + "epoch": 0.28418732782369144, + "grad_norm": 5.989372253417969, + "learning_rate": 8.224046585823035e-06, + "loss": 0.3857, + "step": 2579 + }, + { + "epoch": 0.28429752066115704, + "grad_norm": 8.840926170349121, + "learning_rate": 8.222709981873716e-06, + "loss": 0.4709, + "step": 2580 + }, + { + "epoch": 0.2844077134986226, + "grad_norm": 10.142492294311523, + "learning_rate": 8.221372983847043e-06, + "loss": 0.5157, + "step": 2581 + }, + { + "epoch": 0.28451790633608814, + "grad_norm": 11.084004402160645, + "learning_rate": 8.220035591906505e-06, + "loss": 0.5256, + "step": 2582 + }, + { + "epoch": 0.2846280991735537, + "grad_norm": 7.02859354019165, + "learning_rate": 8.21869780621564e-06, + "loss": 0.4386, + "step": 2583 + }, + { + "epoch": 0.2847382920110193, + "grad_norm": 7.2068939208984375, + "learning_rate": 8.217359626938037e-06, + "loss": 0.4258, + "step": 2584 + }, + { + "epoch": 0.28484848484848485, + "grad_norm": 6.029386520385742, + "learning_rate": 8.216021054237329e-06, + "loss": 0.4751, + "step": 2585 + }, + { + "epoch": 0.2849586776859504, + "grad_norm": 5.800614833831787, + "learning_rate": 8.214682088277195e-06, + "loss": 0.3853, + "step": 2586 + }, + { + "epoch": 0.28506887052341595, + "grad_norm": 5.438484191894531, + "learning_rate": 8.21334272922137e-06, + "loss": 0.4587, + "step": 2587 + }, + { + "epoch": 0.28517906336088156, + "grad_norm": 5.722385883331299, + "learning_rate": 8.212002977233632e-06, + "loss": 0.3503, + "step": 2588 + }, + { + "epoch": 0.2852892561983471, + "grad_norm": 5.973930835723877, + "learning_rate": 8.210662832477806e-06, + "loss": 0.3744, + "step": 2589 + }, + { + "epoch": 0.28539944903581266, + "grad_norm": 5.600560665130615, + "learning_rate": 8.209322295117768e-06, + "loss": 0.468, + "step": 2590 + }, + { + "epoch": 0.28550964187327826, + "grad_norm": 7.9106831550598145, + "learning_rate": 8.20798136531744e-06, + "loss": 0.4752, + "step": 2591 + }, + { + "epoch": 0.2856198347107438, + "grad_norm": 9.829314231872559, + "learning_rate": 8.206640043240793e-06, + "loss": 0.3866, + "step": 2592 + }, + { + "epoch": 0.28573002754820936, + "grad_norm": 7.940008640289307, + "learning_rate": 8.205298329051845e-06, + "loss": 0.434, + "step": 2593 + }, + { + "epoch": 0.2858402203856749, + "grad_norm": 8.637011528015137, + "learning_rate": 8.203956222914665e-06, + "loss": 0.4344, + "step": 2594 + }, + { + "epoch": 0.2859504132231405, + "grad_norm": 5.530643463134766, + "learning_rate": 8.202613724993364e-06, + "loss": 0.3966, + "step": 2595 + }, + { + "epoch": 0.28606060606060607, + "grad_norm": 4.837628364562988, + "learning_rate": 8.201270835452108e-06, + "loss": 0.375, + "step": 2596 + }, + { + "epoch": 0.2861707988980716, + "grad_norm": 4.013152122497559, + "learning_rate": 8.199927554455106e-06, + "loss": 0.3959, + "step": 2597 + }, + { + "epoch": 0.28628099173553717, + "grad_norm": 5.780310153961182, + "learning_rate": 8.198583882166613e-06, + "loss": 0.4421, + "step": 2598 + }, + { + "epoch": 0.2863911845730028, + "grad_norm": 6.987085342407227, + "learning_rate": 8.197239818750942e-06, + "loss": 0.4186, + "step": 2599 + }, + { + "epoch": 0.2865013774104683, + "grad_norm": 4.513391017913818, + "learning_rate": 8.19589536437244e-06, + "loss": 0.4329, + "step": 2600 + }, + { + "epoch": 0.2866115702479339, + "grad_norm": 5.8561787605285645, + "learning_rate": 8.194550519195512e-06, + "loss": 0.4044, + "step": 2601 + }, + { + "epoch": 0.28672176308539943, + "grad_norm": 11.793116569519043, + "learning_rate": 8.193205283384608e-06, + "loss": 0.4767, + "step": 2602 + }, + { + "epoch": 0.28683195592286503, + "grad_norm": 9.649199485778809, + "learning_rate": 8.191859657104225e-06, + "loss": 0.4942, + "step": 2603 + }, + { + "epoch": 0.2869421487603306, + "grad_norm": 10.381978034973145, + "learning_rate": 8.190513640518906e-06, + "loss": 0.4311, + "step": 2604 + }, + { + "epoch": 0.28705234159779613, + "grad_norm": 7.879476070404053, + "learning_rate": 8.189167233793248e-06, + "loss": 0.3897, + "step": 2605 + }, + { + "epoch": 0.2871625344352617, + "grad_norm": 8.143092155456543, + "learning_rate": 8.187820437091885e-06, + "loss": 0.4834, + "step": 2606 + }, + { + "epoch": 0.2872727272727273, + "grad_norm": 8.470446586608887, + "learning_rate": 8.18647325057951e-06, + "loss": 0.578, + "step": 2607 + }, + { + "epoch": 0.28738292011019284, + "grad_norm": 5.060790538787842, + "learning_rate": 8.185125674420857e-06, + "loss": 0.4648, + "step": 2608 + }, + { + "epoch": 0.2874931129476584, + "grad_norm": 8.086231231689453, + "learning_rate": 8.18377770878071e-06, + "loss": 0.4915, + "step": 2609 + }, + { + "epoch": 0.28760330578512394, + "grad_norm": 9.0546293258667, + "learning_rate": 8.182429353823901e-06, + "loss": 0.4771, + "step": 2610 + }, + { + "epoch": 0.28771349862258955, + "grad_norm": 9.38154125213623, + "learning_rate": 8.181080609715309e-06, + "loss": 0.4612, + "step": 2611 + }, + { + "epoch": 0.2878236914600551, + "grad_norm": 9.869471549987793, + "learning_rate": 8.179731476619858e-06, + "loss": 0.5246, + "step": 2612 + }, + { + "epoch": 0.28793388429752065, + "grad_norm": 6.299503803253174, + "learning_rate": 8.178381954702521e-06, + "loss": 0.3416, + "step": 2613 + }, + { + "epoch": 0.2880440771349862, + "grad_norm": 5.722007751464844, + "learning_rate": 8.177032044128323e-06, + "loss": 0.4092, + "step": 2614 + }, + { + "epoch": 0.2881542699724518, + "grad_norm": 8.35212516784668, + "learning_rate": 8.175681745062329e-06, + "loss": 0.378, + "step": 2615 + }, + { + "epoch": 0.28826446280991735, + "grad_norm": 5.700195789337158, + "learning_rate": 8.17433105766966e-06, + "loss": 0.4921, + "step": 2616 + }, + { + "epoch": 0.2883746556473829, + "grad_norm": 7.712435245513916, + "learning_rate": 8.172979982115477e-06, + "loss": 0.5392, + "step": 2617 + }, + { + "epoch": 0.2884848484848485, + "grad_norm": 4.76263952255249, + "learning_rate": 8.17162851856499e-06, + "loss": 0.4437, + "step": 2618 + }, + { + "epoch": 0.28859504132231406, + "grad_norm": 6.987651348114014, + "learning_rate": 8.170276667183461e-06, + "loss": 0.2997, + "step": 2619 + }, + { + "epoch": 0.2887052341597796, + "grad_norm": 9.844829559326172, + "learning_rate": 8.168924428136193e-06, + "loss": 0.4491, + "step": 2620 + }, + { + "epoch": 0.28881542699724516, + "grad_norm": 6.718286514282227, + "learning_rate": 8.167571801588542e-06, + "loss": 0.4666, + "step": 2621 + }, + { + "epoch": 0.28892561983471077, + "grad_norm": 8.184840202331543, + "learning_rate": 8.16621878770591e-06, + "loss": 0.474, + "step": 2622 + }, + { + "epoch": 0.2890358126721763, + "grad_norm": 5.251203536987305, + "learning_rate": 8.164865386653743e-06, + "loss": 0.3551, + "step": 2623 + }, + { + "epoch": 0.28914600550964187, + "grad_norm": 4.9718475341796875, + "learning_rate": 8.163511598597537e-06, + "loss": 0.415, + "step": 2624 + }, + { + "epoch": 0.2892561983471074, + "grad_norm": 5.431325435638428, + "learning_rate": 8.162157423702836e-06, + "loss": 0.3606, + "step": 2625 + }, + { + "epoch": 0.289366391184573, + "grad_norm": 7.744053840637207, + "learning_rate": 8.16080286213523e-06, + "loss": 0.503, + "step": 2626 + }, + { + "epoch": 0.2894765840220386, + "grad_norm": 5.625741004943848, + "learning_rate": 8.159447914060357e-06, + "loss": 0.3628, + "step": 2627 + }, + { + "epoch": 0.2895867768595041, + "grad_norm": 6.573037624359131, + "learning_rate": 8.1580925796439e-06, + "loss": 0.4738, + "step": 2628 + }, + { + "epoch": 0.2896969696969697, + "grad_norm": 10.216695785522461, + "learning_rate": 8.156736859051592e-06, + "loss": 0.4733, + "step": 2629 + }, + { + "epoch": 0.2898071625344353, + "grad_norm": 6.306691646575928, + "learning_rate": 8.155380752449213e-06, + "loss": 0.4845, + "step": 2630 + }, + { + "epoch": 0.28991735537190083, + "grad_norm": 5.268336296081543, + "learning_rate": 8.154024260002592e-06, + "loss": 0.4264, + "step": 2631 + }, + { + "epoch": 0.2900275482093664, + "grad_norm": 8.599966049194336, + "learning_rate": 8.152667381877596e-06, + "loss": 0.4741, + "step": 2632 + }, + { + "epoch": 0.29013774104683193, + "grad_norm": 6.03732967376709, + "learning_rate": 8.151310118240155e-06, + "loss": 0.4128, + "step": 2633 + }, + { + "epoch": 0.29024793388429754, + "grad_norm": 4.368444919586182, + "learning_rate": 8.149952469256228e-06, + "loss": 0.43, + "step": 2634 + }, + { + "epoch": 0.2903581267217631, + "grad_norm": 5.164309024810791, + "learning_rate": 8.148594435091837e-06, + "loss": 0.3865, + "step": 2635 + }, + { + "epoch": 0.29046831955922864, + "grad_norm": 11.640958786010742, + "learning_rate": 8.14723601591304e-06, + "loss": 0.5386, + "step": 2636 + }, + { + "epoch": 0.2905785123966942, + "grad_norm": 4.805738925933838, + "learning_rate": 8.145877211885949e-06, + "loss": 0.3438, + "step": 2637 + }, + { + "epoch": 0.2906887052341598, + "grad_norm": 5.049983501434326, + "learning_rate": 8.144518023176718e-06, + "loss": 0.4094, + "step": 2638 + }, + { + "epoch": 0.29079889807162534, + "grad_norm": 4.325822353363037, + "learning_rate": 8.14315844995155e-06, + "loss": 0.447, + "step": 2639 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 5.957462787628174, + "learning_rate": 8.141798492376702e-06, + "loss": 0.47, + "step": 2640 + }, + { + "epoch": 0.2910192837465565, + "grad_norm": 9.82888126373291, + "learning_rate": 8.140438150618463e-06, + "loss": 0.4671, + "step": 2641 + }, + { + "epoch": 0.29112947658402205, + "grad_norm": 7.724475383758545, + "learning_rate": 8.139077424843183e-06, + "loss": 0.4302, + "step": 2642 + }, + { + "epoch": 0.2912396694214876, + "grad_norm": 5.9646100997924805, + "learning_rate": 8.13771631521725e-06, + "loss": 0.488, + "step": 2643 + }, + { + "epoch": 0.29134986225895315, + "grad_norm": 5.967033863067627, + "learning_rate": 8.136354821907104e-06, + "loss": 0.4025, + "step": 2644 + }, + { + "epoch": 0.29146005509641876, + "grad_norm": 9.786425590515137, + "learning_rate": 8.13499294507923e-06, + "loss": 0.4533, + "step": 2645 + }, + { + "epoch": 0.2915702479338843, + "grad_norm": 9.216123580932617, + "learning_rate": 8.133630684900162e-06, + "loss": 0.5309, + "step": 2646 + }, + { + "epoch": 0.29168044077134986, + "grad_norm": 12.148828506469727, + "learning_rate": 8.132268041536476e-06, + "loss": 0.4823, + "step": 2647 + }, + { + "epoch": 0.2917906336088154, + "grad_norm": 7.568973064422607, + "learning_rate": 8.130905015154799e-06, + "loss": 0.5063, + "step": 2648 + }, + { + "epoch": 0.291900826446281, + "grad_norm": 6.7498779296875, + "learning_rate": 8.129541605921803e-06, + "loss": 0.4117, + "step": 2649 + }, + { + "epoch": 0.29201101928374656, + "grad_norm": 5.429599761962891, + "learning_rate": 8.12817781400421e-06, + "loss": 0.4413, + "step": 2650 + }, + { + "epoch": 0.2921212121212121, + "grad_norm": 6.940309047698975, + "learning_rate": 8.126813639568783e-06, + "loss": 0.4633, + "step": 2651 + }, + { + "epoch": 0.29223140495867767, + "grad_norm": 11.138673782348633, + "learning_rate": 8.125449082782337e-06, + "loss": 0.4829, + "step": 2652 + }, + { + "epoch": 0.29234159779614327, + "grad_norm": 6.361445903778076, + "learning_rate": 8.124084143811732e-06, + "loss": 0.3944, + "step": 2653 + }, + { + "epoch": 0.2924517906336088, + "grad_norm": 5.243053436279297, + "learning_rate": 8.122718822823877e-06, + "loss": 0.493, + "step": 2654 + }, + { + "epoch": 0.29256198347107437, + "grad_norm": 5.032866954803467, + "learning_rate": 8.12135311998572e-06, + "loss": 0.4236, + "step": 2655 + }, + { + "epoch": 0.2926721763085399, + "grad_norm": 9.60926628112793, + "learning_rate": 8.119987035464263e-06, + "loss": 0.4285, + "step": 2656 + }, + { + "epoch": 0.29278236914600553, + "grad_norm": 5.861676216125488, + "learning_rate": 8.118620569426554e-06, + "loss": 0.4199, + "step": 2657 + }, + { + "epoch": 0.2928925619834711, + "grad_norm": 9.698776245117188, + "learning_rate": 8.117253722039686e-06, + "loss": 0.4075, + "step": 2658 + }, + { + "epoch": 0.29300275482093663, + "grad_norm": 5.61128044128418, + "learning_rate": 8.115886493470797e-06, + "loss": 0.3813, + "step": 2659 + }, + { + "epoch": 0.2931129476584022, + "grad_norm": 6.095882415771484, + "learning_rate": 8.114518883887076e-06, + "loss": 0.3672, + "step": 2660 + }, + { + "epoch": 0.2932231404958678, + "grad_norm": 6.09226655960083, + "learning_rate": 8.113150893455756e-06, + "loss": 0.4277, + "step": 2661 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.465144634246826, + "learning_rate": 8.111782522344114e-06, + "loss": 0.3789, + "step": 2662 + }, + { + "epoch": 0.2934435261707989, + "grad_norm": 7.330477714538574, + "learning_rate": 8.11041377071948e-06, + "loss": 0.3919, + "step": 2663 + }, + { + "epoch": 0.29355371900826444, + "grad_norm": 6.909572124481201, + "learning_rate": 8.109044638749224e-06, + "loss": 0.4748, + "step": 2664 + }, + { + "epoch": 0.29366391184573004, + "grad_norm": 5.801069736480713, + "learning_rate": 8.107675126600767e-06, + "loss": 0.4622, + "step": 2665 + }, + { + "epoch": 0.2937741046831956, + "grad_norm": 7.1224188804626465, + "learning_rate": 8.106305234441573e-06, + "loss": 0.4052, + "step": 2666 + }, + { + "epoch": 0.29388429752066114, + "grad_norm": 5.903815746307373, + "learning_rate": 8.104934962439157e-06, + "loss": 0.3944, + "step": 2667 + }, + { + "epoch": 0.29399449035812675, + "grad_norm": 4.441305637359619, + "learning_rate": 8.103564310761077e-06, + "loss": 0.4419, + "step": 2668 + }, + { + "epoch": 0.2941046831955923, + "grad_norm": 11.590902328491211, + "learning_rate": 8.102193279574935e-06, + "loss": 0.4373, + "step": 2669 + }, + { + "epoch": 0.29421487603305785, + "grad_norm": 5.8332953453063965, + "learning_rate": 8.100821869048385e-06, + "loss": 0.4732, + "step": 2670 + }, + { + "epoch": 0.2943250688705234, + "grad_norm": 5.280449867248535, + "learning_rate": 8.099450079349124e-06, + "loss": 0.4443, + "step": 2671 + }, + { + "epoch": 0.294435261707989, + "grad_norm": 10.186681747436523, + "learning_rate": 8.098077910644901e-06, + "loss": 0.4394, + "step": 2672 + }, + { + "epoch": 0.29454545454545455, + "grad_norm": 8.85413932800293, + "learning_rate": 8.096705363103499e-06, + "loss": 0.4828, + "step": 2673 + }, + { + "epoch": 0.2946556473829201, + "grad_norm": 9.561735153198242, + "learning_rate": 8.095332436892761e-06, + "loss": 0.3634, + "step": 2674 + }, + { + "epoch": 0.29476584022038566, + "grad_norm": 10.236101150512695, + "learning_rate": 8.093959132180567e-06, + "loss": 0.4397, + "step": 2675 + }, + { + "epoch": 0.29487603305785126, + "grad_norm": 7.617365837097168, + "learning_rate": 8.092585449134848e-06, + "loss": 0.4184, + "step": 2676 + }, + { + "epoch": 0.2949862258953168, + "grad_norm": 4.575974941253662, + "learning_rate": 8.091211387923578e-06, + "loss": 0.4487, + "step": 2677 + }, + { + "epoch": 0.29509641873278236, + "grad_norm": 7.067541122436523, + "learning_rate": 8.089836948714782e-06, + "loss": 0.4459, + "step": 2678 + }, + { + "epoch": 0.2952066115702479, + "grad_norm": 7.422731876373291, + "learning_rate": 8.088462131676527e-06, + "loss": 0.4099, + "step": 2679 + }, + { + "epoch": 0.2953168044077135, + "grad_norm": 7.212629795074463, + "learning_rate": 8.087086936976927e-06, + "loss": 0.3918, + "step": 2680 + }, + { + "epoch": 0.29542699724517907, + "grad_norm": 4.199789524078369, + "learning_rate": 8.08571136478414e-06, + "loss": 0.3815, + "step": 2681 + }, + { + "epoch": 0.2955371900826446, + "grad_norm": 9.775740623474121, + "learning_rate": 8.084335415266382e-06, + "loss": 0.4704, + "step": 2682 + }, + { + "epoch": 0.29564738292011017, + "grad_norm": 9.210233688354492, + "learning_rate": 8.082959088591896e-06, + "loss": 0.3792, + "step": 2683 + }, + { + "epoch": 0.2957575757575758, + "grad_norm": 5.724307537078857, + "learning_rate": 8.081582384928983e-06, + "loss": 0.441, + "step": 2684 + }, + { + "epoch": 0.2958677685950413, + "grad_norm": 6.285418510437012, + "learning_rate": 8.080205304445992e-06, + "loss": 0.4501, + "step": 2685 + }, + { + "epoch": 0.2959779614325069, + "grad_norm": 5.50066614151001, + "learning_rate": 8.078827847311313e-06, + "loss": 0.4617, + "step": 2686 + }, + { + "epoch": 0.2960881542699724, + "grad_norm": 4.148617267608643, + "learning_rate": 8.077450013693382e-06, + "loss": 0.4431, + "step": 2687 + }, + { + "epoch": 0.29619834710743803, + "grad_norm": 8.889013290405273, + "learning_rate": 8.076071803760683e-06, + "loss": 0.4252, + "step": 2688 + }, + { + "epoch": 0.2963085399449036, + "grad_norm": 7.282867431640625, + "learning_rate": 8.074693217681747e-06, + "loss": 0.3735, + "step": 2689 + }, + { + "epoch": 0.29641873278236913, + "grad_norm": 9.445550918579102, + "learning_rate": 8.073314255625144e-06, + "loss": 0.5431, + "step": 2690 + }, + { + "epoch": 0.29652892561983474, + "grad_norm": 12.855957984924316, + "learning_rate": 8.071934917759502e-06, + "loss": 0.5333, + "step": 2691 + }, + { + "epoch": 0.2966391184573003, + "grad_norm": 6.184736728668213, + "learning_rate": 8.070555204253485e-06, + "loss": 0.477, + "step": 2692 + }, + { + "epoch": 0.29674931129476584, + "grad_norm": 10.550787925720215, + "learning_rate": 8.069175115275808e-06, + "loss": 0.4769, + "step": 2693 + }, + { + "epoch": 0.2968595041322314, + "grad_norm": 8.451855659484863, + "learning_rate": 8.067794650995226e-06, + "loss": 0.4841, + "step": 2694 + }, + { + "epoch": 0.296969696969697, + "grad_norm": 12.774663925170898, + "learning_rate": 8.066413811580548e-06, + "loss": 0.4728, + "step": 2695 + }, + { + "epoch": 0.29707988980716254, + "grad_norm": 12.18591594696045, + "learning_rate": 8.065032597200624e-06, + "loss": 0.5774, + "step": 2696 + }, + { + "epoch": 0.2971900826446281, + "grad_norm": 10.858301162719727, + "learning_rate": 8.063651008024351e-06, + "loss": 0.5422, + "step": 2697 + }, + { + "epoch": 0.29730027548209365, + "grad_norm": 5.995025634765625, + "learning_rate": 8.06226904422067e-06, + "loss": 0.4087, + "step": 2698 + }, + { + "epoch": 0.29741046831955925, + "grad_norm": 8.615156173706055, + "learning_rate": 8.06088670595857e-06, + "loss": 0.4563, + "step": 2699 + }, + { + "epoch": 0.2975206611570248, + "grad_norm": 8.634101867675781, + "learning_rate": 8.05950399340709e-06, + "loss": 0.4195, + "step": 2700 + }, + { + "epoch": 0.29763085399449035, + "grad_norm": 5.616157531738281, + "learning_rate": 8.058120906735304e-06, + "loss": 0.3747, + "step": 2701 + }, + { + "epoch": 0.2977410468319559, + "grad_norm": 4.679882526397705, + "learning_rate": 8.056737446112338e-06, + "loss": 0.3641, + "step": 2702 + }, + { + "epoch": 0.2978512396694215, + "grad_norm": 5.062495231628418, + "learning_rate": 8.055353611707364e-06, + "loss": 0.5009, + "step": 2703 + }, + { + "epoch": 0.29796143250688706, + "grad_norm": 4.492841720581055, + "learning_rate": 8.0539694036896e-06, + "loss": 0.3847, + "step": 2704 + }, + { + "epoch": 0.2980716253443526, + "grad_norm": 7.658977508544922, + "learning_rate": 8.052584822228312e-06, + "loss": 0.4554, + "step": 2705 + }, + { + "epoch": 0.29818181818181816, + "grad_norm": 7.011204719543457, + "learning_rate": 8.051199867492803e-06, + "loss": 0.3854, + "step": 2706 + }, + { + "epoch": 0.29829201101928376, + "grad_norm": 6.54036283493042, + "learning_rate": 8.04981453965243e-06, + "loss": 0.4347, + "step": 2707 + }, + { + "epoch": 0.2984022038567493, + "grad_norm": 4.983130931854248, + "learning_rate": 8.048428838876593e-06, + "loss": 0.4395, + "step": 2708 + }, + { + "epoch": 0.29851239669421487, + "grad_norm": 4.8552374839782715, + "learning_rate": 8.047042765334737e-06, + "loss": 0.4574, + "step": 2709 + }, + { + "epoch": 0.2986225895316804, + "grad_norm": 4.62309455871582, + "learning_rate": 8.045656319196351e-06, + "loss": 0.467, + "step": 2710 + }, + { + "epoch": 0.298732782369146, + "grad_norm": 8.67222785949707, + "learning_rate": 8.044269500630975e-06, + "loss": 0.503, + "step": 2711 + }, + { + "epoch": 0.29884297520661157, + "grad_norm": 6.187382698059082, + "learning_rate": 8.042882309808187e-06, + "loss": 0.3817, + "step": 2712 + }, + { + "epoch": 0.2989531680440771, + "grad_norm": 7.065738201141357, + "learning_rate": 8.041494746897618e-06, + "loss": 0.4222, + "step": 2713 + }, + { + "epoch": 0.2990633608815427, + "grad_norm": 10.733210563659668, + "learning_rate": 8.040106812068943e-06, + "loss": 0.3959, + "step": 2714 + }, + { + "epoch": 0.2991735537190083, + "grad_norm": 8.270208358764648, + "learning_rate": 8.03871850549187e-06, + "loss": 0.4313, + "step": 2715 + }, + { + "epoch": 0.29928374655647383, + "grad_norm": 8.04257583618164, + "learning_rate": 8.037329827336176e-06, + "loss": 0.4242, + "step": 2716 + }, + { + "epoch": 0.2993939393939394, + "grad_norm": 6.929685592651367, + "learning_rate": 8.035940777771664e-06, + "loss": 0.5107, + "step": 2717 + }, + { + "epoch": 0.299504132231405, + "grad_norm": 6.822888374328613, + "learning_rate": 8.03455135696819e-06, + "loss": 0.3517, + "step": 2718 + }, + { + "epoch": 0.29961432506887054, + "grad_norm": 7.220570087432861, + "learning_rate": 8.033161565095654e-06, + "loss": 0.4018, + "step": 2719 + }, + { + "epoch": 0.2997245179063361, + "grad_norm": 8.224783897399902, + "learning_rate": 8.031771402324001e-06, + "loss": 0.4423, + "step": 2720 + }, + { + "epoch": 0.29983471074380164, + "grad_norm": 5.0163254737854, + "learning_rate": 8.030380868823224e-06, + "loss": 0.4316, + "step": 2721 + }, + { + "epoch": 0.29994490358126724, + "grad_norm": 7.272629737854004, + "learning_rate": 8.028989964763356e-06, + "loss": 0.3644, + "step": 2722 + }, + { + "epoch": 0.3000550964187328, + "grad_norm": 9.349287033081055, + "learning_rate": 8.027598690314481e-06, + "loss": 0.431, + "step": 2723 + }, + { + "epoch": 0.30016528925619834, + "grad_norm": 7.657289505004883, + "learning_rate": 8.026207045646728e-06, + "loss": 0.3597, + "step": 2724 + }, + { + "epoch": 0.30016528925619834, + "eval_loss": 0.44847550988197327, + "eval_runtime": 41.9343, + "eval_samples_per_second": 17.504, + "eval_steps_per_second": 2.194, + "step": 2724 + }, + { + "epoch": 0.3002754820936639, + "grad_norm": 9.78979778289795, + "learning_rate": 8.024815030930264e-06, + "loss": 0.4198, + "step": 2725 + }, + { + "epoch": 0.3003856749311295, + "grad_norm": 6.471761226654053, + "learning_rate": 8.023422646335311e-06, + "loss": 0.4108, + "step": 2726 + }, + { + "epoch": 0.30049586776859505, + "grad_norm": 9.229776382446289, + "learning_rate": 8.022029892032128e-06, + "loss": 0.5366, + "step": 2727 + }, + { + "epoch": 0.3006060606060606, + "grad_norm": 12.983948707580566, + "learning_rate": 8.020636768191027e-06, + "loss": 0.516, + "step": 2728 + }, + { + "epoch": 0.30071625344352615, + "grad_norm": 5.297845363616943, + "learning_rate": 8.019243274982357e-06, + "loss": 0.3972, + "step": 2729 + }, + { + "epoch": 0.30082644628099175, + "grad_norm": 8.642130851745605, + "learning_rate": 8.017849412576517e-06, + "loss": 0.4109, + "step": 2730 + }, + { + "epoch": 0.3009366391184573, + "grad_norm": 6.278255462646484, + "learning_rate": 8.016455181143954e-06, + "loss": 0.4478, + "step": 2731 + }, + { + "epoch": 0.30104683195592286, + "grad_norm": 5.403011798858643, + "learning_rate": 8.015060580855154e-06, + "loss": 0.3878, + "step": 2732 + }, + { + "epoch": 0.3011570247933884, + "grad_norm": 6.093563556671143, + "learning_rate": 8.01366561188065e-06, + "loss": 0.3991, + "step": 2733 + }, + { + "epoch": 0.301267217630854, + "grad_norm": 5.804359436035156, + "learning_rate": 8.012270274391022e-06, + "loss": 0.4626, + "step": 2734 + }, + { + "epoch": 0.30137741046831956, + "grad_norm": 11.595686912536621, + "learning_rate": 8.010874568556892e-06, + "loss": 0.5503, + "step": 2735 + }, + { + "epoch": 0.3014876033057851, + "grad_norm": 5.565887928009033, + "learning_rate": 8.00947849454893e-06, + "loss": 0.4032, + "step": 2736 + }, + { + "epoch": 0.30159779614325066, + "grad_norm": 10.746644020080566, + "learning_rate": 8.008082052537848e-06, + "loss": 0.4069, + "step": 2737 + }, + { + "epoch": 0.30170798898071627, + "grad_norm": 6.30381965637207, + "learning_rate": 8.006685242694409e-06, + "loss": 0.3726, + "step": 2738 + }, + { + "epoch": 0.3018181818181818, + "grad_norm": 10.491253852844238, + "learning_rate": 8.005288065189414e-06, + "loss": 0.5036, + "step": 2739 + }, + { + "epoch": 0.30192837465564737, + "grad_norm": 6.20374870300293, + "learning_rate": 8.003890520193711e-06, + "loss": 0.4082, + "step": 2740 + }, + { + "epoch": 0.302038567493113, + "grad_norm": 6.9139227867126465, + "learning_rate": 8.002492607878197e-06, + "loss": 0.4841, + "step": 2741 + }, + { + "epoch": 0.3021487603305785, + "grad_norm": 8.514464378356934, + "learning_rate": 8.001094328413807e-06, + "loss": 0.4927, + "step": 2742 + }, + { + "epoch": 0.3022589531680441, + "grad_norm": 10.513345718383789, + "learning_rate": 7.999695681971525e-06, + "loss": 0.4676, + "step": 2743 + }, + { + "epoch": 0.3023691460055096, + "grad_norm": 8.520428657531738, + "learning_rate": 7.998296668722381e-06, + "loss": 0.442, + "step": 2744 + }, + { + "epoch": 0.30247933884297523, + "grad_norm": 7.687136173248291, + "learning_rate": 7.996897288837449e-06, + "loss": 0.3255, + "step": 2745 + }, + { + "epoch": 0.3025895316804408, + "grad_norm": 8.961156845092773, + "learning_rate": 7.995497542487845e-06, + "loss": 0.4728, + "step": 2746 + }, + { + "epoch": 0.30269972451790633, + "grad_norm": 14.33635139465332, + "learning_rate": 7.994097429844732e-06, + "loss": 0.5018, + "step": 2747 + }, + { + "epoch": 0.3028099173553719, + "grad_norm": 5.116520404815674, + "learning_rate": 7.992696951079318e-06, + "loss": 0.4029, + "step": 2748 + }, + { + "epoch": 0.3029201101928375, + "grad_norm": 11.656856536865234, + "learning_rate": 7.991296106362855e-06, + "loss": 0.5388, + "step": 2749 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 7.711641311645508, + "learning_rate": 7.989894895866643e-06, + "loss": 0.4161, + "step": 2750 + }, + { + "epoch": 0.3031404958677686, + "grad_norm": 6.066614627838135, + "learning_rate": 7.988493319762018e-06, + "loss": 0.4009, + "step": 2751 + }, + { + "epoch": 0.30325068870523414, + "grad_norm": 5.1315717697143555, + "learning_rate": 7.987091378220376e-06, + "loss": 0.4176, + "step": 2752 + }, + { + "epoch": 0.30336088154269975, + "grad_norm": 6.4421844482421875, + "learning_rate": 7.985689071413138e-06, + "loss": 0.4541, + "step": 2753 + }, + { + "epoch": 0.3034710743801653, + "grad_norm": 9.538857460021973, + "learning_rate": 7.984286399511786e-06, + "loss": 0.3356, + "step": 2754 + }, + { + "epoch": 0.30358126721763085, + "grad_norm": 11.166560173034668, + "learning_rate": 7.982883362687839e-06, + "loss": 0.384, + "step": 2755 + }, + { + "epoch": 0.3036914600550964, + "grad_norm": 5.413555145263672, + "learning_rate": 7.981479961112863e-06, + "loss": 0.4505, + "step": 2756 + }, + { + "epoch": 0.303801652892562, + "grad_norm": 6.005543231964111, + "learning_rate": 7.980076194958468e-06, + "loss": 0.462, + "step": 2757 + }, + { + "epoch": 0.30391184573002755, + "grad_norm": 9.219489097595215, + "learning_rate": 7.978672064396307e-06, + "loss": 0.4541, + "step": 2758 + }, + { + "epoch": 0.3040220385674931, + "grad_norm": 5.687552452087402, + "learning_rate": 7.977267569598082e-06, + "loss": 0.3804, + "step": 2759 + }, + { + "epoch": 0.30413223140495865, + "grad_norm": 7.908405780792236, + "learning_rate": 7.975862710735531e-06, + "loss": 0.481, + "step": 2760 + }, + { + "epoch": 0.30424242424242426, + "grad_norm": 8.924123764038086, + "learning_rate": 7.974457487980447e-06, + "loss": 0.3272, + "step": 2761 + }, + { + "epoch": 0.3043526170798898, + "grad_norm": 7.286680698394775, + "learning_rate": 7.97305190150466e-06, + "loss": 0.4686, + "step": 2762 + }, + { + "epoch": 0.30446280991735536, + "grad_norm": 13.811619758605957, + "learning_rate": 7.97164595148005e-06, + "loss": 0.478, + "step": 2763 + }, + { + "epoch": 0.3045730027548209, + "grad_norm": 4.992403030395508, + "learning_rate": 7.970239638078536e-06, + "loss": 0.3503, + "step": 2764 + }, + { + "epoch": 0.3046831955922865, + "grad_norm": 6.27252721786499, + "learning_rate": 7.968832961472084e-06, + "loss": 0.3845, + "step": 2765 + }, + { + "epoch": 0.30479338842975207, + "grad_norm": 7.079043865203857, + "learning_rate": 7.967425921832705e-06, + "loss": 0.3657, + "step": 2766 + }, + { + "epoch": 0.3049035812672176, + "grad_norm": 9.025582313537598, + "learning_rate": 7.966018519332453e-06, + "loss": 0.4906, + "step": 2767 + }, + { + "epoch": 0.3050137741046832, + "grad_norm": 5.517977237701416, + "learning_rate": 7.964610754143427e-06, + "loss": 0.4919, + "step": 2768 + }, + { + "epoch": 0.30512396694214877, + "grad_norm": 5.134316921234131, + "learning_rate": 7.96320262643777e-06, + "loss": 0.4195, + "step": 2769 + }, + { + "epoch": 0.3052341597796143, + "grad_norm": 7.691980361938477, + "learning_rate": 7.961794136387672e-06, + "loss": 0.5515, + "step": 2770 + }, + { + "epoch": 0.3053443526170799, + "grad_norm": 6.8551435470581055, + "learning_rate": 7.960385284165364e-06, + "loss": 0.4347, + "step": 2771 + }, + { + "epoch": 0.3054545454545455, + "grad_norm": 7.076082229614258, + "learning_rate": 7.958976069943123e-06, + "loss": 0.3281, + "step": 2772 + }, + { + "epoch": 0.30556473829201103, + "grad_norm": 6.586147308349609, + "learning_rate": 7.957566493893268e-06, + "loss": 0.3514, + "step": 2773 + }, + { + "epoch": 0.3056749311294766, + "grad_norm": 10.270943641662598, + "learning_rate": 7.956156556188166e-06, + "loss": 0.4335, + "step": 2774 + }, + { + "epoch": 0.30578512396694213, + "grad_norm": 5.657565116882324, + "learning_rate": 7.954746257000223e-06, + "loss": 0.4522, + "step": 2775 + }, + { + "epoch": 0.30589531680440774, + "grad_norm": 7.81183385848999, + "learning_rate": 7.953335596501892e-06, + "loss": 0.4627, + "step": 2776 + }, + { + "epoch": 0.3060055096418733, + "grad_norm": 8.615931510925293, + "learning_rate": 7.951924574865677e-06, + "loss": 0.5076, + "step": 2777 + }, + { + "epoch": 0.30611570247933884, + "grad_norm": 8.567231178283691, + "learning_rate": 7.950513192264112e-06, + "loss": 0.4647, + "step": 2778 + }, + { + "epoch": 0.3062258953168044, + "grad_norm": 13.009634971618652, + "learning_rate": 7.949101448869787e-06, + "loss": 0.4996, + "step": 2779 + }, + { + "epoch": 0.30633608815427, + "grad_norm": 9.427029609680176, + "learning_rate": 7.947689344855331e-06, + "loss": 0.5276, + "step": 2780 + }, + { + "epoch": 0.30644628099173554, + "grad_norm": 6.252676010131836, + "learning_rate": 7.946276880393418e-06, + "loss": 0.4461, + "step": 2781 + }, + { + "epoch": 0.3065564738292011, + "grad_norm": 27.598886489868164, + "learning_rate": 7.944864055656765e-06, + "loss": 0.5092, + "step": 2782 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 6.648087024688721, + "learning_rate": 7.943450870818137e-06, + "loss": 0.4466, + "step": 2783 + }, + { + "epoch": 0.30677685950413225, + "grad_norm": 6.301035404205322, + "learning_rate": 7.942037326050336e-06, + "loss": 0.4298, + "step": 2784 + }, + { + "epoch": 0.3068870523415978, + "grad_norm": 5.16077995300293, + "learning_rate": 7.940623421526217e-06, + "loss": 0.4579, + "step": 2785 + }, + { + "epoch": 0.30699724517906335, + "grad_norm": 10.42927074432373, + "learning_rate": 7.939209157418669e-06, + "loss": 0.4618, + "step": 2786 + }, + { + "epoch": 0.3071074380165289, + "grad_norm": 4.532710552215576, + "learning_rate": 7.937794533900634e-06, + "loss": 0.3699, + "step": 2787 + }, + { + "epoch": 0.3072176308539945, + "grad_norm": 17.369544982910156, + "learning_rate": 7.936379551145092e-06, + "loss": 0.4519, + "step": 2788 + }, + { + "epoch": 0.30732782369146006, + "grad_norm": 4.231653213500977, + "learning_rate": 7.934964209325071e-06, + "loss": 0.3588, + "step": 2789 + }, + { + "epoch": 0.3074380165289256, + "grad_norm": 6.629281997680664, + "learning_rate": 7.933548508613638e-06, + "loss": 0.4241, + "step": 2790 + }, + { + "epoch": 0.3075482093663912, + "grad_norm": 5.726194381713867, + "learning_rate": 7.932132449183912e-06, + "loss": 0.4725, + "step": 2791 + }, + { + "epoch": 0.30765840220385676, + "grad_norm": 5.600101947784424, + "learning_rate": 7.930716031209043e-06, + "loss": 0.4293, + "step": 2792 + }, + { + "epoch": 0.3077685950413223, + "grad_norm": 12.127617835998535, + "learning_rate": 7.929299254862239e-06, + "loss": 0.539, + "step": 2793 + }, + { + "epoch": 0.30787878787878786, + "grad_norm": 6.142868995666504, + "learning_rate": 7.927882120316744e-06, + "loss": 0.3327, + "step": 2794 + }, + { + "epoch": 0.30798898071625347, + "grad_norm": 5.726541042327881, + "learning_rate": 7.926464627745844e-06, + "loss": 0.3919, + "step": 2795 + }, + { + "epoch": 0.308099173553719, + "grad_norm": 6.68948221206665, + "learning_rate": 7.925046777322873e-06, + "loss": 0.4414, + "step": 2796 + }, + { + "epoch": 0.30820936639118457, + "grad_norm": 10.885160446166992, + "learning_rate": 7.92362856922121e-06, + "loss": 0.4975, + "step": 2797 + }, + { + "epoch": 0.3083195592286501, + "grad_norm": 6.794032096862793, + "learning_rate": 7.922210003614277e-06, + "loss": 0.4873, + "step": 2798 + }, + { + "epoch": 0.3084297520661157, + "grad_norm": 7.296762943267822, + "learning_rate": 7.920791080675534e-06, + "loss": 0.4076, + "step": 2799 + }, + { + "epoch": 0.3085399449035813, + "grad_norm": 9.808841705322266, + "learning_rate": 7.919371800578489e-06, + "loss": 0.463, + "step": 2800 + }, + { + "epoch": 0.3086501377410468, + "grad_norm": 7.044501304626465, + "learning_rate": 7.917952163496695e-06, + "loss": 0.4042, + "step": 2801 + }, + { + "epoch": 0.3087603305785124, + "grad_norm": 8.651405334472656, + "learning_rate": 7.916532169603745e-06, + "loss": 0.4401, + "step": 2802 + }, + { + "epoch": 0.308870523415978, + "grad_norm": 8.043083190917969, + "learning_rate": 7.915111819073282e-06, + "loss": 0.4283, + "step": 2803 + }, + { + "epoch": 0.30898071625344353, + "grad_norm": 6.39584493637085, + "learning_rate": 7.913691112078985e-06, + "loss": 0.4579, + "step": 2804 + }, + { + "epoch": 0.3090909090909091, + "grad_norm": 9.391702651977539, + "learning_rate": 7.912270048794582e-06, + "loss": 0.5076, + "step": 2805 + }, + { + "epoch": 0.30920110192837463, + "grad_norm": 9.41595458984375, + "learning_rate": 7.910848629393842e-06, + "loss": 0.435, + "step": 2806 + }, + { + "epoch": 0.30931129476584024, + "grad_norm": 6.67035436630249, + "learning_rate": 7.909426854050575e-06, + "loss": 0.3859, + "step": 2807 + }, + { + "epoch": 0.3094214876033058, + "grad_norm": 6.468634605407715, + "learning_rate": 7.908004722938643e-06, + "loss": 0.4393, + "step": 2808 + }, + { + "epoch": 0.30953168044077134, + "grad_norm": 5.794666290283203, + "learning_rate": 7.906582236231942e-06, + "loss": 0.4383, + "step": 2809 + }, + { + "epoch": 0.3096418732782369, + "grad_norm": 6.282586574554443, + "learning_rate": 7.905159394104416e-06, + "loss": 0.3891, + "step": 2810 + }, + { + "epoch": 0.3097520661157025, + "grad_norm": 7.2533650398254395, + "learning_rate": 7.903736196730053e-06, + "loss": 0.3635, + "step": 2811 + }, + { + "epoch": 0.30986225895316805, + "grad_norm": 8.431426048278809, + "learning_rate": 7.902312644282886e-06, + "loss": 0.4154, + "step": 2812 + }, + { + "epoch": 0.3099724517906336, + "grad_norm": 8.024810791015625, + "learning_rate": 7.900888736936983e-06, + "loss": 0.4344, + "step": 2813 + }, + { + "epoch": 0.31008264462809915, + "grad_norm": 5.503087043762207, + "learning_rate": 7.899464474866466e-06, + "loss": 0.4049, + "step": 2814 + }, + { + "epoch": 0.31019283746556475, + "grad_norm": 12.236490249633789, + "learning_rate": 7.898039858245496e-06, + "loss": 0.4411, + "step": 2815 + }, + { + "epoch": 0.3103030303030303, + "grad_norm": 7.569835186004639, + "learning_rate": 7.896614887248276e-06, + "loss": 0.4911, + "step": 2816 + }, + { + "epoch": 0.31041322314049585, + "grad_norm": 12.798169136047363, + "learning_rate": 7.895189562049051e-06, + "loss": 0.5159, + "step": 2817 + }, + { + "epoch": 0.31052341597796146, + "grad_norm": 5.333597183227539, + "learning_rate": 7.893763882822115e-06, + "loss": 0.4846, + "step": 2818 + }, + { + "epoch": 0.310633608815427, + "grad_norm": 6.715102195739746, + "learning_rate": 7.8923378497418e-06, + "loss": 0.3047, + "step": 2819 + }, + { + "epoch": 0.31074380165289256, + "grad_norm": 3.3703551292419434, + "learning_rate": 7.890911462982482e-06, + "loss": 0.3674, + "step": 2820 + }, + { + "epoch": 0.3108539944903581, + "grad_norm": 6.084781169891357, + "learning_rate": 7.889484722718586e-06, + "loss": 0.4677, + "step": 2821 + }, + { + "epoch": 0.3109641873278237, + "grad_norm": 5.297740459442139, + "learning_rate": 7.888057629124573e-06, + "loss": 0.3949, + "step": 2822 + }, + { + "epoch": 0.31107438016528927, + "grad_norm": 3.961949348449707, + "learning_rate": 7.886630182374947e-06, + "loss": 0.3817, + "step": 2823 + }, + { + "epoch": 0.3111845730027548, + "grad_norm": 8.566644668579102, + "learning_rate": 7.885202382644265e-06, + "loss": 0.4124, + "step": 2824 + }, + { + "epoch": 0.31129476584022037, + "grad_norm": 5.797860622406006, + "learning_rate": 7.883774230107115e-06, + "loss": 0.4373, + "step": 2825 + }, + { + "epoch": 0.311404958677686, + "grad_norm": 3.940842628479004, + "learning_rate": 7.882345724938134e-06, + "loss": 0.3944, + "step": 2826 + }, + { + "epoch": 0.3115151515151515, + "grad_norm": 7.025946617126465, + "learning_rate": 7.880916867312003e-06, + "loss": 0.4132, + "step": 2827 + }, + { + "epoch": 0.3116253443526171, + "grad_norm": 10.542343139648438, + "learning_rate": 7.879487657403445e-06, + "loss": 0.4763, + "step": 2828 + }, + { + "epoch": 0.3117355371900826, + "grad_norm": 4.249691009521484, + "learning_rate": 7.878058095387225e-06, + "loss": 0.4395, + "step": 2829 + }, + { + "epoch": 0.31184573002754823, + "grad_norm": 6.69081974029541, + "learning_rate": 7.87662818143815e-06, + "loss": 0.2579, + "step": 2830 + }, + { + "epoch": 0.3119559228650138, + "grad_norm": 11.511882781982422, + "learning_rate": 7.875197915731076e-06, + "loss": 0.3756, + "step": 2831 + }, + { + "epoch": 0.31206611570247933, + "grad_norm": 6.804051876068115, + "learning_rate": 7.873767298440894e-06, + "loss": 0.3955, + "step": 2832 + }, + { + "epoch": 0.3121763085399449, + "grad_norm": 5.571722507476807, + "learning_rate": 7.872336329742543e-06, + "loss": 0.4043, + "step": 2833 + }, + { + "epoch": 0.3122865013774105, + "grad_norm": 8.094839096069336, + "learning_rate": 7.870905009811003e-06, + "loss": 0.4749, + "step": 2834 + }, + { + "epoch": 0.31239669421487604, + "grad_norm": 7.383030891418457, + "learning_rate": 7.869473338821298e-06, + "loss": 0.395, + "step": 2835 + }, + { + "epoch": 0.3125068870523416, + "grad_norm": 7.508321285247803, + "learning_rate": 7.868041316948498e-06, + "loss": 0.4089, + "step": 2836 + }, + { + "epoch": 0.31261707988980714, + "grad_norm": 7.432014465332031, + "learning_rate": 7.86660894436771e-06, + "loss": 0.5163, + "step": 2837 + }, + { + "epoch": 0.31272727272727274, + "grad_norm": 12.985380172729492, + "learning_rate": 7.865176221254084e-06, + "loss": 0.4646, + "step": 2838 + }, + { + "epoch": 0.3128374655647383, + "grad_norm": 8.084906578063965, + "learning_rate": 7.863743147782819e-06, + "loss": 0.4098, + "step": 2839 + }, + { + "epoch": 0.31294765840220384, + "grad_norm": 12.373218536376953, + "learning_rate": 7.862309724129152e-06, + "loss": 0.4765, + "step": 2840 + }, + { + "epoch": 0.31305785123966945, + "grad_norm": 7.3020429611206055, + "learning_rate": 7.860875950468363e-06, + "loss": 0.4874, + "step": 2841 + }, + { + "epoch": 0.313168044077135, + "grad_norm": 12.162711143493652, + "learning_rate": 7.859441826975776e-06, + "loss": 0.4098, + "step": 2842 + }, + { + "epoch": 0.31327823691460055, + "grad_norm": 6.177431106567383, + "learning_rate": 7.858007353826759e-06, + "loss": 0.4592, + "step": 2843 + }, + { + "epoch": 0.3133884297520661, + "grad_norm": 6.27357292175293, + "learning_rate": 7.856572531196722e-06, + "loss": 0.3901, + "step": 2844 + }, + { + "epoch": 0.3134986225895317, + "grad_norm": 7.773303508758545, + "learning_rate": 7.855137359261115e-06, + "loss": 0.3877, + "step": 2845 + }, + { + "epoch": 0.31360881542699726, + "grad_norm": 10.020936965942383, + "learning_rate": 7.853701838195432e-06, + "loss": 0.5103, + "step": 2846 + }, + { + "epoch": 0.3137190082644628, + "grad_norm": 6.017946243286133, + "learning_rate": 7.852265968175215e-06, + "loss": 0.4733, + "step": 2847 + }, + { + "epoch": 0.31382920110192836, + "grad_norm": 6.867127418518066, + "learning_rate": 7.850829749376037e-06, + "loss": 0.4119, + "step": 2848 + }, + { + "epoch": 0.31393939393939396, + "grad_norm": 7.664295673370361, + "learning_rate": 7.849393181973527e-06, + "loss": 0.4249, + "step": 2849 + }, + { + "epoch": 0.3140495867768595, + "grad_norm": 11.663252830505371, + "learning_rate": 7.847956266143349e-06, + "loss": 0.4911, + "step": 2850 + }, + { + "epoch": 0.31415977961432506, + "grad_norm": 4.299872398376465, + "learning_rate": 7.846519002061208e-06, + "loss": 0.4928, + "step": 2851 + }, + { + "epoch": 0.3142699724517906, + "grad_norm": 8.30838680267334, + "learning_rate": 7.845081389902857e-06, + "loss": 0.4668, + "step": 2852 + }, + { + "epoch": 0.3143801652892562, + "grad_norm": 9.995965957641602, + "learning_rate": 7.84364342984409e-06, + "loss": 0.4979, + "step": 2853 + }, + { + "epoch": 0.31449035812672177, + "grad_norm": 5.872107982635498, + "learning_rate": 7.842205122060742e-06, + "loss": 0.4263, + "step": 2854 + }, + { + "epoch": 0.3146005509641873, + "grad_norm": 12.73546028137207, + "learning_rate": 7.84076646672869e-06, + "loss": 0.412, + "step": 2855 + }, + { + "epoch": 0.31471074380165287, + "grad_norm": 11.293656349182129, + "learning_rate": 7.839327464023856e-06, + "loss": 0.4555, + "step": 2856 + }, + { + "epoch": 0.3148209366391185, + "grad_norm": 4.955324649810791, + "learning_rate": 7.837888114122203e-06, + "loss": 0.4784, + "step": 2857 + }, + { + "epoch": 0.314931129476584, + "grad_norm": 10.688766479492188, + "learning_rate": 7.836448417199735e-06, + "loss": 0.5122, + "step": 2858 + }, + { + "epoch": 0.3150413223140496, + "grad_norm": 4.592641830444336, + "learning_rate": 7.835008373432504e-06, + "loss": 0.4596, + "step": 2859 + }, + { + "epoch": 0.3151515151515151, + "grad_norm": 11.006119728088379, + "learning_rate": 7.833567982996598e-06, + "loss": 0.445, + "step": 2860 + }, + { + "epoch": 0.31526170798898073, + "grad_norm": 13.519087791442871, + "learning_rate": 7.832127246068148e-06, + "loss": 0.4218, + "step": 2861 + }, + { + "epoch": 0.3153719008264463, + "grad_norm": 4.565635681152344, + "learning_rate": 7.830686162823332e-06, + "loss": 0.4107, + "step": 2862 + }, + { + "epoch": 0.31548209366391183, + "grad_norm": 3.7139227390289307, + "learning_rate": 7.829244733438368e-06, + "loss": 0.43, + "step": 2863 + }, + { + "epoch": 0.3155922865013774, + "grad_norm": 8.965399742126465, + "learning_rate": 7.827802958089514e-06, + "loss": 0.4718, + "step": 2864 + }, + { + "epoch": 0.315702479338843, + "grad_norm": 7.846672058105469, + "learning_rate": 7.826360836953073e-06, + "loss": 0.43, + "step": 2865 + }, + { + "epoch": 0.31581267217630854, + "grad_norm": 7.4430766105651855, + "learning_rate": 7.82491837020539e-06, + "loss": 0.4056, + "step": 2866 + }, + { + "epoch": 0.3159228650137741, + "grad_norm": 7.6877946853637695, + "learning_rate": 7.82347555802285e-06, + "loss": 0.5108, + "step": 2867 + }, + { + "epoch": 0.3160330578512397, + "grad_norm": 5.180397033691406, + "learning_rate": 7.822032400581886e-06, + "loss": 0.3812, + "step": 2868 + }, + { + "epoch": 0.31614325068870525, + "grad_norm": 10.081506729125977, + "learning_rate": 7.820588898058966e-06, + "loss": 0.4718, + "step": 2869 + }, + { + "epoch": 0.3162534435261708, + "grad_norm": 4.8489837646484375, + "learning_rate": 7.819145050630602e-06, + "loss": 0.4374, + "step": 2870 + }, + { + "epoch": 0.31636363636363635, + "grad_norm": 6.359596252441406, + "learning_rate": 7.817700858473353e-06, + "loss": 0.4696, + "step": 2871 + }, + { + "epoch": 0.31647382920110195, + "grad_norm": 4.26806640625, + "learning_rate": 7.816256321763818e-06, + "loss": 0.4418, + "step": 2872 + }, + { + "epoch": 0.3165840220385675, + "grad_norm": 7.734675884246826, + "learning_rate": 7.814811440678632e-06, + "loss": 0.4507, + "step": 2873 + }, + { + "epoch": 0.31669421487603305, + "grad_norm": 8.76938247680664, + "learning_rate": 7.813366215394479e-06, + "loss": 0.4429, + "step": 2874 + }, + { + "epoch": 0.3168044077134986, + "grad_norm": 7.27657413482666, + "learning_rate": 7.811920646088084e-06, + "loss": 0.4837, + "step": 2875 + }, + { + "epoch": 0.3169146005509642, + "grad_norm": 5.2841644287109375, + "learning_rate": 7.810474732936213e-06, + "loss": 0.3626, + "step": 2876 + }, + { + "epoch": 0.31702479338842976, + "grad_norm": 6.800135135650635, + "learning_rate": 7.809028476115674e-06, + "loss": 0.4375, + "step": 2877 + }, + { + "epoch": 0.3171349862258953, + "grad_norm": 9.077492713928223, + "learning_rate": 7.807581875803318e-06, + "loss": 0.4649, + "step": 2878 + }, + { + "epoch": 0.31724517906336086, + "grad_norm": 10.60534954071045, + "learning_rate": 7.806134932176038e-06, + "loss": 0.4719, + "step": 2879 + }, + { + "epoch": 0.31735537190082647, + "grad_norm": 10.929094314575195, + "learning_rate": 7.804687645410764e-06, + "loss": 0.446, + "step": 2880 + }, + { + "epoch": 0.317465564738292, + "grad_norm": 15.249067306518555, + "learning_rate": 7.803240015684475e-06, + "loss": 0.3954, + "step": 2881 + }, + { + "epoch": 0.31757575757575757, + "grad_norm": 10.716742515563965, + "learning_rate": 7.80179204317419e-06, + "loss": 0.4967, + "step": 2882 + }, + { + "epoch": 0.3176859504132231, + "grad_norm": 7.304423809051514, + "learning_rate": 7.800343728056968e-06, + "loss": 0.4558, + "step": 2883 + }, + { + "epoch": 0.3177961432506887, + "grad_norm": 8.222415924072266, + "learning_rate": 7.79889507050991e-06, + "loss": 0.4145, + "step": 2884 + }, + { + "epoch": 0.3179063360881543, + "grad_norm": 5.5766191482543945, + "learning_rate": 7.797446070710161e-06, + "loss": 0.3608, + "step": 2885 + }, + { + "epoch": 0.3180165289256198, + "grad_norm": 8.35886001586914, + "learning_rate": 7.795996728834909e-06, + "loss": 0.459, + "step": 2886 + }, + { + "epoch": 0.3181267217630854, + "grad_norm": 6.088986873626709, + "learning_rate": 7.794547045061375e-06, + "loss": 0.464, + "step": 2887 + }, + { + "epoch": 0.318236914600551, + "grad_norm": 20.60193634033203, + "learning_rate": 7.793097019566836e-06, + "loss": 0.5473, + "step": 2888 + }, + { + "epoch": 0.31834710743801653, + "grad_norm": 9.807446479797363, + "learning_rate": 7.791646652528598e-06, + "loss": 0.4344, + "step": 2889 + }, + { + "epoch": 0.3184573002754821, + "grad_norm": 10.306923866271973, + "learning_rate": 7.790195944124014e-06, + "loss": 0.581, + "step": 2890 + }, + { + "epoch": 0.3185674931129477, + "grad_norm": 6.767941951751709, + "learning_rate": 7.788744894530482e-06, + "loss": 0.4147, + "step": 2891 + }, + { + "epoch": 0.31867768595041324, + "grad_norm": 7.338228702545166, + "learning_rate": 7.787293503925435e-06, + "loss": 0.3658, + "step": 2892 + }, + { + "epoch": 0.3187878787878788, + "grad_norm": 6.129321575164795, + "learning_rate": 7.785841772486353e-06, + "loss": 0.3595, + "step": 2893 + }, + { + "epoch": 0.31889807162534434, + "grad_norm": 7.529963970184326, + "learning_rate": 7.784389700390754e-06, + "loss": 0.4386, + "step": 2894 + }, + { + "epoch": 0.31900826446280994, + "grad_norm": 9.463605880737305, + "learning_rate": 7.782937287816199e-06, + "loss": 0.5608, + "step": 2895 + }, + { + "epoch": 0.3191184573002755, + "grad_norm": 7.19361686706543, + "learning_rate": 7.781484534940295e-06, + "loss": 0.4318, + "step": 2896 + }, + { + "epoch": 0.31922865013774104, + "grad_norm": 6.3514533042907715, + "learning_rate": 7.780031441940682e-06, + "loss": 0.3292, + "step": 2897 + }, + { + "epoch": 0.3193388429752066, + "grad_norm": 4.113743305206299, + "learning_rate": 7.778578008995048e-06, + "loss": 0.4203, + "step": 2898 + }, + { + "epoch": 0.3194490358126722, + "grad_norm": 6.209310054779053, + "learning_rate": 7.777124236281122e-06, + "loss": 0.4466, + "step": 2899 + }, + { + "epoch": 0.31955922865013775, + "grad_norm": 6.49799919128418, + "learning_rate": 7.77567012397667e-06, + "loss": 0.4178, + "step": 2900 + }, + { + "epoch": 0.3196694214876033, + "grad_norm": 4.771322250366211, + "learning_rate": 7.774215672259506e-06, + "loss": 0.3765, + "step": 2901 + }, + { + "epoch": 0.31977961432506885, + "grad_norm": 4.73738431930542, + "learning_rate": 7.772760881307482e-06, + "loss": 0.367, + "step": 2902 + }, + { + "epoch": 0.31988980716253446, + "grad_norm": 6.7683634757995605, + "learning_rate": 7.77130575129849e-06, + "loss": 0.4299, + "step": 2903 + }, + { + "epoch": 0.32, + "grad_norm": 12.15935230255127, + "learning_rate": 7.769850282410466e-06, + "loss": 0.5786, + "step": 2904 + }, + { + "epoch": 0.32011019283746556, + "grad_norm": 11.941153526306152, + "learning_rate": 7.768394474821388e-06, + "loss": 0.5455, + "step": 2905 + }, + { + "epoch": 0.3202203856749311, + "grad_norm": 9.834383010864258, + "learning_rate": 7.766938328709273e-06, + "loss": 0.5029, + "step": 2906 + }, + { + "epoch": 0.3203305785123967, + "grad_norm": 15.463730812072754, + "learning_rate": 7.76548184425218e-06, + "loss": 0.5836, + "step": 2907 + }, + { + "epoch": 0.32044077134986226, + "grad_norm": 10.36178970336914, + "learning_rate": 7.764025021628211e-06, + "loss": 0.4246, + "step": 2908 + }, + { + "epoch": 0.3205509641873278, + "grad_norm": 9.731084823608398, + "learning_rate": 7.762567861015507e-06, + "loss": 0.3565, + "step": 2909 + }, + { + "epoch": 0.32066115702479336, + "grad_norm": 8.840417861938477, + "learning_rate": 7.761110362592253e-06, + "loss": 0.3893, + "step": 2910 + }, + { + "epoch": 0.32077134986225897, + "grad_norm": 6.205860137939453, + "learning_rate": 7.759652526536675e-06, + "loss": 0.4306, + "step": 2911 + }, + { + "epoch": 0.3208815426997245, + "grad_norm": 6.984999656677246, + "learning_rate": 7.758194353027034e-06, + "loss": 0.4017, + "step": 2912 + }, + { + "epoch": 0.32099173553719007, + "grad_norm": 4.921543598175049, + "learning_rate": 7.756735842241643e-06, + "loss": 0.3812, + "step": 2913 + }, + { + "epoch": 0.3211019283746556, + "grad_norm": 5.417006015777588, + "learning_rate": 7.755276994358847e-06, + "loss": 0.3356, + "step": 2914 + }, + { + "epoch": 0.3212121212121212, + "grad_norm": 7.116862773895264, + "learning_rate": 7.75381780955704e-06, + "loss": 0.402, + "step": 2915 + }, + { + "epoch": 0.3213223140495868, + "grad_norm": 6.648025989532471, + "learning_rate": 7.752358288014649e-06, + "loss": 0.4288, + "step": 2916 + }, + { + "epoch": 0.3214325068870523, + "grad_norm": 7.639983177185059, + "learning_rate": 7.750898429910148e-06, + "loss": 0.4618, + "step": 2917 + }, + { + "epoch": 0.32154269972451793, + "grad_norm": 4.660123348236084, + "learning_rate": 7.749438235422048e-06, + "loss": 0.4205, + "step": 2918 + }, + { + "epoch": 0.3216528925619835, + "grad_norm": 7.626750469207764, + "learning_rate": 7.747977704728908e-06, + "loss": 0.4429, + "step": 2919 + }, + { + "epoch": 0.32176308539944903, + "grad_norm": 10.333895683288574, + "learning_rate": 7.746516838009324e-06, + "loss": 0.5305, + "step": 2920 + }, + { + "epoch": 0.3218732782369146, + "grad_norm": 7.846880912780762, + "learning_rate": 7.745055635441927e-06, + "loss": 0.508, + "step": 2921 + }, + { + "epoch": 0.3219834710743802, + "grad_norm": 4.909427642822266, + "learning_rate": 7.743594097205398e-06, + "loss": 0.3761, + "step": 2922 + }, + { + "epoch": 0.32209366391184574, + "grad_norm": 6.739473342895508, + "learning_rate": 7.742132223478458e-06, + "loss": 0.4365, + "step": 2923 + }, + { + "epoch": 0.3222038567493113, + "grad_norm": 6.599324703216553, + "learning_rate": 7.740670014439863e-06, + "loss": 0.3779, + "step": 2924 + }, + { + "epoch": 0.32231404958677684, + "grad_norm": 9.731310844421387, + "learning_rate": 7.739207470268418e-06, + "loss": 0.4568, + "step": 2925 + }, + { + "epoch": 0.32242424242424245, + "grad_norm": 31.03965950012207, + "learning_rate": 7.73774459114296e-06, + "loss": 0.5713, + "step": 2926 + }, + { + "epoch": 0.322534435261708, + "grad_norm": 5.701545715332031, + "learning_rate": 7.736281377242376e-06, + "loss": 0.4541, + "step": 2927 + }, + { + "epoch": 0.32264462809917355, + "grad_norm": 8.6945219039917, + "learning_rate": 7.73481782874559e-06, + "loss": 0.5208, + "step": 2928 + }, + { + "epoch": 0.3227548209366391, + "grad_norm": 6.546955585479736, + "learning_rate": 7.73335394583156e-06, + "loss": 0.3758, + "step": 2929 + }, + { + "epoch": 0.3228650137741047, + "grad_norm": 6.786363124847412, + "learning_rate": 7.731889728679301e-06, + "loss": 0.3913, + "step": 2930 + }, + { + "epoch": 0.32297520661157025, + "grad_norm": 5.942480564117432, + "learning_rate": 7.730425177467854e-06, + "loss": 0.4056, + "step": 2931 + }, + { + "epoch": 0.3230853994490358, + "grad_norm": 7.159365177154541, + "learning_rate": 7.728960292376306e-06, + "loss": 0.5533, + "step": 2932 + }, + { + "epoch": 0.32319559228650135, + "grad_norm": 4.457771301269531, + "learning_rate": 7.727495073583788e-06, + "loss": 0.3981, + "step": 2933 + }, + { + "epoch": 0.32330578512396696, + "grad_norm": 5.632359504699707, + "learning_rate": 7.72602952126947e-06, + "loss": 0.3993, + "step": 2934 + }, + { + "epoch": 0.3234159779614325, + "grad_norm": 5.7494001388549805, + "learning_rate": 7.724563635612554e-06, + "loss": 0.3922, + "step": 2935 + }, + { + "epoch": 0.32352617079889806, + "grad_norm": 9.414168357849121, + "learning_rate": 7.723097416792298e-06, + "loss": 0.4567, + "step": 2936 + }, + { + "epoch": 0.3236363636363636, + "grad_norm": 6.057540416717529, + "learning_rate": 7.721630864987992e-06, + "loss": 0.4211, + "step": 2937 + }, + { + "epoch": 0.3237465564738292, + "grad_norm": 8.234047889709473, + "learning_rate": 7.720163980378966e-06, + "loss": 0.4801, + "step": 2938 + }, + { + "epoch": 0.32385674931129477, + "grad_norm": 5.774929046630859, + "learning_rate": 7.718696763144593e-06, + "loss": 0.4454, + "step": 2939 + }, + { + "epoch": 0.3239669421487603, + "grad_norm": 5.500660419464111, + "learning_rate": 7.717229213464287e-06, + "loss": 0.4491, + "step": 2940 + }, + { + "epoch": 0.32407713498622587, + "grad_norm": 7.193687438964844, + "learning_rate": 7.715761331517501e-06, + "loss": 0.4146, + "step": 2941 + }, + { + "epoch": 0.3241873278236915, + "grad_norm": 7.698046684265137, + "learning_rate": 7.714293117483732e-06, + "loss": 0.4115, + "step": 2942 + }, + { + "epoch": 0.324297520661157, + "grad_norm": 5.544668197631836, + "learning_rate": 7.712824571542512e-06, + "loss": 0.4522, + "step": 2943 + }, + { + "epoch": 0.3244077134986226, + "grad_norm": 5.389462471008301, + "learning_rate": 7.71135569387342e-06, + "loss": 0.5141, + "step": 2944 + }, + { + "epoch": 0.3245179063360882, + "grad_norm": 7.950774669647217, + "learning_rate": 7.709886484656071e-06, + "loss": 0.4294, + "step": 2945 + }, + { + "epoch": 0.32462809917355373, + "grad_norm": 9.045058250427246, + "learning_rate": 7.70841694407012e-06, + "loss": 0.4393, + "step": 2946 + }, + { + "epoch": 0.3247382920110193, + "grad_norm": 8.255905151367188, + "learning_rate": 7.706947072295266e-06, + "loss": 0.4806, + "step": 2947 + }, + { + "epoch": 0.32484848484848483, + "grad_norm": 5.852301597595215, + "learning_rate": 7.705476869511249e-06, + "loss": 0.4002, + "step": 2948 + }, + { + "epoch": 0.32495867768595044, + "grad_norm": 10.095605850219727, + "learning_rate": 7.704006335897843e-06, + "loss": 0.4809, + "step": 2949 + }, + { + "epoch": 0.325068870523416, + "grad_norm": 8.605550765991211, + "learning_rate": 7.70253547163487e-06, + "loss": 0.3596, + "step": 2950 + }, + { + "epoch": 0.32517906336088154, + "grad_norm": 5.249491214752197, + "learning_rate": 7.70106427690219e-06, + "loss": 0.4228, + "step": 2951 + }, + { + "epoch": 0.3252892561983471, + "grad_norm": 4.563051700592041, + "learning_rate": 7.699592751879698e-06, + "loss": 0.4318, + "step": 2952 + }, + { + "epoch": 0.3253994490358127, + "grad_norm": 6.423548698425293, + "learning_rate": 7.69812089674734e-06, + "loss": 0.4508, + "step": 2953 + }, + { + "epoch": 0.32550964187327824, + "grad_norm": 6.7021260261535645, + "learning_rate": 7.696648711685093e-06, + "loss": 0.4523, + "step": 2954 + }, + { + "epoch": 0.3256198347107438, + "grad_norm": 7.479311466217041, + "learning_rate": 7.69517619687298e-06, + "loss": 0.4932, + "step": 2955 + }, + { + "epoch": 0.32573002754820934, + "grad_norm": 5.966842174530029, + "learning_rate": 7.693703352491057e-06, + "loss": 0.3899, + "step": 2956 + }, + { + "epoch": 0.32584022038567495, + "grad_norm": 4.4799346923828125, + "learning_rate": 7.692230178719431e-06, + "loss": 0.3957, + "step": 2957 + }, + { + "epoch": 0.3259504132231405, + "grad_norm": 10.163111686706543, + "learning_rate": 7.690756675738242e-06, + "loss": 0.4173, + "step": 2958 + }, + { + "epoch": 0.32606060606060605, + "grad_norm": 8.435184478759766, + "learning_rate": 7.689282843727672e-06, + "loss": 0.4158, + "step": 2959 + }, + { + "epoch": 0.3261707988980716, + "grad_norm": 5.7857160568237305, + "learning_rate": 7.687808682867943e-06, + "loss": 0.4333, + "step": 2960 + }, + { + "epoch": 0.3262809917355372, + "grad_norm": 7.384538173675537, + "learning_rate": 7.686334193339315e-06, + "loss": 0.3412, + "step": 2961 + }, + { + "epoch": 0.32639118457300276, + "grad_norm": 5.766459941864014, + "learning_rate": 7.684859375322094e-06, + "loss": 0.4108, + "step": 2962 + }, + { + "epoch": 0.3265013774104683, + "grad_norm": 4.282750129699707, + "learning_rate": 7.683384228996624e-06, + "loss": 0.4139, + "step": 2963 + }, + { + "epoch": 0.32661157024793386, + "grad_norm": 6.017995357513428, + "learning_rate": 7.681908754543282e-06, + "loss": 0.3988, + "step": 2964 + }, + { + "epoch": 0.32672176308539946, + "grad_norm": 5.02705192565918, + "learning_rate": 7.680432952142497e-06, + "loss": 0.4682, + "step": 2965 + }, + { + "epoch": 0.326831955922865, + "grad_norm": 8.65324592590332, + "learning_rate": 7.678956821974728e-06, + "loss": 0.5517, + "step": 2966 + }, + { + "epoch": 0.32694214876033056, + "grad_norm": 5.790050029754639, + "learning_rate": 7.677480364220479e-06, + "loss": 0.4394, + "step": 2967 + }, + { + "epoch": 0.32705234159779617, + "grad_norm": 12.062713623046875, + "learning_rate": 7.676003579060295e-06, + "loss": 0.4488, + "step": 2968 + }, + { + "epoch": 0.3271625344352617, + "grad_norm": 4.4486236572265625, + "learning_rate": 7.67452646667476e-06, + "loss": 0.4409, + "step": 2969 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 6.7604241371154785, + "learning_rate": 7.673049027244493e-06, + "loss": 0.4054, + "step": 2970 + }, + { + "epoch": 0.3273829201101928, + "grad_norm": 5.401520252227783, + "learning_rate": 7.671571260950162e-06, + "loss": 0.3751, + "step": 2971 + }, + { + "epoch": 0.3274931129476584, + "grad_norm": 4.630212306976318, + "learning_rate": 7.670093167972468e-06, + "loss": 0.3902, + "step": 2972 + }, + { + "epoch": 0.327603305785124, + "grad_norm": 12.318961143493652, + "learning_rate": 7.668614748492154e-06, + "loss": 0.4432, + "step": 2973 + }, + { + "epoch": 0.3277134986225895, + "grad_norm": 6.725958347320557, + "learning_rate": 7.667136002690004e-06, + "loss": 0.3127, + "step": 2974 + }, + { + "epoch": 0.3278236914600551, + "grad_norm": 5.85172176361084, + "learning_rate": 7.66565693074684e-06, + "loss": 0.4157, + "step": 2975 + }, + { + "epoch": 0.3279338842975207, + "grad_norm": 12.489062309265137, + "learning_rate": 7.664177532843525e-06, + "loss": 0.4157, + "step": 2976 + }, + { + "epoch": 0.32804407713498623, + "grad_norm": 6.006800174713135, + "learning_rate": 7.662697809160963e-06, + "loss": 0.4164, + "step": 2977 + }, + { + "epoch": 0.3281542699724518, + "grad_norm": 21.75774574279785, + "learning_rate": 7.661217759880095e-06, + "loss": 0.3391, + "step": 2978 + }, + { + "epoch": 0.32826446280991733, + "grad_norm": 7.501448631286621, + "learning_rate": 7.659737385181907e-06, + "loss": 0.4868, + "step": 2979 + }, + { + "epoch": 0.32837465564738294, + "grad_norm": 7.795146465301514, + "learning_rate": 7.658256685247415e-06, + "loss": 0.4861, + "step": 2980 + }, + { + "epoch": 0.3284848484848485, + "grad_norm": 9.895896911621094, + "learning_rate": 7.656775660257689e-06, + "loss": 0.4656, + "step": 2981 + }, + { + "epoch": 0.32859504132231404, + "grad_norm": 7.72867488861084, + "learning_rate": 7.655294310393822e-06, + "loss": 0.3939, + "step": 2982 + }, + { + "epoch": 0.3287052341597796, + "grad_norm": 6.340445041656494, + "learning_rate": 7.653812635836963e-06, + "loss": 0.4287, + "step": 2983 + }, + { + "epoch": 0.3288154269972452, + "grad_norm": 11.97193431854248, + "learning_rate": 7.652330636768289e-06, + "loss": 0.417, + "step": 2984 + }, + { + "epoch": 0.32892561983471075, + "grad_norm": 11.652042388916016, + "learning_rate": 7.650848313369022e-06, + "loss": 0.4848, + "step": 2985 + }, + { + "epoch": 0.3290358126721763, + "grad_norm": 5.396225452423096, + "learning_rate": 7.649365665820424e-06, + "loss": 0.4078, + "step": 2986 + }, + { + "epoch": 0.32914600550964185, + "grad_norm": 8.694803237915039, + "learning_rate": 7.64788269430379e-06, + "loss": 0.4594, + "step": 2987 + }, + { + "epoch": 0.32925619834710745, + "grad_norm": 7.699805736541748, + "learning_rate": 7.646399399000466e-06, + "loss": 0.4293, + "step": 2988 + }, + { + "epoch": 0.329366391184573, + "grad_norm": 5.9591755867004395, + "learning_rate": 7.644915780091828e-06, + "loss": 0.4216, + "step": 2989 + }, + { + "epoch": 0.32947658402203855, + "grad_norm": 6.445276260375977, + "learning_rate": 7.643431837759295e-06, + "loss": 0.3882, + "step": 2990 + }, + { + "epoch": 0.3295867768595041, + "grad_norm": 5.513563632965088, + "learning_rate": 7.641947572184328e-06, + "loss": 0.417, + "step": 2991 + }, + { + "epoch": 0.3296969696969697, + "grad_norm": 5.847497463226318, + "learning_rate": 7.64046298354842e-06, + "loss": 0.4099, + "step": 2992 + }, + { + "epoch": 0.32980716253443526, + "grad_norm": 7.973586082458496, + "learning_rate": 7.638978072033114e-06, + "loss": 0.398, + "step": 2993 + }, + { + "epoch": 0.3299173553719008, + "grad_norm": 6.221728324890137, + "learning_rate": 7.637492837819986e-06, + "loss": 0.3629, + "step": 2994 + }, + { + "epoch": 0.3300275482093664, + "grad_norm": 9.28332233428955, + "learning_rate": 7.636007281090647e-06, + "loss": 0.4382, + "step": 2995 + }, + { + "epoch": 0.33013774104683197, + "grad_norm": 19.20231819152832, + "learning_rate": 7.63452140202676e-06, + "loss": 0.4491, + "step": 2996 + }, + { + "epoch": 0.3302479338842975, + "grad_norm": 5.971023082733154, + "learning_rate": 7.633035200810018e-06, + "loss": 0.4333, + "step": 2997 + }, + { + "epoch": 0.33035812672176307, + "grad_norm": 5.851497173309326, + "learning_rate": 7.631548677622152e-06, + "loss": 0.4213, + "step": 2998 + }, + { + "epoch": 0.3304683195592287, + "grad_norm": 7.684570789337158, + "learning_rate": 7.630061832644942e-06, + "loss": 0.3773, + "step": 2999 + }, + { + "epoch": 0.3305785123966942, + "grad_norm": 8.930782318115234, + "learning_rate": 7.628574666060198e-06, + "loss": 0.4315, + "step": 3000 + }, + { + "epoch": 0.3306887052341598, + "grad_norm": 9.754863739013672, + "learning_rate": 7.6270871780497726e-06, + "loss": 0.4276, + "step": 3001 + }, + { + "epoch": 0.3307988980716253, + "grad_norm": 13.1202392578125, + "learning_rate": 7.625599368795558e-06, + "loss": 0.4971, + "step": 3002 + }, + { + "epoch": 0.33090909090909093, + "grad_norm": 7.650578498840332, + "learning_rate": 7.624111238479486e-06, + "loss": 0.3925, + "step": 3003 + }, + { + "epoch": 0.3310192837465565, + "grad_norm": 5.505433082580566, + "learning_rate": 7.622622787283528e-06, + "loss": 0.3684, + "step": 3004 + }, + { + "epoch": 0.33112947658402203, + "grad_norm": 9.210761070251465, + "learning_rate": 7.621134015389693e-06, + "loss": 0.5293, + "step": 3005 + }, + { + "epoch": 0.3312396694214876, + "grad_norm": 9.177719116210938, + "learning_rate": 7.61964492298003e-06, + "loss": 0.4088, + "step": 3006 + }, + { + "epoch": 0.3313498622589532, + "grad_norm": 8.678670883178711, + "learning_rate": 7.618155510236627e-06, + "loss": 0.4763, + "step": 3007 + }, + { + "epoch": 0.33146005509641874, + "grad_norm": 4.918095588684082, + "learning_rate": 7.616665777341612e-06, + "loss": 0.4408, + "step": 3008 + }, + { + "epoch": 0.3315702479338843, + "grad_norm": 18.14687156677246, + "learning_rate": 7.6151757244771514e-06, + "loss": 0.5224, + "step": 3009 + }, + { + "epoch": 0.33168044077134984, + "grad_norm": 7.2442498207092285, + "learning_rate": 7.613685351825451e-06, + "loss": 0.3909, + "step": 3010 + }, + { + "epoch": 0.33179063360881544, + "grad_norm": 6.038534164428711, + "learning_rate": 7.612194659568755e-06, + "loss": 0.3934, + "step": 3011 + }, + { + "epoch": 0.331900826446281, + "grad_norm": 8.074752807617188, + "learning_rate": 7.610703647889348e-06, + "loss": 0.3914, + "step": 3012 + }, + { + "epoch": 0.33201101928374654, + "grad_norm": 8.205302238464355, + "learning_rate": 7.609212316969553e-06, + "loss": 0.4561, + "step": 3013 + }, + { + "epoch": 0.3321212121212121, + "grad_norm": 10.075181007385254, + "learning_rate": 7.607720666991733e-06, + "loss": 0.4963, + "step": 3014 + }, + { + "epoch": 0.3322314049586777, + "grad_norm": 11.816518783569336, + "learning_rate": 7.606228698138285e-06, + "loss": 0.4361, + "step": 3015 + }, + { + "epoch": 0.33234159779614325, + "grad_norm": 7.756158351898193, + "learning_rate": 7.604736410591651e-06, + "loss": 0.371, + "step": 3016 + }, + { + "epoch": 0.3324517906336088, + "grad_norm": 7.583977699279785, + "learning_rate": 7.603243804534313e-06, + "loss": 0.3731, + "step": 3017 + }, + { + "epoch": 0.3325619834710744, + "grad_norm": 4.773258686065674, + "learning_rate": 7.601750880148786e-06, + "loss": 0.4677, + "step": 3018 + }, + { + "epoch": 0.33267217630853996, + "grad_norm": 13.50361442565918, + "learning_rate": 7.600257637617627e-06, + "loss": 0.3665, + "step": 3019 + }, + { + "epoch": 0.3327823691460055, + "grad_norm": 11.342306137084961, + "learning_rate": 7.5987640771234305e-06, + "loss": 0.4426, + "step": 3020 + }, + { + "epoch": 0.33289256198347106, + "grad_norm": 12.209716796875, + "learning_rate": 7.597270198848834e-06, + "loss": 0.492, + "step": 3021 + }, + { + "epoch": 0.33300275482093666, + "grad_norm": 3.5088915824890137, + "learning_rate": 7.5957760029765106e-06, + "loss": 0.3931, + "step": 3022 + }, + { + "epoch": 0.3331129476584022, + "grad_norm": 4.468617916107178, + "learning_rate": 7.594281489689169e-06, + "loss": 0.3579, + "step": 3023 + }, + { + "epoch": 0.33322314049586776, + "grad_norm": 6.926708698272705, + "learning_rate": 7.5927866591695645e-06, + "loss": 0.4393, + "step": 3024 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 6.028176784515381, + "learning_rate": 7.591291511600485e-06, + "loss": 0.4507, + "step": 3025 + }, + { + "epoch": 0.3334435261707989, + "grad_norm": 6.006053447723389, + "learning_rate": 7.589796047164759e-06, + "loss": 0.4414, + "step": 3026 + }, + { + "epoch": 0.33355371900826447, + "grad_norm": 6.254835605621338, + "learning_rate": 7.588300266045255e-06, + "loss": 0.4751, + "step": 3027 + }, + { + "epoch": 0.33366391184573, + "grad_norm": 6.107983112335205, + "learning_rate": 7.586804168424879e-06, + "loss": 0.5331, + "step": 3028 + }, + { + "epoch": 0.33377410468319557, + "grad_norm": 7.126374244689941, + "learning_rate": 7.585307754486575e-06, + "loss": 0.3972, + "step": 3029 + }, + { + "epoch": 0.3338842975206612, + "grad_norm": 5.774569988250732, + "learning_rate": 7.583811024413328e-06, + "loss": 0.4707, + "step": 3030 + }, + { + "epoch": 0.3339944903581267, + "grad_norm": 9.53724193572998, + "learning_rate": 7.582313978388156e-06, + "loss": 0.4184, + "step": 3031 + }, + { + "epoch": 0.3341046831955923, + "grad_norm": 14.912829399108887, + "learning_rate": 7.580816616594126e-06, + "loss": 0.5473, + "step": 3032 + }, + { + "epoch": 0.33421487603305783, + "grad_norm": 5.911381721496582, + "learning_rate": 7.579318939214334e-06, + "loss": 0.3691, + "step": 3033 + }, + { + "epoch": 0.33432506887052343, + "grad_norm": 8.710693359375, + "learning_rate": 7.577820946431918e-06, + "loss": 0.3979, + "step": 3034 + }, + { + "epoch": 0.334435261707989, + "grad_norm": 11.323448181152344, + "learning_rate": 7.5763226384300555e-06, + "loss": 0.447, + "step": 3035 + }, + { + "epoch": 0.33454545454545453, + "grad_norm": 5.685377597808838, + "learning_rate": 7.5748240153919605e-06, + "loss": 0.3796, + "step": 3036 + }, + { + "epoch": 0.3346556473829201, + "grad_norm": 8.896135330200195, + "learning_rate": 7.573325077500888e-06, + "loss": 0.4009, + "step": 3037 + }, + { + "epoch": 0.3347658402203857, + "grad_norm": 7.273399829864502, + "learning_rate": 7.571825824940129e-06, + "loss": 0.3532, + "step": 3038 + }, + { + "epoch": 0.33487603305785124, + "grad_norm": 7.050100326538086, + "learning_rate": 7.570326257893015e-06, + "loss": 0.4094, + "step": 3039 + }, + { + "epoch": 0.3349862258953168, + "grad_norm": 5.257210731506348, + "learning_rate": 7.5688263765429145e-06, + "loss": 0.5058, + "step": 3040 + }, + { + "epoch": 0.33509641873278234, + "grad_norm": 4.360664367675781, + "learning_rate": 7.567326181073235e-06, + "loss": 0.4412, + "step": 3041 + }, + { + "epoch": 0.33520661157024795, + "grad_norm": 5.903068542480469, + "learning_rate": 7.565825671667423e-06, + "loss": 0.4536, + "step": 3042 + }, + { + "epoch": 0.3353168044077135, + "grad_norm": 6.916237831115723, + "learning_rate": 7.564324848508963e-06, + "loss": 0.4919, + "step": 3043 + }, + { + "epoch": 0.33542699724517905, + "grad_norm": 9.67279052734375, + "learning_rate": 7.562823711781375e-06, + "loss": 0.3965, + "step": 3044 + }, + { + "epoch": 0.33553719008264465, + "grad_norm": 7.412188529968262, + "learning_rate": 7.561322261668224e-06, + "loss": 0.392, + "step": 3045 + }, + { + "epoch": 0.3356473829201102, + "grad_norm": 11.482963562011719, + "learning_rate": 7.55982049835311e-06, + "loss": 0.4967, + "step": 3046 + }, + { + "epoch": 0.33575757575757575, + "grad_norm": 7.3784708976745605, + "learning_rate": 7.558318422019666e-06, + "loss": 0.4075, + "step": 3047 + }, + { + "epoch": 0.3358677685950413, + "grad_norm": 6.242224216461182, + "learning_rate": 7.556816032851568e-06, + "loss": 0.4608, + "step": 3048 + }, + { + "epoch": 0.3359779614325069, + "grad_norm": 4.955140590667725, + "learning_rate": 7.555313331032537e-06, + "loss": 0.3645, + "step": 3049 + }, + { + "epoch": 0.33608815426997246, + "grad_norm": 9.901180267333984, + "learning_rate": 7.5538103167463175e-06, + "loss": 0.4818, + "step": 3050 + }, + { + "epoch": 0.336198347107438, + "grad_norm": 5.590670108795166, + "learning_rate": 7.552306990176704e-06, + "loss": 0.4435, + "step": 3051 + }, + { + "epoch": 0.33630853994490356, + "grad_norm": 4.528062343597412, + "learning_rate": 7.550803351507525e-06, + "loss": 0.4076, + "step": 3052 + }, + { + "epoch": 0.33641873278236917, + "grad_norm": 4.979361057281494, + "learning_rate": 7.549299400922647e-06, + "loss": 0.407, + "step": 3053 + }, + { + "epoch": 0.3365289256198347, + "grad_norm": 6.397597789764404, + "learning_rate": 7.547795138605976e-06, + "loss": 0.4511, + "step": 3054 + }, + { + "epoch": 0.33663911845730027, + "grad_norm": 5.850242614746094, + "learning_rate": 7.546290564741454e-06, + "loss": 0.4237, + "step": 3055 + }, + { + "epoch": 0.3367493112947658, + "grad_norm": 4.40457010269165, + "learning_rate": 7.544785679513064e-06, + "loss": 0.4449, + "step": 3056 + }, + { + "epoch": 0.3368595041322314, + "grad_norm": 14.284875869750977, + "learning_rate": 7.543280483104824e-06, + "loss": 0.5181, + "step": 3057 + }, + { + "epoch": 0.336969696969697, + "grad_norm": 6.017383098602295, + "learning_rate": 7.541774975700791e-06, + "loss": 0.4222, + "step": 3058 + }, + { + "epoch": 0.3370798898071625, + "grad_norm": 9.808992385864258, + "learning_rate": 7.540269157485061e-06, + "loss": 0.3855, + "step": 3059 + }, + { + "epoch": 0.3371900826446281, + "grad_norm": 5.58088493347168, + "learning_rate": 7.5387630286417705e-06, + "loss": 0.3747, + "step": 3060 + }, + { + "epoch": 0.3373002754820937, + "grad_norm": 9.518733024597168, + "learning_rate": 7.537256589355085e-06, + "loss": 0.5106, + "step": 3061 + }, + { + "epoch": 0.33741046831955923, + "grad_norm": 8.29925537109375, + "learning_rate": 7.535749839809219e-06, + "loss": 0.4678, + "step": 3062 + }, + { + "epoch": 0.3375206611570248, + "grad_norm": 5.418189525604248, + "learning_rate": 7.534242780188419e-06, + "loss": 0.4085, + "step": 3063 + }, + { + "epoch": 0.33763085399449033, + "grad_norm": 14.174312591552734, + "learning_rate": 7.532735410676968e-06, + "loss": 0.5511, + "step": 3064 + }, + { + "epoch": 0.33774104683195594, + "grad_norm": 4.7229695320129395, + "learning_rate": 7.53122773145919e-06, + "loss": 0.4061, + "step": 3065 + }, + { + "epoch": 0.3378512396694215, + "grad_norm": 7.452611446380615, + "learning_rate": 7.529719742719447e-06, + "loss": 0.442, + "step": 3066 + }, + { + "epoch": 0.33796143250688704, + "grad_norm": 4.89311408996582, + "learning_rate": 7.528211444642138e-06, + "loss": 0.3748, + "step": 3067 + }, + { + "epoch": 0.33807162534435264, + "grad_norm": 7.314619541168213, + "learning_rate": 7.526702837411699e-06, + "loss": 0.3605, + "step": 3068 + }, + { + "epoch": 0.3381818181818182, + "grad_norm": 8.761941909790039, + "learning_rate": 7.525193921212606e-06, + "loss": 0.3487, + "step": 3069 + }, + { + "epoch": 0.33829201101928374, + "grad_norm": 8.771902084350586, + "learning_rate": 7.52368469622937e-06, + "loss": 0.3674, + "step": 3070 + }, + { + "epoch": 0.3384022038567493, + "grad_norm": 9.572006225585938, + "learning_rate": 7.52217516264654e-06, + "loss": 0.4249, + "step": 3071 + }, + { + "epoch": 0.3385123966942149, + "grad_norm": 9.799403190612793, + "learning_rate": 7.520665320648705e-06, + "loss": 0.4328, + "step": 3072 + }, + { + "epoch": 0.33862258953168045, + "grad_norm": 4.222426414489746, + "learning_rate": 7.5191551704204915e-06, + "loss": 0.4301, + "step": 3073 + }, + { + "epoch": 0.338732782369146, + "grad_norm": 6.847049236297607, + "learning_rate": 7.5176447121465615e-06, + "loss": 0.3948, + "step": 3074 + }, + { + "epoch": 0.33884297520661155, + "grad_norm": 4.570529460906982, + "learning_rate": 7.516133946011617e-06, + "loss": 0.3965, + "step": 3075 + }, + { + "epoch": 0.33895316804407716, + "grad_norm": 8.942601203918457, + "learning_rate": 7.514622872200394e-06, + "loss": 0.4789, + "step": 3076 + }, + { + "epoch": 0.3390633608815427, + "grad_norm": 9.412450790405273, + "learning_rate": 7.513111490897672e-06, + "loss": 0.4786, + "step": 3077 + }, + { + "epoch": 0.33917355371900826, + "grad_norm": 5.571948528289795, + "learning_rate": 7.511599802288263e-06, + "loss": 0.4435, + "step": 3078 + }, + { + "epoch": 0.3392837465564738, + "grad_norm": 10.787137031555176, + "learning_rate": 7.5100878065570185e-06, + "loss": 0.4989, + "step": 3079 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 6.516151428222656, + "learning_rate": 7.5085755038888266e-06, + "loss": 0.3717, + "step": 3080 + }, + { + "epoch": 0.33950413223140496, + "grad_norm": 4.707610607147217, + "learning_rate": 7.507062894468615e-06, + "loss": 0.4428, + "step": 3081 + }, + { + "epoch": 0.3396143250688705, + "grad_norm": 8.729148864746094, + "learning_rate": 7.505549978481345e-06, + "loss": 0.4132, + "step": 3082 + }, + { + "epoch": 0.33972451790633607, + "grad_norm": 7.110918998718262, + "learning_rate": 7.504036756112023e-06, + "loss": 0.4446, + "step": 3083 + }, + { + "epoch": 0.33983471074380167, + "grad_norm": 4.316981315612793, + "learning_rate": 7.502523227545686e-06, + "loss": 0.4446, + "step": 3084 + }, + { + "epoch": 0.3399449035812672, + "grad_norm": 7.524450778961182, + "learning_rate": 7.5010093929674065e-06, + "loss": 0.4093, + "step": 3085 + }, + { + "epoch": 0.34005509641873277, + "grad_norm": 6.450577259063721, + "learning_rate": 7.499495252562303e-06, + "loss": 0.4341, + "step": 3086 + }, + { + "epoch": 0.3401652892561983, + "grad_norm": 5.516423225402832, + "learning_rate": 7.497980806515524e-06, + "loss": 0.3102, + "step": 3087 + }, + { + "epoch": 0.34027548209366393, + "grad_norm": 4.545234203338623, + "learning_rate": 7.4964660550122595e-06, + "loss": 0.4379, + "step": 3088 + }, + { + "epoch": 0.3403856749311295, + "grad_norm": 9.267948150634766, + "learning_rate": 7.494950998237733e-06, + "loss": 0.5489, + "step": 3089 + }, + { + "epoch": 0.34049586776859503, + "grad_norm": 7.114536762237549, + "learning_rate": 7.493435636377211e-06, + "loss": 0.4453, + "step": 3090 + }, + { + "epoch": 0.3406060606060606, + "grad_norm": 5.326347351074219, + "learning_rate": 7.491919969615993e-06, + "loss": 0.369, + "step": 3091 + }, + { + "epoch": 0.3407162534435262, + "grad_norm": 6.879369735717773, + "learning_rate": 7.490403998139414e-06, + "loss": 0.3687, + "step": 3092 + }, + { + "epoch": 0.34082644628099174, + "grad_norm": 8.012075424194336, + "learning_rate": 7.488887722132853e-06, + "loss": 0.3982, + "step": 3093 + }, + { + "epoch": 0.3409366391184573, + "grad_norm": 4.556394100189209, + "learning_rate": 7.487371141781718e-06, + "loss": 0.444, + "step": 3094 + }, + { + "epoch": 0.3410468319559229, + "grad_norm": 5.1188459396362305, + "learning_rate": 7.485854257271463e-06, + "loss": 0.3585, + "step": 3095 + }, + { + "epoch": 0.34115702479338844, + "grad_norm": 5.984025955200195, + "learning_rate": 7.484337068787574e-06, + "loss": 0.4125, + "step": 3096 + }, + { + "epoch": 0.341267217630854, + "grad_norm": 23.55314826965332, + "learning_rate": 7.482819576515571e-06, + "loss": 0.4087, + "step": 3097 + }, + { + "epoch": 0.34137741046831954, + "grad_norm": 7.297911643981934, + "learning_rate": 7.481301780641019e-06, + "loss": 0.3901, + "step": 3098 + }, + { + "epoch": 0.34148760330578515, + "grad_norm": 6.8249921798706055, + "learning_rate": 7.479783681349515e-06, + "loss": 0.4114, + "step": 3099 + }, + { + "epoch": 0.3415977961432507, + "grad_norm": 12.885574340820312, + "learning_rate": 7.478265278826693e-06, + "loss": 0.5381, + "step": 3100 + }, + { + "epoch": 0.34170798898071625, + "grad_norm": 9.255385398864746, + "learning_rate": 7.476746573258227e-06, + "loss": 0.5265, + "step": 3101 + }, + { + "epoch": 0.3418181818181818, + "grad_norm": 9.08926773071289, + "learning_rate": 7.475227564829826e-06, + "loss": 0.437, + "step": 3102 + }, + { + "epoch": 0.3419283746556474, + "grad_norm": 10.180606842041016, + "learning_rate": 7.473708253727234e-06, + "loss": 0.5285, + "step": 3103 + }, + { + "epoch": 0.34203856749311295, + "grad_norm": 7.8228840827941895, + "learning_rate": 7.472188640136239e-06, + "loss": 0.454, + "step": 3104 + }, + { + "epoch": 0.3421487603305785, + "grad_norm": 6.170624256134033, + "learning_rate": 7.470668724242658e-06, + "loss": 0.4702, + "step": 3105 + }, + { + "epoch": 0.34225895316804406, + "grad_norm": 7.054677963256836, + "learning_rate": 7.46914850623235e-06, + "loss": 0.5059, + "step": 3106 + }, + { + "epoch": 0.34236914600550966, + "grad_norm": 8.298323631286621, + "learning_rate": 7.467627986291207e-06, + "loss": 0.4698, + "step": 3107 + }, + { + "epoch": 0.3424793388429752, + "grad_norm": 7.792571544647217, + "learning_rate": 7.466107164605163e-06, + "loss": 0.4285, + "step": 3108 + }, + { + "epoch": 0.34258953168044076, + "grad_norm": 7.682173728942871, + "learning_rate": 7.464586041360186e-06, + "loss": 0.3875, + "step": 3109 + }, + { + "epoch": 0.3426997245179063, + "grad_norm": 9.861799240112305, + "learning_rate": 7.463064616742278e-06, + "loss": 0.4637, + "step": 3110 + }, + { + "epoch": 0.3428099173553719, + "grad_norm": 5.392634868621826, + "learning_rate": 7.461542890937484e-06, + "loss": 0.4372, + "step": 3111 + }, + { + "epoch": 0.34292011019283747, + "grad_norm": 8.271360397338867, + "learning_rate": 7.460020864131883e-06, + "loss": 0.4836, + "step": 3112 + }, + { + "epoch": 0.343030303030303, + "grad_norm": 7.254205703735352, + "learning_rate": 7.458498536511587e-06, + "loss": 0.4199, + "step": 3113 + }, + { + "epoch": 0.34314049586776857, + "grad_norm": 9.592790603637695, + "learning_rate": 7.4569759082627515e-06, + "loss": 0.464, + "step": 3114 + }, + { + "epoch": 0.3432506887052342, + "grad_norm": 9.89777946472168, + "learning_rate": 7.455452979571562e-06, + "loss": 0.4837, + "step": 3115 + }, + { + "epoch": 0.3433608815426997, + "grad_norm": 7.666777610778809, + "learning_rate": 7.453929750624249e-06, + "loss": 0.3973, + "step": 3116 + }, + { + "epoch": 0.3434710743801653, + "grad_norm": 7.6917405128479, + "learning_rate": 7.452406221607073e-06, + "loss": 0.4681, + "step": 3117 + }, + { + "epoch": 0.3435812672176309, + "grad_norm": 6.429231643676758, + "learning_rate": 7.450882392706332e-06, + "loss": 0.4451, + "step": 3118 + }, + { + "epoch": 0.34369146005509643, + "grad_norm": 6.396045207977295, + "learning_rate": 7.449358264108365e-06, + "loss": 0.4587, + "step": 3119 + }, + { + "epoch": 0.343801652892562, + "grad_norm": 8.26191234588623, + "learning_rate": 7.4478338359995405e-06, + "loss": 0.3897, + "step": 3120 + }, + { + "epoch": 0.34391184573002753, + "grad_norm": 4.853174686431885, + "learning_rate": 7.44630910856627e-06, + "loss": 0.4505, + "step": 3121 + }, + { + "epoch": 0.34402203856749314, + "grad_norm": 5.741095066070557, + "learning_rate": 7.444784081994998e-06, + "loss": 0.3397, + "step": 3122 + }, + { + "epoch": 0.3441322314049587, + "grad_norm": 8.979782104492188, + "learning_rate": 7.443258756472207e-06, + "loss": 0.4242, + "step": 3123 + }, + { + "epoch": 0.34424242424242424, + "grad_norm": 7.540767192840576, + "learning_rate": 7.4417331321844174e-06, + "loss": 0.4519, + "step": 3124 + }, + { + "epoch": 0.3443526170798898, + "grad_norm": 5.847742557525635, + "learning_rate": 7.440207209318183e-06, + "loss": 0.3724, + "step": 3125 + }, + { + "epoch": 0.3444628099173554, + "grad_norm": 8.169764518737793, + "learning_rate": 7.4386809880600975e-06, + "loss": 0.4189, + "step": 3126 + }, + { + "epoch": 0.34457300275482095, + "grad_norm": 9.309894561767578, + "learning_rate": 7.437154468596788e-06, + "loss": 0.4038, + "step": 3127 + }, + { + "epoch": 0.3446831955922865, + "grad_norm": 7.024919033050537, + "learning_rate": 7.435627651114919e-06, + "loss": 0.3992, + "step": 3128 + }, + { + "epoch": 0.34479338842975205, + "grad_norm": 13.71273422241211, + "learning_rate": 7.434100535801192e-06, + "loss": 0.4638, + "step": 3129 + }, + { + "epoch": 0.34490358126721765, + "grad_norm": 6.8807830810546875, + "learning_rate": 7.432573122842346e-06, + "loss": 0.4508, + "step": 3130 + }, + { + "epoch": 0.3450137741046832, + "grad_norm": 10.221870422363281, + "learning_rate": 7.431045412425153e-06, + "loss": 0.4322, + "step": 3131 + }, + { + "epoch": 0.34512396694214875, + "grad_norm": 4.576849460601807, + "learning_rate": 7.429517404736426e-06, + "loss": 0.3949, + "step": 3132 + }, + { + "epoch": 0.3452341597796143, + "grad_norm": 6.163484573364258, + "learning_rate": 7.42798909996301e-06, + "loss": 0.3809, + "step": 3133 + }, + { + "epoch": 0.3453443526170799, + "grad_norm": 8.160926818847656, + "learning_rate": 7.42646049829179e-06, + "loss": 0.432, + "step": 3134 + }, + { + "epoch": 0.34545454545454546, + "grad_norm": 7.556317329406738, + "learning_rate": 7.424931599909682e-06, + "loss": 0.4434, + "step": 3135 + }, + { + "epoch": 0.345564738292011, + "grad_norm": 5.177813529968262, + "learning_rate": 7.423402405003645e-06, + "loss": 0.3683, + "step": 3136 + }, + { + "epoch": 0.34567493112947656, + "grad_norm": 8.282599449157715, + "learning_rate": 7.421872913760671e-06, + "loss": 0.4316, + "step": 3137 + }, + { + "epoch": 0.34578512396694217, + "grad_norm": 4.865157604217529, + "learning_rate": 7.420343126367785e-06, + "loss": 0.4248, + "step": 3138 + }, + { + "epoch": 0.3458953168044077, + "grad_norm": 6.126284599304199, + "learning_rate": 7.418813043012057e-06, + "loss": 0.4936, + "step": 3139 + }, + { + "epoch": 0.34600550964187327, + "grad_norm": 8.956912994384766, + "learning_rate": 7.417282663880582e-06, + "loss": 0.4199, + "step": 3140 + }, + { + "epoch": 0.3461157024793388, + "grad_norm": 4.355523586273193, + "learning_rate": 7.415751989160499e-06, + "loss": 0.3804, + "step": 3141 + }, + { + "epoch": 0.3462258953168044, + "grad_norm": 6.272829055786133, + "learning_rate": 7.414221019038983e-06, + "loss": 0.3715, + "step": 3142 + }, + { + "epoch": 0.34633608815426997, + "grad_norm": 7.41179895401001, + "learning_rate": 7.4126897537032396e-06, + "loss": 0.4484, + "step": 3143 + }, + { + "epoch": 0.3464462809917355, + "grad_norm": 9.036253929138184, + "learning_rate": 7.411158193340517e-06, + "loss": 0.4919, + "step": 3144 + }, + { + "epoch": 0.34655647382920113, + "grad_norm": 9.739233016967773, + "learning_rate": 7.409626338138096e-06, + "loss": 0.3531, + "step": 3145 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.984482288360596, + "learning_rate": 7.408094188283291e-06, + "loss": 0.381, + "step": 3146 + }, + { + "epoch": 0.34677685950413223, + "grad_norm": 5.840088844299316, + "learning_rate": 7.40656174396346e-06, + "loss": 0.5005, + "step": 3147 + }, + { + "epoch": 0.3468870523415978, + "grad_norm": 5.8981475830078125, + "learning_rate": 7.405029005365989e-06, + "loss": 0.3878, + "step": 3148 + }, + { + "epoch": 0.3469972451790634, + "grad_norm": 7.689345359802246, + "learning_rate": 7.403495972678303e-06, + "loss": 0.4888, + "step": 3149 + }, + { + "epoch": 0.34710743801652894, + "grad_norm": 7.819447994232178, + "learning_rate": 7.401962646087867e-06, + "loss": 0.3577, + "step": 3150 + }, + { + "epoch": 0.3472176308539945, + "grad_norm": 7.267233848571777, + "learning_rate": 7.400429025782174e-06, + "loss": 0.5349, + "step": 3151 + }, + { + "epoch": 0.34732782369146004, + "grad_norm": 8.139227867126465, + "learning_rate": 7.398895111948761e-06, + "loss": 0.5192, + "step": 3152 + }, + { + "epoch": 0.34743801652892564, + "grad_norm": 5.7755608558654785, + "learning_rate": 7.397360904775193e-06, + "loss": 0.4215, + "step": 3153 + }, + { + "epoch": 0.3475482093663912, + "grad_norm": 8.518473625183105, + "learning_rate": 7.395826404449078e-06, + "loss": 0.4111, + "step": 3154 + }, + { + "epoch": 0.34765840220385674, + "grad_norm": 9.734333038330078, + "learning_rate": 7.394291611158056e-06, + "loss": 0.5013, + "step": 3155 + }, + { + "epoch": 0.3477685950413223, + "grad_norm": 5.6584296226501465, + "learning_rate": 7.392756525089804e-06, + "loss": 0.4243, + "step": 3156 + }, + { + "epoch": 0.3478787878787879, + "grad_norm": 5.8375020027160645, + "learning_rate": 7.3912211464320324e-06, + "loss": 0.4266, + "step": 3157 + }, + { + "epoch": 0.34798898071625345, + "grad_norm": 4.557214260101318, + "learning_rate": 7.3896854753724926e-06, + "loss": 0.4037, + "step": 3158 + }, + { + "epoch": 0.348099173553719, + "grad_norm": 5.716039657592773, + "learning_rate": 7.3881495120989644e-06, + "loss": 0.3868, + "step": 3159 + }, + { + "epoch": 0.34820936639118455, + "grad_norm": 9.541804313659668, + "learning_rate": 7.3866132567992725e-06, + "loss": 0.4031, + "step": 3160 + }, + { + "epoch": 0.34831955922865016, + "grad_norm": 7.745457649230957, + "learning_rate": 7.385076709661268e-06, + "loss": 0.4666, + "step": 3161 + }, + { + "epoch": 0.3484297520661157, + "grad_norm": 5.802303314208984, + "learning_rate": 7.3835398708728434e-06, + "loss": 0.4277, + "step": 3162 + }, + { + "epoch": 0.34853994490358126, + "grad_norm": 8.434252738952637, + "learning_rate": 7.382002740621927e-06, + "loss": 0.432, + "step": 3163 + }, + { + "epoch": 0.3486501377410468, + "grad_norm": 6.1905999183654785, + "learning_rate": 7.380465319096478e-06, + "loss": 0.4522, + "step": 3164 + }, + { + "epoch": 0.3487603305785124, + "grad_norm": 8.930573463439941, + "learning_rate": 7.3789276064845e-06, + "loss": 0.5147, + "step": 3165 + }, + { + "epoch": 0.34887052341597796, + "grad_norm": 4.414763450622559, + "learning_rate": 7.3773896029740185e-06, + "loss": 0.3796, + "step": 3166 + }, + { + "epoch": 0.3489807162534435, + "grad_norm": 5.990714073181152, + "learning_rate": 7.375851308753109e-06, + "loss": 0.4047, + "step": 3167 + }, + { + "epoch": 0.3490909090909091, + "grad_norm": 13.556224822998047, + "learning_rate": 7.3743127240098746e-06, + "loss": 0.3992, + "step": 3168 + }, + { + "epoch": 0.34920110192837467, + "grad_norm": 7.954684734344482, + "learning_rate": 7.3727738489324545e-06, + "loss": 0.4949, + "step": 3169 + }, + { + "epoch": 0.3493112947658402, + "grad_norm": 9.754575729370117, + "learning_rate": 7.371234683709025e-06, + "loss": 0.5573, + "step": 3170 + }, + { + "epoch": 0.34942148760330577, + "grad_norm": 6.305998802185059, + "learning_rate": 7.369695228527796e-06, + "loss": 0.4003, + "step": 3171 + }, + { + "epoch": 0.3495316804407714, + "grad_norm": 5.422273635864258, + "learning_rate": 7.368155483577017e-06, + "loss": 0.4312, + "step": 3172 + }, + { + "epoch": 0.3496418732782369, + "grad_norm": 7.9743781089782715, + "learning_rate": 7.366615449044969e-06, + "loss": 0.3525, + "step": 3173 + }, + { + "epoch": 0.3497520661157025, + "grad_norm": 3.8766064643859863, + "learning_rate": 7.365075125119969e-06, + "loss": 0.4194, + "step": 3174 + }, + { + "epoch": 0.349862258953168, + "grad_norm": 5.437851905822754, + "learning_rate": 7.36353451199037e-06, + "loss": 0.4043, + "step": 3175 + }, + { + "epoch": 0.34997245179063363, + "grad_norm": 6.247857570648193, + "learning_rate": 7.36199360984456e-06, + "loss": 0.3724, + "step": 3176 + }, + { + "epoch": 0.3500826446280992, + "grad_norm": 5.689133167266846, + "learning_rate": 7.3604524188709625e-06, + "loss": 0.4144, + "step": 3177 + }, + { + "epoch": 0.35019283746556473, + "grad_norm": 7.601550579071045, + "learning_rate": 7.358910939258038e-06, + "loss": 0.4944, + "step": 3178 + }, + { + "epoch": 0.3503030303030303, + "grad_norm": 4.168118000030518, + "learning_rate": 7.35736917119428e-06, + "loss": 0.3522, + "step": 3179 + }, + { + "epoch": 0.3504132231404959, + "grad_norm": 5.866237640380859, + "learning_rate": 7.355827114868216e-06, + "loss": 0.3146, + "step": 3180 + }, + { + "epoch": 0.35052341597796144, + "grad_norm": 8.666057586669922, + "learning_rate": 7.354284770468411e-06, + "loss": 0.4484, + "step": 3181 + }, + { + "epoch": 0.350633608815427, + "grad_norm": 17.02338981628418, + "learning_rate": 7.35274213818347e-06, + "loss": 0.5835, + "step": 3182 + }, + { + "epoch": 0.35074380165289254, + "grad_norm": 5.194803237915039, + "learning_rate": 7.351199218202023e-06, + "loss": 0.4193, + "step": 3183 + }, + { + "epoch": 0.35085399449035815, + "grad_norm": 9.60494613647461, + "learning_rate": 7.3496560107127405e-06, + "loss": 0.4136, + "step": 3184 + }, + { + "epoch": 0.3509641873278237, + "grad_norm": 7.146261692047119, + "learning_rate": 7.348112515904331e-06, + "loss": 0.4132, + "step": 3185 + }, + { + "epoch": 0.35107438016528925, + "grad_norm": 5.76173210144043, + "learning_rate": 7.346568733965534e-06, + "loss": 0.4593, + "step": 3186 + }, + { + "epoch": 0.3511845730027548, + "grad_norm": 5.917609691619873, + "learning_rate": 7.345024665085121e-06, + "loss": 0.4321, + "step": 3187 + }, + { + "epoch": 0.3512947658402204, + "grad_norm": 7.550190448760986, + "learning_rate": 7.3434803094519096e-06, + "loss": 0.4516, + "step": 3188 + }, + { + "epoch": 0.35140495867768595, + "grad_norm": 6.343799114227295, + "learning_rate": 7.3419356672547425e-06, + "loss": 0.4434, + "step": 3189 + }, + { + "epoch": 0.3515151515151515, + "grad_norm": 9.271021842956543, + "learning_rate": 7.3403907386824995e-06, + "loss": 0.4578, + "step": 3190 + }, + { + "epoch": 0.35162534435261705, + "grad_norm": 5.087544918060303, + "learning_rate": 7.3388455239240986e-06, + "loss": 0.3618, + "step": 3191 + }, + { + "epoch": 0.35173553719008266, + "grad_norm": 5.713773727416992, + "learning_rate": 7.33730002316849e-06, + "loss": 0.4621, + "step": 3192 + }, + { + "epoch": 0.3518457300275482, + "grad_norm": 7.254720687866211, + "learning_rate": 7.335754236604661e-06, + "loss": 0.415, + "step": 3193 + }, + { + "epoch": 0.35195592286501376, + "grad_norm": 5.747312068939209, + "learning_rate": 7.33420816442163e-06, + "loss": 0.4226, + "step": 3194 + }, + { + "epoch": 0.35206611570247937, + "grad_norm": 6.117341041564941, + "learning_rate": 7.332661806808452e-06, + "loss": 0.5094, + "step": 3195 + }, + { + "epoch": 0.3521763085399449, + "grad_norm": 5.964598655700684, + "learning_rate": 7.331115163954223e-06, + "loss": 0.4295, + "step": 3196 + }, + { + "epoch": 0.35228650137741047, + "grad_norm": 7.39686393737793, + "learning_rate": 7.329568236048064e-06, + "loss": 0.5029, + "step": 3197 + }, + { + "epoch": 0.352396694214876, + "grad_norm": 8.714959144592285, + "learning_rate": 7.328021023279136e-06, + "loss": 0.4611, + "step": 3198 + }, + { + "epoch": 0.3525068870523416, + "grad_norm": 10.739441871643066, + "learning_rate": 7.326473525836635e-06, + "loss": 0.4679, + "step": 3199 + }, + { + "epoch": 0.3526170798898072, + "grad_norm": 4.571356773376465, + "learning_rate": 7.324925743909792e-06, + "loss": 0.4637, + "step": 3200 + }, + { + "epoch": 0.3527272727272727, + "grad_norm": 5.35941743850708, + "learning_rate": 7.323377677687871e-06, + "loss": 0.4359, + "step": 3201 + }, + { + "epoch": 0.3528374655647383, + "grad_norm": 6.5187482833862305, + "learning_rate": 7.32182932736017e-06, + "loss": 0.4485, + "step": 3202 + }, + { + "epoch": 0.3529476584022039, + "grad_norm": 5.340867519378662, + "learning_rate": 7.320280693116027e-06, + "loss": 0.4535, + "step": 3203 + }, + { + "epoch": 0.35305785123966943, + "grad_norm": 7.739713668823242, + "learning_rate": 7.3187317751448076e-06, + "loss": 0.3692, + "step": 3204 + }, + { + "epoch": 0.353168044077135, + "grad_norm": 5.844455242156982, + "learning_rate": 7.317182573635917e-06, + "loss": 0.3821, + "step": 3205 + }, + { + "epoch": 0.35327823691460053, + "grad_norm": 9.835358619689941, + "learning_rate": 7.315633088778794e-06, + "loss": 0.4196, + "step": 3206 + }, + { + "epoch": 0.35338842975206614, + "grad_norm": 10.456611633300781, + "learning_rate": 7.314083320762913e-06, + "loss": 0.4098, + "step": 3207 + }, + { + "epoch": 0.3534986225895317, + "grad_norm": 10.098512649536133, + "learning_rate": 7.312533269777777e-06, + "loss": 0.5049, + "step": 3208 + }, + { + "epoch": 0.35360881542699724, + "grad_norm": 4.391416072845459, + "learning_rate": 7.310982936012933e-06, + "loss": 0.4541, + "step": 3209 + }, + { + "epoch": 0.3537190082644628, + "grad_norm": 11.859990119934082, + "learning_rate": 7.309432319657957e-06, + "loss": 0.5331, + "step": 3210 + }, + { + "epoch": 0.3538292011019284, + "grad_norm": 9.845305442810059, + "learning_rate": 7.307881420902461e-06, + "loss": 0.3995, + "step": 3211 + }, + { + "epoch": 0.35393939393939394, + "grad_norm": 6.509954452514648, + "learning_rate": 7.3063302399360865e-06, + "loss": 0.405, + "step": 3212 + }, + { + "epoch": 0.3540495867768595, + "grad_norm": 6.005599498748779, + "learning_rate": 7.30477877694852e-06, + "loss": 0.3491, + "step": 3213 + }, + { + "epoch": 0.35415977961432504, + "grad_norm": 4.321752548217773, + "learning_rate": 7.303227032129474e-06, + "loss": 0.4173, + "step": 3214 + }, + { + "epoch": 0.35426997245179065, + "grad_norm": 9.19875431060791, + "learning_rate": 7.301675005668697e-06, + "loss": 0.4207, + "step": 3215 + }, + { + "epoch": 0.3543801652892562, + "grad_norm": 4.377025127410889, + "learning_rate": 7.300122697755974e-06, + "loss": 0.3493, + "step": 3216 + }, + { + "epoch": 0.35449035812672175, + "grad_norm": 4.898425579071045, + "learning_rate": 7.298570108581123e-06, + "loss": 0.4039, + "step": 3217 + }, + { + "epoch": 0.35460055096418736, + "grad_norm": 11.626812934875488, + "learning_rate": 7.297017238333997e-06, + "loss": 0.4473, + "step": 3218 + }, + { + "epoch": 0.3547107438016529, + "grad_norm": 5.8675994873046875, + "learning_rate": 7.295464087204483e-06, + "loss": 0.3992, + "step": 3219 + }, + { + "epoch": 0.35482093663911846, + "grad_norm": 11.243317604064941, + "learning_rate": 7.293910655382501e-06, + "loss": 0.4998, + "step": 3220 + }, + { + "epoch": 0.354931129476584, + "grad_norm": 8.46310043334961, + "learning_rate": 7.292356943058011e-06, + "loss": 0.4724, + "step": 3221 + }, + { + "epoch": 0.3550413223140496, + "grad_norm": 10.018592834472656, + "learning_rate": 7.290802950420998e-06, + "loss": 0.4469, + "step": 3222 + }, + { + "epoch": 0.35515151515151516, + "grad_norm": 6.223527431488037, + "learning_rate": 7.289248677661488e-06, + "loss": 0.5229, + "step": 3223 + }, + { + "epoch": 0.3552617079889807, + "grad_norm": 9.454200744628906, + "learning_rate": 7.287694124969542e-06, + "loss": 0.4397, + "step": 3224 + }, + { + "epoch": 0.35537190082644626, + "grad_norm": 4.261337757110596, + "learning_rate": 7.286139292535249e-06, + "loss": 0.4521, + "step": 3225 + }, + { + "epoch": 0.35548209366391187, + "grad_norm": 12.140992164611816, + "learning_rate": 7.28458418054874e-06, + "loss": 0.5055, + "step": 3226 + }, + { + "epoch": 0.3555922865013774, + "grad_norm": 10.759506225585938, + "learning_rate": 7.2830287892001705e-06, + "loss": 0.5065, + "step": 3227 + }, + { + "epoch": 0.35570247933884297, + "grad_norm": 9.743803024291992, + "learning_rate": 7.281473118679743e-06, + "loss": 0.5222, + "step": 3228 + }, + { + "epoch": 0.3558126721763085, + "grad_norm": 5.370841026306152, + "learning_rate": 7.2799171691776816e-06, + "loss": 0.4211, + "step": 3229 + }, + { + "epoch": 0.3559228650137741, + "grad_norm": 4.253917217254639, + "learning_rate": 7.278360940884252e-06, + "loss": 0.4418, + "step": 3230 + }, + { + "epoch": 0.3560330578512397, + "grad_norm": 5.469326496124268, + "learning_rate": 7.276804433989753e-06, + "loss": 0.4074, + "step": 3231 + }, + { + "epoch": 0.3561432506887052, + "grad_norm": 6.347237586975098, + "learning_rate": 7.275247648684514e-06, + "loss": 0.4117, + "step": 3232 + }, + { + "epoch": 0.3562534435261708, + "grad_norm": 10.482932090759277, + "learning_rate": 7.273690585158901e-06, + "loss": 0.4599, + "step": 3233 + }, + { + "epoch": 0.3563636363636364, + "grad_norm": 5.696188449859619, + "learning_rate": 7.272133243603317e-06, + "loss": 0.4533, + "step": 3234 + }, + { + "epoch": 0.35647382920110193, + "grad_norm": 8.62669849395752, + "learning_rate": 7.270575624208192e-06, + "loss": 0.4449, + "step": 3235 + }, + { + "epoch": 0.3565840220385675, + "grad_norm": 8.257272720336914, + "learning_rate": 7.269017727163995e-06, + "loss": 0.3958, + "step": 3236 + }, + { + "epoch": 0.35669421487603303, + "grad_norm": 5.441015720367432, + "learning_rate": 7.267459552661229e-06, + "loss": 0.4107, + "step": 3237 + }, + { + "epoch": 0.35680440771349864, + "grad_norm": 7.645590782165527, + "learning_rate": 7.26590110089043e-06, + "loss": 0.4335, + "step": 3238 + }, + { + "epoch": 0.3569146005509642, + "grad_norm": 5.621476650238037, + "learning_rate": 7.264342372042165e-06, + "loss": 0.4162, + "step": 3239 + }, + { + "epoch": 0.35702479338842974, + "grad_norm": 7.011815547943115, + "learning_rate": 7.2627833663070394e-06, + "loss": 0.4693, + "step": 3240 + }, + { + "epoch": 0.3571349862258953, + "grad_norm": 5.472735404968262, + "learning_rate": 7.261224083875688e-06, + "loss": 0.485, + "step": 3241 + }, + { + "epoch": 0.3572451790633609, + "grad_norm": 4.558168888092041, + "learning_rate": 7.2596645249387876e-06, + "loss": 0.4939, + "step": 3242 + }, + { + "epoch": 0.35735537190082645, + "grad_norm": 5.512816905975342, + "learning_rate": 7.258104689687038e-06, + "loss": 0.4622, + "step": 3243 + }, + { + "epoch": 0.357465564738292, + "grad_norm": 10.716435432434082, + "learning_rate": 7.25654457831118e-06, + "loss": 0.503, + "step": 3244 + }, + { + "epoch": 0.3575757575757576, + "grad_norm": 5.423465251922607, + "learning_rate": 7.254984191001986e-06, + "loss": 0.4033, + "step": 3245 + }, + { + "epoch": 0.35768595041322315, + "grad_norm": 7.701810836791992, + "learning_rate": 7.253423527950259e-06, + "loss": 0.4288, + "step": 3246 + }, + { + "epoch": 0.3577961432506887, + "grad_norm": 6.4743242263793945, + "learning_rate": 7.251862589346845e-06, + "loss": 0.389, + "step": 3247 + }, + { + "epoch": 0.35790633608815425, + "grad_norm": 6.115100383758545, + "learning_rate": 7.2503013753826135e-06, + "loss": 0.41, + "step": 3248 + }, + { + "epoch": 0.35801652892561986, + "grad_norm": 4.2141289710998535, + "learning_rate": 7.248739886248475e-06, + "loss": 0.4381, + "step": 3249 + }, + { + "epoch": 0.3581267217630854, + "grad_norm": 5.334897994995117, + "learning_rate": 7.247178122135368e-06, + "loss": 0.3945, + "step": 3250 + }, + { + "epoch": 0.35823691460055096, + "grad_norm": 6.416749000549316, + "learning_rate": 7.245616083234266e-06, + "loss": 0.4502, + "step": 3251 + }, + { + "epoch": 0.3583471074380165, + "grad_norm": 6.545506954193115, + "learning_rate": 7.244053769736181e-06, + "loss": 0.4508, + "step": 3252 + }, + { + "epoch": 0.3584573002754821, + "grad_norm": 5.6274003982543945, + "learning_rate": 7.242491181832151e-06, + "loss": 0.4527, + "step": 3253 + }, + { + "epoch": 0.35856749311294767, + "grad_norm": 6.930521488189697, + "learning_rate": 7.240928319713253e-06, + "loss": 0.367, + "step": 3254 + }, + { + "epoch": 0.3586776859504132, + "grad_norm": 12.740245819091797, + "learning_rate": 7.239365183570597e-06, + "loss": 0.431, + "step": 3255 + }, + { + "epoch": 0.35878787878787877, + "grad_norm": 5.199628829956055, + "learning_rate": 7.237801773595325e-06, + "loss": 0.4571, + "step": 3256 + }, + { + "epoch": 0.3588980716253444, + "grad_norm": 8.44864559173584, + "learning_rate": 7.236238089978613e-06, + "loss": 0.3375, + "step": 3257 + }, + { + "epoch": 0.3590082644628099, + "grad_norm": 4.241014003753662, + "learning_rate": 7.234674132911668e-06, + "loss": 0.4201, + "step": 3258 + }, + { + "epoch": 0.3591184573002755, + "grad_norm": 10.057315826416016, + "learning_rate": 7.233109902585735e-06, + "loss": 0.42, + "step": 3259 + }, + { + "epoch": 0.359228650137741, + "grad_norm": 6.158026218414307, + "learning_rate": 7.23154539919209e-06, + "loss": 0.4335, + "step": 3260 + }, + { + "epoch": 0.35933884297520663, + "grad_norm": 7.874216079711914, + "learning_rate": 7.2299806229220416e-06, + "loss": 0.5172, + "step": 3261 + }, + { + "epoch": 0.3594490358126722, + "grad_norm": 5.500450134277344, + "learning_rate": 7.228415573966934e-06, + "loss": 0.383, + "step": 3262 + }, + { + "epoch": 0.35955922865013773, + "grad_norm": 6.57933235168457, + "learning_rate": 7.226850252518144e-06, + "loss": 0.4605, + "step": 3263 + }, + { + "epoch": 0.3596694214876033, + "grad_norm": 5.754207611083984, + "learning_rate": 7.225284658767077e-06, + "loss": 0.482, + "step": 3264 + }, + { + "epoch": 0.3597796143250689, + "grad_norm": 8.27151870727539, + "learning_rate": 7.223718792905183e-06, + "loss": 0.4389, + "step": 3265 + }, + { + "epoch": 0.35988980716253444, + "grad_norm": 6.193888187408447, + "learning_rate": 7.222152655123933e-06, + "loss": 0.4712, + "step": 3266 + }, + { + "epoch": 0.36, + "grad_norm": 5.438370227813721, + "learning_rate": 7.220586245614838e-06, + "loss": 0.372, + "step": 3267 + }, + { + "epoch": 0.36011019283746554, + "grad_norm": 5.420002460479736, + "learning_rate": 7.219019564569441e-06, + "loss": 0.394, + "step": 3268 + }, + { + "epoch": 0.36022038567493114, + "grad_norm": 8.265101432800293, + "learning_rate": 7.217452612179314e-06, + "loss": 0.4221, + "step": 3269 + }, + { + "epoch": 0.3603305785123967, + "grad_norm": 8.280856132507324, + "learning_rate": 7.215885388636075e-06, + "loss": 0.4387, + "step": 3270 + }, + { + "epoch": 0.36044077134986224, + "grad_norm": 9.710977554321289, + "learning_rate": 7.214317894131357e-06, + "loss": 0.4793, + "step": 3271 + }, + { + "epoch": 0.36055096418732785, + "grad_norm": 4.9738688468933105, + "learning_rate": 7.212750128856839e-06, + "loss": 0.399, + "step": 3272 + }, + { + "epoch": 0.3606611570247934, + "grad_norm": 18.947298049926758, + "learning_rate": 7.211182093004231e-06, + "loss": 0.4313, + "step": 3273 + }, + { + "epoch": 0.36077134986225895, + "grad_norm": 6.143950939178467, + "learning_rate": 7.209613786765272e-06, + "loss": 0.3856, + "step": 3274 + }, + { + "epoch": 0.3608815426997245, + "grad_norm": 5.4570465087890625, + "learning_rate": 7.208045210331738e-06, + "loss": 0.3718, + "step": 3275 + }, + { + "epoch": 0.3609917355371901, + "grad_norm": 5.805082321166992, + "learning_rate": 7.206476363895436e-06, + "loss": 0.3773, + "step": 3276 + }, + { + "epoch": 0.36110192837465566, + "grad_norm": 11.431785583496094, + "learning_rate": 7.204907247648207e-06, + "loss": 0.4925, + "step": 3277 + }, + { + "epoch": 0.3612121212121212, + "grad_norm": 13.084609031677246, + "learning_rate": 7.203337861781926e-06, + "loss": 0.3695, + "step": 3278 + }, + { + "epoch": 0.36132231404958676, + "grad_norm": 6.078426361083984, + "learning_rate": 7.201768206488498e-06, + "loss": 0.4312, + "step": 3279 + }, + { + "epoch": 0.36143250688705236, + "grad_norm": 8.034586906433105, + "learning_rate": 7.200198281959863e-06, + "loss": 0.4721, + "step": 3280 + }, + { + "epoch": 0.3615426997245179, + "grad_norm": 6.784212589263916, + "learning_rate": 7.198628088387992e-06, + "loss": 0.4449, + "step": 3281 + }, + { + "epoch": 0.36165289256198346, + "grad_norm": 6.045942306518555, + "learning_rate": 7.197057625964892e-06, + "loss": 0.3875, + "step": 3282 + }, + { + "epoch": 0.361763085399449, + "grad_norm": 5.697310924530029, + "learning_rate": 7.195486894882602e-06, + "loss": 0.3944, + "step": 3283 + }, + { + "epoch": 0.3618732782369146, + "grad_norm": 7.050083637237549, + "learning_rate": 7.193915895333192e-06, + "loss": 0.4038, + "step": 3284 + }, + { + "epoch": 0.36198347107438017, + "grad_norm": 9.088546752929688, + "learning_rate": 7.192344627508767e-06, + "loss": 0.5356, + "step": 3285 + }, + { + "epoch": 0.3620936639118457, + "grad_norm": 9.426963806152344, + "learning_rate": 7.190773091601461e-06, + "loss": 0.5076, + "step": 3286 + }, + { + "epoch": 0.36220385674931127, + "grad_norm": 5.8113813400268555, + "learning_rate": 7.189201287803447e-06, + "loss": 0.4346, + "step": 3287 + }, + { + "epoch": 0.3623140495867769, + "grad_norm": 7.260709285736084, + "learning_rate": 7.187629216306925e-06, + "loss": 0.4405, + "step": 3288 + }, + { + "epoch": 0.3624242424242424, + "grad_norm": 7.138723850250244, + "learning_rate": 7.18605687730413e-06, + "loss": 0.3697, + "step": 3289 + }, + { + "epoch": 0.362534435261708, + "grad_norm": 8.327974319458008, + "learning_rate": 7.184484270987333e-06, + "loss": 0.4503, + "step": 3290 + }, + { + "epoch": 0.3626446280991735, + "grad_norm": 7.641798496246338, + "learning_rate": 7.182911397548831e-06, + "loss": 0.3697, + "step": 3291 + }, + { + "epoch": 0.36275482093663913, + "grad_norm": 6.12933349609375, + "learning_rate": 7.181338257180956e-06, + "loss": 0.4458, + "step": 3292 + }, + { + "epoch": 0.3628650137741047, + "grad_norm": 5.816046714782715, + "learning_rate": 7.179764850076078e-06, + "loss": 0.438, + "step": 3293 + }, + { + "epoch": 0.36297520661157023, + "grad_norm": 8.879964828491211, + "learning_rate": 7.178191176426594e-06, + "loss": 0.3965, + "step": 3294 + }, + { + "epoch": 0.36308539944903584, + "grad_norm": 7.10797643661499, + "learning_rate": 7.176617236424932e-06, + "loss": 0.3342, + "step": 3295 + }, + { + "epoch": 0.3631955922865014, + "grad_norm": 6.46264123916626, + "learning_rate": 7.17504303026356e-06, + "loss": 0.491, + "step": 3296 + }, + { + "epoch": 0.36330578512396694, + "grad_norm": 6.991508960723877, + "learning_rate": 7.173468558134969e-06, + "loss": 0.3825, + "step": 3297 + }, + { + "epoch": 0.3634159779614325, + "grad_norm": 6.524596214294434, + "learning_rate": 7.171893820231693e-06, + "loss": 0.4312, + "step": 3298 + }, + { + "epoch": 0.3635261707988981, + "grad_norm": 25.75640296936035, + "learning_rate": 7.170318816746289e-06, + "loss": 0.4901, + "step": 3299 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 9.401847839355469, + "learning_rate": 7.168743547871353e-06, + "loss": 0.4732, + "step": 3300 + }, + { + "epoch": 0.3637465564738292, + "grad_norm": 6.0926666259765625, + "learning_rate": 7.167168013799509e-06, + "loss": 0.4114, + "step": 3301 + }, + { + "epoch": 0.36385674931129475, + "grad_norm": 4.8055901527404785, + "learning_rate": 7.165592214723416e-06, + "loss": 0.3633, + "step": 3302 + }, + { + "epoch": 0.36396694214876035, + "grad_norm": 7.8886847496032715, + "learning_rate": 7.164016150835766e-06, + "loss": 0.4952, + "step": 3303 + }, + { + "epoch": 0.3640771349862259, + "grad_norm": 6.765124797821045, + "learning_rate": 7.162439822329282e-06, + "loss": 0.3733, + "step": 3304 + }, + { + "epoch": 0.36418732782369145, + "grad_norm": 7.276710033416748, + "learning_rate": 7.160863229396719e-06, + "loss": 0.4238, + "step": 3305 + }, + { + "epoch": 0.364297520661157, + "grad_norm": 5.627334117889404, + "learning_rate": 7.159286372230865e-06, + "loss": 0.3867, + "step": 3306 + }, + { + "epoch": 0.3644077134986226, + "grad_norm": 6.506944179534912, + "learning_rate": 7.157709251024539e-06, + "loss": 0.4296, + "step": 3307 + }, + { + "epoch": 0.36451790633608816, + "grad_norm": 6.304716110229492, + "learning_rate": 7.156131865970597e-06, + "loss": 0.4234, + "step": 3308 + }, + { + "epoch": 0.3646280991735537, + "grad_norm": 9.835976600646973, + "learning_rate": 7.154554217261921e-06, + "loss": 0.4426, + "step": 3309 + }, + { + "epoch": 0.36473829201101926, + "grad_norm": 8.039626121520996, + "learning_rate": 7.152976305091427e-06, + "loss": 0.4893, + "step": 3310 + }, + { + "epoch": 0.36484848484848487, + "grad_norm": 4.799577236175537, + "learning_rate": 7.151398129652067e-06, + "loss": 0.4305, + "step": 3311 + }, + { + "epoch": 0.3649586776859504, + "grad_norm": 6.675243854522705, + "learning_rate": 7.149819691136822e-06, + "loss": 0.3778, + "step": 3312 + }, + { + "epoch": 0.36506887052341597, + "grad_norm": 7.772543907165527, + "learning_rate": 7.148240989738705e-06, + "loss": 0.4963, + "step": 3313 + }, + { + "epoch": 0.3651790633608815, + "grad_norm": 12.201492309570312, + "learning_rate": 7.1466620256507605e-06, + "loss": 0.5121, + "step": 3314 + }, + { + "epoch": 0.3652892561983471, + "grad_norm": 7.570444583892822, + "learning_rate": 7.145082799066067e-06, + "loss": 0.5187, + "step": 3315 + }, + { + "epoch": 0.3653994490358127, + "grad_norm": 6.17292594909668, + "learning_rate": 7.143503310177737e-06, + "loss": 0.4445, + "step": 3316 + }, + { + "epoch": 0.3655096418732782, + "grad_norm": 9.412154197692871, + "learning_rate": 7.141923559178909e-06, + "loss": 0.4234, + "step": 3317 + }, + { + "epoch": 0.3656198347107438, + "grad_norm": 6.886417388916016, + "learning_rate": 7.14034354626276e-06, + "loss": 0.4243, + "step": 3318 + }, + { + "epoch": 0.3657300275482094, + "grad_norm": 6.714693546295166, + "learning_rate": 7.138763271622494e-06, + "loss": 0.3521, + "step": 3319 + }, + { + "epoch": 0.36584022038567493, + "grad_norm": 6.7171711921691895, + "learning_rate": 7.137182735451349e-06, + "loss": 0.4254, + "step": 3320 + }, + { + "epoch": 0.3659504132231405, + "grad_norm": 8.124762535095215, + "learning_rate": 7.135601937942598e-06, + "loss": 0.4031, + "step": 3321 + }, + { + "epoch": 0.3660606060606061, + "grad_norm": 3.9232611656188965, + "learning_rate": 7.13402087928954e-06, + "loss": 0.4102, + "step": 3322 + }, + { + "epoch": 0.36617079889807164, + "grad_norm": 8.025232315063477, + "learning_rate": 7.13243955968551e-06, + "loss": 0.394, + "step": 3323 + }, + { + "epoch": 0.3662809917355372, + "grad_norm": 6.724118232727051, + "learning_rate": 7.130857979323875e-06, + "loss": 0.428, + "step": 3324 + }, + { + "epoch": 0.36639118457300274, + "grad_norm": 5.150669097900391, + "learning_rate": 7.12927613839803e-06, + "loss": 0.4169, + "step": 3325 + }, + { + "epoch": 0.36650137741046834, + "grad_norm": 9.238730430603027, + "learning_rate": 7.127694037101409e-06, + "loss": 0.4994, + "step": 3326 + }, + { + "epoch": 0.3666115702479339, + "grad_norm": 7.163279056549072, + "learning_rate": 7.126111675627469e-06, + "loss": 0.4505, + "step": 3327 + }, + { + "epoch": 0.36672176308539944, + "grad_norm": 7.309119701385498, + "learning_rate": 7.124529054169705e-06, + "loss": 0.3855, + "step": 3328 + }, + { + "epoch": 0.366831955922865, + "grad_norm": 7.18917179107666, + "learning_rate": 7.122946172921644e-06, + "loss": 0.4642, + "step": 3329 + }, + { + "epoch": 0.3669421487603306, + "grad_norm": 4.3568572998046875, + "learning_rate": 7.12136303207684e-06, + "loss": 0.401, + "step": 3330 + }, + { + "epoch": 0.36705234159779615, + "grad_norm": 7.7685980796813965, + "learning_rate": 7.119779631828882e-06, + "loss": 0.3312, + "step": 3331 + }, + { + "epoch": 0.3671625344352617, + "grad_norm": 6.122189998626709, + "learning_rate": 7.1181959723713935e-06, + "loss": 0.4262, + "step": 3332 + }, + { + "epoch": 0.36727272727272725, + "grad_norm": 3.9495091438293457, + "learning_rate": 7.116612053898022e-06, + "loss": 0.3853, + "step": 3333 + }, + { + "epoch": 0.36738292011019286, + "grad_norm": 3.9788098335266113, + "learning_rate": 7.115027876602456e-06, + "loss": 0.4517, + "step": 3334 + }, + { + "epoch": 0.3674931129476584, + "grad_norm": 4.041481971740723, + "learning_rate": 7.113443440678406e-06, + "loss": 0.4503, + "step": 3335 + }, + { + "epoch": 0.36760330578512396, + "grad_norm": 18.059675216674805, + "learning_rate": 7.111858746319622e-06, + "loss": 0.5433, + "step": 3336 + }, + { + "epoch": 0.3677134986225895, + "grad_norm": 9.006142616271973, + "learning_rate": 7.110273793719882e-06, + "loss": 0.5144, + "step": 3337 + }, + { + "epoch": 0.3678236914600551, + "grad_norm": 4.624941825866699, + "learning_rate": 7.108688583072996e-06, + "loss": 0.4708, + "step": 3338 + }, + { + "epoch": 0.36793388429752066, + "grad_norm": 7.720061302185059, + "learning_rate": 7.107103114572805e-06, + "loss": 0.4989, + "step": 3339 + }, + { + "epoch": 0.3680440771349862, + "grad_norm": 5.653432846069336, + "learning_rate": 7.1055173884131835e-06, + "loss": 0.4084, + "step": 3340 + }, + { + "epoch": 0.36815426997245176, + "grad_norm": 13.400155067443848, + "learning_rate": 7.103931404788034e-06, + "loss": 0.4775, + "step": 3341 + }, + { + "epoch": 0.36826446280991737, + "grad_norm": 9.454264640808105, + "learning_rate": 7.102345163891297e-06, + "loss": 0.448, + "step": 3342 + }, + { + "epoch": 0.3683746556473829, + "grad_norm": 5.640964984893799, + "learning_rate": 7.100758665916938e-06, + "loss": 0.4596, + "step": 3343 + }, + { + "epoch": 0.36848484848484847, + "grad_norm": 8.44428825378418, + "learning_rate": 7.099171911058954e-06, + "loss": 0.4497, + "step": 3344 + }, + { + "epoch": 0.3685950413223141, + "grad_norm": 5.792271137237549, + "learning_rate": 7.0975848995113775e-06, + "loss": 0.4347, + "step": 3345 + }, + { + "epoch": 0.3687052341597796, + "grad_norm": 7.476680755615234, + "learning_rate": 7.09599763146827e-06, + "loss": 0.3456, + "step": 3346 + }, + { + "epoch": 0.3688154269972452, + "grad_norm": 5.843869209289551, + "learning_rate": 7.094410107123726e-06, + "loss": 0.4177, + "step": 3347 + }, + { + "epoch": 0.3689256198347107, + "grad_norm": 7.796284198760986, + "learning_rate": 7.092822326671867e-06, + "loss": 0.4709, + "step": 3348 + }, + { + "epoch": 0.36903581267217633, + "grad_norm": 14.648262023925781, + "learning_rate": 7.091234290306853e-06, + "loss": 0.4764, + "step": 3349 + }, + { + "epoch": 0.3691460055096419, + "grad_norm": 6.055613994598389, + "learning_rate": 7.08964599822287e-06, + "loss": 0.3784, + "step": 3350 + }, + { + "epoch": 0.36925619834710743, + "grad_norm": 4.72700834274292, + "learning_rate": 7.088057450614133e-06, + "loss": 0.4069, + "step": 3351 + }, + { + "epoch": 0.369366391184573, + "grad_norm": 6.382406234741211, + "learning_rate": 7.0864686476748965e-06, + "loss": 0.3952, + "step": 3352 + }, + { + "epoch": 0.3694765840220386, + "grad_norm": 7.267900466918945, + "learning_rate": 7.084879589599439e-06, + "loss": 0.4146, + "step": 3353 + }, + { + "epoch": 0.36958677685950414, + "grad_norm": 14.743758201599121, + "learning_rate": 7.083290276582075e-06, + "loss": 0.3825, + "step": 3354 + }, + { + "epoch": 0.3696969696969697, + "grad_norm": 5.35316276550293, + "learning_rate": 7.0817007088171445e-06, + "loss": 0.459, + "step": 3355 + }, + { + "epoch": 0.36980716253443524, + "grad_norm": 4.526485919952393, + "learning_rate": 7.080110886499023e-06, + "loss": 0.3998, + "step": 3356 + }, + { + "epoch": 0.36991735537190085, + "grad_norm": 8.61453628540039, + "learning_rate": 7.078520809822118e-06, + "loss": 0.433, + "step": 3357 + }, + { + "epoch": 0.3700275482093664, + "grad_norm": 4.0356125831604, + "learning_rate": 7.076930478980865e-06, + "loss": 0.3755, + "step": 3358 + }, + { + "epoch": 0.37013774104683195, + "grad_norm": 12.406144142150879, + "learning_rate": 7.07533989416973e-06, + "loss": 0.4947, + "step": 3359 + }, + { + "epoch": 0.3702479338842975, + "grad_norm": 7.419103145599365, + "learning_rate": 7.0737490555832155e-06, + "loss": 0.4703, + "step": 3360 + }, + { + "epoch": 0.3703581267217631, + "grad_norm": 6.0058112144470215, + "learning_rate": 7.072157963415849e-06, + "loss": 0.4211, + "step": 3361 + }, + { + "epoch": 0.37046831955922865, + "grad_norm": 18.789775848388672, + "learning_rate": 7.070566617862192e-06, + "loss": 0.466, + "step": 3362 + }, + { + "epoch": 0.3705785123966942, + "grad_norm": 9.18820858001709, + "learning_rate": 7.068975019116836e-06, + "loss": 0.3948, + "step": 3363 + }, + { + "epoch": 0.37068870523415975, + "grad_norm": 6.847883701324463, + "learning_rate": 7.067383167374405e-06, + "loss": 0.4324, + "step": 3364 + }, + { + "epoch": 0.37079889807162536, + "grad_norm": 8.323562622070312, + "learning_rate": 7.065791062829552e-06, + "loss": 0.4534, + "step": 3365 + }, + { + "epoch": 0.3709090909090909, + "grad_norm": 5.818692684173584, + "learning_rate": 7.064198705676961e-06, + "loss": 0.4155, + "step": 3366 + }, + { + "epoch": 0.37101928374655646, + "grad_norm": 11.364190101623535, + "learning_rate": 7.0626060961113484e-06, + "loss": 0.5086, + "step": 3367 + }, + { + "epoch": 0.371129476584022, + "grad_norm": 7.733094692230225, + "learning_rate": 7.061013234327461e-06, + "loss": 0.4434, + "step": 3368 + }, + { + "epoch": 0.3712396694214876, + "grad_norm": 5.380969047546387, + "learning_rate": 7.059420120520076e-06, + "loss": 0.4551, + "step": 3369 + }, + { + "epoch": 0.37134986225895317, + "grad_norm": 5.875665664672852, + "learning_rate": 7.057826754884001e-06, + "loss": 0.4119, + "step": 3370 + }, + { + "epoch": 0.3714600550964187, + "grad_norm": 8.532805442810059, + "learning_rate": 7.056233137614075e-06, + "loss": 0.4052, + "step": 3371 + }, + { + "epoch": 0.3715702479338843, + "grad_norm": 5.982562065124512, + "learning_rate": 7.054639268905168e-06, + "loss": 0.3746, + "step": 3372 + }, + { + "epoch": 0.3716804407713499, + "grad_norm": 11.36665153503418, + "learning_rate": 7.05304514895218e-06, + "loss": 0.5034, + "step": 3373 + }, + { + "epoch": 0.3717906336088154, + "grad_norm": 8.417113304138184, + "learning_rate": 7.051450777950042e-06, + "loss": 0.3379, + "step": 3374 + }, + { + "epoch": 0.371900826446281, + "grad_norm": 9.102882385253906, + "learning_rate": 7.049856156093717e-06, + "loss": 0.5195, + "step": 3375 + }, + { + "epoch": 0.3720110192837466, + "grad_norm": 7.243648052215576, + "learning_rate": 7.048261283578196e-06, + "loss": 0.4003, + "step": 3376 + }, + { + "epoch": 0.37212121212121213, + "grad_norm": 6.981441974639893, + "learning_rate": 7.046666160598504e-06, + "loss": 0.4477, + "step": 3377 + }, + { + "epoch": 0.3722314049586777, + "grad_norm": 7.513751983642578, + "learning_rate": 7.045070787349694e-06, + "loss": 0.4497, + "step": 3378 + }, + { + "epoch": 0.37234159779614323, + "grad_norm": 6.859706878662109, + "learning_rate": 7.043475164026848e-06, + "loss": 0.4278, + "step": 3379 + }, + { + "epoch": 0.37245179063360884, + "grad_norm": 7.869282245635986, + "learning_rate": 7.041879290825086e-06, + "loss": 0.5427, + "step": 3380 + }, + { + "epoch": 0.3725619834710744, + "grad_norm": 11.550929069519043, + "learning_rate": 7.040283167939548e-06, + "loss": 0.4197, + "step": 3381 + }, + { + "epoch": 0.37267217630853994, + "grad_norm": 6.503086090087891, + "learning_rate": 7.038686795565414e-06, + "loss": 0.5284, + "step": 3382 + }, + { + "epoch": 0.3727823691460055, + "grad_norm": 4.888057231903076, + "learning_rate": 7.037090173897889e-06, + "loss": 0.392, + "step": 3383 + }, + { + "epoch": 0.3728925619834711, + "grad_norm": 7.019505023956299, + "learning_rate": 7.035493303132211e-06, + "loss": 0.3921, + "step": 3384 + }, + { + "epoch": 0.37300275482093664, + "grad_norm": 4.9363203048706055, + "learning_rate": 7.033896183463648e-06, + "loss": 0.4847, + "step": 3385 + }, + { + "epoch": 0.3731129476584022, + "grad_norm": 7.932403564453125, + "learning_rate": 7.032298815087495e-06, + "loss": 0.4259, + "step": 3386 + }, + { + "epoch": 0.37322314049586774, + "grad_norm": 5.2227091789245605, + "learning_rate": 7.030701198199081e-06, + "loss": 0.3965, + "step": 3387 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.936646938323975, + "learning_rate": 7.0291033329937695e-06, + "loss": 0.438, + "step": 3388 + }, + { + "epoch": 0.3734435261707989, + "grad_norm": 5.427149295806885, + "learning_rate": 7.027505219666945e-06, + "loss": 0.4556, + "step": 3389 + }, + { + "epoch": 0.37355371900826445, + "grad_norm": 5.356679439544678, + "learning_rate": 7.025906858414028e-06, + "loss": 0.3175, + "step": 3390 + }, + { + "epoch": 0.37366391184573, + "grad_norm": 5.085716247558594, + "learning_rate": 7.024308249430467e-06, + "loss": 0.4417, + "step": 3391 + }, + { + "epoch": 0.3737741046831956, + "grad_norm": 5.447735786437988, + "learning_rate": 7.022709392911745e-06, + "loss": 0.4662, + "step": 3392 + }, + { + "epoch": 0.37388429752066116, + "grad_norm": 8.673369407653809, + "learning_rate": 7.0211102890533715e-06, + "loss": 0.4313, + "step": 3393 + }, + { + "epoch": 0.3739944903581267, + "grad_norm": 5.703742504119873, + "learning_rate": 7.019510938050884e-06, + "loss": 0.4111, + "step": 3394 + }, + { + "epoch": 0.3741046831955923, + "grad_norm": 5.098775863647461, + "learning_rate": 7.017911340099858e-06, + "loss": 0.399, + "step": 3395 + }, + { + "epoch": 0.37421487603305786, + "grad_norm": 5.0954484939575195, + "learning_rate": 7.016311495395891e-06, + "loss": 0.4055, + "step": 3396 + }, + { + "epoch": 0.3743250688705234, + "grad_norm": 8.749516487121582, + "learning_rate": 7.014711404134616e-06, + "loss": 0.3872, + "step": 3397 + }, + { + "epoch": 0.37443526170798896, + "grad_norm": 10.231471061706543, + "learning_rate": 7.013111066511694e-06, + "loss": 0.4641, + "step": 3398 + }, + { + "epoch": 0.37454545454545457, + "grad_norm": 13.29430866241455, + "learning_rate": 7.011510482722817e-06, + "loss": 0.4989, + "step": 3399 + }, + { + "epoch": 0.3746556473829201, + "grad_norm": 7.456292152404785, + "learning_rate": 7.009909652963704e-06, + "loss": 0.5007, + "step": 3400 + }, + { + "epoch": 0.37476584022038567, + "grad_norm": 5.273218154907227, + "learning_rate": 7.0083085774301085e-06, + "loss": 0.2961, + "step": 3401 + }, + { + "epoch": 0.3748760330578512, + "grad_norm": 7.514196395874023, + "learning_rate": 7.006707256317813e-06, + "loss": 0.4883, + "step": 3402 + }, + { + "epoch": 0.3749862258953168, + "grad_norm": 8.081720352172852, + "learning_rate": 7.005105689822629e-06, + "loss": 0.5013, + "step": 3403 + }, + { + "epoch": 0.3750964187327824, + "grad_norm": 12.821732521057129, + "learning_rate": 7.003503878140396e-06, + "loss": 0.4893, + "step": 3404 + }, + { + "epoch": 0.3752066115702479, + "grad_norm": 6.560337066650391, + "learning_rate": 7.001901821466988e-06, + "loss": 0.4052, + "step": 3405 + }, + { + "epoch": 0.3753168044077135, + "grad_norm": 9.20208740234375, + "learning_rate": 7.000299519998307e-06, + "loss": 0.4203, + "step": 3406 + }, + { + "epoch": 0.3754269972451791, + "grad_norm": 4.343939304351807, + "learning_rate": 6.998696973930282e-06, + "loss": 0.4376, + "step": 3407 + }, + { + "epoch": 0.37553719008264463, + "grad_norm": 3.986074209213257, + "learning_rate": 6.997094183458877e-06, + "loss": 0.409, + "step": 3408 + }, + { + "epoch": 0.3756473829201102, + "grad_norm": 7.913437366485596, + "learning_rate": 6.995491148780082e-06, + "loss": 0.426, + "step": 3409 + }, + { + "epoch": 0.37575757575757573, + "grad_norm": 9.18805980682373, + "learning_rate": 6.993887870089918e-06, + "loss": 0.4564, + "step": 3410 + }, + { + "epoch": 0.37586776859504134, + "grad_norm": 4.529977321624756, + "learning_rate": 6.992284347584438e-06, + "loss": 0.4254, + "step": 3411 + }, + { + "epoch": 0.3759779614325069, + "grad_norm": 6.054936408996582, + "learning_rate": 6.990680581459721e-06, + "loss": 0.4183, + "step": 3412 + }, + { + "epoch": 0.37608815426997244, + "grad_norm": 9.715087890625, + "learning_rate": 6.9890765719118805e-06, + "loss": 0.4464, + "step": 3413 + }, + { + "epoch": 0.376198347107438, + "grad_norm": 5.770445346832275, + "learning_rate": 6.987472319137052e-06, + "loss": 0.3828, + "step": 3414 + }, + { + "epoch": 0.3763085399449036, + "grad_norm": 8.281036376953125, + "learning_rate": 6.9858678233314094e-06, + "loss": 0.4281, + "step": 3415 + }, + { + "epoch": 0.37641873278236915, + "grad_norm": 4.728235244750977, + "learning_rate": 6.984263084691153e-06, + "loss": 0.4528, + "step": 3416 + }, + { + "epoch": 0.3765289256198347, + "grad_norm": 4.969788551330566, + "learning_rate": 6.98265810341251e-06, + "loss": 0.4187, + "step": 3417 + }, + { + "epoch": 0.37663911845730025, + "grad_norm": 6.069950103759766, + "learning_rate": 6.981052879691742e-06, + "loss": 0.3874, + "step": 3418 + }, + { + "epoch": 0.37674931129476585, + "grad_norm": 5.942756175994873, + "learning_rate": 6.979447413725136e-06, + "loss": 0.4353, + "step": 3419 + }, + { + "epoch": 0.3768595041322314, + "grad_norm": 10.28392505645752, + "learning_rate": 6.977841705709012e-06, + "loss": 0.4416, + "step": 3420 + }, + { + "epoch": 0.37696969696969695, + "grad_norm": 8.235113143920898, + "learning_rate": 6.9762357558397176e-06, + "loss": 0.4165, + "step": 3421 + }, + { + "epoch": 0.37707988980716256, + "grad_norm": 5.41470193862915, + "learning_rate": 6.974629564313629e-06, + "loss": 0.3751, + "step": 3422 + }, + { + "epoch": 0.3771900826446281, + "grad_norm": 6.02674674987793, + "learning_rate": 6.9730231313271565e-06, + "loss": 0.4191, + "step": 3423 + }, + { + "epoch": 0.37730027548209366, + "grad_norm": 8.15639591217041, + "learning_rate": 6.971416457076736e-06, + "loss": 0.4016, + "step": 3424 + }, + { + "epoch": 0.3774104683195592, + "grad_norm": 8.789399147033691, + "learning_rate": 6.969809541758832e-06, + "loss": 0.5495, + "step": 3425 + }, + { + "epoch": 0.3775206611570248, + "grad_norm": 5.830241680145264, + "learning_rate": 6.968202385569942e-06, + "loss": 0.4577, + "step": 3426 + }, + { + "epoch": 0.37763085399449037, + "grad_norm": 8.189421653747559, + "learning_rate": 6.966594988706591e-06, + "loss": 0.4584, + "step": 3427 + }, + { + "epoch": 0.3777410468319559, + "grad_norm": 5.825428009033203, + "learning_rate": 6.964987351365332e-06, + "loss": 0.4046, + "step": 3428 + }, + { + "epoch": 0.37785123966942147, + "grad_norm": 4.946706771850586, + "learning_rate": 6.963379473742752e-06, + "loss": 0.3461, + "step": 3429 + }, + { + "epoch": 0.3779614325068871, + "grad_norm": 6.065890789031982, + "learning_rate": 6.961771356035462e-06, + "loss": 0.4715, + "step": 3430 + }, + { + "epoch": 0.3780716253443526, + "grad_norm": 6.467423439025879, + "learning_rate": 6.960162998440108e-06, + "loss": 0.4751, + "step": 3431 + }, + { + "epoch": 0.3781818181818182, + "grad_norm": 11.82904052734375, + "learning_rate": 6.958554401153357e-06, + "loss": 0.4042, + "step": 3432 + }, + { + "epoch": 0.3782920110192837, + "grad_norm": 6.656799793243408, + "learning_rate": 6.956945564371915e-06, + "loss": 0.4459, + "step": 3433 + }, + { + "epoch": 0.37840220385674933, + "grad_norm": 5.858724117279053, + "learning_rate": 6.955336488292511e-06, + "loss": 0.4509, + "step": 3434 + }, + { + "epoch": 0.3785123966942149, + "grad_norm": 4.343075752258301, + "learning_rate": 6.9537271731119034e-06, + "loss": 0.3628, + "step": 3435 + }, + { + "epoch": 0.37862258953168043, + "grad_norm": 5.194216251373291, + "learning_rate": 6.952117619026886e-06, + "loss": 0.4156, + "step": 3436 + }, + { + "epoch": 0.378732782369146, + "grad_norm": 6.092804908752441, + "learning_rate": 6.9505078262342715e-06, + "loss": 0.4287, + "step": 3437 + }, + { + "epoch": 0.3788429752066116, + "grad_norm": 7.648767471313477, + "learning_rate": 6.948897794930914e-06, + "loss": 0.4498, + "step": 3438 + }, + { + "epoch": 0.37895316804407714, + "grad_norm": 6.104161739349365, + "learning_rate": 6.947287525313685e-06, + "loss": 0.4257, + "step": 3439 + }, + { + "epoch": 0.3790633608815427, + "grad_norm": 5.8925580978393555, + "learning_rate": 6.945677017579491e-06, + "loss": 0.4253, + "step": 3440 + }, + { + "epoch": 0.37917355371900824, + "grad_norm": 7.46693754196167, + "learning_rate": 6.94406627192527e-06, + "loss": 0.3619, + "step": 3441 + }, + { + "epoch": 0.37928374655647384, + "grad_norm": 10.42000961303711, + "learning_rate": 6.942455288547984e-06, + "loss": 0.4174, + "step": 3442 + }, + { + "epoch": 0.3793939393939394, + "grad_norm": 6.869128227233887, + "learning_rate": 6.940844067644626e-06, + "loss": 0.4107, + "step": 3443 + }, + { + "epoch": 0.37950413223140494, + "grad_norm": 5.960266590118408, + "learning_rate": 6.939232609412221e-06, + "loss": 0.4108, + "step": 3444 + }, + { + "epoch": 0.37961432506887055, + "grad_norm": 6.284941673278809, + "learning_rate": 6.937620914047818e-06, + "loss": 0.4197, + "step": 3445 + }, + { + "epoch": 0.3797245179063361, + "grad_norm": 8.592612266540527, + "learning_rate": 6.936008981748496e-06, + "loss": 0.4845, + "step": 3446 + }, + { + "epoch": 0.37983471074380165, + "grad_norm": 9.778707504272461, + "learning_rate": 6.934396812711367e-06, + "loss": 0.4354, + "step": 3447 + }, + { + "epoch": 0.3799449035812672, + "grad_norm": 8.552468299865723, + "learning_rate": 6.9327844071335684e-06, + "loss": 0.5189, + "step": 3448 + }, + { + "epoch": 0.3800550964187328, + "grad_norm": 5.014615058898926, + "learning_rate": 6.931171765212267e-06, + "loss": 0.4423, + "step": 3449 + }, + { + "epoch": 0.38016528925619836, + "grad_norm": 10.290054321289062, + "learning_rate": 6.929558887144657e-06, + "loss": 0.4075, + "step": 3450 + }, + { + "epoch": 0.3802754820936639, + "grad_norm": 9.373320579528809, + "learning_rate": 6.927945773127967e-06, + "loss": 0.5347, + "step": 3451 + }, + { + "epoch": 0.38038567493112946, + "grad_norm": 4.781619548797607, + "learning_rate": 6.92633242335945e-06, + "loss": 0.3897, + "step": 3452 + }, + { + "epoch": 0.38049586776859506, + "grad_norm": 7.904768466949463, + "learning_rate": 6.924718838036385e-06, + "loss": 0.4536, + "step": 3453 + }, + { + "epoch": 0.3806060606060606, + "grad_norm": 5.603475570678711, + "learning_rate": 6.923105017356087e-06, + "loss": 0.4637, + "step": 3454 + }, + { + "epoch": 0.38071625344352616, + "grad_norm": 7.653232574462891, + "learning_rate": 6.921490961515897e-06, + "loss": 0.4554, + "step": 3455 + }, + { + "epoch": 0.3808264462809917, + "grad_norm": 10.0427827835083, + "learning_rate": 6.91987667071318e-06, + "loss": 0.4982, + "step": 3456 + }, + { + "epoch": 0.3809366391184573, + "grad_norm": 10.740218162536621, + "learning_rate": 6.918262145145336e-06, + "loss": 0.482, + "step": 3457 + }, + { + "epoch": 0.38104683195592287, + "grad_norm": 5.421781063079834, + "learning_rate": 6.916647385009791e-06, + "loss": 0.384, + "step": 3458 + }, + { + "epoch": 0.3811570247933884, + "grad_norm": 6.068757057189941, + "learning_rate": 6.915032390504003e-06, + "loss": 0.4189, + "step": 3459 + }, + { + "epoch": 0.38126721763085397, + "grad_norm": 7.270913600921631, + "learning_rate": 6.913417161825449e-06, + "loss": 0.3768, + "step": 3460 + }, + { + "epoch": 0.3813774104683196, + "grad_norm": 5.31407356262207, + "learning_rate": 6.911801699171648e-06, + "loss": 0.4006, + "step": 3461 + }, + { + "epoch": 0.38148760330578513, + "grad_norm": 4.115329742431641, + "learning_rate": 6.9101860027401376e-06, + "loss": 0.4165, + "step": 3462 + }, + { + "epoch": 0.3815977961432507, + "grad_norm": 9.908076286315918, + "learning_rate": 6.908570072728487e-06, + "loss": 0.4513, + "step": 3463 + }, + { + "epoch": 0.38170798898071623, + "grad_norm": 6.62583589553833, + "learning_rate": 6.906953909334297e-06, + "loss": 0.396, + "step": 3464 + }, + { + "epoch": 0.38181818181818183, + "grad_norm": 9.345455169677734, + "learning_rate": 6.905337512755191e-06, + "loss": 0.4403, + "step": 3465 + }, + { + "epoch": 0.3819283746556474, + "grad_norm": 4.885265827178955, + "learning_rate": 6.903720883188827e-06, + "loss": 0.429, + "step": 3466 + }, + { + "epoch": 0.38203856749311293, + "grad_norm": 9.006327629089355, + "learning_rate": 6.9021040208328885e-06, + "loss": 0.4824, + "step": 3467 + }, + { + "epoch": 0.3821487603305785, + "grad_norm": 9.241366386413574, + "learning_rate": 6.9004869258850835e-06, + "loss": 0.4928, + "step": 3468 + }, + { + "epoch": 0.3822589531680441, + "grad_norm": 6.628201484680176, + "learning_rate": 6.898869598543158e-06, + "loss": 0.4649, + "step": 3469 + }, + { + "epoch": 0.38236914600550964, + "grad_norm": 7.430251121520996, + "learning_rate": 6.897252039004879e-06, + "loss": 0.4449, + "step": 3470 + }, + { + "epoch": 0.3824793388429752, + "grad_norm": 7.064650058746338, + "learning_rate": 6.8956342474680415e-06, + "loss": 0.4136, + "step": 3471 + }, + { + "epoch": 0.3825895316804408, + "grad_norm": 9.308135032653809, + "learning_rate": 6.894016224130475e-06, + "loss": 0.4978, + "step": 3472 + }, + { + "epoch": 0.38269972451790635, + "grad_norm": 10.293804168701172, + "learning_rate": 6.892397969190031e-06, + "loss": 0.3802, + "step": 3473 + }, + { + "epoch": 0.3828099173553719, + "grad_norm": 5.387298107147217, + "learning_rate": 6.890779482844592e-06, + "loss": 0.3775, + "step": 3474 + }, + { + "epoch": 0.38292011019283745, + "grad_norm": 8.136940956115723, + "learning_rate": 6.889160765292071e-06, + "loss": 0.4118, + "step": 3475 + }, + { + "epoch": 0.38303030303030305, + "grad_norm": 6.427221775054932, + "learning_rate": 6.887541816730406e-06, + "loss": 0.4748, + "step": 3476 + }, + { + "epoch": 0.3831404958677686, + "grad_norm": 7.789750576019287, + "learning_rate": 6.8859226373575625e-06, + "loss": 0.399, + "step": 3477 + }, + { + "epoch": 0.38325068870523415, + "grad_norm": 4.99204158782959, + "learning_rate": 6.884303227371536e-06, + "loss": 0.3875, + "step": 3478 + }, + { + "epoch": 0.3833608815426997, + "grad_norm": 6.423731327056885, + "learning_rate": 6.882683586970352e-06, + "loss": 0.4032, + "step": 3479 + }, + { + "epoch": 0.3834710743801653, + "grad_norm": 5.279046535491943, + "learning_rate": 6.8810637163520635e-06, + "loss": 0.3844, + "step": 3480 + }, + { + "epoch": 0.38358126721763086, + "grad_norm": 8.075996398925781, + "learning_rate": 6.879443615714746e-06, + "loss": 0.4386, + "step": 3481 + }, + { + "epoch": 0.3836914600550964, + "grad_norm": 5.527756690979004, + "learning_rate": 6.877823285256512e-06, + "loss": 0.3602, + "step": 3482 + }, + { + "epoch": 0.38380165289256196, + "grad_norm": 3.9500184059143066, + "learning_rate": 6.876202725175495e-06, + "loss": 0.361, + "step": 3483 + }, + { + "epoch": 0.38391184573002757, + "grad_norm": 4.0957489013671875, + "learning_rate": 6.8745819356698595e-06, + "loss": 0.4082, + "step": 3484 + }, + { + "epoch": 0.3840220385674931, + "grad_norm": 7.210476875305176, + "learning_rate": 6.8729609169377995e-06, + "loss": 0.5056, + "step": 3485 + }, + { + "epoch": 0.38413223140495867, + "grad_norm": 8.166796684265137, + "learning_rate": 6.871339669177535e-06, + "loss": 0.4193, + "step": 3486 + }, + { + "epoch": 0.3842424242424242, + "grad_norm": 10.03830337524414, + "learning_rate": 6.869718192587313e-06, + "loss": 0.5178, + "step": 3487 + }, + { + "epoch": 0.3843526170798898, + "grad_norm": 4.627344131469727, + "learning_rate": 6.868096487365411e-06, + "loss": 0.4505, + "step": 3488 + }, + { + "epoch": 0.3844628099173554, + "grad_norm": 4.49107551574707, + "learning_rate": 6.866474553710132e-06, + "loss": 0.4003, + "step": 3489 + }, + { + "epoch": 0.3845730027548209, + "grad_norm": 5.724297523498535, + "learning_rate": 6.864852391819812e-06, + "loss": 0.3477, + "step": 3490 + }, + { + "epoch": 0.3846831955922865, + "grad_norm": 6.456491947174072, + "learning_rate": 6.8632300018928046e-06, + "loss": 0.4606, + "step": 3491 + }, + { + "epoch": 0.3847933884297521, + "grad_norm": 7.052715301513672, + "learning_rate": 6.861607384127504e-06, + "loss": 0.3989, + "step": 3492 + }, + { + "epoch": 0.38490358126721763, + "grad_norm": 5.147886753082275, + "learning_rate": 6.859984538722322e-06, + "loss": 0.4748, + "step": 3493 + }, + { + "epoch": 0.3850137741046832, + "grad_norm": 7.9086594581604, + "learning_rate": 6.8583614658757056e-06, + "loss": 0.4413, + "step": 3494 + }, + { + "epoch": 0.3851239669421488, + "grad_norm": 7.838868618011475, + "learning_rate": 6.8567381657861255e-06, + "loss": 0.4008, + "step": 3495 + }, + { + "epoch": 0.38523415977961434, + "grad_norm": 8.721397399902344, + "learning_rate": 6.855114638652079e-06, + "loss": 0.4771, + "step": 3496 + }, + { + "epoch": 0.3853443526170799, + "grad_norm": 12.337827682495117, + "learning_rate": 6.853490884672094e-06, + "loss": 0.454, + "step": 3497 + }, + { + "epoch": 0.38545454545454544, + "grad_norm": 7.4478840827941895, + "learning_rate": 6.851866904044727e-06, + "loss": 0.3743, + "step": 3498 + }, + { + "epoch": 0.38556473829201104, + "grad_norm": 12.162842750549316, + "learning_rate": 6.850242696968558e-06, + "loss": 0.5879, + "step": 3499 + }, + { + "epoch": 0.3856749311294766, + "grad_norm": 5.296576499938965, + "learning_rate": 6.848618263642201e-06, + "loss": 0.3823, + "step": 3500 + }, + { + "epoch": 0.38578512396694215, + "grad_norm": 7.909220218658447, + "learning_rate": 6.84699360426429e-06, + "loss": 0.3704, + "step": 3501 + }, + { + "epoch": 0.3858953168044077, + "grad_norm": 7.7088236808776855, + "learning_rate": 6.845368719033493e-06, + "loss": 0.4408, + "step": 3502 + }, + { + "epoch": 0.3860055096418733, + "grad_norm": 5.983971118927002, + "learning_rate": 6.843743608148502e-06, + "loss": 0.3819, + "step": 3503 + }, + { + "epoch": 0.38611570247933885, + "grad_norm": 5.924647331237793, + "learning_rate": 6.842118271808038e-06, + "loss": 0.45, + "step": 3504 + }, + { + "epoch": 0.3862258953168044, + "grad_norm": 6.25740909576416, + "learning_rate": 6.84049271021085e-06, + "loss": 0.3646, + "step": 3505 + }, + { + "epoch": 0.38633608815426995, + "grad_norm": 5.766270637512207, + "learning_rate": 6.838866923555712e-06, + "loss": 0.4103, + "step": 3506 + }, + { + "epoch": 0.38644628099173556, + "grad_norm": 8.451875686645508, + "learning_rate": 6.83724091204143e-06, + "loss": 0.5397, + "step": 3507 + }, + { + "epoch": 0.3865564738292011, + "grad_norm": 8.257237434387207, + "learning_rate": 6.835614675866834e-06, + "loss": 0.4081, + "step": 3508 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 7.948094367980957, + "learning_rate": 6.83398821523078e-06, + "loss": 0.4358, + "step": 3509 + }, + { + "epoch": 0.3867768595041322, + "grad_norm": 5.9864091873168945, + "learning_rate": 6.832361530332158e-06, + "loss": 0.3864, + "step": 3510 + }, + { + "epoch": 0.3868870523415978, + "grad_norm": 6.935376167297363, + "learning_rate": 6.830734621369878e-06, + "loss": 0.3885, + "step": 3511 + }, + { + "epoch": 0.38699724517906336, + "grad_norm": 8.323222160339355, + "learning_rate": 6.829107488542881e-06, + "loss": 0.4655, + "step": 3512 + }, + { + "epoch": 0.3871074380165289, + "grad_norm": 6.112347602844238, + "learning_rate": 6.827480132050137e-06, + "loss": 0.4251, + "step": 3513 + }, + { + "epoch": 0.38721763085399447, + "grad_norm": 5.319746494293213, + "learning_rate": 6.825852552090639e-06, + "loss": 0.4231, + "step": 3514 + }, + { + "epoch": 0.38732782369146007, + "grad_norm": 11.083109855651855, + "learning_rate": 6.824224748863411e-06, + "loss": 0.4576, + "step": 3515 + }, + { + "epoch": 0.3874380165289256, + "grad_norm": 5.660933494567871, + "learning_rate": 6.822596722567504e-06, + "loss": 0.3755, + "step": 3516 + }, + { + "epoch": 0.38754820936639117, + "grad_norm": 9.598067283630371, + "learning_rate": 6.820968473401992e-06, + "loss": 0.4294, + "step": 3517 + }, + { + "epoch": 0.3876584022038567, + "grad_norm": 12.287252426147461, + "learning_rate": 6.819340001565984e-06, + "loss": 0.5338, + "step": 3518 + }, + { + "epoch": 0.38776859504132233, + "grad_norm": 6.616037845611572, + "learning_rate": 6.817711307258608e-06, + "loss": 0.4248, + "step": 3519 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 5.660268783569336, + "learning_rate": 6.816082390679023e-06, + "loss": 0.3798, + "step": 3520 + }, + { + "epoch": 0.38798898071625343, + "grad_norm": 16.88709259033203, + "learning_rate": 6.814453252026417e-06, + "loss": 0.4722, + "step": 3521 + }, + { + "epoch": 0.38809917355371903, + "grad_norm": 7.495926380157471, + "learning_rate": 6.812823891500004e-06, + "loss": 0.395, + "step": 3522 + }, + { + "epoch": 0.3882093663911846, + "grad_norm": 8.984149932861328, + "learning_rate": 6.811194309299023e-06, + "loss": 0.5225, + "step": 3523 + }, + { + "epoch": 0.38831955922865014, + "grad_norm": 6.24255895614624, + "learning_rate": 6.80956450562274e-06, + "loss": 0.3885, + "step": 3524 + }, + { + "epoch": 0.3884297520661157, + "grad_norm": 5.3379597663879395, + "learning_rate": 6.807934480670451e-06, + "loss": 0.5105, + "step": 3525 + }, + { + "epoch": 0.3885399449035813, + "grad_norm": 6.046708583831787, + "learning_rate": 6.8063042346414795e-06, + "loss": 0.4401, + "step": 3526 + }, + { + "epoch": 0.38865013774104684, + "grad_norm": 6.267719745635986, + "learning_rate": 6.8046737677351726e-06, + "loss": 0.4637, + "step": 3527 + }, + { + "epoch": 0.3887603305785124, + "grad_norm": 5.851125717163086, + "learning_rate": 6.803043080150905e-06, + "loss": 0.4352, + "step": 3528 + }, + { + "epoch": 0.38887052341597794, + "grad_norm": 4.543891906738281, + "learning_rate": 6.801412172088081e-06, + "loss": 0.3847, + "step": 3529 + }, + { + "epoch": 0.38898071625344355, + "grad_norm": 8.075711250305176, + "learning_rate": 6.799781043746129e-06, + "loss": 0.3763, + "step": 3530 + }, + { + "epoch": 0.3890909090909091, + "grad_norm": 7.912997245788574, + "learning_rate": 6.7981496953245065e-06, + "loss": 0.428, + "step": 3531 + }, + { + "epoch": 0.38920110192837465, + "grad_norm": 7.018909454345703, + "learning_rate": 6.7965181270226965e-06, + "loss": 0.3968, + "step": 3532 + }, + { + "epoch": 0.3893112947658402, + "grad_norm": 7.459453105926514, + "learning_rate": 6.79488633904021e-06, + "loss": 0.4546, + "step": 3533 + }, + { + "epoch": 0.3894214876033058, + "grad_norm": 5.769484519958496, + "learning_rate": 6.793254331576583e-06, + "loss": 0.3698, + "step": 3534 + }, + { + "epoch": 0.38953168044077136, + "grad_norm": 5.911960124969482, + "learning_rate": 6.7916221048313815e-06, + "loss": 0.4182, + "step": 3535 + }, + { + "epoch": 0.3896418732782369, + "grad_norm": 6.263228416442871, + "learning_rate": 6.7899896590041954e-06, + "loss": 0.3739, + "step": 3536 + }, + { + "epoch": 0.38975206611570246, + "grad_norm": 11.059060096740723, + "learning_rate": 6.788356994294642e-06, + "loss": 0.4189, + "step": 3537 + }, + { + "epoch": 0.38986225895316806, + "grad_norm": 8.340376853942871, + "learning_rate": 6.7867241109023656e-06, + "loss": 0.3487, + "step": 3538 + }, + { + "epoch": 0.3899724517906336, + "grad_norm": 10.862907409667969, + "learning_rate": 6.7850910090270385e-06, + "loss": 0.5064, + "step": 3539 + }, + { + "epoch": 0.39008264462809916, + "grad_norm": 11.00351619720459, + "learning_rate": 6.783457688868356e-06, + "loss": 0.5565, + "step": 3540 + }, + { + "epoch": 0.3901928374655647, + "grad_norm": 6.101572036743164, + "learning_rate": 6.7818241506260486e-06, + "loss": 0.4638, + "step": 3541 + }, + { + "epoch": 0.3903030303030303, + "grad_norm": 5.2540459632873535, + "learning_rate": 6.78019039449986e-06, + "loss": 0.4955, + "step": 3542 + }, + { + "epoch": 0.39041322314049587, + "grad_norm": 4.800851345062256, + "learning_rate": 6.778556420689573e-06, + "loss": 0.4166, + "step": 3543 + }, + { + "epoch": 0.3905234159779614, + "grad_norm": 4.908843517303467, + "learning_rate": 6.776922229394992e-06, + "loss": 0.4068, + "step": 3544 + }, + { + "epoch": 0.390633608815427, + "grad_norm": 4.958993434906006, + "learning_rate": 6.775287820815946e-06, + "loss": 0.4352, + "step": 3545 + }, + { + "epoch": 0.3907438016528926, + "grad_norm": 5.4108076095581055, + "learning_rate": 6.7736531951522955e-06, + "loss": 0.4019, + "step": 3546 + }, + { + "epoch": 0.3908539944903581, + "grad_norm": 9.655213356018066, + "learning_rate": 6.772018352603922e-06, + "loss": 0.4683, + "step": 3547 + }, + { + "epoch": 0.3909641873278237, + "grad_norm": 10.1361665725708, + "learning_rate": 6.770383293370734e-06, + "loss": 0.4672, + "step": 3548 + }, + { + "epoch": 0.3910743801652893, + "grad_norm": 4.456143856048584, + "learning_rate": 6.768748017652676e-06, + "loss": 0.3904, + "step": 3549 + }, + { + "epoch": 0.39118457300275483, + "grad_norm": 8.810110092163086, + "learning_rate": 6.7671125256497086e-06, + "loss": 0.483, + "step": 3550 + }, + { + "epoch": 0.3912947658402204, + "grad_norm": 6.227502346038818, + "learning_rate": 6.765476817561819e-06, + "loss": 0.4536, + "step": 3551 + }, + { + "epoch": 0.39140495867768593, + "grad_norm": 7.060149192810059, + "learning_rate": 6.763840893589025e-06, + "loss": 0.453, + "step": 3552 + }, + { + "epoch": 0.39151515151515154, + "grad_norm": 5.1962714195251465, + "learning_rate": 6.762204753931373e-06, + "loss": 0.4527, + "step": 3553 + }, + { + "epoch": 0.3916253443526171, + "grad_norm": 7.3671441078186035, + "learning_rate": 6.760568398788929e-06, + "loss": 0.4007, + "step": 3554 + }, + { + "epoch": 0.39173553719008264, + "grad_norm": 5.331968784332275, + "learning_rate": 6.75893182836179e-06, + "loss": 0.4689, + "step": 3555 + }, + { + "epoch": 0.3918457300275482, + "grad_norm": 4.352341651916504, + "learning_rate": 6.757295042850077e-06, + "loss": 0.4117, + "step": 3556 + }, + { + "epoch": 0.3919559228650138, + "grad_norm": 6.480555057525635, + "learning_rate": 6.75565804245394e-06, + "loss": 0.416, + "step": 3557 + }, + { + "epoch": 0.39206611570247935, + "grad_norm": 6.7451653480529785, + "learning_rate": 6.754020827373551e-06, + "loss": 0.3993, + "step": 3558 + }, + { + "epoch": 0.3921763085399449, + "grad_norm": 7.922951698303223, + "learning_rate": 6.752383397809114e-06, + "loss": 0.4779, + "step": 3559 + }, + { + "epoch": 0.39228650137741045, + "grad_norm": 8.732173919677734, + "learning_rate": 6.750745753960855e-06, + "loss": 0.4077, + "step": 3560 + }, + { + "epoch": 0.39239669421487605, + "grad_norm": 10.558531761169434, + "learning_rate": 6.749107896029027e-06, + "loss": 0.4473, + "step": 3561 + }, + { + "epoch": 0.3925068870523416, + "grad_norm": 6.761999607086182, + "learning_rate": 6.747469824213909e-06, + "loss": 0.4593, + "step": 3562 + }, + { + "epoch": 0.39261707988980715, + "grad_norm": 6.093954563140869, + "learning_rate": 6.745831538715807e-06, + "loss": 0.4687, + "step": 3563 + }, + { + "epoch": 0.3927272727272727, + "grad_norm": 6.497241497039795, + "learning_rate": 6.744193039735054e-06, + "loss": 0.3646, + "step": 3564 + }, + { + "epoch": 0.3928374655647383, + "grad_norm": 5.9658613204956055, + "learning_rate": 6.742554327472006e-06, + "loss": 0.4484, + "step": 3565 + }, + { + "epoch": 0.39294765840220386, + "grad_norm": 5.079550266265869, + "learning_rate": 6.740915402127048e-06, + "loss": 0.4256, + "step": 3566 + }, + { + "epoch": 0.3930578512396694, + "grad_norm": 6.680603504180908, + "learning_rate": 6.739276263900591e-06, + "loss": 0.352, + "step": 3567 + }, + { + "epoch": 0.39316804407713496, + "grad_norm": 9.222851753234863, + "learning_rate": 6.737636912993067e-06, + "loss": 0.4939, + "step": 3568 + }, + { + "epoch": 0.39327823691460057, + "grad_norm": 5.447881698608398, + "learning_rate": 6.735997349604943e-06, + "loss": 0.4103, + "step": 3569 + }, + { + "epoch": 0.3933884297520661, + "grad_norm": 5.519157886505127, + "learning_rate": 6.734357573936705e-06, + "loss": 0.4654, + "step": 3570 + }, + { + "epoch": 0.39349862258953167, + "grad_norm": 6.907510757446289, + "learning_rate": 6.732717586188866e-06, + "loss": 0.3764, + "step": 3571 + }, + { + "epoch": 0.39360881542699727, + "grad_norm": 7.562971115112305, + "learning_rate": 6.731077386561968e-06, + "loss": 0.4075, + "step": 3572 + }, + { + "epoch": 0.3937190082644628, + "grad_norm": 10.390762329101562, + "learning_rate": 6.729436975256575e-06, + "loss": 0.4435, + "step": 3573 + }, + { + "epoch": 0.3938292011019284, + "grad_norm": 4.922039985656738, + "learning_rate": 6.727796352473279e-06, + "loss": 0.4382, + "step": 3574 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 6.189228057861328, + "learning_rate": 6.726155518412701e-06, + "loss": 0.3901, + "step": 3575 + }, + { + "epoch": 0.39404958677685953, + "grad_norm": 5.172933578491211, + "learning_rate": 6.724514473275477e-06, + "loss": 0.3882, + "step": 3576 + }, + { + "epoch": 0.3941597796143251, + "grad_norm": 11.044160842895508, + "learning_rate": 6.722873217262283e-06, + "loss": 0.3986, + "step": 3577 + }, + { + "epoch": 0.39426997245179063, + "grad_norm": 6.9071125984191895, + "learning_rate": 6.721231750573813e-06, + "loss": 0.4145, + "step": 3578 + }, + { + "epoch": 0.3943801652892562, + "grad_norm": 5.4316935539245605, + "learning_rate": 6.719590073410785e-06, + "loss": 0.3772, + "step": 3579 + }, + { + "epoch": 0.3944903581267218, + "grad_norm": 6.2734527587890625, + "learning_rate": 6.717948185973946e-06, + "loss": 0.4587, + "step": 3580 + }, + { + "epoch": 0.39460055096418734, + "grad_norm": 8.695782661437988, + "learning_rate": 6.71630608846407e-06, + "loss": 0.444, + "step": 3581 + }, + { + "epoch": 0.3947107438016529, + "grad_norm": 10.9119873046875, + "learning_rate": 6.714663781081956e-06, + "loss": 0.3928, + "step": 3582 + }, + { + "epoch": 0.39482093663911844, + "grad_norm": 8.182262420654297, + "learning_rate": 6.713021264028423e-06, + "loss": 0.4551, + "step": 3583 + }, + { + "epoch": 0.39493112947658404, + "grad_norm": 8.505237579345703, + "learning_rate": 6.711378537504324e-06, + "loss": 0.4165, + "step": 3584 + }, + { + "epoch": 0.3950413223140496, + "grad_norm": 4.6055521965026855, + "learning_rate": 6.709735601710533e-06, + "loss": 0.3815, + "step": 3585 + }, + { + "epoch": 0.39515151515151514, + "grad_norm": 6.0570220947265625, + "learning_rate": 6.70809245684795e-06, + "loss": 0.4094, + "step": 3586 + }, + { + "epoch": 0.3952617079889807, + "grad_norm": 8.43146800994873, + "learning_rate": 6.7064491031175e-06, + "loss": 0.5455, + "step": 3587 + }, + { + "epoch": 0.3953719008264463, + "grad_norm": 13.617354393005371, + "learning_rate": 6.704805540720139e-06, + "loss": 0.4753, + "step": 3588 + }, + { + "epoch": 0.39548209366391185, + "grad_norm": 6.508190155029297, + "learning_rate": 6.703161769856837e-06, + "loss": 0.4257, + "step": 3589 + }, + { + "epoch": 0.3955922865013774, + "grad_norm": 6.021367073059082, + "learning_rate": 6.701517790728602e-06, + "loss": 0.4027, + "step": 3590 + }, + { + "epoch": 0.39570247933884295, + "grad_norm": 4.801262855529785, + "learning_rate": 6.699873603536459e-06, + "loss": 0.4364, + "step": 3591 + }, + { + "epoch": 0.39581267217630856, + "grad_norm": 5.468711853027344, + "learning_rate": 6.698229208481465e-06, + "loss": 0.4166, + "step": 3592 + }, + { + "epoch": 0.3959228650137741, + "grad_norm": 11.384525299072266, + "learning_rate": 6.696584605764694e-06, + "loss": 0.3336, + "step": 3593 + }, + { + "epoch": 0.39603305785123966, + "grad_norm": 8.580153465270996, + "learning_rate": 6.694939795587254e-06, + "loss": 0.5833, + "step": 3594 + }, + { + "epoch": 0.39614325068870526, + "grad_norm": 6.473851203918457, + "learning_rate": 6.693294778150276e-06, + "loss": 0.4507, + "step": 3595 + }, + { + "epoch": 0.3962534435261708, + "grad_norm": 6.248347759246826, + "learning_rate": 6.691649553654909e-06, + "loss": 0.3913, + "step": 3596 + }, + { + "epoch": 0.39636363636363636, + "grad_norm": 6.736072063446045, + "learning_rate": 6.690004122302337e-06, + "loss": 0.47, + "step": 3597 + }, + { + "epoch": 0.3964738292011019, + "grad_norm": 7.859832286834717, + "learning_rate": 6.688358484293765e-06, + "loss": 0.3682, + "step": 3598 + }, + { + "epoch": 0.3965840220385675, + "grad_norm": 8.565985679626465, + "learning_rate": 6.686712639830426e-06, + "loss": 0.4206, + "step": 3599 + }, + { + "epoch": 0.39669421487603307, + "grad_norm": 5.582574367523193, + "learning_rate": 6.685066589113573e-06, + "loss": 0.3417, + "step": 3600 + }, + { + "epoch": 0.3968044077134986, + "grad_norm": 6.459357738494873, + "learning_rate": 6.683420332344489e-06, + "loss": 0.3929, + "step": 3601 + }, + { + "epoch": 0.39691460055096417, + "grad_norm": 9.563920974731445, + "learning_rate": 6.68177386972448e-06, + "loss": 0.3584, + "step": 3602 + }, + { + "epoch": 0.3970247933884298, + "grad_norm": 6.491499423980713, + "learning_rate": 6.6801272014548775e-06, + "loss": 0.4232, + "step": 3603 + }, + { + "epoch": 0.3971349862258953, + "grad_norm": 8.638758659362793, + "learning_rate": 6.678480327737039e-06, + "loss": 0.4587, + "step": 3604 + }, + { + "epoch": 0.3972451790633609, + "grad_norm": 3.47772216796875, + "learning_rate": 6.6768332487723455e-06, + "loss": 0.3784, + "step": 3605 + }, + { + "epoch": 0.3973553719008264, + "grad_norm": 6.206127166748047, + "learning_rate": 6.6751859647622055e-06, + "loss": 0.3892, + "step": 3606 + }, + { + "epoch": 0.39746556473829203, + "grad_norm": 9.063240051269531, + "learning_rate": 6.6735384759080494e-06, + "loss": 0.4438, + "step": 3607 + }, + { + "epoch": 0.3975757575757576, + "grad_norm": 6.677006721496582, + "learning_rate": 6.6718907824113355e-06, + "loss": 0.4103, + "step": 3608 + }, + { + "epoch": 0.39768595041322313, + "grad_norm": 5.7193450927734375, + "learning_rate": 6.670242884473546e-06, + "loss": 0.4243, + "step": 3609 + }, + { + "epoch": 0.3977961432506887, + "grad_norm": 8.467764854431152, + "learning_rate": 6.668594782296187e-06, + "loss": 0.3908, + "step": 3610 + }, + { + "epoch": 0.3979063360881543, + "grad_norm": 5.584086894989014, + "learning_rate": 6.666946476080791e-06, + "loss": 0.4517, + "step": 3611 + }, + { + "epoch": 0.39801652892561984, + "grad_norm": 5.541923522949219, + "learning_rate": 6.665297966028918e-06, + "loss": 0.4184, + "step": 3612 + }, + { + "epoch": 0.3981267217630854, + "grad_norm": 13.841939926147461, + "learning_rate": 6.663649252342146e-06, + "loss": 0.5451, + "step": 3613 + }, + { + "epoch": 0.39823691460055094, + "grad_norm": 7.07025671005249, + "learning_rate": 6.662000335222083e-06, + "loss": 0.4008, + "step": 3614 + }, + { + "epoch": 0.39834710743801655, + "grad_norm": 13.290519714355469, + "learning_rate": 6.660351214870362e-06, + "loss": 0.4857, + "step": 3615 + }, + { + "epoch": 0.3984573002754821, + "grad_norm": 5.437166213989258, + "learning_rate": 6.658701891488639e-06, + "loss": 0.4092, + "step": 3616 + }, + { + "epoch": 0.39856749311294765, + "grad_norm": 6.242116451263428, + "learning_rate": 6.657052365278596e-06, + "loss": 0.4404, + "step": 3617 + }, + { + "epoch": 0.3986776859504132, + "grad_norm": 5.129985332489014, + "learning_rate": 6.655402636441937e-06, + "loss": 0.4212, + "step": 3618 + }, + { + "epoch": 0.3987878787878788, + "grad_norm": 5.419785976409912, + "learning_rate": 6.653752705180396e-06, + "loss": 0.3966, + "step": 3619 + }, + { + "epoch": 0.39889807162534435, + "grad_norm": 5.731557846069336, + "learning_rate": 6.652102571695729e-06, + "loss": 0.4348, + "step": 3620 + }, + { + "epoch": 0.3990082644628099, + "grad_norm": 5.96975564956665, + "learning_rate": 6.650452236189715e-06, + "loss": 0.4983, + "step": 3621 + }, + { + "epoch": 0.3991184573002755, + "grad_norm": 6.626498222351074, + "learning_rate": 6.648801698864159e-06, + "loss": 0.4551, + "step": 3622 + }, + { + "epoch": 0.39922865013774106, + "grad_norm": 7.082576274871826, + "learning_rate": 6.6471509599208935e-06, + "loss": 0.4722, + "step": 3623 + }, + { + "epoch": 0.3993388429752066, + "grad_norm": 10.073821067810059, + "learning_rate": 6.645500019561768e-06, + "loss": 0.4255, + "step": 3624 + }, + { + "epoch": 0.39944903581267216, + "grad_norm": 7.216056823730469, + "learning_rate": 6.643848877988668e-06, + "loss": 0.4229, + "step": 3625 + }, + { + "epoch": 0.39955922865013777, + "grad_norm": 6.034587860107422, + "learning_rate": 6.6421975354034915e-06, + "loss": 0.4199, + "step": 3626 + }, + { + "epoch": 0.3996694214876033, + "grad_norm": 6.705302715301514, + "learning_rate": 6.6405459920081715e-06, + "loss": 0.43, + "step": 3627 + }, + { + "epoch": 0.39977961432506887, + "grad_norm": 9.449960708618164, + "learning_rate": 6.638894248004659e-06, + "loss": 0.4597, + "step": 3628 + }, + { + "epoch": 0.3998898071625344, + "grad_norm": 5.67280912399292, + "learning_rate": 6.637242303594931e-06, + "loss": 0.3125, + "step": 3629 + }, + { + "epoch": 0.4, + "grad_norm": 9.817313194274902, + "learning_rate": 6.63559015898099e-06, + "loss": 0.502, + "step": 3630 + }, + { + "epoch": 0.4001101928374656, + "grad_norm": 8.02699089050293, + "learning_rate": 6.633937814364864e-06, + "loss": 0.4656, + "step": 3631 + }, + { + "epoch": 0.4002203856749311, + "grad_norm": 4.558982849121094, + "learning_rate": 6.6322852699486e-06, + "loss": 0.4041, + "step": 3632 + }, + { + "epoch": 0.4002203856749311, + "eval_loss": 0.4264827072620392, + "eval_runtime": 41.9502, + "eval_samples_per_second": 17.497, + "eval_steps_per_second": 2.193, + "step": 3632 + }, + { + "epoch": 0.4003305785123967, + "grad_norm": 4.743308067321777, + "learning_rate": 6.630632525934277e-06, + "loss": 0.4498, + "step": 3633 + }, + { + "epoch": 0.4004407713498623, + "grad_norm": 5.383522033691406, + "learning_rate": 6.628979582523995e-06, + "loss": 0.4034, + "step": 3634 + }, + { + "epoch": 0.40055096418732783, + "grad_norm": 6.133452415466309, + "learning_rate": 6.627326439919875e-06, + "loss": 0.4272, + "step": 3635 + }, + { + "epoch": 0.4006611570247934, + "grad_norm": 8.080024719238281, + "learning_rate": 6.62567309832407e-06, + "loss": 0.4828, + "step": 3636 + }, + { + "epoch": 0.40077134986225893, + "grad_norm": 9.888001441955566, + "learning_rate": 6.624019557938749e-06, + "loss": 0.4499, + "step": 3637 + }, + { + "epoch": 0.40088154269972454, + "grad_norm": 5.559149742126465, + "learning_rate": 6.62236581896611e-06, + "loss": 0.3778, + "step": 3638 + }, + { + "epoch": 0.4009917355371901, + "grad_norm": 4.956784248352051, + "learning_rate": 6.620711881608375e-06, + "loss": 0.4253, + "step": 3639 + }, + { + "epoch": 0.40110192837465564, + "grad_norm": 8.00571346282959, + "learning_rate": 6.6190577460677894e-06, + "loss": 0.4107, + "step": 3640 + }, + { + "epoch": 0.4012121212121212, + "grad_norm": 10.162068367004395, + "learning_rate": 6.617403412546625e-06, + "loss": 0.514, + "step": 3641 + }, + { + "epoch": 0.4013223140495868, + "grad_norm": 3.738342761993408, + "learning_rate": 6.615748881247172e-06, + "loss": 0.3903, + "step": 3642 + }, + { + "epoch": 0.40143250688705234, + "grad_norm": 8.659130096435547, + "learning_rate": 6.6140941523717525e-06, + "loss": 0.4029, + "step": 3643 + }, + { + "epoch": 0.4015426997245179, + "grad_norm": 5.845111846923828, + "learning_rate": 6.6124392261227065e-06, + "loss": 0.4482, + "step": 3644 + }, + { + "epoch": 0.40165289256198344, + "grad_norm": 6.543630123138428, + "learning_rate": 6.6107841027024025e-06, + "loss": 0.4043, + "step": 3645 + }, + { + "epoch": 0.40176308539944905, + "grad_norm": 7.756674766540527, + "learning_rate": 6.60912878231323e-06, + "loss": 0.4155, + "step": 3646 + }, + { + "epoch": 0.4018732782369146, + "grad_norm": 5.7334980964660645, + "learning_rate": 6.607473265157604e-06, + "loss": 0.4492, + "step": 3647 + }, + { + "epoch": 0.40198347107438015, + "grad_norm": 4.604136943817139, + "learning_rate": 6.605817551437963e-06, + "loss": 0.4511, + "step": 3648 + }, + { + "epoch": 0.40209366391184576, + "grad_norm": 8.115760803222656, + "learning_rate": 6.604161641356772e-06, + "loss": 0.4525, + "step": 3649 + }, + { + "epoch": 0.4022038567493113, + "grad_norm": 5.188201904296875, + "learning_rate": 6.6025055351165155e-06, + "loss": 0.4432, + "step": 3650 + }, + { + "epoch": 0.40231404958677686, + "grad_norm": 9.08431339263916, + "learning_rate": 6.600849232919707e-06, + "loss": 0.4567, + "step": 3651 + }, + { + "epoch": 0.4024242424242424, + "grad_norm": 4.568078994750977, + "learning_rate": 6.599192734968878e-06, + "loss": 0.4409, + "step": 3652 + }, + { + "epoch": 0.402534435261708, + "grad_norm": 6.599457740783691, + "learning_rate": 6.597536041466589e-06, + "loss": 0.4305, + "step": 3653 + }, + { + "epoch": 0.40264462809917356, + "grad_norm": 4.810308933258057, + "learning_rate": 6.595879152615423e-06, + "loss": 0.3766, + "step": 3654 + }, + { + "epoch": 0.4027548209366391, + "grad_norm": 4.524010181427002, + "learning_rate": 6.594222068617988e-06, + "loss": 0.4594, + "step": 3655 + }, + { + "epoch": 0.40286501377410466, + "grad_norm": 5.130374431610107, + "learning_rate": 6.592564789676912e-06, + "loss": 0.4735, + "step": 3656 + }, + { + "epoch": 0.40297520661157027, + "grad_norm": 4.945376873016357, + "learning_rate": 6.590907315994849e-06, + "loss": 0.457, + "step": 3657 + }, + { + "epoch": 0.4030853994490358, + "grad_norm": 7.4924726486206055, + "learning_rate": 6.589249647774479e-06, + "loss": 0.4096, + "step": 3658 + }, + { + "epoch": 0.40319559228650137, + "grad_norm": 8.88684368133545, + "learning_rate": 6.587591785218504e-06, + "loss": 0.4629, + "step": 3659 + }, + { + "epoch": 0.4033057851239669, + "grad_norm": 4.510934829711914, + "learning_rate": 6.5859337285296474e-06, + "loss": 0.4635, + "step": 3660 + }, + { + "epoch": 0.4034159779614325, + "grad_norm": 5.477586269378662, + "learning_rate": 6.584275477910662e-06, + "loss": 0.4797, + "step": 3661 + }, + { + "epoch": 0.4035261707988981, + "grad_norm": 10.872736930847168, + "learning_rate": 6.582617033564319e-06, + "loss": 0.4547, + "step": 3662 + }, + { + "epoch": 0.4036363636363636, + "grad_norm": 6.2118048667907715, + "learning_rate": 6.580958395693414e-06, + "loss": 0.4346, + "step": 3663 + }, + { + "epoch": 0.4037465564738292, + "grad_norm": 5.804343223571777, + "learning_rate": 6.5792995645007705e-06, + "loss": 0.4222, + "step": 3664 + }, + { + "epoch": 0.4038567493112948, + "grad_norm": 5.338511943817139, + "learning_rate": 6.577640540189229e-06, + "loss": 0.3204, + "step": 3665 + }, + { + "epoch": 0.40396694214876033, + "grad_norm": 7.646331787109375, + "learning_rate": 6.575981322961662e-06, + "loss": 0.4007, + "step": 3666 + }, + { + "epoch": 0.4040771349862259, + "grad_norm": 5.6988630294799805, + "learning_rate": 6.574321913020956e-06, + "loss": 0.378, + "step": 3667 + }, + { + "epoch": 0.40418732782369143, + "grad_norm": 10.288561820983887, + "learning_rate": 6.572662310570027e-06, + "loss": 0.4368, + "step": 3668 + }, + { + "epoch": 0.40429752066115704, + "grad_norm": 4.442179203033447, + "learning_rate": 6.571002515811818e-06, + "loss": 0.4269, + "step": 3669 + }, + { + "epoch": 0.4044077134986226, + "grad_norm": 14.73851490020752, + "learning_rate": 6.569342528949284e-06, + "loss": 0.4558, + "step": 3670 + }, + { + "epoch": 0.40451790633608814, + "grad_norm": 10.834920883178711, + "learning_rate": 6.567682350185416e-06, + "loss": 0.4896, + "step": 3671 + }, + { + "epoch": 0.40462809917355375, + "grad_norm": 4.4984283447265625, + "learning_rate": 6.566021979723219e-06, + "loss": 0.4421, + "step": 3672 + }, + { + "epoch": 0.4047382920110193, + "grad_norm": 6.032543182373047, + "learning_rate": 6.564361417765727e-06, + "loss": 0.4236, + "step": 3673 + }, + { + "epoch": 0.40484848484848485, + "grad_norm": 5.807493686676025, + "learning_rate": 6.562700664515998e-06, + "loss": 0.4265, + "step": 3674 + }, + { + "epoch": 0.4049586776859504, + "grad_norm": 4.196972846984863, + "learning_rate": 6.561039720177107e-06, + "loss": 0.4221, + "step": 3675 + }, + { + "epoch": 0.405068870523416, + "grad_norm": 5.8588032722473145, + "learning_rate": 6.5593785849521595e-06, + "loss": 0.3816, + "step": 3676 + }, + { + "epoch": 0.40517906336088155, + "grad_norm": 13.153230667114258, + "learning_rate": 6.55771725904428e-06, + "loss": 0.5647, + "step": 3677 + }, + { + "epoch": 0.4052892561983471, + "grad_norm": 7.3144989013671875, + "learning_rate": 6.556055742656619e-06, + "loss": 0.4653, + "step": 3678 + }, + { + "epoch": 0.40539944903581265, + "grad_norm": 6.19295597076416, + "learning_rate": 6.554394035992348e-06, + "loss": 0.4183, + "step": 3679 + }, + { + "epoch": 0.40550964187327826, + "grad_norm": 8.30089282989502, + "learning_rate": 6.552732139254662e-06, + "loss": 0.4574, + "step": 3680 + }, + { + "epoch": 0.4056198347107438, + "grad_norm": 5.111574172973633, + "learning_rate": 6.55107005264678e-06, + "loss": 0.318, + "step": 3681 + }, + { + "epoch": 0.40573002754820936, + "grad_norm": 7.422606945037842, + "learning_rate": 6.549407776371946e-06, + "loss": 0.4245, + "step": 3682 + }, + { + "epoch": 0.4058402203856749, + "grad_norm": 15.394242286682129, + "learning_rate": 6.547745310633425e-06, + "loss": 0.5091, + "step": 3683 + }, + { + "epoch": 0.4059504132231405, + "grad_norm": 9.286622047424316, + "learning_rate": 6.546082655634505e-06, + "loss": 0.4621, + "step": 3684 + }, + { + "epoch": 0.40606060606060607, + "grad_norm": 5.918662071228027, + "learning_rate": 6.544419811578498e-06, + "loss": 0.4551, + "step": 3685 + }, + { + "epoch": 0.4061707988980716, + "grad_norm": 4.8866496086120605, + "learning_rate": 6.5427567786687376e-06, + "loss": 0.4349, + "step": 3686 + }, + { + "epoch": 0.40628099173553717, + "grad_norm": 7.229520320892334, + "learning_rate": 6.541093557108583e-06, + "loss": 0.4098, + "step": 3687 + }, + { + "epoch": 0.4063911845730028, + "grad_norm": 8.540162086486816, + "learning_rate": 6.539430147101414e-06, + "loss": 0.3744, + "step": 3688 + }, + { + "epoch": 0.4065013774104683, + "grad_norm": 6.524184226989746, + "learning_rate": 6.537766548850637e-06, + "loss": 0.4935, + "step": 3689 + }, + { + "epoch": 0.4066115702479339, + "grad_norm": 11.073874473571777, + "learning_rate": 6.5361027625596775e-06, + "loss": 0.4058, + "step": 3690 + }, + { + "epoch": 0.4067217630853994, + "grad_norm": 5.637555122375488, + "learning_rate": 6.534438788431984e-06, + "loss": 0.4091, + "step": 3691 + }, + { + "epoch": 0.40683195592286503, + "grad_norm": 7.77812385559082, + "learning_rate": 6.532774626671033e-06, + "loss": 0.4041, + "step": 3692 + }, + { + "epoch": 0.4069421487603306, + "grad_norm": 4.523971080780029, + "learning_rate": 6.531110277480317e-06, + "loss": 0.3751, + "step": 3693 + }, + { + "epoch": 0.40705234159779613, + "grad_norm": 5.27182674407959, + "learning_rate": 6.529445741063356e-06, + "loss": 0.326, + "step": 3694 + }, + { + "epoch": 0.4071625344352617, + "grad_norm": 5.846972465515137, + "learning_rate": 6.5277810176236946e-06, + "loss": 0.3946, + "step": 3695 + }, + { + "epoch": 0.4072727272727273, + "grad_norm": 7.611056804656982, + "learning_rate": 6.526116107364893e-06, + "loss": 0.4223, + "step": 3696 + }, + { + "epoch": 0.40738292011019284, + "grad_norm": 4.998781204223633, + "learning_rate": 6.524451010490542e-06, + "loss": 0.4046, + "step": 3697 + }, + { + "epoch": 0.4074931129476584, + "grad_norm": 8.281871795654297, + "learning_rate": 6.52278572720425e-06, + "loss": 0.4542, + "step": 3698 + }, + { + "epoch": 0.407603305785124, + "grad_norm": 11.4763822555542, + "learning_rate": 6.52112025770965e-06, + "loss": 0.4554, + "step": 3699 + }, + { + "epoch": 0.40771349862258954, + "grad_norm": 4.127493858337402, + "learning_rate": 6.519454602210402e-06, + "loss": 0.4009, + "step": 3700 + }, + { + "epoch": 0.4078236914600551, + "grad_norm": 7.102976322174072, + "learning_rate": 6.517788760910178e-06, + "loss": 0.4302, + "step": 3701 + }, + { + "epoch": 0.40793388429752064, + "grad_norm": 4.179760456085205, + "learning_rate": 6.516122734012684e-06, + "loss": 0.3512, + "step": 3702 + }, + { + "epoch": 0.40804407713498625, + "grad_norm": 5.761562347412109, + "learning_rate": 6.514456521721642e-06, + "loss": 0.4264, + "step": 3703 + }, + { + "epoch": 0.4081542699724518, + "grad_norm": 6.149220943450928, + "learning_rate": 6.5127901242407995e-06, + "loss": 0.3949, + "step": 3704 + }, + { + "epoch": 0.40826446280991735, + "grad_norm": 8.804180145263672, + "learning_rate": 6.511123541773926e-06, + "loss": 0.4283, + "step": 3705 + }, + { + "epoch": 0.4083746556473829, + "grad_norm": 6.940871715545654, + "learning_rate": 6.509456774524812e-06, + "loss": 0.426, + "step": 3706 + }, + { + "epoch": 0.4084848484848485, + "grad_norm": 10.35230827331543, + "learning_rate": 6.5077898226972745e-06, + "loss": 0.4789, + "step": 3707 + }, + { + "epoch": 0.40859504132231406, + "grad_norm": 6.280311107635498, + "learning_rate": 6.506122686495149e-06, + "loss": 0.4017, + "step": 3708 + }, + { + "epoch": 0.4087052341597796, + "grad_norm": 12.406330108642578, + "learning_rate": 6.504455366122296e-06, + "loss": 0.622, + "step": 3709 + }, + { + "epoch": 0.40881542699724516, + "grad_norm": 8.450384140014648, + "learning_rate": 6.5027878617825955e-06, + "loss": 0.4474, + "step": 3710 + }, + { + "epoch": 0.40892561983471076, + "grad_norm": 11.929401397705078, + "learning_rate": 6.501120173679955e-06, + "loss": 0.5489, + "step": 3711 + }, + { + "epoch": 0.4090358126721763, + "grad_norm": 10.01643180847168, + "learning_rate": 6.499452302018302e-06, + "loss": 0.4799, + "step": 3712 + }, + { + "epoch": 0.40914600550964186, + "grad_norm": 7.758121490478516, + "learning_rate": 6.497784247001583e-06, + "loss": 0.4074, + "step": 3713 + }, + { + "epoch": 0.4092561983471074, + "grad_norm": 6.807549476623535, + "learning_rate": 6.496116008833773e-06, + "loss": 0.4317, + "step": 3714 + }, + { + "epoch": 0.409366391184573, + "grad_norm": 5.175261974334717, + "learning_rate": 6.494447587718864e-06, + "loss": 0.403, + "step": 3715 + }, + { + "epoch": 0.40947658402203857, + "grad_norm": 6.163679122924805, + "learning_rate": 6.492778983860873e-06, + "loss": 0.4818, + "step": 3716 + }, + { + "epoch": 0.4095867768595041, + "grad_norm": 11.304265022277832, + "learning_rate": 6.491110197463842e-06, + "loss": 0.4254, + "step": 3717 + }, + { + "epoch": 0.40969696969696967, + "grad_norm": 21.56818389892578, + "learning_rate": 6.48944122873183e-06, + "loss": 0.4987, + "step": 3718 + }, + { + "epoch": 0.4098071625344353, + "grad_norm": 5.828494548797607, + "learning_rate": 6.487772077868921e-06, + "loss": 0.4672, + "step": 3719 + }, + { + "epoch": 0.4099173553719008, + "grad_norm": 8.28695297241211, + "learning_rate": 6.486102745079223e-06, + "loss": 0.4679, + "step": 3720 + }, + { + "epoch": 0.4100275482093664, + "grad_norm": 6.2719268798828125, + "learning_rate": 6.484433230566861e-06, + "loss": 0.4214, + "step": 3721 + }, + { + "epoch": 0.410137741046832, + "grad_norm": 16.524568557739258, + "learning_rate": 6.4827635345359864e-06, + "loss": 0.5679, + "step": 3722 + }, + { + "epoch": 0.41024793388429753, + "grad_norm": 9.619969367980957, + "learning_rate": 6.4810936571907745e-06, + "loss": 0.4052, + "step": 3723 + }, + { + "epoch": 0.4103581267217631, + "grad_norm": 19.248586654663086, + "learning_rate": 6.479423598735417e-06, + "loss": 0.3896, + "step": 3724 + }, + { + "epoch": 0.41046831955922863, + "grad_norm": 10.976968765258789, + "learning_rate": 6.4777533593741336e-06, + "loss": 0.445, + "step": 3725 + }, + { + "epoch": 0.41057851239669424, + "grad_norm": 7.226747512817383, + "learning_rate": 6.4760829393111615e-06, + "loss": 0.4403, + "step": 3726 + }, + { + "epoch": 0.4106887052341598, + "grad_norm": 5.23472785949707, + "learning_rate": 6.474412338750762e-06, + "loss": 0.4736, + "step": 3727 + }, + { + "epoch": 0.41079889807162534, + "grad_norm": 8.113616943359375, + "learning_rate": 6.472741557897219e-06, + "loss": 0.5052, + "step": 3728 + }, + { + "epoch": 0.4109090909090909, + "grad_norm": 7.333268165588379, + "learning_rate": 6.4710705969548385e-06, + "loss": 0.3841, + "step": 3729 + }, + { + "epoch": 0.4110192837465565, + "grad_norm": 6.259932041168213, + "learning_rate": 6.469399456127947e-06, + "loss": 0.4402, + "step": 3730 + }, + { + "epoch": 0.41112947658402205, + "grad_norm": 5.707891941070557, + "learning_rate": 6.467728135620892e-06, + "loss": 0.4271, + "step": 3731 + }, + { + "epoch": 0.4112396694214876, + "grad_norm": 5.641848087310791, + "learning_rate": 6.46605663563805e-06, + "loss": 0.4079, + "step": 3732 + }, + { + "epoch": 0.41134986225895315, + "grad_norm": 5.5837507247924805, + "learning_rate": 6.4643849563838105e-06, + "loss": 0.4462, + "step": 3733 + }, + { + "epoch": 0.41146005509641875, + "grad_norm": 3.8371849060058594, + "learning_rate": 6.462713098062587e-06, + "loss": 0.3939, + "step": 3734 + }, + { + "epoch": 0.4115702479338843, + "grad_norm": 18.237119674682617, + "learning_rate": 6.461041060878821e-06, + "loss": 0.4489, + "step": 3735 + }, + { + "epoch": 0.41168044077134985, + "grad_norm": 9.150449752807617, + "learning_rate": 6.4593688450369695e-06, + "loss": 0.5537, + "step": 3736 + }, + { + "epoch": 0.4117906336088154, + "grad_norm": 5.526957035064697, + "learning_rate": 6.457696450741512e-06, + "loss": 0.4463, + "step": 3737 + }, + { + "epoch": 0.411900826446281, + "grad_norm": 7.520010948181152, + "learning_rate": 6.456023878196953e-06, + "loss": 0.4618, + "step": 3738 + }, + { + "epoch": 0.41201101928374656, + "grad_norm": 4.8241424560546875, + "learning_rate": 6.454351127607817e-06, + "loss": 0.452, + "step": 3739 + }, + { + "epoch": 0.4121212121212121, + "grad_norm": 7.352964878082275, + "learning_rate": 6.452678199178649e-06, + "loss": 0.3904, + "step": 3740 + }, + { + "epoch": 0.41223140495867766, + "grad_norm": 5.8515753746032715, + "learning_rate": 6.451005093114018e-06, + "loss": 0.4451, + "step": 3741 + }, + { + "epoch": 0.41234159779614327, + "grad_norm": 8.789145469665527, + "learning_rate": 6.4493318096185135e-06, + "loss": 0.3964, + "step": 3742 + }, + { + "epoch": 0.4124517906336088, + "grad_norm": 10.589065551757812, + "learning_rate": 6.4476583488967455e-06, + "loss": 0.426, + "step": 3743 + }, + { + "epoch": 0.41256198347107437, + "grad_norm": 8.067483901977539, + "learning_rate": 6.445984711153348e-06, + "loss": 0.4201, + "step": 3744 + }, + { + "epoch": 0.4126721763085399, + "grad_norm": 8.293087005615234, + "learning_rate": 6.444310896592978e-06, + "loss": 0.4547, + "step": 3745 + }, + { + "epoch": 0.4127823691460055, + "grad_norm": 8.98918342590332, + "learning_rate": 6.442636905420307e-06, + "loss": 0.4654, + "step": 3746 + }, + { + "epoch": 0.4128925619834711, + "grad_norm": 6.305483341217041, + "learning_rate": 6.440962737840038e-06, + "loss": 0.501, + "step": 3747 + }, + { + "epoch": 0.4130027548209366, + "grad_norm": 6.1845502853393555, + "learning_rate": 6.439288394056886e-06, + "loss": 0.4221, + "step": 3748 + }, + { + "epoch": 0.41311294765840223, + "grad_norm": 9.436981201171875, + "learning_rate": 6.437613874275596e-06, + "loss": 0.4046, + "step": 3749 + }, + { + "epoch": 0.4132231404958678, + "grad_norm": 10.030034065246582, + "learning_rate": 6.435939178700926e-06, + "loss": 0.4202, + "step": 3750 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 6.672735214233398, + "learning_rate": 6.434264307537664e-06, + "loss": 0.4676, + "step": 3751 + }, + { + "epoch": 0.4134435261707989, + "grad_norm": 12.742806434631348, + "learning_rate": 6.432589260990614e-06, + "loss": 0.4334, + "step": 3752 + }, + { + "epoch": 0.4135537190082645, + "grad_norm": 6.345670223236084, + "learning_rate": 6.430914039264604e-06, + "loss": 0.4058, + "step": 3753 + }, + { + "epoch": 0.41366391184573004, + "grad_norm": 5.737221717834473, + "learning_rate": 6.429238642564482e-06, + "loss": 0.3692, + "step": 3754 + }, + { + "epoch": 0.4137741046831956, + "grad_norm": 8.014758110046387, + "learning_rate": 6.4275630710951155e-06, + "loss": 0.5336, + "step": 3755 + }, + { + "epoch": 0.41388429752066114, + "grad_norm": 5.608326435089111, + "learning_rate": 6.4258873250614e-06, + "loss": 0.3423, + "step": 3756 + }, + { + "epoch": 0.41399449035812674, + "grad_norm": 5.42350435256958, + "learning_rate": 6.4242114046682435e-06, + "loss": 0.4464, + "step": 3757 + }, + { + "epoch": 0.4141046831955923, + "grad_norm": 4.1355156898498535, + "learning_rate": 6.422535310120583e-06, + "loss": 0.4116, + "step": 3758 + }, + { + "epoch": 0.41421487603305784, + "grad_norm": 5.403732776641846, + "learning_rate": 6.420859041623371e-06, + "loss": 0.4441, + "step": 3759 + }, + { + "epoch": 0.4143250688705234, + "grad_norm": 6.440338134765625, + "learning_rate": 6.419182599381586e-06, + "loss": 0.4527, + "step": 3760 + }, + { + "epoch": 0.414435261707989, + "grad_norm": 12.544288635253906, + "learning_rate": 6.417505983600226e-06, + "loss": 0.5211, + "step": 3761 + }, + { + "epoch": 0.41454545454545455, + "grad_norm": 9.335394859313965, + "learning_rate": 6.4158291944843075e-06, + "loss": 0.4746, + "step": 3762 + }, + { + "epoch": 0.4146556473829201, + "grad_norm": 5.873477935791016, + "learning_rate": 6.4141522322388725e-06, + "loss": 0.4255, + "step": 3763 + }, + { + "epoch": 0.41476584022038565, + "grad_norm": 9.662142753601074, + "learning_rate": 6.412475097068982e-06, + "loss": 0.5154, + "step": 3764 + }, + { + "epoch": 0.41487603305785126, + "grad_norm": 4.658679485321045, + "learning_rate": 6.410797789179717e-06, + "loss": 0.4081, + "step": 3765 + }, + { + "epoch": 0.4149862258953168, + "grad_norm": 5.880003929138184, + "learning_rate": 6.409120308776182e-06, + "loss": 0.3921, + "step": 3766 + }, + { + "epoch": 0.41509641873278236, + "grad_norm": 5.686578273773193, + "learning_rate": 6.4074426560635025e-06, + "loss": 0.4204, + "step": 3767 + }, + { + "epoch": 0.4152066115702479, + "grad_norm": 5.895596027374268, + "learning_rate": 6.405764831246823e-06, + "loss": 0.4568, + "step": 3768 + }, + { + "epoch": 0.4153168044077135, + "grad_norm": 8.551445960998535, + "learning_rate": 6.404086834531309e-06, + "loss": 0.4531, + "step": 3769 + }, + { + "epoch": 0.41542699724517906, + "grad_norm": 5.391382694244385, + "learning_rate": 6.402408666122152e-06, + "loss": 0.3608, + "step": 3770 + }, + { + "epoch": 0.4155371900826446, + "grad_norm": 15.061893463134766, + "learning_rate": 6.4007303262245566e-06, + "loss": 0.407, + "step": 3771 + }, + { + "epoch": 0.4156473829201102, + "grad_norm": 8.456413269042969, + "learning_rate": 6.399051815043754e-06, + "loss": 0.4365, + "step": 3772 + }, + { + "epoch": 0.41575757575757577, + "grad_norm": 4.756795406341553, + "learning_rate": 6.397373132784995e-06, + "loss": 0.4001, + "step": 3773 + }, + { + "epoch": 0.4158677685950413, + "grad_norm": 7.5593342781066895, + "learning_rate": 6.395694279653553e-06, + "loss": 0.4053, + "step": 3774 + }, + { + "epoch": 0.41597796143250687, + "grad_norm": 7.961740493774414, + "learning_rate": 6.394015255854717e-06, + "loss": 0.4265, + "step": 3775 + }, + { + "epoch": 0.4160881542699725, + "grad_norm": 6.529910087585449, + "learning_rate": 6.392336061593802e-06, + "loss": 0.4168, + "step": 3776 + }, + { + "epoch": 0.416198347107438, + "grad_norm": 11.598061561584473, + "learning_rate": 6.390656697076143e-06, + "loss": 0.4635, + "step": 3777 + }, + { + "epoch": 0.4163085399449036, + "grad_norm": 7.832777976989746, + "learning_rate": 6.3889771625070925e-06, + "loss": 0.4787, + "step": 3778 + }, + { + "epoch": 0.4164187327823691, + "grad_norm": 6.925414085388184, + "learning_rate": 6.38729745809203e-06, + "loss": 0.5039, + "step": 3779 + }, + { + "epoch": 0.41652892561983473, + "grad_norm": 5.661531448364258, + "learning_rate": 6.385617584036348e-06, + "loss": 0.4309, + "step": 3780 + }, + { + "epoch": 0.4166391184573003, + "grad_norm": 7.859992027282715, + "learning_rate": 6.3839375405454666e-06, + "loss": 0.4091, + "step": 3781 + }, + { + "epoch": 0.41674931129476583, + "grad_norm": 6.426723480224609, + "learning_rate": 6.3822573278248235e-06, + "loss": 0.439, + "step": 3782 + }, + { + "epoch": 0.4168595041322314, + "grad_norm": 8.279650688171387, + "learning_rate": 6.380576946079875e-06, + "loss": 0.4073, + "step": 3783 + }, + { + "epoch": 0.416969696969697, + "grad_norm": 4.793420314788818, + "learning_rate": 6.3788963955161046e-06, + "loss": 0.4326, + "step": 3784 + }, + { + "epoch": 0.41707988980716254, + "grad_norm": 8.681497573852539, + "learning_rate": 6.377215676339007e-06, + "loss": 0.3881, + "step": 3785 + }, + { + "epoch": 0.4171900826446281, + "grad_norm": 7.378214359283447, + "learning_rate": 6.375534788754106e-06, + "loss": 0.4035, + "step": 3786 + }, + { + "epoch": 0.41730027548209364, + "grad_norm": 6.326870441436768, + "learning_rate": 6.373853732966944e-06, + "loss": 0.4634, + "step": 3787 + }, + { + "epoch": 0.41741046831955925, + "grad_norm": 9.407134056091309, + "learning_rate": 6.372172509183082e-06, + "loss": 0.4246, + "step": 3788 + }, + { + "epoch": 0.4175206611570248, + "grad_norm": 5.874998092651367, + "learning_rate": 6.370491117608101e-06, + "loss": 0.3218, + "step": 3789 + }, + { + "epoch": 0.41763085399449035, + "grad_norm": 6.414131164550781, + "learning_rate": 6.368809558447603e-06, + "loss": 0.5007, + "step": 3790 + }, + { + "epoch": 0.4177410468319559, + "grad_norm": 6.991122245788574, + "learning_rate": 6.367127831907214e-06, + "loss": 0.4957, + "step": 3791 + }, + { + "epoch": 0.4178512396694215, + "grad_norm": 4.127720832824707, + "learning_rate": 6.3654459381925785e-06, + "loss": 0.395, + "step": 3792 + }, + { + "epoch": 0.41796143250688705, + "grad_norm": 8.374381065368652, + "learning_rate": 6.363763877509355e-06, + "loss": 0.3983, + "step": 3793 + }, + { + "epoch": 0.4180716253443526, + "grad_norm": 4.763553142547607, + "learning_rate": 6.362081650063234e-06, + "loss": 0.3837, + "step": 3794 + }, + { + "epoch": 0.41818181818181815, + "grad_norm": 6.920061111450195, + "learning_rate": 6.360399256059919e-06, + "loss": 0.3299, + "step": 3795 + }, + { + "epoch": 0.41829201101928376, + "grad_norm": 6.663188457489014, + "learning_rate": 6.358716695705135e-06, + "loss": 0.3495, + "step": 3796 + }, + { + "epoch": 0.4184022038567493, + "grad_norm": 6.809614181518555, + "learning_rate": 6.357033969204628e-06, + "loss": 0.3879, + "step": 3797 + }, + { + "epoch": 0.41851239669421486, + "grad_norm": 7.749216556549072, + "learning_rate": 6.355351076764164e-06, + "loss": 0.4378, + "step": 3798 + }, + { + "epoch": 0.41862258953168047, + "grad_norm": 5.883305072784424, + "learning_rate": 6.353668018589527e-06, + "loss": 0.3725, + "step": 3799 + }, + { + "epoch": 0.418732782369146, + "grad_norm": 7.1863250732421875, + "learning_rate": 6.3519847948865284e-06, + "loss": 0.3712, + "step": 3800 + }, + { + "epoch": 0.41884297520661157, + "grad_norm": 11.583418846130371, + "learning_rate": 6.350301405860991e-06, + "loss": 0.4467, + "step": 3801 + }, + { + "epoch": 0.4189531680440771, + "grad_norm": 6.89532995223999, + "learning_rate": 6.348617851718766e-06, + "loss": 0.4527, + "step": 3802 + }, + { + "epoch": 0.4190633608815427, + "grad_norm": 6.568483352661133, + "learning_rate": 6.346934132665716e-06, + "loss": 0.3707, + "step": 3803 + }, + { + "epoch": 0.4191735537190083, + "grad_norm": 8.508829116821289, + "learning_rate": 6.345250248907731e-06, + "loss": 0.5115, + "step": 3804 + }, + { + "epoch": 0.4192837465564738, + "grad_norm": 6.044707775115967, + "learning_rate": 6.3435662006507194e-06, + "loss": 0.3654, + "step": 3805 + }, + { + "epoch": 0.4193939393939394, + "grad_norm": 6.409003734588623, + "learning_rate": 6.341881988100605e-06, + "loss": 0.4392, + "step": 3806 + }, + { + "epoch": 0.419504132231405, + "grad_norm": 8.158562660217285, + "learning_rate": 6.340197611463341e-06, + "loss": 0.4119, + "step": 3807 + }, + { + "epoch": 0.41961432506887053, + "grad_norm": 6.024563312530518, + "learning_rate": 6.338513070944891e-06, + "loss": 0.3328, + "step": 3808 + }, + { + "epoch": 0.4197245179063361, + "grad_norm": 9.549670219421387, + "learning_rate": 6.336828366751245e-06, + "loss": 0.4517, + "step": 3809 + }, + { + "epoch": 0.41983471074380163, + "grad_norm": 9.08705997467041, + "learning_rate": 6.335143499088412e-06, + "loss": 0.4371, + "step": 3810 + }, + { + "epoch": 0.41994490358126724, + "grad_norm": 15.335102081298828, + "learning_rate": 6.333458468162415e-06, + "loss": 0.3911, + "step": 3811 + }, + { + "epoch": 0.4200550964187328, + "grad_norm": 9.106855392456055, + "learning_rate": 6.33177327417931e-06, + "loss": 0.4405, + "step": 3812 + }, + { + "epoch": 0.42016528925619834, + "grad_norm": 6.226790904998779, + "learning_rate": 6.330087917345156e-06, + "loss": 0.4018, + "step": 3813 + }, + { + "epoch": 0.4202754820936639, + "grad_norm": 7.737644672393799, + "learning_rate": 6.328402397866045e-06, + "loss": 0.4866, + "step": 3814 + }, + { + "epoch": 0.4203856749311295, + "grad_norm": 6.267590522766113, + "learning_rate": 6.3267167159480845e-06, + "loss": 0.363, + "step": 3815 + }, + { + "epoch": 0.42049586776859504, + "grad_norm": 3.8961129188537598, + "learning_rate": 6.325030871797403e-06, + "loss": 0.4197, + "step": 3816 + }, + { + "epoch": 0.4206060606060606, + "grad_norm": 11.12862491607666, + "learning_rate": 6.323344865620147e-06, + "loss": 0.4586, + "step": 3817 + }, + { + "epoch": 0.42071625344352614, + "grad_norm": 4.104209899902344, + "learning_rate": 6.3216586976224815e-06, + "loss": 0.4427, + "step": 3818 + }, + { + "epoch": 0.42082644628099175, + "grad_norm": 8.506538391113281, + "learning_rate": 6.3199723680105966e-06, + "loss": 0.4049, + "step": 3819 + }, + { + "epoch": 0.4209366391184573, + "grad_norm": 8.029024124145508, + "learning_rate": 6.318285876990697e-06, + "loss": 0.4489, + "step": 3820 + }, + { + "epoch": 0.42104683195592285, + "grad_norm": 5.868107318878174, + "learning_rate": 6.316599224769008e-06, + "loss": 0.3651, + "step": 3821 + }, + { + "epoch": 0.42115702479338846, + "grad_norm": 6.64792013168335, + "learning_rate": 6.314912411551779e-06, + "loss": 0.4023, + "step": 3822 + }, + { + "epoch": 0.421267217630854, + "grad_norm": 6.665269374847412, + "learning_rate": 6.313225437545274e-06, + "loss": 0.3859, + "step": 3823 + }, + { + "epoch": 0.42137741046831956, + "grad_norm": 12.29218578338623, + "learning_rate": 6.311538302955778e-06, + "loss": 0.5835, + "step": 3824 + }, + { + "epoch": 0.4214876033057851, + "grad_norm": 5.8684539794921875, + "learning_rate": 6.309851007989598e-06, + "loss": 0.3665, + "step": 3825 + }, + { + "epoch": 0.4215977961432507, + "grad_norm": 4.377128601074219, + "learning_rate": 6.308163552853057e-06, + "loss": 0.374, + "step": 3826 + }, + { + "epoch": 0.42170798898071626, + "grad_norm": 4.379380702972412, + "learning_rate": 6.3064759377525e-06, + "loss": 0.3751, + "step": 3827 + }, + { + "epoch": 0.4218181818181818, + "grad_norm": 6.47163724899292, + "learning_rate": 6.304788162894291e-06, + "loss": 0.4254, + "step": 3828 + }, + { + "epoch": 0.42192837465564736, + "grad_norm": 7.115240573883057, + "learning_rate": 6.3031002284848106e-06, + "loss": 0.4518, + "step": 3829 + }, + { + "epoch": 0.42203856749311297, + "grad_norm": 5.516519069671631, + "learning_rate": 6.301412134730468e-06, + "loss": 0.4202, + "step": 3830 + }, + { + "epoch": 0.4221487603305785, + "grad_norm": 5.217955589294434, + "learning_rate": 6.299723881837678e-06, + "loss": 0.3879, + "step": 3831 + }, + { + "epoch": 0.42225895316804407, + "grad_norm": 8.340252876281738, + "learning_rate": 6.298035470012889e-06, + "loss": 0.4044, + "step": 3832 + }, + { + "epoch": 0.4223691460055096, + "grad_norm": 5.796931743621826, + "learning_rate": 6.296346899462559e-06, + "loss": 0.429, + "step": 3833 + }, + { + "epoch": 0.4224793388429752, + "grad_norm": 9.1382417678833, + "learning_rate": 6.294658170393169e-06, + "loss": 0.3954, + "step": 3834 + }, + { + "epoch": 0.4225895316804408, + "grad_norm": 8.257237434387207, + "learning_rate": 6.292969283011219e-06, + "loss": 0.3601, + "step": 3835 + }, + { + "epoch": 0.42269972451790633, + "grad_norm": 7.257051467895508, + "learning_rate": 6.291280237523227e-06, + "loss": 0.4497, + "step": 3836 + }, + { + "epoch": 0.4228099173553719, + "grad_norm": 4.841982364654541, + "learning_rate": 6.2895910341357355e-06, + "loss": 0.384, + "step": 3837 + }, + { + "epoch": 0.4229201101928375, + "grad_norm": 9.36284351348877, + "learning_rate": 6.287901673055301e-06, + "loss": 0.5032, + "step": 3838 + }, + { + "epoch": 0.42303030303030303, + "grad_norm": 6.163784503936768, + "learning_rate": 6.2862121544885e-06, + "loss": 0.4299, + "step": 3839 + }, + { + "epoch": 0.4231404958677686, + "grad_norm": 10.357873916625977, + "learning_rate": 6.28452247864193e-06, + "loss": 0.4065, + "step": 3840 + }, + { + "epoch": 0.42325068870523413, + "grad_norm": 6.996313571929932, + "learning_rate": 6.282832645722206e-06, + "loss": 0.3963, + "step": 3841 + }, + { + "epoch": 0.42336088154269974, + "grad_norm": 7.793999671936035, + "learning_rate": 6.281142655935963e-06, + "loss": 0.5049, + "step": 3842 + }, + { + "epoch": 0.4234710743801653, + "grad_norm": 5.729565143585205, + "learning_rate": 6.279452509489856e-06, + "loss": 0.3909, + "step": 3843 + }, + { + "epoch": 0.42358126721763084, + "grad_norm": 9.760586738586426, + "learning_rate": 6.277762206590559e-06, + "loss": 0.4008, + "step": 3844 + }, + { + "epoch": 0.4236914600550964, + "grad_norm": 5.855377674102783, + "learning_rate": 6.276071747444763e-06, + "loss": 0.4294, + "step": 3845 + }, + { + "epoch": 0.423801652892562, + "grad_norm": 5.786755561828613, + "learning_rate": 6.27438113225918e-06, + "loss": 0.4266, + "step": 3846 + }, + { + "epoch": 0.42391184573002755, + "grad_norm": 6.315412521362305, + "learning_rate": 6.272690361240542e-06, + "loss": 0.3893, + "step": 3847 + }, + { + "epoch": 0.4240220385674931, + "grad_norm": 7.395302772521973, + "learning_rate": 6.270999434595598e-06, + "loss": 0.4622, + "step": 3848 + }, + { + "epoch": 0.4241322314049587, + "grad_norm": 13.069974899291992, + "learning_rate": 6.269308352531116e-06, + "loss": 0.5344, + "step": 3849 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 5.600767135620117, + "learning_rate": 6.267617115253885e-06, + "loss": 0.4751, + "step": 3850 + }, + { + "epoch": 0.4243526170798898, + "grad_norm": 5.95487642288208, + "learning_rate": 6.26592572297071e-06, + "loss": 0.3983, + "step": 3851 + }, + { + "epoch": 0.42446280991735535, + "grad_norm": 7.793905258178711, + "learning_rate": 6.264234175888418e-06, + "loss": 0.4072, + "step": 3852 + }, + { + "epoch": 0.42457300275482096, + "grad_norm": 9.293682098388672, + "learning_rate": 6.262542474213855e-06, + "loss": 0.462, + "step": 3853 + }, + { + "epoch": 0.4246831955922865, + "grad_norm": 3.852151393890381, + "learning_rate": 6.260850618153883e-06, + "loss": 0.4374, + "step": 3854 + }, + { + "epoch": 0.42479338842975206, + "grad_norm": 6.262339115142822, + "learning_rate": 6.259158607915385e-06, + "loss": 0.3165, + "step": 3855 + }, + { + "epoch": 0.4249035812672176, + "grad_norm": 3.88421630859375, + "learning_rate": 6.257466443705261e-06, + "loss": 0.3914, + "step": 3856 + }, + { + "epoch": 0.4250137741046832, + "grad_norm": 5.283646583557129, + "learning_rate": 6.255774125730432e-06, + "loss": 0.3919, + "step": 3857 + }, + { + "epoch": 0.42512396694214877, + "grad_norm": 4.296477317810059, + "learning_rate": 6.254081654197839e-06, + "loss": 0.4203, + "step": 3858 + }, + { + "epoch": 0.4252341597796143, + "grad_norm": 5.580582141876221, + "learning_rate": 6.252389029314436e-06, + "loss": 0.4347, + "step": 3859 + }, + { + "epoch": 0.42534435261707987, + "grad_norm": 18.328691482543945, + "learning_rate": 6.2506962512872e-06, + "loss": 0.4846, + "step": 3860 + }, + { + "epoch": 0.4254545454545455, + "grad_norm": 9.269140243530273, + "learning_rate": 6.249003320323131e-06, + "loss": 0.5153, + "step": 3861 + }, + { + "epoch": 0.425564738292011, + "grad_norm": 3.998983144760132, + "learning_rate": 6.2473102366292385e-06, + "loss": 0.365, + "step": 3862 + }, + { + "epoch": 0.4256749311294766, + "grad_norm": 9.478604316711426, + "learning_rate": 6.245617000412555e-06, + "loss": 0.3878, + "step": 3863 + }, + { + "epoch": 0.4257851239669421, + "grad_norm": 10.491778373718262, + "learning_rate": 6.2439236118801314e-06, + "loss": 0.4833, + "step": 3864 + }, + { + "epoch": 0.42589531680440773, + "grad_norm": 4.075682163238525, + "learning_rate": 6.242230071239042e-06, + "loss": 0.4464, + "step": 3865 + }, + { + "epoch": 0.4260055096418733, + "grad_norm": 8.175837516784668, + "learning_rate": 6.240536378696371e-06, + "loss": 0.4427, + "step": 3866 + }, + { + "epoch": 0.42611570247933883, + "grad_norm": 5.808559417724609, + "learning_rate": 6.238842534459224e-06, + "loss": 0.4144, + "step": 3867 + }, + { + "epoch": 0.4262258953168044, + "grad_norm": 8.47080135345459, + "learning_rate": 6.237148538734732e-06, + "loss": 0.4799, + "step": 3868 + }, + { + "epoch": 0.42633608815427, + "grad_norm": 6.882425308227539, + "learning_rate": 6.235454391730035e-06, + "loss": 0.491, + "step": 3869 + }, + { + "epoch": 0.42644628099173554, + "grad_norm": 10.453054428100586, + "learning_rate": 6.233760093652297e-06, + "loss": 0.3773, + "step": 3870 + }, + { + "epoch": 0.4265564738292011, + "grad_norm": 10.358419418334961, + "learning_rate": 6.232065644708698e-06, + "loss": 0.4692, + "step": 3871 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 4.853580474853516, + "learning_rate": 6.23037104510644e-06, + "loss": 0.4267, + "step": 3872 + }, + { + "epoch": 0.42677685950413224, + "grad_norm": 6.97660493850708, + "learning_rate": 6.228676295052738e-06, + "loss": 0.406, + "step": 3873 + }, + { + "epoch": 0.4268870523415978, + "grad_norm": 6.049506187438965, + "learning_rate": 6.22698139475483e-06, + "loss": 0.4359, + "step": 3874 + }, + { + "epoch": 0.42699724517906334, + "grad_norm": 11.104652404785156, + "learning_rate": 6.225286344419971e-06, + "loss": 0.42, + "step": 3875 + }, + { + "epoch": 0.42710743801652895, + "grad_norm": 5.031164646148682, + "learning_rate": 6.223591144255433e-06, + "loss": 0.428, + "step": 3876 + }, + { + "epoch": 0.4272176308539945, + "grad_norm": 5.761896133422852, + "learning_rate": 6.221895794468508e-06, + "loss": 0.3766, + "step": 3877 + }, + { + "epoch": 0.42732782369146005, + "grad_norm": 5.83917760848999, + "learning_rate": 6.2202002952665054e-06, + "loss": 0.3782, + "step": 3878 + }, + { + "epoch": 0.4274380165289256, + "grad_norm": 6.0388102531433105, + "learning_rate": 6.2185046468567535e-06, + "loss": 0.4874, + "step": 3879 + }, + { + "epoch": 0.4275482093663912, + "grad_norm": 4.570846080780029, + "learning_rate": 6.216808849446596e-06, + "loss": 0.3824, + "step": 3880 + }, + { + "epoch": 0.42765840220385676, + "grad_norm": 6.330661773681641, + "learning_rate": 6.2151129032434024e-06, + "loss": 0.3905, + "step": 3881 + }, + { + "epoch": 0.4277685950413223, + "grad_norm": 6.117094993591309, + "learning_rate": 6.2134168084545506e-06, + "loss": 0.4565, + "step": 3882 + }, + { + "epoch": 0.42787878787878786, + "grad_norm": 10.915534973144531, + "learning_rate": 6.211720565287443e-06, + "loss": 0.4279, + "step": 3883 + }, + { + "epoch": 0.42798898071625346, + "grad_norm": 9.58615779876709, + "learning_rate": 6.2100241739495e-06, + "loss": 0.4818, + "step": 3884 + }, + { + "epoch": 0.428099173553719, + "grad_norm": 4.800282955169678, + "learning_rate": 6.208327634648157e-06, + "loss": 0.4052, + "step": 3885 + }, + { + "epoch": 0.42820936639118456, + "grad_norm": 9.476829528808594, + "learning_rate": 6.2066309475908696e-06, + "loss": 0.4348, + "step": 3886 + }, + { + "epoch": 0.4283195592286501, + "grad_norm": 11.474186897277832, + "learning_rate": 6.20493411298511e-06, + "loss": 0.5148, + "step": 3887 + }, + { + "epoch": 0.4284297520661157, + "grad_norm": 8.32387638092041, + "learning_rate": 6.203237131038371e-06, + "loss": 0.4234, + "step": 3888 + }, + { + "epoch": 0.42853994490358127, + "grad_norm": 7.900044918060303, + "learning_rate": 6.201540001958163e-06, + "loss": 0.471, + "step": 3889 + }, + { + "epoch": 0.4286501377410468, + "grad_norm": 5.806334495544434, + "learning_rate": 6.199842725952008e-06, + "loss": 0.4014, + "step": 3890 + }, + { + "epoch": 0.42876033057851237, + "grad_norm": 7.8750081062316895, + "learning_rate": 6.198145303227456e-06, + "loss": 0.4524, + "step": 3891 + }, + { + "epoch": 0.428870523415978, + "grad_norm": 5.8703436851501465, + "learning_rate": 6.1964477339920695e-06, + "loss": 0.433, + "step": 3892 + }, + { + "epoch": 0.42898071625344353, + "grad_norm": 8.025110244750977, + "learning_rate": 6.194750018453428e-06, + "loss": 0.3659, + "step": 3893 + }, + { + "epoch": 0.4290909090909091, + "grad_norm": 5.678608417510986, + "learning_rate": 6.193052156819132e-06, + "loss": 0.4667, + "step": 3894 + }, + { + "epoch": 0.42920110192837463, + "grad_norm": 4.612313270568848, + "learning_rate": 6.191354149296798e-06, + "loss": 0.4118, + "step": 3895 + }, + { + "epoch": 0.42931129476584023, + "grad_norm": 4.925156593322754, + "learning_rate": 6.189655996094059e-06, + "loss": 0.4305, + "step": 3896 + }, + { + "epoch": 0.4294214876033058, + "grad_norm": 5.6468586921691895, + "learning_rate": 6.187957697418571e-06, + "loss": 0.4594, + "step": 3897 + }, + { + "epoch": 0.42953168044077134, + "grad_norm": 9.24137020111084, + "learning_rate": 6.186259253478e-06, + "loss": 0.4387, + "step": 3898 + }, + { + "epoch": 0.42964187327823694, + "grad_norm": 5.610344409942627, + "learning_rate": 6.184560664480036e-06, + "loss": 0.4119, + "step": 3899 + }, + { + "epoch": 0.4297520661157025, + "grad_norm": 5.931612014770508, + "learning_rate": 6.182861930632387e-06, + "loss": 0.4762, + "step": 3900 + }, + { + "epoch": 0.42986225895316804, + "grad_norm": 7.208206653594971, + "learning_rate": 6.181163052142771e-06, + "loss": 0.3654, + "step": 3901 + }, + { + "epoch": 0.4299724517906336, + "grad_norm": 5.230598449707031, + "learning_rate": 6.179464029218936e-06, + "loss": 0.4699, + "step": 3902 + }, + { + "epoch": 0.4300826446280992, + "grad_norm": 7.095353126525879, + "learning_rate": 6.177764862068636e-06, + "loss": 0.4829, + "step": 3903 + }, + { + "epoch": 0.43019283746556475, + "grad_norm": 6.133082389831543, + "learning_rate": 6.176065550899648e-06, + "loss": 0.3976, + "step": 3904 + }, + { + "epoch": 0.4303030303030303, + "grad_norm": 5.761401176452637, + "learning_rate": 6.174366095919767e-06, + "loss": 0.4073, + "step": 3905 + }, + { + "epoch": 0.43041322314049585, + "grad_norm": 8.578031539916992, + "learning_rate": 6.172666497336804e-06, + "loss": 0.3761, + "step": 3906 + }, + { + "epoch": 0.43052341597796145, + "grad_norm": 7.943502902984619, + "learning_rate": 6.170966755358592e-06, + "loss": 0.3873, + "step": 3907 + }, + { + "epoch": 0.430633608815427, + "grad_norm": 4.049300670623779, + "learning_rate": 6.169266870192972e-06, + "loss": 0.4109, + "step": 3908 + }, + { + "epoch": 0.43074380165289256, + "grad_norm": 5.710516452789307, + "learning_rate": 6.1675668420478114e-06, + "loss": 0.4388, + "step": 3909 + }, + { + "epoch": 0.4308539944903581, + "grad_norm": 8.189253807067871, + "learning_rate": 6.165866671130992e-06, + "loss": 0.5286, + "step": 3910 + }, + { + "epoch": 0.4309641873278237, + "grad_norm": 6.19651985168457, + "learning_rate": 6.16416635765041e-06, + "loss": 0.3811, + "step": 3911 + }, + { + "epoch": 0.43107438016528926, + "grad_norm": 7.109768867492676, + "learning_rate": 6.162465901813987e-06, + "loss": 0.4204, + "step": 3912 + }, + { + "epoch": 0.4311845730027548, + "grad_norm": 4.812902450561523, + "learning_rate": 6.160765303829653e-06, + "loss": 0.4346, + "step": 3913 + }, + { + "epoch": 0.43129476584022036, + "grad_norm": 12.155776023864746, + "learning_rate": 6.1590645639053625e-06, + "loss": 0.4088, + "step": 3914 + }, + { + "epoch": 0.43140495867768597, + "grad_norm": 9.650174140930176, + "learning_rate": 6.157363682249081e-06, + "loss": 0.3858, + "step": 3915 + }, + { + "epoch": 0.4315151515151515, + "grad_norm": 9.240128517150879, + "learning_rate": 6.155662659068797e-06, + "loss": 0.4106, + "step": 3916 + }, + { + "epoch": 0.43162534435261707, + "grad_norm": 5.920962333679199, + "learning_rate": 6.153961494572515e-06, + "loss": 0.4427, + "step": 3917 + }, + { + "epoch": 0.4317355371900826, + "grad_norm": 5.3680620193481445, + "learning_rate": 6.152260188968251e-06, + "loss": 0.3361, + "step": 3918 + }, + { + "epoch": 0.4318457300275482, + "grad_norm": 6.636050224304199, + "learning_rate": 6.150558742464047e-06, + "loss": 0.4551, + "step": 3919 + }, + { + "epoch": 0.4319559228650138, + "grad_norm": 6.805266380310059, + "learning_rate": 6.1488571552679566e-06, + "loss": 0.3813, + "step": 3920 + }, + { + "epoch": 0.4320661157024793, + "grad_norm": 5.0186285972595215, + "learning_rate": 6.147155427588054e-06, + "loss": 0.4109, + "step": 3921 + }, + { + "epoch": 0.43217630853994493, + "grad_norm": 5.153963565826416, + "learning_rate": 6.1454535596324275e-06, + "loss": 0.3662, + "step": 3922 + }, + { + "epoch": 0.4322865013774105, + "grad_norm": 10.273711204528809, + "learning_rate": 6.1437515516091815e-06, + "loss": 0.5663, + "step": 3923 + }, + { + "epoch": 0.43239669421487603, + "grad_norm": 5.787929534912109, + "learning_rate": 6.142049403726445e-06, + "loss": 0.3786, + "step": 3924 + }, + { + "epoch": 0.4325068870523416, + "grad_norm": 5.409564971923828, + "learning_rate": 6.140347116192354e-06, + "loss": 0.4063, + "step": 3925 + }, + { + "epoch": 0.4326170798898072, + "grad_norm": 5.356973648071289, + "learning_rate": 6.138644689215068e-06, + "loss": 0.385, + "step": 3926 + }, + { + "epoch": 0.43272727272727274, + "grad_norm": 4.582985877990723, + "learning_rate": 6.136942123002765e-06, + "loss": 0.3817, + "step": 3927 + }, + { + "epoch": 0.4328374655647383, + "grad_norm": 5.866171360015869, + "learning_rate": 6.135239417763634e-06, + "loss": 0.3633, + "step": 3928 + }, + { + "epoch": 0.43294765840220384, + "grad_norm": 11.208274841308594, + "learning_rate": 6.133536573705885e-06, + "loss": 0.4202, + "step": 3929 + }, + { + "epoch": 0.43305785123966944, + "grad_norm": 7.416287422180176, + "learning_rate": 6.131833591037744e-06, + "loss": 0.4555, + "step": 3930 + }, + { + "epoch": 0.433168044077135, + "grad_norm": 6.879089832305908, + "learning_rate": 6.130130469967453e-06, + "loss": 0.4877, + "step": 3931 + }, + { + "epoch": 0.43327823691460055, + "grad_norm": 6.2986931800842285, + "learning_rate": 6.1284272107032735e-06, + "loss": 0.4587, + "step": 3932 + }, + { + "epoch": 0.4333884297520661, + "grad_norm": 6.2256059646606445, + "learning_rate": 6.126723813453484e-06, + "loss": 0.449, + "step": 3933 + }, + { + "epoch": 0.4334986225895317, + "grad_norm": 6.348203182220459, + "learning_rate": 6.1250202784263725e-06, + "loss": 0.3939, + "step": 3934 + }, + { + "epoch": 0.43360881542699725, + "grad_norm": 5.402763843536377, + "learning_rate": 6.123316605830256e-06, + "loss": 0.4556, + "step": 3935 + }, + { + "epoch": 0.4337190082644628, + "grad_norm": 5.364984512329102, + "learning_rate": 6.121612795873457e-06, + "loss": 0.4256, + "step": 3936 + }, + { + "epoch": 0.43382920110192835, + "grad_norm": 9.048213958740234, + "learning_rate": 6.119908848764323e-06, + "loss": 0.4534, + "step": 3937 + }, + { + "epoch": 0.43393939393939396, + "grad_norm": 6.247734069824219, + "learning_rate": 6.118204764711214e-06, + "loss": 0.3265, + "step": 3938 + }, + { + "epoch": 0.4340495867768595, + "grad_norm": 5.927489757537842, + "learning_rate": 6.116500543922507e-06, + "loss": 0.402, + "step": 3939 + }, + { + "epoch": 0.43415977961432506, + "grad_norm": 6.853671073913574, + "learning_rate": 6.1147961866065965e-06, + "loss": 0.3861, + "step": 3940 + }, + { + "epoch": 0.4342699724517906, + "grad_norm": 10.130403518676758, + "learning_rate": 6.113091692971894e-06, + "loss": 0.4626, + "step": 3941 + }, + { + "epoch": 0.4343801652892562, + "grad_norm": 5.313806533813477, + "learning_rate": 6.111387063226828e-06, + "loss": 0.419, + "step": 3942 + }, + { + "epoch": 0.43449035812672177, + "grad_norm": 6.938016891479492, + "learning_rate": 6.109682297579842e-06, + "loss": 0.3902, + "step": 3943 + }, + { + "epoch": 0.4346005509641873, + "grad_norm": 7.420373439788818, + "learning_rate": 6.1079773962393965e-06, + "loss": 0.438, + "step": 3944 + }, + { + "epoch": 0.43471074380165287, + "grad_norm": 10.01765251159668, + "learning_rate": 6.106272359413972e-06, + "loss": 0.4614, + "step": 3945 + }, + { + "epoch": 0.43482093663911847, + "grad_norm": 5.062164306640625, + "learning_rate": 6.104567187312058e-06, + "loss": 0.4132, + "step": 3946 + }, + { + "epoch": 0.434931129476584, + "grad_norm": 5.256128311157227, + "learning_rate": 6.102861880142169e-06, + "loss": 0.455, + "step": 3947 + }, + { + "epoch": 0.43504132231404957, + "grad_norm": 13.423139572143555, + "learning_rate": 6.101156438112832e-06, + "loss": 0.4443, + "step": 3948 + }, + { + "epoch": 0.4351515151515152, + "grad_norm": 5.345340251922607, + "learning_rate": 6.09945086143259e-06, + "loss": 0.4338, + "step": 3949 + }, + { + "epoch": 0.43526170798898073, + "grad_norm": 4.3182477951049805, + "learning_rate": 6.097745150310002e-06, + "loss": 0.3966, + "step": 3950 + }, + { + "epoch": 0.4353719008264463, + "grad_norm": 8.046113967895508, + "learning_rate": 6.096039304953646e-06, + "loss": 0.3909, + "step": 3951 + }, + { + "epoch": 0.43548209366391183, + "grad_norm": 6.949949264526367, + "learning_rate": 6.094333325572116e-06, + "loss": 0.3925, + "step": 3952 + }, + { + "epoch": 0.43559228650137743, + "grad_norm": 8.936685562133789, + "learning_rate": 6.092627212374019e-06, + "loss": 0.4698, + "step": 3953 + }, + { + "epoch": 0.435702479338843, + "grad_norm": 8.110991477966309, + "learning_rate": 6.090920965567983e-06, + "loss": 0.4106, + "step": 3954 + }, + { + "epoch": 0.43581267217630854, + "grad_norm": 5.9730682373046875, + "learning_rate": 6.08921458536265e-06, + "loss": 0.3644, + "step": 3955 + }, + { + "epoch": 0.4359228650137741, + "grad_norm": 6.58652400970459, + "learning_rate": 6.087508071966678e-06, + "loss": 0.409, + "step": 3956 + }, + { + "epoch": 0.4360330578512397, + "grad_norm": 6.210163116455078, + "learning_rate": 6.085801425588741e-06, + "loss": 0.378, + "step": 3957 + }, + { + "epoch": 0.43614325068870524, + "grad_norm": 8.399636268615723, + "learning_rate": 6.084094646437531e-06, + "loss": 0.4283, + "step": 3958 + }, + { + "epoch": 0.4362534435261708, + "grad_norm": 4.2696404457092285, + "learning_rate": 6.082387734721755e-06, + "loss": 0.3491, + "step": 3959 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 5.721973419189453, + "learning_rate": 6.080680690650136e-06, + "loss": 0.4111, + "step": 3960 + }, + { + "epoch": 0.43647382920110195, + "grad_norm": 8.701297760009766, + "learning_rate": 6.078973514431415e-06, + "loss": 0.4477, + "step": 3961 + }, + { + "epoch": 0.4365840220385675, + "grad_norm": 6.38549280166626, + "learning_rate": 6.077266206274346e-06, + "loss": 0.4517, + "step": 3962 + }, + { + "epoch": 0.43669421487603305, + "grad_norm": 9.18921947479248, + "learning_rate": 6.075558766387704e-06, + "loss": 0.4909, + "step": 3963 + }, + { + "epoch": 0.4368044077134986, + "grad_norm": 4.652480125427246, + "learning_rate": 6.073851194980274e-06, + "loss": 0.3808, + "step": 3964 + }, + { + "epoch": 0.4369146005509642, + "grad_norm": 7.874028205871582, + "learning_rate": 6.07214349226086e-06, + "loss": 0.394, + "step": 3965 + }, + { + "epoch": 0.43702479338842976, + "grad_norm": 4.699254989624023, + "learning_rate": 6.070435658438285e-06, + "loss": 0.4416, + "step": 3966 + }, + { + "epoch": 0.4371349862258953, + "grad_norm": 5.5780487060546875, + "learning_rate": 6.068727693721384e-06, + "loss": 0.4461, + "step": 3967 + }, + { + "epoch": 0.43724517906336086, + "grad_norm": 6.561910152435303, + "learning_rate": 6.067019598319007e-06, + "loss": 0.4214, + "step": 3968 + }, + { + "epoch": 0.43735537190082646, + "grad_norm": 6.581684589385986, + "learning_rate": 6.065311372440025e-06, + "loss": 0.3745, + "step": 3969 + }, + { + "epoch": 0.437465564738292, + "grad_norm": 14.880142211914062, + "learning_rate": 6.063603016293321e-06, + "loss": 0.4787, + "step": 3970 + }, + { + "epoch": 0.43757575757575756, + "grad_norm": 6.389845848083496, + "learning_rate": 6.0618945300877964e-06, + "loss": 0.4239, + "step": 3971 + }, + { + "epoch": 0.43768595041322317, + "grad_norm": 7.486594200134277, + "learning_rate": 6.060185914032365e-06, + "loss": 0.4018, + "step": 3972 + }, + { + "epoch": 0.4377961432506887, + "grad_norm": 21.0837459564209, + "learning_rate": 6.058477168335961e-06, + "loss": 0.521, + "step": 3973 + }, + { + "epoch": 0.43790633608815427, + "grad_norm": 13.081880569458008, + "learning_rate": 6.05676829320753e-06, + "loss": 0.398, + "step": 3974 + }, + { + "epoch": 0.4380165289256198, + "grad_norm": 4.662603378295898, + "learning_rate": 6.0550592888560365e-06, + "loss": 0.4188, + "step": 3975 + }, + { + "epoch": 0.4381267217630854, + "grad_norm": 5.609738826751709, + "learning_rate": 6.053350155490462e-06, + "loss": 0.4515, + "step": 3976 + }, + { + "epoch": 0.438236914600551, + "grad_norm": 8.3709135055542, + "learning_rate": 6.051640893319798e-06, + "loss": 0.4255, + "step": 3977 + }, + { + "epoch": 0.4383471074380165, + "grad_norm": 7.281825542449951, + "learning_rate": 6.049931502553058e-06, + "loss": 0.4306, + "step": 3978 + }, + { + "epoch": 0.4384573002754821, + "grad_norm": 3.8517487049102783, + "learning_rate": 6.0482219833992665e-06, + "loss": 0.3779, + "step": 3979 + }, + { + "epoch": 0.4385674931129477, + "grad_norm": 5.827889919281006, + "learning_rate": 6.046512336067467e-06, + "loss": 0.3965, + "step": 3980 + }, + { + "epoch": 0.43867768595041323, + "grad_norm": 13.511173248291016, + "learning_rate": 6.044802560766718e-06, + "loss": 0.5418, + "step": 3981 + }, + { + "epoch": 0.4387878787878788, + "grad_norm": 7.127382278442383, + "learning_rate": 6.043092657706092e-06, + "loss": 0.4385, + "step": 3982 + }, + { + "epoch": 0.43889807162534433, + "grad_norm": 5.266600608825684, + "learning_rate": 6.0413826270946806e-06, + "loss": 0.4144, + "step": 3983 + }, + { + "epoch": 0.43900826446280994, + "grad_norm": 12.07368278503418, + "learning_rate": 6.0396724691415866e-06, + "loss": 0.5229, + "step": 3984 + }, + { + "epoch": 0.4391184573002755, + "grad_norm": 7.233647346496582, + "learning_rate": 6.037962184055928e-06, + "loss": 0.4541, + "step": 3985 + }, + { + "epoch": 0.43922865013774104, + "grad_norm": 10.408295631408691, + "learning_rate": 6.036251772046847e-06, + "loss": 0.4293, + "step": 3986 + }, + { + "epoch": 0.4393388429752066, + "grad_norm": 5.630067348480225, + "learning_rate": 6.034541233323491e-06, + "loss": 0.4029, + "step": 3987 + }, + { + "epoch": 0.4394490358126722, + "grad_norm": 5.803105354309082, + "learning_rate": 6.032830568095027e-06, + "loss": 0.4252, + "step": 3988 + }, + { + "epoch": 0.43955922865013775, + "grad_norm": 9.329225540161133, + "learning_rate": 6.031119776570639e-06, + "loss": 0.3607, + "step": 3989 + }, + { + "epoch": 0.4396694214876033, + "grad_norm": 6.282469272613525, + "learning_rate": 6.029408858959522e-06, + "loss": 0.4765, + "step": 3990 + }, + { + "epoch": 0.43977961432506885, + "grad_norm": 5.0228424072265625, + "learning_rate": 6.0276978154708945e-06, + "loss": 0.4113, + "step": 3991 + }, + { + "epoch": 0.43988980716253445, + "grad_norm": 6.176638126373291, + "learning_rate": 6.0259866463139795e-06, + "loss": 0.4589, + "step": 3992 + }, + { + "epoch": 0.44, + "grad_norm": 5.363448143005371, + "learning_rate": 6.024275351698024e-06, + "loss": 0.4199, + "step": 3993 + }, + { + "epoch": 0.44011019283746555, + "grad_norm": 5.49215841293335, + "learning_rate": 6.022563931832289e-06, + "loss": 0.3592, + "step": 3994 + }, + { + "epoch": 0.4402203856749311, + "grad_norm": 5.0736083984375, + "learning_rate": 6.020852386926046e-06, + "loss": 0.3744, + "step": 3995 + }, + { + "epoch": 0.4403305785123967, + "grad_norm": 6.1613850593566895, + "learning_rate": 6.0191407171885875e-06, + "loss": 0.4618, + "step": 3996 + }, + { + "epoch": 0.44044077134986226, + "grad_norm": 6.350560188293457, + "learning_rate": 6.017428922829216e-06, + "loss": 0.3987, + "step": 3997 + }, + { + "epoch": 0.4405509641873278, + "grad_norm": 6.247224807739258, + "learning_rate": 6.0157170040572545e-06, + "loss": 0.4572, + "step": 3998 + }, + { + "epoch": 0.4406611570247934, + "grad_norm": 6.238313674926758, + "learning_rate": 6.0140049610820386e-06, + "loss": 0.3761, + "step": 3999 + }, + { + "epoch": 0.44077134986225897, + "grad_norm": 8.729419708251953, + "learning_rate": 6.012292794112917e-06, + "loss": 0.4835, + "step": 4000 + }, + { + "epoch": 0.4408815426997245, + "grad_norm": 4.114143371582031, + "learning_rate": 6.01058050335926e-06, + "loss": 0.4602, + "step": 4001 + }, + { + "epoch": 0.44099173553719007, + "grad_norm": 6.828856945037842, + "learning_rate": 6.008868089030445e-06, + "loss": 0.4266, + "step": 4002 + }, + { + "epoch": 0.44110192837465567, + "grad_norm": 5.825451374053955, + "learning_rate": 6.007155551335869e-06, + "loss": 0.3877, + "step": 4003 + }, + { + "epoch": 0.4412121212121212, + "grad_norm": 19.42027473449707, + "learning_rate": 6.005442890484945e-06, + "loss": 0.5371, + "step": 4004 + }, + { + "epoch": 0.4413223140495868, + "grad_norm": 5.937627792358398, + "learning_rate": 6.003730106687099e-06, + "loss": 0.325, + "step": 4005 + }, + { + "epoch": 0.4414325068870523, + "grad_norm": 6.736237525939941, + "learning_rate": 6.0020172001517705e-06, + "loss": 0.3434, + "step": 4006 + }, + { + "epoch": 0.44154269972451793, + "grad_norm": 5.677096843719482, + "learning_rate": 6.00030417108842e-06, + "loss": 0.4118, + "step": 4007 + }, + { + "epoch": 0.4416528925619835, + "grad_norm": 4.171231269836426, + "learning_rate": 5.9985910197065154e-06, + "loss": 0.3589, + "step": 4008 + }, + { + "epoch": 0.44176308539944903, + "grad_norm": 8.049046516418457, + "learning_rate": 5.996877746215545e-06, + "loss": 0.4109, + "step": 4009 + }, + { + "epoch": 0.4418732782369146, + "grad_norm": 11.192566871643066, + "learning_rate": 5.995164350825008e-06, + "loss": 0.4301, + "step": 4010 + }, + { + "epoch": 0.4419834710743802, + "grad_norm": 8.543044090270996, + "learning_rate": 5.993450833744424e-06, + "loss": 0.4741, + "step": 4011 + }, + { + "epoch": 0.44209366391184574, + "grad_norm": 7.122586727142334, + "learning_rate": 5.991737195183323e-06, + "loss": 0.3592, + "step": 4012 + }, + { + "epoch": 0.4422038567493113, + "grad_norm": 10.358314514160156, + "learning_rate": 5.990023435351249e-06, + "loss": 0.4666, + "step": 4013 + }, + { + "epoch": 0.44231404958677684, + "grad_norm": 9.374655723571777, + "learning_rate": 5.988309554457765e-06, + "loss": 0.4063, + "step": 4014 + }, + { + "epoch": 0.44242424242424244, + "grad_norm": 12.979504585266113, + "learning_rate": 5.9865955527124466e-06, + "loss": 0.5649, + "step": 4015 + }, + { + "epoch": 0.442534435261708, + "grad_norm": 5.430773735046387, + "learning_rate": 5.984881430324883e-06, + "loss": 0.4055, + "step": 4016 + }, + { + "epoch": 0.44264462809917354, + "grad_norm": 5.780434608459473, + "learning_rate": 5.983167187504681e-06, + "loss": 0.3895, + "step": 4017 + }, + { + "epoch": 0.4427548209366391, + "grad_norm": 8.516437530517578, + "learning_rate": 5.98145282446146e-06, + "loss": 0.3959, + "step": 4018 + }, + { + "epoch": 0.4428650137741047, + "grad_norm": 5.609032154083252, + "learning_rate": 5.9797383414048535e-06, + "loss": 0.4287, + "step": 4019 + }, + { + "epoch": 0.44297520661157025, + "grad_norm": 8.30771255493164, + "learning_rate": 5.978023738544514e-06, + "loss": 0.4973, + "step": 4020 + }, + { + "epoch": 0.4430853994490358, + "grad_norm": 5.688619136810303, + "learning_rate": 5.9763090160901e-06, + "loss": 0.4544, + "step": 4021 + }, + { + "epoch": 0.44319559228650135, + "grad_norm": 4.770142078399658, + "learning_rate": 5.974594174251297e-06, + "loss": 0.3571, + "step": 4022 + }, + { + "epoch": 0.44330578512396696, + "grad_norm": 11.235660552978516, + "learning_rate": 5.972879213237791e-06, + "loss": 0.48, + "step": 4023 + }, + { + "epoch": 0.4434159779614325, + "grad_norm": 5.956013202667236, + "learning_rate": 5.971164133259295e-06, + "loss": 0.4469, + "step": 4024 + }, + { + "epoch": 0.44352617079889806, + "grad_norm": 6.475219249725342, + "learning_rate": 5.96944893452553e-06, + "loss": 0.4272, + "step": 4025 + }, + { + "epoch": 0.44363636363636366, + "grad_norm": 5.644810676574707, + "learning_rate": 5.9677336172462316e-06, + "loss": 0.4083, + "step": 4026 + }, + { + "epoch": 0.4437465564738292, + "grad_norm": 8.617670059204102, + "learning_rate": 5.966018181631152e-06, + "loss": 0.4128, + "step": 4027 + }, + { + "epoch": 0.44385674931129476, + "grad_norm": 7.4977593421936035, + "learning_rate": 5.964302627890057e-06, + "loss": 0.4083, + "step": 4028 + }, + { + "epoch": 0.4439669421487603, + "grad_norm": 6.7644853591918945, + "learning_rate": 5.962586956232727e-06, + "loss": 0.4154, + "step": 4029 + }, + { + "epoch": 0.4440771349862259, + "grad_norm": 6.908641338348389, + "learning_rate": 5.9608711668689565e-06, + "loss": 0.4706, + "step": 4030 + }, + { + "epoch": 0.44418732782369147, + "grad_norm": 8.456871032714844, + "learning_rate": 5.959155260008554e-06, + "loss": 0.4842, + "step": 4031 + }, + { + "epoch": 0.444297520661157, + "grad_norm": 9.610672950744629, + "learning_rate": 5.9574392358613445e-06, + "loss": 0.3758, + "step": 4032 + }, + { + "epoch": 0.44440771349862257, + "grad_norm": 13.6271333694458, + "learning_rate": 5.955723094637163e-06, + "loss": 0.5054, + "step": 4033 + }, + { + "epoch": 0.4445179063360882, + "grad_norm": 5.952396869659424, + "learning_rate": 5.954006836545864e-06, + "loss": 0.3694, + "step": 4034 + }, + { + "epoch": 0.4446280991735537, + "grad_norm": 5.047083854675293, + "learning_rate": 5.952290461797314e-06, + "loss": 0.4463, + "step": 4035 + }, + { + "epoch": 0.4447382920110193, + "grad_norm": 6.901847839355469, + "learning_rate": 5.950573970601392e-06, + "loss": 0.3828, + "step": 4036 + }, + { + "epoch": 0.4448484848484848, + "grad_norm": 4.849525451660156, + "learning_rate": 5.948857363167995e-06, + "loss": 0.413, + "step": 4037 + }, + { + "epoch": 0.44495867768595043, + "grad_norm": 7.4211745262146, + "learning_rate": 5.9471406397070285e-06, + "loss": 0.3549, + "step": 4038 + }, + { + "epoch": 0.445068870523416, + "grad_norm": 6.657160758972168, + "learning_rate": 5.945423800428419e-06, + "loss": 0.4122, + "step": 4039 + }, + { + "epoch": 0.44517906336088153, + "grad_norm": 4.9380388259887695, + "learning_rate": 5.943706845542103e-06, + "loss": 0.4272, + "step": 4040 + }, + { + "epoch": 0.4452892561983471, + "grad_norm": 6.010152339935303, + "learning_rate": 5.941989775258032e-06, + "loss": 0.3934, + "step": 4041 + }, + { + "epoch": 0.4453994490358127, + "grad_norm": 6.57681131362915, + "learning_rate": 5.940272589786172e-06, + "loss": 0.4489, + "step": 4042 + }, + { + "epoch": 0.44550964187327824, + "grad_norm": 8.034668922424316, + "learning_rate": 5.938555289336503e-06, + "loss": 0.4549, + "step": 4043 + }, + { + "epoch": 0.4456198347107438, + "grad_norm": 6.683331489562988, + "learning_rate": 5.936837874119017e-06, + "loss": 0.3653, + "step": 4044 + }, + { + "epoch": 0.44573002754820934, + "grad_norm": 7.716479301452637, + "learning_rate": 5.935120344343724e-06, + "loss": 0.437, + "step": 4045 + }, + { + "epoch": 0.44584022038567495, + "grad_norm": 6.004525661468506, + "learning_rate": 5.933402700220645e-06, + "loss": 0.4325, + "step": 4046 + }, + { + "epoch": 0.4459504132231405, + "grad_norm": 7.189431667327881, + "learning_rate": 5.931684941959814e-06, + "loss": 0.4682, + "step": 4047 + }, + { + "epoch": 0.44606060606060605, + "grad_norm": 6.102496147155762, + "learning_rate": 5.929967069771285e-06, + "loss": 0.43, + "step": 4048 + }, + { + "epoch": 0.44617079889807165, + "grad_norm": 5.6079511642456055, + "learning_rate": 5.9282490838651185e-06, + "loss": 0.4937, + "step": 4049 + }, + { + "epoch": 0.4462809917355372, + "grad_norm": 7.230101585388184, + "learning_rate": 5.926530984451395e-06, + "loss": 0.4118, + "step": 4050 + }, + { + "epoch": 0.44639118457300275, + "grad_norm": 5.412363529205322, + "learning_rate": 5.924812771740201e-06, + "loss": 0.4016, + "step": 4051 + }, + { + "epoch": 0.4465013774104683, + "grad_norm": 7.132895469665527, + "learning_rate": 5.9230944459416475e-06, + "loss": 0.4153, + "step": 4052 + }, + { + "epoch": 0.4466115702479339, + "grad_norm": 5.968504428863525, + "learning_rate": 5.921376007265851e-06, + "loss": 0.3692, + "step": 4053 + }, + { + "epoch": 0.44672176308539946, + "grad_norm": 7.628453254699707, + "learning_rate": 5.919657455922944e-06, + "loss": 0.4621, + "step": 4054 + }, + { + "epoch": 0.446831955922865, + "grad_norm": 8.419864654541016, + "learning_rate": 5.9179387921230745e-06, + "loss": 0.4084, + "step": 4055 + }, + { + "epoch": 0.44694214876033056, + "grad_norm": 7.555700778961182, + "learning_rate": 5.9162200160764015e-06, + "loss": 0.477, + "step": 4056 + }, + { + "epoch": 0.44705234159779617, + "grad_norm": 6.322028636932373, + "learning_rate": 5.914501127993102e-06, + "loss": 0.3713, + "step": 4057 + }, + { + "epoch": 0.4471625344352617, + "grad_norm": 6.326248645782471, + "learning_rate": 5.912782128083361e-06, + "loss": 0.4339, + "step": 4058 + }, + { + "epoch": 0.44727272727272727, + "grad_norm": 6.034976959228516, + "learning_rate": 5.911063016557381e-06, + "loss": 0.4335, + "step": 4059 + }, + { + "epoch": 0.4473829201101928, + "grad_norm": 8.301700592041016, + "learning_rate": 5.909343793625379e-06, + "loss": 0.3955, + "step": 4060 + }, + { + "epoch": 0.4474931129476584, + "grad_norm": 8.64501667022705, + "learning_rate": 5.907624459497584e-06, + "loss": 0.4309, + "step": 4061 + }, + { + "epoch": 0.447603305785124, + "grad_norm": 4.960073471069336, + "learning_rate": 5.905905014384235e-06, + "loss": 0.4167, + "step": 4062 + }, + { + "epoch": 0.4477134986225895, + "grad_norm": 10.823214530944824, + "learning_rate": 5.904185458495592e-06, + "loss": 0.4695, + "step": 4063 + }, + { + "epoch": 0.4478236914600551, + "grad_norm": 4.7425408363342285, + "learning_rate": 5.902465792041922e-06, + "loss": 0.4547, + "step": 4064 + }, + { + "epoch": 0.4479338842975207, + "grad_norm": 8.464041709899902, + "learning_rate": 5.900746015233507e-06, + "loss": 0.5264, + "step": 4065 + }, + { + "epoch": 0.44804407713498623, + "grad_norm": 9.00203800201416, + "learning_rate": 5.89902612828065e-06, + "loss": 0.3918, + "step": 4066 + }, + { + "epoch": 0.4481542699724518, + "grad_norm": 6.427272796630859, + "learning_rate": 5.897306131393654e-06, + "loss": 0.4185, + "step": 4067 + }, + { + "epoch": 0.44826446280991733, + "grad_norm": 5.438120365142822, + "learning_rate": 5.8955860247828465e-06, + "loss": 0.4098, + "step": 4068 + }, + { + "epoch": 0.44837465564738294, + "grad_norm": 3.8153905868530273, + "learning_rate": 5.893865808658562e-06, + "loss": 0.4449, + "step": 4069 + }, + { + "epoch": 0.4484848484848485, + "grad_norm": 6.233038902282715, + "learning_rate": 5.892145483231153e-06, + "loss": 0.3843, + "step": 4070 + }, + { + "epoch": 0.44859504132231404, + "grad_norm": 6.0744709968566895, + "learning_rate": 5.890425048710982e-06, + "loss": 0.4148, + "step": 4071 + }, + { + "epoch": 0.4487052341597796, + "grad_norm": 8.636625289916992, + "learning_rate": 5.8887045053084265e-06, + "loss": 0.4608, + "step": 4072 + }, + { + "epoch": 0.4488154269972452, + "grad_norm": 4.762270927429199, + "learning_rate": 5.886983853233879e-06, + "loss": 0.3427, + "step": 4073 + }, + { + "epoch": 0.44892561983471074, + "grad_norm": 6.674958229064941, + "learning_rate": 5.88526309269774e-06, + "loss": 0.4495, + "step": 4074 + }, + { + "epoch": 0.4490358126721763, + "grad_norm": 5.687108516693115, + "learning_rate": 5.883542223910426e-06, + "loss": 0.4071, + "step": 4075 + }, + { + "epoch": 0.4491460055096419, + "grad_norm": 8.375948905944824, + "learning_rate": 5.8818212470823696e-06, + "loss": 0.4192, + "step": 4076 + }, + { + "epoch": 0.44925619834710745, + "grad_norm": 4.817061901092529, + "learning_rate": 5.880100162424013e-06, + "loss": 0.3792, + "step": 4077 + }, + { + "epoch": 0.449366391184573, + "grad_norm": 5.390269756317139, + "learning_rate": 5.878378970145813e-06, + "loss": 0.3867, + "step": 4078 + }, + { + "epoch": 0.44947658402203855, + "grad_norm": 8.200571060180664, + "learning_rate": 5.87665767045824e-06, + "loss": 0.4738, + "step": 4079 + }, + { + "epoch": 0.44958677685950416, + "grad_norm": 8.864215850830078, + "learning_rate": 5.874936263571775e-06, + "loss": 0.4256, + "step": 4080 + }, + { + "epoch": 0.4496969696969697, + "grad_norm": 8.43858814239502, + "learning_rate": 5.873214749696918e-06, + "loss": 0.4767, + "step": 4081 + }, + { + "epoch": 0.44980716253443526, + "grad_norm": 7.963834762573242, + "learning_rate": 5.871493129044172e-06, + "loss": 0.4067, + "step": 4082 + }, + { + "epoch": 0.4499173553719008, + "grad_norm": 8.85851001739502, + "learning_rate": 5.869771401824065e-06, + "loss": 0.4339, + "step": 4083 + }, + { + "epoch": 0.4500275482093664, + "grad_norm": 5.0043110847473145, + "learning_rate": 5.868049568247128e-06, + "loss": 0.408, + "step": 4084 + }, + { + "epoch": 0.45013774104683196, + "grad_norm": 10.033919334411621, + "learning_rate": 5.866327628523911e-06, + "loss": 0.4837, + "step": 4085 + }, + { + "epoch": 0.4502479338842975, + "grad_norm": 6.435369491577148, + "learning_rate": 5.864605582864975e-06, + "loss": 0.4075, + "step": 4086 + }, + { + "epoch": 0.45035812672176306, + "grad_norm": 8.440646171569824, + "learning_rate": 5.862883431480894e-06, + "loss": 0.4751, + "step": 4087 + }, + { + "epoch": 0.45046831955922867, + "grad_norm": 6.353564739227295, + "learning_rate": 5.861161174582254e-06, + "loss": 0.5333, + "step": 4088 + }, + { + "epoch": 0.4505785123966942, + "grad_norm": 7.430014610290527, + "learning_rate": 5.859438812379656e-06, + "loss": 0.4208, + "step": 4089 + }, + { + "epoch": 0.45068870523415977, + "grad_norm": 5.450559139251709, + "learning_rate": 5.857716345083712e-06, + "loss": 0.4442, + "step": 4090 + }, + { + "epoch": 0.4507988980716253, + "grad_norm": 7.066483497619629, + "learning_rate": 5.855993772905051e-06, + "loss": 0.3885, + "step": 4091 + }, + { + "epoch": 0.4509090909090909, + "grad_norm": 7.168652057647705, + "learning_rate": 5.854271096054307e-06, + "loss": 0.5197, + "step": 4092 + }, + { + "epoch": 0.4510192837465565, + "grad_norm": 6.303380966186523, + "learning_rate": 5.852548314742131e-06, + "loss": 0.4214, + "step": 4093 + }, + { + "epoch": 0.451129476584022, + "grad_norm": 5.621610164642334, + "learning_rate": 5.850825429179192e-06, + "loss": 0.3097, + "step": 4094 + }, + { + "epoch": 0.4512396694214876, + "grad_norm": 6.062560558319092, + "learning_rate": 5.849102439576163e-06, + "loss": 0.4208, + "step": 4095 + }, + { + "epoch": 0.4513498622589532, + "grad_norm": 7.787949085235596, + "learning_rate": 5.847379346143734e-06, + "loss": 0.5181, + "step": 4096 + }, + { + "epoch": 0.45146005509641873, + "grad_norm": 6.1522297859191895, + "learning_rate": 5.845656149092607e-06, + "loss": 0.3853, + "step": 4097 + }, + { + "epoch": 0.4515702479338843, + "grad_norm": 5.82921838760376, + "learning_rate": 5.843932848633497e-06, + "loss": 0.4241, + "step": 4098 + }, + { + "epoch": 0.4516804407713499, + "grad_norm": 4.390358924865723, + "learning_rate": 5.8422094449771335e-06, + "loss": 0.4511, + "step": 4099 + }, + { + "epoch": 0.45179063360881544, + "grad_norm": 5.280516147613525, + "learning_rate": 5.8404859383342534e-06, + "loss": 0.3394, + "step": 4100 + }, + { + "epoch": 0.451900826446281, + "grad_norm": 4.667827606201172, + "learning_rate": 5.838762328915613e-06, + "loss": 0.4015, + "step": 4101 + }, + { + "epoch": 0.45201101928374654, + "grad_norm": 5.000226974487305, + "learning_rate": 5.837038616931975e-06, + "loss": 0.3944, + "step": 4102 + }, + { + "epoch": 0.45212121212121215, + "grad_norm": 4.7028045654296875, + "learning_rate": 5.8353148025941165e-06, + "loss": 0.4614, + "step": 4103 + }, + { + "epoch": 0.4522314049586777, + "grad_norm": 12.146053314208984, + "learning_rate": 5.833590886112831e-06, + "loss": 0.3995, + "step": 4104 + }, + { + "epoch": 0.45234159779614325, + "grad_norm": 7.792434215545654, + "learning_rate": 5.831866867698918e-06, + "loss": 0.4203, + "step": 4105 + }, + { + "epoch": 0.4524517906336088, + "grad_norm": 9.011277198791504, + "learning_rate": 5.830142747563195e-06, + "loss": 0.4962, + "step": 4106 + }, + { + "epoch": 0.4525619834710744, + "grad_norm": 8.330543518066406, + "learning_rate": 5.828418525916491e-06, + "loss": 0.3836, + "step": 4107 + }, + { + "epoch": 0.45267217630853995, + "grad_norm": 5.687896251678467, + "learning_rate": 5.826694202969641e-06, + "loss": 0.3428, + "step": 4108 + }, + { + "epoch": 0.4527823691460055, + "grad_norm": 6.441923141479492, + "learning_rate": 5.824969778933504e-06, + "loss": 0.4435, + "step": 4109 + }, + { + "epoch": 0.45289256198347105, + "grad_norm": 5.4831438064575195, + "learning_rate": 5.823245254018941e-06, + "loss": 0.3995, + "step": 4110 + }, + { + "epoch": 0.45300275482093666, + "grad_norm": 8.866961479187012, + "learning_rate": 5.82152062843683e-06, + "loss": 0.4861, + "step": 4111 + }, + { + "epoch": 0.4531129476584022, + "grad_norm": 5.998145580291748, + "learning_rate": 5.81979590239806e-06, + "loss": 0.4596, + "step": 4112 + }, + { + "epoch": 0.45322314049586776, + "grad_norm": 6.52558708190918, + "learning_rate": 5.818071076113534e-06, + "loss": 0.3853, + "step": 4113 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 4.526134014129639, + "learning_rate": 5.8163461497941655e-06, + "loss": 0.3947, + "step": 4114 + }, + { + "epoch": 0.4534435261707989, + "grad_norm": 11.384407043457031, + "learning_rate": 5.8146211236508794e-06, + "loss": 0.4874, + "step": 4115 + }, + { + "epoch": 0.45355371900826447, + "grad_norm": 9.417316436767578, + "learning_rate": 5.812895997894617e-06, + "loss": 0.4166, + "step": 4116 + }, + { + "epoch": 0.45366391184573, + "grad_norm": 9.920056343078613, + "learning_rate": 5.811170772736329e-06, + "loss": 0.3421, + "step": 4117 + }, + { + "epoch": 0.45377410468319557, + "grad_norm": 22.426549911499023, + "learning_rate": 5.809445448386976e-06, + "loss": 0.5263, + "step": 4118 + }, + { + "epoch": 0.4538842975206612, + "grad_norm": 6.549997806549072, + "learning_rate": 5.8077200250575334e-06, + "loss": 0.3492, + "step": 4119 + }, + { + "epoch": 0.4539944903581267, + "grad_norm": 8.752823829650879, + "learning_rate": 5.80599450295899e-06, + "loss": 0.4303, + "step": 4120 + }, + { + "epoch": 0.4541046831955923, + "grad_norm": 9.733783721923828, + "learning_rate": 5.804268882302343e-06, + "loss": 0.3882, + "step": 4121 + }, + { + "epoch": 0.4542148760330578, + "grad_norm": 4.425025939941406, + "learning_rate": 5.802543163298605e-06, + "loss": 0.3804, + "step": 4122 + }, + { + "epoch": 0.45432506887052343, + "grad_norm": 8.211994171142578, + "learning_rate": 5.800817346158799e-06, + "loss": 0.4269, + "step": 4123 + }, + { + "epoch": 0.454435261707989, + "grad_norm": 9.320842742919922, + "learning_rate": 5.7990914310939605e-06, + "loss": 0.4989, + "step": 4124 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 5.90273904800415, + "learning_rate": 5.7973654183151355e-06, + "loss": 0.3813, + "step": 4125 + }, + { + "epoch": 0.45465564738292014, + "grad_norm": 6.839073181152344, + "learning_rate": 5.795639308033383e-06, + "loss": 0.4971, + "step": 4126 + }, + { + "epoch": 0.4547658402203857, + "grad_norm": 4.444808483123779, + "learning_rate": 5.793913100459778e-06, + "loss": 0.4208, + "step": 4127 + }, + { + "epoch": 0.45487603305785124, + "grad_norm": 8.782885551452637, + "learning_rate": 5.792186795805399e-06, + "loss": 0.4249, + "step": 4128 + }, + { + "epoch": 0.4549862258953168, + "grad_norm": 4.603721618652344, + "learning_rate": 5.790460394281343e-06, + "loss": 0.4146, + "step": 4129 + }, + { + "epoch": 0.4550964187327824, + "grad_norm": 9.251736640930176, + "learning_rate": 5.788733896098716e-06, + "loss": 0.4833, + "step": 4130 + }, + { + "epoch": 0.45520661157024794, + "grad_norm": 4.481989860534668, + "learning_rate": 5.787007301468637e-06, + "loss": 0.4099, + "step": 4131 + }, + { + "epoch": 0.4553168044077135, + "grad_norm": 5.976316928863525, + "learning_rate": 5.7852806106022354e-06, + "loss": 0.4435, + "step": 4132 + }, + { + "epoch": 0.45542699724517904, + "grad_norm": 5.873013019561768, + "learning_rate": 5.783553823710654e-06, + "loss": 0.4555, + "step": 4133 + }, + { + "epoch": 0.45553719008264465, + "grad_norm": 7.329885482788086, + "learning_rate": 5.781826941005048e-06, + "loss": 0.479, + "step": 4134 + }, + { + "epoch": 0.4556473829201102, + "grad_norm": 7.070761680603027, + "learning_rate": 5.78009996269658e-06, + "loss": 0.4293, + "step": 4135 + }, + { + "epoch": 0.45575757575757575, + "grad_norm": 8.647218704223633, + "learning_rate": 5.77837288899643e-06, + "loss": 0.4293, + "step": 4136 + }, + { + "epoch": 0.4558677685950413, + "grad_norm": 6.377957820892334, + "learning_rate": 5.776645720115787e-06, + "loss": 0.4042, + "step": 4137 + }, + { + "epoch": 0.4559779614325069, + "grad_norm": 5.826206207275391, + "learning_rate": 5.774918456265848e-06, + "loss": 0.3931, + "step": 4138 + }, + { + "epoch": 0.45608815426997246, + "grad_norm": 6.398508548736572, + "learning_rate": 5.773191097657827e-06, + "loss": 0.4291, + "step": 4139 + }, + { + "epoch": 0.456198347107438, + "grad_norm": 3.5530846118927, + "learning_rate": 5.771463644502951e-06, + "loss": 0.3721, + "step": 4140 + }, + { + "epoch": 0.45630853994490356, + "grad_norm": 7.899833679199219, + "learning_rate": 5.769736097012451e-06, + "loss": 0.3802, + "step": 4141 + }, + { + "epoch": 0.45641873278236916, + "grad_norm": 7.081460952758789, + "learning_rate": 5.7680084553975765e-06, + "loss": 0.3913, + "step": 4142 + }, + { + "epoch": 0.4565289256198347, + "grad_norm": 7.769568920135498, + "learning_rate": 5.766280719869584e-06, + "loss": 0.3456, + "step": 4143 + }, + { + "epoch": 0.45663911845730026, + "grad_norm": 7.003978252410889, + "learning_rate": 5.764552890639744e-06, + "loss": 0.4205, + "step": 4144 + }, + { + "epoch": 0.4567493112947658, + "grad_norm": 8.077564239501953, + "learning_rate": 5.76282496791934e-06, + "loss": 0.4132, + "step": 4145 + }, + { + "epoch": 0.4568595041322314, + "grad_norm": 11.824240684509277, + "learning_rate": 5.7610969519196595e-06, + "loss": 0.4437, + "step": 4146 + }, + { + "epoch": 0.45696969696969697, + "grad_norm": 12.949878692626953, + "learning_rate": 5.7593688428520115e-06, + "loss": 0.5133, + "step": 4147 + }, + { + "epoch": 0.4570798898071625, + "grad_norm": 4.608225345611572, + "learning_rate": 5.757640640927711e-06, + "loss": 0.3576, + "step": 4148 + }, + { + "epoch": 0.4571900826446281, + "grad_norm": 6.741049766540527, + "learning_rate": 5.755912346358081e-06, + "loss": 0.4643, + "step": 4149 + }, + { + "epoch": 0.4573002754820937, + "grad_norm": 4.472276210784912, + "learning_rate": 5.7541839593544645e-06, + "loss": 0.4487, + "step": 4150 + }, + { + "epoch": 0.4574104683195592, + "grad_norm": 7.069968223571777, + "learning_rate": 5.752455480128209e-06, + "loss": 0.4236, + "step": 4151 + }, + { + "epoch": 0.4575206611570248, + "grad_norm": 7.472801685333252, + "learning_rate": 5.750726908890675e-06, + "loss": 0.4297, + "step": 4152 + }, + { + "epoch": 0.4576308539944904, + "grad_norm": 6.52254056930542, + "learning_rate": 5.748998245853235e-06, + "loss": 0.4283, + "step": 4153 + }, + { + "epoch": 0.45774104683195593, + "grad_norm": 5.164422512054443, + "learning_rate": 5.747269491227271e-06, + "loss": 0.419, + "step": 4154 + }, + { + "epoch": 0.4578512396694215, + "grad_norm": 10.188724517822266, + "learning_rate": 5.74554064522418e-06, + "loss": 0.4748, + "step": 4155 + }, + { + "epoch": 0.45796143250688703, + "grad_norm": 9.821293830871582, + "learning_rate": 5.743811708055364e-06, + "loss": 0.3994, + "step": 4156 + }, + { + "epoch": 0.45807162534435264, + "grad_norm": 4.618627071380615, + "learning_rate": 5.7420826799322445e-06, + "loss": 0.4188, + "step": 4157 + }, + { + "epoch": 0.4581818181818182, + "grad_norm": 6.881746768951416, + "learning_rate": 5.740353561066246e-06, + "loss": 0.5055, + "step": 4158 + }, + { + "epoch": 0.45829201101928374, + "grad_norm": 5.866497993469238, + "learning_rate": 5.738624351668808e-06, + "loss": 0.415, + "step": 4159 + }, + { + "epoch": 0.4584022038567493, + "grad_norm": 5.360278606414795, + "learning_rate": 5.736895051951382e-06, + "loss": 0.4482, + "step": 4160 + }, + { + "epoch": 0.4585123966942149, + "grad_norm": 6.263329982757568, + "learning_rate": 5.735165662125426e-06, + "loss": 0.3392, + "step": 4161 + }, + { + "epoch": 0.45862258953168045, + "grad_norm": 6.2176313400268555, + "learning_rate": 5.733436182402416e-06, + "loss": 0.4368, + "step": 4162 + }, + { + "epoch": 0.458732782369146, + "grad_norm": 7.075268268585205, + "learning_rate": 5.7317066129938335e-06, + "loss": 0.402, + "step": 4163 + }, + { + "epoch": 0.45884297520661155, + "grad_norm": 11.643766403198242, + "learning_rate": 5.729976954111171e-06, + "loss": 0.4219, + "step": 4164 + }, + { + "epoch": 0.45895316804407715, + "grad_norm": 5.848720550537109, + "learning_rate": 5.728247205965936e-06, + "loss": 0.3966, + "step": 4165 + }, + { + "epoch": 0.4590633608815427, + "grad_norm": 10.475896835327148, + "learning_rate": 5.726517368769644e-06, + "loss": 0.496, + "step": 4166 + }, + { + "epoch": 0.45917355371900825, + "grad_norm": 6.278916358947754, + "learning_rate": 5.724787442733819e-06, + "loss": 0.3825, + "step": 4167 + }, + { + "epoch": 0.4592837465564738, + "grad_norm": 4.7572760581970215, + "learning_rate": 5.723057428070003e-06, + "loss": 0.3911, + "step": 4168 + }, + { + "epoch": 0.4593939393939394, + "grad_norm": 9.122440338134766, + "learning_rate": 5.721327324989743e-06, + "loss": 0.4123, + "step": 4169 + }, + { + "epoch": 0.45950413223140496, + "grad_norm": 4.627889156341553, + "learning_rate": 5.719597133704597e-06, + "loss": 0.4247, + "step": 4170 + }, + { + "epoch": 0.4596143250688705, + "grad_norm": 10.681262969970703, + "learning_rate": 5.717866854426135e-06, + "loss": 0.3771, + "step": 4171 + }, + { + "epoch": 0.45972451790633606, + "grad_norm": 10.229944229125977, + "learning_rate": 5.7161364873659395e-06, + "loss": 0.565, + "step": 4172 + }, + { + "epoch": 0.45983471074380167, + "grad_norm": 9.267426490783691, + "learning_rate": 5.714406032735602e-06, + "loss": 0.38, + "step": 4173 + }, + { + "epoch": 0.4599449035812672, + "grad_norm": 9.422104835510254, + "learning_rate": 5.712675490746724e-06, + "loss": 0.441, + "step": 4174 + }, + { + "epoch": 0.46005509641873277, + "grad_norm": 4.980631351470947, + "learning_rate": 5.710944861610919e-06, + "loss": 0.3582, + "step": 4175 + }, + { + "epoch": 0.4601652892561984, + "grad_norm": 7.520429611206055, + "learning_rate": 5.709214145539811e-06, + "loss": 0.3643, + "step": 4176 + }, + { + "epoch": 0.4602754820936639, + "grad_norm": 8.083974838256836, + "learning_rate": 5.707483342745032e-06, + "loss": 0.4682, + "step": 4177 + }, + { + "epoch": 0.4603856749311295, + "grad_norm": 5.137016296386719, + "learning_rate": 5.705752453438231e-06, + "loss": 0.3894, + "step": 4178 + }, + { + "epoch": 0.460495867768595, + "grad_norm": 10.261585235595703, + "learning_rate": 5.704021477831062e-06, + "loss": 0.5258, + "step": 4179 + }, + { + "epoch": 0.46060606060606063, + "grad_norm": 6.102723598480225, + "learning_rate": 5.7022904161351886e-06, + "loss": 0.3917, + "step": 4180 + }, + { + "epoch": 0.4607162534435262, + "grad_norm": 4.687762260437012, + "learning_rate": 5.70055926856229e-06, + "loss": 0.4548, + "step": 4181 + }, + { + "epoch": 0.46082644628099173, + "grad_norm": 9.035147666931152, + "learning_rate": 5.698828035324051e-06, + "loss": 0.432, + "step": 4182 + }, + { + "epoch": 0.4609366391184573, + "grad_norm": 8.2828369140625, + "learning_rate": 5.697096716632173e-06, + "loss": 0.4721, + "step": 4183 + }, + { + "epoch": 0.4610468319559229, + "grad_norm": 10.927952766418457, + "learning_rate": 5.69536531269836e-06, + "loss": 0.4251, + "step": 4184 + }, + { + "epoch": 0.46115702479338844, + "grad_norm": 10.918237686157227, + "learning_rate": 5.693633823734331e-06, + "loss": 0.4486, + "step": 4185 + }, + { + "epoch": 0.461267217630854, + "grad_norm": 9.484095573425293, + "learning_rate": 5.69190224995182e-06, + "loss": 0.4049, + "step": 4186 + }, + { + "epoch": 0.46137741046831954, + "grad_norm": 5.832151412963867, + "learning_rate": 5.690170591562557e-06, + "loss": 0.4209, + "step": 4187 + }, + { + "epoch": 0.46148760330578514, + "grad_norm": 8.604377746582031, + "learning_rate": 5.6884388487782995e-06, + "loss": 0.4892, + "step": 4188 + }, + { + "epoch": 0.4615977961432507, + "grad_norm": 6.541306972503662, + "learning_rate": 5.686707021810802e-06, + "loss": 0.4282, + "step": 4189 + }, + { + "epoch": 0.46170798898071624, + "grad_norm": 5.040317058563232, + "learning_rate": 5.6849751108718395e-06, + "loss": 0.4091, + "step": 4190 + }, + { + "epoch": 0.4618181818181818, + "grad_norm": 6.780949592590332, + "learning_rate": 5.68324311617319e-06, + "loss": 0.4909, + "step": 4191 + }, + { + "epoch": 0.4619283746556474, + "grad_norm": 9.71782112121582, + "learning_rate": 5.681511037926643e-06, + "loss": 0.4777, + "step": 4192 + }, + { + "epoch": 0.46203856749311295, + "grad_norm": 12.609463691711426, + "learning_rate": 5.679778876344001e-06, + "loss": 0.4661, + "step": 4193 + }, + { + "epoch": 0.4621487603305785, + "grad_norm": 7.244626522064209, + "learning_rate": 5.678046631637074e-06, + "loss": 0.4415, + "step": 4194 + }, + { + "epoch": 0.46225895316804405, + "grad_norm": 8.343194007873535, + "learning_rate": 5.676314304017684e-06, + "loss": 0.3958, + "step": 4195 + }, + { + "epoch": 0.46236914600550966, + "grad_norm": 5.753417015075684, + "learning_rate": 5.674581893697663e-06, + "loss": 0.3926, + "step": 4196 + }, + { + "epoch": 0.4624793388429752, + "grad_norm": 6.718925952911377, + "learning_rate": 5.6728494008888516e-06, + "loss": 0.4558, + "step": 4197 + }, + { + "epoch": 0.46258953168044076, + "grad_norm": 7.640260696411133, + "learning_rate": 5.6711168258031e-06, + "loss": 0.3741, + "step": 4198 + }, + { + "epoch": 0.46269972451790636, + "grad_norm": 9.677931785583496, + "learning_rate": 5.6693841686522734e-06, + "loss": 0.3595, + "step": 4199 + }, + { + "epoch": 0.4628099173553719, + "grad_norm": 6.4213948249816895, + "learning_rate": 5.66765142964824e-06, + "loss": 0.4104, + "step": 4200 + }, + { + "epoch": 0.46292011019283746, + "grad_norm": 4.397367000579834, + "learning_rate": 5.665918609002884e-06, + "loss": 0.3712, + "step": 4201 + }, + { + "epoch": 0.463030303030303, + "grad_norm": 7.3970746994018555, + "learning_rate": 5.664185706928094e-06, + "loss": 0.4252, + "step": 4202 + }, + { + "epoch": 0.4631404958677686, + "grad_norm": 11.200392723083496, + "learning_rate": 5.6624527236357754e-06, + "loss": 0.4969, + "step": 4203 + }, + { + "epoch": 0.46325068870523417, + "grad_norm": 6.623799800872803, + "learning_rate": 5.6607196593378375e-06, + "loss": 0.3434, + "step": 4204 + }, + { + "epoch": 0.4633608815426997, + "grad_norm": 8.250639915466309, + "learning_rate": 5.658986514246202e-06, + "loss": 0.4288, + "step": 4205 + }, + { + "epoch": 0.46347107438016527, + "grad_norm": 8.903459548950195, + "learning_rate": 5.6572532885728e-06, + "loss": 0.418, + "step": 4206 + }, + { + "epoch": 0.4635812672176309, + "grad_norm": 6.643313884735107, + "learning_rate": 5.655519982529574e-06, + "loss": 0.3834, + "step": 4207 + }, + { + "epoch": 0.4636914600550964, + "grad_norm": 6.704066276550293, + "learning_rate": 5.653786596328472e-06, + "loss": 0.3128, + "step": 4208 + }, + { + "epoch": 0.463801652892562, + "grad_norm": 6.213980674743652, + "learning_rate": 5.6520531301814595e-06, + "loss": 0.447, + "step": 4209 + }, + { + "epoch": 0.46391184573002753, + "grad_norm": 4.845897674560547, + "learning_rate": 5.650319584300503e-06, + "loss": 0.3957, + "step": 4210 + }, + { + "epoch": 0.46402203856749313, + "grad_norm": 6.639408111572266, + "learning_rate": 5.648585958897585e-06, + "loss": 0.4439, + "step": 4211 + }, + { + "epoch": 0.4641322314049587, + "grad_norm": 9.47696590423584, + "learning_rate": 5.646852254184695e-06, + "loss": 0.4725, + "step": 4212 + }, + { + "epoch": 0.46424242424242423, + "grad_norm": 5.440971851348877, + "learning_rate": 5.645118470373832e-06, + "loss": 0.4025, + "step": 4213 + }, + { + "epoch": 0.4643526170798898, + "grad_norm": 5.227582931518555, + "learning_rate": 5.643384607677007e-06, + "loss": 0.4273, + "step": 4214 + }, + { + "epoch": 0.4644628099173554, + "grad_norm": 7.226681232452393, + "learning_rate": 5.641650666306237e-06, + "loss": 0.4511, + "step": 4215 + }, + { + "epoch": 0.46457300275482094, + "grad_norm": 6.690056800842285, + "learning_rate": 5.639916646473554e-06, + "loss": 0.357, + "step": 4216 + }, + { + "epoch": 0.4646831955922865, + "grad_norm": 9.158060073852539, + "learning_rate": 5.6381825483909916e-06, + "loss": 0.4276, + "step": 4217 + }, + { + "epoch": 0.46479338842975204, + "grad_norm": 6.537561416625977, + "learning_rate": 5.636448372270602e-06, + "loss": 0.335, + "step": 4218 + }, + { + "epoch": 0.46490358126721765, + "grad_norm": 6.055027484893799, + "learning_rate": 5.634714118324442e-06, + "loss": 0.4111, + "step": 4219 + }, + { + "epoch": 0.4650137741046832, + "grad_norm": 18.14972496032715, + "learning_rate": 5.6329797867645746e-06, + "loss": 0.5513, + "step": 4220 + }, + { + "epoch": 0.46512396694214875, + "grad_norm": 6.206079006195068, + "learning_rate": 5.6312453778030806e-06, + "loss": 0.4031, + "step": 4221 + }, + { + "epoch": 0.4652341597796143, + "grad_norm": 7.26116943359375, + "learning_rate": 5.629510891652045e-06, + "loss": 0.4924, + "step": 4222 + }, + { + "epoch": 0.4653443526170799, + "grad_norm": 5.4720377922058105, + "learning_rate": 5.62777632852356e-06, + "loss": 0.3969, + "step": 4223 + }, + { + "epoch": 0.46545454545454545, + "grad_norm": 6.136850833892822, + "learning_rate": 5.6260416886297356e-06, + "loss": 0.4204, + "step": 4224 + }, + { + "epoch": 0.465564738292011, + "grad_norm": 7.384100914001465, + "learning_rate": 5.624306972182681e-06, + "loss": 0.4294, + "step": 4225 + }, + { + "epoch": 0.4656749311294766, + "grad_norm": 11.933074951171875, + "learning_rate": 5.6225721793945235e-06, + "loss": 0.4586, + "step": 4226 + }, + { + "epoch": 0.46578512396694216, + "grad_norm": 11.574447631835938, + "learning_rate": 5.6208373104773925e-06, + "loss": 0.4163, + "step": 4227 + }, + { + "epoch": 0.4658953168044077, + "grad_norm": 7.786038875579834, + "learning_rate": 5.619102365643434e-06, + "loss": 0.4239, + "step": 4228 + }, + { + "epoch": 0.46600550964187326, + "grad_norm": 7.837298393249512, + "learning_rate": 5.617367345104796e-06, + "loss": 0.4201, + "step": 4229 + }, + { + "epoch": 0.46611570247933887, + "grad_norm": 5.018138408660889, + "learning_rate": 5.615632249073641e-06, + "loss": 0.4, + "step": 4230 + }, + { + "epoch": 0.4662258953168044, + "grad_norm": 7.820041179656982, + "learning_rate": 5.613897077762136e-06, + "loss": 0.3829, + "step": 4231 + }, + { + "epoch": 0.46633608815426997, + "grad_norm": 6.930209159851074, + "learning_rate": 5.612161831382465e-06, + "loss": 0.3486, + "step": 4232 + }, + { + "epoch": 0.4664462809917355, + "grad_norm": 5.2343974113464355, + "learning_rate": 5.610426510146814e-06, + "loss": 0.3918, + "step": 4233 + }, + { + "epoch": 0.4665564738292011, + "grad_norm": 5.706419467926025, + "learning_rate": 5.608691114267379e-06, + "loss": 0.3262, + "step": 4234 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 4.098259449005127, + "learning_rate": 5.606955643956368e-06, + "loss": 0.3802, + "step": 4235 + }, + { + "epoch": 0.4667768595041322, + "grad_norm": 6.685352802276611, + "learning_rate": 5.605220099425995e-06, + "loss": 0.4617, + "step": 4236 + }, + { + "epoch": 0.4668870523415978, + "grad_norm": 8.390470504760742, + "learning_rate": 5.603484480888488e-06, + "loss": 0.4822, + "step": 4237 + }, + { + "epoch": 0.4669972451790634, + "grad_norm": 5.249129295349121, + "learning_rate": 5.6017487885560784e-06, + "loss": 0.4082, + "step": 4238 + }, + { + "epoch": 0.46710743801652893, + "grad_norm": 5.53339958190918, + "learning_rate": 5.600013022641009e-06, + "loss": 0.4677, + "step": 4239 + }, + { + "epoch": 0.4672176308539945, + "grad_norm": 8.485630989074707, + "learning_rate": 5.598277183355533e-06, + "loss": 0.4946, + "step": 4240 + }, + { + "epoch": 0.46732782369146003, + "grad_norm": 4.0238189697265625, + "learning_rate": 5.5965412709119094e-06, + "loss": 0.3379, + "step": 4241 + }, + { + "epoch": 0.46743801652892564, + "grad_norm": 5.816349029541016, + "learning_rate": 5.594805285522411e-06, + "loss": 0.3848, + "step": 4242 + }, + { + "epoch": 0.4675482093663912, + "grad_norm": 13.12299633026123, + "learning_rate": 5.593069227399312e-06, + "loss": 0.5288, + "step": 4243 + }, + { + "epoch": 0.46765840220385674, + "grad_norm": 6.124564170837402, + "learning_rate": 5.591333096754903e-06, + "loss": 0.3349, + "step": 4244 + }, + { + "epoch": 0.4677685950413223, + "grad_norm": 5.709305286407471, + "learning_rate": 5.589596893801479e-06, + "loss": 0.398, + "step": 4245 + }, + { + "epoch": 0.4678787878787879, + "grad_norm": 7.738457202911377, + "learning_rate": 5.587860618751347e-06, + "loss": 0.4516, + "step": 4246 + }, + { + "epoch": 0.46798898071625344, + "grad_norm": 7.816864013671875, + "learning_rate": 5.58612427181682e-06, + "loss": 0.4699, + "step": 4247 + }, + { + "epoch": 0.468099173553719, + "grad_norm": 9.974682807922363, + "learning_rate": 5.58438785321022e-06, + "loss": 0.355, + "step": 4248 + }, + { + "epoch": 0.4682093663911846, + "grad_norm": 9.548007011413574, + "learning_rate": 5.58265136314388e-06, + "loss": 0.3784, + "step": 4249 + }, + { + "epoch": 0.46831955922865015, + "grad_norm": 8.929972648620605, + "learning_rate": 5.580914801830141e-06, + "loss": 0.4947, + "step": 4250 + }, + { + "epoch": 0.4684297520661157, + "grad_norm": 9.959912300109863, + "learning_rate": 5.579178169481348e-06, + "loss": 0.4054, + "step": 4251 + }, + { + "epoch": 0.46853994490358125, + "grad_norm": 7.277345180511475, + "learning_rate": 5.577441466309865e-06, + "loss": 0.4526, + "step": 4252 + }, + { + "epoch": 0.46865013774104686, + "grad_norm": 6.184348106384277, + "learning_rate": 5.575704692528053e-06, + "loss": 0.4462, + "step": 4253 + }, + { + "epoch": 0.4687603305785124, + "grad_norm": 5.845827579498291, + "learning_rate": 5.5739678483482895e-06, + "loss": 0.3542, + "step": 4254 + }, + { + "epoch": 0.46887052341597796, + "grad_norm": 5.441403865814209, + "learning_rate": 5.572230933982958e-06, + "loss": 0.3858, + "step": 4255 + }, + { + "epoch": 0.4689807162534435, + "grad_norm": 7.008370876312256, + "learning_rate": 5.570493949644452e-06, + "loss": 0.3452, + "step": 4256 + }, + { + "epoch": 0.4690909090909091, + "grad_norm": 9.76735782623291, + "learning_rate": 5.56875689554517e-06, + "loss": 0.4842, + "step": 4257 + }, + { + "epoch": 0.46920110192837466, + "grad_norm": 8.114686965942383, + "learning_rate": 5.56701977189752e-06, + "loss": 0.4102, + "step": 4258 + }, + { + "epoch": 0.4693112947658402, + "grad_norm": 7.080952167510986, + "learning_rate": 5.565282578913924e-06, + "loss": 0.4561, + "step": 4259 + }, + { + "epoch": 0.46942148760330576, + "grad_norm": 4.5083794593811035, + "learning_rate": 5.563545316806808e-06, + "loss": 0.3941, + "step": 4260 + }, + { + "epoch": 0.46953168044077137, + "grad_norm": 7.826779365539551, + "learning_rate": 5.561807985788603e-06, + "loss": 0.4642, + "step": 4261 + }, + { + "epoch": 0.4696418732782369, + "grad_norm": 8.584476470947266, + "learning_rate": 5.560070586071755e-06, + "loss": 0.5064, + "step": 4262 + }, + { + "epoch": 0.46975206611570247, + "grad_norm": 6.538086414337158, + "learning_rate": 5.558333117868715e-06, + "loss": 0.3951, + "step": 4263 + }, + { + "epoch": 0.469862258953168, + "grad_norm": 6.985914707183838, + "learning_rate": 5.556595581391941e-06, + "loss": 0.3504, + "step": 4264 + }, + { + "epoch": 0.4699724517906336, + "grad_norm": 5.304479122161865, + "learning_rate": 5.554857976853907e-06, + "loss": 0.3632, + "step": 4265 + }, + { + "epoch": 0.4700826446280992, + "grad_norm": 4.482560157775879, + "learning_rate": 5.553120304467082e-06, + "loss": 0.3665, + "step": 4266 + }, + { + "epoch": 0.47019283746556473, + "grad_norm": 8.54191780090332, + "learning_rate": 5.551382564443958e-06, + "loss": 0.4603, + "step": 4267 + }, + { + "epoch": 0.4703030303030303, + "grad_norm": 8.821993827819824, + "learning_rate": 5.549644756997023e-06, + "loss": 0.423, + "step": 4268 + }, + { + "epoch": 0.4704132231404959, + "grad_norm": 11.837578773498535, + "learning_rate": 5.547906882338782e-06, + "loss": 0.5689, + "step": 4269 + }, + { + "epoch": 0.47052341597796143, + "grad_norm": 8.223700523376465, + "learning_rate": 5.546168940681743e-06, + "loss": 0.4629, + "step": 4270 + }, + { + "epoch": 0.470633608815427, + "grad_norm": 6.382428169250488, + "learning_rate": 5.544430932238423e-06, + "loss": 0.4461, + "step": 4271 + }, + { + "epoch": 0.47074380165289254, + "grad_norm": 8.722525596618652, + "learning_rate": 5.542692857221348e-06, + "loss": 0.3981, + "step": 4272 + }, + { + "epoch": 0.47085399449035814, + "grad_norm": 5.7968010902404785, + "learning_rate": 5.540954715843055e-06, + "loss": 0.4261, + "step": 4273 + }, + { + "epoch": 0.4709641873278237, + "grad_norm": 5.978255271911621, + "learning_rate": 5.539216508316085e-06, + "loss": 0.3708, + "step": 4274 + }, + { + "epoch": 0.47107438016528924, + "grad_norm": 3.8172013759613037, + "learning_rate": 5.537478234852988e-06, + "loss": 0.4553, + "step": 4275 + }, + { + "epoch": 0.47118457300275485, + "grad_norm": 7.099826335906982, + "learning_rate": 5.535739895666321e-06, + "loss": 0.3988, + "step": 4276 + }, + { + "epoch": 0.4712947658402204, + "grad_norm": 6.397146701812744, + "learning_rate": 5.5340014909686525e-06, + "loss": 0.391, + "step": 4277 + }, + { + "epoch": 0.47140495867768595, + "grad_norm": 6.066586494445801, + "learning_rate": 5.532263020972556e-06, + "loss": 0.3247, + "step": 4278 + }, + { + "epoch": 0.4715151515151515, + "grad_norm": 6.35976505279541, + "learning_rate": 5.530524485890614e-06, + "loss": 0.4055, + "step": 4279 + }, + { + "epoch": 0.4716253443526171, + "grad_norm": 4.074512958526611, + "learning_rate": 5.528785885935418e-06, + "loss": 0.3659, + "step": 4280 + }, + { + "epoch": 0.47173553719008265, + "grad_norm": 6.480283737182617, + "learning_rate": 5.527047221319566e-06, + "loss": 0.3655, + "step": 4281 + }, + { + "epoch": 0.4718457300275482, + "grad_norm": 3.880749225616455, + "learning_rate": 5.525308492255662e-06, + "loss": 0.4007, + "step": 4282 + }, + { + "epoch": 0.47195592286501376, + "grad_norm": 5.906920909881592, + "learning_rate": 5.523569698956324e-06, + "loss": 0.3596, + "step": 4283 + }, + { + "epoch": 0.47206611570247936, + "grad_norm": 4.660150527954102, + "learning_rate": 5.521830841634172e-06, + "loss": 0.386, + "step": 4284 + }, + { + "epoch": 0.4721763085399449, + "grad_norm": 7.394484996795654, + "learning_rate": 5.520091920501833e-06, + "loss": 0.422, + "step": 4285 + }, + { + "epoch": 0.47228650137741046, + "grad_norm": 7.227637767791748, + "learning_rate": 5.51835293577195e-06, + "loss": 0.4269, + "step": 4286 + }, + { + "epoch": 0.472396694214876, + "grad_norm": 5.268207550048828, + "learning_rate": 5.516613887657165e-06, + "loss": 0.4106, + "step": 4287 + }, + { + "epoch": 0.4725068870523416, + "grad_norm": 9.578835487365723, + "learning_rate": 5.514874776370133e-06, + "loss": 0.4801, + "step": 4288 + }, + { + "epoch": 0.47261707988980717, + "grad_norm": 4.544835090637207, + "learning_rate": 5.5131356021235135e-06, + "loss": 0.3645, + "step": 4289 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 8.72737979888916, + "learning_rate": 5.511396365129975e-06, + "loss": 0.4129, + "step": 4290 + }, + { + "epoch": 0.47283746556473827, + "grad_norm": 8.959371566772461, + "learning_rate": 5.509657065602197e-06, + "loss": 0.4078, + "step": 4291 + }, + { + "epoch": 0.4729476584022039, + "grad_norm": 5.4538702964782715, + "learning_rate": 5.507917703752856e-06, + "loss": 0.3698, + "step": 4292 + }, + { + "epoch": 0.4730578512396694, + "grad_norm": 18.069997787475586, + "learning_rate": 5.506178279794652e-06, + "loss": 0.5317, + "step": 4293 + }, + { + "epoch": 0.473168044077135, + "grad_norm": 12.249960899353027, + "learning_rate": 5.5044387939402775e-06, + "loss": 0.4586, + "step": 4294 + }, + { + "epoch": 0.4732782369146005, + "grad_norm": 5.215136528015137, + "learning_rate": 5.502699246402444e-06, + "loss": 0.4257, + "step": 4295 + }, + { + "epoch": 0.47338842975206613, + "grad_norm": 9.20561695098877, + "learning_rate": 5.500959637393865e-06, + "loss": 0.3994, + "step": 4296 + }, + { + "epoch": 0.4734986225895317, + "grad_norm": 6.768822193145752, + "learning_rate": 5.499219967127258e-06, + "loss": 0.4311, + "step": 4297 + }, + { + "epoch": 0.47360881542699723, + "grad_norm": 7.707272529602051, + "learning_rate": 5.497480235815356e-06, + "loss": 0.5008, + "step": 4298 + }, + { + "epoch": 0.47371900826446284, + "grad_norm": 5.857189178466797, + "learning_rate": 5.4957404436708975e-06, + "loss": 0.3601, + "step": 4299 + }, + { + "epoch": 0.4738292011019284, + "grad_norm": 9.859488487243652, + "learning_rate": 5.494000590906622e-06, + "loss": 0.5111, + "step": 4300 + }, + { + "epoch": 0.47393939393939394, + "grad_norm": 6.382084369659424, + "learning_rate": 5.492260677735284e-06, + "loss": 0.427, + "step": 4301 + }, + { + "epoch": 0.4740495867768595, + "grad_norm": 8.878406524658203, + "learning_rate": 5.490520704369642e-06, + "loss": 0.4398, + "step": 4302 + }, + { + "epoch": 0.4741597796143251, + "grad_norm": 7.349577903747559, + "learning_rate": 5.488780671022461e-06, + "loss": 0.4158, + "step": 4303 + }, + { + "epoch": 0.47426997245179064, + "grad_norm": 7.9784064292907715, + "learning_rate": 5.487040577906515e-06, + "loss": 0.4526, + "step": 4304 + }, + { + "epoch": 0.4743801652892562, + "grad_norm": 13.937430381774902, + "learning_rate": 5.485300425234587e-06, + "loss": 0.4784, + "step": 4305 + }, + { + "epoch": 0.47449035812672175, + "grad_norm": 8.099891662597656, + "learning_rate": 5.483560213219464e-06, + "loss": 0.4843, + "step": 4306 + }, + { + "epoch": 0.47460055096418735, + "grad_norm": 5.307769298553467, + "learning_rate": 5.4818199420739395e-06, + "loss": 0.382, + "step": 4307 + }, + { + "epoch": 0.4747107438016529, + "grad_norm": 13.030486106872559, + "learning_rate": 5.480079612010819e-06, + "loss": 0.5393, + "step": 4308 + }, + { + "epoch": 0.47482093663911845, + "grad_norm": 6.650481700897217, + "learning_rate": 5.478339223242912e-06, + "loss": 0.4695, + "step": 4309 + }, + { + "epoch": 0.474931129476584, + "grad_norm": 8.314249038696289, + "learning_rate": 5.476598775983033e-06, + "loss": 0.4093, + "step": 4310 + }, + { + "epoch": 0.4750413223140496, + "grad_norm": 13.993112564086914, + "learning_rate": 5.47485827044401e-06, + "loss": 0.5179, + "step": 4311 + }, + { + "epoch": 0.47515151515151516, + "grad_norm": 5.659000396728516, + "learning_rate": 5.473117706838673e-06, + "loss": 0.3622, + "step": 4312 + }, + { + "epoch": 0.4752617079889807, + "grad_norm": 4.3064470291137695, + "learning_rate": 5.471377085379858e-06, + "loss": 0.3936, + "step": 4313 + }, + { + "epoch": 0.47537190082644626, + "grad_norm": 6.03898811340332, + "learning_rate": 5.469636406280416e-06, + "loss": 0.4387, + "step": 4314 + }, + { + "epoch": 0.47548209366391186, + "grad_norm": 4.539555549621582, + "learning_rate": 5.467895669753194e-06, + "loss": 0.2841, + "step": 4315 + }, + { + "epoch": 0.4755922865013774, + "grad_norm": 6.276854038238525, + "learning_rate": 5.466154876011055e-06, + "loss": 0.4324, + "step": 4316 + }, + { + "epoch": 0.47570247933884297, + "grad_norm": 9.656164169311523, + "learning_rate": 5.464414025266863e-06, + "loss": 0.4452, + "step": 4317 + }, + { + "epoch": 0.4758126721763085, + "grad_norm": 4.501316070556641, + "learning_rate": 5.462673117733493e-06, + "loss": 0.4771, + "step": 4318 + }, + { + "epoch": 0.4759228650137741, + "grad_norm": 9.54035758972168, + "learning_rate": 5.460932153623829e-06, + "loss": 0.3968, + "step": 4319 + }, + { + "epoch": 0.47603305785123967, + "grad_norm": 5.434879779815674, + "learning_rate": 5.459191133150753e-06, + "loss": 0.3633, + "step": 4320 + }, + { + "epoch": 0.4761432506887052, + "grad_norm": 5.331169128417969, + "learning_rate": 5.457450056527162e-06, + "loss": 0.432, + "step": 4321 + }, + { + "epoch": 0.47625344352617077, + "grad_norm": 6.110133647918701, + "learning_rate": 5.455708923965954e-06, + "loss": 0.3912, + "step": 4322 + }, + { + "epoch": 0.4763636363636364, + "grad_norm": 5.7163405418396, + "learning_rate": 5.453967735680044e-06, + "loss": 0.391, + "step": 4323 + }, + { + "epoch": 0.47647382920110193, + "grad_norm": 4.875898361206055, + "learning_rate": 5.4522264918823395e-06, + "loss": 0.4557, + "step": 4324 + }, + { + "epoch": 0.4765840220385675, + "grad_norm": 5.104773044586182, + "learning_rate": 5.4504851927857664e-06, + "loss": 0.4269, + "step": 4325 + }, + { + "epoch": 0.4766942148760331, + "grad_norm": 4.523794651031494, + "learning_rate": 5.448743838603252e-06, + "loss": 0.3935, + "step": 4326 + }, + { + "epoch": 0.47680440771349863, + "grad_norm": 7.606765270233154, + "learning_rate": 5.447002429547732e-06, + "loss": 0.4277, + "step": 4327 + }, + { + "epoch": 0.4769146005509642, + "grad_norm": 4.95655632019043, + "learning_rate": 5.445260965832146e-06, + "loss": 0.4097, + "step": 4328 + }, + { + "epoch": 0.47702479338842974, + "grad_norm": 4.617556571960449, + "learning_rate": 5.443519447669445e-06, + "loss": 0.443, + "step": 4329 + }, + { + "epoch": 0.47713498622589534, + "grad_norm": 6.024190902709961, + "learning_rate": 5.441777875272585e-06, + "loss": 0.4358, + "step": 4330 + }, + { + "epoch": 0.4772451790633609, + "grad_norm": 5.942342758178711, + "learning_rate": 5.440036248854525e-06, + "loss": 0.337, + "step": 4331 + }, + { + "epoch": 0.47735537190082644, + "grad_norm": 6.1515398025512695, + "learning_rate": 5.438294568628235e-06, + "loss": 0.4493, + "step": 4332 + }, + { + "epoch": 0.477465564738292, + "grad_norm": 5.799715042114258, + "learning_rate": 5.43655283480669e-06, + "loss": 0.3944, + "step": 4333 + }, + { + "epoch": 0.4775757575757576, + "grad_norm": 5.636984348297119, + "learning_rate": 5.4348110476028715e-06, + "loss": 0.4278, + "step": 4334 + }, + { + "epoch": 0.47768595041322315, + "grad_norm": 7.268592357635498, + "learning_rate": 5.4330692072297665e-06, + "loss": 0.3753, + "step": 4335 + }, + { + "epoch": 0.4777961432506887, + "grad_norm": 5.62819766998291, + "learning_rate": 5.431327313900371e-06, + "loss": 0.3748, + "step": 4336 + }, + { + "epoch": 0.47790633608815425, + "grad_norm": 5.308300971984863, + "learning_rate": 5.4295853678276855e-06, + "loss": 0.4211, + "step": 4337 + }, + { + "epoch": 0.47801652892561985, + "grad_norm": 4.91331148147583, + "learning_rate": 5.427843369224718e-06, + "loss": 0.4165, + "step": 4338 + }, + { + "epoch": 0.4781267217630854, + "grad_norm": 7.938823699951172, + "learning_rate": 5.426101318304482e-06, + "loss": 0.4165, + "step": 4339 + }, + { + "epoch": 0.47823691460055096, + "grad_norm": 8.069595336914062, + "learning_rate": 5.424359215279999e-06, + "loss": 0.3445, + "step": 4340 + }, + { + "epoch": 0.4783471074380165, + "grad_norm": 7.100213527679443, + "learning_rate": 5.422617060364293e-06, + "loss": 0.3952, + "step": 4341 + }, + { + "epoch": 0.4784573002754821, + "grad_norm": 9.818364143371582, + "learning_rate": 5.4208748537703995e-06, + "loss": 0.3307, + "step": 4342 + }, + { + "epoch": 0.47856749311294766, + "grad_norm": 9.175378799438477, + "learning_rate": 5.419132595711357e-06, + "loss": 0.4935, + "step": 4343 + }, + { + "epoch": 0.4786776859504132, + "grad_norm": 6.158470630645752, + "learning_rate": 5.417390286400213e-06, + "loss": 0.3144, + "step": 4344 + }, + { + "epoch": 0.47878787878787876, + "grad_norm": 10.078022003173828, + "learning_rate": 5.415647926050016e-06, + "loss": 0.4913, + "step": 4345 + }, + { + "epoch": 0.47889807162534437, + "grad_norm": 6.890070915222168, + "learning_rate": 5.413905514873825e-06, + "loss": 0.4608, + "step": 4346 + }, + { + "epoch": 0.4790082644628099, + "grad_norm": 6.087070465087891, + "learning_rate": 5.412163053084709e-06, + "loss": 0.3743, + "step": 4347 + }, + { + "epoch": 0.47911845730027547, + "grad_norm": 6.422086238861084, + "learning_rate": 5.410420540895731e-06, + "loss": 0.4777, + "step": 4348 + }, + { + "epoch": 0.4792286501377411, + "grad_norm": 7.705680847167969, + "learning_rate": 5.408677978519975e-06, + "loss": 0.4404, + "step": 4349 + }, + { + "epoch": 0.4793388429752066, + "grad_norm": 8.673178672790527, + "learning_rate": 5.406935366170518e-06, + "loss": 0.5056, + "step": 4350 + }, + { + "epoch": 0.4794490358126722, + "grad_norm": 6.674211502075195, + "learning_rate": 5.405192704060454e-06, + "loss": 0.4401, + "step": 4351 + }, + { + "epoch": 0.4795592286501377, + "grad_norm": 11.007810592651367, + "learning_rate": 5.403449992402875e-06, + "loss": 0.3834, + "step": 4352 + }, + { + "epoch": 0.47966942148760333, + "grad_norm": 6.436962604522705, + "learning_rate": 5.401707231410881e-06, + "loss": 0.3686, + "step": 4353 + }, + { + "epoch": 0.4797796143250689, + "grad_norm": 8.79969596862793, + "learning_rate": 5.399964421297583e-06, + "loss": 0.4908, + "step": 4354 + }, + { + "epoch": 0.47988980716253443, + "grad_norm": 9.806441307067871, + "learning_rate": 5.398221562276092e-06, + "loss": 0.3788, + "step": 4355 + }, + { + "epoch": 0.48, + "grad_norm": 8.346846580505371, + "learning_rate": 5.396478654559527e-06, + "loss": 0.4135, + "step": 4356 + }, + { + "epoch": 0.4801101928374656, + "grad_norm": 9.595311164855957, + "learning_rate": 5.394735698361015e-06, + "loss": 0.3901, + "step": 4357 + }, + { + "epoch": 0.48022038567493114, + "grad_norm": 5.470604419708252, + "learning_rate": 5.392992693893684e-06, + "loss": 0.4604, + "step": 4358 + }, + { + "epoch": 0.4803305785123967, + "grad_norm": 9.696976661682129, + "learning_rate": 5.391249641370673e-06, + "loss": 0.4023, + "step": 4359 + }, + { + "epoch": 0.48044077134986224, + "grad_norm": 4.56251335144043, + "learning_rate": 5.389506541005125e-06, + "loss": 0.3888, + "step": 4360 + }, + { + "epoch": 0.48055096418732784, + "grad_norm": 11.087410926818848, + "learning_rate": 5.387763393010187e-06, + "loss": 0.3778, + "step": 4361 + }, + { + "epoch": 0.4806611570247934, + "grad_norm": 6.750298500061035, + "learning_rate": 5.386020197599016e-06, + "loss": 0.4241, + "step": 4362 + }, + { + "epoch": 0.48077134986225895, + "grad_norm": 4.112381458282471, + "learning_rate": 5.384276954984769e-06, + "loss": 0.2738, + "step": 4363 + }, + { + "epoch": 0.4808815426997245, + "grad_norm": 8.764153480529785, + "learning_rate": 5.3825336653806144e-06, + "loss": 0.4023, + "step": 4364 + }, + { + "epoch": 0.4809917355371901, + "grad_norm": 4.686344146728516, + "learning_rate": 5.380790328999726e-06, + "loss": 0.3965, + "step": 4365 + }, + { + "epoch": 0.48110192837465565, + "grad_norm": 6.162919998168945, + "learning_rate": 5.379046946055276e-06, + "loss": 0.3832, + "step": 4366 + }, + { + "epoch": 0.4812121212121212, + "grad_norm": 8.729318618774414, + "learning_rate": 5.3773035167604516e-06, + "loss": 0.5119, + "step": 4367 + }, + { + "epoch": 0.48132231404958675, + "grad_norm": 6.998101711273193, + "learning_rate": 5.375560041328441e-06, + "loss": 0.4262, + "step": 4368 + }, + { + "epoch": 0.48143250688705236, + "grad_norm": 13.333380699157715, + "learning_rate": 5.373816519972438e-06, + "loss": 0.4887, + "step": 4369 + }, + { + "epoch": 0.4815426997245179, + "grad_norm": 5.41016960144043, + "learning_rate": 5.3720729529056425e-06, + "loss": 0.3149, + "step": 4370 + }, + { + "epoch": 0.48165289256198346, + "grad_norm": 11.839273452758789, + "learning_rate": 5.370329340341261e-06, + "loss": 0.5802, + "step": 4371 + }, + { + "epoch": 0.481763085399449, + "grad_norm": 9.279288291931152, + "learning_rate": 5.3685856824925066e-06, + "loss": 0.4462, + "step": 4372 + }, + { + "epoch": 0.4818732782369146, + "grad_norm": 5.733633041381836, + "learning_rate": 5.3668419795725925e-06, + "loss": 0.4337, + "step": 4373 + }, + { + "epoch": 0.48198347107438017, + "grad_norm": 6.996150016784668, + "learning_rate": 5.365098231794743e-06, + "loss": 0.4301, + "step": 4374 + }, + { + "epoch": 0.4820936639118457, + "grad_norm": 5.667147159576416, + "learning_rate": 5.363354439372188e-06, + "loss": 0.3247, + "step": 4375 + }, + { + "epoch": 0.4822038567493113, + "grad_norm": 5.164324760437012, + "learning_rate": 5.361610602518156e-06, + "loss": 0.3992, + "step": 4376 + }, + { + "epoch": 0.48231404958677687, + "grad_norm": 8.832839012145996, + "learning_rate": 5.3598667214458875e-06, + "loss": 0.4427, + "step": 4377 + }, + { + "epoch": 0.4824242424242424, + "grad_norm": 9.516709327697754, + "learning_rate": 5.35812279636863e-06, + "loss": 0.4116, + "step": 4378 + }, + { + "epoch": 0.482534435261708, + "grad_norm": 6.538336277008057, + "learning_rate": 5.35637882749963e-06, + "loss": 0.4716, + "step": 4379 + }, + { + "epoch": 0.4826446280991736, + "grad_norm": 8.16253662109375, + "learning_rate": 5.354634815052142e-06, + "loss": 0.3644, + "step": 4380 + }, + { + "epoch": 0.48275482093663913, + "grad_norm": 5.015005111694336, + "learning_rate": 5.3528907592394275e-06, + "loss": 0.3886, + "step": 4381 + }, + { + "epoch": 0.4828650137741047, + "grad_norm": 5.4582319259643555, + "learning_rate": 5.351146660274751e-06, + "loss": 0.4306, + "step": 4382 + }, + { + "epoch": 0.48297520661157023, + "grad_norm": 8.287875175476074, + "learning_rate": 5.349402518371385e-06, + "loss": 0.483, + "step": 4383 + }, + { + "epoch": 0.48308539944903583, + "grad_norm": 5.849573612213135, + "learning_rate": 5.347658333742604e-06, + "loss": 0.4077, + "step": 4384 + }, + { + "epoch": 0.4831955922865014, + "grad_norm": 7.98061466217041, + "learning_rate": 5.34591410660169e-06, + "loss": 0.3696, + "step": 4385 + }, + { + "epoch": 0.48330578512396694, + "grad_norm": 7.349559783935547, + "learning_rate": 5.344169837161929e-06, + "loss": 0.3966, + "step": 4386 + }, + { + "epoch": 0.4834159779614325, + "grad_norm": 13.274238586425781, + "learning_rate": 5.3424255256366105e-06, + "loss": 0.4603, + "step": 4387 + }, + { + "epoch": 0.4835261707988981, + "grad_norm": 8.205038070678711, + "learning_rate": 5.340681172239037e-06, + "loss": 0.43, + "step": 4388 + }, + { + "epoch": 0.48363636363636364, + "grad_norm": 5.981983184814453, + "learning_rate": 5.3389367771825065e-06, + "loss": 0.38, + "step": 4389 + }, + { + "epoch": 0.4837465564738292, + "grad_norm": 7.728749752044678, + "learning_rate": 5.337192340680325e-06, + "loss": 0.4007, + "step": 4390 + }, + { + "epoch": 0.48385674931129474, + "grad_norm": 8.33792781829834, + "learning_rate": 5.335447862945806e-06, + "loss": 0.407, + "step": 4391 + }, + { + "epoch": 0.48396694214876035, + "grad_norm": 5.418024063110352, + "learning_rate": 5.333703344192267e-06, + "loss": 0.2849, + "step": 4392 + }, + { + "epoch": 0.4840771349862259, + "grad_norm": 7.042039394378662, + "learning_rate": 5.331958784633031e-06, + "loss": 0.4277, + "step": 4393 + }, + { + "epoch": 0.48418732782369145, + "grad_norm": 8.459491729736328, + "learning_rate": 5.330214184481422e-06, + "loss": 0.3693, + "step": 4394 + }, + { + "epoch": 0.484297520661157, + "grad_norm": 7.763633728027344, + "learning_rate": 5.328469543950776e-06, + "loss": 0.3237, + "step": 4395 + }, + { + "epoch": 0.4844077134986226, + "grad_norm": 9.964437484741211, + "learning_rate": 5.326724863254428e-06, + "loss": 0.414, + "step": 4396 + }, + { + "epoch": 0.48451790633608816, + "grad_norm": 8.47101879119873, + "learning_rate": 5.324980142605718e-06, + "loss": 0.4962, + "step": 4397 + }, + { + "epoch": 0.4846280991735537, + "grad_norm": 25.347455978393555, + "learning_rate": 5.323235382217995e-06, + "loss": 0.6602, + "step": 4398 + }, + { + "epoch": 0.48473829201101926, + "grad_norm": 7.682060718536377, + "learning_rate": 5.3214905823046106e-06, + "loss": 0.4277, + "step": 4399 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 12.011445999145508, + "learning_rate": 5.319745743078922e-06, + "loss": 0.4978, + "step": 4400 + }, + { + "epoch": 0.4849586776859504, + "grad_norm": 6.436575412750244, + "learning_rate": 5.318000864754289e-06, + "loss": 0.3284, + "step": 4401 + }, + { + "epoch": 0.48506887052341596, + "grad_norm": 8.122916221618652, + "learning_rate": 5.316255947544078e-06, + "loss": 0.3877, + "step": 4402 + }, + { + "epoch": 0.48517906336088157, + "grad_norm": 7.619511127471924, + "learning_rate": 5.314510991661662e-06, + "loss": 0.3095, + "step": 4403 + }, + { + "epoch": 0.4852892561983471, + "grad_norm": 4.5794997215271, + "learning_rate": 5.312765997320413e-06, + "loss": 0.4292, + "step": 4404 + }, + { + "epoch": 0.48539944903581267, + "grad_norm": 5.267623424530029, + "learning_rate": 5.311020964733712e-06, + "loss": 0.4613, + "step": 4405 + }, + { + "epoch": 0.4855096418732782, + "grad_norm": 7.349124431610107, + "learning_rate": 5.309275894114947e-06, + "loss": 0.3512, + "step": 4406 + }, + { + "epoch": 0.4856198347107438, + "grad_norm": 5.92105770111084, + "learning_rate": 5.307530785677505e-06, + "loss": 0.2938, + "step": 4407 + }, + { + "epoch": 0.4857300275482094, + "grad_norm": 6.2049360275268555, + "learning_rate": 5.30578563963478e-06, + "loss": 0.4101, + "step": 4408 + }, + { + "epoch": 0.4858402203856749, + "grad_norm": 8.178729057312012, + "learning_rate": 5.304040456200172e-06, + "loss": 0.3573, + "step": 4409 + }, + { + "epoch": 0.4859504132231405, + "grad_norm": 5.729934215545654, + "learning_rate": 5.302295235587085e-06, + "loss": 0.3723, + "step": 4410 + }, + { + "epoch": 0.4860606060606061, + "grad_norm": 7.468725204467773, + "learning_rate": 5.300549978008925e-06, + "loss": 0.4322, + "step": 4411 + }, + { + "epoch": 0.48617079889807163, + "grad_norm": 7.304121494293213, + "learning_rate": 5.298804683679105e-06, + "loss": 0.3723, + "step": 4412 + }, + { + "epoch": 0.4862809917355372, + "grad_norm": 5.810751438140869, + "learning_rate": 5.297059352811044e-06, + "loss": 0.439, + "step": 4413 + }, + { + "epoch": 0.48639118457300273, + "grad_norm": 6.385992527008057, + "learning_rate": 5.29531398561816e-06, + "loss": 0.3924, + "step": 4414 + }, + { + "epoch": 0.48650137741046834, + "grad_norm": 8.34052562713623, + "learning_rate": 5.293568582313882e-06, + "loss": 0.4946, + "step": 4415 + }, + { + "epoch": 0.4866115702479339, + "grad_norm": 10.94664192199707, + "learning_rate": 5.291823143111639e-06, + "loss": 0.431, + "step": 4416 + }, + { + "epoch": 0.48672176308539944, + "grad_norm": 6.250720500946045, + "learning_rate": 5.290077668224865e-06, + "loss": 0.447, + "step": 4417 + }, + { + "epoch": 0.486831955922865, + "grad_norm": 9.11233901977539, + "learning_rate": 5.288332157866999e-06, + "loss": 0.4224, + "step": 4418 + }, + { + "epoch": 0.4869421487603306, + "grad_norm": 7.473569393157959, + "learning_rate": 5.286586612251485e-06, + "loss": 0.5118, + "step": 4419 + }, + { + "epoch": 0.48705234159779615, + "grad_norm": 5.473938465118408, + "learning_rate": 5.284841031591772e-06, + "loss": 0.4349, + "step": 4420 + }, + { + "epoch": 0.4871625344352617, + "grad_norm": 7.55420446395874, + "learning_rate": 5.283095416101312e-06, + "loss": 0.3926, + "step": 4421 + }, + { + "epoch": 0.48727272727272725, + "grad_norm": 6.22142219543457, + "learning_rate": 5.2813497659935575e-06, + "loss": 0.3938, + "step": 4422 + }, + { + "epoch": 0.48738292011019285, + "grad_norm": 4.999783515930176, + "learning_rate": 5.279604081481973e-06, + "loss": 0.436, + "step": 4423 + }, + { + "epoch": 0.4874931129476584, + "grad_norm": 6.7879533767700195, + "learning_rate": 5.27785836278002e-06, + "loss": 0.4117, + "step": 4424 + }, + { + "epoch": 0.48760330578512395, + "grad_norm": 3.9674224853515625, + "learning_rate": 5.27611261010117e-06, + "loss": 0.3627, + "step": 4425 + }, + { + "epoch": 0.48771349862258956, + "grad_norm": 7.521617889404297, + "learning_rate": 5.274366823658895e-06, + "loss": 0.4763, + "step": 4426 + }, + { + "epoch": 0.4878236914600551, + "grad_norm": 12.487852096557617, + "learning_rate": 5.272621003666671e-06, + "loss": 0.3876, + "step": 4427 + }, + { + "epoch": 0.48793388429752066, + "grad_norm": 10.022464752197266, + "learning_rate": 5.270875150337982e-06, + "loss": 0.4421, + "step": 4428 + }, + { + "epoch": 0.4880440771349862, + "grad_norm": 13.987154960632324, + "learning_rate": 5.269129263886312e-06, + "loss": 0.4146, + "step": 4429 + }, + { + "epoch": 0.4881542699724518, + "grad_norm": 5.220799922943115, + "learning_rate": 5.267383344525148e-06, + "loss": 0.4125, + "step": 4430 + }, + { + "epoch": 0.48826446280991737, + "grad_norm": 9.151785850524902, + "learning_rate": 5.265637392467986e-06, + "loss": 0.4678, + "step": 4431 + }, + { + "epoch": 0.4883746556473829, + "grad_norm": 7.962405681610107, + "learning_rate": 5.263891407928324e-06, + "loss": 0.3437, + "step": 4432 + }, + { + "epoch": 0.48848484848484847, + "grad_norm": 5.536421298980713, + "learning_rate": 5.26214539111966e-06, + "loss": 0.3515, + "step": 4433 + }, + { + "epoch": 0.48859504132231407, + "grad_norm": 8.647902488708496, + "learning_rate": 5.260399342255504e-06, + "loss": 0.502, + "step": 4434 + }, + { + "epoch": 0.4887052341597796, + "grad_norm": 5.995055198669434, + "learning_rate": 5.258653261549363e-06, + "loss": 0.4266, + "step": 4435 + }, + { + "epoch": 0.4888154269972452, + "grad_norm": 8.032722473144531, + "learning_rate": 5.2569071492147474e-06, + "loss": 0.4477, + "step": 4436 + }, + { + "epoch": 0.4889256198347107, + "grad_norm": 7.697789669036865, + "learning_rate": 5.255161005465177e-06, + "loss": 0.4765, + "step": 4437 + }, + { + "epoch": 0.48903581267217633, + "grad_norm": 6.443562030792236, + "learning_rate": 5.253414830514174e-06, + "loss": 0.3692, + "step": 4438 + }, + { + "epoch": 0.4891460055096419, + "grad_norm": 4.5295586585998535, + "learning_rate": 5.2516686245752605e-06, + "loss": 0.3371, + "step": 4439 + }, + { + "epoch": 0.48925619834710743, + "grad_norm": 8.297006607055664, + "learning_rate": 5.249922387861964e-06, + "loss": 0.4633, + "step": 4440 + }, + { + "epoch": 0.489366391184573, + "grad_norm": 4.661348342895508, + "learning_rate": 5.248176120587821e-06, + "loss": 0.3234, + "step": 4441 + }, + { + "epoch": 0.4894765840220386, + "grad_norm": 5.500077247619629, + "learning_rate": 5.246429822966363e-06, + "loss": 0.4505, + "step": 4442 + }, + { + "epoch": 0.48958677685950414, + "grad_norm": 6.7624006271362305, + "learning_rate": 5.244683495211132e-06, + "loss": 0.3474, + "step": 4443 + }, + { + "epoch": 0.4896969696969697, + "grad_norm": 7.952180862426758, + "learning_rate": 5.242937137535672e-06, + "loss": 0.415, + "step": 4444 + }, + { + "epoch": 0.48980716253443524, + "grad_norm": 8.214971542358398, + "learning_rate": 5.2411907501535285e-06, + "loss": 0.4879, + "step": 4445 + }, + { + "epoch": 0.48991735537190084, + "grad_norm": 7.120847702026367, + "learning_rate": 5.239444333278251e-06, + "loss": 0.5096, + "step": 4446 + }, + { + "epoch": 0.4900275482093664, + "grad_norm": 10.56857681274414, + "learning_rate": 5.237697887123396e-06, + "loss": 0.4134, + "step": 4447 + }, + { + "epoch": 0.49013774104683194, + "grad_norm": 7.67206335067749, + "learning_rate": 5.23595141190252e-06, + "loss": 0.5067, + "step": 4448 + }, + { + "epoch": 0.4902479338842975, + "grad_norm": 4.152846336364746, + "learning_rate": 5.234204907829187e-06, + "loss": 0.3521, + "step": 4449 + }, + { + "epoch": 0.4903581267217631, + "grad_norm": 11.548025131225586, + "learning_rate": 5.232458375116956e-06, + "loss": 0.472, + "step": 4450 + }, + { + "epoch": 0.49046831955922865, + "grad_norm": 14.106864929199219, + "learning_rate": 5.2307118139794015e-06, + "loss": 0.4055, + "step": 4451 + }, + { + "epoch": 0.4905785123966942, + "grad_norm": 7.866242408752441, + "learning_rate": 5.228965224630094e-06, + "loss": 0.4024, + "step": 4452 + }, + { + "epoch": 0.4906887052341598, + "grad_norm": 7.196980953216553, + "learning_rate": 5.227218607282606e-06, + "loss": 0.4326, + "step": 4453 + }, + { + "epoch": 0.49079889807162536, + "grad_norm": 9.129240989685059, + "learning_rate": 5.225471962150519e-06, + "loss": 0.4973, + "step": 4454 + }, + { + "epoch": 0.4909090909090909, + "grad_norm": 7.125060558319092, + "learning_rate": 5.223725289447413e-06, + "loss": 0.4516, + "step": 4455 + }, + { + "epoch": 0.49101928374655646, + "grad_norm": 6.00028657913208, + "learning_rate": 5.221978589386876e-06, + "loss": 0.4101, + "step": 4456 + }, + { + "epoch": 0.49112947658402206, + "grad_norm": 4.266380786895752, + "learning_rate": 5.220231862182495e-06, + "loss": 0.3677, + "step": 4457 + }, + { + "epoch": 0.4912396694214876, + "grad_norm": 6.081855297088623, + "learning_rate": 5.218485108047862e-06, + "loss": 0.2898, + "step": 4458 + }, + { + "epoch": 0.49134986225895316, + "grad_norm": 7.620930194854736, + "learning_rate": 5.2167383271965745e-06, + "loss": 0.5123, + "step": 4459 + }, + { + "epoch": 0.4914600550964187, + "grad_norm": 5.6011247634887695, + "learning_rate": 5.21499151984223e-06, + "loss": 0.3554, + "step": 4460 + }, + { + "epoch": 0.4915702479338843, + "grad_norm": 9.127986907958984, + "learning_rate": 5.2132446861984285e-06, + "loss": 0.5321, + "step": 4461 + }, + { + "epoch": 0.49168044077134987, + "grad_norm": 9.018638610839844, + "learning_rate": 5.21149782647878e-06, + "loss": 0.496, + "step": 4462 + }, + { + "epoch": 0.4917906336088154, + "grad_norm": 5.467581272125244, + "learning_rate": 5.2097509408968884e-06, + "loss": 0.4314, + "step": 4463 + }, + { + "epoch": 0.49190082644628097, + "grad_norm": 3.5782599449157715, + "learning_rate": 5.208004029666366e-06, + "loss": 0.4047, + "step": 4464 + }, + { + "epoch": 0.4920110192837466, + "grad_norm": 6.758794784545898, + "learning_rate": 5.20625709300083e-06, + "loss": 0.4573, + "step": 4465 + }, + { + "epoch": 0.4921212121212121, + "grad_norm": 6.822628974914551, + "learning_rate": 5.204510131113896e-06, + "loss": 0.4063, + "step": 4466 + }, + { + "epoch": 0.4922314049586777, + "grad_norm": 7.487907409667969, + "learning_rate": 5.202763144219185e-06, + "loss": 0.4272, + "step": 4467 + }, + { + "epoch": 0.4923415977961432, + "grad_norm": 8.487168312072754, + "learning_rate": 5.20101613253032e-06, + "loss": 0.4357, + "step": 4468 + }, + { + "epoch": 0.49245179063360883, + "grad_norm": 11.44509506225586, + "learning_rate": 5.199269096260932e-06, + "loss": 0.5114, + "step": 4469 + }, + { + "epoch": 0.4925619834710744, + "grad_norm": 10.665582656860352, + "learning_rate": 5.197522035624647e-06, + "loss": 0.3898, + "step": 4470 + }, + { + "epoch": 0.49267217630853993, + "grad_norm": 6.364475727081299, + "learning_rate": 5.195774950835098e-06, + "loss": 0.4094, + "step": 4471 + }, + { + "epoch": 0.4927823691460055, + "grad_norm": 6.148874759674072, + "learning_rate": 5.194027842105923e-06, + "loss": 0.4306, + "step": 4472 + }, + { + "epoch": 0.4928925619834711, + "grad_norm": 11.46756362915039, + "learning_rate": 5.192280709650761e-06, + "loss": 0.4999, + "step": 4473 + }, + { + "epoch": 0.49300275482093664, + "grad_norm": 9.750408172607422, + "learning_rate": 5.19053355368325e-06, + "loss": 0.3532, + "step": 4474 + }, + { + "epoch": 0.4931129476584022, + "grad_norm": 5.653137683868408, + "learning_rate": 5.188786374417039e-06, + "loss": 0.3562, + "step": 4475 + }, + { + "epoch": 0.4932231404958678, + "grad_norm": 6.068719387054443, + "learning_rate": 5.187039172065773e-06, + "loss": 0.4124, + "step": 4476 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 7.310559272766113, + "learning_rate": 5.185291946843104e-06, + "loss": 0.4666, + "step": 4477 + }, + { + "epoch": 0.4934435261707989, + "grad_norm": 7.6926589012146, + "learning_rate": 5.183544698962685e-06, + "loss": 0.4709, + "step": 4478 + }, + { + "epoch": 0.49355371900826445, + "grad_norm": 4.070844650268555, + "learning_rate": 5.181797428638168e-06, + "loss": 0.4124, + "step": 4479 + }, + { + "epoch": 0.49366391184573005, + "grad_norm": 4.346336841583252, + "learning_rate": 5.180050136083218e-06, + "loss": 0.341, + "step": 4480 + }, + { + "epoch": 0.4937741046831956, + "grad_norm": 6.669896125793457, + "learning_rate": 5.178302821511489e-06, + "loss": 0.4988, + "step": 4481 + }, + { + "epoch": 0.49388429752066115, + "grad_norm": 11.357707023620605, + "learning_rate": 5.176555485136652e-06, + "loss": 0.4435, + "step": 4482 + }, + { + "epoch": 0.4939944903581267, + "grad_norm": 5.121276378631592, + "learning_rate": 5.174808127172367e-06, + "loss": 0.3446, + "step": 4483 + }, + { + "epoch": 0.4941046831955923, + "grad_norm": 7.8835530281066895, + "learning_rate": 5.1730607478323095e-06, + "loss": 0.3309, + "step": 4484 + }, + { + "epoch": 0.49421487603305786, + "grad_norm": 4.873608589172363, + "learning_rate": 5.171313347330148e-06, + "loss": 0.4474, + "step": 4485 + }, + { + "epoch": 0.4943250688705234, + "grad_norm": 5.673107147216797, + "learning_rate": 5.169565925879557e-06, + "loss": 0.4362, + "step": 4486 + }, + { + "epoch": 0.49443526170798896, + "grad_norm": 9.590445518493652, + "learning_rate": 5.167818483694216e-06, + "loss": 0.428, + "step": 4487 + }, + { + "epoch": 0.49454545454545457, + "grad_norm": 5.7733001708984375, + "learning_rate": 5.166071020987802e-06, + "loss": 0.3635, + "step": 4488 + }, + { + "epoch": 0.4946556473829201, + "grad_norm": 11.6152925491333, + "learning_rate": 5.164323537973996e-06, + "loss": 0.3634, + "step": 4489 + }, + { + "epoch": 0.49476584022038567, + "grad_norm": 7.276978015899658, + "learning_rate": 5.162576034866486e-06, + "loss": 0.5251, + "step": 4490 + }, + { + "epoch": 0.4948760330578512, + "grad_norm": 9.533393859863281, + "learning_rate": 5.160828511878959e-06, + "loss": 0.459, + "step": 4491 + }, + { + "epoch": 0.4949862258953168, + "grad_norm": 6.698780536651611, + "learning_rate": 5.159080969225101e-06, + "loss": 0.432, + "step": 4492 + }, + { + "epoch": 0.4950964187327824, + "grad_norm": 5.994406223297119, + "learning_rate": 5.157333407118608e-06, + "loss": 0.4019, + "step": 4493 + }, + { + "epoch": 0.4952066115702479, + "grad_norm": 7.2543625831604, + "learning_rate": 5.155585825773172e-06, + "loss": 0.4129, + "step": 4494 + }, + { + "epoch": 0.4953168044077135, + "grad_norm": 8.09793472290039, + "learning_rate": 5.153838225402489e-06, + "loss": 0.4901, + "step": 4495 + }, + { + "epoch": 0.4954269972451791, + "grad_norm": 4.807717800140381, + "learning_rate": 5.152090606220258e-06, + "loss": 0.4073, + "step": 4496 + }, + { + "epoch": 0.49553719008264463, + "grad_norm": 7.383471965789795, + "learning_rate": 5.150342968440181e-06, + "loss": 0.4761, + "step": 4497 + }, + { + "epoch": 0.4956473829201102, + "grad_norm": 5.512248516082764, + "learning_rate": 5.148595312275964e-06, + "loss": 0.4348, + "step": 4498 + }, + { + "epoch": 0.49575757575757573, + "grad_norm": 7.76236629486084, + "learning_rate": 5.146847637941308e-06, + "loss": 0.3897, + "step": 4499 + }, + { + "epoch": 0.49586776859504134, + "grad_norm": 9.837713241577148, + "learning_rate": 5.145099945649925e-06, + "loss": 0.4142, + "step": 4500 + }, + { + "epoch": 0.4959779614325069, + "grad_norm": 6.959941864013672, + "learning_rate": 5.143352235615523e-06, + "loss": 0.4046, + "step": 4501 + }, + { + "epoch": 0.49608815426997244, + "grad_norm": 7.133990287780762, + "learning_rate": 5.141604508051814e-06, + "loss": 0.467, + "step": 4502 + }, + { + "epoch": 0.49619834710743804, + "grad_norm": 4.255684852600098, + "learning_rate": 5.139856763172515e-06, + "loss": 0.4422, + "step": 4503 + }, + { + "epoch": 0.4963085399449036, + "grad_norm": 7.363885402679443, + "learning_rate": 5.13810900119134e-06, + "loss": 0.4305, + "step": 4504 + }, + { + "epoch": 0.49641873278236914, + "grad_norm": 7.709430694580078, + "learning_rate": 5.136361222322009e-06, + "loss": 0.371, + "step": 4505 + }, + { + "epoch": 0.4965289256198347, + "grad_norm": 4.840433120727539, + "learning_rate": 5.134613426778242e-06, + "loss": 0.3803, + "step": 4506 + }, + { + "epoch": 0.4966391184573003, + "grad_norm": 7.726146697998047, + "learning_rate": 5.1328656147737625e-06, + "loss": 0.4255, + "step": 4507 + }, + { + "epoch": 0.49674931129476585, + "grad_norm": 3.6370415687561035, + "learning_rate": 5.131117786522296e-06, + "loss": 0.386, + "step": 4508 + }, + { + "epoch": 0.4968595041322314, + "grad_norm": 5.870304107666016, + "learning_rate": 5.129369942237567e-06, + "loss": 0.481, + "step": 4509 + }, + { + "epoch": 0.49696969696969695, + "grad_norm": 6.260837554931641, + "learning_rate": 5.127622082133307e-06, + "loss": 0.3926, + "step": 4510 + }, + { + "epoch": 0.49707988980716256, + "grad_norm": 7.427427291870117, + "learning_rate": 5.125874206423245e-06, + "loss": 0.4743, + "step": 4511 + }, + { + "epoch": 0.4971900826446281, + "grad_norm": 4.559499263763428, + "learning_rate": 5.124126315321114e-06, + "loss": 0.3297, + "step": 4512 + }, + { + "epoch": 0.49730027548209366, + "grad_norm": 5.440631866455078, + "learning_rate": 5.122378409040649e-06, + "loss": 0.3712, + "step": 4513 + }, + { + "epoch": 0.4974104683195592, + "grad_norm": 6.5577712059021, + "learning_rate": 5.120630487795585e-06, + "loss": 0.399, + "step": 4514 + }, + { + "epoch": 0.4975206611570248, + "grad_norm": 4.975412845611572, + "learning_rate": 5.118882551799662e-06, + "loss": 0.3959, + "step": 4515 + }, + { + "epoch": 0.49763085399449036, + "grad_norm": 4.3721537590026855, + "learning_rate": 5.117134601266619e-06, + "loss": 0.42, + "step": 4516 + }, + { + "epoch": 0.4977410468319559, + "grad_norm": 5.379798412322998, + "learning_rate": 5.1153866364101964e-06, + "loss": 0.414, + "step": 4517 + }, + { + "epoch": 0.49785123966942146, + "grad_norm": 7.518587589263916, + "learning_rate": 5.113638657444141e-06, + "loss": 0.4546, + "step": 4518 + }, + { + "epoch": 0.49796143250688707, + "grad_norm": 8.444965362548828, + "learning_rate": 5.111890664582196e-06, + "loss": 0.4242, + "step": 4519 + }, + { + "epoch": 0.4980716253443526, + "grad_norm": 7.5318989753723145, + "learning_rate": 5.110142658038107e-06, + "loss": 0.4095, + "step": 4520 + }, + { + "epoch": 0.49818181818181817, + "grad_norm": 4.474093914031982, + "learning_rate": 5.108394638025626e-06, + "loss": 0.3704, + "step": 4521 + }, + { + "epoch": 0.4982920110192837, + "grad_norm": 5.056334495544434, + "learning_rate": 5.106646604758501e-06, + "loss": 0.3922, + "step": 4522 + }, + { + "epoch": 0.4984022038567493, + "grad_norm": 8.53176498413086, + "learning_rate": 5.104898558450484e-06, + "loss": 0.4607, + "step": 4523 + }, + { + "epoch": 0.4985123966942149, + "grad_norm": 6.229320526123047, + "learning_rate": 5.103150499315328e-06, + "loss": 0.3709, + "step": 4524 + }, + { + "epoch": 0.4986225895316804, + "grad_norm": 4.518656253814697, + "learning_rate": 5.101402427566789e-06, + "loss": 0.3943, + "step": 4525 + }, + { + "epoch": 0.49873278236914603, + "grad_norm": 6.313726425170898, + "learning_rate": 5.0996543434186254e-06, + "loss": 0.4078, + "step": 4526 + }, + { + "epoch": 0.4988429752066116, + "grad_norm": 7.320578575134277, + "learning_rate": 5.097906247084592e-06, + "loss": 0.4082, + "step": 4527 + }, + { + "epoch": 0.49895316804407713, + "grad_norm": 7.043966293334961, + "learning_rate": 5.0961581387784495e-06, + "loss": 0.3463, + "step": 4528 + }, + { + "epoch": 0.4990633608815427, + "grad_norm": 10.093509674072266, + "learning_rate": 5.09441001871396e-06, + "loss": 0.499, + "step": 4529 + }, + { + "epoch": 0.4991735537190083, + "grad_norm": 9.356034278869629, + "learning_rate": 5.092661887104883e-06, + "loss": 0.4536, + "step": 4530 + }, + { + "epoch": 0.49928374655647384, + "grad_norm": 5.936094284057617, + "learning_rate": 5.090913744164987e-06, + "loss": 0.3736, + "step": 4531 + }, + { + "epoch": 0.4993939393939394, + "grad_norm": 5.618441104888916, + "learning_rate": 5.0891655901080325e-06, + "loss": 0.408, + "step": 4532 + }, + { + "epoch": 0.49950413223140494, + "grad_norm": 5.921712875366211, + "learning_rate": 5.08741742514779e-06, + "loss": 0.4293, + "step": 4533 + }, + { + "epoch": 0.49961432506887055, + "grad_norm": 6.156184196472168, + "learning_rate": 5.085669249498027e-06, + "loss": 0.3517, + "step": 4534 + }, + { + "epoch": 0.4997245179063361, + "grad_norm": 6.982524871826172, + "learning_rate": 5.08392106337251e-06, + "loss": 0.3507, + "step": 4535 + }, + { + "epoch": 0.49983471074380165, + "grad_norm": 13.732891082763672, + "learning_rate": 5.082172866985014e-06, + "loss": 0.5273, + "step": 4536 + }, + { + "epoch": 0.4999449035812672, + "grad_norm": 5.866691589355469, + "learning_rate": 5.080424660549305e-06, + "loss": 0.449, + "step": 4537 + }, + { + "epoch": 0.5000550964187328, + "grad_norm": 6.930387496948242, + "learning_rate": 5.07867644427916e-06, + "loss": 0.4369, + "step": 4538 + }, + { + "epoch": 0.5001652892561983, + "grad_norm": 17.0075740814209, + "learning_rate": 5.076928218388353e-06, + "loss": 0.4197, + "step": 4539 + }, + { + "epoch": 0.5002754820936639, + "grad_norm": 6.532504081726074, + "learning_rate": 5.07517998309066e-06, + "loss": 0.3714, + "step": 4540 + }, + { + "epoch": 0.5002754820936639, + "eval_loss": 0.4173244833946228, + "eval_runtime": 41.943, + "eval_samples_per_second": 17.5, + "eval_steps_per_second": 2.193, + "step": 4540 + }, + { + "epoch": 0.5003856749311295, + "grad_norm": 6.452209949493408, + "learning_rate": 5.073431738599855e-06, + "loss": 0.3411, + "step": 4541 + }, + { + "epoch": 0.500495867768595, + "grad_norm": 6.462887763977051, + "learning_rate": 5.071683485129718e-06, + "loss": 0.3818, + "step": 4542 + }, + { + "epoch": 0.5006060606060606, + "grad_norm": 11.5089693069458, + "learning_rate": 5.069935222894027e-06, + "loss": 0.4426, + "step": 4543 + }, + { + "epoch": 0.5007162534435262, + "grad_norm": 9.659563064575195, + "learning_rate": 5.068186952106562e-06, + "loss": 0.4759, + "step": 4544 + }, + { + "epoch": 0.5008264462809917, + "grad_norm": 5.877937316894531, + "learning_rate": 5.066438672981103e-06, + "loss": 0.3361, + "step": 4545 + }, + { + "epoch": 0.5009366391184573, + "grad_norm": 6.2046074867248535, + "learning_rate": 5.064690385731434e-06, + "loss": 0.4354, + "step": 4546 + }, + { + "epoch": 0.5010468319559228, + "grad_norm": 4.185536861419678, + "learning_rate": 5.062942090571337e-06, + "loss": 0.4153, + "step": 4547 + }, + { + "epoch": 0.5011570247933884, + "grad_norm": 5.724506378173828, + "learning_rate": 5.061193787714595e-06, + "loss": 0.3996, + "step": 4548 + }, + { + "epoch": 0.501267217630854, + "grad_norm": 6.192051410675049, + "learning_rate": 5.059445477374992e-06, + "loss": 0.3987, + "step": 4549 + }, + { + "epoch": 0.5013774104683195, + "grad_norm": 8.699459075927734, + "learning_rate": 5.057697159766319e-06, + "loss": 0.4474, + "step": 4550 + }, + { + "epoch": 0.5014876033057851, + "grad_norm": 13.547306060791016, + "learning_rate": 5.055948835102354e-06, + "loss": 0.5094, + "step": 4551 + }, + { + "epoch": 0.5015977961432507, + "grad_norm": 7.65572452545166, + "learning_rate": 5.054200503596894e-06, + "loss": 0.4359, + "step": 4552 + }, + { + "epoch": 0.5017079889807162, + "grad_norm": 6.122793674468994, + "learning_rate": 5.0524521654637195e-06, + "loss": 0.4099, + "step": 4553 + }, + { + "epoch": 0.5018181818181818, + "grad_norm": 8.056415557861328, + "learning_rate": 5.050703820916626e-06, + "loss": 0.4108, + "step": 4554 + }, + { + "epoch": 0.5019283746556474, + "grad_norm": 6.604487895965576, + "learning_rate": 5.048955470169398e-06, + "loss": 0.3795, + "step": 4555 + }, + { + "epoch": 0.5020385674931129, + "grad_norm": 6.377932071685791, + "learning_rate": 5.04720711343583e-06, + "loss": 0.4065, + "step": 4556 + }, + { + "epoch": 0.5021487603305785, + "grad_norm": 6.709017276763916, + "learning_rate": 5.045458750929712e-06, + "loss": 0.3861, + "step": 4557 + }, + { + "epoch": 0.502258953168044, + "grad_norm": 7.791820049285889, + "learning_rate": 5.0437103828648345e-06, + "loss": 0.3611, + "step": 4558 + }, + { + "epoch": 0.5023691460055096, + "grad_norm": 3.928077220916748, + "learning_rate": 5.041962009454994e-06, + "loss": 0.3614, + "step": 4559 + }, + { + "epoch": 0.5024793388429752, + "grad_norm": 5.483081817626953, + "learning_rate": 5.04021363091398e-06, + "loss": 0.4157, + "step": 4560 + }, + { + "epoch": 0.5025895316804407, + "grad_norm": 5.403481960296631, + "learning_rate": 5.038465247455591e-06, + "loss": 0.4341, + "step": 4561 + }, + { + "epoch": 0.5026997245179063, + "grad_norm": 7.353564739227295, + "learning_rate": 5.036716859293618e-06, + "loss": 0.5812, + "step": 4562 + }, + { + "epoch": 0.502809917355372, + "grad_norm": 6.972169399261475, + "learning_rate": 5.034968466641856e-06, + "loss": 0.4056, + "step": 4563 + }, + { + "epoch": 0.5029201101928374, + "grad_norm": 6.7498955726623535, + "learning_rate": 5.033220069714104e-06, + "loss": 0.4436, + "step": 4564 + }, + { + "epoch": 0.503030303030303, + "grad_norm": 10.816003799438477, + "learning_rate": 5.031471668724158e-06, + "loss": 0.4928, + "step": 4565 + }, + { + "epoch": 0.5031404958677685, + "grad_norm": 7.438859939575195, + "learning_rate": 5.0297232638858115e-06, + "loss": 0.3682, + "step": 4566 + }, + { + "epoch": 0.5032506887052342, + "grad_norm": 7.910082817077637, + "learning_rate": 5.027974855412865e-06, + "loss": 0.4335, + "step": 4567 + }, + { + "epoch": 0.5033608815426998, + "grad_norm": 4.845508098602295, + "learning_rate": 5.026226443519116e-06, + "loss": 0.4097, + "step": 4568 + }, + { + "epoch": 0.5034710743801653, + "grad_norm": 11.199101448059082, + "learning_rate": 5.02447802841836e-06, + "loss": 0.4512, + "step": 4569 + }, + { + "epoch": 0.5035812672176309, + "grad_norm": 10.137066841125488, + "learning_rate": 5.022729610324398e-06, + "loss": 0.3955, + "step": 4570 + }, + { + "epoch": 0.5036914600550965, + "grad_norm": 5.9792938232421875, + "learning_rate": 5.020981189451028e-06, + "loss": 0.4271, + "step": 4571 + }, + { + "epoch": 0.503801652892562, + "grad_norm": 5.189902305603027, + "learning_rate": 5.019232766012052e-06, + "loss": 0.4371, + "step": 4572 + }, + { + "epoch": 0.5039118457300276, + "grad_norm": 7.720902919769287, + "learning_rate": 5.017484340221265e-06, + "loss": 0.3973, + "step": 4573 + }, + { + "epoch": 0.5040220385674931, + "grad_norm": 5.592770099639893, + "learning_rate": 5.015735912292469e-06, + "loss": 0.3591, + "step": 4574 + }, + { + "epoch": 0.5041322314049587, + "grad_norm": 8.448620796203613, + "learning_rate": 5.013987482439465e-06, + "loss": 0.4779, + "step": 4575 + }, + { + "epoch": 0.5042424242424243, + "grad_norm": 4.631404399871826, + "learning_rate": 5.012239050876052e-06, + "loss": 0.4248, + "step": 4576 + }, + { + "epoch": 0.5043526170798898, + "grad_norm": 10.913233757019043, + "learning_rate": 5.010490617816033e-06, + "loss": 0.3743, + "step": 4577 + }, + { + "epoch": 0.5044628099173554, + "grad_norm": 5.6780314445495605, + "learning_rate": 5.008742183473205e-06, + "loss": 0.4088, + "step": 4578 + }, + { + "epoch": 0.504573002754821, + "grad_norm": 7.929697513580322, + "learning_rate": 5.006993748061372e-06, + "loss": 0.3323, + "step": 4579 + }, + { + "epoch": 0.5046831955922865, + "grad_norm": 8.058935165405273, + "learning_rate": 5.005245311794334e-06, + "loss": 0.4651, + "step": 4580 + }, + { + "epoch": 0.5047933884297521, + "grad_norm": 4.103596210479736, + "learning_rate": 5.0034968748858905e-06, + "loss": 0.3993, + "step": 4581 + }, + { + "epoch": 0.5049035812672177, + "grad_norm": 5.556540012359619, + "learning_rate": 5.001748437549847e-06, + "loss": 0.3532, + "step": 4582 + }, + { + "epoch": 0.5050137741046832, + "grad_norm": 10.806697845458984, + "learning_rate": 5e-06, + "loss": 0.4152, + "step": 4583 + }, + { + "epoch": 0.5051239669421488, + "grad_norm": 5.479221820831299, + "learning_rate": 4.998251562450155e-06, + "loss": 0.4651, + "step": 4584 + }, + { + "epoch": 0.5052341597796143, + "grad_norm": 8.868670463562012, + "learning_rate": 4.9965031251141095e-06, + "loss": 0.3818, + "step": 4585 + }, + { + "epoch": 0.5053443526170799, + "grad_norm": 4.519674301147461, + "learning_rate": 4.994754688205667e-06, + "loss": 0.4236, + "step": 4586 + }, + { + "epoch": 0.5054545454545455, + "grad_norm": 7.915712833404541, + "learning_rate": 4.99300625193863e-06, + "loss": 0.4449, + "step": 4587 + }, + { + "epoch": 0.505564738292011, + "grad_norm": 7.476759910583496, + "learning_rate": 4.9912578165267955e-06, + "loss": 0.3903, + "step": 4588 + }, + { + "epoch": 0.5056749311294766, + "grad_norm": 6.582509517669678, + "learning_rate": 4.989509382183969e-06, + "loss": 0.4416, + "step": 4589 + }, + { + "epoch": 0.5057851239669422, + "grad_norm": 5.545627593994141, + "learning_rate": 4.98776094912395e-06, + "loss": 0.3882, + "step": 4590 + }, + { + "epoch": 0.5058953168044077, + "grad_norm": 6.2511749267578125, + "learning_rate": 4.986012517560536e-06, + "loss": 0.4118, + "step": 4591 + }, + { + "epoch": 0.5060055096418733, + "grad_norm": 6.856733798980713, + "learning_rate": 4.984264087707532e-06, + "loss": 0.4525, + "step": 4592 + }, + { + "epoch": 0.5061157024793388, + "grad_norm": 4.939981460571289, + "learning_rate": 4.982515659778736e-06, + "loss": 0.4167, + "step": 4593 + }, + { + "epoch": 0.5062258953168044, + "grad_norm": 6.003458499908447, + "learning_rate": 4.9807672339879494e-06, + "loss": 0.4306, + "step": 4594 + }, + { + "epoch": 0.50633608815427, + "grad_norm": 6.502118110656738, + "learning_rate": 4.9790188105489734e-06, + "loss": 0.3594, + "step": 4595 + }, + { + "epoch": 0.5064462809917355, + "grad_norm": 4.995030403137207, + "learning_rate": 4.977270389675603e-06, + "loss": 0.3798, + "step": 4596 + }, + { + "epoch": 0.5065564738292011, + "grad_norm": 7.406594276428223, + "learning_rate": 4.975521971581641e-06, + "loss": 0.4716, + "step": 4597 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 4.951489448547363, + "learning_rate": 4.973773556480887e-06, + "loss": 0.4043, + "step": 4598 + }, + { + "epoch": 0.5067768595041322, + "grad_norm": 5.4200239181518555, + "learning_rate": 4.972025144587135e-06, + "loss": 0.4161, + "step": 4599 + }, + { + "epoch": 0.5068870523415978, + "grad_norm": 4.892787456512451, + "learning_rate": 4.97027673611419e-06, + "loss": 0.3647, + "step": 4600 + }, + { + "epoch": 0.5069972451790634, + "grad_norm": 5.32407808303833, + "learning_rate": 4.968528331275845e-06, + "loss": 0.4264, + "step": 4601 + }, + { + "epoch": 0.5071074380165289, + "grad_norm": 5.645662784576416, + "learning_rate": 4.966779930285897e-06, + "loss": 0.3807, + "step": 4602 + }, + { + "epoch": 0.5072176308539945, + "grad_norm": 8.279090881347656, + "learning_rate": 4.965031533358145e-06, + "loss": 0.4578, + "step": 4603 + }, + { + "epoch": 0.50732782369146, + "grad_norm": 6.70154333114624, + "learning_rate": 4.9632831407063855e-06, + "loss": 0.4058, + "step": 4604 + }, + { + "epoch": 0.5074380165289256, + "grad_norm": 7.6447248458862305, + "learning_rate": 4.9615347525444115e-06, + "loss": 0.3833, + "step": 4605 + }, + { + "epoch": 0.5075482093663912, + "grad_norm": 8.844406127929688, + "learning_rate": 4.959786369086022e-06, + "loss": 0.4539, + "step": 4606 + }, + { + "epoch": 0.5076584022038567, + "grad_norm": 8.162901878356934, + "learning_rate": 4.958037990545008e-06, + "loss": 0.4531, + "step": 4607 + }, + { + "epoch": 0.5077685950413223, + "grad_norm": 12.129911422729492, + "learning_rate": 4.956289617135166e-06, + "loss": 0.4316, + "step": 4608 + }, + { + "epoch": 0.5078787878787879, + "grad_norm": 6.629499912261963, + "learning_rate": 4.954541249070291e-06, + "loss": 0.4399, + "step": 4609 + }, + { + "epoch": 0.5079889807162534, + "grad_norm": 7.2819952964782715, + "learning_rate": 4.9527928865641714e-06, + "loss": 0.398, + "step": 4610 + }, + { + "epoch": 0.508099173553719, + "grad_norm": 5.846472263336182, + "learning_rate": 4.951044529830603e-06, + "loss": 0.4775, + "step": 4611 + }, + { + "epoch": 0.5082093663911845, + "grad_norm": 6.285477638244629, + "learning_rate": 4.949296179083377e-06, + "loss": 0.4359, + "step": 4612 + }, + { + "epoch": 0.5083195592286501, + "grad_norm": 4.833187580108643, + "learning_rate": 4.9475478345362804e-06, + "loss": 0.4372, + "step": 4613 + }, + { + "epoch": 0.5084297520661157, + "grad_norm": 7.467216491699219, + "learning_rate": 4.945799496403108e-06, + "loss": 0.3121, + "step": 4614 + }, + { + "epoch": 0.5085399449035812, + "grad_norm": 4.804136753082275, + "learning_rate": 4.944051164897647e-06, + "loss": 0.3873, + "step": 4615 + }, + { + "epoch": 0.5086501377410468, + "grad_norm": 5.1879472732543945, + "learning_rate": 4.942302840233684e-06, + "loss": 0.4423, + "step": 4616 + }, + { + "epoch": 0.5087603305785124, + "grad_norm": 11.147826194763184, + "learning_rate": 4.940554522625008e-06, + "loss": 0.5191, + "step": 4617 + }, + { + "epoch": 0.5088705234159779, + "grad_norm": 5.908621311187744, + "learning_rate": 4.938806212285408e-06, + "loss": 0.4041, + "step": 4618 + }, + { + "epoch": 0.5089807162534435, + "grad_norm": 5.038792610168457, + "learning_rate": 4.937057909428665e-06, + "loss": 0.428, + "step": 4619 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 4.840710163116455, + "learning_rate": 4.935309614268567e-06, + "loss": 0.4918, + "step": 4620 + }, + { + "epoch": 0.5092011019283746, + "grad_norm": 9.819032669067383, + "learning_rate": 4.933561327018897e-06, + "loss": 0.4938, + "step": 4621 + }, + { + "epoch": 0.5093112947658403, + "grad_norm": 5.381676197052002, + "learning_rate": 4.93181304789344e-06, + "loss": 0.4242, + "step": 4622 + }, + { + "epoch": 0.5094214876033057, + "grad_norm": 7.184305191040039, + "learning_rate": 4.930064777105976e-06, + "loss": 0.4407, + "step": 4623 + }, + { + "epoch": 0.5095316804407714, + "grad_norm": 5.1411333084106445, + "learning_rate": 4.928316514870283e-06, + "loss": 0.3675, + "step": 4624 + }, + { + "epoch": 0.509641873278237, + "grad_norm": 6.193539142608643, + "learning_rate": 4.9265682614001455e-06, + "loss": 0.3727, + "step": 4625 + }, + { + "epoch": 0.5097520661157025, + "grad_norm": 8.644025802612305, + "learning_rate": 4.924820016909343e-06, + "loss": 0.4193, + "step": 4626 + }, + { + "epoch": 0.5098622589531681, + "grad_norm": 5.382510185241699, + "learning_rate": 4.923071781611647e-06, + "loss": 0.3977, + "step": 4627 + }, + { + "epoch": 0.5099724517906337, + "grad_norm": 6.619536399841309, + "learning_rate": 4.921323555720842e-06, + "loss": 0.4694, + "step": 4628 + }, + { + "epoch": 0.5100826446280992, + "grad_norm": 10.856200218200684, + "learning_rate": 4.919575339450698e-06, + "loss": 0.4868, + "step": 4629 + }, + { + "epoch": 0.5101928374655648, + "grad_norm": 6.595972537994385, + "learning_rate": 4.917827133014988e-06, + "loss": 0.4259, + "step": 4630 + }, + { + "epoch": 0.5103030303030303, + "grad_norm": 6.2144904136657715, + "learning_rate": 4.916078936627492e-06, + "loss": 0.4727, + "step": 4631 + }, + { + "epoch": 0.5104132231404959, + "grad_norm": 4.708610534667969, + "learning_rate": 4.914330750501975e-06, + "loss": 0.3465, + "step": 4632 + }, + { + "epoch": 0.5105234159779615, + "grad_norm": 4.370340347290039, + "learning_rate": 4.912582574852211e-06, + "loss": 0.3769, + "step": 4633 + }, + { + "epoch": 0.510633608815427, + "grad_norm": 5.362609386444092, + "learning_rate": 4.910834409891968e-06, + "loss": 0.4057, + "step": 4634 + }, + { + "epoch": 0.5107438016528926, + "grad_norm": 5.957698345184326, + "learning_rate": 4.909086255835015e-06, + "loss": 0.4122, + "step": 4635 + }, + { + "epoch": 0.5108539944903582, + "grad_norm": 5.449244022369385, + "learning_rate": 4.907338112895118e-06, + "loss": 0.4174, + "step": 4636 + }, + { + "epoch": 0.5109641873278237, + "grad_norm": 4.519969940185547, + "learning_rate": 4.905589981286043e-06, + "loss": 0.3028, + "step": 4637 + }, + { + "epoch": 0.5110743801652893, + "grad_norm": 10.036328315734863, + "learning_rate": 4.903841861221552e-06, + "loss": 0.4588, + "step": 4638 + }, + { + "epoch": 0.5111845730027548, + "grad_norm": 7.442993640899658, + "learning_rate": 4.90209375291541e-06, + "loss": 0.4371, + "step": 4639 + }, + { + "epoch": 0.5112947658402204, + "grad_norm": 6.990756034851074, + "learning_rate": 4.900345656581377e-06, + "loss": 0.4603, + "step": 4640 + }, + { + "epoch": 0.511404958677686, + "grad_norm": 6.148458480834961, + "learning_rate": 4.898597572433212e-06, + "loss": 0.4531, + "step": 4641 + }, + { + "epoch": 0.5115151515151515, + "grad_norm": 6.159834861755371, + "learning_rate": 4.896849500684673e-06, + "loss": 0.43, + "step": 4642 + }, + { + "epoch": 0.5116253443526171, + "grad_norm": 5.136331558227539, + "learning_rate": 4.895101441549518e-06, + "loss": 0.4275, + "step": 4643 + }, + { + "epoch": 0.5117355371900827, + "grad_norm": 5.321721076965332, + "learning_rate": 4.8933533952415e-06, + "loss": 0.3909, + "step": 4644 + }, + { + "epoch": 0.5118457300275482, + "grad_norm": 3.884355306625366, + "learning_rate": 4.891605361974375e-06, + "loss": 0.3781, + "step": 4645 + }, + { + "epoch": 0.5119559228650138, + "grad_norm": 5.795078754425049, + "learning_rate": 4.889857341961894e-06, + "loss": 0.4706, + "step": 4646 + }, + { + "epoch": 0.5120661157024793, + "grad_norm": 7.165544033050537, + "learning_rate": 4.888109335417805e-06, + "loss": 0.5056, + "step": 4647 + }, + { + "epoch": 0.5121763085399449, + "grad_norm": 4.470412731170654, + "learning_rate": 4.88636134255586e-06, + "loss": 0.3842, + "step": 4648 + }, + { + "epoch": 0.5122865013774105, + "grad_norm": 5.512759208679199, + "learning_rate": 4.8846133635898035e-06, + "loss": 0.4169, + "step": 4649 + }, + { + "epoch": 0.512396694214876, + "grad_norm": 4.522556781768799, + "learning_rate": 4.882865398733383e-06, + "loss": 0.3372, + "step": 4650 + }, + { + "epoch": 0.5125068870523416, + "grad_norm": 7.432344913482666, + "learning_rate": 4.881117448200339e-06, + "loss": 0.4508, + "step": 4651 + }, + { + "epoch": 0.5126170798898072, + "grad_norm": 5.732212066650391, + "learning_rate": 4.879369512204415e-06, + "loss": 0.4417, + "step": 4652 + }, + { + "epoch": 0.5127272727272727, + "grad_norm": 6.637563228607178, + "learning_rate": 4.877621590959352e-06, + "loss": 0.4064, + "step": 4653 + }, + { + "epoch": 0.5128374655647383, + "grad_norm": 8.682241439819336, + "learning_rate": 4.8758736846788885e-06, + "loss": 0.4144, + "step": 4654 + }, + { + "epoch": 0.5129476584022039, + "grad_norm": 6.463537216186523, + "learning_rate": 4.874125793576755e-06, + "loss": 0.4625, + "step": 4655 + }, + { + "epoch": 0.5130578512396694, + "grad_norm": 6.958169460296631, + "learning_rate": 4.872377917866695e-06, + "loss": 0.4418, + "step": 4656 + }, + { + "epoch": 0.513168044077135, + "grad_norm": 5.541899681091309, + "learning_rate": 4.870630057762435e-06, + "loss": 0.3645, + "step": 4657 + }, + { + "epoch": 0.5132782369146005, + "grad_norm": 3.5046939849853516, + "learning_rate": 4.868882213477704e-06, + "loss": 0.4038, + "step": 4658 + }, + { + "epoch": 0.5133884297520661, + "grad_norm": 10.378771781921387, + "learning_rate": 4.867134385226239e-06, + "loss": 0.3873, + "step": 4659 + }, + { + "epoch": 0.5134986225895317, + "grad_norm": 5.14128303527832, + "learning_rate": 4.86538657322176e-06, + "loss": 0.3369, + "step": 4660 + }, + { + "epoch": 0.5136088154269972, + "grad_norm": 11.744321823120117, + "learning_rate": 4.863638777677993e-06, + "loss": 0.4452, + "step": 4661 + }, + { + "epoch": 0.5137190082644628, + "grad_norm": 5.557402610778809, + "learning_rate": 4.861890998808662e-06, + "loss": 0.4282, + "step": 4662 + }, + { + "epoch": 0.5138292011019284, + "grad_norm": 4.960723400115967, + "learning_rate": 4.860143236827485e-06, + "loss": 0.3699, + "step": 4663 + }, + { + "epoch": 0.5139393939393939, + "grad_norm": 10.539837837219238, + "learning_rate": 4.858395491948187e-06, + "loss": 0.4571, + "step": 4664 + }, + { + "epoch": 0.5140495867768595, + "grad_norm": 5.027210712432861, + "learning_rate": 4.8566477643844795e-06, + "loss": 0.4328, + "step": 4665 + }, + { + "epoch": 0.514159779614325, + "grad_norm": 7.645861625671387, + "learning_rate": 4.8549000543500765e-06, + "loss": 0.4807, + "step": 4666 + }, + { + "epoch": 0.5142699724517906, + "grad_norm": 6.753965377807617, + "learning_rate": 4.853152362058693e-06, + "loss": 0.4608, + "step": 4667 + }, + { + "epoch": 0.5143801652892562, + "grad_norm": 6.281160354614258, + "learning_rate": 4.851404687724038e-06, + "loss": 0.4345, + "step": 4668 + }, + { + "epoch": 0.5144903581267217, + "grad_norm": 10.625886917114258, + "learning_rate": 4.849657031559819e-06, + "loss": 0.5259, + "step": 4669 + }, + { + "epoch": 0.5146005509641873, + "grad_norm": 7.434563159942627, + "learning_rate": 4.847909393779743e-06, + "loss": 0.3596, + "step": 4670 + }, + { + "epoch": 0.5147107438016529, + "grad_norm": 5.95920467376709, + "learning_rate": 4.846161774597514e-06, + "loss": 0.4121, + "step": 4671 + }, + { + "epoch": 0.5148209366391184, + "grad_norm": 7.267666816711426, + "learning_rate": 4.84441417422683e-06, + "loss": 0.4547, + "step": 4672 + }, + { + "epoch": 0.514931129476584, + "grad_norm": 4.895453453063965, + "learning_rate": 4.842666592881394e-06, + "loss": 0.4458, + "step": 4673 + }, + { + "epoch": 0.5150413223140495, + "grad_norm": 7.7665205001831055, + "learning_rate": 4.8409190307749e-06, + "loss": 0.4, + "step": 4674 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 6.711982727050781, + "learning_rate": 4.839171488121042e-06, + "loss": 0.4023, + "step": 4675 + }, + { + "epoch": 0.5152617079889807, + "grad_norm": 6.800289630889893, + "learning_rate": 4.8374239651335145e-06, + "loss": 0.4279, + "step": 4676 + }, + { + "epoch": 0.5153719008264462, + "grad_norm": 6.929649353027344, + "learning_rate": 4.835676462026004e-06, + "loss": 0.4309, + "step": 4677 + }, + { + "epoch": 0.5154820936639118, + "grad_norm": 7.900669574737549, + "learning_rate": 4.8339289790122004e-06, + "loss": 0.3919, + "step": 4678 + }, + { + "epoch": 0.5155922865013775, + "grad_norm": 3.486950397491455, + "learning_rate": 4.832181516305786e-06, + "loss": 0.4277, + "step": 4679 + }, + { + "epoch": 0.515702479338843, + "grad_norm": 5.796440124511719, + "learning_rate": 4.830434074120443e-06, + "loss": 0.4239, + "step": 4680 + }, + { + "epoch": 0.5158126721763086, + "grad_norm": 6.509128570556641, + "learning_rate": 4.828686652669853e-06, + "loss": 0.4457, + "step": 4681 + }, + { + "epoch": 0.5159228650137742, + "grad_norm": 4.2097487449646, + "learning_rate": 4.826939252167693e-06, + "loss": 0.369, + "step": 4682 + }, + { + "epoch": 0.5160330578512397, + "grad_norm": 7.689331531524658, + "learning_rate": 4.825191872827633e-06, + "loss": 0.3942, + "step": 4683 + }, + { + "epoch": 0.5161432506887053, + "grad_norm": 7.9642839431762695, + "learning_rate": 4.82344451486335e-06, + "loss": 0.4656, + "step": 4684 + }, + { + "epoch": 0.5162534435261708, + "grad_norm": 10.46125602722168, + "learning_rate": 4.821697178488513e-06, + "loss": 0.5403, + "step": 4685 + }, + { + "epoch": 0.5163636363636364, + "grad_norm": 6.136967182159424, + "learning_rate": 4.819949863916784e-06, + "loss": 0.4271, + "step": 4686 + }, + { + "epoch": 0.516473829201102, + "grad_norm": 5.80336332321167, + "learning_rate": 4.818202571361834e-06, + "loss": 0.3986, + "step": 4687 + }, + { + "epoch": 0.5165840220385675, + "grad_norm": 7.430635929107666, + "learning_rate": 4.8164553010373185e-06, + "loss": 0.3748, + "step": 4688 + }, + { + "epoch": 0.5166942148760331, + "grad_norm": 4.121450424194336, + "learning_rate": 4.814708053156897e-06, + "loss": 0.3597, + "step": 4689 + }, + { + "epoch": 0.5168044077134987, + "grad_norm": 5.318500995635986, + "learning_rate": 4.812960827934228e-06, + "loss": 0.483, + "step": 4690 + }, + { + "epoch": 0.5169146005509642, + "grad_norm": 4.661088466644287, + "learning_rate": 4.811213625582961e-06, + "loss": 0.4194, + "step": 4691 + }, + { + "epoch": 0.5170247933884298, + "grad_norm": 5.725320816040039, + "learning_rate": 4.809466446316751e-06, + "loss": 0.3859, + "step": 4692 + }, + { + "epoch": 0.5171349862258953, + "grad_norm": 5.202004432678223, + "learning_rate": 4.807719290349242e-06, + "loss": 0.4025, + "step": 4693 + }, + { + "epoch": 0.5172451790633609, + "grad_norm": 5.964555740356445, + "learning_rate": 4.805972157894078e-06, + "loss": 0.3745, + "step": 4694 + }, + { + "epoch": 0.5173553719008265, + "grad_norm": 8.678040504455566, + "learning_rate": 4.804225049164903e-06, + "loss": 0.5042, + "step": 4695 + }, + { + "epoch": 0.517465564738292, + "grad_norm": 6.35433292388916, + "learning_rate": 4.802477964375356e-06, + "loss": 0.3602, + "step": 4696 + }, + { + "epoch": 0.5175757575757576, + "grad_norm": 7.269749164581299, + "learning_rate": 4.80073090373907e-06, + "loss": 0.455, + "step": 4697 + }, + { + "epoch": 0.5176859504132232, + "grad_norm": 4.803569316864014, + "learning_rate": 4.798983867469681e-06, + "loss": 0.4071, + "step": 4698 + }, + { + "epoch": 0.5177961432506887, + "grad_norm": 5.32893180847168, + "learning_rate": 4.797236855780818e-06, + "loss": 0.3967, + "step": 4699 + }, + { + "epoch": 0.5179063360881543, + "grad_norm": 5.919497013092041, + "learning_rate": 4.795489868886106e-06, + "loss": 0.3981, + "step": 4700 + }, + { + "epoch": 0.5180165289256199, + "grad_norm": 6.551672458648682, + "learning_rate": 4.793742906999172e-06, + "loss": 0.3516, + "step": 4701 + }, + { + "epoch": 0.5181267217630854, + "grad_norm": 13.528398513793945, + "learning_rate": 4.791995970333636e-06, + "loss": 0.4038, + "step": 4702 + }, + { + "epoch": 0.518236914600551, + "grad_norm": 6.5294318199157715, + "learning_rate": 4.790249059103113e-06, + "loss": 0.3319, + "step": 4703 + }, + { + "epoch": 0.5183471074380165, + "grad_norm": 6.170133113861084, + "learning_rate": 4.788502173521222e-06, + "loss": 0.4436, + "step": 4704 + }, + { + "epoch": 0.5184573002754821, + "grad_norm": 5.018182277679443, + "learning_rate": 4.7867553138015715e-06, + "loss": 0.3969, + "step": 4705 + }, + { + "epoch": 0.5185674931129477, + "grad_norm": 7.418249607086182, + "learning_rate": 4.785008480157772e-06, + "loss": 0.3985, + "step": 4706 + }, + { + "epoch": 0.5186776859504132, + "grad_norm": 5.478403568267822, + "learning_rate": 4.783261672803427e-06, + "loss": 0.3892, + "step": 4707 + }, + { + "epoch": 0.5187878787878788, + "grad_norm": 12.865488052368164, + "learning_rate": 4.7815148919521385e-06, + "loss": 0.5772, + "step": 4708 + }, + { + "epoch": 0.5188980716253444, + "grad_norm": 4.450212001800537, + "learning_rate": 4.7797681378175065e-06, + "loss": 0.3204, + "step": 4709 + }, + { + "epoch": 0.5190082644628099, + "grad_norm": 6.353028774261475, + "learning_rate": 4.778021410613126e-06, + "loss": 0.4429, + "step": 4710 + }, + { + "epoch": 0.5191184573002755, + "grad_norm": 4.610799312591553, + "learning_rate": 4.776274710552588e-06, + "loss": 0.353, + "step": 4711 + }, + { + "epoch": 0.519228650137741, + "grad_norm": 6.441452503204346, + "learning_rate": 4.774528037849482e-06, + "loss": 0.4051, + "step": 4712 + }, + { + "epoch": 0.5193388429752066, + "grad_norm": 14.416350364685059, + "learning_rate": 4.772781392717397e-06, + "loss": 0.3931, + "step": 4713 + }, + { + "epoch": 0.5194490358126722, + "grad_norm": 11.664390563964844, + "learning_rate": 4.771034775369907e-06, + "loss": 0.3659, + "step": 4714 + }, + { + "epoch": 0.5195592286501377, + "grad_norm": 5.152129173278809, + "learning_rate": 4.769288186020599e-06, + "loss": 0.3688, + "step": 4715 + }, + { + "epoch": 0.5196694214876033, + "grad_norm": 6.308830261230469, + "learning_rate": 4.767541624883045e-06, + "loss": 0.4702, + "step": 4716 + }, + { + "epoch": 0.5197796143250689, + "grad_norm": 10.4258394241333, + "learning_rate": 4.765795092170815e-06, + "loss": 0.4772, + "step": 4717 + }, + { + "epoch": 0.5198898071625344, + "grad_norm": 12.771700859069824, + "learning_rate": 4.764048588097482e-06, + "loss": 0.4561, + "step": 4718 + }, + { + "epoch": 0.52, + "grad_norm": 7.97894811630249, + "learning_rate": 4.762302112876605e-06, + "loss": 0.484, + "step": 4719 + }, + { + "epoch": 0.5201101928374655, + "grad_norm": 15.211674690246582, + "learning_rate": 4.7605556667217505e-06, + "loss": 0.5272, + "step": 4720 + }, + { + "epoch": 0.5202203856749311, + "grad_norm": 7.436028480529785, + "learning_rate": 4.758809249846475e-06, + "loss": 0.4014, + "step": 4721 + }, + { + "epoch": 0.5203305785123967, + "grad_norm": 6.159117221832275, + "learning_rate": 4.757062862464328e-06, + "loss": 0.4912, + "step": 4722 + }, + { + "epoch": 0.5204407713498622, + "grad_norm": 6.888415336608887, + "learning_rate": 4.755316504788869e-06, + "loss": 0.381, + "step": 4723 + }, + { + "epoch": 0.5205509641873278, + "grad_norm": 9.91435718536377, + "learning_rate": 4.7535701770336385e-06, + "loss": 0.4835, + "step": 4724 + }, + { + "epoch": 0.5206611570247934, + "grad_norm": 8.647757530212402, + "learning_rate": 4.7518238794121805e-06, + "loss": 0.5128, + "step": 4725 + }, + { + "epoch": 0.5207713498622589, + "grad_norm": 7.840506076812744, + "learning_rate": 4.750077612138037e-06, + "loss": 0.4447, + "step": 4726 + }, + { + "epoch": 0.5208815426997245, + "grad_norm": 8.255101203918457, + "learning_rate": 4.748331375424742e-06, + "loss": 0.4715, + "step": 4727 + }, + { + "epoch": 0.5209917355371901, + "grad_norm": 6.768373489379883, + "learning_rate": 4.746585169485828e-06, + "loss": 0.4443, + "step": 4728 + }, + { + "epoch": 0.5211019283746556, + "grad_norm": 6.331600666046143, + "learning_rate": 4.744838994534824e-06, + "loss": 0.3851, + "step": 4729 + }, + { + "epoch": 0.5212121212121212, + "grad_norm": 6.041339874267578, + "learning_rate": 4.743092850785255e-06, + "loss": 0.4211, + "step": 4730 + }, + { + "epoch": 0.5213223140495867, + "grad_norm": 10.140803337097168, + "learning_rate": 4.74134673845064e-06, + "loss": 0.5106, + "step": 4731 + }, + { + "epoch": 0.5214325068870523, + "grad_norm": 5.4736456871032715, + "learning_rate": 4.7396006577444976e-06, + "loss": 0.4362, + "step": 4732 + }, + { + "epoch": 0.5215426997245179, + "grad_norm": 6.823335647583008, + "learning_rate": 4.73785460888034e-06, + "loss": 0.3611, + "step": 4733 + }, + { + "epoch": 0.5216528925619834, + "grad_norm": 6.919731140136719, + "learning_rate": 4.736108592071678e-06, + "loss": 0.4689, + "step": 4734 + }, + { + "epoch": 0.521763085399449, + "grad_norm": 6.117292404174805, + "learning_rate": 4.734362607532015e-06, + "loss": 0.3818, + "step": 4735 + }, + { + "epoch": 0.5218732782369147, + "grad_norm": 5.677313327789307, + "learning_rate": 4.732616655474853e-06, + "loss": 0.4574, + "step": 4736 + }, + { + "epoch": 0.5219834710743801, + "grad_norm": 14.314566612243652, + "learning_rate": 4.73087073611369e-06, + "loss": 0.3608, + "step": 4737 + }, + { + "epoch": 0.5220936639118458, + "grad_norm": 7.583749771118164, + "learning_rate": 4.72912484966202e-06, + "loss": 0.4606, + "step": 4738 + }, + { + "epoch": 0.5222038567493112, + "grad_norm": 4.8243865966796875, + "learning_rate": 4.72737899633333e-06, + "loss": 0.4029, + "step": 4739 + }, + { + "epoch": 0.5223140495867769, + "grad_norm": 5.845529079437256, + "learning_rate": 4.725633176341107e-06, + "loss": 0.425, + "step": 4740 + }, + { + "epoch": 0.5224242424242425, + "grad_norm": 6.174137115478516, + "learning_rate": 4.723887389898833e-06, + "loss": 0.4219, + "step": 4741 + }, + { + "epoch": 0.522534435261708, + "grad_norm": 8.151043891906738, + "learning_rate": 4.722141637219981e-06, + "loss": 0.4189, + "step": 4742 + }, + { + "epoch": 0.5226446280991736, + "grad_norm": 5.855828762054443, + "learning_rate": 4.72039591851803e-06, + "loss": 0.3719, + "step": 4743 + }, + { + "epoch": 0.5227548209366392, + "grad_norm": 5.095317363739014, + "learning_rate": 4.718650234006446e-06, + "loss": 0.422, + "step": 4744 + }, + { + "epoch": 0.5228650137741047, + "grad_norm": 5.047654628753662, + "learning_rate": 4.71690458389869e-06, + "loss": 0.3451, + "step": 4745 + }, + { + "epoch": 0.5229752066115703, + "grad_norm": 5.244139671325684, + "learning_rate": 4.71515896840823e-06, + "loss": 0.3126, + "step": 4746 + }, + { + "epoch": 0.5230853994490358, + "grad_norm": 6.747472286224365, + "learning_rate": 4.7134133877485146e-06, + "loss": 0.4296, + "step": 4747 + }, + { + "epoch": 0.5231955922865014, + "grad_norm": 6.123372554779053, + "learning_rate": 4.711667842133003e-06, + "loss": 0.4063, + "step": 4748 + }, + { + "epoch": 0.523305785123967, + "grad_norm": 8.187582015991211, + "learning_rate": 4.709922331775138e-06, + "loss": 0.4062, + "step": 4749 + }, + { + "epoch": 0.5234159779614325, + "grad_norm": 11.04067325592041, + "learning_rate": 4.708176856888362e-06, + "loss": 0.4127, + "step": 4750 + }, + { + "epoch": 0.5235261707988981, + "grad_norm": 7.561423301696777, + "learning_rate": 4.706431417686121e-06, + "loss": 0.4394, + "step": 4751 + }, + { + "epoch": 0.5236363636363637, + "grad_norm": 5.916963577270508, + "learning_rate": 4.704686014381842e-06, + "loss": 0.4553, + "step": 4752 + }, + { + "epoch": 0.5237465564738292, + "grad_norm": 7.415162086486816, + "learning_rate": 4.702940647188958e-06, + "loss": 0.3999, + "step": 4753 + }, + { + "epoch": 0.5238567493112948, + "grad_norm": 3.885070323944092, + "learning_rate": 4.701195316320897e-06, + "loss": 0.3886, + "step": 4754 + }, + { + "epoch": 0.5239669421487604, + "grad_norm": 7.7326459884643555, + "learning_rate": 4.699450021991078e-06, + "loss": 0.4516, + "step": 4755 + }, + { + "epoch": 0.5240771349862259, + "grad_norm": 8.060406684875488, + "learning_rate": 4.697704764412917e-06, + "loss": 0.4573, + "step": 4756 + }, + { + "epoch": 0.5241873278236915, + "grad_norm": 5.327139854431152, + "learning_rate": 4.695959543799829e-06, + "loss": 0.4009, + "step": 4757 + }, + { + "epoch": 0.524297520661157, + "grad_norm": 4.9981513023376465, + "learning_rate": 4.6942143603652214e-06, + "loss": 0.3951, + "step": 4758 + }, + { + "epoch": 0.5244077134986226, + "grad_norm": 6.298994064331055, + "learning_rate": 4.692469214322497e-06, + "loss": 0.4261, + "step": 4759 + }, + { + "epoch": 0.5245179063360882, + "grad_norm": 8.435798645019531, + "learning_rate": 4.690724105885055e-06, + "loss": 0.4001, + "step": 4760 + }, + { + "epoch": 0.5246280991735537, + "grad_norm": 7.095319747924805, + "learning_rate": 4.688979035266288e-06, + "loss": 0.41, + "step": 4761 + }, + { + "epoch": 0.5247382920110193, + "grad_norm": 7.2349700927734375, + "learning_rate": 4.687234002679589e-06, + "loss": 0.461, + "step": 4762 + }, + { + "epoch": 0.5248484848484849, + "grad_norm": 9.026803970336914, + "learning_rate": 4.68548900833834e-06, + "loss": 0.4939, + "step": 4763 + }, + { + "epoch": 0.5249586776859504, + "grad_norm": 5.365535259246826, + "learning_rate": 4.683744052455922e-06, + "loss": 0.4438, + "step": 4764 + }, + { + "epoch": 0.525068870523416, + "grad_norm": 6.975589752197266, + "learning_rate": 4.681999135245712e-06, + "loss": 0.3798, + "step": 4765 + }, + { + "epoch": 0.5251790633608815, + "grad_norm": 11.416638374328613, + "learning_rate": 4.68025425692108e-06, + "loss": 0.4842, + "step": 4766 + }, + { + "epoch": 0.5252892561983471, + "grad_norm": 8.365035057067871, + "learning_rate": 4.678509417695389e-06, + "loss": 0.4478, + "step": 4767 + }, + { + "epoch": 0.5253994490358127, + "grad_norm": 5.908651351928711, + "learning_rate": 4.676764617782006e-06, + "loss": 0.4277, + "step": 4768 + }, + { + "epoch": 0.5255096418732782, + "grad_norm": 7.646467208862305, + "learning_rate": 4.675019857394285e-06, + "loss": 0.3445, + "step": 4769 + }, + { + "epoch": 0.5256198347107438, + "grad_norm": 5.816633701324463, + "learning_rate": 4.673275136745574e-06, + "loss": 0.4133, + "step": 4770 + }, + { + "epoch": 0.5257300275482094, + "grad_norm": 6.1945366859436035, + "learning_rate": 4.671530456049225e-06, + "loss": 0.3272, + "step": 4771 + }, + { + "epoch": 0.5258402203856749, + "grad_norm": 5.908159255981445, + "learning_rate": 4.66978581551858e-06, + "loss": 0.4196, + "step": 4772 + }, + { + "epoch": 0.5259504132231405, + "grad_norm": 5.871518611907959, + "learning_rate": 4.6680412153669695e-06, + "loss": 0.3546, + "step": 4773 + }, + { + "epoch": 0.526060606060606, + "grad_norm": 5.87644624710083, + "learning_rate": 4.666296655807735e-06, + "loss": 0.4596, + "step": 4774 + }, + { + "epoch": 0.5261707988980716, + "grad_norm": 6.668318271636963, + "learning_rate": 4.664552137054194e-06, + "loss": 0.3977, + "step": 4775 + }, + { + "epoch": 0.5262809917355372, + "grad_norm": 12.507335662841797, + "learning_rate": 4.662807659319676e-06, + "loss": 0.3817, + "step": 4776 + }, + { + "epoch": 0.5263911845730027, + "grad_norm": 5.290792465209961, + "learning_rate": 4.661063222817497e-06, + "loss": 0.3623, + "step": 4777 + }, + { + "epoch": 0.5265013774104683, + "grad_norm": 14.073418617248535, + "learning_rate": 4.659318827760964e-06, + "loss": 0.4862, + "step": 4778 + }, + { + "epoch": 0.5266115702479339, + "grad_norm": 6.559229850769043, + "learning_rate": 4.65757447436339e-06, + "loss": 0.4044, + "step": 4779 + }, + { + "epoch": 0.5267217630853994, + "grad_norm": 6.175151348114014, + "learning_rate": 4.655830162838074e-06, + "loss": 0.3925, + "step": 4780 + }, + { + "epoch": 0.526831955922865, + "grad_norm": 5.202744483947754, + "learning_rate": 4.654085893398312e-06, + "loss": 0.3941, + "step": 4781 + }, + { + "epoch": 0.5269421487603306, + "grad_norm": 10.19947338104248, + "learning_rate": 4.652341666257398e-06, + "loss": 0.496, + "step": 4782 + }, + { + "epoch": 0.5270523415977961, + "grad_norm": 9.504716873168945, + "learning_rate": 4.650597481628617e-06, + "loss": 0.4463, + "step": 4783 + }, + { + "epoch": 0.5271625344352617, + "grad_norm": 6.426805019378662, + "learning_rate": 4.64885333972525e-06, + "loss": 0.436, + "step": 4784 + }, + { + "epoch": 0.5272727272727272, + "grad_norm": 6.865482330322266, + "learning_rate": 4.647109240760574e-06, + "loss": 0.4941, + "step": 4785 + }, + { + "epoch": 0.5273829201101928, + "grad_norm": 5.157108306884766, + "learning_rate": 4.645365184947861e-06, + "loss": 0.3815, + "step": 4786 + }, + { + "epoch": 0.5274931129476584, + "grad_norm": 5.182690143585205, + "learning_rate": 4.643621172500372e-06, + "loss": 0.3838, + "step": 4787 + }, + { + "epoch": 0.5276033057851239, + "grad_norm": 10.033770561218262, + "learning_rate": 4.6418772036313716e-06, + "loss": 0.4114, + "step": 4788 + }, + { + "epoch": 0.5277134986225895, + "grad_norm": 5.148810863494873, + "learning_rate": 4.6401332785541125e-06, + "loss": 0.3935, + "step": 4789 + }, + { + "epoch": 0.5278236914600551, + "grad_norm": 4.2936482429504395, + "learning_rate": 4.6383893974818464e-06, + "loss": 0.4128, + "step": 4790 + }, + { + "epoch": 0.5279338842975206, + "grad_norm": 7.7514543533325195, + "learning_rate": 4.636645560627815e-06, + "loss": 0.4589, + "step": 4791 + }, + { + "epoch": 0.5280440771349862, + "grad_norm": 6.3132805824279785, + "learning_rate": 4.634901768205257e-06, + "loss": 0.3303, + "step": 4792 + }, + { + "epoch": 0.5281542699724517, + "grad_norm": 6.861356258392334, + "learning_rate": 4.633158020427408e-06, + "loss": 0.3197, + "step": 4793 + }, + { + "epoch": 0.5282644628099173, + "grad_norm": 8.240154266357422, + "learning_rate": 4.631414317507495e-06, + "loss": 0.3735, + "step": 4794 + }, + { + "epoch": 0.528374655647383, + "grad_norm": 6.859533786773682, + "learning_rate": 4.629670659658739e-06, + "loss": 0.3824, + "step": 4795 + }, + { + "epoch": 0.5284848484848484, + "grad_norm": 11.968518257141113, + "learning_rate": 4.627927047094358e-06, + "loss": 0.4285, + "step": 4796 + }, + { + "epoch": 0.528595041322314, + "grad_norm": 10.671728134155273, + "learning_rate": 4.626183480027564e-06, + "loss": 0.5099, + "step": 4797 + }, + { + "epoch": 0.5287052341597797, + "grad_norm": 8.56689167022705, + "learning_rate": 4.62443995867156e-06, + "loss": 0.3508, + "step": 4798 + }, + { + "epoch": 0.5288154269972452, + "grad_norm": 5.919952392578125, + "learning_rate": 4.622696483239549e-06, + "loss": 0.4478, + "step": 4799 + }, + { + "epoch": 0.5289256198347108, + "grad_norm": 7.455764293670654, + "learning_rate": 4.6209530539447265e-06, + "loss": 0.4184, + "step": 4800 + }, + { + "epoch": 0.5290358126721763, + "grad_norm": 11.0386381149292, + "learning_rate": 4.619209671000276e-06, + "loss": 0.4672, + "step": 4801 + }, + { + "epoch": 0.5291460055096419, + "grad_norm": 7.714338779449463, + "learning_rate": 4.617466334619387e-06, + "loss": 0.3995, + "step": 4802 + }, + { + "epoch": 0.5292561983471075, + "grad_norm": 3.729965925216675, + "learning_rate": 4.6157230450152315e-06, + "loss": 0.3921, + "step": 4803 + }, + { + "epoch": 0.529366391184573, + "grad_norm": 8.325998306274414, + "learning_rate": 4.613979802400986e-06, + "loss": 0.412, + "step": 4804 + }, + { + "epoch": 0.5294765840220386, + "grad_norm": 6.081939697265625, + "learning_rate": 4.612236606989815e-06, + "loss": 0.3978, + "step": 4805 + }, + { + "epoch": 0.5295867768595042, + "grad_norm": 9.263617515563965, + "learning_rate": 4.610493458994876e-06, + "loss": 0.4348, + "step": 4806 + }, + { + "epoch": 0.5296969696969697, + "grad_norm": 8.411460876464844, + "learning_rate": 4.608750358629329e-06, + "loss": 0.4951, + "step": 4807 + }, + { + "epoch": 0.5298071625344353, + "grad_norm": 9.316654205322266, + "learning_rate": 4.607007306106318e-06, + "loss": 0.43, + "step": 4808 + }, + { + "epoch": 0.5299173553719009, + "grad_norm": 6.742702484130859, + "learning_rate": 4.605264301638986e-06, + "loss": 0.4105, + "step": 4809 + }, + { + "epoch": 0.5300275482093664, + "grad_norm": 6.03303861618042, + "learning_rate": 4.603521345440474e-06, + "loss": 0.457, + "step": 4810 + }, + { + "epoch": 0.530137741046832, + "grad_norm": 5.3249735832214355, + "learning_rate": 4.60177843772391e-06, + "loss": 0.384, + "step": 4811 + }, + { + "epoch": 0.5302479338842975, + "grad_norm": 10.849640846252441, + "learning_rate": 4.600035578702418e-06, + "loss": 0.3535, + "step": 4812 + }, + { + "epoch": 0.5303581267217631, + "grad_norm": 5.814062118530273, + "learning_rate": 4.5982927685891196e-06, + "loss": 0.4242, + "step": 4813 + }, + { + "epoch": 0.5304683195592287, + "grad_norm": 4.485311508178711, + "learning_rate": 4.596550007597128e-06, + "loss": 0.4173, + "step": 4814 + }, + { + "epoch": 0.5305785123966942, + "grad_norm": 9.369732856750488, + "learning_rate": 4.594807295939548e-06, + "loss": 0.4045, + "step": 4815 + }, + { + "epoch": 0.5306887052341598, + "grad_norm": 4.321104526519775, + "learning_rate": 4.593064633829483e-06, + "loss": 0.3466, + "step": 4816 + }, + { + "epoch": 0.5307988980716254, + "grad_norm": 5.65838098526001, + "learning_rate": 4.591322021480027e-06, + "loss": 0.4088, + "step": 4817 + }, + { + "epoch": 0.5309090909090909, + "grad_norm": 7.255067348480225, + "learning_rate": 4.58957945910427e-06, + "loss": 0.3777, + "step": 4818 + }, + { + "epoch": 0.5310192837465565, + "grad_norm": 5.063383102416992, + "learning_rate": 4.587836946915294e-06, + "loss": 0.3763, + "step": 4819 + }, + { + "epoch": 0.531129476584022, + "grad_norm": 8.16020393371582, + "learning_rate": 4.586094485126175e-06, + "loss": 0.5009, + "step": 4820 + }, + { + "epoch": 0.5312396694214876, + "grad_norm": 5.291675567626953, + "learning_rate": 4.584352073949986e-06, + "loss": 0.4175, + "step": 4821 + }, + { + "epoch": 0.5313498622589532, + "grad_norm": 7.2242536544799805, + "learning_rate": 4.58260971359979e-06, + "loss": 0.3968, + "step": 4822 + }, + { + "epoch": 0.5314600550964187, + "grad_norm": 6.445919513702393, + "learning_rate": 4.580867404288644e-06, + "loss": 0.3964, + "step": 4823 + }, + { + "epoch": 0.5315702479338843, + "grad_norm": 8.131845474243164, + "learning_rate": 4.579125146229601e-06, + "loss": 0.4372, + "step": 4824 + }, + { + "epoch": 0.5316804407713499, + "grad_norm": 3.518427610397339, + "learning_rate": 4.577382939635709e-06, + "loss": 0.4025, + "step": 4825 + }, + { + "epoch": 0.5317906336088154, + "grad_norm": 6.3140082359313965, + "learning_rate": 4.575640784720003e-06, + "loss": 0.4022, + "step": 4826 + }, + { + "epoch": 0.531900826446281, + "grad_norm": 5.330982208251953, + "learning_rate": 4.573898681695519e-06, + "loss": 0.3742, + "step": 4827 + }, + { + "epoch": 0.5320110192837466, + "grad_norm": 10.08835220336914, + "learning_rate": 4.572156630775285e-06, + "loss": 0.4913, + "step": 4828 + }, + { + "epoch": 0.5321212121212121, + "grad_norm": 4.119600772857666, + "learning_rate": 4.570414632172315e-06, + "loss": 0.3523, + "step": 4829 + }, + { + "epoch": 0.5322314049586777, + "grad_norm": 6.530597686767578, + "learning_rate": 4.568672686099631e-06, + "loss": 0.3307, + "step": 4830 + }, + { + "epoch": 0.5323415977961432, + "grad_norm": 6.231881618499756, + "learning_rate": 4.566930792770234e-06, + "loss": 0.3645, + "step": 4831 + }, + { + "epoch": 0.5324517906336088, + "grad_norm": 5.70702600479126, + "learning_rate": 4.56518895239713e-06, + "loss": 0.4019, + "step": 4832 + }, + { + "epoch": 0.5325619834710744, + "grad_norm": 9.638153076171875, + "learning_rate": 4.5634471651933125e-06, + "loss": 0.4779, + "step": 4833 + }, + { + "epoch": 0.5326721763085399, + "grad_norm": 7.994751930236816, + "learning_rate": 4.561705431371766e-06, + "loss": 0.3918, + "step": 4834 + }, + { + "epoch": 0.5327823691460055, + "grad_norm": 9.847349166870117, + "learning_rate": 4.559963751145477e-06, + "loss": 0.4104, + "step": 4835 + }, + { + "epoch": 0.5328925619834711, + "grad_norm": 6.911519527435303, + "learning_rate": 4.5582221247274175e-06, + "loss": 0.4688, + "step": 4836 + }, + { + "epoch": 0.5330027548209366, + "grad_norm": 6.8758463859558105, + "learning_rate": 4.556480552330555e-06, + "loss": 0.3543, + "step": 4837 + }, + { + "epoch": 0.5331129476584022, + "grad_norm": 6.75565767288208, + "learning_rate": 4.554739034167855e-06, + "loss": 0.4163, + "step": 4838 + }, + { + "epoch": 0.5332231404958677, + "grad_norm": 6.917755126953125, + "learning_rate": 4.552997570452271e-06, + "loss": 0.4408, + "step": 4839 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 5.163933277130127, + "learning_rate": 4.551256161396749e-06, + "loss": 0.4325, + "step": 4840 + }, + { + "epoch": 0.5334435261707989, + "grad_norm": 9.991934776306152, + "learning_rate": 4.549514807214235e-06, + "loss": 0.4224, + "step": 4841 + }, + { + "epoch": 0.5335537190082644, + "grad_norm": 5.259696006774902, + "learning_rate": 4.547773508117663e-06, + "loss": 0.379, + "step": 4842 + }, + { + "epoch": 0.53366391184573, + "grad_norm": 4.730058193206787, + "learning_rate": 4.5460322643199586e-06, + "loss": 0.3877, + "step": 4843 + }, + { + "epoch": 0.5337741046831956, + "grad_norm": 7.9191412925720215, + "learning_rate": 4.5442910760340466e-06, + "loss": 0.424, + "step": 4844 + }, + { + "epoch": 0.5338842975206611, + "grad_norm": 5.3816704750061035, + "learning_rate": 4.542549943472841e-06, + "loss": 0.4266, + "step": 4845 + }, + { + "epoch": 0.5339944903581267, + "grad_norm": 5.254730224609375, + "learning_rate": 4.540808866849249e-06, + "loss": 0.3668, + "step": 4846 + }, + { + "epoch": 0.5341046831955922, + "grad_norm": 13.787521362304688, + "learning_rate": 4.539067846376173e-06, + "loss": 0.5494, + "step": 4847 + }, + { + "epoch": 0.5342148760330578, + "grad_norm": 8.365334510803223, + "learning_rate": 4.537326882266506e-06, + "loss": 0.3826, + "step": 4848 + }, + { + "epoch": 0.5343250688705234, + "grad_norm": 7.230132102966309, + "learning_rate": 4.535585974733138e-06, + "loss": 0.3738, + "step": 4849 + }, + { + "epoch": 0.5344352617079889, + "grad_norm": 6.887158393859863, + "learning_rate": 4.5338451239889465e-06, + "loss": 0.4763, + "step": 4850 + }, + { + "epoch": 0.5345454545454545, + "grad_norm": 8.59938907623291, + "learning_rate": 4.532104330246807e-06, + "loss": 0.4696, + "step": 4851 + }, + { + "epoch": 0.5346556473829202, + "grad_norm": 7.57608699798584, + "learning_rate": 4.530363593719585e-06, + "loss": 0.4282, + "step": 4852 + }, + { + "epoch": 0.5347658402203856, + "grad_norm": 6.073071479797363, + "learning_rate": 4.5286229146201425e-06, + "loss": 0.4261, + "step": 4853 + }, + { + "epoch": 0.5348760330578513, + "grad_norm": 5.064709186553955, + "learning_rate": 4.526882293161328e-06, + "loss": 0.4004, + "step": 4854 + }, + { + "epoch": 0.5349862258953169, + "grad_norm": 4.467386245727539, + "learning_rate": 4.525141729555991e-06, + "loss": 0.3556, + "step": 4855 + }, + { + "epoch": 0.5350964187327824, + "grad_norm": 6.954026699066162, + "learning_rate": 4.5234012240169686e-06, + "loss": 0.4263, + "step": 4856 + }, + { + "epoch": 0.535206611570248, + "grad_norm": 8.104146957397461, + "learning_rate": 4.521660776757089e-06, + "loss": 0.3759, + "step": 4857 + }, + { + "epoch": 0.5353168044077135, + "grad_norm": 6.06421422958374, + "learning_rate": 4.519920387989182e-06, + "loss": 0.3673, + "step": 4858 + }, + { + "epoch": 0.5354269972451791, + "grad_norm": 6.338195323944092, + "learning_rate": 4.518180057926061e-06, + "loss": 0.4627, + "step": 4859 + }, + { + "epoch": 0.5355371900826447, + "grad_norm": 7.764068603515625, + "learning_rate": 4.516439786780538e-06, + "loss": 0.3891, + "step": 4860 + }, + { + "epoch": 0.5356473829201102, + "grad_norm": 6.029236793518066, + "learning_rate": 4.514699574765415e-06, + "loss": 0.4353, + "step": 4861 + }, + { + "epoch": 0.5357575757575758, + "grad_norm": 6.852285385131836, + "learning_rate": 4.5129594220934856e-06, + "loss": 0.379, + "step": 4862 + }, + { + "epoch": 0.5358677685950414, + "grad_norm": 6.268723964691162, + "learning_rate": 4.511219328977541e-06, + "loss": 0.3621, + "step": 4863 + }, + { + "epoch": 0.5359779614325069, + "grad_norm": 10.014451026916504, + "learning_rate": 4.509479295630362e-06, + "loss": 0.438, + "step": 4864 + }, + { + "epoch": 0.5360881542699725, + "grad_norm": 11.356203079223633, + "learning_rate": 4.507739322264717e-06, + "loss": 0.4089, + "step": 4865 + }, + { + "epoch": 0.536198347107438, + "grad_norm": 10.65330982208252, + "learning_rate": 4.50599940909338e-06, + "loss": 0.4433, + "step": 4866 + }, + { + "epoch": 0.5363085399449036, + "grad_norm": 9.675298690795898, + "learning_rate": 4.504259556329105e-06, + "loss": 0.4201, + "step": 4867 + }, + { + "epoch": 0.5364187327823692, + "grad_norm": 8.174824714660645, + "learning_rate": 4.5025197641846445e-06, + "loss": 0.4172, + "step": 4868 + }, + { + "epoch": 0.5365289256198347, + "grad_norm": 13.778491973876953, + "learning_rate": 4.5007800328727435e-06, + "loss": 0.5102, + "step": 4869 + }, + { + "epoch": 0.5366391184573003, + "grad_norm": 7.946699619293213, + "learning_rate": 4.499040362606139e-06, + "loss": 0.41, + "step": 4870 + }, + { + "epoch": 0.5367493112947659, + "grad_norm": 4.874997615814209, + "learning_rate": 4.497300753597557e-06, + "loss": 0.425, + "step": 4871 + }, + { + "epoch": 0.5368595041322314, + "grad_norm": 8.760504722595215, + "learning_rate": 4.495561206059723e-06, + "loss": 0.4714, + "step": 4872 + }, + { + "epoch": 0.536969696969697, + "grad_norm": 5.017094612121582, + "learning_rate": 4.49382172020535e-06, + "loss": 0.4442, + "step": 4873 + }, + { + "epoch": 0.5370798898071625, + "grad_norm": 5.726899147033691, + "learning_rate": 4.492082296247145e-06, + "loss": 0.4706, + "step": 4874 + }, + { + "epoch": 0.5371900826446281, + "grad_norm": 5.397989273071289, + "learning_rate": 4.490342934397807e-06, + "loss": 0.3746, + "step": 4875 + }, + { + "epoch": 0.5373002754820937, + "grad_norm": 4.507936477661133, + "learning_rate": 4.488603634870026e-06, + "loss": 0.3411, + "step": 4876 + }, + { + "epoch": 0.5374104683195592, + "grad_norm": 5.176347255706787, + "learning_rate": 4.486864397876488e-06, + "loss": 0.4165, + "step": 4877 + }, + { + "epoch": 0.5375206611570248, + "grad_norm": 12.070116996765137, + "learning_rate": 4.485125223629868e-06, + "loss": 0.5233, + "step": 4878 + }, + { + "epoch": 0.5376308539944904, + "grad_norm": 11.164689064025879, + "learning_rate": 4.4833861123428355e-06, + "loss": 0.4604, + "step": 4879 + }, + { + "epoch": 0.5377410468319559, + "grad_norm": 7.950995445251465, + "learning_rate": 4.481647064228051e-06, + "loss": 0.402, + "step": 4880 + }, + { + "epoch": 0.5378512396694215, + "grad_norm": 8.600005149841309, + "learning_rate": 4.479908079498168e-06, + "loss": 0.4309, + "step": 4881 + }, + { + "epoch": 0.5379614325068871, + "grad_norm": 6.681096076965332, + "learning_rate": 4.47816915836583e-06, + "loss": 0.4075, + "step": 4882 + }, + { + "epoch": 0.5380716253443526, + "grad_norm": 6.604003429412842, + "learning_rate": 4.476430301043678e-06, + "loss": 0.4222, + "step": 4883 + }, + { + "epoch": 0.5381818181818182, + "grad_norm": 5.344298362731934, + "learning_rate": 4.474691507744339e-06, + "loss": 0.4041, + "step": 4884 + }, + { + "epoch": 0.5382920110192837, + "grad_norm": 7.361419200897217, + "learning_rate": 4.472952778680436e-06, + "loss": 0.427, + "step": 4885 + }, + { + "epoch": 0.5384022038567493, + "grad_norm": 5.2496514320373535, + "learning_rate": 4.4712141140645835e-06, + "loss": 0.4253, + "step": 4886 + }, + { + "epoch": 0.5385123966942149, + "grad_norm": 6.846745491027832, + "learning_rate": 4.469475514109387e-06, + "loss": 0.3627, + "step": 4887 + }, + { + "epoch": 0.5386225895316804, + "grad_norm": 5.707091808319092, + "learning_rate": 4.467736979027445e-06, + "loss": 0.4314, + "step": 4888 + }, + { + "epoch": 0.538732782369146, + "grad_norm": 5.808233737945557, + "learning_rate": 4.46599850903135e-06, + "loss": 0.4623, + "step": 4889 + }, + { + "epoch": 0.5388429752066116, + "grad_norm": 5.639697551727295, + "learning_rate": 4.46426010433368e-06, + "loss": 0.3758, + "step": 4890 + }, + { + "epoch": 0.5389531680440771, + "grad_norm": 7.901959419250488, + "learning_rate": 4.462521765147014e-06, + "loss": 0.341, + "step": 4891 + }, + { + "epoch": 0.5390633608815427, + "grad_norm": 5.190210819244385, + "learning_rate": 4.460783491683917e-06, + "loss": 0.4943, + "step": 4892 + }, + { + "epoch": 0.5391735537190082, + "grad_norm": 5.680449485778809, + "learning_rate": 4.4590452841569446e-06, + "loss": 0.383, + "step": 4893 + }, + { + "epoch": 0.5392837465564738, + "grad_norm": 6.7000250816345215, + "learning_rate": 4.457307142778653e-06, + "loss": 0.396, + "step": 4894 + }, + { + "epoch": 0.5393939393939394, + "grad_norm": 5.925076961517334, + "learning_rate": 4.45556906776158e-06, + "loss": 0.4445, + "step": 4895 + }, + { + "epoch": 0.5395041322314049, + "grad_norm": 8.168147087097168, + "learning_rate": 4.453831059318259e-06, + "loss": 0.327, + "step": 4896 + }, + { + "epoch": 0.5396143250688705, + "grad_norm": 5.0076165199279785, + "learning_rate": 4.452093117661221e-06, + "loss": 0.402, + "step": 4897 + }, + { + "epoch": 0.5397245179063361, + "grad_norm": 6.923157215118408, + "learning_rate": 4.450355243002979e-06, + "loss": 0.4016, + "step": 4898 + }, + { + "epoch": 0.5398347107438016, + "grad_norm": 4.7264838218688965, + "learning_rate": 4.448617435556044e-06, + "loss": 0.3793, + "step": 4899 + }, + { + "epoch": 0.5399449035812672, + "grad_norm": 5.012410640716553, + "learning_rate": 4.446879695532919e-06, + "loss": 0.3114, + "step": 4900 + }, + { + "epoch": 0.5400550964187327, + "grad_norm": 9.185056686401367, + "learning_rate": 4.445142023146095e-06, + "loss": 0.4989, + "step": 4901 + }, + { + "epoch": 0.5401652892561983, + "grad_norm": 7.158074855804443, + "learning_rate": 4.44340441860806e-06, + "loss": 0.4307, + "step": 4902 + }, + { + "epoch": 0.5402754820936639, + "grad_norm": 5.751270771026611, + "learning_rate": 4.441666882131288e-06, + "loss": 0.3339, + "step": 4903 + }, + { + "epoch": 0.5403856749311294, + "grad_norm": 4.243267059326172, + "learning_rate": 4.439929413928247e-06, + "loss": 0.401, + "step": 4904 + }, + { + "epoch": 0.540495867768595, + "grad_norm": 5.914028167724609, + "learning_rate": 4.438192014211398e-06, + "loss": 0.3814, + "step": 4905 + }, + { + "epoch": 0.5406060606060606, + "grad_norm": 6.544396877288818, + "learning_rate": 4.4364546831931945e-06, + "loss": 0.3849, + "step": 4906 + }, + { + "epoch": 0.5407162534435261, + "grad_norm": 6.030860424041748, + "learning_rate": 4.434717421086076e-06, + "loss": 0.4617, + "step": 4907 + }, + { + "epoch": 0.5408264462809917, + "grad_norm": 5.7662248611450195, + "learning_rate": 4.43298022810248e-06, + "loss": 0.425, + "step": 4908 + }, + { + "epoch": 0.5409366391184574, + "grad_norm": 4.316056251525879, + "learning_rate": 4.431243104454833e-06, + "loss": 0.3816, + "step": 4909 + }, + { + "epoch": 0.5410468319559228, + "grad_norm": 10.407288551330566, + "learning_rate": 4.42950605035555e-06, + "loss": 0.4001, + "step": 4910 + }, + { + "epoch": 0.5411570247933885, + "grad_norm": 8.184249877929688, + "learning_rate": 4.427769066017043e-06, + "loss": 0.4545, + "step": 4911 + }, + { + "epoch": 0.541267217630854, + "grad_norm": 7.201406478881836, + "learning_rate": 4.426032151651712e-06, + "loss": 0.5168, + "step": 4912 + }, + { + "epoch": 0.5413774104683196, + "grad_norm": 6.947605133056641, + "learning_rate": 4.424295307471948e-06, + "loss": 0.413, + "step": 4913 + }, + { + "epoch": 0.5414876033057852, + "grad_norm": 5.5525102615356445, + "learning_rate": 4.422558533690136e-06, + "loss": 0.392, + "step": 4914 + }, + { + "epoch": 0.5415977961432507, + "grad_norm": 7.134677410125732, + "learning_rate": 4.420821830518652e-06, + "loss": 0.4661, + "step": 4915 + }, + { + "epoch": 0.5417079889807163, + "grad_norm": 6.866796016693115, + "learning_rate": 4.419085198169861e-06, + "loss": 0.4207, + "step": 4916 + }, + { + "epoch": 0.5418181818181819, + "grad_norm": 6.162034511566162, + "learning_rate": 4.417348636856121e-06, + "loss": 0.4213, + "step": 4917 + }, + { + "epoch": 0.5419283746556474, + "grad_norm": 11.50009536743164, + "learning_rate": 4.415612146789781e-06, + "loss": 0.4765, + "step": 4918 + }, + { + "epoch": 0.542038567493113, + "grad_norm": 6.429296016693115, + "learning_rate": 4.413875728183181e-06, + "loss": 0.4699, + "step": 4919 + }, + { + "epoch": 0.5421487603305785, + "grad_norm": 8.378180503845215, + "learning_rate": 4.412139381248655e-06, + "loss": 0.5049, + "step": 4920 + }, + { + "epoch": 0.5422589531680441, + "grad_norm": 5.784651279449463, + "learning_rate": 4.410403106198521e-06, + "loss": 0.3182, + "step": 4921 + }, + { + "epoch": 0.5423691460055097, + "grad_norm": 8.84086799621582, + "learning_rate": 4.408666903245098e-06, + "loss": 0.4641, + "step": 4922 + }, + { + "epoch": 0.5424793388429752, + "grad_norm": 7.057933807373047, + "learning_rate": 4.406930772600691e-06, + "loss": 0.4934, + "step": 4923 + }, + { + "epoch": 0.5425895316804408, + "grad_norm": 6.431195259094238, + "learning_rate": 4.40519471447759e-06, + "loss": 0.3925, + "step": 4924 + }, + { + "epoch": 0.5426997245179064, + "grad_norm": 6.534265041351318, + "learning_rate": 4.403458729088092e-06, + "loss": 0.3092, + "step": 4925 + }, + { + "epoch": 0.5428099173553719, + "grad_norm": 7.719228744506836, + "learning_rate": 4.4017228166444696e-06, + "loss": 0.3748, + "step": 4926 + }, + { + "epoch": 0.5429201101928375, + "grad_norm": 8.462837219238281, + "learning_rate": 4.399986977358992e-06, + "loss": 0.4354, + "step": 4927 + }, + { + "epoch": 0.5430303030303031, + "grad_norm": 5.98094367980957, + "learning_rate": 4.398251211443923e-06, + "loss": 0.3639, + "step": 4928 + }, + { + "epoch": 0.5431404958677686, + "grad_norm": 6.449747085571289, + "learning_rate": 4.396515519111512e-06, + "loss": 0.471, + "step": 4929 + }, + { + "epoch": 0.5432506887052342, + "grad_norm": 4.866857051849365, + "learning_rate": 4.3947799005740065e-06, + "loss": 0.3987, + "step": 4930 + }, + { + "epoch": 0.5433608815426997, + "grad_norm": 5.388254642486572, + "learning_rate": 4.3930443560436346e-06, + "loss": 0.4468, + "step": 4931 + }, + { + "epoch": 0.5434710743801653, + "grad_norm": 5.047433376312256, + "learning_rate": 4.391308885732622e-06, + "loss": 0.406, + "step": 4932 + }, + { + "epoch": 0.5435812672176309, + "grad_norm": 5.697027683258057, + "learning_rate": 4.3895734898531885e-06, + "loss": 0.4061, + "step": 4933 + }, + { + "epoch": 0.5436914600550964, + "grad_norm": 6.802773475646973, + "learning_rate": 4.387838168617536e-06, + "loss": 0.3757, + "step": 4934 + }, + { + "epoch": 0.543801652892562, + "grad_norm": 5.0467705726623535, + "learning_rate": 4.386102922237864e-06, + "loss": 0.4016, + "step": 4935 + }, + { + "epoch": 0.5439118457300276, + "grad_norm": 4.891822814941406, + "learning_rate": 4.384367750926362e-06, + "loss": 0.3805, + "step": 4936 + }, + { + "epoch": 0.5440220385674931, + "grad_norm": 6.10658597946167, + "learning_rate": 4.382632654895206e-06, + "loss": 0.4592, + "step": 4937 + }, + { + "epoch": 0.5441322314049587, + "grad_norm": 13.864315032958984, + "learning_rate": 4.380897634356567e-06, + "loss": 0.4804, + "step": 4938 + }, + { + "epoch": 0.5442424242424242, + "grad_norm": 5.945678234100342, + "learning_rate": 4.379162689522608e-06, + "loss": 0.3606, + "step": 4939 + }, + { + "epoch": 0.5443526170798898, + "grad_norm": 8.377779960632324, + "learning_rate": 4.377427820605479e-06, + "loss": 0.3898, + "step": 4940 + }, + { + "epoch": 0.5444628099173554, + "grad_norm": 6.678970813751221, + "learning_rate": 4.3756930278173196e-06, + "loss": 0.4064, + "step": 4941 + }, + { + "epoch": 0.5445730027548209, + "grad_norm": 4.300466060638428, + "learning_rate": 4.373958311370266e-06, + "loss": 0.4041, + "step": 4942 + }, + { + "epoch": 0.5446831955922865, + "grad_norm": 6.053644180297852, + "learning_rate": 4.37222367147644e-06, + "loss": 0.3838, + "step": 4943 + }, + { + "epoch": 0.5447933884297521, + "grad_norm": 5.004225730895996, + "learning_rate": 4.3704891083479575e-06, + "loss": 0.4211, + "step": 4944 + }, + { + "epoch": 0.5449035812672176, + "grad_norm": 8.41515064239502, + "learning_rate": 4.368754622196921e-06, + "loss": 0.416, + "step": 4945 + }, + { + "epoch": 0.5450137741046832, + "grad_norm": 7.956199645996094, + "learning_rate": 4.367020213235426e-06, + "loss": 0.4295, + "step": 4946 + }, + { + "epoch": 0.5451239669421487, + "grad_norm": 5.374857425689697, + "learning_rate": 4.365285881675561e-06, + "loss": 0.4256, + "step": 4947 + }, + { + "epoch": 0.5452341597796143, + "grad_norm": 7.09722900390625, + "learning_rate": 4.3635516277294e-06, + "loss": 0.3855, + "step": 4948 + }, + { + "epoch": 0.5453443526170799, + "grad_norm": 4.6613030433654785, + "learning_rate": 4.361817451609008e-06, + "loss": 0.3554, + "step": 4949 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 4.342301368713379, + "learning_rate": 4.360083353526447e-06, + "loss": 0.4192, + "step": 4950 + }, + { + "epoch": 0.545564738292011, + "grad_norm": 5.399786949157715, + "learning_rate": 4.358349333693765e-06, + "loss": 0.4142, + "step": 4951 + }, + { + "epoch": 0.5456749311294766, + "grad_norm": 8.767022132873535, + "learning_rate": 4.356615392322994e-06, + "loss": 0.411, + "step": 4952 + }, + { + "epoch": 0.5457851239669421, + "grad_norm": 9.512101173400879, + "learning_rate": 4.35488152962617e-06, + "loss": 0.4414, + "step": 4953 + }, + { + "epoch": 0.5458953168044077, + "grad_norm": 8.008020401000977, + "learning_rate": 4.353147745815308e-06, + "loss": 0.3263, + "step": 4954 + }, + { + "epoch": 0.5460055096418733, + "grad_norm": 6.651403903961182, + "learning_rate": 4.3514140411024156e-06, + "loss": 0.3493, + "step": 4955 + }, + { + "epoch": 0.5461157024793388, + "grad_norm": 6.7860331535339355, + "learning_rate": 4.349680415699499e-06, + "loss": 0.402, + "step": 4956 + }, + { + "epoch": 0.5462258953168044, + "grad_norm": 5.144118785858154, + "learning_rate": 4.347946869818541e-06, + "loss": 0.411, + "step": 4957 + }, + { + "epoch": 0.5463360881542699, + "grad_norm": 12.622493743896484, + "learning_rate": 4.346213403671529e-06, + "loss": 0.501, + "step": 4958 + }, + { + "epoch": 0.5464462809917355, + "grad_norm": 8.2738618850708, + "learning_rate": 4.344480017470429e-06, + "loss": 0.3893, + "step": 4959 + }, + { + "epoch": 0.5465564738292011, + "grad_norm": 6.189448356628418, + "learning_rate": 4.342746711427202e-06, + "loss": 0.388, + "step": 4960 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 6.918076515197754, + "learning_rate": 4.3410134857538e-06, + "loss": 0.4298, + "step": 4961 + }, + { + "epoch": 0.5467768595041322, + "grad_norm": 5.94843053817749, + "learning_rate": 4.339280340662165e-06, + "loss": 0.3492, + "step": 4962 + }, + { + "epoch": 0.5468870523415978, + "grad_norm": 12.531344413757324, + "learning_rate": 4.337547276364225e-06, + "loss": 0.5248, + "step": 4963 + }, + { + "epoch": 0.5469972451790633, + "grad_norm": 5.890338897705078, + "learning_rate": 4.335814293071907e-06, + "loss": 0.3161, + "step": 4964 + }, + { + "epoch": 0.547107438016529, + "grad_norm": 5.140368938446045, + "learning_rate": 4.334081390997119e-06, + "loss": 0.4451, + "step": 4965 + }, + { + "epoch": 0.5472176308539944, + "grad_norm": 5.555690288543701, + "learning_rate": 4.332348570351761e-06, + "loss": 0.4427, + "step": 4966 + }, + { + "epoch": 0.54732782369146, + "grad_norm": 6.685448169708252, + "learning_rate": 4.330615831347729e-06, + "loss": 0.3855, + "step": 4967 + }, + { + "epoch": 0.5474380165289257, + "grad_norm": 5.776802062988281, + "learning_rate": 4.328883174196901e-06, + "loss": 0.3634, + "step": 4968 + }, + { + "epoch": 0.5475482093663911, + "grad_norm": 6.580652713775635, + "learning_rate": 4.32715059911115e-06, + "loss": 0.4399, + "step": 4969 + }, + { + "epoch": 0.5476584022038568, + "grad_norm": 5.1589813232421875, + "learning_rate": 4.325418106302339e-06, + "loss": 0.3577, + "step": 4970 + }, + { + "epoch": 0.5477685950413224, + "grad_norm": 5.44032621383667, + "learning_rate": 4.323685695982316e-06, + "loss": 0.4031, + "step": 4971 + }, + { + "epoch": 0.5478787878787879, + "grad_norm": 8.49659252166748, + "learning_rate": 4.321953368362927e-06, + "loss": 0.4534, + "step": 4972 + }, + { + "epoch": 0.5479889807162535, + "grad_norm": 7.460810661315918, + "learning_rate": 4.320221123656001e-06, + "loss": 0.4431, + "step": 4973 + }, + { + "epoch": 0.548099173553719, + "grad_norm": 9.8892183303833, + "learning_rate": 4.318488962073358e-06, + "loss": 0.4066, + "step": 4974 + }, + { + "epoch": 0.5482093663911846, + "grad_norm": 6.912549018859863, + "learning_rate": 4.316756883826811e-06, + "loss": 0.4044, + "step": 4975 + }, + { + "epoch": 0.5483195592286502, + "grad_norm": 4.6018757820129395, + "learning_rate": 4.315024889128161e-06, + "loss": 0.3947, + "step": 4976 + }, + { + "epoch": 0.5484297520661157, + "grad_norm": 4.4889726638793945, + "learning_rate": 4.313292978189197e-06, + "loss": 0.4163, + "step": 4977 + }, + { + "epoch": 0.5485399449035813, + "grad_norm": 5.36277437210083, + "learning_rate": 4.311561151221702e-06, + "loss": 0.416, + "step": 4978 + }, + { + "epoch": 0.5486501377410469, + "grad_norm": 6.793168067932129, + "learning_rate": 4.309829408437446e-06, + "loss": 0.3331, + "step": 4979 + }, + { + "epoch": 0.5487603305785124, + "grad_norm": 6.599664688110352, + "learning_rate": 4.308097750048183e-06, + "loss": 0.3934, + "step": 4980 + }, + { + "epoch": 0.548870523415978, + "grad_norm": 7.5874714851379395, + "learning_rate": 4.30636617626567e-06, + "loss": 0.4842, + "step": 4981 + }, + { + "epoch": 0.5489807162534436, + "grad_norm": 4.51835298538208, + "learning_rate": 4.304634687301642e-06, + "loss": 0.3492, + "step": 4982 + }, + { + "epoch": 0.5490909090909091, + "grad_norm": 5.6787519454956055, + "learning_rate": 4.302903283367828e-06, + "loss": 0.4191, + "step": 4983 + }, + { + "epoch": 0.5492011019283747, + "grad_norm": 7.3988542556762695, + "learning_rate": 4.30117196467595e-06, + "loss": 0.3969, + "step": 4984 + }, + { + "epoch": 0.5493112947658402, + "grad_norm": 7.988712787628174, + "learning_rate": 4.2994407314377105e-06, + "loss": 0.5196, + "step": 4985 + }, + { + "epoch": 0.5494214876033058, + "grad_norm": 4.772488594055176, + "learning_rate": 4.297709583864813e-06, + "loss": 0.3467, + "step": 4986 + }, + { + "epoch": 0.5495316804407714, + "grad_norm": 4.936898231506348, + "learning_rate": 4.2959785221689406e-06, + "loss": 0.3985, + "step": 4987 + }, + { + "epoch": 0.5496418732782369, + "grad_norm": 8.552613258361816, + "learning_rate": 4.294247546561768e-06, + "loss": 0.3919, + "step": 4988 + }, + { + "epoch": 0.5497520661157025, + "grad_norm": 4.918736457824707, + "learning_rate": 4.2925166572549685e-06, + "loss": 0.3695, + "step": 4989 + }, + { + "epoch": 0.5498622589531681, + "grad_norm": 13.8289213180542, + "learning_rate": 4.290785854460191e-06, + "loss": 0.4322, + "step": 4990 + }, + { + "epoch": 0.5499724517906336, + "grad_norm": 7.58026647567749, + "learning_rate": 4.289055138389082e-06, + "loss": 0.3489, + "step": 4991 + }, + { + "epoch": 0.5500826446280992, + "grad_norm": 9.824568748474121, + "learning_rate": 4.2873245092532776e-06, + "loss": 0.435, + "step": 4992 + }, + { + "epoch": 0.5501928374655647, + "grad_norm": 7.381153583526611, + "learning_rate": 4.2855939672644e-06, + "loss": 0.4866, + "step": 4993 + }, + { + "epoch": 0.5503030303030303, + "grad_norm": 12.453740119934082, + "learning_rate": 4.283863512634061e-06, + "loss": 0.4905, + "step": 4994 + }, + { + "epoch": 0.5504132231404959, + "grad_norm": 6.226518154144287, + "learning_rate": 4.282133145573867e-06, + "loss": 0.39, + "step": 4995 + }, + { + "epoch": 0.5505234159779614, + "grad_norm": 6.088111400604248, + "learning_rate": 4.280402866295406e-06, + "loss": 0.3693, + "step": 4996 + }, + { + "epoch": 0.550633608815427, + "grad_norm": 6.662594318389893, + "learning_rate": 4.2786726750102596e-06, + "loss": 0.3761, + "step": 4997 + }, + { + "epoch": 0.5507438016528926, + "grad_norm": 4.674248695373535, + "learning_rate": 4.276942571929998e-06, + "loss": 0.3635, + "step": 4998 + }, + { + "epoch": 0.5508539944903581, + "grad_norm": 11.55190658569336, + "learning_rate": 4.275212557266181e-06, + "loss": 0.4362, + "step": 4999 + }, + { + "epoch": 0.5509641873278237, + "grad_norm": 4.43834924697876, + "learning_rate": 4.273482631230358e-06, + "loss": 0.3354, + "step": 5000 + }, + { + "epoch": 0.5510743801652892, + "grad_norm": 9.896891593933105, + "learning_rate": 4.271752794034065e-06, + "loss": 0.4346, + "step": 5001 + }, + { + "epoch": 0.5511845730027548, + "grad_norm": 5.692206382751465, + "learning_rate": 4.270023045888829e-06, + "loss": 0.3872, + "step": 5002 + }, + { + "epoch": 0.5512947658402204, + "grad_norm": 4.49644660949707, + "learning_rate": 4.268293387006168e-06, + "loss": 0.4012, + "step": 5003 + }, + { + "epoch": 0.5514049586776859, + "grad_norm": 6.0624518394470215, + "learning_rate": 4.2665638175975854e-06, + "loss": 0.4274, + "step": 5004 + }, + { + "epoch": 0.5515151515151515, + "grad_norm": 5.555050373077393, + "learning_rate": 4.264834337874574e-06, + "loss": 0.389, + "step": 5005 + }, + { + "epoch": 0.5516253443526171, + "grad_norm": 7.829788684844971, + "learning_rate": 4.263104948048619e-06, + "loss": 0.3417, + "step": 5006 + }, + { + "epoch": 0.5517355371900826, + "grad_norm": 5.923528671264648, + "learning_rate": 4.261375648331194e-06, + "loss": 0.3599, + "step": 5007 + }, + { + "epoch": 0.5518457300275482, + "grad_norm": 9.991193771362305, + "learning_rate": 4.259646438933755e-06, + "loss": 0.4564, + "step": 5008 + }, + { + "epoch": 0.5519559228650138, + "grad_norm": 4.449314594268799, + "learning_rate": 4.257917320067756e-06, + "loss": 0.3436, + "step": 5009 + }, + { + "epoch": 0.5520661157024793, + "grad_norm": 5.419429779052734, + "learning_rate": 4.256188291944637e-06, + "loss": 0.4088, + "step": 5010 + }, + { + "epoch": 0.5521763085399449, + "grad_norm": 7.611926555633545, + "learning_rate": 4.2544593547758214e-06, + "loss": 0.3403, + "step": 5011 + }, + { + "epoch": 0.5522865013774104, + "grad_norm": 7.113373279571533, + "learning_rate": 4.252730508772731e-06, + "loss": 0.3637, + "step": 5012 + }, + { + "epoch": 0.552396694214876, + "grad_norm": 5.663134574890137, + "learning_rate": 4.251001754146766e-06, + "loss": 0.4462, + "step": 5013 + }, + { + "epoch": 0.5525068870523416, + "grad_norm": 6.283078193664551, + "learning_rate": 4.249273091109327e-06, + "loss": 0.3731, + "step": 5014 + }, + { + "epoch": 0.5526170798898071, + "grad_norm": 7.763297080993652, + "learning_rate": 4.247544519871793e-06, + "loss": 0.4351, + "step": 5015 + }, + { + "epoch": 0.5527272727272727, + "grad_norm": 4.79278564453125, + "learning_rate": 4.2458160406455355e-06, + "loss": 0.3428, + "step": 5016 + }, + { + "epoch": 0.5528374655647383, + "grad_norm": 8.19612979888916, + "learning_rate": 4.24408765364192e-06, + "loss": 0.366, + "step": 5017 + }, + { + "epoch": 0.5529476584022038, + "grad_norm": 8.124463081359863, + "learning_rate": 4.2423593590722925e-06, + "loss": 0.3977, + "step": 5018 + }, + { + "epoch": 0.5530578512396694, + "grad_norm": 7.299890041351318, + "learning_rate": 4.240631157147989e-06, + "loss": 0.3687, + "step": 5019 + }, + { + "epoch": 0.5531680440771349, + "grad_norm": 6.376744747161865, + "learning_rate": 4.238903048080342e-06, + "loss": 0.3768, + "step": 5020 + }, + { + "epoch": 0.5532782369146005, + "grad_norm": 10.836827278137207, + "learning_rate": 4.237175032080664e-06, + "loss": 0.4017, + "step": 5021 + }, + { + "epoch": 0.5533884297520661, + "grad_norm": 5.046148777008057, + "learning_rate": 4.235447109360257e-06, + "loss": 0.3483, + "step": 5022 + }, + { + "epoch": 0.5534986225895316, + "grad_norm": 5.973788261413574, + "learning_rate": 4.233719280130418e-06, + "loss": 0.4942, + "step": 5023 + }, + { + "epoch": 0.5536088154269972, + "grad_norm": 9.961180686950684, + "learning_rate": 4.231991544602426e-06, + "loss": 0.3557, + "step": 5024 + }, + { + "epoch": 0.5537190082644629, + "grad_norm": 9.15896224975586, + "learning_rate": 4.23026390298755e-06, + "loss": 0.3715, + "step": 5025 + }, + { + "epoch": 0.5538292011019283, + "grad_norm": 5.100748062133789, + "learning_rate": 4.228536355497051e-06, + "loss": 0.4171, + "step": 5026 + }, + { + "epoch": 0.553939393939394, + "grad_norm": 7.906282424926758, + "learning_rate": 4.226808902342174e-06, + "loss": 0.4004, + "step": 5027 + }, + { + "epoch": 0.5540495867768596, + "grad_norm": 6.108539581298828, + "learning_rate": 4.225081543734153e-06, + "loss": 0.3568, + "step": 5028 + }, + { + "epoch": 0.554159779614325, + "grad_norm": 9.712579727172852, + "learning_rate": 4.223354279884216e-06, + "loss": 0.4134, + "step": 5029 + }, + { + "epoch": 0.5542699724517907, + "grad_norm": 4.735379219055176, + "learning_rate": 4.221627111003571e-06, + "loss": 0.4288, + "step": 5030 + }, + { + "epoch": 0.5543801652892562, + "grad_norm": 7.070766925811768, + "learning_rate": 4.219900037303421e-06, + "loss": 0.3591, + "step": 5031 + }, + { + "epoch": 0.5544903581267218, + "grad_norm": 5.8401031494140625, + "learning_rate": 4.2181730589949546e-06, + "loss": 0.4314, + "step": 5032 + }, + { + "epoch": 0.5546005509641874, + "grad_norm": 7.939309597015381, + "learning_rate": 4.216446176289346e-06, + "loss": 0.4832, + "step": 5033 + }, + { + "epoch": 0.5547107438016529, + "grad_norm": 5.133927345275879, + "learning_rate": 4.214719389397766e-06, + "loss": 0.3338, + "step": 5034 + }, + { + "epoch": 0.5548209366391185, + "grad_norm": 8.18437671661377, + "learning_rate": 4.212992698531366e-06, + "loss": 0.4691, + "step": 5035 + }, + { + "epoch": 0.5549311294765841, + "grad_norm": 9.384045600891113, + "learning_rate": 4.211266103901286e-06, + "loss": 0.4659, + "step": 5036 + }, + { + "epoch": 0.5550413223140496, + "grad_norm": 23.223127365112305, + "learning_rate": 4.209539605718659e-06, + "loss": 0.412, + "step": 5037 + }, + { + "epoch": 0.5551515151515152, + "grad_norm": 6.536337375640869, + "learning_rate": 4.207813204194604e-06, + "loss": 0.3964, + "step": 5038 + }, + { + "epoch": 0.5552617079889807, + "grad_norm": 10.770186424255371, + "learning_rate": 4.2060868995402235e-06, + "loss": 0.5334, + "step": 5039 + }, + { + "epoch": 0.5553719008264463, + "grad_norm": 6.0237040519714355, + "learning_rate": 4.204360691966618e-06, + "loss": 0.4517, + "step": 5040 + }, + { + "epoch": 0.5554820936639119, + "grad_norm": 7.980887413024902, + "learning_rate": 4.202634581684865e-06, + "loss": 0.4244, + "step": 5041 + }, + { + "epoch": 0.5555922865013774, + "grad_norm": 5.497826099395752, + "learning_rate": 4.200908568906041e-06, + "loss": 0.4697, + "step": 5042 + }, + { + "epoch": 0.555702479338843, + "grad_norm": 7.60978364944458, + "learning_rate": 4.199182653841203e-06, + "loss": 0.3848, + "step": 5043 + }, + { + "epoch": 0.5558126721763086, + "grad_norm": 5.508256435394287, + "learning_rate": 4.1974568367013955e-06, + "loss": 0.3393, + "step": 5044 + }, + { + "epoch": 0.5559228650137741, + "grad_norm": 9.76912784576416, + "learning_rate": 4.195731117697659e-06, + "loss": 0.4678, + "step": 5045 + }, + { + "epoch": 0.5560330578512397, + "grad_norm": 11.024624824523926, + "learning_rate": 4.194005497041012e-06, + "loss": 0.4008, + "step": 5046 + }, + { + "epoch": 0.5561432506887052, + "grad_norm": 9.174602508544922, + "learning_rate": 4.192279974942468e-06, + "loss": 0.4359, + "step": 5047 + }, + { + "epoch": 0.5562534435261708, + "grad_norm": 4.278285503387451, + "learning_rate": 4.190554551613027e-06, + "loss": 0.3259, + "step": 5048 + }, + { + "epoch": 0.5563636363636364, + "grad_norm": 5.89907169342041, + "learning_rate": 4.188829227263674e-06, + "loss": 0.4437, + "step": 5049 + }, + { + "epoch": 0.5564738292011019, + "grad_norm": 5.54419469833374, + "learning_rate": 4.187104002105384e-06, + "loss": 0.3755, + "step": 5050 + }, + { + "epoch": 0.5565840220385675, + "grad_norm": 7.27056884765625, + "learning_rate": 4.185378876349121e-06, + "loss": 0.3301, + "step": 5051 + }, + { + "epoch": 0.5566942148760331, + "grad_norm": 4.663144111633301, + "learning_rate": 4.183653850205837e-06, + "loss": 0.48, + "step": 5052 + }, + { + "epoch": 0.5568044077134986, + "grad_norm": 7.8643975257873535, + "learning_rate": 4.181928923886468e-06, + "loss": 0.4575, + "step": 5053 + }, + { + "epoch": 0.5569146005509642, + "grad_norm": 10.424306869506836, + "learning_rate": 4.1802040976019424e-06, + "loss": 0.4388, + "step": 5054 + }, + { + "epoch": 0.5570247933884298, + "grad_norm": 4.337207794189453, + "learning_rate": 4.178479371563172e-06, + "loss": 0.468, + "step": 5055 + }, + { + "epoch": 0.5571349862258953, + "grad_norm": 9.239445686340332, + "learning_rate": 4.176754745981061e-06, + "loss": 0.4442, + "step": 5056 + }, + { + "epoch": 0.5572451790633609, + "grad_norm": 6.625512599945068, + "learning_rate": 4.175030221066497e-06, + "loss": 0.3327, + "step": 5057 + }, + { + "epoch": 0.5573553719008264, + "grad_norm": 6.2668070793151855, + "learning_rate": 4.173305797030359e-06, + "loss": 0.4486, + "step": 5058 + }, + { + "epoch": 0.557465564738292, + "grad_norm": 5.827762603759766, + "learning_rate": 4.171581474083511e-06, + "loss": 0.3978, + "step": 5059 + }, + { + "epoch": 0.5575757575757576, + "grad_norm": 6.255558490753174, + "learning_rate": 4.169857252436806e-06, + "loss": 0.367, + "step": 5060 + }, + { + "epoch": 0.5576859504132231, + "grad_norm": 9.605900764465332, + "learning_rate": 4.168133132301082e-06, + "loss": 0.5043, + "step": 5061 + }, + { + "epoch": 0.5577961432506887, + "grad_norm": 5.620938301086426, + "learning_rate": 4.16640911388717e-06, + "loss": 0.3788, + "step": 5062 + }, + { + "epoch": 0.5579063360881543, + "grad_norm": 4.651265621185303, + "learning_rate": 4.164685197405884e-06, + "loss": 0.3796, + "step": 5063 + }, + { + "epoch": 0.5580165289256198, + "grad_norm": 5.623221397399902, + "learning_rate": 4.162961383068027e-06, + "loss": 0.3577, + "step": 5064 + }, + { + "epoch": 0.5581267217630854, + "grad_norm": 9.15078067779541, + "learning_rate": 4.161237671084388e-06, + "loss": 0.45, + "step": 5065 + }, + { + "epoch": 0.5582369146005509, + "grad_norm": 4.805917739868164, + "learning_rate": 4.159514061665748e-06, + "loss": 0.3958, + "step": 5066 + }, + { + "epoch": 0.5583471074380165, + "grad_norm": 6.629087448120117, + "learning_rate": 4.157790555022867e-06, + "loss": 0.3458, + "step": 5067 + }, + { + "epoch": 0.5584573002754821, + "grad_norm": 10.288884162902832, + "learning_rate": 4.156067151366504e-06, + "loss": 0.4493, + "step": 5068 + }, + { + "epoch": 0.5585674931129476, + "grad_norm": 4.732473850250244, + "learning_rate": 4.154343850907393e-06, + "loss": 0.4328, + "step": 5069 + }, + { + "epoch": 0.5586776859504132, + "grad_norm": 5.083303928375244, + "learning_rate": 4.152620653856267e-06, + "loss": 0.3709, + "step": 5070 + }, + { + "epoch": 0.5587878787878788, + "grad_norm": 7.439194202423096, + "learning_rate": 4.150897560423839e-06, + "loss": 0.4592, + "step": 5071 + }, + { + "epoch": 0.5588980716253443, + "grad_norm": 6.688822269439697, + "learning_rate": 4.149174570820809e-06, + "loss": 0.3108, + "step": 5072 + }, + { + "epoch": 0.5590082644628099, + "grad_norm": 5.189765453338623, + "learning_rate": 4.1474516852578695e-06, + "loss": 0.3649, + "step": 5073 + }, + { + "epoch": 0.5591184573002754, + "grad_norm": 6.02034330368042, + "learning_rate": 4.145728903945696e-06, + "loss": 0.4075, + "step": 5074 + }, + { + "epoch": 0.559228650137741, + "grad_norm": 6.01516056060791, + "learning_rate": 4.14400622709495e-06, + "loss": 0.3907, + "step": 5075 + }, + { + "epoch": 0.5593388429752066, + "grad_norm": 8.235849380493164, + "learning_rate": 4.142283654916288e-06, + "loss": 0.4458, + "step": 5076 + }, + { + "epoch": 0.5594490358126721, + "grad_norm": 6.125243186950684, + "learning_rate": 4.1405611876203455e-06, + "loss": 0.456, + "step": 5077 + }, + { + "epoch": 0.5595592286501377, + "grad_norm": 5.977057456970215, + "learning_rate": 4.138838825417747e-06, + "loss": 0.3886, + "step": 5078 + }, + { + "epoch": 0.5596694214876033, + "grad_norm": 6.936330795288086, + "learning_rate": 4.137116568519108e-06, + "loss": 0.3994, + "step": 5079 + }, + { + "epoch": 0.5597796143250688, + "grad_norm": 6.488701820373535, + "learning_rate": 4.135394417135027e-06, + "loss": 0.4766, + "step": 5080 + }, + { + "epoch": 0.5598898071625344, + "grad_norm": 6.146881103515625, + "learning_rate": 4.133672371476091e-06, + "loss": 0.4505, + "step": 5081 + }, + { + "epoch": 0.56, + "grad_norm": 11.271879196166992, + "learning_rate": 4.131950431752873e-06, + "loss": 0.3688, + "step": 5082 + }, + { + "epoch": 0.5601101928374655, + "grad_norm": 5.499881744384766, + "learning_rate": 4.130228598175936e-06, + "loss": 0.405, + "step": 5083 + }, + { + "epoch": 0.5602203856749312, + "grad_norm": 5.5053391456604, + "learning_rate": 4.1285068709558285e-06, + "loss": 0.4246, + "step": 5084 + }, + { + "epoch": 0.5603305785123966, + "grad_norm": 5.651018142700195, + "learning_rate": 4.126785250303084e-06, + "loss": 0.4212, + "step": 5085 + }, + { + "epoch": 0.5604407713498623, + "grad_norm": 5.410347938537598, + "learning_rate": 4.1250637364282246e-06, + "loss": 0.4234, + "step": 5086 + }, + { + "epoch": 0.5605509641873279, + "grad_norm": 4.950188159942627, + "learning_rate": 4.123342329541761e-06, + "loss": 0.4005, + "step": 5087 + }, + { + "epoch": 0.5606611570247934, + "grad_norm": 4.54240608215332, + "learning_rate": 4.121621029854188e-06, + "loss": 0.4118, + "step": 5088 + }, + { + "epoch": 0.560771349862259, + "grad_norm": 4.574641227722168, + "learning_rate": 4.119899837575988e-06, + "loss": 0.3916, + "step": 5089 + }, + { + "epoch": 0.5608815426997246, + "grad_norm": 6.007846832275391, + "learning_rate": 4.118178752917632e-06, + "loss": 0.4032, + "step": 5090 + }, + { + "epoch": 0.5609917355371901, + "grad_norm": 4.737065315246582, + "learning_rate": 4.116457776089576e-06, + "loss": 0.3332, + "step": 5091 + }, + { + "epoch": 0.5611019283746557, + "grad_norm": 14.039338111877441, + "learning_rate": 4.114736907302263e-06, + "loss": 0.5713, + "step": 5092 + }, + { + "epoch": 0.5612121212121212, + "grad_norm": 4.842670440673828, + "learning_rate": 4.113016146766124e-06, + "loss": 0.4207, + "step": 5093 + }, + { + "epoch": 0.5613223140495868, + "grad_norm": 5.0246806144714355, + "learning_rate": 4.111295494691575e-06, + "loss": 0.3918, + "step": 5094 + }, + { + "epoch": 0.5614325068870524, + "grad_norm": 7.352241516113281, + "learning_rate": 4.1095749512890185e-06, + "loss": 0.3944, + "step": 5095 + }, + { + "epoch": 0.5615426997245179, + "grad_norm": 6.591299057006836, + "learning_rate": 4.107854516768848e-06, + "loss": 0.3938, + "step": 5096 + }, + { + "epoch": 0.5616528925619835, + "grad_norm": 8.282949447631836, + "learning_rate": 4.1061341913414386e-06, + "loss": 0.383, + "step": 5097 + }, + { + "epoch": 0.5617630853994491, + "grad_norm": 10.040361404418945, + "learning_rate": 4.104413975217155e-06, + "loss": 0.4855, + "step": 5098 + }, + { + "epoch": 0.5618732782369146, + "grad_norm": 5.768685340881348, + "learning_rate": 4.102693868606349e-06, + "loss": 0.3668, + "step": 5099 + }, + { + "epoch": 0.5619834710743802, + "grad_norm": 7.725187301635742, + "learning_rate": 4.100973871719351e-06, + "loss": 0.4449, + "step": 5100 + }, + { + "epoch": 0.5620936639118457, + "grad_norm": 6.909238815307617, + "learning_rate": 4.0992539847664935e-06, + "loss": 0.3774, + "step": 5101 + }, + { + "epoch": 0.5622038567493113, + "grad_norm": 5.824862480163574, + "learning_rate": 4.097534207958081e-06, + "loss": 0.3557, + "step": 5102 + }, + { + "epoch": 0.5623140495867769, + "grad_norm": 6.053623199462891, + "learning_rate": 4.095814541504409e-06, + "loss": 0.4114, + "step": 5103 + }, + { + "epoch": 0.5624242424242424, + "grad_norm": 9.661008834838867, + "learning_rate": 4.094094985615766e-06, + "loss": 0.4756, + "step": 5104 + }, + { + "epoch": 0.562534435261708, + "grad_norm": 7.061156272888184, + "learning_rate": 4.092375540502418e-06, + "loss": 0.4518, + "step": 5105 + }, + { + "epoch": 0.5626446280991736, + "grad_norm": 8.996638298034668, + "learning_rate": 4.090656206374622e-06, + "loss": 0.4032, + "step": 5106 + }, + { + "epoch": 0.5627548209366391, + "grad_norm": 4.997591495513916, + "learning_rate": 4.0889369834426195e-06, + "loss": 0.3713, + "step": 5107 + }, + { + "epoch": 0.5628650137741047, + "grad_norm": 9.249066352844238, + "learning_rate": 4.087217871916641e-06, + "loss": 0.4829, + "step": 5108 + }, + { + "epoch": 0.5629752066115703, + "grad_norm": 8.250449180603027, + "learning_rate": 4.0854988720069e-06, + "loss": 0.4207, + "step": 5109 + }, + { + "epoch": 0.5630853994490358, + "grad_norm": 5.6974406242370605, + "learning_rate": 4.0837799839236e-06, + "loss": 0.4235, + "step": 5110 + }, + { + "epoch": 0.5631955922865014, + "grad_norm": 6.328078746795654, + "learning_rate": 4.082061207876927e-06, + "loss": 0.3676, + "step": 5111 + }, + { + "epoch": 0.5633057851239669, + "grad_norm": 9.134344100952148, + "learning_rate": 4.080342544077058e-06, + "loss": 0.4886, + "step": 5112 + }, + { + "epoch": 0.5634159779614325, + "grad_norm": 7.8390960693359375, + "learning_rate": 4.078623992734151e-06, + "loss": 0.4518, + "step": 5113 + }, + { + "epoch": 0.5635261707988981, + "grad_norm": 8.834000587463379, + "learning_rate": 4.076905554058353e-06, + "loss": 0.4235, + "step": 5114 + }, + { + "epoch": 0.5636363636363636, + "grad_norm": 6.361852645874023, + "learning_rate": 4.0751872282598e-06, + "loss": 0.3848, + "step": 5115 + }, + { + "epoch": 0.5637465564738292, + "grad_norm": 6.1839094161987305, + "learning_rate": 4.073469015548608e-06, + "loss": 0.3905, + "step": 5116 + }, + { + "epoch": 0.5638567493112948, + "grad_norm": 6.1756391525268555, + "learning_rate": 4.0717509161348815e-06, + "loss": 0.4582, + "step": 5117 + }, + { + "epoch": 0.5639669421487603, + "grad_norm": 4.953059196472168, + "learning_rate": 4.0700329302287165e-06, + "loss": 0.3591, + "step": 5118 + }, + { + "epoch": 0.5640771349862259, + "grad_norm": 5.793675422668457, + "learning_rate": 4.068315058040187e-06, + "loss": 0.3357, + "step": 5119 + }, + { + "epoch": 0.5641873278236914, + "grad_norm": 11.507821083068848, + "learning_rate": 4.0665972997793565e-06, + "loss": 0.5176, + "step": 5120 + }, + { + "epoch": 0.564297520661157, + "grad_norm": 9.467365264892578, + "learning_rate": 4.064879655656278e-06, + "loss": 0.4334, + "step": 5121 + }, + { + "epoch": 0.5644077134986226, + "grad_norm": 7.418822288513184, + "learning_rate": 4.063162125880986e-06, + "loss": 0.5048, + "step": 5122 + }, + { + "epoch": 0.5645179063360881, + "grad_norm": 9.262983322143555, + "learning_rate": 4.061444710663498e-06, + "loss": 0.3923, + "step": 5123 + }, + { + "epoch": 0.5646280991735537, + "grad_norm": 8.48256778717041, + "learning_rate": 4.05972741021383e-06, + "loss": 0.456, + "step": 5124 + }, + { + "epoch": 0.5647382920110193, + "grad_norm": 5.7215094566345215, + "learning_rate": 4.0580102247419684e-06, + "loss": 0.3397, + "step": 5125 + }, + { + "epoch": 0.5648484848484848, + "grad_norm": 5.982723712921143, + "learning_rate": 4.0562931544578975e-06, + "loss": 0.4135, + "step": 5126 + }, + { + "epoch": 0.5649586776859504, + "grad_norm": 9.416303634643555, + "learning_rate": 4.054576199571584e-06, + "loss": 0.3751, + "step": 5127 + }, + { + "epoch": 0.5650688705234159, + "grad_norm": 5.361845970153809, + "learning_rate": 4.0528593602929715e-06, + "loss": 0.3938, + "step": 5128 + }, + { + "epoch": 0.5651790633608815, + "grad_norm": 14.400737762451172, + "learning_rate": 4.051142636832007e-06, + "loss": 0.4752, + "step": 5129 + }, + { + "epoch": 0.5652892561983471, + "grad_norm": 6.782834053039551, + "learning_rate": 4.0494260293986095e-06, + "loss": 0.3934, + "step": 5130 + }, + { + "epoch": 0.5653994490358126, + "grad_norm": 5.719883441925049, + "learning_rate": 4.047709538202686e-06, + "loss": 0.3518, + "step": 5131 + }, + { + "epoch": 0.5655096418732782, + "grad_norm": 6.53598165512085, + "learning_rate": 4.045993163454137e-06, + "loss": 0.3836, + "step": 5132 + }, + { + "epoch": 0.5656198347107438, + "grad_norm": 5.460474014282227, + "learning_rate": 4.044276905362838e-06, + "loss": 0.4103, + "step": 5133 + }, + { + "epoch": 0.5657300275482093, + "grad_norm": 9.956212997436523, + "learning_rate": 4.042560764138657e-06, + "loss": 0.4957, + "step": 5134 + }, + { + "epoch": 0.5658402203856749, + "grad_norm": 8.960071563720703, + "learning_rate": 4.040844739991447e-06, + "loss": 0.4983, + "step": 5135 + }, + { + "epoch": 0.5659504132231405, + "grad_norm": 8.893988609313965, + "learning_rate": 4.039128833131046e-06, + "loss": 0.4494, + "step": 5136 + }, + { + "epoch": 0.566060606060606, + "grad_norm": 4.337855339050293, + "learning_rate": 4.037413043767274e-06, + "loss": 0.3394, + "step": 5137 + }, + { + "epoch": 0.5661707988980716, + "grad_norm": 7.404200553894043, + "learning_rate": 4.035697372109944e-06, + "loss": 0.4619, + "step": 5138 + }, + { + "epoch": 0.5662809917355371, + "grad_norm": 6.283228397369385, + "learning_rate": 4.033981818368849e-06, + "loss": 0.3573, + "step": 5139 + }, + { + "epoch": 0.5663911845730027, + "grad_norm": 4.769010543823242, + "learning_rate": 4.03226638275377e-06, + "loss": 0.3701, + "step": 5140 + }, + { + "epoch": 0.5665013774104684, + "grad_norm": 8.625991821289062, + "learning_rate": 4.030551065474472e-06, + "loss": 0.3806, + "step": 5141 + }, + { + "epoch": 0.5666115702479338, + "grad_norm": 6.381789207458496, + "learning_rate": 4.0288358667407055e-06, + "loss": 0.3414, + "step": 5142 + }, + { + "epoch": 0.5667217630853995, + "grad_norm": 8.321535110473633, + "learning_rate": 4.02712078676221e-06, + "loss": 0.3744, + "step": 5143 + }, + { + "epoch": 0.5668319559228651, + "grad_norm": 5.784425258636475, + "learning_rate": 4.025405825748706e-06, + "loss": 0.3542, + "step": 5144 + }, + { + "epoch": 0.5669421487603306, + "grad_norm": 5.705422401428223, + "learning_rate": 4.023690983909901e-06, + "loss": 0.4205, + "step": 5145 + }, + { + "epoch": 0.5670523415977962, + "grad_norm": 7.45026159286499, + "learning_rate": 4.021976261455488e-06, + "loss": 0.468, + "step": 5146 + }, + { + "epoch": 0.5671625344352617, + "grad_norm": 6.045354843139648, + "learning_rate": 4.020261658595147e-06, + "loss": 0.4172, + "step": 5147 + }, + { + "epoch": 0.5672727272727273, + "grad_norm": 11.95577621459961, + "learning_rate": 4.0185471755385404e-06, + "loss": 0.3981, + "step": 5148 + }, + { + "epoch": 0.5673829201101929, + "grad_norm": 6.558634281158447, + "learning_rate": 4.01683281249532e-06, + "loss": 0.4548, + "step": 5149 + }, + { + "epoch": 0.5674931129476584, + "grad_norm": 5.374818801879883, + "learning_rate": 4.015118569675118e-06, + "loss": 0.4464, + "step": 5150 + }, + { + "epoch": 0.567603305785124, + "grad_norm": 5.898446083068848, + "learning_rate": 4.013404447287554e-06, + "loss": 0.434, + "step": 5151 + }, + { + "epoch": 0.5677134986225896, + "grad_norm": 6.746851444244385, + "learning_rate": 4.011690445542237e-06, + "loss": 0.405, + "step": 5152 + }, + { + "epoch": 0.5678236914600551, + "grad_norm": 5.899205207824707, + "learning_rate": 4.009976564648752e-06, + "loss": 0.3967, + "step": 5153 + }, + { + "epoch": 0.5679338842975207, + "grad_norm": 5.740797996520996, + "learning_rate": 4.008262804816679e-06, + "loss": 0.4142, + "step": 5154 + }, + { + "epoch": 0.5680440771349863, + "grad_norm": 9.104641914367676, + "learning_rate": 4.006549166255577e-06, + "loss": 0.3933, + "step": 5155 + }, + { + "epoch": 0.5681542699724518, + "grad_norm": 8.068328857421875, + "learning_rate": 4.004835649174992e-06, + "loss": 0.3886, + "step": 5156 + }, + { + "epoch": 0.5682644628099174, + "grad_norm": 7.784996509552002, + "learning_rate": 4.003122253784457e-06, + "loss": 0.377, + "step": 5157 + }, + { + "epoch": 0.5683746556473829, + "grad_norm": 5.714048862457275, + "learning_rate": 4.001408980293487e-06, + "loss": 0.4052, + "step": 5158 + }, + { + "epoch": 0.5684848484848485, + "grad_norm": 7.091597080230713, + "learning_rate": 3.999695828911581e-06, + "loss": 0.428, + "step": 5159 + }, + { + "epoch": 0.5685950413223141, + "grad_norm": 7.845519065856934, + "learning_rate": 3.99798279984823e-06, + "loss": 0.3705, + "step": 5160 + }, + { + "epoch": 0.5687052341597796, + "grad_norm": 10.018808364868164, + "learning_rate": 3.9962698933129026e-06, + "loss": 0.4184, + "step": 5161 + }, + { + "epoch": 0.5688154269972452, + "grad_norm": 18.566343307495117, + "learning_rate": 3.9945571095150545e-06, + "loss": 0.4997, + "step": 5162 + }, + { + "epoch": 0.5689256198347108, + "grad_norm": 4.974708080291748, + "learning_rate": 3.992844448664132e-06, + "loss": 0.4585, + "step": 5163 + }, + { + "epoch": 0.5690358126721763, + "grad_norm": 9.041829109191895, + "learning_rate": 3.991131910969558e-06, + "loss": 0.3548, + "step": 5164 + }, + { + "epoch": 0.5691460055096419, + "grad_norm": 5.789597034454346, + "learning_rate": 3.989419496640742e-06, + "loss": 0.4318, + "step": 5165 + }, + { + "epoch": 0.5692561983471074, + "grad_norm": 8.98617935180664, + "learning_rate": 3.987707205887084e-06, + "loss": 0.3519, + "step": 5166 + }, + { + "epoch": 0.569366391184573, + "grad_norm": 4.784662246704102, + "learning_rate": 3.985995038917961e-06, + "loss": 0.3853, + "step": 5167 + }, + { + "epoch": 0.5694765840220386, + "grad_norm": 8.933760643005371, + "learning_rate": 3.984282995942746e-06, + "loss": 0.4976, + "step": 5168 + }, + { + "epoch": 0.5695867768595041, + "grad_norm": 4.532029151916504, + "learning_rate": 3.982571077170786e-06, + "loss": 0.4092, + "step": 5169 + }, + { + "epoch": 0.5696969696969697, + "grad_norm": 4.453444004058838, + "learning_rate": 3.980859282811414e-06, + "loss": 0.398, + "step": 5170 + }, + { + "epoch": 0.5698071625344353, + "grad_norm": 4.47035026550293, + "learning_rate": 3.979147613073956e-06, + "loss": 0.3952, + "step": 5171 + }, + { + "epoch": 0.5699173553719008, + "grad_norm": 5.4785990715026855, + "learning_rate": 3.977436068167714e-06, + "loss": 0.3992, + "step": 5172 + }, + { + "epoch": 0.5700275482093664, + "grad_norm": 7.377060890197754, + "learning_rate": 3.975724648301976e-06, + "loss": 0.4763, + "step": 5173 + }, + { + "epoch": 0.5701377410468319, + "grad_norm": 8.86286449432373, + "learning_rate": 3.974013353686022e-06, + "loss": 0.4504, + "step": 5174 + }, + { + "epoch": 0.5702479338842975, + "grad_norm": 8.21593952178955, + "learning_rate": 3.972302184529108e-06, + "loss": 0.4363, + "step": 5175 + }, + { + "epoch": 0.5703581267217631, + "grad_norm": 4.344210147857666, + "learning_rate": 3.9705911410404785e-06, + "loss": 0.4251, + "step": 5176 + }, + { + "epoch": 0.5704683195592286, + "grad_norm": 4.425926685333252, + "learning_rate": 3.968880223429364e-06, + "loss": 0.4166, + "step": 5177 + }, + { + "epoch": 0.5705785123966942, + "grad_norm": 6.354044437408447, + "learning_rate": 3.967169431904975e-06, + "loss": 0.3828, + "step": 5178 + }, + { + "epoch": 0.5706887052341598, + "grad_norm": 5.4605278968811035, + "learning_rate": 3.96545876667651e-06, + "loss": 0.4334, + "step": 5179 + }, + { + "epoch": 0.5707988980716253, + "grad_norm": 8.027145385742188, + "learning_rate": 3.963748227953154e-06, + "loss": 0.4629, + "step": 5180 + }, + { + "epoch": 0.5709090909090909, + "grad_norm": 5.73824405670166, + "learning_rate": 3.962037815944071e-06, + "loss": 0.4121, + "step": 5181 + }, + { + "epoch": 0.5710192837465565, + "grad_norm": 9.135799407958984, + "learning_rate": 3.960327530858415e-06, + "loss": 0.4432, + "step": 5182 + }, + { + "epoch": 0.571129476584022, + "grad_norm": 5.127810478210449, + "learning_rate": 3.95861737290532e-06, + "loss": 0.3297, + "step": 5183 + }, + { + "epoch": 0.5712396694214876, + "grad_norm": 5.376129150390625, + "learning_rate": 3.956907342293908e-06, + "loss": 0.4374, + "step": 5184 + }, + { + "epoch": 0.5713498622589531, + "grad_norm": 8.083807945251465, + "learning_rate": 3.955197439233283e-06, + "loss": 0.4545, + "step": 5185 + }, + { + "epoch": 0.5714600550964187, + "grad_norm": 6.188045501708984, + "learning_rate": 3.953487663932535e-06, + "loss": 0.403, + "step": 5186 + }, + { + "epoch": 0.5715702479338843, + "grad_norm": 5.2420125007629395, + "learning_rate": 3.951778016600734e-06, + "loss": 0.3563, + "step": 5187 + }, + { + "epoch": 0.5716804407713498, + "grad_norm": 5.644331455230713, + "learning_rate": 3.950068497446944e-06, + "loss": 0.4251, + "step": 5188 + }, + { + "epoch": 0.5717906336088154, + "grad_norm": 7.3118157386779785, + "learning_rate": 3.948359106680205e-06, + "loss": 0.4561, + "step": 5189 + }, + { + "epoch": 0.571900826446281, + "grad_norm": 6.968026161193848, + "learning_rate": 3.946649844509539e-06, + "loss": 0.4548, + "step": 5190 + }, + { + "epoch": 0.5720110192837465, + "grad_norm": 5.8956193923950195, + "learning_rate": 3.944940711143964e-06, + "loss": 0.4194, + "step": 5191 + }, + { + "epoch": 0.5721212121212121, + "grad_norm": 5.4817986488342285, + "learning_rate": 3.9432317067924716e-06, + "loss": 0.4281, + "step": 5192 + }, + { + "epoch": 0.5722314049586776, + "grad_norm": 7.703038215637207, + "learning_rate": 3.941522831664041e-06, + "loss": 0.4371, + "step": 5193 + }, + { + "epoch": 0.5723415977961432, + "grad_norm": 5.166708946228027, + "learning_rate": 3.939814085967636e-06, + "loss": 0.4106, + "step": 5194 + }, + { + "epoch": 0.5724517906336088, + "grad_norm": 7.860630035400391, + "learning_rate": 3.938105469912204e-06, + "loss": 0.4284, + "step": 5195 + }, + { + "epoch": 0.5725619834710743, + "grad_norm": 8.984552383422852, + "learning_rate": 3.93639698370668e-06, + "loss": 0.4867, + "step": 5196 + }, + { + "epoch": 0.57267217630854, + "grad_norm": 6.660092353820801, + "learning_rate": 3.934688627559977e-06, + "loss": 0.4048, + "step": 5197 + }, + { + "epoch": 0.5727823691460056, + "grad_norm": 5.779437065124512, + "learning_rate": 3.932980401680994e-06, + "loss": 0.3702, + "step": 5198 + }, + { + "epoch": 0.572892561983471, + "grad_norm": 9.79520320892334, + "learning_rate": 3.931272306278619e-06, + "loss": 0.4939, + "step": 5199 + }, + { + "epoch": 0.5730027548209367, + "grad_norm": 5.133498191833496, + "learning_rate": 3.9295643415617164e-06, + "loss": 0.3741, + "step": 5200 + }, + { + "epoch": 0.5731129476584021, + "grad_norm": 3.9077324867248535, + "learning_rate": 3.9278565077391404e-06, + "loss": 0.4111, + "step": 5201 + }, + { + "epoch": 0.5732231404958678, + "grad_norm": 5.43770170211792, + "learning_rate": 3.926148805019728e-06, + "loss": 0.4027, + "step": 5202 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 7.171450138092041, + "learning_rate": 3.924441233612298e-06, + "loss": 0.3585, + "step": 5203 + }, + { + "epoch": 0.5734435261707989, + "grad_norm": 4.4244303703308105, + "learning_rate": 3.922733793725654e-06, + "loss": 0.3767, + "step": 5204 + }, + { + "epoch": 0.5735537190082645, + "grad_norm": 6.719311714172363, + "learning_rate": 3.921026485568587e-06, + "loss": 0.3987, + "step": 5205 + }, + { + "epoch": 0.5736639118457301, + "grad_norm": 5.38864803314209, + "learning_rate": 3.919319309349865e-06, + "loss": 0.3972, + "step": 5206 + }, + { + "epoch": 0.5737741046831956, + "grad_norm": 6.519610404968262, + "learning_rate": 3.917612265278246e-06, + "loss": 0.3909, + "step": 5207 + }, + { + "epoch": 0.5738842975206612, + "grad_norm": 5.475235462188721, + "learning_rate": 3.91590535356247e-06, + "loss": 0.3649, + "step": 5208 + }, + { + "epoch": 0.5739944903581268, + "grad_norm": 9.819671630859375, + "learning_rate": 3.91419857441126e-06, + "loss": 0.4592, + "step": 5209 + }, + { + "epoch": 0.5741046831955923, + "grad_norm": 6.266026020050049, + "learning_rate": 3.912491928033324e-06, + "loss": 0.3988, + "step": 5210 + }, + { + "epoch": 0.5742148760330579, + "grad_norm": 9.54811954498291, + "learning_rate": 3.910785414637351e-06, + "loss": 0.4041, + "step": 5211 + }, + { + "epoch": 0.5743250688705234, + "grad_norm": 8.638733863830566, + "learning_rate": 3.909079034432018e-06, + "loss": 0.3596, + "step": 5212 + }, + { + "epoch": 0.574435261707989, + "grad_norm": 7.72868013381958, + "learning_rate": 3.907372787625982e-06, + "loss": 0.3528, + "step": 5213 + }, + { + "epoch": 0.5745454545454546, + "grad_norm": 11.883960723876953, + "learning_rate": 3.905666674427887e-06, + "loss": 0.4519, + "step": 5214 + }, + { + "epoch": 0.5746556473829201, + "grad_norm": 14.226672172546387, + "learning_rate": 3.903960695046354e-06, + "loss": 0.4518, + "step": 5215 + }, + { + "epoch": 0.5747658402203857, + "grad_norm": 13.8538236618042, + "learning_rate": 3.902254849689999e-06, + "loss": 0.4335, + "step": 5216 + }, + { + "epoch": 0.5748760330578513, + "grad_norm": 7.674587726593018, + "learning_rate": 3.900549138567413e-06, + "loss": 0.4194, + "step": 5217 + }, + { + "epoch": 0.5749862258953168, + "grad_norm": 4.918647289276123, + "learning_rate": 3.8988435618871685e-06, + "loss": 0.3704, + "step": 5218 + }, + { + "epoch": 0.5750964187327824, + "grad_norm": 10.45202922821045, + "learning_rate": 3.897138119857833e-06, + "loss": 0.4117, + "step": 5219 + }, + { + "epoch": 0.5752066115702479, + "grad_norm": 4.533913612365723, + "learning_rate": 3.895432812687944e-06, + "loss": 0.3856, + "step": 5220 + }, + { + "epoch": 0.5753168044077135, + "grad_norm": 4.864469051361084, + "learning_rate": 3.89372764058603e-06, + "loss": 0.3646, + "step": 5221 + }, + { + "epoch": 0.5754269972451791, + "grad_norm": 7.552703857421875, + "learning_rate": 3.892022603760605e-06, + "loss": 0.3838, + "step": 5222 + }, + { + "epoch": 0.5755371900826446, + "grad_norm": 8.9181489944458, + "learning_rate": 3.890317702420158e-06, + "loss": 0.386, + "step": 5223 + }, + { + "epoch": 0.5756473829201102, + "grad_norm": 10.64404010772705, + "learning_rate": 3.888612936773173e-06, + "loss": 0.3579, + "step": 5224 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 5.714846611022949, + "learning_rate": 3.886908307028108e-06, + "loss": 0.3722, + "step": 5225 + }, + { + "epoch": 0.5758677685950413, + "grad_norm": 8.370100975036621, + "learning_rate": 3.885203813393404e-06, + "loss": 0.5137, + "step": 5226 + }, + { + "epoch": 0.5759779614325069, + "grad_norm": 6.253170490264893, + "learning_rate": 3.883499456077495e-06, + "loss": 0.3986, + "step": 5227 + }, + { + "epoch": 0.5760881542699724, + "grad_norm": 7.653555393218994, + "learning_rate": 3.881795235288788e-06, + "loss": 0.3785, + "step": 5228 + }, + { + "epoch": 0.576198347107438, + "grad_norm": 4.932460784912109, + "learning_rate": 3.880091151235678e-06, + "loss": 0.3807, + "step": 5229 + }, + { + "epoch": 0.5763085399449036, + "grad_norm": 8.360217094421387, + "learning_rate": 3.878387204126544e-06, + "loss": 0.4467, + "step": 5230 + }, + { + "epoch": 0.5764187327823691, + "grad_norm": 4.929142951965332, + "learning_rate": 3.8766833941697464e-06, + "loss": 0.3762, + "step": 5231 + }, + { + "epoch": 0.5765289256198347, + "grad_norm": 7.1862688064575195, + "learning_rate": 3.874979721573628e-06, + "loss": 0.3917, + "step": 5232 + }, + { + "epoch": 0.5766391184573003, + "grad_norm": 11.578612327575684, + "learning_rate": 3.873276186546519e-06, + "loss": 0.3606, + "step": 5233 + }, + { + "epoch": 0.5767493112947658, + "grad_norm": 8.164372444152832, + "learning_rate": 3.871572789296727e-06, + "loss": 0.4254, + "step": 5234 + }, + { + "epoch": 0.5768595041322314, + "grad_norm": 6.481094837188721, + "learning_rate": 3.8698695300325475e-06, + "loss": 0.429, + "step": 5235 + }, + { + "epoch": 0.576969696969697, + "grad_norm": 4.183745861053467, + "learning_rate": 3.868166408962258e-06, + "loss": 0.3469, + "step": 5236 + }, + { + "epoch": 0.5770798898071625, + "grad_norm": 8.76431941986084, + "learning_rate": 3.8664634262941155e-06, + "loss": 0.4076, + "step": 5237 + }, + { + "epoch": 0.5771900826446281, + "grad_norm": 7.580943584442139, + "learning_rate": 3.864760582236367e-06, + "loss": 0.4, + "step": 5238 + }, + { + "epoch": 0.5773002754820936, + "grad_norm": 9.020663261413574, + "learning_rate": 3.863057876997236e-06, + "loss": 0.3657, + "step": 5239 + }, + { + "epoch": 0.5774104683195592, + "grad_norm": 10.859776496887207, + "learning_rate": 3.861355310784932e-06, + "loss": 0.4231, + "step": 5240 + }, + { + "epoch": 0.5775206611570248, + "grad_norm": 7.436887264251709, + "learning_rate": 3.8596528838076476e-06, + "loss": 0.4406, + "step": 5241 + }, + { + "epoch": 0.5776308539944903, + "grad_norm": 10.0736722946167, + "learning_rate": 3.857950596273558e-06, + "loss": 0.4015, + "step": 5242 + }, + { + "epoch": 0.5777410468319559, + "grad_norm": 5.721993446350098, + "learning_rate": 3.8562484483908185e-06, + "loss": 0.3466, + "step": 5243 + }, + { + "epoch": 0.5778512396694215, + "grad_norm": 6.503122329711914, + "learning_rate": 3.854546440367575e-06, + "loss": 0.3774, + "step": 5244 + }, + { + "epoch": 0.577961432506887, + "grad_norm": 6.205786228179932, + "learning_rate": 3.852844572411949e-06, + "loss": 0.3698, + "step": 5245 + }, + { + "epoch": 0.5780716253443526, + "grad_norm": 6.713566303253174, + "learning_rate": 3.851142844732043e-06, + "loss": 0.4425, + "step": 5246 + }, + { + "epoch": 0.5781818181818181, + "grad_norm": 6.699551105499268, + "learning_rate": 3.849441257535955e-06, + "loss": 0.4607, + "step": 5247 + }, + { + "epoch": 0.5782920110192837, + "grad_norm": 3.772110939025879, + "learning_rate": 3.847739811031751e-06, + "loss": 0.3504, + "step": 5248 + }, + { + "epoch": 0.5784022038567493, + "grad_norm": 7.316378593444824, + "learning_rate": 3.846038505427487e-06, + "loss": 0.3804, + "step": 5249 + }, + { + "epoch": 0.5785123966942148, + "grad_norm": 11.61896800994873, + "learning_rate": 3.844337340931204e-06, + "loss": 0.658, + "step": 5250 + }, + { + "epoch": 0.5786225895316804, + "grad_norm": 12.600423812866211, + "learning_rate": 3.842636317750918e-06, + "loss": 0.4261, + "step": 5251 + }, + { + "epoch": 0.578732782369146, + "grad_norm": 4.730981826782227, + "learning_rate": 3.840935436094639e-06, + "loss": 0.4067, + "step": 5252 + }, + { + "epoch": 0.5788429752066115, + "grad_norm": 5.613015651702881, + "learning_rate": 3.839234696170348e-06, + "loss": 0.4097, + "step": 5253 + }, + { + "epoch": 0.5789531680440771, + "grad_norm": 10.097566604614258, + "learning_rate": 3.8375340981860134e-06, + "loss": 0.4101, + "step": 5254 + }, + { + "epoch": 0.5790633608815428, + "grad_norm": 6.698999404907227, + "learning_rate": 3.8358336423495904e-06, + "loss": 0.3602, + "step": 5255 + }, + { + "epoch": 0.5791735537190082, + "grad_norm": 6.215074062347412, + "learning_rate": 3.834133328869011e-06, + "loss": 0.4701, + "step": 5256 + }, + { + "epoch": 0.5792837465564739, + "grad_norm": 10.790955543518066, + "learning_rate": 3.832433157952189e-06, + "loss": 0.3983, + "step": 5257 + }, + { + "epoch": 0.5793939393939394, + "grad_norm": 6.542206764221191, + "learning_rate": 3.830733129807029e-06, + "loss": 0.3291, + "step": 5258 + }, + { + "epoch": 0.579504132231405, + "grad_norm": 10.732033729553223, + "learning_rate": 3.829033244641411e-06, + "loss": 0.4505, + "step": 5259 + }, + { + "epoch": 0.5796143250688706, + "grad_norm": 8.658686637878418, + "learning_rate": 3.827333502663195e-06, + "loss": 0.4709, + "step": 5260 + }, + { + "epoch": 0.5797245179063361, + "grad_norm": 4.506829261779785, + "learning_rate": 3.825633904080234e-06, + "loss": 0.3502, + "step": 5261 + }, + { + "epoch": 0.5798347107438017, + "grad_norm": 7.29030704498291, + "learning_rate": 3.823934449100354e-06, + "loss": 0.4117, + "step": 5262 + }, + { + "epoch": 0.5799449035812673, + "grad_norm": 6.036261558532715, + "learning_rate": 3.822235137931366e-06, + "loss": 0.4133, + "step": 5263 + }, + { + "epoch": 0.5800550964187328, + "grad_norm": 7.769625663757324, + "learning_rate": 3.820535970781066e-06, + "loss": 0.4837, + "step": 5264 + }, + { + "epoch": 0.5801652892561984, + "grad_norm": 5.557866096496582, + "learning_rate": 3.818836947857229e-06, + "loss": 0.462, + "step": 5265 + }, + { + "epoch": 0.5802754820936639, + "grad_norm": 5.770900726318359, + "learning_rate": 3.817138069367615e-06, + "loss": 0.4381, + "step": 5266 + }, + { + "epoch": 0.5803856749311295, + "grad_norm": 9.156088829040527, + "learning_rate": 3.8154393355199656e-06, + "loss": 0.4591, + "step": 5267 + }, + { + "epoch": 0.5804958677685951, + "grad_norm": 5.883398532867432, + "learning_rate": 3.8137407465220012e-06, + "loss": 0.4588, + "step": 5268 + }, + { + "epoch": 0.5806060606060606, + "grad_norm": 6.452761173248291, + "learning_rate": 3.8120423025814314e-06, + "loss": 0.415, + "step": 5269 + }, + { + "epoch": 0.5807162534435262, + "grad_norm": 7.139335632324219, + "learning_rate": 3.8103440039059418e-06, + "loss": 0.3779, + "step": 5270 + }, + { + "epoch": 0.5808264462809918, + "grad_norm": 6.148501873016357, + "learning_rate": 3.8086458507032033e-06, + "loss": 0.467, + "step": 5271 + }, + { + "epoch": 0.5809366391184573, + "grad_norm": 5.470493793487549, + "learning_rate": 3.8069478431808686e-06, + "loss": 0.435, + "step": 5272 + }, + { + "epoch": 0.5810468319559229, + "grad_norm": 5.511624336242676, + "learning_rate": 3.8052499815465738e-06, + "loss": 0.3664, + "step": 5273 + }, + { + "epoch": 0.5811570247933884, + "grad_norm": 6.3985371589660645, + "learning_rate": 3.803552266007931e-06, + "loss": 0.4445, + "step": 5274 + }, + { + "epoch": 0.581267217630854, + "grad_norm": 6.291167736053467, + "learning_rate": 3.8018546967725444e-06, + "loss": 0.404, + "step": 5275 + }, + { + "epoch": 0.5813774104683196, + "grad_norm": 10.149051666259766, + "learning_rate": 3.800157274047994e-06, + "loss": 0.4318, + "step": 5276 + }, + { + "epoch": 0.5814876033057851, + "grad_norm": 11.656339645385742, + "learning_rate": 3.7984599980418393e-06, + "loss": 0.4842, + "step": 5277 + }, + { + "epoch": 0.5815977961432507, + "grad_norm": 7.794707298278809, + "learning_rate": 3.7967628689616304e-06, + "loss": 0.4812, + "step": 5278 + }, + { + "epoch": 0.5817079889807163, + "grad_norm": 5.070430278778076, + "learning_rate": 3.79506588701489e-06, + "loss": 0.3897, + "step": 5279 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 4.962933540344238, + "learning_rate": 3.793369052409132e-06, + "loss": 0.3878, + "step": 5280 + }, + { + "epoch": 0.5819283746556474, + "grad_norm": 6.511288166046143, + "learning_rate": 3.791672365351845e-06, + "loss": 0.4635, + "step": 5281 + }, + { + "epoch": 0.582038567493113, + "grad_norm": 6.625985622406006, + "learning_rate": 3.7899758260505e-06, + "loss": 0.4141, + "step": 5282 + }, + { + "epoch": 0.5821487603305785, + "grad_norm": 5.150052547454834, + "learning_rate": 3.788279434712558e-06, + "loss": 0.3894, + "step": 5283 + }, + { + "epoch": 0.5822589531680441, + "grad_norm": 8.391870498657227, + "learning_rate": 3.7865831915454515e-06, + "loss": 0.4382, + "step": 5284 + }, + { + "epoch": 0.5823691460055096, + "grad_norm": 8.36882495880127, + "learning_rate": 3.7848870967565996e-06, + "loss": 0.3315, + "step": 5285 + }, + { + "epoch": 0.5824793388429752, + "grad_norm": 6.879951000213623, + "learning_rate": 3.783191150553405e-06, + "loss": 0.4199, + "step": 5286 + }, + { + "epoch": 0.5825895316804408, + "grad_norm": 5.488747596740723, + "learning_rate": 3.7814953531432495e-06, + "loss": 0.4037, + "step": 5287 + }, + { + "epoch": 0.5826997245179063, + "grad_norm": 4.865351676940918, + "learning_rate": 3.7797997047334966e-06, + "loss": 0.3363, + "step": 5288 + }, + { + "epoch": 0.5828099173553719, + "grad_norm": 4.42156457901001, + "learning_rate": 3.7781042055314943e-06, + "loss": 0.3398, + "step": 5289 + }, + { + "epoch": 0.5829201101928375, + "grad_norm": 8.863175392150879, + "learning_rate": 3.7764088557445686e-06, + "loss": 0.4587, + "step": 5290 + }, + { + "epoch": 0.583030303030303, + "grad_norm": 9.259750366210938, + "learning_rate": 3.77471365558003e-06, + "loss": 0.4136, + "step": 5291 + }, + { + "epoch": 0.5831404958677686, + "grad_norm": 8.834726333618164, + "learning_rate": 3.7730186052451713e-06, + "loss": 0.4891, + "step": 5292 + }, + { + "epoch": 0.5832506887052341, + "grad_norm": 6.8883280754089355, + "learning_rate": 3.771323704947263e-06, + "loss": 0.4081, + "step": 5293 + }, + { + "epoch": 0.5833608815426997, + "grad_norm": 5.524451732635498, + "learning_rate": 3.769628954893562e-06, + "loss": 0.3943, + "step": 5294 + }, + { + "epoch": 0.5834710743801653, + "grad_norm": 6.886898994445801, + "learning_rate": 3.767934355291303e-06, + "loss": 0.4264, + "step": 5295 + }, + { + "epoch": 0.5835812672176308, + "grad_norm": 6.704071998596191, + "learning_rate": 3.766239906347704e-06, + "loss": 0.4009, + "step": 5296 + }, + { + "epoch": 0.5836914600550964, + "grad_norm": 6.265433311462402, + "learning_rate": 3.764545608269966e-06, + "loss": 0.3758, + "step": 5297 + }, + { + "epoch": 0.583801652892562, + "grad_norm": 6.239924907684326, + "learning_rate": 3.76285146126527e-06, + "loss": 0.3858, + "step": 5298 + }, + { + "epoch": 0.5839118457300275, + "grad_norm": 8.371216773986816, + "learning_rate": 3.761157465540776e-06, + "loss": 0.4182, + "step": 5299 + }, + { + "epoch": 0.5840220385674931, + "grad_norm": 9.449613571166992, + "learning_rate": 3.759463621303631e-06, + "loss": 0.428, + "step": 5300 + }, + { + "epoch": 0.5841322314049586, + "grad_norm": 6.359887599945068, + "learning_rate": 3.7577699287609613e-06, + "loss": 0.376, + "step": 5301 + }, + { + "epoch": 0.5842424242424242, + "grad_norm": 4.445738792419434, + "learning_rate": 3.756076388119868e-06, + "loss": 0.3728, + "step": 5302 + }, + { + "epoch": 0.5843526170798898, + "grad_norm": 6.449606418609619, + "learning_rate": 3.7543829995874464e-06, + "loss": 0.3856, + "step": 5303 + }, + { + "epoch": 0.5844628099173553, + "grad_norm": 7.634322643280029, + "learning_rate": 3.752689763370765e-06, + "loss": 0.3719, + "step": 5304 + }, + { + "epoch": 0.5845730027548209, + "grad_norm": 5.385597229003906, + "learning_rate": 3.750996679676869e-06, + "loss": 0.4238, + "step": 5305 + }, + { + "epoch": 0.5846831955922865, + "grad_norm": 8.28349494934082, + "learning_rate": 3.7493037487128005e-06, + "loss": 0.4034, + "step": 5306 + }, + { + "epoch": 0.584793388429752, + "grad_norm": 8.161641120910645, + "learning_rate": 3.7476109706855644e-06, + "loss": 0.3451, + "step": 5307 + }, + { + "epoch": 0.5849035812672176, + "grad_norm": 7.038846015930176, + "learning_rate": 3.745918345802162e-06, + "loss": 0.3661, + "step": 5308 + }, + { + "epoch": 0.5850137741046832, + "grad_norm": 13.376177787780762, + "learning_rate": 3.7442258742695692e-06, + "loss": 0.5265, + "step": 5309 + }, + { + "epoch": 0.5851239669421487, + "grad_norm": 6.555881023406982, + "learning_rate": 3.7425335562947394e-06, + "loss": 0.4477, + "step": 5310 + }, + { + "epoch": 0.5852341597796143, + "grad_norm": 6.7689642906188965, + "learning_rate": 3.740841392084618e-06, + "loss": 0.41, + "step": 5311 + }, + { + "epoch": 0.5853443526170798, + "grad_norm": 6.448429584503174, + "learning_rate": 3.7391493818461188e-06, + "loss": 0.4429, + "step": 5312 + }, + { + "epoch": 0.5854545454545454, + "grad_norm": 5.073526382446289, + "learning_rate": 3.7374575257861454e-06, + "loss": 0.4066, + "step": 5313 + }, + { + "epoch": 0.5855647382920111, + "grad_norm": 10.34382438659668, + "learning_rate": 3.735765824111583e-06, + "loss": 0.4164, + "step": 5314 + }, + { + "epoch": 0.5856749311294766, + "grad_norm": 10.397635459899902, + "learning_rate": 3.7340742770292922e-06, + "loss": 0.4666, + "step": 5315 + }, + { + "epoch": 0.5857851239669422, + "grad_norm": 5.910182952880859, + "learning_rate": 3.7323828847461172e-06, + "loss": 0.3822, + "step": 5316 + }, + { + "epoch": 0.5858953168044078, + "grad_norm": 9.688176155090332, + "learning_rate": 3.730691647468886e-06, + "loss": 0.3534, + "step": 5317 + }, + { + "epoch": 0.5860055096418733, + "grad_norm": 10.011178970336914, + "learning_rate": 3.729000565404405e-06, + "loss": 0.4479, + "step": 5318 + }, + { + "epoch": 0.5861157024793389, + "grad_norm": 6.989460468292236, + "learning_rate": 3.7273096387594585e-06, + "loss": 0.3823, + "step": 5319 + }, + { + "epoch": 0.5862258953168044, + "grad_norm": 5.006401538848877, + "learning_rate": 3.7256188677408213e-06, + "loss": 0.4027, + "step": 5320 + }, + { + "epoch": 0.58633608815427, + "grad_norm": 10.467499732971191, + "learning_rate": 3.7239282525552378e-06, + "loss": 0.4665, + "step": 5321 + }, + { + "epoch": 0.5864462809917356, + "grad_norm": 6.434284687042236, + "learning_rate": 3.722237793409442e-06, + "loss": 0.428, + "step": 5322 + }, + { + "epoch": 0.5865564738292011, + "grad_norm": 5.784710884094238, + "learning_rate": 3.7205474905101454e-06, + "loss": 0.3429, + "step": 5323 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 7.143020153045654, + "learning_rate": 3.7188573440640373e-06, + "loss": 0.3901, + "step": 5324 + }, + { + "epoch": 0.5867768595041323, + "grad_norm": 6.1825761795043945, + "learning_rate": 3.717167354277795e-06, + "loss": 0.4052, + "step": 5325 + }, + { + "epoch": 0.5868870523415978, + "grad_norm": 5.028227806091309, + "learning_rate": 3.7154775213580717e-06, + "loss": 0.3879, + "step": 5326 + }, + { + "epoch": 0.5869972451790634, + "grad_norm": 7.450397968292236, + "learning_rate": 3.7137878455115005e-06, + "loss": 0.4181, + "step": 5327 + }, + { + "epoch": 0.5871074380165289, + "grad_norm": 6.3655290603637695, + "learning_rate": 3.7120983269446997e-06, + "loss": 0.4269, + "step": 5328 + }, + { + "epoch": 0.5872176308539945, + "grad_norm": 6.551460266113281, + "learning_rate": 3.710408965864265e-06, + "loss": 0.3654, + "step": 5329 + }, + { + "epoch": 0.5873278236914601, + "grad_norm": 5.877065181732178, + "learning_rate": 3.7087197624767725e-06, + "loss": 0.4024, + "step": 5330 + }, + { + "epoch": 0.5874380165289256, + "grad_norm": 6.857378959655762, + "learning_rate": 3.707030716988783e-06, + "loss": 0.4336, + "step": 5331 + }, + { + "epoch": 0.5875482093663912, + "grad_norm": 9.565370559692383, + "learning_rate": 3.7053418296068342e-06, + "loss": 0.4952, + "step": 5332 + }, + { + "epoch": 0.5876584022038568, + "grad_norm": 8.197833061218262, + "learning_rate": 3.703653100537442e-06, + "loss": 0.4648, + "step": 5333 + }, + { + "epoch": 0.5877685950413223, + "grad_norm": 10.442931175231934, + "learning_rate": 3.701964529987113e-06, + "loss": 0.4563, + "step": 5334 + }, + { + "epoch": 0.5878787878787879, + "grad_norm": 5.644998550415039, + "learning_rate": 3.7002761181623215e-06, + "loss": 0.4125, + "step": 5335 + }, + { + "epoch": 0.5879889807162535, + "grad_norm": 7.055531978607178, + "learning_rate": 3.698587865269534e-06, + "loss": 0.4379, + "step": 5336 + }, + { + "epoch": 0.588099173553719, + "grad_norm": 6.240884304046631, + "learning_rate": 3.6968997715151907e-06, + "loss": 0.427, + "step": 5337 + }, + { + "epoch": 0.5882093663911846, + "grad_norm": 5.399500846862793, + "learning_rate": 3.69521183710571e-06, + "loss": 0.3934, + "step": 5338 + }, + { + "epoch": 0.5883195592286501, + "grad_norm": 7.054924964904785, + "learning_rate": 3.6935240622475023e-06, + "loss": 0.4656, + "step": 5339 + }, + { + "epoch": 0.5884297520661157, + "grad_norm": 5.625728607177734, + "learning_rate": 3.6918364471469447e-06, + "loss": 0.3853, + "step": 5340 + }, + { + "epoch": 0.5885399449035813, + "grad_norm": 9.67719554901123, + "learning_rate": 3.6901489920104023e-06, + "loss": 0.465, + "step": 5341 + }, + { + "epoch": 0.5886501377410468, + "grad_norm": 6.909407138824463, + "learning_rate": 3.6884616970442234e-06, + "loss": 0.3759, + "step": 5342 + }, + { + "epoch": 0.5887603305785124, + "grad_norm": 6.184091567993164, + "learning_rate": 3.6867745624547278e-06, + "loss": 0.463, + "step": 5343 + }, + { + "epoch": 0.588870523415978, + "grad_norm": 6.955829620361328, + "learning_rate": 3.6850875884482223e-06, + "loss": 0.4659, + "step": 5344 + }, + { + "epoch": 0.5889807162534435, + "grad_norm": 7.018570423126221, + "learning_rate": 3.6834007752309936e-06, + "loss": 0.3883, + "step": 5345 + }, + { + "epoch": 0.5890909090909091, + "grad_norm": 9.0138578414917, + "learning_rate": 3.6817141230093067e-06, + "loss": 0.4157, + "step": 5346 + }, + { + "epoch": 0.5892011019283746, + "grad_norm": 4.636580944061279, + "learning_rate": 3.6800276319894055e-06, + "loss": 0.4195, + "step": 5347 + }, + { + "epoch": 0.5893112947658402, + "grad_norm": 7.094888210296631, + "learning_rate": 3.6783413023775206e-06, + "loss": 0.4772, + "step": 5348 + }, + { + "epoch": 0.5894214876033058, + "grad_norm": 8.197665214538574, + "learning_rate": 3.6766551343798553e-06, + "loss": 0.3932, + "step": 5349 + }, + { + "epoch": 0.5895316804407713, + "grad_norm": 6.349987506866455, + "learning_rate": 3.6749691282025986e-06, + "loss": 0.4241, + "step": 5350 + }, + { + "epoch": 0.5896418732782369, + "grad_norm": 5.951522350311279, + "learning_rate": 3.6732832840519167e-06, + "loss": 0.3908, + "step": 5351 + }, + { + "epoch": 0.5897520661157025, + "grad_norm": 10.102296829223633, + "learning_rate": 3.6715976021339563e-06, + "loss": 0.3268, + "step": 5352 + }, + { + "epoch": 0.589862258953168, + "grad_norm": 8.515700340270996, + "learning_rate": 3.669912082654846e-06, + "loss": 0.4486, + "step": 5353 + }, + { + "epoch": 0.5899724517906336, + "grad_norm": 4.492301940917969, + "learning_rate": 3.6682267258206938e-06, + "loss": 0.4722, + "step": 5354 + }, + { + "epoch": 0.5900826446280992, + "grad_norm": 4.916325092315674, + "learning_rate": 3.666541531837585e-06, + "loss": 0.4163, + "step": 5355 + }, + { + "epoch": 0.5901928374655647, + "grad_norm": 9.863926887512207, + "learning_rate": 3.66485650091159e-06, + "loss": 0.5008, + "step": 5356 + }, + { + "epoch": 0.5903030303030303, + "grad_norm": 6.5478291511535645, + "learning_rate": 3.6631716332487556e-06, + "loss": 0.4371, + "step": 5357 + }, + { + "epoch": 0.5904132231404958, + "grad_norm": 12.289795875549316, + "learning_rate": 3.661486929055109e-06, + "loss": 0.4448, + "step": 5358 + }, + { + "epoch": 0.5905234159779614, + "grad_norm": 4.799616813659668, + "learning_rate": 3.65980238853666e-06, + "loss": 0.3825, + "step": 5359 + }, + { + "epoch": 0.590633608815427, + "grad_norm": 8.292051315307617, + "learning_rate": 3.6581180118993965e-06, + "loss": 0.4481, + "step": 5360 + }, + { + "epoch": 0.5907438016528925, + "grad_norm": 8.100136756896973, + "learning_rate": 3.6564337993492822e-06, + "loss": 0.4519, + "step": 5361 + }, + { + "epoch": 0.5908539944903581, + "grad_norm": 6.405386924743652, + "learning_rate": 3.6547497510922703e-06, + "loss": 0.3976, + "step": 5362 + }, + { + "epoch": 0.5909641873278237, + "grad_norm": 21.084047317504883, + "learning_rate": 3.6530658673342843e-06, + "loss": 0.3932, + "step": 5363 + }, + { + "epoch": 0.5910743801652892, + "grad_norm": 8.751500129699707, + "learning_rate": 3.651382148281235e-06, + "loss": 0.4776, + "step": 5364 + }, + { + "epoch": 0.5911845730027548, + "grad_norm": 5.426620006561279, + "learning_rate": 3.64969859413901e-06, + "loss": 0.4031, + "step": 5365 + }, + { + "epoch": 0.5912947658402203, + "grad_norm": 10.462101936340332, + "learning_rate": 3.6480152051134715e-06, + "loss": 0.4014, + "step": 5366 + }, + { + "epoch": 0.5914049586776859, + "grad_norm": 6.521287441253662, + "learning_rate": 3.6463319814104734e-06, + "loss": 0.5193, + "step": 5367 + }, + { + "epoch": 0.5915151515151515, + "grad_norm": 4.917036056518555, + "learning_rate": 3.6446489232358385e-06, + "loss": 0.4223, + "step": 5368 + }, + { + "epoch": 0.591625344352617, + "grad_norm": 8.916620254516602, + "learning_rate": 3.6429660307953723e-06, + "loss": 0.419, + "step": 5369 + }, + { + "epoch": 0.5917355371900826, + "grad_norm": 5.543242931365967, + "learning_rate": 3.6412833042948663e-06, + "loss": 0.4031, + "step": 5370 + }, + { + "epoch": 0.5918457300275483, + "grad_norm": 7.404268741607666, + "learning_rate": 3.6396007439400826e-06, + "loss": 0.3445, + "step": 5371 + }, + { + "epoch": 0.5919559228650138, + "grad_norm": 9.950736999511719, + "learning_rate": 3.6379183499367667e-06, + "loss": 0.451, + "step": 5372 + }, + { + "epoch": 0.5920661157024794, + "grad_norm": 5.994956970214844, + "learning_rate": 3.6362361224906463e-06, + "loss": 0.4803, + "step": 5373 + }, + { + "epoch": 0.5921763085399449, + "grad_norm": 4.3853230476379395, + "learning_rate": 3.634554061807425e-06, + "loss": 0.3754, + "step": 5374 + }, + { + "epoch": 0.5922865013774105, + "grad_norm": 8.13461971282959, + "learning_rate": 3.6328721680927868e-06, + "loss": 0.5147, + "step": 5375 + }, + { + "epoch": 0.5923966942148761, + "grad_norm": 4.483111381530762, + "learning_rate": 3.631190441552398e-06, + "loss": 0.3936, + "step": 5376 + }, + { + "epoch": 0.5925068870523416, + "grad_norm": 5.281414985656738, + "learning_rate": 3.6295088823919005e-06, + "loss": 0.4366, + "step": 5377 + }, + { + "epoch": 0.5926170798898072, + "grad_norm": 10.191956520080566, + "learning_rate": 3.62782749081692e-06, + "loss": 0.5176, + "step": 5378 + }, + { + "epoch": 0.5927272727272728, + "grad_norm": 13.563871383666992, + "learning_rate": 3.6261462670330573e-06, + "loss": 0.4426, + "step": 5379 + }, + { + "epoch": 0.5928374655647383, + "grad_norm": 9.997957229614258, + "learning_rate": 3.624465211245894e-06, + "loss": 0.3575, + "step": 5380 + }, + { + "epoch": 0.5929476584022039, + "grad_norm": 5.238548278808594, + "learning_rate": 3.622784323660994e-06, + "loss": 0.3669, + "step": 5381 + }, + { + "epoch": 0.5930578512396695, + "grad_norm": 4.02475118637085, + "learning_rate": 3.621103604483899e-06, + "loss": 0.3908, + "step": 5382 + }, + { + "epoch": 0.593168044077135, + "grad_norm": 11.147467613220215, + "learning_rate": 3.6194230539201256e-06, + "loss": 0.4198, + "step": 5383 + }, + { + "epoch": 0.5932782369146006, + "grad_norm": 9.837335586547852, + "learning_rate": 3.6177426721751786e-06, + "loss": 0.4558, + "step": 5384 + }, + { + "epoch": 0.5933884297520661, + "grad_norm": 5.696954250335693, + "learning_rate": 3.6160624594545347e-06, + "loss": 0.4837, + "step": 5385 + }, + { + "epoch": 0.5934986225895317, + "grad_norm": 6.0080084800720215, + "learning_rate": 3.614382415963652e-06, + "loss": 0.3908, + "step": 5386 + }, + { + "epoch": 0.5936088154269973, + "grad_norm": 5.469796657562256, + "learning_rate": 3.6127025419079714e-06, + "loss": 0.3436, + "step": 5387 + }, + { + "epoch": 0.5937190082644628, + "grad_norm": 5.619692802429199, + "learning_rate": 3.611022837492908e-06, + "loss": 0.4052, + "step": 5388 + }, + { + "epoch": 0.5938292011019284, + "grad_norm": 4.303910255432129, + "learning_rate": 3.6093433029238576e-06, + "loss": 0.3026, + "step": 5389 + }, + { + "epoch": 0.593939393939394, + "grad_norm": 4.820587158203125, + "learning_rate": 3.6076639384061985e-06, + "loss": 0.3967, + "step": 5390 + }, + { + "epoch": 0.5940495867768595, + "grad_norm": 6.786526679992676, + "learning_rate": 3.6059847441452835e-06, + "loss": 0.3944, + "step": 5391 + }, + { + "epoch": 0.5941597796143251, + "grad_norm": 6.68752384185791, + "learning_rate": 3.6043057203464483e-06, + "loss": 0.452, + "step": 5392 + }, + { + "epoch": 0.5942699724517906, + "grad_norm": 8.34605598449707, + "learning_rate": 3.602626867215006e-06, + "loss": 0.4646, + "step": 5393 + }, + { + "epoch": 0.5943801652892562, + "grad_norm": 10.381217002868652, + "learning_rate": 3.600948184956246e-06, + "loss": 0.3274, + "step": 5394 + }, + { + "epoch": 0.5944903581267218, + "grad_norm": 7.704305648803711, + "learning_rate": 3.599269673775444e-06, + "loss": 0.3973, + "step": 5395 + }, + { + "epoch": 0.5946005509641873, + "grad_norm": 4.4532623291015625, + "learning_rate": 3.5975913338778513e-06, + "loss": 0.415, + "step": 5396 + }, + { + "epoch": 0.5947107438016529, + "grad_norm": 3.834787130355835, + "learning_rate": 3.595913165468691e-06, + "loss": 0.3236, + "step": 5397 + }, + { + "epoch": 0.5948209366391185, + "grad_norm": 8.35515308380127, + "learning_rate": 3.5942351687531795e-06, + "loss": 0.4118, + "step": 5398 + }, + { + "epoch": 0.594931129476584, + "grad_norm": 11.47801399230957, + "learning_rate": 3.5925573439364996e-06, + "loss": 0.4985, + "step": 5399 + }, + { + "epoch": 0.5950413223140496, + "grad_norm": 23.297760009765625, + "learning_rate": 3.5908796912238174e-06, + "loss": 0.4167, + "step": 5400 + }, + { + "epoch": 0.5951515151515151, + "grad_norm": 5.2379937171936035, + "learning_rate": 3.589202210820285e-06, + "loss": 0.4222, + "step": 5401 + }, + { + "epoch": 0.5952617079889807, + "grad_norm": 10.262834548950195, + "learning_rate": 3.5875249029310204e-06, + "loss": 0.4393, + "step": 5402 + }, + { + "epoch": 0.5953719008264463, + "grad_norm": 8.890426635742188, + "learning_rate": 3.585847767761129e-06, + "loss": 0.3535, + "step": 5403 + }, + { + "epoch": 0.5954820936639118, + "grad_norm": 5.533564567565918, + "learning_rate": 3.584170805515694e-06, + "loss": 0.3808, + "step": 5404 + }, + { + "epoch": 0.5955922865013774, + "grad_norm": 8.540761947631836, + "learning_rate": 3.5824940163997757e-06, + "loss": 0.4659, + "step": 5405 + }, + { + "epoch": 0.595702479338843, + "grad_norm": 10.449670791625977, + "learning_rate": 3.580817400618415e-06, + "loss": 0.4835, + "step": 5406 + }, + { + "epoch": 0.5958126721763085, + "grad_norm": 4.463833808898926, + "learning_rate": 3.57914095837663e-06, + "loss": 0.4041, + "step": 5407 + }, + { + "epoch": 0.5959228650137741, + "grad_norm": 5.430807113647461, + "learning_rate": 3.5774646898794186e-06, + "loss": 0.433, + "step": 5408 + }, + { + "epoch": 0.5960330578512397, + "grad_norm": 6.412762641906738, + "learning_rate": 3.5757885953317578e-06, + "loss": 0.3562, + "step": 5409 + }, + { + "epoch": 0.5961432506887052, + "grad_norm": 4.589330673217773, + "learning_rate": 3.5741126749386025e-06, + "loss": 0.3971, + "step": 5410 + }, + { + "epoch": 0.5962534435261708, + "grad_norm": 6.2130045890808105, + "learning_rate": 3.5724369289048845e-06, + "loss": 0.4398, + "step": 5411 + }, + { + "epoch": 0.5963636363636363, + "grad_norm": 8.505125999450684, + "learning_rate": 3.5707613574355194e-06, + "loss": 0.3777, + "step": 5412 + }, + { + "epoch": 0.5964738292011019, + "grad_norm": 6.111608505249023, + "learning_rate": 3.569085960735397e-06, + "loss": 0.4284, + "step": 5413 + }, + { + "epoch": 0.5965840220385675, + "grad_norm": 6.80982780456543, + "learning_rate": 3.567410739009386e-06, + "loss": 0.4572, + "step": 5414 + }, + { + "epoch": 0.596694214876033, + "grad_norm": 7.65366268157959, + "learning_rate": 3.5657356924623367e-06, + "loss": 0.4361, + "step": 5415 + }, + { + "epoch": 0.5968044077134986, + "grad_norm": 7.172601699829102, + "learning_rate": 3.564060821299076e-06, + "loss": 0.4988, + "step": 5416 + }, + { + "epoch": 0.5969146005509642, + "grad_norm": 5.870329856872559, + "learning_rate": 3.5623861257244062e-06, + "loss": 0.3892, + "step": 5417 + }, + { + "epoch": 0.5970247933884297, + "grad_norm": 10.348566055297852, + "learning_rate": 3.560711605943116e-06, + "loss": 0.3357, + "step": 5418 + }, + { + "epoch": 0.5971349862258953, + "grad_norm": 4.690185546875, + "learning_rate": 3.5590372621599634e-06, + "loss": 0.4008, + "step": 5419 + }, + { + "epoch": 0.5972451790633608, + "grad_norm": 4.000380039215088, + "learning_rate": 3.5573630945796934e-06, + "loss": 0.3698, + "step": 5420 + }, + { + "epoch": 0.5973553719008264, + "grad_norm": 5.777688026428223, + "learning_rate": 3.555689103407024e-06, + "loss": 0.3873, + "step": 5421 + }, + { + "epoch": 0.597465564738292, + "grad_norm": 13.219320297241211, + "learning_rate": 3.5540152888466515e-06, + "loss": 0.5007, + "step": 5422 + }, + { + "epoch": 0.5975757575757575, + "grad_norm": 7.085648059844971, + "learning_rate": 3.552341651103255e-06, + "loss": 0.449, + "step": 5423 + }, + { + "epoch": 0.5976859504132231, + "grad_norm": 8.671232223510742, + "learning_rate": 3.550668190381489e-06, + "loss": 0.4129, + "step": 5424 + }, + { + "epoch": 0.5977961432506887, + "grad_norm": 10.42537784576416, + "learning_rate": 3.548994906885982e-06, + "loss": 0.4701, + "step": 5425 + }, + { + "epoch": 0.5979063360881542, + "grad_norm": 4.7576494216918945, + "learning_rate": 3.547321800821353e-06, + "loss": 0.3749, + "step": 5426 + }, + { + "epoch": 0.5980165289256199, + "grad_norm": 9.288346290588379, + "learning_rate": 3.545648872392185e-06, + "loss": 0.3779, + "step": 5427 + }, + { + "epoch": 0.5981267217630853, + "grad_norm": 7.297476768493652, + "learning_rate": 3.5439761218030465e-06, + "loss": 0.3754, + "step": 5428 + }, + { + "epoch": 0.598236914600551, + "grad_norm": 7.915855884552002, + "learning_rate": 3.542303549258489e-06, + "loss": 0.3809, + "step": 5429 + }, + { + "epoch": 0.5983471074380166, + "grad_norm": 9.086050033569336, + "learning_rate": 3.540631154963033e-06, + "loss": 0.3856, + "step": 5430 + }, + { + "epoch": 0.598457300275482, + "grad_norm": 6.687013149261475, + "learning_rate": 3.5389589391211805e-06, + "loss": 0.4279, + "step": 5431 + }, + { + "epoch": 0.5985674931129477, + "grad_norm": 4.62927770614624, + "learning_rate": 3.5372869019374146e-06, + "loss": 0.3981, + "step": 5432 + }, + { + "epoch": 0.5986776859504133, + "grad_norm": 6.236980438232422, + "learning_rate": 3.5356150436161908e-06, + "loss": 0.3737, + "step": 5433 + }, + { + "epoch": 0.5987878787878788, + "grad_norm": 8.671568870544434, + "learning_rate": 3.5339433643619515e-06, + "loss": 0.4959, + "step": 5434 + }, + { + "epoch": 0.5988980716253444, + "grad_norm": 4.198315620422363, + "learning_rate": 3.5322718643791087e-06, + "loss": 0.3807, + "step": 5435 + }, + { + "epoch": 0.59900826446281, + "grad_norm": 5.558796405792236, + "learning_rate": 3.530600543872055e-06, + "loss": 0.426, + "step": 5436 + }, + { + "epoch": 0.5991184573002755, + "grad_norm": 5.832391262054443, + "learning_rate": 3.528929403045163e-06, + "loss": 0.42, + "step": 5437 + }, + { + "epoch": 0.5992286501377411, + "grad_norm": 5.3307929039001465, + "learning_rate": 3.5272584421027823e-06, + "loss": 0.3636, + "step": 5438 + }, + { + "epoch": 0.5993388429752066, + "grad_norm": 6.033764362335205, + "learning_rate": 3.525587661249239e-06, + "loss": 0.3113, + "step": 5439 + }, + { + "epoch": 0.5994490358126722, + "grad_norm": 4.488221168518066, + "learning_rate": 3.5239170606888405e-06, + "loss": 0.3893, + "step": 5440 + }, + { + "epoch": 0.5995592286501378, + "grad_norm": 4.295153617858887, + "learning_rate": 3.522246640625868e-06, + "loss": 0.375, + "step": 5441 + }, + { + "epoch": 0.5996694214876033, + "grad_norm": 4.770776748657227, + "learning_rate": 3.520576401264584e-06, + "loss": 0.4244, + "step": 5442 + }, + { + "epoch": 0.5997796143250689, + "grad_norm": 7.971227645874023, + "learning_rate": 3.5189063428092276e-06, + "loss": 0.4109, + "step": 5443 + }, + { + "epoch": 0.5998898071625345, + "grad_norm": 11.983476638793945, + "learning_rate": 3.5172364654640144e-06, + "loss": 0.5333, + "step": 5444 + }, + { + "epoch": 0.6, + "grad_norm": 6.334736347198486, + "learning_rate": 3.5155667694331408e-06, + "loss": 0.4252, + "step": 5445 + }, + { + "epoch": 0.6001101928374656, + "grad_norm": 4.475926876068115, + "learning_rate": 3.5138972549207794e-06, + "loss": 0.3493, + "step": 5446 + }, + { + "epoch": 0.6002203856749311, + "grad_norm": 4.665364742279053, + "learning_rate": 3.5122279221310795e-06, + "loss": 0.4304, + "step": 5447 + }, + { + "epoch": 0.6003305785123967, + "grad_norm": 7.550339221954346, + "learning_rate": 3.510558771268171e-06, + "loss": 0.4307, + "step": 5448 + }, + { + "epoch": 0.6003305785123967, + "eval_loss": 0.4078207314014435, + "eval_runtime": 41.9468, + "eval_samples_per_second": 17.498, + "eval_steps_per_second": 2.193, + "step": 5448 + }, + { + "epoch": 0.6004407713498623, + "grad_norm": 7.081995487213135, + "learning_rate": 3.5088898025361596e-06, + "loss": 0.3884, + "step": 5449 + }, + { + "epoch": 0.6005509641873278, + "grad_norm": 7.735726356506348, + "learning_rate": 3.5072210161391273e-06, + "loss": 0.5048, + "step": 5450 + }, + { + "epoch": 0.6006611570247934, + "grad_norm": 6.870943069458008, + "learning_rate": 3.505552412281138e-06, + "loss": 0.4516, + "step": 5451 + }, + { + "epoch": 0.600771349862259, + "grad_norm": 6.664737701416016, + "learning_rate": 3.5038839911662303e-06, + "loss": 0.3871, + "step": 5452 + }, + { + "epoch": 0.6008815426997245, + "grad_norm": 7.977838516235352, + "learning_rate": 3.502215752998418e-06, + "loss": 0.4709, + "step": 5453 + }, + { + "epoch": 0.6009917355371901, + "grad_norm": 8.963409423828125, + "learning_rate": 3.5005476979816992e-06, + "loss": 0.4825, + "step": 5454 + }, + { + "epoch": 0.6011019283746556, + "grad_norm": 9.17885684967041, + "learning_rate": 3.4988798263200463e-06, + "loss": 0.4628, + "step": 5455 + }, + { + "epoch": 0.6012121212121212, + "grad_norm": 7.315991401672363, + "learning_rate": 3.497212138217404e-06, + "loss": 0.3959, + "step": 5456 + }, + { + "epoch": 0.6013223140495868, + "grad_norm": 4.886915683746338, + "learning_rate": 3.4955446338777064e-06, + "loss": 0.3892, + "step": 5457 + }, + { + "epoch": 0.6014325068870523, + "grad_norm": 5.84981107711792, + "learning_rate": 3.493877313504853e-06, + "loss": 0.3855, + "step": 5458 + }, + { + "epoch": 0.6015426997245179, + "grad_norm": 6.625370502471924, + "learning_rate": 3.492210177302727e-06, + "loss": 0.4021, + "step": 5459 + }, + { + "epoch": 0.6016528925619835, + "grad_norm": 10.513236999511719, + "learning_rate": 3.490543225475189e-06, + "loss": 0.5396, + "step": 5460 + }, + { + "epoch": 0.601763085399449, + "grad_norm": 14.246024131774902, + "learning_rate": 3.488876458226075e-06, + "loss": 0.4531, + "step": 5461 + }, + { + "epoch": 0.6018732782369146, + "grad_norm": 6.541625499725342, + "learning_rate": 3.487209875759202e-06, + "loss": 0.3737, + "step": 5462 + }, + { + "epoch": 0.6019834710743802, + "grad_norm": 10.214615821838379, + "learning_rate": 3.4855434782783603e-06, + "loss": 0.441, + "step": 5463 + }, + { + "epoch": 0.6020936639118457, + "grad_norm": 8.126017570495605, + "learning_rate": 3.4838772659873175e-06, + "loss": 0.3262, + "step": 5464 + }, + { + "epoch": 0.6022038567493113, + "grad_norm": 9.761160850524902, + "learning_rate": 3.4822112390898233e-06, + "loss": 0.3774, + "step": 5465 + }, + { + "epoch": 0.6023140495867768, + "grad_norm": 5.821922779083252, + "learning_rate": 3.480545397789601e-06, + "loss": 0.4295, + "step": 5466 + }, + { + "epoch": 0.6024242424242424, + "grad_norm": 5.46732234954834, + "learning_rate": 3.4788797422903496e-06, + "loss": 0.3349, + "step": 5467 + }, + { + "epoch": 0.602534435261708, + "grad_norm": 6.44102144241333, + "learning_rate": 3.4772142727957515e-06, + "loss": 0.4165, + "step": 5468 + }, + { + "epoch": 0.6026446280991735, + "grad_norm": 5.899956703186035, + "learning_rate": 3.4755489895094596e-06, + "loss": 0.4426, + "step": 5469 + }, + { + "epoch": 0.6027548209366391, + "grad_norm": 4.245342254638672, + "learning_rate": 3.4738838926351078e-06, + "loss": 0.3504, + "step": 5470 + }, + { + "epoch": 0.6028650137741047, + "grad_norm": 5.119142055511475, + "learning_rate": 3.4722189823763075e-06, + "loss": 0.3728, + "step": 5471 + }, + { + "epoch": 0.6029752066115702, + "grad_norm": 5.943778038024902, + "learning_rate": 3.470554258936645e-06, + "loss": 0.3109, + "step": 5472 + }, + { + "epoch": 0.6030853994490358, + "grad_norm": 5.078598976135254, + "learning_rate": 3.4688897225196845e-06, + "loss": 0.3263, + "step": 5473 + }, + { + "epoch": 0.6031955922865013, + "grad_norm": 6.279187202453613, + "learning_rate": 3.4672253733289694e-06, + "loss": 0.4356, + "step": 5474 + }, + { + "epoch": 0.6033057851239669, + "grad_norm": 5.126062393188477, + "learning_rate": 3.4655612115680172e-06, + "loss": 0.3613, + "step": 5475 + }, + { + "epoch": 0.6034159779614325, + "grad_norm": 5.338726043701172, + "learning_rate": 3.4638972374403246e-06, + "loss": 0.3471, + "step": 5476 + }, + { + "epoch": 0.603526170798898, + "grad_norm": 10.157291412353516, + "learning_rate": 3.462233451149365e-06, + "loss": 0.4231, + "step": 5477 + }, + { + "epoch": 0.6036363636363636, + "grad_norm": 5.979432582855225, + "learning_rate": 3.4605698528985866e-06, + "loss": 0.3649, + "step": 5478 + }, + { + "epoch": 0.6037465564738292, + "grad_norm": 6.608692169189453, + "learning_rate": 3.4589064428914186e-06, + "loss": 0.4855, + "step": 5479 + }, + { + "epoch": 0.6038567493112947, + "grad_norm": 15.17431640625, + "learning_rate": 3.457243221331266e-06, + "loss": 0.5176, + "step": 5480 + }, + { + "epoch": 0.6039669421487603, + "grad_norm": 5.733303070068359, + "learning_rate": 3.4555801884215036e-06, + "loss": 0.3628, + "step": 5481 + }, + { + "epoch": 0.604077134986226, + "grad_norm": 9.653572082519531, + "learning_rate": 3.453917344365496e-06, + "loss": 0.4355, + "step": 5482 + }, + { + "epoch": 0.6041873278236914, + "grad_norm": 6.833795070648193, + "learning_rate": 3.452254689366577e-06, + "loss": 0.3556, + "step": 5483 + }, + { + "epoch": 0.604297520661157, + "grad_norm": 5.048872470855713, + "learning_rate": 3.450592223628054e-06, + "loss": 0.3802, + "step": 5484 + }, + { + "epoch": 0.6044077134986225, + "grad_norm": 6.3752546310424805, + "learning_rate": 3.4489299473532212e-06, + "loss": 0.3629, + "step": 5485 + }, + { + "epoch": 0.6045179063360882, + "grad_norm": 15.690077781677246, + "learning_rate": 3.4472678607453406e-06, + "loss": 0.444, + "step": 5486 + }, + { + "epoch": 0.6046280991735538, + "grad_norm": 5.63066291809082, + "learning_rate": 3.4456059640076535e-06, + "loss": 0.438, + "step": 5487 + }, + { + "epoch": 0.6047382920110193, + "grad_norm": 11.27439022064209, + "learning_rate": 3.4439442573433834e-06, + "loss": 0.425, + "step": 5488 + }, + { + "epoch": 0.6048484848484849, + "grad_norm": 4.574676990509033, + "learning_rate": 3.4422827409557197e-06, + "loss": 0.3495, + "step": 5489 + }, + { + "epoch": 0.6049586776859505, + "grad_norm": 12.163301467895508, + "learning_rate": 3.4406214150478417e-06, + "loss": 0.4243, + "step": 5490 + }, + { + "epoch": 0.605068870523416, + "grad_norm": 5.973365306854248, + "learning_rate": 3.4389602798228942e-06, + "loss": 0.3563, + "step": 5491 + }, + { + "epoch": 0.6051790633608816, + "grad_norm": 6.112514019012451, + "learning_rate": 3.4372993354840034e-06, + "loss": 0.4301, + "step": 5492 + }, + { + "epoch": 0.6052892561983471, + "grad_norm": 6.745209217071533, + "learning_rate": 3.4356385822342734e-06, + "loss": 0.4964, + "step": 5493 + }, + { + "epoch": 0.6053994490358127, + "grad_norm": 6.515758991241455, + "learning_rate": 3.4339780202767824e-06, + "loss": 0.4525, + "step": 5494 + }, + { + "epoch": 0.6055096418732783, + "grad_norm": 7.227394104003906, + "learning_rate": 3.432317649814586e-06, + "loss": 0.3632, + "step": 5495 + }, + { + "epoch": 0.6056198347107438, + "grad_norm": 6.263385772705078, + "learning_rate": 3.430657471050717e-06, + "loss": 0.3573, + "step": 5496 + }, + { + "epoch": 0.6057300275482094, + "grad_norm": 7.537135601043701, + "learning_rate": 3.4289974841881848e-06, + "loss": 0.423, + "step": 5497 + }, + { + "epoch": 0.605840220385675, + "grad_norm": 7.491336822509766, + "learning_rate": 3.4273376894299726e-06, + "loss": 0.4863, + "step": 5498 + }, + { + "epoch": 0.6059504132231405, + "grad_norm": 6.500481128692627, + "learning_rate": 3.4256780869790456e-06, + "loss": 0.4532, + "step": 5499 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 7.756498336791992, + "learning_rate": 3.4240186770383406e-06, + "loss": 0.3945, + "step": 5500 + }, + { + "epoch": 0.6061707988980716, + "grad_norm": 7.27528715133667, + "learning_rate": 3.422359459810771e-06, + "loss": 0.4888, + "step": 5501 + }, + { + "epoch": 0.6062809917355372, + "grad_norm": 6.987998008728027, + "learning_rate": 3.420700435499231e-06, + "loss": 0.3375, + "step": 5502 + }, + { + "epoch": 0.6063911845730028, + "grad_norm": 7.22456169128418, + "learning_rate": 3.419041604306586e-06, + "loss": 0.4189, + "step": 5503 + }, + { + "epoch": 0.6065013774104683, + "grad_norm": 18.85123062133789, + "learning_rate": 3.4173829664356823e-06, + "loss": 0.5285, + "step": 5504 + }, + { + "epoch": 0.6066115702479339, + "grad_norm": 4.069933891296387, + "learning_rate": 3.4157245220893387e-06, + "loss": 0.4348, + "step": 5505 + }, + { + "epoch": 0.6067217630853995, + "grad_norm": 6.977851867675781, + "learning_rate": 3.414066271470352e-06, + "loss": 0.415, + "step": 5506 + }, + { + "epoch": 0.606831955922865, + "grad_norm": 5.772762775421143, + "learning_rate": 3.4124082147814972e-06, + "loss": 0.4229, + "step": 5507 + }, + { + "epoch": 0.6069421487603306, + "grad_norm": 4.402000904083252, + "learning_rate": 3.410750352225522e-06, + "loss": 0.4296, + "step": 5508 + }, + { + "epoch": 0.6070523415977962, + "grad_norm": 5.187463760375977, + "learning_rate": 3.409092684005152e-06, + "loss": 0.3901, + "step": 5509 + }, + { + "epoch": 0.6071625344352617, + "grad_norm": 9.41743278503418, + "learning_rate": 3.40743521032309e-06, + "loss": 0.525, + "step": 5510 + }, + { + "epoch": 0.6072727272727273, + "grad_norm": 10.870046615600586, + "learning_rate": 3.405777931382015e-06, + "loss": 0.5082, + "step": 5511 + }, + { + "epoch": 0.6073829201101928, + "grad_norm": 9.105496406555176, + "learning_rate": 3.404120847384577e-06, + "loss": 0.4436, + "step": 5512 + }, + { + "epoch": 0.6074931129476584, + "grad_norm": 4.680967330932617, + "learning_rate": 3.402463958533413e-06, + "loss": 0.3457, + "step": 5513 + }, + { + "epoch": 0.607603305785124, + "grad_norm": 7.194051265716553, + "learning_rate": 3.4008072650311248e-06, + "loss": 0.3418, + "step": 5514 + }, + { + "epoch": 0.6077134986225895, + "grad_norm": 3.9928011894226074, + "learning_rate": 3.3991507670802943e-06, + "loss": 0.4321, + "step": 5515 + }, + { + "epoch": 0.6078236914600551, + "grad_norm": 8.524062156677246, + "learning_rate": 3.397494464883486e-06, + "loss": 0.4718, + "step": 5516 + }, + { + "epoch": 0.6079338842975207, + "grad_norm": 5.69117546081543, + "learning_rate": 3.395838358643228e-06, + "loss": 0.3543, + "step": 5517 + }, + { + "epoch": 0.6080440771349862, + "grad_norm": 7.059926509857178, + "learning_rate": 3.394182448562038e-06, + "loss": 0.3468, + "step": 5518 + }, + { + "epoch": 0.6081542699724518, + "grad_norm": 5.54630708694458, + "learning_rate": 3.392526734842398e-06, + "loss": 0.3813, + "step": 5519 + }, + { + "epoch": 0.6082644628099173, + "grad_norm": 5.180455684661865, + "learning_rate": 3.39087121768677e-06, + "loss": 0.4141, + "step": 5520 + }, + { + "epoch": 0.6083746556473829, + "grad_norm": 10.610570907592773, + "learning_rate": 3.3892158972975996e-06, + "loss": 0.4915, + "step": 5521 + }, + { + "epoch": 0.6084848484848485, + "grad_norm": 7.216747760772705, + "learning_rate": 3.387560773877295e-06, + "loss": 0.3115, + "step": 5522 + }, + { + "epoch": 0.608595041322314, + "grad_norm": 4.480962753295898, + "learning_rate": 3.385905847628249e-06, + "loss": 0.3949, + "step": 5523 + }, + { + "epoch": 0.6087052341597796, + "grad_norm": 8.370291709899902, + "learning_rate": 3.384251118752829e-06, + "loss": 0.4791, + "step": 5524 + }, + { + "epoch": 0.6088154269972452, + "grad_norm": 5.0554656982421875, + "learning_rate": 3.382596587453378e-06, + "loss": 0.3885, + "step": 5525 + }, + { + "epoch": 0.6089256198347107, + "grad_norm": 7.361545085906982, + "learning_rate": 3.3809422539322114e-06, + "loss": 0.392, + "step": 5526 + }, + { + "epoch": 0.6090358126721763, + "grad_norm": 5.282715320587158, + "learning_rate": 3.3792881183916264e-06, + "loss": 0.3994, + "step": 5527 + }, + { + "epoch": 0.6091460055096418, + "grad_norm": 7.640480995178223, + "learning_rate": 3.3776341810338918e-06, + "loss": 0.4252, + "step": 5528 + }, + { + "epoch": 0.6092561983471074, + "grad_norm": 11.432671546936035, + "learning_rate": 3.3759804420612523e-06, + "loss": 0.4544, + "step": 5529 + }, + { + "epoch": 0.609366391184573, + "grad_norm": 4.013760566711426, + "learning_rate": 3.3743269016759315e-06, + "loss": 0.3352, + "step": 5530 + }, + { + "epoch": 0.6094765840220385, + "grad_norm": 6.269103050231934, + "learning_rate": 3.372673560080124e-06, + "loss": 0.4492, + "step": 5531 + }, + { + "epoch": 0.6095867768595041, + "grad_norm": 10.109606742858887, + "learning_rate": 3.3710204174760057e-06, + "loss": 0.4854, + "step": 5532 + }, + { + "epoch": 0.6096969696969697, + "grad_norm": 7.672402858734131, + "learning_rate": 3.3693674740657232e-06, + "loss": 0.395, + "step": 5533 + }, + { + "epoch": 0.6098071625344352, + "grad_norm": 5.026207447052002, + "learning_rate": 3.3677147300514003e-06, + "loss": 0.3911, + "step": 5534 + }, + { + "epoch": 0.6099173553719008, + "grad_norm": 6.89813232421875, + "learning_rate": 3.366062185635138e-06, + "loss": 0.3677, + "step": 5535 + }, + { + "epoch": 0.6100275482093664, + "grad_norm": 9.764766693115234, + "learning_rate": 3.3644098410190116e-06, + "loss": 0.4638, + "step": 5536 + }, + { + "epoch": 0.6101377410468319, + "grad_norm": 12.982881546020508, + "learning_rate": 3.3627576964050703e-06, + "loss": 0.4426, + "step": 5537 + }, + { + "epoch": 0.6102479338842975, + "grad_norm": 6.899441719055176, + "learning_rate": 3.3611057519953426e-06, + "loss": 0.4431, + "step": 5538 + }, + { + "epoch": 0.610358126721763, + "grad_norm": 7.647733688354492, + "learning_rate": 3.3594540079918314e-06, + "loss": 0.4726, + "step": 5539 + }, + { + "epoch": 0.6104683195592286, + "grad_norm": 5.014935493469238, + "learning_rate": 3.357802464596509e-06, + "loss": 0.4366, + "step": 5540 + }, + { + "epoch": 0.6105785123966943, + "grad_norm": 13.708841323852539, + "learning_rate": 3.3561511220113342e-06, + "loss": 0.3356, + "step": 5541 + }, + { + "epoch": 0.6106887052341597, + "grad_norm": 5.648785591125488, + "learning_rate": 3.354499980438234e-06, + "loss": 0.3524, + "step": 5542 + }, + { + "epoch": 0.6107988980716254, + "grad_norm": 7.1190667152404785, + "learning_rate": 3.352849040079108e-06, + "loss": 0.4991, + "step": 5543 + }, + { + "epoch": 0.610909090909091, + "grad_norm": 5.480741500854492, + "learning_rate": 3.3511983011358423e-06, + "loss": 0.4242, + "step": 5544 + }, + { + "epoch": 0.6110192837465565, + "grad_norm": 5.00535249710083, + "learning_rate": 3.349547763810285e-06, + "loss": 0.4734, + "step": 5545 + }, + { + "epoch": 0.6111294765840221, + "grad_norm": 7.967171669006348, + "learning_rate": 3.347897428304272e-06, + "loss": 0.3367, + "step": 5546 + }, + { + "epoch": 0.6112396694214876, + "grad_norm": 10.291219711303711, + "learning_rate": 3.3462472948196044e-06, + "loss": 0.4168, + "step": 5547 + }, + { + "epoch": 0.6113498622589532, + "grad_norm": 10.36643123626709, + "learning_rate": 3.3445973635580626e-06, + "loss": 0.4614, + "step": 5548 + }, + { + "epoch": 0.6114600550964188, + "grad_norm": 8.678853034973145, + "learning_rate": 3.342947634721406e-06, + "loss": 0.3642, + "step": 5549 + }, + { + "epoch": 0.6115702479338843, + "grad_norm": 6.127782821655273, + "learning_rate": 3.341298108511363e-06, + "loss": 0.3883, + "step": 5550 + }, + { + "epoch": 0.6116804407713499, + "grad_norm": 7.922155380249023, + "learning_rate": 3.33964878512964e-06, + "loss": 0.4173, + "step": 5551 + }, + { + "epoch": 0.6117906336088155, + "grad_norm": 7.354754447937012, + "learning_rate": 3.337999664777919e-06, + "loss": 0.3638, + "step": 5552 + }, + { + "epoch": 0.611900826446281, + "grad_norm": 5.784652233123779, + "learning_rate": 3.336350747657857e-06, + "loss": 0.4165, + "step": 5553 + }, + { + "epoch": 0.6120110192837466, + "grad_norm": 10.026205062866211, + "learning_rate": 3.3347020339710844e-06, + "loss": 0.4651, + "step": 5554 + }, + { + "epoch": 0.6121212121212121, + "grad_norm": 9.611244201660156, + "learning_rate": 3.33305352391921e-06, + "loss": 0.4452, + "step": 5555 + }, + { + "epoch": 0.6122314049586777, + "grad_norm": 8.440895080566406, + "learning_rate": 3.3314052177038147e-06, + "loss": 0.413, + "step": 5556 + }, + { + "epoch": 0.6123415977961433, + "grad_norm": 5.659351348876953, + "learning_rate": 3.329757115526456e-06, + "loss": 0.4292, + "step": 5557 + }, + { + "epoch": 0.6124517906336088, + "grad_norm": 14.748259544372559, + "learning_rate": 3.3281092175886665e-06, + "loss": 0.4598, + "step": 5558 + }, + { + "epoch": 0.6125619834710744, + "grad_norm": 5.136844635009766, + "learning_rate": 3.326461524091952e-06, + "loss": 0.3272, + "step": 5559 + }, + { + "epoch": 0.61267217630854, + "grad_norm": 5.436107158660889, + "learning_rate": 3.3248140352377957e-06, + "loss": 0.3365, + "step": 5560 + }, + { + "epoch": 0.6127823691460055, + "grad_norm": 5.402589797973633, + "learning_rate": 3.3231667512276553e-06, + "loss": 0.3975, + "step": 5561 + }, + { + "epoch": 0.6128925619834711, + "grad_norm": 5.3792595863342285, + "learning_rate": 3.321519672262962e-06, + "loss": 0.363, + "step": 5562 + }, + { + "epoch": 0.6130027548209367, + "grad_norm": 8.311089515686035, + "learning_rate": 3.319872798545123e-06, + "loss": 0.4721, + "step": 5563 + }, + { + "epoch": 0.6131129476584022, + "grad_norm": 6.57785701751709, + "learning_rate": 3.3182261302755216e-06, + "loss": 0.4175, + "step": 5564 + }, + { + "epoch": 0.6132231404958678, + "grad_norm": 7.138577461242676, + "learning_rate": 3.3165796676555118e-06, + "loss": 0.4104, + "step": 5565 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 5.004648685455322, + "learning_rate": 3.3149334108864273e-06, + "loss": 0.423, + "step": 5566 + }, + { + "epoch": 0.6134435261707989, + "grad_norm": 10.52160930633545, + "learning_rate": 3.3132873601695764e-06, + "loss": 0.5053, + "step": 5567 + }, + { + "epoch": 0.6135537190082645, + "grad_norm": 7.567537784576416, + "learning_rate": 3.311641515706234e-06, + "loss": 0.542, + "step": 5568 + }, + { + "epoch": 0.61366391184573, + "grad_norm": 4.513036727905273, + "learning_rate": 3.3099958776976636e-06, + "loss": 0.4161, + "step": 5569 + }, + { + "epoch": 0.6137741046831956, + "grad_norm": 5.943395137786865, + "learning_rate": 3.3083504463450943e-06, + "loss": 0.3304, + "step": 5570 + }, + { + "epoch": 0.6138842975206612, + "grad_norm": 6.845271110534668, + "learning_rate": 3.3067052218497263e-06, + "loss": 0.3635, + "step": 5571 + }, + { + "epoch": 0.6139944903581267, + "grad_norm": 7.157235622406006, + "learning_rate": 3.3050602044127473e-06, + "loss": 0.4635, + "step": 5572 + }, + { + "epoch": 0.6141046831955923, + "grad_norm": 5.039437770843506, + "learning_rate": 3.3034153942353055e-06, + "loss": 0.3877, + "step": 5573 + }, + { + "epoch": 0.6142148760330578, + "grad_norm": 5.90138578414917, + "learning_rate": 3.301770791518536e-06, + "loss": 0.4233, + "step": 5574 + }, + { + "epoch": 0.6143250688705234, + "grad_norm": 5.284835338592529, + "learning_rate": 3.300126396463542e-06, + "loss": 0.4305, + "step": 5575 + }, + { + "epoch": 0.614435261707989, + "grad_norm": 6.650893211364746, + "learning_rate": 3.2984822092713987e-06, + "loss": 0.4265, + "step": 5576 + }, + { + "epoch": 0.6145454545454545, + "grad_norm": 8.513042449951172, + "learning_rate": 3.2968382301431646e-06, + "loss": 0.4179, + "step": 5577 + }, + { + "epoch": 0.6146556473829201, + "grad_norm": 6.663072109222412, + "learning_rate": 3.2951944592798645e-06, + "loss": 0.4126, + "step": 5578 + }, + { + "epoch": 0.6147658402203857, + "grad_norm": 3.8044331073760986, + "learning_rate": 3.2935508968825e-06, + "loss": 0.3906, + "step": 5579 + }, + { + "epoch": 0.6148760330578512, + "grad_norm": 5.1406097412109375, + "learning_rate": 3.291907543152052e-06, + "loss": 0.4087, + "step": 5580 + }, + { + "epoch": 0.6149862258953168, + "grad_norm": 7.120194435119629, + "learning_rate": 3.2902643982894696e-06, + "loss": 0.378, + "step": 5581 + }, + { + "epoch": 0.6150964187327824, + "grad_norm": 7.6343183517456055, + "learning_rate": 3.2886214624956776e-06, + "loss": 0.4943, + "step": 5582 + }, + { + "epoch": 0.6152066115702479, + "grad_norm": 5.776088237762451, + "learning_rate": 3.2869787359715786e-06, + "loss": 0.3461, + "step": 5583 + }, + { + "epoch": 0.6153168044077135, + "grad_norm": 5.77926778793335, + "learning_rate": 3.285336218918047e-06, + "loss": 0.3914, + "step": 5584 + }, + { + "epoch": 0.615426997245179, + "grad_norm": 5.3097243309021, + "learning_rate": 3.2836939115359313e-06, + "loss": 0.4227, + "step": 5585 + }, + { + "epoch": 0.6155371900826446, + "grad_norm": 4.755492687225342, + "learning_rate": 3.2820518140260554e-06, + "loss": 0.3989, + "step": 5586 + }, + { + "epoch": 0.6156473829201102, + "grad_norm": 5.929659366607666, + "learning_rate": 3.280409926589216e-06, + "loss": 0.3955, + "step": 5587 + }, + { + "epoch": 0.6157575757575757, + "grad_norm": 6.221659183502197, + "learning_rate": 3.278768249426189e-06, + "loss": 0.3877, + "step": 5588 + }, + { + "epoch": 0.6158677685950413, + "grad_norm": 5.330348968505859, + "learning_rate": 3.2771267827377177e-06, + "loss": 0.3422, + "step": 5589 + }, + { + "epoch": 0.6159779614325069, + "grad_norm": 12.225643157958984, + "learning_rate": 3.2754855267245232e-06, + "loss": 0.4134, + "step": 5590 + }, + { + "epoch": 0.6160881542699724, + "grad_norm": 4.399528980255127, + "learning_rate": 3.2738444815873015e-06, + "loss": 0.4353, + "step": 5591 + }, + { + "epoch": 0.616198347107438, + "grad_norm": 8.618428230285645, + "learning_rate": 3.2722036475267215e-06, + "loss": 0.3784, + "step": 5592 + }, + { + "epoch": 0.6163085399449035, + "grad_norm": 9.162566184997559, + "learning_rate": 3.2705630247434258e-06, + "loss": 0.4122, + "step": 5593 + }, + { + "epoch": 0.6164187327823691, + "grad_norm": 5.548257827758789, + "learning_rate": 3.2689226134380333e-06, + "loss": 0.3735, + "step": 5594 + }, + { + "epoch": 0.6165289256198347, + "grad_norm": 6.5701446533203125, + "learning_rate": 3.267282413811135e-06, + "loss": 0.4465, + "step": 5595 + }, + { + "epoch": 0.6166391184573002, + "grad_norm": 6.101266384124756, + "learning_rate": 3.265642426063296e-06, + "loss": 0.4459, + "step": 5596 + }, + { + "epoch": 0.6167493112947658, + "grad_norm": 6.442191123962402, + "learning_rate": 3.264002650395058e-06, + "loss": 0.3938, + "step": 5597 + }, + { + "epoch": 0.6168595041322315, + "grad_norm": 5.264941692352295, + "learning_rate": 3.2623630870069346e-06, + "loss": 0.3521, + "step": 5598 + }, + { + "epoch": 0.616969696969697, + "grad_norm": 8.560416221618652, + "learning_rate": 3.260723736099411e-06, + "loss": 0.4031, + "step": 5599 + }, + { + "epoch": 0.6170798898071626, + "grad_norm": 7.50861120223999, + "learning_rate": 3.2590845978729528e-06, + "loss": 0.3334, + "step": 5600 + }, + { + "epoch": 0.617190082644628, + "grad_norm": 5.766119003295898, + "learning_rate": 3.2574456725279946e-06, + "loss": 0.4062, + "step": 5601 + }, + { + "epoch": 0.6173002754820937, + "grad_norm": 7.3114705085754395, + "learning_rate": 3.255806960264947e-06, + "loss": 0.4019, + "step": 5602 + }, + { + "epoch": 0.6174104683195593, + "grad_norm": 6.906582355499268, + "learning_rate": 3.2541684612841943e-06, + "loss": 0.4805, + "step": 5603 + }, + { + "epoch": 0.6175206611570248, + "grad_norm": 14.328627586364746, + "learning_rate": 3.2525301757860915e-06, + "loss": 0.4274, + "step": 5604 + }, + { + "epoch": 0.6176308539944904, + "grad_norm": 8.022102355957031, + "learning_rate": 3.250892103970975e-06, + "loss": 0.3747, + "step": 5605 + }, + { + "epoch": 0.617741046831956, + "grad_norm": 8.918832778930664, + "learning_rate": 3.2492542460391467e-06, + "loss": 0.4359, + "step": 5606 + }, + { + "epoch": 0.6178512396694215, + "grad_norm": 6.053462505340576, + "learning_rate": 3.2476166021908856e-06, + "loss": 0.4104, + "step": 5607 + }, + { + "epoch": 0.6179614325068871, + "grad_norm": 7.203765392303467, + "learning_rate": 3.24597917262645e-06, + "loss": 0.4289, + "step": 5608 + }, + { + "epoch": 0.6180716253443527, + "grad_norm": 5.25016975402832, + "learning_rate": 3.2443419575460623e-06, + "loss": 0.3791, + "step": 5609 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 7.548518657684326, + "learning_rate": 3.242704957149925e-06, + "loss": 0.4507, + "step": 5610 + }, + { + "epoch": 0.6182920110192838, + "grad_norm": 11.34952449798584, + "learning_rate": 3.241068171638212e-06, + "loss": 0.5606, + "step": 5611 + }, + { + "epoch": 0.6184022038567493, + "grad_norm": 12.98247241973877, + "learning_rate": 3.2394316012110726e-06, + "loss": 0.5672, + "step": 5612 + }, + { + "epoch": 0.6185123966942149, + "grad_norm": 5.332764625549316, + "learning_rate": 3.237795246068628e-06, + "loss": 0.3759, + "step": 5613 + }, + { + "epoch": 0.6186225895316805, + "grad_norm": 6.132878303527832, + "learning_rate": 3.2361591064109754e-06, + "loss": 0.4482, + "step": 5614 + }, + { + "epoch": 0.618732782369146, + "grad_norm": 6.75705623626709, + "learning_rate": 3.234523182438182e-06, + "loss": 0.3936, + "step": 5615 + }, + { + "epoch": 0.6188429752066116, + "grad_norm": 5.98726224899292, + "learning_rate": 3.2328874743502935e-06, + "loss": 0.3458, + "step": 5616 + }, + { + "epoch": 0.6189531680440772, + "grad_norm": 7.1376166343688965, + "learning_rate": 3.231251982347324e-06, + "loss": 0.4255, + "step": 5617 + }, + { + "epoch": 0.6190633608815427, + "grad_norm": 5.139072895050049, + "learning_rate": 3.229616706629265e-06, + "loss": 0.4557, + "step": 5618 + }, + { + "epoch": 0.6191735537190083, + "grad_norm": 6.690254211425781, + "learning_rate": 3.22798164739608e-06, + "loss": 0.4788, + "step": 5619 + }, + { + "epoch": 0.6192837465564738, + "grad_norm": 7.521590232849121, + "learning_rate": 3.2263468048477066e-06, + "loss": 0.4368, + "step": 5620 + }, + { + "epoch": 0.6193939393939394, + "grad_norm": 10.254018783569336, + "learning_rate": 3.224712179184054e-06, + "loss": 0.3912, + "step": 5621 + }, + { + "epoch": 0.619504132231405, + "grad_norm": 8.72264289855957, + "learning_rate": 3.2230777706050087e-06, + "loss": 0.4295, + "step": 5622 + }, + { + "epoch": 0.6196143250688705, + "grad_norm": 3.8662004470825195, + "learning_rate": 3.221443579310428e-06, + "loss": 0.3662, + "step": 5623 + }, + { + "epoch": 0.6197245179063361, + "grad_norm": 4.6445722579956055, + "learning_rate": 3.2198096055001404e-06, + "loss": 0.4469, + "step": 5624 + }, + { + "epoch": 0.6198347107438017, + "grad_norm": 4.952694416046143, + "learning_rate": 3.2181758493739535e-06, + "loss": 0.3441, + "step": 5625 + }, + { + "epoch": 0.6199449035812672, + "grad_norm": 6.553282260894775, + "learning_rate": 3.2165423111316453e-06, + "loss": 0.3735, + "step": 5626 + }, + { + "epoch": 0.6200550964187328, + "grad_norm": 7.485846042633057, + "learning_rate": 3.2149089909729623e-06, + "loss": 0.3838, + "step": 5627 + }, + { + "epoch": 0.6201652892561983, + "grad_norm": 4.6056389808654785, + "learning_rate": 3.2132758890976357e-06, + "loss": 0.3903, + "step": 5628 + }, + { + "epoch": 0.6202754820936639, + "grad_norm": 6.660373210906982, + "learning_rate": 3.2116430057053594e-06, + "loss": 0.4082, + "step": 5629 + }, + { + "epoch": 0.6203856749311295, + "grad_norm": 6.841281890869141, + "learning_rate": 3.2100103409958062e-06, + "loss": 0.4608, + "step": 5630 + }, + { + "epoch": 0.620495867768595, + "grad_norm": 12.513483047485352, + "learning_rate": 3.2083778951686206e-06, + "loss": 0.5196, + "step": 5631 + }, + { + "epoch": 0.6206060606060606, + "grad_norm": 6.187821388244629, + "learning_rate": 3.2067456684234167e-06, + "loss": 0.4053, + "step": 5632 + }, + { + "epoch": 0.6207162534435262, + "grad_norm": 6.212988376617432, + "learning_rate": 3.205113660959791e-06, + "loss": 0.3763, + "step": 5633 + }, + { + "epoch": 0.6208264462809917, + "grad_norm": 7.4557623863220215, + "learning_rate": 3.2034818729773056e-06, + "loss": 0.4272, + "step": 5634 + }, + { + "epoch": 0.6209366391184573, + "grad_norm": 5.5205206871032715, + "learning_rate": 3.201850304675494e-06, + "loss": 0.4393, + "step": 5635 + }, + { + "epoch": 0.6210468319559229, + "grad_norm": 7.638397216796875, + "learning_rate": 3.200218956253873e-06, + "loss": 0.457, + "step": 5636 + }, + { + "epoch": 0.6211570247933884, + "grad_norm": 3.999941110610962, + "learning_rate": 3.1985878279119213e-06, + "loss": 0.4008, + "step": 5637 + }, + { + "epoch": 0.621267217630854, + "grad_norm": 7.912863731384277, + "learning_rate": 3.196956919849097e-06, + "loss": 0.3695, + "step": 5638 + }, + { + "epoch": 0.6213774104683195, + "grad_norm": 4.004800796508789, + "learning_rate": 3.195326232264829e-06, + "loss": 0.3662, + "step": 5639 + }, + { + "epoch": 0.6214876033057851, + "grad_norm": 5.542474746704102, + "learning_rate": 3.193695765358522e-06, + "loss": 0.3078, + "step": 5640 + }, + { + "epoch": 0.6215977961432507, + "grad_norm": 7.421055316925049, + "learning_rate": 3.192065519329549e-06, + "loss": 0.3615, + "step": 5641 + }, + { + "epoch": 0.6217079889807162, + "grad_norm": 6.017780780792236, + "learning_rate": 3.190435494377262e-06, + "loss": 0.332, + "step": 5642 + }, + { + "epoch": 0.6218181818181818, + "grad_norm": 8.54306697845459, + "learning_rate": 3.188805690700979e-06, + "loss": 0.3651, + "step": 5643 + }, + { + "epoch": 0.6219283746556474, + "grad_norm": 9.726669311523438, + "learning_rate": 3.1871761084999975e-06, + "loss": 0.4086, + "step": 5644 + }, + { + "epoch": 0.6220385674931129, + "grad_norm": 5.96169376373291, + "learning_rate": 3.1855467479735836e-06, + "loss": 0.3899, + "step": 5645 + }, + { + "epoch": 0.6221487603305785, + "grad_norm": 4.833744049072266, + "learning_rate": 3.183917609320978e-06, + "loss": 0.3217, + "step": 5646 + }, + { + "epoch": 0.622258953168044, + "grad_norm": 6.837959289550781, + "learning_rate": 3.1822886927413945e-06, + "loss": 0.3345, + "step": 5647 + }, + { + "epoch": 0.6223691460055096, + "grad_norm": 8.42690372467041, + "learning_rate": 3.1806599984340182e-06, + "loss": 0.4644, + "step": 5648 + }, + { + "epoch": 0.6224793388429752, + "grad_norm": 10.13464641571045, + "learning_rate": 3.179031526598008e-06, + "loss": 0.3748, + "step": 5649 + }, + { + "epoch": 0.6225895316804407, + "grad_norm": 13.409700393676758, + "learning_rate": 3.1774032774324973e-06, + "loss": 0.523, + "step": 5650 + }, + { + "epoch": 0.6226997245179063, + "grad_norm": 7.18253755569458, + "learning_rate": 3.1757752511365903e-06, + "loss": 0.3761, + "step": 5651 + }, + { + "epoch": 0.622809917355372, + "grad_norm": 6.525432109832764, + "learning_rate": 3.1741474479093615e-06, + "loss": 0.3809, + "step": 5652 + }, + { + "epoch": 0.6229201101928374, + "grad_norm": 12.289591789245605, + "learning_rate": 3.1725198679498647e-06, + "loss": 0.5713, + "step": 5653 + }, + { + "epoch": 0.623030303030303, + "grad_norm": 6.035792350769043, + "learning_rate": 3.17089251145712e-06, + "loss": 0.3966, + "step": 5654 + }, + { + "epoch": 0.6231404958677685, + "grad_norm": 9.94541072845459, + "learning_rate": 3.169265378630123e-06, + "loss": 0.3731, + "step": 5655 + }, + { + "epoch": 0.6232506887052341, + "grad_norm": 7.422900199890137, + "learning_rate": 3.1676384696678436e-06, + "loss": 0.3012, + "step": 5656 + }, + { + "epoch": 0.6233608815426998, + "grad_norm": 5.478695392608643, + "learning_rate": 3.16601178476922e-06, + "loss": 0.3706, + "step": 5657 + }, + { + "epoch": 0.6234710743801652, + "grad_norm": 4.026650905609131, + "learning_rate": 3.164385324133168e-06, + "loss": 0.3987, + "step": 5658 + }, + { + "epoch": 0.6235812672176309, + "grad_norm": 5.533215522766113, + "learning_rate": 3.1627590879585723e-06, + "loss": 0.2778, + "step": 5659 + }, + { + "epoch": 0.6236914600550965, + "grad_norm": 5.62987756729126, + "learning_rate": 3.161133076444288e-06, + "loss": 0.4016, + "step": 5660 + }, + { + "epoch": 0.623801652892562, + "grad_norm": 5.299964427947998, + "learning_rate": 3.159507289789151e-06, + "loss": 0.4233, + "step": 5661 + }, + { + "epoch": 0.6239118457300276, + "grad_norm": 5.24252462387085, + "learning_rate": 3.1578817281919644e-06, + "loss": 0.2786, + "step": 5662 + }, + { + "epoch": 0.6240220385674932, + "grad_norm": 6.27246618270874, + "learning_rate": 3.156256391851499e-06, + "loss": 0.3504, + "step": 5663 + }, + { + "epoch": 0.6241322314049587, + "grad_norm": 7.873795032501221, + "learning_rate": 3.1546312809665093e-06, + "loss": 0.4294, + "step": 5664 + }, + { + "epoch": 0.6242424242424243, + "grad_norm": 6.295063495635986, + "learning_rate": 3.153006395735712e-06, + "loss": 0.4972, + "step": 5665 + }, + { + "epoch": 0.6243526170798898, + "grad_norm": 6.084031581878662, + "learning_rate": 3.1513817363577997e-06, + "loss": 0.4003, + "step": 5666 + }, + { + "epoch": 0.6244628099173554, + "grad_norm": 6.049535751342773, + "learning_rate": 3.1497573030314433e-06, + "loss": 0.4134, + "step": 5667 + }, + { + "epoch": 0.624573002754821, + "grad_norm": 4.959476947784424, + "learning_rate": 3.148133095955276e-06, + "loss": 0.3097, + "step": 5668 + }, + { + "epoch": 0.6246831955922865, + "grad_norm": 4.211620330810547, + "learning_rate": 3.146509115327907e-06, + "loss": 0.4245, + "step": 5669 + }, + { + "epoch": 0.6247933884297521, + "grad_norm": 10.23599624633789, + "learning_rate": 3.1448853613479238e-06, + "loss": 0.5412, + "step": 5670 + }, + { + "epoch": 0.6249035812672177, + "grad_norm": 4.663244724273682, + "learning_rate": 3.1432618342138765e-06, + "loss": 0.3413, + "step": 5671 + }, + { + "epoch": 0.6250137741046832, + "grad_norm": 5.570018768310547, + "learning_rate": 3.1416385341242957e-06, + "loss": 0.3754, + "step": 5672 + }, + { + "epoch": 0.6251239669421488, + "grad_norm": 7.070735931396484, + "learning_rate": 3.140015461277679e-06, + "loss": 0.4864, + "step": 5673 + }, + { + "epoch": 0.6252341597796143, + "grad_norm": 11.128875732421875, + "learning_rate": 3.1383926158724976e-06, + "loss": 0.4269, + "step": 5674 + }, + { + "epoch": 0.6253443526170799, + "grad_norm": 6.50960111618042, + "learning_rate": 3.1367699981071962e-06, + "loss": 0.4417, + "step": 5675 + }, + { + "epoch": 0.6254545454545455, + "grad_norm": 9.919767379760742, + "learning_rate": 3.135147608180191e-06, + "loss": 0.4222, + "step": 5676 + }, + { + "epoch": 0.625564738292011, + "grad_norm": 7.701219081878662, + "learning_rate": 3.1335254462898686e-06, + "loss": 0.435, + "step": 5677 + }, + { + "epoch": 0.6256749311294766, + "grad_norm": 9.599321365356445, + "learning_rate": 3.131903512634591e-06, + "loss": 0.4074, + "step": 5678 + }, + { + "epoch": 0.6257851239669422, + "grad_norm": 18.00543975830078, + "learning_rate": 3.1302818074126885e-06, + "loss": 0.5061, + "step": 5679 + }, + { + "epoch": 0.6258953168044077, + "grad_norm": 4.5613603591918945, + "learning_rate": 3.128660330822466e-06, + "loss": 0.3742, + "step": 5680 + }, + { + "epoch": 0.6260055096418733, + "grad_norm": 5.788570880889893, + "learning_rate": 3.127039083062201e-06, + "loss": 0.349, + "step": 5681 + }, + { + "epoch": 0.6261157024793389, + "grad_norm": 9.021220207214355, + "learning_rate": 3.1254180643301413e-06, + "loss": 0.3669, + "step": 5682 + }, + { + "epoch": 0.6262258953168044, + "grad_norm": 8.388569831848145, + "learning_rate": 3.123797274824506e-06, + "loss": 0.3591, + "step": 5683 + }, + { + "epoch": 0.62633608815427, + "grad_norm": 5.5817718505859375, + "learning_rate": 3.12217671474349e-06, + "loss": 0.4128, + "step": 5684 + }, + { + "epoch": 0.6264462809917355, + "grad_norm": 5.523597717285156, + "learning_rate": 3.1205563842852544e-06, + "loss": 0.3976, + "step": 5685 + }, + { + "epoch": 0.6265564738292011, + "grad_norm": 9.741737365722656, + "learning_rate": 3.1189362836479386e-06, + "loss": 0.3773, + "step": 5686 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 3.5196845531463623, + "learning_rate": 3.1173164130296486e-06, + "loss": 0.3505, + "step": 5687 + }, + { + "epoch": 0.6267768595041322, + "grad_norm": 6.845584869384766, + "learning_rate": 3.1156967726284644e-06, + "loss": 0.4427, + "step": 5688 + }, + { + "epoch": 0.6268870523415978, + "grad_norm": 4.908968448638916, + "learning_rate": 3.114077362642439e-06, + "loss": 0.3494, + "step": 5689 + }, + { + "epoch": 0.6269972451790634, + "grad_norm": 8.34274959564209, + "learning_rate": 3.1124581832695973e-06, + "loss": 0.3322, + "step": 5690 + }, + { + "epoch": 0.6271074380165289, + "grad_norm": 11.896194458007812, + "learning_rate": 3.110839234707929e-06, + "loss": 0.5003, + "step": 5691 + }, + { + "epoch": 0.6272176308539945, + "grad_norm": 4.664172172546387, + "learning_rate": 3.109220517155409e-06, + "loss": 0.3848, + "step": 5692 + }, + { + "epoch": 0.62732782369146, + "grad_norm": 6.579066276550293, + "learning_rate": 3.1076020308099707e-06, + "loss": 0.3774, + "step": 5693 + }, + { + "epoch": 0.6274380165289256, + "grad_norm": 4.803685188293457, + "learning_rate": 3.1059837758695256e-06, + "loss": 0.4275, + "step": 5694 + }, + { + "epoch": 0.6275482093663912, + "grad_norm": 5.253371238708496, + "learning_rate": 3.1043657525319597e-06, + "loss": 0.4377, + "step": 5695 + }, + { + "epoch": 0.6276584022038567, + "grad_norm": 9.491144180297852, + "learning_rate": 3.102747960995124e-06, + "loss": 0.4879, + "step": 5696 + }, + { + "epoch": 0.6277685950413223, + "grad_norm": 6.025481224060059, + "learning_rate": 3.1011304014568433e-06, + "loss": 0.3644, + "step": 5697 + }, + { + "epoch": 0.6278787878787879, + "grad_norm": 5.618700981140137, + "learning_rate": 3.099513074114917e-06, + "loss": 0.4425, + "step": 5698 + }, + { + "epoch": 0.6279889807162534, + "grad_norm": 4.917667388916016, + "learning_rate": 3.0978959791671128e-06, + "loss": 0.3253, + "step": 5699 + }, + { + "epoch": 0.628099173553719, + "grad_norm": 9.240817070007324, + "learning_rate": 3.0962791168111738e-06, + "loss": 0.3723, + "step": 5700 + }, + { + "epoch": 0.6282093663911845, + "grad_norm": 6.664559841156006, + "learning_rate": 3.0946624872448096e-06, + "loss": 0.3735, + "step": 5701 + }, + { + "epoch": 0.6283195592286501, + "grad_norm": 5.612759113311768, + "learning_rate": 3.0930460906657043e-06, + "loss": 0.3224, + "step": 5702 + }, + { + "epoch": 0.6284297520661157, + "grad_norm": 10.004939079284668, + "learning_rate": 3.091429927271514e-06, + "loss": 0.4557, + "step": 5703 + }, + { + "epoch": 0.6285399449035812, + "grad_norm": 7.886843681335449, + "learning_rate": 3.0898139972598645e-06, + "loss": 0.4325, + "step": 5704 + }, + { + "epoch": 0.6286501377410468, + "grad_norm": 4.917661190032959, + "learning_rate": 3.0881983008283534e-06, + "loss": 0.3939, + "step": 5705 + }, + { + "epoch": 0.6287603305785124, + "grad_norm": 9.051050186157227, + "learning_rate": 3.0865828381745515e-06, + "loss": 0.4182, + "step": 5706 + }, + { + "epoch": 0.6288705234159779, + "grad_norm": 7.852141380310059, + "learning_rate": 3.0849676094960003e-06, + "loss": 0.3589, + "step": 5707 + }, + { + "epoch": 0.6289807162534435, + "grad_norm": 4.527191638946533, + "learning_rate": 3.0833526149902093e-06, + "loss": 0.3663, + "step": 5708 + }, + { + "epoch": 0.6290909090909091, + "grad_norm": 5.753724575042725, + "learning_rate": 3.081737854854665e-06, + "loss": 0.386, + "step": 5709 + }, + { + "epoch": 0.6292011019283746, + "grad_norm": 7.792405128479004, + "learning_rate": 3.0801233292868216e-06, + "loss": 0.4654, + "step": 5710 + }, + { + "epoch": 0.6293112947658402, + "grad_norm": 11.35915470123291, + "learning_rate": 3.078509038484105e-06, + "loss": 0.3484, + "step": 5711 + }, + { + "epoch": 0.6294214876033057, + "grad_norm": 5.1471357345581055, + "learning_rate": 3.0768949826439135e-06, + "loss": 0.4525, + "step": 5712 + }, + { + "epoch": 0.6295316804407713, + "grad_norm": 6.306460857391357, + "learning_rate": 3.0752811619636175e-06, + "loss": 0.362, + "step": 5713 + }, + { + "epoch": 0.629641873278237, + "grad_norm": 14.662524223327637, + "learning_rate": 3.073667576640552e-06, + "loss": 0.4656, + "step": 5714 + }, + { + "epoch": 0.6297520661157024, + "grad_norm": 6.419051647186279, + "learning_rate": 3.0720542268720344e-06, + "loss": 0.4059, + "step": 5715 + }, + { + "epoch": 0.629862258953168, + "grad_norm": 4.525341033935547, + "learning_rate": 3.070441112855343e-06, + "loss": 0.4006, + "step": 5716 + }, + { + "epoch": 0.6299724517906337, + "grad_norm": 8.255001068115234, + "learning_rate": 3.0688282347877346e-06, + "loss": 0.4168, + "step": 5717 + }, + { + "epoch": 0.6300826446280992, + "grad_norm": 4.753320693969727, + "learning_rate": 3.0672155928664345e-06, + "loss": 0.3782, + "step": 5718 + }, + { + "epoch": 0.6301928374655648, + "grad_norm": 6.017674922943115, + "learning_rate": 3.065603187288634e-06, + "loss": 0.4393, + "step": 5719 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 6.011033535003662, + "learning_rate": 3.0639910182515045e-06, + "loss": 0.3328, + "step": 5720 + }, + { + "epoch": 0.6304132231404959, + "grad_norm": 4.837926864624023, + "learning_rate": 3.0623790859521853e-06, + "loss": 0.3611, + "step": 5721 + }, + { + "epoch": 0.6305234159779615, + "grad_norm": 9.484621047973633, + "learning_rate": 3.0607673905877787e-06, + "loss": 0.4761, + "step": 5722 + }, + { + "epoch": 0.630633608815427, + "grad_norm": 6.29550838470459, + "learning_rate": 3.0591559323553745e-06, + "loss": 0.3861, + "step": 5723 + }, + { + "epoch": 0.6307438016528926, + "grad_norm": 6.089902400970459, + "learning_rate": 3.0575447114520175e-06, + "loss": 0.3484, + "step": 5724 + }, + { + "epoch": 0.6308539944903582, + "grad_norm": 5.730831146240234, + "learning_rate": 3.0559337280747314e-06, + "loss": 0.4115, + "step": 5725 + }, + { + "epoch": 0.6309641873278237, + "grad_norm": 5.556731224060059, + "learning_rate": 3.05432298242051e-06, + "loss": 0.3475, + "step": 5726 + }, + { + "epoch": 0.6310743801652893, + "grad_norm": 11.543536186218262, + "learning_rate": 3.052712474686318e-06, + "loss": 0.4169, + "step": 5727 + }, + { + "epoch": 0.6311845730027548, + "grad_norm": 5.412355899810791, + "learning_rate": 3.051102205069088e-06, + "loss": 0.4294, + "step": 5728 + }, + { + "epoch": 0.6312947658402204, + "grad_norm": 7.309114933013916, + "learning_rate": 3.049492173765729e-06, + "loss": 0.327, + "step": 5729 + }, + { + "epoch": 0.631404958677686, + "grad_norm": 5.162247657775879, + "learning_rate": 3.047882380973115e-06, + "loss": 0.3876, + "step": 5730 + }, + { + "epoch": 0.6315151515151515, + "grad_norm": 12.657859802246094, + "learning_rate": 3.046272826888097e-06, + "loss": 0.4236, + "step": 5731 + }, + { + "epoch": 0.6316253443526171, + "grad_norm": 7.963898181915283, + "learning_rate": 3.044663511707491e-06, + "loss": 0.4182, + "step": 5732 + }, + { + "epoch": 0.6317355371900827, + "grad_norm": 5.893122673034668, + "learning_rate": 3.0430544356280865e-06, + "loss": 0.4278, + "step": 5733 + }, + { + "epoch": 0.6318457300275482, + "grad_norm": 5.781898021697998, + "learning_rate": 3.041445598846644e-06, + "loss": 0.3831, + "step": 5734 + }, + { + "epoch": 0.6319559228650138, + "grad_norm": 13.247063636779785, + "learning_rate": 3.039837001559895e-06, + "loss": 0.5004, + "step": 5735 + }, + { + "epoch": 0.6320661157024794, + "grad_norm": 7.441000938415527, + "learning_rate": 3.0382286439645382e-06, + "loss": 0.3866, + "step": 5736 + }, + { + "epoch": 0.6321763085399449, + "grad_norm": 5.362796783447266, + "learning_rate": 3.036620526257249e-06, + "loss": 0.4517, + "step": 5737 + }, + { + "epoch": 0.6322865013774105, + "grad_norm": 6.049092769622803, + "learning_rate": 3.0350126486346694e-06, + "loss": 0.4513, + "step": 5738 + }, + { + "epoch": 0.632396694214876, + "grad_norm": 8.4575777053833, + "learning_rate": 3.0334050112934106e-06, + "loss": 0.3578, + "step": 5739 + }, + { + "epoch": 0.6325068870523416, + "grad_norm": 3.689763069152832, + "learning_rate": 3.0317976144300598e-06, + "loss": 0.3682, + "step": 5740 + }, + { + "epoch": 0.6326170798898072, + "grad_norm": 4.175183296203613, + "learning_rate": 3.03019045824117e-06, + "loss": 0.4013, + "step": 5741 + }, + { + "epoch": 0.6327272727272727, + "grad_norm": 5.330626487731934, + "learning_rate": 3.028583542923266e-06, + "loss": 0.4065, + "step": 5742 + }, + { + "epoch": 0.6328374655647383, + "grad_norm": 7.9115777015686035, + "learning_rate": 3.026976868672844e-06, + "loss": 0.4795, + "step": 5743 + }, + { + "epoch": 0.6329476584022039, + "grad_norm": 8.067718505859375, + "learning_rate": 3.025370435686371e-06, + "loss": 0.3546, + "step": 5744 + }, + { + "epoch": 0.6330578512396694, + "grad_norm": 8.21186351776123, + "learning_rate": 3.0237642441602837e-06, + "loss": 0.28, + "step": 5745 + }, + { + "epoch": 0.633168044077135, + "grad_norm": 8.001065254211426, + "learning_rate": 3.0221582942909903e-06, + "loss": 0.4416, + "step": 5746 + }, + { + "epoch": 0.6332782369146005, + "grad_norm": 5.936722278594971, + "learning_rate": 3.020552586274865e-06, + "loss": 0.4107, + "step": 5747 + }, + { + "epoch": 0.6333884297520661, + "grad_norm": 17.30539321899414, + "learning_rate": 3.0189471203082593e-06, + "loss": 0.5181, + "step": 5748 + }, + { + "epoch": 0.6334986225895317, + "grad_norm": 7.969041347503662, + "learning_rate": 3.017341896587492e-06, + "loss": 0.3932, + "step": 5749 + }, + { + "epoch": 0.6336088154269972, + "grad_norm": 5.316683769226074, + "learning_rate": 3.0157369153088477e-06, + "loss": 0.4158, + "step": 5750 + }, + { + "epoch": 0.6337190082644628, + "grad_norm": 11.801061630249023, + "learning_rate": 3.0141321766685914e-06, + "loss": 0.471, + "step": 5751 + }, + { + "epoch": 0.6338292011019284, + "grad_norm": 8.095755577087402, + "learning_rate": 3.01252768086295e-06, + "loss": 0.4333, + "step": 5752 + }, + { + "epoch": 0.6339393939393939, + "grad_norm": 4.716119766235352, + "learning_rate": 3.010923428088121e-06, + "loss": 0.4192, + "step": 5753 + }, + { + "epoch": 0.6340495867768595, + "grad_norm": 6.167051315307617, + "learning_rate": 3.00931941854028e-06, + "loss": 0.3824, + "step": 5754 + }, + { + "epoch": 0.634159779614325, + "grad_norm": 5.537266254425049, + "learning_rate": 3.0077156524155637e-06, + "loss": 0.4078, + "step": 5755 + }, + { + "epoch": 0.6342699724517906, + "grad_norm": 3.6839773654937744, + "learning_rate": 3.0061121299100824e-06, + "loss": 0.3859, + "step": 5756 + }, + { + "epoch": 0.6343801652892562, + "grad_norm": 7.0088043212890625, + "learning_rate": 3.00450885121992e-06, + "loss": 0.3354, + "step": 5757 + }, + { + "epoch": 0.6344903581267217, + "grad_norm": 5.540916442871094, + "learning_rate": 3.0029058165411245e-06, + "loss": 0.3837, + "step": 5758 + }, + { + "epoch": 0.6346005509641873, + "grad_norm": 8.917105674743652, + "learning_rate": 3.0013030260697194e-06, + "loss": 0.4784, + "step": 5759 + }, + { + "epoch": 0.6347107438016529, + "grad_norm": 8.26288890838623, + "learning_rate": 2.9997004800016956e-06, + "loss": 0.4377, + "step": 5760 + }, + { + "epoch": 0.6348209366391184, + "grad_norm": 6.514774322509766, + "learning_rate": 2.9980981785330125e-06, + "loss": 0.3802, + "step": 5761 + }, + { + "epoch": 0.634931129476584, + "grad_norm": 9.283097267150879, + "learning_rate": 2.996496121859605e-06, + "loss": 0.4863, + "step": 5762 + }, + { + "epoch": 0.6350413223140496, + "grad_norm": 6.459282875061035, + "learning_rate": 2.994894310177373e-06, + "loss": 0.3799, + "step": 5763 + }, + { + "epoch": 0.6351515151515151, + "grad_norm": 8.162826538085938, + "learning_rate": 2.993292743682188e-06, + "loss": 0.3887, + "step": 5764 + }, + { + "epoch": 0.6352617079889807, + "grad_norm": 6.5035576820373535, + "learning_rate": 2.9916914225698923e-06, + "loss": 0.4283, + "step": 5765 + }, + { + "epoch": 0.6353719008264462, + "grad_norm": 4.4118733406066895, + "learning_rate": 2.990090347036298e-06, + "loss": 0.3896, + "step": 5766 + }, + { + "epoch": 0.6354820936639118, + "grad_norm": 6.801360607147217, + "learning_rate": 2.9884895172771854e-06, + "loss": 0.3955, + "step": 5767 + }, + { + "epoch": 0.6355922865013774, + "grad_norm": 7.458156108856201, + "learning_rate": 2.986888933488308e-06, + "loss": 0.4166, + "step": 5768 + }, + { + "epoch": 0.6357024793388429, + "grad_norm": 7.173938274383545, + "learning_rate": 2.9852885958653855e-06, + "loss": 0.4162, + "step": 5769 + }, + { + "epoch": 0.6358126721763085, + "grad_norm": 6.3631486892700195, + "learning_rate": 2.9836885046041095e-06, + "loss": 0.4605, + "step": 5770 + }, + { + "epoch": 0.6359228650137742, + "grad_norm": 5.616750240325928, + "learning_rate": 2.9820886599001434e-06, + "loss": 0.4108, + "step": 5771 + }, + { + "epoch": 0.6360330578512396, + "grad_norm": 4.267113208770752, + "learning_rate": 2.980489061949116e-06, + "loss": 0.284, + "step": 5772 + }, + { + "epoch": 0.6361432506887053, + "grad_norm": 4.92551851272583, + "learning_rate": 2.978889710946631e-06, + "loss": 0.4409, + "step": 5773 + }, + { + "epoch": 0.6362534435261707, + "grad_norm": 18.585920333862305, + "learning_rate": 2.977290607088257e-06, + "loss": 0.5592, + "step": 5774 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 7.982060432434082, + "learning_rate": 2.9756917505695336e-06, + "loss": 0.4083, + "step": 5775 + }, + { + "epoch": 0.636473829201102, + "grad_norm": 7.351476669311523, + "learning_rate": 2.974093141585974e-06, + "loss": 0.42, + "step": 5776 + }, + { + "epoch": 0.6365840220385675, + "grad_norm": 14.6085844039917, + "learning_rate": 2.9724947803330585e-06, + "loss": 0.4173, + "step": 5777 + }, + { + "epoch": 0.6366942148760331, + "grad_norm": 8.948366165161133, + "learning_rate": 2.9708966670062313e-06, + "loss": 0.374, + "step": 5778 + }, + { + "epoch": 0.6368044077134987, + "grad_norm": 6.008039951324463, + "learning_rate": 2.9692988018009195e-06, + "loss": 0.4181, + "step": 5779 + }, + { + "epoch": 0.6369146005509642, + "grad_norm": 5.3842902183532715, + "learning_rate": 2.967701184912508e-06, + "loss": 0.4204, + "step": 5780 + }, + { + "epoch": 0.6370247933884298, + "grad_norm": 8.259960174560547, + "learning_rate": 2.9661038165363537e-06, + "loss": 0.38, + "step": 5781 + }, + { + "epoch": 0.6371349862258954, + "grad_norm": 7.987982749938965, + "learning_rate": 2.9645066968677906e-06, + "loss": 0.3577, + "step": 5782 + }, + { + "epoch": 0.6372451790633609, + "grad_norm": 8.198049545288086, + "learning_rate": 2.9629098261021127e-06, + "loss": 0.4439, + "step": 5783 + }, + { + "epoch": 0.6373553719008265, + "grad_norm": 6.748956203460693, + "learning_rate": 2.961313204434587e-06, + "loss": 0.3691, + "step": 5784 + }, + { + "epoch": 0.637465564738292, + "grad_norm": 6.4396162033081055, + "learning_rate": 2.9597168320604543e-06, + "loss": 0.3588, + "step": 5785 + }, + { + "epoch": 0.6375757575757576, + "grad_norm": 8.94629192352295, + "learning_rate": 2.9581207091749154e-06, + "loss": 0.431, + "step": 5786 + }, + { + "epoch": 0.6376859504132232, + "grad_norm": 3.996971845626831, + "learning_rate": 2.956524835973153e-06, + "loss": 0.3787, + "step": 5787 + }, + { + "epoch": 0.6377961432506887, + "grad_norm": 14.571632385253906, + "learning_rate": 2.9549292126503086e-06, + "loss": 0.4889, + "step": 5788 + }, + { + "epoch": 0.6379063360881543, + "grad_norm": 8.187118530273438, + "learning_rate": 2.9533338394014976e-06, + "loss": 0.3969, + "step": 5789 + }, + { + "epoch": 0.6380165289256199, + "grad_norm": 7.629312515258789, + "learning_rate": 2.951738716421805e-06, + "loss": 0.3848, + "step": 5790 + }, + { + "epoch": 0.6381267217630854, + "grad_norm": 15.203229904174805, + "learning_rate": 2.9501438439062844e-06, + "loss": 0.49, + "step": 5791 + }, + { + "epoch": 0.638236914600551, + "grad_norm": 5.426725387573242, + "learning_rate": 2.948549222049959e-06, + "loss": 0.3781, + "step": 5792 + }, + { + "epoch": 0.6383471074380165, + "grad_norm": 8.121671676635742, + "learning_rate": 2.946954851047822e-06, + "loss": 0.3217, + "step": 5793 + }, + { + "epoch": 0.6384573002754821, + "grad_norm": 9.843147277832031, + "learning_rate": 2.945360731094834e-06, + "loss": 0.3904, + "step": 5794 + }, + { + "epoch": 0.6385674931129477, + "grad_norm": 7.5617523193359375, + "learning_rate": 2.943766862385926e-06, + "loss": 0.4492, + "step": 5795 + }, + { + "epoch": 0.6386776859504132, + "grad_norm": 4.36297082901001, + "learning_rate": 2.942173245116e-06, + "loss": 0.3465, + "step": 5796 + }, + { + "epoch": 0.6387878787878788, + "grad_norm": 8.202534675598145, + "learning_rate": 2.9405798794799257e-06, + "loss": 0.314, + "step": 5797 + }, + { + "epoch": 0.6388980716253444, + "grad_norm": 5.054417133331299, + "learning_rate": 2.938986765672539e-06, + "loss": 0.4032, + "step": 5798 + }, + { + "epoch": 0.6390082644628099, + "grad_norm": 4.384051322937012, + "learning_rate": 2.9373939038886524e-06, + "loss": 0.368, + "step": 5799 + }, + { + "epoch": 0.6391184573002755, + "grad_norm": 8.412352561950684, + "learning_rate": 2.9358012943230395e-06, + "loss": 0.4504, + "step": 5800 + }, + { + "epoch": 0.639228650137741, + "grad_norm": 28.05645179748535, + "learning_rate": 2.934208937170449e-06, + "loss": 0.4941, + "step": 5801 + }, + { + "epoch": 0.6393388429752066, + "grad_norm": 8.390089988708496, + "learning_rate": 2.9326168326255963e-06, + "loss": 0.3966, + "step": 5802 + }, + { + "epoch": 0.6394490358126722, + "grad_norm": 6.950432777404785, + "learning_rate": 2.9310249808831635e-06, + "loss": 0.3803, + "step": 5803 + }, + { + "epoch": 0.6395592286501377, + "grad_norm": 9.502593040466309, + "learning_rate": 2.9294333821378085e-06, + "loss": 0.422, + "step": 5804 + }, + { + "epoch": 0.6396694214876033, + "grad_norm": 13.224533081054688, + "learning_rate": 2.927842036584153e-06, + "loss": 0.5388, + "step": 5805 + }, + { + "epoch": 0.6397796143250689, + "grad_norm": 7.394783973693848, + "learning_rate": 2.9262509444167853e-06, + "loss": 0.3838, + "step": 5806 + }, + { + "epoch": 0.6398898071625344, + "grad_norm": 8.313453674316406, + "learning_rate": 2.9246601058302703e-06, + "loss": 0.4582, + "step": 5807 + }, + { + "epoch": 0.64, + "grad_norm": 8.038554191589355, + "learning_rate": 2.9230695210191377e-06, + "loss": 0.4205, + "step": 5808 + }, + { + "epoch": 0.6401101928374656, + "grad_norm": 5.241087436676025, + "learning_rate": 2.921479190177883e-06, + "loss": 0.4286, + "step": 5809 + }, + { + "epoch": 0.6402203856749311, + "grad_norm": 6.424064636230469, + "learning_rate": 2.919889113500979e-06, + "loss": 0.4076, + "step": 5810 + }, + { + "epoch": 0.6403305785123967, + "grad_norm": 5.438494682312012, + "learning_rate": 2.9182992911828585e-06, + "loss": 0.3951, + "step": 5811 + }, + { + "epoch": 0.6404407713498622, + "grad_norm": 5.5653886795043945, + "learning_rate": 2.9167097234179275e-06, + "loss": 0.4086, + "step": 5812 + }, + { + "epoch": 0.6405509641873278, + "grad_norm": 5.504902362823486, + "learning_rate": 2.9151204104005614e-06, + "loss": 0.3923, + "step": 5813 + }, + { + "epoch": 0.6406611570247934, + "grad_norm": 8.266979217529297, + "learning_rate": 2.913531352325103e-06, + "loss": 0.442, + "step": 5814 + }, + { + "epoch": 0.6407713498622589, + "grad_norm": 6.890100002288818, + "learning_rate": 2.9119425493858677e-06, + "loss": 0.3856, + "step": 5815 + }, + { + "epoch": 0.6408815426997245, + "grad_norm": 7.708473205566406, + "learning_rate": 2.9103540017771316e-06, + "loss": 0.4208, + "step": 5816 + }, + { + "epoch": 0.6409917355371901, + "grad_norm": 5.167667388916016, + "learning_rate": 2.908765709693147e-06, + "loss": 0.3485, + "step": 5817 + }, + { + "epoch": 0.6411019283746556, + "grad_norm": 5.596955299377441, + "learning_rate": 2.907177673328134e-06, + "loss": 0.4133, + "step": 5818 + }, + { + "epoch": 0.6412121212121212, + "grad_norm": 6.119378566741943, + "learning_rate": 2.9055898928762775e-06, + "loss": 0.4351, + "step": 5819 + }, + { + "epoch": 0.6413223140495867, + "grad_norm": 8.178197860717773, + "learning_rate": 2.9040023685317298e-06, + "loss": 0.4298, + "step": 5820 + }, + { + "epoch": 0.6414325068870523, + "grad_norm": 4.932050704956055, + "learning_rate": 2.902415100488624e-06, + "loss": 0.3811, + "step": 5821 + }, + { + "epoch": 0.6415426997245179, + "grad_norm": 5.623518943786621, + "learning_rate": 2.900828088941049e-06, + "loss": 0.4268, + "step": 5822 + }, + { + "epoch": 0.6416528925619834, + "grad_norm": 4.286214351654053, + "learning_rate": 2.899241334083063e-06, + "loss": 0.3584, + "step": 5823 + }, + { + "epoch": 0.641763085399449, + "grad_norm": 5.3854851722717285, + "learning_rate": 2.8976548361087043e-06, + "loss": 0.3922, + "step": 5824 + }, + { + "epoch": 0.6418732782369146, + "grad_norm": 7.0205793380737305, + "learning_rate": 2.8960685952119672e-06, + "loss": 0.4221, + "step": 5825 + }, + { + "epoch": 0.6419834710743801, + "grad_norm": 6.966156005859375, + "learning_rate": 2.8944826115868165e-06, + "loss": 0.4168, + "step": 5826 + }, + { + "epoch": 0.6420936639118457, + "grad_norm": 6.308687686920166, + "learning_rate": 2.8928968854271967e-06, + "loss": 0.3444, + "step": 5827 + }, + { + "epoch": 0.6422038567493112, + "grad_norm": 11.661874771118164, + "learning_rate": 2.8913114169270052e-06, + "loss": 0.4469, + "step": 5828 + }, + { + "epoch": 0.6423140495867768, + "grad_norm": 5.717617511749268, + "learning_rate": 2.88972620628012e-06, + "loss": 0.3863, + "step": 5829 + }, + { + "epoch": 0.6424242424242425, + "grad_norm": 6.405765056610107, + "learning_rate": 2.888141253680379e-06, + "loss": 0.3678, + "step": 5830 + }, + { + "epoch": 0.642534435261708, + "grad_norm": 7.72976541519165, + "learning_rate": 2.886556559321595e-06, + "loss": 0.4918, + "step": 5831 + }, + { + "epoch": 0.6426446280991736, + "grad_norm": 12.591797828674316, + "learning_rate": 2.884972123397547e-06, + "loss": 0.465, + "step": 5832 + }, + { + "epoch": 0.6427548209366392, + "grad_norm": 7.761502265930176, + "learning_rate": 2.883387946101979e-06, + "loss": 0.4708, + "step": 5833 + }, + { + "epoch": 0.6428650137741047, + "grad_norm": 5.839196681976318, + "learning_rate": 2.8818040276286073e-06, + "loss": 0.4671, + "step": 5834 + }, + { + "epoch": 0.6429752066115703, + "grad_norm": 4.984602451324463, + "learning_rate": 2.8802203681711195e-06, + "loss": 0.3963, + "step": 5835 + }, + { + "epoch": 0.6430853994490359, + "grad_norm": 5.926804065704346, + "learning_rate": 2.878636967923162e-06, + "loss": 0.3643, + "step": 5836 + }, + { + "epoch": 0.6431955922865014, + "grad_norm": 8.573546409606934, + "learning_rate": 2.8770538270783576e-06, + "loss": 0.4617, + "step": 5837 + }, + { + "epoch": 0.643305785123967, + "grad_norm": 9.431220054626465, + "learning_rate": 2.8754709458302966e-06, + "loss": 0.4977, + "step": 5838 + }, + { + "epoch": 0.6434159779614325, + "grad_norm": 5.257115364074707, + "learning_rate": 2.8738883243725324e-06, + "loss": 0.3939, + "step": 5839 + }, + { + "epoch": 0.6435261707988981, + "grad_norm": 5.686413764953613, + "learning_rate": 2.872305962898593e-06, + "loss": 0.4053, + "step": 5840 + }, + { + "epoch": 0.6436363636363637, + "grad_norm": 6.769815444946289, + "learning_rate": 2.870723861601972e-06, + "loss": 0.3597, + "step": 5841 + }, + { + "epoch": 0.6437465564738292, + "grad_norm": 5.612032413482666, + "learning_rate": 2.869142020676127e-06, + "loss": 0.3669, + "step": 5842 + }, + { + "epoch": 0.6438567493112948, + "grad_norm": 7.373021125793457, + "learning_rate": 2.86756044031449e-06, + "loss": 0.3263, + "step": 5843 + }, + { + "epoch": 0.6439669421487604, + "grad_norm": 5.95004940032959, + "learning_rate": 2.865979120710462e-06, + "loss": 0.4416, + "step": 5844 + }, + { + "epoch": 0.6440771349862259, + "grad_norm": 5.859914302825928, + "learning_rate": 2.864398062057403e-06, + "loss": 0.4089, + "step": 5845 + }, + { + "epoch": 0.6441873278236915, + "grad_norm": 5.572892665863037, + "learning_rate": 2.8628172645486506e-06, + "loss": 0.4197, + "step": 5846 + }, + { + "epoch": 0.644297520661157, + "grad_norm": 6.591333389282227, + "learning_rate": 2.861236728377508e-06, + "loss": 0.3455, + "step": 5847 + }, + { + "epoch": 0.6444077134986226, + "grad_norm": 5.3536553382873535, + "learning_rate": 2.8596564537372416e-06, + "loss": 0.3486, + "step": 5848 + }, + { + "epoch": 0.6445179063360882, + "grad_norm": 9.382023811340332, + "learning_rate": 2.8580764408210916e-06, + "loss": 0.4225, + "step": 5849 + }, + { + "epoch": 0.6446280991735537, + "grad_norm": 8.87354564666748, + "learning_rate": 2.856496689822265e-06, + "loss": 0.4025, + "step": 5850 + }, + { + "epoch": 0.6447382920110193, + "grad_norm": 5.720553874969482, + "learning_rate": 2.854917200933933e-06, + "loss": 0.386, + "step": 5851 + }, + { + "epoch": 0.6448484848484849, + "grad_norm": 6.788865089416504, + "learning_rate": 2.8533379743492424e-06, + "loss": 0.431, + "step": 5852 + }, + { + "epoch": 0.6449586776859504, + "grad_norm": 5.583028316497803, + "learning_rate": 2.851759010261298e-06, + "loss": 0.4201, + "step": 5853 + }, + { + "epoch": 0.645068870523416, + "grad_norm": 8.681328773498535, + "learning_rate": 2.8501803088631795e-06, + "loss": 0.4024, + "step": 5854 + }, + { + "epoch": 0.6451790633608815, + "grad_norm": 5.74639368057251, + "learning_rate": 2.8486018703479344e-06, + "loss": 0.362, + "step": 5855 + }, + { + "epoch": 0.6452892561983471, + "grad_norm": 8.127897262573242, + "learning_rate": 2.8470236949085722e-06, + "loss": 0.3904, + "step": 5856 + }, + { + "epoch": 0.6453994490358127, + "grad_norm": 12.584700584411621, + "learning_rate": 2.845445782738081e-06, + "loss": 0.4496, + "step": 5857 + }, + { + "epoch": 0.6455096418732782, + "grad_norm": 7.233221530914307, + "learning_rate": 2.8438681340294063e-06, + "loss": 0.4184, + "step": 5858 + }, + { + "epoch": 0.6456198347107438, + "grad_norm": 4.437359809875488, + "learning_rate": 2.8422907489754603e-06, + "loss": 0.4101, + "step": 5859 + }, + { + "epoch": 0.6457300275482094, + "grad_norm": 11.709005355834961, + "learning_rate": 2.840713627769136e-06, + "loss": 0.4677, + "step": 5860 + }, + { + "epoch": 0.6458402203856749, + "grad_norm": 6.178116321563721, + "learning_rate": 2.8391367706032834e-06, + "loss": 0.3714, + "step": 5861 + }, + { + "epoch": 0.6459504132231405, + "grad_norm": 8.382662773132324, + "learning_rate": 2.8375601776707197e-06, + "loss": 0.4539, + "step": 5862 + }, + { + "epoch": 0.6460606060606061, + "grad_norm": 5.94052791595459, + "learning_rate": 2.8359838491642344e-06, + "loss": 0.3629, + "step": 5863 + }, + { + "epoch": 0.6461707988980716, + "grad_norm": 8.428426742553711, + "learning_rate": 2.834407785276586e-06, + "loss": 0.4307, + "step": 5864 + }, + { + "epoch": 0.6462809917355372, + "grad_norm": 5.8498759269714355, + "learning_rate": 2.8328319862004927e-06, + "loss": 0.3718, + "step": 5865 + }, + { + "epoch": 0.6463911845730027, + "grad_norm": 6.83188533782959, + "learning_rate": 2.831256452128649e-06, + "loss": 0.3799, + "step": 5866 + }, + { + "epoch": 0.6465013774104683, + "grad_norm": 6.298277378082275, + "learning_rate": 2.829681183253713e-06, + "loss": 0.3998, + "step": 5867 + }, + { + "epoch": 0.6466115702479339, + "grad_norm": 6.411405086517334, + "learning_rate": 2.8281061797683086e-06, + "loss": 0.4445, + "step": 5868 + }, + { + "epoch": 0.6467217630853994, + "grad_norm": 7.376020431518555, + "learning_rate": 2.8265314418650315e-06, + "loss": 0.3643, + "step": 5869 + }, + { + "epoch": 0.646831955922865, + "grad_norm": 5.0421319007873535, + "learning_rate": 2.824956969736441e-06, + "loss": 0.4507, + "step": 5870 + }, + { + "epoch": 0.6469421487603306, + "grad_norm": 5.176947593688965, + "learning_rate": 2.8233827635750687e-06, + "loss": 0.4157, + "step": 5871 + }, + { + "epoch": 0.6470523415977961, + "grad_norm": 9.123809814453125, + "learning_rate": 2.8218088235734076e-06, + "loss": 0.478, + "step": 5872 + }, + { + "epoch": 0.6471625344352617, + "grad_norm": 7.190445423126221, + "learning_rate": 2.820235149923921e-06, + "loss": 0.406, + "step": 5873 + }, + { + "epoch": 0.6472727272727272, + "grad_norm": 8.6001558303833, + "learning_rate": 2.8186617428190446e-06, + "loss": 0.4365, + "step": 5874 + }, + { + "epoch": 0.6473829201101928, + "grad_norm": 7.1687445640563965, + "learning_rate": 2.8170886024511705e-06, + "loss": 0.4479, + "step": 5875 + }, + { + "epoch": 0.6474931129476584, + "grad_norm": 5.2592854499816895, + "learning_rate": 2.815515729012668e-06, + "loss": 0.3346, + "step": 5876 + }, + { + "epoch": 0.6476033057851239, + "grad_norm": 7.434048175811768, + "learning_rate": 2.813943122695871e-06, + "loss": 0.3995, + "step": 5877 + }, + { + "epoch": 0.6477134986225895, + "grad_norm": 7.412423610687256, + "learning_rate": 2.812370783693078e-06, + "loss": 0.5136, + "step": 5878 + }, + { + "epoch": 0.6478236914600551, + "grad_norm": 5.2689056396484375, + "learning_rate": 2.8107987121965542e-06, + "loss": 0.3406, + "step": 5879 + }, + { + "epoch": 0.6479338842975206, + "grad_norm": 4.5380988121032715, + "learning_rate": 2.8092269083985404e-06, + "loss": 0.396, + "step": 5880 + }, + { + "epoch": 0.6480440771349862, + "grad_norm": 5.02008581161499, + "learning_rate": 2.807655372491237e-06, + "loss": 0.4053, + "step": 5881 + }, + { + "epoch": 0.6481542699724517, + "grad_norm": 4.435138702392578, + "learning_rate": 2.8060841046668085e-06, + "loss": 0.4039, + "step": 5882 + }, + { + "epoch": 0.6482644628099173, + "grad_norm": 6.067267894744873, + "learning_rate": 2.8045131051173996e-06, + "loss": 0.3952, + "step": 5883 + }, + { + "epoch": 0.648374655647383, + "grad_norm": 6.425743103027344, + "learning_rate": 2.8029423740351087e-06, + "loss": 0.4121, + "step": 5884 + }, + { + "epoch": 0.6484848484848484, + "grad_norm": 8.314208030700684, + "learning_rate": 2.8013719116120104e-06, + "loss": 0.4324, + "step": 5885 + }, + { + "epoch": 0.648595041322314, + "grad_norm": 7.725162506103516, + "learning_rate": 2.799801718040139e-06, + "loss": 0.3912, + "step": 5886 + }, + { + "epoch": 0.6487052341597797, + "grad_norm": 6.990640163421631, + "learning_rate": 2.7982317935115035e-06, + "loss": 0.3516, + "step": 5887 + }, + { + "epoch": 0.6488154269972451, + "grad_norm": 7.705892562866211, + "learning_rate": 2.7966621382180758e-06, + "loss": 0.4825, + "step": 5888 + }, + { + "epoch": 0.6489256198347108, + "grad_norm": 12.551563262939453, + "learning_rate": 2.7950927523517936e-06, + "loss": 0.4183, + "step": 5889 + }, + { + "epoch": 0.6490358126721764, + "grad_norm": 6.314427852630615, + "learning_rate": 2.7935236361045643e-06, + "loss": 0.3639, + "step": 5890 + }, + { + "epoch": 0.6491460055096419, + "grad_norm": 9.067972183227539, + "learning_rate": 2.791954789668264e-06, + "loss": 0.5317, + "step": 5891 + }, + { + "epoch": 0.6492561983471075, + "grad_norm": 4.194504737854004, + "learning_rate": 2.790386213234729e-06, + "loss": 0.3472, + "step": 5892 + }, + { + "epoch": 0.649366391184573, + "grad_norm": 8.523187637329102, + "learning_rate": 2.78881790699577e-06, + "loss": 0.3514, + "step": 5893 + }, + { + "epoch": 0.6494765840220386, + "grad_norm": 7.889522552490234, + "learning_rate": 2.787249871143163e-06, + "loss": 0.463, + "step": 5894 + }, + { + "epoch": 0.6495867768595042, + "grad_norm": 6.233395099639893, + "learning_rate": 2.785682105868645e-06, + "loss": 0.4157, + "step": 5895 + }, + { + "epoch": 0.6496969696969697, + "grad_norm": 5.324563503265381, + "learning_rate": 2.784114611363927e-06, + "loss": 0.318, + "step": 5896 + }, + { + "epoch": 0.6498071625344353, + "grad_norm": 6.411282539367676, + "learning_rate": 2.7825473878206865e-06, + "loss": 0.3624, + "step": 5897 + }, + { + "epoch": 0.6499173553719009, + "grad_norm": 7.308784484863281, + "learning_rate": 2.7809804354305612e-06, + "loss": 0.4053, + "step": 5898 + }, + { + "epoch": 0.6500275482093664, + "grad_norm": 8.107905387878418, + "learning_rate": 2.779413754385163e-06, + "loss": 0.401, + "step": 5899 + }, + { + "epoch": 0.650137741046832, + "grad_norm": 8.488053321838379, + "learning_rate": 2.777847344876069e-06, + "loss": 0.3543, + "step": 5900 + }, + { + "epoch": 0.6502479338842975, + "grad_norm": 4.050614356994629, + "learning_rate": 2.7762812070948183e-06, + "loss": 0.43, + "step": 5901 + }, + { + "epoch": 0.6503581267217631, + "grad_norm": 5.892617702484131, + "learning_rate": 2.774715341232922e-06, + "loss": 0.3606, + "step": 5902 + }, + { + "epoch": 0.6504683195592287, + "grad_norm": 10.13648796081543, + "learning_rate": 2.7731497474818587e-06, + "loss": 0.3664, + "step": 5903 + }, + { + "epoch": 0.6505785123966942, + "grad_norm": 5.645157337188721, + "learning_rate": 2.7715844260330672e-06, + "loss": 0.4184, + "step": 5904 + }, + { + "epoch": 0.6506887052341598, + "grad_norm": 5.31406831741333, + "learning_rate": 2.770019377077959e-06, + "loss": 0.3485, + "step": 5905 + }, + { + "epoch": 0.6507988980716254, + "grad_norm": 9.741802215576172, + "learning_rate": 2.768454600807912e-06, + "loss": 0.4814, + "step": 5906 + }, + { + "epoch": 0.6509090909090909, + "grad_norm": 7.9445905685424805, + "learning_rate": 2.7668900974142666e-06, + "loss": 0.4288, + "step": 5907 + }, + { + "epoch": 0.6510192837465565, + "grad_norm": 7.178967475891113, + "learning_rate": 2.765325867088333e-06, + "loss": 0.3788, + "step": 5908 + }, + { + "epoch": 0.6511294765840221, + "grad_norm": 5.582773685455322, + "learning_rate": 2.76376191002139e-06, + "loss": 0.4043, + "step": 5909 + }, + { + "epoch": 0.6512396694214876, + "grad_norm": 5.5358476638793945, + "learning_rate": 2.762198226404676e-06, + "loss": 0.4202, + "step": 5910 + }, + { + "epoch": 0.6513498622589532, + "grad_norm": 9.965605735778809, + "learning_rate": 2.760634816429405e-06, + "loss": 0.401, + "step": 5911 + }, + { + "epoch": 0.6514600550964187, + "grad_norm": 7.121329307556152, + "learning_rate": 2.7590716802867462e-06, + "loss": 0.3202, + "step": 5912 + }, + { + "epoch": 0.6515702479338843, + "grad_norm": 6.912757873535156, + "learning_rate": 2.75750881816785e-06, + "loss": 0.3622, + "step": 5913 + }, + { + "epoch": 0.6516804407713499, + "grad_norm": 8.120623588562012, + "learning_rate": 2.7559462302638223e-06, + "loss": 0.3756, + "step": 5914 + }, + { + "epoch": 0.6517906336088154, + "grad_norm": 4.33920431137085, + "learning_rate": 2.754383916765734e-06, + "loss": 0.3147, + "step": 5915 + }, + { + "epoch": 0.651900826446281, + "grad_norm": 5.247227191925049, + "learning_rate": 2.7528218778646345e-06, + "loss": 0.3487, + "step": 5916 + }, + { + "epoch": 0.6520110192837466, + "grad_norm": 6.693728446960449, + "learning_rate": 2.7512601137515277e-06, + "loss": 0.4002, + "step": 5917 + }, + { + "epoch": 0.6521212121212121, + "grad_norm": 12.273859024047852, + "learning_rate": 2.7496986246173873e-06, + "loss": 0.5216, + "step": 5918 + }, + { + "epoch": 0.6522314049586777, + "grad_norm": 10.704306602478027, + "learning_rate": 2.7481374106531555e-06, + "loss": 0.4146, + "step": 5919 + }, + { + "epoch": 0.6523415977961432, + "grad_norm": 5.9599456787109375, + "learning_rate": 2.7465764720497423e-06, + "loss": 0.3649, + "step": 5920 + }, + { + "epoch": 0.6524517906336088, + "grad_norm": 8.873656272888184, + "learning_rate": 2.745015808998017e-06, + "loss": 0.4255, + "step": 5921 + }, + { + "epoch": 0.6525619834710744, + "grad_norm": 5.334264278411865, + "learning_rate": 2.743455421688822e-06, + "loss": 0.4006, + "step": 5922 + }, + { + "epoch": 0.6526721763085399, + "grad_norm": 6.818568706512451, + "learning_rate": 2.741895310312965e-06, + "loss": 0.4139, + "step": 5923 + }, + { + "epoch": 0.6527823691460055, + "grad_norm": 6.1837358474731445, + "learning_rate": 2.7403354750612145e-06, + "loss": 0.3791, + "step": 5924 + }, + { + "epoch": 0.6528925619834711, + "grad_norm": 7.931142330169678, + "learning_rate": 2.7387759161243116e-06, + "loss": 0.4047, + "step": 5925 + }, + { + "epoch": 0.6530027548209366, + "grad_norm": 7.647500038146973, + "learning_rate": 2.737216633692962e-06, + "loss": 0.4004, + "step": 5926 + }, + { + "epoch": 0.6531129476584022, + "grad_norm": 6.578307151794434, + "learning_rate": 2.735657627957837e-06, + "loss": 0.2985, + "step": 5927 + }, + { + "epoch": 0.6532231404958677, + "grad_norm": 4.906883239746094, + "learning_rate": 2.734098899109572e-06, + "loss": 0.3725, + "step": 5928 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 7.8243913650512695, + "learning_rate": 2.732540447338771e-06, + "loss": 0.4206, + "step": 5929 + }, + { + "epoch": 0.6534435261707989, + "grad_norm": 8.364114761352539, + "learning_rate": 2.7309822728360057e-06, + "loss": 0.4373, + "step": 5930 + }, + { + "epoch": 0.6535537190082644, + "grad_norm": 7.875283718109131, + "learning_rate": 2.7294243757918094e-06, + "loss": 0.3778, + "step": 5931 + }, + { + "epoch": 0.65366391184573, + "grad_norm": 12.430623054504395, + "learning_rate": 2.7278667563966836e-06, + "loss": 0.4091, + "step": 5932 + }, + { + "epoch": 0.6537741046831956, + "grad_norm": 7.027017593383789, + "learning_rate": 2.7263094148410996e-06, + "loss": 0.4117, + "step": 5933 + }, + { + "epoch": 0.6538842975206611, + "grad_norm": 7.087715148925781, + "learning_rate": 2.7247523513154874e-06, + "loss": 0.3524, + "step": 5934 + }, + { + "epoch": 0.6539944903581267, + "grad_norm": 7.618420124053955, + "learning_rate": 2.723195566010248e-06, + "loss": 0.432, + "step": 5935 + }, + { + "epoch": 0.6541046831955923, + "grad_norm": 4.888277530670166, + "learning_rate": 2.7216390591157494e-06, + "loss": 0.3826, + "step": 5936 + }, + { + "epoch": 0.6542148760330578, + "grad_norm": 8.381621360778809, + "learning_rate": 2.7200828308223214e-06, + "loss": 0.4517, + "step": 5937 + }, + { + "epoch": 0.6543250688705234, + "grad_norm": 7.997573375701904, + "learning_rate": 2.718526881320258e-06, + "loss": 0.4573, + "step": 5938 + }, + { + "epoch": 0.6544352617079889, + "grad_norm": 11.966435432434082, + "learning_rate": 2.7169712107998303e-06, + "loss": 0.4254, + "step": 5939 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 9.864051818847656, + "learning_rate": 2.7154158194512625e-06, + "loss": 0.4535, + "step": 5940 + }, + { + "epoch": 0.6546556473829201, + "grad_norm": 10.945318222045898, + "learning_rate": 2.7138607074647516e-06, + "loss": 0.3582, + "step": 5941 + }, + { + "epoch": 0.6547658402203856, + "grad_norm": 4.697882175445557, + "learning_rate": 2.712305875030461e-06, + "loss": 0.4087, + "step": 5942 + }, + { + "epoch": 0.6548760330578512, + "grad_norm": 8.20838451385498, + "learning_rate": 2.710751322338513e-06, + "loss": 0.4489, + "step": 5943 + }, + { + "epoch": 0.6549862258953169, + "grad_norm": 4.127697467803955, + "learning_rate": 2.709197049579005e-06, + "loss": 0.3989, + "step": 5944 + }, + { + "epoch": 0.6550964187327823, + "grad_norm": 4.515989780426025, + "learning_rate": 2.707643056941992e-06, + "loss": 0.3887, + "step": 5945 + }, + { + "epoch": 0.655206611570248, + "grad_norm": 5.887902736663818, + "learning_rate": 2.7060893446174994e-06, + "loss": 0.4113, + "step": 5946 + }, + { + "epoch": 0.6553168044077134, + "grad_norm": 6.794983863830566, + "learning_rate": 2.7045359127955197e-06, + "loss": 0.3861, + "step": 5947 + }, + { + "epoch": 0.655426997245179, + "grad_norm": 5.16549015045166, + "learning_rate": 2.702982761666005e-06, + "loss": 0.3808, + "step": 5948 + }, + { + "epoch": 0.6555371900826447, + "grad_norm": 6.333678722381592, + "learning_rate": 2.701429891418878e-06, + "loss": 0.3983, + "step": 5949 + }, + { + "epoch": 0.6556473829201102, + "grad_norm": 7.293991565704346, + "learning_rate": 2.6998773022440283e-06, + "loss": 0.3974, + "step": 5950 + }, + { + "epoch": 0.6557575757575758, + "grad_norm": 8.225202560424805, + "learning_rate": 2.698324994331305e-06, + "loss": 0.4225, + "step": 5951 + }, + { + "epoch": 0.6558677685950414, + "grad_norm": 5.183380603790283, + "learning_rate": 2.696772967870527e-06, + "loss": 0.3583, + "step": 5952 + }, + { + "epoch": 0.6559779614325069, + "grad_norm": 8.09007453918457, + "learning_rate": 2.695221223051482e-06, + "loss": 0.4006, + "step": 5953 + }, + { + "epoch": 0.6560881542699725, + "grad_norm": 5.152414321899414, + "learning_rate": 2.693669760063914e-06, + "loss": 0.4265, + "step": 5954 + }, + { + "epoch": 0.656198347107438, + "grad_norm": 15.005812644958496, + "learning_rate": 2.692118579097541e-06, + "loss": 0.4978, + "step": 5955 + }, + { + "epoch": 0.6563085399449036, + "grad_norm": 7.070697784423828, + "learning_rate": 2.6905676803420444e-06, + "loss": 0.3571, + "step": 5956 + }, + { + "epoch": 0.6564187327823692, + "grad_norm": 13.157657623291016, + "learning_rate": 2.6890170639870676e-06, + "loss": 0.4747, + "step": 5957 + }, + { + "epoch": 0.6565289256198347, + "grad_norm": 5.683324813842773, + "learning_rate": 2.6874667302222237e-06, + "loss": 0.3628, + "step": 5958 + }, + { + "epoch": 0.6566391184573003, + "grad_norm": 5.332111358642578, + "learning_rate": 2.6859166792370905e-06, + "loss": 0.3987, + "step": 5959 + }, + { + "epoch": 0.6567493112947659, + "grad_norm": 6.43701171875, + "learning_rate": 2.6843669112212073e-06, + "loss": 0.4387, + "step": 5960 + }, + { + "epoch": 0.6568595041322314, + "grad_norm": 7.0052714347839355, + "learning_rate": 2.682817426364084e-06, + "loss": 0.3602, + "step": 5961 + }, + { + "epoch": 0.656969696969697, + "grad_norm": 7.186418533325195, + "learning_rate": 2.6812682248551945e-06, + "loss": 0.4391, + "step": 5962 + }, + { + "epoch": 0.6570798898071626, + "grad_norm": 4.473569393157959, + "learning_rate": 2.6797193068839753e-06, + "loss": 0.3754, + "step": 5963 + }, + { + "epoch": 0.6571900826446281, + "grad_norm": 4.460716247558594, + "learning_rate": 2.6781706726398304e-06, + "loss": 0.3265, + "step": 5964 + }, + { + "epoch": 0.6573002754820937, + "grad_norm": 6.775029182434082, + "learning_rate": 2.676622322312132e-06, + "loss": 0.3858, + "step": 5965 + }, + { + "epoch": 0.6574104683195592, + "grad_norm": 4.9734673500061035, + "learning_rate": 2.67507425609021e-06, + "loss": 0.3462, + "step": 5966 + }, + { + "epoch": 0.6575206611570248, + "grad_norm": 5.981369495391846, + "learning_rate": 2.6735264741633656e-06, + "loss": 0.4071, + "step": 5967 + }, + { + "epoch": 0.6576308539944904, + "grad_norm": 5.739418029785156, + "learning_rate": 2.6719789767208635e-06, + "loss": 0.3575, + "step": 5968 + }, + { + "epoch": 0.6577410468319559, + "grad_norm": 5.659451007843018, + "learning_rate": 2.670431763951938e-06, + "loss": 0.3853, + "step": 5969 + }, + { + "epoch": 0.6578512396694215, + "grad_norm": 5.818564414978027, + "learning_rate": 2.6688848360457796e-06, + "loss": 0.3174, + "step": 5970 + }, + { + "epoch": 0.6579614325068871, + "grad_norm": 5.7039361000061035, + "learning_rate": 2.6673381931915466e-06, + "loss": 0.3567, + "step": 5971 + }, + { + "epoch": 0.6580716253443526, + "grad_norm": 6.77376127243042, + "learning_rate": 2.665791835578372e-06, + "loss": 0.3836, + "step": 5972 + }, + { + "epoch": 0.6581818181818182, + "grad_norm": 11.987671852111816, + "learning_rate": 2.6642457633953424e-06, + "loss": 0.4798, + "step": 5973 + }, + { + "epoch": 0.6582920110192837, + "grad_norm": 4.910957336425781, + "learning_rate": 2.6626999768315092e-06, + "loss": 0.3534, + "step": 5974 + }, + { + "epoch": 0.6584022038567493, + "grad_norm": 5.625916481018066, + "learning_rate": 2.6611544760759023e-06, + "loss": 0.3831, + "step": 5975 + }, + { + "epoch": 0.6585123966942149, + "grad_norm": 5.2465739250183105, + "learning_rate": 2.659609261317503e-06, + "loss": 0.4074, + "step": 5976 + }, + { + "epoch": 0.6586225895316804, + "grad_norm": 5.320181369781494, + "learning_rate": 2.65806433274526e-06, + "loss": 0.4268, + "step": 5977 + }, + { + "epoch": 0.658732782369146, + "grad_norm": 4.958621025085449, + "learning_rate": 2.6565196905480917e-06, + "loss": 0.3769, + "step": 5978 + }, + { + "epoch": 0.6588429752066116, + "grad_norm": 4.646249771118164, + "learning_rate": 2.6549753349148812e-06, + "loss": 0.416, + "step": 5979 + }, + { + "epoch": 0.6589531680440771, + "grad_norm": 7.4992876052856445, + "learning_rate": 2.6534312660344696e-06, + "loss": 0.4483, + "step": 5980 + }, + { + "epoch": 0.6590633608815427, + "grad_norm": 5.924458980560303, + "learning_rate": 2.651887484095671e-06, + "loss": 0.3781, + "step": 5981 + }, + { + "epoch": 0.6591735537190082, + "grad_norm": 4.546374320983887, + "learning_rate": 2.6503439892872594e-06, + "loss": 0.4083, + "step": 5982 + }, + { + "epoch": 0.6592837465564738, + "grad_norm": 9.89686107635498, + "learning_rate": 2.6488007817979793e-06, + "loss": 0.3721, + "step": 5983 + }, + { + "epoch": 0.6593939393939394, + "grad_norm": 7.717336177825928, + "learning_rate": 2.6472578618165313e-06, + "loss": 0.4094, + "step": 5984 + }, + { + "epoch": 0.6595041322314049, + "grad_norm": 6.707863807678223, + "learning_rate": 2.645715229531588e-06, + "loss": 0.4327, + "step": 5985 + }, + { + "epoch": 0.6596143250688705, + "grad_norm": 6.7714314460754395, + "learning_rate": 2.644172885131786e-06, + "loss": 0.3257, + "step": 5986 + }, + { + "epoch": 0.6597245179063361, + "grad_norm": 11.701902389526367, + "learning_rate": 2.6426308288057222e-06, + "loss": 0.5138, + "step": 5987 + }, + { + "epoch": 0.6598347107438016, + "grad_norm": 12.116415977478027, + "learning_rate": 2.6410890607419625e-06, + "loss": 0.4811, + "step": 5988 + }, + { + "epoch": 0.6599449035812672, + "grad_norm": 5.822449207305908, + "learning_rate": 2.6395475811290383e-06, + "loss": 0.4029, + "step": 5989 + }, + { + "epoch": 0.6600550964187328, + "grad_norm": 5.782731533050537, + "learning_rate": 2.638006390155441e-06, + "loss": 0.4049, + "step": 5990 + }, + { + "epoch": 0.6601652892561983, + "grad_norm": 7.295383930206299, + "learning_rate": 2.6364654880096306e-06, + "loss": 0.3913, + "step": 5991 + }, + { + "epoch": 0.6602754820936639, + "grad_norm": 6.990212440490723, + "learning_rate": 2.6349248748800327e-06, + "loss": 0.4191, + "step": 5992 + }, + { + "epoch": 0.6603856749311294, + "grad_norm": 7.561840534210205, + "learning_rate": 2.6333845509550315e-06, + "loss": 0.3883, + "step": 5993 + }, + { + "epoch": 0.660495867768595, + "grad_norm": 8.130766868591309, + "learning_rate": 2.631844516422983e-06, + "loss": 0.4768, + "step": 5994 + }, + { + "epoch": 0.6606060606060606, + "grad_norm": 5.095539569854736, + "learning_rate": 2.6303047714722053e-06, + "loss": 0.3954, + "step": 5995 + }, + { + "epoch": 0.6607162534435261, + "grad_norm": 5.467076301574707, + "learning_rate": 2.6287653162909767e-06, + "loss": 0.3641, + "step": 5996 + }, + { + "epoch": 0.6608264462809917, + "grad_norm": 8.240248680114746, + "learning_rate": 2.6272261510675468e-06, + "loss": 0.3504, + "step": 5997 + }, + { + "epoch": 0.6609366391184573, + "grad_norm": 4.505929946899414, + "learning_rate": 2.6256872759901275e-06, + "loss": 0.3668, + "step": 5998 + }, + { + "epoch": 0.6610468319559228, + "grad_norm": 8.998342514038086, + "learning_rate": 2.6241486912468916e-06, + "loss": 0.3551, + "step": 5999 + }, + { + "epoch": 0.6611570247933884, + "grad_norm": 9.616366386413574, + "learning_rate": 2.622610397025982e-06, + "loss": 0.461, + "step": 6000 + }, + { + "epoch": 0.6612672176308539, + "grad_norm": 6.253893852233887, + "learning_rate": 2.621072393515503e-06, + "loss": 0.42, + "step": 6001 + }, + { + "epoch": 0.6613774104683195, + "grad_norm": 6.103658199310303, + "learning_rate": 2.6195346809035217e-06, + "loss": 0.4098, + "step": 6002 + }, + { + "epoch": 0.6614876033057852, + "grad_norm": 4.5206499099731445, + "learning_rate": 2.6179972593780758e-06, + "loss": 0.4353, + "step": 6003 + }, + { + "epoch": 0.6615977961432506, + "grad_norm": 4.7155022621154785, + "learning_rate": 2.6164601291271574e-06, + "loss": 0.308, + "step": 6004 + }, + { + "epoch": 0.6617079889807163, + "grad_norm": 4.543933391571045, + "learning_rate": 2.6149232903387333e-06, + "loss": 0.4354, + "step": 6005 + }, + { + "epoch": 0.6618181818181819, + "grad_norm": 8.870038986206055, + "learning_rate": 2.6133867432007304e-06, + "loss": 0.4134, + "step": 6006 + }, + { + "epoch": 0.6619283746556474, + "grad_norm": 7.9077301025390625, + "learning_rate": 2.6118504879010364e-06, + "loss": 0.3229, + "step": 6007 + }, + { + "epoch": 0.662038567493113, + "grad_norm": 5.109295845031738, + "learning_rate": 2.6103145246275095e-06, + "loss": 0.3921, + "step": 6008 + }, + { + "epoch": 0.6621487603305786, + "grad_norm": 7.649944305419922, + "learning_rate": 2.6087788535679696e-06, + "loss": 0.3764, + "step": 6009 + }, + { + "epoch": 0.6622589531680441, + "grad_norm": 6.6462812423706055, + "learning_rate": 2.607243474910198e-06, + "loss": 0.3619, + "step": 6010 + }, + { + "epoch": 0.6623691460055097, + "grad_norm": 4.974469184875488, + "learning_rate": 2.605708388841945e-06, + "loss": 0.3555, + "step": 6011 + }, + { + "epoch": 0.6624793388429752, + "grad_norm": 5.443259239196777, + "learning_rate": 2.604173595550924e-06, + "loss": 0.397, + "step": 6012 + }, + { + "epoch": 0.6625895316804408, + "grad_norm": 7.249265193939209, + "learning_rate": 2.6026390952248084e-06, + "loss": 0.4894, + "step": 6013 + }, + { + "epoch": 0.6626997245179064, + "grad_norm": 6.921874523162842, + "learning_rate": 2.6011048880512407e-06, + "loss": 0.3643, + "step": 6014 + }, + { + "epoch": 0.6628099173553719, + "grad_norm": 7.4875054359436035, + "learning_rate": 2.5995709742178277e-06, + "loss": 0.3984, + "step": 6015 + }, + { + "epoch": 0.6629201101928375, + "grad_norm": 4.925077438354492, + "learning_rate": 2.598037353912135e-06, + "loss": 0.3666, + "step": 6016 + }, + { + "epoch": 0.6630303030303031, + "grad_norm": 4.4653639793396, + "learning_rate": 2.5965040273216967e-06, + "loss": 0.3581, + "step": 6017 + }, + { + "epoch": 0.6631404958677686, + "grad_norm": 10.477150917053223, + "learning_rate": 2.5949709946340136e-06, + "loss": 0.4583, + "step": 6018 + }, + { + "epoch": 0.6632506887052342, + "grad_norm": 5.215625286102295, + "learning_rate": 2.5934382560365417e-06, + "loss": 0.3667, + "step": 6019 + }, + { + "epoch": 0.6633608815426997, + "grad_norm": 3.861738443374634, + "learning_rate": 2.591905811716709e-06, + "loss": 0.4071, + "step": 6020 + }, + { + "epoch": 0.6634710743801653, + "grad_norm": 6.306840896606445, + "learning_rate": 2.5903736618619067e-06, + "loss": 0.3782, + "step": 6021 + }, + { + "epoch": 0.6635812672176309, + "grad_norm": 4.3274078369140625, + "learning_rate": 2.5888418066594845e-06, + "loss": 0.375, + "step": 6022 + }, + { + "epoch": 0.6636914600550964, + "grad_norm": 6.996295928955078, + "learning_rate": 2.5873102462967604e-06, + "loss": 0.3847, + "step": 6023 + }, + { + "epoch": 0.663801652892562, + "grad_norm": 10.424880027770996, + "learning_rate": 2.585778980961018e-06, + "loss": 0.4013, + "step": 6024 + }, + { + "epoch": 0.6639118457300276, + "grad_norm": 6.157477855682373, + "learning_rate": 2.584248010839502e-06, + "loss": 0.4322, + "step": 6025 + }, + { + "epoch": 0.6640220385674931, + "grad_norm": 9.453383445739746, + "learning_rate": 2.582717336119419e-06, + "loss": 0.4269, + "step": 6026 + }, + { + "epoch": 0.6641322314049587, + "grad_norm": 15.271723747253418, + "learning_rate": 2.5811869569879446e-06, + "loss": 0.3975, + "step": 6027 + }, + { + "epoch": 0.6642424242424242, + "grad_norm": 7.037370681762695, + "learning_rate": 2.579656873632216e-06, + "loss": 0.4087, + "step": 6028 + }, + { + "epoch": 0.6643526170798898, + "grad_norm": 9.355405807495117, + "learning_rate": 2.5781270862393327e-06, + "loss": 0.4038, + "step": 6029 + }, + { + "epoch": 0.6644628099173554, + "grad_norm": 8.743352890014648, + "learning_rate": 2.576597594996355e-06, + "loss": 0.3952, + "step": 6030 + }, + { + "epoch": 0.6645730027548209, + "grad_norm": 4.763707637786865, + "learning_rate": 2.5750684000903194e-06, + "loss": 0.4084, + "step": 6031 + }, + { + "epoch": 0.6646831955922865, + "grad_norm": 12.654218673706055, + "learning_rate": 2.5735395017082136e-06, + "loss": 0.3919, + "step": 6032 + }, + { + "epoch": 0.6647933884297521, + "grad_norm": 8.429920196533203, + "learning_rate": 2.5720109000369898e-06, + "loss": 0.3137, + "step": 6033 + }, + { + "epoch": 0.6649035812672176, + "grad_norm": 6.514552116394043, + "learning_rate": 2.5704825952635753e-06, + "loss": 0.3254, + "step": 6034 + }, + { + "epoch": 0.6650137741046832, + "grad_norm": 5.537646293640137, + "learning_rate": 2.568954587574849e-06, + "loss": 0.305, + "step": 6035 + }, + { + "epoch": 0.6651239669421488, + "grad_norm": 8.543807029724121, + "learning_rate": 2.567426877157656e-06, + "loss": 0.4117, + "step": 6036 + }, + { + "epoch": 0.6652341597796143, + "grad_norm": 10.421896934509277, + "learning_rate": 2.565899464198809e-06, + "loss": 0.5015, + "step": 6037 + }, + { + "epoch": 0.6653443526170799, + "grad_norm": 5.44912052154541, + "learning_rate": 2.5643723488850813e-06, + "loss": 0.3334, + "step": 6038 + }, + { + "epoch": 0.6654545454545454, + "grad_norm": 6.181161880493164, + "learning_rate": 2.5628455314032143e-06, + "loss": 0.4497, + "step": 6039 + }, + { + "epoch": 0.665564738292011, + "grad_norm": 9.51405143737793, + "learning_rate": 2.5613190119399033e-06, + "loss": 0.4369, + "step": 6040 + }, + { + "epoch": 0.6656749311294766, + "grad_norm": 5.872657299041748, + "learning_rate": 2.5597927906818166e-06, + "loss": 0.358, + "step": 6041 + }, + { + "epoch": 0.6657851239669421, + "grad_norm": 11.218254089355469, + "learning_rate": 2.5582668678155842e-06, + "loss": 0.4721, + "step": 6042 + }, + { + "epoch": 0.6658953168044077, + "grad_norm": 8.075851440429688, + "learning_rate": 2.5567412435277937e-06, + "loss": 0.2994, + "step": 6043 + }, + { + "epoch": 0.6660055096418733, + "grad_norm": 5.535632133483887, + "learning_rate": 2.555215918005003e-06, + "loss": 0.3541, + "step": 6044 + }, + { + "epoch": 0.6661157024793388, + "grad_norm": 5.90950345993042, + "learning_rate": 2.553690891433733e-06, + "loss": 0.4333, + "step": 6045 + }, + { + "epoch": 0.6662258953168044, + "grad_norm": 5.168428421020508, + "learning_rate": 2.552166164000461e-06, + "loss": 0.481, + "step": 6046 + }, + { + "epoch": 0.6663360881542699, + "grad_norm": 10.744282722473145, + "learning_rate": 2.5506417358916365e-06, + "loss": 0.4343, + "step": 6047 + }, + { + "epoch": 0.6664462809917355, + "grad_norm": 5.804863452911377, + "learning_rate": 2.5491176072936683e-06, + "loss": 0.4045, + "step": 6048 + }, + { + "epoch": 0.6665564738292011, + "grad_norm": 7.872255802154541, + "learning_rate": 2.5475937783929276e-06, + "loss": 0.4052, + "step": 6049 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.6324687004089355, + "learning_rate": 2.5460702493757506e-06, + "loss": 0.3889, + "step": 6050 + }, + { + "epoch": 0.6667768595041322, + "grad_norm": 14.796236038208008, + "learning_rate": 2.5445470204284384e-06, + "loss": 0.4138, + "step": 6051 + }, + { + "epoch": 0.6668870523415978, + "grad_norm": 8.73970890045166, + "learning_rate": 2.5430240917372506e-06, + "loss": 0.4081, + "step": 6052 + }, + { + "epoch": 0.6669972451790633, + "grad_norm": 8.211316108703613, + "learning_rate": 2.541501463488414e-06, + "loss": 0.4085, + "step": 6053 + }, + { + "epoch": 0.6671074380165289, + "grad_norm": 5.342787265777588, + "learning_rate": 2.5399791358681203e-06, + "loss": 0.3759, + "step": 6054 + }, + { + "epoch": 0.6672176308539944, + "grad_norm": 7.133078575134277, + "learning_rate": 2.5384571090625166e-06, + "loss": 0.424, + "step": 6055 + }, + { + "epoch": 0.66732782369146, + "grad_norm": 6.504353046417236, + "learning_rate": 2.5369353832577224e-06, + "loss": 0.391, + "step": 6056 + }, + { + "epoch": 0.6674380165289256, + "grad_norm": 10.371742248535156, + "learning_rate": 2.5354139586398164e-06, + "loss": 0.4737, + "step": 6057 + }, + { + "epoch": 0.6675482093663911, + "grad_norm": 6.145484447479248, + "learning_rate": 2.5338928353948376e-06, + "loss": 0.3838, + "step": 6058 + }, + { + "epoch": 0.6676584022038567, + "grad_norm": 3.5295302867889404, + "learning_rate": 2.532372013708793e-06, + "loss": 0.3551, + "step": 6059 + }, + { + "epoch": 0.6677685950413224, + "grad_norm": 7.3660478591918945, + "learning_rate": 2.530851493767652e-06, + "loss": 0.415, + "step": 6060 + }, + { + "epoch": 0.6678787878787878, + "grad_norm": 7.272532939910889, + "learning_rate": 2.529331275757343e-06, + "loss": 0.2895, + "step": 6061 + }, + { + "epoch": 0.6679889807162535, + "grad_norm": 4.882059574127197, + "learning_rate": 2.527811359863763e-06, + "loss": 0.3936, + "step": 6062 + }, + { + "epoch": 0.6680991735537191, + "grad_norm": 7.911858558654785, + "learning_rate": 2.5262917462727664e-06, + "loss": 0.424, + "step": 6063 + }, + { + "epoch": 0.6682093663911846, + "grad_norm": 10.215099334716797, + "learning_rate": 2.5247724351701757e-06, + "loss": 0.4689, + "step": 6064 + }, + { + "epoch": 0.6683195592286502, + "grad_norm": 5.460385799407959, + "learning_rate": 2.523253426741775e-06, + "loss": 0.3514, + "step": 6065 + }, + { + "epoch": 0.6684297520661157, + "grad_norm": 6.721065044403076, + "learning_rate": 2.5217347211733067e-06, + "loss": 0.4132, + "step": 6066 + }, + { + "epoch": 0.6685399449035813, + "grad_norm": 8.900752067565918, + "learning_rate": 2.5202163186504867e-06, + "loss": 0.4125, + "step": 6067 + }, + { + "epoch": 0.6686501377410469, + "grad_norm": 5.635085105895996, + "learning_rate": 2.5186982193589833e-06, + "loss": 0.3659, + "step": 6068 + }, + { + "epoch": 0.6687603305785124, + "grad_norm": 11.600287437438965, + "learning_rate": 2.51718042348443e-06, + "loss": 0.3947, + "step": 6069 + }, + { + "epoch": 0.668870523415978, + "grad_norm": 4.45723819732666, + "learning_rate": 2.515662931212428e-06, + "loss": 0.3916, + "step": 6070 + }, + { + "epoch": 0.6689807162534436, + "grad_norm": 5.884436130523682, + "learning_rate": 2.514145742728539e-06, + "loss": 0.4151, + "step": 6071 + }, + { + "epoch": 0.6690909090909091, + "grad_norm": 4.856365203857422, + "learning_rate": 2.5126288582182827e-06, + "loss": 0.3855, + "step": 6072 + }, + { + "epoch": 0.6692011019283747, + "grad_norm": 8.981498718261719, + "learning_rate": 2.5111122778671495e-06, + "loss": 0.4045, + "step": 6073 + }, + { + "epoch": 0.6693112947658402, + "grad_norm": 9.999101638793945, + "learning_rate": 2.5095960018605887e-06, + "loss": 0.4695, + "step": 6074 + }, + { + "epoch": 0.6694214876033058, + "grad_norm": 6.24324893951416, + "learning_rate": 2.5080800303840104e-06, + "loss": 0.3786, + "step": 6075 + }, + { + "epoch": 0.6695316804407714, + "grad_norm": 5.593008518218994, + "learning_rate": 2.5065643636227897e-06, + "loss": 0.2895, + "step": 6076 + }, + { + "epoch": 0.6696418732782369, + "grad_norm": 5.002102375030518, + "learning_rate": 2.5050490017622686e-06, + "loss": 0.3435, + "step": 6077 + }, + { + "epoch": 0.6697520661157025, + "grad_norm": 5.277883529663086, + "learning_rate": 2.5035339449877426e-06, + "loss": 0.3893, + "step": 6078 + }, + { + "epoch": 0.6698622589531681, + "grad_norm": 4.154916763305664, + "learning_rate": 2.5020191934844774e-06, + "loss": 0.3005, + "step": 6079 + }, + { + "epoch": 0.6699724517906336, + "grad_norm": 7.4428486824035645, + "learning_rate": 2.5005047474376975e-06, + "loss": 0.4769, + "step": 6080 + }, + { + "epoch": 0.6700826446280992, + "grad_norm": 13.213733673095703, + "learning_rate": 2.4989906070325947e-06, + "loss": 0.4506, + "step": 6081 + }, + { + "epoch": 0.6701928374655647, + "grad_norm": 4.562686443328857, + "learning_rate": 2.4974767724543157e-06, + "loss": 0.418, + "step": 6082 + }, + { + "epoch": 0.6703030303030303, + "grad_norm": 12.098550796508789, + "learning_rate": 2.4959632438879765e-06, + "loss": 0.4256, + "step": 6083 + }, + { + "epoch": 0.6704132231404959, + "grad_norm": 10.366657257080078, + "learning_rate": 2.494450021518655e-06, + "loss": 0.3768, + "step": 6084 + }, + { + "epoch": 0.6705234159779614, + "grad_norm": 9.916677474975586, + "learning_rate": 2.4929371055313884e-06, + "loss": 0.5525, + "step": 6085 + }, + { + "epoch": 0.670633608815427, + "grad_norm": 6.2902374267578125, + "learning_rate": 2.4914244961111742e-06, + "loss": 0.4118, + "step": 6086 + }, + { + "epoch": 0.6707438016528926, + "grad_norm": 4.70620584487915, + "learning_rate": 2.4899121934429836e-06, + "loss": 0.3298, + "step": 6087 + }, + { + "epoch": 0.6708539944903581, + "grad_norm": 4.512409687042236, + "learning_rate": 2.4884001977117406e-06, + "loss": 0.3745, + "step": 6088 + }, + { + "epoch": 0.6709641873278237, + "grad_norm": 4.995675563812256, + "learning_rate": 2.4868885091023284e-06, + "loss": 0.3612, + "step": 6089 + }, + { + "epoch": 0.6710743801652893, + "grad_norm": 7.682406902313232, + "learning_rate": 2.485377127799607e-06, + "loss": 0.3819, + "step": 6090 + }, + { + "epoch": 0.6711845730027548, + "grad_norm": 9.726338386535645, + "learning_rate": 2.4838660539883863e-06, + "loss": 0.4186, + "step": 6091 + }, + { + "epoch": 0.6712947658402204, + "grad_norm": 7.077490329742432, + "learning_rate": 2.4823552878534385e-06, + "loss": 0.4742, + "step": 6092 + }, + { + "epoch": 0.6714049586776859, + "grad_norm": 6.823843002319336, + "learning_rate": 2.48084482957951e-06, + "loss": 0.4916, + "step": 6093 + }, + { + "epoch": 0.6715151515151515, + "grad_norm": 6.0614728927612305, + "learning_rate": 2.4793346793512957e-06, + "loss": 0.3698, + "step": 6094 + }, + { + "epoch": 0.6716253443526171, + "grad_norm": 8.090015411376953, + "learning_rate": 2.4778248373534626e-06, + "loss": 0.4433, + "step": 6095 + }, + { + "epoch": 0.6717355371900826, + "grad_norm": 9.832079887390137, + "learning_rate": 2.4763153037706323e-06, + "loss": 0.4631, + "step": 6096 + }, + { + "epoch": 0.6718457300275482, + "grad_norm": 6.708822250366211, + "learning_rate": 2.4748060787873953e-06, + "loss": 0.3908, + "step": 6097 + }, + { + "epoch": 0.6719559228650138, + "grad_norm": 7.857901573181152, + "learning_rate": 2.4732971625883023e-06, + "loss": 0.3987, + "step": 6098 + }, + { + "epoch": 0.6720661157024793, + "grad_norm": 7.758438587188721, + "learning_rate": 2.471788555357863e-06, + "loss": 0.3449, + "step": 6099 + }, + { + "epoch": 0.6721763085399449, + "grad_norm": 7.877081871032715, + "learning_rate": 2.4702802572805536e-06, + "loss": 0.4052, + "step": 6100 + }, + { + "epoch": 0.6722865013774104, + "grad_norm": 7.760099411010742, + "learning_rate": 2.468772268540812e-06, + "loss": 0.3716, + "step": 6101 + }, + { + "epoch": 0.672396694214876, + "grad_norm": 5.632152557373047, + "learning_rate": 2.467264589323034e-06, + "loss": 0.3336, + "step": 6102 + }, + { + "epoch": 0.6725068870523416, + "grad_norm": 5.384594917297363, + "learning_rate": 2.4657572198115826e-06, + "loss": 0.343, + "step": 6103 + }, + { + "epoch": 0.6726170798898071, + "grad_norm": 6.8102827072143555, + "learning_rate": 2.4642501601907826e-06, + "loss": 0.4512, + "step": 6104 + }, + { + "epoch": 0.6727272727272727, + "grad_norm": 8.789267539978027, + "learning_rate": 2.4627434106449155e-06, + "loss": 0.3556, + "step": 6105 + }, + { + "epoch": 0.6728374655647383, + "grad_norm": 4.4546098709106445, + "learning_rate": 2.461236971358231e-06, + "loss": 0.3245, + "step": 6106 + }, + { + "epoch": 0.6729476584022038, + "grad_norm": 5.878533363342285, + "learning_rate": 2.4597308425149395e-06, + "loss": 0.3935, + "step": 6107 + }, + { + "epoch": 0.6730578512396694, + "grad_norm": 6.608068466186523, + "learning_rate": 2.45822502429921e-06, + "loss": 0.4419, + "step": 6108 + }, + { + "epoch": 0.673168044077135, + "grad_norm": 5.274361610412598, + "learning_rate": 2.456719516895177e-06, + "loss": 0.3739, + "step": 6109 + }, + { + "epoch": 0.6732782369146005, + "grad_norm": 6.25014591217041, + "learning_rate": 2.4552143204869377e-06, + "loss": 0.4178, + "step": 6110 + }, + { + "epoch": 0.6733884297520661, + "grad_norm": 6.15048885345459, + "learning_rate": 2.4537094352585466e-06, + "loss": 0.3752, + "step": 6111 + }, + { + "epoch": 0.6734986225895316, + "grad_norm": 5.450191497802734, + "learning_rate": 2.4522048613940242e-06, + "loss": 0.3514, + "step": 6112 + }, + { + "epoch": 0.6736088154269972, + "grad_norm": 6.499011993408203, + "learning_rate": 2.4507005990773543e-06, + "loss": 0.4243, + "step": 6113 + }, + { + "epoch": 0.6737190082644628, + "grad_norm": 7.088918209075928, + "learning_rate": 2.4491966484924763e-06, + "loss": 0.413, + "step": 6114 + }, + { + "epoch": 0.6738292011019283, + "grad_norm": 9.151850700378418, + "learning_rate": 2.4476930098232964e-06, + "loss": 0.4335, + "step": 6115 + }, + { + "epoch": 0.673939393939394, + "grad_norm": 8.162264823913574, + "learning_rate": 2.4461896832536846e-06, + "loss": 0.3641, + "step": 6116 + }, + { + "epoch": 0.6740495867768596, + "grad_norm": 8.007330894470215, + "learning_rate": 2.4446866689674654e-06, + "loss": 0.5077, + "step": 6117 + }, + { + "epoch": 0.674159779614325, + "grad_norm": 5.151622772216797, + "learning_rate": 2.443183967148433e-06, + "loss": 0.4177, + "step": 6118 + }, + { + "epoch": 0.6742699724517907, + "grad_norm": 6.72404146194458, + "learning_rate": 2.4416815779803367e-06, + "loss": 0.3665, + "step": 6119 + }, + { + "epoch": 0.6743801652892562, + "grad_norm": 5.265436172485352, + "learning_rate": 2.440179501646892e-06, + "loss": 0.4831, + "step": 6120 + }, + { + "epoch": 0.6744903581267218, + "grad_norm": 6.513745307922363, + "learning_rate": 2.4386777383317773e-06, + "loss": 0.3932, + "step": 6121 + }, + { + "epoch": 0.6746005509641874, + "grad_norm": 6.415255069732666, + "learning_rate": 2.4371762882186235e-06, + "loss": 0.4229, + "step": 6122 + }, + { + "epoch": 0.6747107438016529, + "grad_norm": 4.8438286781311035, + "learning_rate": 2.4356751514910385e-06, + "loss": 0.3149, + "step": 6123 + }, + { + "epoch": 0.6748209366391185, + "grad_norm": 9.928213119506836, + "learning_rate": 2.434174328332579e-06, + "loss": 0.4716, + "step": 6124 + }, + { + "epoch": 0.6749311294765841, + "grad_norm": 6.0422773361206055, + "learning_rate": 2.4326738189267647e-06, + "loss": 0.3704, + "step": 6125 + }, + { + "epoch": 0.6750413223140496, + "grad_norm": 8.048456192016602, + "learning_rate": 2.431173623457087e-06, + "loss": 0.4681, + "step": 6126 + }, + { + "epoch": 0.6751515151515152, + "grad_norm": 5.316216468811035, + "learning_rate": 2.4296737421069875e-06, + "loss": 0.4271, + "step": 6127 + }, + { + "epoch": 0.6752617079889807, + "grad_norm": 10.065850257873535, + "learning_rate": 2.428174175059873e-06, + "loss": 0.3364, + "step": 6128 + }, + { + "epoch": 0.6753719008264463, + "grad_norm": 11.229357719421387, + "learning_rate": 2.426674922499113e-06, + "loss": 0.4465, + "step": 6129 + }, + { + "epoch": 0.6754820936639119, + "grad_norm": 15.492269515991211, + "learning_rate": 2.425175984608042e-06, + "loss": 0.5522, + "step": 6130 + }, + { + "epoch": 0.6755922865013774, + "grad_norm": 7.198604106903076, + "learning_rate": 2.4236773615699466e-06, + "loss": 0.379, + "step": 6131 + }, + { + "epoch": 0.675702479338843, + "grad_norm": 8.281965255737305, + "learning_rate": 2.422179053568083e-06, + "loss": 0.3656, + "step": 6132 + }, + { + "epoch": 0.6758126721763086, + "grad_norm": 8.31258773803711, + "learning_rate": 2.420681060785668e-06, + "loss": 0.4397, + "step": 6133 + }, + { + "epoch": 0.6759228650137741, + "grad_norm": 7.666805744171143, + "learning_rate": 2.4191833834058753e-06, + "loss": 0.4572, + "step": 6134 + }, + { + "epoch": 0.6760330578512397, + "grad_norm": 7.042750835418701, + "learning_rate": 2.417686021611844e-06, + "loss": 0.3219, + "step": 6135 + }, + { + "epoch": 0.6761432506887053, + "grad_norm": 9.428544998168945, + "learning_rate": 2.416188975586673e-06, + "loss": 0.3839, + "step": 6136 + }, + { + "epoch": 0.6762534435261708, + "grad_norm": 9.478087425231934, + "learning_rate": 2.4146922455134266e-06, + "loss": 0.4228, + "step": 6137 + }, + { + "epoch": 0.6763636363636364, + "grad_norm": 4.778660297393799, + "learning_rate": 2.413195831575122e-06, + "loss": 0.3969, + "step": 6138 + }, + { + "epoch": 0.6764738292011019, + "grad_norm": 6.882425785064697, + "learning_rate": 2.411699733954745e-06, + "loss": 0.4432, + "step": 6139 + }, + { + "epoch": 0.6765840220385675, + "grad_norm": 5.772907257080078, + "learning_rate": 2.4102039528352424e-06, + "loss": 0.4684, + "step": 6140 + }, + { + "epoch": 0.6766942148760331, + "grad_norm": 6.315256118774414, + "learning_rate": 2.408708488399516e-06, + "loss": 0.3635, + "step": 6141 + }, + { + "epoch": 0.6768044077134986, + "grad_norm": 6.439986705780029, + "learning_rate": 2.407213340830436e-06, + "loss": 0.3949, + "step": 6142 + }, + { + "epoch": 0.6769146005509642, + "grad_norm": 7.057758808135986, + "learning_rate": 2.405718510310832e-06, + "loss": 0.475, + "step": 6143 + }, + { + "epoch": 0.6770247933884298, + "grad_norm": 8.165635108947754, + "learning_rate": 2.404223997023493e-06, + "loss": 0.4448, + "step": 6144 + }, + { + "epoch": 0.6771349862258953, + "grad_norm": 6.370461940765381, + "learning_rate": 2.4027298011511656e-06, + "loss": 0.4336, + "step": 6145 + }, + { + "epoch": 0.6772451790633609, + "grad_norm": 6.985379219055176, + "learning_rate": 2.4012359228765703e-06, + "loss": 0.4041, + "step": 6146 + }, + { + "epoch": 0.6773553719008264, + "grad_norm": 7.5502214431762695, + "learning_rate": 2.3997423623823763e-06, + "loss": 0.4298, + "step": 6147 + }, + { + "epoch": 0.677465564738292, + "grad_norm": 7.182582378387451, + "learning_rate": 2.398249119851215e-06, + "loss": 0.3973, + "step": 6148 + }, + { + "epoch": 0.6775757575757576, + "grad_norm": 5.377007961273193, + "learning_rate": 2.3967561954656882e-06, + "loss": 0.3949, + "step": 6149 + }, + { + "epoch": 0.6776859504132231, + "grad_norm": 4.182321548461914, + "learning_rate": 2.3952635894083488e-06, + "loss": 0.3896, + "step": 6150 + }, + { + "epoch": 0.6777961432506887, + "grad_norm": 8.063091278076172, + "learning_rate": 2.3937713018617178e-06, + "loss": 0.4259, + "step": 6151 + }, + { + "epoch": 0.6779063360881543, + "grad_norm": 7.529224395751953, + "learning_rate": 2.39227933300827e-06, + "loss": 0.4392, + "step": 6152 + }, + { + "epoch": 0.6780165289256198, + "grad_norm": 6.323045253753662, + "learning_rate": 2.390787683030448e-06, + "loss": 0.4528, + "step": 6153 + }, + { + "epoch": 0.6781267217630854, + "grad_norm": 10.965755462646484, + "learning_rate": 2.389296352110654e-06, + "loss": 0.5396, + "step": 6154 + }, + { + "epoch": 0.6782369146005509, + "grad_norm": 7.637385845184326, + "learning_rate": 2.387805340431246e-06, + "loss": 0.4123, + "step": 6155 + }, + { + "epoch": 0.6783471074380165, + "grad_norm": 6.656174659729004, + "learning_rate": 2.38631464817455e-06, + "loss": 0.492, + "step": 6156 + }, + { + "epoch": 0.6784573002754821, + "grad_norm": 5.587393283843994, + "learning_rate": 2.3848242755228507e-06, + "loss": 0.4045, + "step": 6157 + }, + { + "epoch": 0.6785674931129476, + "grad_norm": 5.630966663360596, + "learning_rate": 2.3833342226583893e-06, + "loss": 0.3986, + "step": 6158 + }, + { + "epoch": 0.6786776859504132, + "grad_norm": 5.846369743347168, + "learning_rate": 2.381844489763374e-06, + "loss": 0.4388, + "step": 6159 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 6.487740516662598, + "learning_rate": 2.3803550770199723e-06, + "loss": 0.3781, + "step": 6160 + }, + { + "epoch": 0.6788980716253443, + "grad_norm": 6.642772674560547, + "learning_rate": 2.378865984610309e-06, + "loss": 0.4177, + "step": 6161 + }, + { + "epoch": 0.6790082644628099, + "grad_norm": 6.779574394226074, + "learning_rate": 2.377377212716473e-06, + "loss": 0.415, + "step": 6162 + }, + { + "epoch": 0.6791184573002755, + "grad_norm": 10.08366584777832, + "learning_rate": 2.3758887615205163e-06, + "loss": 0.4536, + "step": 6163 + }, + { + "epoch": 0.679228650137741, + "grad_norm": 7.583836555480957, + "learning_rate": 2.3744006312044445e-06, + "loss": 0.4071, + "step": 6164 + }, + { + "epoch": 0.6793388429752066, + "grad_norm": 4.750543117523193, + "learning_rate": 2.3729128219502295e-06, + "loss": 0.3305, + "step": 6165 + }, + { + "epoch": 0.6794490358126721, + "grad_norm": 6.158310890197754, + "learning_rate": 2.3714253339398052e-06, + "loss": 0.3796, + "step": 6166 + }, + { + "epoch": 0.6795592286501377, + "grad_norm": 5.552796840667725, + "learning_rate": 2.3699381673550597e-06, + "loss": 0.3541, + "step": 6167 + }, + { + "epoch": 0.6796694214876033, + "grad_norm": 5.88632345199585, + "learning_rate": 2.3684513223778475e-06, + "loss": 0.3827, + "step": 6168 + }, + { + "epoch": 0.6797796143250688, + "grad_norm": 8.556108474731445, + "learning_rate": 2.3669647991899847e-06, + "loss": 0.511, + "step": 6169 + }, + { + "epoch": 0.6798898071625344, + "grad_norm": 5.67225980758667, + "learning_rate": 2.3654785979732407e-06, + "loss": 0.4465, + "step": 6170 + }, + { + "epoch": 0.68, + "grad_norm": 4.995718002319336, + "learning_rate": 2.3639927189093528e-06, + "loss": 0.3814, + "step": 6171 + }, + { + "epoch": 0.6801101928374655, + "grad_norm": 7.998754978179932, + "learning_rate": 2.362507162180017e-06, + "loss": 0.4225, + "step": 6172 + }, + { + "epoch": 0.6802203856749311, + "grad_norm": 5.867599010467529, + "learning_rate": 2.361021927966887e-06, + "loss": 0.4606, + "step": 6173 + }, + { + "epoch": 0.6803305785123966, + "grad_norm": 9.48109245300293, + "learning_rate": 2.3595370164515796e-06, + "loss": 0.3874, + "step": 6174 + }, + { + "epoch": 0.6804407713498623, + "grad_norm": 9.793745994567871, + "learning_rate": 2.3580524278156748e-06, + "loss": 0.4474, + "step": 6175 + }, + { + "epoch": 0.6805509641873279, + "grad_norm": 4.540402412414551, + "learning_rate": 2.356568162240706e-06, + "loss": 0.4161, + "step": 6176 + }, + { + "epoch": 0.6806611570247934, + "grad_norm": 6.409126281738281, + "learning_rate": 2.355084219908175e-06, + "loss": 0.4146, + "step": 6177 + }, + { + "epoch": 0.680771349862259, + "grad_norm": 8.314022064208984, + "learning_rate": 2.3536006009995343e-06, + "loss": 0.401, + "step": 6178 + }, + { + "epoch": 0.6808815426997246, + "grad_norm": 4.711172103881836, + "learning_rate": 2.352117305696211e-06, + "loss": 0.3854, + "step": 6179 + }, + { + "epoch": 0.6809917355371901, + "grad_norm": 9.868963241577148, + "learning_rate": 2.35063433417958e-06, + "loss": 0.3401, + "step": 6180 + }, + { + "epoch": 0.6811019283746557, + "grad_norm": 7.214323997497559, + "learning_rate": 2.349151686630978e-06, + "loss": 0.3826, + "step": 6181 + }, + { + "epoch": 0.6812121212121212, + "grad_norm": 5.444054126739502, + "learning_rate": 2.347669363231712e-06, + "loss": 0.3377, + "step": 6182 + }, + { + "epoch": 0.6813223140495868, + "grad_norm": 7.184004783630371, + "learning_rate": 2.3461873641630394e-06, + "loss": 0.4225, + "step": 6183 + }, + { + "epoch": 0.6814325068870524, + "grad_norm": 5.51975154876709, + "learning_rate": 2.3447056896061765e-06, + "loss": 0.3937, + "step": 6184 + }, + { + "epoch": 0.6815426997245179, + "grad_norm": 6.245907306671143, + "learning_rate": 2.343224339742313e-06, + "loss": 0.4114, + "step": 6185 + }, + { + "epoch": 0.6816528925619835, + "grad_norm": 4.655656814575195, + "learning_rate": 2.3417433147525864e-06, + "loss": 0.4115, + "step": 6186 + }, + { + "epoch": 0.6817630853994491, + "grad_norm": 10.695345878601074, + "learning_rate": 2.3402626148180957e-06, + "loss": 0.5083, + "step": 6187 + }, + { + "epoch": 0.6818732782369146, + "grad_norm": 4.769364833831787, + "learning_rate": 2.3387822401199055e-06, + "loss": 0.3928, + "step": 6188 + }, + { + "epoch": 0.6819834710743802, + "grad_norm": 10.957562446594238, + "learning_rate": 2.3373021908390397e-06, + "loss": 0.4721, + "step": 6189 + }, + { + "epoch": 0.6820936639118458, + "grad_norm": 7.979918479919434, + "learning_rate": 2.335822467156477e-06, + "loss": 0.3685, + "step": 6190 + }, + { + "epoch": 0.6822038567493113, + "grad_norm": 4.247445106506348, + "learning_rate": 2.334343069253162e-06, + "loss": 0.4332, + "step": 6191 + }, + { + "epoch": 0.6823140495867769, + "grad_norm": 8.737055778503418, + "learning_rate": 2.3328639973099983e-06, + "loss": 0.3704, + "step": 6192 + }, + { + "epoch": 0.6824242424242424, + "grad_norm": 6.368855953216553, + "learning_rate": 2.331385251507849e-06, + "loss": 0.3423, + "step": 6193 + }, + { + "epoch": 0.682534435261708, + "grad_norm": 8.097928047180176, + "learning_rate": 2.3299068320275342e-06, + "loss": 0.4046, + "step": 6194 + }, + { + "epoch": 0.6826446280991736, + "grad_norm": 5.629691123962402, + "learning_rate": 2.3284287390498388e-06, + "loss": 0.432, + "step": 6195 + }, + { + "epoch": 0.6827548209366391, + "grad_norm": 5.790268898010254, + "learning_rate": 2.3269509727555084e-06, + "loss": 0.3309, + "step": 6196 + }, + { + "epoch": 0.6828650137741047, + "grad_norm": 5.701327323913574, + "learning_rate": 2.325473533325242e-06, + "loss": 0.4051, + "step": 6197 + }, + { + "epoch": 0.6829752066115703, + "grad_norm": 10.208582878112793, + "learning_rate": 2.323996420939705e-06, + "loss": 0.4147, + "step": 6198 + }, + { + "epoch": 0.6830853994490358, + "grad_norm": 6.084228515625, + "learning_rate": 2.3225196357795227e-06, + "loss": 0.4382, + "step": 6199 + }, + { + "epoch": 0.6831955922865014, + "grad_norm": 6.657327175140381, + "learning_rate": 2.3210431780252742e-06, + "loss": 0.4025, + "step": 6200 + }, + { + "epoch": 0.6833057851239669, + "grad_norm": 5.073029041290283, + "learning_rate": 2.3195670478575046e-06, + "loss": 0.3901, + "step": 6201 + }, + { + "epoch": 0.6834159779614325, + "grad_norm": 8.293045997619629, + "learning_rate": 2.3180912454567195e-06, + "loss": 0.432, + "step": 6202 + }, + { + "epoch": 0.6835261707988981, + "grad_norm": 8.201218605041504, + "learning_rate": 2.3166157710033806e-06, + "loss": 0.4196, + "step": 6203 + }, + { + "epoch": 0.6836363636363636, + "grad_norm": 6.732716083526611, + "learning_rate": 2.3151406246779055e-06, + "loss": 0.408, + "step": 6204 + }, + { + "epoch": 0.6837465564738292, + "grad_norm": 6.923007488250732, + "learning_rate": 2.313665806660686e-06, + "loss": 0.3639, + "step": 6205 + }, + { + "epoch": 0.6838567493112948, + "grad_norm": 5.650262832641602, + "learning_rate": 2.3121913171320586e-06, + "loss": 0.386, + "step": 6206 + }, + { + "epoch": 0.6839669421487603, + "grad_norm": 4.500156402587891, + "learning_rate": 2.3107171562723284e-06, + "loss": 0.3851, + "step": 6207 + }, + { + "epoch": 0.6840771349862259, + "grad_norm": 5.889590263366699, + "learning_rate": 2.309243324261759e-06, + "loss": 0.3609, + "step": 6208 + }, + { + "epoch": 0.6841873278236914, + "grad_norm": 5.953109264373779, + "learning_rate": 2.3077698212805694e-06, + "loss": 0.3453, + "step": 6209 + }, + { + "epoch": 0.684297520661157, + "grad_norm": 6.79673433303833, + "learning_rate": 2.3062966475089445e-06, + "loss": 0.3652, + "step": 6210 + }, + { + "epoch": 0.6844077134986226, + "grad_norm": 8.246170997619629, + "learning_rate": 2.304823803127023e-06, + "loss": 0.4856, + "step": 6211 + }, + { + "epoch": 0.6845179063360881, + "grad_norm": 6.314090728759766, + "learning_rate": 2.303351288314908e-06, + "loss": 0.4042, + "step": 6212 + }, + { + "epoch": 0.6846280991735537, + "grad_norm": 13.75493049621582, + "learning_rate": 2.3018791032526615e-06, + "loss": 0.3992, + "step": 6213 + }, + { + "epoch": 0.6847382920110193, + "grad_norm": 8.827356338500977, + "learning_rate": 2.300407248120302e-06, + "loss": 0.3295, + "step": 6214 + }, + { + "epoch": 0.6848484848484848, + "grad_norm": 8.514598846435547, + "learning_rate": 2.2989357230978114e-06, + "loss": 0.4799, + "step": 6215 + }, + { + "epoch": 0.6849586776859504, + "grad_norm": 6.385679721832275, + "learning_rate": 2.2974645283651314e-06, + "loss": 0.4168, + "step": 6216 + }, + { + "epoch": 0.685068870523416, + "grad_norm": 5.92914342880249, + "learning_rate": 2.2959936641021585e-06, + "loss": 0.4095, + "step": 6217 + }, + { + "epoch": 0.6851790633608815, + "grad_norm": 6.143651962280273, + "learning_rate": 2.294523130488753e-06, + "loss": 0.4403, + "step": 6218 + }, + { + "epoch": 0.6852892561983471, + "grad_norm": 5.817147254943848, + "learning_rate": 2.293052927704736e-06, + "loss": 0.4051, + "step": 6219 + }, + { + "epoch": 0.6853994490358126, + "grad_norm": 4.9564714431762695, + "learning_rate": 2.291583055929882e-06, + "loss": 0.3457, + "step": 6220 + }, + { + "epoch": 0.6855096418732782, + "grad_norm": 14.586307525634766, + "learning_rate": 2.290113515343931e-06, + "loss": 0.5367, + "step": 6221 + }, + { + "epoch": 0.6856198347107438, + "grad_norm": 5.410526752471924, + "learning_rate": 2.288644306126582e-06, + "loss": 0.3927, + "step": 6222 + }, + { + "epoch": 0.6857300275482093, + "grad_norm": 4.907934188842773, + "learning_rate": 2.2871754284574885e-06, + "loss": 0.3944, + "step": 6223 + }, + { + "epoch": 0.6858402203856749, + "grad_norm": 7.13827657699585, + "learning_rate": 2.285706882516269e-06, + "loss": 0.3411, + "step": 6224 + }, + { + "epoch": 0.6859504132231405, + "grad_norm": 7.94610071182251, + "learning_rate": 2.2842386684825e-06, + "loss": 0.416, + "step": 6225 + }, + { + "epoch": 0.686060606060606, + "grad_norm": 4.437480926513672, + "learning_rate": 2.2827707865357146e-06, + "loss": 0.4489, + "step": 6226 + }, + { + "epoch": 0.6861707988980716, + "grad_norm": 4.621830463409424, + "learning_rate": 2.2813032368554084e-06, + "loss": 0.388, + "step": 6227 + }, + { + "epoch": 0.6862809917355371, + "grad_norm": 4.54764986038208, + "learning_rate": 2.2798360196210366e-06, + "loss": 0.3375, + "step": 6228 + }, + { + "epoch": 0.6863911845730027, + "grad_norm": 7.10068416595459, + "learning_rate": 2.27836913501201e-06, + "loss": 0.3517, + "step": 6229 + }, + { + "epoch": 0.6865013774104683, + "grad_norm": 10.14759635925293, + "learning_rate": 2.2769025832077026e-06, + "loss": 0.308, + "step": 6230 + }, + { + "epoch": 0.6866115702479338, + "grad_norm": 4.241217613220215, + "learning_rate": 2.2754363643874477e-06, + "loss": 0.4252, + "step": 6231 + }, + { + "epoch": 0.6867217630853995, + "grad_norm": 7.68984842300415, + "learning_rate": 2.2739704787305333e-06, + "loss": 0.4598, + "step": 6232 + }, + { + "epoch": 0.6868319559228651, + "grad_norm": 7.011338233947754, + "learning_rate": 2.272504926416212e-06, + "loss": 0.3465, + "step": 6233 + }, + { + "epoch": 0.6869421487603306, + "grad_norm": 5.404778003692627, + "learning_rate": 2.271039707623693e-06, + "loss": 0.4343, + "step": 6234 + }, + { + "epoch": 0.6870523415977962, + "grad_norm": 5.496755123138428, + "learning_rate": 2.2695748225321474e-06, + "loss": 0.4764, + "step": 6235 + }, + { + "epoch": 0.6871625344352618, + "grad_norm": 7.6246867179870605, + "learning_rate": 2.2681102713207015e-06, + "loss": 0.4154, + "step": 6236 + }, + { + "epoch": 0.6872727272727273, + "grad_norm": 4.848220348358154, + "learning_rate": 2.266646054168439e-06, + "loss": 0.3688, + "step": 6237 + }, + { + "epoch": 0.6873829201101929, + "grad_norm": 5.936487197875977, + "learning_rate": 2.2651821712544133e-06, + "loss": 0.4215, + "step": 6238 + }, + { + "epoch": 0.6874931129476584, + "grad_norm": 6.518362045288086, + "learning_rate": 2.2637186227576265e-06, + "loss": 0.3588, + "step": 6239 + }, + { + "epoch": 0.687603305785124, + "grad_norm": 9.76972770690918, + "learning_rate": 2.2622554088570397e-06, + "loss": 0.4332, + "step": 6240 + }, + { + "epoch": 0.6877134986225896, + "grad_norm": 5.921270847320557, + "learning_rate": 2.260792529731584e-06, + "loss": 0.335, + "step": 6241 + }, + { + "epoch": 0.6878236914600551, + "grad_norm": 3.6244571208953857, + "learning_rate": 2.259329985560139e-06, + "loss": 0.3482, + "step": 6242 + }, + { + "epoch": 0.6879338842975207, + "grad_norm": 8.61227035522461, + "learning_rate": 2.257867776521544e-06, + "loss": 0.3787, + "step": 6243 + }, + { + "epoch": 0.6880440771349863, + "grad_norm": 5.597924709320068, + "learning_rate": 2.256405902794602e-06, + "loss": 0.3788, + "step": 6244 + }, + { + "epoch": 0.6881542699724518, + "grad_norm": 7.333438396453857, + "learning_rate": 2.2549443645580755e-06, + "loss": 0.3755, + "step": 6245 + }, + { + "epoch": 0.6882644628099174, + "grad_norm": 8.690994262695312, + "learning_rate": 2.253483161990679e-06, + "loss": 0.4823, + "step": 6246 + }, + { + "epoch": 0.6883746556473829, + "grad_norm": 7.609672546386719, + "learning_rate": 2.252022295271092e-06, + "loss": 0.4277, + "step": 6247 + }, + { + "epoch": 0.6884848484848485, + "grad_norm": 7.242532253265381, + "learning_rate": 2.250561764577951e-06, + "loss": 0.3676, + "step": 6248 + }, + { + "epoch": 0.6885950413223141, + "grad_norm": 8.508557319641113, + "learning_rate": 2.2491015700898543e-06, + "loss": 0.4702, + "step": 6249 + }, + { + "epoch": 0.6887052341597796, + "grad_norm": 14.057016372680664, + "learning_rate": 2.2476417119853527e-06, + "loss": 0.5032, + "step": 6250 + }, + { + "epoch": 0.6888154269972452, + "grad_norm": 5.928760528564453, + "learning_rate": 2.2461821904429616e-06, + "loss": 0.4208, + "step": 6251 + }, + { + "epoch": 0.6889256198347108, + "grad_norm": 7.277674198150635, + "learning_rate": 2.244723005641154e-06, + "loss": 0.4078, + "step": 6252 + }, + { + "epoch": 0.6890358126721763, + "grad_norm": 7.040360450744629, + "learning_rate": 2.2432641577583584e-06, + "loss": 0.4071, + "step": 6253 + }, + { + "epoch": 0.6891460055096419, + "grad_norm": 4.986152648925781, + "learning_rate": 2.2418056469729666e-06, + "loss": 0.3865, + "step": 6254 + }, + { + "epoch": 0.6892561983471074, + "grad_norm": 8.636344909667969, + "learning_rate": 2.2403474734633284e-06, + "loss": 0.4156, + "step": 6255 + }, + { + "epoch": 0.689366391184573, + "grad_norm": 8.572731018066406, + "learning_rate": 2.238889637407748e-06, + "loss": 0.459, + "step": 6256 + }, + { + "epoch": 0.6894765840220386, + "grad_norm": 10.21874713897705, + "learning_rate": 2.237432138984493e-06, + "loss": 0.4493, + "step": 6257 + }, + { + "epoch": 0.6895867768595041, + "grad_norm": 4.540192127227783, + "learning_rate": 2.235974978371791e-06, + "loss": 0.3693, + "step": 6258 + }, + { + "epoch": 0.6896969696969697, + "grad_norm": 6.4375176429748535, + "learning_rate": 2.234518155747821e-06, + "loss": 0.379, + "step": 6259 + }, + { + "epoch": 0.6898071625344353, + "grad_norm": 5.030807971954346, + "learning_rate": 2.233061671290728e-06, + "loss": 0.3979, + "step": 6260 + }, + { + "epoch": 0.6899173553719008, + "grad_norm": 7.331912994384766, + "learning_rate": 2.231605525178614e-06, + "loss": 0.4685, + "step": 6261 + }, + { + "epoch": 0.6900275482093664, + "grad_norm": 11.071955680847168, + "learning_rate": 2.230149717589535e-06, + "loss": 0.4517, + "step": 6262 + }, + { + "epoch": 0.690137741046832, + "grad_norm": 5.575560569763184, + "learning_rate": 2.228694248701511e-06, + "loss": 0.3085, + "step": 6263 + }, + { + "epoch": 0.6902479338842975, + "grad_norm": 5.732341766357422, + "learning_rate": 2.2272391186925196e-06, + "loss": 0.343, + "step": 6264 + }, + { + "epoch": 0.6903581267217631, + "grad_norm": 12.872480392456055, + "learning_rate": 2.2257843277404944e-06, + "loss": 0.4617, + "step": 6265 + }, + { + "epoch": 0.6904683195592286, + "grad_norm": 5.646525859832764, + "learning_rate": 2.2243298760233302e-06, + "loss": 0.3853, + "step": 6266 + }, + { + "epoch": 0.6905785123966942, + "grad_norm": 8.56628704071045, + "learning_rate": 2.2228757637188805e-06, + "loss": 0.3555, + "step": 6267 + }, + { + "epoch": 0.6906887052341598, + "grad_norm": 6.781405925750732, + "learning_rate": 2.221421991004953e-06, + "loss": 0.4026, + "step": 6268 + }, + { + "epoch": 0.6907988980716253, + "grad_norm": 5.899105072021484, + "learning_rate": 2.2199685580593207e-06, + "loss": 0.3981, + "step": 6269 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 5.597877502441406, + "learning_rate": 2.218515465059707e-06, + "loss": 0.3396, + "step": 6270 + }, + { + "epoch": 0.6910192837465565, + "grad_norm": 9.017989158630371, + "learning_rate": 2.2170627121838012e-06, + "loss": 0.3915, + "step": 6271 + }, + { + "epoch": 0.691129476584022, + "grad_norm": 5.523779392242432, + "learning_rate": 2.215610299609249e-06, + "loss": 0.4337, + "step": 6272 + }, + { + "epoch": 0.6912396694214876, + "grad_norm": 10.373221397399902, + "learning_rate": 2.2141582275136494e-06, + "loss": 0.4425, + "step": 6273 + }, + { + "epoch": 0.6913498622589531, + "grad_norm": 4.727308750152588, + "learning_rate": 2.212706496074566e-06, + "loss": 0.4192, + "step": 6274 + }, + { + "epoch": 0.6914600550964187, + "grad_norm": 6.766533374786377, + "learning_rate": 2.21125510546952e-06, + "loss": 0.4341, + "step": 6275 + }, + { + "epoch": 0.6915702479338843, + "grad_norm": 6.907382011413574, + "learning_rate": 2.209804055875987e-06, + "loss": 0.3731, + "step": 6276 + }, + { + "epoch": 0.6916804407713498, + "grad_norm": 5.283140182495117, + "learning_rate": 2.2083533474714032e-06, + "loss": 0.3699, + "step": 6277 + }, + { + "epoch": 0.6917906336088154, + "grad_norm": 6.515407085418701, + "learning_rate": 2.2069029804331665e-06, + "loss": 0.4355, + "step": 6278 + }, + { + "epoch": 0.691900826446281, + "grad_norm": 5.4836649894714355, + "learning_rate": 2.2054529549386257e-06, + "loss": 0.3901, + "step": 6279 + }, + { + "epoch": 0.6920110192837465, + "grad_norm": 8.09799575805664, + "learning_rate": 2.2040032711650928e-06, + "loss": 0.4018, + "step": 6280 + }, + { + "epoch": 0.6921212121212121, + "grad_norm": 10.193501472473145, + "learning_rate": 2.2025539292898402e-06, + "loss": 0.4296, + "step": 6281 + }, + { + "epoch": 0.6922314049586776, + "grad_norm": 5.739263534545898, + "learning_rate": 2.2011049294900915e-06, + "loss": 0.3733, + "step": 6282 + }, + { + "epoch": 0.6923415977961432, + "grad_norm": 10.090827941894531, + "learning_rate": 2.1996562719430337e-06, + "loss": 0.4062, + "step": 6283 + }, + { + "epoch": 0.6924517906336088, + "grad_norm": 7.952030658721924, + "learning_rate": 2.1982079568258123e-06, + "loss": 0.3899, + "step": 6284 + }, + { + "epoch": 0.6925619834710743, + "grad_norm": 6.2705488204956055, + "learning_rate": 2.196759984315527e-06, + "loss": 0.3744, + "step": 6285 + }, + { + "epoch": 0.6926721763085399, + "grad_norm": 9.91977596282959, + "learning_rate": 2.1953123545892373e-06, + "loss": 0.503, + "step": 6286 + }, + { + "epoch": 0.6927823691460055, + "grad_norm": 9.509714126586914, + "learning_rate": 2.193865067823965e-06, + "loss": 0.3961, + "step": 6287 + }, + { + "epoch": 0.692892561983471, + "grad_norm": 8.341251373291016, + "learning_rate": 2.192418124196683e-06, + "loss": 0.3947, + "step": 6288 + }, + { + "epoch": 0.6930027548209367, + "grad_norm": 5.873130798339844, + "learning_rate": 2.1909715238843253e-06, + "loss": 0.4422, + "step": 6289 + }, + { + "epoch": 0.6931129476584023, + "grad_norm": 8.459653854370117, + "learning_rate": 2.189525267063786e-06, + "loss": 0.4469, + "step": 6290 + }, + { + "epoch": 0.6932231404958678, + "grad_norm": 5.0621490478515625, + "learning_rate": 2.1880793539119168e-06, + "loss": 0.4427, + "step": 6291 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 21.41733741760254, + "learning_rate": 2.1866337846055215e-06, + "loss": 0.4542, + "step": 6292 + }, + { + "epoch": 0.6934435261707989, + "grad_norm": 11.350135803222656, + "learning_rate": 2.185188559321369e-06, + "loss": 0.3635, + "step": 6293 + }, + { + "epoch": 0.6935537190082645, + "grad_norm": 7.1028337478637695, + "learning_rate": 2.1837436782361843e-06, + "loss": 0.4029, + "step": 6294 + }, + { + "epoch": 0.6936639118457301, + "grad_norm": 4.9193902015686035, + "learning_rate": 2.1822991415266487e-06, + "loss": 0.389, + "step": 6295 + }, + { + "epoch": 0.6937741046831956, + "grad_norm": 6.254681587219238, + "learning_rate": 2.1808549493693975e-06, + "loss": 0.44, + "step": 6296 + }, + { + "epoch": 0.6938842975206612, + "grad_norm": 9.382376670837402, + "learning_rate": 2.1794111019410364e-06, + "loss": 0.4691, + "step": 6297 + }, + { + "epoch": 0.6939944903581268, + "grad_norm": 5.008315563201904, + "learning_rate": 2.1779675994181167e-06, + "loss": 0.3961, + "step": 6298 + }, + { + "epoch": 0.6941046831955923, + "grad_norm": 6.2896552085876465, + "learning_rate": 2.1765244419771494e-06, + "loss": 0.3416, + "step": 6299 + }, + { + "epoch": 0.6942148760330579, + "grad_norm": 8.428092956542969, + "learning_rate": 2.1750816297946117e-06, + "loss": 0.4795, + "step": 6300 + }, + { + "epoch": 0.6943250688705234, + "grad_norm": 8.456198692321777, + "learning_rate": 2.1736391630469296e-06, + "loss": 0.4117, + "step": 6301 + }, + { + "epoch": 0.694435261707989, + "grad_norm": 14.213221549987793, + "learning_rate": 2.1721970419104883e-06, + "loss": 0.4622, + "step": 6302 + }, + { + "epoch": 0.6945454545454546, + "grad_norm": 12.13443660736084, + "learning_rate": 2.170755266561634e-06, + "loss": 0.4652, + "step": 6303 + }, + { + "epoch": 0.6946556473829201, + "grad_norm": 6.786264419555664, + "learning_rate": 2.169313837176668e-06, + "loss": 0.3888, + "step": 6304 + }, + { + "epoch": 0.6947658402203857, + "grad_norm": 6.256235599517822, + "learning_rate": 2.1678727539318537e-06, + "loss": 0.4624, + "step": 6305 + }, + { + "epoch": 0.6948760330578513, + "grad_norm": 5.675997257232666, + "learning_rate": 2.1664320170034043e-06, + "loss": 0.3924, + "step": 6306 + }, + { + "epoch": 0.6949862258953168, + "grad_norm": 7.280065536499023, + "learning_rate": 2.1649916265674968e-06, + "loss": 0.3564, + "step": 6307 + }, + { + "epoch": 0.6950964187327824, + "grad_norm": 5.219188213348389, + "learning_rate": 2.1635515828002655e-06, + "loss": 0.3962, + "step": 6308 + }, + { + "epoch": 0.6952066115702479, + "grad_norm": 6.4076409339904785, + "learning_rate": 2.1621118858777983e-06, + "loss": 0.4154, + "step": 6309 + }, + { + "epoch": 0.6953168044077135, + "grad_norm": 5.759407997131348, + "learning_rate": 2.160672535976145e-06, + "loss": 0.4566, + "step": 6310 + }, + { + "epoch": 0.6954269972451791, + "grad_norm": 8.507913589477539, + "learning_rate": 2.1592335332713123e-06, + "loss": 0.3917, + "step": 6311 + }, + { + "epoch": 0.6955371900826446, + "grad_norm": 6.995584964752197, + "learning_rate": 2.15779487793926e-06, + "loss": 0.3178, + "step": 6312 + }, + { + "epoch": 0.6956473829201102, + "grad_norm": 6.555243015289307, + "learning_rate": 2.156356570155911e-06, + "loss": 0.3329, + "step": 6313 + }, + { + "epoch": 0.6957575757575758, + "grad_norm": 9.83935546875, + "learning_rate": 2.154918610097145e-06, + "loss": 0.4674, + "step": 6314 + }, + { + "epoch": 0.6958677685950413, + "grad_norm": 11.09785270690918, + "learning_rate": 2.153480997938794e-06, + "loss": 0.4431, + "step": 6315 + }, + { + "epoch": 0.6959779614325069, + "grad_norm": 8.9512300491333, + "learning_rate": 2.152043733856653e-06, + "loss": 0.4768, + "step": 6316 + }, + { + "epoch": 0.6960881542699725, + "grad_norm": 6.576215744018555, + "learning_rate": 2.150606818026475e-06, + "loss": 0.3314, + "step": 6317 + }, + { + "epoch": 0.696198347107438, + "grad_norm": 4.43838357925415, + "learning_rate": 2.149170250623964e-06, + "loss": 0.3905, + "step": 6318 + }, + { + "epoch": 0.6963085399449036, + "grad_norm": 6.3059282302856445, + "learning_rate": 2.147734031824787e-06, + "loss": 0.4542, + "step": 6319 + }, + { + "epoch": 0.6964187327823691, + "grad_norm": 11.818520545959473, + "learning_rate": 2.146298161804569e-06, + "loss": 0.4935, + "step": 6320 + }, + { + "epoch": 0.6965289256198347, + "grad_norm": 5.9157280921936035, + "learning_rate": 2.1448626407388863e-06, + "loss": 0.3611, + "step": 6321 + }, + { + "epoch": 0.6966391184573003, + "grad_norm": 19.772029876708984, + "learning_rate": 2.1434274688032784e-06, + "loss": 0.474, + "step": 6322 + }, + { + "epoch": 0.6967493112947658, + "grad_norm": 7.780381202697754, + "learning_rate": 2.1419926461732417e-06, + "loss": 0.3949, + "step": 6323 + }, + { + "epoch": 0.6968595041322314, + "grad_norm": 4.548693656921387, + "learning_rate": 2.1405581730242244e-06, + "loss": 0.4388, + "step": 6324 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 7.220384120941162, + "learning_rate": 2.139124049531638e-06, + "loss": 0.3578, + "step": 6325 + }, + { + "epoch": 0.6970798898071625, + "grad_norm": 5.096642971038818, + "learning_rate": 2.1376902758708505e-06, + "loss": 0.391, + "step": 6326 + }, + { + "epoch": 0.6971900826446281, + "grad_norm": 9.401430130004883, + "learning_rate": 2.136256852217183e-06, + "loss": 0.3521, + "step": 6327 + }, + { + "epoch": 0.6973002754820936, + "grad_norm": 5.316445827484131, + "learning_rate": 2.1348237787459188e-06, + "loss": 0.3578, + "step": 6328 + }, + { + "epoch": 0.6974104683195592, + "grad_norm": 4.958508491516113, + "learning_rate": 2.1333910556322928e-06, + "loss": 0.4235, + "step": 6329 + }, + { + "epoch": 0.6975206611570248, + "grad_norm": 8.173996925354004, + "learning_rate": 2.1319586830515032e-06, + "loss": 0.4319, + "step": 6330 + }, + { + "epoch": 0.6976308539944903, + "grad_norm": 4.869649887084961, + "learning_rate": 2.130526661178703e-06, + "loss": 0.3519, + "step": 6331 + }, + { + "epoch": 0.6977410468319559, + "grad_norm": 6.183595657348633, + "learning_rate": 2.1290949901889967e-06, + "loss": 0.3677, + "step": 6332 + }, + { + "epoch": 0.6978512396694215, + "grad_norm": 6.6458635330200195, + "learning_rate": 2.1276636702574587e-06, + "loss": 0.4512, + "step": 6333 + }, + { + "epoch": 0.697961432506887, + "grad_norm": 3.9340484142303467, + "learning_rate": 2.126232701559109e-06, + "loss": 0.4129, + "step": 6334 + }, + { + "epoch": 0.6980716253443526, + "grad_norm": 4.056954383850098, + "learning_rate": 2.124802084268926e-06, + "loss": 0.3039, + "step": 6335 + }, + { + "epoch": 0.6981818181818182, + "grad_norm": 13.909092903137207, + "learning_rate": 2.12337181856185e-06, + "loss": 0.2881, + "step": 6336 + }, + { + "epoch": 0.6982920110192837, + "grad_norm": 5.884676933288574, + "learning_rate": 2.121941904612777e-06, + "loss": 0.4296, + "step": 6337 + }, + { + "epoch": 0.6984022038567493, + "grad_norm": 4.607361793518066, + "learning_rate": 2.1205123425965555e-06, + "loss": 0.3976, + "step": 6338 + }, + { + "epoch": 0.6985123966942148, + "grad_norm": 5.883996486663818, + "learning_rate": 2.119083132687997e-06, + "loss": 0.371, + "step": 6339 + }, + { + "epoch": 0.6986225895316804, + "grad_norm": 6.691327095031738, + "learning_rate": 2.117654275061867e-06, + "loss": 0.4552, + "step": 6340 + }, + { + "epoch": 0.698732782369146, + "grad_norm": 5.95828914642334, + "learning_rate": 2.1162257698928866e-06, + "loss": 0.3003, + "step": 6341 + }, + { + "epoch": 0.6988429752066115, + "grad_norm": 5.617248058319092, + "learning_rate": 2.1147976173557363e-06, + "loss": 0.4259, + "step": 6342 + }, + { + "epoch": 0.6989531680440771, + "grad_norm": 5.405389785766602, + "learning_rate": 2.113369817625054e-06, + "loss": 0.3632, + "step": 6343 + }, + { + "epoch": 0.6990633608815428, + "grad_norm": 9.319952011108398, + "learning_rate": 2.1119423708754295e-06, + "loss": 0.4152, + "step": 6344 + }, + { + "epoch": 0.6991735537190082, + "grad_norm": 7.254077434539795, + "learning_rate": 2.110515277281415e-06, + "loss": 0.4259, + "step": 6345 + }, + { + "epoch": 0.6992837465564739, + "grad_norm": 9.147215843200684, + "learning_rate": 2.1090885370175176e-06, + "loss": 0.4393, + "step": 6346 + }, + { + "epoch": 0.6993939393939393, + "grad_norm": 6.482443332672119, + "learning_rate": 2.107662150258202e-06, + "loss": 0.3583, + "step": 6347 + }, + { + "epoch": 0.699504132231405, + "grad_norm": 4.191505432128906, + "learning_rate": 2.1062361171778865e-06, + "loss": 0.3227, + "step": 6348 + }, + { + "epoch": 0.6996143250688706, + "grad_norm": 7.555344104766846, + "learning_rate": 2.1048104379509493e-06, + "loss": 0.4246, + "step": 6349 + }, + { + "epoch": 0.699724517906336, + "grad_norm": 10.89263916015625, + "learning_rate": 2.1033851127517263e-06, + "loss": 0.4577, + "step": 6350 + }, + { + "epoch": 0.6998347107438017, + "grad_norm": 4.815510272979736, + "learning_rate": 2.101960141754506e-06, + "loss": 0.3936, + "step": 6351 + }, + { + "epoch": 0.6999449035812673, + "grad_norm": 8.367965698242188, + "learning_rate": 2.100535525133533e-06, + "loss": 0.4262, + "step": 6352 + }, + { + "epoch": 0.7000550964187328, + "grad_norm": 4.15157413482666, + "learning_rate": 2.099111263063018e-06, + "loss": 0.4028, + "step": 6353 + }, + { + "epoch": 0.7001652892561984, + "grad_norm": 6.863077163696289, + "learning_rate": 2.097687355717118e-06, + "loss": 0.372, + "step": 6354 + }, + { + "epoch": 0.7002754820936639, + "grad_norm": 5.519453048706055, + "learning_rate": 2.0962638032699467e-06, + "loss": 0.3509, + "step": 6355 + }, + { + "epoch": 0.7003856749311295, + "grad_norm": 5.474221706390381, + "learning_rate": 2.094840605895586e-06, + "loss": 0.34, + "step": 6356 + }, + { + "epoch": 0.7003856749311295, + "eval_loss": 0.40236836671829224, + "eval_runtime": 41.9485, + "eval_samples_per_second": 17.498, + "eval_steps_per_second": 2.193, + "step": 6356 + }, + { + "epoch": 0.7004958677685951, + "grad_norm": 5.4912519454956055, + "learning_rate": 2.093417763768062e-06, + "loss": 0.4118, + "step": 6357 + }, + { + "epoch": 0.7006060606060606, + "grad_norm": 10.394960403442383, + "learning_rate": 2.0919952770613584e-06, + "loss": 0.5167, + "step": 6358 + }, + { + "epoch": 0.7007162534435262, + "grad_norm": 4.946646690368652, + "learning_rate": 2.090573145949426e-06, + "loss": 0.3794, + "step": 6359 + }, + { + "epoch": 0.7008264462809918, + "grad_norm": 6.778929233551025, + "learning_rate": 2.08915137060616e-06, + "loss": 0.3765, + "step": 6360 + }, + { + "epoch": 0.7009366391184573, + "grad_norm": 10.721000671386719, + "learning_rate": 2.08772995120542e-06, + "loss": 0.4748, + "step": 6361 + }, + { + "epoch": 0.7010468319559229, + "grad_norm": 5.147397518157959, + "learning_rate": 2.0863088879210158e-06, + "loss": 0.3863, + "step": 6362 + }, + { + "epoch": 0.7011570247933885, + "grad_norm": 5.916102409362793, + "learning_rate": 2.0848881809267185e-06, + "loss": 0.4713, + "step": 6363 + }, + { + "epoch": 0.701267217630854, + "grad_norm": 5.3361735343933105, + "learning_rate": 2.0834678303962556e-06, + "loss": 0.3733, + "step": 6364 + }, + { + "epoch": 0.7013774104683196, + "grad_norm": 6.816646099090576, + "learning_rate": 2.082047836503307e-06, + "loss": 0.3823, + "step": 6365 + }, + { + "epoch": 0.7014876033057851, + "grad_norm": 8.116055488586426, + "learning_rate": 2.0806281994215128e-06, + "loss": 0.4014, + "step": 6366 + }, + { + "epoch": 0.7015977961432507, + "grad_norm": 5.204188346862793, + "learning_rate": 2.0792089193244693e-06, + "loss": 0.3623, + "step": 6367 + }, + { + "epoch": 0.7017079889807163, + "grad_norm": 5.562734127044678, + "learning_rate": 2.0777899963857244e-06, + "loss": 0.3095, + "step": 6368 + }, + { + "epoch": 0.7018181818181818, + "grad_norm": 9.468295097351074, + "learning_rate": 2.0763714307787893e-06, + "loss": 0.432, + "step": 6369 + }, + { + "epoch": 0.7019283746556474, + "grad_norm": 9.8388671875, + "learning_rate": 2.074953222677128e-06, + "loss": 0.4079, + "step": 6370 + }, + { + "epoch": 0.702038567493113, + "grad_norm": 11.453285217285156, + "learning_rate": 2.073535372254158e-06, + "loss": 0.4765, + "step": 6371 + }, + { + "epoch": 0.7021487603305785, + "grad_norm": 5.276843547821045, + "learning_rate": 2.072117879683258e-06, + "loss": 0.421, + "step": 6372 + }, + { + "epoch": 0.7022589531680441, + "grad_norm": 4.76207160949707, + "learning_rate": 2.070700745137763e-06, + "loss": 0.4099, + "step": 6373 + }, + { + "epoch": 0.7023691460055096, + "grad_norm": 6.459706783294678, + "learning_rate": 2.0692839687909578e-06, + "loss": 0.3702, + "step": 6374 + }, + { + "epoch": 0.7024793388429752, + "grad_norm": 4.18793249130249, + "learning_rate": 2.06786755081609e-06, + "loss": 0.465, + "step": 6375 + }, + { + "epoch": 0.7025895316804408, + "grad_norm": 10.031018257141113, + "learning_rate": 2.066451491386363e-06, + "loss": 0.3843, + "step": 6376 + }, + { + "epoch": 0.7026997245179063, + "grad_norm": 5.30232572555542, + "learning_rate": 2.0650357906749304e-06, + "loss": 0.3887, + "step": 6377 + }, + { + "epoch": 0.7028099173553719, + "grad_norm": 5.033413887023926, + "learning_rate": 2.0636204488549083e-06, + "loss": 0.4034, + "step": 6378 + }, + { + "epoch": 0.7029201101928375, + "grad_norm": 5.639863967895508, + "learning_rate": 2.062205466099368e-06, + "loss": 0.3672, + "step": 6379 + }, + { + "epoch": 0.703030303030303, + "grad_norm": 10.517995834350586, + "learning_rate": 2.060790842581332e-06, + "loss": 0.4371, + "step": 6380 + }, + { + "epoch": 0.7031404958677686, + "grad_norm": 9.898099899291992, + "learning_rate": 2.0593765784737846e-06, + "loss": 0.3787, + "step": 6381 + }, + { + "epoch": 0.7032506887052341, + "grad_norm": 5.050538063049316, + "learning_rate": 2.057962673949665e-06, + "loss": 0.4103, + "step": 6382 + }, + { + "epoch": 0.7033608815426997, + "grad_norm": 6.702361583709717, + "learning_rate": 2.0565491291818647e-06, + "loss": 0.4268, + "step": 6383 + }, + { + "epoch": 0.7034710743801653, + "grad_norm": 9.471160888671875, + "learning_rate": 2.0551359443432347e-06, + "loss": 0.3962, + "step": 6384 + }, + { + "epoch": 0.7035812672176308, + "grad_norm": 8.269988059997559, + "learning_rate": 2.0537231196065836e-06, + "loss": 0.4447, + "step": 6385 + }, + { + "epoch": 0.7036914600550964, + "grad_norm": 7.267406940460205, + "learning_rate": 2.05231065514467e-06, + "loss": 0.4116, + "step": 6386 + }, + { + "epoch": 0.703801652892562, + "grad_norm": 8.130721092224121, + "learning_rate": 2.050898551130215e-06, + "loss": 0.446, + "step": 6387 + }, + { + "epoch": 0.7039118457300275, + "grad_norm": 8.853819847106934, + "learning_rate": 2.0494868077358875e-06, + "loss": 0.3528, + "step": 6388 + }, + { + "epoch": 0.7040220385674931, + "grad_norm": 6.168217658996582, + "learning_rate": 2.048075425134325e-06, + "loss": 0.3955, + "step": 6389 + }, + { + "epoch": 0.7041322314049587, + "grad_norm": 5.140215873718262, + "learning_rate": 2.046664403498109e-06, + "loss": 0.4235, + "step": 6390 + }, + { + "epoch": 0.7042424242424242, + "grad_norm": 7.185565948486328, + "learning_rate": 2.0452537429997782e-06, + "loss": 0.4495, + "step": 6391 + }, + { + "epoch": 0.7043526170798898, + "grad_norm": 6.418686866760254, + "learning_rate": 2.0438434438118366e-06, + "loss": 0.3828, + "step": 6392 + }, + { + "epoch": 0.7044628099173553, + "grad_norm": 5.245448589324951, + "learning_rate": 2.0424335061067345e-06, + "loss": 0.4428, + "step": 6393 + }, + { + "epoch": 0.7045730027548209, + "grad_norm": 5.457551956176758, + "learning_rate": 2.0410239300568785e-06, + "loss": 0.4255, + "step": 6394 + }, + { + "epoch": 0.7046831955922865, + "grad_norm": 5.4160943031311035, + "learning_rate": 2.0396147158346362e-06, + "loss": 0.3421, + "step": 6395 + }, + { + "epoch": 0.704793388429752, + "grad_norm": 7.206965923309326, + "learning_rate": 2.038205863612329e-06, + "loss": 0.3551, + "step": 6396 + }, + { + "epoch": 0.7049035812672176, + "grad_norm": 7.743720531463623, + "learning_rate": 2.036797373562231e-06, + "loss": 0.4317, + "step": 6397 + }, + { + "epoch": 0.7050137741046832, + "grad_norm": 5.294939041137695, + "learning_rate": 2.0353892458565742e-06, + "loss": 0.4029, + "step": 6398 + }, + { + "epoch": 0.7051239669421487, + "grad_norm": 4.676883697509766, + "learning_rate": 2.03398148066755e-06, + "loss": 0.4449, + "step": 6399 + }, + { + "epoch": 0.7052341597796143, + "grad_norm": 6.229726314544678, + "learning_rate": 2.0325740781672975e-06, + "loss": 0.3878, + "step": 6400 + }, + { + "epoch": 0.7053443526170798, + "grad_norm": 7.104625701904297, + "learning_rate": 2.0311670385279177e-06, + "loss": 0.4034, + "step": 6401 + }, + { + "epoch": 0.7054545454545454, + "grad_norm": 5.908341407775879, + "learning_rate": 2.0297603619214644e-06, + "loss": 0.3683, + "step": 6402 + }, + { + "epoch": 0.705564738292011, + "grad_norm": 4.865956783294678, + "learning_rate": 2.028354048519951e-06, + "loss": 0.4301, + "step": 6403 + }, + { + "epoch": 0.7056749311294765, + "grad_norm": 9.047842025756836, + "learning_rate": 2.026948098495339e-06, + "loss": 0.4413, + "step": 6404 + }, + { + "epoch": 0.7057851239669422, + "grad_norm": 5.223608493804932, + "learning_rate": 2.0255425120195533e-06, + "loss": 0.4028, + "step": 6405 + }, + { + "epoch": 0.7058953168044078, + "grad_norm": 5.946341514587402, + "learning_rate": 2.0241372892644702e-06, + "loss": 0.4659, + "step": 6406 + }, + { + "epoch": 0.7060055096418733, + "grad_norm": 6.7283501625061035, + "learning_rate": 2.0227324304019203e-06, + "loss": 0.4225, + "step": 6407 + }, + { + "epoch": 0.7061157024793389, + "grad_norm": 6.779466152191162, + "learning_rate": 2.0213279356036925e-06, + "loss": 0.4736, + "step": 6408 + }, + { + "epoch": 0.7062258953168044, + "grad_norm": 8.905494689941406, + "learning_rate": 2.019923805041533e-06, + "loss": 0.368, + "step": 6409 + }, + { + "epoch": 0.70633608815427, + "grad_norm": 9.43920612335205, + "learning_rate": 2.018520038887139e-06, + "loss": 0.4197, + "step": 6410 + }, + { + "epoch": 0.7064462809917356, + "grad_norm": 7.749484539031982, + "learning_rate": 2.017116637312161e-06, + "loss": 0.4171, + "step": 6411 + }, + { + "epoch": 0.7065564738292011, + "grad_norm": 10.579435348510742, + "learning_rate": 2.0157136004882156e-06, + "loss": 0.4442, + "step": 6412 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 4.212276935577393, + "learning_rate": 2.0143109285868653e-06, + "loss": 0.3502, + "step": 6413 + }, + { + "epoch": 0.7067768595041323, + "grad_norm": 6.677924156188965, + "learning_rate": 2.012908621779626e-06, + "loss": 0.3678, + "step": 6414 + }, + { + "epoch": 0.7068870523415978, + "grad_norm": 4.494370460510254, + "learning_rate": 2.0115066802379818e-06, + "loss": 0.3759, + "step": 6415 + }, + { + "epoch": 0.7069972451790634, + "grad_norm": 6.205049991607666, + "learning_rate": 2.0101051041333593e-06, + "loss": 0.4057, + "step": 6416 + }, + { + "epoch": 0.707107438016529, + "grad_norm": 5.39734411239624, + "learning_rate": 2.008703893637145e-06, + "loss": 0.3744, + "step": 6417 + }, + { + "epoch": 0.7072176308539945, + "grad_norm": 6.977585315704346, + "learning_rate": 2.007303048920684e-06, + "loss": 0.3501, + "step": 6418 + }, + { + "epoch": 0.7073278236914601, + "grad_norm": 7.957043647766113, + "learning_rate": 2.00590257015527e-06, + "loss": 0.4709, + "step": 6419 + }, + { + "epoch": 0.7074380165289256, + "grad_norm": 3.9455745220184326, + "learning_rate": 2.004502457512158e-06, + "loss": 0.3972, + "step": 6420 + }, + { + "epoch": 0.7075482093663912, + "grad_norm": 5.484201431274414, + "learning_rate": 2.003102711162553e-06, + "loss": 0.3926, + "step": 6421 + }, + { + "epoch": 0.7076584022038568, + "grad_norm": 5.1551594734191895, + "learning_rate": 2.001703331277619e-06, + "loss": 0.4625, + "step": 6422 + }, + { + "epoch": 0.7077685950413223, + "grad_norm": 6.230329990386963, + "learning_rate": 2.0003043180284763e-06, + "loss": 0.4086, + "step": 6423 + }, + { + "epoch": 0.7078787878787879, + "grad_norm": 7.448983192443848, + "learning_rate": 1.998905671586195e-06, + "loss": 0.4738, + "step": 6424 + }, + { + "epoch": 0.7079889807162535, + "grad_norm": 9.219151496887207, + "learning_rate": 1.9975073921218043e-06, + "loss": 0.3801, + "step": 6425 + }, + { + "epoch": 0.708099173553719, + "grad_norm": 4.740880966186523, + "learning_rate": 1.9961094798062903e-06, + "loss": 0.305, + "step": 6426 + }, + { + "epoch": 0.7082093663911846, + "grad_norm": 7.525774955749512, + "learning_rate": 1.9947119348105877e-06, + "loss": 0.455, + "step": 6427 + }, + { + "epoch": 0.7083195592286501, + "grad_norm": 7.525498390197754, + "learning_rate": 1.993314757305592e-06, + "loss": 0.4045, + "step": 6428 + }, + { + "epoch": 0.7084297520661157, + "grad_norm": 5.27136754989624, + "learning_rate": 1.991917947462153e-06, + "loss": 0.3178, + "step": 6429 + }, + { + "epoch": 0.7085399449035813, + "grad_norm": 4.343788146972656, + "learning_rate": 1.9905215054510724e-06, + "loss": 0.2995, + "step": 6430 + }, + { + "epoch": 0.7086501377410468, + "grad_norm": 12.280888557434082, + "learning_rate": 1.98912543144311e-06, + "loss": 0.516, + "step": 6431 + }, + { + "epoch": 0.7087603305785124, + "grad_norm": 4.431636333465576, + "learning_rate": 1.9877297256089813e-06, + "loss": 0.3228, + "step": 6432 + }, + { + "epoch": 0.708870523415978, + "grad_norm": 5.902378559112549, + "learning_rate": 1.9863343881193513e-06, + "loss": 0.4118, + "step": 6433 + }, + { + "epoch": 0.7089807162534435, + "grad_norm": 9.444318771362305, + "learning_rate": 1.9849394191448467e-06, + "loss": 0.4742, + "step": 6434 + }, + { + "epoch": 0.7090909090909091, + "grad_norm": 4.91517448425293, + "learning_rate": 1.9835448188560474e-06, + "loss": 0.3875, + "step": 6435 + }, + { + "epoch": 0.7092011019283747, + "grad_norm": 5.268383026123047, + "learning_rate": 1.9821505874234833e-06, + "loss": 0.4068, + "step": 6436 + }, + { + "epoch": 0.7093112947658402, + "grad_norm": 4.532389163970947, + "learning_rate": 1.980756725017644e-06, + "loss": 0.3579, + "step": 6437 + }, + { + "epoch": 0.7094214876033058, + "grad_norm": 7.421934127807617, + "learning_rate": 1.9793632318089755e-06, + "loss": 0.3508, + "step": 6438 + }, + { + "epoch": 0.7095316804407713, + "grad_norm": 6.017141819000244, + "learning_rate": 1.9779701079678732e-06, + "loss": 0.4149, + "step": 6439 + }, + { + "epoch": 0.7096418732782369, + "grad_norm": 4.674948692321777, + "learning_rate": 1.9765773536646902e-06, + "loss": 0.368, + "step": 6440 + }, + { + "epoch": 0.7097520661157025, + "grad_norm": 9.248699188232422, + "learning_rate": 1.9751849690697377e-06, + "loss": 0.2843, + "step": 6441 + }, + { + "epoch": 0.709862258953168, + "grad_norm": 3.4135935306549072, + "learning_rate": 1.9737929543532743e-06, + "loss": 0.346, + "step": 6442 + }, + { + "epoch": 0.7099724517906336, + "grad_norm": 4.658424377441406, + "learning_rate": 1.9724013096855206e-06, + "loss": 0.4154, + "step": 6443 + }, + { + "epoch": 0.7100826446280992, + "grad_norm": 5.710280418395996, + "learning_rate": 1.9710100352366436e-06, + "loss": 0.435, + "step": 6444 + }, + { + "epoch": 0.7101928374655647, + "grad_norm": 7.507155895233154, + "learning_rate": 1.9696191311767777e-06, + "loss": 0.4263, + "step": 6445 + }, + { + "epoch": 0.7103030303030303, + "grad_norm": 5.260646343231201, + "learning_rate": 1.968228597676001e-06, + "loss": 0.3965, + "step": 6446 + }, + { + "epoch": 0.7104132231404958, + "grad_norm": 6.585782051086426, + "learning_rate": 1.9668384349043456e-06, + "loss": 0.3621, + "step": 6447 + }, + { + "epoch": 0.7105234159779614, + "grad_norm": 7.680531978607178, + "learning_rate": 1.965448643031811e-06, + "loss": 0.4604, + "step": 6448 + }, + { + "epoch": 0.710633608815427, + "grad_norm": 7.507516860961914, + "learning_rate": 1.9640592222283373e-06, + "loss": 0.4089, + "step": 6449 + }, + { + "epoch": 0.7107438016528925, + "grad_norm": 6.0743021965026855, + "learning_rate": 1.962670172663823e-06, + "loss": 0.3646, + "step": 6450 + }, + { + "epoch": 0.7108539944903581, + "grad_norm": 8.247718811035156, + "learning_rate": 1.961281494508129e-06, + "loss": 0.4097, + "step": 6451 + }, + { + "epoch": 0.7109641873278237, + "grad_norm": 5.167181015014648, + "learning_rate": 1.9598931879310616e-06, + "loss": 0.4324, + "step": 6452 + }, + { + "epoch": 0.7110743801652892, + "grad_norm": 5.881624221801758, + "learning_rate": 1.958505253102383e-06, + "loss": 0.3745, + "step": 6453 + }, + { + "epoch": 0.7111845730027548, + "grad_norm": 5.003997325897217, + "learning_rate": 1.957117690191814e-06, + "loss": 0.4214, + "step": 6454 + }, + { + "epoch": 0.7112947658402203, + "grad_norm": 8.985822677612305, + "learning_rate": 1.955730499369028e-06, + "loss": 0.4729, + "step": 6455 + }, + { + "epoch": 0.7114049586776859, + "grad_norm": 10.160449981689453, + "learning_rate": 1.95434368080365e-06, + "loss": 0.4224, + "step": 6456 + }, + { + "epoch": 0.7115151515151515, + "grad_norm": 9.159520149230957, + "learning_rate": 1.9529572346652646e-06, + "loss": 0.3845, + "step": 6457 + }, + { + "epoch": 0.711625344352617, + "grad_norm": 5.77703332901001, + "learning_rate": 1.951571161123408e-06, + "loss": 0.3817, + "step": 6458 + }, + { + "epoch": 0.7117355371900826, + "grad_norm": 6.839252948760986, + "learning_rate": 1.9501854603475713e-06, + "loss": 0.4795, + "step": 6459 + }, + { + "epoch": 0.7118457300275483, + "grad_norm": 5.172279357910156, + "learning_rate": 1.948800132507198e-06, + "loss": 0.3201, + "step": 6460 + }, + { + "epoch": 0.7119559228650137, + "grad_norm": 5.667455196380615, + "learning_rate": 1.9474151777716895e-06, + "loss": 0.3929, + "step": 6461 + }, + { + "epoch": 0.7120661157024794, + "grad_norm": 6.601551532745361, + "learning_rate": 1.9460305963104004e-06, + "loss": 0.4091, + "step": 6462 + }, + { + "epoch": 0.712176308539945, + "grad_norm": 3.548739194869995, + "learning_rate": 1.9446463882926377e-06, + "loss": 0.3791, + "step": 6463 + }, + { + "epoch": 0.7122865013774105, + "grad_norm": 8.532393455505371, + "learning_rate": 1.9432625538876644e-06, + "loss": 0.2892, + "step": 6464 + }, + { + "epoch": 0.7123966942148761, + "grad_norm": 8.71661376953125, + "learning_rate": 1.9418790932646998e-06, + "loss": 0.4074, + "step": 6465 + }, + { + "epoch": 0.7125068870523416, + "grad_norm": 8.869766235351562, + "learning_rate": 1.9404960065929116e-06, + "loss": 0.3689, + "step": 6466 + }, + { + "epoch": 0.7126170798898072, + "grad_norm": 6.914660453796387, + "learning_rate": 1.9391132940414287e-06, + "loss": 0.415, + "step": 6467 + }, + { + "epoch": 0.7127272727272728, + "grad_norm": 4.588566780090332, + "learning_rate": 1.937730955779331e-06, + "loss": 0.4176, + "step": 6468 + }, + { + "epoch": 0.7128374655647383, + "grad_norm": 5.332058429718018, + "learning_rate": 1.936348991975652e-06, + "loss": 0.4324, + "step": 6469 + }, + { + "epoch": 0.7129476584022039, + "grad_norm": 5.654483318328857, + "learning_rate": 1.9349674027993766e-06, + "loss": 0.305, + "step": 6470 + }, + { + "epoch": 0.7130578512396695, + "grad_norm": 5.828920364379883, + "learning_rate": 1.9335861884194536e-06, + "loss": 0.3845, + "step": 6471 + }, + { + "epoch": 0.713168044077135, + "grad_norm": 4.670661449432373, + "learning_rate": 1.932205349004775e-06, + "loss": 0.26, + "step": 6472 + }, + { + "epoch": 0.7132782369146006, + "grad_norm": 5.2829909324646, + "learning_rate": 1.930824884724194e-06, + "loss": 0.417, + "step": 6473 + }, + { + "epoch": 0.7133884297520661, + "grad_norm": 6.3276686668396, + "learning_rate": 1.929444795746517e-06, + "loss": 0.4473, + "step": 6474 + }, + { + "epoch": 0.7134986225895317, + "grad_norm": 11.624544143676758, + "learning_rate": 1.928065082240499e-06, + "loss": 0.3489, + "step": 6475 + }, + { + "epoch": 0.7136088154269973, + "grad_norm": 8.43012523651123, + "learning_rate": 1.926685744374857e-06, + "loss": 0.4578, + "step": 6476 + }, + { + "epoch": 0.7137190082644628, + "grad_norm": 6.978827476501465, + "learning_rate": 1.925306782318256e-06, + "loss": 0.3079, + "step": 6477 + }, + { + "epoch": 0.7138292011019284, + "grad_norm": 10.418410301208496, + "learning_rate": 1.923928196239318e-06, + "loss": 0.2941, + "step": 6478 + }, + { + "epoch": 0.713939393939394, + "grad_norm": 7.191355228424072, + "learning_rate": 1.92254998630662e-06, + "loss": 0.4816, + "step": 6479 + }, + { + "epoch": 0.7140495867768595, + "grad_norm": 9.33836841583252, + "learning_rate": 1.9211721526886883e-06, + "loss": 0.4029, + "step": 6480 + }, + { + "epoch": 0.7141597796143251, + "grad_norm": 5.70156717300415, + "learning_rate": 1.919794695554008e-06, + "loss": 0.4368, + "step": 6481 + }, + { + "epoch": 0.7142699724517906, + "grad_norm": 10.257763862609863, + "learning_rate": 1.9184176150710184e-06, + "loss": 0.4192, + "step": 6482 + }, + { + "epoch": 0.7143801652892562, + "grad_norm": 4.961391448974609, + "learning_rate": 1.9170409114081067e-06, + "loss": 0.414, + "step": 6483 + }, + { + "epoch": 0.7144903581267218, + "grad_norm": 6.4829630851745605, + "learning_rate": 1.9156645847336203e-06, + "loss": 0.4496, + "step": 6484 + }, + { + "epoch": 0.7146005509641873, + "grad_norm": 6.972085952758789, + "learning_rate": 1.91428863521586e-06, + "loss": 0.3441, + "step": 6485 + }, + { + "epoch": 0.7147107438016529, + "grad_norm": 10.662055969238281, + "learning_rate": 1.9129130630230753e-06, + "loss": 0.4387, + "step": 6486 + }, + { + "epoch": 0.7148209366391185, + "grad_norm": 5.661942958831787, + "learning_rate": 1.9115378683234742e-06, + "loss": 0.2902, + "step": 6487 + }, + { + "epoch": 0.714931129476584, + "grad_norm": 4.665764331817627, + "learning_rate": 1.910163051285219e-06, + "loss": 0.3051, + "step": 6488 + }, + { + "epoch": 0.7150413223140496, + "grad_norm": 5.3506574630737305, + "learning_rate": 1.9087886120764227e-06, + "loss": 0.3345, + "step": 6489 + }, + { + "epoch": 0.7151515151515152, + "grad_norm": 6.438321590423584, + "learning_rate": 1.9074145508651533e-06, + "loss": 0.38, + "step": 6490 + }, + { + "epoch": 0.7152617079889807, + "grad_norm": 11.505531311035156, + "learning_rate": 1.9060408678194347e-06, + "loss": 0.4181, + "step": 6491 + }, + { + "epoch": 0.7153719008264463, + "grad_norm": 5.2197442054748535, + "learning_rate": 1.9046675631072404e-06, + "loss": 0.4098, + "step": 6492 + }, + { + "epoch": 0.7154820936639118, + "grad_norm": 7.508244037628174, + "learning_rate": 1.903294636896501e-06, + "loss": 0.4202, + "step": 6493 + }, + { + "epoch": 0.7155922865013774, + "grad_norm": 9.025035858154297, + "learning_rate": 1.9019220893551016e-06, + "loss": 0.4205, + "step": 6494 + }, + { + "epoch": 0.715702479338843, + "grad_norm": 9.53231430053711, + "learning_rate": 1.9005499206508755e-06, + "loss": 0.4843, + "step": 6495 + }, + { + "epoch": 0.7158126721763085, + "grad_norm": 8.423941612243652, + "learning_rate": 1.8991781309516155e-06, + "loss": 0.3284, + "step": 6496 + }, + { + "epoch": 0.7159228650137741, + "grad_norm": 5.7038469314575195, + "learning_rate": 1.8978067204250673e-06, + "loss": 0.3674, + "step": 6497 + }, + { + "epoch": 0.7160330578512397, + "grad_norm": 7.418968200683594, + "learning_rate": 1.8964356892389253e-06, + "loss": 0.4702, + "step": 6498 + }, + { + "epoch": 0.7161432506887052, + "grad_norm": 5.119117259979248, + "learning_rate": 1.8950650375608432e-06, + "loss": 0.4034, + "step": 6499 + }, + { + "epoch": 0.7162534435261708, + "grad_norm": 6.179746627807617, + "learning_rate": 1.8936947655584259e-06, + "loss": 0.3843, + "step": 6500 + }, + { + "epoch": 0.7163636363636363, + "grad_norm": 4.594128608703613, + "learning_rate": 1.8923248733992344e-06, + "loss": 0.3719, + "step": 6501 + }, + { + "epoch": 0.7164738292011019, + "grad_norm": 5.928335189819336, + "learning_rate": 1.890955361250778e-06, + "loss": 0.4113, + "step": 6502 + }, + { + "epoch": 0.7165840220385675, + "grad_norm": 7.248212814331055, + "learning_rate": 1.88958622928052e-06, + "loss": 0.4168, + "step": 6503 + }, + { + "epoch": 0.716694214876033, + "grad_norm": 6.2532830238342285, + "learning_rate": 1.8882174776558866e-06, + "loss": 0.3212, + "step": 6504 + }, + { + "epoch": 0.7168044077134986, + "grad_norm": 4.506024360656738, + "learning_rate": 1.8868491065442468e-06, + "loss": 0.4239, + "step": 6505 + }, + { + "epoch": 0.7169146005509642, + "grad_norm": 9.820466041564941, + "learning_rate": 1.8854811161129238e-06, + "loss": 0.3794, + "step": 6506 + }, + { + "epoch": 0.7170247933884297, + "grad_norm": 9.815295219421387, + "learning_rate": 1.8841135065292043e-06, + "loss": 0.4058, + "step": 6507 + }, + { + "epoch": 0.7171349862258953, + "grad_norm": 8.686479568481445, + "learning_rate": 1.8827462779603173e-06, + "loss": 0.4168, + "step": 6508 + }, + { + "epoch": 0.7172451790633608, + "grad_norm": 8.61069107055664, + "learning_rate": 1.881379430573448e-06, + "loss": 0.3631, + "step": 6509 + }, + { + "epoch": 0.7173553719008264, + "grad_norm": 5.897222995758057, + "learning_rate": 1.8800129645357384e-06, + "loss": 0.4198, + "step": 6510 + }, + { + "epoch": 0.717465564738292, + "grad_norm": 9.235694885253906, + "learning_rate": 1.8786468800142832e-06, + "loss": 0.3956, + "step": 6511 + }, + { + "epoch": 0.7175757575757575, + "grad_norm": 4.371232509613037, + "learning_rate": 1.8772811771761257e-06, + "loss": 0.4216, + "step": 6512 + }, + { + "epoch": 0.7176859504132231, + "grad_norm": 4.844611167907715, + "learning_rate": 1.875915856188268e-06, + "loss": 0.3879, + "step": 6513 + }, + { + "epoch": 0.7177961432506887, + "grad_norm": 7.7547101974487305, + "learning_rate": 1.8745509172176624e-06, + "loss": 0.4434, + "step": 6514 + }, + { + "epoch": 0.7179063360881542, + "grad_norm": 9.482111930847168, + "learning_rate": 1.8731863604312183e-06, + "loss": 0.5215, + "step": 6515 + }, + { + "epoch": 0.7180165289256198, + "grad_norm": 4.813543796539307, + "learning_rate": 1.871822185995792e-06, + "loss": 0.3946, + "step": 6516 + }, + { + "epoch": 0.7181267217630855, + "grad_norm": 8.135725975036621, + "learning_rate": 1.8704583940781972e-06, + "loss": 0.4224, + "step": 6517 + }, + { + "epoch": 0.718236914600551, + "grad_norm": 7.12558126449585, + "learning_rate": 1.8690949848452034e-06, + "loss": 0.3971, + "step": 6518 + }, + { + "epoch": 0.7183471074380166, + "grad_norm": 4.535747528076172, + "learning_rate": 1.8677319584635257e-06, + "loss": 0.334, + "step": 6519 + }, + { + "epoch": 0.718457300275482, + "grad_norm": 5.1335272789001465, + "learning_rate": 1.8663693150998391e-06, + "loss": 0.3054, + "step": 6520 + }, + { + "epoch": 0.7185674931129477, + "grad_norm": 4.701351165771484, + "learning_rate": 1.865007054920771e-06, + "loss": 0.4164, + "step": 6521 + }, + { + "epoch": 0.7186776859504133, + "grad_norm": 7.359562397003174, + "learning_rate": 1.8636451780928967e-06, + "loss": 0.4328, + "step": 6522 + }, + { + "epoch": 0.7187878787878788, + "grad_norm": 5.535491943359375, + "learning_rate": 1.8622836847827508e-06, + "loss": 0.4366, + "step": 6523 + }, + { + "epoch": 0.7188980716253444, + "grad_norm": 9.683491706848145, + "learning_rate": 1.8609225751568193e-06, + "loss": 0.5647, + "step": 6524 + }, + { + "epoch": 0.71900826446281, + "grad_norm": 9.465622901916504, + "learning_rate": 1.8595618493815377e-06, + "loss": 0.3914, + "step": 6525 + }, + { + "epoch": 0.7191184573002755, + "grad_norm": 6.814040184020996, + "learning_rate": 1.8582015076232995e-06, + "loss": 0.4837, + "step": 6526 + }, + { + "epoch": 0.7192286501377411, + "grad_norm": 7.456498622894287, + "learning_rate": 1.85684155004845e-06, + "loss": 0.3526, + "step": 6527 + }, + { + "epoch": 0.7193388429752066, + "grad_norm": 8.250312805175781, + "learning_rate": 1.8554819768232835e-06, + "loss": 0.3609, + "step": 6528 + }, + { + "epoch": 0.7194490358126722, + "grad_norm": 5.8619537353515625, + "learning_rate": 1.8541227881140528e-06, + "loss": 0.3902, + "step": 6529 + }, + { + "epoch": 0.7195592286501378, + "grad_norm": 9.79912281036377, + "learning_rate": 1.8527639840869622e-06, + "loss": 0.4381, + "step": 6530 + }, + { + "epoch": 0.7196694214876033, + "grad_norm": 5.200512886047363, + "learning_rate": 1.8514055649081646e-06, + "loss": 0.4007, + "step": 6531 + }, + { + "epoch": 0.7197796143250689, + "grad_norm": 6.89130973815918, + "learning_rate": 1.8500475307437721e-06, + "loss": 0.4139, + "step": 6532 + }, + { + "epoch": 0.7198898071625345, + "grad_norm": 5.5149664878845215, + "learning_rate": 1.8486898817598474e-06, + "loss": 0.34, + "step": 6533 + }, + { + "epoch": 0.72, + "grad_norm": 8.045257568359375, + "learning_rate": 1.8473326181224033e-06, + "loss": 0.4186, + "step": 6534 + }, + { + "epoch": 0.7201101928374656, + "grad_norm": 7.109325885772705, + "learning_rate": 1.845975739997411e-06, + "loss": 0.3819, + "step": 6535 + }, + { + "epoch": 0.7202203856749311, + "grad_norm": 6.589534282684326, + "learning_rate": 1.844619247550788e-06, + "loss": 0.4972, + "step": 6536 + }, + { + "epoch": 0.7203305785123967, + "grad_norm": 6.8329243659973145, + "learning_rate": 1.8432631409484091e-06, + "loss": 0.4131, + "step": 6537 + }, + { + "epoch": 0.7204407713498623, + "grad_norm": 8.504344940185547, + "learning_rate": 1.8419074203561034e-06, + "loss": 0.4125, + "step": 6538 + }, + { + "epoch": 0.7205509641873278, + "grad_norm": 6.963781833648682, + "learning_rate": 1.8405520859396458e-06, + "loss": 0.4397, + "step": 6539 + }, + { + "epoch": 0.7206611570247934, + "grad_norm": 9.599796295166016, + "learning_rate": 1.8391971378647715e-06, + "loss": 0.4355, + "step": 6540 + }, + { + "epoch": 0.720771349862259, + "grad_norm": 9.471633911132812, + "learning_rate": 1.8378425762971657e-06, + "loss": 0.4078, + "step": 6541 + }, + { + "epoch": 0.7208815426997245, + "grad_norm": 5.994978904724121, + "learning_rate": 1.8364884014024642e-06, + "loss": 0.483, + "step": 6542 + }, + { + "epoch": 0.7209917355371901, + "grad_norm": 6.151260852813721, + "learning_rate": 1.8351346133462579e-06, + "loss": 0.3639, + "step": 6543 + }, + { + "epoch": 0.7211019283746557, + "grad_norm": 8.746085166931152, + "learning_rate": 1.8337812122940917e-06, + "loss": 0.4144, + "step": 6544 + }, + { + "epoch": 0.7212121212121212, + "grad_norm": 4.571780681610107, + "learning_rate": 1.8324281984114577e-06, + "loss": 0.4102, + "step": 6545 + }, + { + "epoch": 0.7213223140495868, + "grad_norm": 7.68917989730835, + "learning_rate": 1.8310755718638069e-06, + "loss": 0.4475, + "step": 6546 + }, + { + "epoch": 0.7214325068870523, + "grad_norm": 5.956671714782715, + "learning_rate": 1.8297233328165414e-06, + "loss": 0.3835, + "step": 6547 + }, + { + "epoch": 0.7215426997245179, + "grad_norm": 4.739256858825684, + "learning_rate": 1.8283714814350113e-06, + "loss": 0.4433, + "step": 6548 + }, + { + "epoch": 0.7216528925619835, + "grad_norm": 6.6831464767456055, + "learning_rate": 1.8270200178845242e-06, + "loss": 0.4079, + "step": 6549 + }, + { + "epoch": 0.721763085399449, + "grad_norm": 6.049987316131592, + "learning_rate": 1.825668942330342e-06, + "loss": 0.4344, + "step": 6550 + }, + { + "epoch": 0.7218732782369146, + "grad_norm": 8.413738250732422, + "learning_rate": 1.8243182549376714e-06, + "loss": 0.4136, + "step": 6551 + }, + { + "epoch": 0.7219834710743802, + "grad_norm": 6.3152008056640625, + "learning_rate": 1.8229679558716779e-06, + "loss": 0.3579, + "step": 6552 + }, + { + "epoch": 0.7220936639118457, + "grad_norm": 5.228076457977295, + "learning_rate": 1.8216180452974807e-06, + "loss": 0.4032, + "step": 6553 + }, + { + "epoch": 0.7222038567493113, + "grad_norm": 5.5786848068237305, + "learning_rate": 1.8202685233801442e-06, + "loss": 0.462, + "step": 6554 + }, + { + "epoch": 0.7223140495867768, + "grad_norm": 8.232794761657715, + "learning_rate": 1.818919390284692e-06, + "loss": 0.4015, + "step": 6555 + }, + { + "epoch": 0.7224242424242424, + "grad_norm": 6.3292365074157715, + "learning_rate": 1.8175706461760977e-06, + "loss": 0.4662, + "step": 6556 + }, + { + "epoch": 0.722534435261708, + "grad_norm": 7.013743877410889, + "learning_rate": 1.8162222912192896e-06, + "loss": 0.3822, + "step": 6557 + }, + { + "epoch": 0.7226446280991735, + "grad_norm": 7.752472400665283, + "learning_rate": 1.8148743255791428e-06, + "loss": 0.405, + "step": 6558 + }, + { + "epoch": 0.7227548209366391, + "grad_norm": 6.220157623291016, + "learning_rate": 1.81352674942049e-06, + "loss": 0.3573, + "step": 6559 + }, + { + "epoch": 0.7228650137741047, + "grad_norm": 6.291469097137451, + "learning_rate": 1.8121795629081163e-06, + "loss": 0.4659, + "step": 6560 + }, + { + "epoch": 0.7229752066115702, + "grad_norm": 8.126771926879883, + "learning_rate": 1.8108327662067554e-06, + "loss": 0.4463, + "step": 6561 + }, + { + "epoch": 0.7230853994490358, + "grad_norm": 9.852230072021484, + "learning_rate": 1.809486359481093e-06, + "loss": 0.4787, + "step": 6562 + }, + { + "epoch": 0.7231955922865014, + "grad_norm": 8.47900676727295, + "learning_rate": 1.8081403428957762e-06, + "loss": 0.384, + "step": 6563 + }, + { + "epoch": 0.7233057851239669, + "grad_norm": 5.633147239685059, + "learning_rate": 1.8067947166153937e-06, + "loss": 0.3449, + "step": 6564 + }, + { + "epoch": 0.7234159779614325, + "grad_norm": 6.953232765197754, + "learning_rate": 1.805449480804487e-06, + "loss": 0.4464, + "step": 6565 + }, + { + "epoch": 0.723526170798898, + "grad_norm": 5.364905834197998, + "learning_rate": 1.804104635627561e-06, + "loss": 0.3573, + "step": 6566 + }, + { + "epoch": 0.7236363636363636, + "grad_norm": 8.890564918518066, + "learning_rate": 1.8027601812490614e-06, + "loss": 0.4907, + "step": 6567 + }, + { + "epoch": 0.7237465564738292, + "grad_norm": 4.464534759521484, + "learning_rate": 1.8014161178333878e-06, + "loss": 0.4317, + "step": 6568 + }, + { + "epoch": 0.7238567493112947, + "grad_norm": 6.534988880157471, + "learning_rate": 1.8000724455448965e-06, + "loss": 0.4224, + "step": 6569 + }, + { + "epoch": 0.7239669421487603, + "grad_norm": 7.258755207061768, + "learning_rate": 1.7987291645478926e-06, + "loss": 0.4113, + "step": 6570 + }, + { + "epoch": 0.724077134986226, + "grad_norm": 7.645383358001709, + "learning_rate": 1.7973862750066374e-06, + "loss": 0.3435, + "step": 6571 + }, + { + "epoch": 0.7241873278236914, + "grad_norm": 6.104715347290039, + "learning_rate": 1.7960437770853368e-06, + "loss": 0.4122, + "step": 6572 + }, + { + "epoch": 0.724297520661157, + "grad_norm": 6.718868732452393, + "learning_rate": 1.7947016709481552e-06, + "loss": 0.3377, + "step": 6573 + }, + { + "epoch": 0.7244077134986225, + "grad_norm": 6.723759174346924, + "learning_rate": 1.7933599567592092e-06, + "loss": 0.3965, + "step": 6574 + }, + { + "epoch": 0.7245179063360881, + "grad_norm": 8.80811882019043, + "learning_rate": 1.7920186346825618e-06, + "loss": 0.4579, + "step": 6575 + }, + { + "epoch": 0.7246280991735538, + "grad_norm": 5.257187843322754, + "learning_rate": 1.7906777048822332e-06, + "loss": 0.3238, + "step": 6576 + }, + { + "epoch": 0.7247382920110192, + "grad_norm": 4.086373805999756, + "learning_rate": 1.789337167522196e-06, + "loss": 0.3268, + "step": 6577 + }, + { + "epoch": 0.7248484848484849, + "grad_norm": 8.623800277709961, + "learning_rate": 1.7879970227663696e-06, + "loss": 0.4486, + "step": 6578 + }, + { + "epoch": 0.7249586776859505, + "grad_norm": 9.160906791687012, + "learning_rate": 1.7866572707786301e-06, + "loss": 0.5127, + "step": 6579 + }, + { + "epoch": 0.725068870523416, + "grad_norm": 6.842070579528809, + "learning_rate": 1.7853179117228064e-06, + "loss": 0.4273, + "step": 6580 + }, + { + "epoch": 0.7251790633608816, + "grad_norm": 5.782288551330566, + "learning_rate": 1.7839789457626733e-06, + "loss": 0.3604, + "step": 6581 + }, + { + "epoch": 0.725289256198347, + "grad_norm": 6.150672435760498, + "learning_rate": 1.7826403730619635e-06, + "loss": 0.3961, + "step": 6582 + }, + { + "epoch": 0.7253994490358127, + "grad_norm": 6.725852012634277, + "learning_rate": 1.7813021937843606e-06, + "loss": 0.3989, + "step": 6583 + }, + { + "epoch": 0.7255096418732783, + "grad_norm": 6.893726825714111, + "learning_rate": 1.7799644080934959e-06, + "loss": 0.4057, + "step": 6584 + }, + { + "epoch": 0.7256198347107438, + "grad_norm": 4.0173115730285645, + "learning_rate": 1.7786270161529578e-06, + "loss": 0.4538, + "step": 6585 + }, + { + "epoch": 0.7257300275482094, + "grad_norm": 5.072638988494873, + "learning_rate": 1.7772900181262853e-06, + "loss": 0.3996, + "step": 6586 + }, + { + "epoch": 0.725840220385675, + "grad_norm": 3.620347738265991, + "learning_rate": 1.775953414176965e-06, + "loss": 0.3186, + "step": 6587 + }, + { + "epoch": 0.7259504132231405, + "grad_norm": 4.775448799133301, + "learning_rate": 1.7746172044684413e-06, + "loss": 0.4033, + "step": 6588 + }, + { + "epoch": 0.7260606060606061, + "grad_norm": 7.6348772048950195, + "learning_rate": 1.7732813891641088e-06, + "loss": 0.4258, + "step": 6589 + }, + { + "epoch": 0.7261707988980717, + "grad_norm": 6.827914714813232, + "learning_rate": 1.7719459684273089e-06, + "loss": 0.4086, + "step": 6590 + }, + { + "epoch": 0.7262809917355372, + "grad_norm": 5.695342063903809, + "learning_rate": 1.7706109424213414e-06, + "loss": 0.3584, + "step": 6591 + }, + { + "epoch": 0.7263911845730028, + "grad_norm": 5.97652530670166, + "learning_rate": 1.7692763113094557e-06, + "loss": 0.4056, + "step": 6592 + }, + { + "epoch": 0.7265013774104683, + "grad_norm": 6.561317443847656, + "learning_rate": 1.7679420752548499e-06, + "loss": 0.4263, + "step": 6593 + }, + { + "epoch": 0.7266115702479339, + "grad_norm": 4.876357555389404, + "learning_rate": 1.7666082344206787e-06, + "loss": 0.3326, + "step": 6594 + }, + { + "epoch": 0.7267217630853995, + "grad_norm": 6.124058723449707, + "learning_rate": 1.7652747889700434e-06, + "loss": 0.3464, + "step": 6595 + }, + { + "epoch": 0.726831955922865, + "grad_norm": 5.761227607727051, + "learning_rate": 1.7639417390660007e-06, + "loss": 0.2909, + "step": 6596 + }, + { + "epoch": 0.7269421487603306, + "grad_norm": 5.419153690338135, + "learning_rate": 1.7626090848715598e-06, + "loss": 0.4528, + "step": 6597 + }, + { + "epoch": 0.7270523415977962, + "grad_norm": 6.617321968078613, + "learning_rate": 1.7612768265496738e-06, + "loss": 0.412, + "step": 6598 + }, + { + "epoch": 0.7271625344352617, + "grad_norm": 4.691646099090576, + "learning_rate": 1.7599449642632605e-06, + "loss": 0.4754, + "step": 6599 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 6.649336814880371, + "learning_rate": 1.7586134981751785e-06, + "loss": 0.4625, + "step": 6600 + }, + { + "epoch": 0.7273829201101928, + "grad_norm": 5.788405418395996, + "learning_rate": 1.7572824284482387e-06, + "loss": 0.4402, + "step": 6601 + }, + { + "epoch": 0.7274931129476584, + "grad_norm": 6.544713973999023, + "learning_rate": 1.7559517552452082e-06, + "loss": 0.3637, + "step": 6602 + }, + { + "epoch": 0.727603305785124, + "grad_norm": 5.603353023529053, + "learning_rate": 1.7546214787288057e-06, + "loss": 0.4017, + "step": 6603 + }, + { + "epoch": 0.7277134986225895, + "grad_norm": 10.790420532226562, + "learning_rate": 1.7532915990616955e-06, + "loss": 0.4249, + "step": 6604 + }, + { + "epoch": 0.7278236914600551, + "grad_norm": 5.181728839874268, + "learning_rate": 1.7519621164064987e-06, + "loss": 0.4056, + "step": 6605 + }, + { + "epoch": 0.7279338842975207, + "grad_norm": 4.5847930908203125, + "learning_rate": 1.750633030925788e-06, + "loss": 0.3647, + "step": 6606 + }, + { + "epoch": 0.7280440771349862, + "grad_norm": 4.745823383331299, + "learning_rate": 1.7493043427820827e-06, + "loss": 0.3545, + "step": 6607 + }, + { + "epoch": 0.7281542699724518, + "grad_norm": 5.319917678833008, + "learning_rate": 1.7479760521378576e-06, + "loss": 0.4041, + "step": 6608 + }, + { + "epoch": 0.7282644628099173, + "grad_norm": 8.163823127746582, + "learning_rate": 1.74664815915554e-06, + "loss": 0.3588, + "step": 6609 + }, + { + "epoch": 0.7283746556473829, + "grad_norm": 8.125265121459961, + "learning_rate": 1.7453206639975034e-06, + "loss": 0.4947, + "step": 6610 + }, + { + "epoch": 0.7284848484848485, + "grad_norm": 9.334182739257812, + "learning_rate": 1.743993566826077e-06, + "loss": 0.4697, + "step": 6611 + }, + { + "epoch": 0.728595041322314, + "grad_norm": 5.022836685180664, + "learning_rate": 1.7426668678035402e-06, + "loss": 0.4174, + "step": 6612 + }, + { + "epoch": 0.7287052341597796, + "grad_norm": 5.568726062774658, + "learning_rate": 1.7413405670921246e-06, + "loss": 0.3476, + "step": 6613 + }, + { + "epoch": 0.7288154269972452, + "grad_norm": 6.706714630126953, + "learning_rate": 1.7400146648540094e-06, + "loss": 0.3886, + "step": 6614 + }, + { + "epoch": 0.7289256198347107, + "grad_norm": 6.909881114959717, + "learning_rate": 1.7386891612513296e-06, + "loss": 0.4205, + "step": 6615 + }, + { + "epoch": 0.7290358126721763, + "grad_norm": 6.599411487579346, + "learning_rate": 1.7373640564461707e-06, + "loss": 0.4318, + "step": 6616 + }, + { + "epoch": 0.7291460055096419, + "grad_norm": 5.470994472503662, + "learning_rate": 1.7360393506005652e-06, + "loss": 0.41, + "step": 6617 + }, + { + "epoch": 0.7292561983471074, + "grad_norm": 8.121482849121094, + "learning_rate": 1.7347150438765016e-06, + "loss": 0.4368, + "step": 6618 + }, + { + "epoch": 0.729366391184573, + "grad_norm": 7.104863166809082, + "learning_rate": 1.733391136435919e-06, + "loss": 0.4655, + "step": 6619 + }, + { + "epoch": 0.7294765840220385, + "grad_norm": 8.662788391113281, + "learning_rate": 1.7320676284407062e-06, + "loss": 0.5028, + "step": 6620 + }, + { + "epoch": 0.7295867768595041, + "grad_norm": 11.40149211883545, + "learning_rate": 1.730744520052699e-06, + "loss": 0.3573, + "step": 6621 + }, + { + "epoch": 0.7296969696969697, + "grad_norm": 5.609067916870117, + "learning_rate": 1.7294218114336963e-06, + "loss": 0.3833, + "step": 6622 + }, + { + "epoch": 0.7298071625344352, + "grad_norm": 6.620344161987305, + "learning_rate": 1.7280995027454372e-06, + "loss": 0.4063, + "step": 6623 + }, + { + "epoch": 0.7299173553719008, + "grad_norm": 4.553619861602783, + "learning_rate": 1.7267775941496122e-06, + "loss": 0.4281, + "step": 6624 + }, + { + "epoch": 0.7300275482093664, + "grad_norm": 4.575986385345459, + "learning_rate": 1.7254560858078724e-06, + "loss": 0.3539, + "step": 6625 + }, + { + "epoch": 0.7301377410468319, + "grad_norm": 5.355452060699463, + "learning_rate": 1.7241349778818084e-06, + "loss": 0.4154, + "step": 6626 + }, + { + "epoch": 0.7302479338842975, + "grad_norm": 7.630439281463623, + "learning_rate": 1.7228142705329715e-06, + "loss": 0.4742, + "step": 6627 + }, + { + "epoch": 0.730358126721763, + "grad_norm": 4.496499061584473, + "learning_rate": 1.721493963922855e-06, + "loss": 0.3786, + "step": 6628 + }, + { + "epoch": 0.7304683195592286, + "grad_norm": 3.9322595596313477, + "learning_rate": 1.72017405821291e-06, + "loss": 0.339, + "step": 6629 + }, + { + "epoch": 0.7305785123966942, + "grad_norm": 10.261990547180176, + "learning_rate": 1.7188545535645385e-06, + "loss": 0.4321, + "step": 6630 + }, + { + "epoch": 0.7306887052341597, + "grad_norm": 6.136670112609863, + "learning_rate": 1.7175354501390874e-06, + "loss": 0.3624, + "step": 6631 + }, + { + "epoch": 0.7307988980716253, + "grad_norm": 6.162241458892822, + "learning_rate": 1.7162167480978598e-06, + "loss": 0.4412, + "step": 6632 + }, + { + "epoch": 0.730909090909091, + "grad_norm": 11.787130355834961, + "learning_rate": 1.7148984476021107e-06, + "loss": 0.4973, + "step": 6633 + }, + { + "epoch": 0.7310192837465564, + "grad_norm": 4.376728534698486, + "learning_rate": 1.7135805488130402e-06, + "loss": 0.4117, + "step": 6634 + }, + { + "epoch": 0.731129476584022, + "grad_norm": 6.36915397644043, + "learning_rate": 1.7122630518918044e-06, + "loss": 0.3593, + "step": 6635 + }, + { + "epoch": 0.7312396694214875, + "grad_norm": 6.285597801208496, + "learning_rate": 1.7109459569995102e-06, + "loss": 0.4451, + "step": 6636 + }, + { + "epoch": 0.7313498622589532, + "grad_norm": 6.95416259765625, + "learning_rate": 1.7096292642972107e-06, + "loss": 0.4067, + "step": 6637 + }, + { + "epoch": 0.7314600550964188, + "grad_norm": 4.071789264678955, + "learning_rate": 1.7083129739459136e-06, + "loss": 0.3959, + "step": 6638 + }, + { + "epoch": 0.7315702479338843, + "grad_norm": 6.4609456062316895, + "learning_rate": 1.70699708610658e-06, + "loss": 0.3832, + "step": 6639 + }, + { + "epoch": 0.7316804407713499, + "grad_norm": 8.448399543762207, + "learning_rate": 1.7056816009401134e-06, + "loss": 0.4338, + "step": 6640 + }, + { + "epoch": 0.7317906336088155, + "grad_norm": 8.745491981506348, + "learning_rate": 1.7043665186073754e-06, + "loss": 0.3957, + "step": 6641 + }, + { + "epoch": 0.731900826446281, + "grad_norm": 7.603783130645752, + "learning_rate": 1.7030518392691785e-06, + "loss": 0.4717, + "step": 6642 + }, + { + "epoch": 0.7320110192837466, + "grad_norm": 5.770527362823486, + "learning_rate": 1.7017375630862791e-06, + "loss": 0.4157, + "step": 6643 + }, + { + "epoch": 0.7321212121212122, + "grad_norm": 4.718587875366211, + "learning_rate": 1.700423690219391e-06, + "loss": 0.4002, + "step": 6644 + }, + { + "epoch": 0.7322314049586777, + "grad_norm": 5.69619607925415, + "learning_rate": 1.6991102208291777e-06, + "loss": 0.4224, + "step": 6645 + }, + { + "epoch": 0.7323415977961433, + "grad_norm": 4.270539283752441, + "learning_rate": 1.6977971550762484e-06, + "loss": 0.3277, + "step": 6646 + }, + { + "epoch": 0.7324517906336088, + "grad_norm": 8.055335998535156, + "learning_rate": 1.6964844931211689e-06, + "loss": 0.3926, + "step": 6647 + }, + { + "epoch": 0.7325619834710744, + "grad_norm": 4.8165812492370605, + "learning_rate": 1.6951722351244542e-06, + "loss": 0.3115, + "step": 6648 + }, + { + "epoch": 0.73267217630854, + "grad_norm": 7.0699310302734375, + "learning_rate": 1.6938603812465666e-06, + "loss": 0.4445, + "step": 6649 + }, + { + "epoch": 0.7327823691460055, + "grad_norm": 8.66176986694336, + "learning_rate": 1.6925489316479226e-06, + "loss": 0.457, + "step": 6650 + }, + { + "epoch": 0.7328925619834711, + "grad_norm": 4.818056106567383, + "learning_rate": 1.691237886488889e-06, + "loss": 0.4217, + "step": 6651 + }, + { + "epoch": 0.7330027548209367, + "grad_norm": 7.170517444610596, + "learning_rate": 1.68992724592978e-06, + "loss": 0.4272, + "step": 6652 + }, + { + "epoch": 0.7331129476584022, + "grad_norm": 5.2569169998168945, + "learning_rate": 1.6886170101308652e-06, + "loss": 0.3411, + "step": 6653 + }, + { + "epoch": 0.7332231404958678, + "grad_norm": 4.8793110847473145, + "learning_rate": 1.6873071792523572e-06, + "loss": 0.4019, + "step": 6654 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 4.878707408905029, + "learning_rate": 1.6859977534544302e-06, + "loss": 0.3145, + "step": 6655 + }, + { + "epoch": 0.7334435261707989, + "grad_norm": 9.079961776733398, + "learning_rate": 1.6846887328972e-06, + "loss": 0.4538, + "step": 6656 + }, + { + "epoch": 0.7335537190082645, + "grad_norm": 6.583092212677002, + "learning_rate": 1.6833801177407316e-06, + "loss": 0.4319, + "step": 6657 + }, + { + "epoch": 0.73366391184573, + "grad_norm": 9.224050521850586, + "learning_rate": 1.6820719081450505e-06, + "loss": 0.4834, + "step": 6658 + }, + { + "epoch": 0.7337741046831956, + "grad_norm": 5.355169773101807, + "learning_rate": 1.680764104270124e-06, + "loss": 0.3627, + "step": 6659 + }, + { + "epoch": 0.7338842975206612, + "grad_norm": 10.7400484085083, + "learning_rate": 1.6794567062758694e-06, + "loss": 0.4701, + "step": 6660 + }, + { + "epoch": 0.7339944903581267, + "grad_norm": 6.861835956573486, + "learning_rate": 1.6781497143221592e-06, + "loss": 0.297, + "step": 6661 + }, + { + "epoch": 0.7341046831955923, + "grad_norm": 7.219264507293701, + "learning_rate": 1.6768431285688164e-06, + "loss": 0.381, + "step": 6662 + }, + { + "epoch": 0.7342148760330579, + "grad_norm": 4.299060821533203, + "learning_rate": 1.675536949175608e-06, + "loss": 0.3768, + "step": 6663 + }, + { + "epoch": 0.7343250688705234, + "grad_norm": 6.239497184753418, + "learning_rate": 1.6742311763022574e-06, + "loss": 0.3646, + "step": 6664 + }, + { + "epoch": 0.734435261707989, + "grad_norm": 14.268431663513184, + "learning_rate": 1.6729258101084377e-06, + "loss": 0.4536, + "step": 6665 + }, + { + "epoch": 0.7345454545454545, + "grad_norm": 5.3448991775512695, + "learning_rate": 1.6716208507537673e-06, + "loss": 0.3741, + "step": 6666 + }, + { + "epoch": 0.7346556473829201, + "grad_norm": 5.006895065307617, + "learning_rate": 1.670316298397821e-06, + "loss": 0.4255, + "step": 6667 + }, + { + "epoch": 0.7347658402203857, + "grad_norm": 12.607826232910156, + "learning_rate": 1.6690121532001202e-06, + "loss": 0.4501, + "step": 6668 + }, + { + "epoch": 0.7348760330578512, + "grad_norm": 6.037322998046875, + "learning_rate": 1.66770841532014e-06, + "loss": 0.3252, + "step": 6669 + }, + { + "epoch": 0.7349862258953168, + "grad_norm": 7.4917097091674805, + "learning_rate": 1.6664050849172997e-06, + "loss": 0.4251, + "step": 6670 + }, + { + "epoch": 0.7350964187327824, + "grad_norm": 6.970833778381348, + "learning_rate": 1.6651021621509738e-06, + "loss": 0.3062, + "step": 6671 + }, + { + "epoch": 0.7352066115702479, + "grad_norm": 13.247237205505371, + "learning_rate": 1.6637996471804868e-06, + "loss": 0.4118, + "step": 6672 + }, + { + "epoch": 0.7353168044077135, + "grad_norm": 7.240694522857666, + "learning_rate": 1.6624975401651095e-06, + "loss": 0.4215, + "step": 6673 + }, + { + "epoch": 0.735426997245179, + "grad_norm": 5.27970027923584, + "learning_rate": 1.6611958412640667e-06, + "loss": 0.3615, + "step": 6674 + }, + { + "epoch": 0.7355371900826446, + "grad_norm": 13.812773704528809, + "learning_rate": 1.6598945506365327e-06, + "loss": 0.4767, + "step": 6675 + }, + { + "epoch": 0.7356473829201102, + "grad_norm": 12.22442626953125, + "learning_rate": 1.6585936684416305e-06, + "loss": 0.3072, + "step": 6676 + }, + { + "epoch": 0.7357575757575757, + "grad_norm": 4.643481731414795, + "learning_rate": 1.6572931948384301e-06, + "loss": 0.3702, + "step": 6677 + }, + { + "epoch": 0.7358677685950413, + "grad_norm": 4.2442779541015625, + "learning_rate": 1.6559931299859617e-06, + "loss": 0.3646, + "step": 6678 + }, + { + "epoch": 0.7359779614325069, + "grad_norm": 4.7474799156188965, + "learning_rate": 1.6546934740431958e-06, + "loss": 0.4153, + "step": 6679 + }, + { + "epoch": 0.7360881542699724, + "grad_norm": 4.34524393081665, + "learning_rate": 1.6533942271690528e-06, + "loss": 0.3845, + "step": 6680 + }, + { + "epoch": 0.736198347107438, + "grad_norm": 8.834986686706543, + "learning_rate": 1.6520953895224128e-06, + "loss": 0.4496, + "step": 6681 + }, + { + "epoch": 0.7363085399449035, + "grad_norm": 5.882931232452393, + "learning_rate": 1.6507969612620949e-06, + "loss": 0.3919, + "step": 6682 + }, + { + "epoch": 0.7364187327823691, + "grad_norm": 5.51677942276001, + "learning_rate": 1.6494989425468737e-06, + "loss": 0.3354, + "step": 6683 + }, + { + "epoch": 0.7365289256198347, + "grad_norm": 5.615901470184326, + "learning_rate": 1.6482013335354746e-06, + "loss": 0.3891, + "step": 6684 + }, + { + "epoch": 0.7366391184573002, + "grad_norm": 12.293294906616211, + "learning_rate": 1.6469041343865683e-06, + "loss": 0.5426, + "step": 6685 + }, + { + "epoch": 0.7367493112947658, + "grad_norm": 11.403398513793945, + "learning_rate": 1.64560734525878e-06, + "loss": 0.3571, + "step": 6686 + }, + { + "epoch": 0.7368595041322314, + "grad_norm": 4.734846115112305, + "learning_rate": 1.644310966310681e-06, + "loss": 0.3627, + "step": 6687 + }, + { + "epoch": 0.7369696969696969, + "grad_norm": 8.784213066101074, + "learning_rate": 1.6430149977007953e-06, + "loss": 0.4197, + "step": 6688 + }, + { + "epoch": 0.7370798898071625, + "grad_norm": 9.416754722595215, + "learning_rate": 1.641719439587597e-06, + "loss": 0.4473, + "step": 6689 + }, + { + "epoch": 0.7371900826446282, + "grad_norm": 12.03082275390625, + "learning_rate": 1.640424292129506e-06, + "loss": 0.4932, + "step": 6690 + }, + { + "epoch": 0.7373002754820936, + "grad_norm": 7.539887428283691, + "learning_rate": 1.6391295554848957e-06, + "loss": 0.4544, + "step": 6691 + }, + { + "epoch": 0.7374104683195593, + "grad_norm": 8.073715209960938, + "learning_rate": 1.637835229812091e-06, + "loss": 0.3384, + "step": 6692 + }, + { + "epoch": 0.7375206611570247, + "grad_norm": 5.894067764282227, + "learning_rate": 1.6365413152693594e-06, + "loss": 0.4213, + "step": 6693 + }, + { + "epoch": 0.7376308539944904, + "grad_norm": 5.787919521331787, + "learning_rate": 1.6352478120149245e-06, + "loss": 0.3866, + "step": 6694 + }, + { + "epoch": 0.737741046831956, + "grad_norm": 5.080339431762695, + "learning_rate": 1.6339547202069594e-06, + "loss": 0.4116, + "step": 6695 + }, + { + "epoch": 0.7378512396694215, + "grad_norm": 4.942465782165527, + "learning_rate": 1.6326620400035819e-06, + "loss": 0.3744, + "step": 6696 + }, + { + "epoch": 0.7379614325068871, + "grad_norm": 4.5060834884643555, + "learning_rate": 1.631369771562864e-06, + "loss": 0.3505, + "step": 6697 + }, + { + "epoch": 0.7380716253443527, + "grad_norm": 8.70274829864502, + "learning_rate": 1.630077915042828e-06, + "loss": 0.406, + "step": 6698 + }, + { + "epoch": 0.7381818181818182, + "grad_norm": 6.8086934089660645, + "learning_rate": 1.6287864706014406e-06, + "loss": 0.4207, + "step": 6699 + }, + { + "epoch": 0.7382920110192838, + "grad_norm": 5.991214752197266, + "learning_rate": 1.627495438396623e-06, + "loss": 0.4243, + "step": 6700 + }, + { + "epoch": 0.7384022038567493, + "grad_norm": 5.16525936126709, + "learning_rate": 1.6262048185862456e-06, + "loss": 0.4154, + "step": 6701 + }, + { + "epoch": 0.7385123966942149, + "grad_norm": 4.822534084320068, + "learning_rate": 1.6249146113281245e-06, + "loss": 0.3613, + "step": 6702 + }, + { + "epoch": 0.7386225895316805, + "grad_norm": 20.175262451171875, + "learning_rate": 1.6236248167800295e-06, + "loss": 0.5235, + "step": 6703 + }, + { + "epoch": 0.738732782369146, + "grad_norm": 15.914359092712402, + "learning_rate": 1.6223354350996795e-06, + "loss": 0.4973, + "step": 6704 + }, + { + "epoch": 0.7388429752066116, + "grad_norm": 6.135737895965576, + "learning_rate": 1.621046466444739e-06, + "loss": 0.4133, + "step": 6705 + }, + { + "epoch": 0.7389531680440772, + "grad_norm": 12.328907012939453, + "learning_rate": 1.6197579109728268e-06, + "loss": 0.5264, + "step": 6706 + }, + { + "epoch": 0.7390633608815427, + "grad_norm": 8.680732727050781, + "learning_rate": 1.6184697688415102e-06, + "loss": 0.4007, + "step": 6707 + }, + { + "epoch": 0.7391735537190083, + "grad_norm": 9.753934860229492, + "learning_rate": 1.6171820402083022e-06, + "loss": 0.3916, + "step": 6708 + }, + { + "epoch": 0.7392837465564738, + "grad_norm": 12.979923248291016, + "learning_rate": 1.6158947252306707e-06, + "loss": 0.4962, + "step": 6709 + }, + { + "epoch": 0.7393939393939394, + "grad_norm": 10.228775978088379, + "learning_rate": 1.6146078240660258e-06, + "loss": 0.4981, + "step": 6710 + }, + { + "epoch": 0.739504132231405, + "grad_norm": 8.834919929504395, + "learning_rate": 1.6133213368717381e-06, + "loss": 0.4441, + "step": 6711 + }, + { + "epoch": 0.7396143250688705, + "grad_norm": 6.35507345199585, + "learning_rate": 1.6120352638051178e-06, + "loss": 0.3639, + "step": 6712 + }, + { + "epoch": 0.7397245179063361, + "grad_norm": 8.341972351074219, + "learning_rate": 1.6107496050234244e-06, + "loss": 0.4281, + "step": 6713 + }, + { + "epoch": 0.7398347107438017, + "grad_norm": 5.324977874755859, + "learning_rate": 1.609464360683876e-06, + "loss": 0.385, + "step": 6714 + }, + { + "epoch": 0.7399449035812672, + "grad_norm": 6.933832168579102, + "learning_rate": 1.6081795309436315e-06, + "loss": 0.3917, + "step": 6715 + }, + { + "epoch": 0.7400550964187328, + "grad_norm": 11.22779655456543, + "learning_rate": 1.6068951159597984e-06, + "loss": 0.5071, + "step": 6716 + }, + { + "epoch": 0.7401652892561984, + "grad_norm": 11.70495891571045, + "learning_rate": 1.605611115889442e-06, + "loss": 0.429, + "step": 6717 + }, + { + "epoch": 0.7402754820936639, + "grad_norm": 5.684927463531494, + "learning_rate": 1.60432753088957e-06, + "loss": 0.4069, + "step": 6718 + }, + { + "epoch": 0.7403856749311295, + "grad_norm": 4.83610725402832, + "learning_rate": 1.6030443611171381e-06, + "loss": 0.3182, + "step": 6719 + }, + { + "epoch": 0.740495867768595, + "grad_norm": 6.312492847442627, + "learning_rate": 1.601761606729056e-06, + "loss": 0.3978, + "step": 6720 + }, + { + "epoch": 0.7406060606060606, + "grad_norm": 7.136549949645996, + "learning_rate": 1.6004792678821823e-06, + "loss": 0.4525, + "step": 6721 + }, + { + "epoch": 0.7407162534435262, + "grad_norm": 7.6813859939575195, + "learning_rate": 1.5991973447333198e-06, + "loss": 0.3168, + "step": 6722 + }, + { + "epoch": 0.7408264462809917, + "grad_norm": 8.03650188446045, + "learning_rate": 1.5979158374392257e-06, + "loss": 0.4422, + "step": 6723 + }, + { + "epoch": 0.7409366391184573, + "grad_norm": 8.230238914489746, + "learning_rate": 1.596634746156604e-06, + "loss": 0.4155, + "step": 6724 + }, + { + "epoch": 0.7410468319559229, + "grad_norm": 4.2329630851745605, + "learning_rate": 1.5953540710421106e-06, + "loss": 0.3601, + "step": 6725 + }, + { + "epoch": 0.7411570247933884, + "grad_norm": 5.544435024261475, + "learning_rate": 1.5940738122523442e-06, + "loss": 0.3858, + "step": 6726 + }, + { + "epoch": 0.741267217630854, + "grad_norm": 9.27285099029541, + "learning_rate": 1.5927939699438588e-06, + "loss": 0.3882, + "step": 6727 + }, + { + "epoch": 0.7413774104683195, + "grad_norm": 7.3773956298828125, + "learning_rate": 1.5915145442731566e-06, + "loss": 0.432, + "step": 6728 + }, + { + "epoch": 0.7414876033057851, + "grad_norm": 15.690343856811523, + "learning_rate": 1.5902355353966843e-06, + "loss": 0.5766, + "step": 6729 + }, + { + "epoch": 0.7415977961432507, + "grad_norm": 5.725202560424805, + "learning_rate": 1.5889569434708418e-06, + "loss": 0.3884, + "step": 6730 + }, + { + "epoch": 0.7417079889807162, + "grad_norm": 8.445252418518066, + "learning_rate": 1.58767876865198e-06, + "loss": 0.4343, + "step": 6731 + }, + { + "epoch": 0.7418181818181818, + "grad_norm": 9.029854774475098, + "learning_rate": 1.5864010110963919e-06, + "loss": 0.4192, + "step": 6732 + }, + { + "epoch": 0.7419283746556474, + "grad_norm": 5.040360927581787, + "learning_rate": 1.5851236709603246e-06, + "loss": 0.3588, + "step": 6733 + }, + { + "epoch": 0.7420385674931129, + "grad_norm": 6.786897659301758, + "learning_rate": 1.5838467483999753e-06, + "loss": 0.3826, + "step": 6734 + }, + { + "epoch": 0.7421487603305785, + "grad_norm": 5.710162162780762, + "learning_rate": 1.5825702435714862e-06, + "loss": 0.3959, + "step": 6735 + }, + { + "epoch": 0.742258953168044, + "grad_norm": 4.672428607940674, + "learning_rate": 1.5812941566309464e-06, + "loss": 0.3819, + "step": 6736 + }, + { + "epoch": 0.7423691460055096, + "grad_norm": 9.552863121032715, + "learning_rate": 1.5800184877344044e-06, + "loss": 0.4042, + "step": 6737 + }, + { + "epoch": 0.7424793388429752, + "grad_norm": 7.347883224487305, + "learning_rate": 1.578743237037846e-06, + "loss": 0.3994, + "step": 6738 + }, + { + "epoch": 0.7425895316804407, + "grad_norm": 5.481398105621338, + "learning_rate": 1.5774684046972111e-06, + "loss": 0.375, + "step": 6739 + }, + { + "epoch": 0.7426997245179063, + "grad_norm": 5.58358907699585, + "learning_rate": 1.576193990868391e-06, + "loss": 0.4026, + "step": 6740 + }, + { + "epoch": 0.7428099173553719, + "grad_norm": 4.587759971618652, + "learning_rate": 1.5749199957072187e-06, + "loss": 0.3508, + "step": 6741 + }, + { + "epoch": 0.7429201101928374, + "grad_norm": 6.659691333770752, + "learning_rate": 1.5736464193694834e-06, + "loss": 0.4087, + "step": 6742 + }, + { + "epoch": 0.743030303030303, + "grad_norm": 6.995134353637695, + "learning_rate": 1.5723732620109167e-06, + "loss": 0.3273, + "step": 6743 + }, + { + "epoch": 0.7431404958677686, + "grad_norm": 5.064960479736328, + "learning_rate": 1.571100523787203e-06, + "loss": 0.4108, + "step": 6744 + }, + { + "epoch": 0.7432506887052341, + "grad_norm": 6.673768520355225, + "learning_rate": 1.569828204853977e-06, + "loss": 0.3932, + "step": 6745 + }, + { + "epoch": 0.7433608815426997, + "grad_norm": 5.711964130401611, + "learning_rate": 1.5685563053668158e-06, + "loss": 0.3903, + "step": 6746 + }, + { + "epoch": 0.7434710743801652, + "grad_norm": 5.0760297775268555, + "learning_rate": 1.5672848254812506e-06, + "loss": 0.3683, + "step": 6747 + }, + { + "epoch": 0.7435812672176308, + "grad_norm": 5.213315010070801, + "learning_rate": 1.5660137653527619e-06, + "loss": 0.3581, + "step": 6748 + }, + { + "epoch": 0.7436914600550965, + "grad_norm": 4.705984115600586, + "learning_rate": 1.5647431251367728e-06, + "loss": 0.371, + "step": 6749 + }, + { + "epoch": 0.743801652892562, + "grad_norm": 6.9832587242126465, + "learning_rate": 1.5634729049886604e-06, + "loss": 0.4204, + "step": 6750 + }, + { + "epoch": 0.7439118457300276, + "grad_norm": 5.852503299713135, + "learning_rate": 1.5622031050637509e-06, + "loss": 0.4339, + "step": 6751 + }, + { + "epoch": 0.7440220385674932, + "grad_norm": 9.679864883422852, + "learning_rate": 1.560933725517314e-06, + "loss": 0.3648, + "step": 6752 + }, + { + "epoch": 0.7441322314049587, + "grad_norm": 9.233565330505371, + "learning_rate": 1.5596647665045728e-06, + "loss": 0.4009, + "step": 6753 + }, + { + "epoch": 0.7442424242424243, + "grad_norm": 9.235274314880371, + "learning_rate": 1.5583962281806987e-06, + "loss": 0.4592, + "step": 6754 + }, + { + "epoch": 0.7443526170798898, + "grad_norm": 7.1648850440979, + "learning_rate": 1.5571281107008073e-06, + "loss": 0.459, + "step": 6755 + }, + { + "epoch": 0.7444628099173554, + "grad_norm": 8.624975204467773, + "learning_rate": 1.5558604142199668e-06, + "loss": 0.4312, + "step": 6756 + }, + { + "epoch": 0.744573002754821, + "grad_norm": 13.39470386505127, + "learning_rate": 1.554593138893195e-06, + "loss": 0.3769, + "step": 6757 + }, + { + "epoch": 0.7446831955922865, + "grad_norm": 5.642722129821777, + "learning_rate": 1.5533262848754533e-06, + "loss": 0.3878, + "step": 6758 + }, + { + "epoch": 0.7447933884297521, + "grad_norm": 6.0908203125, + "learning_rate": 1.5520598523216546e-06, + "loss": 0.4078, + "step": 6759 + }, + { + "epoch": 0.7449035812672177, + "grad_norm": 9.905556678771973, + "learning_rate": 1.5507938413866625e-06, + "loss": 0.438, + "step": 6760 + }, + { + "epoch": 0.7450137741046832, + "grad_norm": 7.522475719451904, + "learning_rate": 1.549528252225283e-06, + "loss": 0.387, + "step": 6761 + }, + { + "epoch": 0.7451239669421488, + "grad_norm": 7.384159088134766, + "learning_rate": 1.5482630849922764e-06, + "loss": 0.4033, + "step": 6762 + }, + { + "epoch": 0.7452341597796144, + "grad_norm": 7.429068088531494, + "learning_rate": 1.5469983398423499e-06, + "loss": 0.3876, + "step": 6763 + }, + { + "epoch": 0.7453443526170799, + "grad_norm": 7.473761081695557, + "learning_rate": 1.5457340169301549e-06, + "loss": 0.4082, + "step": 6764 + }, + { + "epoch": 0.7454545454545455, + "grad_norm": 6.6018967628479, + "learning_rate": 1.5444701164102966e-06, + "loss": 0.3422, + "step": 6765 + }, + { + "epoch": 0.745564738292011, + "grad_norm": 6.712680816650391, + "learning_rate": 1.5432066384373261e-06, + "loss": 0.3877, + "step": 6766 + }, + { + "epoch": 0.7456749311294766, + "grad_norm": 13.06218433380127, + "learning_rate": 1.541943583165746e-06, + "loss": 0.5377, + "step": 6767 + }, + { + "epoch": 0.7457851239669422, + "grad_norm": 7.596041679382324, + "learning_rate": 1.540680950750001e-06, + "loss": 0.4444, + "step": 6768 + }, + { + "epoch": 0.7458953168044077, + "grad_norm": 6.0458455085754395, + "learning_rate": 1.539418741344485e-06, + "loss": 0.3408, + "step": 6769 + }, + { + "epoch": 0.7460055096418733, + "grad_norm": 6.200343608856201, + "learning_rate": 1.5381569551035497e-06, + "loss": 0.3376, + "step": 6770 + }, + { + "epoch": 0.7461157024793389, + "grad_norm": 6.153450012207031, + "learning_rate": 1.5368955921814844e-06, + "loss": 0.2715, + "step": 6771 + }, + { + "epoch": 0.7462258953168044, + "grad_norm": 4.4604878425598145, + "learning_rate": 1.5356346527325273e-06, + "loss": 0.4098, + "step": 6772 + }, + { + "epoch": 0.74633608815427, + "grad_norm": 10.651001930236816, + "learning_rate": 1.5343741369108733e-06, + "loss": 0.5051, + "step": 6773 + }, + { + "epoch": 0.7464462809917355, + "grad_norm": 5.188691139221191, + "learning_rate": 1.5331140448706576e-06, + "loss": 0.3197, + "step": 6774 + }, + { + "epoch": 0.7465564738292011, + "grad_norm": 5.437318325042725, + "learning_rate": 1.5318543767659645e-06, + "loss": 0.3382, + "step": 6775 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 7.409314155578613, + "learning_rate": 1.530595132750829e-06, + "loss": 0.3382, + "step": 6776 + }, + { + "epoch": 0.7467768595041322, + "grad_norm": 10.222969055175781, + "learning_rate": 1.5293363129792348e-06, + "loss": 0.3694, + "step": 6777 + }, + { + "epoch": 0.7468870523415978, + "grad_norm": 17.013992309570312, + "learning_rate": 1.5280779176051096e-06, + "loss": 0.4434, + "step": 6778 + }, + { + "epoch": 0.7469972451790634, + "grad_norm": 11.93150520324707, + "learning_rate": 1.5268199467823324e-06, + "loss": 0.4627, + "step": 6779 + }, + { + "epoch": 0.7471074380165289, + "grad_norm": 6.485506534576416, + "learning_rate": 1.52556240066473e-06, + "loss": 0.3605, + "step": 6780 + }, + { + "epoch": 0.7472176308539945, + "grad_norm": 8.435442924499512, + "learning_rate": 1.5243052794060785e-06, + "loss": 0.481, + "step": 6781 + }, + { + "epoch": 0.74732782369146, + "grad_norm": 5.366856098175049, + "learning_rate": 1.523048583160097e-06, + "loss": 0.2738, + "step": 6782 + }, + { + "epoch": 0.7474380165289256, + "grad_norm": 8.27634334564209, + "learning_rate": 1.5217923120804578e-06, + "loss": 0.4658, + "step": 6783 + }, + { + "epoch": 0.7475482093663912, + "grad_norm": 7.089442253112793, + "learning_rate": 1.5205364663207811e-06, + "loss": 0.414, + "step": 6784 + }, + { + "epoch": 0.7476584022038567, + "grad_norm": 4.029046535491943, + "learning_rate": 1.5192810460346302e-06, + "loss": 0.2938, + "step": 6785 + }, + { + "epoch": 0.7477685950413223, + "grad_norm": 3.518596887588501, + "learning_rate": 1.5180260513755207e-06, + "loss": 0.3626, + "step": 6786 + }, + { + "epoch": 0.7478787878787879, + "grad_norm": 5.997016906738281, + "learning_rate": 1.516771482496917e-06, + "loss": 0.3698, + "step": 6787 + }, + { + "epoch": 0.7479889807162534, + "grad_norm": 5.003194332122803, + "learning_rate": 1.5155173395522266e-06, + "loss": 0.4407, + "step": 6788 + }, + { + "epoch": 0.748099173553719, + "grad_norm": 6.588343143463135, + "learning_rate": 1.5142636226948087e-06, + "loss": 0.3232, + "step": 6789 + }, + { + "epoch": 0.7482093663911846, + "grad_norm": 4.953454971313477, + "learning_rate": 1.513010332077972e-06, + "loss": 0.395, + "step": 6790 + }, + { + "epoch": 0.7483195592286501, + "grad_norm": 5.947750091552734, + "learning_rate": 1.5117574678549667e-06, + "loss": 0.3361, + "step": 6791 + }, + { + "epoch": 0.7484297520661157, + "grad_norm": 6.818279266357422, + "learning_rate": 1.5105050301789965e-06, + "loss": 0.4242, + "step": 6792 + }, + { + "epoch": 0.7485399449035812, + "grad_norm": 5.5788493156433105, + "learning_rate": 1.509253019203213e-06, + "loss": 0.3882, + "step": 6793 + }, + { + "epoch": 0.7486501377410468, + "grad_norm": 6.290981769561768, + "learning_rate": 1.5080014350807104e-06, + "loss": 0.378, + "step": 6794 + }, + { + "epoch": 0.7487603305785124, + "grad_norm": 7.344674110412598, + "learning_rate": 1.5067502779645353e-06, + "loss": 0.3328, + "step": 6795 + }, + { + "epoch": 0.7488705234159779, + "grad_norm": 7.661288261413574, + "learning_rate": 1.5054995480076833e-06, + "loss": 0.4539, + "step": 6796 + }, + { + "epoch": 0.7489807162534435, + "grad_norm": 8.083108901977539, + "learning_rate": 1.5042492453630918e-06, + "loss": 0.4032, + "step": 6797 + }, + { + "epoch": 0.7490909090909091, + "grad_norm": 6.651820659637451, + "learning_rate": 1.5029993701836514e-06, + "loss": 0.3634, + "step": 6798 + }, + { + "epoch": 0.7492011019283746, + "grad_norm": 8.187838554382324, + "learning_rate": 1.5017499226221993e-06, + "loss": 0.4247, + "step": 6799 + }, + { + "epoch": 0.7493112947658402, + "grad_norm": 3.927823781967163, + "learning_rate": 1.500500902831517e-06, + "loss": 0.354, + "step": 6800 + }, + { + "epoch": 0.7494214876033057, + "grad_norm": 4.890981674194336, + "learning_rate": 1.4992523109643398e-06, + "loss": 0.3147, + "step": 6801 + }, + { + "epoch": 0.7495316804407713, + "grad_norm": 6.409554958343506, + "learning_rate": 1.4980041471733436e-06, + "loss": 0.2929, + "step": 6802 + }, + { + "epoch": 0.749641873278237, + "grad_norm": 9.22906494140625, + "learning_rate": 1.4967564116111571e-06, + "loss": 0.4626, + "step": 6803 + }, + { + "epoch": 0.7497520661157024, + "grad_norm": 5.69516134262085, + "learning_rate": 1.4955091044303572e-06, + "loss": 0.4332, + "step": 6804 + }, + { + "epoch": 0.749862258953168, + "grad_norm": 8.6701078414917, + "learning_rate": 1.4942622257834626e-06, + "loss": 0.3755, + "step": 6805 + }, + { + "epoch": 0.7499724517906337, + "grad_norm": 6.685476303100586, + "learning_rate": 1.4930157758229451e-06, + "loss": 0.3849, + "step": 6806 + }, + { + "epoch": 0.7500826446280991, + "grad_norm": 5.954440593719482, + "learning_rate": 1.4917697547012239e-06, + "loss": 0.4499, + "step": 6807 + }, + { + "epoch": 0.7501928374655648, + "grad_norm": 6.628360748291016, + "learning_rate": 1.4905241625706613e-06, + "loss": 0.4078, + "step": 6808 + }, + { + "epoch": 0.7503030303030302, + "grad_norm": 5.020409107208252, + "learning_rate": 1.4892789995835706e-06, + "loss": 0.4012, + "step": 6809 + }, + { + "epoch": 0.7504132231404959, + "grad_norm": 6.6440253257751465, + "learning_rate": 1.4880342658922148e-06, + "loss": 0.4026, + "step": 6810 + }, + { + "epoch": 0.7505234159779615, + "grad_norm": 4.822587966918945, + "learning_rate": 1.4867899616487974e-06, + "loss": 0.4146, + "step": 6811 + }, + { + "epoch": 0.750633608815427, + "grad_norm": 6.03671407699585, + "learning_rate": 1.485546087005476e-06, + "loss": 0.3777, + "step": 6812 + }, + { + "epoch": 0.7507438016528926, + "grad_norm": 13.897258758544922, + "learning_rate": 1.4843026421143547e-06, + "loss": 0.557, + "step": 6813 + }, + { + "epoch": 0.7508539944903582, + "grad_norm": 8.15986156463623, + "learning_rate": 1.4830596271274806e-06, + "loss": 0.3939, + "step": 6814 + }, + { + "epoch": 0.7509641873278237, + "grad_norm": 5.397086143493652, + "learning_rate": 1.4818170421968519e-06, + "loss": 0.4204, + "step": 6815 + }, + { + "epoch": 0.7510743801652893, + "grad_norm": 8.237540245056152, + "learning_rate": 1.4805748874744163e-06, + "loss": 0.3773, + "step": 6816 + }, + { + "epoch": 0.7511845730027549, + "grad_norm": 4.589722633361816, + "learning_rate": 1.4793331631120628e-06, + "loss": 0.3714, + "step": 6817 + }, + { + "epoch": 0.7512947658402204, + "grad_norm": 4.802692890167236, + "learning_rate": 1.4780918692616319e-06, + "loss": 0.3222, + "step": 6818 + }, + { + "epoch": 0.751404958677686, + "grad_norm": 16.29314422607422, + "learning_rate": 1.476851006074913e-06, + "loss": 0.5529, + "step": 6819 + }, + { + "epoch": 0.7515151515151515, + "grad_norm": 10.048425674438477, + "learning_rate": 1.4756105737036375e-06, + "loss": 0.415, + "step": 6820 + }, + { + "epoch": 0.7516253443526171, + "grad_norm": 5.574573516845703, + "learning_rate": 1.4743705722994884e-06, + "loss": 0.3419, + "step": 6821 + }, + { + "epoch": 0.7517355371900827, + "grad_norm": 7.75213623046875, + "learning_rate": 1.4731310020140944e-06, + "loss": 0.3342, + "step": 6822 + }, + { + "epoch": 0.7518457300275482, + "grad_norm": 8.48292350769043, + "learning_rate": 1.4718918629990342e-06, + "loss": 0.4073, + "step": 6823 + }, + { + "epoch": 0.7519559228650138, + "grad_norm": 9.394196510314941, + "learning_rate": 1.4706531554058278e-06, + "loss": 0.4546, + "step": 6824 + }, + { + "epoch": 0.7520661157024794, + "grad_norm": 9.676992416381836, + "learning_rate": 1.4694148793859475e-06, + "loss": 0.3651, + "step": 6825 + }, + { + "epoch": 0.7521763085399449, + "grad_norm": 5.157181262969971, + "learning_rate": 1.4681770350908136e-06, + "loss": 0.3479, + "step": 6826 + }, + { + "epoch": 0.7522865013774105, + "grad_norm": 5.338240623474121, + "learning_rate": 1.466939622671789e-06, + "loss": 0.3968, + "step": 6827 + }, + { + "epoch": 0.752396694214876, + "grad_norm": 6.495171070098877, + "learning_rate": 1.4657026422801835e-06, + "loss": 0.4372, + "step": 6828 + }, + { + "epoch": 0.7525068870523416, + "grad_norm": 5.0497636795043945, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.3542, + "step": 6829 + }, + { + "epoch": 0.7526170798898072, + "grad_norm": 11.376801490783691, + "learning_rate": 1.4632299781842307e-06, + "loss": 0.4453, + "step": 6830 + }, + { + "epoch": 0.7527272727272727, + "grad_norm": 4.18069314956665, + "learning_rate": 1.4619942947822379e-06, + "loss": 0.3305, + "step": 6831 + }, + { + "epoch": 0.7528374655647383, + "grad_norm": 9.020021438598633, + "learning_rate": 1.460759044012392e-06, + "loss": 0.5502, + "step": 6832 + }, + { + "epoch": 0.7529476584022039, + "grad_norm": 6.928681373596191, + "learning_rate": 1.4595242260257381e-06, + "loss": 0.4342, + "step": 6833 + }, + { + "epoch": 0.7530578512396694, + "grad_norm": 5.69641637802124, + "learning_rate": 1.4582898409732687e-06, + "loss": 0.4027, + "step": 6834 + }, + { + "epoch": 0.753168044077135, + "grad_norm": 5.6361985206604, + "learning_rate": 1.4570558890059288e-06, + "loss": 0.3301, + "step": 6835 + }, + { + "epoch": 0.7532782369146005, + "grad_norm": 5.400755405426025, + "learning_rate": 1.4558223702746093e-06, + "loss": 0.2976, + "step": 6836 + }, + { + "epoch": 0.7533884297520661, + "grad_norm": 15.730646133422852, + "learning_rate": 1.4545892849301429e-06, + "loss": 0.4039, + "step": 6837 + }, + { + "epoch": 0.7534986225895317, + "grad_norm": 4.789764881134033, + "learning_rate": 1.4533566331233145e-06, + "loss": 0.3851, + "step": 6838 + }, + { + "epoch": 0.7536088154269972, + "grad_norm": 5.8264336585998535, + "learning_rate": 1.4521244150048552e-06, + "loss": 0.4435, + "step": 6839 + }, + { + "epoch": 0.7537190082644628, + "grad_norm": 6.7339043617248535, + "learning_rate": 1.4508926307254427e-06, + "loss": 0.3985, + "step": 6840 + }, + { + "epoch": 0.7538292011019284, + "grad_norm": 7.948024272918701, + "learning_rate": 1.4496612804356991e-06, + "loss": 0.3228, + "step": 6841 + }, + { + "epoch": 0.7539393939393939, + "grad_norm": 13.498283386230469, + "learning_rate": 1.448430364286197e-06, + "loss": 0.4703, + "step": 6842 + }, + { + "epoch": 0.7540495867768595, + "grad_norm": 5.53570556640625, + "learning_rate": 1.4471998824274553e-06, + "loss": 0.4111, + "step": 6843 + }, + { + "epoch": 0.7541597796143251, + "grad_norm": 4.883918762207031, + "learning_rate": 1.4459698350099365e-06, + "loss": 0.3549, + "step": 6844 + }, + { + "epoch": 0.7542699724517906, + "grad_norm": 5.303241729736328, + "learning_rate": 1.444740222184054e-06, + "loss": 0.4668, + "step": 6845 + }, + { + "epoch": 0.7543801652892562, + "grad_norm": 7.4251532554626465, + "learning_rate": 1.4435110441001683e-06, + "loss": 0.401, + "step": 6846 + }, + { + "epoch": 0.7544903581267217, + "grad_norm": 14.293661117553711, + "learning_rate": 1.4422823009085812e-06, + "loss": 0.5331, + "step": 6847 + }, + { + "epoch": 0.7546005509641873, + "grad_norm": 6.829413414001465, + "learning_rate": 1.4410539927595474e-06, + "loss": 0.3936, + "step": 6848 + }, + { + "epoch": 0.7547107438016529, + "grad_norm": 5.234549522399902, + "learning_rate": 1.4398261198032671e-06, + "loss": 0.3203, + "step": 6849 + }, + { + "epoch": 0.7548209366391184, + "grad_norm": 15.444355010986328, + "learning_rate": 1.4385986821898834e-06, + "loss": 0.4065, + "step": 6850 + }, + { + "epoch": 0.754931129476584, + "grad_norm": 5.0838303565979, + "learning_rate": 1.437371680069491e-06, + "loss": 0.4106, + "step": 6851 + }, + { + "epoch": 0.7550413223140496, + "grad_norm": 5.2294816970825195, + "learning_rate": 1.4361451135921296e-06, + "loss": 0.305, + "step": 6852 + }, + { + "epoch": 0.7551515151515151, + "grad_norm": 6.445193290710449, + "learning_rate": 1.4349189829077837e-06, + "loss": 0.318, + "step": 6853 + }, + { + "epoch": 0.7552617079889807, + "grad_norm": 8.633602142333984, + "learning_rate": 1.4336932881663868e-06, + "loss": 0.3874, + "step": 6854 + }, + { + "epoch": 0.7553719008264462, + "grad_norm": 7.066654682159424, + "learning_rate": 1.4324680295178211e-06, + "loss": 0.4553, + "step": 6855 + }, + { + "epoch": 0.7554820936639118, + "grad_norm": 3.980142831802368, + "learning_rate": 1.4312432071119086e-06, + "loss": 0.3781, + "step": 6856 + }, + { + "epoch": 0.7555922865013774, + "grad_norm": 6.00819206237793, + "learning_rate": 1.430018821098425e-06, + "loss": 0.3591, + "step": 6857 + }, + { + "epoch": 0.7557024793388429, + "grad_norm": 18.058963775634766, + "learning_rate": 1.4287948716270906e-06, + "loss": 0.4094, + "step": 6858 + }, + { + "epoch": 0.7558126721763085, + "grad_norm": 5.2950825691223145, + "learning_rate": 1.4275713588475687e-06, + "loss": 0.3766, + "step": 6859 + }, + { + "epoch": 0.7559228650137741, + "grad_norm": 5.6798810958862305, + "learning_rate": 1.4263482829094754e-06, + "loss": 0.4094, + "step": 6860 + }, + { + "epoch": 0.7560330578512396, + "grad_norm": 3.7791309356689453, + "learning_rate": 1.4251256439623667e-06, + "loss": 0.3591, + "step": 6861 + }, + { + "epoch": 0.7561432506887052, + "grad_norm": 10.203629493713379, + "learning_rate": 1.4239034421557501e-06, + "loss": 0.4784, + "step": 6862 + }, + { + "epoch": 0.7562534435261709, + "grad_norm": 4.767711639404297, + "learning_rate": 1.42268167763908e-06, + "loss": 0.351, + "step": 6863 + }, + { + "epoch": 0.7563636363636363, + "grad_norm": 6.482807159423828, + "learning_rate": 1.4214603505617525e-06, + "loss": 0.3775, + "step": 6864 + }, + { + "epoch": 0.756473829201102, + "grad_norm": 5.963954925537109, + "learning_rate": 1.420239461073114e-06, + "loss": 0.3416, + "step": 6865 + }, + { + "epoch": 0.7565840220385674, + "grad_norm": 5.087156295776367, + "learning_rate": 1.4190190093224582e-06, + "loss": 0.4816, + "step": 6866 + }, + { + "epoch": 0.7566942148760331, + "grad_norm": 15.355376243591309, + "learning_rate": 1.417798995459021e-06, + "loss": 0.436, + "step": 6867 + }, + { + "epoch": 0.7568044077134987, + "grad_norm": 4.720404148101807, + "learning_rate": 1.4165794196319881e-06, + "loss": 0.3455, + "step": 6868 + }, + { + "epoch": 0.7569146005509642, + "grad_norm": 5.853130340576172, + "learning_rate": 1.4153602819904933e-06, + "loss": 0.3464, + "step": 6869 + }, + { + "epoch": 0.7570247933884298, + "grad_norm": 7.015380382537842, + "learning_rate": 1.4141415826836109e-06, + "loss": 0.3958, + "step": 6870 + }, + { + "epoch": 0.7571349862258954, + "grad_norm": 5.533911228179932, + "learning_rate": 1.4129233218603666e-06, + "loss": 0.3602, + "step": 6871 + }, + { + "epoch": 0.7572451790633609, + "grad_norm": 7.721105575561523, + "learning_rate": 1.4117054996697321e-06, + "loss": 0.415, + "step": 6872 + }, + { + "epoch": 0.7573553719008265, + "grad_norm": 5.225645542144775, + "learning_rate": 1.4104881162606227e-06, + "loss": 0.3553, + "step": 6873 + }, + { + "epoch": 0.757465564738292, + "grad_norm": 7.83046293258667, + "learning_rate": 1.409271171781902e-06, + "loss": 0.337, + "step": 6874 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 6.9218902587890625, + "learning_rate": 1.4080546663823814e-06, + "loss": 0.3887, + "step": 6875 + }, + { + "epoch": 0.7576859504132232, + "grad_norm": 4.234103679656982, + "learning_rate": 1.4068386002108137e-06, + "loss": 0.3772, + "step": 6876 + }, + { + "epoch": 0.7577961432506887, + "grad_norm": 5.8084001541137695, + "learning_rate": 1.4056229734159026e-06, + "loss": 0.406, + "step": 6877 + }, + { + "epoch": 0.7579063360881543, + "grad_norm": 6.415094375610352, + "learning_rate": 1.4044077861462984e-06, + "loss": 0.406, + "step": 6878 + }, + { + "epoch": 0.7580165289256199, + "grad_norm": 11.293000221252441, + "learning_rate": 1.4031930385505931e-06, + "loss": 0.4479, + "step": 6879 + }, + { + "epoch": 0.7581267217630854, + "grad_norm": 4.440645217895508, + "learning_rate": 1.4019787307773285e-06, + "loss": 0.3946, + "step": 6880 + }, + { + "epoch": 0.758236914600551, + "grad_norm": 7.0031819343566895, + "learning_rate": 1.4007648629749925e-06, + "loss": 0.4065, + "step": 6881 + }, + { + "epoch": 0.7583471074380165, + "grad_norm": 5.336536884307861, + "learning_rate": 1.3995514352920197e-06, + "loss": 0.3794, + "step": 6882 + }, + { + "epoch": 0.7584573002754821, + "grad_norm": 8.416298866271973, + "learning_rate": 1.3983384478767865e-06, + "loss": 0.4651, + "step": 6883 + }, + { + "epoch": 0.7585674931129477, + "grad_norm": 6.324009895324707, + "learning_rate": 1.39712590087762e-06, + "loss": 0.3646, + "step": 6884 + }, + { + "epoch": 0.7586776859504132, + "grad_norm": 9.452508926391602, + "learning_rate": 1.3959137944427942e-06, + "loss": 0.4228, + "step": 6885 + }, + { + "epoch": 0.7587878787878788, + "grad_norm": 9.684531211853027, + "learning_rate": 1.3947021287205248e-06, + "loss": 0.4493, + "step": 6886 + }, + { + "epoch": 0.7588980716253444, + "grad_norm": 6.790139675140381, + "learning_rate": 1.3934909038589738e-06, + "loss": 0.4968, + "step": 6887 + }, + { + "epoch": 0.7590082644628099, + "grad_norm": 5.268446445465088, + "learning_rate": 1.392280120006257e-06, + "loss": 0.3925, + "step": 6888 + }, + { + "epoch": 0.7591184573002755, + "grad_norm": 4.70731258392334, + "learning_rate": 1.3910697773104275e-06, + "loss": 0.3949, + "step": 6889 + }, + { + "epoch": 0.7592286501377411, + "grad_norm": 11.086029052734375, + "learning_rate": 1.3898598759194848e-06, + "loss": 0.486, + "step": 6890 + }, + { + "epoch": 0.7593388429752066, + "grad_norm": 6.695804595947266, + "learning_rate": 1.388650415981384e-06, + "loss": 0.3652, + "step": 6891 + }, + { + "epoch": 0.7594490358126722, + "grad_norm": 7.074174404144287, + "learning_rate": 1.3874413976440154e-06, + "loss": 0.4741, + "step": 6892 + }, + { + "epoch": 0.7595592286501377, + "grad_norm": 7.051998138427734, + "learning_rate": 1.3862328210552184e-06, + "loss": 0.4229, + "step": 6893 + }, + { + "epoch": 0.7596694214876033, + "grad_norm": 6.784617900848389, + "learning_rate": 1.3850246863627809e-06, + "loss": 0.4003, + "step": 6894 + }, + { + "epoch": 0.7597796143250689, + "grad_norm": 13.098021507263184, + "learning_rate": 1.3838169937144351e-06, + "loss": 0.3385, + "step": 6895 + }, + { + "epoch": 0.7598898071625344, + "grad_norm": 6.4965033531188965, + "learning_rate": 1.3826097432578612e-06, + "loss": 0.4186, + "step": 6896 + }, + { + "epoch": 0.76, + "grad_norm": 4.233058452606201, + "learning_rate": 1.3814029351406799e-06, + "loss": 0.3435, + "step": 6897 + }, + { + "epoch": 0.7601101928374656, + "grad_norm": 4.8349609375, + "learning_rate": 1.3801965695104636e-06, + "loss": 0.3597, + "step": 6898 + }, + { + "epoch": 0.7602203856749311, + "grad_norm": 7.645735740661621, + "learning_rate": 1.3789906465147284e-06, + "loss": 0.4443, + "step": 6899 + }, + { + "epoch": 0.7603305785123967, + "grad_norm": 6.398595809936523, + "learning_rate": 1.3777851663009344e-06, + "loss": 0.3722, + "step": 6900 + }, + { + "epoch": 0.7604407713498622, + "grad_norm": 8.116183280944824, + "learning_rate": 1.376580129016491e-06, + "loss": 0.4389, + "step": 6901 + }, + { + "epoch": 0.7605509641873278, + "grad_norm": 4.306082248687744, + "learning_rate": 1.3753755348087527e-06, + "loss": 0.4354, + "step": 6902 + }, + { + "epoch": 0.7606611570247934, + "grad_norm": 5.3213300704956055, + "learning_rate": 1.374171383825016e-06, + "loss": 0.3615, + "step": 6903 + }, + { + "epoch": 0.7607713498622589, + "grad_norm": 8.165018081665039, + "learning_rate": 1.3729676762125276e-06, + "loss": 0.3677, + "step": 6904 + }, + { + "epoch": 0.7608815426997245, + "grad_norm": 9.73409366607666, + "learning_rate": 1.3717644121184802e-06, + "loss": 0.4883, + "step": 6905 + }, + { + "epoch": 0.7609917355371901, + "grad_norm": 10.60772705078125, + "learning_rate": 1.3705615916900072e-06, + "loss": 0.3772, + "step": 6906 + }, + { + "epoch": 0.7611019283746556, + "grad_norm": 6.223022937774658, + "learning_rate": 1.369359215074193e-06, + "loss": 0.3599, + "step": 6907 + }, + { + "epoch": 0.7612121212121212, + "grad_norm": 5.9954986572265625, + "learning_rate": 1.3681572824180679e-06, + "loss": 0.3362, + "step": 6908 + }, + { + "epoch": 0.7613223140495867, + "grad_norm": 4.7047882080078125, + "learning_rate": 1.3669557938686012e-06, + "loss": 0.3118, + "step": 6909 + }, + { + "epoch": 0.7614325068870523, + "grad_norm": 6.071364879608154, + "learning_rate": 1.3657547495727152e-06, + "loss": 0.3369, + "step": 6910 + }, + { + "epoch": 0.7615426997245179, + "grad_norm": 6.846749305725098, + "learning_rate": 1.3645541496772768e-06, + "loss": 0.3902, + "step": 6911 + }, + { + "epoch": 0.7616528925619834, + "grad_norm": 13.499283790588379, + "learning_rate": 1.363353994329094e-06, + "loss": 0.4028, + "step": 6912 + }, + { + "epoch": 0.761763085399449, + "grad_norm": 6.2844038009643555, + "learning_rate": 1.362154283674924e-06, + "loss": 0.3668, + "step": 6913 + }, + { + "epoch": 0.7618732782369146, + "grad_norm": 28.452960968017578, + "learning_rate": 1.3609550178614716e-06, + "loss": 0.485, + "step": 6914 + }, + { + "epoch": 0.7619834710743801, + "grad_norm": 5.711501598358154, + "learning_rate": 1.3597561970353817e-06, + "loss": 0.3805, + "step": 6915 + }, + { + "epoch": 0.7620936639118457, + "grad_norm": 8.279010772705078, + "learning_rate": 1.3585578213432482e-06, + "loss": 0.4057, + "step": 6916 + }, + { + "epoch": 0.7622038567493113, + "grad_norm": 5.910313129425049, + "learning_rate": 1.3573598909316127e-06, + "loss": 0.4284, + "step": 6917 + }, + { + "epoch": 0.7623140495867768, + "grad_norm": 9.110838890075684, + "learning_rate": 1.3561624059469559e-06, + "loss": 0.4227, + "step": 6918 + }, + { + "epoch": 0.7624242424242424, + "grad_norm": 5.608441352844238, + "learning_rate": 1.354965366535711e-06, + "loss": 0.4149, + "step": 6919 + }, + { + "epoch": 0.7625344352617079, + "grad_norm": 5.326509952545166, + "learning_rate": 1.3537687728442516e-06, + "loss": 0.314, + "step": 6920 + }, + { + "epoch": 0.7626446280991735, + "grad_norm": 12.109729766845703, + "learning_rate": 1.352572625018899e-06, + "loss": 0.4072, + "step": 6921 + }, + { + "epoch": 0.7627548209366392, + "grad_norm": 7.114477634429932, + "learning_rate": 1.351376923205922e-06, + "loss": 0.4047, + "step": 6922 + }, + { + "epoch": 0.7628650137741046, + "grad_norm": 11.916533470153809, + "learning_rate": 1.3501816675515285e-06, + "loss": 0.4535, + "step": 6923 + }, + { + "epoch": 0.7629752066115703, + "grad_norm": 7.836859703063965, + "learning_rate": 1.3489868582018807e-06, + "loss": 0.4061, + "step": 6924 + }, + { + "epoch": 0.7630853994490359, + "grad_norm": 3.2886035442352295, + "learning_rate": 1.3477924953030796e-06, + "loss": 0.3259, + "step": 6925 + }, + { + "epoch": 0.7631955922865014, + "grad_norm": 4.548503398895264, + "learning_rate": 1.346598579001172e-06, + "loss": 0.347, + "step": 6926 + }, + { + "epoch": 0.763305785123967, + "grad_norm": 5.381287574768066, + "learning_rate": 1.3454051094421521e-06, + "loss": 0.4272, + "step": 6927 + }, + { + "epoch": 0.7634159779614325, + "grad_norm": 6.1449995040893555, + "learning_rate": 1.344212086771962e-06, + "loss": 0.4816, + "step": 6928 + }, + { + "epoch": 0.7635261707988981, + "grad_norm": 5.375181198120117, + "learning_rate": 1.3430195111364818e-06, + "loss": 0.4253, + "step": 6929 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 4.251392841339111, + "learning_rate": 1.3418273826815437e-06, + "loss": 0.3694, + "step": 6930 + }, + { + "epoch": 0.7637465564738292, + "grad_norm": 5.311394691467285, + "learning_rate": 1.3406357015529236e-06, + "loss": 0.3869, + "step": 6931 + }, + { + "epoch": 0.7638567493112948, + "grad_norm": 6.5930914878845215, + "learning_rate": 1.3394444678963393e-06, + "loss": 0.3366, + "step": 6932 + }, + { + "epoch": 0.7639669421487604, + "grad_norm": 9.378667831420898, + "learning_rate": 1.3382536818574576e-06, + "loss": 0.4152, + "step": 6933 + }, + { + "epoch": 0.7640771349862259, + "grad_norm": 7.3711771965026855, + "learning_rate": 1.3370633435818913e-06, + "loss": 0.445, + "step": 6934 + }, + { + "epoch": 0.7641873278236915, + "grad_norm": 5.618210315704346, + "learning_rate": 1.335873453215194e-06, + "loss": 0.3235, + "step": 6935 + }, + { + "epoch": 0.764297520661157, + "grad_norm": 5.0322184562683105, + "learning_rate": 1.3346840109028674e-06, + "loss": 0.4211, + "step": 6936 + }, + { + "epoch": 0.7644077134986226, + "grad_norm": 6.689839839935303, + "learning_rate": 1.3334950167903588e-06, + "loss": 0.3492, + "step": 6937 + }, + { + "epoch": 0.7645179063360882, + "grad_norm": 9.82962703704834, + "learning_rate": 1.3323064710230622e-06, + "loss": 0.4579, + "step": 6938 + }, + { + "epoch": 0.7646280991735537, + "grad_norm": 6.276013374328613, + "learning_rate": 1.3311183737463102e-06, + "loss": 0.4161, + "step": 6939 + }, + { + "epoch": 0.7647382920110193, + "grad_norm": 6.1520304679870605, + "learning_rate": 1.3299307251053871e-06, + "loss": 0.4232, + "step": 6940 + }, + { + "epoch": 0.7648484848484849, + "grad_norm": 5.002037048339844, + "learning_rate": 1.3287435252455221e-06, + "loss": 0.3991, + "step": 6941 + }, + { + "epoch": 0.7649586776859504, + "grad_norm": 7.803005695343018, + "learning_rate": 1.3275567743118855e-06, + "loss": 0.4935, + "step": 6942 + }, + { + "epoch": 0.765068870523416, + "grad_norm": 5.232184410095215, + "learning_rate": 1.3263704724495923e-06, + "loss": 0.3817, + "step": 6943 + }, + { + "epoch": 0.7651790633608816, + "grad_norm": 5.051543235778809, + "learning_rate": 1.3251846198037104e-06, + "loss": 0.3642, + "step": 6944 + }, + { + "epoch": 0.7652892561983471, + "grad_norm": 9.887868881225586, + "learning_rate": 1.3239992165192457e-06, + "loss": 0.4108, + "step": 6945 + }, + { + "epoch": 0.7653994490358127, + "grad_norm": 11.279367446899414, + "learning_rate": 1.3228142627411468e-06, + "loss": 0.4468, + "step": 6946 + }, + { + "epoch": 0.7655096418732782, + "grad_norm": 6.569183826446533, + "learning_rate": 1.3216297586143173e-06, + "loss": 0.3612, + "step": 6947 + }, + { + "epoch": 0.7656198347107438, + "grad_norm": 9.783895492553711, + "learning_rate": 1.3204457042835984e-06, + "loss": 0.4023, + "step": 6948 + }, + { + "epoch": 0.7657300275482094, + "grad_norm": 10.846197128295898, + "learning_rate": 1.3192620998937734e-06, + "loss": 0.3935, + "step": 6949 + }, + { + "epoch": 0.7658402203856749, + "grad_norm": 5.138780117034912, + "learning_rate": 1.3180789455895814e-06, + "loss": 0.3837, + "step": 6950 + }, + { + "epoch": 0.7659504132231405, + "grad_norm": 10.842103958129883, + "learning_rate": 1.3168962415156966e-06, + "loss": 0.5067, + "step": 6951 + }, + { + "epoch": 0.7660606060606061, + "grad_norm": 7.0031819343566895, + "learning_rate": 1.3157139878167435e-06, + "loss": 0.3931, + "step": 6952 + }, + { + "epoch": 0.7661707988980716, + "grad_norm": 6.632589817047119, + "learning_rate": 1.3145321846372866e-06, + "loss": 0.4373, + "step": 6953 + }, + { + "epoch": 0.7662809917355372, + "grad_norm": 4.488170146942139, + "learning_rate": 1.3133508321218408e-06, + "loss": 0.3398, + "step": 6954 + }, + { + "epoch": 0.7663911845730027, + "grad_norm": 5.35597038269043, + "learning_rate": 1.312169930414865e-06, + "loss": 0.3409, + "step": 6955 + }, + { + "epoch": 0.7665013774104683, + "grad_norm": 5.265271186828613, + "learning_rate": 1.3109894796607576e-06, + "loss": 0.377, + "step": 6956 + }, + { + "epoch": 0.7666115702479339, + "grad_norm": 8.279088973999023, + "learning_rate": 1.3098094800038674e-06, + "loss": 0.4185, + "step": 6957 + }, + { + "epoch": 0.7667217630853994, + "grad_norm": 4.7576904296875, + "learning_rate": 1.3086299315884887e-06, + "loss": 0.3353, + "step": 6958 + }, + { + "epoch": 0.766831955922865, + "grad_norm": 6.927733421325684, + "learning_rate": 1.3074508345588543e-06, + "loss": 0.3909, + "step": 6959 + }, + { + "epoch": 0.7669421487603306, + "grad_norm": 4.825605869293213, + "learning_rate": 1.3062721890591478e-06, + "loss": 0.3701, + "step": 6960 + }, + { + "epoch": 0.7670523415977961, + "grad_norm": 7.008309364318848, + "learning_rate": 1.3050939952334968e-06, + "loss": 0.2961, + "step": 6961 + }, + { + "epoch": 0.7671625344352617, + "grad_norm": 4.5859761238098145, + "learning_rate": 1.3039162532259697e-06, + "loss": 0.4035, + "step": 6962 + }, + { + "epoch": 0.7672727272727272, + "grad_norm": 4.477420330047607, + "learning_rate": 1.3027389631805836e-06, + "loss": 0.3313, + "step": 6963 + }, + { + "epoch": 0.7673829201101928, + "grad_norm": 11.143181800842285, + "learning_rate": 1.3015621252413014e-06, + "loss": 0.3517, + "step": 6964 + }, + { + "epoch": 0.7674931129476584, + "grad_norm": 8.078117370605469, + "learning_rate": 1.3003857395520242e-06, + "loss": 0.3019, + "step": 6965 + }, + { + "epoch": 0.7676033057851239, + "grad_norm": 6.918178081512451, + "learning_rate": 1.2992098062566044e-06, + "loss": 0.4217, + "step": 6966 + }, + { + "epoch": 0.7677134986225895, + "grad_norm": 8.825976371765137, + "learning_rate": 1.2980343254988387e-06, + "loss": 0.4189, + "step": 6967 + }, + { + "epoch": 0.7678236914600551, + "grad_norm": 6.583045482635498, + "learning_rate": 1.296859297422462e-06, + "loss": 0.4003, + "step": 6968 + }, + { + "epoch": 0.7679338842975206, + "grad_norm": 8.6136474609375, + "learning_rate": 1.295684722171161e-06, + "loss": 0.4759, + "step": 6969 + }, + { + "epoch": 0.7680440771349862, + "grad_norm": 6.4836344718933105, + "learning_rate": 1.2945105998885654e-06, + "loss": 0.4326, + "step": 6970 + }, + { + "epoch": 0.7681542699724518, + "grad_norm": 6.296912670135498, + "learning_rate": 1.2933369307182453e-06, + "loss": 0.3223, + "step": 6971 + }, + { + "epoch": 0.7682644628099173, + "grad_norm": 12.304940223693848, + "learning_rate": 1.2921637148037203e-06, + "loss": 0.4472, + "step": 6972 + }, + { + "epoch": 0.7683746556473829, + "grad_norm": 6.3861799240112305, + "learning_rate": 1.290990952288455e-06, + "loss": 0.3862, + "step": 6973 + }, + { + "epoch": 0.7684848484848484, + "grad_norm": 6.812122821807861, + "learning_rate": 1.2898186433158521e-06, + "loss": 0.425, + "step": 6974 + }, + { + "epoch": 0.768595041322314, + "grad_norm": 3.9157824516296387, + "learning_rate": 1.2886467880292668e-06, + "loss": 0.3687, + "step": 6975 + }, + { + "epoch": 0.7687052341597796, + "grad_norm": 4.501402378082275, + "learning_rate": 1.2874753865719925e-06, + "loss": 0.3208, + "step": 6976 + }, + { + "epoch": 0.7688154269972451, + "grad_norm": 8.66434383392334, + "learning_rate": 1.2863044390872708e-06, + "loss": 0.4822, + "step": 6977 + }, + { + "epoch": 0.7689256198347107, + "grad_norm": 7.586112976074219, + "learning_rate": 1.2851339457182882e-06, + "loss": 0.4505, + "step": 6978 + }, + { + "epoch": 0.7690358126721764, + "grad_norm": 10.623414039611816, + "learning_rate": 1.28396390660817e-06, + "loss": 0.4421, + "step": 6979 + }, + { + "epoch": 0.7691460055096419, + "grad_norm": 8.026878356933594, + "learning_rate": 1.282794321899996e-06, + "loss": 0.4319, + "step": 6980 + }, + { + "epoch": 0.7692561983471075, + "grad_norm": 9.683146476745605, + "learning_rate": 1.2816251917367816e-06, + "loss": 0.506, + "step": 6981 + }, + { + "epoch": 0.769366391184573, + "grad_norm": 7.421623706817627, + "learning_rate": 1.2804565162614868e-06, + "loss": 0.4425, + "step": 6982 + }, + { + "epoch": 0.7694765840220386, + "grad_norm": 4.457887172698975, + "learning_rate": 1.279288295617025e-06, + "loss": 0.3624, + "step": 6983 + }, + { + "epoch": 0.7695867768595042, + "grad_norm": 4.602921009063721, + "learning_rate": 1.278120529946244e-06, + "loss": 0.3022, + "step": 6984 + }, + { + "epoch": 0.7696969696969697, + "grad_norm": 9.712858200073242, + "learning_rate": 1.2769532193919387e-06, + "loss": 0.3575, + "step": 6985 + }, + { + "epoch": 0.7698071625344353, + "grad_norm": 6.121043682098389, + "learning_rate": 1.2757863640968515e-06, + "loss": 0.3384, + "step": 6986 + }, + { + "epoch": 0.7699173553719009, + "grad_norm": 5.888635158538818, + "learning_rate": 1.2746199642036676e-06, + "loss": 0.4432, + "step": 6987 + }, + { + "epoch": 0.7700275482093664, + "grad_norm": 6.5923309326171875, + "learning_rate": 1.2734540198550132e-06, + "loss": 0.3614, + "step": 6988 + }, + { + "epoch": 0.770137741046832, + "grad_norm": 7.518980026245117, + "learning_rate": 1.2722885311934641e-06, + "loss": 0.297, + "step": 6989 + }, + { + "epoch": 0.7702479338842976, + "grad_norm": 5.579894065856934, + "learning_rate": 1.271123498361538e-06, + "loss": 0.3988, + "step": 6990 + }, + { + "epoch": 0.7703581267217631, + "grad_norm": 5.087518215179443, + "learning_rate": 1.2699589215016939e-06, + "loss": 0.401, + "step": 6991 + }, + { + "epoch": 0.7704683195592287, + "grad_norm": 5.216823577880859, + "learning_rate": 1.26879480075634e-06, + "loss": 0.3954, + "step": 6992 + }, + { + "epoch": 0.7705785123966942, + "grad_norm": 5.857227802276611, + "learning_rate": 1.2676311362678261e-06, + "loss": 0.3368, + "step": 6993 + }, + { + "epoch": 0.7706887052341598, + "grad_norm": 5.294259548187256, + "learning_rate": 1.2664679281784487e-06, + "loss": 0.3901, + "step": 6994 + }, + { + "epoch": 0.7707988980716254, + "grad_norm": 7.392274856567383, + "learning_rate": 1.2653051766304425e-06, + "loss": 0.4224, + "step": 6995 + }, + { + "epoch": 0.7709090909090909, + "grad_norm": 9.018160820007324, + "learning_rate": 1.2641428817659928e-06, + "loss": 0.4405, + "step": 6996 + }, + { + "epoch": 0.7710192837465565, + "grad_norm": 10.305521965026855, + "learning_rate": 1.262981043727227e-06, + "loss": 0.3653, + "step": 6997 + }, + { + "epoch": 0.7711294765840221, + "grad_norm": 10.165287017822266, + "learning_rate": 1.2618196626562145e-06, + "loss": 0.4528, + "step": 6998 + }, + { + "epoch": 0.7712396694214876, + "grad_norm": 6.014214038848877, + "learning_rate": 1.2606587386949714e-06, + "loss": 0.4665, + "step": 6999 + }, + { + "epoch": 0.7713498622589532, + "grad_norm": 6.473193645477295, + "learning_rate": 1.2594982719854586e-06, + "loss": 0.4118, + "step": 7000 + }, + { + "epoch": 0.7714600550964187, + "grad_norm": 6.1697821617126465, + "learning_rate": 1.2583382626695785e-06, + "loss": 0.366, + "step": 7001 + }, + { + "epoch": 0.7715702479338843, + "grad_norm": 7.235289573669434, + "learning_rate": 1.2571787108891748e-06, + "loss": 0.3543, + "step": 7002 + }, + { + "epoch": 0.7716804407713499, + "grad_norm": 6.012856960296631, + "learning_rate": 1.256019616786045e-06, + "loss": 0.4914, + "step": 7003 + }, + { + "epoch": 0.7717906336088154, + "grad_norm": 7.119135856628418, + "learning_rate": 1.2548609805019229e-06, + "loss": 0.4519, + "step": 7004 + }, + { + "epoch": 0.771900826446281, + "grad_norm": 6.916644096374512, + "learning_rate": 1.2537028021784836e-06, + "loss": 0.4306, + "step": 7005 + }, + { + "epoch": 0.7720110192837466, + "grad_norm": 5.378720283508301, + "learning_rate": 1.2525450819573582e-06, + "loss": 0.4267, + "step": 7006 + }, + { + "epoch": 0.7721212121212121, + "grad_norm": 6.772036075592041, + "learning_rate": 1.2513878199801088e-06, + "loss": 0.3534, + "step": 7007 + }, + { + "epoch": 0.7722314049586777, + "grad_norm": 5.722408771514893, + "learning_rate": 1.2502310163882502e-06, + "loss": 0.4124, + "step": 7008 + }, + { + "epoch": 0.7723415977961432, + "grad_norm": 5.215071678161621, + "learning_rate": 1.2490746713232356e-06, + "loss": 0.4087, + "step": 7009 + }, + { + "epoch": 0.7724517906336088, + "grad_norm": 12.780637741088867, + "learning_rate": 1.2479187849264646e-06, + "loss": 0.4359, + "step": 7010 + }, + { + "epoch": 0.7725619834710744, + "grad_norm": 9.537915229797363, + "learning_rate": 1.2467633573392829e-06, + "loss": 0.4041, + "step": 7011 + }, + { + "epoch": 0.7726721763085399, + "grad_norm": 6.791706085205078, + "learning_rate": 1.2456083887029746e-06, + "loss": 0.3762, + "step": 7012 + }, + { + "epoch": 0.7727823691460055, + "grad_norm": 9.5051908493042, + "learning_rate": 1.2444538791587723e-06, + "loss": 0.4262, + "step": 7013 + }, + { + "epoch": 0.7728925619834711, + "grad_norm": 7.05399751663208, + "learning_rate": 1.2432998288478531e-06, + "loss": 0.4192, + "step": 7014 + }, + { + "epoch": 0.7730027548209366, + "grad_norm": 5.819000720977783, + "learning_rate": 1.242146237911332e-06, + "loss": 0.4417, + "step": 7015 + }, + { + "epoch": 0.7731129476584022, + "grad_norm": 6.9604997634887695, + "learning_rate": 1.240993106490273e-06, + "loss": 0.5471, + "step": 7016 + }, + { + "epoch": 0.7732231404958678, + "grad_norm": 3.621635675430298, + "learning_rate": 1.2398404347256854e-06, + "loss": 0.3538, + "step": 7017 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 16.079496383666992, + "learning_rate": 1.238688222758515e-06, + "loss": 0.3974, + "step": 7018 + }, + { + "epoch": 0.7734435261707989, + "grad_norm": 6.103856086730957, + "learning_rate": 1.2375364707296583e-06, + "loss": 0.433, + "step": 7019 + }, + { + "epoch": 0.7735537190082644, + "grad_norm": 6.0294647216796875, + "learning_rate": 1.236385178779954e-06, + "loss": 0.3412, + "step": 7020 + }, + { + "epoch": 0.77366391184573, + "grad_norm": 5.790236949920654, + "learning_rate": 1.235234347050181e-06, + "loss": 0.3971, + "step": 7021 + }, + { + "epoch": 0.7737741046831956, + "grad_norm": 4.867669105529785, + "learning_rate": 1.2340839756810657e-06, + "loss": 0.4326, + "step": 7022 + }, + { + "epoch": 0.7738842975206611, + "grad_norm": 6.277116298675537, + "learning_rate": 1.2329340648132793e-06, + "loss": 0.4039, + "step": 7023 + }, + { + "epoch": 0.7739944903581267, + "grad_norm": 9.411334037780762, + "learning_rate": 1.2317846145874308e-06, + "loss": 0.4242, + "step": 7024 + }, + { + "epoch": 0.7741046831955923, + "grad_norm": 5.751246929168701, + "learning_rate": 1.2306356251440787e-06, + "loss": 0.3294, + "step": 7025 + }, + { + "epoch": 0.7742148760330578, + "grad_norm": 5.511191368103027, + "learning_rate": 1.2294870966237233e-06, + "loss": 0.3936, + "step": 7026 + }, + { + "epoch": 0.7743250688705234, + "grad_norm": 9.842085838317871, + "learning_rate": 1.2283390291668062e-06, + "loss": 0.4709, + "step": 7027 + }, + { + "epoch": 0.7744352617079889, + "grad_norm": 7.187948703765869, + "learning_rate": 1.2271914229137161e-06, + "loss": 0.3535, + "step": 7028 + }, + { + "epoch": 0.7745454545454545, + "grad_norm": 7.193229675292969, + "learning_rate": 1.2260442780047854e-06, + "loss": 0.4727, + "step": 7029 + }, + { + "epoch": 0.7746556473829201, + "grad_norm": 5.977843284606934, + "learning_rate": 1.224897594580285e-06, + "loss": 0.3024, + "step": 7030 + }, + { + "epoch": 0.7747658402203856, + "grad_norm": 7.264575958251953, + "learning_rate": 1.2237513727804346e-06, + "loss": 0.4059, + "step": 7031 + }, + { + "epoch": 0.7748760330578512, + "grad_norm": 6.463776111602783, + "learning_rate": 1.2226056127453972e-06, + "loss": 0.4138, + "step": 7032 + }, + { + "epoch": 0.7749862258953168, + "grad_norm": 6.193376541137695, + "learning_rate": 1.2214603146152753e-06, + "loss": 0.386, + "step": 7033 + }, + { + "epoch": 0.7750964187327823, + "grad_norm": 6.836053371429443, + "learning_rate": 1.2203154785301202e-06, + "loss": 0.3675, + "step": 7034 + }, + { + "epoch": 0.775206611570248, + "grad_norm": 7.610666751861572, + "learning_rate": 1.2191711046299199e-06, + "loss": 0.3901, + "step": 7035 + }, + { + "epoch": 0.7753168044077134, + "grad_norm": 4.446081638336182, + "learning_rate": 1.2180271930546155e-06, + "loss": 0.4249, + "step": 7036 + }, + { + "epoch": 0.775426997245179, + "grad_norm": 7.6869425773620605, + "learning_rate": 1.2168837439440834e-06, + "loss": 0.3503, + "step": 7037 + }, + { + "epoch": 0.7755371900826447, + "grad_norm": 5.408324241638184, + "learning_rate": 1.2157407574381424e-06, + "loss": 0.371, + "step": 7038 + }, + { + "epoch": 0.7756473829201102, + "grad_norm": 6.367844581604004, + "learning_rate": 1.2145982336765655e-06, + "loss": 0.3458, + "step": 7039 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 10.609964370727539, + "learning_rate": 1.2134561727990584e-06, + "loss": 0.3174, + "step": 7040 + }, + { + "epoch": 0.7758677685950414, + "grad_norm": 6.0939507484436035, + "learning_rate": 1.2123145749452724e-06, + "loss": 0.3861, + "step": 7041 + }, + { + "epoch": 0.7759779614325069, + "grad_norm": 4.9223198890686035, + "learning_rate": 1.2111734402548052e-06, + "loss": 0.4554, + "step": 7042 + }, + { + "epoch": 0.7760881542699725, + "grad_norm": 7.670893669128418, + "learning_rate": 1.2100327688671982e-06, + "loss": 0.4497, + "step": 7043 + }, + { + "epoch": 0.7761983471074381, + "grad_norm": 6.00565767288208, + "learning_rate": 1.2088925609219304e-06, + "loss": 0.3969, + "step": 7044 + }, + { + "epoch": 0.7763085399449036, + "grad_norm": 8.241581916809082, + "learning_rate": 1.20775281655843e-06, + "loss": 0.4465, + "step": 7045 + }, + { + "epoch": 0.7764187327823692, + "grad_norm": 6.670886993408203, + "learning_rate": 1.2066135359160686e-06, + "loss": 0.3624, + "step": 7046 + }, + { + "epoch": 0.7765289256198347, + "grad_norm": 7.617499828338623, + "learning_rate": 1.2054747191341548e-06, + "loss": 0.3823, + "step": 7047 + }, + { + "epoch": 0.7766391184573003, + "grad_norm": 6.233975887298584, + "learning_rate": 1.204336366351947e-06, + "loss": 0.4049, + "step": 7048 + }, + { + "epoch": 0.7767493112947659, + "grad_norm": 9.559657096862793, + "learning_rate": 1.2031984777086437e-06, + "loss": 0.4493, + "step": 7049 + }, + { + "epoch": 0.7768595041322314, + "grad_norm": 5.948649883270264, + "learning_rate": 1.20206105334339e-06, + "loss": 0.4542, + "step": 7050 + }, + { + "epoch": 0.776969696969697, + "grad_norm": 7.301677227020264, + "learning_rate": 1.2009240933952682e-06, + "loss": 0.4378, + "step": 7051 + }, + { + "epoch": 0.7770798898071626, + "grad_norm": 3.9794509410858154, + "learning_rate": 1.1997875980033086e-06, + "loss": 0.3574, + "step": 7052 + }, + { + "epoch": 0.7771900826446281, + "grad_norm": 6.005545139312744, + "learning_rate": 1.1986515673064847e-06, + "loss": 0.3983, + "step": 7053 + }, + { + "epoch": 0.7773002754820937, + "grad_norm": 4.148007869720459, + "learning_rate": 1.1975160014437098e-06, + "loss": 0.3678, + "step": 7054 + }, + { + "epoch": 0.7774104683195592, + "grad_norm": 7.430401802062988, + "learning_rate": 1.1963809005538436e-06, + "loss": 0.4654, + "step": 7055 + }, + { + "epoch": 0.7775206611570248, + "grad_norm": 7.025285720825195, + "learning_rate": 1.1952462647756885e-06, + "loss": 0.3726, + "step": 7056 + }, + { + "epoch": 0.7776308539944904, + "grad_norm": 4.826577663421631, + "learning_rate": 1.1941120942479873e-06, + "loss": 0.4262, + "step": 7057 + }, + { + "epoch": 0.7777410468319559, + "grad_norm": 5.657707691192627, + "learning_rate": 1.1929783891094287e-06, + "loss": 0.4076, + "step": 7058 + }, + { + "epoch": 0.7778512396694215, + "grad_norm": 6.577704906463623, + "learning_rate": 1.1918451494986461e-06, + "loss": 0.3889, + "step": 7059 + }, + { + "epoch": 0.7779614325068871, + "grad_norm": 8.449810028076172, + "learning_rate": 1.190712375554211e-06, + "loss": 0.3033, + "step": 7060 + }, + { + "epoch": 0.7780716253443526, + "grad_norm": 4.806299209594727, + "learning_rate": 1.189580067414638e-06, + "loss": 0.4073, + "step": 7061 + }, + { + "epoch": 0.7781818181818182, + "grad_norm": 4.187784671783447, + "learning_rate": 1.1884482252183933e-06, + "loss": 0.3368, + "step": 7062 + }, + { + "epoch": 0.7782920110192837, + "grad_norm": 8.73049259185791, + "learning_rate": 1.1873168491038762e-06, + "loss": 0.4421, + "step": 7063 + }, + { + "epoch": 0.7784022038567493, + "grad_norm": 5.850810527801514, + "learning_rate": 1.1861859392094332e-06, + "loss": 0.3799, + "step": 7064 + }, + { + "epoch": 0.7785123966942149, + "grad_norm": 13.17557430267334, + "learning_rate": 1.1850554956733557e-06, + "loss": 0.451, + "step": 7065 + }, + { + "epoch": 0.7786225895316804, + "grad_norm": 7.426393508911133, + "learning_rate": 1.1839255186338727e-06, + "loss": 0.4401, + "step": 7066 + }, + { + "epoch": 0.778732782369146, + "grad_norm": 5.190055847167969, + "learning_rate": 1.1827960082291623e-06, + "loss": 0.4143, + "step": 7067 + }, + { + "epoch": 0.7788429752066116, + "grad_norm": 8.469324111938477, + "learning_rate": 1.18166696459734e-06, + "loss": 0.4056, + "step": 7068 + }, + { + "epoch": 0.7789531680440771, + "grad_norm": 9.043612480163574, + "learning_rate": 1.1805383878764682e-06, + "loss": 0.4203, + "step": 7069 + }, + { + "epoch": 0.7790633608815427, + "grad_norm": 8.03549575805664, + "learning_rate": 1.1794102782045514e-06, + "loss": 0.3586, + "step": 7070 + }, + { + "epoch": 0.7791735537190083, + "grad_norm": 7.550805568695068, + "learning_rate": 1.1782826357195348e-06, + "loss": 0.4397, + "step": 7071 + }, + { + "epoch": 0.7792837465564738, + "grad_norm": 11.737250328063965, + "learning_rate": 1.1771554605593083e-06, + "loss": 0.4463, + "step": 7072 + }, + { + "epoch": 0.7793939393939394, + "grad_norm": 6.674030303955078, + "learning_rate": 1.1760287528617065e-06, + "loss": 0.4025, + "step": 7073 + }, + { + "epoch": 0.7795041322314049, + "grad_norm": 8.523613929748535, + "learning_rate": 1.1749025127645014e-06, + "loss": 0.34, + "step": 7074 + }, + { + "epoch": 0.7796143250688705, + "grad_norm": 8.995020866394043, + "learning_rate": 1.1737767404054135e-06, + "loss": 0.4579, + "step": 7075 + }, + { + "epoch": 0.7797245179063361, + "grad_norm": 5.88915491104126, + "learning_rate": 1.1726514359221041e-06, + "loss": 0.4213, + "step": 7076 + }, + { + "epoch": 0.7798347107438016, + "grad_norm": 4.900761127471924, + "learning_rate": 1.1715265994521745e-06, + "loss": 0.4118, + "step": 7077 + }, + { + "epoch": 0.7799449035812672, + "grad_norm": 7.442956924438477, + "learning_rate": 1.1704022311331737e-06, + "loss": 0.3528, + "step": 7078 + }, + { + "epoch": 0.7800550964187328, + "grad_norm": 7.432154178619385, + "learning_rate": 1.1692783311025908e-06, + "loss": 0.4494, + "step": 7079 + }, + { + "epoch": 0.7801652892561983, + "grad_norm": 7.1108717918396, + "learning_rate": 1.168154899497856e-06, + "loss": 0.3928, + "step": 7080 + }, + { + "epoch": 0.7802754820936639, + "grad_norm": 6.365954875946045, + "learning_rate": 1.1670319364563447e-06, + "loss": 0.3425, + "step": 7081 + }, + { + "epoch": 0.7803856749311294, + "grad_norm": 13.459142684936523, + "learning_rate": 1.1659094421153766e-06, + "loss": 0.471, + "step": 7082 + }, + { + "epoch": 0.780495867768595, + "grad_norm": 8.00849437713623, + "learning_rate": 1.1647874166122087e-06, + "loss": 0.3829, + "step": 7083 + }, + { + "epoch": 0.7806060606060606, + "grad_norm": 10.074445724487305, + "learning_rate": 1.1636658600840445e-06, + "loss": 0.3996, + "step": 7084 + }, + { + "epoch": 0.7807162534435261, + "grad_norm": 7.83542537689209, + "learning_rate": 1.1625447726680317e-06, + "loss": 0.3902, + "step": 7085 + }, + { + "epoch": 0.7808264462809917, + "grad_norm": 11.84658432006836, + "learning_rate": 1.1614241545012556e-06, + "loss": 0.4302, + "step": 7086 + }, + { + "epoch": 0.7809366391184573, + "grad_norm": 5.272179126739502, + "learning_rate": 1.1603040057207481e-06, + "loss": 0.4253, + "step": 7087 + }, + { + "epoch": 0.7810468319559228, + "grad_norm": 4.778642654418945, + "learning_rate": 1.1591843264634839e-06, + "loss": 0.4388, + "step": 7088 + }, + { + "epoch": 0.7811570247933884, + "grad_norm": 8.891243934631348, + "learning_rate": 1.1580651168663759e-06, + "loss": 0.3542, + "step": 7089 + }, + { + "epoch": 0.781267217630854, + "grad_norm": 4.417768478393555, + "learning_rate": 1.1569463770662842e-06, + "loss": 0.3298, + "step": 7090 + }, + { + "epoch": 0.7813774104683195, + "grad_norm": 7.110738277435303, + "learning_rate": 1.15582810720001e-06, + "loss": 0.3986, + "step": 7091 + }, + { + "epoch": 0.7814876033057852, + "grad_norm": 6.492128372192383, + "learning_rate": 1.154710307404298e-06, + "loss": 0.3744, + "step": 7092 + }, + { + "epoch": 0.7815977961432506, + "grad_norm": 7.248207092285156, + "learning_rate": 1.1535929778158328e-06, + "loss": 0.3587, + "step": 7093 + }, + { + "epoch": 0.7817079889807163, + "grad_norm": 5.013510704040527, + "learning_rate": 1.1524761185712402e-06, + "loss": 0.3996, + "step": 7094 + }, + { + "epoch": 0.7818181818181819, + "grad_norm": 5.109880447387695, + "learning_rate": 1.1513597298070973e-06, + "loss": 0.3952, + "step": 7095 + }, + { + "epoch": 0.7819283746556474, + "grad_norm": 6.290883541107178, + "learning_rate": 1.150243811659914e-06, + "loss": 0.291, + "step": 7096 + }, + { + "epoch": 0.782038567493113, + "grad_norm": 6.792827129364014, + "learning_rate": 1.1491283642661444e-06, + "loss": 0.3487, + "step": 7097 + }, + { + "epoch": 0.7821487603305786, + "grad_norm": 8.456485748291016, + "learning_rate": 1.1480133877621925e-06, + "loss": 0.4565, + "step": 7098 + }, + { + "epoch": 0.7822589531680441, + "grad_norm": 7.435070037841797, + "learning_rate": 1.146898882284395e-06, + "loss": 0.3485, + "step": 7099 + }, + { + "epoch": 0.7823691460055097, + "grad_norm": 5.633684158325195, + "learning_rate": 1.1457848479690354e-06, + "loss": 0.3553, + "step": 7100 + }, + { + "epoch": 0.7824793388429752, + "grad_norm": 9.572943687438965, + "learning_rate": 1.1446712849523395e-06, + "loss": 0.4794, + "step": 7101 + }, + { + "epoch": 0.7825895316804408, + "grad_norm": 7.53718900680542, + "learning_rate": 1.1435581933704776e-06, + "loss": 0.3371, + "step": 7102 + }, + { + "epoch": 0.7826997245179064, + "grad_norm": 7.918816566467285, + "learning_rate": 1.142445573359557e-06, + "loss": 0.3127, + "step": 7103 + }, + { + "epoch": 0.7828099173553719, + "grad_norm": 6.695704460144043, + "learning_rate": 1.1413334250556312e-06, + "loss": 0.3769, + "step": 7104 + }, + { + "epoch": 0.7829201101928375, + "grad_norm": 4.887105464935303, + "learning_rate": 1.140221748594696e-06, + "loss": 0.3407, + "step": 7105 + }, + { + "epoch": 0.7830303030303031, + "grad_norm": 6.283191204071045, + "learning_rate": 1.1391105441126898e-06, + "loss": 0.3842, + "step": 7106 + }, + { + "epoch": 0.7831404958677686, + "grad_norm": 7.616218090057373, + "learning_rate": 1.1379998117454894e-06, + "loss": 0.472, + "step": 7107 + }, + { + "epoch": 0.7832506887052342, + "grad_norm": 10.6012601852417, + "learning_rate": 1.136889551628918e-06, + "loss": 0.4453, + "step": 7108 + }, + { + "epoch": 0.7833608815426997, + "grad_norm": 6.9253010749816895, + "learning_rate": 1.1357797638987407e-06, + "loss": 0.3683, + "step": 7109 + }, + { + "epoch": 0.7834710743801653, + "grad_norm": 10.118677139282227, + "learning_rate": 1.1346704486906618e-06, + "loss": 0.3754, + "step": 7110 + }, + { + "epoch": 0.7835812672176309, + "grad_norm": 7.475785732269287, + "learning_rate": 1.133561606140331e-06, + "loss": 0.4331, + "step": 7111 + }, + { + "epoch": 0.7836914600550964, + "grad_norm": 7.108438014984131, + "learning_rate": 1.1324532363833408e-06, + "loss": 0.3855, + "step": 7112 + }, + { + "epoch": 0.783801652892562, + "grad_norm": 6.726047039031982, + "learning_rate": 1.1313453395552205e-06, + "loss": 0.4521, + "step": 7113 + }, + { + "epoch": 0.7839118457300276, + "grad_norm": 5.010704517364502, + "learning_rate": 1.1302379157914473e-06, + "loss": 0.386, + "step": 7114 + }, + { + "epoch": 0.7840220385674931, + "grad_norm": 7.332189083099365, + "learning_rate": 1.1291309652274397e-06, + "loss": 0.5025, + "step": 7115 + }, + { + "epoch": 0.7841322314049587, + "grad_norm": 7.428362846374512, + "learning_rate": 1.128024487998554e-06, + "loss": 0.3753, + "step": 7116 + }, + { + "epoch": 0.7842424242424243, + "grad_norm": 6.251130104064941, + "learning_rate": 1.1269184842400943e-06, + "loss": 0.3963, + "step": 7117 + }, + { + "epoch": 0.7843526170798898, + "grad_norm": 5.5299577713012695, + "learning_rate": 1.1258129540873042e-06, + "loss": 0.4352, + "step": 7118 + }, + { + "epoch": 0.7844628099173554, + "grad_norm": 8.869359970092773, + "learning_rate": 1.1247078976753673e-06, + "loss": 0.3957, + "step": 7119 + }, + { + "epoch": 0.7845730027548209, + "grad_norm": 6.53255558013916, + "learning_rate": 1.1236033151394127e-06, + "loss": 0.3271, + "step": 7120 + }, + { + "epoch": 0.7846831955922865, + "grad_norm": 7.609908103942871, + "learning_rate": 1.1224992066145117e-06, + "loss": 0.4235, + "step": 7121 + }, + { + "epoch": 0.7847933884297521, + "grad_norm": 9.996800422668457, + "learning_rate": 1.121395572235674e-06, + "loss": 0.3917, + "step": 7122 + }, + { + "epoch": 0.7849035812672176, + "grad_norm": 7.006301403045654, + "learning_rate": 1.1202924121378532e-06, + "loss": 0.4124, + "step": 7123 + }, + { + "epoch": 0.7850137741046832, + "grad_norm": 6.177187919616699, + "learning_rate": 1.1191897264559487e-06, + "loss": 0.382, + "step": 7124 + }, + { + "epoch": 0.7851239669421488, + "grad_norm": 8.534271240234375, + "learning_rate": 1.1180875153247938e-06, + "loss": 0.3238, + "step": 7125 + }, + { + "epoch": 0.7852341597796143, + "grad_norm": 9.306317329406738, + "learning_rate": 1.1169857788791727e-06, + "loss": 0.3449, + "step": 7126 + }, + { + "epoch": 0.7853443526170799, + "grad_norm": 6.282261848449707, + "learning_rate": 1.1158845172538035e-06, + "loss": 0.3807, + "step": 7127 + }, + { + "epoch": 0.7854545454545454, + "grad_norm": 4.840741157531738, + "learning_rate": 1.1147837305833513e-06, + "loss": 0.376, + "step": 7128 + }, + { + "epoch": 0.785564738292011, + "grad_norm": 9.918661117553711, + "learning_rate": 1.1136834190024237e-06, + "loss": 0.3863, + "step": 7129 + }, + { + "epoch": 0.7856749311294766, + "grad_norm": 6.0856828689575195, + "learning_rate": 1.112583582645565e-06, + "loss": 0.4311, + "step": 7130 + }, + { + "epoch": 0.7857851239669421, + "grad_norm": 5.408751487731934, + "learning_rate": 1.1114842216472665e-06, + "loss": 0.3258, + "step": 7131 + }, + { + "epoch": 0.7858953168044077, + "grad_norm": 6.2737274169921875, + "learning_rate": 1.1103853361419608e-06, + "loss": 0.385, + "step": 7132 + }, + { + "epoch": 0.7860055096418733, + "grad_norm": 9.425874710083008, + "learning_rate": 1.1092869262640188e-06, + "loss": 0.3745, + "step": 7133 + }, + { + "epoch": 0.7861157024793388, + "grad_norm": 5.71481466293335, + "learning_rate": 1.1081889921477561e-06, + "loss": 0.4265, + "step": 7134 + }, + { + "epoch": 0.7862258953168044, + "grad_norm": 8.225992202758789, + "learning_rate": 1.1070915339274312e-06, + "loss": 0.4703, + "step": 7135 + }, + { + "epoch": 0.7863360881542699, + "grad_norm": 8.603610038757324, + "learning_rate": 1.10599455173724e-06, + "loss": 0.4142, + "step": 7136 + }, + { + "epoch": 0.7864462809917355, + "grad_norm": 12.46169376373291, + "learning_rate": 1.1048980457113251e-06, + "loss": 0.3765, + "step": 7137 + }, + { + "epoch": 0.7865564738292011, + "grad_norm": 7.737566947937012, + "learning_rate": 1.1038020159837692e-06, + "loss": 0.3685, + "step": 7138 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 5.6223931312561035, + "learning_rate": 1.1027064626885935e-06, + "loss": 0.3964, + "step": 7139 + }, + { + "epoch": 0.7867768595041322, + "grad_norm": 5.277441501617432, + "learning_rate": 1.1016113859597661e-06, + "loss": 0.2842, + "step": 7140 + }, + { + "epoch": 0.7868870523415978, + "grad_norm": 6.701341152191162, + "learning_rate": 1.1005167859311949e-06, + "loss": 0.4404, + "step": 7141 + }, + { + "epoch": 0.7869972451790633, + "grad_norm": 6.160130500793457, + "learning_rate": 1.0994226627367267e-06, + "loss": 0.4521, + "step": 7142 + }, + { + "epoch": 0.7871074380165289, + "grad_norm": 6.142246246337891, + "learning_rate": 1.098329016510154e-06, + "loss": 0.3734, + "step": 7143 + }, + { + "epoch": 0.7872176308539945, + "grad_norm": 6.93201208114624, + "learning_rate": 1.0972358473852102e-06, + "loss": 0.32, + "step": 7144 + }, + { + "epoch": 0.78732782369146, + "grad_norm": 10.67005729675293, + "learning_rate": 1.0961431554955671e-06, + "loss": 0.4239, + "step": 7145 + }, + { + "epoch": 0.7874380165289256, + "grad_norm": 6.718689441680908, + "learning_rate": 1.0950509409748416e-06, + "loss": 0.4462, + "step": 7146 + }, + { + "epoch": 0.7875482093663911, + "grad_norm": 5.778476715087891, + "learning_rate": 1.0939592039565915e-06, + "loss": 0.4016, + "step": 7147 + }, + { + "epoch": 0.7876584022038567, + "grad_norm": 7.173189163208008, + "learning_rate": 1.0928679445743168e-06, + "loss": 0.2673, + "step": 7148 + }, + { + "epoch": 0.7877685950413224, + "grad_norm": 5.864417552947998, + "learning_rate": 1.0917771629614565e-06, + "loss": 0.364, + "step": 7149 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 5.162528038024902, + "learning_rate": 1.090686859251393e-06, + "loss": 0.3032, + "step": 7150 + }, + { + "epoch": 0.7879889807162535, + "grad_norm": 6.525598049163818, + "learning_rate": 1.0895970335774518e-06, + "loss": 0.4942, + "step": 7151 + }, + { + "epoch": 0.7880991735537191, + "grad_norm": 7.823827743530273, + "learning_rate": 1.0885076860728977e-06, + "loss": 0.4232, + "step": 7152 + }, + { + "epoch": 0.7882093663911846, + "grad_norm": 6.679292678833008, + "learning_rate": 1.0874188168709343e-06, + "loss": 0.4352, + "step": 7153 + }, + { + "epoch": 0.7883195592286502, + "grad_norm": 6.845866680145264, + "learning_rate": 1.0863304261047148e-06, + "loss": 0.3552, + "step": 7154 + }, + { + "epoch": 0.7884297520661157, + "grad_norm": 8.466389656066895, + "learning_rate": 1.0852425139073275e-06, + "loss": 0.3649, + "step": 7155 + }, + { + "epoch": 0.7885399449035813, + "grad_norm": 6.212552070617676, + "learning_rate": 1.0841550804118001e-06, + "loss": 0.3685, + "step": 7156 + }, + { + "epoch": 0.7886501377410469, + "grad_norm": 6.205574035644531, + "learning_rate": 1.0830681257511117e-06, + "loss": 0.3829, + "step": 7157 + }, + { + "epoch": 0.7887603305785124, + "grad_norm": 7.429014205932617, + "learning_rate": 1.0819816500581737e-06, + "loss": 0.3488, + "step": 7158 + }, + { + "epoch": 0.788870523415978, + "grad_norm": 4.995081424713135, + "learning_rate": 1.0808956534658399e-06, + "loss": 0.4144, + "step": 7159 + }, + { + "epoch": 0.7889807162534436, + "grad_norm": 6.158295631408691, + "learning_rate": 1.079810136106909e-06, + "loss": 0.3498, + "step": 7160 + }, + { + "epoch": 0.7890909090909091, + "grad_norm": 5.911240100860596, + "learning_rate": 1.07872509811412e-06, + "loss": 0.3704, + "step": 7161 + }, + { + "epoch": 0.7892011019283747, + "grad_norm": 7.6627326011657715, + "learning_rate": 1.077640539620154e-06, + "loss": 0.4462, + "step": 7162 + }, + { + "epoch": 0.7893112947658402, + "grad_norm": 7.624194622039795, + "learning_rate": 1.0765564607576295e-06, + "loss": 0.3291, + "step": 7163 + }, + { + "epoch": 0.7894214876033058, + "grad_norm": 9.946186065673828, + "learning_rate": 1.0754728616591103e-06, + "loss": 0.3983, + "step": 7164 + }, + { + "epoch": 0.7895316804407714, + "grad_norm": 9.451842308044434, + "learning_rate": 1.074389742457102e-06, + "loss": 0.3839, + "step": 7165 + }, + { + "epoch": 0.7896418732782369, + "grad_norm": 8.298590660095215, + "learning_rate": 1.0733071032840464e-06, + "loss": 0.3419, + "step": 7166 + }, + { + "epoch": 0.7897520661157025, + "grad_norm": 5.167027473449707, + "learning_rate": 1.072224944272333e-06, + "loss": 0.4013, + "step": 7167 + }, + { + "epoch": 0.7898622589531681, + "grad_norm": 7.933115005493164, + "learning_rate": 1.0711432655542898e-06, + "loss": 0.4792, + "step": 7168 + }, + { + "epoch": 0.7899724517906336, + "grad_norm": 6.259268760681152, + "learning_rate": 1.070062067262183e-06, + "loss": 0.3645, + "step": 7169 + }, + { + "epoch": 0.7900826446280992, + "grad_norm": 4.871016025543213, + "learning_rate": 1.068981349528226e-06, + "loss": 0.3459, + "step": 7170 + }, + { + "epoch": 0.7901928374655648, + "grad_norm": 9.617240905761719, + "learning_rate": 1.0679011124845702e-06, + "loss": 0.4071, + "step": 7171 + }, + { + "epoch": 0.7903030303030303, + "grad_norm": 5.08292293548584, + "learning_rate": 1.0668213562633056e-06, + "loss": 0.3782, + "step": 7172 + }, + { + "epoch": 0.7904132231404959, + "grad_norm": 8.37671184539795, + "learning_rate": 1.0657420809964692e-06, + "loss": 0.4441, + "step": 7173 + }, + { + "epoch": 0.7905234159779614, + "grad_norm": 4.549337387084961, + "learning_rate": 1.0646632868160362e-06, + "loss": 0.2986, + "step": 7174 + }, + { + "epoch": 0.790633608815427, + "grad_norm": 7.388365745544434, + "learning_rate": 1.0635849738539205e-06, + "loss": 0.3397, + "step": 7175 + }, + { + "epoch": 0.7907438016528926, + "grad_norm": 6.887531757354736, + "learning_rate": 1.062507142241982e-06, + "loss": 0.4043, + "step": 7176 + }, + { + "epoch": 0.7908539944903581, + "grad_norm": 7.719393730163574, + "learning_rate": 1.0614297921120199e-06, + "loss": 0.3595, + "step": 7177 + }, + { + "epoch": 0.7909641873278237, + "grad_norm": 10.925946235656738, + "learning_rate": 1.060352923595771e-06, + "loss": 0.4671, + "step": 7178 + }, + { + "epoch": 0.7910743801652893, + "grad_norm": 6.271305561065674, + "learning_rate": 1.0592765368249186e-06, + "loss": 0.3992, + "step": 7179 + }, + { + "epoch": 0.7911845730027548, + "grad_norm": 9.810705184936523, + "learning_rate": 1.058200631931085e-06, + "loss": 0.4578, + "step": 7180 + }, + { + "epoch": 0.7912947658402204, + "grad_norm": 6.939256191253662, + "learning_rate": 1.0571252090458318e-06, + "loss": 0.4364, + "step": 7181 + }, + { + "epoch": 0.7914049586776859, + "grad_norm": 4.0625319480896, + "learning_rate": 1.0560502683006634e-06, + "loss": 0.4313, + "step": 7182 + }, + { + "epoch": 0.7915151515151515, + "grad_norm": 5.395636081695557, + "learning_rate": 1.0549758098270274e-06, + "loss": 0.3478, + "step": 7183 + }, + { + "epoch": 0.7916253443526171, + "grad_norm": 12.50815486907959, + "learning_rate": 1.0539018337563061e-06, + "loss": 0.5145, + "step": 7184 + }, + { + "epoch": 0.7917355371900826, + "grad_norm": 6.146428108215332, + "learning_rate": 1.0528283402198309e-06, + "loss": 0.3582, + "step": 7185 + }, + { + "epoch": 0.7918457300275482, + "grad_norm": 9.114592552185059, + "learning_rate": 1.0517553293488663e-06, + "loss": 0.4028, + "step": 7186 + }, + { + "epoch": 0.7919559228650138, + "grad_norm": 7.628662586212158, + "learning_rate": 1.0506828012746228e-06, + "loss": 0.426, + "step": 7187 + }, + { + "epoch": 0.7920661157024793, + "grad_norm": 5.638787746429443, + "learning_rate": 1.0496107561282532e-06, + "loss": 0.4132, + "step": 7188 + }, + { + "epoch": 0.7921763085399449, + "grad_norm": 5.9043779373168945, + "learning_rate": 1.048539194040843e-06, + "loss": 0.4129, + "step": 7189 + }, + { + "epoch": 0.7922865013774105, + "grad_norm": 4.690010070800781, + "learning_rate": 1.0474681151434306e-06, + "loss": 0.4328, + "step": 7190 + }, + { + "epoch": 0.792396694214876, + "grad_norm": 10.724125862121582, + "learning_rate": 1.0463975195669861e-06, + "loss": 0.3054, + "step": 7191 + }, + { + "epoch": 0.7925068870523416, + "grad_norm": 4.3964338302612305, + "learning_rate": 1.0453274074424218e-06, + "loss": 0.3724, + "step": 7192 + }, + { + "epoch": 0.7926170798898071, + "grad_norm": 6.200281620025635, + "learning_rate": 1.0442577789005943e-06, + "loss": 0.3352, + "step": 7193 + }, + { + "epoch": 0.7927272727272727, + "grad_norm": 7.095250606536865, + "learning_rate": 1.0431886340723003e-06, + "loss": 0.4533, + "step": 7194 + }, + { + "epoch": 0.7928374655647383, + "grad_norm": 6.526980400085449, + "learning_rate": 1.0421199730882736e-06, + "loss": 0.4564, + "step": 7195 + }, + { + "epoch": 0.7929476584022038, + "grad_norm": 10.025382041931152, + "learning_rate": 1.0410517960791926e-06, + "loss": 0.4522, + "step": 7196 + }, + { + "epoch": 0.7930578512396694, + "grad_norm": 5.875022888183594, + "learning_rate": 1.0399841031756774e-06, + "loss": 0.3377, + "step": 7197 + }, + { + "epoch": 0.793168044077135, + "grad_norm": 4.792463302612305, + "learning_rate": 1.0389168945082833e-06, + "loss": 0.3394, + "step": 7198 + }, + { + "epoch": 0.7932782369146005, + "grad_norm": 6.474595069885254, + "learning_rate": 1.0378501702075122e-06, + "loss": 0.3626, + "step": 7199 + }, + { + "epoch": 0.7933884297520661, + "grad_norm": 4.841497898101807, + "learning_rate": 1.0367839304038057e-06, + "loss": 0.3657, + "step": 7200 + }, + { + "epoch": 0.7934986225895316, + "grad_norm": 5.467446804046631, + "learning_rate": 1.0357181752275425e-06, + "loss": 0.3866, + "step": 7201 + }, + { + "epoch": 0.7936088154269972, + "grad_norm": 7.529897689819336, + "learning_rate": 1.034652904809046e-06, + "loss": 0.3415, + "step": 7202 + }, + { + "epoch": 0.7937190082644628, + "grad_norm": 8.403196334838867, + "learning_rate": 1.0335881192785778e-06, + "loss": 0.3787, + "step": 7203 + }, + { + "epoch": 0.7938292011019283, + "grad_norm": 11.709247589111328, + "learning_rate": 1.0325238187663444e-06, + "loss": 0.2872, + "step": 7204 + }, + { + "epoch": 0.793939393939394, + "grad_norm": 4.930643081665039, + "learning_rate": 1.0314600034024864e-06, + "loss": 0.322, + "step": 7205 + }, + { + "epoch": 0.7940495867768596, + "grad_norm": 7.090351581573486, + "learning_rate": 1.0303966733170896e-06, + "loss": 0.42, + "step": 7206 + }, + { + "epoch": 0.794159779614325, + "grad_norm": 6.290981769561768, + "learning_rate": 1.029333828640181e-06, + "loss": 0.4351, + "step": 7207 + }, + { + "epoch": 0.7942699724517907, + "grad_norm": 6.4861369132995605, + "learning_rate": 1.0282714695017255e-06, + "loss": 0.3989, + "step": 7208 + }, + { + "epoch": 0.7943801652892561, + "grad_norm": 5.802674770355225, + "learning_rate": 1.027209596031627e-06, + "loss": 0.4497, + "step": 7209 + }, + { + "epoch": 0.7944903581267218, + "grad_norm": 5.004255771636963, + "learning_rate": 1.0261482083597385e-06, + "loss": 0.3229, + "step": 7210 + }, + { + "epoch": 0.7946005509641874, + "grad_norm": 10.453167915344238, + "learning_rate": 1.025087306615845e-06, + "loss": 0.3638, + "step": 7211 + }, + { + "epoch": 0.7947107438016529, + "grad_norm": 8.389795303344727, + "learning_rate": 1.0240268909296724e-06, + "loss": 0.4663, + "step": 7212 + }, + { + "epoch": 0.7948209366391185, + "grad_norm": 7.390760898590088, + "learning_rate": 1.022966961430895e-06, + "loss": 0.3642, + "step": 7213 + }, + { + "epoch": 0.7949311294765841, + "grad_norm": 4.972836971282959, + "learning_rate": 1.02190751824912e-06, + "loss": 0.4278, + "step": 7214 + }, + { + "epoch": 0.7950413223140496, + "grad_norm": 20.41846466064453, + "learning_rate": 1.0208485615138946e-06, + "loss": 0.463, + "step": 7215 + }, + { + "epoch": 0.7951515151515152, + "grad_norm": 7.295198440551758, + "learning_rate": 1.0197900913547149e-06, + "loss": 0.3614, + "step": 7216 + }, + { + "epoch": 0.7952617079889808, + "grad_norm": 9.109318733215332, + "learning_rate": 1.0187321079010082e-06, + "loss": 0.359, + "step": 7217 + }, + { + "epoch": 0.7953719008264463, + "grad_norm": 5.5986857414245605, + "learning_rate": 1.0176746112821483e-06, + "loss": 0.3293, + "step": 7218 + }, + { + "epoch": 0.7954820936639119, + "grad_norm": 7.371835708618164, + "learning_rate": 1.0166176016274453e-06, + "loss": 0.333, + "step": 7219 + }, + { + "epoch": 0.7955922865013774, + "grad_norm": 7.072951793670654, + "learning_rate": 1.015561079066153e-06, + "loss": 0.4177, + "step": 7220 + }, + { + "epoch": 0.795702479338843, + "grad_norm": 7.765138626098633, + "learning_rate": 1.0145050437274655e-06, + "loss": 0.4301, + "step": 7221 + }, + { + "epoch": 0.7958126721763086, + "grad_norm": 8.221468925476074, + "learning_rate": 1.0134494957405139e-06, + "loss": 0.3963, + "step": 7222 + }, + { + "epoch": 0.7959228650137741, + "grad_norm": 6.8744072914123535, + "learning_rate": 1.0123944352343728e-06, + "loss": 0.3925, + "step": 7223 + }, + { + "epoch": 0.7960330578512397, + "grad_norm": 5.482988357543945, + "learning_rate": 1.0113398623380582e-06, + "loss": 0.3516, + "step": 7224 + }, + { + "epoch": 0.7961432506887053, + "grad_norm": 8.589863777160645, + "learning_rate": 1.0102857771805218e-06, + "loss": 0.415, + "step": 7225 + }, + { + "epoch": 0.7962534435261708, + "grad_norm": 7.051527500152588, + "learning_rate": 1.0092321798906606e-06, + "loss": 0.3998, + "step": 7226 + }, + { + "epoch": 0.7963636363636364, + "grad_norm": 4.680861473083496, + "learning_rate": 1.0081790705973105e-06, + "loss": 0.4115, + "step": 7227 + }, + { + "epoch": 0.7964738292011019, + "grad_norm": 4.199980735778809, + "learning_rate": 1.0071264494292443e-06, + "loss": 0.3424, + "step": 7228 + }, + { + "epoch": 0.7965840220385675, + "grad_norm": 4.739099502563477, + "learning_rate": 1.0060743165151798e-06, + "loss": 0.3158, + "step": 7229 + }, + { + "epoch": 0.7966942148760331, + "grad_norm": 5.168888092041016, + "learning_rate": 1.0050226719837746e-06, + "loss": 0.4125, + "step": 7230 + }, + { + "epoch": 0.7968044077134986, + "grad_norm": 6.180441379547119, + "learning_rate": 1.0039715159636225e-06, + "loss": 0.409, + "step": 7231 + }, + { + "epoch": 0.7969146005509642, + "grad_norm": 8.566987037658691, + "learning_rate": 1.0029208485832614e-06, + "loss": 0.4531, + "step": 7232 + }, + { + "epoch": 0.7970247933884298, + "grad_norm": 7.466010570526123, + "learning_rate": 1.0018706699711695e-06, + "loss": 0.3514, + "step": 7233 + }, + { + "epoch": 0.7971349862258953, + "grad_norm": 6.98629903793335, + "learning_rate": 1.0008209802557617e-06, + "loss": 0.3916, + "step": 7234 + }, + { + "epoch": 0.7972451790633609, + "grad_norm": 6.633272171020508, + "learning_rate": 9.997717795653972e-07, + "loss": 0.4642, + "step": 7235 + }, + { + "epoch": 0.7973553719008264, + "grad_norm": 19.752498626708984, + "learning_rate": 9.987230680283744e-07, + "loss": 0.5274, + "step": 7236 + }, + { + "epoch": 0.797465564738292, + "grad_norm": 8.323124885559082, + "learning_rate": 9.976748457729285e-07, + "loss": 0.4353, + "step": 7237 + }, + { + "epoch": 0.7975757575757576, + "grad_norm": 8.583427429199219, + "learning_rate": 9.966271129272391e-07, + "loss": 0.513, + "step": 7238 + }, + { + "epoch": 0.7976859504132231, + "grad_norm": 6.132208347320557, + "learning_rate": 9.955798696194259e-07, + "loss": 0.3029, + "step": 7239 + }, + { + "epoch": 0.7977961432506887, + "grad_norm": 5.390061855316162, + "learning_rate": 9.945331159775445e-07, + "loss": 0.3564, + "step": 7240 + }, + { + "epoch": 0.7979063360881543, + "grad_norm": 7.2979607582092285, + "learning_rate": 9.934868521295955e-07, + "loss": 0.4015, + "step": 7241 + }, + { + "epoch": 0.7980165289256198, + "grad_norm": 9.862425804138184, + "learning_rate": 9.924410782035155e-07, + "loss": 0.368, + "step": 7242 + }, + { + "epoch": 0.7981267217630854, + "grad_norm": 9.85254955291748, + "learning_rate": 9.91395794327184e-07, + "loss": 0.3668, + "step": 7243 + }, + { + "epoch": 0.798236914600551, + "grad_norm": 7.688636779785156, + "learning_rate": 9.903510006284218e-07, + "loss": 0.3777, + "step": 7244 + }, + { + "epoch": 0.7983471074380165, + "grad_norm": 7.272444725036621, + "learning_rate": 9.893066972349824e-07, + "loss": 0.3122, + "step": 7245 + }, + { + "epoch": 0.7984573002754821, + "grad_norm": 11.556692123413086, + "learning_rate": 9.882628842745712e-07, + "loss": 0.3954, + "step": 7246 + }, + { + "epoch": 0.7985674931129476, + "grad_norm": 7.959048748016357, + "learning_rate": 9.872195618748236e-07, + "loss": 0.5244, + "step": 7247 + }, + { + "epoch": 0.7986776859504132, + "grad_norm": 6.023469924926758, + "learning_rate": 9.861767301633163e-07, + "loss": 0.4132, + "step": 7248 + }, + { + "epoch": 0.7987878787878788, + "grad_norm": 6.7346510887146, + "learning_rate": 9.851343892675735e-07, + "loss": 0.3649, + "step": 7249 + }, + { + "epoch": 0.7988980716253443, + "grad_norm": 7.217971324920654, + "learning_rate": 9.840925393150507e-07, + "loss": 0.4012, + "step": 7250 + }, + { + "epoch": 0.7990082644628099, + "grad_norm": 6.179969787597656, + "learning_rate": 9.830511804331467e-07, + "loss": 0.3586, + "step": 7251 + }, + { + "epoch": 0.7991184573002755, + "grad_norm": 5.323660373687744, + "learning_rate": 9.820103127492002e-07, + "loss": 0.3714, + "step": 7252 + }, + { + "epoch": 0.799228650137741, + "grad_norm": 6.620593070983887, + "learning_rate": 9.809699363904924e-07, + "loss": 0.3783, + "step": 7253 + }, + { + "epoch": 0.7993388429752066, + "grad_norm": 7.338545799255371, + "learning_rate": 9.799300514842386e-07, + "loss": 0.3991, + "step": 7254 + }, + { + "epoch": 0.7994490358126721, + "grad_norm": 5.968478202819824, + "learning_rate": 9.788906581575986e-07, + "loss": 0.4115, + "step": 7255 + }, + { + "epoch": 0.7995592286501377, + "grad_norm": 16.298006057739258, + "learning_rate": 9.778517565376727e-07, + "loss": 0.4783, + "step": 7256 + }, + { + "epoch": 0.7996694214876033, + "grad_norm": 12.101306915283203, + "learning_rate": 9.768133467514961e-07, + "loss": 0.4822, + "step": 7257 + }, + { + "epoch": 0.7997796143250688, + "grad_norm": 11.499910354614258, + "learning_rate": 9.757754289260485e-07, + "loss": 0.4332, + "step": 7258 + }, + { + "epoch": 0.7998898071625344, + "grad_norm": 7.933718681335449, + "learning_rate": 9.747380031882474e-07, + "loss": 0.4092, + "step": 7259 + }, + { + "epoch": 0.8, + "grad_norm": 7.823580741882324, + "learning_rate": 9.73701069664953e-07, + "loss": 0.3825, + "step": 7260 + }, + { + "epoch": 0.8001101928374655, + "grad_norm": 9.790545463562012, + "learning_rate": 9.726646284829594e-07, + "loss": 0.4438, + "step": 7261 + }, + { + "epoch": 0.8002203856749311, + "grad_norm": 8.749502182006836, + "learning_rate": 9.716286797690056e-07, + "loss": 0.3701, + "step": 7262 + }, + { + "epoch": 0.8003305785123966, + "grad_norm": 6.369805812835693, + "learning_rate": 9.705932236497701e-07, + "loss": 0.4131, + "step": 7263 + }, + { + "epoch": 0.8004407713498622, + "grad_norm": 5.037927150726318, + "learning_rate": 9.695582602518671e-07, + "loss": 0.4315, + "step": 7264 + }, + { + "epoch": 0.8004407713498622, + "eval_loss": 0.39739754796028137, + "eval_runtime": 41.9639, + "eval_samples_per_second": 17.491, + "eval_steps_per_second": 2.192, + "step": 7264 + }, + { + "epoch": 0.8005509641873279, + "grad_norm": 6.549569129943848, + "learning_rate": 9.68523789701855e-07, + "loss": 0.4189, + "step": 7265 + }, + { + "epoch": 0.8006611570247933, + "grad_norm": 8.137191772460938, + "learning_rate": 9.674898121262322e-07, + "loss": 0.4373, + "step": 7266 + }, + { + "epoch": 0.800771349862259, + "grad_norm": 4.936229228973389, + "learning_rate": 9.664563276514321e-07, + "loss": 0.3285, + "step": 7267 + }, + { + "epoch": 0.8008815426997246, + "grad_norm": 5.178045749664307, + "learning_rate": 9.654233364038285e-07, + "loss": 0.417, + "step": 7268 + }, + { + "epoch": 0.80099173553719, + "grad_norm": 13.220976829528809, + "learning_rate": 9.643908385097428e-07, + "loss": 0.5043, + "step": 7269 + }, + { + "epoch": 0.8011019283746557, + "grad_norm": 4.701597213745117, + "learning_rate": 9.633588340954269e-07, + "loss": 0.3783, + "step": 7270 + }, + { + "epoch": 0.8012121212121213, + "grad_norm": 7.403231143951416, + "learning_rate": 9.623273232870734e-07, + "loss": 0.3556, + "step": 7271 + }, + { + "epoch": 0.8013223140495868, + "grad_norm": 7.546553134918213, + "learning_rate": 9.612963062108222e-07, + "loss": 0.4229, + "step": 7272 + }, + { + "epoch": 0.8014325068870524, + "grad_norm": 4.353821754455566, + "learning_rate": 9.60265782992743e-07, + "loss": 0.3123, + "step": 7273 + }, + { + "epoch": 0.8015426997245179, + "grad_norm": 5.343846321105957, + "learning_rate": 9.59235753758853e-07, + "loss": 0.4493, + "step": 7274 + }, + { + "epoch": 0.8016528925619835, + "grad_norm": 10.174478530883789, + "learning_rate": 9.582062186351027e-07, + "loss": 0.4762, + "step": 7275 + }, + { + "epoch": 0.8017630853994491, + "grad_norm": 5.632132053375244, + "learning_rate": 9.57177177747386e-07, + "loss": 0.2683, + "step": 7276 + }, + { + "epoch": 0.8018732782369146, + "grad_norm": 5.7510504722595215, + "learning_rate": 9.561486312215374e-07, + "loss": 0.4303, + "step": 7277 + }, + { + "epoch": 0.8019834710743802, + "grad_norm": 4.070655345916748, + "learning_rate": 9.551205791833252e-07, + "loss": 0.4219, + "step": 7278 + }, + { + "epoch": 0.8020936639118458, + "grad_norm": 6.352983474731445, + "learning_rate": 9.540930217584633e-07, + "loss": 0.4086, + "step": 7279 + }, + { + "epoch": 0.8022038567493113, + "grad_norm": 12.105496406555176, + "learning_rate": 9.530659590726037e-07, + "loss": 0.4251, + "step": 7280 + }, + { + "epoch": 0.8023140495867769, + "grad_norm": 5.1941680908203125, + "learning_rate": 9.520393912513348e-07, + "loss": 0.3371, + "step": 7281 + }, + { + "epoch": 0.8024242424242424, + "grad_norm": 6.544575214385986, + "learning_rate": 9.510133184201881e-07, + "loss": 0.3963, + "step": 7282 + }, + { + "epoch": 0.802534435261708, + "grad_norm": 5.792811870574951, + "learning_rate": 9.499877407046332e-07, + "loss": 0.4255, + "step": 7283 + }, + { + "epoch": 0.8026446280991736, + "grad_norm": 7.323215484619141, + "learning_rate": 9.489626582300782e-07, + "loss": 0.3708, + "step": 7284 + }, + { + "epoch": 0.8027548209366391, + "grad_norm": 9.369105339050293, + "learning_rate": 9.479380711218716e-07, + "loss": 0.4317, + "step": 7285 + }, + { + "epoch": 0.8028650137741047, + "grad_norm": 8.165177345275879, + "learning_rate": 9.469139795053034e-07, + "loss": 0.2629, + "step": 7286 + }, + { + "epoch": 0.8029752066115703, + "grad_norm": 8.255684852600098, + "learning_rate": 9.458903835055983e-07, + "loss": 0.3928, + "step": 7287 + }, + { + "epoch": 0.8030853994490358, + "grad_norm": 5.622321605682373, + "learning_rate": 9.448672832479239e-07, + "loss": 0.371, + "step": 7288 + }, + { + "epoch": 0.8031955922865014, + "grad_norm": 4.412442684173584, + "learning_rate": 9.43844678857388e-07, + "loss": 0.3221, + "step": 7289 + }, + { + "epoch": 0.8033057851239669, + "grad_norm": 4.750947952270508, + "learning_rate": 9.428225704590327e-07, + "loss": 0.3824, + "step": 7290 + }, + { + "epoch": 0.8034159779614325, + "grad_norm": 6.953831195831299, + "learning_rate": 9.418009581778447e-07, + "loss": 0.3953, + "step": 7291 + }, + { + "epoch": 0.8035261707988981, + "grad_norm": 6.322239398956299, + "learning_rate": 9.407798421387498e-07, + "loss": 0.3995, + "step": 7292 + }, + { + "epoch": 0.8036363636363636, + "grad_norm": 5.172215938568115, + "learning_rate": 9.39759222466608e-07, + "loss": 0.4628, + "step": 7293 + }, + { + "epoch": 0.8037465564738292, + "grad_norm": 7.184489727020264, + "learning_rate": 9.387390992862238e-07, + "loss": 0.373, + "step": 7294 + }, + { + "epoch": 0.8038567493112948, + "grad_norm": 5.020264148712158, + "learning_rate": 9.37719472722341e-07, + "loss": 0.3731, + "step": 7295 + }, + { + "epoch": 0.8039669421487603, + "grad_norm": 7.934032440185547, + "learning_rate": 9.367003428996374e-07, + "loss": 0.4368, + "step": 7296 + }, + { + "epoch": 0.8040771349862259, + "grad_norm": 7.1293134689331055, + "learning_rate": 9.356817099427351e-07, + "loss": 0.473, + "step": 7297 + }, + { + "epoch": 0.8041873278236915, + "grad_norm": 7.446048259735107, + "learning_rate": 9.346635739761955e-07, + "loss": 0.4487, + "step": 7298 + }, + { + "epoch": 0.804297520661157, + "grad_norm": 5.76701545715332, + "learning_rate": 9.336459351245152e-07, + "loss": 0.3696, + "step": 7299 + }, + { + "epoch": 0.8044077134986226, + "grad_norm": 5.016604900360107, + "learning_rate": 9.326287935121353e-07, + "loss": 0.3991, + "step": 7300 + }, + { + "epoch": 0.8045179063360881, + "grad_norm": 4.685810089111328, + "learning_rate": 9.316121492634283e-07, + "loss": 0.3693, + "step": 7301 + }, + { + "epoch": 0.8046280991735537, + "grad_norm": 7.337115287780762, + "learning_rate": 9.305960025027172e-07, + "loss": 0.3524, + "step": 7302 + }, + { + "epoch": 0.8047382920110193, + "grad_norm": 7.880723476409912, + "learning_rate": 9.295803533542541e-07, + "loss": 0.4536, + "step": 7303 + }, + { + "epoch": 0.8048484848484848, + "grad_norm": 10.412185668945312, + "learning_rate": 9.28565201942232e-07, + "loss": 0.4296, + "step": 7304 + }, + { + "epoch": 0.8049586776859504, + "grad_norm": 8.625537872314453, + "learning_rate": 9.275505483907904e-07, + "loss": 0.3921, + "step": 7305 + }, + { + "epoch": 0.805068870523416, + "grad_norm": 10.102866172790527, + "learning_rate": 9.265363928239995e-07, + "loss": 0.3705, + "step": 7306 + }, + { + "epoch": 0.8051790633608815, + "grad_norm": 7.82152795791626, + "learning_rate": 9.255227353658691e-07, + "loss": 0.4334, + "step": 7307 + }, + { + "epoch": 0.8052892561983471, + "grad_norm": 4.74699592590332, + "learning_rate": 9.245095761403555e-07, + "loss": 0.4471, + "step": 7308 + }, + { + "epoch": 0.8053994490358126, + "grad_norm": 9.94813346862793, + "learning_rate": 9.234969152713475e-07, + "loss": 0.3688, + "step": 7309 + }, + { + "epoch": 0.8055096418732782, + "grad_norm": 12.886780738830566, + "learning_rate": 9.224847528826725e-07, + "loss": 0.4914, + "step": 7310 + }, + { + "epoch": 0.8056198347107438, + "grad_norm": 7.919636249542236, + "learning_rate": 9.214730890981005e-07, + "loss": 0.3961, + "step": 7311 + }, + { + "epoch": 0.8057300275482093, + "grad_norm": 11.858296394348145, + "learning_rate": 9.204619240413409e-07, + "loss": 0.435, + "step": 7312 + }, + { + "epoch": 0.8058402203856749, + "grad_norm": 6.879891395568848, + "learning_rate": 9.194512578360377e-07, + "loss": 0.443, + "step": 7313 + }, + { + "epoch": 0.8059504132231405, + "grad_norm": 10.284538269042969, + "learning_rate": 9.184410906057773e-07, + "loss": 0.421, + "step": 7314 + }, + { + "epoch": 0.806060606060606, + "grad_norm": 10.9752836227417, + "learning_rate": 9.174314224740844e-07, + "loss": 0.5039, + "step": 7315 + }, + { + "epoch": 0.8061707988980716, + "grad_norm": 11.572310447692871, + "learning_rate": 9.164222535644241e-07, + "loss": 0.417, + "step": 7316 + }, + { + "epoch": 0.8062809917355372, + "grad_norm": 6.825654029846191, + "learning_rate": 9.154135840001965e-07, + "loss": 0.3913, + "step": 7317 + }, + { + "epoch": 0.8063911845730027, + "grad_norm": 9.148991584777832, + "learning_rate": 9.144054139047442e-07, + "loss": 0.4296, + "step": 7318 + }, + { + "epoch": 0.8065013774104683, + "grad_norm": 4.8619890213012695, + "learning_rate": 9.133977434013485e-07, + "loss": 0.3764, + "step": 7319 + }, + { + "epoch": 0.8066115702479338, + "grad_norm": 5.3864030838012695, + "learning_rate": 9.12390572613227e-07, + "loss": 0.3388, + "step": 7320 + }, + { + "epoch": 0.8067217630853994, + "grad_norm": 4.868517875671387, + "learning_rate": 9.113839016635389e-07, + "loss": 0.345, + "step": 7321 + }, + { + "epoch": 0.806831955922865, + "grad_norm": 5.957769393920898, + "learning_rate": 9.103777306753825e-07, + "loss": 0.4002, + "step": 7322 + }, + { + "epoch": 0.8069421487603305, + "grad_norm": 6.74638032913208, + "learning_rate": 9.093720597717909e-07, + "loss": 0.4176, + "step": 7323 + }, + { + "epoch": 0.8070523415977962, + "grad_norm": 4.691101551055908, + "learning_rate": 9.083668890757402e-07, + "loss": 0.3389, + "step": 7324 + }, + { + "epoch": 0.8071625344352618, + "grad_norm": 15.070783615112305, + "learning_rate": 9.073622187101455e-07, + "loss": 0.4583, + "step": 7325 + }, + { + "epoch": 0.8072727272727273, + "grad_norm": 12.757047653198242, + "learning_rate": 9.063580487978579e-07, + "loss": 0.3861, + "step": 7326 + }, + { + "epoch": 0.8073829201101929, + "grad_norm": 6.557346343994141, + "learning_rate": 9.053543794616665e-07, + "loss": 0.4441, + "step": 7327 + }, + { + "epoch": 0.8074931129476584, + "grad_norm": 6.969200134277344, + "learning_rate": 9.043512108243063e-07, + "loss": 0.4289, + "step": 7328 + }, + { + "epoch": 0.807603305785124, + "grad_norm": 7.515108108520508, + "learning_rate": 9.033485430084421e-07, + "loss": 0.3323, + "step": 7329 + }, + { + "epoch": 0.8077134986225896, + "grad_norm": 4.961164951324463, + "learning_rate": 9.023463761366824e-07, + "loss": 0.3961, + "step": 7330 + }, + { + "epoch": 0.8078236914600551, + "grad_norm": 7.038287162780762, + "learning_rate": 9.013447103315758e-07, + "loss": 0.4241, + "step": 7331 + }, + { + "epoch": 0.8079338842975207, + "grad_norm": 5.465232849121094, + "learning_rate": 9.00343545715604e-07, + "loss": 0.3273, + "step": 7332 + }, + { + "epoch": 0.8080440771349863, + "grad_norm": 8.099470138549805, + "learning_rate": 8.993428824111932e-07, + "loss": 0.3854, + "step": 7333 + }, + { + "epoch": 0.8081542699724518, + "grad_norm": 3.8248982429504395, + "learning_rate": 8.983427205407041e-07, + "loss": 0.4089, + "step": 7334 + }, + { + "epoch": 0.8082644628099174, + "grad_norm": 7.873836517333984, + "learning_rate": 8.973430602264388e-07, + "loss": 0.3808, + "step": 7335 + }, + { + "epoch": 0.8083746556473829, + "grad_norm": 6.351470470428467, + "learning_rate": 8.963439015906378e-07, + "loss": 0.4302, + "step": 7336 + }, + { + "epoch": 0.8084848484848485, + "grad_norm": 9.827178001403809, + "learning_rate": 8.953452447554778e-07, + "loss": 0.335, + "step": 7337 + }, + { + "epoch": 0.8085950413223141, + "grad_norm": 5.265390396118164, + "learning_rate": 8.943470898430768e-07, + "loss": 0.3765, + "step": 7338 + }, + { + "epoch": 0.8087052341597796, + "grad_norm": 5.931137561798096, + "learning_rate": 8.933494369754919e-07, + "loss": 0.3746, + "step": 7339 + }, + { + "epoch": 0.8088154269972452, + "grad_norm": 5.3239359855651855, + "learning_rate": 8.923522862747148e-07, + "loss": 0.39, + "step": 7340 + }, + { + "epoch": 0.8089256198347108, + "grad_norm": 7.077722072601318, + "learning_rate": 8.913556378626804e-07, + "loss": 0.4648, + "step": 7341 + }, + { + "epoch": 0.8090358126721763, + "grad_norm": 4.245509147644043, + "learning_rate": 8.903594918612601e-07, + "loss": 0.3343, + "step": 7342 + }, + { + "epoch": 0.8091460055096419, + "grad_norm": 6.912594795227051, + "learning_rate": 8.893638483922628e-07, + "loss": 0.4431, + "step": 7343 + }, + { + "epoch": 0.8092561983471075, + "grad_norm": 6.49102258682251, + "learning_rate": 8.883687075774377e-07, + "loss": 0.3481, + "step": 7344 + }, + { + "epoch": 0.809366391184573, + "grad_norm": 4.468546390533447, + "learning_rate": 8.873740695384736e-07, + "loss": 0.3958, + "step": 7345 + }, + { + "epoch": 0.8094765840220386, + "grad_norm": 6.101900577545166, + "learning_rate": 8.863799343969931e-07, + "loss": 0.4225, + "step": 7346 + }, + { + "epoch": 0.8095867768595041, + "grad_norm": 8.90814208984375, + "learning_rate": 8.853863022745623e-07, + "loss": 0.3228, + "step": 7347 + }, + { + "epoch": 0.8096969696969697, + "grad_norm": 5.18488883972168, + "learning_rate": 8.843931732926847e-07, + "loss": 0.3955, + "step": 7348 + }, + { + "epoch": 0.8098071625344353, + "grad_norm": 8.424071311950684, + "learning_rate": 8.834005475727991e-07, + "loss": 0.3924, + "step": 7349 + }, + { + "epoch": 0.8099173553719008, + "grad_norm": 14.261128425598145, + "learning_rate": 8.824084252362864e-07, + "loss": 0.3871, + "step": 7350 + }, + { + "epoch": 0.8100275482093664, + "grad_norm": 5.683298110961914, + "learning_rate": 8.814168064044659e-07, + "loss": 0.4796, + "step": 7351 + }, + { + "epoch": 0.810137741046832, + "grad_norm": 8.727130889892578, + "learning_rate": 8.80425691198592e-07, + "loss": 0.3456, + "step": 7352 + }, + { + "epoch": 0.8102479338842975, + "grad_norm": 7.0580339431762695, + "learning_rate": 8.794350797398604e-07, + "loss": 0.3149, + "step": 7353 + }, + { + "epoch": 0.8103581267217631, + "grad_norm": 5.936559677124023, + "learning_rate": 8.784449721494054e-07, + "loss": 0.4401, + "step": 7354 + }, + { + "epoch": 0.8104683195592286, + "grad_norm": 6.004831790924072, + "learning_rate": 8.774553685482968e-07, + "loss": 0.423, + "step": 7355 + }, + { + "epoch": 0.8105785123966942, + "grad_norm": 7.186921119689941, + "learning_rate": 8.764662690575454e-07, + "loss": 0.3428, + "step": 7356 + }, + { + "epoch": 0.8106887052341598, + "grad_norm": 8.847982406616211, + "learning_rate": 8.754776737981002e-07, + "loss": 0.4349, + "step": 7357 + }, + { + "epoch": 0.8107988980716253, + "grad_norm": 7.745877265930176, + "learning_rate": 8.744895828908484e-07, + "loss": 0.4443, + "step": 7358 + }, + { + "epoch": 0.8109090909090909, + "grad_norm": 9.938824653625488, + "learning_rate": 8.735019964566149e-07, + "loss": 0.4282, + "step": 7359 + }, + { + "epoch": 0.8110192837465565, + "grad_norm": 7.234878063201904, + "learning_rate": 8.725149146161599e-07, + "loss": 0.4152, + "step": 7360 + }, + { + "epoch": 0.811129476584022, + "grad_norm": 11.086160659790039, + "learning_rate": 8.715283374901901e-07, + "loss": 0.3772, + "step": 7361 + }, + { + "epoch": 0.8112396694214876, + "grad_norm": 11.494916915893555, + "learning_rate": 8.705422651993434e-07, + "loss": 0.4294, + "step": 7362 + }, + { + "epoch": 0.8113498622589531, + "grad_norm": 13.305935859680176, + "learning_rate": 8.69556697864195e-07, + "loss": 0.4527, + "step": 7363 + }, + { + "epoch": 0.8114600550964187, + "grad_norm": 5.638788223266602, + "learning_rate": 8.68571635605267e-07, + "loss": 0.3419, + "step": 7364 + }, + { + "epoch": 0.8115702479338843, + "grad_norm": 7.193055629730225, + "learning_rate": 8.675870785430113e-07, + "loss": 0.3688, + "step": 7365 + }, + { + "epoch": 0.8116804407713498, + "grad_norm": 4.8958048820495605, + "learning_rate": 8.666030267978199e-07, + "loss": 0.4236, + "step": 7366 + }, + { + "epoch": 0.8117906336088154, + "grad_norm": 7.572325229644775, + "learning_rate": 8.656194804900254e-07, + "loss": 0.3654, + "step": 7367 + }, + { + "epoch": 0.811900826446281, + "grad_norm": 8.064297676086426, + "learning_rate": 8.64636439739897e-07, + "loss": 0.4437, + "step": 7368 + }, + { + "epoch": 0.8120110192837465, + "grad_norm": 9.213482856750488, + "learning_rate": 8.636539046676418e-07, + "loss": 0.3727, + "step": 7369 + }, + { + "epoch": 0.8121212121212121, + "grad_norm": 7.8315653800964355, + "learning_rate": 8.626718753934055e-07, + "loss": 0.3821, + "step": 7370 + }, + { + "epoch": 0.8122314049586777, + "grad_norm": 7.829998970031738, + "learning_rate": 8.616903520372721e-07, + "loss": 0.3698, + "step": 7371 + }, + { + "epoch": 0.8123415977961432, + "grad_norm": 8.769179344177246, + "learning_rate": 8.607093347192652e-07, + "loss": 0.3621, + "step": 7372 + }, + { + "epoch": 0.8124517906336088, + "grad_norm": 8.211814880371094, + "learning_rate": 8.597288235593426e-07, + "loss": 0.5154, + "step": 7373 + }, + { + "epoch": 0.8125619834710743, + "grad_norm": 7.744908332824707, + "learning_rate": 8.587488186774029e-07, + "loss": 0.4354, + "step": 7374 + }, + { + "epoch": 0.8126721763085399, + "grad_norm": 5.910334587097168, + "learning_rate": 8.577693201932846e-07, + "loss": 0.4104, + "step": 7375 + }, + { + "epoch": 0.8127823691460055, + "grad_norm": 6.860529899597168, + "learning_rate": 8.567903282267593e-07, + "loss": 0.3656, + "step": 7376 + }, + { + "epoch": 0.812892561983471, + "grad_norm": 5.221512317657471, + "learning_rate": 8.558118428975404e-07, + "loss": 0.3476, + "step": 7377 + }, + { + "epoch": 0.8130027548209366, + "grad_norm": 6.213238716125488, + "learning_rate": 8.548338643252796e-07, + "loss": 0.3832, + "step": 7378 + }, + { + "epoch": 0.8131129476584023, + "grad_norm": 4.635070323944092, + "learning_rate": 8.53856392629564e-07, + "loss": 0.3872, + "step": 7379 + }, + { + "epoch": 0.8132231404958677, + "grad_norm": 6.566573143005371, + "learning_rate": 8.528794279299201e-07, + "loss": 0.3391, + "step": 7380 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 5.124295234680176, + "learning_rate": 8.519029703458148e-07, + "loss": 0.3496, + "step": 7381 + }, + { + "epoch": 0.8134435261707988, + "grad_norm": 6.284134864807129, + "learning_rate": 8.509270199966474e-07, + "loss": 0.3601, + "step": 7382 + }, + { + "epoch": 0.8135537190082645, + "grad_norm": 4.887261390686035, + "learning_rate": 8.499515770017603e-07, + "loss": 0.3419, + "step": 7383 + }, + { + "epoch": 0.8136639118457301, + "grad_norm": 9.922983169555664, + "learning_rate": 8.489766414804323e-07, + "loss": 0.4235, + "step": 7384 + }, + { + "epoch": 0.8137741046831956, + "grad_norm": 5.538818359375, + "learning_rate": 8.480022135518784e-07, + "loss": 0.3737, + "step": 7385 + }, + { + "epoch": 0.8138842975206612, + "grad_norm": 4.689999580383301, + "learning_rate": 8.470282933352536e-07, + "loss": 0.3563, + "step": 7386 + }, + { + "epoch": 0.8139944903581268, + "grad_norm": 6.984789848327637, + "learning_rate": 8.460548809496516e-07, + "loss": 0.3945, + "step": 7387 + }, + { + "epoch": 0.8141046831955923, + "grad_norm": 6.579164505004883, + "learning_rate": 8.450819765141e-07, + "loss": 0.3862, + "step": 7388 + }, + { + "epoch": 0.8142148760330579, + "grad_norm": 7.291015625, + "learning_rate": 8.441095801475685e-07, + "loss": 0.4248, + "step": 7389 + }, + { + "epoch": 0.8143250688705234, + "grad_norm": 7.309769630432129, + "learning_rate": 8.431376919689638e-07, + "loss": 0.3647, + "step": 7390 + }, + { + "epoch": 0.814435261707989, + "grad_norm": 7.58947229385376, + "learning_rate": 8.421663120971274e-07, + "loss": 0.435, + "step": 7391 + }, + { + "epoch": 0.8145454545454546, + "grad_norm": 5.637911796569824, + "learning_rate": 8.411954406508438e-07, + "loss": 0.2922, + "step": 7392 + }, + { + "epoch": 0.8146556473829201, + "grad_norm": 7.525227069854736, + "learning_rate": 8.402250777488291e-07, + "loss": 0.3679, + "step": 7393 + }, + { + "epoch": 0.8147658402203857, + "grad_norm": 17.968149185180664, + "learning_rate": 8.392552235097429e-07, + "loss": 0.3763, + "step": 7394 + }, + { + "epoch": 0.8148760330578513, + "grad_norm": 3.8819832801818848, + "learning_rate": 8.382858780521807e-07, + "loss": 0.4402, + "step": 7395 + }, + { + "epoch": 0.8149862258953168, + "grad_norm": 5.026881694793701, + "learning_rate": 8.373170414946735e-07, + "loss": 0.4128, + "step": 7396 + }, + { + "epoch": 0.8150964187327824, + "grad_norm": 5.453049659729004, + "learning_rate": 8.363487139556925e-07, + "loss": 0.4167, + "step": 7397 + }, + { + "epoch": 0.815206611570248, + "grad_norm": 11.708805084228516, + "learning_rate": 8.35380895553648e-07, + "loss": 0.4392, + "step": 7398 + }, + { + "epoch": 0.8153168044077135, + "grad_norm": 7.500401496887207, + "learning_rate": 8.344135864068837e-07, + "loss": 0.3301, + "step": 7399 + }, + { + "epoch": 0.8154269972451791, + "grad_norm": 6.323413372039795, + "learning_rate": 8.334467866336843e-07, + "loss": 0.4216, + "step": 7400 + }, + { + "epoch": 0.8155371900826446, + "grad_norm": 8.722508430480957, + "learning_rate": 8.324804963522726e-07, + "loss": 0.4156, + "step": 7401 + }, + { + "epoch": 0.8156473829201102, + "grad_norm": 10.26689624786377, + "learning_rate": 8.315147156808057e-07, + "loss": 0.3739, + "step": 7402 + }, + { + "epoch": 0.8157575757575758, + "grad_norm": 4.217180252075195, + "learning_rate": 8.305494447373813e-07, + "loss": 0.3765, + "step": 7403 + }, + { + "epoch": 0.8158677685950413, + "grad_norm": 4.736955165863037, + "learning_rate": 8.295846836400362e-07, + "loss": 0.4242, + "step": 7404 + }, + { + "epoch": 0.8159779614325069, + "grad_norm": 7.2362799644470215, + "learning_rate": 8.286204325067393e-07, + "loss": 0.5049, + "step": 7405 + }, + { + "epoch": 0.8160881542699725, + "grad_norm": 8.120038032531738, + "learning_rate": 8.276566914554023e-07, + "loss": 0.3996, + "step": 7406 + }, + { + "epoch": 0.816198347107438, + "grad_norm": 7.081818103790283, + "learning_rate": 8.26693460603874e-07, + "loss": 0.3563, + "step": 7407 + }, + { + "epoch": 0.8163085399449036, + "grad_norm": 5.156095504760742, + "learning_rate": 8.257307400699372e-07, + "loss": 0.3981, + "step": 7408 + }, + { + "epoch": 0.8164187327823691, + "grad_norm": 13.11861515045166, + "learning_rate": 8.247685299713154e-07, + "loss": 0.3828, + "step": 7409 + }, + { + "epoch": 0.8165289256198347, + "grad_norm": 13.410127639770508, + "learning_rate": 8.238068304256707e-07, + "loss": 0.4772, + "step": 7410 + }, + { + "epoch": 0.8166391184573003, + "grad_norm": 5.385002613067627, + "learning_rate": 8.22845641550598e-07, + "loss": 0.4295, + "step": 7411 + }, + { + "epoch": 0.8167493112947658, + "grad_norm": 11.29184627532959, + "learning_rate": 8.218849634636345e-07, + "loss": 0.4429, + "step": 7412 + }, + { + "epoch": 0.8168595041322314, + "grad_norm": 11.415976524353027, + "learning_rate": 8.209247962822531e-07, + "loss": 0.4586, + "step": 7413 + }, + { + "epoch": 0.816969696969697, + "grad_norm": 5.033871650695801, + "learning_rate": 8.199651401238656e-07, + "loss": 0.4098, + "step": 7414 + }, + { + "epoch": 0.8170798898071625, + "grad_norm": 7.05573034286499, + "learning_rate": 8.190059951058177e-07, + "loss": 0.4453, + "step": 7415 + }, + { + "epoch": 0.8171900826446281, + "grad_norm": 7.5459160804748535, + "learning_rate": 8.18047361345396e-07, + "loss": 0.4227, + "step": 7416 + }, + { + "epoch": 0.8173002754820937, + "grad_norm": 11.135947227478027, + "learning_rate": 8.170892389598245e-07, + "loss": 0.4486, + "step": 7417 + }, + { + "epoch": 0.8174104683195592, + "grad_norm": 9.077187538146973, + "learning_rate": 8.161316280662629e-07, + "loss": 0.4363, + "step": 7418 + }, + { + "epoch": 0.8175206611570248, + "grad_norm": 8.63121223449707, + "learning_rate": 8.151745287818069e-07, + "loss": 0.4338, + "step": 7419 + }, + { + "epoch": 0.8176308539944903, + "grad_norm": 5.781494140625, + "learning_rate": 8.142179412234963e-07, + "loss": 0.3854, + "step": 7420 + }, + { + "epoch": 0.8177410468319559, + "grad_norm": 6.21343994140625, + "learning_rate": 8.132618655083014e-07, + "loss": 0.4058, + "step": 7421 + }, + { + "epoch": 0.8178512396694215, + "grad_norm": 6.685483455657959, + "learning_rate": 8.123063017531308e-07, + "loss": 0.3797, + "step": 7422 + }, + { + "epoch": 0.817961432506887, + "grad_norm": 6.716452598571777, + "learning_rate": 8.113512500748361e-07, + "loss": 0.4155, + "step": 7423 + }, + { + "epoch": 0.8180716253443526, + "grad_norm": 10.566768646240234, + "learning_rate": 8.103967105902e-07, + "loss": 0.4461, + "step": 7424 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 6.36700963973999, + "learning_rate": 8.094426834159447e-07, + "loss": 0.4079, + "step": 7425 + }, + { + "epoch": 0.8182920110192837, + "grad_norm": 6.516602516174316, + "learning_rate": 8.084891686687296e-07, + "loss": 0.4678, + "step": 7426 + }, + { + "epoch": 0.8184022038567493, + "grad_norm": 7.448019981384277, + "learning_rate": 8.075361664651532e-07, + "loss": 0.3733, + "step": 7427 + }, + { + "epoch": 0.8185123966942148, + "grad_norm": 6.607315540313721, + "learning_rate": 8.065836769217499e-07, + "loss": 0.4473, + "step": 7428 + }, + { + "epoch": 0.8186225895316804, + "grad_norm": 8.278419494628906, + "learning_rate": 8.056317001549902e-07, + "loss": 0.3997, + "step": 7429 + }, + { + "epoch": 0.818732782369146, + "grad_norm": 4.643522262573242, + "learning_rate": 8.046802362812833e-07, + "loss": 0.4545, + "step": 7430 + }, + { + "epoch": 0.8188429752066115, + "grad_norm": 4.523182392120361, + "learning_rate": 8.03729285416977e-07, + "loss": 0.3383, + "step": 7431 + }, + { + "epoch": 0.8189531680440771, + "grad_norm": 9.416790962219238, + "learning_rate": 8.027788476783527e-07, + "loss": 0.4083, + "step": 7432 + }, + { + "epoch": 0.8190633608815427, + "grad_norm": 4.700539588928223, + "learning_rate": 8.018289231816323e-07, + "loss": 0.3711, + "step": 7433 + }, + { + "epoch": 0.8191735537190082, + "grad_norm": 5.3888750076293945, + "learning_rate": 8.008795120429752e-07, + "loss": 0.3789, + "step": 7434 + }, + { + "epoch": 0.8192837465564738, + "grad_norm": 4.854864120483398, + "learning_rate": 7.999306143784741e-07, + "loss": 0.4277, + "step": 7435 + }, + { + "epoch": 0.8193939393939393, + "grad_norm": 5.642134666442871, + "learning_rate": 7.989822303041622e-07, + "loss": 0.4036, + "step": 7436 + }, + { + "epoch": 0.819504132231405, + "grad_norm": 6.250295639038086, + "learning_rate": 7.980343599360113e-07, + "loss": 0.3542, + "step": 7437 + }, + { + "epoch": 0.8196143250688706, + "grad_norm": 9.971394538879395, + "learning_rate": 7.970870033899253e-07, + "loss": 0.4408, + "step": 7438 + }, + { + "epoch": 0.819724517906336, + "grad_norm": 5.384455680847168, + "learning_rate": 7.961401607817499e-07, + "loss": 0.4375, + "step": 7439 + }, + { + "epoch": 0.8198347107438017, + "grad_norm": 5.423167705535889, + "learning_rate": 7.951938322272673e-07, + "loss": 0.3207, + "step": 7440 + }, + { + "epoch": 0.8199449035812673, + "grad_norm": 8.072731018066406, + "learning_rate": 7.94248017842193e-07, + "loss": 0.4668, + "step": 7441 + }, + { + "epoch": 0.8200550964187328, + "grad_norm": 4.293138027191162, + "learning_rate": 7.933027177421842e-07, + "loss": 0.4011, + "step": 7442 + }, + { + "epoch": 0.8201652892561984, + "grad_norm": 10.118760108947754, + "learning_rate": 7.923579320428342e-07, + "loss": 0.4398, + "step": 7443 + }, + { + "epoch": 0.820275482093664, + "grad_norm": 6.136846542358398, + "learning_rate": 7.914136608596712e-07, + "loss": 0.4212, + "step": 7444 + }, + { + "epoch": 0.8203856749311295, + "grad_norm": 5.816466808319092, + "learning_rate": 7.904699043081621e-07, + "loss": 0.3057, + "step": 7445 + }, + { + "epoch": 0.8204958677685951, + "grad_norm": 4.639345169067383, + "learning_rate": 7.895266625037124e-07, + "loss": 0.3567, + "step": 7446 + }, + { + "epoch": 0.8206060606060606, + "grad_norm": 7.182546138763428, + "learning_rate": 7.885839355616609e-07, + "loss": 0.3723, + "step": 7447 + }, + { + "epoch": 0.8207162534435262, + "grad_norm": 4.012908935546875, + "learning_rate": 7.876417235972861e-07, + "loss": 0.3462, + "step": 7448 + }, + { + "epoch": 0.8208264462809918, + "grad_norm": 5.623358249664307, + "learning_rate": 7.867000267258045e-07, + "loss": 0.4108, + "step": 7449 + }, + { + "epoch": 0.8209366391184573, + "grad_norm": 4.929952144622803, + "learning_rate": 7.857588450623654e-07, + "loss": 0.3243, + "step": 7450 + }, + { + "epoch": 0.8210468319559229, + "grad_norm": 8.155301094055176, + "learning_rate": 7.84818178722061e-07, + "loss": 0.3344, + "step": 7451 + }, + { + "epoch": 0.8211570247933885, + "grad_norm": 6.135559558868408, + "learning_rate": 7.838780278199137e-07, + "loss": 0.3263, + "step": 7452 + }, + { + "epoch": 0.821267217630854, + "grad_norm": 7.614686965942383, + "learning_rate": 7.829383924708889e-07, + "loss": 0.3218, + "step": 7453 + }, + { + "epoch": 0.8213774104683196, + "grad_norm": 7.434074401855469, + "learning_rate": 7.819992727898862e-07, + "loss": 0.403, + "step": 7454 + }, + { + "epoch": 0.8214876033057851, + "grad_norm": 9.409198760986328, + "learning_rate": 7.8106066889174e-07, + "loss": 0.3881, + "step": 7455 + }, + { + "epoch": 0.8215977961432507, + "grad_norm": 4.896434307098389, + "learning_rate": 7.801225808912288e-07, + "loss": 0.402, + "step": 7456 + }, + { + "epoch": 0.8217079889807163, + "grad_norm": 6.69950008392334, + "learning_rate": 7.791850089030601e-07, + "loss": 0.3975, + "step": 7457 + }, + { + "epoch": 0.8218181818181818, + "grad_norm": 6.383553981781006, + "learning_rate": 7.782479530418807e-07, + "loss": 0.3964, + "step": 7458 + }, + { + "epoch": 0.8219283746556474, + "grad_norm": 6.325447082519531, + "learning_rate": 7.773114134222765e-07, + "loss": 0.3689, + "step": 7459 + }, + { + "epoch": 0.822038567493113, + "grad_norm": 7.699170112609863, + "learning_rate": 7.763753901587695e-07, + "loss": 0.3063, + "step": 7460 + }, + { + "epoch": 0.8221487603305785, + "grad_norm": 6.865559101104736, + "learning_rate": 7.754398833658161e-07, + "loss": 0.3655, + "step": 7461 + }, + { + "epoch": 0.8222589531680441, + "grad_norm": 5.2361369132995605, + "learning_rate": 7.745048931578125e-07, + "loss": 0.3859, + "step": 7462 + }, + { + "epoch": 0.8223691460055096, + "grad_norm": 9.958680152893066, + "learning_rate": 7.735704196490911e-07, + "loss": 0.4336, + "step": 7463 + }, + { + "epoch": 0.8224793388429752, + "grad_norm": 5.9230570793151855, + "learning_rate": 7.72636462953919e-07, + "loss": 0.3634, + "step": 7464 + }, + { + "epoch": 0.8225895316804408, + "grad_norm": 9.550772666931152, + "learning_rate": 7.71703023186502e-07, + "loss": 0.4396, + "step": 7465 + }, + { + "epoch": 0.8226997245179063, + "grad_norm": 5.828458309173584, + "learning_rate": 7.707701004609846e-07, + "loss": 0.3809, + "step": 7466 + }, + { + "epoch": 0.8228099173553719, + "grad_norm": 6.5030059814453125, + "learning_rate": 7.698376948914426e-07, + "loss": 0.37, + "step": 7467 + }, + { + "epoch": 0.8229201101928375, + "grad_norm": 5.650920867919922, + "learning_rate": 7.689058065918937e-07, + "loss": 0.3444, + "step": 7468 + }, + { + "epoch": 0.823030303030303, + "grad_norm": 4.907332897186279, + "learning_rate": 7.679744356762897e-07, + "loss": 0.3816, + "step": 7469 + }, + { + "epoch": 0.8231404958677686, + "grad_norm": 11.659539222717285, + "learning_rate": 7.67043582258522e-07, + "loss": 0.4082, + "step": 7470 + }, + { + "epoch": 0.8232506887052342, + "grad_norm": 5.233342170715332, + "learning_rate": 7.661132464524135e-07, + "loss": 0.4045, + "step": 7471 + }, + { + "epoch": 0.8233608815426997, + "grad_norm": 5.3335161209106445, + "learning_rate": 7.651834283717286e-07, + "loss": 0.3777, + "step": 7472 + }, + { + "epoch": 0.8234710743801653, + "grad_norm": 6.997351169586182, + "learning_rate": 7.642541281301674e-07, + "loss": 0.3797, + "step": 7473 + }, + { + "epoch": 0.8235812672176308, + "grad_norm": 6.346808910369873, + "learning_rate": 7.633253458413653e-07, + "loss": 0.39, + "step": 7474 + }, + { + "epoch": 0.8236914600550964, + "grad_norm": 6.893392562866211, + "learning_rate": 7.623970816188925e-07, + "loss": 0.3167, + "step": 7475 + }, + { + "epoch": 0.823801652892562, + "grad_norm": 6.692793369293213, + "learning_rate": 7.614693355762632e-07, + "loss": 0.404, + "step": 7476 + }, + { + "epoch": 0.8239118457300275, + "grad_norm": 8.373234748840332, + "learning_rate": 7.605421078269209e-07, + "loss": 0.4487, + "step": 7477 + }, + { + "epoch": 0.8240220385674931, + "grad_norm": 16.04227066040039, + "learning_rate": 7.596153984842464e-07, + "loss": 0.5009, + "step": 7478 + }, + { + "epoch": 0.8241322314049587, + "grad_norm": 8.250227928161621, + "learning_rate": 7.586892076615632e-07, + "loss": 0.4597, + "step": 7479 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 6.846471309661865, + "learning_rate": 7.577635354721247e-07, + "loss": 0.386, + "step": 7480 + }, + { + "epoch": 0.8243526170798898, + "grad_norm": 5.904123783111572, + "learning_rate": 7.568383820291214e-07, + "loss": 0.4399, + "step": 7481 + }, + { + "epoch": 0.8244628099173553, + "grad_norm": 7.321972846984863, + "learning_rate": 7.559137474456868e-07, + "loss": 0.4062, + "step": 7482 + }, + { + "epoch": 0.8245730027548209, + "grad_norm": 4.483993053436279, + "learning_rate": 7.549896318348826e-07, + "loss": 0.3509, + "step": 7483 + }, + { + "epoch": 0.8246831955922865, + "grad_norm": 8.56092643737793, + "learning_rate": 7.540660353097146e-07, + "loss": 0.3823, + "step": 7484 + }, + { + "epoch": 0.824793388429752, + "grad_norm": 5.048705577850342, + "learning_rate": 7.531429579831173e-07, + "loss": 0.4145, + "step": 7485 + }, + { + "epoch": 0.8249035812672176, + "grad_norm": 7.269402503967285, + "learning_rate": 7.522203999679684e-07, + "loss": 0.4043, + "step": 7486 + }, + { + "epoch": 0.8250137741046832, + "grad_norm": 6.417334079742432, + "learning_rate": 7.512983613770797e-07, + "loss": 0.4195, + "step": 7487 + }, + { + "epoch": 0.8251239669421487, + "grad_norm": 5.412443161010742, + "learning_rate": 7.503768423231983e-07, + "loss": 0.3367, + "step": 7488 + }, + { + "epoch": 0.8252341597796143, + "grad_norm": 6.628684997558594, + "learning_rate": 7.494558429190085e-07, + "loss": 0.3736, + "step": 7489 + }, + { + "epoch": 0.8253443526170798, + "grad_norm": 7.438686370849609, + "learning_rate": 7.485353632771336e-07, + "loss": 0.3753, + "step": 7490 + }, + { + "epoch": 0.8254545454545454, + "grad_norm": 5.179122447967529, + "learning_rate": 7.476154035101279e-07, + "loss": 0.3946, + "step": 7491 + }, + { + "epoch": 0.825564738292011, + "grad_norm": 6.319139003753662, + "learning_rate": 7.466959637304871e-07, + "loss": 0.4079, + "step": 7492 + }, + { + "epoch": 0.8256749311294765, + "grad_norm": 6.583497047424316, + "learning_rate": 7.457770440506429e-07, + "loss": 0.3306, + "step": 7493 + }, + { + "epoch": 0.8257851239669421, + "grad_norm": 4.10272741317749, + "learning_rate": 7.448586445829592e-07, + "loss": 0.3673, + "step": 7494 + }, + { + "epoch": 0.8258953168044078, + "grad_norm": 8.826009750366211, + "learning_rate": 7.439407654397402e-07, + "loss": 0.3622, + "step": 7495 + }, + { + "epoch": 0.8260055096418732, + "grad_norm": 6.662688255310059, + "learning_rate": 7.43023406733227e-07, + "loss": 0.4372, + "step": 7496 + }, + { + "epoch": 0.8261157024793389, + "grad_norm": 16.362524032592773, + "learning_rate": 7.421065685755935e-07, + "loss": 0.3762, + "step": 7497 + }, + { + "epoch": 0.8262258953168045, + "grad_norm": 7.434336185455322, + "learning_rate": 7.41190251078952e-07, + "loss": 0.4018, + "step": 7498 + }, + { + "epoch": 0.82633608815427, + "grad_norm": 6.320937156677246, + "learning_rate": 7.402744543553531e-07, + "loss": 0.3997, + "step": 7499 + }, + { + "epoch": 0.8264462809917356, + "grad_norm": 5.576345443725586, + "learning_rate": 7.393591785167786e-07, + "loss": 0.3733, + "step": 7500 + }, + { + "epoch": 0.826556473829201, + "grad_norm": 15.954998970031738, + "learning_rate": 7.384444236751514e-07, + "loss": 0.5084, + "step": 7501 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 7.5200886726379395, + "learning_rate": 7.375301899423304e-07, + "loss": 0.4141, + "step": 7502 + }, + { + "epoch": 0.8267768595041323, + "grad_norm": 7.538036346435547, + "learning_rate": 7.36616477430106e-07, + "loss": 0.3081, + "step": 7503 + }, + { + "epoch": 0.8268870523415978, + "grad_norm": 9.011994361877441, + "learning_rate": 7.357032862502106e-07, + "loss": 0.4073, + "step": 7504 + }, + { + "epoch": 0.8269972451790634, + "grad_norm": 6.264695167541504, + "learning_rate": 7.34790616514311e-07, + "loss": 0.4056, + "step": 7505 + }, + { + "epoch": 0.827107438016529, + "grad_norm": 6.546352863311768, + "learning_rate": 7.338784683340067e-07, + "loss": 0.3476, + "step": 7506 + }, + { + "epoch": 0.8272176308539945, + "grad_norm": 7.9584197998046875, + "learning_rate": 7.329668418208386e-07, + "loss": 0.383, + "step": 7507 + }, + { + "epoch": 0.8273278236914601, + "grad_norm": 6.149013996124268, + "learning_rate": 7.320557370862824e-07, + "loss": 0.4527, + "step": 7508 + }, + { + "epoch": 0.8274380165289256, + "grad_norm": 7.031678199768066, + "learning_rate": 7.311451542417469e-07, + "loss": 0.3523, + "step": 7509 + }, + { + "epoch": 0.8275482093663912, + "grad_norm": 5.1706624031066895, + "learning_rate": 7.302350933985819e-07, + "loss": 0.4124, + "step": 7510 + }, + { + "epoch": 0.8276584022038568, + "grad_norm": 5.587656497955322, + "learning_rate": 7.293255546680678e-07, + "loss": 0.4128, + "step": 7511 + }, + { + "epoch": 0.8277685950413223, + "grad_norm": 6.654507637023926, + "learning_rate": 7.284165381614278e-07, + "loss": 0.4302, + "step": 7512 + }, + { + "epoch": 0.8278787878787879, + "grad_norm": 11.286688804626465, + "learning_rate": 7.275080439898158e-07, + "loss": 0.402, + "step": 7513 + }, + { + "epoch": 0.8279889807162535, + "grad_norm": 7.159271717071533, + "learning_rate": 7.266000722643213e-07, + "loss": 0.4005, + "step": 7514 + }, + { + "epoch": 0.828099173553719, + "grad_norm": 6.855350017547607, + "learning_rate": 7.256926230959776e-07, + "loss": 0.3725, + "step": 7515 + }, + { + "epoch": 0.8282093663911846, + "grad_norm": 7.471316814422607, + "learning_rate": 7.247856965957456e-07, + "loss": 0.4115, + "step": 7516 + }, + { + "epoch": 0.8283195592286502, + "grad_norm": 9.73602294921875, + "learning_rate": 7.238792928745247e-07, + "loss": 0.4839, + "step": 7517 + }, + { + "epoch": 0.8284297520661157, + "grad_norm": 5.6931471824646, + "learning_rate": 7.229734120431531e-07, + "loss": 0.3588, + "step": 7518 + }, + { + "epoch": 0.8285399449035813, + "grad_norm": 5.699717998504639, + "learning_rate": 7.220680542124031e-07, + "loss": 0.3021, + "step": 7519 + }, + { + "epoch": 0.8286501377410468, + "grad_norm": 8.98326587677002, + "learning_rate": 7.211632194929813e-07, + "loss": 0.3568, + "step": 7520 + }, + { + "epoch": 0.8287603305785124, + "grad_norm": 5.933770179748535, + "learning_rate": 7.20258907995533e-07, + "loss": 0.4108, + "step": 7521 + }, + { + "epoch": 0.828870523415978, + "grad_norm": 5.496316432952881, + "learning_rate": 7.193551198306408e-07, + "loss": 0.3757, + "step": 7522 + }, + { + "epoch": 0.8289807162534435, + "grad_norm": 9.473560333251953, + "learning_rate": 7.184518551088176e-07, + "loss": 0.4775, + "step": 7523 + }, + { + "epoch": 0.8290909090909091, + "grad_norm": 6.112762451171875, + "learning_rate": 7.175491139405172e-07, + "loss": 0.3568, + "step": 7524 + }, + { + "epoch": 0.8292011019283747, + "grad_norm": 9.144979476928711, + "learning_rate": 7.166468964361289e-07, + "loss": 0.3841, + "step": 7525 + }, + { + "epoch": 0.8293112947658402, + "grad_norm": 6.507180690765381, + "learning_rate": 7.157452027059769e-07, + "loss": 0.3955, + "step": 7526 + }, + { + "epoch": 0.8294214876033058, + "grad_norm": 9.648252487182617, + "learning_rate": 7.148440328603206e-07, + "loss": 0.3757, + "step": 7527 + }, + { + "epoch": 0.8295316804407713, + "grad_norm": 5.539919376373291, + "learning_rate": 7.139433870093565e-07, + "loss": 0.414, + "step": 7528 + }, + { + "epoch": 0.8296418732782369, + "grad_norm": 6.198495864868164, + "learning_rate": 7.130432652632179e-07, + "loss": 0.3635, + "step": 7529 + }, + { + "epoch": 0.8297520661157025, + "grad_norm": 10.395768165588379, + "learning_rate": 7.121436677319715e-07, + "loss": 0.4508, + "step": 7530 + }, + { + "epoch": 0.829862258953168, + "grad_norm": 6.952027797698975, + "learning_rate": 7.112445945256219e-07, + "loss": 0.4318, + "step": 7531 + }, + { + "epoch": 0.8299724517906336, + "grad_norm": 6.353927135467529, + "learning_rate": 7.1034604575411e-07, + "loss": 0.4063, + "step": 7532 + }, + { + "epoch": 0.8300826446280992, + "grad_norm": 7.425856590270996, + "learning_rate": 7.094480215273103e-07, + "loss": 0.4616, + "step": 7533 + }, + { + "epoch": 0.8301928374655647, + "grad_norm": 6.667521953582764, + "learning_rate": 7.085505219550326e-07, + "loss": 0.323, + "step": 7534 + }, + { + "epoch": 0.8303030303030303, + "grad_norm": 9.246989250183105, + "learning_rate": 7.076535471470286e-07, + "loss": 0.3977, + "step": 7535 + }, + { + "epoch": 0.8304132231404958, + "grad_norm": 7.972851753234863, + "learning_rate": 7.067570972129795e-07, + "loss": 0.4488, + "step": 7536 + }, + { + "epoch": 0.8305234159779614, + "grad_norm": 6.29607629776001, + "learning_rate": 7.058611722625019e-07, + "loss": 0.3144, + "step": 7537 + }, + { + "epoch": 0.830633608815427, + "grad_norm": 8.219892501831055, + "learning_rate": 7.049657724051556e-07, + "loss": 0.4169, + "step": 7538 + }, + { + "epoch": 0.8307438016528925, + "grad_norm": 5.537420749664307, + "learning_rate": 7.040708977504279e-07, + "loss": 0.3664, + "step": 7539 + }, + { + "epoch": 0.8308539944903581, + "grad_norm": 6.648195743560791, + "learning_rate": 7.031765484077463e-07, + "loss": 0.4035, + "step": 7540 + }, + { + "epoch": 0.8309641873278237, + "grad_norm": 9.343284606933594, + "learning_rate": 7.022827244864738e-07, + "loss": 0.443, + "step": 7541 + }, + { + "epoch": 0.8310743801652892, + "grad_norm": 4.213461875915527, + "learning_rate": 7.013894260959064e-07, + "loss": 0.3329, + "step": 7542 + }, + { + "epoch": 0.8311845730027548, + "grad_norm": 8.926543235778809, + "learning_rate": 7.004966533452806e-07, + "loss": 0.4323, + "step": 7543 + }, + { + "epoch": 0.8312947658402204, + "grad_norm": 7.958025932312012, + "learning_rate": 6.99604406343763e-07, + "loss": 0.4226, + "step": 7544 + }, + { + "epoch": 0.8314049586776859, + "grad_norm": 8.788220405578613, + "learning_rate": 6.987126852004606e-07, + "loss": 0.3323, + "step": 7545 + }, + { + "epoch": 0.8315151515151515, + "grad_norm": 5.215331554412842, + "learning_rate": 6.97821490024414e-07, + "loss": 0.3899, + "step": 7546 + }, + { + "epoch": 0.831625344352617, + "grad_norm": 10.267518043518066, + "learning_rate": 6.96930820924599e-07, + "loss": 0.4865, + "step": 7547 + }, + { + "epoch": 0.8317355371900826, + "grad_norm": 5.965144157409668, + "learning_rate": 6.960406780099282e-07, + "loss": 0.3882, + "step": 7548 + }, + { + "epoch": 0.8318457300275482, + "grad_norm": 12.689521789550781, + "learning_rate": 6.951510613892509e-07, + "loss": 0.3665, + "step": 7549 + }, + { + "epoch": 0.8319559228650137, + "grad_norm": 7.58388090133667, + "learning_rate": 6.942619711713483e-07, + "loss": 0.4159, + "step": 7550 + }, + { + "epoch": 0.8320661157024793, + "grad_norm": 9.022119522094727, + "learning_rate": 6.933734074649406e-07, + "loss": 0.481, + "step": 7551 + }, + { + "epoch": 0.832176308539945, + "grad_norm": 7.578518390655518, + "learning_rate": 6.924853703786838e-07, + "loss": 0.3811, + "step": 7552 + }, + { + "epoch": 0.8322865013774104, + "grad_norm": 4.457272052764893, + "learning_rate": 6.915978600211654e-07, + "loss": 0.3322, + "step": 7553 + }, + { + "epoch": 0.832396694214876, + "grad_norm": 7.8011064529418945, + "learning_rate": 6.907108765009136e-07, + "loss": 0.4161, + "step": 7554 + }, + { + "epoch": 0.8325068870523415, + "grad_norm": 6.120491981506348, + "learning_rate": 6.898244199263904e-07, + "loss": 0.3595, + "step": 7555 + }, + { + "epoch": 0.8326170798898072, + "grad_norm": 9.96679401397705, + "learning_rate": 6.889384904059909e-07, + "loss": 0.4497, + "step": 7556 + }, + { + "epoch": 0.8327272727272728, + "grad_norm": 6.320555686950684, + "learning_rate": 6.88053088048049e-07, + "loss": 0.3224, + "step": 7557 + }, + { + "epoch": 0.8328374655647383, + "grad_norm": 6.006103515625, + "learning_rate": 6.871682129608332e-07, + "loss": 0.3396, + "step": 7558 + }, + { + "epoch": 0.8329476584022039, + "grad_norm": 6.8017683029174805, + "learning_rate": 6.862838652525461e-07, + "loss": 0.357, + "step": 7559 + }, + { + "epoch": 0.8330578512396695, + "grad_norm": 19.439645767211914, + "learning_rate": 6.854000450313275e-07, + "loss": 0.4138, + "step": 7560 + }, + { + "epoch": 0.833168044077135, + "grad_norm": 10.255168914794922, + "learning_rate": 6.845167524052531e-07, + "loss": 0.4535, + "step": 7561 + }, + { + "epoch": 0.8332782369146006, + "grad_norm": 5.926719665527344, + "learning_rate": 6.836339874823311e-07, + "loss": 0.342, + "step": 7562 + }, + { + "epoch": 0.8333884297520661, + "grad_norm": 9.165303230285645, + "learning_rate": 6.827517503705089e-07, + "loss": 0.404, + "step": 7563 + }, + { + "epoch": 0.8334986225895317, + "grad_norm": 4.787303924560547, + "learning_rate": 6.818700411776674e-07, + "loss": 0.3747, + "step": 7564 + }, + { + "epoch": 0.8336088154269973, + "grad_norm": 9.279829025268555, + "learning_rate": 6.809888600116221e-07, + "loss": 0.3478, + "step": 7565 + }, + { + "epoch": 0.8337190082644628, + "grad_norm": 9.262317657470703, + "learning_rate": 6.801082069801268e-07, + "loss": 0.4606, + "step": 7566 + }, + { + "epoch": 0.8338292011019284, + "grad_norm": 6.749006271362305, + "learning_rate": 6.792280821908659e-07, + "loss": 0.3707, + "step": 7567 + }, + { + "epoch": 0.833939393939394, + "grad_norm": 7.549883842468262, + "learning_rate": 6.783484857514666e-07, + "loss": 0.4643, + "step": 7568 + }, + { + "epoch": 0.8340495867768595, + "grad_norm": 5.3465399742126465, + "learning_rate": 6.774694177694846e-07, + "loss": 0.3999, + "step": 7569 + }, + { + "epoch": 0.8341597796143251, + "grad_norm": 3.93556547164917, + "learning_rate": 6.765908783524116e-07, + "loss": 0.4035, + "step": 7570 + }, + { + "epoch": 0.8342699724517907, + "grad_norm": 5.638111114501953, + "learning_rate": 6.757128676076813e-07, + "loss": 0.3956, + "step": 7571 + }, + { + "epoch": 0.8343801652892562, + "grad_norm": 12.791842460632324, + "learning_rate": 6.748353856426553e-07, + "loss": 0.4195, + "step": 7572 + }, + { + "epoch": 0.8344903581267218, + "grad_norm": 7.686941623687744, + "learning_rate": 6.739584325646314e-07, + "loss": 0.4666, + "step": 7573 + }, + { + "epoch": 0.8346005509641873, + "grad_norm": 4.8445611000061035, + "learning_rate": 6.730820084808487e-07, + "loss": 0.408, + "step": 7574 + }, + { + "epoch": 0.8347107438016529, + "grad_norm": 12.405566215515137, + "learning_rate": 6.722061134984759e-07, + "loss": 0.4039, + "step": 7575 + }, + { + "epoch": 0.8348209366391185, + "grad_norm": 9.464533805847168, + "learning_rate": 6.713307477246168e-07, + "loss": 0.2975, + "step": 7576 + }, + { + "epoch": 0.834931129476584, + "grad_norm": 6.935895919799805, + "learning_rate": 6.70455911266314e-07, + "loss": 0.3776, + "step": 7577 + }, + { + "epoch": 0.8350413223140496, + "grad_norm": 6.494793891906738, + "learning_rate": 6.695816042305441e-07, + "loss": 0.3186, + "step": 7578 + }, + { + "epoch": 0.8351515151515152, + "grad_norm": 7.261117935180664, + "learning_rate": 6.687078267242176e-07, + "loss": 0.3739, + "step": 7579 + }, + { + "epoch": 0.8352617079889807, + "grad_norm": 5.465384006500244, + "learning_rate": 6.678345788541807e-07, + "loss": 0.3734, + "step": 7580 + }, + { + "epoch": 0.8353719008264463, + "grad_norm": 8.419529914855957, + "learning_rate": 6.669618607272166e-07, + "loss": 0.4178, + "step": 7581 + }, + { + "epoch": 0.8354820936639118, + "grad_norm": 6.401289463043213, + "learning_rate": 6.660896724500432e-07, + "loss": 0.3982, + "step": 7582 + }, + { + "epoch": 0.8355922865013774, + "grad_norm": 6.065435886383057, + "learning_rate": 6.652180141293107e-07, + "loss": 0.4255, + "step": 7583 + }, + { + "epoch": 0.835702479338843, + "grad_norm": 5.099431037902832, + "learning_rate": 6.643468858716074e-07, + "loss": 0.4164, + "step": 7584 + }, + { + "epoch": 0.8358126721763085, + "grad_norm": 7.421755790710449, + "learning_rate": 6.634762877834578e-07, + "loss": 0.4291, + "step": 7585 + }, + { + "epoch": 0.8359228650137741, + "grad_norm": 4.8282318115234375, + "learning_rate": 6.626062199713168e-07, + "loss": 0.3984, + "step": 7586 + }, + { + "epoch": 0.8360330578512397, + "grad_norm": 7.492334365844727, + "learning_rate": 6.617366825415788e-07, + "loss": 0.4239, + "step": 7587 + }, + { + "epoch": 0.8361432506887052, + "grad_norm": 6.620091438293457, + "learning_rate": 6.608676756005738e-07, + "loss": 0.2626, + "step": 7588 + }, + { + "epoch": 0.8362534435261708, + "grad_norm": 13.550325393676758, + "learning_rate": 6.599991992545624e-07, + "loss": 0.5648, + "step": 7589 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 6.486863136291504, + "learning_rate": 6.591312536097438e-07, + "loss": 0.403, + "step": 7590 + }, + { + "epoch": 0.8364738292011019, + "grad_norm": 7.239338397979736, + "learning_rate": 6.582638387722534e-07, + "loss": 0.4628, + "step": 7591 + }, + { + "epoch": 0.8365840220385675, + "grad_norm": 6.446137428283691, + "learning_rate": 6.573969548481585e-07, + "loss": 0.3246, + "step": 7592 + }, + { + "epoch": 0.836694214876033, + "grad_norm": 8.6100435256958, + "learning_rate": 6.565306019434603e-07, + "loss": 0.419, + "step": 7593 + }, + { + "epoch": 0.8368044077134986, + "grad_norm": 18.886690139770508, + "learning_rate": 6.556647801641031e-07, + "loss": 0.4786, + "step": 7594 + }, + { + "epoch": 0.8369146005509642, + "grad_norm": 3.9173879623413086, + "learning_rate": 6.547994896159559e-07, + "loss": 0.3588, + "step": 7595 + }, + { + "epoch": 0.8370247933884297, + "grad_norm": 6.038689613342285, + "learning_rate": 6.5393473040483e-07, + "loss": 0.3917, + "step": 7596 + }, + { + "epoch": 0.8371349862258953, + "grad_norm": 8.612103462219238, + "learning_rate": 6.530705026364692e-07, + "loss": 0.3718, + "step": 7597 + }, + { + "epoch": 0.8372451790633609, + "grad_norm": 5.947027683258057, + "learning_rate": 6.522068064165515e-07, + "loss": 0.4106, + "step": 7598 + }, + { + "epoch": 0.8373553719008264, + "grad_norm": 8.922974586486816, + "learning_rate": 6.513436418506925e-07, + "loss": 0.3402, + "step": 7599 + }, + { + "epoch": 0.837465564738292, + "grad_norm": 13.584858894348145, + "learning_rate": 6.504810090444392e-07, + "loss": 0.4067, + "step": 7600 + }, + { + "epoch": 0.8375757575757575, + "grad_norm": 8.478438377380371, + "learning_rate": 6.496189081032755e-07, + "loss": 0.4709, + "step": 7601 + }, + { + "epoch": 0.8376859504132231, + "grad_norm": 6.562189102172852, + "learning_rate": 6.48757339132623e-07, + "loss": 0.3826, + "step": 7602 + }, + { + "epoch": 0.8377961432506887, + "grad_norm": 5.741882801055908, + "learning_rate": 6.478963022378327e-07, + "loss": 0.4329, + "step": 7603 + }, + { + "epoch": 0.8379063360881542, + "grad_norm": 6.273601055145264, + "learning_rate": 6.470357975241937e-07, + "loss": 0.3931, + "step": 7604 + }, + { + "epoch": 0.8380165289256198, + "grad_norm": 5.691573143005371, + "learning_rate": 6.461758250969313e-07, + "loss": 0.3401, + "step": 7605 + }, + { + "epoch": 0.8381267217630854, + "grad_norm": 5.817183971405029, + "learning_rate": 6.453163850612026e-07, + "loss": 0.3951, + "step": 7606 + }, + { + "epoch": 0.8382369146005509, + "grad_norm": 6.3230156898498535, + "learning_rate": 6.444574775221013e-07, + "loss": 0.3436, + "step": 7607 + }, + { + "epoch": 0.8383471074380165, + "grad_norm": 4.935028076171875, + "learning_rate": 6.435991025846572e-07, + "loss": 0.4212, + "step": 7608 + }, + { + "epoch": 0.838457300275482, + "grad_norm": 10.081740379333496, + "learning_rate": 6.427412603538314e-07, + "loss": 0.4168, + "step": 7609 + }, + { + "epoch": 0.8385674931129476, + "grad_norm": 5.62360954284668, + "learning_rate": 6.418839509345231e-07, + "loss": 0.4195, + "step": 7610 + }, + { + "epoch": 0.8386776859504133, + "grad_norm": 6.882262229919434, + "learning_rate": 6.41027174431566e-07, + "loss": 0.386, + "step": 7611 + }, + { + "epoch": 0.8387878787878787, + "grad_norm": 6.527801036834717, + "learning_rate": 6.401709309497262e-07, + "loss": 0.4452, + "step": 7612 + }, + { + "epoch": 0.8388980716253444, + "grad_norm": 15.232413291931152, + "learning_rate": 6.39315220593707e-07, + "loss": 0.5078, + "step": 7613 + }, + { + "epoch": 0.83900826446281, + "grad_norm": 7.460157871246338, + "learning_rate": 6.384600434681476e-07, + "loss": 0.4059, + "step": 7614 + }, + { + "epoch": 0.8391184573002755, + "grad_norm": 11.996729850769043, + "learning_rate": 6.376053996776172e-07, + "loss": 0.4352, + "step": 7615 + }, + { + "epoch": 0.8392286501377411, + "grad_norm": 5.507315635681152, + "learning_rate": 6.367512893266243e-07, + "loss": 0.391, + "step": 7616 + }, + { + "epoch": 0.8393388429752067, + "grad_norm": 7.622223377227783, + "learning_rate": 6.358977125196114e-07, + "loss": 0.4086, + "step": 7617 + }, + { + "epoch": 0.8394490358126722, + "grad_norm": 5.924062252044678, + "learning_rate": 6.350446693609536e-07, + "loss": 0.4142, + "step": 7618 + }, + { + "epoch": 0.8395592286501378, + "grad_norm": 4.628000259399414, + "learning_rate": 6.341921599549628e-07, + "loss": 0.3287, + "step": 7619 + }, + { + "epoch": 0.8396694214876033, + "grad_norm": 6.6283369064331055, + "learning_rate": 6.333401844058862e-07, + "loss": 0.3357, + "step": 7620 + }, + { + "epoch": 0.8397796143250689, + "grad_norm": 6.074338912963867, + "learning_rate": 6.324887428179022e-07, + "loss": 0.3197, + "step": 7621 + }, + { + "epoch": 0.8398898071625345, + "grad_norm": 5.312293529510498, + "learning_rate": 6.316378352951275e-07, + "loss": 0.3066, + "step": 7622 + }, + { + "epoch": 0.84, + "grad_norm": 4.839400291442871, + "learning_rate": 6.307874619416116e-07, + "loss": 0.4367, + "step": 7623 + }, + { + "epoch": 0.8401101928374656, + "grad_norm": 6.882898330688477, + "learning_rate": 6.299376228613413e-07, + "loss": 0.3652, + "step": 7624 + }, + { + "epoch": 0.8402203856749312, + "grad_norm": 5.810752868652344, + "learning_rate": 6.290883181582347e-07, + "loss": 0.4103, + "step": 7625 + }, + { + "epoch": 0.8403305785123967, + "grad_norm": 8.484546661376953, + "learning_rate": 6.282395479361442e-07, + "loss": 0.427, + "step": 7626 + }, + { + "epoch": 0.8404407713498623, + "grad_norm": 7.992834091186523, + "learning_rate": 6.273913122988618e-07, + "loss": 0.3979, + "step": 7627 + }, + { + "epoch": 0.8405509641873278, + "grad_norm": 4.217305660247803, + "learning_rate": 6.265436113501094e-07, + "loss": 0.3736, + "step": 7628 + }, + { + "epoch": 0.8406611570247934, + "grad_norm": 5.164590358734131, + "learning_rate": 6.256964451935427e-07, + "loss": 0.3923, + "step": 7629 + }, + { + "epoch": 0.840771349862259, + "grad_norm": 9.06873607635498, + "learning_rate": 6.248498139327586e-07, + "loss": 0.4647, + "step": 7630 + }, + { + "epoch": 0.8408815426997245, + "grad_norm": 6.276998996734619, + "learning_rate": 6.240037176712826e-07, + "loss": 0.4112, + "step": 7631 + }, + { + "epoch": 0.8409917355371901, + "grad_norm": 6.069005012512207, + "learning_rate": 6.23158156512575e-07, + "loss": 0.3844, + "step": 7632 + }, + { + "epoch": 0.8411019283746557, + "grad_norm": 9.613306999206543, + "learning_rate": 6.223131305600339e-07, + "loss": 0.369, + "step": 7633 + }, + { + "epoch": 0.8412121212121212, + "grad_norm": 5.842223167419434, + "learning_rate": 6.2146863991699e-07, + "loss": 0.4157, + "step": 7634 + }, + { + "epoch": 0.8413223140495868, + "grad_norm": 4.464301586151123, + "learning_rate": 6.206246846867081e-07, + "loss": 0.4021, + "step": 7635 + }, + { + "epoch": 0.8414325068870523, + "grad_norm": 11.78282356262207, + "learning_rate": 6.197812649723878e-07, + "loss": 0.3999, + "step": 7636 + }, + { + "epoch": 0.8415426997245179, + "grad_norm": 6.033527374267578, + "learning_rate": 6.189383808771649e-07, + "loss": 0.3789, + "step": 7637 + }, + { + "epoch": 0.8416528925619835, + "grad_norm": 5.16497278213501, + "learning_rate": 6.180960325041085e-07, + "loss": 0.4042, + "step": 7638 + }, + { + "epoch": 0.841763085399449, + "grad_norm": 11.83187484741211, + "learning_rate": 6.17254219956221e-07, + "loss": 0.4211, + "step": 7639 + }, + { + "epoch": 0.8418732782369146, + "grad_norm": 6.822471618652344, + "learning_rate": 6.164129433364407e-07, + "loss": 0.3986, + "step": 7640 + }, + { + "epoch": 0.8419834710743802, + "grad_norm": 8.842621803283691, + "learning_rate": 6.155722027476408e-07, + "loss": 0.4551, + "step": 7641 + }, + { + "epoch": 0.8420936639118457, + "grad_norm": 8.396306991577148, + "learning_rate": 6.14731998292627e-07, + "loss": 0.5187, + "step": 7642 + }, + { + "epoch": 0.8422038567493113, + "grad_norm": 6.149595260620117, + "learning_rate": 6.138923300741412e-07, + "loss": 0.3448, + "step": 7643 + }, + { + "epoch": 0.8423140495867769, + "grad_norm": 9.125263214111328, + "learning_rate": 6.130531981948601e-07, + "loss": 0.4397, + "step": 7644 + }, + { + "epoch": 0.8424242424242424, + "grad_norm": 5.493200778961182, + "learning_rate": 6.122146027573922e-07, + "loss": 0.364, + "step": 7645 + }, + { + "epoch": 0.842534435261708, + "grad_norm": 5.831855773925781, + "learning_rate": 6.113765438642827e-07, + "loss": 0.3772, + "step": 7646 + }, + { + "epoch": 0.8426446280991735, + "grad_norm": 6.851219177246094, + "learning_rate": 6.105390216180119e-07, + "loss": 0.2942, + "step": 7647 + }, + { + "epoch": 0.8427548209366391, + "grad_norm": 4.491634845733643, + "learning_rate": 6.09702036120991e-07, + "loss": 0.3646, + "step": 7648 + }, + { + "epoch": 0.8428650137741047, + "grad_norm": 6.463755130767822, + "learning_rate": 6.088655874755689e-07, + "loss": 0.4057, + "step": 7649 + }, + { + "epoch": 0.8429752066115702, + "grad_norm": 7.555401802062988, + "learning_rate": 6.080296757840282e-07, + "loss": 0.393, + "step": 7650 + }, + { + "epoch": 0.8430853994490358, + "grad_norm": 5.10014533996582, + "learning_rate": 6.071943011485837e-07, + "loss": 0.3135, + "step": 7651 + }, + { + "epoch": 0.8431955922865014, + "grad_norm": 6.354140758514404, + "learning_rate": 6.063594636713877e-07, + "loss": 0.4231, + "step": 7652 + }, + { + "epoch": 0.8433057851239669, + "grad_norm": 5.660528659820557, + "learning_rate": 6.05525163454525e-07, + "loss": 0.3039, + "step": 7653 + }, + { + "epoch": 0.8434159779614325, + "grad_norm": 6.830688953399658, + "learning_rate": 6.046914006000137e-07, + "loss": 0.4382, + "step": 7654 + }, + { + "epoch": 0.843526170798898, + "grad_norm": 6.061576843261719, + "learning_rate": 6.038581752098083e-07, + "loss": 0.3374, + "step": 7655 + }, + { + "epoch": 0.8436363636363636, + "grad_norm": 5.584253311157227, + "learning_rate": 6.030254873857982e-07, + "loss": 0.3632, + "step": 7656 + }, + { + "epoch": 0.8437465564738292, + "grad_norm": 9.06962776184082, + "learning_rate": 6.021933372298028e-07, + "loss": 0.492, + "step": 7657 + }, + { + "epoch": 0.8438567493112947, + "grad_norm": 8.123315811157227, + "learning_rate": 6.013617248435815e-07, + "loss": 0.4305, + "step": 7658 + }, + { + "epoch": 0.8439669421487603, + "grad_norm": 7.666255950927734, + "learning_rate": 6.005306503288222e-07, + "loss": 0.4583, + "step": 7659 + }, + { + "epoch": 0.8440771349862259, + "grad_norm": 11.121870994567871, + "learning_rate": 5.997001137871505e-07, + "loss": 0.5033, + "step": 7660 + }, + { + "epoch": 0.8441873278236914, + "grad_norm": 5.500019550323486, + "learning_rate": 5.988701153201276e-07, + "loss": 0.3998, + "step": 7661 + }, + { + "epoch": 0.844297520661157, + "grad_norm": 6.658538341522217, + "learning_rate": 5.980406550292445e-07, + "loss": 0.3212, + "step": 7662 + }, + { + "epoch": 0.8444077134986225, + "grad_norm": 4.301170349121094, + "learning_rate": 5.972117330159294e-07, + "loss": 0.3336, + "step": 7663 + }, + { + "epoch": 0.8445179063360881, + "grad_norm": 6.08860445022583, + "learning_rate": 5.963833493815452e-07, + "loss": 0.441, + "step": 7664 + }, + { + "epoch": 0.8446280991735537, + "grad_norm": 5.657111644744873, + "learning_rate": 5.955555042273858e-07, + "loss": 0.3926, + "step": 7665 + }, + { + "epoch": 0.8447382920110192, + "grad_norm": 10.227325439453125, + "learning_rate": 5.94728197654682e-07, + "loss": 0.4637, + "step": 7666 + }, + { + "epoch": 0.8448484848484848, + "grad_norm": 6.199921131134033, + "learning_rate": 5.939014297645995e-07, + "loss": 0.3628, + "step": 7667 + }, + { + "epoch": 0.8449586776859505, + "grad_norm": 5.123288631439209, + "learning_rate": 5.930752006582341e-07, + "loss": 0.408, + "step": 7668 + }, + { + "epoch": 0.845068870523416, + "grad_norm": 8.856583595275879, + "learning_rate": 5.922495104366194e-07, + "loss": 0.4075, + "step": 7669 + }, + { + "epoch": 0.8451790633608816, + "grad_norm": 5.787606716156006, + "learning_rate": 5.914243592007229e-07, + "loss": 0.4321, + "step": 7670 + }, + { + "epoch": 0.8452892561983472, + "grad_norm": 10.660080909729004, + "learning_rate": 5.90599747051443e-07, + "loss": 0.4141, + "step": 7671 + }, + { + "epoch": 0.8453994490358127, + "grad_norm": 7.40358304977417, + "learning_rate": 5.89775674089616e-07, + "loss": 0.3404, + "step": 7672 + }, + { + "epoch": 0.8455096418732783, + "grad_norm": 6.4439005851745605, + "learning_rate": 5.889521404160109e-07, + "loss": 0.3808, + "step": 7673 + }, + { + "epoch": 0.8456198347107438, + "grad_norm": 7.696054458618164, + "learning_rate": 5.881291461313293e-07, + "loss": 0.3735, + "step": 7674 + }, + { + "epoch": 0.8457300275482094, + "grad_norm": 4.064366340637207, + "learning_rate": 5.87306691336208e-07, + "loss": 0.3548, + "step": 7675 + }, + { + "epoch": 0.845840220385675, + "grad_norm": 5.888460159301758, + "learning_rate": 5.864847761312204e-07, + "loss": 0.3777, + "step": 7676 + }, + { + "epoch": 0.8459504132231405, + "grad_norm": 7.175257682800293, + "learning_rate": 5.856634006168677e-07, + "loss": 0.4329, + "step": 7677 + }, + { + "epoch": 0.8460606060606061, + "grad_norm": 4.705997943878174, + "learning_rate": 5.848425648935913e-07, + "loss": 0.3432, + "step": 7678 + }, + { + "epoch": 0.8461707988980717, + "grad_norm": 5.8356242179870605, + "learning_rate": 5.84022269061763e-07, + "loss": 0.3829, + "step": 7679 + }, + { + "epoch": 0.8462809917355372, + "grad_norm": 7.494500160217285, + "learning_rate": 5.832025132216917e-07, + "loss": 0.389, + "step": 7680 + }, + { + "epoch": 0.8463911845730028, + "grad_norm": 7.917182922363281, + "learning_rate": 5.823832974736154e-07, + "loss": 0.3599, + "step": 7681 + }, + { + "epoch": 0.8465013774104683, + "grad_norm": 4.986794471740723, + "learning_rate": 5.815646219177102e-07, + "loss": 0.3944, + "step": 7682 + }, + { + "epoch": 0.8466115702479339, + "grad_norm": 6.412942409515381, + "learning_rate": 5.807464866540857e-07, + "loss": 0.3049, + "step": 7683 + }, + { + "epoch": 0.8467217630853995, + "grad_norm": 7.922013282775879, + "learning_rate": 5.799288917827838e-07, + "loss": 0.4079, + "step": 7684 + }, + { + "epoch": 0.846831955922865, + "grad_norm": 8.984867095947266, + "learning_rate": 5.791118374037796e-07, + "loss": 0.4134, + "step": 7685 + }, + { + "epoch": 0.8469421487603306, + "grad_norm": 4.974648952484131, + "learning_rate": 5.78295323616987e-07, + "loss": 0.4181, + "step": 7686 + }, + { + "epoch": 0.8470523415977962, + "grad_norm": 9.432533264160156, + "learning_rate": 5.774793505222481e-07, + "loss": 0.4887, + "step": 7687 + }, + { + "epoch": 0.8471625344352617, + "grad_norm": 8.944128036499023, + "learning_rate": 5.766639182193395e-07, + "loss": 0.4455, + "step": 7688 + }, + { + "epoch": 0.8472727272727273, + "grad_norm": 6.501738548278809, + "learning_rate": 5.758490268079781e-07, + "loss": 0.3665, + "step": 7689 + }, + { + "epoch": 0.8473829201101928, + "grad_norm": 4.6713385581970215, + "learning_rate": 5.750346763878073e-07, + "loss": 0.3188, + "step": 7690 + }, + { + "epoch": 0.8474931129476584, + "grad_norm": 3.850214958190918, + "learning_rate": 5.742208670584054e-07, + "loss": 0.3162, + "step": 7691 + }, + { + "epoch": 0.847603305785124, + "grad_norm": 10.417648315429688, + "learning_rate": 5.734075989192884e-07, + "loss": 0.4206, + "step": 7692 + }, + { + "epoch": 0.8477134986225895, + "grad_norm": 6.745625972747803, + "learning_rate": 5.725948720699026e-07, + "loss": 0.4001, + "step": 7693 + }, + { + "epoch": 0.8478236914600551, + "grad_norm": 6.785290241241455, + "learning_rate": 5.71782686609631e-07, + "loss": 0.3319, + "step": 7694 + }, + { + "epoch": 0.8479338842975207, + "grad_norm": 7.850511074066162, + "learning_rate": 5.709710426377868e-07, + "loss": 0.4087, + "step": 7695 + }, + { + "epoch": 0.8480440771349862, + "grad_norm": 8.589800834655762, + "learning_rate": 5.701599402536196e-07, + "loss": 0.3473, + "step": 7696 + }, + { + "epoch": 0.8481542699724518, + "grad_norm": 8.957038879394531, + "learning_rate": 5.693493795563132e-07, + "loss": 0.4608, + "step": 7697 + }, + { + "epoch": 0.8482644628099174, + "grad_norm": 12.42385482788086, + "learning_rate": 5.685393606449824e-07, + "loss": 0.5595, + "step": 7698 + }, + { + "epoch": 0.8483746556473829, + "grad_norm": 6.1786298751831055, + "learning_rate": 5.677298836186779e-07, + "loss": 0.3342, + "step": 7699 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 7.499631881713867, + "learning_rate": 5.66920948576385e-07, + "loss": 0.3894, + "step": 7700 + }, + { + "epoch": 0.848595041322314, + "grad_norm": 5.315148830413818, + "learning_rate": 5.661125556170188e-07, + "loss": 0.37, + "step": 7701 + }, + { + "epoch": 0.8487052341597796, + "grad_norm": 7.056180477142334, + "learning_rate": 5.653047048394328e-07, + "loss": 0.3748, + "step": 7702 + }, + { + "epoch": 0.8488154269972452, + "grad_norm": 6.130964279174805, + "learning_rate": 5.644973963424122e-07, + "loss": 0.3858, + "step": 7703 + }, + { + "epoch": 0.8489256198347107, + "grad_norm": 9.253560066223145, + "learning_rate": 5.636906302246736e-07, + "loss": 0.3755, + "step": 7704 + }, + { + "epoch": 0.8490358126721763, + "grad_norm": 6.5167927742004395, + "learning_rate": 5.628844065848715e-07, + "loss": 0.4288, + "step": 7705 + }, + { + "epoch": 0.8491460055096419, + "grad_norm": 5.824051856994629, + "learning_rate": 5.620787255215921e-07, + "loss": 0.3397, + "step": 7706 + }, + { + "epoch": 0.8492561983471074, + "grad_norm": 4.285006523132324, + "learning_rate": 5.612735871333535e-07, + "loss": 0.3016, + "step": 7707 + }, + { + "epoch": 0.849366391184573, + "grad_norm": 9.04549789428711, + "learning_rate": 5.604689915186101e-07, + "loss": 0.4501, + "step": 7708 + }, + { + "epoch": 0.8494765840220385, + "grad_norm": 6.156450271606445, + "learning_rate": 5.596649387757502e-07, + "loss": 0.4291, + "step": 7709 + }, + { + "epoch": 0.8495867768595041, + "grad_norm": 5.915395259857178, + "learning_rate": 5.588614290030919e-07, + "loss": 0.3578, + "step": 7710 + }, + { + "epoch": 0.8496969696969697, + "grad_norm": 8.087713241577148, + "learning_rate": 5.580584622988905e-07, + "loss": 0.4435, + "step": 7711 + }, + { + "epoch": 0.8498071625344352, + "grad_norm": 8.039299011230469, + "learning_rate": 5.572560387613352e-07, + "loss": 0.3567, + "step": 7712 + }, + { + "epoch": 0.8499173553719008, + "grad_norm": 7.934005260467529, + "learning_rate": 5.564541584885458e-07, + "loss": 0.3357, + "step": 7713 + }, + { + "epoch": 0.8500275482093664, + "grad_norm": 4.658009052276611, + "learning_rate": 5.556528215785778e-07, + "loss": 0.3597, + "step": 7714 + }, + { + "epoch": 0.8501377410468319, + "grad_norm": 4.118621826171875, + "learning_rate": 5.548520281294206e-07, + "loss": 0.3772, + "step": 7715 + }, + { + "epoch": 0.8502479338842975, + "grad_norm": 7.155452251434326, + "learning_rate": 5.540517782389943e-07, + "loss": 0.4349, + "step": 7716 + }, + { + "epoch": 0.850358126721763, + "grad_norm": 9.435413360595703, + "learning_rate": 5.532520720051571e-07, + "loss": 0.4621, + "step": 7717 + }, + { + "epoch": 0.8504683195592286, + "grad_norm": 5.139011859893799, + "learning_rate": 5.524529095256958e-07, + "loss": 0.4654, + "step": 7718 + }, + { + "epoch": 0.8505785123966942, + "grad_norm": 3.8253042697906494, + "learning_rate": 5.516542908983341e-07, + "loss": 0.3474, + "step": 7719 + }, + { + "epoch": 0.8506887052341597, + "grad_norm": 4.634027004241943, + "learning_rate": 5.508562162207293e-07, + "loss": 0.416, + "step": 7720 + }, + { + "epoch": 0.8507988980716253, + "grad_norm": 9.43326473236084, + "learning_rate": 5.500586855904677e-07, + "loss": 0.3912, + "step": 7721 + }, + { + "epoch": 0.850909090909091, + "grad_norm": 9.88778018951416, + "learning_rate": 5.49261699105077e-07, + "loss": 0.4034, + "step": 7722 + }, + { + "epoch": 0.8510192837465564, + "grad_norm": 7.362020969390869, + "learning_rate": 5.484652568620113e-07, + "loss": 0.3774, + "step": 7723 + }, + { + "epoch": 0.851129476584022, + "grad_norm": 6.79647159576416, + "learning_rate": 5.476693589586596e-07, + "loss": 0.392, + "step": 7724 + }, + { + "epoch": 0.8512396694214877, + "grad_norm": 10.27442455291748, + "learning_rate": 5.468740054923472e-07, + "loss": 0.4702, + "step": 7725 + }, + { + "epoch": 0.8513498622589531, + "grad_norm": 4.888890266418457, + "learning_rate": 5.460791965603307e-07, + "loss": 0.3662, + "step": 7726 + }, + { + "epoch": 0.8514600550964188, + "grad_norm": 5.989506721496582, + "learning_rate": 5.452849322597997e-07, + "loss": 0.3734, + "step": 7727 + }, + { + "epoch": 0.8515702479338843, + "grad_norm": 10.52830696105957, + "learning_rate": 5.444912126878776e-07, + "loss": 0.429, + "step": 7728 + }, + { + "epoch": 0.8516804407713499, + "grad_norm": 5.60654354095459, + "learning_rate": 5.43698037941624e-07, + "loss": 0.4441, + "step": 7729 + }, + { + "epoch": 0.8517906336088155, + "grad_norm": 5.992086410522461, + "learning_rate": 5.429054081180263e-07, + "loss": 0.4182, + "step": 7730 + }, + { + "epoch": 0.851900826446281, + "grad_norm": 12.801902770996094, + "learning_rate": 5.421133233140096e-07, + "loss": 0.5226, + "step": 7731 + }, + { + "epoch": 0.8520110192837466, + "grad_norm": 9.182729721069336, + "learning_rate": 5.413217836264317e-07, + "loss": 0.4883, + "step": 7732 + }, + { + "epoch": 0.8521212121212122, + "grad_norm": 14.285962104797363, + "learning_rate": 5.405307891520823e-07, + "loss": 0.3842, + "step": 7733 + }, + { + "epoch": 0.8522314049586777, + "grad_norm": 12.276472091674805, + "learning_rate": 5.397403399876855e-07, + "loss": 0.3837, + "step": 7734 + }, + { + "epoch": 0.8523415977961433, + "grad_norm": 5.377422332763672, + "learning_rate": 5.389504362298987e-07, + "loss": 0.3916, + "step": 7735 + }, + { + "epoch": 0.8524517906336088, + "grad_norm": 6.361353397369385, + "learning_rate": 5.381610779753127e-07, + "loss": 0.4305, + "step": 7736 + }, + { + "epoch": 0.8525619834710744, + "grad_norm": 9.844764709472656, + "learning_rate": 5.373722653204505e-07, + "loss": 0.3797, + "step": 7737 + }, + { + "epoch": 0.85267217630854, + "grad_norm": 6.1600117683410645, + "learning_rate": 5.365839983617693e-07, + "loss": 0.3293, + "step": 7738 + }, + { + "epoch": 0.8527823691460055, + "grad_norm": 6.187741279602051, + "learning_rate": 5.35796277195661e-07, + "loss": 0.4407, + "step": 7739 + }, + { + "epoch": 0.8528925619834711, + "grad_norm": 16.504772186279297, + "learning_rate": 5.350091019184467e-07, + "loss": 0.6067, + "step": 7740 + }, + { + "epoch": 0.8530027548209367, + "grad_norm": 9.01555061340332, + "learning_rate": 5.342224726263845e-07, + "loss": 0.4012, + "step": 7741 + }, + { + "epoch": 0.8531129476584022, + "grad_norm": 6.932723522186279, + "learning_rate": 5.334363894156658e-07, + "loss": 0.3988, + "step": 7742 + }, + { + "epoch": 0.8532231404958678, + "grad_norm": 8.647543907165527, + "learning_rate": 5.32650852382412e-07, + "loss": 0.4217, + "step": 7743 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 7.07531213760376, + "learning_rate": 5.318658616226791e-07, + "loss": 0.4471, + "step": 7744 + }, + { + "epoch": 0.8534435261707989, + "grad_norm": 6.896085739135742, + "learning_rate": 5.310814172324596e-07, + "loss": 0.3559, + "step": 7745 + }, + { + "epoch": 0.8535537190082645, + "grad_norm": 11.441871643066406, + "learning_rate": 5.302975193076748e-07, + "loss": 0.5203, + "step": 7746 + }, + { + "epoch": 0.85366391184573, + "grad_norm": 4.805006980895996, + "learning_rate": 5.295141679441784e-07, + "loss": 0.3604, + "step": 7747 + }, + { + "epoch": 0.8537741046831956, + "grad_norm": 4.587762832641602, + "learning_rate": 5.28731363237765e-07, + "loss": 0.3518, + "step": 7748 + }, + { + "epoch": 0.8538842975206612, + "grad_norm": 8.714046478271484, + "learning_rate": 5.279491052841523e-07, + "loss": 0.3746, + "step": 7749 + }, + { + "epoch": 0.8539944903581267, + "grad_norm": 8.611539840698242, + "learning_rate": 5.271673941789996e-07, + "loss": 0.3367, + "step": 7750 + }, + { + "epoch": 0.8541046831955923, + "grad_norm": 5.320642948150635, + "learning_rate": 5.263862300178917e-07, + "loss": 0.4176, + "step": 7751 + }, + { + "epoch": 0.8542148760330579, + "grad_norm": 7.6433539390563965, + "learning_rate": 5.256056128963533e-07, + "loss": 0.3874, + "step": 7752 + }, + { + "epoch": 0.8543250688705234, + "grad_norm": 5.494566440582275, + "learning_rate": 5.248255429098387e-07, + "loss": 0.3558, + "step": 7753 + }, + { + "epoch": 0.854435261707989, + "grad_norm": 8.333560943603516, + "learning_rate": 5.24046020153735e-07, + "loss": 0.4335, + "step": 7754 + }, + { + "epoch": 0.8545454545454545, + "grad_norm": 5.336777210235596, + "learning_rate": 5.232670447233639e-07, + "loss": 0.3522, + "step": 7755 + }, + { + "epoch": 0.8546556473829201, + "grad_norm": 4.345096111297607, + "learning_rate": 5.224886167139803e-07, + "loss": 0.3992, + "step": 7756 + }, + { + "epoch": 0.8547658402203857, + "grad_norm": 6.853146553039551, + "learning_rate": 5.217107362207701e-07, + "loss": 0.4191, + "step": 7757 + }, + { + "epoch": 0.8548760330578512, + "grad_norm": 5.39180850982666, + "learning_rate": 5.209334033388542e-07, + "loss": 0.4003, + "step": 7758 + }, + { + "epoch": 0.8549862258953168, + "grad_norm": 7.58671760559082, + "learning_rate": 5.201566181632872e-07, + "loss": 0.3832, + "step": 7759 + }, + { + "epoch": 0.8550964187327824, + "grad_norm": 5.2194976806640625, + "learning_rate": 5.193803807890529e-07, + "loss": 0.3714, + "step": 7760 + }, + { + "epoch": 0.8552066115702479, + "grad_norm": 9.42974853515625, + "learning_rate": 5.186046913110721e-07, + "loss": 0.3761, + "step": 7761 + }, + { + "epoch": 0.8553168044077135, + "grad_norm": 5.1489081382751465, + "learning_rate": 5.178295498241976e-07, + "loss": 0.3207, + "step": 7762 + }, + { + "epoch": 0.855426997245179, + "grad_norm": 6.776419639587402, + "learning_rate": 5.170549564232135e-07, + "loss": 0.3667, + "step": 7763 + }, + { + "epoch": 0.8555371900826446, + "grad_norm": 13.21877384185791, + "learning_rate": 5.162809112028388e-07, + "loss": 0.4848, + "step": 7764 + }, + { + "epoch": 0.8556473829201102, + "grad_norm": 4.820184230804443, + "learning_rate": 5.15507414257726e-07, + "loss": 0.358, + "step": 7765 + }, + { + "epoch": 0.8557575757575757, + "grad_norm": 5.9227705001831055, + "learning_rate": 5.147344656824566e-07, + "loss": 0.4157, + "step": 7766 + }, + { + "epoch": 0.8558677685950413, + "grad_norm": 7.291881084442139, + "learning_rate": 5.139620655715499e-07, + "loss": 0.364, + "step": 7767 + }, + { + "epoch": 0.8559779614325069, + "grad_norm": 5.570192337036133, + "learning_rate": 5.131902140194561e-07, + "loss": 0.3772, + "step": 7768 + }, + { + "epoch": 0.8560881542699724, + "grad_norm": 6.814751148223877, + "learning_rate": 5.124189111205574e-07, + "loss": 0.4392, + "step": 7769 + }, + { + "epoch": 0.856198347107438, + "grad_norm": 17.650371551513672, + "learning_rate": 5.116481569691695e-07, + "loss": 0.54, + "step": 7770 + }, + { + "epoch": 0.8563085399449036, + "grad_norm": 4.790437698364258, + "learning_rate": 5.108779516595424e-07, + "loss": 0.375, + "step": 7771 + }, + { + "epoch": 0.8564187327823691, + "grad_norm": 7.293126106262207, + "learning_rate": 5.101082952858571e-07, + "loss": 0.4796, + "step": 7772 + }, + { + "epoch": 0.8565289256198347, + "grad_norm": 7.757967472076416, + "learning_rate": 5.093391879422277e-07, + "loss": 0.403, + "step": 7773 + }, + { + "epoch": 0.8566391184573002, + "grad_norm": 4.683879852294922, + "learning_rate": 5.085706297227039e-07, + "loss": 0.363, + "step": 7774 + }, + { + "epoch": 0.8567493112947658, + "grad_norm": 5.255395889282227, + "learning_rate": 5.078026207212633e-07, + "loss": 0.3157, + "step": 7775 + }, + { + "epoch": 0.8568595041322314, + "grad_norm": 6.1586384773254395, + "learning_rate": 5.070351610318208e-07, + "loss": 0.4242, + "step": 7776 + }, + { + "epoch": 0.8569696969696969, + "grad_norm": 8.594871520996094, + "learning_rate": 5.062682507482203e-07, + "loss": 0.4221, + "step": 7777 + }, + { + "epoch": 0.8570798898071625, + "grad_norm": 5.6082587242126465, + "learning_rate": 5.055018899642445e-07, + "loss": 0.3667, + "step": 7778 + }, + { + "epoch": 0.8571900826446281, + "grad_norm": 7.998793125152588, + "learning_rate": 5.047360787736027e-07, + "loss": 0.3374, + "step": 7779 + }, + { + "epoch": 0.8573002754820936, + "grad_norm": 6.855576038360596, + "learning_rate": 5.03970817269937e-07, + "loss": 0.4028, + "step": 7780 + }, + { + "epoch": 0.8574104683195592, + "grad_norm": 7.873417854309082, + "learning_rate": 5.03206105546829e-07, + "loss": 0.4002, + "step": 7781 + }, + { + "epoch": 0.8575206611570247, + "grad_norm": 6.745034217834473, + "learning_rate": 5.024419436977873e-07, + "loss": 0.3676, + "step": 7782 + }, + { + "epoch": 0.8576308539944903, + "grad_norm": 7.215935707092285, + "learning_rate": 5.016783318162527e-07, + "loss": 0.4505, + "step": 7783 + }, + { + "epoch": 0.857741046831956, + "grad_norm": 13.45441722869873, + "learning_rate": 5.009152699956022e-07, + "loss": 0.4353, + "step": 7784 + }, + { + "epoch": 0.8578512396694215, + "grad_norm": 11.69281005859375, + "learning_rate": 5.001527583291443e-07, + "loss": 0.4109, + "step": 7785 + }, + { + "epoch": 0.8579614325068871, + "grad_norm": 6.876806735992432, + "learning_rate": 4.993907969101191e-07, + "loss": 0.334, + "step": 7786 + }, + { + "epoch": 0.8580716253443527, + "grad_norm": 6.825343132019043, + "learning_rate": 4.98629385831701e-07, + "loss": 0.3764, + "step": 7787 + }, + { + "epoch": 0.8581818181818182, + "grad_norm": 6.715329647064209, + "learning_rate": 4.978685251869964e-07, + "loss": 0.362, + "step": 7788 + }, + { + "epoch": 0.8582920110192838, + "grad_norm": 5.902563095092773, + "learning_rate": 4.971082150690437e-07, + "loss": 0.4258, + "step": 7789 + }, + { + "epoch": 0.8584022038567493, + "grad_norm": 5.445431232452393, + "learning_rate": 4.963484555708148e-07, + "loss": 0.3971, + "step": 7790 + }, + { + "epoch": 0.8585123966942149, + "grad_norm": 9.014820098876953, + "learning_rate": 4.955892467852142e-07, + "loss": 0.4358, + "step": 7791 + }, + { + "epoch": 0.8586225895316805, + "grad_norm": 10.790130615234375, + "learning_rate": 4.948305888050803e-07, + "loss": 0.3901, + "step": 7792 + }, + { + "epoch": 0.858732782369146, + "grad_norm": 6.64311408996582, + "learning_rate": 4.940724817231807e-07, + "loss": 0.3852, + "step": 7793 + }, + { + "epoch": 0.8588429752066116, + "grad_norm": 6.426881313323975, + "learning_rate": 4.93314925632219e-07, + "loss": 0.4101, + "step": 7794 + }, + { + "epoch": 0.8589531680440772, + "grad_norm": 6.748328685760498, + "learning_rate": 4.925579206248305e-07, + "loss": 0.3288, + "step": 7795 + }, + { + "epoch": 0.8590633608815427, + "grad_norm": 5.245304584503174, + "learning_rate": 4.918014667935811e-07, + "loss": 0.3531, + "step": 7796 + }, + { + "epoch": 0.8591735537190083, + "grad_norm": 5.381857872009277, + "learning_rate": 4.910455642309725e-07, + "loss": 0.4286, + "step": 7797 + }, + { + "epoch": 0.8592837465564739, + "grad_norm": 9.686392784118652, + "learning_rate": 4.90290213029438e-07, + "loss": 0.4856, + "step": 7798 + }, + { + "epoch": 0.8593939393939394, + "grad_norm": 4.616172790527344, + "learning_rate": 4.895354132813418e-07, + "loss": 0.4519, + "step": 7799 + }, + { + "epoch": 0.859504132231405, + "grad_norm": 6.30689001083374, + "learning_rate": 4.887811650789809e-07, + "loss": 0.3234, + "step": 7800 + }, + { + "epoch": 0.8596143250688705, + "grad_norm": 7.747769832611084, + "learning_rate": 4.880274685145886e-07, + "loss": 0.39, + "step": 7801 + }, + { + "epoch": 0.8597245179063361, + "grad_norm": 5.953192710876465, + "learning_rate": 4.872743236803263e-07, + "loss": 0.3754, + "step": 7802 + }, + { + "epoch": 0.8598347107438017, + "grad_norm": 6.571876525878906, + "learning_rate": 4.865217306682874e-07, + "loss": 0.3665, + "step": 7803 + }, + { + "epoch": 0.8599449035812672, + "grad_norm": 6.302215099334717, + "learning_rate": 4.857696895705045e-07, + "loss": 0.3479, + "step": 7804 + }, + { + "epoch": 0.8600550964187328, + "grad_norm": 5.455636501312256, + "learning_rate": 4.85018200478935e-07, + "loss": 0.4147, + "step": 7805 + }, + { + "epoch": 0.8601652892561984, + "grad_norm": 7.550479888916016, + "learning_rate": 4.842672634854728e-07, + "loss": 0.4079, + "step": 7806 + }, + { + "epoch": 0.8602754820936639, + "grad_norm": 4.89521598815918, + "learning_rate": 4.835168786819445e-07, + "loss": 0.3678, + "step": 7807 + }, + { + "epoch": 0.8603856749311295, + "grad_norm": 8.000993728637695, + "learning_rate": 4.827670461601064e-07, + "loss": 0.4827, + "step": 7808 + }, + { + "epoch": 0.860495867768595, + "grad_norm": 4.615668296813965, + "learning_rate": 4.820177660116515e-07, + "loss": 0.386, + "step": 7809 + }, + { + "epoch": 0.8606060606060606, + "grad_norm": 8.023161888122559, + "learning_rate": 4.812690383281998e-07, + "loss": 0.4274, + "step": 7810 + }, + { + "epoch": 0.8607162534435262, + "grad_norm": 6.460952281951904, + "learning_rate": 4.80520863201308e-07, + "loss": 0.362, + "step": 7811 + }, + { + "epoch": 0.8608264462809917, + "grad_norm": 4.9097676277160645, + "learning_rate": 4.797732407224654e-07, + "loss": 0.3567, + "step": 7812 + }, + { + "epoch": 0.8609366391184573, + "grad_norm": 9.771832466125488, + "learning_rate": 4.790261709830901e-07, + "loss": 0.3936, + "step": 7813 + }, + { + "epoch": 0.8610468319559229, + "grad_norm": 10.23644733428955, + "learning_rate": 4.782796540745354e-07, + "loss": 0.4066, + "step": 7814 + }, + { + "epoch": 0.8611570247933884, + "grad_norm": 7.400659561157227, + "learning_rate": 4.775336900880884e-07, + "loss": 0.4473, + "step": 7815 + }, + { + "epoch": 0.861267217630854, + "grad_norm": 5.771268367767334, + "learning_rate": 4.7678827911496304e-07, + "loss": 0.4521, + "step": 7816 + }, + { + "epoch": 0.8613774104683195, + "grad_norm": 4.279479026794434, + "learning_rate": 4.7604342124631166e-07, + "loss": 0.342, + "step": 7817 + }, + { + "epoch": 0.8614876033057851, + "grad_norm": 7.7825541496276855, + "learning_rate": 4.752991165732168e-07, + "loss": 0.4103, + "step": 7818 + }, + { + "epoch": 0.8615977961432507, + "grad_norm": 8.169404983520508, + "learning_rate": 4.745553651866913e-07, + "loss": 0.323, + "step": 7819 + }, + { + "epoch": 0.8617079889807162, + "grad_norm": 5.612525939941406, + "learning_rate": 4.7381216717768295e-07, + "loss": 0.331, + "step": 7820 + }, + { + "epoch": 0.8618181818181818, + "grad_norm": 4.310781955718994, + "learning_rate": 4.730695226370724e-07, + "loss": 0.2813, + "step": 7821 + }, + { + "epoch": 0.8619283746556474, + "grad_norm": 10.757055282592773, + "learning_rate": 4.723274316556681e-07, + "loss": 0.4415, + "step": 7822 + }, + { + "epoch": 0.8620385674931129, + "grad_norm": 5.158672332763672, + "learning_rate": 4.715858943242163e-07, + "loss": 0.3846, + "step": 7823 + }, + { + "epoch": 0.8621487603305785, + "grad_norm": 5.668391227722168, + "learning_rate": 4.708449107333929e-07, + "loss": 0.322, + "step": 7824 + }, + { + "epoch": 0.8622589531680441, + "grad_norm": 8.147780418395996, + "learning_rate": 4.701044809738059e-07, + "loss": 0.3339, + "step": 7825 + }, + { + "epoch": 0.8623691460055096, + "grad_norm": 5.757749080657959, + "learning_rate": 4.693646051359957e-07, + "loss": 0.3206, + "step": 7826 + }, + { + "epoch": 0.8624793388429752, + "grad_norm": 7.330891132354736, + "learning_rate": 4.6862528331043654e-07, + "loss": 0.4199, + "step": 7827 + }, + { + "epoch": 0.8625895316804407, + "grad_norm": 5.04267692565918, + "learning_rate": 4.6788651558753286e-07, + "loss": 0.3396, + "step": 7828 + }, + { + "epoch": 0.8626997245179063, + "grad_norm": 6.792194366455078, + "learning_rate": 4.671483020576217e-07, + "loss": 0.3301, + "step": 7829 + }, + { + "epoch": 0.8628099173553719, + "grad_norm": 8.746522903442383, + "learning_rate": 4.664106428109744e-07, + "loss": 0.3467, + "step": 7830 + }, + { + "epoch": 0.8629201101928374, + "grad_norm": 6.639345645904541, + "learning_rate": 4.6567353793779134e-07, + "loss": 0.3339, + "step": 7831 + }, + { + "epoch": 0.863030303030303, + "grad_norm": 5.252315521240234, + "learning_rate": 4.649369875282084e-07, + "loss": 0.4206, + "step": 7832 + }, + { + "epoch": 0.8631404958677686, + "grad_norm": 4.119190692901611, + "learning_rate": 4.642009916722884e-07, + "loss": 0.3193, + "step": 7833 + }, + { + "epoch": 0.8632506887052341, + "grad_norm": 6.538342475891113, + "learning_rate": 4.6346555046003493e-07, + "loss": 0.411, + "step": 7834 + }, + { + "epoch": 0.8633608815426997, + "grad_norm": 7.538941860198975, + "learning_rate": 4.627306639813761e-07, + "loss": 0.3699, + "step": 7835 + }, + { + "epoch": 0.8634710743801652, + "grad_norm": 5.973537445068359, + "learning_rate": 4.619963323261728e-07, + "loss": 0.4105, + "step": 7836 + }, + { + "epoch": 0.8635812672176308, + "grad_norm": 6.2615437507629395, + "learning_rate": 4.612625555842243e-07, + "loss": 0.3151, + "step": 7837 + }, + { + "epoch": 0.8636914600550964, + "grad_norm": 8.059074401855469, + "learning_rate": 4.605293338452554e-07, + "loss": 0.458, + "step": 7838 + }, + { + "epoch": 0.8638016528925619, + "grad_norm": 5.59768533706665, + "learning_rate": 4.597966671989246e-07, + "loss": 0.3664, + "step": 7839 + }, + { + "epoch": 0.8639118457300275, + "grad_norm": 7.597044944763184, + "learning_rate": 4.590645557348261e-07, + "loss": 0.3609, + "step": 7840 + }, + { + "epoch": 0.8640220385674932, + "grad_norm": 6.077411651611328, + "learning_rate": 4.5833299954248233e-07, + "loss": 0.3579, + "step": 7841 + }, + { + "epoch": 0.8641322314049587, + "grad_norm": 5.302631855010986, + "learning_rate": 4.5760199871134723e-07, + "loss": 0.3454, + "step": 7842 + }, + { + "epoch": 0.8642424242424243, + "grad_norm": 10.610418319702148, + "learning_rate": 4.568715533308099e-07, + "loss": 0.4745, + "step": 7843 + }, + { + "epoch": 0.8643526170798899, + "grad_norm": 6.243701457977295, + "learning_rate": 4.5614166349019163e-07, + "loss": 0.422, + "step": 7844 + }, + { + "epoch": 0.8644628099173554, + "grad_norm": 13.345518112182617, + "learning_rate": 4.5541232927874155e-07, + "loss": 0.3868, + "step": 7845 + }, + { + "epoch": 0.864573002754821, + "grad_norm": 4.608929634094238, + "learning_rate": 4.546835507856456e-07, + "loss": 0.3389, + "step": 7846 + }, + { + "epoch": 0.8646831955922865, + "grad_norm": 6.194828033447266, + "learning_rate": 4.539553281000192e-07, + "loss": 0.4143, + "step": 7847 + }, + { + "epoch": 0.8647933884297521, + "grad_norm": 8.774277687072754, + "learning_rate": 4.53227661310911e-07, + "loss": 0.4575, + "step": 7848 + }, + { + "epoch": 0.8649035812672177, + "grad_norm": 6.736591815948486, + "learning_rate": 4.525005505072999e-07, + "loss": 0.351, + "step": 7849 + }, + { + "epoch": 0.8650137741046832, + "grad_norm": 7.460677623748779, + "learning_rate": 4.5177399577809867e-07, + "loss": 0.3125, + "step": 7850 + }, + { + "epoch": 0.8651239669421488, + "grad_norm": 5.065041542053223, + "learning_rate": 4.510479972121523e-07, + "loss": 0.39, + "step": 7851 + }, + { + "epoch": 0.8652341597796144, + "grad_norm": 9.561484336853027, + "learning_rate": 4.5032255489823484e-07, + "loss": 0.4045, + "step": 7852 + }, + { + "epoch": 0.8653443526170799, + "grad_norm": 6.36940336227417, + "learning_rate": 4.4959766892505587e-07, + "loss": 0.3412, + "step": 7853 + }, + { + "epoch": 0.8654545454545455, + "grad_norm": 4.763881206512451, + "learning_rate": 4.488733393812555e-07, + "loss": 0.4107, + "step": 7854 + }, + { + "epoch": 0.865564738292011, + "grad_norm": 7.719789028167725, + "learning_rate": 4.4814956635540477e-07, + "loss": 0.3691, + "step": 7855 + }, + { + "epoch": 0.8656749311294766, + "grad_norm": 7.469186305999756, + "learning_rate": 4.474263499360082e-07, + "loss": 0.4382, + "step": 7856 + }, + { + "epoch": 0.8657851239669422, + "grad_norm": 4.7013840675354, + "learning_rate": 4.4670369021150237e-07, + "loss": 0.4005, + "step": 7857 + }, + { + "epoch": 0.8658953168044077, + "grad_norm": 7.680978775024414, + "learning_rate": 4.459815872702544e-07, + "loss": 0.3761, + "step": 7858 + }, + { + "epoch": 0.8660055096418733, + "grad_norm": 7.623222351074219, + "learning_rate": 4.45260041200562e-07, + "loss": 0.4246, + "step": 7859 + }, + { + "epoch": 0.8661157024793389, + "grad_norm": 7.7285895347595215, + "learning_rate": 4.445390520906606e-07, + "loss": 0.4048, + "step": 7860 + }, + { + "epoch": 0.8662258953168044, + "grad_norm": 4.648258686065674, + "learning_rate": 4.4381862002871144e-07, + "loss": 0.3657, + "step": 7861 + }, + { + "epoch": 0.86633608815427, + "grad_norm": 6.883289813995361, + "learning_rate": 4.4309874510280957e-07, + "loss": 0.4015, + "step": 7862 + }, + { + "epoch": 0.8664462809917355, + "grad_norm": 9.443902969360352, + "learning_rate": 4.423794274009846e-07, + "loss": 0.3514, + "step": 7863 + }, + { + "epoch": 0.8665564738292011, + "grad_norm": 7.527348518371582, + "learning_rate": 4.4166066701119336e-07, + "loss": 0.3888, + "step": 7864 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 7.373073101043701, + "learning_rate": 4.4094246402132836e-07, + "loss": 0.4161, + "step": 7865 + }, + { + "epoch": 0.8667768595041322, + "grad_norm": 4.945640563964844, + "learning_rate": 4.402248185192104e-07, + "loss": 0.399, + "step": 7866 + }, + { + "epoch": 0.8668870523415978, + "grad_norm": 6.406610012054443, + "learning_rate": 4.3950773059259597e-07, + "loss": 0.4355, + "step": 7867 + }, + { + "epoch": 0.8669972451790634, + "grad_norm": 5.226853370666504, + "learning_rate": 4.3879120032917224e-07, + "loss": 0.3961, + "step": 7868 + }, + { + "epoch": 0.8671074380165289, + "grad_norm": 8.117792129516602, + "learning_rate": 4.3807522781655454e-07, + "loss": 0.4156, + "step": 7869 + }, + { + "epoch": 0.8672176308539945, + "grad_norm": 16.762847900390625, + "learning_rate": 4.373598131422957e-07, + "loss": 0.511, + "step": 7870 + }, + { + "epoch": 0.8673278236914601, + "grad_norm": 16.404571533203125, + "learning_rate": 4.3664495639387683e-07, + "loss": 0.5058, + "step": 7871 + }, + { + "epoch": 0.8674380165289256, + "grad_norm": 10.145781517028809, + "learning_rate": 4.359306576587108e-07, + "loss": 0.5076, + "step": 7872 + }, + { + "epoch": 0.8675482093663912, + "grad_norm": 9.53577709197998, + "learning_rate": 4.352169170241438e-07, + "loss": 0.3837, + "step": 7873 + }, + { + "epoch": 0.8676584022038567, + "grad_norm": 5.142624378204346, + "learning_rate": 4.345037345774533e-07, + "loss": 0.3095, + "step": 7874 + }, + { + "epoch": 0.8677685950413223, + "grad_norm": 5.64030122756958, + "learning_rate": 4.3379111040584734e-07, + "loss": 0.433, + "step": 7875 + }, + { + "epoch": 0.8678787878787879, + "grad_norm": 3.860023260116577, + "learning_rate": 4.3307904459646666e-07, + "loss": 0.3126, + "step": 7876 + }, + { + "epoch": 0.8679889807162534, + "grad_norm": 5.460316181182861, + "learning_rate": 4.3236753723638446e-07, + "loss": 0.3417, + "step": 7877 + }, + { + "epoch": 0.868099173553719, + "grad_norm": 5.342752456665039, + "learning_rate": 4.3165658841260394e-07, + "loss": 0.3704, + "step": 7878 + }, + { + "epoch": 0.8682093663911846, + "grad_norm": 12.695240020751953, + "learning_rate": 4.3094619821206164e-07, + "loss": 0.4667, + "step": 7879 + }, + { + "epoch": 0.8683195592286501, + "grad_norm": 6.243031978607178, + "learning_rate": 4.302363667216253e-07, + "loss": 0.4304, + "step": 7880 + }, + { + "epoch": 0.8684297520661157, + "grad_norm": 8.409862518310547, + "learning_rate": 4.295270940280921e-07, + "loss": 0.4571, + "step": 7881 + }, + { + "epoch": 0.8685399449035812, + "grad_norm": 8.675850868225098, + "learning_rate": 4.2881838021819447e-07, + "loss": 0.4461, + "step": 7882 + }, + { + "epoch": 0.8686501377410468, + "grad_norm": 7.134649276733398, + "learning_rate": 4.281102253785957e-07, + "loss": 0.3419, + "step": 7883 + }, + { + "epoch": 0.8687603305785124, + "grad_norm": 8.466950416564941, + "learning_rate": 4.2740262959588777e-07, + "loss": 0.4195, + "step": 7884 + }, + { + "epoch": 0.8688705234159779, + "grad_norm": 6.0013227462768555, + "learning_rate": 4.26695592956598e-07, + "loss": 0.3899, + "step": 7885 + }, + { + "epoch": 0.8689807162534435, + "grad_norm": 3.945721387863159, + "learning_rate": 4.259891155471835e-07, + "loss": 0.392, + "step": 7886 + }, + { + "epoch": 0.8690909090909091, + "grad_norm": 6.7099175453186035, + "learning_rate": 4.252831974540328e-07, + "loss": 0.3565, + "step": 7887 + }, + { + "epoch": 0.8692011019283746, + "grad_norm": 9.24994945526123, + "learning_rate": 4.245778387634669e-07, + "loss": 0.4214, + "step": 7888 + }, + { + "epoch": 0.8693112947658402, + "grad_norm": 8.128684997558594, + "learning_rate": 4.2387303956173744e-07, + "loss": 0.3915, + "step": 7889 + }, + { + "epoch": 0.8694214876033057, + "grad_norm": 6.961113452911377, + "learning_rate": 4.2316879993503033e-07, + "loss": 0.4557, + "step": 7890 + }, + { + "epoch": 0.8695316804407713, + "grad_norm": 7.837036609649658, + "learning_rate": 4.2246511996945904e-07, + "loss": 0.4634, + "step": 7891 + }, + { + "epoch": 0.8696418732782369, + "grad_norm": 5.132203102111816, + "learning_rate": 4.2176199975106913e-07, + "loss": 0.3386, + "step": 7892 + }, + { + "epoch": 0.8697520661157024, + "grad_norm": 7.128950595855713, + "learning_rate": 4.210594393658424e-07, + "loss": 0.4544, + "step": 7893 + }, + { + "epoch": 0.869862258953168, + "grad_norm": 8.016288757324219, + "learning_rate": 4.203574388996873e-07, + "loss": 0.3735, + "step": 7894 + }, + { + "epoch": 0.8699724517906336, + "grad_norm": 4.295894622802734, + "learning_rate": 4.196559984384441e-07, + "loss": 0.4244, + "step": 7895 + }, + { + "epoch": 0.8700826446280991, + "grad_norm": 6.6320719718933105, + "learning_rate": 4.189551180678886e-07, + "loss": 0.3294, + "step": 7896 + }, + { + "epoch": 0.8701928374655648, + "grad_norm": 7.069095134735107, + "learning_rate": 4.182547978737239e-07, + "loss": 0.4646, + "step": 7897 + }, + { + "epoch": 0.8703030303030304, + "grad_norm": 6.976655960083008, + "learning_rate": 4.1755503794158547e-07, + "loss": 0.4672, + "step": 7898 + }, + { + "epoch": 0.8704132231404959, + "grad_norm": 6.343747138977051, + "learning_rate": 4.16855838357042e-07, + "loss": 0.3327, + "step": 7899 + }, + { + "epoch": 0.8705234159779615, + "grad_norm": 8.259794235229492, + "learning_rate": 4.161571992055924e-07, + "loss": 0.3866, + "step": 7900 + }, + { + "epoch": 0.870633608815427, + "grad_norm": 7.544325351715088, + "learning_rate": 4.1545912057266656e-07, + "loss": 0.4435, + "step": 7901 + }, + { + "epoch": 0.8707438016528926, + "grad_norm": 5.710932731628418, + "learning_rate": 4.1476160254362683e-07, + "loss": 0.3952, + "step": 7902 + }, + { + "epoch": 0.8708539944903582, + "grad_norm": 6.750667572021484, + "learning_rate": 4.1406464520376664e-07, + "loss": 0.3688, + "step": 7903 + }, + { + "epoch": 0.8709641873278237, + "grad_norm": 5.663279056549072, + "learning_rate": 4.133682486383123e-07, + "loss": 0.3955, + "step": 7904 + }, + { + "epoch": 0.8710743801652893, + "grad_norm": 4.945156574249268, + "learning_rate": 4.126724129324178e-07, + "loss": 0.4232, + "step": 7905 + }, + { + "epoch": 0.8711845730027549, + "grad_norm": 11.367006301879883, + "learning_rate": 4.119771381711718e-07, + "loss": 0.3749, + "step": 7906 + }, + { + "epoch": 0.8712947658402204, + "grad_norm": 8.525832176208496, + "learning_rate": 4.1128242443959466e-07, + "loss": 0.382, + "step": 7907 + }, + { + "epoch": 0.871404958677686, + "grad_norm": 5.626183032989502, + "learning_rate": 4.105882718226345e-07, + "loss": 0.392, + "step": 7908 + }, + { + "epoch": 0.8715151515151515, + "grad_norm": 7.996572971343994, + "learning_rate": 4.098946804051751e-07, + "loss": 0.429, + "step": 7909 + }, + { + "epoch": 0.8716253443526171, + "grad_norm": 6.027618885040283, + "learning_rate": 4.0920165027202975e-07, + "loss": 0.3688, + "step": 7910 + }, + { + "epoch": 0.8717355371900827, + "grad_norm": 4.738654136657715, + "learning_rate": 4.085091815079417e-07, + "loss": 0.3237, + "step": 7911 + }, + { + "epoch": 0.8718457300275482, + "grad_norm": 5.284733772277832, + "learning_rate": 4.0781727419758777e-07, + "loss": 0.379, + "step": 7912 + }, + { + "epoch": 0.8719559228650138, + "grad_norm": 7.230706214904785, + "learning_rate": 4.0712592842557685e-07, + "loss": 0.3235, + "step": 7913 + }, + { + "epoch": 0.8720661157024794, + "grad_norm": 8.302559852600098, + "learning_rate": 4.064351442764447e-07, + "loss": 0.393, + "step": 7914 + }, + { + "epoch": 0.8721763085399449, + "grad_norm": 10.283626556396484, + "learning_rate": 4.057449218346632e-07, + "loss": 0.4482, + "step": 7915 + }, + { + "epoch": 0.8722865013774105, + "grad_norm": 4.496822357177734, + "learning_rate": 4.0505526118463425e-07, + "loss": 0.375, + "step": 7916 + }, + { + "epoch": 0.872396694214876, + "grad_norm": 6.420034408569336, + "learning_rate": 4.043661624106887e-07, + "loss": 0.3895, + "step": 7917 + }, + { + "epoch": 0.8725068870523416, + "grad_norm": 11.06975269317627, + "learning_rate": 4.036776255970909e-07, + "loss": 0.4565, + "step": 7918 + }, + { + "epoch": 0.8726170798898072, + "grad_norm": 6.550425052642822, + "learning_rate": 4.0298965082803785e-07, + "loss": 0.4343, + "step": 7919 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 6.285310745239258, + "learning_rate": 4.02302238187654e-07, + "loss": 0.3874, + "step": 7920 + }, + { + "epoch": 0.8728374655647383, + "grad_norm": 4.321930408477783, + "learning_rate": 4.016153877599976e-07, + "loss": 0.394, + "step": 7921 + }, + { + "epoch": 0.8729476584022039, + "grad_norm": 5.822112083435059, + "learning_rate": 4.009290996290588e-07, + "loss": 0.3841, + "step": 7922 + }, + { + "epoch": 0.8730578512396694, + "grad_norm": 11.433794975280762, + "learning_rate": 4.002433738787559e-07, + "loss": 0.4247, + "step": 7923 + }, + { + "epoch": 0.873168044077135, + "grad_norm": 9.85711669921875, + "learning_rate": 3.995582105929424e-07, + "loss": 0.3715, + "step": 7924 + }, + { + "epoch": 0.8732782369146006, + "grad_norm": 4.688582897186279, + "learning_rate": 3.9887360985539913e-07, + "loss": 0.3918, + "step": 7925 + }, + { + "epoch": 0.8733884297520661, + "grad_norm": 5.9062957763671875, + "learning_rate": 3.9818957174984076e-07, + "loss": 0.439, + "step": 7926 + }, + { + "epoch": 0.8734986225895317, + "grad_norm": 7.6765570640563965, + "learning_rate": 3.9750609635991313e-07, + "loss": 0.3475, + "step": 7927 + }, + { + "epoch": 0.8736088154269972, + "grad_norm": 6.803735733032227, + "learning_rate": 3.968231837691916e-07, + "loss": 0.3725, + "step": 7928 + }, + { + "epoch": 0.8737190082644628, + "grad_norm": 5.69181489944458, + "learning_rate": 3.9614083406118384e-07, + "loss": 0.3447, + "step": 7929 + }, + { + "epoch": 0.8738292011019284, + "grad_norm": 7.410181522369385, + "learning_rate": 3.9545904731932926e-07, + "loss": 0.4619, + "step": 7930 + }, + { + "epoch": 0.8739393939393939, + "grad_norm": 6.334578990936279, + "learning_rate": 3.947778236269961e-07, + "loss": 0.4537, + "step": 7931 + }, + { + "epoch": 0.8740495867768595, + "grad_norm": 11.030654907226562, + "learning_rate": 3.940971630674867e-07, + "loss": 0.3301, + "step": 7932 + }, + { + "epoch": 0.8741597796143251, + "grad_norm": 8.924234390258789, + "learning_rate": 3.9341706572403326e-07, + "loss": 0.4461, + "step": 7933 + }, + { + "epoch": 0.8742699724517906, + "grad_norm": 5.870466709136963, + "learning_rate": 3.927375316797971e-07, + "loss": 0.2781, + "step": 7934 + }, + { + "epoch": 0.8743801652892562, + "grad_norm": 8.122753143310547, + "learning_rate": 3.9205856101787457e-07, + "loss": 0.4622, + "step": 7935 + }, + { + "epoch": 0.8744903581267217, + "grad_norm": 9.385433197021484, + "learning_rate": 3.913801538212914e-07, + "loss": 0.4746, + "step": 7936 + }, + { + "epoch": 0.8746005509641873, + "grad_norm": 8.047636032104492, + "learning_rate": 3.907023101730023e-07, + "loss": 0.3346, + "step": 7937 + }, + { + "epoch": 0.8747107438016529, + "grad_norm": 5.601955890655518, + "learning_rate": 3.9002503015589554e-07, + "loss": 0.4383, + "step": 7938 + }, + { + "epoch": 0.8748209366391184, + "grad_norm": 6.140141010284424, + "learning_rate": 3.893483138527909e-07, + "loss": 0.4151, + "step": 7939 + }, + { + "epoch": 0.874931129476584, + "grad_norm": 5.186720848083496, + "learning_rate": 3.886721613464367e-07, + "loss": 0.3644, + "step": 7940 + }, + { + "epoch": 0.8750413223140496, + "grad_norm": 8.273353576660156, + "learning_rate": 3.879965727195145e-07, + "loss": 0.4391, + "step": 7941 + }, + { + "epoch": 0.8751515151515151, + "grad_norm": 6.724390029907227, + "learning_rate": 3.873215480546372e-07, + "loss": 0.3852, + "step": 7942 + }, + { + "epoch": 0.8752617079889807, + "grad_norm": 6.30220365524292, + "learning_rate": 3.8664708743434585e-07, + "loss": 0.3941, + "step": 7943 + }, + { + "epoch": 0.8753719008264463, + "grad_norm": 8.863051414489746, + "learning_rate": 3.8597319094111516e-07, + "loss": 0.384, + "step": 7944 + }, + { + "epoch": 0.8754820936639118, + "grad_norm": 7.784864902496338, + "learning_rate": 3.852998586573503e-07, + "loss": 0.3233, + "step": 7945 + }, + { + "epoch": 0.8755922865013774, + "grad_norm": 7.160646438598633, + "learning_rate": 3.8462709066538763e-07, + "loss": 0.3986, + "step": 7946 + }, + { + "epoch": 0.8757024793388429, + "grad_norm": 9.844958305358887, + "learning_rate": 3.839548870474935e-07, + "loss": 0.4987, + "step": 7947 + }, + { + "epoch": 0.8758126721763085, + "grad_norm": 4.915763854980469, + "learning_rate": 3.832832478858656e-07, + "loss": 0.4543, + "step": 7948 + }, + { + "epoch": 0.8759228650137741, + "grad_norm": 9.974266052246094, + "learning_rate": 3.826121732626342e-07, + "loss": 0.4292, + "step": 7949 + }, + { + "epoch": 0.8760330578512396, + "grad_norm": 5.884228229522705, + "learning_rate": 3.8194166325985826e-07, + "loss": 0.4142, + "step": 7950 + }, + { + "epoch": 0.8761432506887052, + "grad_norm": 9.149958610534668, + "learning_rate": 3.8127171795952766e-07, + "loss": 0.4498, + "step": 7951 + }, + { + "epoch": 0.8762534435261708, + "grad_norm": 7.31388521194458, + "learning_rate": 3.8060233744356634e-07, + "loss": 0.4345, + "step": 7952 + }, + { + "epoch": 0.8763636363636363, + "grad_norm": 6.551784038543701, + "learning_rate": 3.799335217938266e-07, + "loss": 0.4088, + "step": 7953 + }, + { + "epoch": 0.876473829201102, + "grad_norm": 4.9210662841796875, + "learning_rate": 3.7926527109208967e-07, + "loss": 0.3613, + "step": 7954 + }, + { + "epoch": 0.8765840220385674, + "grad_norm": 9.949579238891602, + "learning_rate": 3.7859758542007354e-07, + "loss": 0.3757, + "step": 7955 + }, + { + "epoch": 0.876694214876033, + "grad_norm": 6.613851070404053, + "learning_rate": 3.779304648594223e-07, + "loss": 0.4321, + "step": 7956 + }, + { + "epoch": 0.8768044077134987, + "grad_norm": 7.689001560211182, + "learning_rate": 3.7726390949171133e-07, + "loss": 0.3359, + "step": 7957 + }, + { + "epoch": 0.8769146005509642, + "grad_norm": 7.808323383331299, + "learning_rate": 3.765979193984487e-07, + "loss": 0.3436, + "step": 7958 + }, + { + "epoch": 0.8770247933884298, + "grad_norm": 8.902508735656738, + "learning_rate": 3.759324946610743e-07, + "loss": 0.4354, + "step": 7959 + }, + { + "epoch": 0.8771349862258954, + "grad_norm": 5.330629825592041, + "learning_rate": 3.7526763536095414e-07, + "loss": 0.4416, + "step": 7960 + }, + { + "epoch": 0.8772451790633609, + "grad_norm": 9.913277626037598, + "learning_rate": 3.7460334157938983e-07, + "loss": 0.4213, + "step": 7961 + }, + { + "epoch": 0.8773553719008265, + "grad_norm": 5.145281791687012, + "learning_rate": 3.73939613397612e-07, + "loss": 0.3298, + "step": 7962 + }, + { + "epoch": 0.877465564738292, + "grad_norm": 7.522690296173096, + "learning_rate": 3.732764508967829e-07, + "loss": 0.4063, + "step": 7963 + }, + { + "epoch": 0.8775757575757576, + "grad_norm": 5.090415954589844, + "learning_rate": 3.7261385415799325e-07, + "loss": 0.3964, + "step": 7964 + }, + { + "epoch": 0.8776859504132232, + "grad_norm": 14.363067626953125, + "learning_rate": 3.7195182326226765e-07, + "loss": 0.431, + "step": 7965 + }, + { + "epoch": 0.8777961432506887, + "grad_norm": 6.970553398132324, + "learning_rate": 3.7129035829056027e-07, + "loss": 0.3938, + "step": 7966 + }, + { + "epoch": 0.8779063360881543, + "grad_norm": 8.049395561218262, + "learning_rate": 3.706294593237542e-07, + "loss": 0.4911, + "step": 7967 + }, + { + "epoch": 0.8780165289256199, + "grad_norm": 6.71792459487915, + "learning_rate": 3.699691264426664e-07, + "loss": 0.4038, + "step": 7968 + }, + { + "epoch": 0.8781267217630854, + "grad_norm": 4.56690788269043, + "learning_rate": 3.6930935972804395e-07, + "loss": 0.3587, + "step": 7969 + }, + { + "epoch": 0.878236914600551, + "grad_norm": 3.843503475189209, + "learning_rate": 3.6865015926056237e-07, + "loss": 0.4201, + "step": 7970 + }, + { + "epoch": 0.8783471074380166, + "grad_norm": 6.373324394226074, + "learning_rate": 3.679915251208305e-07, + "loss": 0.3347, + "step": 7971 + }, + { + "epoch": 0.8784573002754821, + "grad_norm": 5.485124111175537, + "learning_rate": 3.6733345738938776e-07, + "loss": 0.3936, + "step": 7972 + }, + { + "epoch": 0.8785674931129477, + "grad_norm": 5.12364387512207, + "learning_rate": 3.666759561467015e-07, + "loss": 0.3765, + "step": 7973 + }, + { + "epoch": 0.8786776859504132, + "grad_norm": 7.823952674865723, + "learning_rate": 3.6601902147317345e-07, + "loss": 0.4422, + "step": 7974 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 5.81721305847168, + "learning_rate": 3.653626534491345e-07, + "loss": 0.4155, + "step": 7975 + }, + { + "epoch": 0.8788980716253444, + "grad_norm": 8.598370552062988, + "learning_rate": 3.6470685215484525e-07, + "loss": 0.4148, + "step": 7976 + }, + { + "epoch": 0.8790082644628099, + "grad_norm": 5.4404826164245605, + "learning_rate": 3.640516176704989e-07, + "loss": 0.4311, + "step": 7977 + }, + { + "epoch": 0.8791184573002755, + "grad_norm": 5.446481227874756, + "learning_rate": 3.6339695007621855e-07, + "loss": 0.4092, + "step": 7978 + }, + { + "epoch": 0.8792286501377411, + "grad_norm": 6.2118401527404785, + "learning_rate": 3.627428494520563e-07, + "loss": 0.3892, + "step": 7979 + }, + { + "epoch": 0.8793388429752066, + "grad_norm": 5.770524024963379, + "learning_rate": 3.6208931587799813e-07, + "loss": 0.2954, + "step": 7980 + }, + { + "epoch": 0.8794490358126722, + "grad_norm": 4.975718021392822, + "learning_rate": 3.6143634943395846e-07, + "loss": 0.3841, + "step": 7981 + }, + { + "epoch": 0.8795592286501377, + "grad_norm": 12.857072830200195, + "learning_rate": 3.607839501997823e-07, + "loss": 0.5214, + "step": 7982 + }, + { + "epoch": 0.8796694214876033, + "grad_norm": 10.470033645629883, + "learning_rate": 3.6013211825524754e-07, + "loss": 0.4818, + "step": 7983 + }, + { + "epoch": 0.8797796143250689, + "grad_norm": 7.853455543518066, + "learning_rate": 3.5948085368005926e-07, + "loss": 0.4785, + "step": 7984 + }, + { + "epoch": 0.8798898071625344, + "grad_norm": 16.928417205810547, + "learning_rate": 3.5883015655385544e-07, + "loss": 0.4825, + "step": 7985 + }, + { + "epoch": 0.88, + "grad_norm": 5.691904544830322, + "learning_rate": 3.5818002695620526e-07, + "loss": 0.3282, + "step": 7986 + }, + { + "epoch": 0.8801101928374656, + "grad_norm": 6.92352294921875, + "learning_rate": 3.5753046496660614e-07, + "loss": 0.3857, + "step": 7987 + }, + { + "epoch": 0.8802203856749311, + "grad_norm": 9.776923179626465, + "learning_rate": 3.5688147066448744e-07, + "loss": 0.4823, + "step": 7988 + }, + { + "epoch": 0.8803305785123967, + "grad_norm": 6.693477153778076, + "learning_rate": 3.562330441292111e-07, + "loss": 0.3555, + "step": 7989 + }, + { + "epoch": 0.8804407713498622, + "grad_norm": 12.216975212097168, + "learning_rate": 3.5558518544006493e-07, + "loss": 0.5197, + "step": 7990 + }, + { + "epoch": 0.8805509641873278, + "grad_norm": 8.715909004211426, + "learning_rate": 3.549378946762705e-07, + "loss": 0.3948, + "step": 7991 + }, + { + "epoch": 0.8806611570247934, + "grad_norm": 8.513862609863281, + "learning_rate": 3.542911719169817e-07, + "loss": 0.4894, + "step": 7992 + }, + { + "epoch": 0.8807713498622589, + "grad_norm": 8.399560928344727, + "learning_rate": 3.536450172412775e-07, + "loss": 0.5353, + "step": 7993 + }, + { + "epoch": 0.8808815426997245, + "grad_norm": 6.549276351928711, + "learning_rate": 3.5299943072817257e-07, + "loss": 0.4302, + "step": 7994 + }, + { + "epoch": 0.8809917355371901, + "grad_norm": 6.469832420349121, + "learning_rate": 3.523544124566103e-07, + "loss": 0.3995, + "step": 7995 + }, + { + "epoch": 0.8811019283746556, + "grad_norm": 5.7896246910095215, + "learning_rate": 3.517099625054626e-07, + "loss": 0.3088, + "step": 7996 + }, + { + "epoch": 0.8812121212121212, + "grad_norm": 6.694138526916504, + "learning_rate": 3.510660809535349e-07, + "loss": 0.364, + "step": 7997 + }, + { + "epoch": 0.8813223140495868, + "grad_norm": 6.127065181732178, + "learning_rate": 3.504227678795624e-07, + "loss": 0.3851, + "step": 7998 + }, + { + "epoch": 0.8814325068870523, + "grad_norm": 6.639011859893799, + "learning_rate": 3.4978002336220953e-07, + "loss": 0.4902, + "step": 7999 + }, + { + "epoch": 0.8815426997245179, + "grad_norm": 7.459198951721191, + "learning_rate": 3.4913784748007163e-07, + "loss": 0.3475, + "step": 8000 + }, + { + "epoch": 0.8816528925619834, + "grad_norm": 5.273727893829346, + "learning_rate": 3.4849624031167593e-07, + "loss": 0.4321, + "step": 8001 + }, + { + "epoch": 0.881763085399449, + "grad_norm": 8.887303352355957, + "learning_rate": 3.4785520193547806e-07, + "loss": 0.3721, + "step": 8002 + }, + { + "epoch": 0.8818732782369146, + "grad_norm": 8.102290153503418, + "learning_rate": 3.472147324298647e-07, + "loss": 0.3801, + "step": 8003 + }, + { + "epoch": 0.8819834710743801, + "grad_norm": 6.330785274505615, + "learning_rate": 3.465748318731549e-07, + "loss": 0.3685, + "step": 8004 + }, + { + "epoch": 0.8820936639118457, + "grad_norm": 5.924086570739746, + "learning_rate": 3.45935500343596e-07, + "loss": 0.4131, + "step": 8005 + }, + { + "epoch": 0.8822038567493113, + "grad_norm": 6.860537052154541, + "learning_rate": 3.4529673791936556e-07, + "loss": 0.4818, + "step": 8006 + }, + { + "epoch": 0.8823140495867768, + "grad_norm": 7.429711818695068, + "learning_rate": 3.446585446785722e-07, + "loss": 0.315, + "step": 8007 + }, + { + "epoch": 0.8824242424242424, + "grad_norm": 4.881139755249023, + "learning_rate": 3.440209206992573e-07, + "loss": 0.3868, + "step": 8008 + }, + { + "epoch": 0.8825344352617079, + "grad_norm": 8.522668838500977, + "learning_rate": 3.43383866059388e-07, + "loss": 0.4128, + "step": 8009 + }, + { + "epoch": 0.8826446280991735, + "grad_norm": 9.63332462310791, + "learning_rate": 3.427473808368637e-07, + "loss": 0.4244, + "step": 8010 + }, + { + "epoch": 0.8827548209366392, + "grad_norm": 9.569814682006836, + "learning_rate": 3.4211146510951755e-07, + "loss": 0.3674, + "step": 8011 + }, + { + "epoch": 0.8828650137741046, + "grad_norm": 5.571114540100098, + "learning_rate": 3.414761189551086e-07, + "loss": 0.3177, + "step": 8012 + }, + { + "epoch": 0.8829752066115703, + "grad_norm": 11.988824844360352, + "learning_rate": 3.408413424513263e-07, + "loss": 0.5143, + "step": 8013 + }, + { + "epoch": 0.8830853994490359, + "grad_norm": 7.931176662445068, + "learning_rate": 3.402071356757947e-07, + "loss": 0.4564, + "step": 8014 + }, + { + "epoch": 0.8831955922865014, + "grad_norm": 8.800257682800293, + "learning_rate": 3.395734987060645e-07, + "loss": 0.3842, + "step": 8015 + }, + { + "epoch": 0.883305785123967, + "grad_norm": 4.346943378448486, + "learning_rate": 3.3894043161961653e-07, + "loss": 0.2961, + "step": 8016 + }, + { + "epoch": 0.8834159779614325, + "grad_norm": 4.912277698516846, + "learning_rate": 3.383079344938639e-07, + "loss": 0.3357, + "step": 8017 + }, + { + "epoch": 0.8835261707988981, + "grad_norm": 5.641799449920654, + "learning_rate": 3.376760074061497e-07, + "loss": 0.4385, + "step": 8018 + }, + { + "epoch": 0.8836363636363637, + "grad_norm": 7.622751712799072, + "learning_rate": 3.3704465043374714e-07, + "loss": 0.4138, + "step": 8019 + }, + { + "epoch": 0.8837465564738292, + "grad_norm": 6.354327201843262, + "learning_rate": 3.3641386365385773e-07, + "loss": 0.4326, + "step": 8020 + }, + { + "epoch": 0.8838567493112948, + "grad_norm": 6.589369297027588, + "learning_rate": 3.3578364714361597e-07, + "loss": 0.481, + "step": 8021 + }, + { + "epoch": 0.8839669421487604, + "grad_norm": 5.158421039581299, + "learning_rate": 3.351540009800869e-07, + "loss": 0.434, + "step": 8022 + }, + { + "epoch": 0.8840771349862259, + "grad_norm": 7.01084041595459, + "learning_rate": 3.3452492524026156e-07, + "loss": 0.3777, + "step": 8023 + }, + { + "epoch": 0.8841873278236915, + "grad_norm": 5.994622230529785, + "learning_rate": 3.338964200010664e-07, + "loss": 0.4062, + "step": 8024 + }, + { + "epoch": 0.8842975206611571, + "grad_norm": 7.221744537353516, + "learning_rate": 3.3326848533935584e-07, + "loss": 0.3739, + "step": 8025 + }, + { + "epoch": 0.8844077134986226, + "grad_norm": 5.481081962585449, + "learning_rate": 3.3264112133191307e-07, + "loss": 0.3611, + "step": 8026 + }, + { + "epoch": 0.8845179063360882, + "grad_norm": 8.957030296325684, + "learning_rate": 3.3201432805545387e-07, + "loss": 0.3442, + "step": 8027 + }, + { + "epoch": 0.8846280991735537, + "grad_norm": 10.126974105834961, + "learning_rate": 3.313881055866247e-07, + "loss": 0.4113, + "step": 8028 + }, + { + "epoch": 0.8847382920110193, + "grad_norm": 6.311191082000732, + "learning_rate": 3.307624540019988e-07, + "loss": 0.4409, + "step": 8029 + }, + { + "epoch": 0.8848484848484849, + "grad_norm": 5.306412220001221, + "learning_rate": 3.3013737337808217e-07, + "loss": 0.4359, + "step": 8030 + }, + { + "epoch": 0.8849586776859504, + "grad_norm": 6.894516468048096, + "learning_rate": 3.29512863791312e-07, + "loss": 0.4272, + "step": 8031 + }, + { + "epoch": 0.885068870523416, + "grad_norm": 5.035217761993408, + "learning_rate": 3.288889253180522e-07, + "loss": 0.4076, + "step": 8032 + }, + { + "epoch": 0.8851790633608816, + "grad_norm": 8.280344009399414, + "learning_rate": 3.282655580346e-07, + "loss": 0.4444, + "step": 8033 + }, + { + "epoch": 0.8852892561983471, + "grad_norm": 8.162149429321289, + "learning_rate": 3.276427620171818e-07, + "loss": 0.3521, + "step": 8034 + }, + { + "epoch": 0.8853994490358127, + "grad_norm": 12.015260696411133, + "learning_rate": 3.270205373419527e-07, + "loss": 0.5101, + "step": 8035 + }, + { + "epoch": 0.8855096418732782, + "grad_norm": 10.396943092346191, + "learning_rate": 3.2639888408499964e-07, + "loss": 0.4122, + "step": 8036 + }, + { + "epoch": 0.8856198347107438, + "grad_norm": 6.952391624450684, + "learning_rate": 3.257778023223407e-07, + "loss": 0.3836, + "step": 8037 + }, + { + "epoch": 0.8857300275482094, + "grad_norm": 6.282987117767334, + "learning_rate": 3.251572921299206e-07, + "loss": 0.4001, + "step": 8038 + }, + { + "epoch": 0.8858402203856749, + "grad_norm": 8.051639556884766, + "learning_rate": 3.245373535836166e-07, + "loss": 0.4535, + "step": 8039 + }, + { + "epoch": 0.8859504132231405, + "grad_norm": 7.128776550292969, + "learning_rate": 3.2391798675923735e-07, + "loss": 0.3403, + "step": 8040 + }, + { + "epoch": 0.8860606060606061, + "grad_norm": 6.599671363830566, + "learning_rate": 3.2329919173251734e-07, + "loss": 0.3496, + "step": 8041 + }, + { + "epoch": 0.8861707988980716, + "grad_norm": 5.977092742919922, + "learning_rate": 3.2268096857912547e-07, + "loss": 0.356, + "step": 8042 + }, + { + "epoch": 0.8862809917355372, + "grad_norm": 7.180810451507568, + "learning_rate": 3.220633173746579e-07, + "loss": 0.4067, + "step": 8043 + }, + { + "epoch": 0.8863911845730027, + "grad_norm": 8.043085098266602, + "learning_rate": 3.2144623819464205e-07, + "loss": 0.3187, + "step": 8044 + }, + { + "epoch": 0.8865013774104683, + "grad_norm": 7.067012310028076, + "learning_rate": 3.2082973111453587e-07, + "loss": 0.4156, + "step": 8045 + }, + { + "epoch": 0.8866115702479339, + "grad_norm": 11.196881294250488, + "learning_rate": 3.2021379620972513e-07, + "loss": 0.4194, + "step": 8046 + }, + { + "epoch": 0.8867217630853994, + "grad_norm": 8.067953109741211, + "learning_rate": 3.1959843355552964e-07, + "loss": 0.4077, + "step": 8047 + }, + { + "epoch": 0.886831955922865, + "grad_norm": 4.846611499786377, + "learning_rate": 3.189836432271953e-07, + "loss": 0.3907, + "step": 8048 + }, + { + "epoch": 0.8869421487603306, + "grad_norm": 13.707871437072754, + "learning_rate": 3.183694252998987e-07, + "loss": 0.4561, + "step": 8049 + }, + { + "epoch": 0.8870523415977961, + "grad_norm": 4.391310691833496, + "learning_rate": 3.177557798487485e-07, + "loss": 0.4253, + "step": 8050 + }, + { + "epoch": 0.8871625344352617, + "grad_norm": 7.767878532409668, + "learning_rate": 3.17142706948782e-07, + "loss": 0.3253, + "step": 8051 + }, + { + "epoch": 0.8872727272727273, + "grad_norm": 5.446445941925049, + "learning_rate": 3.1653020667496593e-07, + "loss": 0.42, + "step": 8052 + }, + { + "epoch": 0.8873829201101928, + "grad_norm": 6.431450366973877, + "learning_rate": 3.1591827910219806e-07, + "loss": 0.4241, + "step": 8053 + }, + { + "epoch": 0.8874931129476584, + "grad_norm": 9.028473854064941, + "learning_rate": 3.153069243053064e-07, + "loss": 0.4122, + "step": 8054 + }, + { + "epoch": 0.8876033057851239, + "grad_norm": 8.524662017822266, + "learning_rate": 3.146961423590472e-07, + "loss": 0.4029, + "step": 8055 + }, + { + "epoch": 0.8877134986225895, + "grad_norm": 5.424405097961426, + "learning_rate": 3.1408593333810743e-07, + "loss": 0.3575, + "step": 8056 + }, + { + "epoch": 0.8878236914600551, + "grad_norm": 12.417449951171875, + "learning_rate": 3.1347629731710625e-07, + "loss": 0.4773, + "step": 8057 + }, + { + "epoch": 0.8879338842975206, + "grad_norm": 12.965645790100098, + "learning_rate": 3.128672343705885e-07, + "loss": 0.5651, + "step": 8058 + }, + { + "epoch": 0.8880440771349862, + "grad_norm": 4.561319351196289, + "learning_rate": 3.1225874457303183e-07, + "loss": 0.3918, + "step": 8059 + }, + { + "epoch": 0.8881542699724518, + "grad_norm": 6.537079811096191, + "learning_rate": 3.1165082799884396e-07, + "loss": 0.4639, + "step": 8060 + }, + { + "epoch": 0.8882644628099173, + "grad_norm": 4.994674205780029, + "learning_rate": 3.1104348472236203e-07, + "loss": 0.3408, + "step": 8061 + }, + { + "epoch": 0.8883746556473829, + "grad_norm": 6.84326696395874, + "learning_rate": 3.104367148178511e-07, + "loss": 0.3811, + "step": 8062 + }, + { + "epoch": 0.8884848484848484, + "grad_norm": 14.176671981811523, + "learning_rate": 3.0983051835950904e-07, + "loss": 0.5579, + "step": 8063 + }, + { + "epoch": 0.888595041322314, + "grad_norm": 7.461105823516846, + "learning_rate": 3.092248954214627e-07, + "loss": 0.4269, + "step": 8064 + }, + { + "epoch": 0.8887052341597796, + "grad_norm": 8.84874153137207, + "learning_rate": 3.086198460777684e-07, + "loss": 0.3993, + "step": 8065 + }, + { + "epoch": 0.8888154269972451, + "grad_norm": 6.398798942565918, + "learning_rate": 3.080153704024097e-07, + "loss": 0.4355, + "step": 8066 + }, + { + "epoch": 0.8889256198347107, + "grad_norm": 19.387798309326172, + "learning_rate": 3.074114684693069e-07, + "loss": 0.4238, + "step": 8067 + }, + { + "epoch": 0.8890358126721764, + "grad_norm": 11.25837230682373, + "learning_rate": 3.068081403523043e-07, + "loss": 0.4366, + "step": 8068 + }, + { + "epoch": 0.8891460055096418, + "grad_norm": 6.566999912261963, + "learning_rate": 3.0620538612517567e-07, + "loss": 0.3861, + "step": 8069 + }, + { + "epoch": 0.8892561983471075, + "grad_norm": 5.205772876739502, + "learning_rate": 3.056032058616293e-07, + "loss": 0.3086, + "step": 8070 + }, + { + "epoch": 0.8893663911845731, + "grad_norm": 16.4765567779541, + "learning_rate": 3.050015996353001e-07, + "loss": 0.4317, + "step": 8071 + }, + { + "epoch": 0.8894765840220386, + "grad_norm": 7.251389503479004, + "learning_rate": 3.044005675197514e-07, + "loss": 0.41, + "step": 8072 + }, + { + "epoch": 0.8895867768595042, + "grad_norm": 5.62153959274292, + "learning_rate": 3.0380010958848125e-07, + "loss": 0.3659, + "step": 8073 + }, + { + "epoch": 0.8896969696969697, + "grad_norm": 4.930037498474121, + "learning_rate": 3.0320022591491193e-07, + "loss": 0.4092, + "step": 8074 + }, + { + "epoch": 0.8898071625344353, + "grad_norm": 7.107131004333496, + "learning_rate": 3.026009165723992e-07, + "loss": 0.4444, + "step": 8075 + }, + { + "epoch": 0.8899173553719009, + "grad_norm": 5.48328161239624, + "learning_rate": 3.0200218163422725e-07, + "loss": 0.4075, + "step": 8076 + }, + { + "epoch": 0.8900275482093664, + "grad_norm": 10.584638595581055, + "learning_rate": 3.014040211736097e-07, + "loss": 0.3669, + "step": 8077 + }, + { + "epoch": 0.890137741046832, + "grad_norm": 8.460695266723633, + "learning_rate": 3.0080643526369135e-07, + "loss": 0.4176, + "step": 8078 + }, + { + "epoch": 0.8902479338842976, + "grad_norm": 7.722398281097412, + "learning_rate": 3.0020942397754493e-07, + "loss": 0.3903, + "step": 8079 + }, + { + "epoch": 0.8903581267217631, + "grad_norm": 8.00770092010498, + "learning_rate": 2.9961298738817424e-07, + "loss": 0.3857, + "step": 8080 + }, + { + "epoch": 0.8904683195592287, + "grad_norm": 5.820324420928955, + "learning_rate": 2.9901712556851315e-07, + "loss": 0.4006, + "step": 8081 + }, + { + "epoch": 0.8905785123966942, + "grad_norm": 5.2179975509643555, + "learning_rate": 2.984218385914228e-07, + "loss": 0.3543, + "step": 8082 + }, + { + "epoch": 0.8906887052341598, + "grad_norm": 5.526549339294434, + "learning_rate": 2.9782712652969615e-07, + "loss": 0.4097, + "step": 8083 + }, + { + "epoch": 0.8907988980716254, + "grad_norm": 5.95803165435791, + "learning_rate": 2.9723298945605663e-07, + "loss": 0.3751, + "step": 8084 + }, + { + "epoch": 0.8909090909090909, + "grad_norm": 8.43634033203125, + "learning_rate": 2.9663942744315443e-07, + "loss": 0.3998, + "step": 8085 + }, + { + "epoch": 0.8910192837465565, + "grad_norm": 9.699858665466309, + "learning_rate": 2.960464405635721e-07, + "loss": 0.3533, + "step": 8086 + }, + { + "epoch": 0.8911294765840221, + "grad_norm": 7.270328998565674, + "learning_rate": 2.954540288898217e-07, + "loss": 0.3432, + "step": 8087 + }, + { + "epoch": 0.8912396694214876, + "grad_norm": 8.234623908996582, + "learning_rate": 2.9486219249434234e-07, + "loss": 0.3458, + "step": 8088 + }, + { + "epoch": 0.8913498622589532, + "grad_norm": 8.299948692321777, + "learning_rate": 2.9427093144950513e-07, + "loss": 0.3965, + "step": 8089 + }, + { + "epoch": 0.8914600550964187, + "grad_norm": 4.60298490524292, + "learning_rate": 2.936802458276111e-07, + "loss": 0.3638, + "step": 8090 + }, + { + "epoch": 0.8915702479338843, + "grad_norm": 7.8577165603637695, + "learning_rate": 2.9309013570088916e-07, + "loss": 0.4339, + "step": 8091 + }, + { + "epoch": 0.8916804407713499, + "grad_norm": 6.670377254486084, + "learning_rate": 2.9250060114149883e-07, + "loss": 0.4261, + "step": 8092 + }, + { + "epoch": 0.8917906336088154, + "grad_norm": 4.505085468292236, + "learning_rate": 2.9191164222153014e-07, + "loss": 0.3706, + "step": 8093 + }, + { + "epoch": 0.891900826446281, + "grad_norm": 4.422532558441162, + "learning_rate": 2.9132325901300106e-07, + "loss": 0.3651, + "step": 8094 + }, + { + "epoch": 0.8920110192837466, + "grad_norm": 8.972837448120117, + "learning_rate": 2.907354515878591e-07, + "loss": 0.41, + "step": 8095 + }, + { + "epoch": 0.8921212121212121, + "grad_norm": 7.192322254180908, + "learning_rate": 2.9014822001798446e-07, + "loss": 0.3484, + "step": 8096 + }, + { + "epoch": 0.8922314049586777, + "grad_norm": 8.343283653259277, + "learning_rate": 2.8956156437518204e-07, + "loss": 0.4628, + "step": 8097 + }, + { + "epoch": 0.8923415977961433, + "grad_norm": 8.42356014251709, + "learning_rate": 2.88975484731191e-07, + "loss": 0.495, + "step": 8098 + }, + { + "epoch": 0.8924517906336088, + "grad_norm": 4.533227443695068, + "learning_rate": 2.8838998115767623e-07, + "loss": 0.2961, + "step": 8099 + }, + { + "epoch": 0.8925619834710744, + "grad_norm": 9.942359924316406, + "learning_rate": 2.8780505372623444e-07, + "loss": 0.4207, + "step": 8100 + }, + { + "epoch": 0.8926721763085399, + "grad_norm": 7.865906715393066, + "learning_rate": 2.8722070250839283e-07, + "loss": 0.393, + "step": 8101 + }, + { + "epoch": 0.8927823691460055, + "grad_norm": 8.467253684997559, + "learning_rate": 2.866369275756037e-07, + "loss": 0.4613, + "step": 8102 + }, + { + "epoch": 0.8928925619834711, + "grad_norm": 14.772082328796387, + "learning_rate": 2.860537289992549e-07, + "loss": 0.5226, + "step": 8103 + }, + { + "epoch": 0.8930027548209366, + "grad_norm": 4.3217997550964355, + "learning_rate": 2.854711068506594e-07, + "loss": 0.3545, + "step": 8104 + }, + { + "epoch": 0.8931129476584022, + "grad_norm": 4.720042705535889, + "learning_rate": 2.848890612010591e-07, + "loss": 0.4085, + "step": 8105 + }, + { + "epoch": 0.8932231404958678, + "grad_norm": 8.005223274230957, + "learning_rate": 2.843075921216315e-07, + "loss": 0.3778, + "step": 8106 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 6.306997299194336, + "learning_rate": 2.8372669968347645e-07, + "loss": 0.3506, + "step": 8107 + }, + { + "epoch": 0.8934435261707989, + "grad_norm": 6.133072376251221, + "learning_rate": 2.831463839576271e-07, + "loss": 0.3684, + "step": 8108 + }, + { + "epoch": 0.8935537190082644, + "grad_norm": 4.7656989097595215, + "learning_rate": 2.825666450150444e-07, + "loss": 0.3681, + "step": 8109 + }, + { + "epoch": 0.89366391184573, + "grad_norm": 5.7682013511657715, + "learning_rate": 2.8198748292662114e-07, + "loss": 0.3956, + "step": 8110 + }, + { + "epoch": 0.8937741046831956, + "grad_norm": 10.335726737976074, + "learning_rate": 2.814088977631768e-07, + "loss": 0.4726, + "step": 8111 + }, + { + "epoch": 0.8938842975206611, + "grad_norm": 5.817956924438477, + "learning_rate": 2.80830889595462e-07, + "loss": 0.4033, + "step": 8112 + }, + { + "epoch": 0.8939944903581267, + "grad_norm": 5.217440128326416, + "learning_rate": 2.802534584941569e-07, + "loss": 0.3323, + "step": 8113 + }, + { + "epoch": 0.8941046831955923, + "grad_norm": 6.813904285430908, + "learning_rate": 2.7967660452986987e-07, + "loss": 0.4036, + "step": 8114 + }, + { + "epoch": 0.8942148760330578, + "grad_norm": 8.03448486328125, + "learning_rate": 2.791003277731391e-07, + "loss": 0.3333, + "step": 8115 + }, + { + "epoch": 0.8943250688705234, + "grad_norm": 6.301514148712158, + "learning_rate": 2.7852462829443314e-07, + "loss": 0.3822, + "step": 8116 + }, + { + "epoch": 0.8944352617079889, + "grad_norm": 6.421783447265625, + "learning_rate": 2.779495061641496e-07, + "loss": 0.454, + "step": 8117 + }, + { + "epoch": 0.8945454545454545, + "grad_norm": 5.428205966949463, + "learning_rate": 2.773749614526139e-07, + "loss": 0.3974, + "step": 8118 + }, + { + "epoch": 0.8946556473829201, + "grad_norm": 8.008651733398438, + "learning_rate": 2.768009942300831e-07, + "loss": 0.4237, + "step": 8119 + }, + { + "epoch": 0.8947658402203856, + "grad_norm": 5.937148094177246, + "learning_rate": 2.762276045667439e-07, + "loss": 0.3997, + "step": 8120 + }, + { + "epoch": 0.8948760330578512, + "grad_norm": 6.53151798248291, + "learning_rate": 2.756547925327085e-07, + "loss": 0.3953, + "step": 8121 + }, + { + "epoch": 0.8949862258953168, + "grad_norm": 6.310358047485352, + "learning_rate": 2.7508255819802266e-07, + "loss": 0.3669, + "step": 8122 + }, + { + "epoch": 0.8950964187327823, + "grad_norm": 6.95485782623291, + "learning_rate": 2.7451090163266026e-07, + "loss": 0.3781, + "step": 8123 + }, + { + "epoch": 0.895206611570248, + "grad_norm": 5.507017612457275, + "learning_rate": 2.7393982290652374e-07, + "loss": 0.417, + "step": 8124 + }, + { + "epoch": 0.8953168044077136, + "grad_norm": 4.009406566619873, + "learning_rate": 2.7336932208944403e-07, + "loss": 0.3, + "step": 8125 + }, + { + "epoch": 0.895426997245179, + "grad_norm": 8.58454418182373, + "learning_rate": 2.727993992511857e-07, + "loss": 0.4231, + "step": 8126 + }, + { + "epoch": 0.8955371900826447, + "grad_norm": 7.0145769119262695, + "learning_rate": 2.722300544614381e-07, + "loss": 0.4203, + "step": 8127 + }, + { + "epoch": 0.8956473829201101, + "grad_norm": 9.251009941101074, + "learning_rate": 2.7166128778982005e-07, + "loss": 0.4215, + "step": 8128 + }, + { + "epoch": 0.8957575757575758, + "grad_norm": 5.557025909423828, + "learning_rate": 2.7109309930588413e-07, + "loss": 0.3281, + "step": 8129 + }, + { + "epoch": 0.8958677685950414, + "grad_norm": 8.478955268859863, + "learning_rate": 2.705254890791065e-07, + "loss": 0.5145, + "step": 8130 + }, + { + "epoch": 0.8959779614325069, + "grad_norm": 7.705807209014893, + "learning_rate": 2.6995845717889715e-07, + "loss": 0.4013, + "step": 8131 + }, + { + "epoch": 0.8960881542699725, + "grad_norm": 8.202071189880371, + "learning_rate": 2.693920036745923e-07, + "loss": 0.4038, + "step": 8132 + }, + { + "epoch": 0.8961983471074381, + "grad_norm": 3.9696667194366455, + "learning_rate": 2.688261286354593e-07, + "loss": 0.3579, + "step": 8133 + }, + { + "epoch": 0.8963085399449036, + "grad_norm": 10.224541664123535, + "learning_rate": 2.6826083213069453e-07, + "loss": 0.4634, + "step": 8134 + }, + { + "epoch": 0.8964187327823692, + "grad_norm": 10.307765007019043, + "learning_rate": 2.6769611422942155e-07, + "loss": 0.4637, + "step": 8135 + }, + { + "epoch": 0.8965289256198347, + "grad_norm": 3.828857898712158, + "learning_rate": 2.671319750006962e-07, + "loss": 0.3909, + "step": 8136 + }, + { + "epoch": 0.8966391184573003, + "grad_norm": 7.539666175842285, + "learning_rate": 2.6656841451350277e-07, + "loss": 0.4047, + "step": 8137 + }, + { + "epoch": 0.8967493112947659, + "grad_norm": 7.9426398277282715, + "learning_rate": 2.660054328367523e-07, + "loss": 0.4304, + "step": 8138 + }, + { + "epoch": 0.8968595041322314, + "grad_norm": 5.386150360107422, + "learning_rate": 2.654430300392885e-07, + "loss": 0.3594, + "step": 8139 + }, + { + "epoch": 0.896969696969697, + "grad_norm": 4.974326133728027, + "learning_rate": 2.6488120618988256e-07, + "loss": 0.322, + "step": 8140 + }, + { + "epoch": 0.8970798898071626, + "grad_norm": 7.355337619781494, + "learning_rate": 2.643199613572345e-07, + "loss": 0.4132, + "step": 8141 + }, + { + "epoch": 0.8971900826446281, + "grad_norm": 7.0208916664123535, + "learning_rate": 2.637592956099738e-07, + "loss": 0.3688, + "step": 8142 + }, + { + "epoch": 0.8973002754820937, + "grad_norm": 5.85609245300293, + "learning_rate": 2.6319920901666073e-07, + "loss": 0.3767, + "step": 8143 + }, + { + "epoch": 0.8974104683195592, + "grad_norm": 9.025439262390137, + "learning_rate": 2.626397016457827e-07, + "loss": 0.4982, + "step": 8144 + }, + { + "epoch": 0.8975206611570248, + "grad_norm": 5.851327896118164, + "learning_rate": 2.62080773565756e-07, + "loss": 0.4023, + "step": 8145 + }, + { + "epoch": 0.8976308539944904, + "grad_norm": 6.238762855529785, + "learning_rate": 2.6152242484492943e-07, + "loss": 0.416, + "step": 8146 + }, + { + "epoch": 0.8977410468319559, + "grad_norm": 5.96334981918335, + "learning_rate": 2.6096465555157655e-07, + "loss": 0.4034, + "step": 8147 + }, + { + "epoch": 0.8978512396694215, + "grad_norm": 8.227760314941406, + "learning_rate": 2.6040746575390295e-07, + "loss": 0.3429, + "step": 8148 + }, + { + "epoch": 0.8979614325068871, + "grad_norm": 9.91084098815918, + "learning_rate": 2.598508555200435e-07, + "loss": 0.4176, + "step": 8149 + }, + { + "epoch": 0.8980716253443526, + "grad_norm": 5.673585891723633, + "learning_rate": 2.592948249180594e-07, + "loss": 0.3466, + "step": 8150 + }, + { + "epoch": 0.8981818181818182, + "grad_norm": 5.9524641036987305, + "learning_rate": 2.587393740159433e-07, + "loss": 0.3574, + "step": 8151 + }, + { + "epoch": 0.8982920110192838, + "grad_norm": 8.94787883758545, + "learning_rate": 2.5818450288161823e-07, + "loss": 0.5022, + "step": 8152 + }, + { + "epoch": 0.8984022038567493, + "grad_norm": 7.699998378753662, + "learning_rate": 2.5763021158293213e-07, + "loss": 0.4089, + "step": 8153 + }, + { + "epoch": 0.8985123966942149, + "grad_norm": 3.6122827529907227, + "learning_rate": 2.5707650018766627e-07, + "loss": 0.3893, + "step": 8154 + }, + { + "epoch": 0.8986225895316804, + "grad_norm": 4.201280117034912, + "learning_rate": 2.565233687635288e-07, + "loss": 0.3328, + "step": 8155 + }, + { + "epoch": 0.898732782369146, + "grad_norm": 10.086000442504883, + "learning_rate": 2.559708173781561e-07, + "loss": 0.4042, + "step": 8156 + }, + { + "epoch": 0.8988429752066116, + "grad_norm": 10.268120765686035, + "learning_rate": 2.554188460991175e-07, + "loss": 0.4516, + "step": 8157 + }, + { + "epoch": 0.8989531680440771, + "grad_norm": 5.0997700691223145, + "learning_rate": 2.5486745499390564e-07, + "loss": 0.3556, + "step": 8158 + }, + { + "epoch": 0.8990633608815427, + "grad_norm": 8.162371635437012, + "learning_rate": 2.5431664412994774e-07, + "loss": 0.3871, + "step": 8159 + }, + { + "epoch": 0.8991735537190083, + "grad_norm": 3.4119391441345215, + "learning_rate": 2.5376641357459765e-07, + "loss": 0.3638, + "step": 8160 + }, + { + "epoch": 0.8992837465564738, + "grad_norm": 6.416200160980225, + "learning_rate": 2.53216763395136e-07, + "loss": 0.3328, + "step": 8161 + }, + { + "epoch": 0.8993939393939394, + "grad_norm": 5.656872272491455, + "learning_rate": 2.5266769365877796e-07, + "loss": 0.3552, + "step": 8162 + }, + { + "epoch": 0.8995041322314049, + "grad_norm": 5.099634647369385, + "learning_rate": 2.5211920443266314e-07, + "loss": 0.3377, + "step": 8163 + }, + { + "epoch": 0.8996143250688705, + "grad_norm": 5.004369735717773, + "learning_rate": 2.5157129578386007e-07, + "loss": 0.301, + "step": 8164 + }, + { + "epoch": 0.8997245179063361, + "grad_norm": 7.1246724128723145, + "learning_rate": 2.5102396777936965e-07, + "loss": 0.3258, + "step": 8165 + }, + { + "epoch": 0.8998347107438016, + "grad_norm": 5.042446136474609, + "learning_rate": 2.5047722048611944e-07, + "loss": 0.424, + "step": 8166 + }, + { + "epoch": 0.8999449035812672, + "grad_norm": 7.238094806671143, + "learning_rate": 2.4993105397096596e-07, + "loss": 0.4319, + "step": 8167 + }, + { + "epoch": 0.9000550964187328, + "grad_norm": 6.154951095581055, + "learning_rate": 2.4938546830069575e-07, + "loss": 0.4727, + "step": 8168 + }, + { + "epoch": 0.9001652892561983, + "grad_norm": 5.778834342956543, + "learning_rate": 2.4884046354202383e-07, + "loss": 0.3655, + "step": 8169 + }, + { + "epoch": 0.9002754820936639, + "grad_norm": 5.150241851806641, + "learning_rate": 2.482960397615936e-07, + "loss": 0.4015, + "step": 8170 + }, + { + "epoch": 0.9003856749311295, + "grad_norm": 8.055282592773438, + "learning_rate": 2.4775219702597777e-07, + "loss": 0.4048, + "step": 8171 + }, + { + "epoch": 0.900495867768595, + "grad_norm": 8.553960800170898, + "learning_rate": 2.472089354016788e-07, + "loss": 0.4267, + "step": 8172 + }, + { + "epoch": 0.900495867768595, + "eval_loss": 0.39516323804855347, + "eval_runtime": 41.972, + "eval_samples_per_second": 17.488, + "eval_steps_per_second": 2.192, + "step": 8172 + }, + { + "epoch": 0.9006060606060606, + "grad_norm": 5.766805648803711, + "learning_rate": 2.466662549551274e-07, + "loss": 0.3505, + "step": 8173 + }, + { + "epoch": 0.9007162534435261, + "grad_norm": 7.682420253753662, + "learning_rate": 2.4612415575268276e-07, + "loss": 0.396, + "step": 8174 + }, + { + "epoch": 0.9008264462809917, + "grad_norm": 4.986112594604492, + "learning_rate": 2.4558263786063406e-07, + "loss": 0.3932, + "step": 8175 + }, + { + "epoch": 0.9009366391184573, + "grad_norm": 7.434338092803955, + "learning_rate": 2.4504170134519944e-07, + "loss": 0.4025, + "step": 8176 + }, + { + "epoch": 0.9010468319559228, + "grad_norm": 8.345190048217773, + "learning_rate": 2.4450134627252376e-07, + "loss": 0.395, + "step": 8177 + }, + { + "epoch": 0.9011570247933884, + "grad_norm": 6.5656633377075195, + "learning_rate": 2.4396157270868304e-07, + "loss": 0.421, + "step": 8178 + }, + { + "epoch": 0.901267217630854, + "grad_norm": 10.848407745361328, + "learning_rate": 2.434223807196823e-07, + "loss": 0.4032, + "step": 8179 + }, + { + "epoch": 0.9013774104683195, + "grad_norm": 5.726838111877441, + "learning_rate": 2.4288377037145315e-07, + "loss": 0.4159, + "step": 8180 + }, + { + "epoch": 0.9014876033057851, + "grad_norm": 9.377209663391113, + "learning_rate": 2.423457417298591e-07, + "loss": 0.4314, + "step": 8181 + }, + { + "epoch": 0.9015977961432506, + "grad_norm": 6.047981262207031, + "learning_rate": 2.4180829486069037e-07, + "loss": 0.4007, + "step": 8182 + }, + { + "epoch": 0.9017079889807162, + "grad_norm": 5.022491455078125, + "learning_rate": 2.41271429829667e-07, + "loss": 0.3669, + "step": 8183 + }, + { + "epoch": 0.9018181818181819, + "grad_norm": 7.403291702270508, + "learning_rate": 2.4073514670243605e-07, + "loss": 0.4, + "step": 8184 + }, + { + "epoch": 0.9019283746556473, + "grad_norm": 4.81556510925293, + "learning_rate": 2.4019944554457775e-07, + "loss": 0.3806, + "step": 8185 + }, + { + "epoch": 0.902038567493113, + "grad_norm": 6.117400646209717, + "learning_rate": 2.3966432642159587e-07, + "loss": 0.4074, + "step": 8186 + }, + { + "epoch": 0.9021487603305786, + "grad_norm": 12.024847984313965, + "learning_rate": 2.391297893989264e-07, + "loss": 0.4172, + "step": 8187 + }, + { + "epoch": 0.902258953168044, + "grad_norm": 6.242091655731201, + "learning_rate": 2.385958345419337e-07, + "loss": 0.3194, + "step": 8188 + }, + { + "epoch": 0.9023691460055097, + "grad_norm": 6.490644454956055, + "learning_rate": 2.3806246191590941e-07, + "loss": 0.3924, + "step": 8189 + }, + { + "epoch": 0.9024793388429752, + "grad_norm": 12.245840072631836, + "learning_rate": 2.3752967158607698e-07, + "loss": 0.5208, + "step": 8190 + }, + { + "epoch": 0.9025895316804408, + "grad_norm": 5.594180583953857, + "learning_rate": 2.3699746361758424e-07, + "loss": 0.4115, + "step": 8191 + }, + { + "epoch": 0.9026997245179064, + "grad_norm": 4.443387508392334, + "learning_rate": 2.3646583807551194e-07, + "loss": 0.3965, + "step": 8192 + }, + { + "epoch": 0.9028099173553719, + "grad_norm": 6.190323829650879, + "learning_rate": 2.3593479502486804e-07, + "loss": 0.3864, + "step": 8193 + }, + { + "epoch": 0.9029201101928375, + "grad_norm": 7.716813564300537, + "learning_rate": 2.3540433453058843e-07, + "loss": 0.4035, + "step": 8194 + }, + { + "epoch": 0.9030303030303031, + "grad_norm": 6.573265075683594, + "learning_rate": 2.348744566575384e-07, + "loss": 0.3777, + "step": 8195 + }, + { + "epoch": 0.9031404958677686, + "grad_norm": 7.373490810394287, + "learning_rate": 2.3434516147051389e-07, + "loss": 0.3437, + "step": 8196 + }, + { + "epoch": 0.9032506887052342, + "grad_norm": 5.659514904022217, + "learning_rate": 2.338164490342354e-07, + "loss": 0.3805, + "step": 8197 + }, + { + "epoch": 0.9033608815426998, + "grad_norm": 4.987745761871338, + "learning_rate": 2.332883194133556e-07, + "loss": 0.3703, + "step": 8198 + }, + { + "epoch": 0.9034710743801653, + "grad_norm": 6.0765838623046875, + "learning_rate": 2.3276077267245567e-07, + "loss": 0.3111, + "step": 8199 + }, + { + "epoch": 0.9035812672176309, + "grad_norm": 5.776496410369873, + "learning_rate": 2.3223380887604396e-07, + "loss": 0.4141, + "step": 8200 + }, + { + "epoch": 0.9036914600550964, + "grad_norm": 4.796311378479004, + "learning_rate": 2.317074280885584e-07, + "loss": 0.3587, + "step": 8201 + }, + { + "epoch": 0.903801652892562, + "grad_norm": 10.538880348205566, + "learning_rate": 2.3118163037436582e-07, + "loss": 0.4016, + "step": 8202 + }, + { + "epoch": 0.9039118457300276, + "grad_norm": 7.131097316741943, + "learning_rate": 2.3065641579776088e-07, + "loss": 0.4957, + "step": 8203 + }, + { + "epoch": 0.9040220385674931, + "grad_norm": 6.796650409698486, + "learning_rate": 2.301317844229678e-07, + "loss": 0.3976, + "step": 8204 + }, + { + "epoch": 0.9041322314049587, + "grad_norm": 4.019796848297119, + "learning_rate": 2.2960773631414024e-07, + "loss": 0.3696, + "step": 8205 + }, + { + "epoch": 0.9042424242424243, + "grad_norm": 16.750940322875977, + "learning_rate": 2.2908427153535806e-07, + "loss": 0.4196, + "step": 8206 + }, + { + "epoch": 0.9043526170798898, + "grad_norm": 6.668462753295898, + "learning_rate": 2.2856139015063172e-07, + "loss": 0.4145, + "step": 8207 + }, + { + "epoch": 0.9044628099173554, + "grad_norm": 6.95205545425415, + "learning_rate": 2.2803909222390065e-07, + "loss": 0.3372, + "step": 8208 + }, + { + "epoch": 0.9045730027548209, + "grad_norm": 5.908565998077393, + "learning_rate": 2.2751737781903038e-07, + "loss": 0.4251, + "step": 8209 + }, + { + "epoch": 0.9046831955922865, + "grad_norm": 7.492595672607422, + "learning_rate": 2.2699624699981826e-07, + "loss": 0.3483, + "step": 8210 + }, + { + "epoch": 0.9047933884297521, + "grad_norm": 9.232844352722168, + "learning_rate": 2.2647569982998942e-07, + "loss": 0.4344, + "step": 8211 + }, + { + "epoch": 0.9049035812672176, + "grad_norm": 13.05223560333252, + "learning_rate": 2.2595573637319513e-07, + "loss": 0.5699, + "step": 8212 + }, + { + "epoch": 0.9050137741046832, + "grad_norm": 9.177809715270996, + "learning_rate": 2.2543635669301843e-07, + "loss": 0.4088, + "step": 8213 + }, + { + "epoch": 0.9051239669421488, + "grad_norm": 7.9022955894470215, + "learning_rate": 2.2491756085296966e-07, + "loss": 0.4845, + "step": 8214 + }, + { + "epoch": 0.9052341597796143, + "grad_norm": 5.8695831298828125, + "learning_rate": 2.2439934891648863e-07, + "loss": 0.2941, + "step": 8215 + }, + { + "epoch": 0.9053443526170799, + "grad_norm": 8.41268253326416, + "learning_rate": 2.2388172094694237e-07, + "loss": 0.3745, + "step": 8216 + }, + { + "epoch": 0.9054545454545454, + "grad_norm": 8.964435577392578, + "learning_rate": 2.2336467700762532e-07, + "loss": 0.4527, + "step": 8217 + }, + { + "epoch": 0.905564738292011, + "grad_norm": 8.264138221740723, + "learning_rate": 2.228482171617652e-07, + "loss": 0.3942, + "step": 8218 + }, + { + "epoch": 0.9056749311294766, + "grad_norm": 5.133756637573242, + "learning_rate": 2.2233234147251482e-07, + "loss": 0.3861, + "step": 8219 + }, + { + "epoch": 0.9057851239669421, + "grad_norm": 6.964199542999268, + "learning_rate": 2.2181705000295374e-07, + "loss": 0.4232, + "step": 8220 + }, + { + "epoch": 0.9058953168044077, + "grad_norm": 6.645883560180664, + "learning_rate": 2.2130234281609541e-07, + "loss": 0.3692, + "step": 8221 + }, + { + "epoch": 0.9060055096418733, + "grad_norm": 8.911177635192871, + "learning_rate": 2.2078821997487841e-07, + "loss": 0.4156, + "step": 8222 + }, + { + "epoch": 0.9061157024793388, + "grad_norm": 7.476542949676514, + "learning_rate": 2.2027468154216857e-07, + "loss": 0.4474, + "step": 8223 + }, + { + "epoch": 0.9062258953168044, + "grad_norm": 7.22313117980957, + "learning_rate": 2.1976172758076398e-07, + "loss": 0.4365, + "step": 8224 + }, + { + "epoch": 0.90633608815427, + "grad_norm": 11.649321556091309, + "learning_rate": 2.192493581533889e-07, + "loss": 0.4786, + "step": 8225 + }, + { + "epoch": 0.9064462809917355, + "grad_norm": 6.162008285522461, + "learning_rate": 2.187375733226954e-07, + "loss": 0.3448, + "step": 8226 + }, + { + "epoch": 0.9065564738292011, + "grad_norm": 10.098167419433594, + "learning_rate": 2.182263731512668e-07, + "loss": 0.4494, + "step": 8227 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 5.976385116577148, + "learning_rate": 2.17715757701612e-07, + "loss": 0.4257, + "step": 8228 + }, + { + "epoch": 0.9067768595041322, + "grad_norm": 10.715227127075195, + "learning_rate": 2.172057270361716e-07, + "loss": 0.4498, + "step": 8229 + }, + { + "epoch": 0.9068870523415978, + "grad_norm": 10.081153869628906, + "learning_rate": 2.1669628121731068e-07, + "loss": 0.3486, + "step": 8230 + }, + { + "epoch": 0.9069972451790633, + "grad_norm": 7.819251537322998, + "learning_rate": 2.161874203073261e-07, + "loss": 0.4109, + "step": 8231 + }, + { + "epoch": 0.9071074380165289, + "grad_norm": 6.477564334869385, + "learning_rate": 2.1567914436844305e-07, + "loss": 0.3086, + "step": 8232 + }, + { + "epoch": 0.9072176308539945, + "grad_norm": 5.197267055511475, + "learning_rate": 2.1517145346281188e-07, + "loss": 0.4296, + "step": 8233 + }, + { + "epoch": 0.90732782369146, + "grad_norm": 10.985544204711914, + "learning_rate": 2.146643476525151e-07, + "loss": 0.4504, + "step": 8234 + }, + { + "epoch": 0.9074380165289256, + "grad_norm": 5.30659294128418, + "learning_rate": 2.1415782699956255e-07, + "loss": 0.3781, + "step": 8235 + }, + { + "epoch": 0.9075482093663911, + "grad_norm": 5.9461894035339355, + "learning_rate": 2.136518915658914e-07, + "loss": 0.3664, + "step": 8236 + }, + { + "epoch": 0.9076584022038567, + "grad_norm": 6.129619121551514, + "learning_rate": 2.1314654141336878e-07, + "loss": 0.3769, + "step": 8237 + }, + { + "epoch": 0.9077685950413223, + "grad_norm": 6.220506191253662, + "learning_rate": 2.1264177660378972e-07, + "loss": 0.4206, + "step": 8238 + }, + { + "epoch": 0.9078787878787878, + "grad_norm": 10.714601516723633, + "learning_rate": 2.121375971988765e-07, + "loss": 0.3468, + "step": 8239 + }, + { + "epoch": 0.9079889807162534, + "grad_norm": 6.325436115264893, + "learning_rate": 2.1163400326028204e-07, + "loss": 0.3695, + "step": 8240 + }, + { + "epoch": 0.908099173553719, + "grad_norm": 8.421700477600098, + "learning_rate": 2.11130994849586e-07, + "loss": 0.3944, + "step": 8241 + }, + { + "epoch": 0.9082093663911845, + "grad_norm": 5.89466667175293, + "learning_rate": 2.106285720282969e-07, + "loss": 0.3764, + "step": 8242 + }, + { + "epoch": 0.9083195592286502, + "grad_norm": 5.76004695892334, + "learning_rate": 2.1012673485785173e-07, + "loss": 0.4143, + "step": 8243 + }, + { + "epoch": 0.9084297520661156, + "grad_norm": 6.659377098083496, + "learning_rate": 2.0962548339961586e-07, + "loss": 0.3692, + "step": 8244 + }, + { + "epoch": 0.9085399449035813, + "grad_norm": 6.028273582458496, + "learning_rate": 2.09124817714883e-07, + "loss": 0.3318, + "step": 8245 + }, + { + "epoch": 0.9086501377410469, + "grad_norm": 5.327060222625732, + "learning_rate": 2.086247378648748e-07, + "loss": 0.3372, + "step": 8246 + }, + { + "epoch": 0.9087603305785124, + "grad_norm": 11.709897994995117, + "learning_rate": 2.0812524391074285e-07, + "loss": 0.4626, + "step": 8247 + }, + { + "epoch": 0.908870523415978, + "grad_norm": 7.009006023406982, + "learning_rate": 2.076263359135644e-07, + "loss": 0.3802, + "step": 8248 + }, + { + "epoch": 0.9089807162534436, + "grad_norm": 6.697656154632568, + "learning_rate": 2.071280139343479e-07, + "loss": 0.3865, + "step": 8249 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 7.680671691894531, + "learning_rate": 2.066302780340279e-07, + "loss": 0.3975, + "step": 8250 + }, + { + "epoch": 0.9092011019283747, + "grad_norm": 15.613241195678711, + "learning_rate": 2.0613312827346908e-07, + "loss": 0.3827, + "step": 8251 + }, + { + "epoch": 0.9093112947658403, + "grad_norm": 10.426063537597656, + "learning_rate": 2.0563656471346338e-07, + "loss": 0.4593, + "step": 8252 + }, + { + "epoch": 0.9094214876033058, + "grad_norm": 8.83703899383545, + "learning_rate": 2.0514058741473053e-07, + "loss": 0.4043, + "step": 8253 + }, + { + "epoch": 0.9095316804407714, + "grad_norm": 5.748766899108887, + "learning_rate": 2.046451964379198e-07, + "loss": 0.3937, + "step": 8254 + }, + { + "epoch": 0.9096418732782369, + "grad_norm": 5.267103672027588, + "learning_rate": 2.0415039184360884e-07, + "loss": 0.356, + "step": 8255 + }, + { + "epoch": 0.9097520661157025, + "grad_norm": 9.736839294433594, + "learning_rate": 2.0365617369230205e-07, + "loss": 0.4508, + "step": 8256 + }, + { + "epoch": 0.9098622589531681, + "grad_norm": 6.641547679901123, + "learning_rate": 2.0316254204443332e-07, + "loss": 0.4355, + "step": 8257 + }, + { + "epoch": 0.9099724517906336, + "grad_norm": 6.056273460388184, + "learning_rate": 2.0266949696036543e-07, + "loss": 0.3821, + "step": 8258 + }, + { + "epoch": 0.9100826446280992, + "grad_norm": 6.898715496063232, + "learning_rate": 2.021770385003874e-07, + "loss": 0.4678, + "step": 8259 + }, + { + "epoch": 0.9101928374655648, + "grad_norm": 4.9509172439575195, + "learning_rate": 2.0168516672471828e-07, + "loss": 0.4205, + "step": 8260 + }, + { + "epoch": 0.9103030303030303, + "grad_norm": 15.809843063354492, + "learning_rate": 2.011938816935055e-07, + "loss": 0.5226, + "step": 8261 + }, + { + "epoch": 0.9104132231404959, + "grad_norm": 4.871439456939697, + "learning_rate": 2.0070318346682272e-07, + "loss": 0.3979, + "step": 8262 + }, + { + "epoch": 0.9105234159779614, + "grad_norm": 8.149961471557617, + "learning_rate": 2.002130721046741e-07, + "loss": 0.38, + "step": 8263 + }, + { + "epoch": 0.910633608815427, + "grad_norm": 7.782983303070068, + "learning_rate": 1.997235476669912e-07, + "loss": 0.4124, + "step": 8264 + }, + { + "epoch": 0.9107438016528926, + "grad_norm": 7.894191741943359, + "learning_rate": 1.9923461021363334e-07, + "loss": 0.4377, + "step": 8265 + }, + { + "epoch": 0.9108539944903581, + "grad_norm": 6.815659523010254, + "learning_rate": 1.987462598043882e-07, + "loss": 0.4063, + "step": 8266 + }, + { + "epoch": 0.9109641873278237, + "grad_norm": 7.375637531280518, + "learning_rate": 1.9825849649897255e-07, + "loss": 0.366, + "step": 8267 + }, + { + "epoch": 0.9110743801652893, + "grad_norm": 9.88588809967041, + "learning_rate": 1.977713203570303e-07, + "loss": 0.4498, + "step": 8268 + }, + { + "epoch": 0.9111845730027548, + "grad_norm": 7.845414638519287, + "learning_rate": 1.9728473143813432e-07, + "loss": 0.3645, + "step": 8269 + }, + { + "epoch": 0.9112947658402204, + "grad_norm": 8.823968887329102, + "learning_rate": 1.9679872980178483e-07, + "loss": 0.4212, + "step": 8270 + }, + { + "epoch": 0.911404958677686, + "grad_norm": 6.635942459106445, + "learning_rate": 1.9631331550741207e-07, + "loss": 0.3881, + "step": 8271 + }, + { + "epoch": 0.9115151515151515, + "grad_norm": 5.339030742645264, + "learning_rate": 1.958284886143713e-07, + "loss": 0.3459, + "step": 8272 + }, + { + "epoch": 0.9116253443526171, + "grad_norm": 7.035585880279541, + "learning_rate": 1.9534424918194906e-07, + "loss": 0.4295, + "step": 8273 + }, + { + "epoch": 0.9117355371900826, + "grad_norm": 4.605419635772705, + "learning_rate": 1.9486059726935903e-07, + "loss": 0.3746, + "step": 8274 + }, + { + "epoch": 0.9118457300275482, + "grad_norm": 8.786975860595703, + "learning_rate": 1.9437753293574225e-07, + "loss": 0.4245, + "step": 8275 + }, + { + "epoch": 0.9119559228650138, + "grad_norm": 8.458885192871094, + "learning_rate": 1.9389505624016758e-07, + "loss": 0.4328, + "step": 8276 + }, + { + "epoch": 0.9120661157024793, + "grad_norm": 5.858531951904297, + "learning_rate": 1.9341316724163506e-07, + "loss": 0.4012, + "step": 8277 + }, + { + "epoch": 0.9121763085399449, + "grad_norm": 8.922369003295898, + "learning_rate": 1.929318659990692e-07, + "loss": 0.4947, + "step": 8278 + }, + { + "epoch": 0.9122865013774105, + "grad_norm": 5.19242000579834, + "learning_rate": 1.9245115257132351e-07, + "loss": 0.3481, + "step": 8279 + }, + { + "epoch": 0.912396694214876, + "grad_norm": 8.85990047454834, + "learning_rate": 1.9197102701718263e-07, + "loss": 0.4135, + "step": 8280 + }, + { + "epoch": 0.9125068870523416, + "grad_norm": 5.6612420082092285, + "learning_rate": 1.9149148939535568e-07, + "loss": 0.3619, + "step": 8281 + }, + { + "epoch": 0.9126170798898071, + "grad_norm": 4.2848381996154785, + "learning_rate": 1.910125397644802e-07, + "loss": 0.3566, + "step": 8282 + }, + { + "epoch": 0.9127272727272727, + "grad_norm": 11.075757026672363, + "learning_rate": 1.905341781831238e-07, + "loss": 0.4707, + "step": 8283 + }, + { + "epoch": 0.9128374655647383, + "grad_norm": 10.635125160217285, + "learning_rate": 1.9005640470978137e-07, + "loss": 0.4806, + "step": 8284 + }, + { + "epoch": 0.9129476584022038, + "grad_norm": 7.729781627655029, + "learning_rate": 1.895792194028756e-07, + "loss": 0.3553, + "step": 8285 + }, + { + "epoch": 0.9130578512396694, + "grad_norm": 5.757815361022949, + "learning_rate": 1.8910262232075706e-07, + "loss": 0.4058, + "step": 8286 + }, + { + "epoch": 0.913168044077135, + "grad_norm": 9.050127983093262, + "learning_rate": 1.8862661352170465e-07, + "loss": 0.3369, + "step": 8287 + }, + { + "epoch": 0.9132782369146005, + "grad_norm": 16.06497573852539, + "learning_rate": 1.8815119306392625e-07, + "loss": 0.4096, + "step": 8288 + }, + { + "epoch": 0.9133884297520661, + "grad_norm": 5.088180065155029, + "learning_rate": 1.8767636100555543e-07, + "loss": 0.3421, + "step": 8289 + }, + { + "epoch": 0.9134986225895316, + "grad_norm": 7.645425796508789, + "learning_rate": 1.872021174046562e-07, + "loss": 0.4557, + "step": 8290 + }, + { + "epoch": 0.9136088154269972, + "grad_norm": 11.184514045715332, + "learning_rate": 1.8672846231922005e-07, + "loss": 0.4466, + "step": 8291 + }, + { + "epoch": 0.9137190082644628, + "grad_norm": 6.974379539489746, + "learning_rate": 1.86255395807165e-07, + "loss": 0.4206, + "step": 8292 + }, + { + "epoch": 0.9138292011019283, + "grad_norm": 8.65231990814209, + "learning_rate": 1.857829179263393e-07, + "loss": 0.4017, + "step": 8293 + }, + { + "epoch": 0.9139393939393939, + "grad_norm": 6.22916841506958, + "learning_rate": 1.8531102873451834e-07, + "loss": 0.4092, + "step": 8294 + }, + { + "epoch": 0.9140495867768595, + "grad_norm": 7.040229797363281, + "learning_rate": 1.8483972828940434e-07, + "loss": 0.3962, + "step": 8295 + }, + { + "epoch": 0.914159779614325, + "grad_norm": 14.772039413452148, + "learning_rate": 1.843690166486295e-07, + "loss": 0.5156, + "step": 8296 + }, + { + "epoch": 0.9142699724517906, + "grad_norm": 6.841568470001221, + "learning_rate": 1.8389889386975279e-07, + "loss": 0.4296, + "step": 8297 + }, + { + "epoch": 0.9143801652892563, + "grad_norm": 4.574252128601074, + "learning_rate": 1.8342936001026101e-07, + "loss": 0.3238, + "step": 8298 + }, + { + "epoch": 0.9144903581267217, + "grad_norm": 12.451244354248047, + "learning_rate": 1.8296041512756934e-07, + "loss": 0.4644, + "step": 8299 + }, + { + "epoch": 0.9146005509641874, + "grad_norm": 5.818710803985596, + "learning_rate": 1.8249205927902247e-07, + "loss": 0.3551, + "step": 8300 + }, + { + "epoch": 0.9147107438016528, + "grad_norm": 5.565225124359131, + "learning_rate": 1.8202429252188957e-07, + "loss": 0.3945, + "step": 8301 + }, + { + "epoch": 0.9148209366391185, + "grad_norm": 8.565671920776367, + "learning_rate": 1.81557114913371e-07, + "loss": 0.4937, + "step": 8302 + }, + { + "epoch": 0.9149311294765841, + "grad_norm": 6.482603549957275, + "learning_rate": 1.8109052651059444e-07, + "loss": 0.4699, + "step": 8303 + }, + { + "epoch": 0.9150413223140496, + "grad_norm": 5.407090663909912, + "learning_rate": 1.806245273706131e-07, + "loss": 0.4133, + "step": 8304 + }, + { + "epoch": 0.9151515151515152, + "grad_norm": 10.727090835571289, + "learning_rate": 1.8015911755041137e-07, + "loss": 0.4985, + "step": 8305 + }, + { + "epoch": 0.9152617079889808, + "grad_norm": 7.957470417022705, + "learning_rate": 1.7969429710689989e-07, + "loss": 0.3101, + "step": 8306 + }, + { + "epoch": 0.9153719008264463, + "grad_norm": 7.423542022705078, + "learning_rate": 1.7923006609691761e-07, + "loss": 0.3959, + "step": 8307 + }, + { + "epoch": 0.9154820936639119, + "grad_norm": 5.17576265335083, + "learning_rate": 1.7876642457723136e-07, + "loss": 0.3963, + "step": 8308 + }, + { + "epoch": 0.9155922865013774, + "grad_norm": 8.73119831085205, + "learning_rate": 1.7830337260453523e-07, + "loss": 0.3828, + "step": 8309 + }, + { + "epoch": 0.915702479338843, + "grad_norm": 5.12293815612793, + "learning_rate": 1.778409102354528e-07, + "loss": 0.3701, + "step": 8310 + }, + { + "epoch": 0.9158126721763086, + "grad_norm": 8.923873901367188, + "learning_rate": 1.7737903752653386e-07, + "loss": 0.4444, + "step": 8311 + }, + { + "epoch": 0.9159228650137741, + "grad_norm": 24.163288116455078, + "learning_rate": 1.7691775453425653e-07, + "loss": 0.423, + "step": 8312 + }, + { + "epoch": 0.9160330578512397, + "grad_norm": 4.271527290344238, + "learning_rate": 1.7645706131502904e-07, + "loss": 0.3025, + "step": 8313 + }, + { + "epoch": 0.9161432506887053, + "grad_norm": 8.020672798156738, + "learning_rate": 1.7599695792518356e-07, + "loss": 0.4385, + "step": 8314 + }, + { + "epoch": 0.9162534435261708, + "grad_norm": 4.3559393882751465, + "learning_rate": 1.7553744442098285e-07, + "loss": 0.3572, + "step": 8315 + }, + { + "epoch": 0.9163636363636364, + "grad_norm": 6.599991321563721, + "learning_rate": 1.7507852085861642e-07, + "loss": 0.4171, + "step": 8316 + }, + { + "epoch": 0.9164738292011019, + "grad_norm": 5.609654903411865, + "learning_rate": 1.7462018729420326e-07, + "loss": 0.3031, + "step": 8317 + }, + { + "epoch": 0.9165840220385675, + "grad_norm": 8.524760246276855, + "learning_rate": 1.7416244378378745e-07, + "loss": 0.4674, + "step": 8318 + }, + { + "epoch": 0.9166942148760331, + "grad_norm": 7.917247772216797, + "learning_rate": 1.737052903833436e-07, + "loss": 0.3783, + "step": 8319 + }, + { + "epoch": 0.9168044077134986, + "grad_norm": 5.46023416519165, + "learning_rate": 1.7324872714877317e-07, + "loss": 0.2716, + "step": 8320 + }, + { + "epoch": 0.9169146005509642, + "grad_norm": 7.115887641906738, + "learning_rate": 1.7279275413590425e-07, + "loss": 0.4426, + "step": 8321 + }, + { + "epoch": 0.9170247933884298, + "grad_norm": 5.3829874992370605, + "learning_rate": 1.7233737140049445e-07, + "loss": 0.3992, + "step": 8322 + }, + { + "epoch": 0.9171349862258953, + "grad_norm": 7.402966022491455, + "learning_rate": 1.7188257899822868e-07, + "loss": 0.4134, + "step": 8323 + }, + { + "epoch": 0.9172451790633609, + "grad_norm": 5.992671489715576, + "learning_rate": 1.714283769847197e-07, + "loss": 0.3665, + "step": 8324 + }, + { + "epoch": 0.9173553719008265, + "grad_norm": 8.363330841064453, + "learning_rate": 1.7097476541550751e-07, + "loss": 0.4608, + "step": 8325 + }, + { + "epoch": 0.917465564738292, + "grad_norm": 13.325737953186035, + "learning_rate": 1.705217443460605e-07, + "loss": 0.4234, + "step": 8326 + }, + { + "epoch": 0.9175757575757576, + "grad_norm": 5.575732707977295, + "learning_rate": 1.7006931383177548e-07, + "loss": 0.3845, + "step": 8327 + }, + { + "epoch": 0.9176859504132231, + "grad_norm": 7.288289546966553, + "learning_rate": 1.6961747392797488e-07, + "loss": 0.4315, + "step": 8328 + }, + { + "epoch": 0.9177961432506887, + "grad_norm": 6.849255084991455, + "learning_rate": 1.6916622468991118e-07, + "loss": 0.3797, + "step": 8329 + }, + { + "epoch": 0.9179063360881543, + "grad_norm": 8.203755378723145, + "learning_rate": 1.6871556617276407e-07, + "loss": 0.3878, + "step": 8330 + }, + { + "epoch": 0.9180165289256198, + "grad_norm": 7.336549282073975, + "learning_rate": 1.682654984316401e-07, + "loss": 0.3361, + "step": 8331 + }, + { + "epoch": 0.9181267217630854, + "grad_norm": 4.305370330810547, + "learning_rate": 1.678160215215735e-07, + "loss": 0.3609, + "step": 8332 + }, + { + "epoch": 0.918236914600551, + "grad_norm": 6.776370048522949, + "learning_rate": 1.6736713549752815e-07, + "loss": 0.4287, + "step": 8333 + }, + { + "epoch": 0.9183471074380165, + "grad_norm": 4.999423980712891, + "learning_rate": 1.6691884041439455e-07, + "loss": 0.4315, + "step": 8334 + }, + { + "epoch": 0.9184573002754821, + "grad_norm": 20.000083923339844, + "learning_rate": 1.6647113632698886e-07, + "loss": 0.458, + "step": 8335 + }, + { + "epoch": 0.9185674931129476, + "grad_norm": 6.919802188873291, + "learning_rate": 1.660240232900595e-07, + "loss": 0.4631, + "step": 8336 + }, + { + "epoch": 0.9186776859504132, + "grad_norm": 11.445610046386719, + "learning_rate": 1.6557750135827833e-07, + "loss": 0.4419, + "step": 8337 + }, + { + "epoch": 0.9187878787878788, + "grad_norm": 12.312527656555176, + "learning_rate": 1.6513157058624662e-07, + "loss": 0.439, + "step": 8338 + }, + { + "epoch": 0.9188980716253443, + "grad_norm": 6.393393516540527, + "learning_rate": 1.6468623102849523e-07, + "loss": 0.41, + "step": 8339 + }, + { + "epoch": 0.9190082644628099, + "grad_norm": 6.301146984100342, + "learning_rate": 1.6424148273947892e-07, + "loss": 0.4067, + "step": 8340 + }, + { + "epoch": 0.9191184573002755, + "grad_norm": 16.659996032714844, + "learning_rate": 1.6379732577358366e-07, + "loss": 0.439, + "step": 8341 + }, + { + "epoch": 0.919228650137741, + "grad_norm": 4.91457462310791, + "learning_rate": 1.6335376018511984e-07, + "loss": 0.3293, + "step": 8342 + }, + { + "epoch": 0.9193388429752066, + "grad_norm": 7.265914440155029, + "learning_rate": 1.62910786028328e-07, + "loss": 0.4245, + "step": 8343 + }, + { + "epoch": 0.9194490358126721, + "grad_norm": 8.63418197631836, + "learning_rate": 1.6246840335737646e-07, + "loss": 0.433, + "step": 8344 + }, + { + "epoch": 0.9195592286501377, + "grad_norm": 6.614242076873779, + "learning_rate": 1.6202661222635917e-07, + "loss": 0.3203, + "step": 8345 + }, + { + "epoch": 0.9196694214876033, + "grad_norm": 8.162196159362793, + "learning_rate": 1.6158541268929962e-07, + "loss": 0.4583, + "step": 8346 + }, + { + "epoch": 0.9197796143250688, + "grad_norm": 7.244357109069824, + "learning_rate": 1.6114480480014905e-07, + "loss": 0.3071, + "step": 8347 + }, + { + "epoch": 0.9198898071625344, + "grad_norm": 7.062908172607422, + "learning_rate": 1.6070478861278327e-07, + "loss": 0.4074, + "step": 8348 + }, + { + "epoch": 0.92, + "grad_norm": 9.556879997253418, + "learning_rate": 1.6026536418101034e-07, + "loss": 0.5522, + "step": 8349 + }, + { + "epoch": 0.9201101928374655, + "grad_norm": 6.477383613586426, + "learning_rate": 1.5982653155856287e-07, + "loss": 0.3977, + "step": 8350 + }, + { + "epoch": 0.9202203856749311, + "grad_norm": 4.523920059204102, + "learning_rate": 1.5938829079910122e-07, + "loss": 0.3486, + "step": 8351 + }, + { + "epoch": 0.9203305785123967, + "grad_norm": 4.436920166015625, + "learning_rate": 1.5895064195621478e-07, + "loss": 0.405, + "step": 8352 + }, + { + "epoch": 0.9204407713498622, + "grad_norm": 7.761414051055908, + "learning_rate": 1.5851358508342074e-07, + "loss": 0.4385, + "step": 8353 + }, + { + "epoch": 0.9205509641873278, + "grad_norm": 8.559017181396484, + "learning_rate": 1.5807712023416078e-07, + "loss": 0.4268, + "step": 8354 + }, + { + "epoch": 0.9206611570247933, + "grad_norm": 6.970315933227539, + "learning_rate": 1.5764124746180832e-07, + "loss": 0.3736, + "step": 8355 + }, + { + "epoch": 0.920771349862259, + "grad_norm": 6.862329006195068, + "learning_rate": 1.572059668196618e-07, + "loss": 0.3573, + "step": 8356 + }, + { + "epoch": 0.9208815426997246, + "grad_norm": 6.494803428649902, + "learning_rate": 1.5677127836094763e-07, + "loss": 0.4031, + "step": 8357 + }, + { + "epoch": 0.92099173553719, + "grad_norm": 8.824837684631348, + "learning_rate": 1.5633718213882097e-07, + "loss": 0.4843, + "step": 8358 + }, + { + "epoch": 0.9211019283746557, + "grad_norm": 5.581061840057373, + "learning_rate": 1.5590367820636276e-07, + "loss": 0.4371, + "step": 8359 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 8.021149635314941, + "learning_rate": 1.5547076661658279e-07, + "loss": 0.4531, + "step": 8360 + }, + { + "epoch": 0.9213223140495868, + "grad_norm": 4.6611127853393555, + "learning_rate": 1.5503844742241813e-07, + "loss": 0.3755, + "step": 8361 + }, + { + "epoch": 0.9214325068870524, + "grad_norm": 6.004344463348389, + "learning_rate": 1.5460672067673376e-07, + "loss": 0.3695, + "step": 8362 + }, + { + "epoch": 0.9215426997245179, + "grad_norm": 6.207559585571289, + "learning_rate": 1.5417558643232077e-07, + "loss": 0.3423, + "step": 8363 + }, + { + "epoch": 0.9216528925619835, + "grad_norm": 7.601280212402344, + "learning_rate": 1.5374504474190033e-07, + "loss": 0.4347, + "step": 8364 + }, + { + "epoch": 0.9217630853994491, + "grad_norm": 4.724337577819824, + "learning_rate": 1.533150956581181e-07, + "loss": 0.3406, + "step": 8365 + }, + { + "epoch": 0.9218732782369146, + "grad_norm": 8.186506271362305, + "learning_rate": 1.5288573923354976e-07, + "loss": 0.432, + "step": 8366 + }, + { + "epoch": 0.9219834710743802, + "grad_norm": 7.820041656494141, + "learning_rate": 1.5245697552069782e-07, + "loss": 0.3995, + "step": 8367 + }, + { + "epoch": 0.9220936639118458, + "grad_norm": 4.091879844665527, + "learning_rate": 1.520288045719903e-07, + "loss": 0.3506, + "step": 8368 + }, + { + "epoch": 0.9222038567493113, + "grad_norm": 4.473921775817871, + "learning_rate": 1.5160122643978703e-07, + "loss": 0.3674, + "step": 8369 + }, + { + "epoch": 0.9223140495867769, + "grad_norm": 6.545633792877197, + "learning_rate": 1.511742411763717e-07, + "loss": 0.2819, + "step": 8370 + }, + { + "epoch": 0.9224242424242424, + "grad_norm": 11.863525390625, + "learning_rate": 1.5074784883395587e-07, + "loss": 0.4053, + "step": 8371 + }, + { + "epoch": 0.922534435261708, + "grad_norm": 4.6853814125061035, + "learning_rate": 1.503220494646812e-07, + "loss": 0.3621, + "step": 8372 + }, + { + "epoch": 0.9226446280991736, + "grad_norm": 8.450872421264648, + "learning_rate": 1.498968431206138e-07, + "loss": 0.3981, + "step": 8373 + }, + { + "epoch": 0.9227548209366391, + "grad_norm": 5.5612359046936035, + "learning_rate": 1.494722298537482e-07, + "loss": 0.3576, + "step": 8374 + }, + { + "epoch": 0.9228650137741047, + "grad_norm": 10.44948959350586, + "learning_rate": 1.4904820971600676e-07, + "loss": 0.4276, + "step": 8375 + }, + { + "epoch": 0.9229752066115703, + "grad_norm": 4.357065200805664, + "learning_rate": 1.486247827592402e-07, + "loss": 0.3567, + "step": 8376 + }, + { + "epoch": 0.9230853994490358, + "grad_norm": 6.663888931274414, + "learning_rate": 1.482019490352249e-07, + "loss": 0.3808, + "step": 8377 + }, + { + "epoch": 0.9231955922865014, + "grad_norm": 6.435294151306152, + "learning_rate": 1.477797085956656e-07, + "loss": 0.3919, + "step": 8378 + }, + { + "epoch": 0.923305785123967, + "grad_norm": 5.487656593322754, + "learning_rate": 1.4735806149219544e-07, + "loss": 0.386, + "step": 8379 + }, + { + "epoch": 0.9234159779614325, + "grad_norm": 8.581289291381836, + "learning_rate": 1.4693700777637265e-07, + "loss": 0.4092, + "step": 8380 + }, + { + "epoch": 0.9235261707988981, + "grad_norm": 8.062307357788086, + "learning_rate": 1.4651654749968436e-07, + "loss": 0.4001, + "step": 8381 + }, + { + "epoch": 0.9236363636363636, + "grad_norm": 4.797517776489258, + "learning_rate": 1.460966807135461e-07, + "loss": 0.3722, + "step": 8382 + }, + { + "epoch": 0.9237465564738292, + "grad_norm": 6.919247150421143, + "learning_rate": 1.4567740746929904e-07, + "loss": 0.3781, + "step": 8383 + }, + { + "epoch": 0.9238567493112948, + "grad_norm": 7.119813919067383, + "learning_rate": 1.4525872781821215e-07, + "loss": 0.3846, + "step": 8384 + }, + { + "epoch": 0.9239669421487603, + "grad_norm": 6.6954498291015625, + "learning_rate": 1.4484064181148283e-07, + "loss": 0.3339, + "step": 8385 + }, + { + "epoch": 0.9240771349862259, + "grad_norm": 6.583959579467773, + "learning_rate": 1.4442314950023517e-07, + "loss": 0.3822, + "step": 8386 + }, + { + "epoch": 0.9241873278236915, + "grad_norm": 6.830016136169434, + "learning_rate": 1.4400625093552e-07, + "loss": 0.4408, + "step": 8387 + }, + { + "epoch": 0.924297520661157, + "grad_norm": 6.698701858520508, + "learning_rate": 1.4358994616831656e-07, + "loss": 0.3444, + "step": 8388 + }, + { + "epoch": 0.9244077134986226, + "grad_norm": 4.193092346191406, + "learning_rate": 1.431742352495319e-07, + "loss": 0.4458, + "step": 8389 + }, + { + "epoch": 0.9245179063360881, + "grad_norm": 7.711377143859863, + "learning_rate": 1.4275911822999922e-07, + "loss": 0.4317, + "step": 8390 + }, + { + "epoch": 0.9246280991735537, + "grad_norm": 7.1049370765686035, + "learning_rate": 1.423445951604785e-07, + "loss": 0.3267, + "step": 8391 + }, + { + "epoch": 0.9247382920110193, + "grad_norm": 6.219496250152588, + "learning_rate": 1.4193066609165972e-07, + "loss": 0.3452, + "step": 8392 + }, + { + "epoch": 0.9248484848484848, + "grad_norm": 4.745777130126953, + "learning_rate": 1.4151733107415855e-07, + "loss": 0.3863, + "step": 8393 + }, + { + "epoch": 0.9249586776859504, + "grad_norm": 6.9581990242004395, + "learning_rate": 1.4110459015851675e-07, + "loss": 0.4266, + "step": 8394 + }, + { + "epoch": 0.925068870523416, + "grad_norm": 6.962114334106445, + "learning_rate": 1.4069244339520672e-07, + "loss": 0.4283, + "step": 8395 + }, + { + "epoch": 0.9251790633608815, + "grad_norm": 7.494615077972412, + "learning_rate": 1.4028089083462482e-07, + "loss": 0.3962, + "step": 8396 + }, + { + "epoch": 0.9252892561983471, + "grad_norm": 7.1831560134887695, + "learning_rate": 1.3986993252709747e-07, + "loss": 0.4074, + "step": 8397 + }, + { + "epoch": 0.9253994490358127, + "grad_norm": 5.726095676422119, + "learning_rate": 1.394595685228761e-07, + "loss": 0.3096, + "step": 8398 + }, + { + "epoch": 0.9255096418732782, + "grad_norm": 6.874561309814453, + "learning_rate": 1.3904979887214064e-07, + "loss": 0.3626, + "step": 8399 + }, + { + "epoch": 0.9256198347107438, + "grad_norm": 6.559597015380859, + "learning_rate": 1.3864062362499987e-07, + "loss": 0.3821, + "step": 8400 + }, + { + "epoch": 0.9257300275482093, + "grad_norm": 6.934940814971924, + "learning_rate": 1.3823204283148651e-07, + "loss": 0.3554, + "step": 8401 + }, + { + "epoch": 0.9258402203856749, + "grad_norm": 9.219505310058594, + "learning_rate": 1.3782405654156284e-07, + "loss": 0.4705, + "step": 8402 + }, + { + "epoch": 0.9259504132231405, + "grad_norm": 7.8724870681762695, + "learning_rate": 1.3741666480511894e-07, + "loss": 0.392, + "step": 8403 + }, + { + "epoch": 0.926060606060606, + "grad_norm": 4.987101078033447, + "learning_rate": 1.3700986767197e-07, + "loss": 0.4414, + "step": 8404 + }, + { + "epoch": 0.9261707988980716, + "grad_norm": 7.214572429656982, + "learning_rate": 1.3660366519185953e-07, + "loss": 0.3922, + "step": 8405 + }, + { + "epoch": 0.9262809917355372, + "grad_norm": 6.101369380950928, + "learning_rate": 1.3619805741446e-07, + "loss": 0.3524, + "step": 8406 + }, + { + "epoch": 0.9263911845730027, + "grad_norm": 8.496451377868652, + "learning_rate": 1.3579304438936848e-07, + "loss": 0.4005, + "step": 8407 + }, + { + "epoch": 0.9265013774104683, + "grad_norm": 6.085446834564209, + "learning_rate": 1.3538862616611083e-07, + "loss": 0.3692, + "step": 8408 + }, + { + "epoch": 0.9266115702479338, + "grad_norm": 7.8222150802612305, + "learning_rate": 1.3498480279414028e-07, + "loss": 0.4491, + "step": 8409 + }, + { + "epoch": 0.9267217630853994, + "grad_norm": 12.749969482421875, + "learning_rate": 1.3458157432283626e-07, + "loss": 0.4216, + "step": 8410 + }, + { + "epoch": 0.926831955922865, + "grad_norm": 8.5120849609375, + "learning_rate": 1.3417894080150595e-07, + "loss": 0.4171, + "step": 8411 + }, + { + "epoch": 0.9269421487603305, + "grad_norm": 7.881687164306641, + "learning_rate": 1.33776902279385e-07, + "loss": 0.3602, + "step": 8412 + }, + { + "epoch": 0.9270523415977961, + "grad_norm": 14.683112144470215, + "learning_rate": 1.3337545880563462e-07, + "loss": 0.4831, + "step": 8413 + }, + { + "epoch": 0.9271625344352618, + "grad_norm": 6.600551128387451, + "learning_rate": 1.329746104293428e-07, + "loss": 0.394, + "step": 8414 + }, + { + "epoch": 0.9272727272727272, + "grad_norm": 11.075981140136719, + "learning_rate": 1.3257435719952804e-07, + "loss": 0.3995, + "step": 8415 + }, + { + "epoch": 0.9273829201101929, + "grad_norm": 6.9011664390563965, + "learning_rate": 1.3217469916513182e-07, + "loss": 0.4076, + "step": 8416 + }, + { + "epoch": 0.9274931129476583, + "grad_norm": 7.484837055206299, + "learning_rate": 1.3177563637502612e-07, + "loss": 0.43, + "step": 8417 + }, + { + "epoch": 0.927603305785124, + "grad_norm": 10.550387382507324, + "learning_rate": 1.3137716887800854e-07, + "loss": 0.4981, + "step": 8418 + }, + { + "epoch": 0.9277134986225896, + "grad_norm": 7.676487445831299, + "learning_rate": 1.309792967228035e-07, + "loss": 0.3883, + "step": 8419 + }, + { + "epoch": 0.9278236914600551, + "grad_norm": 6.712424278259277, + "learning_rate": 1.305820199580643e-07, + "loss": 0.4221, + "step": 8420 + }, + { + "epoch": 0.9279338842975207, + "grad_norm": 4.638515472412109, + "learning_rate": 1.3018533863237037e-07, + "loss": 0.4026, + "step": 8421 + }, + { + "epoch": 0.9280440771349863, + "grad_norm": 7.031803131103516, + "learning_rate": 1.2978925279422795e-07, + "loss": 0.3845, + "step": 8422 + }, + { + "epoch": 0.9281542699724518, + "grad_norm": 3.9012720584869385, + "learning_rate": 1.2939376249207157e-07, + "loss": 0.2964, + "step": 8423 + }, + { + "epoch": 0.9282644628099174, + "grad_norm": 6.953731536865234, + "learning_rate": 1.2899886777426096e-07, + "loss": 0.3968, + "step": 8424 + }, + { + "epoch": 0.928374655647383, + "grad_norm": 6.745138645172119, + "learning_rate": 1.2860456868908632e-07, + "loss": 0.4183, + "step": 8425 + }, + { + "epoch": 0.9284848484848485, + "grad_norm": 8.751028060913086, + "learning_rate": 1.2821086528476244e-07, + "loss": 0.4627, + "step": 8426 + }, + { + "epoch": 0.9285950413223141, + "grad_norm": 9.254036903381348, + "learning_rate": 1.2781775760943026e-07, + "loss": 0.3513, + "step": 8427 + }, + { + "epoch": 0.9287052341597796, + "grad_norm": 6.881036758422852, + "learning_rate": 1.2742524571116244e-07, + "loss": 0.4159, + "step": 8428 + }, + { + "epoch": 0.9288154269972452, + "grad_norm": 8.426811218261719, + "learning_rate": 1.270333296379539e-07, + "loss": 0.4034, + "step": 8429 + }, + { + "epoch": 0.9289256198347108, + "grad_norm": 14.932004928588867, + "learning_rate": 1.2664200943772853e-07, + "loss": 0.442, + "step": 8430 + }, + { + "epoch": 0.9290358126721763, + "grad_norm": 5.104497909545898, + "learning_rate": 1.2625128515833863e-07, + "loss": 0.326, + "step": 8431 + }, + { + "epoch": 0.9291460055096419, + "grad_norm": 4.100868225097656, + "learning_rate": 1.2586115684756205e-07, + "loss": 0.3718, + "step": 8432 + }, + { + "epoch": 0.9292561983471075, + "grad_norm": 5.32192850112915, + "learning_rate": 1.2547162455310347e-07, + "loss": 0.3859, + "step": 8433 + }, + { + "epoch": 0.929366391184573, + "grad_norm": 5.072726249694824, + "learning_rate": 1.2508268832259585e-07, + "loss": 0.449, + "step": 8434 + }, + { + "epoch": 0.9294765840220386, + "grad_norm": 8.323136329650879, + "learning_rate": 1.2469434820360005e-07, + "loss": 0.4656, + "step": 8435 + }, + { + "epoch": 0.9295867768595041, + "grad_norm": 5.533694744110107, + "learning_rate": 1.2430660424360085e-07, + "loss": 0.3922, + "step": 8436 + }, + { + "epoch": 0.9296969696969697, + "grad_norm": 6.415494918823242, + "learning_rate": 1.2391945649001314e-07, + "loss": 0.3965, + "step": 8437 + }, + { + "epoch": 0.9298071625344353, + "grad_norm": 4.350618362426758, + "learning_rate": 1.2353290499017788e-07, + "loss": 0.3766, + "step": 8438 + }, + { + "epoch": 0.9299173553719008, + "grad_norm": 6.249704360961914, + "learning_rate": 1.23146949791364e-07, + "loss": 0.3584, + "step": 8439 + }, + { + "epoch": 0.9300275482093664, + "grad_norm": 9.35655403137207, + "learning_rate": 1.2276159094076479e-07, + "loss": 0.3968, + "step": 8440 + }, + { + "epoch": 0.930137741046832, + "grad_norm": 8.002750396728516, + "learning_rate": 1.2237682848550313e-07, + "loss": 0.3365, + "step": 8441 + }, + { + "epoch": 0.9302479338842975, + "grad_norm": 5.87313985824585, + "learning_rate": 1.219926624726292e-07, + "loss": 0.3905, + "step": 8442 + }, + { + "epoch": 0.9303581267217631, + "grad_norm": 7.536591053009033, + "learning_rate": 1.216090929491176e-07, + "loss": 0.3819, + "step": 8443 + }, + { + "epoch": 0.9304683195592286, + "grad_norm": 4.820857524871826, + "learning_rate": 1.212261199618736e-07, + "loss": 0.3978, + "step": 8444 + }, + { + "epoch": 0.9305785123966942, + "grad_norm": 7.339576721191406, + "learning_rate": 1.2084374355772642e-07, + "loss": 0.3865, + "step": 8445 + }, + { + "epoch": 0.9306887052341598, + "grad_norm": 9.985773086547852, + "learning_rate": 1.2046196378343423e-07, + "loss": 0.4125, + "step": 8446 + }, + { + "epoch": 0.9307988980716253, + "grad_norm": 8.914613723754883, + "learning_rate": 1.2008078068568074e-07, + "loss": 0.4161, + "step": 8447 + }, + { + "epoch": 0.9309090909090909, + "grad_norm": 9.997947692871094, + "learning_rate": 1.1970019431107926e-07, + "loss": 0.4691, + "step": 8448 + }, + { + "epoch": 0.9310192837465565, + "grad_norm": 7.667982578277588, + "learning_rate": 1.1932020470616646e-07, + "loss": 0.4159, + "step": 8449 + }, + { + "epoch": 0.931129476584022, + "grad_norm": 5.538841247558594, + "learning_rate": 1.1894081191740848e-07, + "loss": 0.3165, + "step": 8450 + }, + { + "epoch": 0.9312396694214876, + "grad_norm": 7.691782474517822, + "learning_rate": 1.1856201599119876e-07, + "loss": 0.3869, + "step": 8451 + }, + { + "epoch": 0.9313498622589532, + "grad_norm": 5.27807092666626, + "learning_rate": 1.1818381697385639e-07, + "loss": 0.4573, + "step": 8452 + }, + { + "epoch": 0.9314600550964187, + "grad_norm": 8.917466163635254, + "learning_rate": 1.1780621491162825e-07, + "loss": 0.3827, + "step": 8453 + }, + { + "epoch": 0.9315702479338843, + "grad_norm": 8.276079177856445, + "learning_rate": 1.1742920985068795e-07, + "loss": 0.2961, + "step": 8454 + }, + { + "epoch": 0.9316804407713498, + "grad_norm": 7.216142654418945, + "learning_rate": 1.1705280183713641e-07, + "loss": 0.4257, + "step": 8455 + }, + { + "epoch": 0.9317906336088154, + "grad_norm": 5.993567943572998, + "learning_rate": 1.1667699091700068e-07, + "loss": 0.3158, + "step": 8456 + }, + { + "epoch": 0.931900826446281, + "grad_norm": 13.740479469299316, + "learning_rate": 1.1630177713623625e-07, + "loss": 0.5898, + "step": 8457 + }, + { + "epoch": 0.9320110192837465, + "grad_norm": 8.478474617004395, + "learning_rate": 1.1592716054072361e-07, + "loss": 0.427, + "step": 8458 + }, + { + "epoch": 0.9321212121212121, + "grad_norm": 10.227319717407227, + "learning_rate": 1.1555314117627336e-07, + "loss": 0.355, + "step": 8459 + }, + { + "epoch": 0.9322314049586777, + "grad_norm": 5.869755268096924, + "learning_rate": 1.1517971908861892e-07, + "loss": 0.3748, + "step": 8460 + }, + { + "epoch": 0.9323415977961432, + "grad_norm": 5.438831329345703, + "learning_rate": 1.1480689432342373e-07, + "loss": 0.4089, + "step": 8461 + }, + { + "epoch": 0.9324517906336088, + "grad_norm": 6.387732028961182, + "learning_rate": 1.1443466692627803e-07, + "loss": 0.3713, + "step": 8462 + }, + { + "epoch": 0.9325619834710743, + "grad_norm": 5.7738728523254395, + "learning_rate": 1.1406303694269705e-07, + "loss": 0.3228, + "step": 8463 + }, + { + "epoch": 0.9326721763085399, + "grad_norm": 5.7311296463012695, + "learning_rate": 1.13692004418125e-07, + "loss": 0.3703, + "step": 8464 + }, + { + "epoch": 0.9327823691460055, + "grad_norm": 21.89881134033203, + "learning_rate": 1.1332156939793282e-07, + "loss": 0.4693, + "step": 8465 + }, + { + "epoch": 0.932892561983471, + "grad_norm": 12.989684104919434, + "learning_rate": 1.1295173192741593e-07, + "loss": 0.4476, + "step": 8466 + }, + { + "epoch": 0.9330027548209366, + "grad_norm": 7.674446105957031, + "learning_rate": 1.1258249205179983e-07, + "loss": 0.3345, + "step": 8467 + }, + { + "epoch": 0.9331129476584022, + "grad_norm": 5.347735404968262, + "learning_rate": 1.1221384981623618e-07, + "loss": 0.3376, + "step": 8468 + }, + { + "epoch": 0.9332231404958677, + "grad_norm": 4.392724514007568, + "learning_rate": 1.118458052658017e-07, + "loss": 0.349, + "step": 8469 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 6.3683857917785645, + "learning_rate": 1.1147835844550204e-07, + "loss": 0.3807, + "step": 8470 + }, + { + "epoch": 0.9334435261707988, + "grad_norm": 12.318037033081055, + "learning_rate": 1.1111150940027016e-07, + "loss": 0.4266, + "step": 8471 + }, + { + "epoch": 0.9335537190082644, + "grad_norm": 9.726642608642578, + "learning_rate": 1.107452581749624e-07, + "loss": 0.3922, + "step": 8472 + }, + { + "epoch": 0.93366391184573, + "grad_norm": 7.582922458648682, + "learning_rate": 1.1037960481436682e-07, + "loss": 0.389, + "step": 8473 + }, + { + "epoch": 0.9337741046831955, + "grad_norm": 7.151773929595947, + "learning_rate": 1.1001454936319489e-07, + "loss": 0.4179, + "step": 8474 + }, + { + "epoch": 0.9338842975206612, + "grad_norm": 6.1926774978637695, + "learning_rate": 1.0965009186608589e-07, + "loss": 0.404, + "step": 8475 + }, + { + "epoch": 0.9339944903581268, + "grad_norm": 6.94793701171875, + "learning_rate": 1.0928623236760694e-07, + "loss": 0.3677, + "step": 8476 + }, + { + "epoch": 0.9341046831955923, + "grad_norm": 8.575023651123047, + "learning_rate": 1.0892297091225079e-07, + "loss": 0.4431, + "step": 8477 + }, + { + "epoch": 0.9342148760330579, + "grad_norm": 5.1859130859375, + "learning_rate": 1.0856030754443747e-07, + "loss": 0.3697, + "step": 8478 + }, + { + "epoch": 0.9343250688705235, + "grad_norm": 11.38362979888916, + "learning_rate": 1.0819824230851373e-07, + "loss": 0.4817, + "step": 8479 + }, + { + "epoch": 0.934435261707989, + "grad_norm": 6.110439300537109, + "learning_rate": 1.0783677524875413e-07, + "loss": 0.3407, + "step": 8480 + }, + { + "epoch": 0.9345454545454546, + "grad_norm": 4.999442100524902, + "learning_rate": 1.0747590640935945e-07, + "loss": 0.387, + "step": 8481 + }, + { + "epoch": 0.9346556473829201, + "grad_norm": 7.006037712097168, + "learning_rate": 1.0711563583445717e-07, + "loss": 0.3847, + "step": 8482 + }, + { + "epoch": 0.9347658402203857, + "grad_norm": 9.513243675231934, + "learning_rate": 1.067559635680998e-07, + "loss": 0.4652, + "step": 8483 + }, + { + "epoch": 0.9348760330578513, + "grad_norm": 4.773813247680664, + "learning_rate": 1.0639688965427108e-07, + "loss": 0.3606, + "step": 8484 + }, + { + "epoch": 0.9349862258953168, + "grad_norm": 7.91803503036499, + "learning_rate": 1.060384141368781e-07, + "loss": 0.387, + "step": 8485 + }, + { + "epoch": 0.9350964187327824, + "grad_norm": 6.70215368270874, + "learning_rate": 1.0568053705975467e-07, + "loss": 0.4408, + "step": 8486 + }, + { + "epoch": 0.935206611570248, + "grad_norm": 4.688905239105225, + "learning_rate": 1.0532325846666414e-07, + "loss": 0.3805, + "step": 8487 + }, + { + "epoch": 0.9353168044077135, + "grad_norm": 9.817471504211426, + "learning_rate": 1.0496657840129432e-07, + "loss": 0.4929, + "step": 8488 + }, + { + "epoch": 0.9354269972451791, + "grad_norm": 6.609936237335205, + "learning_rate": 1.0461049690726033e-07, + "loss": 0.3759, + "step": 8489 + }, + { + "epoch": 0.9355371900826446, + "grad_norm": 7.818711280822754, + "learning_rate": 1.0425501402810457e-07, + "loss": 0.4189, + "step": 8490 + }, + { + "epoch": 0.9356473829201102, + "grad_norm": 8.710646629333496, + "learning_rate": 1.0390012980729613e-07, + "loss": 0.372, + "step": 8491 + }, + { + "epoch": 0.9357575757575758, + "grad_norm": 6.422230243682861, + "learning_rate": 1.0354584428823034e-07, + "loss": 0.3829, + "step": 8492 + }, + { + "epoch": 0.9358677685950413, + "grad_norm": 12.990571022033691, + "learning_rate": 1.0319215751422973e-07, + "loss": 0.4292, + "step": 8493 + }, + { + "epoch": 0.9359779614325069, + "grad_norm": 13.805790901184082, + "learning_rate": 1.0283906952854361e-07, + "loss": 0.4625, + "step": 8494 + }, + { + "epoch": 0.9360881542699725, + "grad_norm": 10.73592758178711, + "learning_rate": 1.024865803743491e-07, + "loss": 0.5072, + "step": 8495 + }, + { + "epoch": 0.936198347107438, + "grad_norm": 6.469753265380859, + "learning_rate": 1.021346900947473e-07, + "loss": 0.3863, + "step": 8496 + }, + { + "epoch": 0.9363085399449036, + "grad_norm": 5.712851047515869, + "learning_rate": 1.0178339873276877e-07, + "loss": 0.4021, + "step": 8497 + }, + { + "epoch": 0.9364187327823692, + "grad_norm": 5.719577312469482, + "learning_rate": 1.0143270633137026e-07, + "loss": 0.3985, + "step": 8498 + }, + { + "epoch": 0.9365289256198347, + "grad_norm": 11.244417190551758, + "learning_rate": 1.0108261293343413e-07, + "loss": 0.4558, + "step": 8499 + }, + { + "epoch": 0.9366391184573003, + "grad_norm": 9.082850456237793, + "learning_rate": 1.007331185817706e-07, + "loss": 0.3955, + "step": 8500 + }, + { + "epoch": 0.9367493112947658, + "grad_norm": 6.717166900634766, + "learning_rate": 1.0038422331911657e-07, + "loss": 0.4204, + "step": 8501 + }, + { + "epoch": 0.9368595041322314, + "grad_norm": 10.005170822143555, + "learning_rate": 1.0003592718813515e-07, + "loss": 0.4111, + "step": 8502 + }, + { + "epoch": 0.936969696969697, + "grad_norm": 5.7031145095825195, + "learning_rate": 9.968823023141616e-08, + "loss": 0.4185, + "step": 8503 + }, + { + "epoch": 0.9370798898071625, + "grad_norm": 5.498226165771484, + "learning_rate": 9.934113249147725e-08, + "loss": 0.4084, + "step": 8504 + }, + { + "epoch": 0.9371900826446281, + "grad_norm": 5.361534118652344, + "learning_rate": 9.899463401076115e-08, + "loss": 0.3802, + "step": 8505 + }, + { + "epoch": 0.9373002754820937, + "grad_norm": 7.721748352050781, + "learning_rate": 9.864873483163839e-08, + "loss": 0.3692, + "step": 8506 + }, + { + "epoch": 0.9374104683195592, + "grad_norm": 8.12588882446289, + "learning_rate": 9.830343499640683e-08, + "loss": 0.383, + "step": 8507 + }, + { + "epoch": 0.9375206611570248, + "grad_norm": 6.769551753997803, + "learning_rate": 9.795873454728932e-08, + "loss": 0.3858, + "step": 8508 + }, + { + "epoch": 0.9376308539944903, + "grad_norm": 9.868959426879883, + "learning_rate": 9.761463352643608e-08, + "loss": 0.4937, + "step": 8509 + }, + { + "epoch": 0.9377410468319559, + "grad_norm": 6.203073501586914, + "learning_rate": 9.727113197592564e-08, + "loss": 0.4053, + "step": 8510 + }, + { + "epoch": 0.9378512396694215, + "grad_norm": 10.969325065612793, + "learning_rate": 9.692822993775996e-08, + "loss": 0.3893, + "step": 8511 + }, + { + "epoch": 0.937961432506887, + "grad_norm": 4.758317470550537, + "learning_rate": 9.658592745387108e-08, + "loss": 0.4115, + "step": 8512 + }, + { + "epoch": 0.9380716253443526, + "grad_norm": 4.448451519012451, + "learning_rate": 9.624422456611548e-08, + "loss": 0.3227, + "step": 8513 + }, + { + "epoch": 0.9381818181818182, + "grad_norm": 4.2206645011901855, + "learning_rate": 9.590312131627699e-08, + "loss": 0.3886, + "step": 8514 + }, + { + "epoch": 0.9382920110192837, + "grad_norm": 6.712493419647217, + "learning_rate": 9.556261774606668e-08, + "loss": 0.3623, + "step": 8515 + }, + { + "epoch": 0.9384022038567493, + "grad_norm": 7.380617141723633, + "learning_rate": 9.522271389712123e-08, + "loss": 0.3771, + "step": 8516 + }, + { + "epoch": 0.9385123966942148, + "grad_norm": 11.71654224395752, + "learning_rate": 9.488340981100463e-08, + "loss": 0.5992, + "step": 8517 + }, + { + "epoch": 0.9386225895316804, + "grad_norm": 6.216311454772949, + "learning_rate": 9.454470552920814e-08, + "loss": 0.3657, + "step": 8518 + }, + { + "epoch": 0.938732782369146, + "grad_norm": 7.123051643371582, + "learning_rate": 9.420660109314805e-08, + "loss": 0.3396, + "step": 8519 + }, + { + "epoch": 0.9388429752066115, + "grad_norm": 9.15761661529541, + "learning_rate": 9.386909654416853e-08, + "loss": 0.3531, + "step": 8520 + }, + { + "epoch": 0.9389531680440771, + "grad_norm": 5.481376647949219, + "learning_rate": 9.353219192354101e-08, + "loss": 0.3669, + "step": 8521 + }, + { + "epoch": 0.9390633608815427, + "grad_norm": 4.637485027313232, + "learning_rate": 9.319588727246143e-08, + "loss": 0.338, + "step": 8522 + }, + { + "epoch": 0.9391735537190082, + "grad_norm": 12.680817604064941, + "learning_rate": 9.286018263205355e-08, + "loss": 0.4836, + "step": 8523 + }, + { + "epoch": 0.9392837465564738, + "grad_norm": 4.465548992156982, + "learning_rate": 9.252507804336897e-08, + "loss": 0.4173, + "step": 8524 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 4.898728370666504, + "learning_rate": 9.219057354738326e-08, + "loss": 0.3305, + "step": 8525 + }, + { + "epoch": 0.9395041322314049, + "grad_norm": 4.7653727531433105, + "learning_rate": 9.18566691850009e-08, + "loss": 0.4154, + "step": 8526 + }, + { + "epoch": 0.9396143250688705, + "grad_norm": 5.312608242034912, + "learning_rate": 9.152336499705261e-08, + "loss": 0.3742, + "step": 8527 + }, + { + "epoch": 0.939724517906336, + "grad_norm": 8.836138725280762, + "learning_rate": 9.119066102429464e-08, + "loss": 0.3333, + "step": 8528 + }, + { + "epoch": 0.9398347107438016, + "grad_norm": 7.88871431350708, + "learning_rate": 9.085855730741e-08, + "loss": 0.4661, + "step": 8529 + }, + { + "epoch": 0.9399449035812673, + "grad_norm": 8.838181495666504, + "learning_rate": 9.05270538870101e-08, + "loss": 0.3756, + "step": 8530 + }, + { + "epoch": 0.9400550964187327, + "grad_norm": 4.735797882080078, + "learning_rate": 9.019615080363087e-08, + "loss": 0.3795, + "step": 8531 + }, + { + "epoch": 0.9401652892561984, + "grad_norm": 3.989769458770752, + "learning_rate": 8.98658480977349e-08, + "loss": 0.3852, + "step": 8532 + }, + { + "epoch": 0.940275482093664, + "grad_norm": 7.275123119354248, + "learning_rate": 8.953614580971381e-08, + "loss": 0.4019, + "step": 8533 + }, + { + "epoch": 0.9403856749311295, + "grad_norm": 5.2555718421936035, + "learning_rate": 8.920704397988256e-08, + "loss": 0.3471, + "step": 8534 + }, + { + "epoch": 0.9404958677685951, + "grad_norm": 5.258383274078369, + "learning_rate": 8.88785426484845e-08, + "loss": 0.3768, + "step": 8535 + }, + { + "epoch": 0.9406060606060606, + "grad_norm": 18.62480926513672, + "learning_rate": 8.855064185568918e-08, + "loss": 0.5789, + "step": 8536 + }, + { + "epoch": 0.9407162534435262, + "grad_norm": 4.384313106536865, + "learning_rate": 8.82233416415934e-08, + "loss": 0.3795, + "step": 8537 + }, + { + "epoch": 0.9408264462809918, + "grad_norm": 6.902232646942139, + "learning_rate": 8.789664204621906e-08, + "loss": 0.4671, + "step": 8538 + }, + { + "epoch": 0.9409366391184573, + "grad_norm": 5.78926420211792, + "learning_rate": 8.757054310951585e-08, + "loss": 0.35, + "step": 8539 + }, + { + "epoch": 0.9410468319559229, + "grad_norm": 6.472806930541992, + "learning_rate": 8.724504487135965e-08, + "loss": 0.4257, + "step": 8540 + }, + { + "epoch": 0.9411570247933885, + "grad_norm": 7.819636344909668, + "learning_rate": 8.692014737155307e-08, + "loss": 0.4085, + "step": 8541 + }, + { + "epoch": 0.941267217630854, + "grad_norm": 9.186422348022461, + "learning_rate": 8.659585064982323e-08, + "loss": 0.3484, + "step": 8542 + }, + { + "epoch": 0.9413774104683196, + "grad_norm": 4.838017463684082, + "learning_rate": 8.62721547458284e-08, + "loss": 0.3866, + "step": 8543 + }, + { + "epoch": 0.9414876033057851, + "grad_norm": 5.862768173217773, + "learning_rate": 8.594905969914858e-08, + "loss": 0.3511, + "step": 8544 + }, + { + "epoch": 0.9415977961432507, + "grad_norm": 9.558072090148926, + "learning_rate": 8.562656554929271e-08, + "loss": 0.3256, + "step": 8545 + }, + { + "epoch": 0.9417079889807163, + "grad_norm": 7.147973537445068, + "learning_rate": 8.530467233569595e-08, + "loss": 0.4119, + "step": 8546 + }, + { + "epoch": 0.9418181818181818, + "grad_norm": 9.320735931396484, + "learning_rate": 8.498338009772067e-08, + "loss": 0.4266, + "step": 8547 + }, + { + "epoch": 0.9419283746556474, + "grad_norm": 4.809484958648682, + "learning_rate": 8.466268887465268e-08, + "loss": 0.333, + "step": 8548 + }, + { + "epoch": 0.942038567493113, + "grad_norm": 5.545483112335205, + "learning_rate": 8.434259870570893e-08, + "loss": 0.3773, + "step": 8549 + }, + { + "epoch": 0.9421487603305785, + "grad_norm": 4.454220294952393, + "learning_rate": 8.402310963002869e-08, + "loss": 0.3499, + "step": 8550 + }, + { + "epoch": 0.9422589531680441, + "grad_norm": 7.443580627441406, + "learning_rate": 8.370422168668125e-08, + "loss": 0.3727, + "step": 8551 + }, + { + "epoch": 0.9423691460055097, + "grad_norm": 5.809983253479004, + "learning_rate": 8.338593491465874e-08, + "loss": 0.3427, + "step": 8552 + }, + { + "epoch": 0.9424793388429752, + "grad_norm": 6.981957912445068, + "learning_rate": 8.306824935288338e-08, + "loss": 0.3758, + "step": 8553 + }, + { + "epoch": 0.9425895316804408, + "grad_norm": 5.555224895477295, + "learning_rate": 8.275116504020131e-08, + "loss": 0.4115, + "step": 8554 + }, + { + "epoch": 0.9426997245179063, + "grad_norm": 5.965600967407227, + "learning_rate": 8.243468201538596e-08, + "loss": 0.3928, + "step": 8555 + }, + { + "epoch": 0.9428099173553719, + "grad_norm": 4.40320348739624, + "learning_rate": 8.211880031713748e-08, + "loss": 0.3729, + "step": 8556 + }, + { + "epoch": 0.9429201101928375, + "grad_norm": 4.800597190856934, + "learning_rate": 8.180351998408331e-08, + "loss": 0.3673, + "step": 8557 + }, + { + "epoch": 0.943030303030303, + "grad_norm": 5.584104061126709, + "learning_rate": 8.148884105477429e-08, + "loss": 0.3483, + "step": 8558 + }, + { + "epoch": 0.9431404958677686, + "grad_norm": 10.89968490600586, + "learning_rate": 8.117476356769127e-08, + "loss": 0.4443, + "step": 8559 + }, + { + "epoch": 0.9432506887052342, + "grad_norm": 7.249094009399414, + "learning_rate": 8.086128756124023e-08, + "loss": 0.3651, + "step": 8560 + }, + { + "epoch": 0.9433608815426997, + "grad_norm": 7.102390766143799, + "learning_rate": 8.054841307375217e-08, + "loss": 0.4313, + "step": 8561 + }, + { + "epoch": 0.9434710743801653, + "grad_norm": 4.627991199493408, + "learning_rate": 8.023614014348702e-08, + "loss": 0.3556, + "step": 8562 + }, + { + "epoch": 0.9435812672176308, + "grad_norm": 7.0684309005737305, + "learning_rate": 7.992446880862981e-08, + "loss": 0.4176, + "step": 8563 + }, + { + "epoch": 0.9436914600550964, + "grad_norm": 4.0046186447143555, + "learning_rate": 7.961339910729115e-08, + "loss": 0.3443, + "step": 8564 + }, + { + "epoch": 0.943801652892562, + "grad_norm": 7.933088779449463, + "learning_rate": 7.930293107751009e-08, + "loss": 0.4305, + "step": 8565 + }, + { + "epoch": 0.9439118457300275, + "grad_norm": 4.772091865539551, + "learning_rate": 7.899306475725066e-08, + "loss": 0.4325, + "step": 8566 + }, + { + "epoch": 0.9440220385674931, + "grad_norm": 4.680997848510742, + "learning_rate": 7.868380018440369e-08, + "loss": 0.3798, + "step": 8567 + }, + { + "epoch": 0.9441322314049587, + "grad_norm": 8.505746841430664, + "learning_rate": 7.83751373967867e-08, + "loss": 0.3907, + "step": 8568 + }, + { + "epoch": 0.9442424242424242, + "grad_norm": 5.709012985229492, + "learning_rate": 7.806707643214395e-08, + "loss": 0.4498, + "step": 8569 + }, + { + "epoch": 0.9443526170798898, + "grad_norm": 8.944786071777344, + "learning_rate": 7.775961732814364e-08, + "loss": 0.3283, + "step": 8570 + }, + { + "epoch": 0.9444628099173553, + "grad_norm": 6.441588401794434, + "learning_rate": 7.745276012238401e-08, + "loss": 0.4687, + "step": 8571 + }, + { + "epoch": 0.9445730027548209, + "grad_norm": 7.572641849517822, + "learning_rate": 7.714650485238783e-08, + "loss": 0.4429, + "step": 8572 + }, + { + "epoch": 0.9446831955922865, + "grad_norm": 5.6551833152771, + "learning_rate": 7.684085155560406e-08, + "loss": 0.368, + "step": 8573 + }, + { + "epoch": 0.944793388429752, + "grad_norm": 7.429037570953369, + "learning_rate": 7.653580026940833e-08, + "loss": 0.4332, + "step": 8574 + }, + { + "epoch": 0.9449035812672176, + "grad_norm": 7.719753742218018, + "learning_rate": 7.623135103110246e-08, + "loss": 0.4786, + "step": 8575 + }, + { + "epoch": 0.9450137741046832, + "grad_norm": 7.220445156097412, + "learning_rate": 7.592750387791558e-08, + "loss": 0.4455, + "step": 8576 + }, + { + "epoch": 0.9451239669421487, + "grad_norm": 6.1490654945373535, + "learning_rate": 7.562425884700241e-08, + "loss": 0.3882, + "step": 8577 + }, + { + "epoch": 0.9452341597796143, + "grad_norm": 7.9718804359436035, + "learning_rate": 7.53216159754433e-08, + "loss": 0.4391, + "step": 8578 + }, + { + "epoch": 0.9453443526170799, + "grad_norm": 5.425424098968506, + "learning_rate": 7.501957530024695e-08, + "loss": 0.3879, + "step": 8579 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 4.844546318054199, + "learning_rate": 7.471813685834716e-08, + "loss": 0.3323, + "step": 8580 + }, + { + "epoch": 0.945564738292011, + "grad_norm": 6.59128999710083, + "learning_rate": 7.441730068660336e-08, + "loss": 0.3366, + "step": 8581 + }, + { + "epoch": 0.9456749311294765, + "grad_norm": 5.869863986968994, + "learning_rate": 7.41170668218033e-08, + "loss": 0.455, + "step": 8582 + }, + { + "epoch": 0.9457851239669421, + "grad_norm": 9.603310585021973, + "learning_rate": 7.381743530065933e-08, + "loss": 0.4076, + "step": 8583 + }, + { + "epoch": 0.9458953168044077, + "grad_norm": 6.4055495262146, + "learning_rate": 7.351840615981043e-08, + "loss": 0.2913, + "step": 8584 + }, + { + "epoch": 0.9460055096418732, + "grad_norm": 9.786589622497559, + "learning_rate": 7.321997943582293e-08, + "loss": 0.4402, + "step": 8585 + }, + { + "epoch": 0.9461157024793388, + "grad_norm": 11.6978178024292, + "learning_rate": 7.292215516518931e-08, + "loss": 0.5475, + "step": 8586 + }, + { + "epoch": 0.9462258953168045, + "grad_norm": 7.274374008178711, + "learning_rate": 7.262493338432708e-08, + "loss": 0.4702, + "step": 8587 + }, + { + "epoch": 0.94633608815427, + "grad_norm": 4.72409725189209, + "learning_rate": 7.232831412958053e-08, + "loss": 0.4482, + "step": 8588 + }, + { + "epoch": 0.9464462809917356, + "grad_norm": 6.25150203704834, + "learning_rate": 7.203229743722229e-08, + "loss": 0.3769, + "step": 8589 + }, + { + "epoch": 0.946556473829201, + "grad_norm": 5.5450639724731445, + "learning_rate": 7.173688334344841e-08, + "loss": 0.3712, + "step": 8590 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 5.280126571655273, + "learning_rate": 7.144207188438223e-08, + "loss": 0.3382, + "step": 8591 + }, + { + "epoch": 0.9467768595041323, + "grad_norm": 7.815299034118652, + "learning_rate": 7.114786309607491e-08, + "loss": 0.3563, + "step": 8592 + }, + { + "epoch": 0.9468870523415978, + "grad_norm": 7.206793785095215, + "learning_rate": 7.08542570145021e-08, + "loss": 0.3989, + "step": 8593 + }, + { + "epoch": 0.9469972451790634, + "grad_norm": 7.927333354949951, + "learning_rate": 7.056125367556566e-08, + "loss": 0.3615, + "step": 8594 + }, + { + "epoch": 0.947107438016529, + "grad_norm": 6.645603656768799, + "learning_rate": 7.026885311509579e-08, + "loss": 0.4112, + "step": 8595 + }, + { + "epoch": 0.9472176308539945, + "grad_norm": 6.320545196533203, + "learning_rate": 6.99770553688467e-08, + "loss": 0.4128, + "step": 8596 + }, + { + "epoch": 0.9473278236914601, + "grad_norm": 5.7397074699401855, + "learning_rate": 6.968586047250036e-08, + "loss": 0.3748, + "step": 8597 + }, + { + "epoch": 0.9474380165289257, + "grad_norm": 6.543806076049805, + "learning_rate": 6.939526846166333e-08, + "loss": 0.3366, + "step": 8598 + }, + { + "epoch": 0.9475482093663912, + "grad_norm": 7.952162742614746, + "learning_rate": 6.910527937187162e-08, + "loss": 0.5369, + "step": 8599 + }, + { + "epoch": 0.9476584022038568, + "grad_norm": 7.1026716232299805, + "learning_rate": 6.881589323858407e-08, + "loss": 0.3817, + "step": 8600 + }, + { + "epoch": 0.9477685950413223, + "grad_norm": 7.128777027130127, + "learning_rate": 6.852711009718626e-08, + "loss": 0.2803, + "step": 8601 + }, + { + "epoch": 0.9478787878787879, + "grad_norm": 4.653260231018066, + "learning_rate": 6.82389299829933e-08, + "loss": 0.3445, + "step": 8602 + }, + { + "epoch": 0.9479889807162535, + "grad_norm": 4.707342624664307, + "learning_rate": 6.795135293124311e-08, + "loss": 0.3503, + "step": 8603 + }, + { + "epoch": 0.948099173553719, + "grad_norm": 5.059284687042236, + "learning_rate": 6.766437897710032e-08, + "loss": 0.3551, + "step": 8604 + }, + { + "epoch": 0.9482093663911846, + "grad_norm": 7.666423320770264, + "learning_rate": 6.737800815565799e-08, + "loss": 0.4054, + "step": 8605 + }, + { + "epoch": 0.9483195592286502, + "grad_norm": 4.72752046585083, + "learning_rate": 6.709224050193252e-08, + "loss": 0.3762, + "step": 8606 + }, + { + "epoch": 0.9484297520661157, + "grad_norm": 6.7642316818237305, + "learning_rate": 6.680707605086878e-08, + "loss": 0.4859, + "step": 8607 + }, + { + "epoch": 0.9485399449035813, + "grad_norm": 6.059367656707764, + "learning_rate": 6.652251483733718e-08, + "loss": 0.3812, + "step": 8608 + }, + { + "epoch": 0.9486501377410468, + "grad_norm": 5.486311435699463, + "learning_rate": 6.623855689613323e-08, + "loss": 0.3266, + "step": 8609 + }, + { + "epoch": 0.9487603305785124, + "grad_norm": 7.317872524261475, + "learning_rate": 6.595520226198138e-08, + "loss": 0.3442, + "step": 8610 + }, + { + "epoch": 0.948870523415978, + "grad_norm": 4.8540778160095215, + "learning_rate": 6.56724509695289e-08, + "loss": 0.402, + "step": 8611 + }, + { + "epoch": 0.9489807162534435, + "grad_norm": 4.781094074249268, + "learning_rate": 6.539030305335147e-08, + "loss": 0.403, + "step": 8612 + }, + { + "epoch": 0.9490909090909091, + "grad_norm": 7.0046067237854, + "learning_rate": 6.510875854795152e-08, + "loss": 0.2694, + "step": 8613 + }, + { + "epoch": 0.9492011019283747, + "grad_norm": 6.1152825355529785, + "learning_rate": 6.48278174877559e-08, + "loss": 0.4012, + "step": 8614 + }, + { + "epoch": 0.9493112947658402, + "grad_norm": 6.957250595092773, + "learning_rate": 6.454747990711774e-08, + "loss": 0.4294, + "step": 8615 + }, + { + "epoch": 0.9494214876033058, + "grad_norm": 8.402409553527832, + "learning_rate": 6.426774584031902e-08, + "loss": 0.4305, + "step": 8616 + }, + { + "epoch": 0.9495316804407713, + "grad_norm": 8.449647903442383, + "learning_rate": 6.398861532156408e-08, + "loss": 0.5149, + "step": 8617 + }, + { + "epoch": 0.9496418732782369, + "grad_norm": 8.074153900146484, + "learning_rate": 6.371008838498616e-08, + "loss": 0.3902, + "step": 8618 + }, + { + "epoch": 0.9497520661157025, + "grad_norm": 6.5540876388549805, + "learning_rate": 6.343216506464467e-08, + "loss": 0.4076, + "step": 8619 + }, + { + "epoch": 0.949862258953168, + "grad_norm": 6.830234527587891, + "learning_rate": 6.315484539452299e-08, + "loss": 0.3625, + "step": 8620 + }, + { + "epoch": 0.9499724517906336, + "grad_norm": 7.6607584953308105, + "learning_rate": 6.287812940853288e-08, + "loss": 0.422, + "step": 8621 + }, + { + "epoch": 0.9500826446280992, + "grad_norm": 6.200892925262451, + "learning_rate": 6.260201714051229e-08, + "loss": 0.3613, + "step": 8622 + }, + { + "epoch": 0.9501928374655647, + "grad_norm": 12.884931564331055, + "learning_rate": 6.232650862422308e-08, + "loss": 0.4733, + "step": 8623 + }, + { + "epoch": 0.9503030303030303, + "grad_norm": 5.730234622955322, + "learning_rate": 6.205160389335552e-08, + "loss": 0.4097, + "step": 8624 + }, + { + "epoch": 0.9504132231404959, + "grad_norm": 4.848020076751709, + "learning_rate": 6.177730298152606e-08, + "loss": 0.3682, + "step": 8625 + }, + { + "epoch": 0.9505234159779614, + "grad_norm": 5.113597869873047, + "learning_rate": 6.150360592227511e-08, + "loss": 0.41, + "step": 8626 + }, + { + "epoch": 0.950633608815427, + "grad_norm": 8.638497352600098, + "learning_rate": 6.123051274907199e-08, + "loss": 0.4268, + "step": 8627 + }, + { + "epoch": 0.9507438016528925, + "grad_norm": 7.427926063537598, + "learning_rate": 6.095802349531055e-08, + "loss": 0.3286, + "step": 8628 + }, + { + "epoch": 0.9508539944903581, + "grad_norm": 5.209235191345215, + "learning_rate": 6.068613819431079e-08, + "loss": 0.308, + "step": 8629 + }, + { + "epoch": 0.9509641873278237, + "grad_norm": 5.912085056304932, + "learning_rate": 6.041485687931891e-08, + "loss": 0.3675, + "step": 8630 + }, + { + "epoch": 0.9510743801652892, + "grad_norm": 5.423869609832764, + "learning_rate": 6.014417958350893e-08, + "loss": 0.4198, + "step": 8631 + }, + { + "epoch": 0.9511845730027548, + "grad_norm": 9.339936256408691, + "learning_rate": 5.987410633997881e-08, + "loss": 0.4165, + "step": 8632 + }, + { + "epoch": 0.9512947658402204, + "grad_norm": 5.6121439933776855, + "learning_rate": 5.960463718175324e-08, + "loss": 0.395, + "step": 8633 + }, + { + "epoch": 0.9514049586776859, + "grad_norm": 8.349647521972656, + "learning_rate": 5.93357721417831e-08, + "loss": 0.349, + "step": 8634 + }, + { + "epoch": 0.9515151515151515, + "grad_norm": 9.9574556350708, + "learning_rate": 5.906751125294652e-08, + "loss": 0.4834, + "step": 8635 + }, + { + "epoch": 0.951625344352617, + "grad_norm": 4.9349799156188965, + "learning_rate": 5.8799854548046156e-08, + "loss": 0.3591, + "step": 8636 + }, + { + "epoch": 0.9517355371900826, + "grad_norm": 6.009603023529053, + "learning_rate": 5.8532802059810825e-08, + "loss": 0.3637, + "step": 8637 + }, + { + "epoch": 0.9518457300275482, + "grad_norm": 7.522815227508545, + "learning_rate": 5.8266353820897736e-08, + "loss": 0.4404, + "step": 8638 + }, + { + "epoch": 0.9519559228650137, + "grad_norm": 10.83249282836914, + "learning_rate": 5.80005098638875e-08, + "loss": 0.3346, + "step": 8639 + }, + { + "epoch": 0.9520661157024793, + "grad_norm": 7.943075656890869, + "learning_rate": 5.7735270221287444e-08, + "loss": 0.4483, + "step": 8640 + }, + { + "epoch": 0.952176308539945, + "grad_norm": 6.577495098114014, + "learning_rate": 5.747063492553218e-08, + "loss": 0.3587, + "step": 8641 + }, + { + "epoch": 0.9522865013774104, + "grad_norm": 6.013138771057129, + "learning_rate": 5.720660400898193e-08, + "loss": 0.3738, + "step": 8642 + }, + { + "epoch": 0.952396694214876, + "grad_norm": 5.224336624145508, + "learning_rate": 5.694317750392142e-08, + "loss": 0.408, + "step": 8643 + }, + { + "epoch": 0.9525068870523415, + "grad_norm": 7.12232780456543, + "learning_rate": 5.668035544256434e-08, + "loss": 0.3908, + "step": 8644 + }, + { + "epoch": 0.9526170798898072, + "grad_norm": 12.983247756958008, + "learning_rate": 5.641813785704831e-08, + "loss": 0.4187, + "step": 8645 + }, + { + "epoch": 0.9527272727272728, + "grad_norm": 7.4847893714904785, + "learning_rate": 5.6156524779437116e-08, + "loss": 0.4703, + "step": 8646 + }, + { + "epoch": 0.9528374655647383, + "grad_norm": 6.619725704193115, + "learning_rate": 5.589551624172129e-08, + "loss": 0.4126, + "step": 8647 + }, + { + "epoch": 0.9529476584022039, + "grad_norm": 10.35739517211914, + "learning_rate": 5.563511227581808e-08, + "loss": 0.4818, + "step": 8648 + }, + { + "epoch": 0.9530578512396695, + "grad_norm": 11.51596450805664, + "learning_rate": 5.537531291356979e-08, + "loss": 0.3502, + "step": 8649 + }, + { + "epoch": 0.953168044077135, + "grad_norm": 5.162397384643555, + "learning_rate": 5.511611818674434e-08, + "loss": 0.383, + "step": 8650 + }, + { + "epoch": 0.9532782369146006, + "grad_norm": 4.714761734008789, + "learning_rate": 5.485752812703749e-08, + "loss": 0.362, + "step": 8651 + }, + { + "epoch": 0.9533884297520662, + "grad_norm": 10.624504089355469, + "learning_rate": 5.4599542766069494e-08, + "loss": 0.4279, + "step": 8652 + }, + { + "epoch": 0.9534986225895317, + "grad_norm": 9.877367973327637, + "learning_rate": 5.4342162135386236e-08, + "loss": 0.4233, + "step": 8653 + }, + { + "epoch": 0.9536088154269973, + "grad_norm": 5.578636646270752, + "learning_rate": 5.408538626646198e-08, + "loss": 0.3959, + "step": 8654 + }, + { + "epoch": 0.9537190082644628, + "grad_norm": 8.448080062866211, + "learning_rate": 5.38292151906955e-08, + "loss": 0.3512, + "step": 8655 + }, + { + "epoch": 0.9538292011019284, + "grad_norm": 5.63498592376709, + "learning_rate": 5.357364893941064e-08, + "loss": 0.3658, + "step": 8656 + }, + { + "epoch": 0.953939393939394, + "grad_norm": 5.705527305603027, + "learning_rate": 5.331868754385905e-08, + "loss": 0.4013, + "step": 8657 + }, + { + "epoch": 0.9540495867768595, + "grad_norm": 5.283731937408447, + "learning_rate": 5.306433103521802e-08, + "loss": 0.3825, + "step": 8658 + }, + { + "epoch": 0.9541597796143251, + "grad_norm": 6.6954216957092285, + "learning_rate": 5.2810579444590445e-08, + "loss": 0.4422, + "step": 8659 + }, + { + "epoch": 0.9542699724517907, + "grad_norm": 6.549747943878174, + "learning_rate": 5.255743280300485e-08, + "loss": 0.3703, + "step": 8660 + }, + { + "epoch": 0.9543801652892562, + "grad_norm": 6.241518974304199, + "learning_rate": 5.2304891141417014e-08, + "loss": 0.3691, + "step": 8661 + }, + { + "epoch": 0.9544903581267218, + "grad_norm": 7.21035623550415, + "learning_rate": 5.205295449070835e-08, + "loss": 0.4171, + "step": 8662 + }, + { + "epoch": 0.9546005509641873, + "grad_norm": 6.325603485107422, + "learning_rate": 5.1801622881684775e-08, + "loss": 0.4035, + "step": 8663 + }, + { + "epoch": 0.9547107438016529, + "grad_norm": 7.280940055847168, + "learning_rate": 5.155089634508059e-08, + "loss": 0.4243, + "step": 8664 + }, + { + "epoch": 0.9548209366391185, + "grad_norm": 4.8911824226379395, + "learning_rate": 5.130077491155461e-08, + "loss": 0.3151, + "step": 8665 + }, + { + "epoch": 0.954931129476584, + "grad_norm": 8.103285789489746, + "learning_rate": 5.1051258611692355e-08, + "loss": 0.4047, + "step": 8666 + }, + { + "epoch": 0.9550413223140496, + "grad_norm": 5.535149097442627, + "learning_rate": 5.0802347476004434e-08, + "loss": 0.3804, + "step": 8667 + }, + { + "epoch": 0.9551515151515152, + "grad_norm": 14.667068481445312, + "learning_rate": 5.05540415349276e-08, + "loss": 0.5715, + "step": 8668 + }, + { + "epoch": 0.9552617079889807, + "grad_norm": 5.581683158874512, + "learning_rate": 5.030634081882702e-08, + "loss": 0.4177, + "step": 8669 + }, + { + "epoch": 0.9553719008264463, + "grad_norm": 9.997817039489746, + "learning_rate": 5.005924535798956e-08, + "loss": 0.4213, + "step": 8670 + }, + { + "epoch": 0.9554820936639118, + "grad_norm": 7.072441101074219, + "learning_rate": 4.981275518263162e-08, + "loss": 0.4222, + "step": 8671 + }, + { + "epoch": 0.9555922865013774, + "grad_norm": 7.401980400085449, + "learning_rate": 4.9566870322894645e-08, + "loss": 0.3962, + "step": 8672 + }, + { + "epoch": 0.955702479338843, + "grad_norm": 10.443068504333496, + "learning_rate": 4.932159080884458e-08, + "loss": 0.3976, + "step": 8673 + }, + { + "epoch": 0.9558126721763085, + "grad_norm": 6.550655841827393, + "learning_rate": 4.9076916670475206e-08, + "loss": 0.3847, + "step": 8674 + }, + { + "epoch": 0.9559228650137741, + "grad_norm": 10.696511268615723, + "learning_rate": 4.8832847937706486e-08, + "loss": 0.4171, + "step": 8675 + }, + { + "epoch": 0.9560330578512397, + "grad_norm": 5.649347305297852, + "learning_rate": 4.8589384640381766e-08, + "loss": 0.3848, + "step": 8676 + }, + { + "epoch": 0.9561432506887052, + "grad_norm": 5.724981307983398, + "learning_rate": 4.834652680827334e-08, + "loss": 0.4056, + "step": 8677 + }, + { + "epoch": 0.9562534435261708, + "grad_norm": 7.857694625854492, + "learning_rate": 4.8104274471078015e-08, + "loss": 0.3987, + "step": 8678 + }, + { + "epoch": 0.9563636363636364, + "grad_norm": 4.164583683013916, + "learning_rate": 4.786262765841765e-08, + "loss": 0.372, + "step": 8679 + }, + { + "epoch": 0.9564738292011019, + "grad_norm": 14.193947792053223, + "learning_rate": 4.7621586399842487e-08, + "loss": 0.463, + "step": 8680 + }, + { + "epoch": 0.9565840220385675, + "grad_norm": 3.8538706302642822, + "learning_rate": 4.7381150724827296e-08, + "loss": 0.3822, + "step": 8681 + }, + { + "epoch": 0.956694214876033, + "grad_norm": 4.835615634918213, + "learning_rate": 4.714132066277188e-08, + "loss": 0.3613, + "step": 8682 + }, + { + "epoch": 0.9568044077134986, + "grad_norm": 7.823811054229736, + "learning_rate": 4.69020962430039e-08, + "loss": 0.3714, + "step": 8683 + }, + { + "epoch": 0.9569146005509642, + "grad_norm": 8.162225723266602, + "learning_rate": 4.666347749477551e-08, + "loss": 0.4376, + "step": 8684 + }, + { + "epoch": 0.9570247933884297, + "grad_norm": 8.022966384887695, + "learning_rate": 4.6425464447265586e-08, + "loss": 0.3877, + "step": 8685 + }, + { + "epoch": 0.9571349862258953, + "grad_norm": 8.46652603149414, + "learning_rate": 4.6188057129578635e-08, + "loss": 0.3787, + "step": 8686 + }, + { + "epoch": 0.9572451790633609, + "grad_norm": 4.681698322296143, + "learning_rate": 4.595125557074531e-08, + "loss": 0.3885, + "step": 8687 + }, + { + "epoch": 0.9573553719008264, + "grad_norm": 7.8477935791015625, + "learning_rate": 4.571505979972191e-08, + "loss": 0.4589, + "step": 8688 + }, + { + "epoch": 0.957465564738292, + "grad_norm": 5.692086696624756, + "learning_rate": 4.547946984539031e-08, + "loss": 0.4306, + "step": 8689 + }, + { + "epoch": 0.9575757575757575, + "grad_norm": 5.172653675079346, + "learning_rate": 4.524448573655915e-08, + "loss": 0.3984, + "step": 8690 + }, + { + "epoch": 0.9576859504132231, + "grad_norm": 5.916308403015137, + "learning_rate": 4.501010750196322e-08, + "loss": 0.3823, + "step": 8691 + }, + { + "epoch": 0.9577961432506887, + "grad_norm": 7.113374710083008, + "learning_rate": 4.477633517026181e-08, + "loss": 0.3866, + "step": 8692 + }, + { + "epoch": 0.9579063360881542, + "grad_norm": 4.070824146270752, + "learning_rate": 4.4543168770040946e-08, + "loss": 0.3483, + "step": 8693 + }, + { + "epoch": 0.9580165289256198, + "grad_norm": 5.743619441986084, + "learning_rate": 4.431060832981282e-08, + "loss": 0.3443, + "step": 8694 + }, + { + "epoch": 0.9581267217630854, + "grad_norm": 6.238353252410889, + "learning_rate": 4.40786538780158e-08, + "loss": 0.3864, + "step": 8695 + }, + { + "epoch": 0.9582369146005509, + "grad_norm": 4.780889987945557, + "learning_rate": 4.3847305443011635e-08, + "loss": 0.3364, + "step": 8696 + }, + { + "epoch": 0.9583471074380165, + "grad_norm": 7.004602432250977, + "learning_rate": 4.361656305309214e-08, + "loss": 0.3869, + "step": 8697 + }, + { + "epoch": 0.9584573002754821, + "grad_norm": 6.1174116134643555, + "learning_rate": 4.338642673647198e-08, + "loss": 0.3435, + "step": 8698 + }, + { + "epoch": 0.9585674931129476, + "grad_norm": 5.435125827789307, + "learning_rate": 4.3156896521291956e-08, + "loss": 0.3154, + "step": 8699 + }, + { + "epoch": 0.9586776859504132, + "grad_norm": 8.105121612548828, + "learning_rate": 4.2927972435620194e-08, + "loss": 0.3504, + "step": 8700 + }, + { + "epoch": 0.9587878787878787, + "grad_norm": 8.5179443359375, + "learning_rate": 4.2699654507449836e-08, + "loss": 0.3894, + "step": 8701 + }, + { + "epoch": 0.9588980716253444, + "grad_norm": 6.650347709655762, + "learning_rate": 4.247194276469857e-08, + "loss": 0.3452, + "step": 8702 + }, + { + "epoch": 0.95900826446281, + "grad_norm": 7.823209285736084, + "learning_rate": 4.2244837235213e-08, + "loss": 0.4156, + "step": 8703 + }, + { + "epoch": 0.9591184573002755, + "grad_norm": 4.450412273406982, + "learning_rate": 4.201833794676258e-08, + "loss": 0.3676, + "step": 8704 + }, + { + "epoch": 0.9592286501377411, + "grad_norm": 7.199307441711426, + "learning_rate": 4.179244492704515e-08, + "loss": 0.4141, + "step": 8705 + }, + { + "epoch": 0.9593388429752067, + "grad_norm": 6.122735500335693, + "learning_rate": 4.1567158203682514e-08, + "loss": 0.4061, + "step": 8706 + }, + { + "epoch": 0.9594490358126722, + "grad_norm": 6.911500930786133, + "learning_rate": 4.134247780422318e-08, + "loss": 0.4044, + "step": 8707 + }, + { + "epoch": 0.9595592286501378, + "grad_norm": 5.078444957733154, + "learning_rate": 4.111840375614129e-08, + "loss": 0.3493, + "step": 8708 + }, + { + "epoch": 0.9596694214876033, + "grad_norm": 6.058871269226074, + "learning_rate": 4.089493608683659e-08, + "loss": 0.4757, + "step": 8709 + }, + { + "epoch": 0.9597796143250689, + "grad_norm": 6.7376532554626465, + "learning_rate": 4.0672074823635554e-08, + "loss": 0.4169, + "step": 8710 + }, + { + "epoch": 0.9598898071625345, + "grad_norm": 7.444267272949219, + "learning_rate": 4.044981999379027e-08, + "loss": 0.4361, + "step": 8711 + }, + { + "epoch": 0.96, + "grad_norm": 10.424495697021484, + "learning_rate": 4.022817162447734e-08, + "loss": 0.3745, + "step": 8712 + }, + { + "epoch": 0.9601101928374656, + "grad_norm": 17.904897689819336, + "learning_rate": 4.000712974280119e-08, + "loss": 0.4437, + "step": 8713 + }, + { + "epoch": 0.9602203856749312, + "grad_norm": 7.5280022621154785, + "learning_rate": 3.978669437579019e-08, + "loss": 0.4471, + "step": 8714 + }, + { + "epoch": 0.9603305785123967, + "grad_norm": 6.035186290740967, + "learning_rate": 3.9566865550400566e-08, + "loss": 0.3758, + "step": 8715 + }, + { + "epoch": 0.9604407713498623, + "grad_norm": 5.6610918045043945, + "learning_rate": 3.9347643293512485e-08, + "loss": 0.4303, + "step": 8716 + }, + { + "epoch": 0.9605509641873278, + "grad_norm": 5.898346900939941, + "learning_rate": 3.9129027631932826e-08, + "loss": 0.4415, + "step": 8717 + }, + { + "epoch": 0.9606611570247934, + "grad_norm": 7.7965240478515625, + "learning_rate": 3.89110185923941e-08, + "loss": 0.362, + "step": 8718 + }, + { + "epoch": 0.960771349862259, + "grad_norm": 4.916289329528809, + "learning_rate": 3.869361620155554e-08, + "loss": 0.4284, + "step": 8719 + }, + { + "epoch": 0.9608815426997245, + "grad_norm": 8.657247543334961, + "learning_rate": 3.847682048600088e-08, + "loss": 0.4117, + "step": 8720 + }, + { + "epoch": 0.9609917355371901, + "grad_norm": 4.531099319458008, + "learning_rate": 3.826063147224002e-08, + "loss": 0.4181, + "step": 8721 + }, + { + "epoch": 0.9611019283746557, + "grad_norm": 6.3142523765563965, + "learning_rate": 3.804504918670904e-08, + "loss": 0.4041, + "step": 8722 + }, + { + "epoch": 0.9612121212121212, + "grad_norm": 5.790463447570801, + "learning_rate": 3.783007365576907e-08, + "loss": 0.3302, + "step": 8723 + }, + { + "epoch": 0.9613223140495868, + "grad_norm": 4.8118133544921875, + "learning_rate": 3.7615704905708537e-08, + "loss": 0.3303, + "step": 8724 + }, + { + "epoch": 0.9614325068870524, + "grad_norm": 6.230093955993652, + "learning_rate": 3.740194296274091e-08, + "loss": 0.3689, + "step": 8725 + }, + { + "epoch": 0.9615426997245179, + "grad_norm": 7.357944011688232, + "learning_rate": 3.7188787853003614e-08, + "loss": 0.4025, + "step": 8726 + }, + { + "epoch": 0.9616528925619835, + "grad_norm": 5.621352672576904, + "learning_rate": 3.697623960256358e-08, + "loss": 0.3632, + "step": 8727 + }, + { + "epoch": 0.961763085399449, + "grad_norm": 4.002316951751709, + "learning_rate": 3.6764298237410014e-08, + "loss": 0.377, + "step": 8728 + }, + { + "epoch": 0.9618732782369146, + "grad_norm": 7.255180835723877, + "learning_rate": 3.655296378346052e-08, + "loss": 0.4069, + "step": 8729 + }, + { + "epoch": 0.9619834710743802, + "grad_norm": 6.317800521850586, + "learning_rate": 3.6342236266556085e-08, + "loss": 0.3946, + "step": 8730 + }, + { + "epoch": 0.9620936639118457, + "grad_norm": 12.31389331817627, + "learning_rate": 3.613211571246611e-08, + "loss": 0.388, + "step": 8731 + }, + { + "epoch": 0.9622038567493113, + "grad_norm": 7.282164573669434, + "learning_rate": 3.592260214688337e-08, + "loss": 0.3554, + "step": 8732 + }, + { + "epoch": 0.9623140495867769, + "grad_norm": 5.954392433166504, + "learning_rate": 3.571369559542792e-08, + "loss": 0.4262, + "step": 8733 + }, + { + "epoch": 0.9624242424242424, + "grad_norm": 10.04466724395752, + "learning_rate": 3.550539608364545e-08, + "loss": 0.4081, + "step": 8734 + }, + { + "epoch": 0.962534435261708, + "grad_norm": 5.788437843322754, + "learning_rate": 3.529770363700613e-08, + "loss": 0.3278, + "step": 8735 + }, + { + "epoch": 0.9626446280991735, + "grad_norm": 7.020529270172119, + "learning_rate": 3.5090618280907985e-08, + "loss": 0.4064, + "step": 8736 + }, + { + "epoch": 0.9627548209366391, + "grad_norm": 6.224966526031494, + "learning_rate": 3.4884140040672975e-08, + "loss": 0.3332, + "step": 8737 + }, + { + "epoch": 0.9628650137741047, + "grad_norm": 5.927913188934326, + "learning_rate": 3.4678268941549794e-08, + "loss": 0.4083, + "step": 8738 + }, + { + "epoch": 0.9629752066115702, + "grad_norm": 10.509246826171875, + "learning_rate": 3.447300500871276e-08, + "loss": 0.4871, + "step": 8739 + }, + { + "epoch": 0.9630853994490358, + "grad_norm": 5.570017337799072, + "learning_rate": 3.4268348267261776e-08, + "loss": 0.4285, + "step": 8740 + }, + { + "epoch": 0.9631955922865014, + "grad_norm": 10.697563171386719, + "learning_rate": 3.406429874222239e-08, + "loss": 0.3668, + "step": 8741 + }, + { + "epoch": 0.9633057851239669, + "grad_norm": 4.3002424240112305, + "learning_rate": 3.3860856458545754e-08, + "loss": 0.3056, + "step": 8742 + }, + { + "epoch": 0.9634159779614325, + "grad_norm": 5.99077033996582, + "learning_rate": 3.3658021441109744e-08, + "loss": 0.3781, + "step": 8743 + }, + { + "epoch": 0.963526170798898, + "grad_norm": 7.563045024871826, + "learning_rate": 3.345579371471674e-08, + "loss": 0.4507, + "step": 8744 + }, + { + "epoch": 0.9636363636363636, + "grad_norm": 7.128659725189209, + "learning_rate": 3.325417330409586e-08, + "loss": 0.3879, + "step": 8745 + }, + { + "epoch": 0.9637465564738292, + "grad_norm": 5.0663909912109375, + "learning_rate": 3.305316023390126e-08, + "loss": 0.4553, + "step": 8746 + }, + { + "epoch": 0.9638567493112947, + "grad_norm": 8.34847354888916, + "learning_rate": 3.2852754528713285e-08, + "loss": 0.4624, + "step": 8747 + }, + { + "epoch": 0.9639669421487603, + "grad_norm": 6.316442966461182, + "learning_rate": 3.265295621303788e-08, + "loss": 0.3036, + "step": 8748 + }, + { + "epoch": 0.9640771349862259, + "grad_norm": 6.613633155822754, + "learning_rate": 3.245376531130551e-08, + "loss": 0.3491, + "step": 8749 + }, + { + "epoch": 0.9641873278236914, + "grad_norm": 6.894759178161621, + "learning_rate": 3.2255181847875574e-08, + "loss": 0.4662, + "step": 8750 + }, + { + "epoch": 0.964297520661157, + "grad_norm": 7.324357032775879, + "learning_rate": 3.205720584702976e-08, + "loss": 0.3483, + "step": 8751 + }, + { + "epoch": 0.9644077134986226, + "grad_norm": 5.790113925933838, + "learning_rate": 3.1859837332976486e-08, + "loss": 0.367, + "step": 8752 + }, + { + "epoch": 0.9645179063360881, + "grad_norm": 6.1464715003967285, + "learning_rate": 3.166307632985144e-08, + "loss": 0.3834, + "step": 8753 + }, + { + "epoch": 0.9646280991735537, + "grad_norm": 9.115376472473145, + "learning_rate": 3.1466922861714266e-08, + "loss": 0.4463, + "step": 8754 + }, + { + "epoch": 0.9647382920110192, + "grad_norm": 10.096417427062988, + "learning_rate": 3.127137695255078e-08, + "loss": 0.3869, + "step": 8755 + }, + { + "epoch": 0.9648484848484848, + "grad_norm": 5.323825836181641, + "learning_rate": 3.107643862627241e-08, + "loss": 0.4285, + "step": 8756 + }, + { + "epoch": 0.9649586776859504, + "grad_norm": 7.107351303100586, + "learning_rate": 3.0882107906717307e-08, + "loss": 0.4017, + "step": 8757 + }, + { + "epoch": 0.965068870523416, + "grad_norm": 9.90488338470459, + "learning_rate": 3.0688384817647574e-08, + "loss": 0.3584, + "step": 8758 + }, + { + "epoch": 0.9651790633608816, + "grad_norm": 8.946452140808105, + "learning_rate": 3.0495269382752046e-08, + "loss": 0.3719, + "step": 8759 + }, + { + "epoch": 0.9652892561983472, + "grad_norm": 8.5913667678833, + "learning_rate": 3.0302761625645716e-08, + "loss": 0.4018, + "step": 8760 + }, + { + "epoch": 0.9653994490358127, + "grad_norm": 8.09288215637207, + "learning_rate": 3.011086156986864e-08, + "loss": 0.453, + "step": 8761 + }, + { + "epoch": 0.9655096418732783, + "grad_norm": 5.370037078857422, + "learning_rate": 2.991956923888539e-08, + "loss": 0.3169, + "step": 8762 + }, + { + "epoch": 0.9656198347107438, + "grad_norm": 5.912369251251221, + "learning_rate": 2.9728884656088918e-08, + "loss": 0.3846, + "step": 8763 + }, + { + "epoch": 0.9657300275482094, + "grad_norm": 6.833310127258301, + "learning_rate": 2.9538807844796124e-08, + "loss": 0.3688, + "step": 8764 + }, + { + "epoch": 0.965840220385675, + "grad_norm": 6.207882404327393, + "learning_rate": 2.934933882824953e-08, + "loss": 0.373, + "step": 8765 + }, + { + "epoch": 0.9659504132231405, + "grad_norm": 4.079768657684326, + "learning_rate": 2.916047762961782e-08, + "loss": 0.438, + "step": 8766 + }, + { + "epoch": 0.9660606060606061, + "grad_norm": 10.41829776763916, + "learning_rate": 2.8972224271994755e-08, + "loss": 0.4724, + "step": 8767 + }, + { + "epoch": 0.9661707988980717, + "grad_norm": 7.421224594116211, + "learning_rate": 2.8784578778400796e-08, + "loss": 0.4165, + "step": 8768 + }, + { + "epoch": 0.9662809917355372, + "grad_norm": 6.185629367828369, + "learning_rate": 2.8597541171781483e-08, + "loss": 0.4181, + "step": 8769 + }, + { + "epoch": 0.9663911845730028, + "grad_norm": 5.071274757385254, + "learning_rate": 2.8411111475007968e-08, + "loss": 0.322, + "step": 8770 + }, + { + "epoch": 0.9665013774104683, + "grad_norm": 5.575862884521484, + "learning_rate": 2.8225289710876457e-08, + "loss": 0.331, + "step": 8771 + }, + { + "epoch": 0.9666115702479339, + "grad_norm": 6.873533248901367, + "learning_rate": 2.8040075902109887e-08, + "loss": 0.423, + "step": 8772 + }, + { + "epoch": 0.9667217630853995, + "grad_norm": 5.264482021331787, + "learning_rate": 2.785547007135736e-08, + "loss": 0.3141, + "step": 8773 + }, + { + "epoch": 0.966831955922865, + "grad_norm": 9.443242073059082, + "learning_rate": 2.7671472241191376e-08, + "loss": 0.3916, + "step": 8774 + }, + { + "epoch": 0.9669421487603306, + "grad_norm": 8.357752799987793, + "learning_rate": 2.7488082434111718e-08, + "loss": 0.4151, + "step": 8775 + }, + { + "epoch": 0.9670523415977962, + "grad_norm": 7.899710178375244, + "learning_rate": 2.7305300672544334e-08, + "loss": 0.3292, + "step": 8776 + }, + { + "epoch": 0.9671625344352617, + "grad_norm": 6.588962554931641, + "learning_rate": 2.7123126978839676e-08, + "loss": 0.3844, + "step": 8777 + }, + { + "epoch": 0.9672727272727273, + "grad_norm": 8.38394832611084, + "learning_rate": 2.6941561375273818e-08, + "loss": 0.4229, + "step": 8778 + }, + { + "epoch": 0.9673829201101929, + "grad_norm": 13.35795783996582, + "learning_rate": 2.6760603884048998e-08, + "loss": 0.4688, + "step": 8779 + }, + { + "epoch": 0.9674931129476584, + "grad_norm": 9.614317893981934, + "learning_rate": 2.658025452729307e-08, + "loss": 0.3995, + "step": 8780 + }, + { + "epoch": 0.967603305785124, + "grad_norm": 7.807085990905762, + "learning_rate": 2.6400513327059508e-08, + "loss": 0.4585, + "step": 8781 + }, + { + "epoch": 0.9677134986225895, + "grad_norm": 5.789939880371094, + "learning_rate": 2.622138030532684e-08, + "loss": 0.3661, + "step": 8782 + }, + { + "epoch": 0.9678236914600551, + "grad_norm": 5.294877529144287, + "learning_rate": 2.604285548399976e-08, + "loss": 0.3886, + "step": 8783 + }, + { + "epoch": 0.9679338842975207, + "grad_norm": 4.9963812828063965, + "learning_rate": 2.5864938884909707e-08, + "loss": 0.4206, + "step": 8784 + }, + { + "epoch": 0.9680440771349862, + "grad_norm": 4.177475452423096, + "learning_rate": 2.5687630529810935e-08, + "loss": 0.3957, + "step": 8785 + }, + { + "epoch": 0.9681542699724518, + "grad_norm": 7.786369323730469, + "learning_rate": 2.5510930440385552e-08, + "loss": 0.4177, + "step": 8786 + }, + { + "epoch": 0.9682644628099174, + "grad_norm": 5.515124320983887, + "learning_rate": 2.5334838638241268e-08, + "loss": 0.3151, + "step": 8787 + }, + { + "epoch": 0.9683746556473829, + "grad_norm": 8.18659496307373, + "learning_rate": 2.5159355144909746e-08, + "loss": 0.4789, + "step": 8788 + }, + { + "epoch": 0.9684848484848485, + "grad_norm": 4.32504940032959, + "learning_rate": 2.4984479981850494e-08, + "loss": 0.3506, + "step": 8789 + }, + { + "epoch": 0.968595041322314, + "grad_norm": 6.653140068054199, + "learning_rate": 2.481021317044696e-08, + "loss": 0.3292, + "step": 8790 + }, + { + "epoch": 0.9687052341597796, + "grad_norm": 5.733898639678955, + "learning_rate": 2.463655473200821e-08, + "loss": 0.3428, + "step": 8791 + }, + { + "epoch": 0.9688154269972452, + "grad_norm": 5.614349365234375, + "learning_rate": 2.4463504687770035e-08, + "loss": 0.3711, + "step": 8792 + }, + { + "epoch": 0.9689256198347107, + "grad_norm": 6.361004829406738, + "learning_rate": 2.4291063058893283e-08, + "loss": 0.4352, + "step": 8793 + }, + { + "epoch": 0.9690358126721763, + "grad_norm": 5.7507100105285645, + "learning_rate": 2.4119229866463866e-08, + "loss": 0.333, + "step": 8794 + }, + { + "epoch": 0.9691460055096419, + "grad_norm": 8.004241943359375, + "learning_rate": 2.3948005131494422e-08, + "loss": 0.4431, + "step": 8795 + }, + { + "epoch": 0.9692561983471074, + "grad_norm": 5.039779186248779, + "learning_rate": 2.3777388874922092e-08, + "loss": 0.3062, + "step": 8796 + }, + { + "epoch": 0.969366391184573, + "grad_norm": 10.546436309814453, + "learning_rate": 2.3607381117610184e-08, + "loss": 0.4645, + "step": 8797 + }, + { + "epoch": 0.9694765840220385, + "grad_norm": 4.041770935058594, + "learning_rate": 2.3437981880347628e-08, + "loss": 0.3388, + "step": 8798 + }, + { + "epoch": 0.9695867768595041, + "grad_norm": 4.304748058319092, + "learning_rate": 2.3269191183848405e-08, + "loss": 0.3828, + "step": 8799 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 5.550631046295166, + "learning_rate": 2.3101009048752678e-08, + "loss": 0.3803, + "step": 8800 + }, + { + "epoch": 0.9698071625344352, + "grad_norm": 7.604842662811279, + "learning_rate": 2.2933435495626212e-08, + "loss": 0.3725, + "step": 8801 + }, + { + "epoch": 0.9699173553719008, + "grad_norm": 5.92302131652832, + "learning_rate": 2.276647054495984e-08, + "loss": 0.415, + "step": 8802 + }, + { + "epoch": 0.9700275482093664, + "grad_norm": 5.738622665405273, + "learning_rate": 2.2600114217170566e-08, + "loss": 0.3844, + "step": 8803 + }, + { + "epoch": 0.9701377410468319, + "grad_norm": 5.6382646560668945, + "learning_rate": 2.243436653260045e-08, + "loss": 0.4277, + "step": 8804 + }, + { + "epoch": 0.9702479338842975, + "grad_norm": 5.22743558883667, + "learning_rate": 2.2269227511517167e-08, + "loss": 0.3433, + "step": 8805 + }, + { + "epoch": 0.9703581267217631, + "grad_norm": 4.088083267211914, + "learning_rate": 2.210469717411401e-08, + "loss": 0.2806, + "step": 8806 + }, + { + "epoch": 0.9704683195592286, + "grad_norm": 6.981813907623291, + "learning_rate": 2.1940775540510996e-08, + "loss": 0.4285, + "step": 8807 + }, + { + "epoch": 0.9705785123966942, + "grad_norm": 11.983031272888184, + "learning_rate": 2.1777462630751533e-08, + "loss": 0.4332, + "step": 8808 + }, + { + "epoch": 0.9706887052341597, + "grad_norm": 12.438445091247559, + "learning_rate": 2.1614758464806316e-08, + "loss": 0.4587, + "step": 8809 + }, + { + "epoch": 0.9707988980716253, + "grad_norm": 8.856440544128418, + "learning_rate": 2.14526630625711e-08, + "loss": 0.5133, + "step": 8810 + }, + { + "epoch": 0.9709090909090909, + "grad_norm": 6.0965399742126465, + "learning_rate": 2.1291176443866134e-08, + "loss": 0.3325, + "step": 8811 + }, + { + "epoch": 0.9710192837465564, + "grad_norm": 9.841634750366211, + "learning_rate": 2.113029862844007e-08, + "loss": 0.4037, + "step": 8812 + }, + { + "epoch": 0.971129476584022, + "grad_norm": 5.323287487030029, + "learning_rate": 2.097002963596384e-08, + "loss": 0.3911, + "step": 8813 + }, + { + "epoch": 0.9712396694214877, + "grad_norm": 6.11491060256958, + "learning_rate": 2.0810369486035652e-08, + "loss": 0.3484, + "step": 8814 + }, + { + "epoch": 0.9713498622589531, + "grad_norm": 8.195962905883789, + "learning_rate": 2.0651318198178783e-08, + "loss": 0.3972, + "step": 8815 + }, + { + "epoch": 0.9714600550964188, + "grad_norm": 9.117520332336426, + "learning_rate": 2.049287579184267e-08, + "loss": 0.3708, + "step": 8816 + }, + { + "epoch": 0.9715702479338842, + "grad_norm": 7.001363277435303, + "learning_rate": 2.0335042286401817e-08, + "loss": 0.3454, + "step": 8817 + }, + { + "epoch": 0.9716804407713499, + "grad_norm": 8.596649169921875, + "learning_rate": 2.0177817701156342e-08, + "loss": 0.387, + "step": 8818 + }, + { + "epoch": 0.9717906336088155, + "grad_norm": 3.672086477279663, + "learning_rate": 2.0021202055331424e-08, + "loss": 0.3942, + "step": 8819 + }, + { + "epoch": 0.971900826446281, + "grad_norm": 8.945923805236816, + "learning_rate": 1.986519536807896e-08, + "loss": 0.3429, + "step": 8820 + }, + { + "epoch": 0.9720110192837466, + "grad_norm": 5.04570198059082, + "learning_rate": 1.9709797658474805e-08, + "loss": 0.4346, + "step": 8821 + }, + { + "epoch": 0.9721212121212122, + "grad_norm": 10.192368507385254, + "learning_rate": 1.9555008945521536e-08, + "loss": 0.4353, + "step": 8822 + }, + { + "epoch": 0.9722314049586777, + "grad_norm": 6.014553546905518, + "learning_rate": 1.9400829248147902e-08, + "loss": 0.4043, + "step": 8823 + }, + { + "epoch": 0.9723415977961433, + "grad_norm": 13.046184539794922, + "learning_rate": 1.9247258585205488e-08, + "loss": 0.504, + "step": 8824 + }, + { + "epoch": 0.9724517906336089, + "grad_norm": 8.074289321899414, + "learning_rate": 1.9094296975474268e-08, + "loss": 0.3953, + "step": 8825 + }, + { + "epoch": 0.9725619834710744, + "grad_norm": 7.343116283416748, + "learning_rate": 1.894194443765873e-08, + "loss": 0.4039, + "step": 8826 + }, + { + "epoch": 0.97267217630854, + "grad_norm": 5.112274169921875, + "learning_rate": 1.8790200990387863e-08, + "loss": 0.4343, + "step": 8827 + }, + { + "epoch": 0.9727823691460055, + "grad_norm": 4.739718437194824, + "learning_rate": 1.8639066652217375e-08, + "loss": 0.387, + "step": 8828 + }, + { + "epoch": 0.9728925619834711, + "grad_norm": 4.407110691070557, + "learning_rate": 1.8488541441628593e-08, + "loss": 0.2905, + "step": 8829 + }, + { + "epoch": 0.9730027548209367, + "grad_norm": 10.422987937927246, + "learning_rate": 1.8338625377027907e-08, + "loss": 0.5067, + "step": 8830 + }, + { + "epoch": 0.9731129476584022, + "grad_norm": 7.39470911026001, + "learning_rate": 1.8189318476746764e-08, + "loss": 0.3879, + "step": 8831 + }, + { + "epoch": 0.9732231404958678, + "grad_norm": 9.564068794250488, + "learning_rate": 1.804062075904278e-08, + "loss": 0.4198, + "step": 8832 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 5.889505863189697, + "learning_rate": 1.7892532242099192e-08, + "loss": 0.3871, + "step": 8833 + }, + { + "epoch": 0.9734435261707989, + "grad_norm": 9.963391304016113, + "learning_rate": 1.774505294402429e-08, + "loss": 0.3672, + "step": 8834 + }, + { + "epoch": 0.9735537190082645, + "grad_norm": 9.059921264648438, + "learning_rate": 1.7598182882851977e-08, + "loss": 0.3408, + "step": 8835 + }, + { + "epoch": 0.97366391184573, + "grad_norm": 6.291049480438232, + "learning_rate": 1.7451922076541783e-08, + "loss": 0.3148, + "step": 8836 + }, + { + "epoch": 0.9737741046831956, + "grad_norm": 5.076037883758545, + "learning_rate": 1.7306270542978288e-08, + "loss": 0.4341, + "step": 8837 + }, + { + "epoch": 0.9738842975206612, + "grad_norm": 6.406747341156006, + "learning_rate": 1.7161228299973354e-08, + "loss": 0.3532, + "step": 8838 + }, + { + "epoch": 0.9739944903581267, + "grad_norm": 6.340034008026123, + "learning_rate": 1.701679536526113e-08, + "loss": 0.4543, + "step": 8839 + }, + { + "epoch": 0.9741046831955923, + "grad_norm": 7.013032913208008, + "learning_rate": 1.6872971756504153e-08, + "loss": 0.3869, + "step": 8840 + }, + { + "epoch": 0.9742148760330579, + "grad_norm": 5.976755142211914, + "learning_rate": 1.672975749128891e-08, + "loss": 0.4204, + "step": 8841 + }, + { + "epoch": 0.9743250688705234, + "grad_norm": 6.865273952484131, + "learning_rate": 1.6587152587128064e-08, + "loss": 0.4589, + "step": 8842 + }, + { + "epoch": 0.974435261707989, + "grad_norm": 7.500924587249756, + "learning_rate": 1.6445157061459883e-08, + "loss": 0.4156, + "step": 8843 + }, + { + "epoch": 0.9745454545454545, + "grad_norm": 15.058416366577148, + "learning_rate": 1.6303770931647146e-08, + "loss": 0.3318, + "step": 8844 + }, + { + "epoch": 0.9746556473829201, + "grad_norm": 5.671866416931152, + "learning_rate": 1.61629942149788e-08, + "loss": 0.3541, + "step": 8845 + }, + { + "epoch": 0.9747658402203857, + "grad_norm": 5.84348726272583, + "learning_rate": 1.6022826928669964e-08, + "loss": 0.4222, + "step": 8846 + }, + { + "epoch": 0.9748760330578512, + "grad_norm": 5.512253284454346, + "learning_rate": 1.5883269089859155e-08, + "loss": 0.3553, + "step": 8847 + }, + { + "epoch": 0.9749862258953168, + "grad_norm": 6.648393154144287, + "learning_rate": 1.5744320715612716e-08, + "loss": 0.4339, + "step": 8848 + }, + { + "epoch": 0.9750964187327824, + "grad_norm": 5.175326347351074, + "learning_rate": 1.56059818229215e-08, + "loss": 0.3948, + "step": 8849 + }, + { + "epoch": 0.9752066115702479, + "grad_norm": 7.846436977386475, + "learning_rate": 1.5468252428701425e-08, + "loss": 0.3716, + "step": 8850 + }, + { + "epoch": 0.9753168044077135, + "grad_norm": 4.911636829376221, + "learning_rate": 1.5331132549794014e-08, + "loss": 0.3354, + "step": 8851 + }, + { + "epoch": 0.9754269972451791, + "grad_norm": 8.698405265808105, + "learning_rate": 1.5194622202966968e-08, + "loss": 0.3774, + "step": 8852 + }, + { + "epoch": 0.9755371900826446, + "grad_norm": 5.8944501876831055, + "learning_rate": 1.505872140491249e-08, + "loss": 0.4486, + "step": 8853 + }, + { + "epoch": 0.9756473829201102, + "grad_norm": 6.094091415405273, + "learning_rate": 1.4923430172248953e-08, + "loss": 0.4023, + "step": 8854 + }, + { + "epoch": 0.9757575757575757, + "grad_norm": 10.681119918823242, + "learning_rate": 1.4788748521519792e-08, + "loss": 0.4874, + "step": 8855 + }, + { + "epoch": 0.9758677685950413, + "grad_norm": 5.61772346496582, + "learning_rate": 1.4654676469194607e-08, + "loss": 0.3438, + "step": 8856 + }, + { + "epoch": 0.9759779614325069, + "grad_norm": 4.298542499542236, + "learning_rate": 1.4521214031666952e-08, + "loss": 0.374, + "step": 8857 + }, + { + "epoch": 0.9760881542699724, + "grad_norm": 6.038506031036377, + "learning_rate": 1.4388361225257663e-08, + "loss": 0.3459, + "step": 8858 + }, + { + "epoch": 0.976198347107438, + "grad_norm": 5.182193756103516, + "learning_rate": 1.4256118066212077e-08, + "loss": 0.3743, + "step": 8859 + }, + { + "epoch": 0.9763085399449036, + "grad_norm": 8.860308647155762, + "learning_rate": 1.4124484570700591e-08, + "loss": 0.4173, + "step": 8860 + }, + { + "epoch": 0.9764187327823691, + "grad_norm": 7.584855556488037, + "learning_rate": 1.3993460754819777e-08, + "loss": 0.3871, + "step": 8861 + }, + { + "epoch": 0.9765289256198347, + "grad_norm": 10.860026359558105, + "learning_rate": 1.3863046634591815e-08, + "loss": 0.3235, + "step": 8862 + }, + { + "epoch": 0.9766391184573002, + "grad_norm": 5.567005634307861, + "learning_rate": 1.373324222596284e-08, + "loss": 0.3589, + "step": 8863 + }, + { + "epoch": 0.9767493112947658, + "grad_norm": 8.486008644104004, + "learning_rate": 1.360404754480682e-08, + "loss": 0.4191, + "step": 8864 + }, + { + "epoch": 0.9768595041322314, + "grad_norm": 10.701818466186523, + "learning_rate": 1.347546260692112e-08, + "loss": 0.5152, + "step": 8865 + }, + { + "epoch": 0.9769696969696969, + "grad_norm": 10.07162857055664, + "learning_rate": 1.3347487428029272e-08, + "loss": 0.4344, + "step": 8866 + }, + { + "epoch": 0.9770798898071625, + "grad_norm": 6.04201078414917, + "learning_rate": 1.3220122023779869e-08, + "loss": 0.3542, + "step": 8867 + }, + { + "epoch": 0.9771900826446281, + "grad_norm": 8.38823127746582, + "learning_rate": 1.3093366409748232e-08, + "loss": 0.4219, + "step": 8868 + }, + { + "epoch": 0.9773002754820936, + "grad_norm": 8.828341484069824, + "learning_rate": 1.2967220601434183e-08, + "loss": 0.4607, + "step": 8869 + }, + { + "epoch": 0.9774104683195592, + "grad_norm": 8.483080863952637, + "learning_rate": 1.2841684614262052e-08, + "loss": 0.4064, + "step": 8870 + }, + { + "epoch": 0.9775206611570247, + "grad_norm": 5.314237594604492, + "learning_rate": 1.2716758463583444e-08, + "loss": 0.3547, + "step": 8871 + }, + { + "epoch": 0.9776308539944903, + "grad_norm": 5.820374011993408, + "learning_rate": 1.2592442164673923e-08, + "loss": 0.3678, + "step": 8872 + }, + { + "epoch": 0.977741046831956, + "grad_norm": 5.87931489944458, + "learning_rate": 1.2468735732735215e-08, + "loss": 0.3456, + "step": 8873 + }, + { + "epoch": 0.9778512396694214, + "grad_norm": 4.359767913818359, + "learning_rate": 1.2345639182894664e-08, + "loss": 0.4238, + "step": 8874 + }, + { + "epoch": 0.977961432506887, + "grad_norm": 6.766221523284912, + "learning_rate": 1.2223152530204118e-08, + "loss": 0.3743, + "step": 8875 + }, + { + "epoch": 0.9780716253443527, + "grad_norm": 8.950994491577148, + "learning_rate": 1.2101275789642152e-08, + "loss": 0.4784, + "step": 8876 + }, + { + "epoch": 0.9781818181818182, + "grad_norm": 5.21338415145874, + "learning_rate": 1.198000897611129e-08, + "loss": 0.398, + "step": 8877 + }, + { + "epoch": 0.9782920110192838, + "grad_norm": 8.412769317626953, + "learning_rate": 1.1859352104440225e-08, + "loss": 0.4103, + "step": 8878 + }, + { + "epoch": 0.9784022038567494, + "grad_norm": 5.464070796966553, + "learning_rate": 1.173930518938382e-08, + "loss": 0.3167, + "step": 8879 + }, + { + "epoch": 0.9785123966942149, + "grad_norm": 6.9967803955078125, + "learning_rate": 1.161986824562089e-08, + "loss": 0.3065, + "step": 8880 + }, + { + "epoch": 0.9786225895316805, + "grad_norm": 11.746569633483887, + "learning_rate": 1.150104128775642e-08, + "loss": 0.3764, + "step": 8881 + }, + { + "epoch": 0.978732782369146, + "grad_norm": 5.9719648361206055, + "learning_rate": 1.1382824330321007e-08, + "loss": 0.3483, + "step": 8882 + }, + { + "epoch": 0.9788429752066116, + "grad_norm": 5.727097511291504, + "learning_rate": 1.126521738777031e-08, + "loss": 0.3614, + "step": 8883 + }, + { + "epoch": 0.9789531680440772, + "grad_norm": 6.904728412628174, + "learning_rate": 1.1148220474485049e-08, + "loss": 0.3706, + "step": 8884 + }, + { + "epoch": 0.9790633608815427, + "grad_norm": 5.867455959320068, + "learning_rate": 1.1031833604772113e-08, + "loss": 0.3982, + "step": 8885 + }, + { + "epoch": 0.9791735537190083, + "grad_norm": 11.182619094848633, + "learning_rate": 1.091605679286345e-08, + "loss": 0.5638, + "step": 8886 + }, + { + "epoch": 0.9792837465564739, + "grad_norm": 5.756790637969971, + "learning_rate": 1.0800890052916623e-08, + "loss": 0.3807, + "step": 8887 + }, + { + "epoch": 0.9793939393939394, + "grad_norm": 4.721381187438965, + "learning_rate": 1.0686333399013704e-08, + "loss": 0.394, + "step": 8888 + }, + { + "epoch": 0.979504132231405, + "grad_norm": 5.011298179626465, + "learning_rate": 1.0572386845163485e-08, + "loss": 0.4153, + "step": 8889 + }, + { + "epoch": 0.9796143250688705, + "grad_norm": 5.2296142578125, + "learning_rate": 1.0459050405299265e-08, + "loss": 0.3156, + "step": 8890 + }, + { + "epoch": 0.9797245179063361, + "grad_norm": 5.576880931854248, + "learning_rate": 1.0346324093279958e-08, + "loss": 0.3834, + "step": 8891 + }, + { + "epoch": 0.9798347107438017, + "grad_norm": 8.7191162109375, + "learning_rate": 1.0234207922890094e-08, + "loss": 0.4275, + "step": 8892 + }, + { + "epoch": 0.9799449035812672, + "grad_norm": 6.978935241699219, + "learning_rate": 1.012270190783926e-08, + "loss": 0.4068, + "step": 8893 + }, + { + "epoch": 0.9800550964187328, + "grad_norm": 6.131322383880615, + "learning_rate": 1.0011806061762109e-08, + "loss": 0.3177, + "step": 8894 + }, + { + "epoch": 0.9801652892561984, + "grad_norm": 5.406473636627197, + "learning_rate": 9.90152039822001e-09, + "loss": 0.3968, + "step": 8895 + }, + { + "epoch": 0.9802754820936639, + "grad_norm": 11.630650520324707, + "learning_rate": 9.791844930697736e-09, + "loss": 0.4068, + "step": 8896 + }, + { + "epoch": 0.9803856749311295, + "grad_norm": 3.7398698329925537, + "learning_rate": 9.68277967260789e-09, + "loss": 0.3398, + "step": 8897 + }, + { + "epoch": 0.980495867768595, + "grad_norm": 5.201544284820557, + "learning_rate": 9.574324637285915e-09, + "loss": 0.4375, + "step": 8898 + }, + { + "epoch": 0.9806060606060606, + "grad_norm": 15.65612506866455, + "learning_rate": 9.466479837994536e-09, + "loss": 0.407, + "step": 8899 + }, + { + "epoch": 0.9807162534435262, + "grad_norm": 5.909888744354248, + "learning_rate": 9.359245287920981e-09, + "loss": 0.4024, + "step": 8900 + }, + { + "epoch": 0.9808264462809917, + "grad_norm": 5.173218727111816, + "learning_rate": 9.252621000178097e-09, + "loss": 0.4007, + "step": 8901 + }, + { + "epoch": 0.9809366391184573, + "grad_norm": 4.32283353805542, + "learning_rate": 9.146606987804341e-09, + "loss": 0.3089, + "step": 8902 + }, + { + "epoch": 0.9810468319559229, + "grad_norm": 10.846911430358887, + "learning_rate": 9.041203263762122e-09, + "loss": 0.4318, + "step": 8903 + }, + { + "epoch": 0.9811570247933884, + "grad_norm": 6.127249717712402, + "learning_rate": 8.936409840941685e-09, + "loss": 0.4559, + "step": 8904 + }, + { + "epoch": 0.981267217630854, + "grad_norm": 6.813745975494385, + "learning_rate": 8.832226732156668e-09, + "loss": 0.4043, + "step": 8905 + }, + { + "epoch": 0.9813774104683196, + "grad_norm": 5.514586925506592, + "learning_rate": 8.728653950146326e-09, + "loss": 0.4176, + "step": 8906 + }, + { + "epoch": 0.9814876033057851, + "grad_norm": 7.442978858947754, + "learning_rate": 8.625691507576638e-09, + "loss": 0.4868, + "step": 8907 + }, + { + "epoch": 0.9815977961432507, + "grad_norm": 6.240289211273193, + "learning_rate": 8.523339417037535e-09, + "loss": 0.4179, + "step": 8908 + }, + { + "epoch": 0.9817079889807162, + "grad_norm": 5.748809814453125, + "learning_rate": 8.421597691044559e-09, + "loss": 0.4019, + "step": 8909 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 6.198007106781006, + "learning_rate": 8.320466342038868e-09, + "loss": 0.3816, + "step": 8910 + }, + { + "epoch": 0.9819283746556474, + "grad_norm": 4.0680928230285645, + "learning_rate": 8.21994538238724e-09, + "loss": 0.3458, + "step": 8911 + }, + { + "epoch": 0.9820385674931129, + "grad_norm": 7.397459030151367, + "learning_rate": 8.120034824381506e-09, + "loss": 0.4114, + "step": 8912 + }, + { + "epoch": 0.9821487603305785, + "grad_norm": 5.5714430809021, + "learning_rate": 8.020734680238562e-09, + "loss": 0.3178, + "step": 8913 + }, + { + "epoch": 0.9822589531680441, + "grad_norm": 5.759981632232666, + "learning_rate": 7.922044962100916e-09, + "loss": 0.3357, + "step": 8914 + }, + { + "epoch": 0.9823691460055096, + "grad_norm": 8.685111045837402, + "learning_rate": 7.823965682037249e-09, + "loss": 0.4521, + "step": 8915 + }, + { + "epoch": 0.9824793388429752, + "grad_norm": 6.009195327758789, + "learning_rate": 7.726496852039633e-09, + "loss": 0.3254, + "step": 8916 + }, + { + "epoch": 0.9825895316804407, + "grad_norm": 6.0467143058776855, + "learning_rate": 7.629638484027424e-09, + "loss": 0.2981, + "step": 8917 + }, + { + "epoch": 0.9826997245179063, + "grad_norm": 6.970085144042969, + "learning_rate": 7.533390589845035e-09, + "loss": 0.3373, + "step": 8918 + }, + { + "epoch": 0.9828099173553719, + "grad_norm": 8.984983444213867, + "learning_rate": 7.437753181260831e-09, + "loss": 0.3906, + "step": 8919 + }, + { + "epoch": 0.9829201101928374, + "grad_norm": 12.746138572692871, + "learning_rate": 7.342726269969902e-09, + "loss": 0.513, + "step": 8920 + }, + { + "epoch": 0.983030303030303, + "grad_norm": 7.483966827392578, + "learning_rate": 7.2483098675923955e-09, + "loss": 0.4206, + "step": 8921 + }, + { + "epoch": 0.9831404958677686, + "grad_norm": 11.717601776123047, + "learning_rate": 7.154503985673522e-09, + "loss": 0.4531, + "step": 8922 + }, + { + "epoch": 0.9832506887052341, + "grad_norm": 5.9319353103637695, + "learning_rate": 7.061308635684105e-09, + "loss": 0.3361, + "step": 8923 + }, + { + "epoch": 0.9833608815426997, + "grad_norm": 8.37967586517334, + "learning_rate": 6.96872382902003e-09, + "loss": 0.3765, + "step": 8924 + }, + { + "epoch": 0.9834710743801653, + "grad_norm": 5.437576770782471, + "learning_rate": 6.876749577002795e-09, + "loss": 0.3946, + "step": 8925 + }, + { + "epoch": 0.9835812672176308, + "grad_norm": 4.543797492980957, + "learning_rate": 6.7853858908784046e-09, + "loss": 0.4174, + "step": 8926 + }, + { + "epoch": 0.9836914600550964, + "grad_norm": 5.815567970275879, + "learning_rate": 6.694632781820698e-09, + "loss": 0.4073, + "step": 8927 + }, + { + "epoch": 0.9838016528925619, + "grad_norm": 9.523588180541992, + "learning_rate": 6.604490260925245e-09, + "loss": 0.4364, + "step": 8928 + }, + { + "epoch": 0.9839118457300275, + "grad_norm": 4.060024261474609, + "learning_rate": 6.514958339216004e-09, + "loss": 0.3589, + "step": 8929 + }, + { + "epoch": 0.9840220385674932, + "grad_norm": 6.048622131347656, + "learning_rate": 6.4260370276408855e-09, + "loss": 0.3405, + "step": 8930 + }, + { + "epoch": 0.9841322314049586, + "grad_norm": 5.9720282554626465, + "learning_rate": 6.3377263370728585e-09, + "loss": 0.4164, + "step": 8931 + }, + { + "epoch": 0.9842424242424243, + "grad_norm": 5.029387474060059, + "learning_rate": 6.250026278310506e-09, + "loss": 0.3852, + "step": 8932 + }, + { + "epoch": 0.9843526170798899, + "grad_norm": 4.894454479217529, + "learning_rate": 6.162936862078583e-09, + "loss": 0.337, + "step": 8933 + }, + { + "epoch": 0.9844628099173554, + "grad_norm": 6.65284538269043, + "learning_rate": 6.07645809902635e-09, + "loss": 0.3496, + "step": 8934 + }, + { + "epoch": 0.984573002754821, + "grad_norm": 5.625543594360352, + "learning_rate": 5.9905899997281244e-09, + "loss": 0.3761, + "step": 8935 + }, + { + "epoch": 0.9846831955922865, + "grad_norm": 11.028755187988281, + "learning_rate": 5.905332574684397e-09, + "loss": 0.5301, + "step": 8936 + }, + { + "epoch": 0.9847933884297521, + "grad_norm": 8.856451988220215, + "learning_rate": 5.820685834320161e-09, + "loss": 0.4884, + "step": 8937 + }, + { + "epoch": 0.9849035812672177, + "grad_norm": 8.534035682678223, + "learning_rate": 5.736649788986581e-09, + "loss": 0.4377, + "step": 8938 + }, + { + "epoch": 0.9850137741046832, + "grad_norm": 6.711327075958252, + "learning_rate": 5.653224448959882e-09, + "loss": 0.3666, + "step": 8939 + }, + { + "epoch": 0.9851239669421488, + "grad_norm": 7.608486175537109, + "learning_rate": 5.570409824440237e-09, + "loss": 0.3626, + "step": 8940 + }, + { + "epoch": 0.9852341597796144, + "grad_norm": 6.2303786277771, + "learning_rate": 5.488205925555656e-09, + "loss": 0.342, + "step": 8941 + }, + { + "epoch": 0.9853443526170799, + "grad_norm": 9.190738677978516, + "learning_rate": 5.406612762357544e-09, + "loss": 0.445, + "step": 8942 + }, + { + "epoch": 0.9854545454545455, + "grad_norm": 6.4787278175354, + "learning_rate": 5.325630344823474e-09, + "loss": 0.4057, + "step": 8943 + }, + { + "epoch": 0.985564738292011, + "grad_norm": 5.341314792633057, + "learning_rate": 5.24525868285608e-09, + "loss": 0.4303, + "step": 8944 + }, + { + "epoch": 0.9856749311294766, + "grad_norm": 6.1412129402160645, + "learning_rate": 5.1654977862825025e-09, + "loss": 0.4085, + "step": 8945 + }, + { + "epoch": 0.9857851239669422, + "grad_norm": 5.3331732749938965, + "learning_rate": 5.0863476648571605e-09, + "loss": 0.3269, + "step": 8946 + }, + { + "epoch": 0.9858953168044077, + "grad_norm": 6.190547466278076, + "learning_rate": 5.007808328258423e-09, + "loss": 0.4189, + "step": 8947 + }, + { + "epoch": 0.9860055096418733, + "grad_norm": 13.57688045501709, + "learning_rate": 4.9298797860891645e-09, + "loss": 0.5056, + "step": 8948 + }, + { + "epoch": 0.9861157024793389, + "grad_norm": 6.407979488372803, + "learning_rate": 4.852562047879539e-09, + "loss": 0.2966, + "step": 8949 + }, + { + "epoch": 0.9862258953168044, + "grad_norm": 5.984018325805664, + "learning_rate": 4.775855123084206e-09, + "loss": 0.3656, + "step": 8950 + }, + { + "epoch": 0.98633608815427, + "grad_norm": 8.00353717803955, + "learning_rate": 4.699759021082328e-09, + "loss": 0.4132, + "step": 8951 + }, + { + "epoch": 0.9864462809917356, + "grad_norm": 6.895720958709717, + "learning_rate": 4.624273751179797e-09, + "loss": 0.4028, + "step": 8952 + }, + { + "epoch": 0.9865564738292011, + "grad_norm": 9.572421073913574, + "learning_rate": 4.549399322606451e-09, + "loss": 0.4381, + "step": 8953 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 5.052114963531494, + "learning_rate": 4.4751357445177445e-09, + "loss": 0.3868, + "step": 8954 + }, + { + "epoch": 0.9867768595041322, + "grad_norm": 5.700187683105469, + "learning_rate": 4.4014830259958565e-09, + "loss": 0.4035, + "step": 8955 + }, + { + "epoch": 0.9868870523415978, + "grad_norm": 4.689455509185791, + "learning_rate": 4.328441176045806e-09, + "loss": 0.3585, + "step": 8956 + }, + { + "epoch": 0.9869972451790634, + "grad_norm": 5.007065296173096, + "learning_rate": 4.256010203600447e-09, + "loss": 0.3385, + "step": 8957 + }, + { + "epoch": 0.9871074380165289, + "grad_norm": 8.176401138305664, + "learning_rate": 4.184190117516029e-09, + "loss": 0.4321, + "step": 8958 + }, + { + "epoch": 0.9872176308539945, + "grad_norm": 9.480592727661133, + "learning_rate": 4.112980926574972e-09, + "loss": 0.37, + "step": 8959 + }, + { + "epoch": 0.9873278236914601, + "grad_norm": 5.8697733879089355, + "learning_rate": 4.04238263948531e-09, + "loss": 0.3528, + "step": 8960 + }, + { + "epoch": 0.9874380165289256, + "grad_norm": 8.033915519714355, + "learning_rate": 3.972395264879026e-09, + "loss": 0.3632, + "step": 8961 + }, + { + "epoch": 0.9875482093663912, + "grad_norm": 5.839441776275635, + "learning_rate": 3.9030188113142745e-09, + "loss": 0.3946, + "step": 8962 + }, + { + "epoch": 0.9876584022038567, + "grad_norm": 7.490885257720947, + "learning_rate": 3.83425328727538e-09, + "loss": 0.3771, + "step": 8963 + }, + { + "epoch": 0.9877685950413223, + "grad_norm": 5.09083890914917, + "learning_rate": 3.766098701170617e-09, + "loss": 0.3528, + "step": 8964 + }, + { + "epoch": 0.9878787878787879, + "grad_norm": 4.408278942108154, + "learning_rate": 3.69855506133443e-09, + "loss": 0.3541, + "step": 8965 + }, + { + "epoch": 0.9879889807162534, + "grad_norm": 13.632762908935547, + "learning_rate": 3.6316223760252125e-09, + "loss": 0.3984, + "step": 8966 + }, + { + "epoch": 0.988099173553719, + "grad_norm": 10.03503704071045, + "learning_rate": 3.5653006534280833e-09, + "loss": 0.4507, + "step": 8967 + }, + { + "epoch": 0.9882093663911846, + "grad_norm": 11.511576652526855, + "learning_rate": 3.499589901653222e-09, + "loss": 0.5408, + "step": 8968 + }, + { + "epoch": 0.9883195592286501, + "grad_norm": 5.304442405700684, + "learning_rate": 3.4344901287353126e-09, + "loss": 0.3568, + "step": 8969 + }, + { + "epoch": 0.9884297520661157, + "grad_norm": 68.96772766113281, + "learning_rate": 3.3700013426352096e-09, + "loss": 0.4214, + "step": 8970 + }, + { + "epoch": 0.9885399449035812, + "grad_norm": 6.820242404937744, + "learning_rate": 3.3061235512388267e-09, + "loss": 0.3921, + "step": 8971 + }, + { + "epoch": 0.9886501377410468, + "grad_norm": 4.5322442054748535, + "learning_rate": 3.242856762356583e-09, + "loss": 0.4278, + "step": 8972 + }, + { + "epoch": 0.9887603305785124, + "grad_norm": 8.196949005126953, + "learning_rate": 3.180200983725623e-09, + "loss": 0.4112, + "step": 8973 + }, + { + "epoch": 0.9888705234159779, + "grad_norm": 8.82950496673584, + "learning_rate": 3.1181562230070406e-09, + "loss": 0.5031, + "step": 8974 + }, + { + "epoch": 0.9889807162534435, + "grad_norm": 7.549552917480469, + "learning_rate": 3.0567224877875445e-09, + "loss": 0.4718, + "step": 8975 + }, + { + "epoch": 0.9890909090909091, + "grad_norm": 7.072907447814941, + "learning_rate": 2.9958997855805693e-09, + "loss": 0.3899, + "step": 8976 + }, + { + "epoch": 0.9892011019283746, + "grad_norm": 4.553850173950195, + "learning_rate": 2.935688123821834e-09, + "loss": 0.3323, + "step": 8977 + }, + { + "epoch": 0.9893112947658402, + "grad_norm": 6.005707263946533, + "learning_rate": 2.876087509875447e-09, + "loss": 0.3726, + "step": 8978 + }, + { + "epoch": 0.9894214876033058, + "grad_norm": 6.978837013244629, + "learning_rate": 2.8170979510289133e-09, + "loss": 0.3745, + "step": 8979 + }, + { + "epoch": 0.9895316804407713, + "grad_norm": 15.969650268554688, + "learning_rate": 2.758719454495906e-09, + "loss": 0.3653, + "step": 8980 + }, + { + "epoch": 0.9896418732782369, + "grad_norm": 11.831104278564453, + "learning_rate": 2.7009520274146052e-09, + "loss": 0.4508, + "step": 8981 + }, + { + "epoch": 0.9897520661157024, + "grad_norm": 5.531377792358398, + "learning_rate": 2.643795676848804e-09, + "loss": 0.4591, + "step": 8982 + }, + { + "epoch": 0.989862258953168, + "grad_norm": 7.935631275177002, + "learning_rate": 2.5872504097884664e-09, + "loss": 0.3249, + "step": 8983 + }, + { + "epoch": 0.9899724517906336, + "grad_norm": 7.079875469207764, + "learning_rate": 2.5313162331469522e-09, + "loss": 0.4057, + "step": 8984 + }, + { + "epoch": 0.9900826446280991, + "grad_norm": 5.866529941558838, + "learning_rate": 2.4759931537648998e-09, + "loss": 0.4286, + "step": 8985 + }, + { + "epoch": 0.9901928374655647, + "grad_norm": 4.35883092880249, + "learning_rate": 2.4212811784063427e-09, + "loss": 0.4128, + "step": 8986 + }, + { + "epoch": 0.9903030303030304, + "grad_norm": 8.639993667602539, + "learning_rate": 2.367180313762596e-09, + "loss": 0.4323, + "step": 8987 + }, + { + "epoch": 0.9904132231404958, + "grad_norm": 7.270481586456299, + "learning_rate": 2.3136905664483676e-09, + "loss": 0.3641, + "step": 8988 + }, + { + "epoch": 0.9905234159779615, + "grad_norm": 5.585358142852783, + "learning_rate": 2.2608119430045373e-09, + "loss": 0.4149, + "step": 8989 + }, + { + "epoch": 0.990633608815427, + "grad_norm": 5.513559341430664, + "learning_rate": 2.2085444498975984e-09, + "loss": 0.4228, + "step": 8990 + }, + { + "epoch": 0.9907438016528926, + "grad_norm": 9.119502067565918, + "learning_rate": 2.156888093519105e-09, + "loss": 0.4291, + "step": 8991 + }, + { + "epoch": 0.9908539944903582, + "grad_norm": 5.719584941864014, + "learning_rate": 2.105842880184561e-09, + "loss": 0.3416, + "step": 8992 + }, + { + "epoch": 0.9909641873278237, + "grad_norm": 5.899689674377441, + "learning_rate": 2.0554088161367503e-09, + "loss": 0.3913, + "step": 8993 + }, + { + "epoch": 0.9910743801652893, + "grad_norm": 14.344022750854492, + "learning_rate": 2.005585907542962e-09, + "loss": 0.3655, + "step": 8994 + }, + { + "epoch": 0.9911845730027549, + "grad_norm": 6.823244571685791, + "learning_rate": 1.9563741604949893e-09, + "loss": 0.3427, + "step": 8995 + }, + { + "epoch": 0.9912947658402204, + "grad_norm": 4.921267509460449, + "learning_rate": 1.9077735810107967e-09, + "loss": 0.3867, + "step": 8996 + }, + { + "epoch": 0.991404958677686, + "grad_norm": 5.286651134490967, + "learning_rate": 1.8597841750328528e-09, + "loss": 0.4136, + "step": 8997 + }, + { + "epoch": 0.9915151515151515, + "grad_norm": 8.979911804199219, + "learning_rate": 1.8124059484303513e-09, + "loss": 0.4196, + "step": 8998 + }, + { + "epoch": 0.9916253443526171, + "grad_norm": 5.602459907531738, + "learning_rate": 1.7656389069958812e-09, + "loss": 0.4028, + "step": 8999 + }, + { + "epoch": 0.9917355371900827, + "grad_norm": 7.817157745361328, + "learning_rate": 1.7194830564487564e-09, + "loss": 0.3055, + "step": 9000 + }, + { + "epoch": 0.9918457300275482, + "grad_norm": 6.696260929107666, + "learning_rate": 1.6739384024327953e-09, + "loss": 0.3815, + "step": 9001 + }, + { + "epoch": 0.9919559228650138, + "grad_norm": 5.939305782318115, + "learning_rate": 1.629004950516877e-09, + "loss": 0.3609, + "step": 9002 + }, + { + "epoch": 0.9920661157024794, + "grad_norm": 5.6286797523498535, + "learning_rate": 1.5846827061960501e-09, + "loss": 0.3164, + "step": 9003 + }, + { + "epoch": 0.9921763085399449, + "grad_norm": 7.445435047149658, + "learning_rate": 1.5409716748898684e-09, + "loss": 0.4076, + "step": 9004 + }, + { + "epoch": 0.9922865013774105, + "grad_norm": 4.20958137512207, + "learning_rate": 1.4978718619435007e-09, + "loss": 0.3721, + "step": 9005 + }, + { + "epoch": 0.9923966942148761, + "grad_norm": 5.760605812072754, + "learning_rate": 1.4553832726271755e-09, + "loss": 0.3617, + "step": 9006 + }, + { + "epoch": 0.9925068870523416, + "grad_norm": 4.79775333404541, + "learning_rate": 1.4135059121361815e-09, + "loss": 0.3678, + "step": 9007 + }, + { + "epoch": 0.9926170798898072, + "grad_norm": 5.942553997039795, + "learning_rate": 1.3722397855919778e-09, + "loss": 0.4377, + "step": 9008 + }, + { + "epoch": 0.9927272727272727, + "grad_norm": 8.425210952758789, + "learning_rate": 1.3315848980399726e-09, + "loss": 0.4446, + "step": 9009 + }, + { + "epoch": 0.9928374655647383, + "grad_norm": 7.769210338592529, + "learning_rate": 1.2915412544517447e-09, + "loss": 0.4648, + "step": 9010 + }, + { + "epoch": 0.9929476584022039, + "grad_norm": 4.9520134925842285, + "learning_rate": 1.2521088597239328e-09, + "loss": 0.3496, + "step": 9011 + }, + { + "epoch": 0.9930578512396694, + "grad_norm": 5.601205348968506, + "learning_rate": 1.2132877186787906e-09, + "loss": 0.3704, + "step": 9012 + }, + { + "epoch": 0.993168044077135, + "grad_norm": 7.059554576873779, + "learning_rate": 1.1750778360625214e-09, + "loss": 0.3442, + "step": 9013 + }, + { + "epoch": 0.9932782369146006, + "grad_norm": 5.612921714782715, + "learning_rate": 1.1374792165486093e-09, + "loss": 0.3001, + "step": 9014 + }, + { + "epoch": 0.9933884297520661, + "grad_norm": 9.377395629882812, + "learning_rate": 1.1004918647333773e-09, + "loss": 0.4434, + "step": 9015 + }, + { + "epoch": 0.9934986225895317, + "grad_norm": 5.162177562713623, + "learning_rate": 1.0641157851404294e-09, + "loss": 0.3526, + "step": 9016 + }, + { + "epoch": 0.9936088154269972, + "grad_norm": 6.366061210632324, + "learning_rate": 1.028350982217874e-09, + "loss": 0.367, + "step": 9017 + }, + { + "epoch": 0.9937190082644628, + "grad_norm": 6.468604564666748, + "learning_rate": 9.931974603394345e-10, + "loss": 0.3281, + "step": 9018 + }, + { + "epoch": 0.9938292011019284, + "grad_norm": 5.354187965393066, + "learning_rate": 9.586552238027847e-10, + "loss": 0.3617, + "step": 9019 + }, + { + "epoch": 0.9939393939393939, + "grad_norm": 8.523183822631836, + "learning_rate": 9.247242768323228e-10, + "loss": 0.469, + "step": 9020 + }, + { + "epoch": 0.9940495867768595, + "grad_norm": 7.622811317443848, + "learning_rate": 8.914046235775076e-10, + "loss": 0.3906, + "step": 9021 + }, + { + "epoch": 0.9941597796143251, + "grad_norm": 7.261738300323486, + "learning_rate": 8.586962681117472e-10, + "loss": 0.4557, + "step": 9022 + }, + { + "epoch": 0.9942699724517906, + "grad_norm": 5.034419059753418, + "learning_rate": 8.265992144357304e-10, + "loss": 0.356, + "step": 9023 + }, + { + "epoch": 0.9943801652892562, + "grad_norm": 5.880802631378174, + "learning_rate": 7.951134664740956e-10, + "loss": 0.3586, + "step": 9024 + }, + { + "epoch": 0.9944903581267218, + "grad_norm": 11.561360359191895, + "learning_rate": 7.642390280759859e-10, + "loss": 0.3061, + "step": 9025 + }, + { + "epoch": 0.9946005509641873, + "grad_norm": 7.9841766357421875, + "learning_rate": 7.339759030183802e-10, + "loss": 0.3877, + "step": 9026 + }, + { + "epoch": 0.9947107438016529, + "grad_norm": 8.438712120056152, + "learning_rate": 7.043240950005414e-10, + "loss": 0.3889, + "step": 9027 + }, + { + "epoch": 0.9948209366391184, + "grad_norm": 12.29823112487793, + "learning_rate": 6.752836076484581e-10, + "loss": 0.4049, + "step": 9028 + }, + { + "epoch": 0.994931129476584, + "grad_norm": 8.459811210632324, + "learning_rate": 6.468544445142888e-10, + "loss": 0.4202, + "step": 9029 + }, + { + "epoch": 0.9950413223140496, + "grad_norm": 7.202066421508789, + "learning_rate": 6.190366090735866e-10, + "loss": 0.3433, + "step": 9030 + }, + { + "epoch": 0.9951515151515151, + "grad_norm": 5.43256950378418, + "learning_rate": 5.91830104728075e-10, + "loss": 0.287, + "step": 9031 + }, + { + "epoch": 0.9952617079889807, + "grad_norm": 6.569697380065918, + "learning_rate": 5.652349348045372e-10, + "loss": 0.3686, + "step": 9032 + }, + { + "epoch": 0.9953719008264463, + "grad_norm": 5.745114326477051, + "learning_rate": 5.392511025548164e-10, + "loss": 0.3929, + "step": 9033 + }, + { + "epoch": 0.9954820936639118, + "grad_norm": 5.564207553863525, + "learning_rate": 5.13878611156926e-10, + "loss": 0.3982, + "step": 9034 + }, + { + "epoch": 0.9955922865013774, + "grad_norm": 4.55740213394165, + "learning_rate": 4.891174637128294e-10, + "loss": 0.3603, + "step": 9035 + }, + { + "epoch": 0.9957024793388429, + "grad_norm": 6.228816986083984, + "learning_rate": 4.6496766325121454e-10, + "loss": 0.3726, + "step": 9036 + }, + { + "epoch": 0.9958126721763085, + "grad_norm": 7.958637237548828, + "learning_rate": 4.414292127241648e-10, + "loss": 0.3639, + "step": 9037 + }, + { + "epoch": 0.9959228650137741, + "grad_norm": 8.71097183227539, + "learning_rate": 4.185021150099333e-10, + "loss": 0.451, + "step": 9038 + }, + { + "epoch": 0.9960330578512396, + "grad_norm": 4.552064418792725, + "learning_rate": 3.9618637291349846e-10, + "loss": 0.3394, + "step": 9039 + }, + { + "epoch": 0.9961432506887052, + "grad_norm": 4.1845855712890625, + "learning_rate": 3.7448198916212317e-10, + "loss": 0.3896, + "step": 9040 + }, + { + "epoch": 0.9962534435261708, + "grad_norm": 7.433754920959473, + "learning_rate": 3.533889664103507e-10, + "loss": 0.427, + "step": 9041 + }, + { + "epoch": 0.9963636363636363, + "grad_norm": 14.062396049499512, + "learning_rate": 3.329073072377842e-10, + "loss": 0.4165, + "step": 9042 + }, + { + "epoch": 0.996473829201102, + "grad_norm": 6.495471954345703, + "learning_rate": 3.1303701414853174e-10, + "loss": 0.3897, + "step": 9043 + }, + { + "epoch": 0.9965840220385674, + "grad_norm": 5.049517631530762, + "learning_rate": 2.9377808957231636e-10, + "loss": 0.3929, + "step": 9044 + }, + { + "epoch": 0.996694214876033, + "grad_norm": 5.869344234466553, + "learning_rate": 2.7513053586447624e-10, + "loss": 0.388, + "step": 9045 + }, + { + "epoch": 0.9968044077134987, + "grad_norm": 8.759241104125977, + "learning_rate": 2.570943553054095e-10, + "loss": 0.3316, + "step": 9046 + }, + { + "epoch": 0.9969146005509641, + "grad_norm": 5.056518077850342, + "learning_rate": 2.3966955010001903e-10, + "loss": 0.3831, + "step": 9047 + }, + { + "epoch": 0.9970247933884298, + "grad_norm": 5.458576679229736, + "learning_rate": 2.2285612237937793e-10, + "loss": 0.3921, + "step": 9048 + }, + { + "epoch": 0.9971349862258954, + "grad_norm": 6.452542781829834, + "learning_rate": 2.0665407419961925e-10, + "loss": 0.4179, + "step": 9049 + }, + { + "epoch": 0.9972451790633609, + "grad_norm": 7.256555080413818, + "learning_rate": 1.9106340754138086e-10, + "loss": 0.357, + "step": 9050 + }, + { + "epoch": 0.9973553719008265, + "grad_norm": 9.529255867004395, + "learning_rate": 1.7608412431202592e-10, + "loss": 0.439, + "step": 9051 + }, + { + "epoch": 0.9974655647382921, + "grad_norm": 7.108154296875, + "learning_rate": 1.6171622634231222e-10, + "loss": 0.3922, + "step": 9052 + }, + { + "epoch": 0.9975757575757576, + "grad_norm": 5.342614650726318, + "learning_rate": 1.4795971538972276e-10, + "loss": 0.4049, + "step": 9053 + }, + { + "epoch": 0.9976859504132232, + "grad_norm": 5.520908355712891, + "learning_rate": 1.3481459313624544e-10, + "loss": 0.3641, + "step": 9054 + }, + { + "epoch": 0.9977961432506887, + "grad_norm": 7.096041202545166, + "learning_rate": 1.2228086118892812e-10, + "loss": 0.4094, + "step": 9055 + }, + { + "epoch": 0.9979063360881543, + "grad_norm": 5.964447021484375, + "learning_rate": 1.1035852108154388e-10, + "loss": 0.3797, + "step": 9056 + }, + { + "epoch": 0.9980165289256199, + "grad_norm": 7.609488487243652, + "learning_rate": 9.904757427070533e-11, + "loss": 0.2776, + "step": 9057 + }, + { + "epoch": 0.9981267217630854, + "grad_norm": 7.307365894317627, + "learning_rate": 8.834802213975036e-11, + "loss": 0.4681, + "step": 9058 + }, + { + "epoch": 0.998236914600551, + "grad_norm": 18.94178009033203, + "learning_rate": 7.825986599763191e-11, + "loss": 0.5068, + "step": 9059 + }, + { + "epoch": 0.9983471074380166, + "grad_norm": 12.599600791931152, + "learning_rate": 6.878310707780777e-11, + "loss": 0.3579, + "step": 9060 + }, + { + "epoch": 0.9984573002754821, + "grad_norm": 6.1359639167785645, + "learning_rate": 5.991774653879568e-11, + "loss": 0.3525, + "step": 9061 + }, + { + "epoch": 0.9985674931129477, + "grad_norm": 6.206957817077637, + "learning_rate": 5.166378546472839e-11, + "loss": 0.4174, + "step": 9062 + }, + { + "epoch": 0.9986776859504132, + "grad_norm": 4.911308288574219, + "learning_rate": 4.402122486479865e-11, + "loss": 0.3078, + "step": 9063 + }, + { + "epoch": 0.9987878787878788, + "grad_norm": 5.652713775634766, + "learning_rate": 3.699006567381425e-11, + "loss": 0.3946, + "step": 9064 + }, + { + "epoch": 0.9988980716253444, + "grad_norm": 9.284574508666992, + "learning_rate": 3.0570308751642906e-11, + "loss": 0.4012, + "step": 9065 + }, + { + "epoch": 0.9990082644628099, + "grad_norm": 10.509775161743164, + "learning_rate": 2.4761954883212312e-11, + "loss": 0.5294, + "step": 9066 + }, + { + "epoch": 0.9991184573002755, + "grad_norm": 10.87788200378418, + "learning_rate": 1.956500477851009e-11, + "loss": 0.3956, + "step": 9067 + }, + { + "epoch": 0.9992286501377411, + "grad_norm": 7.179844856262207, + "learning_rate": 1.4979459073138913e-11, + "loss": 0.3698, + "step": 9068 + }, + { + "epoch": 0.9993388429752066, + "grad_norm": 6.903147220611572, + "learning_rate": 1.1005318327761417e-11, + "loss": 0.3662, + "step": 9069 + }, + { + "epoch": 0.9994490358126722, + "grad_norm": 8.5108642578125, + "learning_rate": 7.642583028655282e-12, + "loss": 0.4146, + "step": 9070 + }, + { + "epoch": 0.9995592286501377, + "grad_norm": 20.710254669189453, + "learning_rate": 4.89125358715814e-12, + "loss": 0.4132, + "step": 9071 + }, + { + "epoch": 0.9996694214876033, + "grad_norm": 7.372518062591553, + "learning_rate": 2.751330339112457e-12, + "loss": 0.3995, + "step": 9072 + }, + { + "epoch": 0.9997796143250689, + "grad_norm": 4.217400550842285, + "learning_rate": 1.2228135465308654e-12, + "loss": 0.3204, + "step": 9073 + }, + { + "epoch": 0.9998898071625344, + "grad_norm": 35.320533752441406, + "learning_rate": 3.057033959308342e-13, + "loss": 0.4807, + "step": 9074 + }, + { + "epoch": 1.0, + "grad_norm": 7.732905864715576, + "learning_rate": 0.0, + "loss": 0.4039, + "step": 9075 + }, + { + "epoch": 1.0, + "step": 9075, + "total_flos": 2.806207392566477e+18, + "train_loss": 0.43036781519897715, + "train_runtime": 19668.4772, + "train_samples_per_second": 3.691, + "train_steps_per_second": 0.461 + } + ], + "logging_steps": 1, + "max_steps": 9075, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 9075, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.806207392566477e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}