diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,82084 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 11722, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.530967411704488e-05, + "grad_norm": 6.629314529173358, + "learning_rate": 2.840909090909091e-07, + "loss": 0.6284, + "step": 1 + }, + { + "epoch": 0.00017061934823408976, + "grad_norm": 5.932271803860879, + "learning_rate": 5.681818181818182e-07, + "loss": 0.5329, + "step": 2 + }, + { + "epoch": 0.0002559290223511346, + "grad_norm": 7.1147730193722625, + "learning_rate": 8.522727272727273e-07, + "loss": 0.6039, + "step": 3 + }, + { + "epoch": 0.0003412386964681795, + "grad_norm": 7.096217864707822, + "learning_rate": 1.1363636363636364e-06, + "loss": 0.5143, + "step": 4 + }, + { + "epoch": 0.00042654837058522436, + "grad_norm": 5.912930378966415, + "learning_rate": 1.4204545454545456e-06, + "loss": 0.5325, + "step": 5 + }, + { + "epoch": 0.0005118580447022692, + "grad_norm": 7.2287544184246935, + "learning_rate": 1.7045454545454546e-06, + "loss": 0.6042, + "step": 6 + }, + { + "epoch": 0.0005971677188193141, + "grad_norm": 6.21850922851663, + "learning_rate": 1.9886363636363638e-06, + "loss": 0.5608, + "step": 7 + }, + { + "epoch": 0.000682477392936359, + "grad_norm": 7.053744399862654, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.565, + "step": 8 + }, + { + "epoch": 0.0007677870670534039, + "grad_norm": 7.493507626951199, + "learning_rate": 2.556818181818182e-06, + "loss": 0.5962, + "step": 9 + }, + { + "epoch": 0.0008530967411704487, + "grad_norm": 6.192947301807031, + "learning_rate": 2.840909090909091e-06, + "loss": 0.5282, + "step": 10 + }, + { + "epoch": 0.0009384064152874935, + "grad_norm": 6.684495197372885, + "learning_rate": 3.125e-06, + "loss": 0.596, + "step": 11 + }, + { + "epoch": 0.0010237160894045385, + "grad_norm": 6.875621420778997, + "learning_rate": 3.409090909090909e-06, + "loss": 0.6384, + "step": 12 + }, + { + "epoch": 0.0011090257635215834, + "grad_norm": 5.34180401622009, + "learning_rate": 3.6931818181818186e-06, + "loss": 0.5144, + "step": 13 + }, + { + "epoch": 0.0011943354376386282, + "grad_norm": 5.761761122725548, + "learning_rate": 3.9772727272727275e-06, + "loss": 0.4941, + "step": 14 + }, + { + "epoch": 0.0012796451117556731, + "grad_norm": 6.087771255747434, + "learning_rate": 4.2613636363636365e-06, + "loss": 0.5588, + "step": 15 + }, + { + "epoch": 0.001364954785872718, + "grad_norm": 6.210298095035311, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.5241, + "step": 16 + }, + { + "epoch": 0.0014502644599897628, + "grad_norm": 4.096960774214127, + "learning_rate": 4.8295454545454545e-06, + "loss": 0.5402, + "step": 17 + }, + { + "epoch": 0.0015355741341068077, + "grad_norm": 5.166936908340425, + "learning_rate": 5.113636363636364e-06, + "loss": 0.5525, + "step": 18 + }, + { + "epoch": 0.0016208838082238527, + "grad_norm": 4.522643867933041, + "learning_rate": 5.397727272727273e-06, + "loss": 0.5006, + "step": 19 + }, + { + "epoch": 0.0017061934823408974, + "grad_norm": 5.4051509026592015, + "learning_rate": 5.681818181818182e-06, + "loss": 0.5601, + "step": 20 + }, + { + "epoch": 0.0017915031564579424, + "grad_norm": 4.2363985863924665, + "learning_rate": 5.965909090909091e-06, + "loss": 0.4607, + "step": 21 + }, + { + "epoch": 0.001876812830574987, + "grad_norm": 3.10201538776686, + "learning_rate": 6.25e-06, + "loss": 0.4585, + "step": 22 + }, + { + "epoch": 0.0019621225046920323, + "grad_norm": 3.816799303662491, + "learning_rate": 6.534090909090909e-06, + "loss": 0.4631, + "step": 23 + }, + { + "epoch": 0.002047432178809077, + "grad_norm": 3.6358645508468124, + "learning_rate": 6.818181818181818e-06, + "loss": 0.46, + "step": 24 + }, + { + "epoch": 0.0021327418529261217, + "grad_norm": 4.04790751714191, + "learning_rate": 7.102272727272728e-06, + "loss": 0.4904, + "step": 25 + }, + { + "epoch": 0.002218051527043167, + "grad_norm": 4.332183147274963, + "learning_rate": 7.386363636363637e-06, + "loss": 0.5344, + "step": 26 + }, + { + "epoch": 0.0023033612011602116, + "grad_norm": 3.4435144693517357, + "learning_rate": 7.670454545454545e-06, + "loss": 0.5198, + "step": 27 + }, + { + "epoch": 0.0023886708752772563, + "grad_norm": 2.999834373988388, + "learning_rate": 7.954545454545455e-06, + "loss": 0.4124, + "step": 28 + }, + { + "epoch": 0.0024739805493943015, + "grad_norm": 2.7988314472186757, + "learning_rate": 8.238636363636363e-06, + "loss": 0.4453, + "step": 29 + }, + { + "epoch": 0.0025592902235113462, + "grad_norm": 3.5867232914062677, + "learning_rate": 8.522727272727273e-06, + "loss": 0.5019, + "step": 30 + }, + { + "epoch": 0.002644599897628391, + "grad_norm": 3.568366590971768, + "learning_rate": 8.806818181818183e-06, + "loss": 0.503, + "step": 31 + }, + { + "epoch": 0.002729909571745436, + "grad_norm": 3.576671988301675, + "learning_rate": 9.090909090909091e-06, + "loss": 0.4749, + "step": 32 + }, + { + "epoch": 0.002815219245862481, + "grad_norm": 3.4684813799249006, + "learning_rate": 9.375000000000001e-06, + "loss": 0.4596, + "step": 33 + }, + { + "epoch": 0.0029005289199795256, + "grad_norm": 2.7005813184761975, + "learning_rate": 9.659090909090909e-06, + "loss": 0.445, + "step": 34 + }, + { + "epoch": 0.0029858385940965708, + "grad_norm": 3.461783346659491, + "learning_rate": 9.943181818181819e-06, + "loss": 0.5361, + "step": 35 + }, + { + "epoch": 0.0030711482682136155, + "grad_norm": 2.85171566904558, + "learning_rate": 1.0227272727272729e-05, + "loss": 0.4596, + "step": 36 + }, + { + "epoch": 0.0031564579423306602, + "grad_norm": 2.9458556142465286, + "learning_rate": 1.0511363636363637e-05, + "loss": 0.4231, + "step": 37 + }, + { + "epoch": 0.0032417676164477054, + "grad_norm": 2.7769807774198187, + "learning_rate": 1.0795454545454547e-05, + "loss": 0.437, + "step": 38 + }, + { + "epoch": 0.00332707729056475, + "grad_norm": 2.7680138222784776, + "learning_rate": 1.1079545454545455e-05, + "loss": 0.5033, + "step": 39 + }, + { + "epoch": 0.003412386964681795, + "grad_norm": 2.878143582196053, + "learning_rate": 1.1363636363636365e-05, + "loss": 0.4611, + "step": 40 + }, + { + "epoch": 0.00349769663879884, + "grad_norm": 2.6572662204753765, + "learning_rate": 1.1647727272727273e-05, + "loss": 0.4371, + "step": 41 + }, + { + "epoch": 0.0035830063129158847, + "grad_norm": 2.6815801670624753, + "learning_rate": 1.1931818181818183e-05, + "loss": 0.4843, + "step": 42 + }, + { + "epoch": 0.0036683159870329295, + "grad_norm": 2.562583549812194, + "learning_rate": 1.2215909090909092e-05, + "loss": 0.432, + "step": 43 + }, + { + "epoch": 0.003753625661149974, + "grad_norm": 2.739143964880151, + "learning_rate": 1.25e-05, + "loss": 0.4844, + "step": 44 + }, + { + "epoch": 0.0038389353352670194, + "grad_norm": 2.780612315249518, + "learning_rate": 1.2784090909090909e-05, + "loss": 0.431, + "step": 45 + }, + { + "epoch": 0.0039242450093840645, + "grad_norm": 2.8748069366647395, + "learning_rate": 1.3068181818181819e-05, + "loss": 0.4636, + "step": 46 + }, + { + "epoch": 0.004009554683501109, + "grad_norm": 2.696568823769511, + "learning_rate": 1.3352272727272727e-05, + "loss": 0.4669, + "step": 47 + }, + { + "epoch": 0.004094864357618154, + "grad_norm": 3.0441281187587963, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.523, + "step": 48 + }, + { + "epoch": 0.004180174031735199, + "grad_norm": 2.7147998394605377, + "learning_rate": 1.3920454545454545e-05, + "loss": 0.4549, + "step": 49 + }, + { + "epoch": 0.0042654837058522434, + "grad_norm": 2.957650405786925, + "learning_rate": 1.4204545454545456e-05, + "loss": 0.49, + "step": 50 + }, + { + "epoch": 0.004350793379969288, + "grad_norm": 2.905629409652124, + "learning_rate": 1.4488636363636366e-05, + "loss": 0.4072, + "step": 51 + }, + { + "epoch": 0.004436103054086334, + "grad_norm": 2.601004234190255, + "learning_rate": 1.4772727272727274e-05, + "loss": 0.3886, + "step": 52 + }, + { + "epoch": 0.0045214127282033785, + "grad_norm": 2.6294151595784374, + "learning_rate": 1.5056818181818182e-05, + "loss": 0.4422, + "step": 53 + }, + { + "epoch": 0.004606722402320423, + "grad_norm": 2.619601374869889, + "learning_rate": 1.534090909090909e-05, + "loss": 0.4018, + "step": 54 + }, + { + "epoch": 0.004692032076437468, + "grad_norm": 2.510556345070487, + "learning_rate": 1.5625e-05, + "loss": 0.3996, + "step": 55 + }, + { + "epoch": 0.004777341750554513, + "grad_norm": 2.2963729621139355, + "learning_rate": 1.590909090909091e-05, + "loss": 0.4232, + "step": 56 + }, + { + "epoch": 0.004862651424671557, + "grad_norm": 2.4640898360125156, + "learning_rate": 1.619318181818182e-05, + "loss": 0.4405, + "step": 57 + }, + { + "epoch": 0.004947961098788603, + "grad_norm": 2.9500789373149257, + "learning_rate": 1.6477272727272726e-05, + "loss": 0.4912, + "step": 58 + }, + { + "epoch": 0.005033270772905648, + "grad_norm": 3.1716084485733753, + "learning_rate": 1.6761363636363636e-05, + "loss": 0.4228, + "step": 59 + }, + { + "epoch": 0.0051185804470226925, + "grad_norm": 2.4454277425794544, + "learning_rate": 1.7045454545454546e-05, + "loss": 0.4536, + "step": 60 + }, + { + "epoch": 0.005203890121139737, + "grad_norm": 2.8109240885187194, + "learning_rate": 1.7329545454545456e-05, + "loss": 0.4177, + "step": 61 + }, + { + "epoch": 0.005289199795256782, + "grad_norm": 2.6953534551632554, + "learning_rate": 1.7613636363636366e-05, + "loss": 0.4327, + "step": 62 + }, + { + "epoch": 0.005374509469373827, + "grad_norm": 3.3716333339801574, + "learning_rate": 1.7897727272727276e-05, + "loss": 0.4671, + "step": 63 + }, + { + "epoch": 0.005459819143490872, + "grad_norm": 2.8254756425797627, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.4411, + "step": 64 + }, + { + "epoch": 0.005545128817607917, + "grad_norm": 2.1775193454658286, + "learning_rate": 1.8465909090909092e-05, + "loss": 0.4334, + "step": 65 + }, + { + "epoch": 0.005630438491724962, + "grad_norm": 2.887497978705157, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.3884, + "step": 66 + }, + { + "epoch": 0.0057157481658420065, + "grad_norm": 3.0250875223912845, + "learning_rate": 1.9034090909090908e-05, + "loss": 0.4735, + "step": 67 + }, + { + "epoch": 0.005801057839959051, + "grad_norm": 1.9973627226139639, + "learning_rate": 1.9318181818181818e-05, + "loss": 0.4331, + "step": 68 + }, + { + "epoch": 0.005886367514076096, + "grad_norm": 2.6540890149824823, + "learning_rate": 1.9602272727272728e-05, + "loss": 0.411, + "step": 69 + }, + { + "epoch": 0.0059716771881931415, + "grad_norm": 2.7941918316018985, + "learning_rate": 1.9886363636363638e-05, + "loss": 0.3893, + "step": 70 + }, + { + "epoch": 0.006056986862310186, + "grad_norm": 2.468703740326332, + "learning_rate": 2.0170454545454544e-05, + "loss": 0.4396, + "step": 71 + }, + { + "epoch": 0.006142296536427231, + "grad_norm": 2.2721514128346905, + "learning_rate": 2.0454545454545457e-05, + "loss": 0.4542, + "step": 72 + }, + { + "epoch": 0.006227606210544276, + "grad_norm": 2.5257997587064462, + "learning_rate": 2.0738636363636367e-05, + "loss": 0.4239, + "step": 73 + }, + { + "epoch": 0.0063129158846613204, + "grad_norm": 2.391049415961601, + "learning_rate": 2.1022727272727274e-05, + "loss": 0.4455, + "step": 74 + }, + { + "epoch": 0.006398225558778365, + "grad_norm": 2.2439387590072073, + "learning_rate": 2.1306818181818183e-05, + "loss": 0.4087, + "step": 75 + }, + { + "epoch": 0.006483535232895411, + "grad_norm": 2.7118541915598113, + "learning_rate": 2.1590909090909093e-05, + "loss": 0.399, + "step": 76 + }, + { + "epoch": 0.0065688449070124555, + "grad_norm": 2.4380295007186668, + "learning_rate": 2.1875e-05, + "loss": 0.3821, + "step": 77 + }, + { + "epoch": 0.0066541545811295, + "grad_norm": 2.2267576968645812, + "learning_rate": 2.215909090909091e-05, + "loss": 0.4067, + "step": 78 + }, + { + "epoch": 0.006739464255246545, + "grad_norm": 2.3983794463149195, + "learning_rate": 2.244318181818182e-05, + "loss": 0.4176, + "step": 79 + }, + { + "epoch": 0.00682477392936359, + "grad_norm": 2.701910374463013, + "learning_rate": 2.272727272727273e-05, + "loss": 0.4055, + "step": 80 + }, + { + "epoch": 0.006910083603480634, + "grad_norm": 2.438091890901868, + "learning_rate": 2.3011363636363636e-05, + "loss": 0.4329, + "step": 81 + }, + { + "epoch": 0.00699539327759768, + "grad_norm": 3.0371775138055743, + "learning_rate": 2.3295454545454546e-05, + "loss": 0.4478, + "step": 82 + }, + { + "epoch": 0.007080702951714725, + "grad_norm": 2.9669190231602607, + "learning_rate": 2.3579545454545455e-05, + "loss": 0.4853, + "step": 83 + }, + { + "epoch": 0.0071660126258317695, + "grad_norm": 2.859139354563873, + "learning_rate": 2.3863636363636365e-05, + "loss": 0.4051, + "step": 84 + }, + { + "epoch": 0.007251322299948814, + "grad_norm": 2.285670974014553, + "learning_rate": 2.4147727272727275e-05, + "loss": 0.4564, + "step": 85 + }, + { + "epoch": 0.007336631974065859, + "grad_norm": 2.2653677958800316, + "learning_rate": 2.4431818181818185e-05, + "loss": 0.4419, + "step": 86 + }, + { + "epoch": 0.007421941648182904, + "grad_norm": 2.7272967467550795, + "learning_rate": 2.471590909090909e-05, + "loss": 0.3965, + "step": 87 + }, + { + "epoch": 0.007507251322299948, + "grad_norm": 2.3618557248284207, + "learning_rate": 2.5e-05, + "loss": 0.3885, + "step": 88 + }, + { + "epoch": 0.007592560996416994, + "grad_norm": 2.844057506661265, + "learning_rate": 2.5284090909090914e-05, + "loss": 0.4297, + "step": 89 + }, + { + "epoch": 0.007677870670534039, + "grad_norm": 3.2240440016268805, + "learning_rate": 2.5568181818181817e-05, + "loss": 0.479, + "step": 90 + }, + { + "epoch": 0.0077631803446510834, + "grad_norm": 2.5809231456501065, + "learning_rate": 2.585227272727273e-05, + "loss": 0.39, + "step": 91 + }, + { + "epoch": 0.007848490018768129, + "grad_norm": 2.1449746818204254, + "learning_rate": 2.6136363636363637e-05, + "loss": 0.398, + "step": 92 + }, + { + "epoch": 0.007933799692885173, + "grad_norm": 2.6658356881814447, + "learning_rate": 2.6420454545454547e-05, + "loss": 0.4163, + "step": 93 + }, + { + "epoch": 0.008019109367002219, + "grad_norm": 2.816925720475951, + "learning_rate": 2.6704545454545453e-05, + "loss": 0.4579, + "step": 94 + }, + { + "epoch": 0.008104419041119262, + "grad_norm": 3.1909705323353306, + "learning_rate": 2.6988636363636367e-05, + "loss": 0.4421, + "step": 95 + }, + { + "epoch": 0.008189728715236308, + "grad_norm": 2.6903140394552256, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.4332, + "step": 96 + }, + { + "epoch": 0.008275038389353352, + "grad_norm": 2.261479656646674, + "learning_rate": 2.7556818181818183e-05, + "loss": 0.4383, + "step": 97 + }, + { + "epoch": 0.008360348063470397, + "grad_norm": 2.4101742538200126, + "learning_rate": 2.784090909090909e-05, + "loss": 0.4747, + "step": 98 + }, + { + "epoch": 0.008445657737587443, + "grad_norm": 3.189572875746799, + "learning_rate": 2.8125000000000003e-05, + "loss": 0.4188, + "step": 99 + }, + { + "epoch": 0.008530967411704487, + "grad_norm": 2.8665742556144713, + "learning_rate": 2.8409090909090912e-05, + "loss": 0.4555, + "step": 100 + }, + { + "epoch": 0.008616277085821532, + "grad_norm": 3.107580692781122, + "learning_rate": 2.869318181818182e-05, + "loss": 0.4371, + "step": 101 + }, + { + "epoch": 0.008701586759938576, + "grad_norm": 2.6561859123128118, + "learning_rate": 2.8977272727272732e-05, + "loss": 0.4273, + "step": 102 + }, + { + "epoch": 0.008786896434055622, + "grad_norm": 2.4387686924533307, + "learning_rate": 2.9261363636363635e-05, + "loss": 0.4319, + "step": 103 + }, + { + "epoch": 0.008872206108172668, + "grad_norm": 2.367498497000744, + "learning_rate": 2.954545454545455e-05, + "loss": 0.4138, + "step": 104 + }, + { + "epoch": 0.008957515782289711, + "grad_norm": 2.399062482191123, + "learning_rate": 2.9829545454545455e-05, + "loss": 0.3934, + "step": 105 + }, + { + "epoch": 0.009042825456406757, + "grad_norm": 2.2303430418275383, + "learning_rate": 3.0113636363636365e-05, + "loss": 0.3882, + "step": 106 + }, + { + "epoch": 0.0091281351305238, + "grad_norm": 2.7945423308364234, + "learning_rate": 3.039772727272727e-05, + "loss": 0.4444, + "step": 107 + }, + { + "epoch": 0.009213444804640846, + "grad_norm": 2.33809641200874, + "learning_rate": 3.068181818181818e-05, + "loss": 0.4412, + "step": 108 + }, + { + "epoch": 0.00929875447875789, + "grad_norm": 2.890506721344916, + "learning_rate": 3.096590909090909e-05, + "loss": 0.4226, + "step": 109 + }, + { + "epoch": 0.009384064152874936, + "grad_norm": 2.756478654368825, + "learning_rate": 3.125e-05, + "loss": 0.3945, + "step": 110 + }, + { + "epoch": 0.009469373826991982, + "grad_norm": 2.347354609300478, + "learning_rate": 3.153409090909091e-05, + "loss": 0.3965, + "step": 111 + }, + { + "epoch": 0.009554683501109025, + "grad_norm": 2.7855857742625565, + "learning_rate": 3.181818181818182e-05, + "loss": 0.3814, + "step": 112 + }, + { + "epoch": 0.009639993175226071, + "grad_norm": 2.810281853709702, + "learning_rate": 3.210227272727273e-05, + "loss": 0.4418, + "step": 113 + }, + { + "epoch": 0.009725302849343115, + "grad_norm": 2.4997447837258604, + "learning_rate": 3.238636363636364e-05, + "loss": 0.3808, + "step": 114 + }, + { + "epoch": 0.00981061252346016, + "grad_norm": 2.224526635725398, + "learning_rate": 3.267045454545455e-05, + "loss": 0.4399, + "step": 115 + }, + { + "epoch": 0.009895922197577206, + "grad_norm": 2.239553572582664, + "learning_rate": 3.295454545454545e-05, + "loss": 0.3651, + "step": 116 + }, + { + "epoch": 0.00998123187169425, + "grad_norm": 2.558876639299239, + "learning_rate": 3.323863636363637e-05, + "loss": 0.4254, + "step": 117 + }, + { + "epoch": 0.010066541545811295, + "grad_norm": 2.562609600421485, + "learning_rate": 3.352272727272727e-05, + "loss": 0.4547, + "step": 118 + }, + { + "epoch": 0.01015185121992834, + "grad_norm": 2.329367491697468, + "learning_rate": 3.380681818181818e-05, + "loss": 0.4442, + "step": 119 + }, + { + "epoch": 0.010237160894045385, + "grad_norm": 2.34291611141999, + "learning_rate": 3.409090909090909e-05, + "loss": 0.435, + "step": 120 + }, + { + "epoch": 0.010322470568162429, + "grad_norm": 2.341304876903332, + "learning_rate": 3.4375e-05, + "loss": 0.4276, + "step": 121 + }, + { + "epoch": 0.010407780242279474, + "grad_norm": 2.388019413394707, + "learning_rate": 3.465909090909091e-05, + "loss": 0.3708, + "step": 122 + }, + { + "epoch": 0.01049308991639652, + "grad_norm": 2.301087959713805, + "learning_rate": 3.494318181818182e-05, + "loss": 0.4625, + "step": 123 + }, + { + "epoch": 0.010578399590513564, + "grad_norm": 2.4122638606589377, + "learning_rate": 3.522727272727273e-05, + "loss": 0.41, + "step": 124 + }, + { + "epoch": 0.01066370926463061, + "grad_norm": 2.9180038838141718, + "learning_rate": 3.5511363636363635e-05, + "loss": 0.44, + "step": 125 + }, + { + "epoch": 0.010749018938747653, + "grad_norm": 2.8819515938012685, + "learning_rate": 3.579545454545455e-05, + "loss": 0.4522, + "step": 126 + }, + { + "epoch": 0.010834328612864699, + "grad_norm": 2.3626546738465866, + "learning_rate": 3.6079545454545454e-05, + "loss": 0.3913, + "step": 127 + }, + { + "epoch": 0.010919638286981745, + "grad_norm": 2.3690482402252093, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.4064, + "step": 128 + }, + { + "epoch": 0.011004947961098788, + "grad_norm": 2.324628521355312, + "learning_rate": 3.6647727272727274e-05, + "loss": 0.4163, + "step": 129 + }, + { + "epoch": 0.011090257635215834, + "grad_norm": 2.3571162242953627, + "learning_rate": 3.6931818181818184e-05, + "loss": 0.3537, + "step": 130 + }, + { + "epoch": 0.011175567309332878, + "grad_norm": 2.6333834880768654, + "learning_rate": 3.721590909090909e-05, + "loss": 0.4197, + "step": 131 + }, + { + "epoch": 0.011260876983449923, + "grad_norm": 2.62791444567765, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.3753, + "step": 132 + }, + { + "epoch": 0.011346186657566967, + "grad_norm": 2.6389534077927514, + "learning_rate": 3.778409090909091e-05, + "loss": 0.3776, + "step": 133 + }, + { + "epoch": 0.011431496331684013, + "grad_norm": 2.7011152083603265, + "learning_rate": 3.8068181818181816e-05, + "loss": 0.4444, + "step": 134 + }, + { + "epoch": 0.011516806005801059, + "grad_norm": 2.467105172675951, + "learning_rate": 3.835227272727273e-05, + "loss": 0.4716, + "step": 135 + }, + { + "epoch": 0.011602115679918102, + "grad_norm": 2.567534641377614, + "learning_rate": 3.8636363636363636e-05, + "loss": 0.4099, + "step": 136 + }, + { + "epoch": 0.011687425354035148, + "grad_norm": 2.6657081510537193, + "learning_rate": 3.8920454545454546e-05, + "loss": 0.4344, + "step": 137 + }, + { + "epoch": 0.011772735028152192, + "grad_norm": 3.0208726200749925, + "learning_rate": 3.9204545454545456e-05, + "loss": 0.4845, + "step": 138 + }, + { + "epoch": 0.011858044702269237, + "grad_norm": 3.148781580143838, + "learning_rate": 3.9488636363636366e-05, + "loss": 0.4329, + "step": 139 + }, + { + "epoch": 0.011943354376386283, + "grad_norm": 2.4785205308942286, + "learning_rate": 3.9772727272727275e-05, + "loss": 0.5395, + "step": 140 + }, + { + "epoch": 0.012028664050503327, + "grad_norm": 2.701888843635922, + "learning_rate": 4.0056818181818185e-05, + "loss": 0.41, + "step": 141 + }, + { + "epoch": 0.012113973724620372, + "grad_norm": 2.244890026465999, + "learning_rate": 4.034090909090909e-05, + "loss": 0.4041, + "step": 142 + }, + { + "epoch": 0.012199283398737416, + "grad_norm": 2.4629690352927773, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.4514, + "step": 143 + }, + { + "epoch": 0.012284593072854462, + "grad_norm": 2.606842182312283, + "learning_rate": 4.0909090909090915e-05, + "loss": 0.4666, + "step": 144 + }, + { + "epoch": 0.012369902746971506, + "grad_norm": 2.437699138625212, + "learning_rate": 4.119318181818182e-05, + "loss": 0.4337, + "step": 145 + }, + { + "epoch": 0.012455212421088551, + "grad_norm": 2.3627027071420614, + "learning_rate": 4.1477272727272734e-05, + "loss": 0.3842, + "step": 146 + }, + { + "epoch": 0.012540522095205597, + "grad_norm": 2.3157634904763573, + "learning_rate": 4.176136363636364e-05, + "loss": 0.4732, + "step": 147 + }, + { + "epoch": 0.012625831769322641, + "grad_norm": 2.0017455351593814, + "learning_rate": 4.204545454545455e-05, + "loss": 0.3461, + "step": 148 + }, + { + "epoch": 0.012711141443439686, + "grad_norm": 2.56314757933644, + "learning_rate": 4.232954545454546e-05, + "loss": 0.4096, + "step": 149 + }, + { + "epoch": 0.01279645111755673, + "grad_norm": 2.1440659315234414, + "learning_rate": 4.261363636363637e-05, + "loss": 0.4548, + "step": 150 + }, + { + "epoch": 0.012881760791673776, + "grad_norm": 2.3080827951175045, + "learning_rate": 4.289772727272727e-05, + "loss": 0.3785, + "step": 151 + }, + { + "epoch": 0.012967070465790822, + "grad_norm": 2.013942400634266, + "learning_rate": 4.318181818181819e-05, + "loss": 0.3873, + "step": 152 + }, + { + "epoch": 0.013052380139907865, + "grad_norm": 2.7874178271305814, + "learning_rate": 4.346590909090909e-05, + "loss": 0.4876, + "step": 153 + }, + { + "epoch": 0.013137689814024911, + "grad_norm": 2.1405483705490416, + "learning_rate": 4.375e-05, + "loss": 0.4122, + "step": 154 + }, + { + "epoch": 0.013222999488141955, + "grad_norm": 2.632660722744027, + "learning_rate": 4.4034090909090916e-05, + "loss": 0.4822, + "step": 155 + }, + { + "epoch": 0.013308309162259, + "grad_norm": 2.10643863455855, + "learning_rate": 4.431818181818182e-05, + "loss": 0.3805, + "step": 156 + }, + { + "epoch": 0.013393618836376044, + "grad_norm": 2.414338031340607, + "learning_rate": 4.460227272727273e-05, + "loss": 0.4072, + "step": 157 + }, + { + "epoch": 0.01347892851049309, + "grad_norm": 3.2263062308376123, + "learning_rate": 4.488636363636364e-05, + "loss": 0.4084, + "step": 158 + }, + { + "epoch": 0.013564238184610136, + "grad_norm": 2.2316863416051835, + "learning_rate": 4.517045454545455e-05, + "loss": 0.3704, + "step": 159 + }, + { + "epoch": 0.01364954785872718, + "grad_norm": 2.4844122350049687, + "learning_rate": 4.545454545454546e-05, + "loss": 0.404, + "step": 160 + }, + { + "epoch": 0.013734857532844225, + "grad_norm": 2.4101932467151324, + "learning_rate": 4.573863636363637e-05, + "loss": 0.4523, + "step": 161 + }, + { + "epoch": 0.013820167206961269, + "grad_norm": 2.3242541398631826, + "learning_rate": 4.602272727272727e-05, + "loss": 0.4241, + "step": 162 + }, + { + "epoch": 0.013905476881078314, + "grad_norm": 2.6891403515023393, + "learning_rate": 4.630681818181818e-05, + "loss": 0.451, + "step": 163 + }, + { + "epoch": 0.01399078655519536, + "grad_norm": 2.8190388432127422, + "learning_rate": 4.659090909090909e-05, + "loss": 0.4069, + "step": 164 + }, + { + "epoch": 0.014076096229312404, + "grad_norm": 2.4510707150440365, + "learning_rate": 4.6875e-05, + "loss": 0.3978, + "step": 165 + }, + { + "epoch": 0.01416140590342945, + "grad_norm": 2.4501163961048196, + "learning_rate": 4.715909090909091e-05, + "loss": 0.4643, + "step": 166 + }, + { + "epoch": 0.014246715577546493, + "grad_norm": 3.5419141047190177, + "learning_rate": 4.744318181818182e-05, + "loss": 0.4195, + "step": 167 + }, + { + "epoch": 0.014332025251663539, + "grad_norm": 2.4948616628576574, + "learning_rate": 4.772727272727273e-05, + "loss": 0.4084, + "step": 168 + }, + { + "epoch": 0.014417334925780583, + "grad_norm": 2.5015318945514182, + "learning_rate": 4.801136363636364e-05, + "loss": 0.4598, + "step": 169 + }, + { + "epoch": 0.014502644599897628, + "grad_norm": 2.406353737712857, + "learning_rate": 4.829545454545455e-05, + "loss": 0.3859, + "step": 170 + }, + { + "epoch": 0.014587954274014674, + "grad_norm": 2.370111353065027, + "learning_rate": 4.857954545454545e-05, + "loss": 0.3997, + "step": 171 + }, + { + "epoch": 0.014673263948131718, + "grad_norm": 2.514239196068343, + "learning_rate": 4.886363636363637e-05, + "loss": 0.3698, + "step": 172 + }, + { + "epoch": 0.014758573622248763, + "grad_norm": 2.6636046074830406, + "learning_rate": 4.914772727272727e-05, + "loss": 0.415, + "step": 173 + }, + { + "epoch": 0.014843883296365807, + "grad_norm": 2.374428730572856, + "learning_rate": 4.943181818181818e-05, + "loss": 0.3849, + "step": 174 + }, + { + "epoch": 0.014929192970482853, + "grad_norm": 2.433557696863759, + "learning_rate": 4.971590909090909e-05, + "loss": 0.4298, + "step": 175 + }, + { + "epoch": 0.015014502644599897, + "grad_norm": 1.9580069601385302, + "learning_rate": 5e-05, + "loss": 0.3604, + "step": 176 + }, + { + "epoch": 0.015099812318716942, + "grad_norm": 2.288392681770112, + "learning_rate": 5.0284090909090905e-05, + "loss": 0.4093, + "step": 177 + }, + { + "epoch": 0.015185121992833988, + "grad_norm": 2.9733852139324513, + "learning_rate": 5.056818181818183e-05, + "loss": 0.3911, + "step": 178 + }, + { + "epoch": 0.015270431666951032, + "grad_norm": 2.6770649969954836, + "learning_rate": 5.085227272727273e-05, + "loss": 0.4516, + "step": 179 + }, + { + "epoch": 0.015355741341068077, + "grad_norm": 2.4377382602152724, + "learning_rate": 5.1136363636363635e-05, + "loss": 0.4476, + "step": 180 + }, + { + "epoch": 0.015441051015185121, + "grad_norm": 2.5861934293198994, + "learning_rate": 5.1420454545454545e-05, + "loss": 0.4685, + "step": 181 + }, + { + "epoch": 0.015526360689302167, + "grad_norm": 2.896575741088906, + "learning_rate": 5.170454545454546e-05, + "loss": 0.4433, + "step": 182 + }, + { + "epoch": 0.015611670363419212, + "grad_norm": 2.6599358510764954, + "learning_rate": 5.1988636363636364e-05, + "loss": 0.4251, + "step": 183 + }, + { + "epoch": 0.015696980037536258, + "grad_norm": 2.438004759341845, + "learning_rate": 5.2272727272727274e-05, + "loss": 0.4284, + "step": 184 + }, + { + "epoch": 0.0157822897116533, + "grad_norm": 2.227061326215854, + "learning_rate": 5.255681818181818e-05, + "loss": 0.4044, + "step": 185 + }, + { + "epoch": 0.015867599385770346, + "grad_norm": 2.380804698087815, + "learning_rate": 5.2840909090909094e-05, + "loss": 0.4194, + "step": 186 + }, + { + "epoch": 0.01595290905988739, + "grad_norm": 2.279491426063942, + "learning_rate": 5.3125000000000004e-05, + "loss": 0.431, + "step": 187 + }, + { + "epoch": 0.016038218734004437, + "grad_norm": 2.6766441568811343, + "learning_rate": 5.340909090909091e-05, + "loss": 0.3904, + "step": 188 + }, + { + "epoch": 0.016123528408121483, + "grad_norm": 1.9764639486841136, + "learning_rate": 5.3693181818181823e-05, + "loss": 0.3652, + "step": 189 + }, + { + "epoch": 0.016208838082238525, + "grad_norm": 2.5636267627128007, + "learning_rate": 5.397727272727273e-05, + "loss": 0.4372, + "step": 190 + }, + { + "epoch": 0.01629414775635557, + "grad_norm": 2.298835073766845, + "learning_rate": 5.4261363636363636e-05, + "loss": 0.4377, + "step": 191 + }, + { + "epoch": 0.016379457430472616, + "grad_norm": 2.186628549561476, + "learning_rate": 5.4545454545454546e-05, + "loss": 0.4144, + "step": 192 + }, + { + "epoch": 0.01646476710458966, + "grad_norm": 2.692304707619826, + "learning_rate": 5.482954545454546e-05, + "loss": 0.48, + "step": 193 + }, + { + "epoch": 0.016550076778706704, + "grad_norm": 2.2535190613327503, + "learning_rate": 5.5113636363636366e-05, + "loss": 0.4124, + "step": 194 + }, + { + "epoch": 0.01663538645282375, + "grad_norm": 2.0387039746317264, + "learning_rate": 5.5397727272727276e-05, + "loss": 0.3935, + "step": 195 + }, + { + "epoch": 0.016720696126940795, + "grad_norm": 2.4423187747752397, + "learning_rate": 5.568181818181818e-05, + "loss": 0.4202, + "step": 196 + }, + { + "epoch": 0.01680600580105784, + "grad_norm": 2.269094612284175, + "learning_rate": 5.5965909090909095e-05, + "loss": 0.3815, + "step": 197 + }, + { + "epoch": 0.016891315475174886, + "grad_norm": 2.2772363339670623, + "learning_rate": 5.6250000000000005e-05, + "loss": 0.4252, + "step": 198 + }, + { + "epoch": 0.016976625149291928, + "grad_norm": 2.493049496425207, + "learning_rate": 5.653409090909091e-05, + "loss": 0.4346, + "step": 199 + }, + { + "epoch": 0.017061934823408974, + "grad_norm": 2.7503155613957935, + "learning_rate": 5.6818181818181825e-05, + "loss": 0.4178, + "step": 200 + }, + { + "epoch": 0.01714724449752602, + "grad_norm": 2.2266195724935978, + "learning_rate": 5.7102272727272735e-05, + "loss": 0.412, + "step": 201 + }, + { + "epoch": 0.017232554171643065, + "grad_norm": 2.8407013029923203, + "learning_rate": 5.738636363636364e-05, + "loss": 0.4639, + "step": 202 + }, + { + "epoch": 0.01731786384576011, + "grad_norm": 2.617368777245781, + "learning_rate": 5.767045454545454e-05, + "loss": 0.4439, + "step": 203 + }, + { + "epoch": 0.017403173519877153, + "grad_norm": 2.0892848898637983, + "learning_rate": 5.7954545454545464e-05, + "loss": 0.402, + "step": 204 + }, + { + "epoch": 0.0174884831939942, + "grad_norm": 2.436676864533104, + "learning_rate": 5.823863636363637e-05, + "loss": 0.4087, + "step": 205 + }, + { + "epoch": 0.017573792868111244, + "grad_norm": 2.5845663348109458, + "learning_rate": 5.852272727272727e-05, + "loss": 0.4186, + "step": 206 + }, + { + "epoch": 0.01765910254222829, + "grad_norm": 2.3642438961408896, + "learning_rate": 5.880681818181818e-05, + "loss": 0.4086, + "step": 207 + }, + { + "epoch": 0.017744412216345335, + "grad_norm": 2.1335219995201653, + "learning_rate": 5.90909090909091e-05, + "loss": 0.3728, + "step": 208 + }, + { + "epoch": 0.017829721890462377, + "grad_norm": 2.6074355907293754, + "learning_rate": 5.9375e-05, + "loss": 0.4244, + "step": 209 + }, + { + "epoch": 0.017915031564579423, + "grad_norm": 3.25477205735282, + "learning_rate": 5.965909090909091e-05, + "loss": 0.4111, + "step": 210 + }, + { + "epoch": 0.01800034123869647, + "grad_norm": 2.454606696048798, + "learning_rate": 5.9943181818181826e-05, + "loss": 0.4215, + "step": 211 + }, + { + "epoch": 0.018085650912813514, + "grad_norm": 2.1353496509984127, + "learning_rate": 6.022727272727273e-05, + "loss": 0.419, + "step": 212 + }, + { + "epoch": 0.01817096058693056, + "grad_norm": 2.500536479608336, + "learning_rate": 6.051136363636364e-05, + "loss": 0.4496, + "step": 213 + }, + { + "epoch": 0.0182562702610476, + "grad_norm": 2.9987885890320753, + "learning_rate": 6.079545454545454e-05, + "loss": 0.4507, + "step": 214 + }, + { + "epoch": 0.018341579935164647, + "grad_norm": 2.247094928021411, + "learning_rate": 6.107954545454547e-05, + "loss": 0.4054, + "step": 215 + }, + { + "epoch": 0.018426889609281693, + "grad_norm": 2.402179892309456, + "learning_rate": 6.136363636363636e-05, + "loss": 0.4485, + "step": 216 + }, + { + "epoch": 0.01851219928339874, + "grad_norm": 1.7680762500588991, + "learning_rate": 6.164772727272727e-05, + "loss": 0.3624, + "step": 217 + }, + { + "epoch": 0.01859750895751578, + "grad_norm": 2.6790701374642354, + "learning_rate": 6.193181818181818e-05, + "loss": 0.4293, + "step": 218 + }, + { + "epoch": 0.018682818631632826, + "grad_norm": 2.4588525083784476, + "learning_rate": 6.221590909090909e-05, + "loss": 0.433, + "step": 219 + }, + { + "epoch": 0.018768128305749872, + "grad_norm": 2.0963968655678356, + "learning_rate": 6.25e-05, + "loss": 0.3591, + "step": 220 + }, + { + "epoch": 0.018853437979866917, + "grad_norm": 2.794223146247957, + "learning_rate": 6.278409090909091e-05, + "loss": 0.4308, + "step": 221 + }, + { + "epoch": 0.018938747653983963, + "grad_norm": 2.2081393270624314, + "learning_rate": 6.306818181818182e-05, + "loss": 0.3923, + "step": 222 + }, + { + "epoch": 0.019024057328101005, + "grad_norm": 2.12053817863608, + "learning_rate": 6.335227272727273e-05, + "loss": 0.4308, + "step": 223 + }, + { + "epoch": 0.01910936700221805, + "grad_norm": 2.3852945377008883, + "learning_rate": 6.363636363636364e-05, + "loss": 0.4389, + "step": 224 + }, + { + "epoch": 0.019194676676335096, + "grad_norm": 2.1586937808351845, + "learning_rate": 6.392045454545455e-05, + "loss": 0.4132, + "step": 225 + }, + { + "epoch": 0.019279986350452142, + "grad_norm": 2.079858042934605, + "learning_rate": 6.420454545454546e-05, + "loss": 0.4216, + "step": 226 + }, + { + "epoch": 0.019365296024569188, + "grad_norm": 1.9909462446488657, + "learning_rate": 6.448863636363637e-05, + "loss": 0.35, + "step": 227 + }, + { + "epoch": 0.01945060569868623, + "grad_norm": 2.0339874601624626, + "learning_rate": 6.477272727272728e-05, + "loss": 0.3714, + "step": 228 + }, + { + "epoch": 0.019535915372803275, + "grad_norm": 1.601909544025692, + "learning_rate": 6.505681818181818e-05, + "loss": 0.3724, + "step": 229 + }, + { + "epoch": 0.01962122504692032, + "grad_norm": 1.9721372015929162, + "learning_rate": 6.53409090909091e-05, + "loss": 0.4024, + "step": 230 + }, + { + "epoch": 0.019706534721037366, + "grad_norm": 2.4817827243793587, + "learning_rate": 6.562500000000001e-05, + "loss": 0.4502, + "step": 231 + }, + { + "epoch": 0.019791844395154412, + "grad_norm": 2.0375358157140164, + "learning_rate": 6.59090909090909e-05, + "loss": 0.3958, + "step": 232 + }, + { + "epoch": 0.019877154069271454, + "grad_norm": 2.3366166470335035, + "learning_rate": 6.619318181818183e-05, + "loss": 0.4338, + "step": 233 + }, + { + "epoch": 0.0199624637433885, + "grad_norm": 1.8930909020170057, + "learning_rate": 6.647727272727274e-05, + "loss": 0.3942, + "step": 234 + }, + { + "epoch": 0.020047773417505545, + "grad_norm": 2.694706857057218, + "learning_rate": 6.676136363636364e-05, + "loss": 0.4838, + "step": 235 + }, + { + "epoch": 0.02013308309162259, + "grad_norm": 2.4290366328285473, + "learning_rate": 6.704545454545455e-05, + "loss": 0.4179, + "step": 236 + }, + { + "epoch": 0.020218392765739637, + "grad_norm": 2.3474409414059116, + "learning_rate": 6.732954545454547e-05, + "loss": 0.4183, + "step": 237 + }, + { + "epoch": 0.02030370243985668, + "grad_norm": 2.2787788279640795, + "learning_rate": 6.761363636363636e-05, + "loss": 0.4061, + "step": 238 + }, + { + "epoch": 0.020389012113973724, + "grad_norm": 2.373541534735438, + "learning_rate": 6.789772727272727e-05, + "loss": 0.4365, + "step": 239 + }, + { + "epoch": 0.02047432178809077, + "grad_norm": 2.3081579942821513, + "learning_rate": 6.818181818181818e-05, + "loss": 0.3781, + "step": 240 + }, + { + "epoch": 0.020559631462207816, + "grad_norm": 2.5766238379363178, + "learning_rate": 6.84659090909091e-05, + "loss": 0.4276, + "step": 241 + }, + { + "epoch": 0.020644941136324858, + "grad_norm": 2.38974830658828, + "learning_rate": 6.875e-05, + "loss": 0.436, + "step": 242 + }, + { + "epoch": 0.020730250810441903, + "grad_norm": 2.5043939124493533, + "learning_rate": 6.903409090909091e-05, + "loss": 0.4271, + "step": 243 + }, + { + "epoch": 0.02081556048455895, + "grad_norm": 2.2455676184058695, + "learning_rate": 6.931818181818182e-05, + "loss": 0.3763, + "step": 244 + }, + { + "epoch": 0.020900870158675994, + "grad_norm": 2.0126630918393773, + "learning_rate": 6.960227272727273e-05, + "loss": 0.4216, + "step": 245 + }, + { + "epoch": 0.02098617983279304, + "grad_norm": 2.4314806046744857, + "learning_rate": 6.988636363636364e-05, + "loss": 0.3868, + "step": 246 + }, + { + "epoch": 0.021071489506910082, + "grad_norm": 2.683584332623123, + "learning_rate": 7.017045454545454e-05, + "loss": 0.4434, + "step": 247 + }, + { + "epoch": 0.021156799181027128, + "grad_norm": 2.522366039397641, + "learning_rate": 7.045454545454546e-05, + "loss": 0.4322, + "step": 248 + }, + { + "epoch": 0.021242108855144173, + "grad_norm": 2.4736583541232173, + "learning_rate": 7.073863636363637e-05, + "loss": 0.4598, + "step": 249 + }, + { + "epoch": 0.02132741852926122, + "grad_norm": 1.9516230797508671, + "learning_rate": 7.102272727272727e-05, + "loss": 0.4696, + "step": 250 + }, + { + "epoch": 0.021412728203378265, + "grad_norm": 1.8823724129490143, + "learning_rate": 7.130681818181818e-05, + "loss": 0.3884, + "step": 251 + }, + { + "epoch": 0.021498037877495307, + "grad_norm": 2.302613336878057, + "learning_rate": 7.15909090909091e-05, + "loss": 0.4278, + "step": 252 + }, + { + "epoch": 0.021583347551612352, + "grad_norm": 2.064295363014741, + "learning_rate": 7.1875e-05, + "loss": 0.4239, + "step": 253 + }, + { + "epoch": 0.021668657225729398, + "grad_norm": 2.090826242109761, + "learning_rate": 7.215909090909091e-05, + "loss": 0.4356, + "step": 254 + }, + { + "epoch": 0.021753966899846443, + "grad_norm": 1.9675281911378966, + "learning_rate": 7.244318181818183e-05, + "loss": 0.403, + "step": 255 + }, + { + "epoch": 0.02183927657396349, + "grad_norm": 2.2684160601544314, + "learning_rate": 7.272727272727273e-05, + "loss": 0.4429, + "step": 256 + }, + { + "epoch": 0.02192458624808053, + "grad_norm": 1.8001039104138832, + "learning_rate": 7.301136363636364e-05, + "loss": 0.3878, + "step": 257 + }, + { + "epoch": 0.022009895922197577, + "grad_norm": 2.824482294768277, + "learning_rate": 7.329545454545455e-05, + "loss": 0.4372, + "step": 258 + }, + { + "epoch": 0.022095205596314622, + "grad_norm": 2.114413763554116, + "learning_rate": 7.357954545454546e-05, + "loss": 0.4565, + "step": 259 + }, + { + "epoch": 0.022180515270431668, + "grad_norm": 2.410582765897125, + "learning_rate": 7.386363636363637e-05, + "loss": 0.4433, + "step": 260 + }, + { + "epoch": 0.02226582494454871, + "grad_norm": 2.2112227077320132, + "learning_rate": 7.414772727272728e-05, + "loss": 0.4087, + "step": 261 + }, + { + "epoch": 0.022351134618665756, + "grad_norm": 1.6947096354821942, + "learning_rate": 7.443181818181817e-05, + "loss": 0.3613, + "step": 262 + }, + { + "epoch": 0.0224364442927828, + "grad_norm": 2.301747765112511, + "learning_rate": 7.47159090909091e-05, + "loss": 0.4189, + "step": 263 + }, + { + "epoch": 0.022521753966899847, + "grad_norm": 2.30536468384744, + "learning_rate": 7.500000000000001e-05, + "loss": 0.4359, + "step": 264 + }, + { + "epoch": 0.022607063641016893, + "grad_norm": 1.9573892658039487, + "learning_rate": 7.52840909090909e-05, + "loss": 0.4206, + "step": 265 + }, + { + "epoch": 0.022692373315133935, + "grad_norm": 2.129782959889996, + "learning_rate": 7.556818181818183e-05, + "loss": 0.3936, + "step": 266 + }, + { + "epoch": 0.02277768298925098, + "grad_norm": 2.1100318168663894, + "learning_rate": 7.585227272727274e-05, + "loss": 0.432, + "step": 267 + }, + { + "epoch": 0.022862992663368026, + "grad_norm": 2.5995322430271646, + "learning_rate": 7.613636363636363e-05, + "loss": 0.4807, + "step": 268 + }, + { + "epoch": 0.02294830233748507, + "grad_norm": 2.0337347238971604, + "learning_rate": 7.642045454545454e-05, + "loss": 0.3795, + "step": 269 + }, + { + "epoch": 0.023033612011602117, + "grad_norm": 2.190910840942133, + "learning_rate": 7.670454545454547e-05, + "loss": 0.4297, + "step": 270 + }, + { + "epoch": 0.02311892168571916, + "grad_norm": 2.419609141982105, + "learning_rate": 7.698863636363636e-05, + "loss": 0.458, + "step": 271 + }, + { + "epoch": 0.023204231359836205, + "grad_norm": 2.168031103938609, + "learning_rate": 7.727272727272727e-05, + "loss": 0.41, + "step": 272 + }, + { + "epoch": 0.02328954103395325, + "grad_norm": 2.3960852006847397, + "learning_rate": 7.755681818181818e-05, + "loss": 0.4996, + "step": 273 + }, + { + "epoch": 0.023374850708070296, + "grad_norm": 2.2335931437045566, + "learning_rate": 7.784090909090909e-05, + "loss": 0.4293, + "step": 274 + }, + { + "epoch": 0.02346016038218734, + "grad_norm": 2.041486096344372, + "learning_rate": 7.8125e-05, + "loss": 0.3993, + "step": 275 + }, + { + "epoch": 0.023545470056304384, + "grad_norm": 2.2405684204619236, + "learning_rate": 7.840909090909091e-05, + "loss": 0.3819, + "step": 276 + }, + { + "epoch": 0.02363077973042143, + "grad_norm": 2.2809759524096678, + "learning_rate": 7.869318181818182e-05, + "loss": 0.4047, + "step": 277 + }, + { + "epoch": 0.023716089404538475, + "grad_norm": 1.8354656505940115, + "learning_rate": 7.897727272727273e-05, + "loss": 0.4128, + "step": 278 + }, + { + "epoch": 0.02380139907865552, + "grad_norm": 2.1087050892966253, + "learning_rate": 7.926136363636364e-05, + "loss": 0.4591, + "step": 279 + }, + { + "epoch": 0.023886708752772566, + "grad_norm": 1.9526127257879837, + "learning_rate": 7.954545454545455e-05, + "loss": 0.3926, + "step": 280 + }, + { + "epoch": 0.023972018426889608, + "grad_norm": 2.231732065820366, + "learning_rate": 7.982954545454546e-05, + "loss": 0.4224, + "step": 281 + }, + { + "epoch": 0.024057328101006654, + "grad_norm": 2.3258765711422074, + "learning_rate": 8.011363636363637e-05, + "loss": 0.4486, + "step": 282 + }, + { + "epoch": 0.0241426377751237, + "grad_norm": 2.097291598600573, + "learning_rate": 8.039772727272728e-05, + "loss": 0.4555, + "step": 283 + }, + { + "epoch": 0.024227947449240745, + "grad_norm": 2.0576139431037546, + "learning_rate": 8.068181818181818e-05, + "loss": 0.4242, + "step": 284 + }, + { + "epoch": 0.024313257123357787, + "grad_norm": 2.1073030821911694, + "learning_rate": 8.09659090909091e-05, + "loss": 0.4592, + "step": 285 + }, + { + "epoch": 0.024398566797474833, + "grad_norm": 2.1904471027227888, + "learning_rate": 8.125000000000001e-05, + "loss": 0.4315, + "step": 286 + }, + { + "epoch": 0.02448387647159188, + "grad_norm": 2.39197317766697, + "learning_rate": 8.15340909090909e-05, + "loss": 0.4379, + "step": 287 + }, + { + "epoch": 0.024569186145708924, + "grad_norm": 2.1958233113455274, + "learning_rate": 8.181818181818183e-05, + "loss": 0.4387, + "step": 288 + }, + { + "epoch": 0.02465449581982597, + "grad_norm": 2.1177392219608735, + "learning_rate": 8.210227272727274e-05, + "loss": 0.4244, + "step": 289 + }, + { + "epoch": 0.02473980549394301, + "grad_norm": 2.163896244424515, + "learning_rate": 8.238636363636364e-05, + "loss": 0.4233, + "step": 290 + }, + { + "epoch": 0.024825115168060057, + "grad_norm": 2.321665051750211, + "learning_rate": 8.267045454545455e-05, + "loss": 0.4715, + "step": 291 + }, + { + "epoch": 0.024910424842177103, + "grad_norm": 1.8350982695831577, + "learning_rate": 8.295454545454547e-05, + "loss": 0.4368, + "step": 292 + }, + { + "epoch": 0.02499573451629415, + "grad_norm": 2.5456302151812715, + "learning_rate": 8.323863636363637e-05, + "loss": 0.4511, + "step": 293 + }, + { + "epoch": 0.025081044190411194, + "grad_norm": 2.0739950573088883, + "learning_rate": 8.352272727272727e-05, + "loss": 0.4007, + "step": 294 + }, + { + "epoch": 0.025166353864528236, + "grad_norm": 2.5200214707504403, + "learning_rate": 8.380681818181818e-05, + "loss": 0.4289, + "step": 295 + }, + { + "epoch": 0.025251663538645282, + "grad_norm": 2.1590787604323283, + "learning_rate": 8.40909090909091e-05, + "loss": 0.419, + "step": 296 + }, + { + "epoch": 0.025336973212762327, + "grad_norm": 2.8775934877959846, + "learning_rate": 8.4375e-05, + "loss": 0.4577, + "step": 297 + }, + { + "epoch": 0.025422282886879373, + "grad_norm": 1.9942457150856439, + "learning_rate": 8.465909090909091e-05, + "loss": 0.3722, + "step": 298 + }, + { + "epoch": 0.02550759256099642, + "grad_norm": 2.259699893875751, + "learning_rate": 8.494318181818182e-05, + "loss": 0.4225, + "step": 299 + }, + { + "epoch": 0.02559290223511346, + "grad_norm": 2.1876651156643416, + "learning_rate": 8.522727272727273e-05, + "loss": 0.4391, + "step": 300 + }, + { + "epoch": 0.025678211909230506, + "grad_norm": 2.239210005997261, + "learning_rate": 8.551136363636364e-05, + "loss": 0.4622, + "step": 301 + }, + { + "epoch": 0.025763521583347552, + "grad_norm": 1.9997042198811548, + "learning_rate": 8.579545454545454e-05, + "loss": 0.4283, + "step": 302 + }, + { + "epoch": 0.025848831257464597, + "grad_norm": 1.9990316669423305, + "learning_rate": 8.607954545454546e-05, + "loss": 0.392, + "step": 303 + }, + { + "epoch": 0.025934140931581643, + "grad_norm": 2.1912395665496494, + "learning_rate": 8.636363636363637e-05, + "loss": 0.4888, + "step": 304 + }, + { + "epoch": 0.026019450605698685, + "grad_norm": 1.9437906831365956, + "learning_rate": 8.664772727272727e-05, + "loss": 0.4438, + "step": 305 + }, + { + "epoch": 0.02610476027981573, + "grad_norm": 2.2897292067825545, + "learning_rate": 8.693181818181818e-05, + "loss": 0.4411, + "step": 306 + }, + { + "epoch": 0.026190069953932776, + "grad_norm": 1.8667725936035295, + "learning_rate": 8.72159090909091e-05, + "loss": 0.3792, + "step": 307 + }, + { + "epoch": 0.026275379628049822, + "grad_norm": 2.001616539921716, + "learning_rate": 8.75e-05, + "loss": 0.3993, + "step": 308 + }, + { + "epoch": 0.026360689302166864, + "grad_norm": 1.9698526836636576, + "learning_rate": 8.778409090909091e-05, + "loss": 0.4038, + "step": 309 + }, + { + "epoch": 0.02644599897628391, + "grad_norm": 1.984745141298858, + "learning_rate": 8.806818181818183e-05, + "loss": 0.4541, + "step": 310 + }, + { + "epoch": 0.026531308650400955, + "grad_norm": 1.884512865580798, + "learning_rate": 8.835227272727273e-05, + "loss": 0.4381, + "step": 311 + }, + { + "epoch": 0.026616618324518, + "grad_norm": 2.4831388265060133, + "learning_rate": 8.863636363636364e-05, + "loss": 0.424, + "step": 312 + }, + { + "epoch": 0.026701927998635046, + "grad_norm": 2.0911087913288613, + "learning_rate": 8.892045454545455e-05, + "loss": 0.4585, + "step": 313 + }, + { + "epoch": 0.02678723767275209, + "grad_norm": 2.099454709056239, + "learning_rate": 8.920454545454546e-05, + "loss": 0.4413, + "step": 314 + }, + { + "epoch": 0.026872547346869134, + "grad_norm": 1.857717849463511, + "learning_rate": 8.948863636363637e-05, + "loss": 0.4365, + "step": 315 + }, + { + "epoch": 0.02695785702098618, + "grad_norm": 2.2091446471822693, + "learning_rate": 8.977272727272728e-05, + "loss": 0.3983, + "step": 316 + }, + { + "epoch": 0.027043166695103225, + "grad_norm": 1.8647576007353133, + "learning_rate": 9.005681818181819e-05, + "loss": 0.3884, + "step": 317 + }, + { + "epoch": 0.02712847636922027, + "grad_norm": 1.9841011639383348, + "learning_rate": 9.03409090909091e-05, + "loss": 0.4302, + "step": 318 + }, + { + "epoch": 0.027213786043337313, + "grad_norm": 1.859998710590859, + "learning_rate": 9.062500000000001e-05, + "loss": 0.4028, + "step": 319 + }, + { + "epoch": 0.02729909571745436, + "grad_norm": 2.023175903848956, + "learning_rate": 9.090909090909092e-05, + "loss": 0.3932, + "step": 320 + }, + { + "epoch": 0.027384405391571404, + "grad_norm": 2.2051830013395652, + "learning_rate": 9.119318181818183e-05, + "loss": 0.4163, + "step": 321 + }, + { + "epoch": 0.02746971506568845, + "grad_norm": 2.235208955934696, + "learning_rate": 9.147727272727274e-05, + "loss": 0.4609, + "step": 322 + }, + { + "epoch": 0.027555024739805496, + "grad_norm": 2.1994964803679227, + "learning_rate": 9.176136363636363e-05, + "loss": 0.4547, + "step": 323 + }, + { + "epoch": 0.027640334413922538, + "grad_norm": 2.0965232133333056, + "learning_rate": 9.204545454545454e-05, + "loss": 0.4096, + "step": 324 + }, + { + "epoch": 0.027725644088039583, + "grad_norm": 1.9607746922967504, + "learning_rate": 9.232954545454547e-05, + "loss": 0.4394, + "step": 325 + }, + { + "epoch": 0.02781095376215663, + "grad_norm": 2.1625424772291324, + "learning_rate": 9.261363636363636e-05, + "loss": 0.4258, + "step": 326 + }, + { + "epoch": 0.027896263436273674, + "grad_norm": 2.0981794731243633, + "learning_rate": 9.289772727272727e-05, + "loss": 0.4054, + "step": 327 + }, + { + "epoch": 0.02798157311039072, + "grad_norm": 2.1449846854919206, + "learning_rate": 9.318181818181818e-05, + "loss": 0.3627, + "step": 328 + }, + { + "epoch": 0.028066882784507762, + "grad_norm": 2.014754351274066, + "learning_rate": 9.346590909090909e-05, + "loss": 0.4327, + "step": 329 + }, + { + "epoch": 0.028152192458624808, + "grad_norm": 1.9631347504440135, + "learning_rate": 9.375e-05, + "loss": 0.37, + "step": 330 + }, + { + "epoch": 0.028237502132741853, + "grad_norm": 2.3106651243647502, + "learning_rate": 9.403409090909091e-05, + "loss": 0.4704, + "step": 331 + }, + { + "epoch": 0.0283228118068589, + "grad_norm": 1.8457726198061013, + "learning_rate": 9.431818181818182e-05, + "loss": 0.4334, + "step": 332 + }, + { + "epoch": 0.02840812148097594, + "grad_norm": 2.2932677715747793, + "learning_rate": 9.460227272727273e-05, + "loss": 0.4389, + "step": 333 + }, + { + "epoch": 0.028493431155092987, + "grad_norm": 1.9535980872354977, + "learning_rate": 9.488636363636364e-05, + "loss": 0.4002, + "step": 334 + }, + { + "epoch": 0.028578740829210032, + "grad_norm": 2.082547751768784, + "learning_rate": 9.517045454545455e-05, + "loss": 0.4885, + "step": 335 + }, + { + "epoch": 0.028664050503327078, + "grad_norm": 1.8107495240807714, + "learning_rate": 9.545454545454546e-05, + "loss": 0.3809, + "step": 336 + }, + { + "epoch": 0.028749360177444123, + "grad_norm": 2.1616497154228065, + "learning_rate": 9.573863636363637e-05, + "loss": 0.4312, + "step": 337 + }, + { + "epoch": 0.028834669851561166, + "grad_norm": 1.862738508275819, + "learning_rate": 9.602272727272728e-05, + "loss": 0.4555, + "step": 338 + }, + { + "epoch": 0.02891997952567821, + "grad_norm": 1.9347831381781693, + "learning_rate": 9.630681818181818e-05, + "loss": 0.4439, + "step": 339 + }, + { + "epoch": 0.029005289199795257, + "grad_norm": 2.127971030783837, + "learning_rate": 9.65909090909091e-05, + "loss": 0.466, + "step": 340 + }, + { + "epoch": 0.029090598873912302, + "grad_norm": 2.091898312856188, + "learning_rate": 9.687500000000001e-05, + "loss": 0.4355, + "step": 341 + }, + { + "epoch": 0.029175908548029348, + "grad_norm": 2.1397767996925383, + "learning_rate": 9.71590909090909e-05, + "loss": 0.4852, + "step": 342 + }, + { + "epoch": 0.02926121822214639, + "grad_norm": 2.0469305598748244, + "learning_rate": 9.744318181818183e-05, + "loss": 0.4438, + "step": 343 + }, + { + "epoch": 0.029346527896263436, + "grad_norm": 2.070087348643038, + "learning_rate": 9.772727272727274e-05, + "loss": 0.4791, + "step": 344 + }, + { + "epoch": 0.02943183757038048, + "grad_norm": 2.08017096513764, + "learning_rate": 9.801136363636364e-05, + "loss": 0.3927, + "step": 345 + }, + { + "epoch": 0.029517147244497527, + "grad_norm": 1.7758976345632271, + "learning_rate": 9.829545454545455e-05, + "loss": 0.4143, + "step": 346 + }, + { + "epoch": 0.029602456918614573, + "grad_norm": 2.2037809220696207, + "learning_rate": 9.857954545454547e-05, + "loss": 0.4435, + "step": 347 + }, + { + "epoch": 0.029687766592731615, + "grad_norm": 1.9215947388782628, + "learning_rate": 9.886363636363637e-05, + "loss": 0.3775, + "step": 348 + }, + { + "epoch": 0.02977307626684866, + "grad_norm": 2.0231537491078866, + "learning_rate": 9.914772727272728e-05, + "loss": 0.4202, + "step": 349 + }, + { + "epoch": 0.029858385940965706, + "grad_norm": 2.1361756131349754, + "learning_rate": 9.943181818181819e-05, + "loss": 0.3969, + "step": 350 + }, + { + "epoch": 0.02994369561508275, + "grad_norm": 1.9714897710470474, + "learning_rate": 9.97159090909091e-05, + "loss": 0.4254, + "step": 351 + }, + { + "epoch": 0.030029005289199794, + "grad_norm": 1.9625615541560266, + "learning_rate": 0.0001, + "loss": 0.446, + "step": 352 + }, + { + "epoch": 0.03011431496331684, + "grad_norm": 1.9589553480954867, + "learning_rate": 9.999999809138285e-05, + "loss": 0.4354, + "step": 353 + }, + { + "epoch": 0.030199624637433885, + "grad_norm": 1.9948155440718336, + "learning_rate": 9.999999236553155e-05, + "loss": 0.4147, + "step": 354 + }, + { + "epoch": 0.03028493431155093, + "grad_norm": 2.6046457892581096, + "learning_rate": 9.999998282244653e-05, + "loss": 0.4593, + "step": 355 + }, + { + "epoch": 0.030370243985667976, + "grad_norm": 2.279216957815973, + "learning_rate": 9.999996946212851e-05, + "loss": 0.4585, + "step": 356 + }, + { + "epoch": 0.030455553659785018, + "grad_norm": 2.1956250385704656, + "learning_rate": 9.999995228457853e-05, + "loss": 0.4435, + "step": 357 + }, + { + "epoch": 0.030540863333902064, + "grad_norm": 1.7413102066574255, + "learning_rate": 9.99999312897979e-05, + "loss": 0.4181, + "step": 358 + }, + { + "epoch": 0.03062617300801911, + "grad_norm": 1.7067792693383537, + "learning_rate": 9.99999064777882e-05, + "loss": 0.4008, + "step": 359 + }, + { + "epoch": 0.030711482682136155, + "grad_norm": 1.9150008412252226, + "learning_rate": 9.999987784855135e-05, + "loss": 0.4394, + "step": 360 + }, + { + "epoch": 0.0307967923562532, + "grad_norm": 2.18074059426832, + "learning_rate": 9.999984540208954e-05, + "loss": 0.3823, + "step": 361 + }, + { + "epoch": 0.030882102030370243, + "grad_norm": 2.0830388306044387, + "learning_rate": 9.99998091384052e-05, + "loss": 0.4554, + "step": 362 + }, + { + "epoch": 0.030967411704487288, + "grad_norm": 2.0196632327369417, + "learning_rate": 9.999976905750114e-05, + "loss": 0.4342, + "step": 363 + }, + { + "epoch": 0.031052721378604334, + "grad_norm": 2.0923935780870955, + "learning_rate": 9.999972515938044e-05, + "loss": 0.4247, + "step": 364 + }, + { + "epoch": 0.03113803105272138, + "grad_norm": 1.9031563341289148, + "learning_rate": 9.999967744404639e-05, + "loss": 0.3841, + "step": 365 + }, + { + "epoch": 0.031223340726838425, + "grad_norm": 2.1459903721304556, + "learning_rate": 9.999962591150267e-05, + "loss": 0.4739, + "step": 366 + }, + { + "epoch": 0.03130865040095547, + "grad_norm": 2.3628572950694235, + "learning_rate": 9.999957056175321e-05, + "loss": 0.5259, + "step": 367 + }, + { + "epoch": 0.031393960075072516, + "grad_norm": 1.7150793934676076, + "learning_rate": 9.999951139480224e-05, + "loss": 0.3993, + "step": 368 + }, + { + "epoch": 0.031479269749189555, + "grad_norm": 1.6935744275524314, + "learning_rate": 9.999944841065427e-05, + "loss": 0.4182, + "step": 369 + }, + { + "epoch": 0.0315645794233066, + "grad_norm": 2.1951437237029383, + "learning_rate": 9.999938160931412e-05, + "loss": 0.4445, + "step": 370 + }, + { + "epoch": 0.031649889097423646, + "grad_norm": 1.8465669739779238, + "learning_rate": 9.999931099078688e-05, + "loss": 0.4114, + "step": 371 + }, + { + "epoch": 0.03173519877154069, + "grad_norm": 2.0087053859197708, + "learning_rate": 9.999923655507792e-05, + "loss": 0.4389, + "step": 372 + }, + { + "epoch": 0.03182050844565774, + "grad_norm": 1.8213840326906363, + "learning_rate": 9.999915830219296e-05, + "loss": 0.4106, + "step": 373 + }, + { + "epoch": 0.03190581811977478, + "grad_norm": 2.053223875501273, + "learning_rate": 9.999907623213796e-05, + "loss": 0.463, + "step": 374 + }, + { + "epoch": 0.03199112779389183, + "grad_norm": 1.899658940472844, + "learning_rate": 9.999899034491919e-05, + "loss": 0.4402, + "step": 375 + }, + { + "epoch": 0.032076437468008874, + "grad_norm": 1.7400649616567545, + "learning_rate": 9.999890064054318e-05, + "loss": 0.4165, + "step": 376 + }, + { + "epoch": 0.03216174714212592, + "grad_norm": 1.9470843476577795, + "learning_rate": 9.999880711901682e-05, + "loss": 0.4461, + "step": 377 + }, + { + "epoch": 0.032247056816242965, + "grad_norm": 1.574740794211691, + "learning_rate": 9.999870978034722e-05, + "loss": 0.3808, + "step": 378 + }, + { + "epoch": 0.032332366490360004, + "grad_norm": 1.6731729193213831, + "learning_rate": 9.999860862454182e-05, + "loss": 0.3732, + "step": 379 + }, + { + "epoch": 0.03241767616447705, + "grad_norm": 2.3072586459056654, + "learning_rate": 9.999850365160836e-05, + "loss": 0.4685, + "step": 380 + }, + { + "epoch": 0.032502985838594095, + "grad_norm": 2.4653633162533937, + "learning_rate": 9.999839486155482e-05, + "loss": 0.4687, + "step": 381 + }, + { + "epoch": 0.03258829551271114, + "grad_norm": 2.1617605587880204, + "learning_rate": 9.999828225438954e-05, + "loss": 0.4416, + "step": 382 + }, + { + "epoch": 0.032673605186828186, + "grad_norm": 1.8432090741654392, + "learning_rate": 9.999816583012109e-05, + "loss": 0.38, + "step": 383 + }, + { + "epoch": 0.03275891486094523, + "grad_norm": 1.7255542970270525, + "learning_rate": 9.999804558875835e-05, + "loss": 0.3897, + "step": 384 + }, + { + "epoch": 0.03284422453506228, + "grad_norm": 2.2437285865888232, + "learning_rate": 9.999792153031055e-05, + "loss": 0.5137, + "step": 385 + }, + { + "epoch": 0.03292953420917932, + "grad_norm": 1.8928977609785098, + "learning_rate": 9.999779365478712e-05, + "loss": 0.4257, + "step": 386 + }, + { + "epoch": 0.03301484388329637, + "grad_norm": 1.9243978053863642, + "learning_rate": 9.999766196219784e-05, + "loss": 0.4317, + "step": 387 + }, + { + "epoch": 0.03310015355741341, + "grad_norm": 2.462422821849337, + "learning_rate": 9.999752645255273e-05, + "loss": 0.478, + "step": 388 + }, + { + "epoch": 0.03318546323153045, + "grad_norm": 2.213257559701024, + "learning_rate": 9.99973871258622e-05, + "loss": 0.4339, + "step": 389 + }, + { + "epoch": 0.0332707729056475, + "grad_norm": 1.8182274660969233, + "learning_rate": 9.99972439821368e-05, + "loss": 0.4266, + "step": 390 + }, + { + "epoch": 0.033356082579764544, + "grad_norm": 2.277835223654649, + "learning_rate": 9.999709702138756e-05, + "loss": 0.4718, + "step": 391 + }, + { + "epoch": 0.03344139225388159, + "grad_norm": 1.7945496481391174, + "learning_rate": 9.99969462436256e-05, + "loss": 0.423, + "step": 392 + }, + { + "epoch": 0.033526701927998635, + "grad_norm": 1.8387622286703071, + "learning_rate": 9.999679164886251e-05, + "loss": 0.424, + "step": 393 + }, + { + "epoch": 0.03361201160211568, + "grad_norm": 2.2155144969248077, + "learning_rate": 9.999663323711004e-05, + "loss": 0.4615, + "step": 394 + }, + { + "epoch": 0.033697321276232726, + "grad_norm": 2.1710132014269887, + "learning_rate": 9.999647100838032e-05, + "loss": 0.4169, + "step": 395 + }, + { + "epoch": 0.03378263095034977, + "grad_norm": 1.708057505352017, + "learning_rate": 9.999630496268572e-05, + "loss": 0.3975, + "step": 396 + }, + { + "epoch": 0.03386794062446682, + "grad_norm": 1.7820872965707593, + "learning_rate": 9.999613510003891e-05, + "loss": 0.4088, + "step": 397 + }, + { + "epoch": 0.033953250298583856, + "grad_norm": 1.7093968531242085, + "learning_rate": 9.999596142045286e-05, + "loss": 0.4073, + "step": 398 + }, + { + "epoch": 0.0340385599727009, + "grad_norm": 1.7906389911630942, + "learning_rate": 9.999578392394085e-05, + "loss": 0.4295, + "step": 399 + }, + { + "epoch": 0.03412386964681795, + "grad_norm": 2.029904198476538, + "learning_rate": 9.99956026105164e-05, + "loss": 0.4461, + "step": 400 + }, + { + "epoch": 0.03420917932093499, + "grad_norm": 2.509424944958969, + "learning_rate": 9.999541748019337e-05, + "loss": 0.46, + "step": 401 + }, + { + "epoch": 0.03429448899505204, + "grad_norm": 1.9037876172021797, + "learning_rate": 9.999522853298589e-05, + "loss": 0.4192, + "step": 402 + }, + { + "epoch": 0.034379798669169084, + "grad_norm": 1.5985656985186654, + "learning_rate": 9.999503576890838e-05, + "loss": 0.4096, + "step": 403 + }, + { + "epoch": 0.03446510834328613, + "grad_norm": 2.1697783744601433, + "learning_rate": 9.999483918797556e-05, + "loss": 0.4264, + "step": 404 + }, + { + "epoch": 0.034550418017403176, + "grad_norm": 2.249842850177404, + "learning_rate": 9.999463879020246e-05, + "loss": 0.4737, + "step": 405 + }, + { + "epoch": 0.03463572769152022, + "grad_norm": 1.8537895233406445, + "learning_rate": 9.999443457560434e-05, + "loss": 0.4518, + "step": 406 + }, + { + "epoch": 0.03472103736563727, + "grad_norm": 1.6848283033875446, + "learning_rate": 9.999422654419682e-05, + "loss": 0.3914, + "step": 407 + }, + { + "epoch": 0.034806347039754305, + "grad_norm": 1.9247505571330115, + "learning_rate": 9.999401469599577e-05, + "loss": 0.4369, + "step": 408 + }, + { + "epoch": 0.03489165671387135, + "grad_norm": 2.0357216545679098, + "learning_rate": 9.999379903101735e-05, + "loss": 0.4292, + "step": 409 + }, + { + "epoch": 0.0349769663879884, + "grad_norm": 1.7980703607186028, + "learning_rate": 9.999357954927808e-05, + "loss": 0.3672, + "step": 410 + }, + { + "epoch": 0.03506227606210544, + "grad_norm": 2.0037425310801225, + "learning_rate": 9.999335625079464e-05, + "loss": 0.4021, + "step": 411 + }, + { + "epoch": 0.03514758573622249, + "grad_norm": 1.9993254000684548, + "learning_rate": 9.999312913558413e-05, + "loss": 0.3912, + "step": 412 + }, + { + "epoch": 0.03523289541033953, + "grad_norm": 1.862805959673738, + "learning_rate": 9.999289820366387e-05, + "loss": 0.3825, + "step": 413 + }, + { + "epoch": 0.03531820508445658, + "grad_norm": 1.8277655354057292, + "learning_rate": 9.999266345505149e-05, + "loss": 0.4321, + "step": 414 + }, + { + "epoch": 0.035403514758573625, + "grad_norm": 1.9668139671939164, + "learning_rate": 9.999242488976493e-05, + "loss": 0.4225, + "step": 415 + }, + { + "epoch": 0.03548882443269067, + "grad_norm": 1.7374065127024416, + "learning_rate": 9.999218250782239e-05, + "loss": 0.4075, + "step": 416 + }, + { + "epoch": 0.03557413410680771, + "grad_norm": 1.783508408351506, + "learning_rate": 9.999193630924236e-05, + "loss": 0.4371, + "step": 417 + }, + { + "epoch": 0.035659443780924754, + "grad_norm": 1.842813156823864, + "learning_rate": 9.999168629404365e-05, + "loss": 0.4322, + "step": 418 + }, + { + "epoch": 0.0357447534550418, + "grad_norm": 1.7963075488168845, + "learning_rate": 9.999143246224536e-05, + "loss": 0.4051, + "step": 419 + }, + { + "epoch": 0.035830063129158846, + "grad_norm": 1.92854701637278, + "learning_rate": 9.999117481386684e-05, + "loss": 0.3736, + "step": 420 + }, + { + "epoch": 0.03591537280327589, + "grad_norm": 1.9874747988464672, + "learning_rate": 9.999091334892779e-05, + "loss": 0.4192, + "step": 421 + }, + { + "epoch": 0.03600068247739294, + "grad_norm": 1.8661637951512327, + "learning_rate": 9.999064806744816e-05, + "loss": 0.4531, + "step": 422 + }, + { + "epoch": 0.03608599215150998, + "grad_norm": 1.8672563228931698, + "learning_rate": 9.999037896944819e-05, + "loss": 0.3954, + "step": 423 + }, + { + "epoch": 0.03617130182562703, + "grad_norm": 1.9541365178540537, + "learning_rate": 9.999010605494843e-05, + "loss": 0.4929, + "step": 424 + }, + { + "epoch": 0.036256611499744074, + "grad_norm": 1.8289543943981572, + "learning_rate": 9.998982932396972e-05, + "loss": 0.388, + "step": 425 + }, + { + "epoch": 0.03634192117386112, + "grad_norm": 2.289052188576658, + "learning_rate": 9.998954877653319e-05, + "loss": 0.5287, + "step": 426 + }, + { + "epoch": 0.03642723084797816, + "grad_norm": 1.743389860857203, + "learning_rate": 9.998926441266026e-05, + "loss": 0.3843, + "step": 427 + }, + { + "epoch": 0.0365125405220952, + "grad_norm": 1.6717398045147527, + "learning_rate": 9.998897623237263e-05, + "loss": 0.427, + "step": 428 + }, + { + "epoch": 0.03659785019621225, + "grad_norm": 2.0092337361420687, + "learning_rate": 9.998868423569231e-05, + "loss": 0.4794, + "step": 429 + }, + { + "epoch": 0.036683159870329295, + "grad_norm": 2.1469976975917686, + "learning_rate": 9.998838842264158e-05, + "loss": 0.3978, + "step": 430 + }, + { + "epoch": 0.03676846954444634, + "grad_norm": 2.02892992595597, + "learning_rate": 9.998808879324304e-05, + "loss": 0.4603, + "step": 431 + }, + { + "epoch": 0.036853779218563386, + "grad_norm": 1.9340274825249049, + "learning_rate": 9.998778534751956e-05, + "loss": 0.4586, + "step": 432 + }, + { + "epoch": 0.03693908889268043, + "grad_norm": 2.045793669938324, + "learning_rate": 9.998747808549429e-05, + "loss": 0.4316, + "step": 433 + }, + { + "epoch": 0.03702439856679748, + "grad_norm": 1.9406671786677268, + "learning_rate": 9.998716700719071e-05, + "loss": 0.3865, + "step": 434 + }, + { + "epoch": 0.03710970824091452, + "grad_norm": 1.7564992110166182, + "learning_rate": 9.998685211263257e-05, + "loss": 0.448, + "step": 435 + }, + { + "epoch": 0.03719501791503156, + "grad_norm": 1.944023164646452, + "learning_rate": 9.998653340184387e-05, + "loss": 0.4768, + "step": 436 + }, + { + "epoch": 0.03728032758914861, + "grad_norm": 1.9590477823814212, + "learning_rate": 9.998621087484901e-05, + "loss": 0.4118, + "step": 437 + }, + { + "epoch": 0.03736563726326565, + "grad_norm": 1.9193376368193606, + "learning_rate": 9.998588453167256e-05, + "loss": 0.427, + "step": 438 + }, + { + "epoch": 0.0374509469373827, + "grad_norm": 1.775878033599765, + "learning_rate": 9.998555437233946e-05, + "loss": 0.3688, + "step": 439 + }, + { + "epoch": 0.037536256611499744, + "grad_norm": 1.7138817941418163, + "learning_rate": 9.998522039687488e-05, + "loss": 0.3891, + "step": 440 + }, + { + "epoch": 0.03762156628561679, + "grad_norm": 1.8589044825088654, + "learning_rate": 9.998488260530436e-05, + "loss": 0.4357, + "step": 441 + }, + { + "epoch": 0.037706875959733835, + "grad_norm": 1.8775306789568564, + "learning_rate": 9.998454099765368e-05, + "loss": 0.4409, + "step": 442 + }, + { + "epoch": 0.03779218563385088, + "grad_norm": 1.734729696209684, + "learning_rate": 9.99841955739489e-05, + "loss": 0.388, + "step": 443 + }, + { + "epoch": 0.037877495307967926, + "grad_norm": 1.7025289629966738, + "learning_rate": 9.998384633421641e-05, + "loss": 0.3993, + "step": 444 + }, + { + "epoch": 0.03796280498208497, + "grad_norm": 1.9216380399682018, + "learning_rate": 9.998349327848286e-05, + "loss": 0.4663, + "step": 445 + }, + { + "epoch": 0.03804811465620201, + "grad_norm": 2.16444002569889, + "learning_rate": 9.998313640677522e-05, + "loss": 0.4499, + "step": 446 + }, + { + "epoch": 0.038133424330319056, + "grad_norm": 2.0420511493534943, + "learning_rate": 9.998277571912073e-05, + "loss": 0.4324, + "step": 447 + }, + { + "epoch": 0.0382187340044361, + "grad_norm": 1.9204375931646704, + "learning_rate": 9.998241121554692e-05, + "loss": 0.4707, + "step": 448 + }, + { + "epoch": 0.03830404367855315, + "grad_norm": 1.8202521187259524, + "learning_rate": 9.99820428960816e-05, + "loss": 0.4616, + "step": 449 + }, + { + "epoch": 0.03838935335267019, + "grad_norm": 2.9452914533191987, + "learning_rate": 9.998167076075293e-05, + "loss": 0.4195, + "step": 450 + }, + { + "epoch": 0.03847466302678724, + "grad_norm": 1.9652133237635931, + "learning_rate": 9.998129480958929e-05, + "loss": 0.431, + "step": 451 + }, + { + "epoch": 0.038559972700904284, + "grad_norm": 1.928231682445954, + "learning_rate": 9.99809150426194e-05, + "loss": 0.4129, + "step": 452 + }, + { + "epoch": 0.03864528237502133, + "grad_norm": 1.8863727892124031, + "learning_rate": 9.998053145987223e-05, + "loss": 0.4236, + "step": 453 + }, + { + "epoch": 0.038730592049138375, + "grad_norm": 2.029140844578974, + "learning_rate": 9.998014406137709e-05, + "loss": 0.4914, + "step": 454 + }, + { + "epoch": 0.038815901723255414, + "grad_norm": 1.8084088554374762, + "learning_rate": 9.997975284716354e-05, + "loss": 0.4323, + "step": 455 + }, + { + "epoch": 0.03890121139737246, + "grad_norm": 1.9739399274170912, + "learning_rate": 9.997935781726147e-05, + "loss": 0.4625, + "step": 456 + }, + { + "epoch": 0.038986521071489505, + "grad_norm": 1.7282298740896487, + "learning_rate": 9.9978958971701e-05, + "loss": 0.4361, + "step": 457 + }, + { + "epoch": 0.03907183074560655, + "grad_norm": 1.6874368443856747, + "learning_rate": 9.99785563105126e-05, + "loss": 0.3953, + "step": 458 + }, + { + "epoch": 0.039157140419723596, + "grad_norm": 1.670781713942468, + "learning_rate": 9.997814983372702e-05, + "loss": 0.4435, + "step": 459 + }, + { + "epoch": 0.03924245009384064, + "grad_norm": 1.7607385314801893, + "learning_rate": 9.997773954137528e-05, + "loss": 0.4634, + "step": 460 + }, + { + "epoch": 0.03932775976795769, + "grad_norm": 2.1285991482196147, + "learning_rate": 9.99773254334887e-05, + "loss": 0.451, + "step": 461 + }, + { + "epoch": 0.03941306944207473, + "grad_norm": 2.0539464738403295, + "learning_rate": 9.997690751009892e-05, + "loss": 0.4879, + "step": 462 + }, + { + "epoch": 0.03949837911619178, + "grad_norm": 1.877152860019274, + "learning_rate": 9.997648577123782e-05, + "loss": 0.4216, + "step": 463 + }, + { + "epoch": 0.039583688790308824, + "grad_norm": 1.8978055679292676, + "learning_rate": 9.99760602169376e-05, + "loss": 0.3862, + "step": 464 + }, + { + "epoch": 0.03966899846442586, + "grad_norm": 1.6654749503907371, + "learning_rate": 9.997563084723077e-05, + "loss": 0.4125, + "step": 465 + }, + { + "epoch": 0.03975430813854291, + "grad_norm": 1.7587102335410838, + "learning_rate": 9.997519766215009e-05, + "loss": 0.4306, + "step": 466 + }, + { + "epoch": 0.039839617812659954, + "grad_norm": 1.6985023128211911, + "learning_rate": 9.997476066172863e-05, + "loss": 0.4102, + "step": 467 + }, + { + "epoch": 0.039924927486777, + "grad_norm": 2.1099651500119943, + "learning_rate": 9.997431984599976e-05, + "loss": 0.4818, + "step": 468 + }, + { + "epoch": 0.040010237160894045, + "grad_norm": 1.5380006796806829, + "learning_rate": 9.997387521499714e-05, + "loss": 0.3916, + "step": 469 + }, + { + "epoch": 0.04009554683501109, + "grad_norm": 1.7876247975993698, + "learning_rate": 9.99734267687547e-05, + "loss": 0.4408, + "step": 470 + }, + { + "epoch": 0.040180856509128136, + "grad_norm": 1.668714584374819, + "learning_rate": 9.997297450730669e-05, + "loss": 0.4599, + "step": 471 + }, + { + "epoch": 0.04026616618324518, + "grad_norm": 1.7888597931611698, + "learning_rate": 9.997251843068762e-05, + "loss": 0.3982, + "step": 472 + }, + { + "epoch": 0.04035147585736223, + "grad_norm": 1.831909305767981, + "learning_rate": 9.997205853893234e-05, + "loss": 0.419, + "step": 473 + }, + { + "epoch": 0.04043678553147927, + "grad_norm": 2.05642278809104, + "learning_rate": 9.997159483207594e-05, + "loss": 0.4107, + "step": 474 + }, + { + "epoch": 0.04052209520559631, + "grad_norm": 1.8964990208666261, + "learning_rate": 9.997112731015382e-05, + "loss": 0.4196, + "step": 475 + }, + { + "epoch": 0.04060740487971336, + "grad_norm": 1.6556136150263872, + "learning_rate": 9.997065597320165e-05, + "loss": 0.3846, + "step": 476 + }, + { + "epoch": 0.0406927145538304, + "grad_norm": 1.785570226035507, + "learning_rate": 9.997018082125546e-05, + "loss": 0.4453, + "step": 477 + }, + { + "epoch": 0.04077802422794745, + "grad_norm": 2.1324823703909197, + "learning_rate": 9.996970185435152e-05, + "loss": 0.444, + "step": 478 + }, + { + "epoch": 0.040863333902064494, + "grad_norm": 1.8015351610185433, + "learning_rate": 9.996921907252636e-05, + "loss": 0.4458, + "step": 479 + }, + { + "epoch": 0.04094864357618154, + "grad_norm": 1.8162515553876672, + "learning_rate": 9.996873247581689e-05, + "loss": 0.4437, + "step": 480 + }, + { + "epoch": 0.041033953250298585, + "grad_norm": 1.7954509897493156, + "learning_rate": 9.996824206426021e-05, + "loss": 0.3887, + "step": 481 + }, + { + "epoch": 0.04111926292441563, + "grad_norm": 2.009533333954896, + "learning_rate": 9.996774783789377e-05, + "loss": 0.4562, + "step": 482 + }, + { + "epoch": 0.04120457259853268, + "grad_norm": 1.6419429753074182, + "learning_rate": 9.996724979675533e-05, + "loss": 0.4176, + "step": 483 + }, + { + "epoch": 0.041289882272649715, + "grad_norm": 1.7105855362019193, + "learning_rate": 9.996674794088288e-05, + "loss": 0.4719, + "step": 484 + }, + { + "epoch": 0.04137519194676676, + "grad_norm": 2.0389576150512156, + "learning_rate": 9.996624227031474e-05, + "loss": 0.4272, + "step": 485 + }, + { + "epoch": 0.041460501620883806, + "grad_norm": 1.9720316587386792, + "learning_rate": 9.996573278508953e-05, + "loss": 0.5048, + "step": 486 + }, + { + "epoch": 0.04154581129500085, + "grad_norm": 1.955573050765085, + "learning_rate": 9.996521948524615e-05, + "loss": 0.4689, + "step": 487 + }, + { + "epoch": 0.0416311209691179, + "grad_norm": 1.8340900622253422, + "learning_rate": 9.996470237082378e-05, + "loss": 0.4552, + "step": 488 + }, + { + "epoch": 0.04171643064323494, + "grad_norm": 1.515981101449214, + "learning_rate": 9.996418144186188e-05, + "loss": 0.392, + "step": 489 + }, + { + "epoch": 0.04180174031735199, + "grad_norm": 1.8807489634080876, + "learning_rate": 9.996365669840024e-05, + "loss": 0.4006, + "step": 490 + }, + { + "epoch": 0.041887049991469034, + "grad_norm": 2.2866320938670124, + "learning_rate": 9.996312814047892e-05, + "loss": 0.4432, + "step": 491 + }, + { + "epoch": 0.04197235966558608, + "grad_norm": 1.5705188976172262, + "learning_rate": 9.996259576813828e-05, + "loss": 0.4681, + "step": 492 + }, + { + "epoch": 0.042057669339703126, + "grad_norm": 2.2225209816799407, + "learning_rate": 9.996205958141894e-05, + "loss": 0.5061, + "step": 493 + }, + { + "epoch": 0.042142979013820164, + "grad_norm": 2.0396317312273657, + "learning_rate": 9.996151958036186e-05, + "loss": 0.4875, + "step": 494 + }, + { + "epoch": 0.04222828868793721, + "grad_norm": 1.9566085876778616, + "learning_rate": 9.996097576500825e-05, + "loss": 0.4598, + "step": 495 + }, + { + "epoch": 0.042313598362054256, + "grad_norm": 1.797909513468858, + "learning_rate": 9.996042813539964e-05, + "loss": 0.4057, + "step": 496 + }, + { + "epoch": 0.0423989080361713, + "grad_norm": 1.991157115678473, + "learning_rate": 9.995987669157781e-05, + "loss": 0.453, + "step": 497 + }, + { + "epoch": 0.04248421771028835, + "grad_norm": 1.4267809563756437, + "learning_rate": 9.99593214335849e-05, + "loss": 0.4081, + "step": 498 + }, + { + "epoch": 0.04256952738440539, + "grad_norm": 1.9387760882217988, + "learning_rate": 9.995876236146327e-05, + "loss": 0.403, + "step": 499 + }, + { + "epoch": 0.04265483705852244, + "grad_norm": 1.5274801754328375, + "learning_rate": 9.995819947525563e-05, + "loss": 0.3929, + "step": 500 + }, + { + "epoch": 0.042740146732639483, + "grad_norm": 1.6270046340463735, + "learning_rate": 9.995763277500493e-05, + "loss": 0.4367, + "step": 501 + }, + { + "epoch": 0.04282545640675653, + "grad_norm": 1.5348653347824193, + "learning_rate": 9.995706226075445e-05, + "loss": 0.3822, + "step": 502 + }, + { + "epoch": 0.04291076608087357, + "grad_norm": 1.7936442646403976, + "learning_rate": 9.995648793254772e-05, + "loss": 0.4047, + "step": 503 + }, + { + "epoch": 0.04299607575499061, + "grad_norm": 1.7599888193252196, + "learning_rate": 9.995590979042861e-05, + "loss": 0.4401, + "step": 504 + }, + { + "epoch": 0.04308138542910766, + "grad_norm": 1.9607489141840362, + "learning_rate": 9.995532783444126e-05, + "loss": 0.4356, + "step": 505 + }, + { + "epoch": 0.043166695103224705, + "grad_norm": 2.145043928501278, + "learning_rate": 9.995474206463009e-05, + "loss": 0.4305, + "step": 506 + }, + { + "epoch": 0.04325200477734175, + "grad_norm": 1.6572994290339318, + "learning_rate": 9.995415248103982e-05, + "loss": 0.3654, + "step": 507 + }, + { + "epoch": 0.043337314451458796, + "grad_norm": 2.007720826427349, + "learning_rate": 9.995355908371546e-05, + "loss": 0.4444, + "step": 508 + }, + { + "epoch": 0.04342262412557584, + "grad_norm": 1.6977641577379594, + "learning_rate": 9.995296187270233e-05, + "loss": 0.4443, + "step": 509 + }, + { + "epoch": 0.04350793379969289, + "grad_norm": 1.8624156804807661, + "learning_rate": 9.9952360848046e-05, + "loss": 0.4124, + "step": 510 + }, + { + "epoch": 0.04359324347380993, + "grad_norm": 2.0336253435961242, + "learning_rate": 9.995175600979236e-05, + "loss": 0.4193, + "step": 511 + }, + { + "epoch": 0.04367855314792698, + "grad_norm": 1.863073565346736, + "learning_rate": 9.995114735798761e-05, + "loss": 0.4529, + "step": 512 + }, + { + "epoch": 0.04376386282204402, + "grad_norm": 1.5486438540583327, + "learning_rate": 9.99505348926782e-05, + "loss": 0.4217, + "step": 513 + }, + { + "epoch": 0.04384917249616106, + "grad_norm": 1.7909336216914036, + "learning_rate": 9.994991861391088e-05, + "loss": 0.4431, + "step": 514 + }, + { + "epoch": 0.04393448217027811, + "grad_norm": 1.664011063474776, + "learning_rate": 9.99492985217327e-05, + "loss": 0.4209, + "step": 515 + }, + { + "epoch": 0.044019791844395154, + "grad_norm": 1.6830434231655653, + "learning_rate": 9.994867461619101e-05, + "loss": 0.4088, + "step": 516 + }, + { + "epoch": 0.0441051015185122, + "grad_norm": 1.6184268938252164, + "learning_rate": 9.994804689733344e-05, + "loss": 0.4377, + "step": 517 + }, + { + "epoch": 0.044190411192629245, + "grad_norm": 2.0369541771088304, + "learning_rate": 9.994741536520792e-05, + "loss": 0.457, + "step": 518 + }, + { + "epoch": 0.04427572086674629, + "grad_norm": 1.9305325406880656, + "learning_rate": 9.994678001986265e-05, + "loss": 0.4623, + "step": 519 + }, + { + "epoch": 0.044361030540863336, + "grad_norm": 1.7573249649423603, + "learning_rate": 9.994614086134616e-05, + "loss": 0.4328, + "step": 520 + }, + { + "epoch": 0.04444634021498038, + "grad_norm": 2.176255234433211, + "learning_rate": 9.994549788970721e-05, + "loss": 0.4343, + "step": 521 + }, + { + "epoch": 0.04453164988909742, + "grad_norm": 1.5776720057808933, + "learning_rate": 9.99448511049949e-05, + "loss": 0.3983, + "step": 522 + }, + { + "epoch": 0.044616959563214466, + "grad_norm": 1.977000553607622, + "learning_rate": 9.994420050725863e-05, + "loss": 0.4057, + "step": 523 + }, + { + "epoch": 0.04470226923733151, + "grad_norm": 1.7487069529974277, + "learning_rate": 9.994354609654806e-05, + "loss": 0.4294, + "step": 524 + }, + { + "epoch": 0.04478757891144856, + "grad_norm": 1.6073724824865063, + "learning_rate": 9.994288787291313e-05, + "loss": 0.4342, + "step": 525 + }, + { + "epoch": 0.0448728885855656, + "grad_norm": 2.064182751002176, + "learning_rate": 9.994222583640412e-05, + "loss": 0.4617, + "step": 526 + }, + { + "epoch": 0.04495819825968265, + "grad_norm": 2.005432142838248, + "learning_rate": 9.994155998707155e-05, + "loss": 0.4528, + "step": 527 + }, + { + "epoch": 0.045043507933799694, + "grad_norm": 1.6786739396328332, + "learning_rate": 9.994089032496627e-05, + "loss": 0.4335, + "step": 528 + }, + { + "epoch": 0.04512881760791674, + "grad_norm": 1.5681755746810122, + "learning_rate": 9.994021685013939e-05, + "loss": 0.3518, + "step": 529 + }, + { + "epoch": 0.045214127282033785, + "grad_norm": 1.7847948772849345, + "learning_rate": 9.993953956264235e-05, + "loss": 0.4301, + "step": 530 + }, + { + "epoch": 0.04529943695615083, + "grad_norm": 1.81006122812224, + "learning_rate": 9.993885846252682e-05, + "loss": 0.4555, + "step": 531 + }, + { + "epoch": 0.04538474663026787, + "grad_norm": 2.0485164670759537, + "learning_rate": 9.993817354984486e-05, + "loss": 0.4285, + "step": 532 + }, + { + "epoch": 0.045470056304384915, + "grad_norm": 1.8557016969314828, + "learning_rate": 9.993748482464868e-05, + "loss": 0.4561, + "step": 533 + }, + { + "epoch": 0.04555536597850196, + "grad_norm": 1.9225785743123498, + "learning_rate": 9.993679228699091e-05, + "loss": 0.4395, + "step": 534 + }, + { + "epoch": 0.045640675652619006, + "grad_norm": 2.003022056469577, + "learning_rate": 9.993609593692442e-05, + "loss": 0.484, + "step": 535 + }, + { + "epoch": 0.04572598532673605, + "grad_norm": 1.5764848098289237, + "learning_rate": 9.993539577450237e-05, + "loss": 0.3652, + "step": 536 + }, + { + "epoch": 0.0458112950008531, + "grad_norm": 1.8432079746924845, + "learning_rate": 9.993469179977821e-05, + "loss": 0.4036, + "step": 537 + }, + { + "epoch": 0.04589660467497014, + "grad_norm": 1.7259818779445988, + "learning_rate": 9.993398401280567e-05, + "loss": 0.4005, + "step": 538 + }, + { + "epoch": 0.04598191434908719, + "grad_norm": 1.772212205989413, + "learning_rate": 9.993327241363881e-05, + "loss": 0.4143, + "step": 539 + }, + { + "epoch": 0.046067224023204234, + "grad_norm": 1.863213687847433, + "learning_rate": 9.993255700233194e-05, + "loss": 0.3589, + "step": 540 + }, + { + "epoch": 0.04615253369732128, + "grad_norm": 2.106266698572994, + "learning_rate": 9.99318377789397e-05, + "loss": 0.4102, + "step": 541 + }, + { + "epoch": 0.04623784337143832, + "grad_norm": 1.7667209970769158, + "learning_rate": 9.993111474351698e-05, + "loss": 0.4535, + "step": 542 + }, + { + "epoch": 0.046323153045555364, + "grad_norm": 1.883366673379803, + "learning_rate": 9.993038789611897e-05, + "loss": 0.4883, + "step": 543 + }, + { + "epoch": 0.04640846271967241, + "grad_norm": 1.9041348588931537, + "learning_rate": 9.992965723680117e-05, + "loss": 0.406, + "step": 544 + }, + { + "epoch": 0.046493772393789455, + "grad_norm": 1.897527217443621, + "learning_rate": 9.992892276561938e-05, + "loss": 0.4507, + "step": 545 + }, + { + "epoch": 0.0465790820679065, + "grad_norm": 1.821830280468326, + "learning_rate": 9.992818448262965e-05, + "loss": 0.4342, + "step": 546 + }, + { + "epoch": 0.046664391742023546, + "grad_norm": 1.9887043740669104, + "learning_rate": 9.992744238788836e-05, + "loss": 0.4759, + "step": 547 + }, + { + "epoch": 0.04674970141614059, + "grad_norm": 1.9065217621744515, + "learning_rate": 9.992669648145215e-05, + "loss": 0.4771, + "step": 548 + }, + { + "epoch": 0.04683501109025764, + "grad_norm": 1.7387867396166499, + "learning_rate": 9.992594676337797e-05, + "loss": 0.4246, + "step": 549 + }, + { + "epoch": 0.04692032076437468, + "grad_norm": 1.6872756243698814, + "learning_rate": 9.992519323372307e-05, + "loss": 0.4017, + "step": 550 + }, + { + "epoch": 0.04700563043849172, + "grad_norm": 1.6476389320154368, + "learning_rate": 9.992443589254496e-05, + "loss": 0.4187, + "step": 551 + }, + { + "epoch": 0.04709094011260877, + "grad_norm": 1.6445618397819353, + "learning_rate": 9.99236747399015e-05, + "loss": 0.4144, + "step": 552 + }, + { + "epoch": 0.04717624978672581, + "grad_norm": 1.980788887829494, + "learning_rate": 9.992290977585072e-05, + "loss": 0.4644, + "step": 553 + }, + { + "epoch": 0.04726155946084286, + "grad_norm": 1.6482974892406739, + "learning_rate": 9.99221410004511e-05, + "loss": 0.4687, + "step": 554 + }, + { + "epoch": 0.047346869134959904, + "grad_norm": 1.7027941741209258, + "learning_rate": 9.99213684137613e-05, + "loss": 0.4141, + "step": 555 + }, + { + "epoch": 0.04743217880907695, + "grad_norm": 1.7904192847940812, + "learning_rate": 9.99205920158403e-05, + "loss": 0.4302, + "step": 556 + }, + { + "epoch": 0.047517488483193995, + "grad_norm": 1.6913486673539364, + "learning_rate": 9.991981180674737e-05, + "loss": 0.4185, + "step": 557 + }, + { + "epoch": 0.04760279815731104, + "grad_norm": 1.737653503223708, + "learning_rate": 9.991902778654207e-05, + "loss": 0.3884, + "step": 558 + }, + { + "epoch": 0.047688107831428087, + "grad_norm": 1.6587591059189746, + "learning_rate": 9.991823995528428e-05, + "loss": 0.3838, + "step": 559 + }, + { + "epoch": 0.04777341750554513, + "grad_norm": 1.9741398133233161, + "learning_rate": 9.991744831303416e-05, + "loss": 0.4409, + "step": 560 + }, + { + "epoch": 0.04785872717966217, + "grad_norm": 1.5490685525283021, + "learning_rate": 9.991665285985209e-05, + "loss": 0.3878, + "step": 561 + }, + { + "epoch": 0.047944036853779216, + "grad_norm": 1.6516440783719395, + "learning_rate": 9.991585359579884e-05, + "loss": 0.3828, + "step": 562 + }, + { + "epoch": 0.04802934652789626, + "grad_norm": 2.068359605538821, + "learning_rate": 9.991505052093541e-05, + "loss": 0.4306, + "step": 563 + }, + { + "epoch": 0.04811465620201331, + "grad_norm": 1.7979381567362094, + "learning_rate": 9.991424363532314e-05, + "loss": 0.4099, + "step": 564 + }, + { + "epoch": 0.04819996587613035, + "grad_norm": 1.7529154062353483, + "learning_rate": 9.991343293902361e-05, + "loss": 0.4068, + "step": 565 + }, + { + "epoch": 0.0482852755502474, + "grad_norm": 2.0502111605307616, + "learning_rate": 9.991261843209872e-05, + "loss": 0.4285, + "step": 566 + }, + { + "epoch": 0.048370585224364444, + "grad_norm": 1.8357865941139286, + "learning_rate": 9.991180011461063e-05, + "loss": 0.41, + "step": 567 + }, + { + "epoch": 0.04845589489848149, + "grad_norm": 1.4585383498178595, + "learning_rate": 9.991097798662183e-05, + "loss": 0.3725, + "step": 568 + }, + { + "epoch": 0.048541204572598536, + "grad_norm": 1.9377198863617282, + "learning_rate": 9.99101520481951e-05, + "loss": 0.4036, + "step": 569 + }, + { + "epoch": 0.048626514246715574, + "grad_norm": 1.9609964673420446, + "learning_rate": 9.99093222993935e-05, + "loss": 0.4594, + "step": 570 + }, + { + "epoch": 0.04871182392083262, + "grad_norm": 1.92798338482848, + "learning_rate": 9.990848874028032e-05, + "loss": 0.4216, + "step": 571 + }, + { + "epoch": 0.048797133594949665, + "grad_norm": 1.9774213892458463, + "learning_rate": 9.990765137091927e-05, + "loss": 0.4127, + "step": 572 + }, + { + "epoch": 0.04888244326906671, + "grad_norm": 1.7822954556461847, + "learning_rate": 9.990681019137424e-05, + "loss": 0.4474, + "step": 573 + }, + { + "epoch": 0.04896775294318376, + "grad_norm": 1.6003575700321564, + "learning_rate": 9.990596520170945e-05, + "loss": 0.4175, + "step": 574 + }, + { + "epoch": 0.0490530626173008, + "grad_norm": 1.7745673659783288, + "learning_rate": 9.99051164019894e-05, + "loss": 0.4284, + "step": 575 + }, + { + "epoch": 0.04913837229141785, + "grad_norm": 1.9852321905978498, + "learning_rate": 9.990426379227894e-05, + "loss": 0.4105, + "step": 576 + }, + { + "epoch": 0.04922368196553489, + "grad_norm": 1.8535749870096034, + "learning_rate": 9.990340737264311e-05, + "loss": 0.4431, + "step": 577 + }, + { + "epoch": 0.04930899163965194, + "grad_norm": 1.6749429095203776, + "learning_rate": 9.990254714314732e-05, + "loss": 0.4179, + "step": 578 + }, + { + "epoch": 0.049394301313768985, + "grad_norm": 2.265222447905475, + "learning_rate": 9.990168310385726e-05, + "loss": 0.4605, + "step": 579 + }, + { + "epoch": 0.04947961098788602, + "grad_norm": 1.8783480951270148, + "learning_rate": 9.990081525483885e-05, + "loss": 0.4486, + "step": 580 + }, + { + "epoch": 0.04956492066200307, + "grad_norm": 2.0766327742029986, + "learning_rate": 9.989994359615836e-05, + "loss": 0.4747, + "step": 581 + }, + { + "epoch": 0.049650230336120114, + "grad_norm": 1.7237823335023126, + "learning_rate": 9.989906812788235e-05, + "loss": 0.478, + "step": 582 + }, + { + "epoch": 0.04973554001023716, + "grad_norm": 1.5445648822586222, + "learning_rate": 9.989818885007766e-05, + "loss": 0.453, + "step": 583 + }, + { + "epoch": 0.049820849684354206, + "grad_norm": 1.9257722742208105, + "learning_rate": 9.98973057628114e-05, + "loss": 0.4403, + "step": 584 + }, + { + "epoch": 0.04990615935847125, + "grad_norm": 1.8812706676881346, + "learning_rate": 9.989641886615101e-05, + "loss": 0.4751, + "step": 585 + }, + { + "epoch": 0.0499914690325883, + "grad_norm": 1.6575149887580998, + "learning_rate": 9.989552816016418e-05, + "loss": 0.4307, + "step": 586 + }, + { + "epoch": 0.05007677870670534, + "grad_norm": 1.6603679925859307, + "learning_rate": 9.989463364491893e-05, + "loss": 0.3732, + "step": 587 + }, + { + "epoch": 0.05016208838082239, + "grad_norm": 1.5386976400185586, + "learning_rate": 9.989373532048353e-05, + "loss": 0.3842, + "step": 588 + }, + { + "epoch": 0.05024739805493943, + "grad_norm": 1.7476727133168597, + "learning_rate": 9.989283318692657e-05, + "loss": 0.459, + "step": 589 + }, + { + "epoch": 0.05033270772905647, + "grad_norm": 1.4539096313634023, + "learning_rate": 9.989192724431694e-05, + "loss": 0.3551, + "step": 590 + }, + { + "epoch": 0.05041801740317352, + "grad_norm": 1.8311567679878624, + "learning_rate": 9.989101749272378e-05, + "loss": 0.4704, + "step": 591 + }, + { + "epoch": 0.050503327077290563, + "grad_norm": 1.674485921969712, + "learning_rate": 9.989010393221656e-05, + "loss": 0.4231, + "step": 592 + }, + { + "epoch": 0.05058863675140761, + "grad_norm": 1.5810920847569117, + "learning_rate": 9.988918656286503e-05, + "loss": 0.4378, + "step": 593 + }, + { + "epoch": 0.050673946425524655, + "grad_norm": 1.788070203627527, + "learning_rate": 9.98882653847392e-05, + "loss": 0.4168, + "step": 594 + }, + { + "epoch": 0.0507592560996417, + "grad_norm": 1.3916835399373817, + "learning_rate": 9.988734039790942e-05, + "loss": 0.416, + "step": 595 + }, + { + "epoch": 0.050844565773758746, + "grad_norm": 1.6962849161803455, + "learning_rate": 9.98864116024463e-05, + "loss": 0.4042, + "step": 596 + }, + { + "epoch": 0.05092987544787579, + "grad_norm": 1.7238480301189036, + "learning_rate": 9.988547899842076e-05, + "loss": 0.426, + "step": 597 + }, + { + "epoch": 0.05101518512199284, + "grad_norm": 1.7432537835803728, + "learning_rate": 9.988454258590398e-05, + "loss": 0.4885, + "step": 598 + }, + { + "epoch": 0.051100494796109876, + "grad_norm": 1.580743863971221, + "learning_rate": 9.988360236496745e-05, + "loss": 0.4259, + "step": 599 + }, + { + "epoch": 0.05118580447022692, + "grad_norm": 2.032638896592715, + "learning_rate": 9.988265833568298e-05, + "loss": 0.4687, + "step": 600 + }, + { + "epoch": 0.05127111414434397, + "grad_norm": 1.7424026151176844, + "learning_rate": 9.98817104981226e-05, + "loss": 0.3844, + "step": 601 + }, + { + "epoch": 0.05135642381846101, + "grad_norm": 1.6243691320192233, + "learning_rate": 9.988075885235873e-05, + "loss": 0.3535, + "step": 602 + }, + { + "epoch": 0.05144173349257806, + "grad_norm": 1.9235100962545184, + "learning_rate": 9.987980339846395e-05, + "loss": 0.452, + "step": 603 + }, + { + "epoch": 0.051527043166695104, + "grad_norm": 1.9567644314275794, + "learning_rate": 9.987884413651127e-05, + "loss": 0.4421, + "step": 604 + }, + { + "epoch": 0.05161235284081215, + "grad_norm": 1.8302624588245187, + "learning_rate": 9.987788106657387e-05, + "loss": 0.4096, + "step": 605 + }, + { + "epoch": 0.051697662514929195, + "grad_norm": 2.018242253086773, + "learning_rate": 9.987691418872532e-05, + "loss": 0.4417, + "step": 606 + }, + { + "epoch": 0.05178297218904624, + "grad_norm": 1.8376379700847743, + "learning_rate": 9.987594350303941e-05, + "loss": 0.43, + "step": 607 + }, + { + "epoch": 0.051868281863163286, + "grad_norm": 1.9614028682419764, + "learning_rate": 9.987496900959026e-05, + "loss": 0.4157, + "step": 608 + }, + { + "epoch": 0.051953591537280325, + "grad_norm": 1.8620728226873853, + "learning_rate": 9.987399070845226e-05, + "loss": 0.4577, + "step": 609 + }, + { + "epoch": 0.05203890121139737, + "grad_norm": 1.423864879578551, + "learning_rate": 9.98730085997001e-05, + "loss": 0.4333, + "step": 610 + }, + { + "epoch": 0.052124210885514416, + "grad_norm": 1.727476404583737, + "learning_rate": 9.987202268340876e-05, + "loss": 0.4493, + "step": 611 + }, + { + "epoch": 0.05220952055963146, + "grad_norm": 1.7847592770118559, + "learning_rate": 9.98710329596535e-05, + "loss": 0.4631, + "step": 612 + }, + { + "epoch": 0.05229483023374851, + "grad_norm": 1.8576834541592206, + "learning_rate": 9.987003942850989e-05, + "loss": 0.461, + "step": 613 + }, + { + "epoch": 0.05238013990786555, + "grad_norm": 1.8219587879314543, + "learning_rate": 9.986904209005378e-05, + "loss": 0.433, + "step": 614 + }, + { + "epoch": 0.0524654495819826, + "grad_norm": 1.9570009149267584, + "learning_rate": 9.98680409443613e-05, + "loss": 0.3849, + "step": 615 + }, + { + "epoch": 0.052550759256099644, + "grad_norm": 1.8562457254790852, + "learning_rate": 9.986703599150891e-05, + "loss": 0.4703, + "step": 616 + }, + { + "epoch": 0.05263606893021669, + "grad_norm": 1.426696650703955, + "learning_rate": 9.986602723157332e-05, + "loss": 0.4023, + "step": 617 + }, + { + "epoch": 0.05272137860433373, + "grad_norm": 1.8172716929775283, + "learning_rate": 9.986501466463152e-05, + "loss": 0.37, + "step": 618 + }, + { + "epoch": 0.052806688278450774, + "grad_norm": 2.1353723164736773, + "learning_rate": 9.986399829076084e-05, + "loss": 0.4564, + "step": 619 + }, + { + "epoch": 0.05289199795256782, + "grad_norm": 1.5738562836332872, + "learning_rate": 9.986297811003886e-05, + "loss": 0.3926, + "step": 620 + }, + { + "epoch": 0.052977307626684865, + "grad_norm": 1.9655220574864012, + "learning_rate": 9.986195412254349e-05, + "loss": 0.4247, + "step": 621 + }, + { + "epoch": 0.05306261730080191, + "grad_norm": 1.7257172599868804, + "learning_rate": 9.986092632835286e-05, + "loss": 0.4088, + "step": 622 + }, + { + "epoch": 0.053147926974918956, + "grad_norm": 1.9636922385634132, + "learning_rate": 9.985989472754549e-05, + "loss": 0.438, + "step": 623 + }, + { + "epoch": 0.053233236649036, + "grad_norm": 1.5309201099274732, + "learning_rate": 9.985885932020011e-05, + "loss": 0.4287, + "step": 624 + }, + { + "epoch": 0.05331854632315305, + "grad_norm": 1.622793900919288, + "learning_rate": 9.985782010639577e-05, + "loss": 0.4248, + "step": 625 + }, + { + "epoch": 0.05340385599727009, + "grad_norm": 1.4829005899582366, + "learning_rate": 9.98567770862118e-05, + "loss": 0.404, + "step": 626 + }, + { + "epoch": 0.05348916567138714, + "grad_norm": 1.6558352976868793, + "learning_rate": 9.985573025972785e-05, + "loss": 0.426, + "step": 627 + }, + { + "epoch": 0.05357447534550418, + "grad_norm": 1.880196491179506, + "learning_rate": 9.985467962702382e-05, + "loss": 0.4172, + "step": 628 + }, + { + "epoch": 0.05365978501962122, + "grad_norm": 1.4922713011532882, + "learning_rate": 9.985362518817993e-05, + "loss": 0.3653, + "step": 629 + }, + { + "epoch": 0.05374509469373827, + "grad_norm": 1.7043214847345076, + "learning_rate": 9.985256694327669e-05, + "loss": 0.3725, + "step": 630 + }, + { + "epoch": 0.053830404367855314, + "grad_norm": 1.8786667256735436, + "learning_rate": 9.985150489239486e-05, + "loss": 0.4031, + "step": 631 + }, + { + "epoch": 0.05391571404197236, + "grad_norm": 2.037920986751495, + "learning_rate": 9.985043903561555e-05, + "loss": 0.4325, + "step": 632 + }, + { + "epoch": 0.054001023716089405, + "grad_norm": 1.6646395117200277, + "learning_rate": 9.984936937302013e-05, + "loss": 0.4986, + "step": 633 + }, + { + "epoch": 0.05408633339020645, + "grad_norm": 1.9408354812463864, + "learning_rate": 9.984829590469025e-05, + "loss": 0.4124, + "step": 634 + }, + { + "epoch": 0.054171643064323496, + "grad_norm": 1.9568076853107614, + "learning_rate": 9.984721863070788e-05, + "loss": 0.4499, + "step": 635 + }, + { + "epoch": 0.05425695273844054, + "grad_norm": 1.8127518840585877, + "learning_rate": 9.984613755115525e-05, + "loss": 0.3865, + "step": 636 + }, + { + "epoch": 0.05434226241255758, + "grad_norm": 1.9867568252041108, + "learning_rate": 9.984505266611491e-05, + "loss": 0.4375, + "step": 637 + }, + { + "epoch": 0.054427572086674626, + "grad_norm": 1.4679024056571948, + "learning_rate": 9.984396397566965e-05, + "loss": 0.4237, + "step": 638 + }, + { + "epoch": 0.05451288176079167, + "grad_norm": 1.686093839479765, + "learning_rate": 9.984287147990263e-05, + "loss": 0.3952, + "step": 639 + }, + { + "epoch": 0.05459819143490872, + "grad_norm": 1.565691625582272, + "learning_rate": 9.984177517889724e-05, + "loss": 0.406, + "step": 640 + }, + { + "epoch": 0.05468350110902576, + "grad_norm": 1.7322620890147777, + "learning_rate": 9.984067507273715e-05, + "loss": 0.4011, + "step": 641 + }, + { + "epoch": 0.05476881078314281, + "grad_norm": 1.5636645745500795, + "learning_rate": 9.98395711615064e-05, + "loss": 0.3824, + "step": 642 + }, + { + "epoch": 0.054854120457259854, + "grad_norm": 1.8386579770336298, + "learning_rate": 9.983846344528923e-05, + "loss": 0.4574, + "step": 643 + }, + { + "epoch": 0.0549394301313769, + "grad_norm": 1.9048548110942412, + "learning_rate": 9.983735192417021e-05, + "loss": 0.4624, + "step": 644 + }, + { + "epoch": 0.055024739805493945, + "grad_norm": 2.044961631993242, + "learning_rate": 9.983623659823422e-05, + "loss": 0.4143, + "step": 645 + }, + { + "epoch": 0.05511004947961099, + "grad_norm": 1.8062537856540806, + "learning_rate": 9.983511746756638e-05, + "loss": 0.437, + "step": 646 + }, + { + "epoch": 0.05519535915372803, + "grad_norm": 1.5146183721729956, + "learning_rate": 9.983399453225216e-05, + "loss": 0.4035, + "step": 647 + }, + { + "epoch": 0.055280668827845075, + "grad_norm": 1.9668894862443678, + "learning_rate": 9.983286779237727e-05, + "loss": 0.4834, + "step": 648 + }, + { + "epoch": 0.05536597850196212, + "grad_norm": 1.861829596071821, + "learning_rate": 9.983173724802772e-05, + "loss": 0.4225, + "step": 649 + }, + { + "epoch": 0.055451288176079166, + "grad_norm": 1.8725588483947024, + "learning_rate": 9.983060289928984e-05, + "loss": 0.4628, + "step": 650 + }, + { + "epoch": 0.05553659785019621, + "grad_norm": 1.754375437487421, + "learning_rate": 9.982946474625024e-05, + "loss": 0.4096, + "step": 651 + }, + { + "epoch": 0.05562190752431326, + "grad_norm": 1.7151826771238572, + "learning_rate": 9.982832278899582e-05, + "loss": 0.4226, + "step": 652 + }, + { + "epoch": 0.0557072171984303, + "grad_norm": 1.7291879154722833, + "learning_rate": 9.982717702761371e-05, + "loss": 0.4568, + "step": 653 + }, + { + "epoch": 0.05579252687254735, + "grad_norm": 1.5367950156776902, + "learning_rate": 9.982602746219142e-05, + "loss": 0.4073, + "step": 654 + }, + { + "epoch": 0.055877836546664394, + "grad_norm": 1.7327844698318313, + "learning_rate": 9.982487409281671e-05, + "loss": 0.4243, + "step": 655 + }, + { + "epoch": 0.05596314622078144, + "grad_norm": 1.60194943103676, + "learning_rate": 9.982371691957764e-05, + "loss": 0.4268, + "step": 656 + }, + { + "epoch": 0.05604845589489848, + "grad_norm": 1.4197391775517751, + "learning_rate": 9.982255594256253e-05, + "loss": 0.4085, + "step": 657 + }, + { + "epoch": 0.056133765569015524, + "grad_norm": 1.8696801055807888, + "learning_rate": 9.982139116186004e-05, + "loss": 0.4509, + "step": 658 + }, + { + "epoch": 0.05621907524313257, + "grad_norm": 1.760451171657228, + "learning_rate": 9.98202225775591e-05, + "loss": 0.3803, + "step": 659 + }, + { + "epoch": 0.056304384917249616, + "grad_norm": 1.7266064806553745, + "learning_rate": 9.981905018974888e-05, + "loss": 0.4099, + "step": 660 + }, + { + "epoch": 0.05638969459136666, + "grad_norm": 2.029331063798562, + "learning_rate": 9.981787399851894e-05, + "loss": 0.5058, + "step": 661 + }, + { + "epoch": 0.05647500426548371, + "grad_norm": 1.9639903465740756, + "learning_rate": 9.981669400395906e-05, + "loss": 0.4188, + "step": 662 + }, + { + "epoch": 0.05656031393960075, + "grad_norm": 1.5458439945699876, + "learning_rate": 9.98155102061593e-05, + "loss": 0.4375, + "step": 663 + }, + { + "epoch": 0.0566456236137178, + "grad_norm": 1.6156104317751672, + "learning_rate": 9.981432260521006e-05, + "loss": 0.4734, + "step": 664 + }, + { + "epoch": 0.056730933287834844, + "grad_norm": 1.9446740998780048, + "learning_rate": 9.981313120120199e-05, + "loss": 0.376, + "step": 665 + }, + { + "epoch": 0.05681624296195188, + "grad_norm": 1.580613544221983, + "learning_rate": 9.981193599422608e-05, + "loss": 0.4064, + "step": 666 + }, + { + "epoch": 0.05690155263606893, + "grad_norm": 1.9939371840283002, + "learning_rate": 9.981073698437355e-05, + "loss": 0.4597, + "step": 667 + }, + { + "epoch": 0.05698686231018597, + "grad_norm": 1.627673077968262, + "learning_rate": 9.980953417173594e-05, + "loss": 0.392, + "step": 668 + }, + { + "epoch": 0.05707217198430302, + "grad_norm": 1.811943560721299, + "learning_rate": 9.980832755640509e-05, + "loss": 0.421, + "step": 669 + }, + { + "epoch": 0.057157481658420065, + "grad_norm": 1.508057016104302, + "learning_rate": 9.98071171384731e-05, + "loss": 0.3473, + "step": 670 + }, + { + "epoch": 0.05724279133253711, + "grad_norm": 1.706799663835291, + "learning_rate": 9.980590291803241e-05, + "loss": 0.4089, + "step": 671 + }, + { + "epoch": 0.057328101006654156, + "grad_norm": 1.9110688757788352, + "learning_rate": 9.98046848951757e-05, + "loss": 0.4346, + "step": 672 + }, + { + "epoch": 0.0574134106807712, + "grad_norm": 1.5839432829807862, + "learning_rate": 9.980346306999596e-05, + "loss": 0.417, + "step": 673 + }, + { + "epoch": 0.05749872035488825, + "grad_norm": 2.150819103435792, + "learning_rate": 9.980223744258644e-05, + "loss": 0.4435, + "step": 674 + }, + { + "epoch": 0.05758403002900529, + "grad_norm": 1.8969270953545716, + "learning_rate": 9.980100801304077e-05, + "loss": 0.4885, + "step": 675 + }, + { + "epoch": 0.05766933970312233, + "grad_norm": 1.4218275460509366, + "learning_rate": 9.979977478145276e-05, + "loss": 0.3559, + "step": 676 + }, + { + "epoch": 0.05775464937723938, + "grad_norm": 1.771348033230026, + "learning_rate": 9.97985377479166e-05, + "loss": 0.4363, + "step": 677 + }, + { + "epoch": 0.05783995905135642, + "grad_norm": 1.5391672941054115, + "learning_rate": 9.97972969125267e-05, + "loss": 0.3595, + "step": 678 + }, + { + "epoch": 0.05792526872547347, + "grad_norm": 2.0226092318731266, + "learning_rate": 9.979605227537781e-05, + "loss": 0.4473, + "step": 679 + }, + { + "epoch": 0.058010578399590514, + "grad_norm": 1.5755657935755187, + "learning_rate": 9.979480383656494e-05, + "loss": 0.3948, + "step": 680 + }, + { + "epoch": 0.05809588807370756, + "grad_norm": 1.7941827258737757, + "learning_rate": 9.979355159618343e-05, + "loss": 0.4114, + "step": 681 + }, + { + "epoch": 0.058181197747824605, + "grad_norm": 1.983599293397938, + "learning_rate": 9.979229555432882e-05, + "loss": 0.3877, + "step": 682 + }, + { + "epoch": 0.05826650742194165, + "grad_norm": 1.8565561421004084, + "learning_rate": 9.979103571109706e-05, + "loss": 0.3932, + "step": 683 + }, + { + "epoch": 0.058351817096058696, + "grad_norm": 1.5969606858050187, + "learning_rate": 9.97897720665843e-05, + "loss": 0.4063, + "step": 684 + }, + { + "epoch": 0.058437126770175735, + "grad_norm": 1.5477181650916576, + "learning_rate": 9.978850462088704e-05, + "loss": 0.3773, + "step": 685 + }, + { + "epoch": 0.05852243644429278, + "grad_norm": 1.9571922982169399, + "learning_rate": 9.978723337410202e-05, + "loss": 0.4324, + "step": 686 + }, + { + "epoch": 0.058607746118409826, + "grad_norm": 1.4798391810239806, + "learning_rate": 9.978595832632632e-05, + "loss": 0.402, + "step": 687 + }, + { + "epoch": 0.05869305579252687, + "grad_norm": 1.7406158200472204, + "learning_rate": 9.978467947765724e-05, + "loss": 0.4159, + "step": 688 + }, + { + "epoch": 0.05877836546664392, + "grad_norm": 1.7907633463356754, + "learning_rate": 9.978339682819246e-05, + "loss": 0.3977, + "step": 689 + }, + { + "epoch": 0.05886367514076096, + "grad_norm": 1.9310916625856633, + "learning_rate": 9.978211037802986e-05, + "loss": 0.5006, + "step": 690 + }, + { + "epoch": 0.05894898481487801, + "grad_norm": 1.869786197863332, + "learning_rate": 9.978082012726768e-05, + "loss": 0.4278, + "step": 691 + }, + { + "epoch": 0.059034294488995054, + "grad_norm": 1.4471769936381407, + "learning_rate": 9.977952607600442e-05, + "loss": 0.4572, + "step": 692 + }, + { + "epoch": 0.0591196041631121, + "grad_norm": 1.767667820799123, + "learning_rate": 9.977822822433886e-05, + "loss": 0.407, + "step": 693 + }, + { + "epoch": 0.059204913837229145, + "grad_norm": 1.658400237428728, + "learning_rate": 9.977692657237013e-05, + "loss": 0.4301, + "step": 694 + }, + { + "epoch": 0.059290223511346184, + "grad_norm": 1.9816144830598752, + "learning_rate": 9.977562112019754e-05, + "loss": 0.4262, + "step": 695 + }, + { + "epoch": 0.05937553318546323, + "grad_norm": 1.6016108807955403, + "learning_rate": 9.97743118679208e-05, + "loss": 0.4183, + "step": 696 + }, + { + "epoch": 0.059460842859580275, + "grad_norm": 1.8153864140762759, + "learning_rate": 9.977299881563984e-05, + "loss": 0.3544, + "step": 697 + }, + { + "epoch": 0.05954615253369732, + "grad_norm": 1.5616597777506123, + "learning_rate": 9.977168196345492e-05, + "loss": 0.4283, + "step": 698 + }, + { + "epoch": 0.059631462207814366, + "grad_norm": 1.7108400918065563, + "learning_rate": 9.977036131146656e-05, + "loss": 0.3845, + "step": 699 + }, + { + "epoch": 0.05971677188193141, + "grad_norm": 1.740349728307172, + "learning_rate": 9.97690368597756e-05, + "loss": 0.3917, + "step": 700 + }, + { + "epoch": 0.05980208155604846, + "grad_norm": 1.6789937560582797, + "learning_rate": 9.976770860848315e-05, + "loss": 0.4141, + "step": 701 + }, + { + "epoch": 0.0598873912301655, + "grad_norm": 1.9257531463862783, + "learning_rate": 9.976637655769061e-05, + "loss": 0.5062, + "step": 702 + }, + { + "epoch": 0.05997270090428255, + "grad_norm": 1.4936235318455626, + "learning_rate": 9.976504070749969e-05, + "loss": 0.4378, + "step": 703 + }, + { + "epoch": 0.06005801057839959, + "grad_norm": 1.741272827349332, + "learning_rate": 9.976370105801234e-05, + "loss": 0.4518, + "step": 704 + }, + { + "epoch": 0.06014332025251663, + "grad_norm": 1.8145108085472286, + "learning_rate": 9.976235760933086e-05, + "loss": 0.4071, + "step": 705 + }, + { + "epoch": 0.06022862992663368, + "grad_norm": 1.658808549366544, + "learning_rate": 9.976101036155783e-05, + "loss": 0.4207, + "step": 706 + }, + { + "epoch": 0.060313939600750724, + "grad_norm": 1.653848238194744, + "learning_rate": 9.975965931479607e-05, + "loss": 0.3834, + "step": 707 + }, + { + "epoch": 0.06039924927486777, + "grad_norm": 1.796642056168611, + "learning_rate": 9.975830446914876e-05, + "loss": 0.407, + "step": 708 + }, + { + "epoch": 0.060484558948984815, + "grad_norm": 1.8764388603498008, + "learning_rate": 9.975694582471932e-05, + "loss": 0.3788, + "step": 709 + }, + { + "epoch": 0.06056986862310186, + "grad_norm": 1.713628594726069, + "learning_rate": 9.975558338161146e-05, + "loss": 0.4424, + "step": 710 + }, + { + "epoch": 0.060655178297218906, + "grad_norm": 1.5238020240721182, + "learning_rate": 9.975421713992923e-05, + "loss": 0.3971, + "step": 711 + }, + { + "epoch": 0.06074048797133595, + "grad_norm": 1.4240721609873204, + "learning_rate": 9.97528470997769e-05, + "loss": 0.3728, + "step": 712 + }, + { + "epoch": 0.060825797645453, + "grad_norm": 1.671246990009984, + "learning_rate": 9.975147326125908e-05, + "loss": 0.3764, + "step": 713 + }, + { + "epoch": 0.060911107319570036, + "grad_norm": 1.6296077836784801, + "learning_rate": 9.975009562448066e-05, + "loss": 0.4083, + "step": 714 + }, + { + "epoch": 0.06099641699368708, + "grad_norm": 1.7082977136139423, + "learning_rate": 9.974871418954681e-05, + "loss": 0.3684, + "step": 715 + }, + { + "epoch": 0.06108172666780413, + "grad_norm": 1.9369928249921435, + "learning_rate": 9.9747328956563e-05, + "loss": 0.4308, + "step": 716 + }, + { + "epoch": 0.06116703634192117, + "grad_norm": 2.2116112642839973, + "learning_rate": 9.974593992563498e-05, + "loss": 0.4336, + "step": 717 + }, + { + "epoch": 0.06125234601603822, + "grad_norm": 1.9965355789105876, + "learning_rate": 9.974454709686878e-05, + "loss": 0.4791, + "step": 718 + }, + { + "epoch": 0.061337655690155264, + "grad_norm": 1.8203665270994214, + "learning_rate": 9.974315047037077e-05, + "loss": 0.4443, + "step": 719 + }, + { + "epoch": 0.06142296536427231, + "grad_norm": 1.7504232439757355, + "learning_rate": 9.974175004624756e-05, + "loss": 0.4516, + "step": 720 + }, + { + "epoch": 0.061508275038389355, + "grad_norm": 1.9204218262978112, + "learning_rate": 9.974034582460606e-05, + "loss": 0.3909, + "step": 721 + }, + { + "epoch": 0.0615935847125064, + "grad_norm": 1.8106615674214386, + "learning_rate": 9.973893780555346e-05, + "loss": 0.4174, + "step": 722 + }, + { + "epoch": 0.06167889438662345, + "grad_norm": 1.6481255692678836, + "learning_rate": 9.973752598919728e-05, + "loss": 0.4045, + "step": 723 + }, + { + "epoch": 0.061764204060740485, + "grad_norm": 1.8095463989456402, + "learning_rate": 9.973611037564529e-05, + "loss": 0.4089, + "step": 724 + }, + { + "epoch": 0.06184951373485753, + "grad_norm": 1.4843783729916447, + "learning_rate": 9.973469096500558e-05, + "loss": 0.4075, + "step": 725 + }, + { + "epoch": 0.061934823408974576, + "grad_norm": 1.483033788989333, + "learning_rate": 9.97332677573865e-05, + "loss": 0.3781, + "step": 726 + }, + { + "epoch": 0.06202013308309162, + "grad_norm": 1.6702006860175476, + "learning_rate": 9.973184075289672e-05, + "loss": 0.38, + "step": 727 + }, + { + "epoch": 0.06210544275720867, + "grad_norm": 1.8546676673246565, + "learning_rate": 9.973040995164515e-05, + "loss": 0.4095, + "step": 728 + }, + { + "epoch": 0.06219075243132571, + "grad_norm": 2.0802081000325225, + "learning_rate": 9.972897535374106e-05, + "loss": 0.4388, + "step": 729 + }, + { + "epoch": 0.06227606210544276, + "grad_norm": 2.0643990761314117, + "learning_rate": 9.972753695929397e-05, + "loss": 0.4426, + "step": 730 + }, + { + "epoch": 0.062361371779559804, + "grad_norm": 1.5342278621693195, + "learning_rate": 9.972609476841367e-05, + "loss": 0.4353, + "step": 731 + }, + { + "epoch": 0.06244668145367685, + "grad_norm": 1.5758553982539307, + "learning_rate": 9.972464878121028e-05, + "loss": 0.3547, + "step": 732 + }, + { + "epoch": 0.0625319911277939, + "grad_norm": 1.3762799721069183, + "learning_rate": 9.972319899779422e-05, + "loss": 0.3908, + "step": 733 + }, + { + "epoch": 0.06261730080191094, + "grad_norm": 1.6035432801550966, + "learning_rate": 9.97217454182761e-05, + "loss": 0.3907, + "step": 734 + }, + { + "epoch": 0.06270261047602799, + "grad_norm": 1.7161712721780478, + "learning_rate": 9.972028804276697e-05, + "loss": 0.4628, + "step": 735 + }, + { + "epoch": 0.06278792015014503, + "grad_norm": 1.7743157048276244, + "learning_rate": 9.971882687137805e-05, + "loss": 0.4329, + "step": 736 + }, + { + "epoch": 0.06287322982426208, + "grad_norm": 1.6373236000618754, + "learning_rate": 9.97173619042209e-05, + "loss": 0.4325, + "step": 737 + }, + { + "epoch": 0.06295853949837911, + "grad_norm": 1.8875907320358087, + "learning_rate": 9.971589314140738e-05, + "loss": 0.3883, + "step": 738 + }, + { + "epoch": 0.06304384917249616, + "grad_norm": 1.9402580319692888, + "learning_rate": 9.97144205830496e-05, + "loss": 0.4356, + "step": 739 + }, + { + "epoch": 0.0631291588466132, + "grad_norm": 1.829180551300995, + "learning_rate": 9.971294422925999e-05, + "loss": 0.4227, + "step": 740 + }, + { + "epoch": 0.06321446852073025, + "grad_norm": 1.5307310353782888, + "learning_rate": 9.971146408015126e-05, + "loss": 0.3852, + "step": 741 + }, + { + "epoch": 0.06329977819484729, + "grad_norm": 1.7781374992910104, + "learning_rate": 9.970998013583643e-05, + "loss": 0.4087, + "step": 742 + }, + { + "epoch": 0.06338508786896434, + "grad_norm": 1.6321747186763973, + "learning_rate": 9.970849239642875e-05, + "loss": 0.369, + "step": 743 + }, + { + "epoch": 0.06347039754308138, + "grad_norm": 1.7753962652034996, + "learning_rate": 9.970700086204184e-05, + "loss": 0.4346, + "step": 744 + }, + { + "epoch": 0.06355570721719843, + "grad_norm": 1.5127150138424117, + "learning_rate": 9.970550553278956e-05, + "loss": 0.4206, + "step": 745 + }, + { + "epoch": 0.06364101689131547, + "grad_norm": 1.7462342480964155, + "learning_rate": 9.970400640878605e-05, + "loss": 0.4002, + "step": 746 + }, + { + "epoch": 0.06372632656543252, + "grad_norm": 1.9813579541948918, + "learning_rate": 9.97025034901458e-05, + "loss": 0.3988, + "step": 747 + }, + { + "epoch": 0.06381163623954957, + "grad_norm": 1.7700899096999652, + "learning_rate": 9.970099677698353e-05, + "loss": 0.3815, + "step": 748 + }, + { + "epoch": 0.06389694591366661, + "grad_norm": 1.828453099730764, + "learning_rate": 9.969948626941426e-05, + "loss": 0.4163, + "step": 749 + }, + { + "epoch": 0.06398225558778366, + "grad_norm": 1.7354645614456705, + "learning_rate": 9.969797196755331e-05, + "loss": 0.4387, + "step": 750 + }, + { + "epoch": 0.0640675652619007, + "grad_norm": 1.9481513210130315, + "learning_rate": 9.969645387151629e-05, + "loss": 0.431, + "step": 751 + }, + { + "epoch": 0.06415287493601775, + "grad_norm": 1.972405566915169, + "learning_rate": 9.96949319814191e-05, + "loss": 0.3986, + "step": 752 + }, + { + "epoch": 0.0642381846101348, + "grad_norm": 1.5148859483449306, + "learning_rate": 9.969340629737794e-05, + "loss": 0.4384, + "step": 753 + }, + { + "epoch": 0.06432349428425184, + "grad_norm": 1.7965166480941033, + "learning_rate": 9.969187681950928e-05, + "loss": 0.4262, + "step": 754 + }, + { + "epoch": 0.06440880395836888, + "grad_norm": 1.7650395325726247, + "learning_rate": 9.96903435479299e-05, + "loss": 0.4456, + "step": 755 + }, + { + "epoch": 0.06449411363248593, + "grad_norm": 1.672672901311217, + "learning_rate": 9.968880648275682e-05, + "loss": 0.3911, + "step": 756 + }, + { + "epoch": 0.06457942330660296, + "grad_norm": 2.093351525492899, + "learning_rate": 9.968726562410744e-05, + "loss": 0.4478, + "step": 757 + }, + { + "epoch": 0.06466473298072001, + "grad_norm": 1.5839073077737664, + "learning_rate": 9.968572097209934e-05, + "loss": 0.4166, + "step": 758 + }, + { + "epoch": 0.06475004265483705, + "grad_norm": 2.127672198237853, + "learning_rate": 9.968417252685049e-05, + "loss": 0.4647, + "step": 759 + }, + { + "epoch": 0.0648353523289541, + "grad_norm": 1.8028922791929458, + "learning_rate": 9.968262028847908e-05, + "loss": 0.4237, + "step": 760 + }, + { + "epoch": 0.06492066200307114, + "grad_norm": 1.547818599313833, + "learning_rate": 9.968106425710364e-05, + "loss": 0.3278, + "step": 761 + }, + { + "epoch": 0.06500597167718819, + "grad_norm": 2.002709460787104, + "learning_rate": 9.967950443284293e-05, + "loss": 0.4704, + "step": 762 + }, + { + "epoch": 0.06509128135130524, + "grad_norm": 1.6936164493625083, + "learning_rate": 9.967794081581606e-05, + "loss": 0.4133, + "step": 763 + }, + { + "epoch": 0.06517659102542228, + "grad_norm": 1.3878576625320678, + "learning_rate": 9.96763734061424e-05, + "loss": 0.3611, + "step": 764 + }, + { + "epoch": 0.06526190069953933, + "grad_norm": 1.6748449737774969, + "learning_rate": 9.96748022039416e-05, + "loss": 0.3931, + "step": 765 + }, + { + "epoch": 0.06534721037365637, + "grad_norm": 1.4758210111707109, + "learning_rate": 9.967322720933363e-05, + "loss": 0.4096, + "step": 766 + }, + { + "epoch": 0.06543252004777342, + "grad_norm": 1.9596742003057535, + "learning_rate": 9.967164842243872e-05, + "loss": 0.4294, + "step": 767 + }, + { + "epoch": 0.06551782972189046, + "grad_norm": 1.8062267262368437, + "learning_rate": 9.967006584337741e-05, + "loss": 0.407, + "step": 768 + }, + { + "epoch": 0.06560313939600751, + "grad_norm": 2.028697361289656, + "learning_rate": 9.966847947227054e-05, + "loss": 0.49, + "step": 769 + }, + { + "epoch": 0.06568844907012455, + "grad_norm": 1.5384473781667618, + "learning_rate": 9.966688930923917e-05, + "loss": 0.3758, + "step": 770 + }, + { + "epoch": 0.0657737587442416, + "grad_norm": 1.6759639186935664, + "learning_rate": 9.966529535440475e-05, + "loss": 0.426, + "step": 771 + }, + { + "epoch": 0.06585906841835865, + "grad_norm": 1.900145653110966, + "learning_rate": 9.966369760788895e-05, + "loss": 0.4584, + "step": 772 + }, + { + "epoch": 0.06594437809247569, + "grad_norm": 1.69237013348179, + "learning_rate": 9.966209606981373e-05, + "loss": 0.4171, + "step": 773 + }, + { + "epoch": 0.06602968776659274, + "grad_norm": 1.8458526389301464, + "learning_rate": 9.966049074030141e-05, + "loss": 0.4165, + "step": 774 + }, + { + "epoch": 0.06611499744070978, + "grad_norm": 1.672360643816351, + "learning_rate": 9.96588816194745e-05, + "loss": 0.3889, + "step": 775 + }, + { + "epoch": 0.06620030711482681, + "grad_norm": 2.1442795346923385, + "learning_rate": 9.965726870745586e-05, + "loss": 0.4473, + "step": 776 + }, + { + "epoch": 0.06628561678894386, + "grad_norm": 1.5987477200083995, + "learning_rate": 9.965565200436865e-05, + "loss": 0.438, + "step": 777 + }, + { + "epoch": 0.0663709264630609, + "grad_norm": 1.5985370623623631, + "learning_rate": 9.965403151033628e-05, + "loss": 0.4019, + "step": 778 + }, + { + "epoch": 0.06645623613717795, + "grad_norm": 1.621407719736719, + "learning_rate": 9.965240722548245e-05, + "loss": 0.4483, + "step": 779 + }, + { + "epoch": 0.066541545811295, + "grad_norm": 1.3318707769390519, + "learning_rate": 9.965077914993119e-05, + "loss": 0.4301, + "step": 780 + }, + { + "epoch": 0.06662685548541204, + "grad_norm": 1.4576918508112018, + "learning_rate": 9.964914728380677e-05, + "loss": 0.3954, + "step": 781 + }, + { + "epoch": 0.06671216515952909, + "grad_norm": 1.5354488181375885, + "learning_rate": 9.96475116272338e-05, + "loss": 0.3983, + "step": 782 + }, + { + "epoch": 0.06679747483364613, + "grad_norm": 1.768767674226376, + "learning_rate": 9.964587218033715e-05, + "loss": 0.4189, + "step": 783 + }, + { + "epoch": 0.06688278450776318, + "grad_norm": 1.5955260609285875, + "learning_rate": 9.964422894324197e-05, + "loss": 0.3875, + "step": 784 + }, + { + "epoch": 0.06696809418188023, + "grad_norm": 1.570512218016105, + "learning_rate": 9.964258191607372e-05, + "loss": 0.3878, + "step": 785 + }, + { + "epoch": 0.06705340385599727, + "grad_norm": 1.5519721313511008, + "learning_rate": 9.964093109895816e-05, + "loss": 0.4308, + "step": 786 + }, + { + "epoch": 0.06713871353011432, + "grad_norm": 1.4750880392923433, + "learning_rate": 9.963927649202127e-05, + "loss": 0.3928, + "step": 787 + }, + { + "epoch": 0.06722402320423136, + "grad_norm": 1.5314689012658513, + "learning_rate": 9.963761809538943e-05, + "loss": 0.4001, + "step": 788 + }, + { + "epoch": 0.06730933287834841, + "grad_norm": 1.5760632634705454, + "learning_rate": 9.963595590918921e-05, + "loss": 0.4013, + "step": 789 + }, + { + "epoch": 0.06739464255246545, + "grad_norm": 1.7537355743489207, + "learning_rate": 9.963428993354751e-05, + "loss": 0.4113, + "step": 790 + }, + { + "epoch": 0.0674799522265825, + "grad_norm": 1.8104378380395372, + "learning_rate": 9.963262016859154e-05, + "loss": 0.4171, + "step": 791 + }, + { + "epoch": 0.06756526190069954, + "grad_norm": 1.3351155033222046, + "learning_rate": 9.963094661444878e-05, + "loss": 0.3501, + "step": 792 + }, + { + "epoch": 0.06765057157481659, + "grad_norm": 1.5681311796453534, + "learning_rate": 9.962926927124697e-05, + "loss": 0.4002, + "step": 793 + }, + { + "epoch": 0.06773588124893364, + "grad_norm": 1.7074925129001612, + "learning_rate": 9.962758813911419e-05, + "loss": 0.4493, + "step": 794 + }, + { + "epoch": 0.06782119092305067, + "grad_norm": 1.5223582822240553, + "learning_rate": 9.962590321817878e-05, + "loss": 0.3961, + "step": 795 + }, + { + "epoch": 0.06790650059716771, + "grad_norm": 2.066416287893566, + "learning_rate": 9.962421450856936e-05, + "loss": 0.5521, + "step": 796 + }, + { + "epoch": 0.06799181027128476, + "grad_norm": 1.6601151764220508, + "learning_rate": 9.962252201041486e-05, + "loss": 0.4907, + "step": 797 + }, + { + "epoch": 0.0680771199454018, + "grad_norm": 1.6354872410928762, + "learning_rate": 9.96208257238445e-05, + "loss": 0.4299, + "step": 798 + }, + { + "epoch": 0.06816242961951885, + "grad_norm": 1.7317840193244083, + "learning_rate": 9.961912564898779e-05, + "loss": 0.3806, + "step": 799 + }, + { + "epoch": 0.0682477392936359, + "grad_norm": 1.533140378020366, + "learning_rate": 9.96174217859745e-05, + "loss": 0.3975, + "step": 800 + }, + { + "epoch": 0.06833304896775294, + "grad_norm": 1.6066696984247963, + "learning_rate": 9.961571413493474e-05, + "loss": 0.393, + "step": 801 + }, + { + "epoch": 0.06841835864186999, + "grad_norm": 1.3395906882234951, + "learning_rate": 9.961400269599886e-05, + "loss": 0.3388, + "step": 802 + }, + { + "epoch": 0.06850366831598703, + "grad_norm": 1.8338195271693283, + "learning_rate": 9.961228746929752e-05, + "loss": 0.44, + "step": 803 + }, + { + "epoch": 0.06858897799010408, + "grad_norm": 1.6121721865981693, + "learning_rate": 9.961056845496167e-05, + "loss": 0.4034, + "step": 804 + }, + { + "epoch": 0.06867428766422112, + "grad_norm": 1.9502288097242129, + "learning_rate": 9.960884565312255e-05, + "loss": 0.4171, + "step": 805 + }, + { + "epoch": 0.06875959733833817, + "grad_norm": 2.065080819753614, + "learning_rate": 9.960711906391167e-05, + "loss": 0.4814, + "step": 806 + }, + { + "epoch": 0.06884490701245521, + "grad_norm": 1.7157795987539437, + "learning_rate": 9.960538868746087e-05, + "loss": 0.4213, + "step": 807 + }, + { + "epoch": 0.06893021668657226, + "grad_norm": 1.7273950425101938, + "learning_rate": 9.960365452390226e-05, + "loss": 0.4027, + "step": 808 + }, + { + "epoch": 0.0690155263606893, + "grad_norm": 1.8160994075715577, + "learning_rate": 9.960191657336821e-05, + "loss": 0.4404, + "step": 809 + }, + { + "epoch": 0.06910083603480635, + "grad_norm": 1.8604421678641454, + "learning_rate": 9.960017483599142e-05, + "loss": 0.4306, + "step": 810 + }, + { + "epoch": 0.0691861457089234, + "grad_norm": 1.7108569539937606, + "learning_rate": 9.959842931190485e-05, + "loss": 0.3651, + "step": 811 + }, + { + "epoch": 0.06927145538304044, + "grad_norm": 1.6826427632677905, + "learning_rate": 9.959668000124177e-05, + "loss": 0.4278, + "step": 812 + }, + { + "epoch": 0.06935676505715749, + "grad_norm": 1.5020084445324864, + "learning_rate": 9.959492690413573e-05, + "loss": 0.4216, + "step": 813 + }, + { + "epoch": 0.06944207473127453, + "grad_norm": 1.7381082170067803, + "learning_rate": 9.959317002072057e-05, + "loss": 0.4279, + "step": 814 + }, + { + "epoch": 0.06952738440539157, + "grad_norm": 1.8988422229261541, + "learning_rate": 9.95914093511304e-05, + "loss": 0.4126, + "step": 815 + }, + { + "epoch": 0.06961269407950861, + "grad_norm": 1.603104904673485, + "learning_rate": 9.958964489549968e-05, + "loss": 0.3818, + "step": 816 + }, + { + "epoch": 0.06969800375362566, + "grad_norm": 1.4444876301659328, + "learning_rate": 9.958787665396308e-05, + "loss": 0.3948, + "step": 817 + }, + { + "epoch": 0.0697833134277427, + "grad_norm": 1.7364559577082002, + "learning_rate": 9.958610462665561e-05, + "loss": 0.4178, + "step": 818 + }, + { + "epoch": 0.06986862310185975, + "grad_norm": 1.7727928838614577, + "learning_rate": 9.958432881371253e-05, + "loss": 0.4731, + "step": 819 + }, + { + "epoch": 0.0699539327759768, + "grad_norm": 1.6411653627883929, + "learning_rate": 9.958254921526946e-05, + "loss": 0.4093, + "step": 820 + }, + { + "epoch": 0.07003924245009384, + "grad_norm": 1.5903632137812598, + "learning_rate": 9.958076583146224e-05, + "loss": 0.3813, + "step": 821 + }, + { + "epoch": 0.07012455212421088, + "grad_norm": 1.6717520695350219, + "learning_rate": 9.957897866242702e-05, + "loss": 0.403, + "step": 822 + }, + { + "epoch": 0.07020986179832793, + "grad_norm": 1.6821384734170965, + "learning_rate": 9.957718770830022e-05, + "loss": 0.4502, + "step": 823 + }, + { + "epoch": 0.07029517147244498, + "grad_norm": 1.5958567140358806, + "learning_rate": 9.957539296921862e-05, + "loss": 0.3937, + "step": 824 + }, + { + "epoch": 0.07038048114656202, + "grad_norm": 1.5158406526970096, + "learning_rate": 9.95735944453192e-05, + "loss": 0.372, + "step": 825 + }, + { + "epoch": 0.07046579082067907, + "grad_norm": 1.8311654263388588, + "learning_rate": 9.957179213673926e-05, + "loss": 0.397, + "step": 826 + }, + { + "epoch": 0.07055110049479611, + "grad_norm": 1.699377784740752, + "learning_rate": 9.956998604361644e-05, + "loss": 0.4464, + "step": 827 + }, + { + "epoch": 0.07063641016891316, + "grad_norm": 1.8911148571635248, + "learning_rate": 9.956817616608857e-05, + "loss": 0.4565, + "step": 828 + }, + { + "epoch": 0.0707217198430302, + "grad_norm": 1.445494356828545, + "learning_rate": 9.956636250429389e-05, + "loss": 0.4068, + "step": 829 + }, + { + "epoch": 0.07080702951714725, + "grad_norm": 1.7614699228162274, + "learning_rate": 9.95645450583708e-05, + "loss": 0.4759, + "step": 830 + }, + { + "epoch": 0.0708923391912643, + "grad_norm": 1.4079481679766486, + "learning_rate": 9.95627238284581e-05, + "loss": 0.4135, + "step": 831 + }, + { + "epoch": 0.07097764886538134, + "grad_norm": 1.8147319337224361, + "learning_rate": 9.956089881469482e-05, + "loss": 0.4389, + "step": 832 + }, + { + "epoch": 0.07106295853949839, + "grad_norm": 1.4168203027479986, + "learning_rate": 9.955907001722025e-05, + "loss": 0.3573, + "step": 833 + }, + { + "epoch": 0.07114826821361542, + "grad_norm": 1.696448863775742, + "learning_rate": 9.955723743617407e-05, + "loss": 0.4214, + "step": 834 + }, + { + "epoch": 0.07123357788773246, + "grad_norm": 1.4369650965083927, + "learning_rate": 9.955540107169614e-05, + "loss": 0.4278, + "step": 835 + }, + { + "epoch": 0.07131888756184951, + "grad_norm": 1.6617398237820984, + "learning_rate": 9.955356092392668e-05, + "loss": 0.4181, + "step": 836 + }, + { + "epoch": 0.07140419723596655, + "grad_norm": 1.553726512703792, + "learning_rate": 9.955171699300617e-05, + "loss": 0.393, + "step": 837 + }, + { + "epoch": 0.0714895069100836, + "grad_norm": 1.572362520063593, + "learning_rate": 9.954986927907539e-05, + "loss": 0.3599, + "step": 838 + }, + { + "epoch": 0.07157481658420065, + "grad_norm": 1.650089177698375, + "learning_rate": 9.95480177822754e-05, + "loss": 0.3725, + "step": 839 + }, + { + "epoch": 0.07166012625831769, + "grad_norm": 1.490219656095398, + "learning_rate": 9.954616250274754e-05, + "loss": 0.4052, + "step": 840 + }, + { + "epoch": 0.07174543593243474, + "grad_norm": 1.5664394308557068, + "learning_rate": 9.954430344063347e-05, + "loss": 0.4184, + "step": 841 + }, + { + "epoch": 0.07183074560655178, + "grad_norm": 1.4774834722636927, + "learning_rate": 9.95424405960751e-05, + "loss": 0.3963, + "step": 842 + }, + { + "epoch": 0.07191605528066883, + "grad_norm": 1.5085276750726058, + "learning_rate": 9.954057396921467e-05, + "loss": 0.4077, + "step": 843 + }, + { + "epoch": 0.07200136495478587, + "grad_norm": 1.8681920114790531, + "learning_rate": 9.953870356019466e-05, + "loss": 0.4168, + "step": 844 + }, + { + "epoch": 0.07208667462890292, + "grad_norm": 1.6623221596270337, + "learning_rate": 9.953682936915791e-05, + "loss": 0.4083, + "step": 845 + }, + { + "epoch": 0.07217198430301996, + "grad_norm": 1.3297857335670609, + "learning_rate": 9.953495139624744e-05, + "loss": 0.4112, + "step": 846 + }, + { + "epoch": 0.07225729397713701, + "grad_norm": 1.5688661965427195, + "learning_rate": 9.953306964160669e-05, + "loss": 0.4289, + "step": 847 + }, + { + "epoch": 0.07234260365125406, + "grad_norm": 1.753138996687177, + "learning_rate": 9.953118410537928e-05, + "loss": 0.4013, + "step": 848 + }, + { + "epoch": 0.0724279133253711, + "grad_norm": 1.5223926580263065, + "learning_rate": 9.952929478770916e-05, + "loss": 0.3671, + "step": 849 + }, + { + "epoch": 0.07251322299948815, + "grad_norm": 1.517479931417427, + "learning_rate": 9.952740168874059e-05, + "loss": 0.4124, + "step": 850 + }, + { + "epoch": 0.07259853267360519, + "grad_norm": 1.611307409904723, + "learning_rate": 9.95255048086181e-05, + "loss": 0.3898, + "step": 851 + }, + { + "epoch": 0.07268384234772224, + "grad_norm": 1.455565669159701, + "learning_rate": 9.952360414748649e-05, + "loss": 0.3649, + "step": 852 + }, + { + "epoch": 0.07276915202183927, + "grad_norm": 1.6649097560219357, + "learning_rate": 9.952169970549088e-05, + "loss": 0.4284, + "step": 853 + }, + { + "epoch": 0.07285446169595632, + "grad_norm": 1.872216638329716, + "learning_rate": 9.951979148277664e-05, + "loss": 0.4314, + "step": 854 + }, + { + "epoch": 0.07293977137007336, + "grad_norm": 1.2739952597747002, + "learning_rate": 9.951787947948947e-05, + "loss": 0.3456, + "step": 855 + }, + { + "epoch": 0.0730250810441904, + "grad_norm": 1.827941102779828, + "learning_rate": 9.951596369577535e-05, + "loss": 0.4185, + "step": 856 + }, + { + "epoch": 0.07311039071830745, + "grad_norm": 2.0090824371027454, + "learning_rate": 9.951404413178052e-05, + "loss": 0.4551, + "step": 857 + }, + { + "epoch": 0.0731957003924245, + "grad_norm": 1.5864202957786435, + "learning_rate": 9.951212078765155e-05, + "loss": 0.3764, + "step": 858 + }, + { + "epoch": 0.07328101006654154, + "grad_norm": 1.425126769467023, + "learning_rate": 9.951019366353524e-05, + "loss": 0.3928, + "step": 859 + }, + { + "epoch": 0.07336631974065859, + "grad_norm": 1.9067345925660544, + "learning_rate": 9.950826275957877e-05, + "loss": 0.4583, + "step": 860 + }, + { + "epoch": 0.07345162941477563, + "grad_norm": 1.5230793849922206, + "learning_rate": 9.950632807592951e-05, + "loss": 0.3857, + "step": 861 + }, + { + "epoch": 0.07353693908889268, + "grad_norm": 1.7489930389335564, + "learning_rate": 9.950438961273517e-05, + "loss": 0.396, + "step": 862 + }, + { + "epoch": 0.07362224876300973, + "grad_norm": 1.8098799085030821, + "learning_rate": 9.950244737014376e-05, + "loss": 0.3724, + "step": 863 + }, + { + "epoch": 0.07370755843712677, + "grad_norm": 1.6872481405429087, + "learning_rate": 9.950050134830355e-05, + "loss": 0.3844, + "step": 864 + }, + { + "epoch": 0.07379286811124382, + "grad_norm": 1.5512414821049696, + "learning_rate": 9.94985515473631e-05, + "loss": 0.4108, + "step": 865 + }, + { + "epoch": 0.07387817778536086, + "grad_norm": 1.3910940000912388, + "learning_rate": 9.949659796747129e-05, + "loss": 0.3812, + "step": 866 + }, + { + "epoch": 0.07396348745947791, + "grad_norm": 1.6244988769170436, + "learning_rate": 9.949464060877723e-05, + "loss": 0.4056, + "step": 867 + }, + { + "epoch": 0.07404879713359495, + "grad_norm": 1.554041282498229, + "learning_rate": 9.949267947143038e-05, + "loss": 0.4036, + "step": 868 + }, + { + "epoch": 0.074134106807712, + "grad_norm": 1.6257391495686464, + "learning_rate": 9.949071455558046e-05, + "loss": 0.4229, + "step": 869 + }, + { + "epoch": 0.07421941648182905, + "grad_norm": 1.8976478347183405, + "learning_rate": 9.948874586137747e-05, + "loss": 0.4045, + "step": 870 + }, + { + "epoch": 0.07430472615594609, + "grad_norm": 1.5924332358888864, + "learning_rate": 9.948677338897172e-05, + "loss": 0.4074, + "step": 871 + }, + { + "epoch": 0.07439003583006312, + "grad_norm": 1.389861036325033, + "learning_rate": 9.948479713851379e-05, + "loss": 0.3431, + "step": 872 + }, + { + "epoch": 0.07447534550418017, + "grad_norm": 1.5992253067187814, + "learning_rate": 9.948281711015458e-05, + "loss": 0.3875, + "step": 873 + }, + { + "epoch": 0.07456065517829721, + "grad_norm": 1.8158786458627252, + "learning_rate": 9.948083330404522e-05, + "loss": 0.4464, + "step": 874 + }, + { + "epoch": 0.07464596485241426, + "grad_norm": 1.7525795589569202, + "learning_rate": 9.947884572033717e-05, + "loss": 0.4039, + "step": 875 + }, + { + "epoch": 0.0747312745265313, + "grad_norm": 1.5980749351109855, + "learning_rate": 9.947685435918219e-05, + "loss": 0.3322, + "step": 876 + }, + { + "epoch": 0.07481658420064835, + "grad_norm": 1.4224825389038447, + "learning_rate": 9.947485922073231e-05, + "loss": 0.3688, + "step": 877 + }, + { + "epoch": 0.0749018938747654, + "grad_norm": 1.3936716203076156, + "learning_rate": 9.947286030513983e-05, + "loss": 0.4074, + "step": 878 + }, + { + "epoch": 0.07498720354888244, + "grad_norm": 2.045672114197119, + "learning_rate": 9.947085761255735e-05, + "loss": 0.4267, + "step": 879 + }, + { + "epoch": 0.07507251322299949, + "grad_norm": 1.7085090019652056, + "learning_rate": 9.94688511431378e-05, + "loss": 0.3809, + "step": 880 + }, + { + "epoch": 0.07515782289711653, + "grad_norm": 2.0479668187791304, + "learning_rate": 9.946684089703434e-05, + "loss": 0.4752, + "step": 881 + }, + { + "epoch": 0.07524313257123358, + "grad_norm": 1.4061678544529634, + "learning_rate": 9.946482687440042e-05, + "loss": 0.3936, + "step": 882 + }, + { + "epoch": 0.07532844224535062, + "grad_norm": 1.597122886812165, + "learning_rate": 9.946280907538985e-05, + "loss": 0.3786, + "step": 883 + }, + { + "epoch": 0.07541375191946767, + "grad_norm": 1.554040131859552, + "learning_rate": 9.946078750015664e-05, + "loss": 0.3981, + "step": 884 + }, + { + "epoch": 0.07549906159358472, + "grad_norm": 1.756019932892268, + "learning_rate": 9.945876214885513e-05, + "loss": 0.4367, + "step": 885 + }, + { + "epoch": 0.07558437126770176, + "grad_norm": 1.4751623872496558, + "learning_rate": 9.945673302163997e-05, + "loss": 0.3971, + "step": 886 + }, + { + "epoch": 0.0756696809418188, + "grad_norm": 1.477371639995077, + "learning_rate": 9.945470011866604e-05, + "loss": 0.3826, + "step": 887 + }, + { + "epoch": 0.07575499061593585, + "grad_norm": 1.4438786172756286, + "learning_rate": 9.945266344008857e-05, + "loss": 0.4266, + "step": 888 + }, + { + "epoch": 0.0758403002900529, + "grad_norm": 1.5147279267202027, + "learning_rate": 9.945062298606305e-05, + "loss": 0.3798, + "step": 889 + }, + { + "epoch": 0.07592560996416994, + "grad_norm": 1.9949974438643752, + "learning_rate": 9.944857875674522e-05, + "loss": 0.4973, + "step": 890 + }, + { + "epoch": 0.07601091963828698, + "grad_norm": 2.0277055773952624, + "learning_rate": 9.94465307522912e-05, + "loss": 0.5057, + "step": 891 + }, + { + "epoch": 0.07609622931240402, + "grad_norm": 1.434747714817044, + "learning_rate": 9.944447897285729e-05, + "loss": 0.3779, + "step": 892 + }, + { + "epoch": 0.07618153898652107, + "grad_norm": 1.680964991012642, + "learning_rate": 9.944242341860016e-05, + "loss": 0.3885, + "step": 893 + }, + { + "epoch": 0.07626684866063811, + "grad_norm": 1.6874224327062108, + "learning_rate": 9.944036408967674e-05, + "loss": 0.4701, + "step": 894 + }, + { + "epoch": 0.07635215833475516, + "grad_norm": 2.102312939608402, + "learning_rate": 9.943830098624426e-05, + "loss": 0.4663, + "step": 895 + }, + { + "epoch": 0.0764374680088722, + "grad_norm": 1.5505283255156348, + "learning_rate": 9.943623410846021e-05, + "loss": 0.3868, + "step": 896 + }, + { + "epoch": 0.07652277768298925, + "grad_norm": 1.7846137960955586, + "learning_rate": 9.943416345648238e-05, + "loss": 0.4214, + "step": 897 + }, + { + "epoch": 0.0766080873571063, + "grad_norm": 1.7010477679910776, + "learning_rate": 9.943208903046888e-05, + "loss": 0.4608, + "step": 898 + }, + { + "epoch": 0.07669339703122334, + "grad_norm": 1.402390078148613, + "learning_rate": 9.943001083057805e-05, + "loss": 0.3096, + "step": 899 + }, + { + "epoch": 0.07677870670534039, + "grad_norm": 1.5524866830251585, + "learning_rate": 9.942792885696856e-05, + "loss": 0.3926, + "step": 900 + }, + { + "epoch": 0.07686401637945743, + "grad_norm": 1.6487801001449627, + "learning_rate": 9.942584310979939e-05, + "loss": 0.3803, + "step": 901 + }, + { + "epoch": 0.07694932605357448, + "grad_norm": 1.9104782488377685, + "learning_rate": 9.942375358922971e-05, + "loss": 0.4275, + "step": 902 + }, + { + "epoch": 0.07703463572769152, + "grad_norm": 1.8667932197836743, + "learning_rate": 9.94216602954191e-05, + "loss": 0.4328, + "step": 903 + }, + { + "epoch": 0.07711994540180857, + "grad_norm": 1.4881630197255973, + "learning_rate": 9.941956322852735e-05, + "loss": 0.4024, + "step": 904 + }, + { + "epoch": 0.07720525507592561, + "grad_norm": 1.5590928122425167, + "learning_rate": 9.941746238871457e-05, + "loss": 0.4154, + "step": 905 + }, + { + "epoch": 0.07729056475004266, + "grad_norm": 1.519331107405912, + "learning_rate": 9.941535777614112e-05, + "loss": 0.3777, + "step": 906 + }, + { + "epoch": 0.0773758744241597, + "grad_norm": 1.5344739955812983, + "learning_rate": 9.941324939096772e-05, + "loss": 0.4009, + "step": 907 + }, + { + "epoch": 0.07746118409827675, + "grad_norm": 1.7731666526164567, + "learning_rate": 9.941113723335531e-05, + "loss": 0.3811, + "step": 908 + }, + { + "epoch": 0.0775464937723938, + "grad_norm": 1.3621317217005404, + "learning_rate": 9.940902130346513e-05, + "loss": 0.3969, + "step": 909 + }, + { + "epoch": 0.07763180344651083, + "grad_norm": 1.6877891857879768, + "learning_rate": 9.940690160145874e-05, + "loss": 0.4359, + "step": 910 + }, + { + "epoch": 0.07771711312062787, + "grad_norm": 1.4189524535533058, + "learning_rate": 9.940477812749795e-05, + "loss": 0.3851, + "step": 911 + }, + { + "epoch": 0.07780242279474492, + "grad_norm": 1.6472573055557493, + "learning_rate": 9.94026508817449e-05, + "loss": 0.3833, + "step": 912 + }, + { + "epoch": 0.07788773246886196, + "grad_norm": 1.7339434387503534, + "learning_rate": 9.940051986436198e-05, + "loss": 0.4093, + "step": 913 + }, + { + "epoch": 0.07797304214297901, + "grad_norm": 1.6406845808563764, + "learning_rate": 9.939838507551188e-05, + "loss": 0.3531, + "step": 914 + }, + { + "epoch": 0.07805835181709606, + "grad_norm": 1.6379536997547528, + "learning_rate": 9.939624651535757e-05, + "loss": 0.4152, + "step": 915 + }, + { + "epoch": 0.0781436614912131, + "grad_norm": 1.3937606605840542, + "learning_rate": 9.939410418406234e-05, + "loss": 0.3626, + "step": 916 + }, + { + "epoch": 0.07822897116533015, + "grad_norm": 1.3210644809541832, + "learning_rate": 9.939195808178974e-05, + "loss": 0.394, + "step": 917 + }, + { + "epoch": 0.07831428083944719, + "grad_norm": 1.608640484642789, + "learning_rate": 9.938980820870361e-05, + "loss": 0.3775, + "step": 918 + }, + { + "epoch": 0.07839959051356424, + "grad_norm": 2.0557496549134475, + "learning_rate": 9.938765456496808e-05, + "loss": 0.4419, + "step": 919 + }, + { + "epoch": 0.07848490018768128, + "grad_norm": 1.5819830239563057, + "learning_rate": 9.938549715074757e-05, + "loss": 0.3605, + "step": 920 + }, + { + "epoch": 0.07857020986179833, + "grad_norm": 1.5182668725942867, + "learning_rate": 9.938333596620677e-05, + "loss": 0.3646, + "step": 921 + }, + { + "epoch": 0.07865551953591537, + "grad_norm": 1.4535549717216238, + "learning_rate": 9.93811710115107e-05, + "loss": 0.3816, + "step": 922 + }, + { + "epoch": 0.07874082921003242, + "grad_norm": 1.9238440498428777, + "learning_rate": 9.937900228682465e-05, + "loss": 0.4419, + "step": 923 + }, + { + "epoch": 0.07882613888414947, + "grad_norm": 1.702658423112307, + "learning_rate": 9.937682979231416e-05, + "loss": 0.3985, + "step": 924 + }, + { + "epoch": 0.07891144855826651, + "grad_norm": 1.4350917380114725, + "learning_rate": 9.93746535281451e-05, + "loss": 0.3599, + "step": 925 + }, + { + "epoch": 0.07899675823238356, + "grad_norm": 1.5024821090267786, + "learning_rate": 9.937247349448362e-05, + "loss": 0.3671, + "step": 926 + }, + { + "epoch": 0.0790820679065006, + "grad_norm": 1.7560829300338785, + "learning_rate": 9.937028969149617e-05, + "loss": 0.4359, + "step": 927 + }, + { + "epoch": 0.07916737758061765, + "grad_norm": 1.8567038832084304, + "learning_rate": 9.936810211934944e-05, + "loss": 0.4298, + "step": 928 + }, + { + "epoch": 0.07925268725473468, + "grad_norm": 1.8759775792227853, + "learning_rate": 9.936591077821045e-05, + "loss": 0.4406, + "step": 929 + }, + { + "epoch": 0.07933799692885173, + "grad_norm": 1.459218937046371, + "learning_rate": 9.936371566824651e-05, + "loss": 0.3663, + "step": 930 + }, + { + "epoch": 0.07942330660296877, + "grad_norm": 1.6614193408500193, + "learning_rate": 9.936151678962523e-05, + "loss": 0.4242, + "step": 931 + }, + { + "epoch": 0.07950861627708582, + "grad_norm": 1.3331618447092377, + "learning_rate": 9.93593141425144e-05, + "loss": 0.3685, + "step": 932 + }, + { + "epoch": 0.07959392595120286, + "grad_norm": 1.6156127929226338, + "learning_rate": 9.935710772708225e-05, + "loss": 0.4048, + "step": 933 + }, + { + "epoch": 0.07967923562531991, + "grad_norm": 1.76858030118462, + "learning_rate": 9.93548975434972e-05, + "loss": 0.4054, + "step": 934 + }, + { + "epoch": 0.07976454529943695, + "grad_norm": 1.777548574865013, + "learning_rate": 9.935268359192802e-05, + "loss": 0.4314, + "step": 935 + }, + { + "epoch": 0.079849854973554, + "grad_norm": 1.5316126646969312, + "learning_rate": 9.935046587254368e-05, + "loss": 0.3974, + "step": 936 + }, + { + "epoch": 0.07993516464767104, + "grad_norm": 1.7106172444812717, + "learning_rate": 9.934824438551353e-05, + "loss": 0.3985, + "step": 937 + }, + { + "epoch": 0.08002047432178809, + "grad_norm": 1.707210574998789, + "learning_rate": 9.934601913100716e-05, + "loss": 0.397, + "step": 938 + }, + { + "epoch": 0.08010578399590514, + "grad_norm": 1.5681650841866768, + "learning_rate": 9.934379010919446e-05, + "loss": 0.4235, + "step": 939 + }, + { + "epoch": 0.08019109367002218, + "grad_norm": 1.5493426436430673, + "learning_rate": 9.934155732024557e-05, + "loss": 0.3884, + "step": 940 + }, + { + "epoch": 0.08027640334413923, + "grad_norm": 1.7369232702531296, + "learning_rate": 9.933932076433101e-05, + "loss": 0.4151, + "step": 941 + }, + { + "epoch": 0.08036171301825627, + "grad_norm": 1.6376699810764517, + "learning_rate": 9.933708044162149e-05, + "loss": 0.4585, + "step": 942 + }, + { + "epoch": 0.08044702269237332, + "grad_norm": 1.7226191579410883, + "learning_rate": 9.933483635228804e-05, + "loss": 0.3886, + "step": 943 + }, + { + "epoch": 0.08053233236649036, + "grad_norm": 2.163295348215504, + "learning_rate": 9.933258849650202e-05, + "loss": 0.4314, + "step": 944 + }, + { + "epoch": 0.08061764204060741, + "grad_norm": 2.1622896610344675, + "learning_rate": 9.9330336874435e-05, + "loss": 0.4333, + "step": 945 + }, + { + "epoch": 0.08070295171472446, + "grad_norm": 1.7243527041048436, + "learning_rate": 9.932808148625891e-05, + "loss": 0.4377, + "step": 946 + }, + { + "epoch": 0.0807882613888415, + "grad_norm": 1.590156843019075, + "learning_rate": 9.932582233214593e-05, + "loss": 0.4057, + "step": 947 + }, + { + "epoch": 0.08087357106295855, + "grad_norm": 1.7529260151859165, + "learning_rate": 9.932355941226854e-05, + "loss": 0.3875, + "step": 948 + }, + { + "epoch": 0.08095888073707558, + "grad_norm": 1.6521807651083074, + "learning_rate": 9.932129272679949e-05, + "loss": 0.3834, + "step": 949 + }, + { + "epoch": 0.08104419041119262, + "grad_norm": 1.5230304663322867, + "learning_rate": 9.931902227591183e-05, + "loss": 0.3491, + "step": 950 + }, + { + "epoch": 0.08112950008530967, + "grad_norm": 1.8186539064171319, + "learning_rate": 9.93167480597789e-05, + "loss": 0.3894, + "step": 951 + }, + { + "epoch": 0.08121480975942671, + "grad_norm": 1.6622873786972638, + "learning_rate": 9.931447007857432e-05, + "loss": 0.3661, + "step": 952 + }, + { + "epoch": 0.08130011943354376, + "grad_norm": 1.6468036382802227, + "learning_rate": 9.931218833247203e-05, + "loss": 0.416, + "step": 953 + }, + { + "epoch": 0.0813854291076608, + "grad_norm": 1.6972524205650052, + "learning_rate": 9.930990282164617e-05, + "loss": 0.4439, + "step": 954 + }, + { + "epoch": 0.08147073878177785, + "grad_norm": 1.7626942980649782, + "learning_rate": 9.930761354627129e-05, + "loss": 0.3822, + "step": 955 + }, + { + "epoch": 0.0815560484558949, + "grad_norm": 1.7519823835262383, + "learning_rate": 9.930532050652212e-05, + "loss": 0.4457, + "step": 956 + }, + { + "epoch": 0.08164135813001194, + "grad_norm": 1.530050469607025, + "learning_rate": 9.930302370257374e-05, + "loss": 0.4138, + "step": 957 + }, + { + "epoch": 0.08172666780412899, + "grad_norm": 1.6239593548268876, + "learning_rate": 9.93007231346015e-05, + "loss": 0.4023, + "step": 958 + }, + { + "epoch": 0.08181197747824603, + "grad_norm": 1.6215288796195353, + "learning_rate": 9.929841880278104e-05, + "loss": 0.3921, + "step": 959 + }, + { + "epoch": 0.08189728715236308, + "grad_norm": 1.5372116655876398, + "learning_rate": 9.929611070728826e-05, + "loss": 0.3739, + "step": 960 + }, + { + "epoch": 0.08198259682648013, + "grad_norm": 1.7176908524153687, + "learning_rate": 9.929379884829939e-05, + "loss": 0.4127, + "step": 961 + }, + { + "epoch": 0.08206790650059717, + "grad_norm": 1.7384728486840317, + "learning_rate": 9.929148322599093e-05, + "loss": 0.4104, + "step": 962 + }, + { + "epoch": 0.08215321617471422, + "grad_norm": 1.4674845986800955, + "learning_rate": 9.928916384053965e-05, + "loss": 0.3564, + "step": 963 + }, + { + "epoch": 0.08223852584883126, + "grad_norm": 1.5796299170196961, + "learning_rate": 9.928684069212264e-05, + "loss": 0.3841, + "step": 964 + }, + { + "epoch": 0.08232383552294831, + "grad_norm": 1.960962667721647, + "learning_rate": 9.928451378091726e-05, + "loss": 0.4391, + "step": 965 + }, + { + "epoch": 0.08240914519706535, + "grad_norm": 1.323450726619773, + "learning_rate": 9.928218310710115e-05, + "loss": 0.3329, + "step": 966 + }, + { + "epoch": 0.0824944548711824, + "grad_norm": 1.126890977438387, + "learning_rate": 9.927984867085224e-05, + "loss": 0.3167, + "step": 967 + }, + { + "epoch": 0.08257976454529943, + "grad_norm": 1.6888137224826205, + "learning_rate": 9.927751047234875e-05, + "loss": 0.4243, + "step": 968 + }, + { + "epoch": 0.08266507421941648, + "grad_norm": 1.7334523539150009, + "learning_rate": 9.927516851176921e-05, + "loss": 0.3877, + "step": 969 + }, + { + "epoch": 0.08275038389353352, + "grad_norm": 1.9393448199162233, + "learning_rate": 9.92728227892924e-05, + "loss": 0.4617, + "step": 970 + }, + { + "epoch": 0.08283569356765057, + "grad_norm": 1.6531613899364932, + "learning_rate": 9.92704733050974e-05, + "loss": 0.3988, + "step": 971 + }, + { + "epoch": 0.08292100324176761, + "grad_norm": 1.6326109839388958, + "learning_rate": 9.926812005936359e-05, + "loss": 0.4279, + "step": 972 + }, + { + "epoch": 0.08300631291588466, + "grad_norm": 1.667556890520914, + "learning_rate": 9.926576305227063e-05, + "loss": 0.4067, + "step": 973 + }, + { + "epoch": 0.0830916225900017, + "grad_norm": 1.8090461840662317, + "learning_rate": 9.926340228399845e-05, + "loss": 0.4325, + "step": 974 + }, + { + "epoch": 0.08317693226411875, + "grad_norm": 1.5629815694658298, + "learning_rate": 9.926103775472728e-05, + "loss": 0.4121, + "step": 975 + }, + { + "epoch": 0.0832622419382358, + "grad_norm": 1.4291245649863116, + "learning_rate": 9.925866946463766e-05, + "loss": 0.3706, + "step": 976 + }, + { + "epoch": 0.08334755161235284, + "grad_norm": 1.5995724106687563, + "learning_rate": 9.925629741391038e-05, + "loss": 0.3909, + "step": 977 + }, + { + "epoch": 0.08343286128646989, + "grad_norm": 1.4782008210105717, + "learning_rate": 9.925392160272655e-05, + "loss": 0.4415, + "step": 978 + }, + { + "epoch": 0.08351817096058693, + "grad_norm": 1.3095293031511217, + "learning_rate": 9.925154203126754e-05, + "loss": 0.3964, + "step": 979 + }, + { + "epoch": 0.08360348063470398, + "grad_norm": 1.7113070154270404, + "learning_rate": 9.924915869971503e-05, + "loss": 0.3867, + "step": 980 + }, + { + "epoch": 0.08368879030882102, + "grad_norm": 1.5976306568138428, + "learning_rate": 9.924677160825094e-05, + "loss": 0.3896, + "step": 981 + }, + { + "epoch": 0.08377409998293807, + "grad_norm": 1.426009486494571, + "learning_rate": 9.924438075705756e-05, + "loss": 0.3929, + "step": 982 + }, + { + "epoch": 0.08385940965705511, + "grad_norm": 1.4266695782521803, + "learning_rate": 9.924198614631736e-05, + "loss": 0.3729, + "step": 983 + }, + { + "epoch": 0.08394471933117216, + "grad_norm": 2.246026026492598, + "learning_rate": 9.92395877762132e-05, + "loss": 0.4402, + "step": 984 + }, + { + "epoch": 0.0840300290052892, + "grad_norm": 1.5353601528966365, + "learning_rate": 9.92371856469282e-05, + "loss": 0.3632, + "step": 985 + }, + { + "epoch": 0.08411533867940625, + "grad_norm": 2.1849212705626555, + "learning_rate": 9.92347797586457e-05, + "loss": 0.4491, + "step": 986 + }, + { + "epoch": 0.08420064835352328, + "grad_norm": 1.539180073386258, + "learning_rate": 9.92323701115494e-05, + "loss": 0.4718, + "step": 987 + }, + { + "epoch": 0.08428595802764033, + "grad_norm": 1.3549210378999164, + "learning_rate": 9.922995670582325e-05, + "loss": 0.3542, + "step": 988 + }, + { + "epoch": 0.08437126770175737, + "grad_norm": 1.7466439355336911, + "learning_rate": 9.922753954165154e-05, + "loss": 0.4268, + "step": 989 + }, + { + "epoch": 0.08445657737587442, + "grad_norm": 1.5277756180410822, + "learning_rate": 9.922511861921878e-05, + "loss": 0.4446, + "step": 990 + }, + { + "epoch": 0.08454188704999147, + "grad_norm": 1.4524606806411642, + "learning_rate": 9.922269393870976e-05, + "loss": 0.3923, + "step": 991 + }, + { + "epoch": 0.08462719672410851, + "grad_norm": 1.6235109255710627, + "learning_rate": 9.922026550030965e-05, + "loss": 0.4238, + "step": 992 + }, + { + "epoch": 0.08471250639822556, + "grad_norm": 1.5380084305868262, + "learning_rate": 9.921783330420383e-05, + "loss": 0.4412, + "step": 993 + }, + { + "epoch": 0.0847978160723426, + "grad_norm": 1.5170293377860202, + "learning_rate": 9.921539735057798e-05, + "loss": 0.4138, + "step": 994 + }, + { + "epoch": 0.08488312574645965, + "grad_norm": 1.5693752851223886, + "learning_rate": 9.921295763961806e-05, + "loss": 0.4232, + "step": 995 + }, + { + "epoch": 0.0849684354205767, + "grad_norm": 1.5846192843123548, + "learning_rate": 9.921051417151035e-05, + "loss": 0.3814, + "step": 996 + }, + { + "epoch": 0.08505374509469374, + "grad_norm": 1.832724564077769, + "learning_rate": 9.92080669464414e-05, + "loss": 0.4544, + "step": 997 + }, + { + "epoch": 0.08513905476881078, + "grad_norm": 1.8473920895416924, + "learning_rate": 9.920561596459801e-05, + "loss": 0.43, + "step": 998 + }, + { + "epoch": 0.08522436444292783, + "grad_norm": 1.4406206838414872, + "learning_rate": 9.920316122616732e-05, + "loss": 0.3599, + "step": 999 + }, + { + "epoch": 0.08530967411704488, + "grad_norm": 1.4872798404133472, + "learning_rate": 9.920070273133674e-05, + "loss": 0.4043, + "step": 1000 + }, + { + "epoch": 0.08539498379116192, + "grad_norm": 1.7160227552474645, + "learning_rate": 9.919824048029397e-05, + "loss": 0.3566, + "step": 1001 + }, + { + "epoch": 0.08548029346527897, + "grad_norm": 1.451563088013496, + "learning_rate": 9.919577447322697e-05, + "loss": 0.4176, + "step": 1002 + }, + { + "epoch": 0.08556560313939601, + "grad_norm": 1.6625469631359495, + "learning_rate": 9.919330471032401e-05, + "loss": 0.4186, + "step": 1003 + }, + { + "epoch": 0.08565091281351306, + "grad_norm": 1.6576207085063346, + "learning_rate": 9.919083119177366e-05, + "loss": 0.4013, + "step": 1004 + }, + { + "epoch": 0.0857362224876301, + "grad_norm": 1.509768941266167, + "learning_rate": 9.918835391776474e-05, + "loss": 0.3862, + "step": 1005 + }, + { + "epoch": 0.08582153216174714, + "grad_norm": 1.4240339049364124, + "learning_rate": 9.918587288848638e-05, + "loss": 0.3902, + "step": 1006 + }, + { + "epoch": 0.08590684183586418, + "grad_norm": 1.5366954124862304, + "learning_rate": 9.918338810412801e-05, + "loss": 0.3574, + "step": 1007 + }, + { + "epoch": 0.08599215150998123, + "grad_norm": 2.1244654263551386, + "learning_rate": 9.91808995648793e-05, + "loss": 0.39, + "step": 1008 + }, + { + "epoch": 0.08607746118409827, + "grad_norm": 1.7710669045586542, + "learning_rate": 9.917840727093028e-05, + "loss": 0.3513, + "step": 1009 + }, + { + "epoch": 0.08616277085821532, + "grad_norm": 1.860610679543165, + "learning_rate": 9.917591122247119e-05, + "loss": 0.4803, + "step": 1010 + }, + { + "epoch": 0.08624808053233236, + "grad_norm": 1.5647940008337091, + "learning_rate": 9.917341141969258e-05, + "loss": 0.4285, + "step": 1011 + }, + { + "epoch": 0.08633339020644941, + "grad_norm": 1.8128510496936157, + "learning_rate": 9.917090786278533e-05, + "loss": 0.444, + "step": 1012 + }, + { + "epoch": 0.08641869988056645, + "grad_norm": 1.8520976391795119, + "learning_rate": 9.916840055194057e-05, + "loss": 0.4153, + "step": 1013 + }, + { + "epoch": 0.0865040095546835, + "grad_norm": 1.492804115276654, + "learning_rate": 9.91658894873497e-05, + "loss": 0.3822, + "step": 1014 + }, + { + "epoch": 0.08658931922880055, + "grad_norm": 1.802817031767372, + "learning_rate": 9.916337466920443e-05, + "loss": 0.4125, + "step": 1015 + }, + { + "epoch": 0.08667462890291759, + "grad_norm": 1.8392670652949337, + "learning_rate": 9.916085609769677e-05, + "loss": 0.4009, + "step": 1016 + }, + { + "epoch": 0.08675993857703464, + "grad_norm": 1.4804431440006092, + "learning_rate": 9.915833377301898e-05, + "loss": 0.3876, + "step": 1017 + }, + { + "epoch": 0.08684524825115168, + "grad_norm": 1.4741682390657833, + "learning_rate": 9.915580769536362e-05, + "loss": 0.4192, + "step": 1018 + }, + { + "epoch": 0.08693055792526873, + "grad_norm": 1.5700872458981556, + "learning_rate": 9.915327786492357e-05, + "loss": 0.3859, + "step": 1019 + }, + { + "epoch": 0.08701586759938577, + "grad_norm": 1.3552529110957288, + "learning_rate": 9.915074428189195e-05, + "loss": 0.3401, + "step": 1020 + }, + { + "epoch": 0.08710117727350282, + "grad_norm": 1.495064961862393, + "learning_rate": 9.91482069464622e-05, + "loss": 0.4151, + "step": 1021 + }, + { + "epoch": 0.08718648694761987, + "grad_norm": 2.100037656173963, + "learning_rate": 9.914566585882801e-05, + "loss": 0.4525, + "step": 1022 + }, + { + "epoch": 0.08727179662173691, + "grad_norm": 1.5408126252541476, + "learning_rate": 9.91431210191834e-05, + "loss": 0.3732, + "step": 1023 + }, + { + "epoch": 0.08735710629585396, + "grad_norm": 1.6693361203126729, + "learning_rate": 9.914057242772266e-05, + "loss": 0.4283, + "step": 1024 + }, + { + "epoch": 0.08744241596997099, + "grad_norm": 1.6234628302733933, + "learning_rate": 9.913802008464033e-05, + "loss": 0.4522, + "step": 1025 + }, + { + "epoch": 0.08752772564408803, + "grad_norm": 1.6754695006588272, + "learning_rate": 9.913546399013127e-05, + "loss": 0.3877, + "step": 1026 + }, + { + "epoch": 0.08761303531820508, + "grad_norm": 1.8914325345153844, + "learning_rate": 9.913290414439068e-05, + "loss": 0.3737, + "step": 1027 + }, + { + "epoch": 0.08769834499232212, + "grad_norm": 2.0629846408066994, + "learning_rate": 9.913034054761392e-05, + "loss": 0.415, + "step": 1028 + }, + { + "epoch": 0.08778365466643917, + "grad_norm": 1.7163253327553218, + "learning_rate": 9.912777319999675e-05, + "loss": 0.4216, + "step": 1029 + }, + { + "epoch": 0.08786896434055622, + "grad_norm": 1.6523081097016943, + "learning_rate": 9.912520210173515e-05, + "loss": 0.4181, + "step": 1030 + }, + { + "epoch": 0.08795427401467326, + "grad_norm": 1.5346185650157314, + "learning_rate": 9.912262725302543e-05, + "loss": 0.3902, + "step": 1031 + }, + { + "epoch": 0.08803958368879031, + "grad_norm": 1.6811868053031946, + "learning_rate": 9.912004865406415e-05, + "loss": 0.4457, + "step": 1032 + }, + { + "epoch": 0.08812489336290735, + "grad_norm": 1.5684970210992253, + "learning_rate": 9.911746630504818e-05, + "loss": 0.4554, + "step": 1033 + }, + { + "epoch": 0.0882102030370244, + "grad_norm": 1.5903704846201183, + "learning_rate": 9.911488020617467e-05, + "loss": 0.4074, + "step": 1034 + }, + { + "epoch": 0.08829551271114144, + "grad_norm": 1.454215430213596, + "learning_rate": 9.911229035764106e-05, + "loss": 0.3906, + "step": 1035 + }, + { + "epoch": 0.08838082238525849, + "grad_norm": 1.4879976738327056, + "learning_rate": 9.910969675964506e-05, + "loss": 0.4066, + "step": 1036 + }, + { + "epoch": 0.08846613205937554, + "grad_norm": 1.6579680259299645, + "learning_rate": 9.910709941238467e-05, + "loss": 0.3805, + "step": 1037 + }, + { + "epoch": 0.08855144173349258, + "grad_norm": 1.6002155873967299, + "learning_rate": 9.910449831605821e-05, + "loss": 0.408, + "step": 1038 + }, + { + "epoch": 0.08863675140760963, + "grad_norm": 1.6985540383372362, + "learning_rate": 9.910189347086423e-05, + "loss": 0.3787, + "step": 1039 + }, + { + "epoch": 0.08872206108172667, + "grad_norm": 1.7142880871165975, + "learning_rate": 9.909928487700162e-05, + "loss": 0.4317, + "step": 1040 + }, + { + "epoch": 0.08880737075584372, + "grad_norm": 1.7859913147889228, + "learning_rate": 9.909667253466952e-05, + "loss": 0.4137, + "step": 1041 + }, + { + "epoch": 0.08889268042996076, + "grad_norm": 1.9552960345828005, + "learning_rate": 9.909405644406738e-05, + "loss": 0.4606, + "step": 1042 + }, + { + "epoch": 0.08897799010407781, + "grad_norm": 1.7600755831354542, + "learning_rate": 9.909143660539491e-05, + "loss": 0.4318, + "step": 1043 + }, + { + "epoch": 0.08906329977819484, + "grad_norm": 1.5755435490087217, + "learning_rate": 9.908881301885212e-05, + "loss": 0.3671, + "step": 1044 + }, + { + "epoch": 0.08914860945231189, + "grad_norm": 1.575248286946329, + "learning_rate": 9.908618568463932e-05, + "loss": 0.409, + "step": 1045 + }, + { + "epoch": 0.08923391912642893, + "grad_norm": 1.5931405790935835, + "learning_rate": 9.908355460295708e-05, + "loss": 0.3748, + "step": 1046 + }, + { + "epoch": 0.08931922880054598, + "grad_norm": 1.4993942150110713, + "learning_rate": 9.908091977400629e-05, + "loss": 0.3486, + "step": 1047 + }, + { + "epoch": 0.08940453847466302, + "grad_norm": 2.1220722224810205, + "learning_rate": 9.907828119798807e-05, + "loss": 0.4525, + "step": 1048 + }, + { + "epoch": 0.08948984814878007, + "grad_norm": 1.569125813610626, + "learning_rate": 9.90756388751039e-05, + "loss": 0.4133, + "step": 1049 + }, + { + "epoch": 0.08957515782289711, + "grad_norm": 1.3632042106131326, + "learning_rate": 9.907299280555549e-05, + "loss": 0.3431, + "step": 1050 + }, + { + "epoch": 0.08966046749701416, + "grad_norm": 1.5988150500320493, + "learning_rate": 9.907034298954485e-05, + "loss": 0.4283, + "step": 1051 + }, + { + "epoch": 0.0897457771711312, + "grad_norm": 1.470254492755597, + "learning_rate": 9.906768942727427e-05, + "loss": 0.4077, + "step": 1052 + }, + { + "epoch": 0.08983108684524825, + "grad_norm": 1.555915845669338, + "learning_rate": 9.906503211894635e-05, + "loss": 0.4007, + "step": 1053 + }, + { + "epoch": 0.0899163965193653, + "grad_norm": 1.7254064495733312, + "learning_rate": 9.906237106476397e-05, + "loss": 0.3762, + "step": 1054 + }, + { + "epoch": 0.09000170619348234, + "grad_norm": 1.5656357389831763, + "learning_rate": 9.905970626493029e-05, + "loss": 0.418, + "step": 1055 + }, + { + "epoch": 0.09008701586759939, + "grad_norm": 1.3060219615853983, + "learning_rate": 9.905703771964872e-05, + "loss": 0.3476, + "step": 1056 + }, + { + "epoch": 0.09017232554171643, + "grad_norm": 1.5550532078609287, + "learning_rate": 9.905436542912301e-05, + "loss": 0.4301, + "step": 1057 + }, + { + "epoch": 0.09025763521583348, + "grad_norm": 1.5778565886713063, + "learning_rate": 9.905168939355717e-05, + "loss": 0.342, + "step": 1058 + }, + { + "epoch": 0.09034294488995052, + "grad_norm": 1.5104082129230916, + "learning_rate": 9.90490096131555e-05, + "loss": 0.4198, + "step": 1059 + }, + { + "epoch": 0.09042825456406757, + "grad_norm": 1.3957969841091087, + "learning_rate": 9.904632608812261e-05, + "loss": 0.3572, + "step": 1060 + }, + { + "epoch": 0.09051356423818462, + "grad_norm": 1.7691411478888746, + "learning_rate": 9.904363881866334e-05, + "loss": 0.3713, + "step": 1061 + }, + { + "epoch": 0.09059887391230166, + "grad_norm": 1.7050065312861753, + "learning_rate": 9.904094780498288e-05, + "loss": 0.4063, + "step": 1062 + }, + { + "epoch": 0.0906841835864187, + "grad_norm": 1.4737981522760653, + "learning_rate": 9.903825304728664e-05, + "loss": 0.3834, + "step": 1063 + }, + { + "epoch": 0.09076949326053574, + "grad_norm": 1.483115454899053, + "learning_rate": 9.903555454578038e-05, + "loss": 0.3886, + "step": 1064 + }, + { + "epoch": 0.09085480293465278, + "grad_norm": 1.4788564795549235, + "learning_rate": 9.903285230067011e-05, + "loss": 0.3872, + "step": 1065 + }, + { + "epoch": 0.09094011260876983, + "grad_norm": 1.5319753991684157, + "learning_rate": 9.903014631216213e-05, + "loss": 0.3589, + "step": 1066 + }, + { + "epoch": 0.09102542228288688, + "grad_norm": 1.6564863504157736, + "learning_rate": 9.902743658046301e-05, + "loss": 0.3989, + "step": 1067 + }, + { + "epoch": 0.09111073195700392, + "grad_norm": 1.693743141976862, + "learning_rate": 9.902472310577963e-05, + "loss": 0.4293, + "step": 1068 + }, + { + "epoch": 0.09119604163112097, + "grad_norm": 1.5194523258845558, + "learning_rate": 9.902200588831918e-05, + "loss": 0.4046, + "step": 1069 + }, + { + "epoch": 0.09128135130523801, + "grad_norm": 1.7269375078814895, + "learning_rate": 9.901928492828907e-05, + "loss": 0.4226, + "step": 1070 + }, + { + "epoch": 0.09136666097935506, + "grad_norm": 1.521525978626275, + "learning_rate": 9.901656022589705e-05, + "loss": 0.3932, + "step": 1071 + }, + { + "epoch": 0.0914519706534721, + "grad_norm": 1.919615557150131, + "learning_rate": 9.901383178135113e-05, + "loss": 0.4149, + "step": 1072 + }, + { + "epoch": 0.09153728032758915, + "grad_norm": 1.6133622340826805, + "learning_rate": 9.901109959485961e-05, + "loss": 0.3713, + "step": 1073 + }, + { + "epoch": 0.0916225900017062, + "grad_norm": 1.4541133677744429, + "learning_rate": 9.900836366663108e-05, + "loss": 0.389, + "step": 1074 + }, + { + "epoch": 0.09170789967582324, + "grad_norm": 1.3994029032015374, + "learning_rate": 9.900562399687443e-05, + "loss": 0.3841, + "step": 1075 + }, + { + "epoch": 0.09179320934994029, + "grad_norm": 1.5215401596400473, + "learning_rate": 9.900288058579879e-05, + "loss": 0.4038, + "step": 1076 + }, + { + "epoch": 0.09187851902405733, + "grad_norm": 1.8865868816534737, + "learning_rate": 9.900013343361361e-05, + "loss": 0.4415, + "step": 1077 + }, + { + "epoch": 0.09196382869817438, + "grad_norm": 1.6944788979673775, + "learning_rate": 9.899738254052863e-05, + "loss": 0.4326, + "step": 1078 + }, + { + "epoch": 0.09204913837229142, + "grad_norm": 1.589216703811722, + "learning_rate": 9.899462790675389e-05, + "loss": 0.3788, + "step": 1079 + }, + { + "epoch": 0.09213444804640847, + "grad_norm": 1.6845066224373106, + "learning_rate": 9.899186953249965e-05, + "loss": 0.3978, + "step": 1080 + }, + { + "epoch": 0.09221975772052551, + "grad_norm": 1.5364348156524648, + "learning_rate": 9.898910741797652e-05, + "loss": 0.4035, + "step": 1081 + }, + { + "epoch": 0.09230506739464256, + "grad_norm": 1.6268229896183486, + "learning_rate": 9.898634156339534e-05, + "loss": 0.3581, + "step": 1082 + }, + { + "epoch": 0.09239037706875959, + "grad_norm": 1.4574031406365084, + "learning_rate": 9.898357196896733e-05, + "loss": 0.3842, + "step": 1083 + }, + { + "epoch": 0.09247568674287664, + "grad_norm": 1.4712451275507412, + "learning_rate": 9.898079863490389e-05, + "loss": 0.3898, + "step": 1084 + }, + { + "epoch": 0.09256099641699368, + "grad_norm": 1.4833253060170108, + "learning_rate": 9.897802156141673e-05, + "loss": 0.3747, + "step": 1085 + }, + { + "epoch": 0.09264630609111073, + "grad_norm": 1.7434297930773963, + "learning_rate": 9.897524074871792e-05, + "loss": 0.4327, + "step": 1086 + }, + { + "epoch": 0.09273161576522777, + "grad_norm": 1.4671203012890899, + "learning_rate": 9.897245619701972e-05, + "loss": 0.3948, + "step": 1087 + }, + { + "epoch": 0.09281692543934482, + "grad_norm": 1.5519702878742814, + "learning_rate": 9.896966790653475e-05, + "loss": 0.4525, + "step": 1088 + }, + { + "epoch": 0.09290223511346186, + "grad_norm": 1.509922113241415, + "learning_rate": 9.896687587747584e-05, + "loss": 0.3953, + "step": 1089 + }, + { + "epoch": 0.09298754478757891, + "grad_norm": 1.5969285123634192, + "learning_rate": 9.896408011005617e-05, + "loss": 0.4278, + "step": 1090 + }, + { + "epoch": 0.09307285446169596, + "grad_norm": 1.4260223602853117, + "learning_rate": 9.896128060448917e-05, + "loss": 0.3696, + "step": 1091 + }, + { + "epoch": 0.093158164135813, + "grad_norm": 1.948090802193288, + "learning_rate": 9.89584773609886e-05, + "loss": 0.4142, + "step": 1092 + }, + { + "epoch": 0.09324347380993005, + "grad_norm": 1.650198623955504, + "learning_rate": 9.895567037976842e-05, + "loss": 0.4049, + "step": 1093 + }, + { + "epoch": 0.09332878348404709, + "grad_norm": 1.2563500283875293, + "learning_rate": 9.895285966104298e-05, + "loss": 0.3523, + "step": 1094 + }, + { + "epoch": 0.09341409315816414, + "grad_norm": 1.6418263714301033, + "learning_rate": 9.895004520502683e-05, + "loss": 0.4137, + "step": 1095 + }, + { + "epoch": 0.09349940283228118, + "grad_norm": 1.3887513214434664, + "learning_rate": 9.894722701193486e-05, + "loss": 0.37, + "step": 1096 + }, + { + "epoch": 0.09358471250639823, + "grad_norm": 1.632794320767332, + "learning_rate": 9.894440508198219e-05, + "loss": 0.3961, + "step": 1097 + }, + { + "epoch": 0.09367002218051527, + "grad_norm": 1.6558447288014835, + "learning_rate": 9.894157941538428e-05, + "loss": 0.3836, + "step": 1098 + }, + { + "epoch": 0.09375533185463232, + "grad_norm": 1.4728806221025965, + "learning_rate": 9.893875001235689e-05, + "loss": 0.3982, + "step": 1099 + }, + { + "epoch": 0.09384064152874937, + "grad_norm": 1.7992775155863208, + "learning_rate": 9.893591687311598e-05, + "loss": 0.4274, + "step": 1100 + }, + { + "epoch": 0.09392595120286641, + "grad_norm": 1.6533621316883294, + "learning_rate": 9.893307999787787e-05, + "loss": 0.3966, + "step": 1101 + }, + { + "epoch": 0.09401126087698344, + "grad_norm": 1.9266611685008936, + "learning_rate": 9.893023938685911e-05, + "loss": 0.3717, + "step": 1102 + }, + { + "epoch": 0.09409657055110049, + "grad_norm": 1.6508890155616975, + "learning_rate": 9.89273950402766e-05, + "loss": 0.4307, + "step": 1103 + }, + { + "epoch": 0.09418188022521753, + "grad_norm": 1.5506256563553502, + "learning_rate": 9.892454695834747e-05, + "loss": 0.4003, + "step": 1104 + }, + { + "epoch": 0.09426718989933458, + "grad_norm": 1.7875866528828586, + "learning_rate": 9.892169514128919e-05, + "loss": 0.3843, + "step": 1105 + }, + { + "epoch": 0.09435249957345163, + "grad_norm": 1.5571259205198327, + "learning_rate": 9.891883958931943e-05, + "loss": 0.4094, + "step": 1106 + }, + { + "epoch": 0.09443780924756867, + "grad_norm": 1.6311131445525264, + "learning_rate": 9.891598030265623e-05, + "loss": 0.3992, + "step": 1107 + }, + { + "epoch": 0.09452311892168572, + "grad_norm": 1.8431034570924505, + "learning_rate": 9.891311728151789e-05, + "loss": 0.3912, + "step": 1108 + }, + { + "epoch": 0.09460842859580276, + "grad_norm": 1.6368297457329255, + "learning_rate": 9.891025052612295e-05, + "loss": 0.4616, + "step": 1109 + }, + { + "epoch": 0.09469373826991981, + "grad_norm": 1.5346020190699587, + "learning_rate": 9.890738003669029e-05, + "loss": 0.3699, + "step": 1110 + }, + { + "epoch": 0.09477904794403685, + "grad_norm": 1.8251247729145925, + "learning_rate": 9.890450581343907e-05, + "loss": 0.4143, + "step": 1111 + }, + { + "epoch": 0.0948643576181539, + "grad_norm": 1.5400596525355503, + "learning_rate": 9.89016278565887e-05, + "loss": 0.4, + "step": 1112 + }, + { + "epoch": 0.09494966729227095, + "grad_norm": 1.750011716530951, + "learning_rate": 9.88987461663589e-05, + "loss": 0.4156, + "step": 1113 + }, + { + "epoch": 0.09503497696638799, + "grad_norm": 1.3417097171645092, + "learning_rate": 9.889586074296968e-05, + "loss": 0.4015, + "step": 1114 + }, + { + "epoch": 0.09512028664050504, + "grad_norm": 1.8260464059086732, + "learning_rate": 9.889297158664134e-05, + "loss": 0.4168, + "step": 1115 + }, + { + "epoch": 0.09520559631462208, + "grad_norm": 1.6398348404132068, + "learning_rate": 9.889007869759442e-05, + "loss": 0.4124, + "step": 1116 + }, + { + "epoch": 0.09529090598873913, + "grad_norm": 1.5294689018377727, + "learning_rate": 9.88871820760498e-05, + "loss": 0.4227, + "step": 1117 + }, + { + "epoch": 0.09537621566285617, + "grad_norm": 1.47828839872781, + "learning_rate": 9.88842817222286e-05, + "loss": 0.4052, + "step": 1118 + }, + { + "epoch": 0.09546152533697322, + "grad_norm": 1.9175464297247529, + "learning_rate": 9.88813776363523e-05, + "loss": 0.4136, + "step": 1119 + }, + { + "epoch": 0.09554683501109026, + "grad_norm": 1.5384366849579048, + "learning_rate": 9.887846981864255e-05, + "loss": 0.3643, + "step": 1120 + }, + { + "epoch": 0.0956321446852073, + "grad_norm": 1.8326630306191638, + "learning_rate": 9.887555826932136e-05, + "loss": 0.4405, + "step": 1121 + }, + { + "epoch": 0.09571745435932434, + "grad_norm": 1.507893614489566, + "learning_rate": 9.887264298861103e-05, + "loss": 0.3678, + "step": 1122 + }, + { + "epoch": 0.09580276403344139, + "grad_norm": 1.5183016550895867, + "learning_rate": 9.88697239767341e-05, + "loss": 0.411, + "step": 1123 + }, + { + "epoch": 0.09588807370755843, + "grad_norm": 1.7459468225025743, + "learning_rate": 9.886680123391347e-05, + "loss": 0.4344, + "step": 1124 + }, + { + "epoch": 0.09597338338167548, + "grad_norm": 1.62865513757338, + "learning_rate": 9.886387476037222e-05, + "loss": 0.4144, + "step": 1125 + }, + { + "epoch": 0.09605869305579252, + "grad_norm": 1.5695072975438684, + "learning_rate": 9.886094455633381e-05, + "loss": 0.3678, + "step": 1126 + }, + { + "epoch": 0.09614400272990957, + "grad_norm": 1.3191018959349314, + "learning_rate": 9.88580106220219e-05, + "loss": 0.4078, + "step": 1127 + }, + { + "epoch": 0.09622931240402662, + "grad_norm": 1.5473825411983755, + "learning_rate": 9.885507295766054e-05, + "loss": 0.3629, + "step": 1128 + }, + { + "epoch": 0.09631462207814366, + "grad_norm": 1.4698459513386102, + "learning_rate": 9.885213156347398e-05, + "loss": 0.3643, + "step": 1129 + }, + { + "epoch": 0.0963999317522607, + "grad_norm": 1.4540536845710843, + "learning_rate": 9.884918643968676e-05, + "loss": 0.3825, + "step": 1130 + }, + { + "epoch": 0.09648524142637775, + "grad_norm": 1.6116368963154155, + "learning_rate": 9.884623758652373e-05, + "loss": 0.3739, + "step": 1131 + }, + { + "epoch": 0.0965705511004948, + "grad_norm": 1.2001706300064445, + "learning_rate": 9.884328500421005e-05, + "loss": 0.3662, + "step": 1132 + }, + { + "epoch": 0.09665586077461184, + "grad_norm": 2.0876537791733516, + "learning_rate": 9.884032869297111e-05, + "loss": 0.3898, + "step": 1133 + }, + { + "epoch": 0.09674117044872889, + "grad_norm": 1.7509255005539186, + "learning_rate": 9.88373686530326e-05, + "loss": 0.3524, + "step": 1134 + }, + { + "epoch": 0.09682648012284593, + "grad_norm": 1.6995866048484765, + "learning_rate": 9.883440488462051e-05, + "loss": 0.4097, + "step": 1135 + }, + { + "epoch": 0.09691178979696298, + "grad_norm": 1.7312260815452045, + "learning_rate": 9.883143738796113e-05, + "loss": 0.4218, + "step": 1136 + }, + { + "epoch": 0.09699709947108003, + "grad_norm": 1.7035071268024957, + "learning_rate": 9.882846616328099e-05, + "loss": 0.4213, + "step": 1137 + }, + { + "epoch": 0.09708240914519707, + "grad_norm": 1.5778563620170611, + "learning_rate": 9.882549121080694e-05, + "loss": 0.3801, + "step": 1138 + }, + { + "epoch": 0.09716771881931412, + "grad_norm": 1.4849994349879978, + "learning_rate": 9.882251253076606e-05, + "loss": 0.3667, + "step": 1139 + }, + { + "epoch": 0.09725302849343115, + "grad_norm": 1.616663013860502, + "learning_rate": 9.881953012338583e-05, + "loss": 0.4065, + "step": 1140 + }, + { + "epoch": 0.0973383381675482, + "grad_norm": 1.581303709422227, + "learning_rate": 9.881654398889389e-05, + "loss": 0.3948, + "step": 1141 + }, + { + "epoch": 0.09742364784166524, + "grad_norm": 1.598247241557984, + "learning_rate": 9.881355412751822e-05, + "loss": 0.4098, + "step": 1142 + }, + { + "epoch": 0.09750895751578229, + "grad_norm": 1.5234990034159601, + "learning_rate": 9.88105605394871e-05, + "loss": 0.4214, + "step": 1143 + }, + { + "epoch": 0.09759426718989933, + "grad_norm": 1.5790295416379894, + "learning_rate": 9.880756322502904e-05, + "loss": 0.4273, + "step": 1144 + }, + { + "epoch": 0.09767957686401638, + "grad_norm": 1.985062007175787, + "learning_rate": 9.88045621843729e-05, + "loss": 0.4206, + "step": 1145 + }, + { + "epoch": 0.09776488653813342, + "grad_norm": 1.4718279140578403, + "learning_rate": 9.880155741774779e-05, + "loss": 0.4045, + "step": 1146 + }, + { + "epoch": 0.09785019621225047, + "grad_norm": 1.7200293894408203, + "learning_rate": 9.87985489253831e-05, + "loss": 0.4221, + "step": 1147 + }, + { + "epoch": 0.09793550588636751, + "grad_norm": 1.3375900487801116, + "learning_rate": 9.879553670750852e-05, + "loss": 0.4364, + "step": 1148 + }, + { + "epoch": 0.09802081556048456, + "grad_norm": 1.4401269213055474, + "learning_rate": 9.879252076435402e-05, + "loss": 0.4302, + "step": 1149 + }, + { + "epoch": 0.0981061252346016, + "grad_norm": 1.5858898437555806, + "learning_rate": 9.878950109614982e-05, + "loss": 0.3944, + "step": 1150 + }, + { + "epoch": 0.09819143490871865, + "grad_norm": 1.6086530825477052, + "learning_rate": 9.878647770312649e-05, + "loss": 0.4261, + "step": 1151 + }, + { + "epoch": 0.0982767445828357, + "grad_norm": 1.4579444457565012, + "learning_rate": 9.878345058551484e-05, + "loss": 0.3774, + "step": 1152 + }, + { + "epoch": 0.09836205425695274, + "grad_norm": 1.6876734891649727, + "learning_rate": 9.878041974354598e-05, + "loss": 0.4191, + "step": 1153 + }, + { + "epoch": 0.09844736393106979, + "grad_norm": 1.4242374789313832, + "learning_rate": 9.877738517745127e-05, + "loss": 0.4332, + "step": 1154 + }, + { + "epoch": 0.09853267360518683, + "grad_norm": 1.5677374988094595, + "learning_rate": 9.877434688746241e-05, + "loss": 0.4124, + "step": 1155 + }, + { + "epoch": 0.09861798327930388, + "grad_norm": 1.5289290608228245, + "learning_rate": 9.877130487381137e-05, + "loss": 0.3929, + "step": 1156 + }, + { + "epoch": 0.09870329295342092, + "grad_norm": 1.776384769832928, + "learning_rate": 9.876825913673036e-05, + "loss": 0.439, + "step": 1157 + }, + { + "epoch": 0.09878860262753797, + "grad_norm": 1.8766999485775469, + "learning_rate": 9.876520967645191e-05, + "loss": 0.4472, + "step": 1158 + }, + { + "epoch": 0.098873912301655, + "grad_norm": 1.6100449880391952, + "learning_rate": 9.876215649320885e-05, + "loss": 0.431, + "step": 1159 + }, + { + "epoch": 0.09895922197577205, + "grad_norm": 1.6265385020425152, + "learning_rate": 9.875909958723426e-05, + "loss": 0.4094, + "step": 1160 + }, + { + "epoch": 0.09904453164988909, + "grad_norm": 1.2372097831043276, + "learning_rate": 9.875603895876154e-05, + "loss": 0.3572, + "step": 1161 + }, + { + "epoch": 0.09912984132400614, + "grad_norm": 1.67016242899159, + "learning_rate": 9.875297460802431e-05, + "loss": 0.386, + "step": 1162 + }, + { + "epoch": 0.09921515099812318, + "grad_norm": 1.7102329423419294, + "learning_rate": 9.874990653525656e-05, + "loss": 0.378, + "step": 1163 + }, + { + "epoch": 0.09930046067224023, + "grad_norm": 1.3530464184785223, + "learning_rate": 9.874683474069248e-05, + "loss": 0.3678, + "step": 1164 + }, + { + "epoch": 0.09938577034635727, + "grad_norm": 1.6979773886964495, + "learning_rate": 9.874375922456662e-05, + "loss": 0.4041, + "step": 1165 + }, + { + "epoch": 0.09947108002047432, + "grad_norm": 1.598814006177353, + "learning_rate": 9.874067998711378e-05, + "loss": 0.4096, + "step": 1166 + }, + { + "epoch": 0.09955638969459137, + "grad_norm": 1.8828590988787421, + "learning_rate": 9.873759702856901e-05, + "loss": 0.3822, + "step": 1167 + }, + { + "epoch": 0.09964169936870841, + "grad_norm": 1.5416888759491065, + "learning_rate": 9.873451034916772e-05, + "loss": 0.4037, + "step": 1168 + }, + { + "epoch": 0.09972700904282546, + "grad_norm": 1.5233080686704237, + "learning_rate": 9.873141994914553e-05, + "loss": 0.3681, + "step": 1169 + }, + { + "epoch": 0.0998123187169425, + "grad_norm": 1.5702742386896573, + "learning_rate": 9.872832582873837e-05, + "loss": 0.4108, + "step": 1170 + }, + { + "epoch": 0.09989762839105955, + "grad_norm": 1.5870196779758186, + "learning_rate": 9.87252279881825e-05, + "loss": 0.388, + "step": 1171 + }, + { + "epoch": 0.0999829380651766, + "grad_norm": 1.5261349004011449, + "learning_rate": 9.872212642771439e-05, + "loss": 0.3727, + "step": 1172 + }, + { + "epoch": 0.10006824773929364, + "grad_norm": 1.7874588754898342, + "learning_rate": 9.871902114757084e-05, + "loss": 0.45, + "step": 1173 + }, + { + "epoch": 0.10015355741341068, + "grad_norm": 1.3776359001595844, + "learning_rate": 9.87159121479889e-05, + "loss": 0.4098, + "step": 1174 + }, + { + "epoch": 0.10023886708752773, + "grad_norm": 1.609574907349762, + "learning_rate": 9.871279942920595e-05, + "loss": 0.355, + "step": 1175 + }, + { + "epoch": 0.10032417676164478, + "grad_norm": 1.7237507982369746, + "learning_rate": 9.870968299145965e-05, + "loss": 0.3967, + "step": 1176 + }, + { + "epoch": 0.10040948643576182, + "grad_norm": 1.397749960899182, + "learning_rate": 9.870656283498786e-05, + "loss": 0.3615, + "step": 1177 + }, + { + "epoch": 0.10049479610987885, + "grad_norm": 1.4835170474922745, + "learning_rate": 9.870343896002884e-05, + "loss": 0.3363, + "step": 1178 + }, + { + "epoch": 0.1005801057839959, + "grad_norm": 1.8112041839849518, + "learning_rate": 9.870031136682107e-05, + "loss": 0.3984, + "step": 1179 + }, + { + "epoch": 0.10066541545811294, + "grad_norm": 1.7234056015036947, + "learning_rate": 9.869718005560331e-05, + "loss": 0.3869, + "step": 1180 + }, + { + "epoch": 0.10075072513222999, + "grad_norm": 1.8164199664510667, + "learning_rate": 9.869404502661464e-05, + "loss": 0.4562, + "step": 1181 + }, + { + "epoch": 0.10083603480634704, + "grad_norm": 1.8665875863740202, + "learning_rate": 9.869090628009438e-05, + "loss": 0.4412, + "step": 1182 + }, + { + "epoch": 0.10092134448046408, + "grad_norm": 1.6523345153406248, + "learning_rate": 9.868776381628218e-05, + "loss": 0.3965, + "step": 1183 + }, + { + "epoch": 0.10100665415458113, + "grad_norm": 1.7243462747446727, + "learning_rate": 9.868461763541791e-05, + "loss": 0.4083, + "step": 1184 + }, + { + "epoch": 0.10109196382869817, + "grad_norm": 1.8541724244217646, + "learning_rate": 9.868146773774183e-05, + "loss": 0.3999, + "step": 1185 + }, + { + "epoch": 0.10117727350281522, + "grad_norm": 1.589616763995373, + "learning_rate": 9.867831412349438e-05, + "loss": 0.3402, + "step": 1186 + }, + { + "epoch": 0.10126258317693226, + "grad_norm": 1.7469538334782408, + "learning_rate": 9.86751567929163e-05, + "loss": 0.4134, + "step": 1187 + }, + { + "epoch": 0.10134789285104931, + "grad_norm": 1.1933070794117626, + "learning_rate": 9.867199574624867e-05, + "loss": 0.3199, + "step": 1188 + }, + { + "epoch": 0.10143320252516635, + "grad_norm": 1.4790102737204052, + "learning_rate": 9.86688309837328e-05, + "loss": 0.3539, + "step": 1189 + }, + { + "epoch": 0.1015185121992834, + "grad_norm": 1.6499019824883077, + "learning_rate": 9.866566250561033e-05, + "loss": 0.4077, + "step": 1190 + }, + { + "epoch": 0.10160382187340045, + "grad_norm": 1.780945601303489, + "learning_rate": 9.866249031212311e-05, + "loss": 0.4102, + "step": 1191 + }, + { + "epoch": 0.10168913154751749, + "grad_norm": 1.545558099158801, + "learning_rate": 9.865931440351337e-05, + "loss": 0.355, + "step": 1192 + }, + { + "epoch": 0.10177444122163454, + "grad_norm": 1.3645443898630554, + "learning_rate": 9.865613478002354e-05, + "loss": 0.3488, + "step": 1193 + }, + { + "epoch": 0.10185975089575158, + "grad_norm": 1.4253241655490843, + "learning_rate": 9.865295144189638e-05, + "loss": 0.3726, + "step": 1194 + }, + { + "epoch": 0.10194506056986863, + "grad_norm": 1.6206468816501767, + "learning_rate": 9.864976438937493e-05, + "loss": 0.4445, + "step": 1195 + }, + { + "epoch": 0.10203037024398567, + "grad_norm": 1.5960701901960548, + "learning_rate": 9.864657362270247e-05, + "loss": 0.4029, + "step": 1196 + }, + { + "epoch": 0.10211567991810272, + "grad_norm": 1.8306916957907553, + "learning_rate": 9.864337914212263e-05, + "loss": 0.4351, + "step": 1197 + }, + { + "epoch": 0.10220098959221975, + "grad_norm": 1.576321090196278, + "learning_rate": 9.864018094787928e-05, + "loss": 0.4124, + "step": 1198 + }, + { + "epoch": 0.1022862992663368, + "grad_norm": 1.7461625668066525, + "learning_rate": 9.863697904021661e-05, + "loss": 0.4567, + "step": 1199 + }, + { + "epoch": 0.10237160894045384, + "grad_norm": 1.6362002332097125, + "learning_rate": 9.863377341937903e-05, + "loss": 0.4255, + "step": 1200 + }, + { + "epoch": 0.10245691861457089, + "grad_norm": 1.51766751491278, + "learning_rate": 9.863056408561129e-05, + "loss": 0.3907, + "step": 1201 + }, + { + "epoch": 0.10254222828868793, + "grad_norm": 1.9369109704270973, + "learning_rate": 9.86273510391584e-05, + "loss": 0.4045, + "step": 1202 + }, + { + "epoch": 0.10262753796280498, + "grad_norm": 1.5247763203908735, + "learning_rate": 9.862413428026567e-05, + "loss": 0.3816, + "step": 1203 + }, + { + "epoch": 0.10271284763692203, + "grad_norm": 1.657787760730636, + "learning_rate": 9.862091380917868e-05, + "loss": 0.4283, + "step": 1204 + }, + { + "epoch": 0.10279815731103907, + "grad_norm": 1.6339007679749813, + "learning_rate": 9.861768962614328e-05, + "loss": 0.4271, + "step": 1205 + }, + { + "epoch": 0.10288346698515612, + "grad_norm": 1.476235489737626, + "learning_rate": 9.861446173140563e-05, + "loss": 0.4041, + "step": 1206 + }, + { + "epoch": 0.10296877665927316, + "grad_norm": 1.6481990551359238, + "learning_rate": 9.861123012521219e-05, + "loss": 0.4019, + "step": 1207 + }, + { + "epoch": 0.10305408633339021, + "grad_norm": 1.6030342596894294, + "learning_rate": 9.860799480780963e-05, + "loss": 0.4216, + "step": 1208 + }, + { + "epoch": 0.10313939600750725, + "grad_norm": 1.6063339983280471, + "learning_rate": 9.860475577944497e-05, + "loss": 0.4122, + "step": 1209 + }, + { + "epoch": 0.1032247056816243, + "grad_norm": 1.44555859919143, + "learning_rate": 9.86015130403655e-05, + "loss": 0.3574, + "step": 1210 + }, + { + "epoch": 0.10331001535574134, + "grad_norm": 1.5948460867591736, + "learning_rate": 9.859826659081875e-05, + "loss": 0.3932, + "step": 1211 + }, + { + "epoch": 0.10339532502985839, + "grad_norm": 1.7040489560289802, + "learning_rate": 9.859501643105262e-05, + "loss": 0.4171, + "step": 1212 + }, + { + "epoch": 0.10348063470397544, + "grad_norm": 1.49418338888966, + "learning_rate": 9.859176256131522e-05, + "loss": 0.3896, + "step": 1213 + }, + { + "epoch": 0.10356594437809248, + "grad_norm": 1.6102877500717088, + "learning_rate": 9.858850498185496e-05, + "loss": 0.3851, + "step": 1214 + }, + { + "epoch": 0.10365125405220953, + "grad_norm": 1.4004155666199827, + "learning_rate": 9.858524369292054e-05, + "loss": 0.3586, + "step": 1215 + }, + { + "epoch": 0.10373656372632657, + "grad_norm": 1.5642049261558193, + "learning_rate": 9.858197869476096e-05, + "loss": 0.3681, + "step": 1216 + }, + { + "epoch": 0.1038218734004436, + "grad_norm": 1.6491179304871362, + "learning_rate": 9.857870998762544e-05, + "loss": 0.4156, + "step": 1217 + }, + { + "epoch": 0.10390718307456065, + "grad_norm": 1.4927825540146684, + "learning_rate": 9.85754375717636e-05, + "loss": 0.4352, + "step": 1218 + }, + { + "epoch": 0.1039924927486777, + "grad_norm": 1.601007579406677, + "learning_rate": 9.85721614474252e-05, + "loss": 0.4239, + "step": 1219 + }, + { + "epoch": 0.10407780242279474, + "grad_norm": 1.4816717328105489, + "learning_rate": 9.85688816148604e-05, + "loss": 0.3902, + "step": 1220 + }, + { + "epoch": 0.10416311209691179, + "grad_norm": 1.5008161231958201, + "learning_rate": 9.856559807431958e-05, + "loss": 0.3427, + "step": 1221 + }, + { + "epoch": 0.10424842177102883, + "grad_norm": 1.5248566106704817, + "learning_rate": 9.856231082605342e-05, + "loss": 0.3898, + "step": 1222 + }, + { + "epoch": 0.10433373144514588, + "grad_norm": 1.5842109640404005, + "learning_rate": 9.855901987031289e-05, + "loss": 0.4067, + "step": 1223 + }, + { + "epoch": 0.10441904111926292, + "grad_norm": 1.396698877893307, + "learning_rate": 9.855572520734923e-05, + "loss": 0.3882, + "step": 1224 + }, + { + "epoch": 0.10450435079337997, + "grad_norm": 1.5758987435910594, + "learning_rate": 9.8552426837414e-05, + "loss": 0.395, + "step": 1225 + }, + { + "epoch": 0.10458966046749701, + "grad_norm": 1.5757710491979335, + "learning_rate": 9.854912476075897e-05, + "loss": 0.3997, + "step": 1226 + }, + { + "epoch": 0.10467497014161406, + "grad_norm": 1.7864627522969065, + "learning_rate": 9.854581897763626e-05, + "loss": 0.37, + "step": 1227 + }, + { + "epoch": 0.1047602798157311, + "grad_norm": 1.5659695061629322, + "learning_rate": 9.854250948829824e-05, + "loss": 0.4159, + "step": 1228 + }, + { + "epoch": 0.10484558948984815, + "grad_norm": 1.4632729207793176, + "learning_rate": 9.853919629299758e-05, + "loss": 0.3841, + "step": 1229 + }, + { + "epoch": 0.1049308991639652, + "grad_norm": 1.5297284254263077, + "learning_rate": 9.853587939198721e-05, + "loss": 0.4282, + "step": 1230 + }, + { + "epoch": 0.10501620883808224, + "grad_norm": 1.9485977804402128, + "learning_rate": 9.853255878552036e-05, + "loss": 0.4081, + "step": 1231 + }, + { + "epoch": 0.10510151851219929, + "grad_norm": 1.4308508637112918, + "learning_rate": 9.852923447385056e-05, + "loss": 0.4181, + "step": 1232 + }, + { + "epoch": 0.10518682818631633, + "grad_norm": 1.6101219147672696, + "learning_rate": 9.85259064572316e-05, + "loss": 0.3784, + "step": 1233 + }, + { + "epoch": 0.10527213786043338, + "grad_norm": 1.4463636545216476, + "learning_rate": 9.852257473591754e-05, + "loss": 0.3869, + "step": 1234 + }, + { + "epoch": 0.10535744753455042, + "grad_norm": 1.326822786889586, + "learning_rate": 9.851923931016275e-05, + "loss": 0.3678, + "step": 1235 + }, + { + "epoch": 0.10544275720866746, + "grad_norm": 1.5155002503327226, + "learning_rate": 9.851590018022187e-05, + "loss": 0.425, + "step": 1236 + }, + { + "epoch": 0.1055280668827845, + "grad_norm": 1.4388646821318285, + "learning_rate": 9.851255734634983e-05, + "loss": 0.349, + "step": 1237 + }, + { + "epoch": 0.10561337655690155, + "grad_norm": 1.6247457158598184, + "learning_rate": 9.850921080880183e-05, + "loss": 0.41, + "step": 1238 + }, + { + "epoch": 0.1056986862310186, + "grad_norm": 1.6010424258003428, + "learning_rate": 9.850586056783334e-05, + "loss": 0.4321, + "step": 1239 + }, + { + "epoch": 0.10578399590513564, + "grad_norm": 1.3526174584539745, + "learning_rate": 9.850250662370017e-05, + "loss": 0.3773, + "step": 1240 + }, + { + "epoch": 0.10586930557925268, + "grad_norm": 1.6244951344209075, + "learning_rate": 9.849914897665837e-05, + "loss": 0.4125, + "step": 1241 + }, + { + "epoch": 0.10595461525336973, + "grad_norm": 1.7072075026106859, + "learning_rate": 9.849578762696426e-05, + "loss": 0.4521, + "step": 1242 + }, + { + "epoch": 0.10603992492748678, + "grad_norm": 1.85888819372561, + "learning_rate": 9.849242257487447e-05, + "loss": 0.3828, + "step": 1243 + }, + { + "epoch": 0.10612523460160382, + "grad_norm": 1.4227107180580392, + "learning_rate": 9.848905382064591e-05, + "loss": 0.396, + "step": 1244 + }, + { + "epoch": 0.10621054427572087, + "grad_norm": 1.48960781299599, + "learning_rate": 9.848568136453577e-05, + "loss": 0.4042, + "step": 1245 + }, + { + "epoch": 0.10629585394983791, + "grad_norm": 1.5669292994107318, + "learning_rate": 9.84823052068015e-05, + "loss": 0.4538, + "step": 1246 + }, + { + "epoch": 0.10638116362395496, + "grad_norm": 1.672856408249695, + "learning_rate": 9.847892534770086e-05, + "loss": 0.4006, + "step": 1247 + }, + { + "epoch": 0.106466473298072, + "grad_norm": 1.9015568979975386, + "learning_rate": 9.84755417874919e-05, + "loss": 0.4345, + "step": 1248 + }, + { + "epoch": 0.10655178297218905, + "grad_norm": 1.6684366682114486, + "learning_rate": 9.84721545264329e-05, + "loss": 0.4623, + "step": 1249 + }, + { + "epoch": 0.1066370926463061, + "grad_norm": 1.6426271846307634, + "learning_rate": 9.84687635647825e-05, + "loss": 0.3938, + "step": 1250 + }, + { + "epoch": 0.10672240232042314, + "grad_norm": 1.478091139939872, + "learning_rate": 9.846536890279956e-05, + "loss": 0.351, + "step": 1251 + }, + { + "epoch": 0.10680771199454019, + "grad_norm": 1.4979277442801942, + "learning_rate": 9.846197054074325e-05, + "loss": 0.5531, + "step": 1252 + }, + { + "epoch": 0.10689302166865723, + "grad_norm": 1.4097373107620492, + "learning_rate": 9.845856847887302e-05, + "loss": 0.3603, + "step": 1253 + }, + { + "epoch": 0.10697833134277428, + "grad_norm": 1.5448990348738334, + "learning_rate": 9.84551627174486e-05, + "loss": 0.3629, + "step": 1254 + }, + { + "epoch": 0.10706364101689131, + "grad_norm": 1.7209932426927315, + "learning_rate": 9.845175325672998e-05, + "loss": 0.4615, + "step": 1255 + }, + { + "epoch": 0.10714895069100835, + "grad_norm": 1.5232230793614938, + "learning_rate": 9.844834009697748e-05, + "loss": 0.3495, + "step": 1256 + }, + { + "epoch": 0.1072342603651254, + "grad_norm": 2.080378636963071, + "learning_rate": 9.844492323845167e-05, + "loss": 0.3665, + "step": 1257 + }, + { + "epoch": 0.10731957003924245, + "grad_norm": 1.5691315874556921, + "learning_rate": 9.844150268141338e-05, + "loss": 0.3874, + "step": 1258 + }, + { + "epoch": 0.10740487971335949, + "grad_norm": 1.830554879702474, + "learning_rate": 9.843807842612383e-05, + "loss": 0.4361, + "step": 1259 + }, + { + "epoch": 0.10749018938747654, + "grad_norm": 1.3373991348879906, + "learning_rate": 9.843465047284434e-05, + "loss": 0.3539, + "step": 1260 + }, + { + "epoch": 0.10757549906159358, + "grad_norm": 1.6267925059520882, + "learning_rate": 9.84312188218367e-05, + "loss": 0.4305, + "step": 1261 + }, + { + "epoch": 0.10766080873571063, + "grad_norm": 1.483530948991082, + "learning_rate": 9.842778347336286e-05, + "loss": 0.3691, + "step": 1262 + }, + { + "epoch": 0.10774611840982767, + "grad_norm": 1.6209282848066056, + "learning_rate": 9.84243444276851e-05, + "loss": 0.4131, + "step": 1263 + }, + { + "epoch": 0.10783142808394472, + "grad_norm": 1.3782218985540946, + "learning_rate": 9.842090168506596e-05, + "loss": 0.4179, + "step": 1264 + }, + { + "epoch": 0.10791673775806176, + "grad_norm": 1.626571482286378, + "learning_rate": 9.841745524576829e-05, + "loss": 0.4032, + "step": 1265 + }, + { + "epoch": 0.10800204743217881, + "grad_norm": 1.5944011330490948, + "learning_rate": 9.84140051100552e-05, + "loss": 0.3961, + "step": 1266 + }, + { + "epoch": 0.10808735710629586, + "grad_norm": 1.1376499936487545, + "learning_rate": 9.841055127819009e-05, + "loss": 0.3187, + "step": 1267 + }, + { + "epoch": 0.1081726667804129, + "grad_norm": 1.7387678172599317, + "learning_rate": 9.840709375043663e-05, + "loss": 0.4055, + "step": 1268 + }, + { + "epoch": 0.10825797645452995, + "grad_norm": 1.32277986114486, + "learning_rate": 9.840363252705882e-05, + "loss": 0.3478, + "step": 1269 + }, + { + "epoch": 0.10834328612864699, + "grad_norm": 1.7420258725037387, + "learning_rate": 9.840016760832088e-05, + "loss": 0.3884, + "step": 1270 + }, + { + "epoch": 0.10842859580276404, + "grad_norm": 1.779872696428237, + "learning_rate": 9.839669899448733e-05, + "loss": 0.3929, + "step": 1271 + }, + { + "epoch": 0.10851390547688108, + "grad_norm": 1.834160646980156, + "learning_rate": 9.8393226685823e-05, + "loss": 0.4124, + "step": 1272 + }, + { + "epoch": 0.10859921515099813, + "grad_norm": 1.621495870330285, + "learning_rate": 9.838975068259297e-05, + "loss": 0.3922, + "step": 1273 + }, + { + "epoch": 0.10868452482511516, + "grad_norm": 1.5903085689904624, + "learning_rate": 9.838627098506264e-05, + "loss": 0.3317, + "step": 1274 + }, + { + "epoch": 0.1087698344992322, + "grad_norm": 1.5102437238229727, + "learning_rate": 9.838278759349762e-05, + "loss": 0.3669, + "step": 1275 + }, + { + "epoch": 0.10885514417334925, + "grad_norm": 1.9501843096365878, + "learning_rate": 9.837930050816387e-05, + "loss": 0.4103, + "step": 1276 + }, + { + "epoch": 0.1089404538474663, + "grad_norm": 1.6191298020832865, + "learning_rate": 9.837580972932762e-05, + "loss": 0.4041, + "step": 1277 + }, + { + "epoch": 0.10902576352158334, + "grad_norm": 1.9215272418236897, + "learning_rate": 9.837231525725537e-05, + "loss": 0.4508, + "step": 1278 + }, + { + "epoch": 0.10911107319570039, + "grad_norm": 1.4453831011050189, + "learning_rate": 9.836881709221391e-05, + "loss": 0.4119, + "step": 1279 + }, + { + "epoch": 0.10919638286981743, + "grad_norm": 1.6616362314041955, + "learning_rate": 9.836531523447028e-05, + "loss": 0.3843, + "step": 1280 + }, + { + "epoch": 0.10928169254393448, + "grad_norm": 1.7282234591533776, + "learning_rate": 9.836180968429185e-05, + "loss": 0.4232, + "step": 1281 + }, + { + "epoch": 0.10936700221805153, + "grad_norm": 1.6753903802332095, + "learning_rate": 9.835830044194625e-05, + "loss": 0.3911, + "step": 1282 + }, + { + "epoch": 0.10945231189216857, + "grad_norm": 1.5600835807124547, + "learning_rate": 9.835478750770137e-05, + "loss": 0.4044, + "step": 1283 + }, + { + "epoch": 0.10953762156628562, + "grad_norm": 1.633122902893285, + "learning_rate": 9.835127088182543e-05, + "loss": 0.3767, + "step": 1284 + }, + { + "epoch": 0.10962293124040266, + "grad_norm": 1.6126620603045914, + "learning_rate": 9.834775056458691e-05, + "loss": 0.4242, + "step": 1285 + }, + { + "epoch": 0.10970824091451971, + "grad_norm": 1.4246066336870817, + "learning_rate": 9.834422655625454e-05, + "loss": 0.3948, + "step": 1286 + }, + { + "epoch": 0.10979355058863675, + "grad_norm": 1.5370305779120086, + "learning_rate": 9.834069885709738e-05, + "loss": 0.3835, + "step": 1287 + }, + { + "epoch": 0.1098788602627538, + "grad_norm": 1.568087619595953, + "learning_rate": 9.833716746738474e-05, + "loss": 0.3756, + "step": 1288 + }, + { + "epoch": 0.10996416993687085, + "grad_norm": 1.6278975402916556, + "learning_rate": 9.833363238738623e-05, + "loss": 0.4073, + "step": 1289 + }, + { + "epoch": 0.11004947961098789, + "grad_norm": 1.9149982889630912, + "learning_rate": 9.833009361737174e-05, + "loss": 0.4147, + "step": 1290 + }, + { + "epoch": 0.11013478928510494, + "grad_norm": 1.5711263948195051, + "learning_rate": 9.83265511576114e-05, + "loss": 0.3836, + "step": 1291 + }, + { + "epoch": 0.11022009895922198, + "grad_norm": 1.7813469040595487, + "learning_rate": 9.83230050083757e-05, + "loss": 0.4483, + "step": 1292 + }, + { + "epoch": 0.11030540863333901, + "grad_norm": 1.429942540835023, + "learning_rate": 9.831945516993537e-05, + "loss": 0.4145, + "step": 1293 + }, + { + "epoch": 0.11039071830745606, + "grad_norm": 1.7153658756312815, + "learning_rate": 9.831590164256139e-05, + "loss": 0.4321, + "step": 1294 + }, + { + "epoch": 0.1104760279815731, + "grad_norm": 1.2645690657404158, + "learning_rate": 9.831234442652508e-05, + "loss": 0.332, + "step": 1295 + }, + { + "epoch": 0.11056133765569015, + "grad_norm": 2.0952090972487323, + "learning_rate": 9.8308783522098e-05, + "loss": 0.4872, + "step": 1296 + }, + { + "epoch": 0.1106466473298072, + "grad_norm": 1.690779572082313, + "learning_rate": 9.830521892955202e-05, + "loss": 0.4693, + "step": 1297 + }, + { + "epoch": 0.11073195700392424, + "grad_norm": 1.3825371247147757, + "learning_rate": 9.830165064915926e-05, + "loss": 0.3593, + "step": 1298 + }, + { + "epoch": 0.11081726667804129, + "grad_norm": 1.5723969398773467, + "learning_rate": 9.829807868119214e-05, + "loss": 0.4269, + "step": 1299 + }, + { + "epoch": 0.11090257635215833, + "grad_norm": 1.5077870421162862, + "learning_rate": 9.829450302592338e-05, + "loss": 0.3697, + "step": 1300 + }, + { + "epoch": 0.11098788602627538, + "grad_norm": 1.5145973575717588, + "learning_rate": 9.829092368362596e-05, + "loss": 0.4178, + "step": 1301 + }, + { + "epoch": 0.11107319570039242, + "grad_norm": 1.4921497120491412, + "learning_rate": 9.828734065457313e-05, + "loss": 0.3778, + "step": 1302 + }, + { + "epoch": 0.11115850537450947, + "grad_norm": 1.5286776676361635, + "learning_rate": 9.828375393903842e-05, + "loss": 0.4323, + "step": 1303 + }, + { + "epoch": 0.11124381504862652, + "grad_norm": 2.1955338227375747, + "learning_rate": 9.828016353729569e-05, + "loss": 0.4829, + "step": 1304 + }, + { + "epoch": 0.11132912472274356, + "grad_norm": 1.3011428284645377, + "learning_rate": 9.827656944961903e-05, + "loss": 0.4151, + "step": 1305 + }, + { + "epoch": 0.1114144343968606, + "grad_norm": 1.4524720068120107, + "learning_rate": 9.827297167628283e-05, + "loss": 0.3663, + "step": 1306 + }, + { + "epoch": 0.11149974407097765, + "grad_norm": 1.5432733548966495, + "learning_rate": 9.826937021756177e-05, + "loss": 0.4006, + "step": 1307 + }, + { + "epoch": 0.1115850537450947, + "grad_norm": 1.2655838853727643, + "learning_rate": 9.82657650737308e-05, + "loss": 0.3665, + "step": 1308 + }, + { + "epoch": 0.11167036341921174, + "grad_norm": 1.3456348684244057, + "learning_rate": 9.826215624506516e-05, + "loss": 0.376, + "step": 1309 + }, + { + "epoch": 0.11175567309332879, + "grad_norm": 1.6014489110347983, + "learning_rate": 9.825854373184033e-05, + "loss": 0.366, + "step": 1310 + }, + { + "epoch": 0.11184098276744583, + "grad_norm": 1.7080189795589777, + "learning_rate": 9.825492753433215e-05, + "loss": 0.4203, + "step": 1311 + }, + { + "epoch": 0.11192629244156288, + "grad_norm": 1.4676621649334691, + "learning_rate": 9.825130765281668e-05, + "loss": 0.3927, + "step": 1312 + }, + { + "epoch": 0.11201160211567991, + "grad_norm": 1.2565992676412976, + "learning_rate": 9.824768408757028e-05, + "loss": 0.377, + "step": 1313 + }, + { + "epoch": 0.11209691178979696, + "grad_norm": 1.4671182699394456, + "learning_rate": 9.824405683886957e-05, + "loss": 0.4233, + "step": 1314 + }, + { + "epoch": 0.112182221463914, + "grad_norm": 1.7965780925090677, + "learning_rate": 9.824042590699151e-05, + "loss": 0.4421, + "step": 1315 + }, + { + "epoch": 0.11226753113803105, + "grad_norm": 1.5845552631586517, + "learning_rate": 9.823679129221326e-05, + "loss": 0.4125, + "step": 1316 + }, + { + "epoch": 0.1123528408121481, + "grad_norm": 1.7729428311429618, + "learning_rate": 9.823315299481235e-05, + "loss": 0.4303, + "step": 1317 + }, + { + "epoch": 0.11243815048626514, + "grad_norm": 1.5337183772948835, + "learning_rate": 9.82295110150665e-05, + "loss": 0.3794, + "step": 1318 + }, + { + "epoch": 0.11252346016038219, + "grad_norm": 1.906214791895175, + "learning_rate": 9.822586535325378e-05, + "loss": 0.4701, + "step": 1319 + }, + { + "epoch": 0.11260876983449923, + "grad_norm": 1.5446220714704637, + "learning_rate": 9.822221600965252e-05, + "loss": 0.3735, + "step": 1320 + }, + { + "epoch": 0.11269407950861628, + "grad_norm": 1.3881394536598284, + "learning_rate": 9.821856298454131e-05, + "loss": 0.3974, + "step": 1321 + }, + { + "epoch": 0.11277938918273332, + "grad_norm": 1.5055643824326006, + "learning_rate": 9.821490627819904e-05, + "loss": 0.3758, + "step": 1322 + }, + { + "epoch": 0.11286469885685037, + "grad_norm": 1.3416855500750358, + "learning_rate": 9.821124589090491e-05, + "loss": 0.3732, + "step": 1323 + }, + { + "epoch": 0.11295000853096741, + "grad_norm": 1.5067982004499716, + "learning_rate": 9.820758182293834e-05, + "loss": 0.4057, + "step": 1324 + }, + { + "epoch": 0.11303531820508446, + "grad_norm": 1.4089252879394418, + "learning_rate": 9.820391407457907e-05, + "loss": 0.3883, + "step": 1325 + }, + { + "epoch": 0.1131206278792015, + "grad_norm": 1.2497344688675995, + "learning_rate": 9.820024264610713e-05, + "loss": 0.3774, + "step": 1326 + }, + { + "epoch": 0.11320593755331855, + "grad_norm": 1.6150411469499941, + "learning_rate": 9.81965675378028e-05, + "loss": 0.373, + "step": 1327 + }, + { + "epoch": 0.1132912472274356, + "grad_norm": 1.577665054222673, + "learning_rate": 9.819288874994663e-05, + "loss": 0.4036, + "step": 1328 + }, + { + "epoch": 0.11337655690155264, + "grad_norm": 2.0629577128837333, + "learning_rate": 9.818920628281953e-05, + "loss": 0.5256, + "step": 1329 + }, + { + "epoch": 0.11346186657566969, + "grad_norm": 1.7705437666561015, + "learning_rate": 9.818552013670258e-05, + "loss": 0.4064, + "step": 1330 + }, + { + "epoch": 0.11354717624978673, + "grad_norm": 1.2225871203687202, + "learning_rate": 9.818183031187724e-05, + "loss": 0.3702, + "step": 1331 + }, + { + "epoch": 0.11363248592390376, + "grad_norm": 1.6248428562182757, + "learning_rate": 9.81781368086252e-05, + "loss": 0.3859, + "step": 1332 + }, + { + "epoch": 0.11371779559802081, + "grad_norm": 1.9829095550663949, + "learning_rate": 9.817443962722843e-05, + "loss": 0.4094, + "step": 1333 + }, + { + "epoch": 0.11380310527213786, + "grad_norm": 1.6569095234132327, + "learning_rate": 9.817073876796918e-05, + "loss": 0.3798, + "step": 1334 + }, + { + "epoch": 0.1138884149462549, + "grad_norm": 1.4432439899910845, + "learning_rate": 9.816703423113001e-05, + "loss": 0.3956, + "step": 1335 + }, + { + "epoch": 0.11397372462037195, + "grad_norm": 1.5501731622013872, + "learning_rate": 9.816332601699374e-05, + "loss": 0.4374, + "step": 1336 + }, + { + "epoch": 0.11405903429448899, + "grad_norm": 1.5937735611913058, + "learning_rate": 9.815961412584347e-05, + "loss": 0.3575, + "step": 1337 + }, + { + "epoch": 0.11414434396860604, + "grad_norm": 1.5248041527484413, + "learning_rate": 9.815589855796259e-05, + "loss": 0.3712, + "step": 1338 + }, + { + "epoch": 0.11422965364272308, + "grad_norm": 1.5531794455980519, + "learning_rate": 9.815217931363475e-05, + "loss": 0.4478, + "step": 1339 + }, + { + "epoch": 0.11431496331684013, + "grad_norm": 1.887019415803533, + "learning_rate": 9.814845639314387e-05, + "loss": 0.4183, + "step": 1340 + }, + { + "epoch": 0.11440027299095717, + "grad_norm": 1.2627957592491499, + "learning_rate": 9.814472979677424e-05, + "loss": 0.3484, + "step": 1341 + }, + { + "epoch": 0.11448558266507422, + "grad_norm": 1.7990233473818644, + "learning_rate": 9.814099952481032e-05, + "loss": 0.433, + "step": 1342 + }, + { + "epoch": 0.11457089233919127, + "grad_norm": 1.5176635089748975, + "learning_rate": 9.81372655775369e-05, + "loss": 0.3927, + "step": 1343 + }, + { + "epoch": 0.11465620201330831, + "grad_norm": 1.6394191852044078, + "learning_rate": 9.813352795523907e-05, + "loss": 0.455, + "step": 1344 + }, + { + "epoch": 0.11474151168742536, + "grad_norm": 1.7374117959227313, + "learning_rate": 9.812978665820216e-05, + "loss": 0.4564, + "step": 1345 + }, + { + "epoch": 0.1148268213615424, + "grad_norm": 1.7858034792831694, + "learning_rate": 9.812604168671178e-05, + "loss": 0.3965, + "step": 1346 + }, + { + "epoch": 0.11491213103565945, + "grad_norm": 1.846893217510088, + "learning_rate": 9.812229304105387e-05, + "loss": 0.4022, + "step": 1347 + }, + { + "epoch": 0.1149974407097765, + "grad_norm": 1.256660502534179, + "learning_rate": 9.811854072151461e-05, + "loss": 0.3399, + "step": 1348 + }, + { + "epoch": 0.11508275038389354, + "grad_norm": 1.6968271547319982, + "learning_rate": 9.811478472838046e-05, + "loss": 0.4116, + "step": 1349 + }, + { + "epoch": 0.11516806005801059, + "grad_norm": 1.6642709520902645, + "learning_rate": 9.811102506193818e-05, + "loss": 0.3786, + "step": 1350 + }, + { + "epoch": 0.11525336973212762, + "grad_norm": 1.3958334946513675, + "learning_rate": 9.810726172247482e-05, + "loss": 0.3523, + "step": 1351 + }, + { + "epoch": 0.11533867940624466, + "grad_norm": 1.373098315473216, + "learning_rate": 9.810349471027765e-05, + "loss": 0.3804, + "step": 1352 + }, + { + "epoch": 0.11542398908036171, + "grad_norm": 1.516880734326338, + "learning_rate": 9.809972402563427e-05, + "loss": 0.3788, + "step": 1353 + }, + { + "epoch": 0.11550929875447875, + "grad_norm": 1.55027397564645, + "learning_rate": 9.809594966883259e-05, + "loss": 0.4251, + "step": 1354 + }, + { + "epoch": 0.1155946084285958, + "grad_norm": 1.3693968441995499, + "learning_rate": 9.809217164016071e-05, + "loss": 0.3769, + "step": 1355 + }, + { + "epoch": 0.11567991810271284, + "grad_norm": 1.2341805316954158, + "learning_rate": 9.80883899399071e-05, + "loss": 0.3276, + "step": 1356 + }, + { + "epoch": 0.11576522777682989, + "grad_norm": 1.5374970971056818, + "learning_rate": 9.808460456836047e-05, + "loss": 0.3999, + "step": 1357 + }, + { + "epoch": 0.11585053745094694, + "grad_norm": 1.5645725999947389, + "learning_rate": 9.808081552580978e-05, + "loss": 0.3875, + "step": 1358 + }, + { + "epoch": 0.11593584712506398, + "grad_norm": 1.7284793483569694, + "learning_rate": 9.807702281254432e-05, + "loss": 0.3936, + "step": 1359 + }, + { + "epoch": 0.11602115679918103, + "grad_norm": 1.5586725277220796, + "learning_rate": 9.807322642885369e-05, + "loss": 0.3567, + "step": 1360 + }, + { + "epoch": 0.11610646647329807, + "grad_norm": 1.6864249018967754, + "learning_rate": 9.806942637502764e-05, + "loss": 0.4268, + "step": 1361 + }, + { + "epoch": 0.11619177614741512, + "grad_norm": 1.45904091517369, + "learning_rate": 9.806562265135635e-05, + "loss": 0.3764, + "step": 1362 + }, + { + "epoch": 0.11627708582153216, + "grad_norm": 2.0317401001291127, + "learning_rate": 9.806181525813019e-05, + "loss": 0.4017, + "step": 1363 + }, + { + "epoch": 0.11636239549564921, + "grad_norm": 1.4815695502453996, + "learning_rate": 9.805800419563982e-05, + "loss": 0.4218, + "step": 1364 + }, + { + "epoch": 0.11644770516976626, + "grad_norm": 1.5189653534343968, + "learning_rate": 9.805418946417622e-05, + "loss": 0.3713, + "step": 1365 + }, + { + "epoch": 0.1165330148438833, + "grad_norm": 1.7889270981103493, + "learning_rate": 9.805037106403062e-05, + "loss": 0.4397, + "step": 1366 + }, + { + "epoch": 0.11661832451800035, + "grad_norm": 1.6257278572931584, + "learning_rate": 9.804654899549451e-05, + "loss": 0.367, + "step": 1367 + }, + { + "epoch": 0.11670363419211739, + "grad_norm": 1.6412421382765152, + "learning_rate": 9.804272325885971e-05, + "loss": 0.4149, + "step": 1368 + }, + { + "epoch": 0.11678894386623444, + "grad_norm": 1.584753562349142, + "learning_rate": 9.80388938544183e-05, + "loss": 0.3904, + "step": 1369 + }, + { + "epoch": 0.11687425354035147, + "grad_norm": 1.5889911286746807, + "learning_rate": 9.803506078246262e-05, + "loss": 0.4006, + "step": 1370 + }, + { + "epoch": 0.11695956321446851, + "grad_norm": 1.667042387891971, + "learning_rate": 9.803122404328529e-05, + "loss": 0.3638, + "step": 1371 + }, + { + "epoch": 0.11704487288858556, + "grad_norm": 1.4141709091191823, + "learning_rate": 9.802738363717928e-05, + "loss": 0.3669, + "step": 1372 + }, + { + "epoch": 0.1171301825627026, + "grad_norm": 1.3680804143326395, + "learning_rate": 9.80235395644377e-05, + "loss": 0.3411, + "step": 1373 + }, + { + "epoch": 0.11721549223681965, + "grad_norm": 1.7187270423049488, + "learning_rate": 9.80196918253541e-05, + "loss": 0.4354, + "step": 1374 + }, + { + "epoch": 0.1173008019109367, + "grad_norm": 1.5847461152938551, + "learning_rate": 9.80158404202222e-05, + "loss": 0.3928, + "step": 1375 + }, + { + "epoch": 0.11738611158505374, + "grad_norm": 1.4948959934455248, + "learning_rate": 9.801198534933603e-05, + "loss": 0.3393, + "step": 1376 + }, + { + "epoch": 0.11747142125917079, + "grad_norm": 1.5565831880969498, + "learning_rate": 9.800812661298992e-05, + "loss": 0.3798, + "step": 1377 + }, + { + "epoch": 0.11755673093328783, + "grad_norm": 2.070211731959517, + "learning_rate": 9.800426421147845e-05, + "loss": 0.4498, + "step": 1378 + }, + { + "epoch": 0.11764204060740488, + "grad_norm": 1.563526426779397, + "learning_rate": 9.800039814509653e-05, + "loss": 0.4104, + "step": 1379 + }, + { + "epoch": 0.11772735028152193, + "grad_norm": 1.8858104596652112, + "learning_rate": 9.799652841413927e-05, + "loss": 0.3933, + "step": 1380 + }, + { + "epoch": 0.11781265995563897, + "grad_norm": 1.7562107502634245, + "learning_rate": 9.799265501890211e-05, + "loss": 0.3926, + "step": 1381 + }, + { + "epoch": 0.11789796962975602, + "grad_norm": 1.5138740256899084, + "learning_rate": 9.798877795968078e-05, + "loss": 0.3721, + "step": 1382 + }, + { + "epoch": 0.11798327930387306, + "grad_norm": 1.5016082248612883, + "learning_rate": 9.798489723677126e-05, + "loss": 0.3616, + "step": 1383 + }, + { + "epoch": 0.11806858897799011, + "grad_norm": 1.3448811582794293, + "learning_rate": 9.798101285046983e-05, + "loss": 0.3442, + "step": 1384 + }, + { + "epoch": 0.11815389865210715, + "grad_norm": 1.5657330443359392, + "learning_rate": 9.797712480107304e-05, + "loss": 0.3917, + "step": 1385 + }, + { + "epoch": 0.1182392083262242, + "grad_norm": 1.3038337335369345, + "learning_rate": 9.797323308887773e-05, + "loss": 0.3936, + "step": 1386 + }, + { + "epoch": 0.11832451800034124, + "grad_norm": 1.25113435772954, + "learning_rate": 9.796933771418098e-05, + "loss": 0.3694, + "step": 1387 + }, + { + "epoch": 0.11840982767445829, + "grad_norm": 1.4249662395292801, + "learning_rate": 9.796543867728023e-05, + "loss": 0.4161, + "step": 1388 + }, + { + "epoch": 0.11849513734857532, + "grad_norm": 1.6731172028945198, + "learning_rate": 9.79615359784731e-05, + "loss": 0.4024, + "step": 1389 + }, + { + "epoch": 0.11858044702269237, + "grad_norm": 1.549849992076673, + "learning_rate": 9.795762961805758e-05, + "loss": 0.4639, + "step": 1390 + }, + { + "epoch": 0.11866575669680941, + "grad_norm": 1.533167669773171, + "learning_rate": 9.795371959633189e-05, + "loss": 0.3811, + "step": 1391 + }, + { + "epoch": 0.11875106637092646, + "grad_norm": 1.7096656008339188, + "learning_rate": 9.794980591359453e-05, + "loss": 0.3541, + "step": 1392 + }, + { + "epoch": 0.1188363760450435, + "grad_norm": 1.480641377633408, + "learning_rate": 9.79458885701443e-05, + "loss": 0.3706, + "step": 1393 + }, + { + "epoch": 0.11892168571916055, + "grad_norm": 1.4680215368276182, + "learning_rate": 9.794196756628025e-05, + "loss": 0.4078, + "step": 1394 + }, + { + "epoch": 0.1190069953932776, + "grad_norm": 2.0333404348072217, + "learning_rate": 9.793804290230176e-05, + "loss": 0.4307, + "step": 1395 + }, + { + "epoch": 0.11909230506739464, + "grad_norm": 1.5056631947538524, + "learning_rate": 9.793411457850841e-05, + "loss": 0.3994, + "step": 1396 + }, + { + "epoch": 0.11917761474151169, + "grad_norm": 1.5480438594321893, + "learning_rate": 9.793018259520014e-05, + "loss": 0.3985, + "step": 1397 + }, + { + "epoch": 0.11926292441562873, + "grad_norm": 1.5129375423895168, + "learning_rate": 9.792624695267714e-05, + "loss": 0.4052, + "step": 1398 + }, + { + "epoch": 0.11934823408974578, + "grad_norm": 1.323289843612551, + "learning_rate": 9.792230765123987e-05, + "loss": 0.3458, + "step": 1399 + }, + { + "epoch": 0.11943354376386282, + "grad_norm": 1.5912265621835522, + "learning_rate": 9.791836469118905e-05, + "loss": 0.3862, + "step": 1400 + }, + { + "epoch": 0.11951885343797987, + "grad_norm": 1.4382672128401612, + "learning_rate": 9.791441807282573e-05, + "loss": 0.337, + "step": 1401 + }, + { + "epoch": 0.11960416311209691, + "grad_norm": 1.7203557229913673, + "learning_rate": 9.791046779645121e-05, + "loss": 0.4037, + "step": 1402 + }, + { + "epoch": 0.11968947278621396, + "grad_norm": 1.7282535332724058, + "learning_rate": 9.790651386236707e-05, + "loss": 0.381, + "step": 1403 + }, + { + "epoch": 0.119774782460331, + "grad_norm": 1.5214418300022805, + "learning_rate": 9.790255627087517e-05, + "loss": 0.4411, + "step": 1404 + }, + { + "epoch": 0.11986009213444805, + "grad_norm": 1.4565843812206998, + "learning_rate": 9.789859502227766e-05, + "loss": 0.4098, + "step": 1405 + }, + { + "epoch": 0.1199454018085651, + "grad_norm": 1.5294891665256767, + "learning_rate": 9.789463011687694e-05, + "loss": 0.3994, + "step": 1406 + }, + { + "epoch": 0.12003071148268214, + "grad_norm": 1.7070665857773644, + "learning_rate": 9.789066155497573e-05, + "loss": 0.3829, + "step": 1407 + }, + { + "epoch": 0.12011602115679917, + "grad_norm": 1.3438002111235776, + "learning_rate": 9.788668933687699e-05, + "loss": 0.3306, + "step": 1408 + }, + { + "epoch": 0.12020133083091622, + "grad_norm": 1.2411904802294773, + "learning_rate": 9.7882713462884e-05, + "loss": 0.3644, + "step": 1409 + }, + { + "epoch": 0.12028664050503327, + "grad_norm": 1.3312865382091927, + "learning_rate": 9.78787339333003e-05, + "loss": 0.3601, + "step": 1410 + }, + { + "epoch": 0.12037195017915031, + "grad_norm": 1.7941095051723244, + "learning_rate": 9.787475074842967e-05, + "loss": 0.4069, + "step": 1411 + }, + { + "epoch": 0.12045725985326736, + "grad_norm": 1.5671632987347308, + "learning_rate": 9.787076390857623e-05, + "loss": 0.3886, + "step": 1412 + }, + { + "epoch": 0.1205425695273844, + "grad_norm": 1.6347027656018862, + "learning_rate": 9.786677341404436e-05, + "loss": 0.4117, + "step": 1413 + }, + { + "epoch": 0.12062787920150145, + "grad_norm": 1.7290905702078447, + "learning_rate": 9.78627792651387e-05, + "loss": 0.4242, + "step": 1414 + }, + { + "epoch": 0.1207131888756185, + "grad_norm": 1.392462298911892, + "learning_rate": 9.785878146216417e-05, + "loss": 0.3774, + "step": 1415 + }, + { + "epoch": 0.12079849854973554, + "grad_norm": 1.4643828213621637, + "learning_rate": 9.7854780005426e-05, + "loss": 0.3564, + "step": 1416 + }, + { + "epoch": 0.12088380822385258, + "grad_norm": 1.4858958434033112, + "learning_rate": 9.78507748952297e-05, + "loss": 0.3103, + "step": 1417 + }, + { + "epoch": 0.12096911789796963, + "grad_norm": 1.5458534026921007, + "learning_rate": 9.7846766131881e-05, + "loss": 0.3479, + "step": 1418 + }, + { + "epoch": 0.12105442757208668, + "grad_norm": 1.8892902044561743, + "learning_rate": 9.784275371568596e-05, + "loss": 0.4358, + "step": 1419 + }, + { + "epoch": 0.12113973724620372, + "grad_norm": 1.5890666740310146, + "learning_rate": 9.783873764695091e-05, + "loss": 0.3982, + "step": 1420 + }, + { + "epoch": 0.12122504692032077, + "grad_norm": 1.604010149103148, + "learning_rate": 9.783471792598247e-05, + "loss": 0.4008, + "step": 1421 + }, + { + "epoch": 0.12131035659443781, + "grad_norm": 1.8727600705455663, + "learning_rate": 9.783069455308749e-05, + "loss": 0.4557, + "step": 1422 + }, + { + "epoch": 0.12139566626855486, + "grad_norm": 1.6510737162306268, + "learning_rate": 9.782666752857317e-05, + "loss": 0.4079, + "step": 1423 + }, + { + "epoch": 0.1214809759426719, + "grad_norm": 1.5009278765434824, + "learning_rate": 9.782263685274692e-05, + "loss": 0.3328, + "step": 1424 + }, + { + "epoch": 0.12156628561678895, + "grad_norm": 1.5649615924741378, + "learning_rate": 9.781860252591648e-05, + "loss": 0.4275, + "step": 1425 + }, + { + "epoch": 0.121651595290906, + "grad_norm": 1.5671154519099313, + "learning_rate": 9.781456454838986e-05, + "loss": 0.3866, + "step": 1426 + }, + { + "epoch": 0.12173690496502303, + "grad_norm": 1.6897110758639955, + "learning_rate": 9.78105229204753e-05, + "loss": 0.4132, + "step": 1427 + }, + { + "epoch": 0.12182221463914007, + "grad_norm": 1.4904269872657692, + "learning_rate": 9.780647764248139e-05, + "loss": 0.3743, + "step": 1428 + }, + { + "epoch": 0.12190752431325712, + "grad_norm": 1.5501675484453845, + "learning_rate": 9.780242871471696e-05, + "loss": 0.4233, + "step": 1429 + }, + { + "epoch": 0.12199283398737416, + "grad_norm": 1.4752503069547778, + "learning_rate": 9.779837613749111e-05, + "loss": 0.3641, + "step": 1430 + }, + { + "epoch": 0.12207814366149121, + "grad_norm": 1.6140711146195872, + "learning_rate": 9.779431991111326e-05, + "loss": 0.3929, + "step": 1431 + }, + { + "epoch": 0.12216345333560825, + "grad_norm": 2.040490012501748, + "learning_rate": 9.779026003589304e-05, + "loss": 0.4819, + "step": 1432 + }, + { + "epoch": 0.1222487630097253, + "grad_norm": 1.6620806860773185, + "learning_rate": 9.778619651214042e-05, + "loss": 0.4326, + "step": 1433 + }, + { + "epoch": 0.12233407268384235, + "grad_norm": 1.6001500297777866, + "learning_rate": 9.778212934016566e-05, + "loss": 0.3999, + "step": 1434 + }, + { + "epoch": 0.12241938235795939, + "grad_norm": 1.5982144227145807, + "learning_rate": 9.777805852027922e-05, + "loss": 0.3907, + "step": 1435 + }, + { + "epoch": 0.12250469203207644, + "grad_norm": 1.3395235885815728, + "learning_rate": 9.777398405279192e-05, + "loss": 0.3718, + "step": 1436 + }, + { + "epoch": 0.12259000170619348, + "grad_norm": 1.4839040711904956, + "learning_rate": 9.77699059380148e-05, + "loss": 0.351, + "step": 1437 + }, + { + "epoch": 0.12267531138031053, + "grad_norm": 1.3512848585433523, + "learning_rate": 9.77658241762592e-05, + "loss": 0.4055, + "step": 1438 + }, + { + "epoch": 0.12276062105442757, + "grad_norm": 1.865034578018158, + "learning_rate": 9.776173876783677e-05, + "loss": 0.4125, + "step": 1439 + }, + { + "epoch": 0.12284593072854462, + "grad_norm": 1.5832017877376847, + "learning_rate": 9.775764971305936e-05, + "loss": 0.4029, + "step": 1440 + }, + { + "epoch": 0.12293124040266167, + "grad_norm": 1.447282098632627, + "learning_rate": 9.77535570122392e-05, + "loss": 0.4195, + "step": 1441 + }, + { + "epoch": 0.12301655007677871, + "grad_norm": 1.7733736404819216, + "learning_rate": 9.774946066568873e-05, + "loss": 0.3934, + "step": 1442 + }, + { + "epoch": 0.12310185975089576, + "grad_norm": 1.5119936986267137, + "learning_rate": 9.774536067372066e-05, + "loss": 0.3577, + "step": 1443 + }, + { + "epoch": 0.1231871694250128, + "grad_norm": 1.7515478781805298, + "learning_rate": 9.774125703664805e-05, + "loss": 0.3861, + "step": 1444 + }, + { + "epoch": 0.12327247909912985, + "grad_norm": 1.9438946317483903, + "learning_rate": 9.773714975478414e-05, + "loss": 0.4221, + "step": 1445 + }, + { + "epoch": 0.1233577887732469, + "grad_norm": 1.6469404464544188, + "learning_rate": 9.773303882844253e-05, + "loss": 0.4408, + "step": 1446 + }, + { + "epoch": 0.12344309844736392, + "grad_norm": 1.746491866841178, + "learning_rate": 9.772892425793705e-05, + "loss": 0.393, + "step": 1447 + }, + { + "epoch": 0.12352840812148097, + "grad_norm": 1.4388684932060216, + "learning_rate": 9.772480604358183e-05, + "loss": 0.349, + "step": 1448 + }, + { + "epoch": 0.12361371779559802, + "grad_norm": 1.2782041119479068, + "learning_rate": 9.772068418569129e-05, + "loss": 0.3875, + "step": 1449 + }, + { + "epoch": 0.12369902746971506, + "grad_norm": 1.2713470619229221, + "learning_rate": 9.77165586845801e-05, + "loss": 0.3234, + "step": 1450 + }, + { + "epoch": 0.12378433714383211, + "grad_norm": 1.429246344537149, + "learning_rate": 9.771242954056321e-05, + "loss": 0.4058, + "step": 1451 + }, + { + "epoch": 0.12386964681794915, + "grad_norm": 1.6173095657057255, + "learning_rate": 9.770829675395587e-05, + "loss": 0.4018, + "step": 1452 + }, + { + "epoch": 0.1239549564920662, + "grad_norm": 1.5435682460030622, + "learning_rate": 9.770416032507361e-05, + "loss": 0.4136, + "step": 1453 + }, + { + "epoch": 0.12404026616618324, + "grad_norm": 1.596134421660813, + "learning_rate": 9.77000202542322e-05, + "loss": 0.3907, + "step": 1454 + }, + { + "epoch": 0.12412557584030029, + "grad_norm": 1.1978802993340332, + "learning_rate": 9.769587654174772e-05, + "loss": 0.3193, + "step": 1455 + }, + { + "epoch": 0.12421088551441734, + "grad_norm": 1.6306147774565631, + "learning_rate": 9.769172918793652e-05, + "loss": 0.3818, + "step": 1456 + }, + { + "epoch": 0.12429619518853438, + "grad_norm": 1.5478235284330377, + "learning_rate": 9.768757819311523e-05, + "loss": 0.3722, + "step": 1457 + }, + { + "epoch": 0.12438150486265143, + "grad_norm": 1.5035739283804668, + "learning_rate": 9.768342355760076e-05, + "loss": 0.3329, + "step": 1458 + }, + { + "epoch": 0.12446681453676847, + "grad_norm": 1.285413536591431, + "learning_rate": 9.767926528171028e-05, + "loss": 0.3727, + "step": 1459 + }, + { + "epoch": 0.12455212421088552, + "grad_norm": 1.6558369535353157, + "learning_rate": 9.767510336576127e-05, + "loss": 0.4485, + "step": 1460 + }, + { + "epoch": 0.12463743388500256, + "grad_norm": 1.5067199543725904, + "learning_rate": 9.767093781007147e-05, + "loss": 0.3998, + "step": 1461 + }, + { + "epoch": 0.12472274355911961, + "grad_norm": 1.5887459378674527, + "learning_rate": 9.766676861495888e-05, + "loss": 0.3748, + "step": 1462 + }, + { + "epoch": 0.12480805323323665, + "grad_norm": 1.5760560779034518, + "learning_rate": 9.766259578074181e-05, + "loss": 0.401, + "step": 1463 + }, + { + "epoch": 0.1248933629073537, + "grad_norm": 1.6180061074461092, + "learning_rate": 9.765841930773883e-05, + "loss": 0.4405, + "step": 1464 + }, + { + "epoch": 0.12497867258147075, + "grad_norm": 1.8237025574127537, + "learning_rate": 9.76542391962688e-05, + "loss": 0.4425, + "step": 1465 + }, + { + "epoch": 0.1250639822555878, + "grad_norm": 1.7074011220829344, + "learning_rate": 9.765005544665084e-05, + "loss": 0.4195, + "step": 1466 + }, + { + "epoch": 0.12514929192970484, + "grad_norm": 1.7611288702891998, + "learning_rate": 9.764586805920434e-05, + "loss": 0.3624, + "step": 1467 + }, + { + "epoch": 0.12523460160382188, + "grad_norm": 1.4342176031051357, + "learning_rate": 9.764167703424904e-05, + "loss": 0.3708, + "step": 1468 + }, + { + "epoch": 0.12531991127793893, + "grad_norm": 1.5893757949209082, + "learning_rate": 9.763748237210484e-05, + "loss": 0.4177, + "step": 1469 + }, + { + "epoch": 0.12540522095205597, + "grad_norm": 1.2936634606060615, + "learning_rate": 9.763328407309201e-05, + "loss": 0.3636, + "step": 1470 + }, + { + "epoch": 0.12549053062617302, + "grad_norm": 1.4438267675029803, + "learning_rate": 9.762908213753107e-05, + "loss": 0.3984, + "step": 1471 + }, + { + "epoch": 0.12557584030029006, + "grad_norm": 1.3709420231609104, + "learning_rate": 9.76248765657428e-05, + "loss": 0.3751, + "step": 1472 + }, + { + "epoch": 0.1256611499744071, + "grad_norm": 1.824257827764997, + "learning_rate": 9.762066735804829e-05, + "loss": 0.4407, + "step": 1473 + }, + { + "epoch": 0.12574645964852416, + "grad_norm": 1.5246516158096788, + "learning_rate": 9.761645451476889e-05, + "loss": 0.41, + "step": 1474 + }, + { + "epoch": 0.12583176932264117, + "grad_norm": 1.361712933613824, + "learning_rate": 9.761223803622621e-05, + "loss": 0.3395, + "step": 1475 + }, + { + "epoch": 0.12591707899675822, + "grad_norm": 1.4738651241670113, + "learning_rate": 9.760801792274217e-05, + "loss": 0.3761, + "step": 1476 + }, + { + "epoch": 0.12600238867087526, + "grad_norm": 1.6480749373342263, + "learning_rate": 9.760379417463894e-05, + "loss": 0.3703, + "step": 1477 + }, + { + "epoch": 0.1260876983449923, + "grad_norm": 1.8161841472073759, + "learning_rate": 9.759956679223901e-05, + "loss": 0.4654, + "step": 1478 + }, + { + "epoch": 0.12617300801910936, + "grad_norm": 1.6333503384905121, + "learning_rate": 9.759533577586508e-05, + "loss": 0.4029, + "step": 1479 + }, + { + "epoch": 0.1262583176932264, + "grad_norm": 1.4446982238226294, + "learning_rate": 9.75911011258402e-05, + "loss": 0.3604, + "step": 1480 + }, + { + "epoch": 0.12634362736734345, + "grad_norm": 1.5024968664863396, + "learning_rate": 9.758686284248764e-05, + "loss": 0.3595, + "step": 1481 + }, + { + "epoch": 0.1264289370414605, + "grad_norm": 1.263981682418331, + "learning_rate": 9.758262092613099e-05, + "loss": 0.3151, + "step": 1482 + }, + { + "epoch": 0.12651424671557754, + "grad_norm": 1.4856943792015616, + "learning_rate": 9.757837537709407e-05, + "loss": 0.4196, + "step": 1483 + }, + { + "epoch": 0.12659955638969458, + "grad_norm": 1.4825089621313197, + "learning_rate": 9.757412619570104e-05, + "loss": 0.4018, + "step": 1484 + }, + { + "epoch": 0.12668486606381163, + "grad_norm": 1.681969658077868, + "learning_rate": 9.756987338227626e-05, + "loss": 0.44, + "step": 1485 + }, + { + "epoch": 0.12677017573792868, + "grad_norm": 1.6115365928119423, + "learning_rate": 9.756561693714446e-05, + "loss": 0.4438, + "step": 1486 + }, + { + "epoch": 0.12685548541204572, + "grad_norm": 1.6005427393690164, + "learning_rate": 9.756135686063055e-05, + "loss": 0.357, + "step": 1487 + }, + { + "epoch": 0.12694079508616277, + "grad_norm": 1.4131844676689043, + "learning_rate": 9.755709315305978e-05, + "loss": 0.3402, + "step": 1488 + }, + { + "epoch": 0.1270261047602798, + "grad_norm": 1.3849568524163902, + "learning_rate": 9.755282581475769e-05, + "loss": 0.4218, + "step": 1489 + }, + { + "epoch": 0.12711141443439686, + "grad_norm": 1.5737764480222753, + "learning_rate": 9.754855484605003e-05, + "loss": 0.3872, + "step": 1490 + }, + { + "epoch": 0.1271967241085139, + "grad_norm": 1.7885521573125818, + "learning_rate": 9.754428024726288e-05, + "loss": 0.4186, + "step": 1491 + }, + { + "epoch": 0.12728203378263095, + "grad_norm": 1.6499210569974947, + "learning_rate": 9.754000201872258e-05, + "loss": 0.376, + "step": 1492 + }, + { + "epoch": 0.127367343456748, + "grad_norm": 1.5879211798211474, + "learning_rate": 9.753572016075576e-05, + "loss": 0.4299, + "step": 1493 + }, + { + "epoch": 0.12745265313086504, + "grad_norm": 1.567251914194325, + "learning_rate": 9.753143467368931e-05, + "loss": 0.3762, + "step": 1494 + }, + { + "epoch": 0.12753796280498209, + "grad_norm": 1.3961765616970476, + "learning_rate": 9.75271455578504e-05, + "loss": 0.3412, + "step": 1495 + }, + { + "epoch": 0.12762327247909913, + "grad_norm": 1.4381261374126315, + "learning_rate": 9.752285281356648e-05, + "loss": 0.3891, + "step": 1496 + }, + { + "epoch": 0.12770858215321618, + "grad_norm": 1.5721309627890916, + "learning_rate": 9.75185564411653e-05, + "loss": 0.3701, + "step": 1497 + }, + { + "epoch": 0.12779389182733322, + "grad_norm": 1.5504362169301111, + "learning_rate": 9.751425644097482e-05, + "loss": 0.3562, + "step": 1498 + }, + { + "epoch": 0.12787920150145027, + "grad_norm": 1.367048768770485, + "learning_rate": 9.750995281332338e-05, + "loss": 0.4052, + "step": 1499 + }, + { + "epoch": 0.1279645111755673, + "grad_norm": 1.4070903386529137, + "learning_rate": 9.750564555853951e-05, + "loss": 0.3999, + "step": 1500 + }, + { + "epoch": 0.12804982084968436, + "grad_norm": 1.3398583859014075, + "learning_rate": 9.750133467695203e-05, + "loss": 0.3647, + "step": 1501 + }, + { + "epoch": 0.1281351305238014, + "grad_norm": 1.3337591157010007, + "learning_rate": 9.749702016889008e-05, + "loss": 0.4257, + "step": 1502 + }, + { + "epoch": 0.12822044019791845, + "grad_norm": 1.9198342630259528, + "learning_rate": 9.749270203468304e-05, + "loss": 0.4227, + "step": 1503 + }, + { + "epoch": 0.1283057498720355, + "grad_norm": 1.5473648990948863, + "learning_rate": 9.748838027466057e-05, + "loss": 0.3908, + "step": 1504 + }, + { + "epoch": 0.12839105954615254, + "grad_norm": 1.5833449195973293, + "learning_rate": 9.748405488915262e-05, + "loss": 0.3693, + "step": 1505 + }, + { + "epoch": 0.1284763692202696, + "grad_norm": 1.342533092836798, + "learning_rate": 9.747972587848942e-05, + "loss": 0.353, + "step": 1506 + }, + { + "epoch": 0.12856167889438663, + "grad_norm": 1.5022365267970412, + "learning_rate": 9.747539324300143e-05, + "loss": 0.392, + "step": 1507 + }, + { + "epoch": 0.12864698856850368, + "grad_norm": 1.5777930486563907, + "learning_rate": 9.747105698301949e-05, + "loss": 0.3399, + "step": 1508 + }, + { + "epoch": 0.12873229824262072, + "grad_norm": 1.5824866122241452, + "learning_rate": 9.746671709887458e-05, + "loss": 0.3955, + "step": 1509 + }, + { + "epoch": 0.12881760791673777, + "grad_norm": 1.5697293693400625, + "learning_rate": 9.746237359089805e-05, + "loss": 0.3539, + "step": 1510 + }, + { + "epoch": 0.12890291759085482, + "grad_norm": 1.571500565997564, + "learning_rate": 9.745802645942153e-05, + "loss": 0.3563, + "step": 1511 + }, + { + "epoch": 0.12898822726497186, + "grad_norm": 1.5906725323662974, + "learning_rate": 9.745367570477688e-05, + "loss": 0.4021, + "step": 1512 + }, + { + "epoch": 0.1290735369390889, + "grad_norm": 1.5922063195610967, + "learning_rate": 9.744932132729625e-05, + "loss": 0.354, + "step": 1513 + }, + { + "epoch": 0.12915884661320592, + "grad_norm": 1.5012171098979705, + "learning_rate": 9.744496332731208e-05, + "loss": 0.3795, + "step": 1514 + }, + { + "epoch": 0.12924415628732297, + "grad_norm": 1.4660436979757423, + "learning_rate": 9.74406017051571e-05, + "loss": 0.3711, + "step": 1515 + }, + { + "epoch": 0.12932946596144002, + "grad_norm": 1.4756413568658944, + "learning_rate": 9.743623646116427e-05, + "loss": 0.4036, + "step": 1516 + }, + { + "epoch": 0.12941477563555706, + "grad_norm": 1.785309298776367, + "learning_rate": 9.743186759566685e-05, + "loss": 0.441, + "step": 1517 + }, + { + "epoch": 0.1295000853096741, + "grad_norm": 1.7423199658861144, + "learning_rate": 9.742749510899841e-05, + "loss": 0.3893, + "step": 1518 + }, + { + "epoch": 0.12958539498379115, + "grad_norm": 1.483659832985735, + "learning_rate": 9.742311900149275e-05, + "loss": 0.3683, + "step": 1519 + }, + { + "epoch": 0.1296707046579082, + "grad_norm": 1.491168248253157, + "learning_rate": 9.741873927348394e-05, + "loss": 0.3911, + "step": 1520 + }, + { + "epoch": 0.12975601433202524, + "grad_norm": 1.6730510105079273, + "learning_rate": 9.741435592530638e-05, + "loss": 0.4091, + "step": 1521 + }, + { + "epoch": 0.1298413240061423, + "grad_norm": 1.8453110374296304, + "learning_rate": 9.74099689572947e-05, + "loss": 0.4428, + "step": 1522 + }, + { + "epoch": 0.12992663368025933, + "grad_norm": 1.768646760225816, + "learning_rate": 9.740557836978384e-05, + "loss": 0.3531, + "step": 1523 + }, + { + "epoch": 0.13001194335437638, + "grad_norm": 1.761215374879008, + "learning_rate": 9.740118416310897e-05, + "loss": 0.3612, + "step": 1524 + }, + { + "epoch": 0.13009725302849343, + "grad_norm": 1.5741509737707076, + "learning_rate": 9.739678633760559e-05, + "loss": 0.3971, + "step": 1525 + }, + { + "epoch": 0.13018256270261047, + "grad_norm": 1.2696393245765147, + "learning_rate": 9.739238489360942e-05, + "loss": 0.3709, + "step": 1526 + }, + { + "epoch": 0.13026787237672752, + "grad_norm": 1.4781102540649655, + "learning_rate": 9.738797983145654e-05, + "loss": 0.3876, + "step": 1527 + }, + { + "epoch": 0.13035318205084456, + "grad_norm": 1.5952274729518061, + "learning_rate": 9.738357115148319e-05, + "loss": 0.365, + "step": 1528 + }, + { + "epoch": 0.1304384917249616, + "grad_norm": 1.7382878335549485, + "learning_rate": 9.737915885402599e-05, + "loss": 0.4067, + "step": 1529 + }, + { + "epoch": 0.13052380139907865, + "grad_norm": 1.3814086684840199, + "learning_rate": 9.737474293942177e-05, + "loss": 0.3739, + "step": 1530 + }, + { + "epoch": 0.1306091110731957, + "grad_norm": 1.7928153577026764, + "learning_rate": 9.737032340800769e-05, + "loss": 0.4148, + "step": 1531 + }, + { + "epoch": 0.13069442074731275, + "grad_norm": 1.5636450577246201, + "learning_rate": 9.736590026012114e-05, + "loss": 0.3615, + "step": 1532 + }, + { + "epoch": 0.1307797304214298, + "grad_norm": 1.5841369931677496, + "learning_rate": 9.736147349609981e-05, + "loss": 0.4126, + "step": 1533 + }, + { + "epoch": 0.13086504009554684, + "grad_norm": 1.6114706080469992, + "learning_rate": 9.735704311628166e-05, + "loss": 0.3413, + "step": 1534 + }, + { + "epoch": 0.13095034976966388, + "grad_norm": 1.47692398409785, + "learning_rate": 9.735260912100492e-05, + "loss": 0.3962, + "step": 1535 + }, + { + "epoch": 0.13103565944378093, + "grad_norm": 1.4724535410470485, + "learning_rate": 9.73481715106081e-05, + "loss": 0.3407, + "step": 1536 + }, + { + "epoch": 0.13112096911789797, + "grad_norm": 1.2301186684219814, + "learning_rate": 9.734373028543001e-05, + "loss": 0.3667, + "step": 1537 + }, + { + "epoch": 0.13120627879201502, + "grad_norm": 1.5605454999896164, + "learning_rate": 9.733928544580967e-05, + "loss": 0.3916, + "step": 1538 + }, + { + "epoch": 0.13129158846613206, + "grad_norm": 1.4132023508328218, + "learning_rate": 9.733483699208645e-05, + "loss": 0.3506, + "step": 1539 + }, + { + "epoch": 0.1313768981402491, + "grad_norm": 1.6374237799923463, + "learning_rate": 9.733038492459998e-05, + "loss": 0.3884, + "step": 1540 + }, + { + "epoch": 0.13146220781436616, + "grad_norm": 1.871951677028812, + "learning_rate": 9.732592924369013e-05, + "loss": 0.3579, + "step": 1541 + }, + { + "epoch": 0.1315475174884832, + "grad_norm": 1.5069780165316182, + "learning_rate": 9.732146994969706e-05, + "loss": 0.4181, + "step": 1542 + }, + { + "epoch": 0.13163282716260025, + "grad_norm": 1.829945497378669, + "learning_rate": 9.731700704296126e-05, + "loss": 0.3979, + "step": 1543 + }, + { + "epoch": 0.1317181368367173, + "grad_norm": 1.4623005266360931, + "learning_rate": 9.731254052382337e-05, + "loss": 0.3763, + "step": 1544 + }, + { + "epoch": 0.13180344651083434, + "grad_norm": 1.888701918953774, + "learning_rate": 9.730807039262447e-05, + "loss": 0.4246, + "step": 1545 + }, + { + "epoch": 0.13188875618495138, + "grad_norm": 1.5003550427186978, + "learning_rate": 9.730359664970576e-05, + "loss": 0.3891, + "step": 1546 + }, + { + "epoch": 0.13197406585906843, + "grad_norm": 1.5920284173049009, + "learning_rate": 9.729911929540883e-05, + "loss": 0.4094, + "step": 1547 + }, + { + "epoch": 0.13205937553318547, + "grad_norm": 1.594850347308505, + "learning_rate": 9.729463833007548e-05, + "loss": 0.4157, + "step": 1548 + }, + { + "epoch": 0.13214468520730252, + "grad_norm": 1.488308963868031, + "learning_rate": 9.729015375404782e-05, + "loss": 0.4131, + "step": 1549 + }, + { + "epoch": 0.13222999488141957, + "grad_norm": 1.4786718731941308, + "learning_rate": 9.728566556766823e-05, + "loss": 0.4447, + "step": 1550 + }, + { + "epoch": 0.1323153045555366, + "grad_norm": 1.534318302140307, + "learning_rate": 9.728117377127933e-05, + "loss": 0.3615, + "step": 1551 + }, + { + "epoch": 0.13240061422965363, + "grad_norm": 1.2244956223801744, + "learning_rate": 9.727667836522407e-05, + "loss": 0.3359, + "step": 1552 + }, + { + "epoch": 0.13248592390377067, + "grad_norm": 1.450340145598042, + "learning_rate": 9.727217934984566e-05, + "loss": 0.3769, + "step": 1553 + }, + { + "epoch": 0.13257123357788772, + "grad_norm": 1.6672014173303975, + "learning_rate": 9.726767672548755e-05, + "loss": 0.4607, + "step": 1554 + }, + { + "epoch": 0.13265654325200477, + "grad_norm": 1.4017748940755728, + "learning_rate": 9.72631704924935e-05, + "loss": 0.3915, + "step": 1555 + }, + { + "epoch": 0.1327418529261218, + "grad_norm": 1.4735602480425447, + "learning_rate": 9.725866065120755e-05, + "loss": 0.3916, + "step": 1556 + }, + { + "epoch": 0.13282716260023886, + "grad_norm": 1.6625291090403116, + "learning_rate": 9.725414720197399e-05, + "loss": 0.3961, + "step": 1557 + }, + { + "epoch": 0.1329124722743559, + "grad_norm": 1.4465492524492045, + "learning_rate": 9.72496301451374e-05, + "loss": 0.4073, + "step": 1558 + }, + { + "epoch": 0.13299778194847295, + "grad_norm": 1.498129234387553, + "learning_rate": 9.724510948104262e-05, + "loss": 0.3665, + "step": 1559 + }, + { + "epoch": 0.13308309162259, + "grad_norm": 1.5714869116815533, + "learning_rate": 9.72405852100348e-05, + "loss": 0.4128, + "step": 1560 + }, + { + "epoch": 0.13316840129670704, + "grad_norm": 1.573304774757937, + "learning_rate": 9.723605733245933e-05, + "loss": 0.3679, + "step": 1561 + }, + { + "epoch": 0.13325371097082409, + "grad_norm": 1.6077316189692306, + "learning_rate": 9.72315258486619e-05, + "loss": 0.3689, + "step": 1562 + }, + { + "epoch": 0.13333902064494113, + "grad_norm": 1.697039168760585, + "learning_rate": 9.722699075898846e-05, + "loss": 0.4172, + "step": 1563 + }, + { + "epoch": 0.13342433031905818, + "grad_norm": 1.607776255148098, + "learning_rate": 9.722245206378524e-05, + "loss": 0.3814, + "step": 1564 + }, + { + "epoch": 0.13350963999317522, + "grad_norm": 1.7161995431216364, + "learning_rate": 9.721790976339874e-05, + "loss": 0.4171, + "step": 1565 + }, + { + "epoch": 0.13359494966729227, + "grad_norm": 1.4952024190118773, + "learning_rate": 9.721336385817575e-05, + "loss": 0.3882, + "step": 1566 + }, + { + "epoch": 0.1336802593414093, + "grad_norm": 1.4891292529039406, + "learning_rate": 9.720881434846332e-05, + "loss": 0.3755, + "step": 1567 + }, + { + "epoch": 0.13376556901552636, + "grad_norm": 1.7163639499207755, + "learning_rate": 9.720426123460877e-05, + "loss": 0.4413, + "step": 1568 + }, + { + "epoch": 0.1338508786896434, + "grad_norm": 1.5465474408060744, + "learning_rate": 9.719970451695973e-05, + "loss": 0.3535, + "step": 1569 + }, + { + "epoch": 0.13393618836376045, + "grad_norm": 1.947138163260366, + "learning_rate": 9.719514419586406e-05, + "loss": 0.4453, + "step": 1570 + }, + { + "epoch": 0.1340214980378775, + "grad_norm": 1.515007758545602, + "learning_rate": 9.719058027166994e-05, + "loss": 0.3522, + "step": 1571 + }, + { + "epoch": 0.13410680771199454, + "grad_norm": 1.7937304638586276, + "learning_rate": 9.718601274472578e-05, + "loss": 0.3663, + "step": 1572 + }, + { + "epoch": 0.1341921173861116, + "grad_norm": 1.6592282562601417, + "learning_rate": 9.71814416153803e-05, + "loss": 0.4191, + "step": 1573 + }, + { + "epoch": 0.13427742706022863, + "grad_norm": 1.561381206512026, + "learning_rate": 9.717686688398246e-05, + "loss": 0.3897, + "step": 1574 + }, + { + "epoch": 0.13436273673434568, + "grad_norm": 1.6200932736094864, + "learning_rate": 9.717228855088154e-05, + "loss": 0.4082, + "step": 1575 + }, + { + "epoch": 0.13444804640846272, + "grad_norm": 1.486219364553145, + "learning_rate": 9.716770661642707e-05, + "loss": 0.3711, + "step": 1576 + }, + { + "epoch": 0.13453335608257977, + "grad_norm": 1.5782930643462578, + "learning_rate": 9.716312108096884e-05, + "loss": 0.4289, + "step": 1577 + }, + { + "epoch": 0.13461866575669681, + "grad_norm": 1.7026244661652268, + "learning_rate": 9.715853194485693e-05, + "loss": 0.3886, + "step": 1578 + }, + { + "epoch": 0.13470397543081386, + "grad_norm": 1.6934863684178536, + "learning_rate": 9.715393920844171e-05, + "loss": 0.428, + "step": 1579 + }, + { + "epoch": 0.1347892851049309, + "grad_norm": 1.3443702774284125, + "learning_rate": 9.714934287207382e-05, + "loss": 0.4021, + "step": 1580 + }, + { + "epoch": 0.13487459477904795, + "grad_norm": 1.5960870698778773, + "learning_rate": 9.714474293610415e-05, + "loss": 0.3655, + "step": 1581 + }, + { + "epoch": 0.134959904453165, + "grad_norm": 1.4121312394302725, + "learning_rate": 9.714013940088388e-05, + "loss": 0.4129, + "step": 1582 + }, + { + "epoch": 0.13504521412728204, + "grad_norm": 1.5280788048000664, + "learning_rate": 9.713553226676446e-05, + "loss": 0.4115, + "step": 1583 + }, + { + "epoch": 0.1351305238013991, + "grad_norm": 1.581400879910264, + "learning_rate": 9.713092153409765e-05, + "loss": 0.4302, + "step": 1584 + }, + { + "epoch": 0.13521583347551613, + "grad_norm": 1.5942429172980066, + "learning_rate": 9.712630720323542e-05, + "loss": 0.4381, + "step": 1585 + }, + { + "epoch": 0.13530114314963318, + "grad_norm": 1.3782079295423924, + "learning_rate": 9.712168927453007e-05, + "loss": 0.3542, + "step": 1586 + }, + { + "epoch": 0.13538645282375023, + "grad_norm": 1.5804443689371208, + "learning_rate": 9.711706774833414e-05, + "loss": 0.3953, + "step": 1587 + }, + { + "epoch": 0.13547176249786727, + "grad_norm": 1.5370988275924657, + "learning_rate": 9.711244262500048e-05, + "loss": 0.3804, + "step": 1588 + }, + { + "epoch": 0.13555707217198432, + "grad_norm": 1.4338352924974864, + "learning_rate": 9.710781390488216e-05, + "loss": 0.374, + "step": 1589 + }, + { + "epoch": 0.13564238184610133, + "grad_norm": 1.5930805670317492, + "learning_rate": 9.710318158833261e-05, + "loss": 0.3501, + "step": 1590 + }, + { + "epoch": 0.13572769152021838, + "grad_norm": 1.5877368660180724, + "learning_rate": 9.709854567570542e-05, + "loss": 0.4376, + "step": 1591 + }, + { + "epoch": 0.13581300119433543, + "grad_norm": 1.0334568685603185, + "learning_rate": 9.709390616735456e-05, + "loss": 0.3566, + "step": 1592 + }, + { + "epoch": 0.13589831086845247, + "grad_norm": 1.5402392231310809, + "learning_rate": 9.708926306363422e-05, + "loss": 0.3672, + "step": 1593 + }, + { + "epoch": 0.13598362054256952, + "grad_norm": 1.413782334366891, + "learning_rate": 9.708461636489889e-05, + "loss": 0.3586, + "step": 1594 + }, + { + "epoch": 0.13606893021668656, + "grad_norm": 1.677039041190965, + "learning_rate": 9.70799660715033e-05, + "loss": 0.4089, + "step": 1595 + }, + { + "epoch": 0.1361542398908036, + "grad_norm": 1.4222034452278338, + "learning_rate": 9.707531218380248e-05, + "loss": 0.3866, + "step": 1596 + }, + { + "epoch": 0.13623954956492065, + "grad_norm": 1.664371157393566, + "learning_rate": 9.707065470215174e-05, + "loss": 0.3903, + "step": 1597 + }, + { + "epoch": 0.1363248592390377, + "grad_norm": 1.7475394935129018, + "learning_rate": 9.706599362690663e-05, + "loss": 0.3706, + "step": 1598 + }, + { + "epoch": 0.13641016891315474, + "grad_norm": 1.612262099111962, + "learning_rate": 9.706132895842304e-05, + "loss": 0.4023, + "step": 1599 + }, + { + "epoch": 0.1364954785872718, + "grad_norm": 1.5461674623039467, + "learning_rate": 9.705666069705704e-05, + "loss": 0.3791, + "step": 1600 + }, + { + "epoch": 0.13658078826138884, + "grad_norm": 1.6907313456202153, + "learning_rate": 9.705198884316507e-05, + "loss": 0.3877, + "step": 1601 + }, + { + "epoch": 0.13666609793550588, + "grad_norm": 1.525447438997784, + "learning_rate": 9.70473133971038e-05, + "loss": 0.4148, + "step": 1602 + }, + { + "epoch": 0.13675140760962293, + "grad_norm": 1.5611736008312378, + "learning_rate": 9.704263435923014e-05, + "loss": 0.4306, + "step": 1603 + }, + { + "epoch": 0.13683671728373997, + "grad_norm": 1.449287433487613, + "learning_rate": 9.703795172990134e-05, + "loss": 0.4418, + "step": 1604 + }, + { + "epoch": 0.13692202695785702, + "grad_norm": 1.177079608652529, + "learning_rate": 9.703326550947487e-05, + "loss": 0.3532, + "step": 1605 + }, + { + "epoch": 0.13700733663197406, + "grad_norm": 1.7926195257497117, + "learning_rate": 9.702857569830852e-05, + "loss": 0.4104, + "step": 1606 + }, + { + "epoch": 0.1370926463060911, + "grad_norm": 1.4820402556343328, + "learning_rate": 9.702388229676033e-05, + "loss": 0.415, + "step": 1607 + }, + { + "epoch": 0.13717795598020815, + "grad_norm": 1.5030486597038368, + "learning_rate": 9.701918530518861e-05, + "loss": 0.3911, + "step": 1608 + }, + { + "epoch": 0.1372632656543252, + "grad_norm": 1.2380458475660985, + "learning_rate": 9.701448472395197e-05, + "loss": 0.3332, + "step": 1609 + }, + { + "epoch": 0.13734857532844225, + "grad_norm": 1.405281326664139, + "learning_rate": 9.700978055340923e-05, + "loss": 0.3698, + "step": 1610 + }, + { + "epoch": 0.1374338850025593, + "grad_norm": 1.557745910874254, + "learning_rate": 9.700507279391956e-05, + "loss": 0.4017, + "step": 1611 + }, + { + "epoch": 0.13751919467667634, + "grad_norm": 1.4181388143486113, + "learning_rate": 9.700036144584237e-05, + "loss": 0.3906, + "step": 1612 + }, + { + "epoch": 0.13760450435079338, + "grad_norm": 1.4680458166368218, + "learning_rate": 9.699564650953734e-05, + "loss": 0.3921, + "step": 1613 + }, + { + "epoch": 0.13768981402491043, + "grad_norm": 1.5542180104846908, + "learning_rate": 9.699092798536445e-05, + "loss": 0.3987, + "step": 1614 + }, + { + "epoch": 0.13777512369902747, + "grad_norm": 1.479456573158392, + "learning_rate": 9.698620587368389e-05, + "loss": 0.4284, + "step": 1615 + }, + { + "epoch": 0.13786043337314452, + "grad_norm": 1.4117523206256366, + "learning_rate": 9.698148017485621e-05, + "loss": 0.3511, + "step": 1616 + }, + { + "epoch": 0.13794574304726157, + "grad_norm": 1.7409387055836059, + "learning_rate": 9.697675088924218e-05, + "loss": 0.4472, + "step": 1617 + }, + { + "epoch": 0.1380310527213786, + "grad_norm": 1.8808194450346787, + "learning_rate": 9.697201801720286e-05, + "loss": 0.4465, + "step": 1618 + }, + { + "epoch": 0.13811636239549566, + "grad_norm": 1.775310048707822, + "learning_rate": 9.696728155909956e-05, + "loss": 0.3565, + "step": 1619 + }, + { + "epoch": 0.1382016720696127, + "grad_norm": 1.4005306498433059, + "learning_rate": 9.69625415152939e-05, + "loss": 0.3694, + "step": 1620 + }, + { + "epoch": 0.13828698174372975, + "grad_norm": 1.381158906858611, + "learning_rate": 9.695779788614776e-05, + "loss": 0.3608, + "step": 1621 + }, + { + "epoch": 0.1383722914178468, + "grad_norm": 1.832833315835978, + "learning_rate": 9.695305067202328e-05, + "loss": 0.4196, + "step": 1622 + }, + { + "epoch": 0.13845760109196384, + "grad_norm": 1.7449584957094701, + "learning_rate": 9.694829987328288e-05, + "loss": 0.3868, + "step": 1623 + }, + { + "epoch": 0.13854291076608088, + "grad_norm": 1.4890825812635886, + "learning_rate": 9.694354549028927e-05, + "loss": 0.4003, + "step": 1624 + }, + { + "epoch": 0.13862822044019793, + "grad_norm": 1.4389838148509788, + "learning_rate": 9.693878752340544e-05, + "loss": 0.3557, + "step": 1625 + }, + { + "epoch": 0.13871353011431498, + "grad_norm": 1.67151331331575, + "learning_rate": 9.69340259729946e-05, + "loss": 0.3995, + "step": 1626 + }, + { + "epoch": 0.13879883978843202, + "grad_norm": 1.2361005480216145, + "learning_rate": 9.692926083942029e-05, + "loss": 0.3371, + "step": 1627 + }, + { + "epoch": 0.13888414946254907, + "grad_norm": 1.5116859452189984, + "learning_rate": 9.692449212304629e-05, + "loss": 0.3891, + "step": 1628 + }, + { + "epoch": 0.13896945913666608, + "grad_norm": 1.138804511431061, + "learning_rate": 9.691971982423669e-05, + "loss": 0.3532, + "step": 1629 + }, + { + "epoch": 0.13905476881078313, + "grad_norm": 1.5043253525203089, + "learning_rate": 9.691494394335579e-05, + "loss": 0.3904, + "step": 1630 + }, + { + "epoch": 0.13914007848490018, + "grad_norm": 1.73904861485781, + "learning_rate": 9.691016448076824e-05, + "loss": 0.3932, + "step": 1631 + }, + { + "epoch": 0.13922538815901722, + "grad_norm": 1.2704258979586254, + "learning_rate": 9.690538143683891e-05, + "loss": 0.3418, + "step": 1632 + }, + { + "epoch": 0.13931069783313427, + "grad_norm": 1.6981197617962145, + "learning_rate": 9.690059481193295e-05, + "loss": 0.3788, + "step": 1633 + }, + { + "epoch": 0.1393960075072513, + "grad_norm": 1.5150205842435813, + "learning_rate": 9.689580460641581e-05, + "loss": 0.3809, + "step": 1634 + }, + { + "epoch": 0.13948131718136836, + "grad_norm": 1.3116340505348978, + "learning_rate": 9.68910108206532e-05, + "loss": 0.4094, + "step": 1635 + }, + { + "epoch": 0.1395666268554854, + "grad_norm": 1.6424958234527764, + "learning_rate": 9.688621345501109e-05, + "loss": 0.383, + "step": 1636 + }, + { + "epoch": 0.13965193652960245, + "grad_norm": 1.4290833577749193, + "learning_rate": 9.688141250985574e-05, + "loss": 0.3187, + "step": 1637 + }, + { + "epoch": 0.1397372462037195, + "grad_norm": 1.640384765475006, + "learning_rate": 9.687660798555367e-05, + "loss": 0.3707, + "step": 1638 + }, + { + "epoch": 0.13982255587783654, + "grad_norm": 1.5952529552380332, + "learning_rate": 9.687179988247167e-05, + "loss": 0.3988, + "step": 1639 + }, + { + "epoch": 0.1399078655519536, + "grad_norm": 1.6646528316244216, + "learning_rate": 9.686698820097684e-05, + "loss": 0.3842, + "step": 1640 + }, + { + "epoch": 0.13999317522607063, + "grad_norm": 1.3820627194924846, + "learning_rate": 9.686217294143652e-05, + "loss": 0.3952, + "step": 1641 + }, + { + "epoch": 0.14007848490018768, + "grad_norm": 1.1956011355102751, + "learning_rate": 9.68573541042183e-05, + "loss": 0.3292, + "step": 1642 + }, + { + "epoch": 0.14016379457430472, + "grad_norm": 1.8223178933713358, + "learning_rate": 9.68525316896901e-05, + "loss": 0.4261, + "step": 1643 + }, + { + "epoch": 0.14024910424842177, + "grad_norm": 1.5863897120495658, + "learning_rate": 9.684770569822008e-05, + "loss": 0.3831, + "step": 1644 + }, + { + "epoch": 0.14033441392253881, + "grad_norm": 1.6616413968280979, + "learning_rate": 9.684287613017669e-05, + "loss": 0.4377, + "step": 1645 + }, + { + "epoch": 0.14041972359665586, + "grad_norm": 1.6190177401489059, + "learning_rate": 9.683804298592862e-05, + "loss": 0.3691, + "step": 1646 + }, + { + "epoch": 0.1405050332707729, + "grad_norm": 1.5264711350711628, + "learning_rate": 9.683320626584486e-05, + "loss": 0.3523, + "step": 1647 + }, + { + "epoch": 0.14059034294488995, + "grad_norm": 1.561883728564702, + "learning_rate": 9.682836597029468e-05, + "loss": 0.4053, + "step": 1648 + }, + { + "epoch": 0.140675652619007, + "grad_norm": 1.4997118037250283, + "learning_rate": 9.68235220996476e-05, + "loss": 0.3447, + "step": 1649 + }, + { + "epoch": 0.14076096229312404, + "grad_norm": 1.367225777226452, + "learning_rate": 9.681867465427344e-05, + "loss": 0.3324, + "step": 1650 + }, + { + "epoch": 0.1408462719672411, + "grad_norm": 1.5050431351167353, + "learning_rate": 9.681382363454224e-05, + "loss": 0.3792, + "step": 1651 + }, + { + "epoch": 0.14093158164135813, + "grad_norm": 1.5880713928855237, + "learning_rate": 9.680896904082439e-05, + "loss": 0.348, + "step": 1652 + }, + { + "epoch": 0.14101689131547518, + "grad_norm": 1.5057708674642218, + "learning_rate": 9.68041108734905e-05, + "loss": 0.3379, + "step": 1653 + }, + { + "epoch": 0.14110220098959222, + "grad_norm": 1.7301400520436687, + "learning_rate": 9.679924913291145e-05, + "loss": 0.3727, + "step": 1654 + }, + { + "epoch": 0.14118751066370927, + "grad_norm": 1.3814422370081434, + "learning_rate": 9.679438381945843e-05, + "loss": 0.3718, + "step": 1655 + }, + { + "epoch": 0.14127282033782632, + "grad_norm": 1.659390477110099, + "learning_rate": 9.678951493350286e-05, + "loss": 0.3357, + "step": 1656 + }, + { + "epoch": 0.14135813001194336, + "grad_norm": 1.8907876063007454, + "learning_rate": 9.678464247541648e-05, + "loss": 0.4025, + "step": 1657 + }, + { + "epoch": 0.1414434396860604, + "grad_norm": 1.5118949846553016, + "learning_rate": 9.677976644557125e-05, + "loss": 0.3498, + "step": 1658 + }, + { + "epoch": 0.14152874936017745, + "grad_norm": 1.6497480055532063, + "learning_rate": 9.677488684433944e-05, + "loss": 0.381, + "step": 1659 + }, + { + "epoch": 0.1416140590342945, + "grad_norm": 1.4297140984067351, + "learning_rate": 9.677000367209356e-05, + "loss": 0.3534, + "step": 1660 + }, + { + "epoch": 0.14169936870841154, + "grad_norm": 1.3458052154926359, + "learning_rate": 9.676511692920647e-05, + "loss": 0.4, + "step": 1661 + }, + { + "epoch": 0.1417846783825286, + "grad_norm": 1.646494366489924, + "learning_rate": 9.67602266160512e-05, + "loss": 0.3945, + "step": 1662 + }, + { + "epoch": 0.14186998805664564, + "grad_norm": 1.6223821093456916, + "learning_rate": 9.675533273300111e-05, + "loss": 0.398, + "step": 1663 + }, + { + "epoch": 0.14195529773076268, + "grad_norm": 1.5667024175474185, + "learning_rate": 9.675043528042982e-05, + "loss": 0.3729, + "step": 1664 + }, + { + "epoch": 0.14204060740487973, + "grad_norm": 1.2793758921207807, + "learning_rate": 9.674553425871123e-05, + "loss": 0.3531, + "step": 1665 + }, + { + "epoch": 0.14212591707899677, + "grad_norm": 1.6327573045956492, + "learning_rate": 9.67406296682195e-05, + "loss": 0.419, + "step": 1666 + }, + { + "epoch": 0.1422112267531138, + "grad_norm": 1.6668238406774813, + "learning_rate": 9.673572150932909e-05, + "loss": 0.3788, + "step": 1667 + }, + { + "epoch": 0.14229653642723084, + "grad_norm": 1.5758329308299346, + "learning_rate": 9.673080978241468e-05, + "loss": 0.3755, + "step": 1668 + }, + { + "epoch": 0.14238184610134788, + "grad_norm": 1.6088291464708675, + "learning_rate": 9.672589448785128e-05, + "loss": 0.3768, + "step": 1669 + }, + { + "epoch": 0.14246715577546493, + "grad_norm": 1.5091788470519045, + "learning_rate": 9.672097562601414e-05, + "loss": 0.3577, + "step": 1670 + }, + { + "epoch": 0.14255246544958197, + "grad_norm": 1.5052430391985423, + "learning_rate": 9.671605319727876e-05, + "loss": 0.3385, + "step": 1671 + }, + { + "epoch": 0.14263777512369902, + "grad_norm": 1.334734815912357, + "learning_rate": 9.6711127202021e-05, + "loss": 0.3836, + "step": 1672 + }, + { + "epoch": 0.14272308479781606, + "grad_norm": 1.6075240667033628, + "learning_rate": 9.670619764061688e-05, + "loss": 0.3529, + "step": 1673 + }, + { + "epoch": 0.1428083944719331, + "grad_norm": 1.7703707221642828, + "learning_rate": 9.670126451344277e-05, + "loss": 0.3997, + "step": 1674 + }, + { + "epoch": 0.14289370414605015, + "grad_norm": 1.533964126177392, + "learning_rate": 9.66963278208753e-05, + "loss": 0.4162, + "step": 1675 + }, + { + "epoch": 0.1429790138201672, + "grad_norm": 1.476583813710259, + "learning_rate": 9.669138756329133e-05, + "loss": 0.4067, + "step": 1676 + }, + { + "epoch": 0.14306432349428425, + "grad_norm": 1.4193113922016716, + "learning_rate": 9.668644374106805e-05, + "loss": 0.3244, + "step": 1677 + }, + { + "epoch": 0.1431496331684013, + "grad_norm": 1.6095376534348946, + "learning_rate": 9.668149635458287e-05, + "loss": 0.3656, + "step": 1678 + }, + { + "epoch": 0.14323494284251834, + "grad_norm": 1.6445687260287902, + "learning_rate": 9.667654540421351e-05, + "loss": 0.4261, + "step": 1679 + }, + { + "epoch": 0.14332025251663538, + "grad_norm": 1.6519379534580574, + "learning_rate": 9.667159089033794e-05, + "loss": 0.3484, + "step": 1680 + }, + { + "epoch": 0.14340556219075243, + "grad_norm": 1.4769887157672892, + "learning_rate": 9.666663281333443e-05, + "loss": 0.3702, + "step": 1681 + }, + { + "epoch": 0.14349087186486947, + "grad_norm": 1.550751654585159, + "learning_rate": 9.666167117358149e-05, + "loss": 0.3688, + "step": 1682 + }, + { + "epoch": 0.14357618153898652, + "grad_norm": 1.7811706257585727, + "learning_rate": 9.66567059714579e-05, + "loss": 0.4493, + "step": 1683 + }, + { + "epoch": 0.14366149121310356, + "grad_norm": 1.6598863090510807, + "learning_rate": 9.665173720734277e-05, + "loss": 0.3824, + "step": 1684 + }, + { + "epoch": 0.1437468008872206, + "grad_norm": 1.6450085887278902, + "learning_rate": 9.66467648816154e-05, + "loss": 0.4062, + "step": 1685 + }, + { + "epoch": 0.14383211056133766, + "grad_norm": 1.2081004718593613, + "learning_rate": 9.66417889946554e-05, + "loss": 0.3916, + "step": 1686 + }, + { + "epoch": 0.1439174202354547, + "grad_norm": 1.517122060226807, + "learning_rate": 9.663680954684268e-05, + "loss": 0.381, + "step": 1687 + }, + { + "epoch": 0.14400272990957175, + "grad_norm": 1.4202328096197847, + "learning_rate": 9.663182653855737e-05, + "loss": 0.3432, + "step": 1688 + }, + { + "epoch": 0.1440880395836888, + "grad_norm": 1.353979562402305, + "learning_rate": 9.662683997017991e-05, + "loss": 0.3655, + "step": 1689 + }, + { + "epoch": 0.14417334925780584, + "grad_norm": 1.2832574237732859, + "learning_rate": 9.6621849842091e-05, + "loss": 0.3568, + "step": 1690 + }, + { + "epoch": 0.14425865893192288, + "grad_norm": 1.759166416521776, + "learning_rate": 9.661685615467157e-05, + "loss": 0.421, + "step": 1691 + }, + { + "epoch": 0.14434396860603993, + "grad_norm": 1.4712657080421907, + "learning_rate": 9.661185890830293e-05, + "loss": 0.3836, + "step": 1692 + }, + { + "epoch": 0.14442927828015698, + "grad_norm": 1.3180180764818294, + "learning_rate": 9.660685810336654e-05, + "loss": 0.3775, + "step": 1693 + }, + { + "epoch": 0.14451458795427402, + "grad_norm": 1.6689204553992965, + "learning_rate": 9.660185374024421e-05, + "loss": 0.4319, + "step": 1694 + }, + { + "epoch": 0.14459989762839107, + "grad_norm": 1.8767671522721343, + "learning_rate": 9.659684581931798e-05, + "loss": 0.4769, + "step": 1695 + }, + { + "epoch": 0.1446852073025081, + "grad_norm": 1.1487441918492973, + "learning_rate": 9.65918343409702e-05, + "loss": 0.3641, + "step": 1696 + }, + { + "epoch": 0.14477051697662516, + "grad_norm": 1.426492174134447, + "learning_rate": 9.658681930558345e-05, + "loss": 0.3907, + "step": 1697 + }, + { + "epoch": 0.1448558266507422, + "grad_norm": 1.343007436997308, + "learning_rate": 9.658180071354061e-05, + "loss": 0.3415, + "step": 1698 + }, + { + "epoch": 0.14494113632485925, + "grad_norm": 1.2960709066832559, + "learning_rate": 9.657677856522483e-05, + "loss": 0.3703, + "step": 1699 + }, + { + "epoch": 0.1450264459989763, + "grad_norm": 1.4444001488536842, + "learning_rate": 9.657175286101949e-05, + "loss": 0.3942, + "step": 1700 + }, + { + "epoch": 0.14511175567309334, + "grad_norm": 1.75717680562683, + "learning_rate": 9.656672360130832e-05, + "loss": 0.4542, + "step": 1701 + }, + { + "epoch": 0.14519706534721039, + "grad_norm": 1.4568337314280622, + "learning_rate": 9.656169078647526e-05, + "loss": 0.3509, + "step": 1702 + }, + { + "epoch": 0.14528237502132743, + "grad_norm": 1.3374519909950235, + "learning_rate": 9.655665441690453e-05, + "loss": 0.3435, + "step": 1703 + }, + { + "epoch": 0.14536768469544448, + "grad_norm": 1.4683551663447267, + "learning_rate": 9.655161449298062e-05, + "loss": 0.3478, + "step": 1704 + }, + { + "epoch": 0.1454529943695615, + "grad_norm": 1.5266726841592815, + "learning_rate": 9.654657101508836e-05, + "loss": 0.4242, + "step": 1705 + }, + { + "epoch": 0.14553830404367854, + "grad_norm": 1.473775584975126, + "learning_rate": 9.654152398361271e-05, + "loss": 0.3427, + "step": 1706 + }, + { + "epoch": 0.14562361371779559, + "grad_norm": 1.3406683605802416, + "learning_rate": 9.653647339893905e-05, + "loss": 0.4031, + "step": 1707 + }, + { + "epoch": 0.14570892339191263, + "grad_norm": 1.667479753843185, + "learning_rate": 9.653141926145292e-05, + "loss": 0.3689, + "step": 1708 + }, + { + "epoch": 0.14579423306602968, + "grad_norm": 2.777221333613233, + "learning_rate": 9.652636157154022e-05, + "loss": 0.4743, + "step": 1709 + }, + { + "epoch": 0.14587954274014672, + "grad_norm": 1.8237932186744834, + "learning_rate": 9.652130032958704e-05, + "loss": 0.3778, + "step": 1710 + }, + { + "epoch": 0.14596485241426377, + "grad_norm": 1.6145583448219605, + "learning_rate": 9.651623553597981e-05, + "loss": 0.4175, + "step": 1711 + }, + { + "epoch": 0.1460501620883808, + "grad_norm": 1.5018988352712577, + "learning_rate": 9.651116719110517e-05, + "loss": 0.3816, + "step": 1712 + }, + { + "epoch": 0.14613547176249786, + "grad_norm": 1.5178615153273747, + "learning_rate": 9.650609529535008e-05, + "loss": 0.4001, + "step": 1713 + }, + { + "epoch": 0.1462207814366149, + "grad_norm": 1.2054417091051284, + "learning_rate": 9.650101984910174e-05, + "loss": 0.3503, + "step": 1714 + }, + { + "epoch": 0.14630609111073195, + "grad_norm": 1.7486335324051907, + "learning_rate": 9.649594085274764e-05, + "loss": 0.3849, + "step": 1715 + }, + { + "epoch": 0.146391400784849, + "grad_norm": 1.8563102018824313, + "learning_rate": 9.649085830667555e-05, + "loss": 0.4299, + "step": 1716 + }, + { + "epoch": 0.14647671045896604, + "grad_norm": 1.5228097257974407, + "learning_rate": 9.648577221127346e-05, + "loss": 0.3526, + "step": 1717 + }, + { + "epoch": 0.1465620201330831, + "grad_norm": 1.4413907843693086, + "learning_rate": 9.64806825669297e-05, + "loss": 0.4098, + "step": 1718 + }, + { + "epoch": 0.14664732980720013, + "grad_norm": 1.3238663556163661, + "learning_rate": 9.647558937403283e-05, + "loss": 0.4141, + "step": 1719 + }, + { + "epoch": 0.14673263948131718, + "grad_norm": 1.3132578160670438, + "learning_rate": 9.647049263297168e-05, + "loss": 0.416, + "step": 1720 + }, + { + "epoch": 0.14681794915543422, + "grad_norm": 1.549874835959702, + "learning_rate": 9.646539234413535e-05, + "loss": 0.4165, + "step": 1721 + }, + { + "epoch": 0.14690325882955127, + "grad_norm": 1.3768410062121659, + "learning_rate": 9.646028850791325e-05, + "loss": 0.3991, + "step": 1722 + }, + { + "epoch": 0.14698856850366832, + "grad_norm": 1.4470703981749367, + "learning_rate": 9.645518112469498e-05, + "loss": 0.3556, + "step": 1723 + }, + { + "epoch": 0.14707387817778536, + "grad_norm": 1.5076693926481615, + "learning_rate": 9.645007019487052e-05, + "loss": 0.3705, + "step": 1724 + }, + { + "epoch": 0.1471591878519024, + "grad_norm": 1.508918791196701, + "learning_rate": 9.644495571883003e-05, + "loss": 0.3791, + "step": 1725 + }, + { + "epoch": 0.14724449752601945, + "grad_norm": 1.3324806148986974, + "learning_rate": 9.643983769696398e-05, + "loss": 0.3285, + "step": 1726 + }, + { + "epoch": 0.1473298072001365, + "grad_norm": 1.3555679504856217, + "learning_rate": 9.64347161296631e-05, + "loss": 0.3561, + "step": 1727 + }, + { + "epoch": 0.14741511687425354, + "grad_norm": 1.6622513778756545, + "learning_rate": 9.64295910173184e-05, + "loss": 0.3974, + "step": 1728 + }, + { + "epoch": 0.1475004265483706, + "grad_norm": 1.593313419159069, + "learning_rate": 9.642446236032114e-05, + "loss": 0.4114, + "step": 1729 + }, + { + "epoch": 0.14758573622248763, + "grad_norm": 1.870824360418856, + "learning_rate": 9.64193301590629e-05, + "loss": 0.4599, + "step": 1730 + }, + { + "epoch": 0.14767104589660468, + "grad_norm": 1.8072247524864626, + "learning_rate": 9.641419441393546e-05, + "loss": 0.385, + "step": 1731 + }, + { + "epoch": 0.14775635557072173, + "grad_norm": 1.5420268213732446, + "learning_rate": 9.640905512533091e-05, + "loss": 0.4024, + "step": 1732 + }, + { + "epoch": 0.14784166524483877, + "grad_norm": 1.6640824151078697, + "learning_rate": 9.640391229364165e-05, + "loss": 0.4239, + "step": 1733 + }, + { + "epoch": 0.14792697491895582, + "grad_norm": 1.3745951489999428, + "learning_rate": 9.639876591926026e-05, + "loss": 0.3413, + "step": 1734 + }, + { + "epoch": 0.14801228459307286, + "grad_norm": 1.4852889610205458, + "learning_rate": 9.639361600257966e-05, + "loss": 0.3701, + "step": 1735 + }, + { + "epoch": 0.1480975942671899, + "grad_norm": 1.6142375042440416, + "learning_rate": 9.6388462543993e-05, + "loss": 0.398, + "step": 1736 + }, + { + "epoch": 0.14818290394130695, + "grad_norm": 1.520588641753117, + "learning_rate": 9.638330554389374e-05, + "loss": 0.3728, + "step": 1737 + }, + { + "epoch": 0.148268213615424, + "grad_norm": 1.3499538131159847, + "learning_rate": 9.637814500267559e-05, + "loss": 0.3325, + "step": 1738 + }, + { + "epoch": 0.14835352328954104, + "grad_norm": 1.737518074435624, + "learning_rate": 9.63729809207325e-05, + "loss": 0.4795, + "step": 1739 + }, + { + "epoch": 0.1484388329636581, + "grad_norm": 1.4407734295887524, + "learning_rate": 9.636781329845877e-05, + "loss": 0.4074, + "step": 1740 + }, + { + "epoch": 0.14852414263777514, + "grad_norm": 1.6469455855876347, + "learning_rate": 9.636264213624889e-05, + "loss": 0.3681, + "step": 1741 + }, + { + "epoch": 0.14860945231189218, + "grad_norm": 1.6471412950091706, + "learning_rate": 9.635746743449763e-05, + "loss": 0.3806, + "step": 1742 + }, + { + "epoch": 0.1486947619860092, + "grad_norm": 1.4311581744592927, + "learning_rate": 9.635228919360009e-05, + "loss": 0.3198, + "step": 1743 + }, + { + "epoch": 0.14878007166012625, + "grad_norm": 1.4545755017493271, + "learning_rate": 9.634710741395158e-05, + "loss": 0.3289, + "step": 1744 + }, + { + "epoch": 0.1488653813342433, + "grad_norm": 1.5985590615385246, + "learning_rate": 9.634192209594773e-05, + "loss": 0.3824, + "step": 1745 + }, + { + "epoch": 0.14895069100836034, + "grad_norm": 1.3794190315163781, + "learning_rate": 9.633673323998436e-05, + "loss": 0.3005, + "step": 1746 + }, + { + "epoch": 0.14903600068247738, + "grad_norm": 1.8436476468916945, + "learning_rate": 9.633154084645766e-05, + "loss": 0.4571, + "step": 1747 + }, + { + "epoch": 0.14912131035659443, + "grad_norm": 1.651139490095605, + "learning_rate": 9.6326344915764e-05, + "loss": 0.4404, + "step": 1748 + }, + { + "epoch": 0.14920662003071147, + "grad_norm": 1.6451433723373943, + "learning_rate": 9.632114544830011e-05, + "loss": 0.3688, + "step": 1749 + }, + { + "epoch": 0.14929192970482852, + "grad_norm": 1.4586588995695942, + "learning_rate": 9.631594244446289e-05, + "loss": 0.4106, + "step": 1750 + }, + { + "epoch": 0.14937723937894556, + "grad_norm": 1.5093414776793508, + "learning_rate": 9.63107359046496e-05, + "loss": 0.3227, + "step": 1751 + }, + { + "epoch": 0.1494625490530626, + "grad_norm": 1.5598998751664908, + "learning_rate": 9.630552582925772e-05, + "loss": 0.3802, + "step": 1752 + }, + { + "epoch": 0.14954785872717966, + "grad_norm": 1.4887288539355181, + "learning_rate": 9.630031221868501e-05, + "loss": 0.3677, + "step": 1753 + }, + { + "epoch": 0.1496331684012967, + "grad_norm": 1.2235965221043843, + "learning_rate": 9.62950950733295e-05, + "loss": 0.3674, + "step": 1754 + }, + { + "epoch": 0.14971847807541375, + "grad_norm": 1.515255833921545, + "learning_rate": 9.62898743935895e-05, + "loss": 0.4021, + "step": 1755 + }, + { + "epoch": 0.1498037877495308, + "grad_norm": 1.4305101539707445, + "learning_rate": 9.628465017986356e-05, + "loss": 0.3064, + "step": 1756 + }, + { + "epoch": 0.14988909742364784, + "grad_norm": 1.6500262518441646, + "learning_rate": 9.627942243255055e-05, + "loss": 0.3701, + "step": 1757 + }, + { + "epoch": 0.14997440709776488, + "grad_norm": 1.4100404728836626, + "learning_rate": 9.627419115204956e-05, + "loss": 0.4161, + "step": 1758 + }, + { + "epoch": 0.15005971677188193, + "grad_norm": 1.4655644385995943, + "learning_rate": 9.626895633875997e-05, + "loss": 0.4098, + "step": 1759 + }, + { + "epoch": 0.15014502644599897, + "grad_norm": 1.713293743653325, + "learning_rate": 9.626371799308144e-05, + "loss": 0.3595, + "step": 1760 + }, + { + "epoch": 0.15023033612011602, + "grad_norm": 1.6830835830686528, + "learning_rate": 9.625847611541388e-05, + "loss": 0.3947, + "step": 1761 + }, + { + "epoch": 0.15031564579423307, + "grad_norm": 2.0044790419722056, + "learning_rate": 9.625323070615751e-05, + "loss": 0.3541, + "step": 1762 + }, + { + "epoch": 0.1504009554683501, + "grad_norm": 1.3750529712530464, + "learning_rate": 9.624798176571274e-05, + "loss": 0.3518, + "step": 1763 + }, + { + "epoch": 0.15048626514246716, + "grad_norm": 1.7924677662099286, + "learning_rate": 9.624272929448033e-05, + "loss": 0.359, + "step": 1764 + }, + { + "epoch": 0.1505715748165842, + "grad_norm": 1.2978652023390904, + "learning_rate": 9.623747329286126e-05, + "loss": 0.3828, + "step": 1765 + }, + { + "epoch": 0.15065688449070125, + "grad_norm": 1.5247784312897292, + "learning_rate": 9.623221376125683e-05, + "loss": 0.3843, + "step": 1766 + }, + { + "epoch": 0.1507421941648183, + "grad_norm": 1.70749558477543, + "learning_rate": 9.622695070006855e-05, + "loss": 0.3822, + "step": 1767 + }, + { + "epoch": 0.15082750383893534, + "grad_norm": 1.198449002724245, + "learning_rate": 9.622168410969824e-05, + "loss": 0.3519, + "step": 1768 + }, + { + "epoch": 0.15091281351305239, + "grad_norm": 1.3043516720493946, + "learning_rate": 9.621641399054797e-05, + "loss": 0.4024, + "step": 1769 + }, + { + "epoch": 0.15099812318716943, + "grad_norm": 1.716999063748175, + "learning_rate": 9.621114034302007e-05, + "loss": 0.479, + "step": 1770 + }, + { + "epoch": 0.15108343286128648, + "grad_norm": 1.6873057218262109, + "learning_rate": 9.620586316751719e-05, + "loss": 0.3614, + "step": 1771 + }, + { + "epoch": 0.15116874253540352, + "grad_norm": 1.200797964065371, + "learning_rate": 9.620058246444218e-05, + "loss": 0.3291, + "step": 1772 + }, + { + "epoch": 0.15125405220952057, + "grad_norm": 1.4484056520715738, + "learning_rate": 9.619529823419821e-05, + "loss": 0.3586, + "step": 1773 + }, + { + "epoch": 0.1513393618836376, + "grad_norm": 1.2269084466853473, + "learning_rate": 9.619001047718871e-05, + "loss": 0.3339, + "step": 1774 + }, + { + "epoch": 0.15142467155775466, + "grad_norm": 1.3722119241311757, + "learning_rate": 9.618471919381735e-05, + "loss": 0.3553, + "step": 1775 + }, + { + "epoch": 0.1515099812318717, + "grad_norm": 1.6037306089708154, + "learning_rate": 9.617942438448812e-05, + "loss": 0.3295, + "step": 1776 + }, + { + "epoch": 0.15159529090598875, + "grad_norm": 1.4003300992902172, + "learning_rate": 9.617412604960523e-05, + "loss": 0.3681, + "step": 1777 + }, + { + "epoch": 0.1516806005801058, + "grad_norm": 1.7640789318417542, + "learning_rate": 9.616882418957318e-05, + "loss": 0.4051, + "step": 1778 + }, + { + "epoch": 0.15176591025422284, + "grad_norm": 1.169260898652667, + "learning_rate": 9.616351880479675e-05, + "loss": 0.324, + "step": 1779 + }, + { + "epoch": 0.1518512199283399, + "grad_norm": 1.2750751753691671, + "learning_rate": 9.615820989568098e-05, + "loss": 0.3355, + "step": 1780 + }, + { + "epoch": 0.15193652960245693, + "grad_norm": 1.4719317447552593, + "learning_rate": 9.615289746263116e-05, + "loss": 0.3695, + "step": 1781 + }, + { + "epoch": 0.15202183927657395, + "grad_norm": 1.4220472797361745, + "learning_rate": 9.614758150605286e-05, + "loss": 0.3708, + "step": 1782 + }, + { + "epoch": 0.152107148950691, + "grad_norm": 1.5917697646751967, + "learning_rate": 9.614226202635195e-05, + "loss": 0.3793, + "step": 1783 + }, + { + "epoch": 0.15219245862480804, + "grad_norm": 1.5759938268298548, + "learning_rate": 9.613693902393455e-05, + "loss": 0.4172, + "step": 1784 + }, + { + "epoch": 0.1522777682989251, + "grad_norm": 1.5676166680828407, + "learning_rate": 9.613161249920701e-05, + "loss": 0.3738, + "step": 1785 + }, + { + "epoch": 0.15236307797304213, + "grad_norm": 1.3826927521144667, + "learning_rate": 9.6126282452576e-05, + "loss": 0.3848, + "step": 1786 + }, + { + "epoch": 0.15244838764715918, + "grad_norm": 1.7691077258150025, + "learning_rate": 9.612094888444845e-05, + "loss": 0.4031, + "step": 1787 + }, + { + "epoch": 0.15253369732127622, + "grad_norm": 1.1394484735968708, + "learning_rate": 9.611561179523152e-05, + "loss": 0.3697, + "step": 1788 + }, + { + "epoch": 0.15261900699539327, + "grad_norm": 1.4672035029536938, + "learning_rate": 9.611027118533271e-05, + "loss": 0.3741, + "step": 1789 + }, + { + "epoch": 0.15270431666951031, + "grad_norm": 1.5877232011693645, + "learning_rate": 9.610492705515972e-05, + "loss": 0.3742, + "step": 1790 + }, + { + "epoch": 0.15278962634362736, + "grad_norm": 1.3683755127186337, + "learning_rate": 9.609957940512054e-05, + "loss": 0.3988, + "step": 1791 + }, + { + "epoch": 0.1528749360177444, + "grad_norm": 1.472683124651167, + "learning_rate": 9.609422823562345e-05, + "loss": 0.4032, + "step": 1792 + }, + { + "epoch": 0.15296024569186145, + "grad_norm": 1.3209454525955682, + "learning_rate": 9.608887354707699e-05, + "loss": 0.3413, + "step": 1793 + }, + { + "epoch": 0.1530455553659785, + "grad_norm": 1.792818549353607, + "learning_rate": 9.608351533988992e-05, + "loss": 0.3989, + "step": 1794 + }, + { + "epoch": 0.15313086504009554, + "grad_norm": 1.6625799461015265, + "learning_rate": 9.607815361447136e-05, + "loss": 0.3779, + "step": 1795 + }, + { + "epoch": 0.1532161747142126, + "grad_norm": 1.7639910133548111, + "learning_rate": 9.607278837123064e-05, + "loss": 0.4184, + "step": 1796 + }, + { + "epoch": 0.15330148438832963, + "grad_norm": 1.6711128137472173, + "learning_rate": 9.606741961057736e-05, + "loss": 0.3744, + "step": 1797 + }, + { + "epoch": 0.15338679406244668, + "grad_norm": 1.5126975978688757, + "learning_rate": 9.606204733292139e-05, + "loss": 0.3529, + "step": 1798 + }, + { + "epoch": 0.15347210373656373, + "grad_norm": 1.7457346024132616, + "learning_rate": 9.605667153867286e-05, + "loss": 0.3636, + "step": 1799 + }, + { + "epoch": 0.15355741341068077, + "grad_norm": 1.2638887811784152, + "learning_rate": 9.605129222824223e-05, + "loss": 0.3727, + "step": 1800 + }, + { + "epoch": 0.15364272308479782, + "grad_norm": 1.4243685477739585, + "learning_rate": 9.604590940204013e-05, + "loss": 0.3613, + "step": 1801 + }, + { + "epoch": 0.15372803275891486, + "grad_norm": 1.3146758436407582, + "learning_rate": 9.604052306047755e-05, + "loss": 0.3022, + "step": 1802 + }, + { + "epoch": 0.1538133424330319, + "grad_norm": 1.4733512715647232, + "learning_rate": 9.603513320396569e-05, + "loss": 0.3432, + "step": 1803 + }, + { + "epoch": 0.15389865210714895, + "grad_norm": 1.7525916663560606, + "learning_rate": 9.602973983291604e-05, + "loss": 0.341, + "step": 1804 + }, + { + "epoch": 0.153983961781266, + "grad_norm": 1.5942813511123175, + "learning_rate": 9.602434294774037e-05, + "loss": 0.3396, + "step": 1805 + }, + { + "epoch": 0.15406927145538304, + "grad_norm": 1.7481253664945806, + "learning_rate": 9.601894254885067e-05, + "loss": 0.3472, + "step": 1806 + }, + { + "epoch": 0.1541545811295001, + "grad_norm": 1.5993125928991831, + "learning_rate": 9.601353863665925e-05, + "loss": 0.417, + "step": 1807 + }, + { + "epoch": 0.15423989080361714, + "grad_norm": 1.4397825280853813, + "learning_rate": 9.600813121157868e-05, + "loss": 0.3458, + "step": 1808 + }, + { + "epoch": 0.15432520047773418, + "grad_norm": 1.3972929207746092, + "learning_rate": 9.600272027402178e-05, + "loss": 0.4074, + "step": 1809 + }, + { + "epoch": 0.15441051015185123, + "grad_norm": 1.2668024872859318, + "learning_rate": 9.599730582440163e-05, + "loss": 0.3325, + "step": 1810 + }, + { + "epoch": 0.15449581982596827, + "grad_norm": 1.6039052808101828, + "learning_rate": 9.599188786313162e-05, + "loss": 0.3699, + "step": 1811 + }, + { + "epoch": 0.15458112950008532, + "grad_norm": 1.5346076120856722, + "learning_rate": 9.598646639062538e-05, + "loss": 0.3955, + "step": 1812 + }, + { + "epoch": 0.15466643917420236, + "grad_norm": 1.6340080885109143, + "learning_rate": 9.59810414072968e-05, + "loss": 0.4158, + "step": 1813 + }, + { + "epoch": 0.1547517488483194, + "grad_norm": 1.5695878059979012, + "learning_rate": 9.597561291356004e-05, + "loss": 0.3773, + "step": 1814 + }, + { + "epoch": 0.15483705852243645, + "grad_norm": 1.466002552717206, + "learning_rate": 9.597018090982956e-05, + "loss": 0.3832, + "step": 1815 + }, + { + "epoch": 0.1549223681965535, + "grad_norm": 1.254091433341745, + "learning_rate": 9.596474539652005e-05, + "loss": 0.359, + "step": 1816 + }, + { + "epoch": 0.15500767787067055, + "grad_norm": 1.4213354742941797, + "learning_rate": 9.595930637404649e-05, + "loss": 0.3661, + "step": 1817 + }, + { + "epoch": 0.1550929875447876, + "grad_norm": 1.330105819077737, + "learning_rate": 9.59538638428241e-05, + "loss": 0.3946, + "step": 1818 + }, + { + "epoch": 0.15517829721890464, + "grad_norm": 1.4036141169638505, + "learning_rate": 9.594841780326842e-05, + "loss": 0.3244, + "step": 1819 + }, + { + "epoch": 0.15526360689302166, + "grad_norm": 1.303698776131246, + "learning_rate": 9.59429682557952e-05, + "loss": 0.3521, + "step": 1820 + }, + { + "epoch": 0.1553489165671387, + "grad_norm": 1.4410373453440648, + "learning_rate": 9.59375152008205e-05, + "loss": 0.3277, + "step": 1821 + }, + { + "epoch": 0.15543422624125575, + "grad_norm": 1.3475386858537444, + "learning_rate": 9.593205863876062e-05, + "loss": 0.4049, + "step": 1822 + }, + { + "epoch": 0.1555195359153728, + "grad_norm": 1.4593369799609488, + "learning_rate": 9.592659857003214e-05, + "loss": 0.3505, + "step": 1823 + }, + { + "epoch": 0.15560484558948984, + "grad_norm": 1.5656568299368185, + "learning_rate": 9.592113499505193e-05, + "loss": 0.3679, + "step": 1824 + }, + { + "epoch": 0.15569015526360688, + "grad_norm": 1.679516592867654, + "learning_rate": 9.591566791423708e-05, + "loss": 0.3855, + "step": 1825 + }, + { + "epoch": 0.15577546493772393, + "grad_norm": 1.6455988737090415, + "learning_rate": 9.591019732800499e-05, + "loss": 0.3675, + "step": 1826 + }, + { + "epoch": 0.15586077461184097, + "grad_norm": 1.4259535593921322, + "learning_rate": 9.590472323677328e-05, + "loss": 0.3635, + "step": 1827 + }, + { + "epoch": 0.15594608428595802, + "grad_norm": 1.5206448512226005, + "learning_rate": 9.589924564095991e-05, + "loss": 0.3685, + "step": 1828 + }, + { + "epoch": 0.15603139396007507, + "grad_norm": 1.6547382581092218, + "learning_rate": 9.589376454098304e-05, + "loss": 0.3603, + "step": 1829 + }, + { + "epoch": 0.1561167036341921, + "grad_norm": 1.4780365383138192, + "learning_rate": 9.58882799372611e-05, + "loss": 0.3628, + "step": 1830 + }, + { + "epoch": 0.15620201330830916, + "grad_norm": 1.5656247746206642, + "learning_rate": 9.588279183021288e-05, + "loss": 0.4144, + "step": 1831 + }, + { + "epoch": 0.1562873229824262, + "grad_norm": 1.895526302194404, + "learning_rate": 9.58773002202573e-05, + "loss": 0.4114, + "step": 1832 + }, + { + "epoch": 0.15637263265654325, + "grad_norm": 1.5216362893260724, + "learning_rate": 9.587180510781363e-05, + "loss": 0.3848, + "step": 1833 + }, + { + "epoch": 0.1564579423306603, + "grad_norm": 1.275557549434821, + "learning_rate": 9.586630649330142e-05, + "loss": 0.3665, + "step": 1834 + }, + { + "epoch": 0.15654325200477734, + "grad_norm": 1.657339637329028, + "learning_rate": 9.586080437714044e-05, + "loss": 0.435, + "step": 1835 + }, + { + "epoch": 0.15662856167889438, + "grad_norm": 1.7698956718517342, + "learning_rate": 9.585529875975074e-05, + "loss": 0.4315, + "step": 1836 + }, + { + "epoch": 0.15671387135301143, + "grad_norm": 1.5718661541244323, + "learning_rate": 9.584978964155266e-05, + "loss": 0.3674, + "step": 1837 + }, + { + "epoch": 0.15679918102712848, + "grad_norm": 1.488605774268017, + "learning_rate": 9.58442770229668e-05, + "loss": 0.3896, + "step": 1838 + }, + { + "epoch": 0.15688449070124552, + "grad_norm": 1.6015441893484523, + "learning_rate": 9.583876090441398e-05, + "loss": 0.4582, + "step": 1839 + }, + { + "epoch": 0.15696980037536257, + "grad_norm": 1.414752433606239, + "learning_rate": 9.583324128631537e-05, + "loss": 0.419, + "step": 1840 + }, + { + "epoch": 0.1570551100494796, + "grad_norm": 1.34897873714535, + "learning_rate": 9.582771816909234e-05, + "loss": 0.3896, + "step": 1841 + }, + { + "epoch": 0.15714041972359666, + "grad_norm": 1.3296950540653014, + "learning_rate": 9.582219155316656e-05, + "loss": 0.4392, + "step": 1842 + }, + { + "epoch": 0.1572257293977137, + "grad_norm": 1.1926690085423441, + "learning_rate": 9.581666143895994e-05, + "loss": 0.3672, + "step": 1843 + }, + { + "epoch": 0.15731103907183075, + "grad_norm": 1.295380524067418, + "learning_rate": 9.58111278268947e-05, + "loss": 0.3516, + "step": 1844 + }, + { + "epoch": 0.1573963487459478, + "grad_norm": 1.4439316210103457, + "learning_rate": 9.580559071739329e-05, + "loss": 0.3323, + "step": 1845 + }, + { + "epoch": 0.15748165842006484, + "grad_norm": 1.4219762745097242, + "learning_rate": 9.580005011087844e-05, + "loss": 0.3759, + "step": 1846 + }, + { + "epoch": 0.1575669680941819, + "grad_norm": 1.614302415918115, + "learning_rate": 9.579450600777314e-05, + "loss": 0.4064, + "step": 1847 + }, + { + "epoch": 0.15765227776829893, + "grad_norm": 1.7286542416611643, + "learning_rate": 9.578895840850066e-05, + "loss": 0.4124, + "step": 1848 + }, + { + "epoch": 0.15773758744241598, + "grad_norm": 1.4105163283567599, + "learning_rate": 9.578340731348454e-05, + "loss": 0.3605, + "step": 1849 + }, + { + "epoch": 0.15782289711653302, + "grad_norm": 1.4377166750605601, + "learning_rate": 9.577785272314854e-05, + "loss": 0.4112, + "step": 1850 + }, + { + "epoch": 0.15790820679065007, + "grad_norm": 1.5296013968591018, + "learning_rate": 9.577229463791677e-05, + "loss": 0.4608, + "step": 1851 + }, + { + "epoch": 0.15799351646476711, + "grad_norm": 1.6306203335760683, + "learning_rate": 9.576673305821353e-05, + "loss": 0.4, + "step": 1852 + }, + { + "epoch": 0.15807882613888416, + "grad_norm": 1.5714337345756821, + "learning_rate": 9.576116798446342e-05, + "loss": 0.3796, + "step": 1853 + }, + { + "epoch": 0.1581641358130012, + "grad_norm": 1.4518948341883604, + "learning_rate": 9.575559941709131e-05, + "loss": 0.4089, + "step": 1854 + }, + { + "epoch": 0.15824944548711825, + "grad_norm": 1.3384995460067737, + "learning_rate": 9.575002735652234e-05, + "loss": 0.3748, + "step": 1855 + }, + { + "epoch": 0.1583347551612353, + "grad_norm": 1.2148542495715386, + "learning_rate": 9.57444518031819e-05, + "loss": 0.3273, + "step": 1856 + }, + { + "epoch": 0.15842006483535234, + "grad_norm": 1.7143298098024615, + "learning_rate": 9.573887275749564e-05, + "loss": 0.4324, + "step": 1857 + }, + { + "epoch": 0.15850537450946936, + "grad_norm": 1.276832408222686, + "learning_rate": 9.573329021988949e-05, + "loss": 0.3495, + "step": 1858 + }, + { + "epoch": 0.1585906841835864, + "grad_norm": 1.3173591040421309, + "learning_rate": 9.572770419078966e-05, + "loss": 0.3829, + "step": 1859 + }, + { + "epoch": 0.15867599385770345, + "grad_norm": 1.4148952497928433, + "learning_rate": 9.572211467062264e-05, + "loss": 0.3513, + "step": 1860 + }, + { + "epoch": 0.1587613035318205, + "grad_norm": 1.5818611712927055, + "learning_rate": 9.57165216598151e-05, + "loss": 0.3626, + "step": 1861 + }, + { + "epoch": 0.15884661320593754, + "grad_norm": 1.4571290993241361, + "learning_rate": 9.57109251587941e-05, + "loss": 0.3605, + "step": 1862 + }, + { + "epoch": 0.1589319228800546, + "grad_norm": 1.5544672671037723, + "learning_rate": 9.570532516798685e-05, + "loss": 0.3663, + "step": 1863 + }, + { + "epoch": 0.15901723255417163, + "grad_norm": 1.5049698672652363, + "learning_rate": 9.56997216878209e-05, + "loss": 0.3807, + "step": 1864 + }, + { + "epoch": 0.15910254222828868, + "grad_norm": 1.41633069972578, + "learning_rate": 9.569411471872404e-05, + "loss": 0.3576, + "step": 1865 + }, + { + "epoch": 0.15918785190240572, + "grad_norm": 1.587760065958907, + "learning_rate": 9.568850426112436e-05, + "loss": 0.3111, + "step": 1866 + }, + { + "epoch": 0.15927316157652277, + "grad_norm": 1.3097719269147807, + "learning_rate": 9.568289031545017e-05, + "loss": 0.3019, + "step": 1867 + }, + { + "epoch": 0.15935847125063982, + "grad_norm": 1.6203138566632234, + "learning_rate": 9.567727288213005e-05, + "loss": 0.3679, + "step": 1868 + }, + { + "epoch": 0.15944378092475686, + "grad_norm": 1.2860025326865545, + "learning_rate": 9.567165196159288e-05, + "loss": 0.353, + "step": 1869 + }, + { + "epoch": 0.1595290905988739, + "grad_norm": 1.5200721542900675, + "learning_rate": 9.566602755426776e-05, + "loss": 0.4107, + "step": 1870 + }, + { + "epoch": 0.15961440027299095, + "grad_norm": 1.476225153417582, + "learning_rate": 9.566039966058414e-05, + "loss": 0.3631, + "step": 1871 + }, + { + "epoch": 0.159699709947108, + "grad_norm": 1.607185652329586, + "learning_rate": 9.565476828097163e-05, + "loss": 0.4038, + "step": 1872 + }, + { + "epoch": 0.15978501962122504, + "grad_norm": 1.5163932504936912, + "learning_rate": 9.564913341586017e-05, + "loss": 0.385, + "step": 1873 + }, + { + "epoch": 0.1598703292953421, + "grad_norm": 1.4986182842646667, + "learning_rate": 9.564349506567996e-05, + "loss": 0.3332, + "step": 1874 + }, + { + "epoch": 0.15995563896945914, + "grad_norm": 2.3769041760058527, + "learning_rate": 9.563785323086143e-05, + "loss": 0.312, + "step": 1875 + }, + { + "epoch": 0.16004094864357618, + "grad_norm": 1.6666128467771033, + "learning_rate": 9.563220791183535e-05, + "loss": 0.3998, + "step": 1876 + }, + { + "epoch": 0.16012625831769323, + "grad_norm": 1.742512899211497, + "learning_rate": 9.562655910903267e-05, + "loss": 0.4209, + "step": 1877 + }, + { + "epoch": 0.16021156799181027, + "grad_norm": 1.3649889462061866, + "learning_rate": 9.562090682288467e-05, + "loss": 0.3736, + "step": 1878 + }, + { + "epoch": 0.16029687766592732, + "grad_norm": 1.6153965122532383, + "learning_rate": 9.561525105382286e-05, + "loss": 0.3388, + "step": 1879 + }, + { + "epoch": 0.16038218734004436, + "grad_norm": 1.4602591015558704, + "learning_rate": 9.560959180227902e-05, + "loss": 0.3718, + "step": 1880 + }, + { + "epoch": 0.1604674970141614, + "grad_norm": 1.3068630331431983, + "learning_rate": 9.560392906868522e-05, + "loss": 0.3473, + "step": 1881 + }, + { + "epoch": 0.16055280668827845, + "grad_norm": 1.4672763005400398, + "learning_rate": 9.55982628534738e-05, + "loss": 0.3722, + "step": 1882 + }, + { + "epoch": 0.1606381163623955, + "grad_norm": 1.3205055682531819, + "learning_rate": 9.559259315707729e-05, + "loss": 0.3526, + "step": 1883 + }, + { + "epoch": 0.16072342603651255, + "grad_norm": 1.5156657419922688, + "learning_rate": 9.558691997992858e-05, + "loss": 0.4008, + "step": 1884 + }, + { + "epoch": 0.1608087357106296, + "grad_norm": 1.5963796724539772, + "learning_rate": 9.558124332246078e-05, + "loss": 0.3475, + "step": 1885 + }, + { + "epoch": 0.16089404538474664, + "grad_norm": 1.3398050463900162, + "learning_rate": 9.557556318510728e-05, + "loss": 0.386, + "step": 1886 + }, + { + "epoch": 0.16097935505886368, + "grad_norm": 1.4352413345391606, + "learning_rate": 9.55698795683017e-05, + "loss": 0.3612, + "step": 1887 + }, + { + "epoch": 0.16106466473298073, + "grad_norm": 1.6608449427455243, + "learning_rate": 9.556419247247799e-05, + "loss": 0.3772, + "step": 1888 + }, + { + "epoch": 0.16114997440709777, + "grad_norm": 1.3761962541970234, + "learning_rate": 9.55585018980703e-05, + "loss": 0.3545, + "step": 1889 + }, + { + "epoch": 0.16123528408121482, + "grad_norm": 1.3104686457883066, + "learning_rate": 9.555280784551308e-05, + "loss": 0.3422, + "step": 1890 + }, + { + "epoch": 0.16132059375533186, + "grad_norm": 1.438931871547079, + "learning_rate": 9.554711031524107e-05, + "loss": 0.368, + "step": 1891 + }, + { + "epoch": 0.1614059034294489, + "grad_norm": 1.830286948423147, + "learning_rate": 9.554140930768922e-05, + "loss": 0.3997, + "step": 1892 + }, + { + "epoch": 0.16149121310356596, + "grad_norm": 1.470565485497824, + "learning_rate": 9.553570482329277e-05, + "loss": 0.3638, + "step": 1893 + }, + { + "epoch": 0.161576522777683, + "grad_norm": 1.4154990470718631, + "learning_rate": 9.552999686248722e-05, + "loss": 0.3842, + "step": 1894 + }, + { + "epoch": 0.16166183245180005, + "grad_norm": 1.922638098093566, + "learning_rate": 9.552428542570838e-05, + "loss": 0.4605, + "step": 1895 + }, + { + "epoch": 0.1617471421259171, + "grad_norm": 1.6114139418435518, + "learning_rate": 9.551857051339225e-05, + "loss": 0.3604, + "step": 1896 + }, + { + "epoch": 0.1618324518000341, + "grad_norm": 1.6308829117407115, + "learning_rate": 9.551285212597516e-05, + "loss": 0.4207, + "step": 1897 + }, + { + "epoch": 0.16191776147415116, + "grad_norm": 1.6920064869839742, + "learning_rate": 9.550713026389366e-05, + "loss": 0.4286, + "step": 1898 + }, + { + "epoch": 0.1620030711482682, + "grad_norm": 1.595518738868818, + "learning_rate": 9.550140492758457e-05, + "loss": 0.3979, + "step": 1899 + }, + { + "epoch": 0.16208838082238525, + "grad_norm": 1.3995563996382685, + "learning_rate": 9.549567611748503e-05, + "loss": 0.3363, + "step": 1900 + }, + { + "epoch": 0.1621736904965023, + "grad_norm": 1.8677251193442368, + "learning_rate": 9.548994383403238e-05, + "loss": 0.3814, + "step": 1901 + }, + { + "epoch": 0.16225900017061934, + "grad_norm": 1.2499863623828815, + "learning_rate": 9.548420807766425e-05, + "loss": 0.3538, + "step": 1902 + }, + { + "epoch": 0.16234430984473638, + "grad_norm": 1.5057369830591596, + "learning_rate": 9.547846884881853e-05, + "loss": 0.3974, + "step": 1903 + }, + { + "epoch": 0.16242961951885343, + "grad_norm": 1.4486077761019904, + "learning_rate": 9.547272614793339e-05, + "loss": 0.3904, + "step": 1904 + }, + { + "epoch": 0.16251492919297048, + "grad_norm": 1.2552633102006092, + "learning_rate": 9.546697997544725e-05, + "loss": 0.354, + "step": 1905 + }, + { + "epoch": 0.16260023886708752, + "grad_norm": 1.3700055534361615, + "learning_rate": 9.546123033179879e-05, + "loss": 0.3211, + "step": 1906 + }, + { + "epoch": 0.16268554854120457, + "grad_norm": 1.178256347781605, + "learning_rate": 9.545547721742698e-05, + "loss": 0.3716, + "step": 1907 + }, + { + "epoch": 0.1627708582153216, + "grad_norm": 1.6359036767299517, + "learning_rate": 9.544972063277104e-05, + "loss": 0.4403, + "step": 1908 + }, + { + "epoch": 0.16285616788943866, + "grad_norm": 1.671079927862571, + "learning_rate": 9.544396057827045e-05, + "loss": 0.4025, + "step": 1909 + }, + { + "epoch": 0.1629414775635557, + "grad_norm": 1.7543583820291182, + "learning_rate": 9.543819705436496e-05, + "loss": 0.4178, + "step": 1910 + }, + { + "epoch": 0.16302678723767275, + "grad_norm": 1.6320234518563321, + "learning_rate": 9.543243006149459e-05, + "loss": 0.3869, + "step": 1911 + }, + { + "epoch": 0.1631120969117898, + "grad_norm": 1.3509923801927257, + "learning_rate": 9.542665960009959e-05, + "loss": 0.3269, + "step": 1912 + }, + { + "epoch": 0.16319740658590684, + "grad_norm": 1.448202512047144, + "learning_rate": 9.542088567062055e-05, + "loss": 0.4043, + "step": 1913 + }, + { + "epoch": 0.16328271626002389, + "grad_norm": 1.6246689679350357, + "learning_rate": 9.541510827349823e-05, + "loss": 0.3736, + "step": 1914 + }, + { + "epoch": 0.16336802593414093, + "grad_norm": 1.6026162007495455, + "learning_rate": 9.540932740917374e-05, + "loss": 0.355, + "step": 1915 + }, + { + "epoch": 0.16345333560825798, + "grad_norm": 1.5214137794266787, + "learning_rate": 9.540354307808841e-05, + "loss": 0.3675, + "step": 1916 + }, + { + "epoch": 0.16353864528237502, + "grad_norm": 1.527359047737488, + "learning_rate": 9.539775528068384e-05, + "loss": 0.3929, + "step": 1917 + }, + { + "epoch": 0.16362395495649207, + "grad_norm": 1.3638203164494591, + "learning_rate": 9.53919640174019e-05, + "loss": 0.3719, + "step": 1918 + }, + { + "epoch": 0.1637092646306091, + "grad_norm": 1.7213555440353754, + "learning_rate": 9.538616928868473e-05, + "loss": 0.3115, + "step": 1919 + }, + { + "epoch": 0.16379457430472616, + "grad_norm": 1.490046058045333, + "learning_rate": 9.53803710949747e-05, + "loss": 0.3493, + "step": 1920 + }, + { + "epoch": 0.1638798839788432, + "grad_norm": 1.5242243373187732, + "learning_rate": 9.53745694367145e-05, + "loss": 0.3396, + "step": 1921 + }, + { + "epoch": 0.16396519365296025, + "grad_norm": 1.6386968999504534, + "learning_rate": 9.536876431434703e-05, + "loss": 0.3548, + "step": 1922 + }, + { + "epoch": 0.1640505033270773, + "grad_norm": 1.3515409787620016, + "learning_rate": 9.53629557283155e-05, + "loss": 0.3635, + "step": 1923 + }, + { + "epoch": 0.16413581300119434, + "grad_norm": 1.269756966112304, + "learning_rate": 9.535714367906336e-05, + "loss": 0.3563, + "step": 1924 + }, + { + "epoch": 0.1642211226753114, + "grad_norm": 1.8087833049794988, + "learning_rate": 9.535132816703432e-05, + "loss": 0.3984, + "step": 1925 + }, + { + "epoch": 0.16430643234942843, + "grad_norm": 1.4961853796087667, + "learning_rate": 9.534550919267238e-05, + "loss": 0.4283, + "step": 1926 + }, + { + "epoch": 0.16439174202354548, + "grad_norm": 1.3417772405484392, + "learning_rate": 9.533968675642178e-05, + "loss": 0.3747, + "step": 1927 + }, + { + "epoch": 0.16447705169766252, + "grad_norm": 1.4152247252465662, + "learning_rate": 9.533386085872703e-05, + "loss": 0.3624, + "step": 1928 + }, + { + "epoch": 0.16456236137177957, + "grad_norm": 1.56261695424111, + "learning_rate": 9.53280315000329e-05, + "loss": 0.3667, + "step": 1929 + }, + { + "epoch": 0.16464767104589662, + "grad_norm": 1.4431763405173679, + "learning_rate": 9.532219868078445e-05, + "loss": 0.3983, + "step": 1930 + }, + { + "epoch": 0.16473298072001366, + "grad_norm": 1.6622442063041163, + "learning_rate": 9.531636240142696e-05, + "loss": 0.3548, + "step": 1931 + }, + { + "epoch": 0.1648182903941307, + "grad_norm": 1.3615859894371232, + "learning_rate": 9.531052266240601e-05, + "loss": 0.3663, + "step": 1932 + }, + { + "epoch": 0.16490360006824775, + "grad_norm": 1.3166543517361726, + "learning_rate": 9.530467946416745e-05, + "loss": 0.3512, + "step": 1933 + }, + { + "epoch": 0.1649889097423648, + "grad_norm": 1.239041453082941, + "learning_rate": 9.529883280715735e-05, + "loss": 0.3846, + "step": 1934 + }, + { + "epoch": 0.16507421941648182, + "grad_norm": 1.1925324167288014, + "learning_rate": 9.529298269182209e-05, + "loss": 0.3299, + "step": 1935 + }, + { + "epoch": 0.16515952909059886, + "grad_norm": 1.362618446451218, + "learning_rate": 9.528712911860829e-05, + "loss": 0.3666, + "step": 1936 + }, + { + "epoch": 0.1652448387647159, + "grad_norm": 1.384878436444713, + "learning_rate": 9.528127208796282e-05, + "loss": 0.3809, + "step": 1937 + }, + { + "epoch": 0.16533014843883295, + "grad_norm": 1.348572880920801, + "learning_rate": 9.527541160033286e-05, + "loss": 0.4209, + "step": 1938 + }, + { + "epoch": 0.16541545811295, + "grad_norm": 1.5952522826902542, + "learning_rate": 9.526954765616583e-05, + "loss": 0.3868, + "step": 1939 + }, + { + "epoch": 0.16550076778706704, + "grad_norm": 1.8113470685707431, + "learning_rate": 9.526368025590938e-05, + "loss": 0.3842, + "step": 1940 + }, + { + "epoch": 0.1655860774611841, + "grad_norm": 1.5844576842272216, + "learning_rate": 9.525780940001148e-05, + "loss": 0.383, + "step": 1941 + }, + { + "epoch": 0.16567138713530113, + "grad_norm": 1.3340186453978917, + "learning_rate": 9.525193508892034e-05, + "loss": 0.3531, + "step": 1942 + }, + { + "epoch": 0.16575669680941818, + "grad_norm": 1.6727453087835131, + "learning_rate": 9.524605732308442e-05, + "loss": 0.4205, + "step": 1943 + }, + { + "epoch": 0.16584200648353523, + "grad_norm": 1.4807083620409418, + "learning_rate": 9.524017610295245e-05, + "loss": 0.3581, + "step": 1944 + }, + { + "epoch": 0.16592731615765227, + "grad_norm": 1.508919976243008, + "learning_rate": 9.523429142897346e-05, + "loss": 0.3935, + "step": 1945 + }, + { + "epoch": 0.16601262583176932, + "grad_norm": 1.980347518671471, + "learning_rate": 9.522840330159669e-05, + "loss": 0.5012, + "step": 1946 + }, + { + "epoch": 0.16609793550588636, + "grad_norm": 1.326389794851441, + "learning_rate": 9.522251172127166e-05, + "loss": 0.3495, + "step": 1947 + }, + { + "epoch": 0.1661832451800034, + "grad_norm": 2.0305766236615024, + "learning_rate": 9.521661668844817e-05, + "loss": 0.3906, + "step": 1948 + }, + { + "epoch": 0.16626855485412045, + "grad_norm": 1.4005438003936128, + "learning_rate": 9.521071820357627e-05, + "loss": 0.4327, + "step": 1949 + }, + { + "epoch": 0.1663538645282375, + "grad_norm": 1.2269886029933654, + "learning_rate": 9.520481626710631e-05, + "loss": 0.3716, + "step": 1950 + }, + { + "epoch": 0.16643917420235455, + "grad_norm": 1.2090205616997054, + "learning_rate": 9.519891087948884e-05, + "loss": 0.3712, + "step": 1951 + }, + { + "epoch": 0.1665244838764716, + "grad_norm": 1.5279518731462065, + "learning_rate": 9.519300204117469e-05, + "loss": 0.3206, + "step": 1952 + }, + { + "epoch": 0.16660979355058864, + "grad_norm": 1.4493951818616395, + "learning_rate": 9.518708975261502e-05, + "loss": 0.3715, + "step": 1953 + }, + { + "epoch": 0.16669510322470568, + "grad_norm": 1.867100988961649, + "learning_rate": 9.518117401426115e-05, + "loss": 0.4197, + "step": 1954 + }, + { + "epoch": 0.16678041289882273, + "grad_norm": 1.5852036305016588, + "learning_rate": 9.517525482656475e-05, + "loss": 0.3669, + "step": 1955 + }, + { + "epoch": 0.16686572257293977, + "grad_norm": 1.3905643921406787, + "learning_rate": 9.51693321899777e-05, + "loss": 0.365, + "step": 1956 + }, + { + "epoch": 0.16695103224705682, + "grad_norm": 1.4459994771225697, + "learning_rate": 9.516340610495215e-05, + "loss": 0.393, + "step": 1957 + }, + { + "epoch": 0.16703634192117386, + "grad_norm": 1.5083783921032674, + "learning_rate": 9.515747657194056e-05, + "loss": 0.4657, + "step": 1958 + }, + { + "epoch": 0.1671216515952909, + "grad_norm": 1.514685742861431, + "learning_rate": 9.51515435913956e-05, + "loss": 0.3848, + "step": 1959 + }, + { + "epoch": 0.16720696126940796, + "grad_norm": 1.4146436477110251, + "learning_rate": 9.514560716377023e-05, + "loss": 0.362, + "step": 1960 + }, + { + "epoch": 0.167292270943525, + "grad_norm": 1.3574481330362744, + "learning_rate": 9.513966728951764e-05, + "loss": 0.3546, + "step": 1961 + }, + { + "epoch": 0.16737758061764205, + "grad_norm": 1.4527296123134235, + "learning_rate": 9.513372396909133e-05, + "loss": 0.3305, + "step": 1962 + }, + { + "epoch": 0.1674628902917591, + "grad_norm": 1.5155710427029596, + "learning_rate": 9.512777720294504e-05, + "loss": 0.3812, + "step": 1963 + }, + { + "epoch": 0.16754819996587614, + "grad_norm": 1.6072407616963769, + "learning_rate": 9.512182699153276e-05, + "loss": 0.4183, + "step": 1964 + }, + { + "epoch": 0.16763350963999318, + "grad_norm": 1.4193646414549708, + "learning_rate": 9.511587333530877e-05, + "loss": 0.4076, + "step": 1965 + }, + { + "epoch": 0.16771881931411023, + "grad_norm": 1.7561358105879774, + "learning_rate": 9.51099162347276e-05, + "loss": 0.3919, + "step": 1966 + }, + { + "epoch": 0.16780412898822727, + "grad_norm": 1.5484298444110227, + "learning_rate": 9.510395569024404e-05, + "loss": 0.3444, + "step": 1967 + }, + { + "epoch": 0.16788943866234432, + "grad_norm": 1.5954295265846423, + "learning_rate": 9.509799170231314e-05, + "loss": 0.4207, + "step": 1968 + }, + { + "epoch": 0.16797474833646137, + "grad_norm": 1.416650585008485, + "learning_rate": 9.509202427139023e-05, + "loss": 0.349, + "step": 1969 + }, + { + "epoch": 0.1680600580105784, + "grad_norm": 1.5646460482053284, + "learning_rate": 9.508605339793087e-05, + "loss": 0.3922, + "step": 1970 + }, + { + "epoch": 0.16814536768469546, + "grad_norm": 1.56693842877801, + "learning_rate": 9.508007908239094e-05, + "loss": 0.4232, + "step": 1971 + }, + { + "epoch": 0.1682306773588125, + "grad_norm": 1.4906920284019969, + "learning_rate": 9.507410132522652e-05, + "loss": 0.3683, + "step": 1972 + }, + { + "epoch": 0.16831598703292952, + "grad_norm": 1.1634214543484371, + "learning_rate": 9.506812012689399e-05, + "loss": 0.3437, + "step": 1973 + }, + { + "epoch": 0.16840129670704657, + "grad_norm": 1.416588566067115, + "learning_rate": 9.506213548784996e-05, + "loss": 0.4123, + "step": 1974 + }, + { + "epoch": 0.1684866063811636, + "grad_norm": 1.4456044495041753, + "learning_rate": 9.505614740855138e-05, + "loss": 0.3667, + "step": 1975 + }, + { + "epoch": 0.16857191605528066, + "grad_norm": 1.3769619988771018, + "learning_rate": 9.505015588945534e-05, + "loss": 0.3469, + "step": 1976 + }, + { + "epoch": 0.1686572257293977, + "grad_norm": 1.610110438924628, + "learning_rate": 9.50441609310193e-05, + "loss": 0.4288, + "step": 1977 + }, + { + "epoch": 0.16874253540351475, + "grad_norm": 1.4264110271680157, + "learning_rate": 9.503816253370097e-05, + "loss": 0.3654, + "step": 1978 + }, + { + "epoch": 0.1688278450776318, + "grad_norm": 1.3930720968628294, + "learning_rate": 9.503216069795824e-05, + "loss": 0.3428, + "step": 1979 + }, + { + "epoch": 0.16891315475174884, + "grad_norm": 1.6184226216841973, + "learning_rate": 9.502615542424933e-05, + "loss": 0.3701, + "step": 1980 + }, + { + "epoch": 0.16899846442586589, + "grad_norm": 1.3913098481253559, + "learning_rate": 9.502014671303275e-05, + "loss": 0.3523, + "step": 1981 + }, + { + "epoch": 0.16908377409998293, + "grad_norm": 1.396789602788483, + "learning_rate": 9.501413456476717e-05, + "loss": 0.3763, + "step": 1982 + }, + { + "epoch": 0.16916908377409998, + "grad_norm": 1.6872634721931155, + "learning_rate": 9.500811897991164e-05, + "loss": 0.4412, + "step": 1983 + }, + { + "epoch": 0.16925439344821702, + "grad_norm": 1.663980687955517, + "learning_rate": 9.500209995892541e-05, + "loss": 0.4963, + "step": 1984 + }, + { + "epoch": 0.16933970312233407, + "grad_norm": 1.3861634768012483, + "learning_rate": 9.499607750226797e-05, + "loss": 0.3429, + "step": 1985 + }, + { + "epoch": 0.1694250127964511, + "grad_norm": 1.5270449450960053, + "learning_rate": 9.499005161039914e-05, + "loss": 0.3507, + "step": 1986 + }, + { + "epoch": 0.16951032247056816, + "grad_norm": 1.6017181320866514, + "learning_rate": 9.498402228377892e-05, + "loss": 0.3392, + "step": 1987 + }, + { + "epoch": 0.1695956321446852, + "grad_norm": 1.4465822158429134, + "learning_rate": 9.497798952286767e-05, + "loss": 0.3808, + "step": 1988 + }, + { + "epoch": 0.16968094181880225, + "grad_norm": 1.5496939203091256, + "learning_rate": 9.497195332812592e-05, + "loss": 0.3526, + "step": 1989 + }, + { + "epoch": 0.1697662514929193, + "grad_norm": 1.4224963664459322, + "learning_rate": 9.49659137000145e-05, + "loss": 0.4149, + "step": 1990 + }, + { + "epoch": 0.16985156116703634, + "grad_norm": 1.7796148189878567, + "learning_rate": 9.495987063899454e-05, + "loss": 0.3964, + "step": 1991 + }, + { + "epoch": 0.1699368708411534, + "grad_norm": 1.436033911057561, + "learning_rate": 9.495382414552737e-05, + "loss": 0.3897, + "step": 1992 + }, + { + "epoch": 0.17002218051527043, + "grad_norm": 1.6045434517544594, + "learning_rate": 9.494777422007462e-05, + "loss": 0.3629, + "step": 1993 + }, + { + "epoch": 0.17010749018938748, + "grad_norm": 1.8689485170183626, + "learning_rate": 9.494172086309813e-05, + "loss": 0.3982, + "step": 1994 + }, + { + "epoch": 0.17019279986350452, + "grad_norm": 1.6164578620853254, + "learning_rate": 9.493566407506009e-05, + "loss": 0.3853, + "step": 1995 + }, + { + "epoch": 0.17027810953762157, + "grad_norm": 1.4874631380274121, + "learning_rate": 9.492960385642288e-05, + "loss": 0.332, + "step": 1996 + }, + { + "epoch": 0.17036341921173861, + "grad_norm": 1.2198479426319129, + "learning_rate": 9.492354020764919e-05, + "loss": 0.33, + "step": 1997 + }, + { + "epoch": 0.17044872888585566, + "grad_norm": 1.768381313544331, + "learning_rate": 9.491747312920191e-05, + "loss": 0.3724, + "step": 1998 + }, + { + "epoch": 0.1705340385599727, + "grad_norm": 1.8577899748621323, + "learning_rate": 9.491140262154426e-05, + "loss": 0.3783, + "step": 1999 + }, + { + "epoch": 0.17061934823408975, + "grad_norm": 1.3978784818403818, + "learning_rate": 9.490532868513967e-05, + "loss": 0.3611, + "step": 2000 + }, + { + "epoch": 0.1707046579082068, + "grad_norm": 1.4197414446218812, + "learning_rate": 9.489925132045185e-05, + "loss": 0.3273, + "step": 2001 + }, + { + "epoch": 0.17078996758232384, + "grad_norm": 1.3955491569059186, + "learning_rate": 9.489317052794481e-05, + "loss": 0.3433, + "step": 2002 + }, + { + "epoch": 0.1708752772564409, + "grad_norm": 1.4724980681686812, + "learning_rate": 9.488708630808275e-05, + "loss": 0.3604, + "step": 2003 + }, + { + "epoch": 0.17096058693055793, + "grad_norm": 1.6198405005973546, + "learning_rate": 9.488099866133017e-05, + "loss": 0.3746, + "step": 2004 + }, + { + "epoch": 0.17104589660467498, + "grad_norm": 1.5827595859129449, + "learning_rate": 9.487490758815186e-05, + "loss": 0.3444, + "step": 2005 + }, + { + "epoch": 0.17113120627879203, + "grad_norm": 1.623438011073894, + "learning_rate": 9.486881308901281e-05, + "loss": 0.3362, + "step": 2006 + }, + { + "epoch": 0.17121651595290907, + "grad_norm": 1.186474407423192, + "learning_rate": 9.486271516437832e-05, + "loss": 0.3414, + "step": 2007 + }, + { + "epoch": 0.17130182562702612, + "grad_norm": 1.5865338336548076, + "learning_rate": 9.485661381471393e-05, + "loss": 0.3705, + "step": 2008 + }, + { + "epoch": 0.17138713530114316, + "grad_norm": 1.553833309895497, + "learning_rate": 9.485050904048542e-05, + "loss": 0.3343, + "step": 2009 + }, + { + "epoch": 0.1714724449752602, + "grad_norm": 1.3298780652131086, + "learning_rate": 9.48444008421589e-05, + "loss": 0.2949, + "step": 2010 + }, + { + "epoch": 0.17155775464937725, + "grad_norm": 1.5805011649774627, + "learning_rate": 9.483828922020069e-05, + "loss": 0.3688, + "step": 2011 + }, + { + "epoch": 0.17164306432349427, + "grad_norm": 1.3255981192950144, + "learning_rate": 9.483217417507734e-05, + "loss": 0.3411, + "step": 2012 + }, + { + "epoch": 0.17172837399761132, + "grad_norm": 1.3468772958970472, + "learning_rate": 9.482605570725575e-05, + "loss": 0.3769, + "step": 2013 + }, + { + "epoch": 0.17181368367172836, + "grad_norm": 1.6095659457511904, + "learning_rate": 9.4819933817203e-05, + "loss": 0.3522, + "step": 2014 + }, + { + "epoch": 0.1718989933458454, + "grad_norm": 1.5444136795270016, + "learning_rate": 9.481380850538648e-05, + "loss": 0.3537, + "step": 2015 + }, + { + "epoch": 0.17198430301996245, + "grad_norm": 1.3514609767291486, + "learning_rate": 9.480767977227383e-05, + "loss": 0.3702, + "step": 2016 + }, + { + "epoch": 0.1720696126940795, + "grad_norm": 1.5701079733585683, + "learning_rate": 9.480154761833293e-05, + "loss": 0.376, + "step": 2017 + }, + { + "epoch": 0.17215492236819654, + "grad_norm": 1.3989171745208626, + "learning_rate": 9.479541204403193e-05, + "loss": 0.3647, + "step": 2018 + }, + { + "epoch": 0.1722402320423136, + "grad_norm": 1.5828768507142232, + "learning_rate": 9.47892730498393e-05, + "loss": 0.3681, + "step": 2019 + }, + { + "epoch": 0.17232554171643064, + "grad_norm": 1.5871240846598673, + "learning_rate": 9.478313063622364e-05, + "loss": 0.3513, + "step": 2020 + }, + { + "epoch": 0.17241085139054768, + "grad_norm": 1.5942884545368605, + "learning_rate": 9.477698480365395e-05, + "loss": 0.3183, + "step": 2021 + }, + { + "epoch": 0.17249616106466473, + "grad_norm": 1.6804868703350253, + "learning_rate": 9.477083555259943e-05, + "loss": 0.4235, + "step": 2022 + }, + { + "epoch": 0.17258147073878177, + "grad_norm": 1.4513051111762538, + "learning_rate": 9.476468288352951e-05, + "loss": 0.3711, + "step": 2023 + }, + { + "epoch": 0.17266678041289882, + "grad_norm": 1.8514665667311923, + "learning_rate": 9.475852679691393e-05, + "loss": 0.3978, + "step": 2024 + }, + { + "epoch": 0.17275209008701586, + "grad_norm": 1.381366771405277, + "learning_rate": 9.475236729322268e-05, + "loss": 0.3465, + "step": 2025 + }, + { + "epoch": 0.1728373997611329, + "grad_norm": 1.6029250141055684, + "learning_rate": 9.4746204372926e-05, + "loss": 0.3841, + "step": 2026 + }, + { + "epoch": 0.17292270943524995, + "grad_norm": 1.8075624495780929, + "learning_rate": 9.474003803649441e-05, + "loss": 0.3854, + "step": 2027 + }, + { + "epoch": 0.173008019109367, + "grad_norm": 1.2756690850272745, + "learning_rate": 9.473386828439865e-05, + "loss": 0.3345, + "step": 2028 + }, + { + "epoch": 0.17309332878348405, + "grad_norm": 1.1411279588440137, + "learning_rate": 9.472769511710976e-05, + "loss": 0.3224, + "step": 2029 + }, + { + "epoch": 0.1731786384576011, + "grad_norm": 1.5239623119470058, + "learning_rate": 9.472151853509903e-05, + "loss": 0.3448, + "step": 2030 + }, + { + "epoch": 0.17326394813171814, + "grad_norm": 1.4543497810095276, + "learning_rate": 9.471533853883803e-05, + "loss": 0.3962, + "step": 2031 + }, + { + "epoch": 0.17334925780583518, + "grad_norm": 1.0942399153895093, + "learning_rate": 9.470915512879852e-05, + "loss": 0.3495, + "step": 2032 + }, + { + "epoch": 0.17343456747995223, + "grad_norm": 1.5380868675592425, + "learning_rate": 9.470296830545263e-05, + "loss": 0.367, + "step": 2033 + }, + { + "epoch": 0.17351987715406927, + "grad_norm": 1.5500000492218995, + "learning_rate": 9.469677806927264e-05, + "loss": 0.4053, + "step": 2034 + }, + { + "epoch": 0.17360518682818632, + "grad_norm": 1.3643432677803804, + "learning_rate": 9.469058442073117e-05, + "loss": 0.3702, + "step": 2035 + }, + { + "epoch": 0.17369049650230337, + "grad_norm": 1.4953369138631982, + "learning_rate": 9.468438736030107e-05, + "loss": 0.3731, + "step": 2036 + }, + { + "epoch": 0.1737758061764204, + "grad_norm": 1.6222766650802707, + "learning_rate": 9.467818688845544e-05, + "loss": 0.4022, + "step": 2037 + }, + { + "epoch": 0.17386111585053746, + "grad_norm": 1.4310949516437115, + "learning_rate": 9.467198300566766e-05, + "loss": 0.3584, + "step": 2038 + }, + { + "epoch": 0.1739464255246545, + "grad_norm": 1.5128947570413411, + "learning_rate": 9.466577571241137e-05, + "loss": 0.3676, + "step": 2039 + }, + { + "epoch": 0.17403173519877155, + "grad_norm": 1.28269514134802, + "learning_rate": 9.465956500916045e-05, + "loss": 0.3512, + "step": 2040 + }, + { + "epoch": 0.1741170448728886, + "grad_norm": 1.4903717977573707, + "learning_rate": 9.465335089638907e-05, + "loss": 0.4189, + "step": 2041 + }, + { + "epoch": 0.17420235454700564, + "grad_norm": 1.4715974679722406, + "learning_rate": 9.464713337457163e-05, + "loss": 0.3491, + "step": 2042 + }, + { + "epoch": 0.17428766422112268, + "grad_norm": 1.6515324902943336, + "learning_rate": 9.464091244418282e-05, + "loss": 0.3712, + "step": 2043 + }, + { + "epoch": 0.17437297389523973, + "grad_norm": 1.5527998616546224, + "learning_rate": 9.463468810569756e-05, + "loss": 0.3701, + "step": 2044 + }, + { + "epoch": 0.17445828356935678, + "grad_norm": 1.5692567075373618, + "learning_rate": 9.462846035959105e-05, + "loss": 0.332, + "step": 2045 + }, + { + "epoch": 0.17454359324347382, + "grad_norm": 1.2344626866352923, + "learning_rate": 9.462222920633875e-05, + "loss": 0.293, + "step": 2046 + }, + { + "epoch": 0.17462890291759087, + "grad_norm": 1.4235884853312164, + "learning_rate": 9.461599464641638e-05, + "loss": 0.3729, + "step": 2047 + }, + { + "epoch": 0.1747142125917079, + "grad_norm": 1.539120126023015, + "learning_rate": 9.46097566802999e-05, + "loss": 0.3994, + "step": 2048 + }, + { + "epoch": 0.17479952226582496, + "grad_norm": 1.5519772777122989, + "learning_rate": 9.460351530846555e-05, + "loss": 0.3629, + "step": 2049 + }, + { + "epoch": 0.17488483193994198, + "grad_norm": 1.5066982757718506, + "learning_rate": 9.459727053138983e-05, + "loss": 0.4238, + "step": 2050 + }, + { + "epoch": 0.17497014161405902, + "grad_norm": 1.4981909014960142, + "learning_rate": 9.45910223495495e-05, + "loss": 0.3609, + "step": 2051 + }, + { + "epoch": 0.17505545128817607, + "grad_norm": 1.5607832060078062, + "learning_rate": 9.458477076342157e-05, + "loss": 0.3542, + "step": 2052 + }, + { + "epoch": 0.1751407609622931, + "grad_norm": 1.4399378463789074, + "learning_rate": 9.457851577348332e-05, + "loss": 0.3303, + "step": 2053 + }, + { + "epoch": 0.17522607063641016, + "grad_norm": 1.421849680245458, + "learning_rate": 9.457225738021226e-05, + "loss": 0.3257, + "step": 2054 + }, + { + "epoch": 0.1753113803105272, + "grad_norm": 1.6233642855336392, + "learning_rate": 9.456599558408623e-05, + "loss": 0.3951, + "step": 2055 + }, + { + "epoch": 0.17539668998464425, + "grad_norm": 1.6513918756932628, + "learning_rate": 9.455973038558325e-05, + "loss": 0.3689, + "step": 2056 + }, + { + "epoch": 0.1754819996587613, + "grad_norm": 1.220989321918956, + "learning_rate": 9.455346178518164e-05, + "loss": 0.3397, + "step": 2057 + }, + { + "epoch": 0.17556730933287834, + "grad_norm": 1.497401609547849, + "learning_rate": 9.454718978336e-05, + "loss": 0.3416, + "step": 2058 + }, + { + "epoch": 0.1756526190069954, + "grad_norm": 1.4111848498190755, + "learning_rate": 9.454091438059712e-05, + "loss": 0.3689, + "step": 2059 + }, + { + "epoch": 0.17573792868111243, + "grad_norm": 1.5393883994259243, + "learning_rate": 9.453463557737212e-05, + "loss": 0.383, + "step": 2060 + }, + { + "epoch": 0.17582323835522948, + "grad_norm": 1.5863053219454233, + "learning_rate": 9.452835337416436e-05, + "loss": 0.3421, + "step": 2061 + }, + { + "epoch": 0.17590854802934652, + "grad_norm": 1.4849595374557716, + "learning_rate": 9.452206777145342e-05, + "loss": 0.4045, + "step": 2062 + }, + { + "epoch": 0.17599385770346357, + "grad_norm": 1.3534830813328118, + "learning_rate": 9.451577876971923e-05, + "loss": 0.3499, + "step": 2063 + }, + { + "epoch": 0.17607916737758061, + "grad_norm": 1.320991386720471, + "learning_rate": 9.450948636944189e-05, + "loss": 0.3533, + "step": 2064 + }, + { + "epoch": 0.17616447705169766, + "grad_norm": 1.3747770822330179, + "learning_rate": 9.450319057110175e-05, + "loss": 0.3804, + "step": 2065 + }, + { + "epoch": 0.1762497867258147, + "grad_norm": 1.6433500637391831, + "learning_rate": 9.449689137517952e-05, + "loss": 0.4088, + "step": 2066 + }, + { + "epoch": 0.17633509639993175, + "grad_norm": 1.6585678580118834, + "learning_rate": 9.44905887821561e-05, + "loss": 0.3656, + "step": 2067 + }, + { + "epoch": 0.1764204060740488, + "grad_norm": 1.5215927300904266, + "learning_rate": 9.448428279251263e-05, + "loss": 0.3975, + "step": 2068 + }, + { + "epoch": 0.17650571574816584, + "grad_norm": 1.4964576856224163, + "learning_rate": 9.447797340673057e-05, + "loss": 0.3704, + "step": 2069 + }, + { + "epoch": 0.1765910254222829, + "grad_norm": 1.300121701486029, + "learning_rate": 9.44716606252916e-05, + "loss": 0.3597, + "step": 2070 + }, + { + "epoch": 0.17667633509639993, + "grad_norm": 1.642412429054416, + "learning_rate": 9.446534444867765e-05, + "loss": 0.3929, + "step": 2071 + }, + { + "epoch": 0.17676164477051698, + "grad_norm": 1.336617592261912, + "learning_rate": 9.445902487737095e-05, + "loss": 0.3536, + "step": 2072 + }, + { + "epoch": 0.17684695444463402, + "grad_norm": 1.6734108688557747, + "learning_rate": 9.445270191185395e-05, + "loss": 0.3917, + "step": 2073 + }, + { + "epoch": 0.17693226411875107, + "grad_norm": 1.4404381289477226, + "learning_rate": 9.444637555260939e-05, + "loss": 0.3902, + "step": 2074 + }, + { + "epoch": 0.17701757379286812, + "grad_norm": 1.818853620480993, + "learning_rate": 9.444004580012023e-05, + "loss": 0.3898, + "step": 2075 + }, + { + "epoch": 0.17710288346698516, + "grad_norm": 1.3661701886178708, + "learning_rate": 9.443371265486975e-05, + "loss": 0.3648, + "step": 2076 + }, + { + "epoch": 0.1771881931411022, + "grad_norm": 1.494814173654145, + "learning_rate": 9.442737611734141e-05, + "loss": 0.3739, + "step": 2077 + }, + { + "epoch": 0.17727350281521925, + "grad_norm": 1.5693050208188686, + "learning_rate": 9.442103618801898e-05, + "loss": 0.3588, + "step": 2078 + }, + { + "epoch": 0.1773588124893363, + "grad_norm": 1.3477705312731325, + "learning_rate": 9.44146928673865e-05, + "loss": 0.3891, + "step": 2079 + }, + { + "epoch": 0.17744412216345334, + "grad_norm": 1.2422893650345583, + "learning_rate": 9.440834615592826e-05, + "loss": 0.384, + "step": 2080 + }, + { + "epoch": 0.1775294318375704, + "grad_norm": 1.59423880467545, + "learning_rate": 9.440199605412876e-05, + "loss": 0.3599, + "step": 2081 + }, + { + "epoch": 0.17761474151168744, + "grad_norm": 1.4222475339372282, + "learning_rate": 9.439564256247281e-05, + "loss": 0.3146, + "step": 2082 + }, + { + "epoch": 0.17770005118580448, + "grad_norm": 1.4042354989466956, + "learning_rate": 9.438928568144547e-05, + "loss": 0.3896, + "step": 2083 + }, + { + "epoch": 0.17778536085992153, + "grad_norm": 1.3367744634846488, + "learning_rate": 9.438292541153206e-05, + "loss": 0.3764, + "step": 2084 + }, + { + "epoch": 0.17787067053403857, + "grad_norm": 1.4161205173973368, + "learning_rate": 9.437656175321814e-05, + "loss": 0.3458, + "step": 2085 + }, + { + "epoch": 0.17795598020815562, + "grad_norm": 1.5110771927268145, + "learning_rate": 9.437019470698955e-05, + "loss": 0.3885, + "step": 2086 + }, + { + "epoch": 0.17804128988227266, + "grad_norm": 1.4148864874448854, + "learning_rate": 9.436382427333237e-05, + "loss": 0.3155, + "step": 2087 + }, + { + "epoch": 0.17812659955638968, + "grad_norm": 1.4845637151317854, + "learning_rate": 9.435745045273297e-05, + "loss": 0.3409, + "step": 2088 + }, + { + "epoch": 0.17821190923050673, + "grad_norm": 1.3240257730222846, + "learning_rate": 9.435107324567793e-05, + "loss": 0.3522, + "step": 2089 + }, + { + "epoch": 0.17829721890462377, + "grad_norm": 1.722241548545989, + "learning_rate": 9.434469265265414e-05, + "loss": 0.45, + "step": 2090 + }, + { + "epoch": 0.17838252857874082, + "grad_norm": 1.3724695676893526, + "learning_rate": 9.43383086741487e-05, + "loss": 0.3549, + "step": 2091 + }, + { + "epoch": 0.17846783825285786, + "grad_norm": 1.394712206642833, + "learning_rate": 9.4331921310649e-05, + "loss": 0.3616, + "step": 2092 + }, + { + "epoch": 0.1785531479269749, + "grad_norm": 1.2411428413349714, + "learning_rate": 9.43255305626427e-05, + "loss": 0.3517, + "step": 2093 + }, + { + "epoch": 0.17863845760109195, + "grad_norm": 1.3951785511341113, + "learning_rate": 9.431913643061769e-05, + "loss": 0.3371, + "step": 2094 + }, + { + "epoch": 0.178723767275209, + "grad_norm": 1.4440732849946718, + "learning_rate": 9.431273891506213e-05, + "loss": 0.3585, + "step": 2095 + }, + { + "epoch": 0.17880907694932605, + "grad_norm": 1.8115916771387377, + "learning_rate": 9.430633801646443e-05, + "loss": 0.3651, + "step": 2096 + }, + { + "epoch": 0.1788943866234431, + "grad_norm": 1.6581907597632306, + "learning_rate": 9.429993373531326e-05, + "loss": 0.4356, + "step": 2097 + }, + { + "epoch": 0.17897969629756014, + "grad_norm": 1.5840583529994712, + "learning_rate": 9.429352607209755e-05, + "loss": 0.3336, + "step": 2098 + }, + { + "epoch": 0.17906500597167718, + "grad_norm": 1.2813919733054788, + "learning_rate": 9.42871150273065e-05, + "loss": 0.3458, + "step": 2099 + }, + { + "epoch": 0.17915031564579423, + "grad_norm": 1.3727324167743142, + "learning_rate": 9.428070060142957e-05, + "loss": 0.3531, + "step": 2100 + }, + { + "epoch": 0.17923562531991127, + "grad_norm": 1.447708206356078, + "learning_rate": 9.427428279495646e-05, + "loss": 0.3894, + "step": 2101 + }, + { + "epoch": 0.17932093499402832, + "grad_norm": 1.5490172993412459, + "learning_rate": 9.426786160837713e-05, + "loss": 0.3363, + "step": 2102 + }, + { + "epoch": 0.17940624466814536, + "grad_norm": 1.1104377706405748, + "learning_rate": 9.426143704218179e-05, + "loss": 0.3146, + "step": 2103 + }, + { + "epoch": 0.1794915543422624, + "grad_norm": 1.485189435288107, + "learning_rate": 9.425500909686096e-05, + "loss": 0.3475, + "step": 2104 + }, + { + "epoch": 0.17957686401637946, + "grad_norm": 1.6547604466425663, + "learning_rate": 9.424857777290535e-05, + "loss": 0.4423, + "step": 2105 + }, + { + "epoch": 0.1796621736904965, + "grad_norm": 1.471906800139159, + "learning_rate": 9.424214307080594e-05, + "loss": 0.3949, + "step": 2106 + }, + { + "epoch": 0.17974748336461355, + "grad_norm": 1.5480482487909466, + "learning_rate": 9.423570499105403e-05, + "loss": 0.3809, + "step": 2107 + }, + { + "epoch": 0.1798327930387306, + "grad_norm": 1.8391575920781056, + "learning_rate": 9.42292635341411e-05, + "loss": 0.3761, + "step": 2108 + }, + { + "epoch": 0.17991810271284764, + "grad_norm": 1.3045157131010492, + "learning_rate": 9.422281870055896e-05, + "loss": 0.3591, + "step": 2109 + }, + { + "epoch": 0.18000341238696468, + "grad_norm": 1.414797597097611, + "learning_rate": 9.421637049079959e-05, + "loss": 0.3394, + "step": 2110 + }, + { + "epoch": 0.18008872206108173, + "grad_norm": 1.4198698220556956, + "learning_rate": 9.42099189053553e-05, + "loss": 0.3927, + "step": 2111 + }, + { + "epoch": 0.18017403173519878, + "grad_norm": 1.3824440847888746, + "learning_rate": 9.420346394471864e-05, + "loss": 0.3225, + "step": 2112 + }, + { + "epoch": 0.18025934140931582, + "grad_norm": 1.3125425967615674, + "learning_rate": 9.41970056093824e-05, + "loss": 0.337, + "step": 2113 + }, + { + "epoch": 0.18034465108343287, + "grad_norm": 1.5574994751269586, + "learning_rate": 9.419054389983964e-05, + "loss": 0.3961, + "step": 2114 + }, + { + "epoch": 0.1804299607575499, + "grad_norm": 1.3875609083811307, + "learning_rate": 9.418407881658369e-05, + "loss": 0.3983, + "step": 2115 + }, + { + "epoch": 0.18051527043166696, + "grad_norm": 1.437856215417877, + "learning_rate": 9.417761036010812e-05, + "loss": 0.2842, + "step": 2116 + }, + { + "epoch": 0.180600580105784, + "grad_norm": 1.598373438650538, + "learning_rate": 9.417113853090675e-05, + "loss": 0.4028, + "step": 2117 + }, + { + "epoch": 0.18068588977990105, + "grad_norm": 1.4904622594898205, + "learning_rate": 9.416466332947367e-05, + "loss": 0.3647, + "step": 2118 + }, + { + "epoch": 0.1807711994540181, + "grad_norm": 1.3115782907338411, + "learning_rate": 9.415818475630325e-05, + "loss": 0.3768, + "step": 2119 + }, + { + "epoch": 0.18085650912813514, + "grad_norm": 1.445695651018829, + "learning_rate": 9.415170281189008e-05, + "loss": 0.3586, + "step": 2120 + }, + { + "epoch": 0.18094181880225219, + "grad_norm": 1.4944029334699114, + "learning_rate": 9.414521749672902e-05, + "loss": 0.3716, + "step": 2121 + }, + { + "epoch": 0.18102712847636923, + "grad_norm": 1.6235788439845715, + "learning_rate": 9.413872881131518e-05, + "loss": 0.4106, + "step": 2122 + }, + { + "epoch": 0.18111243815048628, + "grad_norm": 1.6984889776295007, + "learning_rate": 9.413223675614396e-05, + "loss": 0.3243, + "step": 2123 + }, + { + "epoch": 0.18119774782460332, + "grad_norm": 1.5669029000243948, + "learning_rate": 9.412574133171098e-05, + "loss": 0.4054, + "step": 2124 + }, + { + "epoch": 0.18128305749872037, + "grad_norm": 1.3010638505659557, + "learning_rate": 9.411924253851213e-05, + "loss": 0.3081, + "step": 2125 + }, + { + "epoch": 0.1813683671728374, + "grad_norm": 1.338344613895088, + "learning_rate": 9.411274037704356e-05, + "loss": 0.3428, + "step": 2126 + }, + { + "epoch": 0.18145367684695443, + "grad_norm": 1.2930793830123053, + "learning_rate": 9.410623484780168e-05, + "loss": 0.3505, + "step": 2127 + }, + { + "epoch": 0.18153898652107148, + "grad_norm": 1.2855123009287905, + "learning_rate": 9.409972595128316e-05, + "loss": 0.2899, + "step": 2128 + }, + { + "epoch": 0.18162429619518852, + "grad_norm": 1.541247138943457, + "learning_rate": 9.409321368798489e-05, + "loss": 0.3499, + "step": 2129 + }, + { + "epoch": 0.18170960586930557, + "grad_norm": 1.3737284676527821, + "learning_rate": 9.408669805840408e-05, + "loss": 0.3363, + "step": 2130 + }, + { + "epoch": 0.1817949155434226, + "grad_norm": 1.157052560593279, + "learning_rate": 9.408017906303815e-05, + "loss": 0.3405, + "step": 2131 + }, + { + "epoch": 0.18188022521753966, + "grad_norm": 1.3771218487330503, + "learning_rate": 9.407365670238479e-05, + "loss": 0.3623, + "step": 2132 + }, + { + "epoch": 0.1819655348916567, + "grad_norm": 1.4806411360976093, + "learning_rate": 9.406713097694194e-05, + "loss": 0.3803, + "step": 2133 + }, + { + "epoch": 0.18205084456577375, + "grad_norm": 1.4878778186185877, + "learning_rate": 9.406060188720782e-05, + "loss": 0.4114, + "step": 2134 + }, + { + "epoch": 0.1821361542398908, + "grad_norm": 1.6643031846187255, + "learning_rate": 9.405406943368088e-05, + "loss": 0.3381, + "step": 2135 + }, + { + "epoch": 0.18222146391400784, + "grad_norm": 1.4989149619749687, + "learning_rate": 9.404753361685985e-05, + "loss": 0.412, + "step": 2136 + }, + { + "epoch": 0.1823067735881249, + "grad_norm": 1.6363975919947766, + "learning_rate": 9.404099443724368e-05, + "loss": 0.3683, + "step": 2137 + }, + { + "epoch": 0.18239208326224193, + "grad_norm": 1.5354696250005457, + "learning_rate": 9.403445189533163e-05, + "loss": 0.3918, + "step": 2138 + }, + { + "epoch": 0.18247739293635898, + "grad_norm": 1.4930830260517047, + "learning_rate": 9.402790599162317e-05, + "loss": 0.3395, + "step": 2139 + }, + { + "epoch": 0.18256270261047602, + "grad_norm": 1.7124089571844419, + "learning_rate": 9.402135672661807e-05, + "loss": 0.3989, + "step": 2140 + }, + { + "epoch": 0.18264801228459307, + "grad_norm": 1.393694201665393, + "learning_rate": 9.401480410081628e-05, + "loss": 0.3742, + "step": 2141 + }, + { + "epoch": 0.18273332195871012, + "grad_norm": 1.5801664688442303, + "learning_rate": 9.400824811471811e-05, + "loss": 0.3799, + "step": 2142 + }, + { + "epoch": 0.18281863163282716, + "grad_norm": 1.445702165192577, + "learning_rate": 9.400168876882408e-05, + "loss": 0.3071, + "step": 2143 + }, + { + "epoch": 0.1829039413069442, + "grad_norm": 1.5127215546313584, + "learning_rate": 9.39951260636349e-05, + "loss": 0.3913, + "step": 2144 + }, + { + "epoch": 0.18298925098106125, + "grad_norm": 1.3993782416881584, + "learning_rate": 9.398855999965165e-05, + "loss": 0.2814, + "step": 2145 + }, + { + "epoch": 0.1830745606551783, + "grad_norm": 1.4639655570684424, + "learning_rate": 9.39819905773756e-05, + "loss": 0.3391, + "step": 2146 + }, + { + "epoch": 0.18315987032929534, + "grad_norm": 1.5478789029278979, + "learning_rate": 9.397541779730827e-05, + "loss": 0.3648, + "step": 2147 + }, + { + "epoch": 0.1832451800034124, + "grad_norm": 1.2935391461583383, + "learning_rate": 9.39688416599515e-05, + "loss": 0.346, + "step": 2148 + }, + { + "epoch": 0.18333048967752943, + "grad_norm": 1.7089425218089342, + "learning_rate": 9.396226216580733e-05, + "loss": 0.3508, + "step": 2149 + }, + { + "epoch": 0.18341579935164648, + "grad_norm": 1.7410165589458244, + "learning_rate": 9.395567931537803e-05, + "loss": 0.389, + "step": 2150 + }, + { + "epoch": 0.18350110902576353, + "grad_norm": 1.5358552772967553, + "learning_rate": 9.39490931091662e-05, + "loss": 0.3672, + "step": 2151 + }, + { + "epoch": 0.18358641869988057, + "grad_norm": 1.355734411584374, + "learning_rate": 9.394250354767467e-05, + "loss": 0.3654, + "step": 2152 + }, + { + "epoch": 0.18367172837399762, + "grad_norm": 1.529648702595086, + "learning_rate": 9.39359106314065e-05, + "loss": 0.3595, + "step": 2153 + }, + { + "epoch": 0.18375703804811466, + "grad_norm": 1.425578565430829, + "learning_rate": 9.392931436086502e-05, + "loss": 0.3452, + "step": 2154 + }, + { + "epoch": 0.1838423477222317, + "grad_norm": 1.4587777232695847, + "learning_rate": 9.392271473655384e-05, + "loss": 0.3912, + "step": 2155 + }, + { + "epoch": 0.18392765739634875, + "grad_norm": 1.567074982548948, + "learning_rate": 9.391611175897677e-05, + "loss": 0.3815, + "step": 2156 + }, + { + "epoch": 0.1840129670704658, + "grad_norm": 1.431664522603879, + "learning_rate": 9.390950542863797e-05, + "loss": 0.3984, + "step": 2157 + }, + { + "epoch": 0.18409827674458284, + "grad_norm": 1.6303692658017979, + "learning_rate": 9.390289574604174e-05, + "loss": 0.4146, + "step": 2158 + }, + { + "epoch": 0.1841835864186999, + "grad_norm": 1.2965786606817444, + "learning_rate": 9.389628271169273e-05, + "loss": 0.3617, + "step": 2159 + }, + { + "epoch": 0.18426889609281694, + "grad_norm": 1.3084062584317118, + "learning_rate": 9.38896663260958e-05, + "loss": 0.3291, + "step": 2160 + }, + { + "epoch": 0.18435420576693398, + "grad_norm": 1.5872211992442011, + "learning_rate": 9.388304658975608e-05, + "loss": 0.3977, + "step": 2161 + }, + { + "epoch": 0.18443951544105103, + "grad_norm": 1.5765956606747022, + "learning_rate": 9.387642350317894e-05, + "loss": 0.3735, + "step": 2162 + }, + { + "epoch": 0.18452482511516807, + "grad_norm": 1.4109437386849026, + "learning_rate": 9.386979706687002e-05, + "loss": 0.3755, + "step": 2163 + }, + { + "epoch": 0.18461013478928512, + "grad_norm": 1.4650148012110096, + "learning_rate": 9.386316728133525e-05, + "loss": 0.359, + "step": 2164 + }, + { + "epoch": 0.18469544446340214, + "grad_norm": 1.7618344558646326, + "learning_rate": 9.385653414708071e-05, + "loss": 0.3785, + "step": 2165 + }, + { + "epoch": 0.18478075413751918, + "grad_norm": 1.647283357954928, + "learning_rate": 9.384989766461285e-05, + "loss": 0.3604, + "step": 2166 + }, + { + "epoch": 0.18486606381163623, + "grad_norm": 1.3735257828843872, + "learning_rate": 9.384325783443832e-05, + "loss": 0.3809, + "step": 2167 + }, + { + "epoch": 0.18495137348575327, + "grad_norm": 1.4872361566255512, + "learning_rate": 9.383661465706404e-05, + "loss": 0.3398, + "step": 2168 + }, + { + "epoch": 0.18503668315987032, + "grad_norm": 1.7257276216725197, + "learning_rate": 9.382996813299718e-05, + "loss": 0.4549, + "step": 2169 + }, + { + "epoch": 0.18512199283398736, + "grad_norm": 1.3366115721068486, + "learning_rate": 9.382331826274518e-05, + "loss": 0.3456, + "step": 2170 + }, + { + "epoch": 0.1852073025081044, + "grad_norm": 1.2593525053907115, + "learning_rate": 9.381666504681568e-05, + "loss": 0.3641, + "step": 2171 + }, + { + "epoch": 0.18529261218222146, + "grad_norm": 1.4324761381708364, + "learning_rate": 9.381000848571666e-05, + "loss": 0.315, + "step": 2172 + }, + { + "epoch": 0.1853779218563385, + "grad_norm": 1.495182805829612, + "learning_rate": 9.380334857995629e-05, + "loss": 0.3648, + "step": 2173 + }, + { + "epoch": 0.18546323153045555, + "grad_norm": 1.41875366680999, + "learning_rate": 9.379668533004305e-05, + "loss": 0.3604, + "step": 2174 + }, + { + "epoch": 0.1855485412045726, + "grad_norm": 1.373173367356437, + "learning_rate": 9.379001873648558e-05, + "loss": 0.3878, + "step": 2175 + }, + { + "epoch": 0.18563385087868964, + "grad_norm": 2.003462654970958, + "learning_rate": 9.378334879979292e-05, + "loss": 0.3718, + "step": 2176 + }, + { + "epoch": 0.18571916055280668, + "grad_norm": 1.3993321119911664, + "learning_rate": 9.377667552047423e-05, + "loss": 0.3185, + "step": 2177 + }, + { + "epoch": 0.18580447022692373, + "grad_norm": 1.4873009203877523, + "learning_rate": 9.3769998899039e-05, + "loss": 0.3775, + "step": 2178 + }, + { + "epoch": 0.18588977990104077, + "grad_norm": 1.3520451824916808, + "learning_rate": 9.376331893599692e-05, + "loss": 0.3273, + "step": 2179 + }, + { + "epoch": 0.18597508957515782, + "grad_norm": 1.551514036837339, + "learning_rate": 9.375663563185801e-05, + "loss": 0.3918, + "step": 2180 + }, + { + "epoch": 0.18606039924927487, + "grad_norm": 1.2727131567983454, + "learning_rate": 9.37499489871325e-05, + "loss": 0.3438, + "step": 2181 + }, + { + "epoch": 0.1861457089233919, + "grad_norm": 1.303306808413153, + "learning_rate": 9.374325900233088e-05, + "loss": 0.3327, + "step": 2182 + }, + { + "epoch": 0.18623101859750896, + "grad_norm": 1.5895341951400515, + "learning_rate": 9.373656567796386e-05, + "loss": 0.3733, + "step": 2183 + }, + { + "epoch": 0.186316328271626, + "grad_norm": 1.5326550324075727, + "learning_rate": 9.372986901454248e-05, + "loss": 0.3716, + "step": 2184 + }, + { + "epoch": 0.18640163794574305, + "grad_norm": 1.63110393586401, + "learning_rate": 9.372316901257798e-05, + "loss": 0.3583, + "step": 2185 + }, + { + "epoch": 0.1864869476198601, + "grad_norm": 1.2894571336038494, + "learning_rate": 9.371646567258187e-05, + "loss": 0.3409, + "step": 2186 + }, + { + "epoch": 0.18657225729397714, + "grad_norm": 1.408346605332417, + "learning_rate": 9.370975899506593e-05, + "loss": 0.3762, + "step": 2187 + }, + { + "epoch": 0.18665756696809419, + "grad_norm": 1.199492780459368, + "learning_rate": 9.370304898054214e-05, + "loss": 0.3764, + "step": 2188 + }, + { + "epoch": 0.18674287664221123, + "grad_norm": 1.5724477342613465, + "learning_rate": 9.369633562952281e-05, + "loss": 0.348, + "step": 2189 + }, + { + "epoch": 0.18682818631632828, + "grad_norm": 1.6410030837602483, + "learning_rate": 9.368961894252046e-05, + "loss": 0.3808, + "step": 2190 + }, + { + "epoch": 0.18691349599044532, + "grad_norm": 1.7254593721269822, + "learning_rate": 9.368289892004787e-05, + "loss": 0.3634, + "step": 2191 + }, + { + "epoch": 0.18699880566456237, + "grad_norm": 1.7578556309812698, + "learning_rate": 9.367617556261808e-05, + "loss": 0.454, + "step": 2192 + }, + { + "epoch": 0.1870841153386794, + "grad_norm": 1.4552788307992095, + "learning_rate": 9.366944887074437e-05, + "loss": 0.4177, + "step": 2193 + }, + { + "epoch": 0.18716942501279646, + "grad_norm": 1.5195022227204193, + "learning_rate": 9.36627188449403e-05, + "loss": 0.3692, + "step": 2194 + }, + { + "epoch": 0.1872547346869135, + "grad_norm": 1.4150306380227138, + "learning_rate": 9.365598548571968e-05, + "loss": 0.3718, + "step": 2195 + }, + { + "epoch": 0.18734004436103055, + "grad_norm": 1.4432716601074742, + "learning_rate": 9.364924879359653e-05, + "loss": 0.4084, + "step": 2196 + }, + { + "epoch": 0.1874253540351476, + "grad_norm": 1.6044435963902883, + "learning_rate": 9.364250876908522e-05, + "loss": 0.3872, + "step": 2197 + }, + { + "epoch": 0.18751066370926464, + "grad_norm": 1.523986952072517, + "learning_rate": 9.363576541270027e-05, + "loss": 0.3796, + "step": 2198 + }, + { + "epoch": 0.1875959733833817, + "grad_norm": 1.3611281041405867, + "learning_rate": 9.36290187249565e-05, + "loss": 0.3394, + "step": 2199 + }, + { + "epoch": 0.18768128305749873, + "grad_norm": 1.373615261200611, + "learning_rate": 9.362226870636901e-05, + "loss": 0.3491, + "step": 2200 + }, + { + "epoch": 0.18776659273161578, + "grad_norm": 1.7977803478817251, + "learning_rate": 9.36155153574531e-05, + "loss": 0.3543, + "step": 2201 + }, + { + "epoch": 0.18785190240573282, + "grad_norm": 1.457056203838132, + "learning_rate": 9.360875867872437e-05, + "loss": 0.3634, + "step": 2202 + }, + { + "epoch": 0.18793721207984984, + "grad_norm": 1.5420834862785329, + "learning_rate": 9.360199867069866e-05, + "loss": 0.3115, + "step": 2203 + }, + { + "epoch": 0.1880225217539669, + "grad_norm": 1.6509241810423314, + "learning_rate": 9.359523533389202e-05, + "loss": 0.3631, + "step": 2204 + }, + { + "epoch": 0.18810783142808393, + "grad_norm": 1.1786912665191398, + "learning_rate": 9.358846866882087e-05, + "loss": 0.3675, + "step": 2205 + }, + { + "epoch": 0.18819314110220098, + "grad_norm": 1.5581130431351855, + "learning_rate": 9.358169867600175e-05, + "loss": 0.403, + "step": 2206 + }, + { + "epoch": 0.18827845077631802, + "grad_norm": 1.9408307517746883, + "learning_rate": 9.357492535595151e-05, + "loss": 0.4274, + "step": 2207 + }, + { + "epoch": 0.18836376045043507, + "grad_norm": 1.49519851234162, + "learning_rate": 9.356814870918731e-05, + "loss": 0.3581, + "step": 2208 + }, + { + "epoch": 0.18844907012455211, + "grad_norm": 1.3354561814260906, + "learning_rate": 9.356136873622646e-05, + "loss": 0.3631, + "step": 2209 + }, + { + "epoch": 0.18853437979866916, + "grad_norm": 1.4995080618035839, + "learning_rate": 9.355458543758658e-05, + "loss": 0.3268, + "step": 2210 + }, + { + "epoch": 0.1886196894727862, + "grad_norm": 1.4457247584285118, + "learning_rate": 9.354779881378558e-05, + "loss": 0.378, + "step": 2211 + }, + { + "epoch": 0.18870499914690325, + "grad_norm": 1.5184193443505163, + "learning_rate": 9.354100886534152e-05, + "loss": 0.3736, + "step": 2212 + }, + { + "epoch": 0.1887903088210203, + "grad_norm": 1.3358071246861138, + "learning_rate": 9.353421559277282e-05, + "loss": 0.3429, + "step": 2213 + }, + { + "epoch": 0.18887561849513734, + "grad_norm": 1.5624024932954723, + "learning_rate": 9.352741899659812e-05, + "loss": 0.4157, + "step": 2214 + }, + { + "epoch": 0.1889609281692544, + "grad_norm": 1.505296099610858, + "learning_rate": 9.352061907733626e-05, + "loss": 0.342, + "step": 2215 + }, + { + "epoch": 0.18904623784337143, + "grad_norm": 1.4332678317842633, + "learning_rate": 9.351381583550641e-05, + "loss": 0.3678, + "step": 2216 + }, + { + "epoch": 0.18913154751748848, + "grad_norm": 1.5014155384435015, + "learning_rate": 9.350700927162794e-05, + "loss": 0.3531, + "step": 2217 + }, + { + "epoch": 0.18921685719160553, + "grad_norm": 1.545740192962784, + "learning_rate": 9.350019938622053e-05, + "loss": 0.3215, + "step": 2218 + }, + { + "epoch": 0.18930216686572257, + "grad_norm": 1.6665167105289307, + "learning_rate": 9.349338617980406e-05, + "loss": 0.3983, + "step": 2219 + }, + { + "epoch": 0.18938747653983962, + "grad_norm": 1.399105828700447, + "learning_rate": 9.348656965289866e-05, + "loss": 0.3546, + "step": 2220 + }, + { + "epoch": 0.18947278621395666, + "grad_norm": 1.6752595956536378, + "learning_rate": 9.347974980602477e-05, + "loss": 0.4113, + "step": 2221 + }, + { + "epoch": 0.1895580958880737, + "grad_norm": 1.4692825812738146, + "learning_rate": 9.347292663970301e-05, + "loss": 0.3507, + "step": 2222 + }, + { + "epoch": 0.18964340556219075, + "grad_norm": 1.4731086825218431, + "learning_rate": 9.346610015445434e-05, + "loss": 0.3737, + "step": 2223 + }, + { + "epoch": 0.1897287152363078, + "grad_norm": 1.5145979085200825, + "learning_rate": 9.34592703507999e-05, + "loss": 0.3663, + "step": 2224 + }, + { + "epoch": 0.18981402491042484, + "grad_norm": 1.4705448141419493, + "learning_rate": 9.34524372292611e-05, + "loss": 0.3163, + "step": 2225 + }, + { + "epoch": 0.1898993345845419, + "grad_norm": 1.5000292457272657, + "learning_rate": 9.344560079035962e-05, + "loss": 0.3954, + "step": 2226 + }, + { + "epoch": 0.18998464425865894, + "grad_norm": 1.5724212000863373, + "learning_rate": 9.34387610346174e-05, + "loss": 0.3823, + "step": 2227 + }, + { + "epoch": 0.19006995393277598, + "grad_norm": 1.5927174533194512, + "learning_rate": 9.343191796255659e-05, + "loss": 0.3735, + "step": 2228 + }, + { + "epoch": 0.19015526360689303, + "grad_norm": 1.6183968412835168, + "learning_rate": 9.342507157469967e-05, + "loss": 0.4035, + "step": 2229 + }, + { + "epoch": 0.19024057328101007, + "grad_norm": 1.2012730997744432, + "learning_rate": 9.341822187156927e-05, + "loss": 0.3652, + "step": 2230 + }, + { + "epoch": 0.19032588295512712, + "grad_norm": 1.3638431298391498, + "learning_rate": 9.341136885368837e-05, + "loss": 0.3878, + "step": 2231 + }, + { + "epoch": 0.19041119262924416, + "grad_norm": 1.3892313799843345, + "learning_rate": 9.340451252158015e-05, + "loss": 0.3213, + "step": 2232 + }, + { + "epoch": 0.1904965023033612, + "grad_norm": 1.432982935695377, + "learning_rate": 9.339765287576803e-05, + "loss": 0.4045, + "step": 2233 + }, + { + "epoch": 0.19058181197747825, + "grad_norm": 1.2512502140193607, + "learning_rate": 9.339078991677575e-05, + "loss": 0.3411, + "step": 2234 + }, + { + "epoch": 0.1906671216515953, + "grad_norm": 1.318841238617068, + "learning_rate": 9.338392364512723e-05, + "loss": 0.3582, + "step": 2235 + }, + { + "epoch": 0.19075243132571235, + "grad_norm": 1.268469921448622, + "learning_rate": 9.337705406134666e-05, + "loss": 0.387, + "step": 2236 + }, + { + "epoch": 0.1908377409998294, + "grad_norm": 1.504760499571868, + "learning_rate": 9.337018116595855e-05, + "loss": 0.3561, + "step": 2237 + }, + { + "epoch": 0.19092305067394644, + "grad_norm": 1.5199935719705486, + "learning_rate": 9.336330495948756e-05, + "loss": 0.3202, + "step": 2238 + }, + { + "epoch": 0.19100836034806348, + "grad_norm": 1.4702694926118651, + "learning_rate": 9.335642544245868e-05, + "loss": 0.3859, + "step": 2239 + }, + { + "epoch": 0.19109367002218053, + "grad_norm": 1.6030246666041459, + "learning_rate": 9.33495426153971e-05, + "loss": 0.3624, + "step": 2240 + }, + { + "epoch": 0.19117897969629755, + "grad_norm": 1.4297960584105134, + "learning_rate": 9.334265647882832e-05, + "loss": 0.3075, + "step": 2241 + }, + { + "epoch": 0.1912642893704146, + "grad_norm": 1.9792376723017, + "learning_rate": 9.333576703327803e-05, + "loss": 0.458, + "step": 2242 + }, + { + "epoch": 0.19134959904453164, + "grad_norm": 1.5524872973132797, + "learning_rate": 9.33288742792722e-05, + "loss": 0.3867, + "step": 2243 + }, + { + "epoch": 0.19143490871864868, + "grad_norm": 1.554878874841924, + "learning_rate": 9.332197821733709e-05, + "loss": 0.3929, + "step": 2244 + }, + { + "epoch": 0.19152021839276573, + "grad_norm": 1.3791080143530041, + "learning_rate": 9.331507884799913e-05, + "loss": 0.3306, + "step": 2245 + }, + { + "epoch": 0.19160552806688277, + "grad_norm": 1.3852685356779497, + "learning_rate": 9.33081761717851e-05, + "loss": 0.3239, + "step": 2246 + }, + { + "epoch": 0.19169083774099982, + "grad_norm": 1.6893479154605262, + "learning_rate": 9.330127018922194e-05, + "loss": 0.4364, + "step": 2247 + }, + { + "epoch": 0.19177614741511687, + "grad_norm": 1.4136734738199892, + "learning_rate": 9.32943609008369e-05, + "loss": 0.5458, + "step": 2248 + }, + { + "epoch": 0.1918614570892339, + "grad_norm": 1.4221519787316514, + "learning_rate": 9.32874483071575e-05, + "loss": 0.3603, + "step": 2249 + }, + { + "epoch": 0.19194676676335096, + "grad_norm": 1.2785746858255433, + "learning_rate": 9.328053240871143e-05, + "loss": 0.3659, + "step": 2250 + }, + { + "epoch": 0.192032076437468, + "grad_norm": 1.6306485525223582, + "learning_rate": 9.32736132060267e-05, + "loss": 0.4091, + "step": 2251 + }, + { + "epoch": 0.19211738611158505, + "grad_norm": 1.618533693685873, + "learning_rate": 9.326669069963156e-05, + "loss": 0.3822, + "step": 2252 + }, + { + "epoch": 0.1922026957857021, + "grad_norm": 1.4002077629694747, + "learning_rate": 9.325976489005453e-05, + "loss": 0.3517, + "step": 2253 + }, + { + "epoch": 0.19228800545981914, + "grad_norm": 1.424235302718538, + "learning_rate": 9.32528357778243e-05, + "loss": 0.354, + "step": 2254 + }, + { + "epoch": 0.19237331513393618, + "grad_norm": 1.5200246289416206, + "learning_rate": 9.324590336346992e-05, + "loss": 0.4086, + "step": 2255 + }, + { + "epoch": 0.19245862480805323, + "grad_norm": 1.70091797502964, + "learning_rate": 9.323896764752063e-05, + "loss": 0.3881, + "step": 2256 + }, + { + "epoch": 0.19254393448217028, + "grad_norm": 1.2797673648789567, + "learning_rate": 9.32320286305059e-05, + "loss": 0.3644, + "step": 2257 + }, + { + "epoch": 0.19262924415628732, + "grad_norm": 1.2499744889517106, + "learning_rate": 9.322508631295555e-05, + "loss": 0.3437, + "step": 2258 + }, + { + "epoch": 0.19271455383040437, + "grad_norm": 1.4458878480306283, + "learning_rate": 9.321814069539956e-05, + "loss": 0.3741, + "step": 2259 + }, + { + "epoch": 0.1927998635045214, + "grad_norm": 1.563510034263625, + "learning_rate": 9.321119177836818e-05, + "loss": 0.4249, + "step": 2260 + }, + { + "epoch": 0.19288517317863846, + "grad_norm": 1.6406023296424763, + "learning_rate": 9.320423956239192e-05, + "loss": 0.4165, + "step": 2261 + }, + { + "epoch": 0.1929704828527555, + "grad_norm": 1.2329202119020621, + "learning_rate": 9.319728404800157e-05, + "loss": 0.3291, + "step": 2262 + }, + { + "epoch": 0.19305579252687255, + "grad_norm": 1.4196721714005904, + "learning_rate": 9.319032523572815e-05, + "loss": 0.3694, + "step": 2263 + }, + { + "epoch": 0.1931411022009896, + "grad_norm": 1.4222958120208644, + "learning_rate": 9.318336312610288e-05, + "loss": 0.3975, + "step": 2264 + }, + { + "epoch": 0.19322641187510664, + "grad_norm": 1.6032017204264015, + "learning_rate": 9.317639771965733e-05, + "loss": 0.3825, + "step": 2265 + }, + { + "epoch": 0.1933117215492237, + "grad_norm": 1.548322982479738, + "learning_rate": 9.316942901692325e-05, + "loss": 0.3468, + "step": 2266 + }, + { + "epoch": 0.19339703122334073, + "grad_norm": 1.409380289816392, + "learning_rate": 9.316245701843266e-05, + "loss": 0.3585, + "step": 2267 + }, + { + "epoch": 0.19348234089745778, + "grad_norm": 1.5559972105810271, + "learning_rate": 9.315548172471784e-05, + "loss": 0.3434, + "step": 2268 + }, + { + "epoch": 0.19356765057157482, + "grad_norm": 1.419490953460277, + "learning_rate": 9.314850313631132e-05, + "loss": 0.3544, + "step": 2269 + }, + { + "epoch": 0.19365296024569187, + "grad_norm": 1.4281344290323987, + "learning_rate": 9.314152125374589e-05, + "loss": 0.3498, + "step": 2270 + }, + { + "epoch": 0.19373826991980891, + "grad_norm": 1.2336118246231662, + "learning_rate": 9.313453607755456e-05, + "loss": 0.3157, + "step": 2271 + }, + { + "epoch": 0.19382357959392596, + "grad_norm": 1.6517035504382358, + "learning_rate": 9.312754760827061e-05, + "loss": 0.3758, + "step": 2272 + }, + { + "epoch": 0.193908889268043, + "grad_norm": 1.5075425612987505, + "learning_rate": 9.312055584642758e-05, + "loss": 0.3606, + "step": 2273 + }, + { + "epoch": 0.19399419894216005, + "grad_norm": 1.3461553646959397, + "learning_rate": 9.311356079255927e-05, + "loss": 0.3538, + "step": 2274 + }, + { + "epoch": 0.1940795086162771, + "grad_norm": 1.6348539303500276, + "learning_rate": 9.310656244719968e-05, + "loss": 0.3733, + "step": 2275 + }, + { + "epoch": 0.19416481829039414, + "grad_norm": 1.4791080391597355, + "learning_rate": 9.309956081088311e-05, + "loss": 0.2958, + "step": 2276 + }, + { + "epoch": 0.1942501279645112, + "grad_norm": 1.1817183035547427, + "learning_rate": 9.309255588414412e-05, + "loss": 0.3963, + "step": 2277 + }, + { + "epoch": 0.19433543763862823, + "grad_norm": 1.5003204798074032, + "learning_rate": 9.308554766751746e-05, + "loss": 0.3424, + "step": 2278 + }, + { + "epoch": 0.19442074731274528, + "grad_norm": 1.2198601092979509, + "learning_rate": 9.307853616153821e-05, + "loss": 0.3674, + "step": 2279 + }, + { + "epoch": 0.1945060569868623, + "grad_norm": 1.635872196325985, + "learning_rate": 9.307152136674164e-05, + "loss": 0.4394, + "step": 2280 + }, + { + "epoch": 0.19459136666097934, + "grad_norm": 1.3336566840922235, + "learning_rate": 9.30645032836633e-05, + "loss": 0.3376, + "step": 2281 + }, + { + "epoch": 0.1946766763350964, + "grad_norm": 1.6823028784536735, + "learning_rate": 9.305748191283898e-05, + "loss": 0.4106, + "step": 2282 + }, + { + "epoch": 0.19476198600921343, + "grad_norm": 1.4543458465683683, + "learning_rate": 9.305045725480472e-05, + "loss": 0.3804, + "step": 2283 + }, + { + "epoch": 0.19484729568333048, + "grad_norm": 1.3907681509339, + "learning_rate": 9.304342931009681e-05, + "loss": 0.3505, + "step": 2284 + }, + { + "epoch": 0.19493260535744752, + "grad_norm": 1.2925809065326863, + "learning_rate": 9.303639807925182e-05, + "loss": 0.359, + "step": 2285 + }, + { + "epoch": 0.19501791503156457, + "grad_norm": 1.2549704435751192, + "learning_rate": 9.302936356280652e-05, + "loss": 0.3676, + "step": 2286 + }, + { + "epoch": 0.19510322470568162, + "grad_norm": 1.4138115928987298, + "learning_rate": 9.302232576129797e-05, + "loss": 0.3531, + "step": 2287 + }, + { + "epoch": 0.19518853437979866, + "grad_norm": 1.4954502723866803, + "learning_rate": 9.301528467526347e-05, + "loss": 0.3227, + "step": 2288 + }, + { + "epoch": 0.1952738440539157, + "grad_norm": 1.4703456246580011, + "learning_rate": 9.300824030524056e-05, + "loss": 0.3671, + "step": 2289 + }, + { + "epoch": 0.19535915372803275, + "grad_norm": 1.3390425797430754, + "learning_rate": 9.300119265176707e-05, + "loss": 0.3272, + "step": 2290 + }, + { + "epoch": 0.1954444634021498, + "grad_norm": 1.3558591780556613, + "learning_rate": 9.2994141715381e-05, + "loss": 0.3534, + "step": 2291 + }, + { + "epoch": 0.19552977307626684, + "grad_norm": 1.467438457179841, + "learning_rate": 9.29870874966207e-05, + "loss": 0.3725, + "step": 2292 + }, + { + "epoch": 0.1956150827503839, + "grad_norm": 1.2422625441045494, + "learning_rate": 9.298002999602471e-05, + "loss": 0.3343, + "step": 2293 + }, + { + "epoch": 0.19570039242450094, + "grad_norm": 1.883607815120781, + "learning_rate": 9.297296921413181e-05, + "loss": 0.3307, + "step": 2294 + }, + { + "epoch": 0.19578570209861798, + "grad_norm": 1.3451647297408802, + "learning_rate": 9.296590515148109e-05, + "loss": 0.3468, + "step": 2295 + }, + { + "epoch": 0.19587101177273503, + "grad_norm": 1.498819999682742, + "learning_rate": 9.295883780861181e-05, + "loss": 0.3817, + "step": 2296 + }, + { + "epoch": 0.19595632144685207, + "grad_norm": 1.5300277191343887, + "learning_rate": 9.295176718606355e-05, + "loss": 0.4012, + "step": 2297 + }, + { + "epoch": 0.19604163112096912, + "grad_norm": 1.2790225135187772, + "learning_rate": 9.294469328437611e-05, + "loss": 0.3722, + "step": 2298 + }, + { + "epoch": 0.19612694079508616, + "grad_norm": 1.6214818283130898, + "learning_rate": 9.293761610408955e-05, + "loss": 0.3161, + "step": 2299 + }, + { + "epoch": 0.1962122504692032, + "grad_norm": 1.7537546750779216, + "learning_rate": 9.293053564574417e-05, + "loss": 0.3388, + "step": 2300 + }, + { + "epoch": 0.19629756014332025, + "grad_norm": 1.3950341864468, + "learning_rate": 9.292345190988052e-05, + "loss": 0.3232, + "step": 2301 + }, + { + "epoch": 0.1963828698174373, + "grad_norm": 1.7478235880000268, + "learning_rate": 9.291636489703943e-05, + "loss": 0.3552, + "step": 2302 + }, + { + "epoch": 0.19646817949155435, + "grad_norm": 1.3931081653945223, + "learning_rate": 9.290927460776192e-05, + "loss": 0.3342, + "step": 2303 + }, + { + "epoch": 0.1965534891656714, + "grad_norm": 1.4526469767232348, + "learning_rate": 9.290218104258933e-05, + "loss": 0.4003, + "step": 2304 + }, + { + "epoch": 0.19663879883978844, + "grad_norm": 1.4881921776577116, + "learning_rate": 9.289508420206318e-05, + "loss": 0.3797, + "step": 2305 + }, + { + "epoch": 0.19672410851390548, + "grad_norm": 1.5311839031045038, + "learning_rate": 9.28879840867253e-05, + "loss": 0.4065, + "step": 2306 + }, + { + "epoch": 0.19680941818802253, + "grad_norm": 1.9519679189766073, + "learning_rate": 9.288088069711774e-05, + "loss": 0.3799, + "step": 2307 + }, + { + "epoch": 0.19689472786213957, + "grad_norm": 1.2819079593790168, + "learning_rate": 9.287377403378282e-05, + "loss": 0.3385, + "step": 2308 + }, + { + "epoch": 0.19698003753625662, + "grad_norm": 1.5458444572658565, + "learning_rate": 9.286666409726306e-05, + "loss": 0.3976, + "step": 2309 + }, + { + "epoch": 0.19706534721037366, + "grad_norm": 1.7301012600497216, + "learning_rate": 9.285955088810132e-05, + "loss": 0.3706, + "step": 2310 + }, + { + "epoch": 0.1971506568844907, + "grad_norm": 2.008388451522935, + "learning_rate": 9.285243440684059e-05, + "loss": 0.3606, + "step": 2311 + }, + { + "epoch": 0.19723596655860776, + "grad_norm": 1.2754301617089234, + "learning_rate": 9.284531465402424e-05, + "loss": 0.3272, + "step": 2312 + }, + { + "epoch": 0.1973212762327248, + "grad_norm": 1.3992524995466287, + "learning_rate": 9.283819163019578e-05, + "loss": 0.3242, + "step": 2313 + }, + { + "epoch": 0.19740658590684185, + "grad_norm": 1.3961790377924257, + "learning_rate": 9.283106533589905e-05, + "loss": 0.3259, + "step": 2314 + }, + { + "epoch": 0.1974918955809589, + "grad_norm": 1.4638170229897018, + "learning_rate": 9.282393577167807e-05, + "loss": 0.3479, + "step": 2315 + }, + { + "epoch": 0.19757720525507594, + "grad_norm": 1.2925567431510063, + "learning_rate": 9.281680293807717e-05, + "loss": 0.331, + "step": 2316 + }, + { + "epoch": 0.19766251492919298, + "grad_norm": 1.5788418491976428, + "learning_rate": 9.280966683564088e-05, + "loss": 0.353, + "step": 2317 + }, + { + "epoch": 0.19774782460331, + "grad_norm": 1.5746486635258359, + "learning_rate": 9.280252746491403e-05, + "loss": 0.4146, + "step": 2318 + }, + { + "epoch": 0.19783313427742705, + "grad_norm": 1.7053267919197717, + "learning_rate": 9.279538482644165e-05, + "loss": 0.3608, + "step": 2319 + }, + { + "epoch": 0.1979184439515441, + "grad_norm": 1.26144817273652, + "learning_rate": 9.278823892076907e-05, + "loss": 0.347, + "step": 2320 + }, + { + "epoch": 0.19800375362566114, + "grad_norm": 1.4518786592145927, + "learning_rate": 9.27810897484418e-05, + "loss": 0.3166, + "step": 2321 + }, + { + "epoch": 0.19808906329977818, + "grad_norm": 1.4705256016800705, + "learning_rate": 9.277393731000568e-05, + "loss": 0.325, + "step": 2322 + }, + { + "epoch": 0.19817437297389523, + "grad_norm": 1.956840216474636, + "learning_rate": 9.276678160600674e-05, + "loss": 0.37, + "step": 2323 + }, + { + "epoch": 0.19825968264801228, + "grad_norm": 1.467770757311051, + "learning_rate": 9.275962263699129e-05, + "loss": 0.3457, + "step": 2324 + }, + { + "epoch": 0.19834499232212932, + "grad_norm": 1.6781995703469232, + "learning_rate": 9.27524604035059e-05, + "loss": 0.3789, + "step": 2325 + }, + { + "epoch": 0.19843030199624637, + "grad_norm": 1.3479296862972323, + "learning_rate": 9.274529490609731e-05, + "loss": 0.3201, + "step": 2326 + }, + { + "epoch": 0.1985156116703634, + "grad_norm": 1.2808523724112388, + "learning_rate": 9.273812614531261e-05, + "loss": 0.3309, + "step": 2327 + }, + { + "epoch": 0.19860092134448046, + "grad_norm": 1.3910286992483543, + "learning_rate": 9.27309541216991e-05, + "loss": 0.3577, + "step": 2328 + }, + { + "epoch": 0.1986862310185975, + "grad_norm": 1.5129815083928717, + "learning_rate": 9.27237788358043e-05, + "loss": 0.3769, + "step": 2329 + }, + { + "epoch": 0.19877154069271455, + "grad_norm": 1.5481838507433523, + "learning_rate": 9.271660028817603e-05, + "loss": 0.3169, + "step": 2330 + }, + { + "epoch": 0.1988568503668316, + "grad_norm": 1.7907598181762547, + "learning_rate": 9.270941847936232e-05, + "loss": 0.3885, + "step": 2331 + }, + { + "epoch": 0.19894216004094864, + "grad_norm": 1.697591489143861, + "learning_rate": 9.270223340991147e-05, + "loss": 0.3488, + "step": 2332 + }, + { + "epoch": 0.19902746971506569, + "grad_norm": 1.5657103555220282, + "learning_rate": 9.269504508037202e-05, + "loss": 0.3835, + "step": 2333 + }, + { + "epoch": 0.19911277938918273, + "grad_norm": 1.5097425688528965, + "learning_rate": 9.268785349129277e-05, + "loss": 0.3613, + "step": 2334 + }, + { + "epoch": 0.19919808906329978, + "grad_norm": 1.402069234942463, + "learning_rate": 9.268065864322273e-05, + "loss": 0.3764, + "step": 2335 + }, + { + "epoch": 0.19928339873741682, + "grad_norm": 1.317581240992137, + "learning_rate": 9.267346053671121e-05, + "loss": 0.3662, + "step": 2336 + }, + { + "epoch": 0.19936870841153387, + "grad_norm": 1.6189338726937779, + "learning_rate": 9.266625917230774e-05, + "loss": 0.3905, + "step": 2337 + }, + { + "epoch": 0.1994540180856509, + "grad_norm": 1.5180737877755566, + "learning_rate": 9.265905455056211e-05, + "loss": 0.4274, + "step": 2338 + }, + { + "epoch": 0.19953932775976796, + "grad_norm": 1.256342152873683, + "learning_rate": 9.265184667202438e-05, + "loss": 0.3256, + "step": 2339 + }, + { + "epoch": 0.199624637433885, + "grad_norm": 1.3220940482812145, + "learning_rate": 9.264463553724478e-05, + "loss": 0.3152, + "step": 2340 + }, + { + "epoch": 0.19970994710800205, + "grad_norm": 1.401317592899641, + "learning_rate": 9.26374211467739e-05, + "loss": 0.312, + "step": 2341 + }, + { + "epoch": 0.1997952567821191, + "grad_norm": 1.3401228722419425, + "learning_rate": 9.263020350116247e-05, + "loss": 0.3588, + "step": 2342 + }, + { + "epoch": 0.19988056645623614, + "grad_norm": 1.5067110930844312, + "learning_rate": 9.262298260096154e-05, + "loss": 0.3576, + "step": 2343 + }, + { + "epoch": 0.1999658761303532, + "grad_norm": 1.6814954387093624, + "learning_rate": 9.261575844672238e-05, + "loss": 0.3624, + "step": 2344 + }, + { + "epoch": 0.20005118580447023, + "grad_norm": 1.6340045866604216, + "learning_rate": 9.260853103899654e-05, + "loss": 0.367, + "step": 2345 + }, + { + "epoch": 0.20013649547858728, + "grad_norm": 1.794273948574453, + "learning_rate": 9.260130037833576e-05, + "loss": 0.4172, + "step": 2346 + }, + { + "epoch": 0.20022180515270432, + "grad_norm": 1.8105131637907983, + "learning_rate": 9.259406646529209e-05, + "loss": 0.3642, + "step": 2347 + }, + { + "epoch": 0.20030711482682137, + "grad_norm": 1.6027517973705752, + "learning_rate": 9.258682930041778e-05, + "loss": 0.3796, + "step": 2348 + }, + { + "epoch": 0.20039242450093842, + "grad_norm": 1.5323204658476444, + "learning_rate": 9.257958888426536e-05, + "loss": 0.3905, + "step": 2349 + }, + { + "epoch": 0.20047773417505546, + "grad_norm": 1.2793938752884548, + "learning_rate": 9.257234521738762e-05, + "loss": 0.3125, + "step": 2350 + }, + { + "epoch": 0.2005630438491725, + "grad_norm": 1.2746416261446618, + "learning_rate": 9.256509830033752e-05, + "loss": 0.3758, + "step": 2351 + }, + { + "epoch": 0.20064835352328955, + "grad_norm": 1.224199290263932, + "learning_rate": 9.255784813366837e-05, + "loss": 0.3486, + "step": 2352 + }, + { + "epoch": 0.2007336631974066, + "grad_norm": 1.4243959987242454, + "learning_rate": 9.255059471793369e-05, + "loss": 0.357, + "step": 2353 + }, + { + "epoch": 0.20081897287152364, + "grad_norm": 1.5610147665181908, + "learning_rate": 9.254333805368717e-05, + "loss": 0.3621, + "step": 2354 + }, + { + "epoch": 0.2009042825456407, + "grad_norm": 1.3462698176491765, + "learning_rate": 9.253607814148289e-05, + "loss": 0.2918, + "step": 2355 + }, + { + "epoch": 0.2009895922197577, + "grad_norm": 1.480816883231211, + "learning_rate": 9.25288149818751e-05, + "loss": 0.3488, + "step": 2356 + }, + { + "epoch": 0.20107490189387475, + "grad_norm": 1.3245001749864451, + "learning_rate": 9.252154857541825e-05, + "loss": 0.3274, + "step": 2357 + }, + { + "epoch": 0.2011602115679918, + "grad_norm": 1.1953710404565532, + "learning_rate": 9.251427892266712e-05, + "loss": 0.3374, + "step": 2358 + }, + { + "epoch": 0.20124552124210884, + "grad_norm": 1.2888937174955077, + "learning_rate": 9.250700602417675e-05, + "loss": 0.3398, + "step": 2359 + }, + { + "epoch": 0.2013308309162259, + "grad_norm": 1.3113029107109142, + "learning_rate": 9.249972988050233e-05, + "loss": 0.3122, + "step": 2360 + }, + { + "epoch": 0.20141614059034293, + "grad_norm": 1.558359840834, + "learning_rate": 9.249245049219939e-05, + "loss": 0.3987, + "step": 2361 + }, + { + "epoch": 0.20150145026445998, + "grad_norm": 1.3145612244275862, + "learning_rate": 9.248516785982364e-05, + "loss": 0.3118, + "step": 2362 + }, + { + "epoch": 0.20158675993857703, + "grad_norm": 1.447267931319856, + "learning_rate": 9.247788198393111e-05, + "loss": 0.4013, + "step": 2363 + }, + { + "epoch": 0.20167206961269407, + "grad_norm": 1.2464051051981606, + "learning_rate": 9.2470592865078e-05, + "loss": 0.3163, + "step": 2364 + }, + { + "epoch": 0.20175737928681112, + "grad_norm": 1.2205837832287891, + "learning_rate": 9.246330050382083e-05, + "loss": 0.3386, + "step": 2365 + }, + { + "epoch": 0.20184268896092816, + "grad_norm": 1.1774548757212486, + "learning_rate": 9.24560049007163e-05, + "loss": 0.3381, + "step": 2366 + }, + { + "epoch": 0.2019279986350452, + "grad_norm": 1.5123659468585662, + "learning_rate": 9.244870605632142e-05, + "loss": 0.3401, + "step": 2367 + }, + { + "epoch": 0.20201330830916225, + "grad_norm": 1.3951131851329, + "learning_rate": 9.24414039711934e-05, + "loss": 0.3295, + "step": 2368 + }, + { + "epoch": 0.2020986179832793, + "grad_norm": 1.569914582352549, + "learning_rate": 9.243409864588972e-05, + "loss": 0.3742, + "step": 2369 + }, + { + "epoch": 0.20218392765739635, + "grad_norm": 1.5652935233006344, + "learning_rate": 9.242679008096811e-05, + "loss": 0.3338, + "step": 2370 + }, + { + "epoch": 0.2022692373315134, + "grad_norm": 1.6499858653301198, + "learning_rate": 9.241947827698652e-05, + "loss": 0.3592, + "step": 2371 + }, + { + "epoch": 0.20235454700563044, + "grad_norm": 1.348653938188765, + "learning_rate": 9.24121632345032e-05, + "loss": 0.3276, + "step": 2372 + }, + { + "epoch": 0.20243985667974748, + "grad_norm": 1.3848536023600049, + "learning_rate": 9.240484495407657e-05, + "loss": 0.3424, + "step": 2373 + }, + { + "epoch": 0.20252516635386453, + "grad_norm": 1.4880061658813541, + "learning_rate": 9.239752343626538e-05, + "loss": 0.3835, + "step": 2374 + }, + { + "epoch": 0.20261047602798157, + "grad_norm": 1.3965206968479016, + "learning_rate": 9.239019868162856e-05, + "loss": 0.3291, + "step": 2375 + }, + { + "epoch": 0.20269578570209862, + "grad_norm": 1.7209146912980149, + "learning_rate": 9.238287069072535e-05, + "loss": 0.432, + "step": 2376 + }, + { + "epoch": 0.20278109537621566, + "grad_norm": 1.440644184241334, + "learning_rate": 9.237553946411519e-05, + "loss": 0.3957, + "step": 2377 + }, + { + "epoch": 0.2028664050503327, + "grad_norm": 1.3597275726157676, + "learning_rate": 9.236820500235776e-05, + "loss": 0.3441, + "step": 2378 + }, + { + "epoch": 0.20295171472444976, + "grad_norm": 1.484846421722767, + "learning_rate": 9.236086730601304e-05, + "loss": 0.3628, + "step": 2379 + }, + { + "epoch": 0.2030370243985668, + "grad_norm": 1.8558818156580406, + "learning_rate": 9.235352637564118e-05, + "loss": 0.395, + "step": 2380 + }, + { + "epoch": 0.20312233407268385, + "grad_norm": 1.3348555617472397, + "learning_rate": 9.234618221180267e-05, + "loss": 0.3782, + "step": 2381 + }, + { + "epoch": 0.2032076437468009, + "grad_norm": 1.3901689617462902, + "learning_rate": 9.233883481505817e-05, + "loss": 0.3259, + "step": 2382 + }, + { + "epoch": 0.20329295342091794, + "grad_norm": 1.2862269338869552, + "learning_rate": 9.233148418596862e-05, + "loss": 0.362, + "step": 2383 + }, + { + "epoch": 0.20337826309503498, + "grad_norm": 1.5459241931189127, + "learning_rate": 9.23241303250952e-05, + "loss": 0.4038, + "step": 2384 + }, + { + "epoch": 0.20346357276915203, + "grad_norm": 1.4247934760632408, + "learning_rate": 9.231677323299935e-05, + "loss": 0.3988, + "step": 2385 + }, + { + "epoch": 0.20354888244326907, + "grad_norm": 1.5112149288629535, + "learning_rate": 9.230941291024273e-05, + "loss": 0.3389, + "step": 2386 + }, + { + "epoch": 0.20363419211738612, + "grad_norm": 1.6065335420537519, + "learning_rate": 9.230204935738725e-05, + "loss": 0.3707, + "step": 2387 + }, + { + "epoch": 0.20371950179150317, + "grad_norm": 1.517922930533949, + "learning_rate": 9.229468257499511e-05, + "loss": 0.3365, + "step": 2388 + }, + { + "epoch": 0.2038048114656202, + "grad_norm": 1.3760548793361493, + "learning_rate": 9.22873125636287e-05, + "loss": 0.359, + "step": 2389 + }, + { + "epoch": 0.20389012113973726, + "grad_norm": 1.5267688030048883, + "learning_rate": 9.227993932385069e-05, + "loss": 0.4021, + "step": 2390 + }, + { + "epoch": 0.2039754308138543, + "grad_norm": 1.3188660050845482, + "learning_rate": 9.2272562856224e-05, + "loss": 0.358, + "step": 2391 + }, + { + "epoch": 0.20406074048797135, + "grad_norm": 1.4464959325696958, + "learning_rate": 9.226518316131176e-05, + "loss": 0.3016, + "step": 2392 + }, + { + "epoch": 0.2041460501620884, + "grad_norm": 1.4035986595743148, + "learning_rate": 9.22578002396774e-05, + "loss": 0.3583, + "step": 2393 + }, + { + "epoch": 0.20423135983620544, + "grad_norm": 1.347768408489516, + "learning_rate": 9.225041409188453e-05, + "loss": 0.3327, + "step": 2394 + }, + { + "epoch": 0.20431666951032246, + "grad_norm": 1.3565403113388232, + "learning_rate": 9.224302471849707e-05, + "loss": 0.3608, + "step": 2395 + }, + { + "epoch": 0.2044019791844395, + "grad_norm": 1.366081487973002, + "learning_rate": 9.223563212007915e-05, + "loss": 0.3553, + "step": 2396 + }, + { + "epoch": 0.20448728885855655, + "grad_norm": 1.4130474683775431, + "learning_rate": 9.222823629719516e-05, + "loss": 0.3784, + "step": 2397 + }, + { + "epoch": 0.2045725985326736, + "grad_norm": 1.5195680432412895, + "learning_rate": 9.222083725040973e-05, + "loss": 0.3737, + "step": 2398 + }, + { + "epoch": 0.20465790820679064, + "grad_norm": 1.2980811703169588, + "learning_rate": 9.221343498028774e-05, + "loss": 0.3557, + "step": 2399 + }, + { + "epoch": 0.20474321788090769, + "grad_norm": 1.618125606579481, + "learning_rate": 9.22060294873943e-05, + "loss": 0.3267, + "step": 2400 + }, + { + "epoch": 0.20482852755502473, + "grad_norm": 1.9377799293353495, + "learning_rate": 9.21986207722948e-05, + "loss": 0.4057, + "step": 2401 + }, + { + "epoch": 0.20491383722914178, + "grad_norm": 1.8917022662517073, + "learning_rate": 9.219120883555486e-05, + "loss": 0.3746, + "step": 2402 + }, + { + "epoch": 0.20499914690325882, + "grad_norm": 1.961337956728868, + "learning_rate": 9.21837936777403e-05, + "loss": 0.3825, + "step": 2403 + }, + { + "epoch": 0.20508445657737587, + "grad_norm": 1.9520093249035717, + "learning_rate": 9.217637529941727e-05, + "loss": 0.3644, + "step": 2404 + }, + { + "epoch": 0.2051697662514929, + "grad_norm": 1.3094727709554865, + "learning_rate": 9.216895370115211e-05, + "loss": 0.3113, + "step": 2405 + }, + { + "epoch": 0.20525507592560996, + "grad_norm": 1.2992544292016988, + "learning_rate": 9.216152888351144e-05, + "loss": 0.3544, + "step": 2406 + }, + { + "epoch": 0.205340385599727, + "grad_norm": 1.6388924305908095, + "learning_rate": 9.215410084706206e-05, + "loss": 0.3792, + "step": 2407 + }, + { + "epoch": 0.20542569527384405, + "grad_norm": 1.477758901797391, + "learning_rate": 9.214666959237109e-05, + "loss": 0.3704, + "step": 2408 + }, + { + "epoch": 0.2055110049479611, + "grad_norm": 1.403383045214586, + "learning_rate": 9.213923512000588e-05, + "loss": 0.3781, + "step": 2409 + }, + { + "epoch": 0.20559631462207814, + "grad_norm": 1.2863063130410417, + "learning_rate": 9.213179743053399e-05, + "loss": 0.3683, + "step": 2410 + }, + { + "epoch": 0.2056816242961952, + "grad_norm": 1.5397853025417485, + "learning_rate": 9.212435652452324e-05, + "loss": 0.3352, + "step": 2411 + }, + { + "epoch": 0.20576693397031223, + "grad_norm": 1.5513416882267979, + "learning_rate": 9.211691240254173e-05, + "loss": 0.3259, + "step": 2412 + }, + { + "epoch": 0.20585224364442928, + "grad_norm": 1.1969862047131878, + "learning_rate": 9.210946506515777e-05, + "loss": 0.3188, + "step": 2413 + }, + { + "epoch": 0.20593755331854632, + "grad_norm": 1.778951868972562, + "learning_rate": 9.210201451293992e-05, + "loss": 0.4078, + "step": 2414 + }, + { + "epoch": 0.20602286299266337, + "grad_norm": 1.4799737832608402, + "learning_rate": 9.209456074645699e-05, + "loss": 0.3627, + "step": 2415 + }, + { + "epoch": 0.20610817266678041, + "grad_norm": 1.3096378089982557, + "learning_rate": 9.208710376627803e-05, + "loss": 0.3501, + "step": 2416 + }, + { + "epoch": 0.20619348234089746, + "grad_norm": 1.2827926281076356, + "learning_rate": 9.207964357297235e-05, + "loss": 0.3354, + "step": 2417 + }, + { + "epoch": 0.2062787920150145, + "grad_norm": 1.3862663283216288, + "learning_rate": 9.20721801671095e-05, + "loss": 0.3067, + "step": 2418 + }, + { + "epoch": 0.20636410168913155, + "grad_norm": 1.2931146913894684, + "learning_rate": 9.206471354925928e-05, + "loss": 0.3876, + "step": 2419 + }, + { + "epoch": 0.2064494113632486, + "grad_norm": 1.3959073668627056, + "learning_rate": 9.205724371999169e-05, + "loss": 0.4102, + "step": 2420 + }, + { + "epoch": 0.20653472103736564, + "grad_norm": 1.736274709834672, + "learning_rate": 9.204977067987704e-05, + "loss": 0.4128, + "step": 2421 + }, + { + "epoch": 0.2066200307114827, + "grad_norm": 1.5333225471006633, + "learning_rate": 9.204229442948585e-05, + "loss": 0.39, + "step": 2422 + }, + { + "epoch": 0.20670534038559973, + "grad_norm": 1.4427269155175517, + "learning_rate": 9.203481496938888e-05, + "loss": 0.3523, + "step": 2423 + }, + { + "epoch": 0.20679065005971678, + "grad_norm": 1.4028196244665427, + "learning_rate": 9.202733230015719e-05, + "loss": 0.3261, + "step": 2424 + }, + { + "epoch": 0.20687595973383383, + "grad_norm": 1.8118557936146182, + "learning_rate": 9.201984642236198e-05, + "loss": 0.3771, + "step": 2425 + }, + { + "epoch": 0.20696126940795087, + "grad_norm": 1.2188137233409726, + "learning_rate": 9.201235733657481e-05, + "loss": 0.317, + "step": 2426 + }, + { + "epoch": 0.20704657908206792, + "grad_norm": 1.6089512535662909, + "learning_rate": 9.20048650433674e-05, + "loss": 0.3577, + "step": 2427 + }, + { + "epoch": 0.20713188875618496, + "grad_norm": 1.6002979031034716, + "learning_rate": 9.199736954331177e-05, + "loss": 0.3791, + "step": 2428 + }, + { + "epoch": 0.207217198430302, + "grad_norm": 1.5744622659822474, + "learning_rate": 9.198987083698014e-05, + "loss": 0.4074, + "step": 2429 + }, + { + "epoch": 0.20730250810441905, + "grad_norm": 1.7341199120239505, + "learning_rate": 9.198236892494501e-05, + "loss": 0.4313, + "step": 2430 + }, + { + "epoch": 0.2073878177785361, + "grad_norm": 1.432783599528768, + "learning_rate": 9.197486380777911e-05, + "loss": 0.3215, + "step": 2431 + }, + { + "epoch": 0.20747312745265314, + "grad_norm": 1.6953345793943118, + "learning_rate": 9.196735548605541e-05, + "loss": 0.3083, + "step": 2432 + }, + { + "epoch": 0.20755843712677016, + "grad_norm": 1.770853521194574, + "learning_rate": 9.195984396034713e-05, + "loss": 0.3769, + "step": 2433 + }, + { + "epoch": 0.2076437468008872, + "grad_norm": 1.4495754574112816, + "learning_rate": 9.195232923122773e-05, + "loss": 0.3703, + "step": 2434 + }, + { + "epoch": 0.20772905647500425, + "grad_norm": 1.4033545036636987, + "learning_rate": 9.194481129927093e-05, + "loss": 0.3034, + "step": 2435 + }, + { + "epoch": 0.2078143661491213, + "grad_norm": 1.5077834052406667, + "learning_rate": 9.193729016505069e-05, + "loss": 0.3283, + "step": 2436 + }, + { + "epoch": 0.20789967582323834, + "grad_norm": 1.7840246040247043, + "learning_rate": 9.19297658291412e-05, + "loss": 0.3585, + "step": 2437 + }, + { + "epoch": 0.2079849854973554, + "grad_norm": 1.3880865953432335, + "learning_rate": 9.19222382921169e-05, + "loss": 0.3418, + "step": 2438 + }, + { + "epoch": 0.20807029517147244, + "grad_norm": 1.6402695043608033, + "learning_rate": 9.191470755455248e-05, + "loss": 0.4165, + "step": 2439 + }, + { + "epoch": 0.20815560484558948, + "grad_norm": 1.635829347024894, + "learning_rate": 9.190717361702288e-05, + "loss": 0.3806, + "step": 2440 + }, + { + "epoch": 0.20824091451970653, + "grad_norm": 1.5357012764009006, + "learning_rate": 9.189963648010326e-05, + "loss": 0.392, + "step": 2441 + }, + { + "epoch": 0.20832622419382357, + "grad_norm": 1.346802902947275, + "learning_rate": 9.189209614436906e-05, + "loss": 0.3455, + "step": 2442 + }, + { + "epoch": 0.20841153386794062, + "grad_norm": 1.4938792280536783, + "learning_rate": 9.188455261039592e-05, + "loss": 0.3607, + "step": 2443 + }, + { + "epoch": 0.20849684354205766, + "grad_norm": 1.5108354535993598, + "learning_rate": 9.187700587875977e-05, + "loss": 0.3445, + "step": 2444 + }, + { + "epoch": 0.2085821532161747, + "grad_norm": 1.2687204442081628, + "learning_rate": 9.186945595003676e-05, + "loss": 0.3389, + "step": 2445 + }, + { + "epoch": 0.20866746289029175, + "grad_norm": 1.8302418768871445, + "learning_rate": 9.186190282480327e-05, + "loss": 0.4075, + "step": 2446 + }, + { + "epoch": 0.2087527725644088, + "grad_norm": 1.3374360363362778, + "learning_rate": 9.185434650363596e-05, + "loss": 0.3885, + "step": 2447 + }, + { + "epoch": 0.20883808223852585, + "grad_norm": 1.3834288349660078, + "learning_rate": 9.184678698711171e-05, + "loss": 0.3492, + "step": 2448 + }, + { + "epoch": 0.2089233919126429, + "grad_norm": 1.3524652013338798, + "learning_rate": 9.183922427580764e-05, + "loss": 0.3192, + "step": 2449 + }, + { + "epoch": 0.20900870158675994, + "grad_norm": 1.6392876396797145, + "learning_rate": 9.183165837030114e-05, + "loss": 0.3344, + "step": 2450 + }, + { + "epoch": 0.20909401126087698, + "grad_norm": 1.2094839416642693, + "learning_rate": 9.182408927116981e-05, + "loss": 0.2738, + "step": 2451 + }, + { + "epoch": 0.20917932093499403, + "grad_norm": 1.3544766095839507, + "learning_rate": 9.181651697899152e-05, + "loss": 0.3696, + "step": 2452 + }, + { + "epoch": 0.20926463060911107, + "grad_norm": 1.5317115088212574, + "learning_rate": 9.180894149434437e-05, + "loss": 0.3525, + "step": 2453 + }, + { + "epoch": 0.20934994028322812, + "grad_norm": 1.217603682346766, + "learning_rate": 9.18013628178067e-05, + "loss": 0.3335, + "step": 2454 + }, + { + "epoch": 0.20943524995734517, + "grad_norm": 1.402362408952185, + "learning_rate": 9.179378094995712e-05, + "loss": 0.2805, + "step": 2455 + }, + { + "epoch": 0.2095205596314622, + "grad_norm": 1.4270468751283107, + "learning_rate": 9.178619589137447e-05, + "loss": 0.3314, + "step": 2456 + }, + { + "epoch": 0.20960586930557926, + "grad_norm": 1.3681844945303623, + "learning_rate": 9.177860764263779e-05, + "loss": 0.3345, + "step": 2457 + }, + { + "epoch": 0.2096911789796963, + "grad_norm": 1.233459132884068, + "learning_rate": 9.177101620432644e-05, + "loss": 0.3139, + "step": 2458 + }, + { + "epoch": 0.20977648865381335, + "grad_norm": 1.1838493448884542, + "learning_rate": 9.176342157701998e-05, + "loss": 0.275, + "step": 2459 + }, + { + "epoch": 0.2098617983279304, + "grad_norm": 1.9551354398910248, + "learning_rate": 9.17558237612982e-05, + "loss": 0.3957, + "step": 2460 + }, + { + "epoch": 0.20994710800204744, + "grad_norm": 1.4424489289602913, + "learning_rate": 9.174822275774117e-05, + "loss": 0.3392, + "step": 2461 + }, + { + "epoch": 0.21003241767616448, + "grad_norm": 1.3569329787860698, + "learning_rate": 9.174061856692919e-05, + "loss": 0.3479, + "step": 2462 + }, + { + "epoch": 0.21011772735028153, + "grad_norm": 1.4175475785009273, + "learning_rate": 9.17330111894428e-05, + "loss": 0.2839, + "step": 2463 + }, + { + "epoch": 0.21020303702439858, + "grad_norm": 1.6319628973745028, + "learning_rate": 9.172540062586276e-05, + "loss": 0.351, + "step": 2464 + }, + { + "epoch": 0.21028834669851562, + "grad_norm": 1.6420704604816077, + "learning_rate": 9.171778687677011e-05, + "loss": 0.3737, + "step": 2465 + }, + { + "epoch": 0.21037365637263267, + "grad_norm": 1.3499283806912148, + "learning_rate": 9.171016994274612e-05, + "loss": 0.3417, + "step": 2466 + }, + { + "epoch": 0.2104589660467497, + "grad_norm": 1.3960132577807627, + "learning_rate": 9.17025498243723e-05, + "loss": 0.3548, + "step": 2467 + }, + { + "epoch": 0.21054427572086676, + "grad_norm": 1.2562467888178992, + "learning_rate": 9.169492652223042e-05, + "loss": 0.3924, + "step": 2468 + }, + { + "epoch": 0.2106295853949838, + "grad_norm": 1.66581554933221, + "learning_rate": 9.168730003690246e-05, + "loss": 0.3975, + "step": 2469 + }, + { + "epoch": 0.21071489506910085, + "grad_norm": 1.443450883537793, + "learning_rate": 9.167967036897066e-05, + "loss": 0.3249, + "step": 2470 + }, + { + "epoch": 0.21080020474321787, + "grad_norm": 1.4607237797227726, + "learning_rate": 9.167203751901751e-05, + "loss": 0.3133, + "step": 2471 + }, + { + "epoch": 0.2108855144173349, + "grad_norm": 1.5363266538239047, + "learning_rate": 9.166440148762576e-05, + "loss": 0.4093, + "step": 2472 + }, + { + "epoch": 0.21097082409145196, + "grad_norm": 1.120903131847718, + "learning_rate": 9.165676227537836e-05, + "loss": 0.3392, + "step": 2473 + }, + { + "epoch": 0.211056133765569, + "grad_norm": 1.5908269068310223, + "learning_rate": 9.164911988285851e-05, + "loss": 0.3959, + "step": 2474 + }, + { + "epoch": 0.21114144343968605, + "grad_norm": 1.301595759338922, + "learning_rate": 9.164147431064969e-05, + "loss": 0.3281, + "step": 2475 + }, + { + "epoch": 0.2112267531138031, + "grad_norm": 1.357941759846635, + "learning_rate": 9.163382555933558e-05, + "loss": 0.3694, + "step": 2476 + }, + { + "epoch": 0.21131206278792014, + "grad_norm": 1.6576855753813486, + "learning_rate": 9.162617362950015e-05, + "loss": 0.3498, + "step": 2477 + }, + { + "epoch": 0.2113973724620372, + "grad_norm": 1.8114653955913838, + "learning_rate": 9.161851852172754e-05, + "loss": 0.3919, + "step": 2478 + }, + { + "epoch": 0.21148268213615423, + "grad_norm": 1.6120664442486878, + "learning_rate": 9.161086023660222e-05, + "loss": 0.3803, + "step": 2479 + }, + { + "epoch": 0.21156799181027128, + "grad_norm": 1.8410586094102739, + "learning_rate": 9.160319877470882e-05, + "loss": 0.3454, + "step": 2480 + }, + { + "epoch": 0.21165330148438832, + "grad_norm": 1.806276356857873, + "learning_rate": 9.159553413663228e-05, + "loss": 0.3742, + "step": 2481 + }, + { + "epoch": 0.21173861115850537, + "grad_norm": 1.4551284269304494, + "learning_rate": 9.158786632295776e-05, + "loss": 0.3494, + "step": 2482 + }, + { + "epoch": 0.21182392083262241, + "grad_norm": 1.3178371723299727, + "learning_rate": 9.158019533427064e-05, + "loss": 0.4202, + "step": 2483 + }, + { + "epoch": 0.21190923050673946, + "grad_norm": 1.5383646975812602, + "learning_rate": 9.157252117115656e-05, + "loss": 0.3999, + "step": 2484 + }, + { + "epoch": 0.2119945401808565, + "grad_norm": 1.3869244060337473, + "learning_rate": 9.156484383420141e-05, + "loss": 0.3403, + "step": 2485 + }, + { + "epoch": 0.21207984985497355, + "grad_norm": 1.2462334151011, + "learning_rate": 9.155716332399129e-05, + "loss": 0.3171, + "step": 2486 + }, + { + "epoch": 0.2121651595290906, + "grad_norm": 1.5203456706459728, + "learning_rate": 9.15494796411126e-05, + "loss": 0.3649, + "step": 2487 + }, + { + "epoch": 0.21225046920320764, + "grad_norm": 1.366121061475942, + "learning_rate": 9.154179278615194e-05, + "loss": 0.3377, + "step": 2488 + }, + { + "epoch": 0.2123357788773247, + "grad_norm": 1.3742542845624697, + "learning_rate": 9.153410275969613e-05, + "loss": 0.3626, + "step": 2489 + }, + { + "epoch": 0.21242108855144173, + "grad_norm": 1.380569880893272, + "learning_rate": 9.152640956233231e-05, + "loss": 0.3308, + "step": 2490 + }, + { + "epoch": 0.21250639822555878, + "grad_norm": 1.2961342316066142, + "learning_rate": 9.151871319464778e-05, + "loss": 0.3153, + "step": 2491 + }, + { + "epoch": 0.21259170789967582, + "grad_norm": 1.4982626708570794, + "learning_rate": 9.151101365723013e-05, + "loss": 0.3609, + "step": 2492 + }, + { + "epoch": 0.21267701757379287, + "grad_norm": 1.5771080744157806, + "learning_rate": 9.150331095066717e-05, + "loss": 0.3932, + "step": 2493 + }, + { + "epoch": 0.21276232724790992, + "grad_norm": 1.4456983721329515, + "learning_rate": 9.149560507554698e-05, + "loss": 0.3726, + "step": 2494 + }, + { + "epoch": 0.21284763692202696, + "grad_norm": 1.3615368719974474, + "learning_rate": 9.148789603245784e-05, + "loss": 0.3273, + "step": 2495 + }, + { + "epoch": 0.212932946596144, + "grad_norm": 1.6606658512441705, + "learning_rate": 9.148018382198831e-05, + "loss": 0.3427, + "step": 2496 + }, + { + "epoch": 0.21301825627026105, + "grad_norm": 1.2023470456559668, + "learning_rate": 9.147246844472716e-05, + "loss": 0.3466, + "step": 2497 + }, + { + "epoch": 0.2131035659443781, + "grad_norm": 1.5191862602177038, + "learning_rate": 9.146474990126343e-05, + "loss": 0.3446, + "step": 2498 + }, + { + "epoch": 0.21318887561849514, + "grad_norm": 1.5001357335031738, + "learning_rate": 9.14570281921864e-05, + "loss": 0.3165, + "step": 2499 + }, + { + "epoch": 0.2132741852926122, + "grad_norm": 1.319026795034846, + "learning_rate": 9.144930331808557e-05, + "loss": 0.3172, + "step": 2500 + }, + { + "epoch": 0.21335949496672924, + "grad_norm": 1.4017255690343615, + "learning_rate": 9.144157527955069e-05, + "loss": 0.3085, + "step": 2501 + }, + { + "epoch": 0.21344480464084628, + "grad_norm": 1.6386155674641294, + "learning_rate": 9.143384407717175e-05, + "loss": 0.3967, + "step": 2502 + }, + { + "epoch": 0.21353011431496333, + "grad_norm": 1.7829661885648391, + "learning_rate": 9.1426109711539e-05, + "loss": 0.3607, + "step": 2503 + }, + { + "epoch": 0.21361542398908037, + "grad_norm": 1.5527215537255643, + "learning_rate": 9.141837218324292e-05, + "loss": 0.3833, + "step": 2504 + }, + { + "epoch": 0.21370073366319742, + "grad_norm": 1.6036107805201332, + "learning_rate": 9.141063149287421e-05, + "loss": 0.3673, + "step": 2505 + }, + { + "epoch": 0.21378604333731446, + "grad_norm": 1.4137258392347143, + "learning_rate": 9.140288764102384e-05, + "loss": 0.3845, + "step": 2506 + }, + { + "epoch": 0.2138713530114315, + "grad_norm": 1.6888205694649085, + "learning_rate": 9.1395140628283e-05, + "loss": 0.4281, + "step": 2507 + }, + { + "epoch": 0.21395666268554855, + "grad_norm": 1.474561273661873, + "learning_rate": 9.138739045524318e-05, + "loss": 0.3614, + "step": 2508 + }, + { + "epoch": 0.2140419723596656, + "grad_norm": 1.4979423078624645, + "learning_rate": 9.1379637122496e-05, + "loss": 0.3948, + "step": 2509 + }, + { + "epoch": 0.21412728203378262, + "grad_norm": 1.250603339500001, + "learning_rate": 9.137188063063344e-05, + "loss": 0.34, + "step": 2510 + }, + { + "epoch": 0.21421259170789966, + "grad_norm": 1.6484105980291512, + "learning_rate": 9.136412098024763e-05, + "loss": 0.4213, + "step": 2511 + }, + { + "epoch": 0.2142979013820167, + "grad_norm": 1.3052067437229489, + "learning_rate": 9.1356358171931e-05, + "loss": 0.3253, + "step": 2512 + }, + { + "epoch": 0.21438321105613375, + "grad_norm": 1.5268745189389608, + "learning_rate": 9.134859220627618e-05, + "loss": 0.3097, + "step": 2513 + }, + { + "epoch": 0.2144685207302508, + "grad_norm": 1.2927224653620677, + "learning_rate": 9.134082308387608e-05, + "loss": 0.3303, + "step": 2514 + }, + { + "epoch": 0.21455383040436785, + "grad_norm": 1.4240128092564734, + "learning_rate": 9.133305080532384e-05, + "loss": 0.3232, + "step": 2515 + }, + { + "epoch": 0.2146391400784849, + "grad_norm": 1.4428561395428527, + "learning_rate": 9.132527537121278e-05, + "loss": 0.319, + "step": 2516 + }, + { + "epoch": 0.21472444975260194, + "grad_norm": 1.19223678906945, + "learning_rate": 9.131749678213657e-05, + "loss": 0.3215, + "step": 2517 + }, + { + "epoch": 0.21480975942671898, + "grad_norm": 1.9645781138388863, + "learning_rate": 9.130971503868904e-05, + "loss": 0.3984, + "step": 2518 + }, + { + "epoch": 0.21489506910083603, + "grad_norm": 1.2212422637538238, + "learning_rate": 9.130193014146427e-05, + "loss": 0.4047, + "step": 2519 + }, + { + "epoch": 0.21498037877495307, + "grad_norm": 1.5302571561783445, + "learning_rate": 9.129414209105664e-05, + "loss": 0.3196, + "step": 2520 + }, + { + "epoch": 0.21506568844907012, + "grad_norm": 1.4244108956648431, + "learning_rate": 9.128635088806068e-05, + "loss": 0.352, + "step": 2521 + }, + { + "epoch": 0.21515099812318716, + "grad_norm": 1.5373189121693727, + "learning_rate": 9.127855653307123e-05, + "loss": 0.3442, + "step": 2522 + }, + { + "epoch": 0.2152363077973042, + "grad_norm": 1.5374218866617808, + "learning_rate": 9.127075902668333e-05, + "loss": 0.3952, + "step": 2523 + }, + { + "epoch": 0.21532161747142126, + "grad_norm": 1.6697366890648735, + "learning_rate": 9.126295836949231e-05, + "loss": 0.3763, + "step": 2524 + }, + { + "epoch": 0.2154069271455383, + "grad_norm": 1.4681040378209198, + "learning_rate": 9.125515456209367e-05, + "loss": 0.3195, + "step": 2525 + }, + { + "epoch": 0.21549223681965535, + "grad_norm": 1.3732274073810598, + "learning_rate": 9.124734760508323e-05, + "loss": 0.3744, + "step": 2526 + }, + { + "epoch": 0.2155775464937724, + "grad_norm": 1.3732056180045116, + "learning_rate": 9.123953749905697e-05, + "loss": 0.3688, + "step": 2527 + }, + { + "epoch": 0.21566285616788944, + "grad_norm": 1.3121776639129543, + "learning_rate": 9.123172424461118e-05, + "loss": 0.3737, + "step": 2528 + }, + { + "epoch": 0.21574816584200648, + "grad_norm": 1.3532219550304436, + "learning_rate": 9.122390784234233e-05, + "loss": 0.3282, + "step": 2529 + }, + { + "epoch": 0.21583347551612353, + "grad_norm": 1.4615922572393845, + "learning_rate": 9.12160882928472e-05, + "loss": 0.3452, + "step": 2530 + }, + { + "epoch": 0.21591878519024058, + "grad_norm": 1.6051082902544624, + "learning_rate": 9.120826559672275e-05, + "loss": 0.3306, + "step": 2531 + }, + { + "epoch": 0.21600409486435762, + "grad_norm": 1.498088254195459, + "learning_rate": 9.12004397545662e-05, + "loss": 0.3626, + "step": 2532 + }, + { + "epoch": 0.21608940453847467, + "grad_norm": 1.637473139651779, + "learning_rate": 9.1192610766975e-05, + "loss": 0.3228, + "step": 2533 + }, + { + "epoch": 0.2161747142125917, + "grad_norm": 1.416491366741507, + "learning_rate": 9.118477863454688e-05, + "loss": 0.3745, + "step": 2534 + }, + { + "epoch": 0.21626002388670876, + "grad_norm": 1.7976119810619302, + "learning_rate": 9.117694335787976e-05, + "loss": 0.393, + "step": 2535 + }, + { + "epoch": 0.2163453335608258, + "grad_norm": 1.2408565372443479, + "learning_rate": 9.116910493757185e-05, + "loss": 0.3264, + "step": 2536 + }, + { + "epoch": 0.21643064323494285, + "grad_norm": 1.6390415361225625, + "learning_rate": 9.116126337422152e-05, + "loss": 0.3976, + "step": 2537 + }, + { + "epoch": 0.2165159529090599, + "grad_norm": 1.6728152189298582, + "learning_rate": 9.115341866842748e-05, + "loss": 0.3962, + "step": 2538 + }, + { + "epoch": 0.21660126258317694, + "grad_norm": 1.6217594112889395, + "learning_rate": 9.114557082078861e-05, + "loss": 0.3685, + "step": 2539 + }, + { + "epoch": 0.21668657225729399, + "grad_norm": 1.6731055891345257, + "learning_rate": 9.113771983190405e-05, + "loss": 0.3546, + "step": 2540 + }, + { + "epoch": 0.21677188193141103, + "grad_norm": 1.401661146215816, + "learning_rate": 9.112986570237319e-05, + "loss": 0.3652, + "step": 2541 + }, + { + "epoch": 0.21685719160552808, + "grad_norm": 1.516290184425666, + "learning_rate": 9.112200843279565e-05, + "loss": 0.332, + "step": 2542 + }, + { + "epoch": 0.21694250127964512, + "grad_norm": 1.6343599862162184, + "learning_rate": 9.11141480237713e-05, + "loss": 0.34, + "step": 2543 + }, + { + "epoch": 0.21702781095376217, + "grad_norm": 1.6673515660965446, + "learning_rate": 9.110628447590021e-05, + "loss": 0.3597, + "step": 2544 + }, + { + "epoch": 0.2171131206278792, + "grad_norm": 1.6222544731493502, + "learning_rate": 9.109841778978274e-05, + "loss": 0.4057, + "step": 2545 + }, + { + "epoch": 0.21719843030199626, + "grad_norm": 1.4205904177791921, + "learning_rate": 9.109054796601947e-05, + "loss": 0.3241, + "step": 2546 + }, + { + "epoch": 0.2172837399761133, + "grad_norm": 1.379005536619473, + "learning_rate": 9.108267500521121e-05, + "loss": 0.3693, + "step": 2547 + }, + { + "epoch": 0.21736904965023032, + "grad_norm": 1.4418868424119837, + "learning_rate": 9.107479890795905e-05, + "loss": 0.3782, + "step": 2548 + }, + { + "epoch": 0.21745435932434737, + "grad_norm": 1.3533461163219331, + "learning_rate": 9.106691967486424e-05, + "loss": 0.3232, + "step": 2549 + }, + { + "epoch": 0.2175396689984644, + "grad_norm": 1.4196227964233161, + "learning_rate": 9.105903730652836e-05, + "loss": 0.3543, + "step": 2550 + }, + { + "epoch": 0.21762497867258146, + "grad_norm": 1.4030589030782714, + "learning_rate": 9.105115180355317e-05, + "loss": 0.3267, + "step": 2551 + }, + { + "epoch": 0.2177102883466985, + "grad_norm": 1.386911513134561, + "learning_rate": 9.104326316654067e-05, + "loss": 0.353, + "step": 2552 + }, + { + "epoch": 0.21779559802081555, + "grad_norm": 1.38546141454188, + "learning_rate": 9.103537139609314e-05, + "loss": 0.3688, + "step": 2553 + }, + { + "epoch": 0.2178809076949326, + "grad_norm": 1.4614512310367571, + "learning_rate": 9.102747649281307e-05, + "loss": 0.3672, + "step": 2554 + }, + { + "epoch": 0.21796621736904964, + "grad_norm": 1.8703736769280128, + "learning_rate": 9.101957845730318e-05, + "loss": 0.4214, + "step": 2555 + }, + { + "epoch": 0.2180515270431667, + "grad_norm": 1.4159336250503043, + "learning_rate": 9.101167729016646e-05, + "loss": 0.4039, + "step": 2556 + }, + { + "epoch": 0.21813683671728373, + "grad_norm": 1.7940643885698397, + "learning_rate": 9.10037729920061e-05, + "loss": 0.3958, + "step": 2557 + }, + { + "epoch": 0.21822214639140078, + "grad_norm": 1.4307290540912012, + "learning_rate": 9.099586556342559e-05, + "loss": 0.3498, + "step": 2558 + }, + { + "epoch": 0.21830745606551782, + "grad_norm": 1.4647261508784708, + "learning_rate": 9.098795500502858e-05, + "loss": 0.3477, + "step": 2559 + }, + { + "epoch": 0.21839276573963487, + "grad_norm": 1.4960457019709874, + "learning_rate": 9.098004131741902e-05, + "loss": 0.3921, + "step": 2560 + }, + { + "epoch": 0.21847807541375192, + "grad_norm": 1.4307566329639663, + "learning_rate": 9.097212450120108e-05, + "loss": 0.3653, + "step": 2561 + }, + { + "epoch": 0.21856338508786896, + "grad_norm": 1.399219145314189, + "learning_rate": 9.096420455697914e-05, + "loss": 0.3668, + "step": 2562 + }, + { + "epoch": 0.218648694761986, + "grad_norm": 1.579731340865897, + "learning_rate": 9.095628148535788e-05, + "loss": 0.2983, + "step": 2563 + }, + { + "epoch": 0.21873400443610305, + "grad_norm": 1.1677883353322214, + "learning_rate": 9.094835528694217e-05, + "loss": 0.3224, + "step": 2564 + }, + { + "epoch": 0.2188193141102201, + "grad_norm": 1.5796430481524666, + "learning_rate": 9.094042596233712e-05, + "loss": 0.4247, + "step": 2565 + }, + { + "epoch": 0.21890462378433714, + "grad_norm": 1.290689539738869, + "learning_rate": 9.093249351214812e-05, + "loss": 0.3567, + "step": 2566 + }, + { + "epoch": 0.2189899334584542, + "grad_norm": 1.5078774077522656, + "learning_rate": 9.092455793698075e-05, + "loss": 0.3206, + "step": 2567 + }, + { + "epoch": 0.21907524313257123, + "grad_norm": 1.59773250481797, + "learning_rate": 9.091661923744086e-05, + "loss": 0.3173, + "step": 2568 + }, + { + "epoch": 0.21916055280668828, + "grad_norm": 1.6225535243159221, + "learning_rate": 9.090867741413452e-05, + "loss": 0.403, + "step": 2569 + }, + { + "epoch": 0.21924586248080533, + "grad_norm": 1.3984306377903168, + "learning_rate": 9.090073246766803e-05, + "loss": 0.3583, + "step": 2570 + }, + { + "epoch": 0.21933117215492237, + "grad_norm": 1.3326279542620254, + "learning_rate": 9.089278439864797e-05, + "loss": 0.3714, + "step": 2571 + }, + { + "epoch": 0.21941648182903942, + "grad_norm": 1.5957929196066907, + "learning_rate": 9.088483320768115e-05, + "loss": 0.3993, + "step": 2572 + }, + { + "epoch": 0.21950179150315646, + "grad_norm": 1.4248966346863219, + "learning_rate": 9.087687889537454e-05, + "loss": 0.3042, + "step": 2573 + }, + { + "epoch": 0.2195871011772735, + "grad_norm": 1.470827783820874, + "learning_rate": 9.086892146233548e-05, + "loss": 0.3298, + "step": 2574 + }, + { + "epoch": 0.21967241085139055, + "grad_norm": 1.4663594046880324, + "learning_rate": 9.086096090917143e-05, + "loss": 0.3443, + "step": 2575 + }, + { + "epoch": 0.2197577205255076, + "grad_norm": 1.6038525096216387, + "learning_rate": 9.085299723649014e-05, + "loss": 0.3427, + "step": 2576 + }, + { + "epoch": 0.21984303019962464, + "grad_norm": 1.4610858372741338, + "learning_rate": 9.08450304448996e-05, + "loss": 0.341, + "step": 2577 + }, + { + "epoch": 0.2199283398737417, + "grad_norm": 1.4501142588354818, + "learning_rate": 9.083706053500806e-05, + "loss": 0.332, + "step": 2578 + }, + { + "epoch": 0.22001364954785874, + "grad_norm": 1.2776010236715707, + "learning_rate": 9.082908750742394e-05, + "loss": 0.3217, + "step": 2579 + }, + { + "epoch": 0.22009895922197578, + "grad_norm": 1.6841719977464191, + "learning_rate": 9.082111136275596e-05, + "loss": 0.3918, + "step": 2580 + }, + { + "epoch": 0.22018426889609283, + "grad_norm": 1.3136430032207718, + "learning_rate": 9.081313210161304e-05, + "loss": 0.3278, + "step": 2581 + }, + { + "epoch": 0.22026957857020987, + "grad_norm": 1.4144778827058635, + "learning_rate": 9.080514972460439e-05, + "loss": 0.3452, + "step": 2582 + }, + { + "epoch": 0.22035488824432692, + "grad_norm": 1.5469222398730937, + "learning_rate": 9.079716423233938e-05, + "loss": 0.2896, + "step": 2583 + }, + { + "epoch": 0.22044019791844396, + "grad_norm": 1.5061905591957319, + "learning_rate": 9.078917562542765e-05, + "loss": 0.4116, + "step": 2584 + }, + { + "epoch": 0.220525507592561, + "grad_norm": 1.8053351226968692, + "learning_rate": 9.078118390447917e-05, + "loss": 0.4404, + "step": 2585 + }, + { + "epoch": 0.22061081726667803, + "grad_norm": 1.6489615669467794, + "learning_rate": 9.077318907010397e-05, + "loss": 0.3979, + "step": 2586 + }, + { + "epoch": 0.22069612694079507, + "grad_norm": 1.281600206476664, + "learning_rate": 9.076519112291246e-05, + "loss": 0.3582, + "step": 2587 + }, + { + "epoch": 0.22078143661491212, + "grad_norm": 1.4848733165919927, + "learning_rate": 9.075719006351524e-05, + "loss": 0.3652, + "step": 2588 + }, + { + "epoch": 0.22086674628902916, + "grad_norm": 1.4361305347820406, + "learning_rate": 9.074918589252313e-05, + "loss": 0.3631, + "step": 2589 + }, + { + "epoch": 0.2209520559631462, + "grad_norm": 1.5541656998673092, + "learning_rate": 9.074117861054723e-05, + "loss": 0.315, + "step": 2590 + }, + { + "epoch": 0.22103736563726326, + "grad_norm": 1.4558897072078754, + "learning_rate": 9.073316821819884e-05, + "loss": 0.3466, + "step": 2591 + }, + { + "epoch": 0.2211226753113803, + "grad_norm": 1.5003269951757194, + "learning_rate": 9.07251547160895e-05, + "loss": 0.3752, + "step": 2592 + }, + { + "epoch": 0.22120798498549735, + "grad_norm": 1.3510033658181257, + "learning_rate": 9.071713810483103e-05, + "loss": 0.3686, + "step": 2593 + }, + { + "epoch": 0.2212932946596144, + "grad_norm": 1.4067570937652187, + "learning_rate": 9.070911838503545e-05, + "loss": 0.3401, + "step": 2594 + }, + { + "epoch": 0.22137860433373144, + "grad_norm": 1.4687310075546476, + "learning_rate": 9.070109555731497e-05, + "loss": 0.3417, + "step": 2595 + }, + { + "epoch": 0.22146391400784848, + "grad_norm": 1.493528073476427, + "learning_rate": 9.069306962228215e-05, + "loss": 0.3119, + "step": 2596 + }, + { + "epoch": 0.22154922368196553, + "grad_norm": 1.4047646201140582, + "learning_rate": 9.06850405805497e-05, + "loss": 0.366, + "step": 2597 + }, + { + "epoch": 0.22163453335608257, + "grad_norm": 1.487455203880448, + "learning_rate": 9.067700843273061e-05, + "loss": 0.3111, + "step": 2598 + }, + { + "epoch": 0.22171984303019962, + "grad_norm": 1.3761406415545379, + "learning_rate": 9.066897317943808e-05, + "loss": 0.3547, + "step": 2599 + }, + { + "epoch": 0.22180515270431667, + "grad_norm": 1.4691818089533368, + "learning_rate": 9.066093482128557e-05, + "loss": 0.3589, + "step": 2600 + }, + { + "epoch": 0.2218904623784337, + "grad_norm": 1.3050931197226736, + "learning_rate": 9.065289335888674e-05, + "loss": 0.3349, + "step": 2601 + }, + { + "epoch": 0.22197577205255076, + "grad_norm": 1.4988163410486075, + "learning_rate": 9.064484879285555e-05, + "loss": 0.3811, + "step": 2602 + }, + { + "epoch": 0.2220610817266678, + "grad_norm": 1.5517485938209492, + "learning_rate": 9.063680112380612e-05, + "loss": 0.3932, + "step": 2603 + }, + { + "epoch": 0.22214639140078485, + "grad_norm": 1.5619975998941853, + "learning_rate": 9.062875035235288e-05, + "loss": 0.3732, + "step": 2604 + }, + { + "epoch": 0.2222317010749019, + "grad_norm": 1.3723162556208697, + "learning_rate": 9.062069647911046e-05, + "loss": 0.3612, + "step": 2605 + }, + { + "epoch": 0.22231701074901894, + "grad_norm": 1.3567696520701982, + "learning_rate": 9.061263950469371e-05, + "loss": 0.3254, + "step": 2606 + }, + { + "epoch": 0.22240232042313599, + "grad_norm": 1.3321132243150429, + "learning_rate": 9.060457942971776e-05, + "loss": 0.3171, + "step": 2607 + }, + { + "epoch": 0.22248763009725303, + "grad_norm": 1.3192777472594854, + "learning_rate": 9.059651625479793e-05, + "loss": 0.323, + "step": 2608 + }, + { + "epoch": 0.22257293977137008, + "grad_norm": 1.626219951753739, + "learning_rate": 9.058844998054983e-05, + "loss": 0.3207, + "step": 2609 + }, + { + "epoch": 0.22265824944548712, + "grad_norm": 1.4863621310580928, + "learning_rate": 9.058038060758925e-05, + "loss": 0.3329, + "step": 2610 + }, + { + "epoch": 0.22274355911960417, + "grad_norm": 1.2999416998481703, + "learning_rate": 9.057230813653225e-05, + "loss": 0.3292, + "step": 2611 + }, + { + "epoch": 0.2228288687937212, + "grad_norm": 1.4219736756679437, + "learning_rate": 9.056423256799513e-05, + "loss": 0.3321, + "step": 2612 + }, + { + "epoch": 0.22291417846783826, + "grad_norm": 1.8620015034080524, + "learning_rate": 9.055615390259441e-05, + "loss": 0.3987, + "step": 2613 + }, + { + "epoch": 0.2229994881419553, + "grad_norm": 1.3601257618790756, + "learning_rate": 9.054807214094685e-05, + "loss": 0.4035, + "step": 2614 + }, + { + "epoch": 0.22308479781607235, + "grad_norm": 1.4566702305963304, + "learning_rate": 9.053998728366947e-05, + "loss": 0.3549, + "step": 2615 + }, + { + "epoch": 0.2231701074901894, + "grad_norm": 1.4915429765505306, + "learning_rate": 9.053189933137949e-05, + "loss": 0.3394, + "step": 2616 + }, + { + "epoch": 0.22325541716430644, + "grad_norm": 1.4237901135722582, + "learning_rate": 9.052380828469436e-05, + "loss": 0.3581, + "step": 2617 + }, + { + "epoch": 0.2233407268384235, + "grad_norm": 1.4600176848032884, + "learning_rate": 9.051571414423182e-05, + "loss": 0.3503, + "step": 2618 + }, + { + "epoch": 0.22342603651254053, + "grad_norm": 1.5985501127576316, + "learning_rate": 9.050761691060981e-05, + "loss": 0.338, + "step": 2619 + }, + { + "epoch": 0.22351134618665758, + "grad_norm": 1.523531084365152, + "learning_rate": 9.049951658444651e-05, + "loss": 0.394, + "step": 2620 + }, + { + "epoch": 0.22359665586077462, + "grad_norm": 1.485711229106358, + "learning_rate": 9.049141316636033e-05, + "loss": 0.3811, + "step": 2621 + }, + { + "epoch": 0.22368196553489167, + "grad_norm": 1.2742536304969254, + "learning_rate": 9.048330665696993e-05, + "loss": 0.3561, + "step": 2622 + }, + { + "epoch": 0.22376727520900871, + "grad_norm": 1.224991150259632, + "learning_rate": 9.047519705689418e-05, + "loss": 0.3287, + "step": 2623 + }, + { + "epoch": 0.22385258488312576, + "grad_norm": 1.5261191217167267, + "learning_rate": 9.046708436675223e-05, + "loss": 0.3359, + "step": 2624 + }, + { + "epoch": 0.22393789455724278, + "grad_norm": 1.2317656932935943, + "learning_rate": 9.045896858716343e-05, + "loss": 0.3157, + "step": 2625 + }, + { + "epoch": 0.22402320423135982, + "grad_norm": 1.480602731404406, + "learning_rate": 9.045084971874738e-05, + "loss": 0.3588, + "step": 2626 + }, + { + "epoch": 0.22410851390547687, + "grad_norm": 1.0485102848536334, + "learning_rate": 9.04427277621239e-05, + "loss": 0.2901, + "step": 2627 + }, + { + "epoch": 0.22419382357959391, + "grad_norm": 1.4834601544518111, + "learning_rate": 9.043460271791308e-05, + "loss": 0.3703, + "step": 2628 + }, + { + "epoch": 0.22427913325371096, + "grad_norm": 1.4013712706822048, + "learning_rate": 9.04264745867352e-05, + "loss": 0.3743, + "step": 2629 + }, + { + "epoch": 0.224364442927828, + "grad_norm": 1.579490750490103, + "learning_rate": 9.041834336921082e-05, + "loss": 0.3468, + "step": 2630 + }, + { + "epoch": 0.22444975260194505, + "grad_norm": 1.8702961728969631, + "learning_rate": 9.04102090659607e-05, + "loss": 0.3833, + "step": 2631 + }, + { + "epoch": 0.2245350622760621, + "grad_norm": 1.5330714078074665, + "learning_rate": 9.040207167760586e-05, + "loss": 0.3583, + "step": 2632 + }, + { + "epoch": 0.22462037195017914, + "grad_norm": 1.5534414540810104, + "learning_rate": 9.039393120476755e-05, + "loss": 0.3792, + "step": 2633 + }, + { + "epoch": 0.2247056816242962, + "grad_norm": 1.448815631581177, + "learning_rate": 9.038578764806723e-05, + "loss": 0.3805, + "step": 2634 + }, + { + "epoch": 0.22479099129841323, + "grad_norm": 1.656712881220153, + "learning_rate": 9.037764100812664e-05, + "loss": 0.2951, + "step": 2635 + }, + { + "epoch": 0.22487630097253028, + "grad_norm": 1.5066405173932909, + "learning_rate": 9.036949128556773e-05, + "loss": 0.3539, + "step": 2636 + }, + { + "epoch": 0.22496161064664733, + "grad_norm": 1.4027989745768032, + "learning_rate": 9.036133848101269e-05, + "loss": 0.3736, + "step": 2637 + }, + { + "epoch": 0.22504692032076437, + "grad_norm": 1.1997559179981596, + "learning_rate": 9.035318259508393e-05, + "loss": 0.2947, + "step": 2638 + }, + { + "epoch": 0.22513222999488142, + "grad_norm": 1.2847604344959902, + "learning_rate": 9.034502362840411e-05, + "loss": 0.2952, + "step": 2639 + }, + { + "epoch": 0.22521753966899846, + "grad_norm": 1.5368160370215846, + "learning_rate": 9.033686158159613e-05, + "loss": 0.3977, + "step": 2640 + }, + { + "epoch": 0.2253028493431155, + "grad_norm": 1.4925980727084007, + "learning_rate": 9.032869645528313e-05, + "loss": 0.3496, + "step": 2641 + }, + { + "epoch": 0.22538815901723255, + "grad_norm": 1.645535051703329, + "learning_rate": 9.032052825008845e-05, + "loss": 0.4262, + "step": 2642 + }, + { + "epoch": 0.2254734686913496, + "grad_norm": 1.5867464605003152, + "learning_rate": 9.031235696663572e-05, + "loss": 0.3648, + "step": 2643 + }, + { + "epoch": 0.22555877836546664, + "grad_norm": 1.4749810363067721, + "learning_rate": 9.030418260554873e-05, + "loss": 0.3746, + "step": 2644 + }, + { + "epoch": 0.2256440880395837, + "grad_norm": 1.4835602783155635, + "learning_rate": 9.029600516745158e-05, + "loss": 0.3536, + "step": 2645 + }, + { + "epoch": 0.22572939771370074, + "grad_norm": 1.6146868108589558, + "learning_rate": 9.028782465296856e-05, + "loss": 0.3259, + "step": 2646 + }, + { + "epoch": 0.22581470738781778, + "grad_norm": 1.3523490248940309, + "learning_rate": 9.027964106272423e-05, + "loss": 0.3349, + "step": 2647 + }, + { + "epoch": 0.22590001706193483, + "grad_norm": 1.5716495420481054, + "learning_rate": 9.027145439734336e-05, + "loss": 0.3514, + "step": 2648 + }, + { + "epoch": 0.22598532673605187, + "grad_norm": 1.2784024671903156, + "learning_rate": 9.026326465745094e-05, + "loss": 0.3527, + "step": 2649 + }, + { + "epoch": 0.22607063641016892, + "grad_norm": 1.440980016031087, + "learning_rate": 9.025507184367223e-05, + "loss": 0.2681, + "step": 2650 + }, + { + "epoch": 0.22615594608428596, + "grad_norm": 1.3867481497213598, + "learning_rate": 9.024687595663268e-05, + "loss": 0.3088, + "step": 2651 + }, + { + "epoch": 0.226241255758403, + "grad_norm": 1.1727748213666038, + "learning_rate": 9.023867699695804e-05, + "loss": 0.3279, + "step": 2652 + }, + { + "epoch": 0.22632656543252005, + "grad_norm": 1.4456469174713813, + "learning_rate": 9.023047496527423e-05, + "loss": 0.3719, + "step": 2653 + }, + { + "epoch": 0.2264118751066371, + "grad_norm": 1.4774973498480406, + "learning_rate": 9.022226986220745e-05, + "loss": 0.3485, + "step": 2654 + }, + { + "epoch": 0.22649718478075415, + "grad_norm": 1.673203769179113, + "learning_rate": 9.02140616883841e-05, + "loss": 0.3896, + "step": 2655 + }, + { + "epoch": 0.2265824944548712, + "grad_norm": 1.5874210428141582, + "learning_rate": 9.020585044443084e-05, + "loss": 0.4008, + "step": 2656 + }, + { + "epoch": 0.22666780412898824, + "grad_norm": 1.2660533509817204, + "learning_rate": 9.019763613097455e-05, + "loss": 0.3116, + "step": 2657 + }, + { + "epoch": 0.22675311380310528, + "grad_norm": 2.1602480767101, + "learning_rate": 9.018941874864236e-05, + "loss": 0.3553, + "step": 2658 + }, + { + "epoch": 0.22683842347722233, + "grad_norm": 1.5319386704721214, + "learning_rate": 9.01811982980616e-05, + "loss": 0.3354, + "step": 2659 + }, + { + "epoch": 0.22692373315133937, + "grad_norm": 1.4703732712091857, + "learning_rate": 9.017297477985989e-05, + "loss": 0.2938, + "step": 2660 + }, + { + "epoch": 0.22700904282545642, + "grad_norm": 1.383482086655729, + "learning_rate": 9.016474819466501e-05, + "loss": 0.3392, + "step": 2661 + }, + { + "epoch": 0.22709435249957347, + "grad_norm": 1.4619218090924933, + "learning_rate": 9.015651854310506e-05, + "loss": 0.353, + "step": 2662 + }, + { + "epoch": 0.22717966217369048, + "grad_norm": 1.3514626085709576, + "learning_rate": 9.01482858258083e-05, + "loss": 0.3244, + "step": 2663 + }, + { + "epoch": 0.22726497184780753, + "grad_norm": 1.3952592077041992, + "learning_rate": 9.014005004340327e-05, + "loss": 0.4092, + "step": 2664 + }, + { + "epoch": 0.22735028152192457, + "grad_norm": 1.315071629288537, + "learning_rate": 9.013181119651872e-05, + "loss": 0.3269, + "step": 2665 + }, + { + "epoch": 0.22743559119604162, + "grad_norm": 1.3673366792217243, + "learning_rate": 9.012356928578365e-05, + "loss": 0.34, + "step": 2666 + }, + { + "epoch": 0.22752090087015867, + "grad_norm": 1.489605252120227, + "learning_rate": 9.011532431182729e-05, + "loss": 0.353, + "step": 2667 + }, + { + "epoch": 0.2276062105442757, + "grad_norm": 1.4093912432426032, + "learning_rate": 9.010707627527909e-05, + "loss": 0.3931, + "step": 2668 + }, + { + "epoch": 0.22769152021839276, + "grad_norm": 1.581719864706679, + "learning_rate": 9.009882517676873e-05, + "loss": 0.3643, + "step": 2669 + }, + { + "epoch": 0.2277768298925098, + "grad_norm": 1.4843749196905818, + "learning_rate": 9.009057101692615e-05, + "loss": 0.3418, + "step": 2670 + }, + { + "epoch": 0.22786213956662685, + "grad_norm": 1.464970534877853, + "learning_rate": 9.008231379638152e-05, + "loss": 0.355, + "step": 2671 + }, + { + "epoch": 0.2279474492407439, + "grad_norm": 1.3721434658545917, + "learning_rate": 9.007405351576524e-05, + "loss": 0.3519, + "step": 2672 + }, + { + "epoch": 0.22803275891486094, + "grad_norm": 1.6771262580006516, + "learning_rate": 9.006579017570792e-05, + "loss": 0.3798, + "step": 2673 + }, + { + "epoch": 0.22811806858897798, + "grad_norm": 1.7863135395000977, + "learning_rate": 9.005752377684043e-05, + "loss": 0.3395, + "step": 2674 + }, + { + "epoch": 0.22820337826309503, + "grad_norm": 1.3882929074254688, + "learning_rate": 9.004925431979387e-05, + "loss": 0.3357, + "step": 2675 + }, + { + "epoch": 0.22828868793721208, + "grad_norm": 1.5060626534731294, + "learning_rate": 9.004098180519956e-05, + "loss": 0.375, + "step": 2676 + }, + { + "epoch": 0.22837399761132912, + "grad_norm": 1.6867829671476047, + "learning_rate": 9.003270623368905e-05, + "loss": 0.3753, + "step": 2677 + }, + { + "epoch": 0.22845930728544617, + "grad_norm": 1.7973146647060123, + "learning_rate": 9.002442760589418e-05, + "loss": 0.4216, + "step": 2678 + }, + { + "epoch": 0.2285446169595632, + "grad_norm": 1.6065100937973524, + "learning_rate": 9.001614592244694e-05, + "loss": 0.3107, + "step": 2679 + }, + { + "epoch": 0.22862992663368026, + "grad_norm": 1.7426270071671672, + "learning_rate": 9.00078611839796e-05, + "loss": 0.3528, + "step": 2680 + }, + { + "epoch": 0.2287152363077973, + "grad_norm": 1.7216616849774904, + "learning_rate": 8.999957339112466e-05, + "loss": 0.3848, + "step": 2681 + }, + { + "epoch": 0.22880054598191435, + "grad_norm": 1.5793928586601544, + "learning_rate": 8.999128254451486e-05, + "loss": 0.3739, + "step": 2682 + }, + { + "epoch": 0.2288858556560314, + "grad_norm": 1.4292872134307235, + "learning_rate": 8.998298864478314e-05, + "loss": 0.3032, + "step": 2683 + }, + { + "epoch": 0.22897116533014844, + "grad_norm": 1.5182901133566833, + "learning_rate": 8.997469169256271e-05, + "loss": 0.3415, + "step": 2684 + }, + { + "epoch": 0.2290564750042655, + "grad_norm": 1.389615152996505, + "learning_rate": 8.9966391688487e-05, + "loss": 0.3224, + "step": 2685 + }, + { + "epoch": 0.22914178467838253, + "grad_norm": 1.332539838871685, + "learning_rate": 8.995808863318966e-05, + "loss": 0.3987, + "step": 2686 + }, + { + "epoch": 0.22922709435249958, + "grad_norm": 1.5201153649921217, + "learning_rate": 8.99497825273046e-05, + "loss": 0.3878, + "step": 2687 + }, + { + "epoch": 0.22931240402661662, + "grad_norm": 1.6691087315764384, + "learning_rate": 8.994147337146593e-05, + "loss": 0.375, + "step": 2688 + }, + { + "epoch": 0.22939771370073367, + "grad_norm": 1.2344138706702417, + "learning_rate": 8.993316116630801e-05, + "loss": 0.3587, + "step": 2689 + }, + { + "epoch": 0.22948302337485071, + "grad_norm": 1.2521366931041409, + "learning_rate": 8.992484591246544e-05, + "loss": 0.322, + "step": 2690 + }, + { + "epoch": 0.22956833304896776, + "grad_norm": 1.4304424856411295, + "learning_rate": 8.991652761057305e-05, + "loss": 0.3192, + "step": 2691 + }, + { + "epoch": 0.2296536427230848, + "grad_norm": 1.6376150949812056, + "learning_rate": 8.990820626126589e-05, + "loss": 0.3273, + "step": 2692 + }, + { + "epoch": 0.22973895239720185, + "grad_norm": 1.6854843534718262, + "learning_rate": 8.989988186517926e-05, + "loss": 0.335, + "step": 2693 + }, + { + "epoch": 0.2298242620713189, + "grad_norm": 1.7343918395728457, + "learning_rate": 8.989155442294867e-05, + "loss": 0.3737, + "step": 2694 + }, + { + "epoch": 0.22990957174543594, + "grad_norm": 1.2900623489807819, + "learning_rate": 8.988322393520989e-05, + "loss": 0.3331, + "step": 2695 + }, + { + "epoch": 0.229994881419553, + "grad_norm": 1.420733570090468, + "learning_rate": 8.98748904025989e-05, + "loss": 0.3522, + "step": 2696 + }, + { + "epoch": 0.23008019109367003, + "grad_norm": 1.328211388863755, + "learning_rate": 8.986655382575192e-05, + "loss": 0.3367, + "step": 2697 + }, + { + "epoch": 0.23016550076778708, + "grad_norm": 1.4672407959184468, + "learning_rate": 8.98582142053054e-05, + "loss": 0.3468, + "step": 2698 + }, + { + "epoch": 0.23025081044190412, + "grad_norm": 1.5269669558254164, + "learning_rate": 8.984987154189604e-05, + "loss": 0.2941, + "step": 2699 + }, + { + "epoch": 0.23033612011602117, + "grad_norm": 1.5415876557319652, + "learning_rate": 8.984152583616076e-05, + "loss": 0.3007, + "step": 2700 + }, + { + "epoch": 0.2304214297901382, + "grad_norm": 1.4090274932160813, + "learning_rate": 8.983317708873669e-05, + "loss": 0.3477, + "step": 2701 + }, + { + "epoch": 0.23050673946425523, + "grad_norm": 1.4607573209412965, + "learning_rate": 8.982482530026122e-05, + "loss": 0.3475, + "step": 2702 + }, + { + "epoch": 0.23059204913837228, + "grad_norm": 1.5709134750092304, + "learning_rate": 8.981647047137197e-05, + "loss": 0.3772, + "step": 2703 + }, + { + "epoch": 0.23067735881248932, + "grad_norm": 1.4529788036021176, + "learning_rate": 8.980811260270677e-05, + "loss": 0.32, + "step": 2704 + }, + { + "epoch": 0.23076266848660637, + "grad_norm": 1.2478990064251259, + "learning_rate": 8.979975169490374e-05, + "loss": 0.3992, + "step": 2705 + }, + { + "epoch": 0.23084797816072342, + "grad_norm": 1.4305659029359055, + "learning_rate": 8.979138774860114e-05, + "loss": 0.4117, + "step": 2706 + }, + { + "epoch": 0.23093328783484046, + "grad_norm": 1.6676751264720635, + "learning_rate": 8.978302076443754e-05, + "loss": 0.4303, + "step": 2707 + }, + { + "epoch": 0.2310185975089575, + "grad_norm": 1.3917503090048342, + "learning_rate": 8.977465074305173e-05, + "loss": 0.3155, + "step": 2708 + }, + { + "epoch": 0.23110390718307455, + "grad_norm": 1.2981843887577587, + "learning_rate": 8.976627768508267e-05, + "loss": 0.3024, + "step": 2709 + }, + { + "epoch": 0.2311892168571916, + "grad_norm": 1.435594124567865, + "learning_rate": 8.975790159116966e-05, + "loss": 0.3221, + "step": 2710 + }, + { + "epoch": 0.23127452653130864, + "grad_norm": 1.353180242384292, + "learning_rate": 8.974952246195212e-05, + "loss": 0.2757, + "step": 2711 + }, + { + "epoch": 0.2313598362054257, + "grad_norm": 1.8127801612845869, + "learning_rate": 8.974114029806976e-05, + "loss": 0.3938, + "step": 2712 + }, + { + "epoch": 0.23144514587954274, + "grad_norm": 1.3817187364028884, + "learning_rate": 8.973275510016252e-05, + "loss": 0.3063, + "step": 2713 + }, + { + "epoch": 0.23153045555365978, + "grad_norm": 1.308175865748185, + "learning_rate": 8.972436686887059e-05, + "loss": 0.307, + "step": 2714 + }, + { + "epoch": 0.23161576522777683, + "grad_norm": 1.6637860278922578, + "learning_rate": 8.971597560483434e-05, + "loss": 0.3693, + "step": 2715 + }, + { + "epoch": 0.23170107490189387, + "grad_norm": 1.3477004776483583, + "learning_rate": 8.970758130869439e-05, + "loss": 0.3095, + "step": 2716 + }, + { + "epoch": 0.23178638457601092, + "grad_norm": 1.577670041213197, + "learning_rate": 8.969918398109162e-05, + "loss": 0.3624, + "step": 2717 + }, + { + "epoch": 0.23187169425012796, + "grad_norm": 1.6662988813231567, + "learning_rate": 8.969078362266711e-05, + "loss": 0.3417, + "step": 2718 + }, + { + "epoch": 0.231957003924245, + "grad_norm": 1.5864797332315557, + "learning_rate": 8.968238023406219e-05, + "loss": 0.3658, + "step": 2719 + }, + { + "epoch": 0.23204231359836205, + "grad_norm": 1.78492524970906, + "learning_rate": 8.96739738159184e-05, + "loss": 0.4017, + "step": 2720 + }, + { + "epoch": 0.2321276232724791, + "grad_norm": 1.3413477427765317, + "learning_rate": 8.966556436887755e-05, + "loss": 0.3197, + "step": 2721 + }, + { + "epoch": 0.23221293294659615, + "grad_norm": 1.2783923962976873, + "learning_rate": 8.965715189358164e-05, + "loss": 0.3208, + "step": 2722 + }, + { + "epoch": 0.2322982426207132, + "grad_norm": 1.614911823286636, + "learning_rate": 8.964873639067293e-05, + "loss": 0.3694, + "step": 2723 + }, + { + "epoch": 0.23238355229483024, + "grad_norm": 1.431193408097108, + "learning_rate": 8.964031786079388e-05, + "loss": 0.3782, + "step": 2724 + }, + { + "epoch": 0.23246886196894728, + "grad_norm": 1.2991596917305084, + "learning_rate": 8.963189630458721e-05, + "loss": 0.3498, + "step": 2725 + }, + { + "epoch": 0.23255417164306433, + "grad_norm": 1.389678418606113, + "learning_rate": 8.962347172269585e-05, + "loss": 0.3525, + "step": 2726 + }, + { + "epoch": 0.23263948131718137, + "grad_norm": 1.5038241277294075, + "learning_rate": 8.9615044115763e-05, + "loss": 0.3172, + "step": 2727 + }, + { + "epoch": 0.23272479099129842, + "grad_norm": 1.464795897655899, + "learning_rate": 8.960661348443205e-05, + "loss": 0.3223, + "step": 2728 + }, + { + "epoch": 0.23281010066541546, + "grad_norm": 1.4999693231624662, + "learning_rate": 8.959817982934662e-05, + "loss": 0.3277, + "step": 2729 + }, + { + "epoch": 0.2328954103395325, + "grad_norm": 1.4428203644846966, + "learning_rate": 8.958974315115059e-05, + "loss": 0.2994, + "step": 2730 + }, + { + "epoch": 0.23298072001364956, + "grad_norm": 1.5365747009003967, + "learning_rate": 8.958130345048803e-05, + "loss": 0.405, + "step": 2731 + }, + { + "epoch": 0.2330660296877666, + "grad_norm": 1.4377712325583099, + "learning_rate": 8.95728607280033e-05, + "loss": 0.3285, + "step": 2732 + }, + { + "epoch": 0.23315133936188365, + "grad_norm": 1.526840244042859, + "learning_rate": 8.956441498434096e-05, + "loss": 0.3288, + "step": 2733 + }, + { + "epoch": 0.2332366490360007, + "grad_norm": 1.6400101735799693, + "learning_rate": 8.955596622014576e-05, + "loss": 0.3335, + "step": 2734 + }, + { + "epoch": 0.23332195871011774, + "grad_norm": 1.447820847814723, + "learning_rate": 8.954751443606273e-05, + "loss": 0.4249, + "step": 2735 + }, + { + "epoch": 0.23340726838423478, + "grad_norm": 1.7168767951703208, + "learning_rate": 8.953905963273714e-05, + "loss": 0.3834, + "step": 2736 + }, + { + "epoch": 0.23349257805835183, + "grad_norm": 1.4360768280675902, + "learning_rate": 8.953060181081447e-05, + "loss": 0.3311, + "step": 2737 + }, + { + "epoch": 0.23357788773246888, + "grad_norm": 1.3973350228258166, + "learning_rate": 8.952214097094041e-05, + "loss": 0.3251, + "step": 2738 + }, + { + "epoch": 0.2336631974065859, + "grad_norm": 1.5371855313388714, + "learning_rate": 8.95136771137609e-05, + "loss": 0.3675, + "step": 2739 + }, + { + "epoch": 0.23374850708070294, + "grad_norm": 1.6965915759369317, + "learning_rate": 8.950521023992212e-05, + "loss": 0.3783, + "step": 2740 + }, + { + "epoch": 0.23383381675481998, + "grad_norm": 1.403803498632058, + "learning_rate": 8.949674035007047e-05, + "loss": 0.3493, + "step": 2741 + }, + { + "epoch": 0.23391912642893703, + "grad_norm": 1.6244818521591384, + "learning_rate": 8.948826744485258e-05, + "loss": 0.3933, + "step": 2742 + }, + { + "epoch": 0.23400443610305408, + "grad_norm": 1.4269005131623194, + "learning_rate": 8.947979152491533e-05, + "loss": 0.3563, + "step": 2743 + }, + { + "epoch": 0.23408974577717112, + "grad_norm": 1.2393678536644306, + "learning_rate": 8.947131259090575e-05, + "loss": 0.2926, + "step": 2744 + }, + { + "epoch": 0.23417505545128817, + "grad_norm": 1.7737521930060025, + "learning_rate": 8.946283064347125e-05, + "loss": 0.3682, + "step": 2745 + }, + { + "epoch": 0.2342603651254052, + "grad_norm": 1.3940420281027912, + "learning_rate": 8.94543456832593e-05, + "loss": 0.3256, + "step": 2746 + }, + { + "epoch": 0.23434567479952226, + "grad_norm": 1.5613484526134822, + "learning_rate": 8.944585771091773e-05, + "loss": 0.3506, + "step": 2747 + }, + { + "epoch": 0.2344309844736393, + "grad_norm": 1.9184690238672946, + "learning_rate": 8.943736672709454e-05, + "loss": 0.4104, + "step": 2748 + }, + { + "epoch": 0.23451629414775635, + "grad_norm": 1.782931153530607, + "learning_rate": 8.942887273243797e-05, + "loss": 0.419, + "step": 2749 + }, + { + "epoch": 0.2346016038218734, + "grad_norm": 1.3628578730961391, + "learning_rate": 8.942037572759649e-05, + "loss": 0.3346, + "step": 2750 + }, + { + "epoch": 0.23468691349599044, + "grad_norm": 1.7471191672354478, + "learning_rate": 8.94118757132188e-05, + "loss": 0.3553, + "step": 2751 + }, + { + "epoch": 0.23477222317010749, + "grad_norm": 1.6046972346915962, + "learning_rate": 8.940337268995385e-05, + "loss": 0.3636, + "step": 2752 + }, + { + "epoch": 0.23485753284422453, + "grad_norm": 1.2998353193587227, + "learning_rate": 8.939486665845077e-05, + "loss": 0.3034, + "step": 2753 + }, + { + "epoch": 0.23494284251834158, + "grad_norm": 1.4134639927609713, + "learning_rate": 8.938635761935896e-05, + "loss": 0.3081, + "step": 2754 + }, + { + "epoch": 0.23502815219245862, + "grad_norm": 1.3255382254408197, + "learning_rate": 8.937784557332807e-05, + "loss": 0.3284, + "step": 2755 + }, + { + "epoch": 0.23511346186657567, + "grad_norm": 1.4808846646873621, + "learning_rate": 8.936933052100789e-05, + "loss": 0.3038, + "step": 2756 + }, + { + "epoch": 0.2351987715406927, + "grad_norm": 1.5342842712304223, + "learning_rate": 8.936081246304855e-05, + "loss": 0.3405, + "step": 2757 + }, + { + "epoch": 0.23528408121480976, + "grad_norm": 1.545564963736208, + "learning_rate": 8.935229140010035e-05, + "loss": 0.3393, + "step": 2758 + }, + { + "epoch": 0.2353693908889268, + "grad_norm": 1.2914486157529175, + "learning_rate": 8.93437673328138e-05, + "loss": 0.3531, + "step": 2759 + }, + { + "epoch": 0.23545470056304385, + "grad_norm": 1.6298446330815153, + "learning_rate": 8.93352402618397e-05, + "loss": 0.3239, + "step": 2760 + }, + { + "epoch": 0.2355400102371609, + "grad_norm": 1.5281122718078124, + "learning_rate": 8.932671018782903e-05, + "loss": 0.3794, + "step": 2761 + }, + { + "epoch": 0.23562531991127794, + "grad_norm": 1.5005776564610271, + "learning_rate": 8.931817711143302e-05, + "loss": 0.3447, + "step": 2762 + }, + { + "epoch": 0.235710629585395, + "grad_norm": 1.565143789931264, + "learning_rate": 8.930964103330312e-05, + "loss": 0.3672, + "step": 2763 + }, + { + "epoch": 0.23579593925951203, + "grad_norm": 1.4256339219151548, + "learning_rate": 8.930110195409102e-05, + "loss": 0.3384, + "step": 2764 + }, + { + "epoch": 0.23588124893362908, + "grad_norm": 1.4020330143250717, + "learning_rate": 8.929255987444864e-05, + "loss": 0.3673, + "step": 2765 + }, + { + "epoch": 0.23596655860774612, + "grad_norm": 1.4266100003376376, + "learning_rate": 8.92840147950281e-05, + "loss": 0.3625, + "step": 2766 + }, + { + "epoch": 0.23605186828186317, + "grad_norm": 1.7228142283137062, + "learning_rate": 8.92754667164818e-05, + "loss": 0.3641, + "step": 2767 + }, + { + "epoch": 0.23613717795598022, + "grad_norm": 1.531645431841794, + "learning_rate": 8.926691563946232e-05, + "loss": 0.3568, + "step": 2768 + }, + { + "epoch": 0.23622248763009726, + "grad_norm": 1.3032622633254047, + "learning_rate": 8.92583615646225e-05, + "loss": 0.369, + "step": 2769 + }, + { + "epoch": 0.2363077973042143, + "grad_norm": 1.4240729143784416, + "learning_rate": 8.924980449261539e-05, + "loss": 0.3652, + "step": 2770 + }, + { + "epoch": 0.23639310697833135, + "grad_norm": 1.4318811649104877, + "learning_rate": 8.924124442409427e-05, + "loss": 0.3724, + "step": 2771 + }, + { + "epoch": 0.2364784166524484, + "grad_norm": 1.8294547697712211, + "learning_rate": 8.923268135971267e-05, + "loss": 0.396, + "step": 2772 + }, + { + "epoch": 0.23656372632656544, + "grad_norm": 1.4834884405299675, + "learning_rate": 8.922411530012433e-05, + "loss": 0.3298, + "step": 2773 + }, + { + "epoch": 0.2366490360006825, + "grad_norm": 1.430171196833511, + "learning_rate": 8.921554624598323e-05, + "loss": 0.3639, + "step": 2774 + }, + { + "epoch": 0.23673434567479953, + "grad_norm": 1.462640840318615, + "learning_rate": 8.920697419794357e-05, + "loss": 0.3681, + "step": 2775 + }, + { + "epoch": 0.23681965534891658, + "grad_norm": 1.2783030602767733, + "learning_rate": 8.919839915665976e-05, + "loss": 0.3564, + "step": 2776 + }, + { + "epoch": 0.23690496502303363, + "grad_norm": 1.4196207810816441, + "learning_rate": 8.91898211227865e-05, + "loss": 0.375, + "step": 2777 + }, + { + "epoch": 0.23699027469715064, + "grad_norm": 1.2173826910759706, + "learning_rate": 8.91812400969786e-05, + "loss": 0.2982, + "step": 2778 + }, + { + "epoch": 0.2370755843712677, + "grad_norm": 1.42133513880884, + "learning_rate": 8.917265607989128e-05, + "loss": 0.3634, + "step": 2779 + }, + { + "epoch": 0.23716089404538473, + "grad_norm": 1.5034061541686232, + "learning_rate": 8.916406907217982e-05, + "loss": 0.3869, + "step": 2780 + }, + { + "epoch": 0.23724620371950178, + "grad_norm": 1.4358944426713214, + "learning_rate": 8.915547907449979e-05, + "loss": 0.3358, + "step": 2781 + }, + { + "epoch": 0.23733151339361883, + "grad_norm": 1.6317326383466302, + "learning_rate": 8.914688608750701e-05, + "loss": 0.3784, + "step": 2782 + }, + { + "epoch": 0.23741682306773587, + "grad_norm": 1.7405590435249767, + "learning_rate": 8.91382901118575e-05, + "loss": 0.3373, + "step": 2783 + }, + { + "epoch": 0.23750213274185292, + "grad_norm": 1.5082282249261034, + "learning_rate": 8.912969114820753e-05, + "loss": 0.3458, + "step": 2784 + }, + { + "epoch": 0.23758744241596996, + "grad_norm": 1.5618826600276565, + "learning_rate": 8.912108919721358e-05, + "loss": 0.3485, + "step": 2785 + }, + { + "epoch": 0.237672752090087, + "grad_norm": 1.3575092973619043, + "learning_rate": 8.911248425953236e-05, + "loss": 0.3253, + "step": 2786 + }, + { + "epoch": 0.23775806176420405, + "grad_norm": 1.4528782132508775, + "learning_rate": 8.91038763358208e-05, + "loss": 0.2941, + "step": 2787 + }, + { + "epoch": 0.2378433714383211, + "grad_norm": 1.4867254821336264, + "learning_rate": 8.909526542673608e-05, + "loss": 0.3618, + "step": 2788 + }, + { + "epoch": 0.23792868111243815, + "grad_norm": 1.4794473068429899, + "learning_rate": 8.908665153293559e-05, + "loss": 0.3196, + "step": 2789 + }, + { + "epoch": 0.2380139907865552, + "grad_norm": 1.4234143821606113, + "learning_rate": 8.907803465507697e-05, + "loss": 0.3123, + "step": 2790 + }, + { + "epoch": 0.23809930046067224, + "grad_norm": 1.3474870216155113, + "learning_rate": 8.906941479381806e-05, + "loss": 0.3273, + "step": 2791 + }, + { + "epoch": 0.23818461013478928, + "grad_norm": 1.669837637221109, + "learning_rate": 8.906079194981695e-05, + "loss": 0.3665, + "step": 2792 + }, + { + "epoch": 0.23826991980890633, + "grad_norm": 1.3918521481485033, + "learning_rate": 8.905216612373195e-05, + "loss": 0.2899, + "step": 2793 + }, + { + "epoch": 0.23835522948302337, + "grad_norm": 1.3802208617979312, + "learning_rate": 8.904353731622157e-05, + "loss": 0.3509, + "step": 2794 + }, + { + "epoch": 0.23844053915714042, + "grad_norm": 1.3520016700914892, + "learning_rate": 8.90349055279446e-05, + "loss": 0.3706, + "step": 2795 + }, + { + "epoch": 0.23852584883125746, + "grad_norm": 1.3134803743555283, + "learning_rate": 8.902627075956003e-05, + "loss": 0.3076, + "step": 2796 + }, + { + "epoch": 0.2386111585053745, + "grad_norm": 1.5599101155624426, + "learning_rate": 8.901763301172708e-05, + "loss": 0.3046, + "step": 2797 + }, + { + "epoch": 0.23869646817949156, + "grad_norm": 1.5748404891363266, + "learning_rate": 8.900899228510517e-05, + "loss": 0.3512, + "step": 2798 + }, + { + "epoch": 0.2387817778536086, + "grad_norm": 1.5515899471379184, + "learning_rate": 8.900034858035402e-05, + "loss": 0.3447, + "step": 2799 + }, + { + "epoch": 0.23886708752772565, + "grad_norm": 1.4027574189390515, + "learning_rate": 8.899170189813349e-05, + "loss": 0.3609, + "step": 2800 + }, + { + "epoch": 0.2389523972018427, + "grad_norm": 1.421335390422852, + "learning_rate": 8.89830522391037e-05, + "loss": 0.3429, + "step": 2801 + }, + { + "epoch": 0.23903770687595974, + "grad_norm": 1.6691857215579633, + "learning_rate": 8.897439960392507e-05, + "loss": 0.3964, + "step": 2802 + }, + { + "epoch": 0.23912301655007678, + "grad_norm": 1.2493858736146108, + "learning_rate": 8.89657439932581e-05, + "loss": 0.3915, + "step": 2803 + }, + { + "epoch": 0.23920832622419383, + "grad_norm": 1.7652920434340178, + "learning_rate": 8.895708540776366e-05, + "loss": 0.379, + "step": 2804 + }, + { + "epoch": 0.23929363589831087, + "grad_norm": 1.3952123010077795, + "learning_rate": 8.894842384810276e-05, + "loss": 0.3388, + "step": 2805 + }, + { + "epoch": 0.23937894557242792, + "grad_norm": 1.3427977625637602, + "learning_rate": 8.893975931493668e-05, + "loss": 0.344, + "step": 2806 + }, + { + "epoch": 0.23946425524654497, + "grad_norm": 1.5489585792662737, + "learning_rate": 8.89310918089269e-05, + "loss": 0.3861, + "step": 2807 + }, + { + "epoch": 0.239549564920662, + "grad_norm": 1.2810010900857591, + "learning_rate": 8.892242133073513e-05, + "loss": 0.3089, + "step": 2808 + }, + { + "epoch": 0.23963487459477906, + "grad_norm": 1.5128039819091084, + "learning_rate": 8.891374788102333e-05, + "loss": 0.3451, + "step": 2809 + }, + { + "epoch": 0.2397201842688961, + "grad_norm": 1.3257895186733437, + "learning_rate": 8.890507146045366e-05, + "loss": 0.3257, + "step": 2810 + }, + { + "epoch": 0.23980549394301315, + "grad_norm": 1.4061581899554663, + "learning_rate": 8.889639206968853e-05, + "loss": 0.3038, + "step": 2811 + }, + { + "epoch": 0.2398908036171302, + "grad_norm": 1.7830746323417683, + "learning_rate": 8.888770970939057e-05, + "loss": 0.345, + "step": 2812 + }, + { + "epoch": 0.23997611329124724, + "grad_norm": 1.5713955764279048, + "learning_rate": 8.887902438022261e-05, + "loss": 0.3455, + "step": 2813 + }, + { + "epoch": 0.24006142296536429, + "grad_norm": 1.34143883427284, + "learning_rate": 8.887033608284776e-05, + "loss": 0.3876, + "step": 2814 + }, + { + "epoch": 0.24014673263948133, + "grad_norm": 1.4630685858460801, + "learning_rate": 8.886164481792929e-05, + "loss": 0.3302, + "step": 2815 + }, + { + "epoch": 0.24023204231359835, + "grad_norm": 1.6887661457772332, + "learning_rate": 8.885295058613075e-05, + "loss": 0.3447, + "step": 2816 + }, + { + "epoch": 0.2403173519877154, + "grad_norm": 1.3578714847373463, + "learning_rate": 8.88442533881159e-05, + "loss": 0.3356, + "step": 2817 + }, + { + "epoch": 0.24040266166183244, + "grad_norm": 1.3623389017600949, + "learning_rate": 8.883555322454873e-05, + "loss": 0.3308, + "step": 2818 + }, + { + "epoch": 0.24048797133594949, + "grad_norm": 1.33523657184956, + "learning_rate": 8.882685009609343e-05, + "loss": 0.3519, + "step": 2819 + }, + { + "epoch": 0.24057328101006653, + "grad_norm": 1.640628197076498, + "learning_rate": 8.881814400341446e-05, + "loss": 0.3781, + "step": 2820 + }, + { + "epoch": 0.24065859068418358, + "grad_norm": 1.7813641695610551, + "learning_rate": 8.880943494717646e-05, + "loss": 0.3627, + "step": 2821 + }, + { + "epoch": 0.24074390035830062, + "grad_norm": 1.547974167068566, + "learning_rate": 8.880072292804435e-05, + "loss": 0.3632, + "step": 2822 + }, + { + "epoch": 0.24082921003241767, + "grad_norm": 1.680255598651746, + "learning_rate": 8.879200794668323e-05, + "loss": 0.4027, + "step": 2823 + }, + { + "epoch": 0.2409145197065347, + "grad_norm": 1.5024628289015456, + "learning_rate": 8.878329000375844e-05, + "loss": 0.3722, + "step": 2824 + }, + { + "epoch": 0.24099982938065176, + "grad_norm": 1.2390644953785015, + "learning_rate": 8.877456909993555e-05, + "loss": 0.3191, + "step": 2825 + }, + { + "epoch": 0.2410851390547688, + "grad_norm": 1.5019351873827276, + "learning_rate": 8.876584523588036e-05, + "loss": 0.381, + "step": 2826 + }, + { + "epoch": 0.24117044872888585, + "grad_norm": 1.5015177994980522, + "learning_rate": 8.875711841225888e-05, + "loss": 0.3806, + "step": 2827 + }, + { + "epoch": 0.2412557584030029, + "grad_norm": 1.186816470534537, + "learning_rate": 8.874838862973738e-05, + "loss": 0.3008, + "step": 2828 + }, + { + "epoch": 0.24134106807711994, + "grad_norm": 1.6382666940547252, + "learning_rate": 8.87396558889823e-05, + "loss": 0.3518, + "step": 2829 + }, + { + "epoch": 0.241426377751237, + "grad_norm": 1.5275660204316355, + "learning_rate": 8.873092019066038e-05, + "loss": 0.3515, + "step": 2830 + }, + { + "epoch": 0.24151168742535403, + "grad_norm": 1.4338368721595758, + "learning_rate": 8.872218153543849e-05, + "loss": 0.3055, + "step": 2831 + }, + { + "epoch": 0.24159699709947108, + "grad_norm": 1.6269682921289952, + "learning_rate": 8.871343992398383e-05, + "loss": 0.423, + "step": 2832 + }, + { + "epoch": 0.24168230677358812, + "grad_norm": 1.5130913390268406, + "learning_rate": 8.870469535696375e-05, + "loss": 0.384, + "step": 2833 + }, + { + "epoch": 0.24176761644770517, + "grad_norm": 1.5214839832467002, + "learning_rate": 8.869594783504585e-05, + "loss": 0.4069, + "step": 2834 + }, + { + "epoch": 0.24185292612182221, + "grad_norm": 1.4694438675124333, + "learning_rate": 8.868719735889797e-05, + "loss": 0.4163, + "step": 2835 + }, + { + "epoch": 0.24193823579593926, + "grad_norm": 1.4148248124606182, + "learning_rate": 8.867844392918816e-05, + "loss": 0.3496, + "step": 2836 + }, + { + "epoch": 0.2420235454700563, + "grad_norm": 1.67995205947049, + "learning_rate": 8.866968754658466e-05, + "loss": 0.3906, + "step": 2837 + }, + { + "epoch": 0.24210885514417335, + "grad_norm": 1.6586998149578958, + "learning_rate": 8.866092821175604e-05, + "loss": 0.3716, + "step": 2838 + }, + { + "epoch": 0.2421941648182904, + "grad_norm": 1.7206767986245142, + "learning_rate": 8.8652165925371e-05, + "loss": 0.3689, + "step": 2839 + }, + { + "epoch": 0.24227947449240744, + "grad_norm": 1.4020758667851443, + "learning_rate": 8.864340068809846e-05, + "loss": 0.3645, + "step": 2840 + }, + { + "epoch": 0.2423647841665245, + "grad_norm": 1.265152501575014, + "learning_rate": 8.863463250060765e-05, + "loss": 0.3102, + "step": 2841 + }, + { + "epoch": 0.24245009384064153, + "grad_norm": 1.541583093319659, + "learning_rate": 8.862586136356794e-05, + "loss": 0.3838, + "step": 2842 + }, + { + "epoch": 0.24253540351475858, + "grad_norm": 1.556237526349554, + "learning_rate": 8.8617087277649e-05, + "loss": 0.3537, + "step": 2843 + }, + { + "epoch": 0.24262071318887563, + "grad_norm": 1.6601012456421056, + "learning_rate": 8.860831024352063e-05, + "loss": 0.3657, + "step": 2844 + }, + { + "epoch": 0.24270602286299267, + "grad_norm": 1.4709075339875242, + "learning_rate": 8.859953026185294e-05, + "loss": 0.2901, + "step": 2845 + }, + { + "epoch": 0.24279133253710972, + "grad_norm": 1.553180903881217, + "learning_rate": 8.859074733331622e-05, + "loss": 0.3243, + "step": 2846 + }, + { + "epoch": 0.24287664221122676, + "grad_norm": 1.3910553941234542, + "learning_rate": 8.858196145858104e-05, + "loss": 0.3598, + "step": 2847 + }, + { + "epoch": 0.2429619518853438, + "grad_norm": 1.3766097269606041, + "learning_rate": 8.857317263831812e-05, + "loss": 0.3341, + "step": 2848 + }, + { + "epoch": 0.24304726155946085, + "grad_norm": 1.466148913997397, + "learning_rate": 8.856438087319843e-05, + "loss": 0.3119, + "step": 2849 + }, + { + "epoch": 0.2431325712335779, + "grad_norm": 1.7051295811250635, + "learning_rate": 8.855558616389322e-05, + "loss": 0.377, + "step": 2850 + }, + { + "epoch": 0.24321788090769494, + "grad_norm": 1.3469818639291884, + "learning_rate": 8.854678851107385e-05, + "loss": 0.3432, + "step": 2851 + }, + { + "epoch": 0.243303190581812, + "grad_norm": 1.5981576594423355, + "learning_rate": 8.853798791541204e-05, + "loss": 0.3855, + "step": 2852 + }, + { + "epoch": 0.24338850025592904, + "grad_norm": 1.5439770458224868, + "learning_rate": 8.852918437757964e-05, + "loss": 0.3306, + "step": 2853 + }, + { + "epoch": 0.24347380993004605, + "grad_norm": 1.3315710799883727, + "learning_rate": 8.852037789824876e-05, + "loss": 0.3552, + "step": 2854 + }, + { + "epoch": 0.2435591196041631, + "grad_norm": 1.4237438119380397, + "learning_rate": 8.851156847809173e-05, + "loss": 0.3544, + "step": 2855 + }, + { + "epoch": 0.24364442927828014, + "grad_norm": 1.3711731281832291, + "learning_rate": 8.85027561177811e-05, + "loss": 0.3098, + "step": 2856 + }, + { + "epoch": 0.2437297389523972, + "grad_norm": 1.5360139193301525, + "learning_rate": 8.849394081798962e-05, + "loss": 0.3562, + "step": 2857 + }, + { + "epoch": 0.24381504862651424, + "grad_norm": 1.6946727758658136, + "learning_rate": 8.848512257939033e-05, + "loss": 0.3815, + "step": 2858 + }, + { + "epoch": 0.24390035830063128, + "grad_norm": 1.5023372243341764, + "learning_rate": 8.847630140265644e-05, + "loss": 0.3581, + "step": 2859 + }, + { + "epoch": 0.24398566797474833, + "grad_norm": 1.5503330303632832, + "learning_rate": 8.84674772884614e-05, + "loss": 0.378, + "step": 2860 + }, + { + "epoch": 0.24407097764886537, + "grad_norm": 1.480412384006967, + "learning_rate": 8.845865023747888e-05, + "loss": 0.2993, + "step": 2861 + }, + { + "epoch": 0.24415628732298242, + "grad_norm": 1.5027678701873688, + "learning_rate": 8.844982025038279e-05, + "loss": 0.3602, + "step": 2862 + }, + { + "epoch": 0.24424159699709946, + "grad_norm": 1.661229834394719, + "learning_rate": 8.844098732784723e-05, + "loss": 0.4196, + "step": 2863 + }, + { + "epoch": 0.2443269066712165, + "grad_norm": 1.2467395221276802, + "learning_rate": 8.843215147054659e-05, + "loss": 0.2829, + "step": 2864 + }, + { + "epoch": 0.24441221634533356, + "grad_norm": 1.5225792447969337, + "learning_rate": 8.84233126791554e-05, + "loss": 0.4055, + "step": 2865 + }, + { + "epoch": 0.2444975260194506, + "grad_norm": 1.3460909834006956, + "learning_rate": 8.841447095434847e-05, + "loss": 0.3186, + "step": 2866 + }, + { + "epoch": 0.24458283569356765, + "grad_norm": 1.4410729164984273, + "learning_rate": 8.84056262968008e-05, + "loss": 0.303, + "step": 2867 + }, + { + "epoch": 0.2446681453676847, + "grad_norm": 1.5378406147362598, + "learning_rate": 8.839677870718768e-05, + "loss": 0.3027, + "step": 2868 + }, + { + "epoch": 0.24475345504180174, + "grad_norm": 1.441022951185548, + "learning_rate": 8.838792818618452e-05, + "loss": 0.3587, + "step": 2869 + }, + { + "epoch": 0.24483876471591878, + "grad_norm": 1.4039118508220307, + "learning_rate": 8.837907473446704e-05, + "loss": 0.3447, + "step": 2870 + }, + { + "epoch": 0.24492407439003583, + "grad_norm": 1.664961451178715, + "learning_rate": 8.837021835271117e-05, + "loss": 0.3755, + "step": 2871 + }, + { + "epoch": 0.24500938406415287, + "grad_norm": 1.7242156774980057, + "learning_rate": 8.836135904159302e-05, + "loss": 0.3459, + "step": 2872 + }, + { + "epoch": 0.24509469373826992, + "grad_norm": 1.5743768579487971, + "learning_rate": 8.835249680178894e-05, + "loss": 0.342, + "step": 2873 + }, + { + "epoch": 0.24518000341238697, + "grad_norm": 1.3599921831439272, + "learning_rate": 8.834363163397556e-05, + "loss": 0.3047, + "step": 2874 + }, + { + "epoch": 0.245265313086504, + "grad_norm": 1.4347115713936445, + "learning_rate": 8.833476353882964e-05, + "loss": 0.3547, + "step": 2875 + }, + { + "epoch": 0.24535062276062106, + "grad_norm": 1.4009372809378227, + "learning_rate": 8.832589251702825e-05, + "loss": 0.4179, + "step": 2876 + }, + { + "epoch": 0.2454359324347381, + "grad_norm": 1.3433809438950022, + "learning_rate": 8.831701856924864e-05, + "loss": 0.3627, + "step": 2877 + }, + { + "epoch": 0.24552124210885515, + "grad_norm": 1.208709143152259, + "learning_rate": 8.830814169616826e-05, + "loss": 0.3021, + "step": 2878 + }, + { + "epoch": 0.2456065517829722, + "grad_norm": 1.194493231159, + "learning_rate": 8.829926189846482e-05, + "loss": 0.3075, + "step": 2879 + }, + { + "epoch": 0.24569186145708924, + "grad_norm": 1.5285512536427484, + "learning_rate": 8.829037917681627e-05, + "loss": 0.3514, + "step": 2880 + }, + { + "epoch": 0.24577717113120628, + "grad_norm": 1.3458207609142505, + "learning_rate": 8.828149353190075e-05, + "loss": 0.2864, + "step": 2881 + }, + { + "epoch": 0.24586248080532333, + "grad_norm": 1.637129629633785, + "learning_rate": 8.827260496439662e-05, + "loss": 0.3987, + "step": 2882 + }, + { + "epoch": 0.24594779047944038, + "grad_norm": 1.5965299110274822, + "learning_rate": 8.826371347498248e-05, + "loss": 0.4408, + "step": 2883 + }, + { + "epoch": 0.24603310015355742, + "grad_norm": 1.5896016154439903, + "learning_rate": 8.825481906433716e-05, + "loss": 0.3029, + "step": 2884 + }, + { + "epoch": 0.24611840982767447, + "grad_norm": 1.355704427232778, + "learning_rate": 8.824592173313968e-05, + "loss": 0.3319, + "step": 2885 + }, + { + "epoch": 0.2462037195017915, + "grad_norm": 1.653712217745607, + "learning_rate": 8.823702148206931e-05, + "loss": 0.3565, + "step": 2886 + }, + { + "epoch": 0.24628902917590856, + "grad_norm": 1.565996606396814, + "learning_rate": 8.822811831180555e-05, + "loss": 0.3575, + "step": 2887 + }, + { + "epoch": 0.2463743388500256, + "grad_norm": 1.5781864116541524, + "learning_rate": 8.821921222302811e-05, + "loss": 0.3539, + "step": 2888 + }, + { + "epoch": 0.24645964852414265, + "grad_norm": 1.6358071932045792, + "learning_rate": 8.82103032164169e-05, + "loss": 0.3493, + "step": 2889 + }, + { + "epoch": 0.2465449581982597, + "grad_norm": 1.3087102581938195, + "learning_rate": 8.820139129265208e-05, + "loss": 0.3133, + "step": 2890 + }, + { + "epoch": 0.24663026787237674, + "grad_norm": 1.2769267950322039, + "learning_rate": 8.819247645241406e-05, + "loss": 0.2969, + "step": 2891 + }, + { + "epoch": 0.2467155775464938, + "grad_norm": 1.7377477620164874, + "learning_rate": 8.818355869638339e-05, + "loss": 0.3923, + "step": 2892 + }, + { + "epoch": 0.2468008872206108, + "grad_norm": 1.3605453013680158, + "learning_rate": 8.817463802524096e-05, + "loss": 0.3333, + "step": 2893 + }, + { + "epoch": 0.24688619689472785, + "grad_norm": 1.2817598584493335, + "learning_rate": 8.816571443966775e-05, + "loss": 0.3299, + "step": 2894 + }, + { + "epoch": 0.2469715065688449, + "grad_norm": 1.3422797831916606, + "learning_rate": 8.815678794034505e-05, + "loss": 0.321, + "step": 2895 + }, + { + "epoch": 0.24705681624296194, + "grad_norm": 1.5767124766307308, + "learning_rate": 8.814785852795436e-05, + "loss": 0.3229, + "step": 2896 + }, + { + "epoch": 0.247142125917079, + "grad_norm": 1.8209838938788991, + "learning_rate": 8.81389262031774e-05, + "loss": 0.3665, + "step": 2897 + }, + { + "epoch": 0.24722743559119603, + "grad_norm": 1.5362844422641024, + "learning_rate": 8.812999096669608e-05, + "loss": 0.3867, + "step": 2898 + }, + { + "epoch": 0.24731274526531308, + "grad_norm": 1.5760024498470262, + "learning_rate": 8.812105281919259e-05, + "loss": 0.282, + "step": 2899 + }, + { + "epoch": 0.24739805493943012, + "grad_norm": 1.2003600812130093, + "learning_rate": 8.811211176134927e-05, + "loss": 0.2879, + "step": 2900 + }, + { + "epoch": 0.24748336461354717, + "grad_norm": 1.698331825036055, + "learning_rate": 8.810316779384877e-05, + "loss": 0.3719, + "step": 2901 + }, + { + "epoch": 0.24756867428766421, + "grad_norm": 1.3417796393429957, + "learning_rate": 8.809422091737387e-05, + "loss": 0.3314, + "step": 2902 + }, + { + "epoch": 0.24765398396178126, + "grad_norm": 1.5562665578428583, + "learning_rate": 8.808527113260764e-05, + "loss": 0.3575, + "step": 2903 + }, + { + "epoch": 0.2477392936358983, + "grad_norm": 1.6081946034792904, + "learning_rate": 8.807631844023334e-05, + "loss": 0.3323, + "step": 2904 + }, + { + "epoch": 0.24782460331001535, + "grad_norm": 1.6152883979591828, + "learning_rate": 8.806736284093446e-05, + "loss": 0.313, + "step": 2905 + }, + { + "epoch": 0.2479099129841324, + "grad_norm": 1.40753191693234, + "learning_rate": 8.805840433539475e-05, + "loss": 0.389, + "step": 2906 + }, + { + "epoch": 0.24799522265824944, + "grad_norm": 1.4728943811841293, + "learning_rate": 8.804944292429807e-05, + "loss": 0.3502, + "step": 2907 + }, + { + "epoch": 0.2480805323323665, + "grad_norm": 1.2477418052084752, + "learning_rate": 8.804047860832863e-05, + "loss": 0.322, + "step": 2908 + }, + { + "epoch": 0.24816584200648353, + "grad_norm": 1.5956943843233298, + "learning_rate": 8.803151138817081e-05, + "loss": 0.342, + "step": 2909 + }, + { + "epoch": 0.24825115168060058, + "grad_norm": 1.4164144441059345, + "learning_rate": 8.802254126450917e-05, + "loss": 0.3715, + "step": 2910 + }, + { + "epoch": 0.24833646135471762, + "grad_norm": 1.6867464997065982, + "learning_rate": 8.801356823802857e-05, + "loss": 0.3397, + "step": 2911 + }, + { + "epoch": 0.24842177102883467, + "grad_norm": 1.3499555350857595, + "learning_rate": 8.800459230941405e-05, + "loss": 0.375, + "step": 2912 + }, + { + "epoch": 0.24850708070295172, + "grad_norm": 1.544877583366757, + "learning_rate": 8.799561347935086e-05, + "loss": 0.3323, + "step": 2913 + }, + { + "epoch": 0.24859239037706876, + "grad_norm": 1.3430371722532766, + "learning_rate": 8.798663174852447e-05, + "loss": 0.3118, + "step": 2914 + }, + { + "epoch": 0.2486777000511858, + "grad_norm": 1.2543817016848295, + "learning_rate": 8.797764711762063e-05, + "loss": 0.3243, + "step": 2915 + }, + { + "epoch": 0.24876300972530285, + "grad_norm": 1.8350346069814543, + "learning_rate": 8.796865958732522e-05, + "loss": 0.3282, + "step": 2916 + }, + { + "epoch": 0.2488483193994199, + "grad_norm": 1.292481022147596, + "learning_rate": 8.795966915832443e-05, + "loss": 0.278, + "step": 2917 + }, + { + "epoch": 0.24893362907353694, + "grad_norm": 1.4161087321221584, + "learning_rate": 8.795067583130461e-05, + "loss": 0.3325, + "step": 2918 + }, + { + "epoch": 0.249018938747654, + "grad_norm": 1.5317503442553766, + "learning_rate": 8.794167960695238e-05, + "loss": 0.3022, + "step": 2919 + }, + { + "epoch": 0.24910424842177104, + "grad_norm": 1.476483539710446, + "learning_rate": 8.793268048595452e-05, + "loss": 0.3065, + "step": 2920 + }, + { + "epoch": 0.24918955809588808, + "grad_norm": 1.3111899059042107, + "learning_rate": 8.792367846899807e-05, + "loss": 0.3013, + "step": 2921 + }, + { + "epoch": 0.24927486777000513, + "grad_norm": 1.3704458591548796, + "learning_rate": 8.791467355677031e-05, + "loss": 0.275, + "step": 2922 + }, + { + "epoch": 0.24936017744412217, + "grad_norm": 1.2294690142834548, + "learning_rate": 8.79056657499587e-05, + "loss": 0.3091, + "step": 2923 + }, + { + "epoch": 0.24944548711823922, + "grad_norm": 1.4315869996564823, + "learning_rate": 8.789665504925093e-05, + "loss": 0.3336, + "step": 2924 + }, + { + "epoch": 0.24953079679235626, + "grad_norm": 1.454507067970368, + "learning_rate": 8.788764145533494e-05, + "loss": 0.3369, + "step": 2925 + }, + { + "epoch": 0.2496161064664733, + "grad_norm": 1.773087521608408, + "learning_rate": 8.787862496889886e-05, + "loss": 0.3832, + "step": 2926 + }, + { + "epoch": 0.24970141614059035, + "grad_norm": 1.23894653739764, + "learning_rate": 8.786960559063105e-05, + "loss": 0.3659, + "step": 2927 + }, + { + "epoch": 0.2497867258147074, + "grad_norm": 1.2565966113739249, + "learning_rate": 8.786058332122009e-05, + "loss": 0.299, + "step": 2928 + }, + { + "epoch": 0.24987203548882445, + "grad_norm": 1.3678595934528963, + "learning_rate": 8.785155816135478e-05, + "loss": 0.3251, + "step": 2929 + }, + { + "epoch": 0.2499573451629415, + "grad_norm": 1.274433892925217, + "learning_rate": 8.784253011172415e-05, + "loss": 0.3055, + "step": 2930 + }, + { + "epoch": 0.2500426548370585, + "grad_norm": 1.5679821734193564, + "learning_rate": 8.783349917301744e-05, + "loss": 0.3521, + "step": 2931 + }, + { + "epoch": 0.2501279645111756, + "grad_norm": 1.4418721260270346, + "learning_rate": 8.782446534592413e-05, + "loss": 0.3271, + "step": 2932 + }, + { + "epoch": 0.2502132741852926, + "grad_norm": 1.4965856953462486, + "learning_rate": 8.781542863113387e-05, + "loss": 0.37, + "step": 2933 + }, + { + "epoch": 0.2502985838594097, + "grad_norm": 1.3901603436782244, + "learning_rate": 8.780638902933658e-05, + "loss": 0.3344, + "step": 2934 + }, + { + "epoch": 0.2503838935335267, + "grad_norm": 1.2444361840174525, + "learning_rate": 8.779734654122241e-05, + "loss": 0.2856, + "step": 2935 + }, + { + "epoch": 0.25046920320764376, + "grad_norm": 1.4788224621235644, + "learning_rate": 8.778830116748167e-05, + "loss": 0.3696, + "step": 2936 + }, + { + "epoch": 0.2505545128817608, + "grad_norm": 1.3758337354206966, + "learning_rate": 8.777925290880496e-05, + "loss": 0.328, + "step": 2937 + }, + { + "epoch": 0.25063982255587786, + "grad_norm": 1.429081856275145, + "learning_rate": 8.777020176588305e-05, + "loss": 0.2976, + "step": 2938 + }, + { + "epoch": 0.2507251322299949, + "grad_norm": 1.407789913085209, + "learning_rate": 8.776114773940693e-05, + "loss": 0.3335, + "step": 2939 + }, + { + "epoch": 0.25081044190411195, + "grad_norm": 1.6631391945698732, + "learning_rate": 8.775209083006784e-05, + "loss": 0.3393, + "step": 2940 + }, + { + "epoch": 0.25089575157822896, + "grad_norm": 1.7333464450829328, + "learning_rate": 8.774303103855724e-05, + "loss": 0.3949, + "step": 2941 + }, + { + "epoch": 0.25098106125234604, + "grad_norm": 1.3625536181897637, + "learning_rate": 8.773396836556679e-05, + "loss": 0.3443, + "step": 2942 + }, + { + "epoch": 0.25106637092646306, + "grad_norm": 1.6192565066625166, + "learning_rate": 8.772490281178836e-05, + "loss": 0.3714, + "step": 2943 + }, + { + "epoch": 0.25115168060058013, + "grad_norm": 1.5957667736120338, + "learning_rate": 8.771583437791409e-05, + "loss": 0.3404, + "step": 2944 + }, + { + "epoch": 0.25123699027469715, + "grad_norm": 1.5225747820147923, + "learning_rate": 8.770676306463628e-05, + "loss": 0.3375, + "step": 2945 + }, + { + "epoch": 0.2513222999488142, + "grad_norm": 1.5682108465531113, + "learning_rate": 8.769768887264747e-05, + "loss": 0.3614, + "step": 2946 + }, + { + "epoch": 0.25140760962293124, + "grad_norm": 1.4408117376017797, + "learning_rate": 8.768861180264045e-05, + "loss": 0.3574, + "step": 2947 + }, + { + "epoch": 0.2514929192970483, + "grad_norm": 1.5730606249011059, + "learning_rate": 8.767953185530819e-05, + "loss": 0.3547, + "step": 2948 + }, + { + "epoch": 0.25157822897116533, + "grad_norm": 1.4341419637363861, + "learning_rate": 8.767044903134392e-05, + "loss": 0.3254, + "step": 2949 + }, + { + "epoch": 0.25166353864528235, + "grad_norm": 1.6345817799803488, + "learning_rate": 8.766136333144102e-05, + "loss": 0.36, + "step": 2950 + }, + { + "epoch": 0.2517488483193994, + "grad_norm": 1.5922694341450117, + "learning_rate": 8.765227475629319e-05, + "loss": 0.3985, + "step": 2951 + }, + { + "epoch": 0.25183415799351644, + "grad_norm": 1.2641527065913656, + "learning_rate": 8.764318330659424e-05, + "loss": 0.3085, + "step": 2952 + }, + { + "epoch": 0.2519194676676335, + "grad_norm": 1.183162094316587, + "learning_rate": 8.763408898303829e-05, + "loss": 0.3401, + "step": 2953 + }, + { + "epoch": 0.25200477734175053, + "grad_norm": 1.4299551291195, + "learning_rate": 8.762499178631964e-05, + "loss": 0.4044, + "step": 2954 + }, + { + "epoch": 0.2520900870158676, + "grad_norm": 1.287494052021398, + "learning_rate": 8.76158917171328e-05, + "loss": 0.3356, + "step": 2955 + }, + { + "epoch": 0.2521753966899846, + "grad_norm": 1.3035325289034032, + "learning_rate": 8.760678877617253e-05, + "loss": 0.3424, + "step": 2956 + }, + { + "epoch": 0.2522607063641017, + "grad_norm": 1.6679880467176376, + "learning_rate": 8.759768296413376e-05, + "loss": 0.4003, + "step": 2957 + }, + { + "epoch": 0.2523460160382187, + "grad_norm": 1.195886511511915, + "learning_rate": 8.75885742817117e-05, + "loss": 0.3427, + "step": 2958 + }, + { + "epoch": 0.2524313257123358, + "grad_norm": 1.4806192366881175, + "learning_rate": 8.757946272960173e-05, + "loss": 0.3461, + "step": 2959 + }, + { + "epoch": 0.2525166353864528, + "grad_norm": 1.4252182776603206, + "learning_rate": 8.757034830849948e-05, + "loss": 0.3465, + "step": 2960 + }, + { + "epoch": 0.2526019450605699, + "grad_norm": 1.361531750023376, + "learning_rate": 8.756123101910079e-05, + "loss": 0.326, + "step": 2961 + }, + { + "epoch": 0.2526872547346869, + "grad_norm": 1.5375429969295904, + "learning_rate": 8.755211086210172e-05, + "loss": 0.3337, + "step": 2962 + }, + { + "epoch": 0.25277256440880397, + "grad_norm": 1.4024079289533713, + "learning_rate": 8.754298783819853e-05, + "loss": 0.3715, + "step": 2963 + }, + { + "epoch": 0.252857874082921, + "grad_norm": 1.5666754817038286, + "learning_rate": 8.753386194808772e-05, + "loss": 0.3341, + "step": 2964 + }, + { + "epoch": 0.25294318375703806, + "grad_norm": 1.3773603121345408, + "learning_rate": 8.752473319246601e-05, + "loss": 0.3475, + "step": 2965 + }, + { + "epoch": 0.2530284934311551, + "grad_norm": 1.4492882560246265, + "learning_rate": 8.751560157203031e-05, + "loss": 0.3133, + "step": 2966 + }, + { + "epoch": 0.25311380310527215, + "grad_norm": 1.3224733265600825, + "learning_rate": 8.750646708747781e-05, + "loss": 0.3331, + "step": 2967 + }, + { + "epoch": 0.25319911277938917, + "grad_norm": 1.0387025367368699, + "learning_rate": 8.749732973950585e-05, + "loss": 0.2922, + "step": 2968 + }, + { + "epoch": 0.25328442245350624, + "grad_norm": 1.716559522797388, + "learning_rate": 8.748818952881201e-05, + "loss": 0.3137, + "step": 2969 + }, + { + "epoch": 0.25336973212762326, + "grad_norm": 1.5439899397112156, + "learning_rate": 8.747904645609413e-05, + "loss": 0.3284, + "step": 2970 + }, + { + "epoch": 0.25345504180174033, + "grad_norm": 1.3161806541499235, + "learning_rate": 8.746990052205022e-05, + "loss": 0.2944, + "step": 2971 + }, + { + "epoch": 0.25354035147585735, + "grad_norm": 1.4064768290368939, + "learning_rate": 8.746075172737851e-05, + "loss": 0.3407, + "step": 2972 + }, + { + "epoch": 0.2536256611499744, + "grad_norm": 1.8126388365089399, + "learning_rate": 8.745160007277747e-05, + "loss": 0.3222, + "step": 2973 + }, + { + "epoch": 0.25371097082409144, + "grad_norm": 1.7308583391514052, + "learning_rate": 8.744244555894578e-05, + "loss": 0.3812, + "step": 2974 + }, + { + "epoch": 0.2537962804982085, + "grad_norm": 1.742381542672724, + "learning_rate": 8.743328818658235e-05, + "loss": 0.4196, + "step": 2975 + }, + { + "epoch": 0.25388159017232553, + "grad_norm": 1.415126421372807, + "learning_rate": 8.742412795638629e-05, + "loss": 0.3248, + "step": 2976 + }, + { + "epoch": 0.2539668998464426, + "grad_norm": 1.3813056710187774, + "learning_rate": 8.741496486905691e-05, + "loss": 0.3031, + "step": 2977 + }, + { + "epoch": 0.2540522095205596, + "grad_norm": 1.7382750778945901, + "learning_rate": 8.740579892529381e-05, + "loss": 0.3431, + "step": 2978 + }, + { + "epoch": 0.2541375191946767, + "grad_norm": 1.3389964636028788, + "learning_rate": 8.739663012579673e-05, + "loss": 0.2791, + "step": 2979 + }, + { + "epoch": 0.2542228288687937, + "grad_norm": 1.4035392914243696, + "learning_rate": 8.738745847126565e-05, + "loss": 0.3322, + "step": 2980 + }, + { + "epoch": 0.2543081385429108, + "grad_norm": 1.3510950856860076, + "learning_rate": 8.737828396240081e-05, + "loss": 0.3055, + "step": 2981 + }, + { + "epoch": 0.2543934482170278, + "grad_norm": 1.480617465398191, + "learning_rate": 8.736910659990261e-05, + "loss": 0.3099, + "step": 2982 + }, + { + "epoch": 0.2544787578911449, + "grad_norm": 1.3912564676276693, + "learning_rate": 8.73599263844717e-05, + "loss": 0.346, + "step": 2983 + }, + { + "epoch": 0.2545640675652619, + "grad_norm": 1.375797777222852, + "learning_rate": 8.735074331680895e-05, + "loss": 0.2714, + "step": 2984 + }, + { + "epoch": 0.25464937723937897, + "grad_norm": 1.4436947403277889, + "learning_rate": 8.734155739761541e-05, + "loss": 0.3389, + "step": 2985 + }, + { + "epoch": 0.254734686913496, + "grad_norm": 1.4029711326734382, + "learning_rate": 8.733236862759242e-05, + "loss": 0.3218, + "step": 2986 + }, + { + "epoch": 0.25481999658761306, + "grad_norm": 1.4231059017918672, + "learning_rate": 8.732317700744146e-05, + "loss": 0.3439, + "step": 2987 + }, + { + "epoch": 0.2549053062617301, + "grad_norm": 1.587354205788295, + "learning_rate": 8.731398253786426e-05, + "loss": 0.3667, + "step": 2988 + }, + { + "epoch": 0.2549906159358471, + "grad_norm": 1.5229312936034314, + "learning_rate": 8.730478521956278e-05, + "loss": 0.3371, + "step": 2989 + }, + { + "epoch": 0.25507592560996417, + "grad_norm": 1.3621728976911653, + "learning_rate": 8.729558505323921e-05, + "loss": 0.3227, + "step": 2990 + }, + { + "epoch": 0.2551612352840812, + "grad_norm": 2.9418740679397226, + "learning_rate": 8.728638203959589e-05, + "loss": 0.3588, + "step": 2991 + }, + { + "epoch": 0.25524654495819826, + "grad_norm": 1.3370626964662562, + "learning_rate": 8.727717617933544e-05, + "loss": 0.3191, + "step": 2992 + }, + { + "epoch": 0.2553318546323153, + "grad_norm": 1.489543229565816, + "learning_rate": 8.726796747316068e-05, + "loss": 0.3931, + "step": 2993 + }, + { + "epoch": 0.25541716430643235, + "grad_norm": 1.440610671230726, + "learning_rate": 8.725875592177464e-05, + "loss": 0.386, + "step": 2994 + }, + { + "epoch": 0.25550247398054937, + "grad_norm": 1.7543391519708895, + "learning_rate": 8.724954152588058e-05, + "loss": 0.3572, + "step": 2995 + }, + { + "epoch": 0.25558778365466644, + "grad_norm": 1.4680441925818484, + "learning_rate": 8.724032428618198e-05, + "loss": 0.3483, + "step": 2996 + }, + { + "epoch": 0.25567309332878346, + "grad_norm": 1.5835223670683074, + "learning_rate": 8.723110420338251e-05, + "loss": 0.3423, + "step": 2997 + }, + { + "epoch": 0.25575840300290054, + "grad_norm": 1.375566452562154, + "learning_rate": 8.722188127818608e-05, + "loss": 0.3169, + "step": 2998 + }, + { + "epoch": 0.25584371267701755, + "grad_norm": 1.6748230513297162, + "learning_rate": 8.721265551129683e-05, + "loss": 0.3919, + "step": 2999 + }, + { + "epoch": 0.2559290223511346, + "grad_norm": 1.5081369317820192, + "learning_rate": 8.720342690341905e-05, + "loss": 0.3814, + "step": 3000 + }, + { + "epoch": 0.25601433202525165, + "grad_norm": 1.4854127970355735, + "learning_rate": 8.719419545525733e-05, + "loss": 0.3585, + "step": 3001 + }, + { + "epoch": 0.2560996416993687, + "grad_norm": 1.6072475853357586, + "learning_rate": 8.718496116751644e-05, + "loss": 0.3938, + "step": 3002 + }, + { + "epoch": 0.25618495137348574, + "grad_norm": 1.1246439052314376, + "learning_rate": 8.717572404090138e-05, + "loss": 0.3383, + "step": 3003 + }, + { + "epoch": 0.2562702610476028, + "grad_norm": 1.4529486108186798, + "learning_rate": 8.716648407611732e-05, + "loss": 0.3155, + "step": 3004 + }, + { + "epoch": 0.2563555707217198, + "grad_norm": 1.4509602491216809, + "learning_rate": 8.715724127386972e-05, + "loss": 0.3952, + "step": 3005 + }, + { + "epoch": 0.2564408803958369, + "grad_norm": 1.433089081704649, + "learning_rate": 8.714799563486418e-05, + "loss": 0.3737, + "step": 3006 + }, + { + "epoch": 0.2565261900699539, + "grad_norm": 1.21161701456593, + "learning_rate": 8.71387471598066e-05, + "loss": 0.3329, + "step": 3007 + }, + { + "epoch": 0.256611499744071, + "grad_norm": 1.4950534002150655, + "learning_rate": 8.712949584940303e-05, + "loss": 0.3396, + "step": 3008 + }, + { + "epoch": 0.256696809418188, + "grad_norm": 1.4359610239121479, + "learning_rate": 8.712024170435975e-05, + "loss": 0.3431, + "step": 3009 + }, + { + "epoch": 0.2567821190923051, + "grad_norm": 1.4548631342851304, + "learning_rate": 8.711098472538326e-05, + "loss": 0.3315, + "step": 3010 + }, + { + "epoch": 0.2568674287664221, + "grad_norm": 1.6304926842622194, + "learning_rate": 8.710172491318031e-05, + "loss": 0.3788, + "step": 3011 + }, + { + "epoch": 0.2569527384405392, + "grad_norm": 1.2717373035322808, + "learning_rate": 8.709246226845782e-05, + "loss": 0.4137, + "step": 3012 + }, + { + "epoch": 0.2570380481146562, + "grad_norm": 1.200555708482756, + "learning_rate": 8.708319679192293e-05, + "loss": 0.2878, + "step": 3013 + }, + { + "epoch": 0.25712335778877327, + "grad_norm": 1.4215377575357797, + "learning_rate": 8.707392848428304e-05, + "loss": 0.3098, + "step": 3014 + }, + { + "epoch": 0.2572086674628903, + "grad_norm": 1.5464917873851471, + "learning_rate": 8.706465734624572e-05, + "loss": 0.3442, + "step": 3015 + }, + { + "epoch": 0.25729397713700736, + "grad_norm": 1.4989399343089969, + "learning_rate": 8.705538337851878e-05, + "loss": 0.3236, + "step": 3016 + }, + { + "epoch": 0.2573792868111244, + "grad_norm": 1.496350298764888, + "learning_rate": 8.704610658181021e-05, + "loss": 0.3619, + "step": 3017 + }, + { + "epoch": 0.25746459648524145, + "grad_norm": 1.153222011217429, + "learning_rate": 8.703682695682829e-05, + "loss": 0.3525, + "step": 3018 + }, + { + "epoch": 0.25754990615935847, + "grad_norm": 1.3203792216580077, + "learning_rate": 8.702754450428143e-05, + "loss": 0.2493, + "step": 3019 + }, + { + "epoch": 0.25763521583347554, + "grad_norm": 1.4033412945054156, + "learning_rate": 8.701825922487831e-05, + "loss": 0.3644, + "step": 3020 + }, + { + "epoch": 0.25772052550759256, + "grad_norm": 1.3546718486510934, + "learning_rate": 8.700897111932782e-05, + "loss": 0.3118, + "step": 3021 + }, + { + "epoch": 0.25780583518170963, + "grad_norm": 1.2867497823154346, + "learning_rate": 8.699968018833904e-05, + "loss": 0.3547, + "step": 3022 + }, + { + "epoch": 0.25789114485582665, + "grad_norm": 1.5657874047874336, + "learning_rate": 8.699038643262131e-05, + "loss": 0.3229, + "step": 3023 + }, + { + "epoch": 0.2579764545299437, + "grad_norm": 1.4179022487042232, + "learning_rate": 8.698108985288414e-05, + "loss": 0.3435, + "step": 3024 + }, + { + "epoch": 0.25806176420406074, + "grad_norm": 1.2923763335516532, + "learning_rate": 8.697179044983725e-05, + "loss": 0.2959, + "step": 3025 + }, + { + "epoch": 0.2581470738781778, + "grad_norm": 1.7370396669785588, + "learning_rate": 8.696248822419065e-05, + "loss": 0.384, + "step": 3026 + }, + { + "epoch": 0.25823238355229483, + "grad_norm": 1.579140742281224, + "learning_rate": 8.69531831766545e-05, + "loss": 0.3807, + "step": 3027 + }, + { + "epoch": 0.25831769322641185, + "grad_norm": 1.4867039128984916, + "learning_rate": 8.694387530793916e-05, + "loss": 0.2839, + "step": 3028 + }, + { + "epoch": 0.2584030029005289, + "grad_norm": 1.5743612598820613, + "learning_rate": 8.693456461875529e-05, + "loss": 0.3552, + "step": 3029 + }, + { + "epoch": 0.25848831257464594, + "grad_norm": 1.4870145117661513, + "learning_rate": 8.692525110981366e-05, + "loss": 0.3646, + "step": 3030 + }, + { + "epoch": 0.258573622248763, + "grad_norm": 1.7611658958567424, + "learning_rate": 8.691593478182533e-05, + "loss": 0.357, + "step": 3031 + }, + { + "epoch": 0.25865893192288003, + "grad_norm": 1.3842878929960687, + "learning_rate": 8.690661563550156e-05, + "loss": 0.3401, + "step": 3032 + }, + { + "epoch": 0.2587442415969971, + "grad_norm": 1.612009059482987, + "learning_rate": 8.68972936715538e-05, + "loss": 0.34, + "step": 3033 + }, + { + "epoch": 0.2588295512711141, + "grad_norm": 1.358937072369264, + "learning_rate": 8.688796889069373e-05, + "loss": 0.3678, + "step": 3034 + }, + { + "epoch": 0.2589148609452312, + "grad_norm": 1.3948585275811818, + "learning_rate": 8.687864129363327e-05, + "loss": 0.33, + "step": 3035 + }, + { + "epoch": 0.2590001706193482, + "grad_norm": 1.484931921634675, + "learning_rate": 8.686931088108452e-05, + "loss": 0.3632, + "step": 3036 + }, + { + "epoch": 0.2590854802934653, + "grad_norm": 1.2939246755429792, + "learning_rate": 8.68599776537598e-05, + "loss": 0.3153, + "step": 3037 + }, + { + "epoch": 0.2591707899675823, + "grad_norm": 1.4603613058889888, + "learning_rate": 8.685064161237167e-05, + "loss": 0.3343, + "step": 3038 + }, + { + "epoch": 0.2592560996416994, + "grad_norm": 1.2469640102500499, + "learning_rate": 8.684130275763287e-05, + "loss": 0.3114, + "step": 3039 + }, + { + "epoch": 0.2593414093158164, + "grad_norm": 1.5469250911732655, + "learning_rate": 8.68319610902564e-05, + "loss": 0.369, + "step": 3040 + }, + { + "epoch": 0.25942671898993347, + "grad_norm": 1.24776535560363, + "learning_rate": 8.68226166109554e-05, + "loss": 0.3444, + "step": 3041 + }, + { + "epoch": 0.2595120286640505, + "grad_norm": 1.7105090157796172, + "learning_rate": 8.68132693204433e-05, + "loss": 0.3871, + "step": 3042 + }, + { + "epoch": 0.25959733833816756, + "grad_norm": 1.1491593667771978, + "learning_rate": 8.680391921943371e-05, + "loss": 0.3316, + "step": 3043 + }, + { + "epoch": 0.2596826480122846, + "grad_norm": 1.553277147543048, + "learning_rate": 8.679456630864048e-05, + "loss": 0.3564, + "step": 3044 + }, + { + "epoch": 0.25976795768640165, + "grad_norm": 1.337150068051128, + "learning_rate": 8.678521058877763e-05, + "loss": 0.3361, + "step": 3045 + }, + { + "epoch": 0.25985326736051867, + "grad_norm": 1.4604284858316048, + "learning_rate": 8.677585206055943e-05, + "loss": 0.3565, + "step": 3046 + }, + { + "epoch": 0.25993857703463574, + "grad_norm": 1.617565534699742, + "learning_rate": 8.676649072470034e-05, + "loss": 0.3062, + "step": 3047 + }, + { + "epoch": 0.26002388670875276, + "grad_norm": 1.3632805067350007, + "learning_rate": 8.675712658191508e-05, + "loss": 0.3213, + "step": 3048 + }, + { + "epoch": 0.26010919638286983, + "grad_norm": 1.4626916156670635, + "learning_rate": 8.674775963291853e-05, + "loss": 0.3472, + "step": 3049 + }, + { + "epoch": 0.26019450605698685, + "grad_norm": 1.5393306285613562, + "learning_rate": 8.673838987842579e-05, + "loss": 0.3481, + "step": 3050 + }, + { + "epoch": 0.2602798157311039, + "grad_norm": 1.3719875975957447, + "learning_rate": 8.672901731915222e-05, + "loss": 0.3849, + "step": 3051 + }, + { + "epoch": 0.26036512540522094, + "grad_norm": 1.31406676918815, + "learning_rate": 8.671964195581336e-05, + "loss": 0.3052, + "step": 3052 + }, + { + "epoch": 0.260450435079338, + "grad_norm": 1.3424826567255939, + "learning_rate": 8.671026378912497e-05, + "loss": 0.3656, + "step": 3053 + }, + { + "epoch": 0.26053574475345503, + "grad_norm": 1.4092939704557612, + "learning_rate": 8.6700882819803e-05, + "loss": 0.3429, + "step": 3054 + }, + { + "epoch": 0.2606210544275721, + "grad_norm": 1.4377381500673827, + "learning_rate": 8.669149904856366e-05, + "loss": 0.3608, + "step": 3055 + }, + { + "epoch": 0.2607063641016891, + "grad_norm": 1.839771309821584, + "learning_rate": 8.668211247612335e-05, + "loss": 0.3664, + "step": 3056 + }, + { + "epoch": 0.2607916737758062, + "grad_norm": 1.502238113887016, + "learning_rate": 8.66727231031987e-05, + "loss": 0.346, + "step": 3057 + }, + { + "epoch": 0.2608769834499232, + "grad_norm": 1.279663173271976, + "learning_rate": 8.666333093050649e-05, + "loss": 0.3516, + "step": 3058 + }, + { + "epoch": 0.2609622931240403, + "grad_norm": 1.5066432075587148, + "learning_rate": 8.665393595876379e-05, + "loss": 0.3221, + "step": 3059 + }, + { + "epoch": 0.2610476027981573, + "grad_norm": 1.4060456021430408, + "learning_rate": 8.664453818868789e-05, + "loss": 0.3009, + "step": 3060 + }, + { + "epoch": 0.2611329124722744, + "grad_norm": 1.4341868491177756, + "learning_rate": 8.66351376209962e-05, + "loss": 0.2814, + "step": 3061 + }, + { + "epoch": 0.2612182221463914, + "grad_norm": 1.3992343103000475, + "learning_rate": 8.662573425640645e-05, + "loss": 0.3256, + "step": 3062 + }, + { + "epoch": 0.2613035318205085, + "grad_norm": 2.1943582110087303, + "learning_rate": 8.661632809563651e-05, + "loss": 0.2902, + "step": 3063 + }, + { + "epoch": 0.2613888414946255, + "grad_norm": 1.3939872129283466, + "learning_rate": 8.66069191394045e-05, + "loss": 0.348, + "step": 3064 + }, + { + "epoch": 0.2614741511687425, + "grad_norm": 1.4331196928861816, + "learning_rate": 8.659750738842873e-05, + "loss": 0.3189, + "step": 3065 + }, + { + "epoch": 0.2615594608428596, + "grad_norm": 1.3862143875377115, + "learning_rate": 8.658809284342778e-05, + "loss": 0.2965, + "step": 3066 + }, + { + "epoch": 0.2616447705169766, + "grad_norm": 1.526256750093447, + "learning_rate": 8.657867550512033e-05, + "loss": 0.3405, + "step": 3067 + }, + { + "epoch": 0.2617300801910937, + "grad_norm": 1.5203796999155452, + "learning_rate": 8.656925537422542e-05, + "loss": 0.3879, + "step": 3068 + }, + { + "epoch": 0.2618153898652107, + "grad_norm": 1.3143922698173816, + "learning_rate": 8.655983245146217e-05, + "loss": 0.2931, + "step": 3069 + }, + { + "epoch": 0.26190069953932776, + "grad_norm": 1.2980081135153856, + "learning_rate": 8.655040673754999e-05, + "loss": 0.3403, + "step": 3070 + }, + { + "epoch": 0.2619860092134448, + "grad_norm": 1.5824148207605895, + "learning_rate": 8.65409782332085e-05, + "loss": 0.3928, + "step": 3071 + }, + { + "epoch": 0.26207131888756185, + "grad_norm": 1.4390277208066564, + "learning_rate": 8.65315469391575e-05, + "loss": 0.3682, + "step": 3072 + }, + { + "epoch": 0.2621566285616789, + "grad_norm": 1.392915403978097, + "learning_rate": 8.652211285611701e-05, + "loss": 0.3519, + "step": 3073 + }, + { + "epoch": 0.26224193823579595, + "grad_norm": 1.4755221137855792, + "learning_rate": 8.651267598480728e-05, + "loss": 0.3503, + "step": 3074 + }, + { + "epoch": 0.26232724790991296, + "grad_norm": 1.3528538526954528, + "learning_rate": 8.650323632594877e-05, + "loss": 0.3307, + "step": 3075 + }, + { + "epoch": 0.26241255758403004, + "grad_norm": 1.3000967613302523, + "learning_rate": 8.649379388026215e-05, + "loss": 0.3065, + "step": 3076 + }, + { + "epoch": 0.26249786725814706, + "grad_norm": 1.577878573978356, + "learning_rate": 8.64843486484683e-05, + "loss": 0.3523, + "step": 3077 + }, + { + "epoch": 0.26258317693226413, + "grad_norm": 1.588848580484164, + "learning_rate": 8.64749006312883e-05, + "loss": 0.3255, + "step": 3078 + }, + { + "epoch": 0.26266848660638115, + "grad_norm": 1.4344220587962353, + "learning_rate": 8.646544982944345e-05, + "loss": 0.3563, + "step": 3079 + }, + { + "epoch": 0.2627537962804982, + "grad_norm": 2.0039160060830983, + "learning_rate": 8.64559962436553e-05, + "loss": 0.3568, + "step": 3080 + }, + { + "epoch": 0.26283910595461524, + "grad_norm": 1.3400232845389999, + "learning_rate": 8.644653987464555e-05, + "loss": 0.3153, + "step": 3081 + }, + { + "epoch": 0.2629244156287323, + "grad_norm": 1.4568472329287525, + "learning_rate": 8.643708072313618e-05, + "loss": 0.3515, + "step": 3082 + }, + { + "epoch": 0.26300972530284933, + "grad_norm": 1.3308277230491854, + "learning_rate": 8.642761878984931e-05, + "loss": 0.3528, + "step": 3083 + }, + { + "epoch": 0.2630950349769664, + "grad_norm": 1.2670268548439259, + "learning_rate": 8.64181540755073e-05, + "loss": 0.2887, + "step": 3084 + }, + { + "epoch": 0.2631803446510834, + "grad_norm": 1.5778104260847121, + "learning_rate": 8.640868658083279e-05, + "loss": 0.3234, + "step": 3085 + }, + { + "epoch": 0.2632656543252005, + "grad_norm": 1.6607057626860051, + "learning_rate": 8.639921630654852e-05, + "loss": 0.3661, + "step": 3086 + }, + { + "epoch": 0.2633509639993175, + "grad_norm": 1.3483412618761554, + "learning_rate": 8.63897432533775e-05, + "loss": 0.2855, + "step": 3087 + }, + { + "epoch": 0.2634362736734346, + "grad_norm": 1.4059550505842304, + "learning_rate": 8.638026742204298e-05, + "loss": 0.3936, + "step": 3088 + }, + { + "epoch": 0.2635215833475516, + "grad_norm": 1.7124120898549997, + "learning_rate": 8.637078881326834e-05, + "loss": 0.3146, + "step": 3089 + }, + { + "epoch": 0.2636068930216687, + "grad_norm": 1.4982992703162394, + "learning_rate": 8.636130742777725e-05, + "loss": 0.345, + "step": 3090 + }, + { + "epoch": 0.2636922026957857, + "grad_norm": 1.366912901413495, + "learning_rate": 8.635182326629358e-05, + "loss": 0.3618, + "step": 3091 + }, + { + "epoch": 0.26377751236990277, + "grad_norm": 1.9961019915207185, + "learning_rate": 8.634233632954139e-05, + "loss": 0.4345, + "step": 3092 + }, + { + "epoch": 0.2638628220440198, + "grad_norm": 1.518570387780574, + "learning_rate": 8.633284661824492e-05, + "loss": 0.3332, + "step": 3093 + }, + { + "epoch": 0.26394813171813686, + "grad_norm": 1.723163832710155, + "learning_rate": 8.632335413312869e-05, + "loss": 0.34, + "step": 3094 + }, + { + "epoch": 0.2640334413922539, + "grad_norm": 1.3493522255527375, + "learning_rate": 8.631385887491739e-05, + "loss": 0.2948, + "step": 3095 + }, + { + "epoch": 0.26411875106637095, + "grad_norm": 1.5183218332825907, + "learning_rate": 8.630436084433593e-05, + "loss": 0.3949, + "step": 3096 + }, + { + "epoch": 0.26420406074048797, + "grad_norm": 1.2929480974559047, + "learning_rate": 8.629486004210945e-05, + "loss": 0.3784, + "step": 3097 + }, + { + "epoch": 0.26428937041460504, + "grad_norm": 1.38836335995013, + "learning_rate": 8.628535646896328e-05, + "loss": 0.339, + "step": 3098 + }, + { + "epoch": 0.26437468008872206, + "grad_norm": 1.3295738844066054, + "learning_rate": 8.627585012562296e-05, + "loss": 0.2839, + "step": 3099 + }, + { + "epoch": 0.26445998976283913, + "grad_norm": 1.372469393974321, + "learning_rate": 8.626634101281425e-05, + "loss": 0.3321, + "step": 3100 + }, + { + "epoch": 0.26454529943695615, + "grad_norm": 1.3036830027584212, + "learning_rate": 8.625682913126311e-05, + "loss": 0.3286, + "step": 3101 + }, + { + "epoch": 0.2646306091110732, + "grad_norm": 1.6531174023170605, + "learning_rate": 8.624731448169576e-05, + "loss": 0.3753, + "step": 3102 + }, + { + "epoch": 0.26471591878519024, + "grad_norm": 1.589073500693624, + "learning_rate": 8.623779706483855e-05, + "loss": 0.3192, + "step": 3103 + }, + { + "epoch": 0.26480122845930726, + "grad_norm": 1.4729251363133966, + "learning_rate": 8.622827688141812e-05, + "loss": 0.3649, + "step": 3104 + }, + { + "epoch": 0.26488653813342433, + "grad_norm": 1.3576136611368028, + "learning_rate": 8.621875393216126e-05, + "loss": 0.2983, + "step": 3105 + }, + { + "epoch": 0.26497184780754135, + "grad_norm": 1.4772027459274788, + "learning_rate": 8.620922821779498e-05, + "loss": 0.3316, + "step": 3106 + }, + { + "epoch": 0.2650571574816584, + "grad_norm": 1.4904877733447646, + "learning_rate": 8.619969973904655e-05, + "loss": 0.3471, + "step": 3107 + }, + { + "epoch": 0.26514246715577544, + "grad_norm": 1.4203615622864658, + "learning_rate": 8.619016849664344e-05, + "loss": 0.3057, + "step": 3108 + }, + { + "epoch": 0.2652277768298925, + "grad_norm": 1.1494325232826128, + "learning_rate": 8.618063449131327e-05, + "loss": 0.3272, + "step": 3109 + }, + { + "epoch": 0.26531308650400953, + "grad_norm": 1.349144111517021, + "learning_rate": 8.617109772378388e-05, + "loss": 0.2717, + "step": 3110 + }, + { + "epoch": 0.2653983961781266, + "grad_norm": 1.6016958785924373, + "learning_rate": 8.616155819478344e-05, + "loss": 0.3741, + "step": 3111 + }, + { + "epoch": 0.2654837058522436, + "grad_norm": 1.630586850358637, + "learning_rate": 8.615201590504017e-05, + "loss": 0.378, + "step": 3112 + }, + { + "epoch": 0.2655690155263607, + "grad_norm": 1.5379313848441112, + "learning_rate": 8.61424708552826e-05, + "loss": 0.3426, + "step": 3113 + }, + { + "epoch": 0.2656543252004777, + "grad_norm": 1.2761062086741253, + "learning_rate": 8.613292304623945e-05, + "loss": 0.3811, + "step": 3114 + }, + { + "epoch": 0.2657396348745948, + "grad_norm": 1.3839508371275988, + "learning_rate": 8.612337247863962e-05, + "loss": 0.353, + "step": 3115 + }, + { + "epoch": 0.2658249445487118, + "grad_norm": 1.4044281071874263, + "learning_rate": 8.611381915321227e-05, + "loss": 0.3375, + "step": 3116 + }, + { + "epoch": 0.2659102542228289, + "grad_norm": 1.3726694991156794, + "learning_rate": 8.610426307068674e-05, + "loss": 0.3633, + "step": 3117 + }, + { + "epoch": 0.2659955638969459, + "grad_norm": 1.5069201106593766, + "learning_rate": 8.609470423179258e-05, + "loss": 0.3398, + "step": 3118 + }, + { + "epoch": 0.26608087357106297, + "grad_norm": 1.5158396303459571, + "learning_rate": 8.608514263725955e-05, + "loss": 0.3302, + "step": 3119 + }, + { + "epoch": 0.26616618324518, + "grad_norm": 1.44565029836273, + "learning_rate": 8.607557828781765e-05, + "loss": 0.2858, + "step": 3120 + }, + { + "epoch": 0.26625149291929706, + "grad_norm": 1.352140446393673, + "learning_rate": 8.606601118419705e-05, + "loss": 0.3167, + "step": 3121 + }, + { + "epoch": 0.2663368025934141, + "grad_norm": 1.3671388017973236, + "learning_rate": 8.605644132712814e-05, + "loss": 0.3567, + "step": 3122 + }, + { + "epoch": 0.26642211226753115, + "grad_norm": 1.528795727773433, + "learning_rate": 8.604686871734156e-05, + "loss": 0.3973, + "step": 3123 + }, + { + "epoch": 0.26650742194164817, + "grad_norm": 1.3597987599706796, + "learning_rate": 8.603729335556808e-05, + "loss": 0.3197, + "step": 3124 + }, + { + "epoch": 0.26659273161576524, + "grad_norm": 1.6537329062618593, + "learning_rate": 8.602771524253876e-05, + "loss": 0.337, + "step": 3125 + }, + { + "epoch": 0.26667804128988226, + "grad_norm": 1.2231367343189616, + "learning_rate": 8.601813437898484e-05, + "loss": 0.3664, + "step": 3126 + }, + { + "epoch": 0.26676335096399933, + "grad_norm": 1.290549928638571, + "learning_rate": 8.600855076563776e-05, + "loss": 0.2801, + "step": 3127 + }, + { + "epoch": 0.26684866063811635, + "grad_norm": 1.4564960713881923, + "learning_rate": 8.599896440322918e-05, + "loss": 0.3151, + "step": 3128 + }, + { + "epoch": 0.2669339703122334, + "grad_norm": 1.6629149012443534, + "learning_rate": 8.598937529249096e-05, + "loss": 0.3706, + "step": 3129 + }, + { + "epoch": 0.26701927998635044, + "grad_norm": 1.7136640727220092, + "learning_rate": 8.59797834341552e-05, + "loss": 0.3638, + "step": 3130 + }, + { + "epoch": 0.2671045896604675, + "grad_norm": 1.3129248612629463, + "learning_rate": 8.597018882895416e-05, + "loss": 0.2982, + "step": 3131 + }, + { + "epoch": 0.26718989933458454, + "grad_norm": 1.4849943776136627, + "learning_rate": 8.596059147762034e-05, + "loss": 0.2722, + "step": 3132 + }, + { + "epoch": 0.2672752090087016, + "grad_norm": 1.4533848683963906, + "learning_rate": 8.595099138088644e-05, + "loss": 0.3353, + "step": 3133 + }, + { + "epoch": 0.2673605186828186, + "grad_norm": 1.540198047585879, + "learning_rate": 8.594138853948544e-05, + "loss": 0.3318, + "step": 3134 + }, + { + "epoch": 0.2674458283569357, + "grad_norm": 1.5108750623329157, + "learning_rate": 8.593178295415038e-05, + "loss": 0.3123, + "step": 3135 + }, + { + "epoch": 0.2675311380310527, + "grad_norm": 1.5197701156095704, + "learning_rate": 8.592217462561465e-05, + "loss": 0.3124, + "step": 3136 + }, + { + "epoch": 0.2676164477051698, + "grad_norm": 1.4537504757382638, + "learning_rate": 8.591256355461176e-05, + "loss": 0.2847, + "step": 3137 + }, + { + "epoch": 0.2677017573792868, + "grad_norm": 1.4979723895677204, + "learning_rate": 8.59029497418755e-05, + "loss": 0.3673, + "step": 3138 + }, + { + "epoch": 0.2677870670534039, + "grad_norm": 1.4521686319102718, + "learning_rate": 8.58933331881398e-05, + "loss": 0.3196, + "step": 3139 + }, + { + "epoch": 0.2678723767275209, + "grad_norm": 1.4784337050043108, + "learning_rate": 8.588371389413885e-05, + "loss": 0.358, + "step": 3140 + }, + { + "epoch": 0.267957686401638, + "grad_norm": 1.502275013256262, + "learning_rate": 8.587409186060704e-05, + "loss": 0.2872, + "step": 3141 + }, + { + "epoch": 0.268042996075755, + "grad_norm": 1.6381452439391564, + "learning_rate": 8.586446708827896e-05, + "loss": 0.3372, + "step": 3142 + }, + { + "epoch": 0.268128305749872, + "grad_norm": 1.4410624934139102, + "learning_rate": 8.585483957788938e-05, + "loss": 0.329, + "step": 3143 + }, + { + "epoch": 0.2682136154239891, + "grad_norm": 1.7071232192625228, + "learning_rate": 8.584520933017333e-05, + "loss": 0.3452, + "step": 3144 + }, + { + "epoch": 0.2682989250981061, + "grad_norm": 1.4895139380164961, + "learning_rate": 8.583557634586605e-05, + "loss": 0.2927, + "step": 3145 + }, + { + "epoch": 0.2683842347722232, + "grad_norm": 1.5561110530243278, + "learning_rate": 8.582594062570292e-05, + "loss": 0.3186, + "step": 3146 + }, + { + "epoch": 0.2684695444463402, + "grad_norm": 1.1029807625940662, + "learning_rate": 8.581630217041963e-05, + "loss": 0.3069, + "step": 3147 + }, + { + "epoch": 0.26855485412045726, + "grad_norm": 1.4363126412086953, + "learning_rate": 8.580666098075197e-05, + "loss": 0.3908, + "step": 3148 + }, + { + "epoch": 0.2686401637945743, + "grad_norm": 1.357976347384424, + "learning_rate": 8.579701705743604e-05, + "loss": 0.2974, + "step": 3149 + }, + { + "epoch": 0.26872547346869136, + "grad_norm": 1.5094642881280065, + "learning_rate": 8.578737040120807e-05, + "loss": 0.3313, + "step": 3150 + }, + { + "epoch": 0.2688107831428084, + "grad_norm": 1.3019105974697402, + "learning_rate": 8.577772101280456e-05, + "loss": 0.3023, + "step": 3151 + }, + { + "epoch": 0.26889609281692545, + "grad_norm": 1.5431179312524392, + "learning_rate": 8.576806889296216e-05, + "loss": 0.3756, + "step": 3152 + }, + { + "epoch": 0.26898140249104247, + "grad_norm": 1.1551190593175797, + "learning_rate": 8.575841404241777e-05, + "loss": 0.2902, + "step": 3153 + }, + { + "epoch": 0.26906671216515954, + "grad_norm": 1.3768153777787038, + "learning_rate": 8.57487564619085e-05, + "loss": 0.2905, + "step": 3154 + }, + { + "epoch": 0.26915202183927656, + "grad_norm": 1.5264100638021838, + "learning_rate": 8.573909615217163e-05, + "loss": 0.2959, + "step": 3155 + }, + { + "epoch": 0.26923733151339363, + "grad_norm": 1.7123821552131855, + "learning_rate": 8.572943311394468e-05, + "loss": 0.3287, + "step": 3156 + }, + { + "epoch": 0.26932264118751065, + "grad_norm": 1.1276937659113369, + "learning_rate": 8.57197673479654e-05, + "loss": 0.3028, + "step": 3157 + }, + { + "epoch": 0.2694079508616277, + "grad_norm": 1.3570570201546166, + "learning_rate": 8.571009885497168e-05, + "loss": 0.3072, + "step": 3158 + }, + { + "epoch": 0.26949326053574474, + "grad_norm": 1.3137524396617823, + "learning_rate": 8.570042763570168e-05, + "loss": 0.2637, + "step": 3159 + }, + { + "epoch": 0.2695785702098618, + "grad_norm": 1.637324692172651, + "learning_rate": 8.569075369089374e-05, + "loss": 0.4194, + "step": 3160 + }, + { + "epoch": 0.26966387988397883, + "grad_norm": 1.640879584042099, + "learning_rate": 8.568107702128642e-05, + "loss": 0.3345, + "step": 3161 + }, + { + "epoch": 0.2697491895580959, + "grad_norm": 1.4821518965033482, + "learning_rate": 8.56713976276185e-05, + "loss": 0.312, + "step": 3162 + }, + { + "epoch": 0.2698344992322129, + "grad_norm": 1.844939155505761, + "learning_rate": 8.566171551062889e-05, + "loss": 0.3798, + "step": 3163 + }, + { + "epoch": 0.26991980890633, + "grad_norm": 1.8194832284043798, + "learning_rate": 8.565203067105683e-05, + "loss": 0.3979, + "step": 3164 + }, + { + "epoch": 0.270005118580447, + "grad_norm": 1.5768029746934769, + "learning_rate": 8.564234310964168e-05, + "loss": 0.3326, + "step": 3165 + }, + { + "epoch": 0.2700904282545641, + "grad_norm": 1.2799261743892094, + "learning_rate": 8.563265282712303e-05, + "loss": 0.3643, + "step": 3166 + }, + { + "epoch": 0.2701757379286811, + "grad_norm": 2.05112964395591, + "learning_rate": 8.562295982424069e-05, + "loss": 0.3452, + "step": 3167 + }, + { + "epoch": 0.2702610476027982, + "grad_norm": 1.8945278521635123, + "learning_rate": 8.561326410173467e-05, + "loss": 0.3582, + "step": 3168 + }, + { + "epoch": 0.2703463572769152, + "grad_norm": 1.5343513222788427, + "learning_rate": 8.560356566034518e-05, + "loss": 0.3327, + "step": 3169 + }, + { + "epoch": 0.27043166695103227, + "grad_norm": 1.2093972531816906, + "learning_rate": 8.559386450081266e-05, + "loss": 0.2776, + "step": 3170 + }, + { + "epoch": 0.2705169766251493, + "grad_norm": 1.3200251249609831, + "learning_rate": 8.558416062387772e-05, + "loss": 0.3178, + "step": 3171 + }, + { + "epoch": 0.27060228629926636, + "grad_norm": 1.2846795679291543, + "learning_rate": 8.557445403028122e-05, + "loss": 0.33, + "step": 3172 + }, + { + "epoch": 0.2706875959733834, + "grad_norm": 1.629879813765778, + "learning_rate": 8.556474472076419e-05, + "loss": 0.3885, + "step": 3173 + }, + { + "epoch": 0.27077290564750045, + "grad_norm": 1.4548785386398495, + "learning_rate": 8.555503269606789e-05, + "loss": 0.2985, + "step": 3174 + }, + { + "epoch": 0.27085821532161747, + "grad_norm": 1.4123131484953475, + "learning_rate": 8.55453179569338e-05, + "loss": 0.3551, + "step": 3175 + }, + { + "epoch": 0.27094352499573454, + "grad_norm": 1.2164955317097301, + "learning_rate": 8.553560050410354e-05, + "loss": 0.353, + "step": 3176 + }, + { + "epoch": 0.27102883466985156, + "grad_norm": 1.118863109646051, + "learning_rate": 8.552588033831905e-05, + "loss": 0.2678, + "step": 3177 + }, + { + "epoch": 0.27111414434396863, + "grad_norm": 1.385781113532844, + "learning_rate": 8.551615746032235e-05, + "loss": 0.381, + "step": 3178 + }, + { + "epoch": 0.27119945401808565, + "grad_norm": 1.8962208557906821, + "learning_rate": 8.550643187085579e-05, + "loss": 0.3426, + "step": 3179 + }, + { + "epoch": 0.27128476369220267, + "grad_norm": 1.4719504529399194, + "learning_rate": 8.549670357066182e-05, + "loss": 0.2895, + "step": 3180 + }, + { + "epoch": 0.27137007336631974, + "grad_norm": 1.413848860814623, + "learning_rate": 8.548697256048317e-05, + "loss": 0.2934, + "step": 3181 + }, + { + "epoch": 0.27145538304043676, + "grad_norm": 1.683581571247279, + "learning_rate": 8.547723884106274e-05, + "loss": 0.3621, + "step": 3182 + }, + { + "epoch": 0.27154069271455383, + "grad_norm": 1.5979216338888234, + "learning_rate": 8.546750241314365e-05, + "loss": 0.3156, + "step": 3183 + }, + { + "epoch": 0.27162600238867085, + "grad_norm": 1.5017289687656301, + "learning_rate": 8.545776327746922e-05, + "loss": 0.4461, + "step": 3184 + }, + { + "epoch": 0.2717113120627879, + "grad_norm": 1.617375607077749, + "learning_rate": 8.544802143478298e-05, + "loss": 0.3693, + "step": 3185 + }, + { + "epoch": 0.27179662173690494, + "grad_norm": 1.2560446974734165, + "learning_rate": 8.543827688582868e-05, + "loss": 0.2902, + "step": 3186 + }, + { + "epoch": 0.271881931411022, + "grad_norm": 1.4167020830701467, + "learning_rate": 8.542852963135029e-05, + "loss": 0.4055, + "step": 3187 + }, + { + "epoch": 0.27196724108513903, + "grad_norm": 1.6542238041436705, + "learning_rate": 8.541877967209189e-05, + "loss": 0.3458, + "step": 3188 + }, + { + "epoch": 0.2720525507592561, + "grad_norm": 1.4811492402778688, + "learning_rate": 8.540902700879789e-05, + "loss": 0.3442, + "step": 3189 + }, + { + "epoch": 0.2721378604333731, + "grad_norm": 1.5041860344961626, + "learning_rate": 8.539927164221285e-05, + "loss": 0.3677, + "step": 3190 + }, + { + "epoch": 0.2722231701074902, + "grad_norm": 1.3161097342589196, + "learning_rate": 8.538951357308151e-05, + "loss": 0.2872, + "step": 3191 + }, + { + "epoch": 0.2723084797816072, + "grad_norm": 1.5178769086304127, + "learning_rate": 8.537975280214889e-05, + "loss": 0.3459, + "step": 3192 + }, + { + "epoch": 0.2723937894557243, + "grad_norm": 1.4573571707360222, + "learning_rate": 8.536998933016014e-05, + "loss": 0.3812, + "step": 3193 + }, + { + "epoch": 0.2724790991298413, + "grad_norm": 1.4872678174804583, + "learning_rate": 8.536022315786065e-05, + "loss": 0.3433, + "step": 3194 + }, + { + "epoch": 0.2725644088039584, + "grad_norm": 1.4010468758222698, + "learning_rate": 8.535045428599604e-05, + "loss": 0.34, + "step": 3195 + }, + { + "epoch": 0.2726497184780754, + "grad_norm": 1.5062027476643154, + "learning_rate": 8.53406827153121e-05, + "loss": 0.3594, + "step": 3196 + }, + { + "epoch": 0.27273502815219247, + "grad_norm": 1.4687529218928717, + "learning_rate": 8.533090844655482e-05, + "loss": 0.295, + "step": 3197 + }, + { + "epoch": 0.2728203378263095, + "grad_norm": 1.6992135584960206, + "learning_rate": 8.532113148047045e-05, + "loss": 0.3585, + "step": 3198 + }, + { + "epoch": 0.27290564750042656, + "grad_norm": 1.3532578083787694, + "learning_rate": 8.531135181780537e-05, + "loss": 0.3418, + "step": 3199 + }, + { + "epoch": 0.2729909571745436, + "grad_norm": 1.3757617747603412, + "learning_rate": 8.530156945930624e-05, + "loss": 0.3423, + "step": 3200 + }, + { + "epoch": 0.27307626684866065, + "grad_norm": 1.5732192281116824, + "learning_rate": 8.529178440571986e-05, + "loss": 0.3433, + "step": 3201 + }, + { + "epoch": 0.27316157652277767, + "grad_norm": 1.6741522850126527, + "learning_rate": 8.528199665779328e-05, + "loss": 0.38, + "step": 3202 + }, + { + "epoch": 0.27324688619689474, + "grad_norm": 1.3464859898543406, + "learning_rate": 8.527220621627375e-05, + "loss": 0.3021, + "step": 3203 + }, + { + "epoch": 0.27333219587101176, + "grad_norm": 1.4113209318542042, + "learning_rate": 8.526241308190871e-05, + "loss": 0.3137, + "step": 3204 + }, + { + "epoch": 0.27341750554512884, + "grad_norm": 1.4601586861791642, + "learning_rate": 8.525261725544582e-05, + "loss": 0.3742, + "step": 3205 + }, + { + "epoch": 0.27350281521924585, + "grad_norm": 1.7458513676236165, + "learning_rate": 8.524281873763293e-05, + "loss": 0.4056, + "step": 3206 + }, + { + "epoch": 0.2735881248933629, + "grad_norm": 1.7115130566860386, + "learning_rate": 8.523301752921811e-05, + "loss": 0.3479, + "step": 3207 + }, + { + "epoch": 0.27367343456747995, + "grad_norm": 1.4368004962841698, + "learning_rate": 8.522321363094962e-05, + "loss": 0.3289, + "step": 3208 + }, + { + "epoch": 0.273758744241597, + "grad_norm": 1.3455194754890352, + "learning_rate": 8.521340704357597e-05, + "loss": 0.3303, + "step": 3209 + }, + { + "epoch": 0.27384405391571404, + "grad_norm": 1.2842564549000774, + "learning_rate": 8.520359776784579e-05, + "loss": 0.3009, + "step": 3210 + }, + { + "epoch": 0.2739293635898311, + "grad_norm": 1.219545935885109, + "learning_rate": 8.5193785804508e-05, + "loss": 0.3325, + "step": 3211 + }, + { + "epoch": 0.2740146732639481, + "grad_norm": 1.1311997081052088, + "learning_rate": 8.518397115431169e-05, + "loss": 0.2972, + "step": 3212 + }, + { + "epoch": 0.2740999829380652, + "grad_norm": 1.4741020896371484, + "learning_rate": 8.517415381800615e-05, + "loss": 0.3357, + "step": 3213 + }, + { + "epoch": 0.2741852926121822, + "grad_norm": 1.758288238573877, + "learning_rate": 8.516433379634088e-05, + "loss": 0.4123, + "step": 3214 + }, + { + "epoch": 0.2742706022862993, + "grad_norm": 1.5411729623517385, + "learning_rate": 8.515451109006558e-05, + "loss": 0.3495, + "step": 3215 + }, + { + "epoch": 0.2743559119604163, + "grad_norm": 1.5394816336209887, + "learning_rate": 8.514468569993017e-05, + "loss": 0.3365, + "step": 3216 + }, + { + "epoch": 0.2744412216345334, + "grad_norm": 1.406423897057721, + "learning_rate": 8.513485762668476e-05, + "loss": 0.3098, + "step": 3217 + }, + { + "epoch": 0.2745265313086504, + "grad_norm": 1.257269463689414, + "learning_rate": 8.51250268710797e-05, + "loss": 0.3044, + "step": 3218 + }, + { + "epoch": 0.2746118409827674, + "grad_norm": 1.477918456407827, + "learning_rate": 8.511519343386547e-05, + "loss": 0.3363, + "step": 3219 + }, + { + "epoch": 0.2746971506568845, + "grad_norm": 1.4509210587998984, + "learning_rate": 8.510535731579283e-05, + "loss": 0.3551, + "step": 3220 + }, + { + "epoch": 0.2747824603310015, + "grad_norm": 1.7272292010741404, + "learning_rate": 8.50955185176127e-05, + "loss": 0.3524, + "step": 3221 + }, + { + "epoch": 0.2748677700051186, + "grad_norm": 1.4030196492293532, + "learning_rate": 8.508567704007627e-05, + "loss": 0.3777, + "step": 3222 + }, + { + "epoch": 0.2749530796792356, + "grad_norm": 1.2421313579036137, + "learning_rate": 8.507583288393479e-05, + "loss": 0.2874, + "step": 3223 + }, + { + "epoch": 0.2750383893533527, + "grad_norm": 1.5681339163619845, + "learning_rate": 8.50659860499399e-05, + "loss": 0.3275, + "step": 3224 + }, + { + "epoch": 0.2751236990274697, + "grad_norm": 1.416380189465466, + "learning_rate": 8.50561365388433e-05, + "loss": 0.3016, + "step": 3225 + }, + { + "epoch": 0.27520900870158677, + "grad_norm": 1.630040347998262, + "learning_rate": 8.504628435139696e-05, + "loss": 0.3452, + "step": 3226 + }, + { + "epoch": 0.2752943183757038, + "grad_norm": 1.2900702034529115, + "learning_rate": 8.503642948835305e-05, + "loss": 0.3213, + "step": 3227 + }, + { + "epoch": 0.27537962804982086, + "grad_norm": 1.3360982876217569, + "learning_rate": 8.502657195046393e-05, + "loss": 0.2945, + "step": 3228 + }, + { + "epoch": 0.2754649377239379, + "grad_norm": 1.3543529040088138, + "learning_rate": 8.501671173848217e-05, + "loss": 0.3006, + "step": 3229 + }, + { + "epoch": 0.27555024739805495, + "grad_norm": 1.7045797731817243, + "learning_rate": 8.500684885316055e-05, + "loss": 0.3483, + "step": 3230 + }, + { + "epoch": 0.27563555707217197, + "grad_norm": 1.41713410500876, + "learning_rate": 8.499698329525205e-05, + "loss": 0.3199, + "step": 3231 + }, + { + "epoch": 0.27572086674628904, + "grad_norm": 1.3588575935052503, + "learning_rate": 8.498711506550983e-05, + "loss": 0.3174, + "step": 3232 + }, + { + "epoch": 0.27580617642040606, + "grad_norm": 1.6015048877309197, + "learning_rate": 8.497724416468733e-05, + "loss": 0.3384, + "step": 3233 + }, + { + "epoch": 0.27589148609452313, + "grad_norm": 1.3242333785517817, + "learning_rate": 8.496737059353809e-05, + "loss": 0.3479, + "step": 3234 + }, + { + "epoch": 0.27597679576864015, + "grad_norm": 1.5941146732565068, + "learning_rate": 8.495749435281592e-05, + "loss": 0.3803, + "step": 3235 + }, + { + "epoch": 0.2760621054427572, + "grad_norm": 1.3594307723607364, + "learning_rate": 8.49476154432748e-05, + "loss": 0.2856, + "step": 3236 + }, + { + "epoch": 0.27614741511687424, + "grad_norm": 1.7510454597258083, + "learning_rate": 8.493773386566899e-05, + "loss": 0.4265, + "step": 3237 + }, + { + "epoch": 0.2762327247909913, + "grad_norm": 1.8905819974690472, + "learning_rate": 8.492784962075284e-05, + "loss": 0.3643, + "step": 3238 + }, + { + "epoch": 0.27631803446510833, + "grad_norm": 1.4494697785241955, + "learning_rate": 8.491796270928099e-05, + "loss": 0.3121, + "step": 3239 + }, + { + "epoch": 0.2764033441392254, + "grad_norm": 1.5866443580907725, + "learning_rate": 8.490807313200822e-05, + "loss": 0.3428, + "step": 3240 + }, + { + "epoch": 0.2764886538133424, + "grad_norm": 1.1746328328070283, + "learning_rate": 8.489818088968957e-05, + "loss": 0.2844, + "step": 3241 + }, + { + "epoch": 0.2765739634874595, + "grad_norm": 1.3423149963148238, + "learning_rate": 8.488828598308028e-05, + "loss": 0.323, + "step": 3242 + }, + { + "epoch": 0.2766592731615765, + "grad_norm": 1.6295716461228122, + "learning_rate": 8.487838841293572e-05, + "loss": 0.3705, + "step": 3243 + }, + { + "epoch": 0.2767445828356936, + "grad_norm": 1.4624881776511176, + "learning_rate": 8.486848818001158e-05, + "loss": 0.3229, + "step": 3244 + }, + { + "epoch": 0.2768298925098106, + "grad_norm": 1.3448026659117036, + "learning_rate": 8.485858528506363e-05, + "loss": 0.3177, + "step": 3245 + }, + { + "epoch": 0.2769152021839277, + "grad_norm": 1.2824414458859943, + "learning_rate": 8.484867972884795e-05, + "loss": 0.2893, + "step": 3246 + }, + { + "epoch": 0.2770005118580447, + "grad_norm": 1.2550780148213752, + "learning_rate": 8.483877151212077e-05, + "loss": 0.3047, + "step": 3247 + }, + { + "epoch": 0.27708582153216177, + "grad_norm": 1.3858524249090896, + "learning_rate": 8.482886063563849e-05, + "loss": 0.3544, + "step": 3248 + }, + { + "epoch": 0.2771711312062788, + "grad_norm": 1.450294855984901, + "learning_rate": 8.481894710015778e-05, + "loss": 0.331, + "step": 3249 + }, + { + "epoch": 0.27725644088039586, + "grad_norm": 1.372456799575954, + "learning_rate": 8.48090309064355e-05, + "loss": 0.3673, + "step": 3250 + }, + { + "epoch": 0.2773417505545129, + "grad_norm": 1.2907416302536872, + "learning_rate": 8.47991120552287e-05, + "loss": 0.2632, + "step": 3251 + }, + { + "epoch": 0.27742706022862995, + "grad_norm": 1.652896076353333, + "learning_rate": 8.47891905472946e-05, + "loss": 0.372, + "step": 3252 + }, + { + "epoch": 0.27751236990274697, + "grad_norm": 1.183749349205217, + "learning_rate": 8.477926638339067e-05, + "loss": 0.2789, + "step": 3253 + }, + { + "epoch": 0.27759767957686404, + "grad_norm": 1.234268425312323, + "learning_rate": 8.476933956427458e-05, + "loss": 0.2522, + "step": 3254 + }, + { + "epoch": 0.27768298925098106, + "grad_norm": 1.6162967901102516, + "learning_rate": 8.475941009070416e-05, + "loss": 0.3773, + "step": 3255 + }, + { + "epoch": 0.27776829892509813, + "grad_norm": 1.5316673021588478, + "learning_rate": 8.47494779634375e-05, + "loss": 0.3471, + "step": 3256 + }, + { + "epoch": 0.27785360859921515, + "grad_norm": 1.5197990593350212, + "learning_rate": 8.473954318323287e-05, + "loss": 0.3384, + "step": 3257 + }, + { + "epoch": 0.27793891827333217, + "grad_norm": 1.4461922931945335, + "learning_rate": 8.47296057508487e-05, + "loss": 0.319, + "step": 3258 + }, + { + "epoch": 0.27802422794744924, + "grad_norm": 1.4810689952370024, + "learning_rate": 8.471966566704369e-05, + "loss": 0.3055, + "step": 3259 + }, + { + "epoch": 0.27810953762156626, + "grad_norm": 1.5057654048429978, + "learning_rate": 8.470972293257671e-05, + "loss": 0.3238, + "step": 3260 + }, + { + "epoch": 0.27819484729568333, + "grad_norm": 1.5801303322068891, + "learning_rate": 8.469977754820683e-05, + "loss": 0.3649, + "step": 3261 + }, + { + "epoch": 0.27828015696980035, + "grad_norm": 1.276841557793486, + "learning_rate": 8.468982951469333e-05, + "loss": 0.3204, + "step": 3262 + }, + { + "epoch": 0.2783654666439174, + "grad_norm": 1.6153413859371455, + "learning_rate": 8.467987883279569e-05, + "loss": 0.3447, + "step": 3263 + }, + { + "epoch": 0.27845077631803444, + "grad_norm": 1.7843583408520036, + "learning_rate": 8.46699255032736e-05, + "loss": 0.3073, + "step": 3264 + }, + { + "epoch": 0.2785360859921515, + "grad_norm": 1.529667016614047, + "learning_rate": 8.46599695268869e-05, + "loss": 0.3074, + "step": 3265 + }, + { + "epoch": 0.27862139566626853, + "grad_norm": 1.2316795084072638, + "learning_rate": 8.465001090439575e-05, + "loss": 0.3052, + "step": 3266 + }, + { + "epoch": 0.2787067053403856, + "grad_norm": 1.860755087674725, + "learning_rate": 8.464004963656037e-05, + "loss": 0.3576, + "step": 3267 + }, + { + "epoch": 0.2787920150145026, + "grad_norm": 1.6383556838401239, + "learning_rate": 8.463008572414128e-05, + "loss": 0.3836, + "step": 3268 + }, + { + "epoch": 0.2788773246886197, + "grad_norm": 1.4133854715309067, + "learning_rate": 8.462011916789918e-05, + "loss": 0.2895, + "step": 3269 + }, + { + "epoch": 0.2789626343627367, + "grad_norm": 1.7390126264880421, + "learning_rate": 8.461014996859495e-05, + "loss": 0.3397, + "step": 3270 + }, + { + "epoch": 0.2790479440368538, + "grad_norm": 1.7094982486346575, + "learning_rate": 8.460017812698968e-05, + "loss": 0.3826, + "step": 3271 + }, + { + "epoch": 0.2791332537109708, + "grad_norm": 1.4436477558873608, + "learning_rate": 8.45902036438447e-05, + "loss": 0.3542, + "step": 3272 + }, + { + "epoch": 0.2792185633850879, + "grad_norm": 1.2943433386938028, + "learning_rate": 8.458022651992145e-05, + "loss": 0.3137, + "step": 3273 + }, + { + "epoch": 0.2793038730592049, + "grad_norm": 1.5120262602084855, + "learning_rate": 8.457024675598168e-05, + "loss": 0.3704, + "step": 3274 + }, + { + "epoch": 0.279389182733322, + "grad_norm": 1.3155534738411057, + "learning_rate": 8.456026435278728e-05, + "loss": 0.3619, + "step": 3275 + }, + { + "epoch": 0.279474492407439, + "grad_norm": 1.4975780962161398, + "learning_rate": 8.455027931110034e-05, + "loss": 0.4511, + "step": 3276 + }, + { + "epoch": 0.27955980208155606, + "grad_norm": 1.2498883674365329, + "learning_rate": 8.454029163168317e-05, + "loss": 0.292, + "step": 3277 + }, + { + "epoch": 0.2796451117556731, + "grad_norm": 1.281353457856169, + "learning_rate": 8.45303013152983e-05, + "loss": 0.2644, + "step": 3278 + }, + { + "epoch": 0.27973042142979015, + "grad_norm": 1.9394641272779254, + "learning_rate": 8.452030836270841e-05, + "loss": 0.3641, + "step": 3279 + }, + { + "epoch": 0.2798157311039072, + "grad_norm": 1.4612850652946614, + "learning_rate": 8.451031277467641e-05, + "loss": 0.2985, + "step": 3280 + }, + { + "epoch": 0.27990104077802425, + "grad_norm": 1.6980550354664306, + "learning_rate": 8.450031455196543e-05, + "loss": 0.3462, + "step": 3281 + }, + { + "epoch": 0.27998635045214126, + "grad_norm": 1.6438388408831757, + "learning_rate": 8.449031369533876e-05, + "loss": 0.3658, + "step": 3282 + }, + { + "epoch": 0.28007166012625834, + "grad_norm": 1.1406117791232666, + "learning_rate": 8.448031020555993e-05, + "loss": 0.3098, + "step": 3283 + }, + { + "epoch": 0.28015696980037536, + "grad_norm": 1.6280771610897802, + "learning_rate": 8.447030408339263e-05, + "loss": 0.3224, + "step": 3284 + }, + { + "epoch": 0.28024227947449243, + "grad_norm": 1.3737965866256385, + "learning_rate": 8.446029532960081e-05, + "loss": 0.3044, + "step": 3285 + }, + { + "epoch": 0.28032758914860945, + "grad_norm": 1.2651799679323588, + "learning_rate": 8.445028394494853e-05, + "loss": 0.3031, + "step": 3286 + }, + { + "epoch": 0.2804128988227265, + "grad_norm": 1.4279276538281835, + "learning_rate": 8.444026993020017e-05, + "loss": 0.3407, + "step": 3287 + }, + { + "epoch": 0.28049820849684354, + "grad_norm": 1.466022068398818, + "learning_rate": 8.44302532861202e-05, + "loss": 0.3466, + "step": 3288 + }, + { + "epoch": 0.2805835181709606, + "grad_norm": 1.547630664935542, + "learning_rate": 8.442023401347336e-05, + "loss": 0.3853, + "step": 3289 + }, + { + "epoch": 0.28066882784507763, + "grad_norm": 1.4845989660652854, + "learning_rate": 8.441021211302456e-05, + "loss": 0.3227, + "step": 3290 + }, + { + "epoch": 0.2807541375191947, + "grad_norm": 1.3731081258425433, + "learning_rate": 8.440018758553892e-05, + "loss": 0.297, + "step": 3291 + }, + { + "epoch": 0.2808394471933117, + "grad_norm": 1.3542310797214892, + "learning_rate": 8.439016043178176e-05, + "loss": 0.2916, + "step": 3292 + }, + { + "epoch": 0.2809247568674288, + "grad_norm": 1.579044187655882, + "learning_rate": 8.438013065251859e-05, + "loss": 0.3447, + "step": 3293 + }, + { + "epoch": 0.2810100665415458, + "grad_norm": 1.4147386146379841, + "learning_rate": 8.437009824851517e-05, + "loss": 0.283, + "step": 3294 + }, + { + "epoch": 0.28109537621566283, + "grad_norm": 1.5982873691371215, + "learning_rate": 8.436006322053737e-05, + "loss": 0.332, + "step": 3295 + }, + { + "epoch": 0.2811806858897799, + "grad_norm": 1.5977768980700873, + "learning_rate": 8.435002556935131e-05, + "loss": 0.366, + "step": 3296 + }, + { + "epoch": 0.2812659955638969, + "grad_norm": 1.2661792812953432, + "learning_rate": 8.433998529572338e-05, + "loss": 0.3315, + "step": 3297 + }, + { + "epoch": 0.281351305238014, + "grad_norm": 1.538122906921773, + "learning_rate": 8.432994240042003e-05, + "loss": 0.3527, + "step": 3298 + }, + { + "epoch": 0.281436614912131, + "grad_norm": 1.4380478022486332, + "learning_rate": 8.4319896884208e-05, + "loss": 0.3252, + "step": 3299 + }, + { + "epoch": 0.2815219245862481, + "grad_norm": 1.6481618063111143, + "learning_rate": 8.430984874785423e-05, + "loss": 0.3159, + "step": 3300 + }, + { + "epoch": 0.2816072342603651, + "grad_norm": 1.6161662392327385, + "learning_rate": 8.429979799212584e-05, + "loss": 0.3148, + "step": 3301 + }, + { + "epoch": 0.2816925439344822, + "grad_norm": 1.3649746234337483, + "learning_rate": 8.428974461779014e-05, + "loss": 0.3328, + "step": 3302 + }, + { + "epoch": 0.2817778536085992, + "grad_norm": 1.5663459747209152, + "learning_rate": 8.427968862561464e-05, + "loss": 0.3795, + "step": 3303 + }, + { + "epoch": 0.28186316328271627, + "grad_norm": 1.4929019356094069, + "learning_rate": 8.426963001636707e-05, + "loss": 0.3216, + "step": 3304 + }, + { + "epoch": 0.2819484729568333, + "grad_norm": 1.4736413873280179, + "learning_rate": 8.425956879081539e-05, + "loss": 0.3554, + "step": 3305 + }, + { + "epoch": 0.28203378263095036, + "grad_norm": 1.5627398497551328, + "learning_rate": 8.424950494972766e-05, + "loss": 0.3035, + "step": 3306 + }, + { + "epoch": 0.2821190923050674, + "grad_norm": 1.2831018602428044, + "learning_rate": 8.423943849387223e-05, + "loss": 0.2818, + "step": 3307 + }, + { + "epoch": 0.28220440197918445, + "grad_norm": 1.2368266229098361, + "learning_rate": 8.422936942401762e-05, + "loss": 0.2578, + "step": 3308 + }, + { + "epoch": 0.28228971165330147, + "grad_norm": 1.3644264461028859, + "learning_rate": 8.421929774093255e-05, + "loss": 0.2559, + "step": 3309 + }, + { + "epoch": 0.28237502132741854, + "grad_norm": 1.803418526649647, + "learning_rate": 8.420922344538594e-05, + "loss": 0.3125, + "step": 3310 + }, + { + "epoch": 0.28246033100153556, + "grad_norm": 1.678787558129074, + "learning_rate": 8.419914653814692e-05, + "loss": 0.3005, + "step": 3311 + }, + { + "epoch": 0.28254564067565263, + "grad_norm": 1.297847888444762, + "learning_rate": 8.418906701998477e-05, + "loss": 0.304, + "step": 3312 + }, + { + "epoch": 0.28263095034976965, + "grad_norm": 1.37536256518312, + "learning_rate": 8.417898489166905e-05, + "loss": 0.3339, + "step": 3313 + }, + { + "epoch": 0.2827162600238867, + "grad_norm": 1.3613046564736404, + "learning_rate": 8.416890015396947e-05, + "loss": 0.3115, + "step": 3314 + }, + { + "epoch": 0.28280156969800374, + "grad_norm": 1.7593075646169247, + "learning_rate": 8.415881280765591e-05, + "loss": 0.3745, + "step": 3315 + }, + { + "epoch": 0.2828868793721208, + "grad_norm": 1.4805887218972809, + "learning_rate": 8.414872285349854e-05, + "loss": 0.3282, + "step": 3316 + }, + { + "epoch": 0.28297218904623783, + "grad_norm": 1.3195118931490555, + "learning_rate": 8.413863029226762e-05, + "loss": 0.3408, + "step": 3317 + }, + { + "epoch": 0.2830574987203549, + "grad_norm": 1.5606287623227577, + "learning_rate": 8.412853512473371e-05, + "loss": 0.3294, + "step": 3318 + }, + { + "epoch": 0.2831428083944719, + "grad_norm": 1.6637385953058046, + "learning_rate": 8.41184373516675e-05, + "loss": 0.3563, + "step": 3319 + }, + { + "epoch": 0.283228118068589, + "grad_norm": 1.3956017752874732, + "learning_rate": 8.41083369738399e-05, + "loss": 0.3624, + "step": 3320 + }, + { + "epoch": 0.283313427742706, + "grad_norm": 1.4039791846077232, + "learning_rate": 8.409823399202203e-05, + "loss": 0.3498, + "step": 3321 + }, + { + "epoch": 0.2833987374168231, + "grad_norm": 1.7132339742376632, + "learning_rate": 8.408812840698517e-05, + "loss": 0.3058, + "step": 3322 + }, + { + "epoch": 0.2834840470909401, + "grad_norm": 1.2873193688207303, + "learning_rate": 8.407802021950087e-05, + "loss": 0.2934, + "step": 3323 + }, + { + "epoch": 0.2835693567650572, + "grad_norm": 1.5045713068860787, + "learning_rate": 8.406790943034081e-05, + "loss": 0.3401, + "step": 3324 + }, + { + "epoch": 0.2836546664391742, + "grad_norm": 1.2864389714110895, + "learning_rate": 8.405779604027691e-05, + "loss": 0.3057, + "step": 3325 + }, + { + "epoch": 0.28373997611329127, + "grad_norm": 1.3520312957075935, + "learning_rate": 8.404768005008126e-05, + "loss": 0.3271, + "step": 3326 + }, + { + "epoch": 0.2838252857874083, + "grad_norm": 1.3726190413250257, + "learning_rate": 8.403756146052617e-05, + "loss": 0.3193, + "step": 3327 + }, + { + "epoch": 0.28391059546152536, + "grad_norm": 1.6008759306297835, + "learning_rate": 8.402744027238413e-05, + "loss": 0.3256, + "step": 3328 + }, + { + "epoch": 0.2839959051356424, + "grad_norm": 1.2991978169381775, + "learning_rate": 8.401731648642785e-05, + "loss": 0.2983, + "step": 3329 + }, + { + "epoch": 0.28408121480975945, + "grad_norm": 1.6321092761330358, + "learning_rate": 8.400719010343023e-05, + "loss": 0.3341, + "step": 3330 + }, + { + "epoch": 0.28416652448387647, + "grad_norm": 1.5683054077440493, + "learning_rate": 8.399706112416434e-05, + "loss": 0.3179, + "step": 3331 + }, + { + "epoch": 0.28425183415799354, + "grad_norm": 1.5994376624869255, + "learning_rate": 8.398692954940352e-05, + "loss": 0.3695, + "step": 3332 + }, + { + "epoch": 0.28433714383211056, + "grad_norm": 1.7691438431898998, + "learning_rate": 8.397679537992122e-05, + "loss": 0.3609, + "step": 3333 + }, + { + "epoch": 0.2844224535062276, + "grad_norm": 1.616621720125096, + "learning_rate": 8.396665861649115e-05, + "loss": 0.3625, + "step": 3334 + }, + { + "epoch": 0.28450776318034465, + "grad_norm": 1.3824351167543172, + "learning_rate": 8.395651925988718e-05, + "loss": 0.3545, + "step": 3335 + }, + { + "epoch": 0.28459307285446167, + "grad_norm": 1.6542703565613621, + "learning_rate": 8.394637731088344e-05, + "loss": 0.3802, + "step": 3336 + }, + { + "epoch": 0.28467838252857874, + "grad_norm": 1.6244279147868474, + "learning_rate": 8.393623277025415e-05, + "loss": 0.3207, + "step": 3337 + }, + { + "epoch": 0.28476369220269576, + "grad_norm": 1.466107202521244, + "learning_rate": 8.392608563877385e-05, + "loss": 0.3175, + "step": 3338 + }, + { + "epoch": 0.28484900187681284, + "grad_norm": 1.2220166273062838, + "learning_rate": 8.391593591721718e-05, + "loss": 0.2735, + "step": 3339 + }, + { + "epoch": 0.28493431155092985, + "grad_norm": 1.3378855461525405, + "learning_rate": 8.390578360635903e-05, + "loss": 0.3411, + "step": 3340 + }, + { + "epoch": 0.2850196212250469, + "grad_norm": 1.7236007211955897, + "learning_rate": 8.38956287069745e-05, + "loss": 0.3626, + "step": 3341 + }, + { + "epoch": 0.28510493089916394, + "grad_norm": 1.4867577151285487, + "learning_rate": 8.388547121983881e-05, + "loss": 0.3431, + "step": 3342 + }, + { + "epoch": 0.285190240573281, + "grad_norm": 1.4336379873326266, + "learning_rate": 8.387531114572746e-05, + "loss": 0.3678, + "step": 3343 + }, + { + "epoch": 0.28527555024739804, + "grad_norm": 1.2235843925912289, + "learning_rate": 8.386514848541614e-05, + "loss": 0.3722, + "step": 3344 + }, + { + "epoch": 0.2853608599215151, + "grad_norm": 1.684567340350639, + "learning_rate": 8.385498323968069e-05, + "loss": 0.3312, + "step": 3345 + }, + { + "epoch": 0.2854461695956321, + "grad_norm": 1.52112485974302, + "learning_rate": 8.384481540929715e-05, + "loss": 0.3927, + "step": 3346 + }, + { + "epoch": 0.2855314792697492, + "grad_norm": 1.600803846848578, + "learning_rate": 8.383464499504183e-05, + "loss": 0.3412, + "step": 3347 + }, + { + "epoch": 0.2856167889438662, + "grad_norm": 1.5036568571117577, + "learning_rate": 8.382447199769115e-05, + "loss": 0.3219, + "step": 3348 + }, + { + "epoch": 0.2857020986179833, + "grad_norm": 1.3092643272223479, + "learning_rate": 8.381429641802177e-05, + "loss": 0.3216, + "step": 3349 + }, + { + "epoch": 0.2857874082921003, + "grad_norm": 1.3778779515557773, + "learning_rate": 8.380411825681057e-05, + "loss": 0.3375, + "step": 3350 + }, + { + "epoch": 0.2858727179662174, + "grad_norm": 1.2699615218874094, + "learning_rate": 8.379393751483455e-05, + "loss": 0.2839, + "step": 3351 + }, + { + "epoch": 0.2859580276403344, + "grad_norm": 1.4890123709119276, + "learning_rate": 8.378375419287099e-05, + "loss": 0.2948, + "step": 3352 + }, + { + "epoch": 0.2860433373144515, + "grad_norm": 1.1685690469447738, + "learning_rate": 8.377356829169734e-05, + "loss": 0.3137, + "step": 3353 + }, + { + "epoch": 0.2861286469885685, + "grad_norm": 1.5151454520826695, + "learning_rate": 8.376337981209119e-05, + "loss": 0.3137, + "step": 3354 + }, + { + "epoch": 0.28621395666268556, + "grad_norm": 1.4712546886022593, + "learning_rate": 8.375318875483045e-05, + "loss": 0.3403, + "step": 3355 + }, + { + "epoch": 0.2862992663368026, + "grad_norm": 1.4798004332872652, + "learning_rate": 8.374299512069308e-05, + "loss": 0.3558, + "step": 3356 + }, + { + "epoch": 0.28638457601091966, + "grad_norm": 1.586625424471493, + "learning_rate": 8.373279891045735e-05, + "loss": 0.3342, + "step": 3357 + }, + { + "epoch": 0.2864698856850367, + "grad_norm": 1.3431074691061493, + "learning_rate": 8.37226001249017e-05, + "loss": 0.3395, + "step": 3358 + }, + { + "epoch": 0.28655519535915375, + "grad_norm": 1.7326124015555378, + "learning_rate": 8.37123987648047e-05, + "loss": 0.3645, + "step": 3359 + }, + { + "epoch": 0.28664050503327076, + "grad_norm": 1.251188476147474, + "learning_rate": 8.370219483094523e-05, + "loss": 0.3375, + "step": 3360 + }, + { + "epoch": 0.28672581470738784, + "grad_norm": 1.6836985402785538, + "learning_rate": 8.369198832410227e-05, + "loss": 0.3668, + "step": 3361 + }, + { + "epoch": 0.28681112438150486, + "grad_norm": 1.2115508464324602, + "learning_rate": 8.368177924505504e-05, + "loss": 0.2959, + "step": 3362 + }, + { + "epoch": 0.28689643405562193, + "grad_norm": 1.4138244934423247, + "learning_rate": 8.367156759458294e-05, + "loss": 0.3825, + "step": 3363 + }, + { + "epoch": 0.28698174372973895, + "grad_norm": 1.413501101169218, + "learning_rate": 8.366135337346559e-05, + "loss": 0.3164, + "step": 3364 + }, + { + "epoch": 0.287067053403856, + "grad_norm": 1.3896751159980854, + "learning_rate": 8.365113658248278e-05, + "loss": 0.3126, + "step": 3365 + }, + { + "epoch": 0.28715236307797304, + "grad_norm": 1.3444054357966426, + "learning_rate": 8.364091722241454e-05, + "loss": 0.2941, + "step": 3366 + }, + { + "epoch": 0.2872376727520901, + "grad_norm": 1.161540223842728, + "learning_rate": 8.363069529404102e-05, + "loss": 0.2831, + "step": 3367 + }, + { + "epoch": 0.28732298242620713, + "grad_norm": 1.6164624344071115, + "learning_rate": 8.362047079814262e-05, + "loss": 0.3737, + "step": 3368 + }, + { + "epoch": 0.2874082921003242, + "grad_norm": 1.2418218110380703, + "learning_rate": 8.361024373549994e-05, + "loss": 0.3186, + "step": 3369 + }, + { + "epoch": 0.2874936017744412, + "grad_norm": 1.295068608826808, + "learning_rate": 8.360001410689375e-05, + "loss": 0.2692, + "step": 3370 + }, + { + "epoch": 0.2875789114485583, + "grad_norm": 1.5378989841736879, + "learning_rate": 8.358978191310505e-05, + "loss": 0.3439, + "step": 3371 + }, + { + "epoch": 0.2876642211226753, + "grad_norm": 1.5617989302447974, + "learning_rate": 8.357954715491498e-05, + "loss": 0.3198, + "step": 3372 + }, + { + "epoch": 0.28774953079679233, + "grad_norm": 1.442891913714002, + "learning_rate": 8.356930983310493e-05, + "loss": 0.272, + "step": 3373 + }, + { + "epoch": 0.2878348404709094, + "grad_norm": 1.4578643135438607, + "learning_rate": 8.355906994845646e-05, + "loss": 0.3367, + "step": 3374 + }, + { + "epoch": 0.2879201501450264, + "grad_norm": 1.7245004345192918, + "learning_rate": 8.354882750175133e-05, + "loss": 0.3136, + "step": 3375 + }, + { + "epoch": 0.2880054598191435, + "grad_norm": 1.4434569949121756, + "learning_rate": 8.35385824937715e-05, + "loss": 0.3484, + "step": 3376 + }, + { + "epoch": 0.2880907694932605, + "grad_norm": 1.5121054930897035, + "learning_rate": 8.352833492529914e-05, + "loss": 0.3068, + "step": 3377 + }, + { + "epoch": 0.2881760791673776, + "grad_norm": 1.3624304709936323, + "learning_rate": 8.351808479711656e-05, + "loss": 0.3296, + "step": 3378 + }, + { + "epoch": 0.2882613888414946, + "grad_norm": 1.4692005521545286, + "learning_rate": 8.350783211000632e-05, + "loss": 0.3375, + "step": 3379 + }, + { + "epoch": 0.2883466985156117, + "grad_norm": 1.4480846316753675, + "learning_rate": 8.349757686475116e-05, + "loss": 0.2853, + "step": 3380 + }, + { + "epoch": 0.2884320081897287, + "grad_norm": 1.2944894474258046, + "learning_rate": 8.348731906213402e-05, + "loss": 0.2837, + "step": 3381 + }, + { + "epoch": 0.28851731786384577, + "grad_norm": 1.3977743100007232, + "learning_rate": 8.3477058702938e-05, + "loss": 0.3569, + "step": 3382 + }, + { + "epoch": 0.2886026275379628, + "grad_norm": 1.7837500177502381, + "learning_rate": 8.346679578794647e-05, + "loss": 0.2945, + "step": 3383 + }, + { + "epoch": 0.28868793721207986, + "grad_norm": 1.5521486341150623, + "learning_rate": 8.345653031794292e-05, + "loss": 0.3451, + "step": 3384 + }, + { + "epoch": 0.2887732468861969, + "grad_norm": 1.3642925894256581, + "learning_rate": 8.344626229371107e-05, + "loss": 0.332, + "step": 3385 + }, + { + "epoch": 0.28885855656031395, + "grad_norm": 1.2056193071688754, + "learning_rate": 8.34359917160348e-05, + "loss": 0.2973, + "step": 3386 + }, + { + "epoch": 0.28894386623443097, + "grad_norm": 1.5309818578002727, + "learning_rate": 8.342571858569826e-05, + "loss": 0.3312, + "step": 3387 + }, + { + "epoch": 0.28902917590854804, + "grad_norm": 1.5877089355424894, + "learning_rate": 8.341544290348572e-05, + "loss": 0.3654, + "step": 3388 + }, + { + "epoch": 0.28911448558266506, + "grad_norm": 1.2784254061495903, + "learning_rate": 8.340516467018171e-05, + "loss": 0.3216, + "step": 3389 + }, + { + "epoch": 0.28919979525678213, + "grad_norm": 1.2509523578479016, + "learning_rate": 8.339488388657089e-05, + "loss": 0.3107, + "step": 3390 + }, + { + "epoch": 0.28928510493089915, + "grad_norm": 1.2841026370064597, + "learning_rate": 8.338460055343812e-05, + "loss": 0.3082, + "step": 3391 + }, + { + "epoch": 0.2893704146050162, + "grad_norm": 1.6480591687858084, + "learning_rate": 8.337431467156851e-05, + "loss": 0.3823, + "step": 3392 + }, + { + "epoch": 0.28945572427913324, + "grad_norm": 1.721390793526585, + "learning_rate": 8.336402624174734e-05, + "loss": 0.3525, + "step": 3393 + }, + { + "epoch": 0.2895410339532503, + "grad_norm": 1.3593824978325686, + "learning_rate": 8.335373526476005e-05, + "loss": 0.3284, + "step": 3394 + }, + { + "epoch": 0.28962634362736733, + "grad_norm": 1.4867856177538812, + "learning_rate": 8.334344174139233e-05, + "loss": 0.3189, + "step": 3395 + }, + { + "epoch": 0.2897116533014844, + "grad_norm": 1.4039438623687421, + "learning_rate": 8.333314567243e-05, + "loss": 0.3981, + "step": 3396 + }, + { + "epoch": 0.2897969629756014, + "grad_norm": 1.490273251407635, + "learning_rate": 8.332284705865914e-05, + "loss": 0.2939, + "step": 3397 + }, + { + "epoch": 0.2898822726497185, + "grad_norm": 1.677458095339819, + "learning_rate": 8.331254590086597e-05, + "loss": 0.3516, + "step": 3398 + }, + { + "epoch": 0.2899675823238355, + "grad_norm": 1.4311995718122155, + "learning_rate": 8.330224219983695e-05, + "loss": 0.2987, + "step": 3399 + }, + { + "epoch": 0.2900528919979526, + "grad_norm": 1.533300000671534, + "learning_rate": 8.329193595635872e-05, + "loss": 0.3707, + "step": 3400 + }, + { + "epoch": 0.2901382016720696, + "grad_norm": 1.5251504511176304, + "learning_rate": 8.32816271712181e-05, + "loss": 0.3521, + "step": 3401 + }, + { + "epoch": 0.2902235113461867, + "grad_norm": 1.4859472681600923, + "learning_rate": 8.327131584520207e-05, + "loss": 0.324, + "step": 3402 + }, + { + "epoch": 0.2903088210203037, + "grad_norm": 1.363403314847807, + "learning_rate": 8.32610019790979e-05, + "loss": 0.3194, + "step": 3403 + }, + { + "epoch": 0.29039413069442077, + "grad_norm": 1.4413873107823754, + "learning_rate": 8.325068557369298e-05, + "loss": 0.3487, + "step": 3404 + }, + { + "epoch": 0.2904794403685378, + "grad_norm": 1.6214548467136018, + "learning_rate": 8.32403666297749e-05, + "loss": 0.2945, + "step": 3405 + }, + { + "epoch": 0.29056475004265486, + "grad_norm": 1.3742030174825177, + "learning_rate": 8.323004514813148e-05, + "loss": 0.3227, + "step": 3406 + }, + { + "epoch": 0.2906500597167719, + "grad_norm": 1.3900560222286302, + "learning_rate": 8.321972112955068e-05, + "loss": 0.2836, + "step": 3407 + }, + { + "epoch": 0.29073536939088895, + "grad_norm": 1.4362718892234714, + "learning_rate": 8.320939457482072e-05, + "loss": 0.2883, + "step": 3408 + }, + { + "epoch": 0.29082067906500597, + "grad_norm": 1.2848021416535824, + "learning_rate": 8.319906548472993e-05, + "loss": 0.2889, + "step": 3409 + }, + { + "epoch": 0.290905988739123, + "grad_norm": 1.5149369398533319, + "learning_rate": 8.318873386006693e-05, + "loss": 0.352, + "step": 3410 + }, + { + "epoch": 0.29099129841324006, + "grad_norm": 1.7339133945691558, + "learning_rate": 8.317839970162047e-05, + "loss": 0.3418, + "step": 3411 + }, + { + "epoch": 0.2910766080873571, + "grad_norm": 0.9681992965175608, + "learning_rate": 8.31680630101795e-05, + "loss": 0.2801, + "step": 3412 + }, + { + "epoch": 0.29116191776147415, + "grad_norm": 1.4678988323848137, + "learning_rate": 8.315772378653317e-05, + "loss": 0.3332, + "step": 3413 + }, + { + "epoch": 0.29124722743559117, + "grad_norm": 1.5605346526430002, + "learning_rate": 8.314738203147084e-05, + "loss": 0.3591, + "step": 3414 + }, + { + "epoch": 0.29133253710970825, + "grad_norm": 1.726231193688644, + "learning_rate": 8.3137037745782e-05, + "loss": 0.345, + "step": 3415 + }, + { + "epoch": 0.29141784678382526, + "grad_norm": 1.2490377059968574, + "learning_rate": 8.312669093025645e-05, + "loss": 0.302, + "step": 3416 + }, + { + "epoch": 0.29150315645794234, + "grad_norm": 1.4277102446180319, + "learning_rate": 8.311634158568408e-05, + "loss": 0.316, + "step": 3417 + }, + { + "epoch": 0.29158846613205935, + "grad_norm": 1.351680992974035, + "learning_rate": 8.3105989712855e-05, + "loss": 0.2838, + "step": 3418 + }, + { + "epoch": 0.2916737758061764, + "grad_norm": 1.52779902626443, + "learning_rate": 8.309563531255955e-05, + "loss": 0.4026, + "step": 3419 + }, + { + "epoch": 0.29175908548029345, + "grad_norm": 1.2502719106571611, + "learning_rate": 8.308527838558819e-05, + "loss": 0.3148, + "step": 3420 + }, + { + "epoch": 0.2918443951544105, + "grad_norm": 1.2251514652578621, + "learning_rate": 8.307491893273165e-05, + "loss": 0.3011, + "step": 3421 + }, + { + "epoch": 0.29192970482852754, + "grad_norm": 1.8325679944422664, + "learning_rate": 8.306455695478081e-05, + "loss": 0.3796, + "step": 3422 + }, + { + "epoch": 0.2920150145026446, + "grad_norm": 1.6559799262041828, + "learning_rate": 8.305419245252676e-05, + "loss": 0.3598, + "step": 3423 + }, + { + "epoch": 0.2921003241767616, + "grad_norm": 1.4069419430058494, + "learning_rate": 8.304382542676075e-05, + "loss": 0.3313, + "step": 3424 + }, + { + "epoch": 0.2921856338508787, + "grad_norm": 1.4625280361259636, + "learning_rate": 8.303345587827427e-05, + "loss": 0.3359, + "step": 3425 + }, + { + "epoch": 0.2922709435249957, + "grad_norm": 1.373917326798478, + "learning_rate": 8.302308380785898e-05, + "loss": 0.2769, + "step": 3426 + }, + { + "epoch": 0.2923562531991128, + "grad_norm": 1.9757294840760555, + "learning_rate": 8.301270921630673e-05, + "loss": 0.3485, + "step": 3427 + }, + { + "epoch": 0.2924415628732298, + "grad_norm": 1.148250227210755, + "learning_rate": 8.300233210440954e-05, + "loss": 0.3037, + "step": 3428 + }, + { + "epoch": 0.2925268725473469, + "grad_norm": 1.440160611123179, + "learning_rate": 8.299195247295968e-05, + "loss": 0.2948, + "step": 3429 + }, + { + "epoch": 0.2926121822214639, + "grad_norm": 1.664592103453614, + "learning_rate": 8.298157032274957e-05, + "loss": 0.3025, + "step": 3430 + }, + { + "epoch": 0.292697491895581, + "grad_norm": 1.3610333379380388, + "learning_rate": 8.297118565457182e-05, + "loss": 0.3207, + "step": 3431 + }, + { + "epoch": 0.292782801569698, + "grad_norm": 1.4490486312508564, + "learning_rate": 8.296079846921927e-05, + "loss": 0.3186, + "step": 3432 + }, + { + "epoch": 0.29286811124381507, + "grad_norm": 1.5323132307487544, + "learning_rate": 8.295040876748489e-05, + "loss": 0.3311, + "step": 3433 + }, + { + "epoch": 0.2929534209179321, + "grad_norm": 1.3691165067586395, + "learning_rate": 8.294001655016192e-05, + "loss": 0.3053, + "step": 3434 + }, + { + "epoch": 0.29303873059204916, + "grad_norm": 1.444070478267461, + "learning_rate": 8.292962181804372e-05, + "loss": 0.3009, + "step": 3435 + }, + { + "epoch": 0.2931240402661662, + "grad_norm": 1.4773930227211731, + "learning_rate": 8.291922457192387e-05, + "loss": 0.319, + "step": 3436 + }, + { + "epoch": 0.29320934994028325, + "grad_norm": 1.2958170240710825, + "learning_rate": 8.290882481259618e-05, + "loss": 0.3656, + "step": 3437 + }, + { + "epoch": 0.29329465961440027, + "grad_norm": 1.5111941035860943, + "learning_rate": 8.289842254085458e-05, + "loss": 0.3429, + "step": 3438 + }, + { + "epoch": 0.29337996928851734, + "grad_norm": 1.360203063264417, + "learning_rate": 8.288801775749323e-05, + "loss": 0.3142, + "step": 3439 + }, + { + "epoch": 0.29346527896263436, + "grad_norm": 1.6028945965491568, + "learning_rate": 8.28776104633065e-05, + "loss": 0.356, + "step": 3440 + }, + { + "epoch": 0.29355058863675143, + "grad_norm": 1.450765765985163, + "learning_rate": 8.286720065908893e-05, + "loss": 0.3469, + "step": 3441 + }, + { + "epoch": 0.29363589831086845, + "grad_norm": 1.4420325926537712, + "learning_rate": 8.285678834563524e-05, + "loss": 0.317, + "step": 3442 + }, + { + "epoch": 0.2937212079849855, + "grad_norm": 1.6868457938934722, + "learning_rate": 8.284637352374037e-05, + "loss": 0.3305, + "step": 3443 + }, + { + "epoch": 0.29380651765910254, + "grad_norm": 1.8434097493332207, + "learning_rate": 8.283595619419941e-05, + "loss": 0.3622, + "step": 3444 + }, + { + "epoch": 0.2938918273332196, + "grad_norm": 1.5724568315897054, + "learning_rate": 8.28255363578077e-05, + "loss": 0.3252, + "step": 3445 + }, + { + "epoch": 0.29397713700733663, + "grad_norm": 1.5780334446098185, + "learning_rate": 8.281511401536071e-05, + "loss": 0.3603, + "step": 3446 + }, + { + "epoch": 0.2940624466814537, + "grad_norm": 1.2046962238000847, + "learning_rate": 8.280468916765415e-05, + "loss": 0.2795, + "step": 3447 + }, + { + "epoch": 0.2941477563555707, + "grad_norm": 1.6318954009494275, + "learning_rate": 8.27942618154839e-05, + "loss": 0.3597, + "step": 3448 + }, + { + "epoch": 0.29423306602968774, + "grad_norm": 1.1910937290224175, + "learning_rate": 8.278383195964601e-05, + "loss": 0.2819, + "step": 3449 + }, + { + "epoch": 0.2943183757038048, + "grad_norm": 1.4413839198918188, + "learning_rate": 8.27733996009368e-05, + "loss": 0.3331, + "step": 3450 + }, + { + "epoch": 0.29440368537792183, + "grad_norm": 1.571867670910473, + "learning_rate": 8.276296474015266e-05, + "loss": 0.3538, + "step": 3451 + }, + { + "epoch": 0.2944889950520389, + "grad_norm": 1.568930022104512, + "learning_rate": 8.275252737809028e-05, + "loss": 0.3814, + "step": 3452 + }, + { + "epoch": 0.2945743047261559, + "grad_norm": 1.3237730193185386, + "learning_rate": 8.274208751554646e-05, + "loss": 0.3349, + "step": 3453 + }, + { + "epoch": 0.294659614400273, + "grad_norm": 1.5378275917706472, + "learning_rate": 8.273164515331826e-05, + "loss": 0.344, + "step": 3454 + }, + { + "epoch": 0.29474492407439, + "grad_norm": 1.4334352494016933, + "learning_rate": 8.272120029220289e-05, + "loss": 0.3028, + "step": 3455 + }, + { + "epoch": 0.2948302337485071, + "grad_norm": 1.4441953723488363, + "learning_rate": 8.271075293299777e-05, + "loss": 0.3171, + "step": 3456 + }, + { + "epoch": 0.2949155434226241, + "grad_norm": 1.4836027847183682, + "learning_rate": 8.270030307650048e-05, + "loss": 0.2913, + "step": 3457 + }, + { + "epoch": 0.2950008530967412, + "grad_norm": 1.303984309763213, + "learning_rate": 8.268985072350882e-05, + "loss": 0.3008, + "step": 3458 + }, + { + "epoch": 0.2950861627708582, + "grad_norm": 1.4594149075591922, + "learning_rate": 8.267939587482077e-05, + "loss": 0.3156, + "step": 3459 + }, + { + "epoch": 0.29517147244497527, + "grad_norm": 1.6561400179064367, + "learning_rate": 8.266893853123447e-05, + "loss": 0.3401, + "step": 3460 + }, + { + "epoch": 0.2952567821190923, + "grad_norm": 1.381332726396209, + "learning_rate": 8.265847869354836e-05, + "loss": 0.3072, + "step": 3461 + }, + { + "epoch": 0.29534209179320936, + "grad_norm": 1.3960513850840974, + "learning_rate": 8.264801636256094e-05, + "loss": 0.2893, + "step": 3462 + }, + { + "epoch": 0.2954274014673264, + "grad_norm": 1.429848083482857, + "learning_rate": 8.263755153907095e-05, + "loss": 0.3318, + "step": 3463 + }, + { + "epoch": 0.29551271114144345, + "grad_norm": 1.4741545727400653, + "learning_rate": 8.262708422387735e-05, + "loss": 0.2935, + "step": 3464 + }, + { + "epoch": 0.29559802081556047, + "grad_norm": 1.317463617130108, + "learning_rate": 8.261661441777924e-05, + "loss": 0.302, + "step": 3465 + }, + { + "epoch": 0.29568333048967754, + "grad_norm": 1.463816941552411, + "learning_rate": 8.260614212157593e-05, + "loss": 0.3486, + "step": 3466 + }, + { + "epoch": 0.29576864016379456, + "grad_norm": 1.3410305627957078, + "learning_rate": 8.259566733606696e-05, + "loss": 0.3385, + "step": 3467 + }, + { + "epoch": 0.29585394983791163, + "grad_norm": 1.5092849262285268, + "learning_rate": 8.2585190062052e-05, + "loss": 0.2836, + "step": 3468 + }, + { + "epoch": 0.29593925951202865, + "grad_norm": 1.6132805849678942, + "learning_rate": 8.257471030033092e-05, + "loss": 0.3442, + "step": 3469 + }, + { + "epoch": 0.2960245691861457, + "grad_norm": 1.4804370242622267, + "learning_rate": 8.256422805170383e-05, + "loss": 0.3526, + "step": 3470 + }, + { + "epoch": 0.29610987886026274, + "grad_norm": 1.3551549190909238, + "learning_rate": 8.255374331697097e-05, + "loss": 0.3233, + "step": 3471 + }, + { + "epoch": 0.2961951885343798, + "grad_norm": 1.7387602071236237, + "learning_rate": 8.25432560969328e-05, + "loss": 0.3514, + "step": 3472 + }, + { + "epoch": 0.29628049820849683, + "grad_norm": 1.5495223693808586, + "learning_rate": 8.253276639238995e-05, + "loss": 0.3899, + "step": 3473 + }, + { + "epoch": 0.2963658078826139, + "grad_norm": 1.394170206813722, + "learning_rate": 8.252227420414327e-05, + "loss": 0.302, + "step": 3474 + }, + { + "epoch": 0.2964511175567309, + "grad_norm": 1.6571335505120848, + "learning_rate": 8.251177953299379e-05, + "loss": 0.3432, + "step": 3475 + }, + { + "epoch": 0.296536427230848, + "grad_norm": 1.5842558114329137, + "learning_rate": 8.250128237974268e-05, + "loss": 0.3228, + "step": 3476 + }, + { + "epoch": 0.296621736904965, + "grad_norm": 1.43220057342309, + "learning_rate": 8.24907827451914e-05, + "loss": 0.3179, + "step": 3477 + }, + { + "epoch": 0.2967070465790821, + "grad_norm": 1.468577435684217, + "learning_rate": 8.24802806301415e-05, + "loss": 0.3687, + "step": 3478 + }, + { + "epoch": 0.2967923562531991, + "grad_norm": 1.3651802368781192, + "learning_rate": 8.246977603539478e-05, + "loss": 0.3039, + "step": 3479 + }, + { + "epoch": 0.2968776659273162, + "grad_norm": 1.4460244564557727, + "learning_rate": 8.245926896175321e-05, + "loss": 0.3219, + "step": 3480 + }, + { + "epoch": 0.2969629756014332, + "grad_norm": 1.9085470649046878, + "learning_rate": 8.244875941001893e-05, + "loss": 0.3727, + "step": 3481 + }, + { + "epoch": 0.2970482852755503, + "grad_norm": 1.4724568603880168, + "learning_rate": 8.243824738099431e-05, + "loss": 0.3091, + "step": 3482 + }, + { + "epoch": 0.2971335949496673, + "grad_norm": 1.393149110389477, + "learning_rate": 8.242773287548187e-05, + "loss": 0.3274, + "step": 3483 + }, + { + "epoch": 0.29721890462378436, + "grad_norm": 1.3481992650345669, + "learning_rate": 8.241721589428435e-05, + "loss": 0.3543, + "step": 3484 + }, + { + "epoch": 0.2973042142979014, + "grad_norm": 1.52685585912511, + "learning_rate": 8.240669643820467e-05, + "loss": 0.3171, + "step": 3485 + }, + { + "epoch": 0.2973895239720184, + "grad_norm": 1.4154338615001154, + "learning_rate": 8.239617450804591e-05, + "loss": 0.3373, + "step": 3486 + }, + { + "epoch": 0.2974748336461355, + "grad_norm": 1.50051664356025, + "learning_rate": 8.238565010461138e-05, + "loss": 0.3636, + "step": 3487 + }, + { + "epoch": 0.2975601433202525, + "grad_norm": 1.3840391677402235, + "learning_rate": 8.237512322870458e-05, + "loss": 0.3733, + "step": 3488 + }, + { + "epoch": 0.29764545299436956, + "grad_norm": 1.4038474858742958, + "learning_rate": 8.236459388112916e-05, + "loss": 0.3462, + "step": 3489 + }, + { + "epoch": 0.2977307626684866, + "grad_norm": 1.4079130723193687, + "learning_rate": 8.235406206268898e-05, + "loss": 0.3442, + "step": 3490 + }, + { + "epoch": 0.29781607234260365, + "grad_norm": 1.495159763998199, + "learning_rate": 8.234352777418808e-05, + "loss": 0.3102, + "step": 3491 + }, + { + "epoch": 0.2979013820167207, + "grad_norm": 1.5226414875001886, + "learning_rate": 8.23329910164307e-05, + "loss": 0.3633, + "step": 3492 + }, + { + "epoch": 0.29798669169083775, + "grad_norm": 1.5617544302745479, + "learning_rate": 8.23224517902213e-05, + "loss": 0.3509, + "step": 3493 + }, + { + "epoch": 0.29807200136495476, + "grad_norm": 1.2572168871057035, + "learning_rate": 8.231191009636446e-05, + "loss": 0.3218, + "step": 3494 + }, + { + "epoch": 0.29815731103907184, + "grad_norm": 1.5668859341897605, + "learning_rate": 8.230136593566497e-05, + "loss": 0.2813, + "step": 3495 + }, + { + "epoch": 0.29824262071318886, + "grad_norm": 1.451052428472866, + "learning_rate": 8.229081930892786e-05, + "loss": 0.3299, + "step": 3496 + }, + { + "epoch": 0.29832793038730593, + "grad_norm": 1.6496811645409282, + "learning_rate": 8.228027021695827e-05, + "loss": 0.3637, + "step": 3497 + }, + { + "epoch": 0.29841324006142295, + "grad_norm": 1.1094940551488524, + "learning_rate": 8.226971866056161e-05, + "loss": 0.3031, + "step": 3498 + }, + { + "epoch": 0.29849854973554, + "grad_norm": 1.6073377733257608, + "learning_rate": 8.225916464054341e-05, + "loss": 0.3301, + "step": 3499 + }, + { + "epoch": 0.29858385940965704, + "grad_norm": 1.5302081553660531, + "learning_rate": 8.22486081577094e-05, + "loss": 0.3484, + "step": 3500 + }, + { + "epoch": 0.2986691690837741, + "grad_norm": 1.3404344863505548, + "learning_rate": 8.223804921286553e-05, + "loss": 0.2839, + "step": 3501 + }, + { + "epoch": 0.29875447875789113, + "grad_norm": 1.5702986835232504, + "learning_rate": 8.22274878068179e-05, + "loss": 0.3547, + "step": 3502 + }, + { + "epoch": 0.2988397884320082, + "grad_norm": 1.3474028860565483, + "learning_rate": 8.221692394037286e-05, + "loss": 0.2716, + "step": 3503 + }, + { + "epoch": 0.2989250981061252, + "grad_norm": 1.6138578035852078, + "learning_rate": 8.220635761433687e-05, + "loss": 0.2915, + "step": 3504 + }, + { + "epoch": 0.2990104077802423, + "grad_norm": 1.44875029938294, + "learning_rate": 8.219578882951662e-05, + "loss": 0.2868, + "step": 3505 + }, + { + "epoch": 0.2990957174543593, + "grad_norm": 1.364491709576698, + "learning_rate": 8.218521758671897e-05, + "loss": 0.2744, + "step": 3506 + }, + { + "epoch": 0.2991810271284764, + "grad_norm": 1.4733440705306573, + "learning_rate": 8.2174643886751e-05, + "loss": 0.3361, + "step": 3507 + }, + { + "epoch": 0.2992663368025934, + "grad_norm": 1.1923815628032712, + "learning_rate": 8.216406773041994e-05, + "loss": 0.2954, + "step": 3508 + }, + { + "epoch": 0.2993516464767105, + "grad_norm": 1.5835538342512008, + "learning_rate": 8.215348911853324e-05, + "loss": 0.2817, + "step": 3509 + }, + { + "epoch": 0.2994369561508275, + "grad_norm": 1.5747045330648788, + "learning_rate": 8.21429080518985e-05, + "loss": 0.3485, + "step": 3510 + }, + { + "epoch": 0.29952226582494457, + "grad_norm": 1.197780131423429, + "learning_rate": 8.213232453132353e-05, + "loss": 0.2455, + "step": 3511 + }, + { + "epoch": 0.2996075754990616, + "grad_norm": 1.2264207891738952, + "learning_rate": 8.212173855761636e-05, + "loss": 0.2959, + "step": 3512 + }, + { + "epoch": 0.29969288517317866, + "grad_norm": 1.4052128676031928, + "learning_rate": 8.211115013158512e-05, + "loss": 0.3734, + "step": 3513 + }, + { + "epoch": 0.2997781948472957, + "grad_norm": 1.5455395106556813, + "learning_rate": 8.210055925403821e-05, + "loss": 0.3531, + "step": 3514 + }, + { + "epoch": 0.29986350452141275, + "grad_norm": 1.5549262548788518, + "learning_rate": 8.208996592578417e-05, + "loss": 0.3399, + "step": 3515 + }, + { + "epoch": 0.29994881419552977, + "grad_norm": 1.2812181794471804, + "learning_rate": 8.207937014763178e-05, + "loss": 0.3094, + "step": 3516 + }, + { + "epoch": 0.30003412386964684, + "grad_norm": 1.4697886913259532, + "learning_rate": 8.206877192038995e-05, + "loss": 0.3201, + "step": 3517 + }, + { + "epoch": 0.30011943354376386, + "grad_norm": 1.6951736885585216, + "learning_rate": 8.205817124486779e-05, + "loss": 0.374, + "step": 3518 + }, + { + "epoch": 0.30020474321788093, + "grad_norm": 1.1699261545857662, + "learning_rate": 8.204756812187461e-05, + "loss": 0.3483, + "step": 3519 + }, + { + "epoch": 0.30029005289199795, + "grad_norm": 1.5324657051767014, + "learning_rate": 8.203696255221991e-05, + "loss": 0.2725, + "step": 3520 + }, + { + "epoch": 0.300375362566115, + "grad_norm": 1.4442883136111926, + "learning_rate": 8.202635453671335e-05, + "loss": 0.3516, + "step": 3521 + }, + { + "epoch": 0.30046067224023204, + "grad_norm": 1.3034174328002148, + "learning_rate": 8.201574407616483e-05, + "loss": 0.3164, + "step": 3522 + }, + { + "epoch": 0.3005459819143491, + "grad_norm": 1.535124023720473, + "learning_rate": 8.200513117138435e-05, + "loss": 0.3036, + "step": 3523 + }, + { + "epoch": 0.30063129158846613, + "grad_norm": 1.4373355439693412, + "learning_rate": 8.199451582318221e-05, + "loss": 0.2797, + "step": 3524 + }, + { + "epoch": 0.30071660126258315, + "grad_norm": 1.5159483889045384, + "learning_rate": 8.19838980323688e-05, + "loss": 0.3032, + "step": 3525 + }, + { + "epoch": 0.3008019109367002, + "grad_norm": 1.5149891099443988, + "learning_rate": 8.197327779975473e-05, + "loss": 0.3236, + "step": 3526 + }, + { + "epoch": 0.30088722061081724, + "grad_norm": 1.4368220679517012, + "learning_rate": 8.196265512615081e-05, + "loss": 0.3333, + "step": 3527 + }, + { + "epoch": 0.3009725302849343, + "grad_norm": 1.7625128400618977, + "learning_rate": 8.195203001236802e-05, + "loss": 0.3499, + "step": 3528 + }, + { + "epoch": 0.30105783995905133, + "grad_norm": 1.3324641533873456, + "learning_rate": 8.194140245921753e-05, + "loss": 0.3002, + "step": 3529 + }, + { + "epoch": 0.3011431496331684, + "grad_norm": 1.3410829202160706, + "learning_rate": 8.19307724675107e-05, + "loss": 0.36, + "step": 3530 + }, + { + "epoch": 0.3012284593072854, + "grad_norm": 1.7145381298881435, + "learning_rate": 8.192014003805907e-05, + "loss": 0.3387, + "step": 3531 + }, + { + "epoch": 0.3013137689814025, + "grad_norm": 1.632233001232514, + "learning_rate": 8.190950517167437e-05, + "loss": 0.2976, + "step": 3532 + }, + { + "epoch": 0.3013990786555195, + "grad_norm": 1.6223231054897655, + "learning_rate": 8.189886786916853e-05, + "loss": 0.3178, + "step": 3533 + }, + { + "epoch": 0.3014843883296366, + "grad_norm": 1.5235387523944401, + "learning_rate": 8.188822813135362e-05, + "loss": 0.3427, + "step": 3534 + }, + { + "epoch": 0.3015696980037536, + "grad_norm": 1.7774107532657117, + "learning_rate": 8.187758595904196e-05, + "loss": 0.4158, + "step": 3535 + }, + { + "epoch": 0.3016550076778707, + "grad_norm": 1.4625039907547714, + "learning_rate": 8.1866941353046e-05, + "loss": 0.2917, + "step": 3536 + }, + { + "epoch": 0.3017403173519877, + "grad_norm": 1.2824923377152075, + "learning_rate": 8.18562943141784e-05, + "loss": 0.2818, + "step": 3537 + }, + { + "epoch": 0.30182562702610477, + "grad_norm": 1.5409064695428558, + "learning_rate": 8.184564484325204e-05, + "loss": 0.2822, + "step": 3538 + }, + { + "epoch": 0.3019109367002218, + "grad_norm": 1.3386962247247451, + "learning_rate": 8.18349929410799e-05, + "loss": 0.3424, + "step": 3539 + }, + { + "epoch": 0.30199624637433886, + "grad_norm": 1.3812676545132523, + "learning_rate": 8.182433860847524e-05, + "loss": 0.2843, + "step": 3540 + }, + { + "epoch": 0.3020815560484559, + "grad_norm": 1.2875284951028867, + "learning_rate": 8.181368184625143e-05, + "loss": 0.3509, + "step": 3541 + }, + { + "epoch": 0.30216686572257295, + "grad_norm": 1.5214696450065213, + "learning_rate": 8.180302265522206e-05, + "loss": 0.2726, + "step": 3542 + }, + { + "epoch": 0.30225217539668997, + "grad_norm": 1.3303073679901414, + "learning_rate": 8.179236103620094e-05, + "loss": 0.2794, + "step": 3543 + }, + { + "epoch": 0.30233748507080704, + "grad_norm": 1.0721000168282195, + "learning_rate": 8.178169699000198e-05, + "loss": 0.3012, + "step": 3544 + }, + { + "epoch": 0.30242279474492406, + "grad_norm": 1.3327092657324004, + "learning_rate": 8.177103051743932e-05, + "loss": 0.2695, + "step": 3545 + }, + { + "epoch": 0.30250810441904114, + "grad_norm": 1.3417003880087615, + "learning_rate": 8.176036161932734e-05, + "loss": 0.3178, + "step": 3546 + }, + { + "epoch": 0.30259341409315815, + "grad_norm": 1.5288138180853326, + "learning_rate": 8.174969029648052e-05, + "loss": 0.3115, + "step": 3547 + }, + { + "epoch": 0.3026787237672752, + "grad_norm": 1.4146272153419892, + "learning_rate": 8.173901654971357e-05, + "loss": 0.3062, + "step": 3548 + }, + { + "epoch": 0.30276403344139224, + "grad_norm": 1.9640668239257608, + "learning_rate": 8.172834037984137e-05, + "loss": 0.387, + "step": 3549 + }, + { + "epoch": 0.3028493431155093, + "grad_norm": 1.5847944076967044, + "learning_rate": 8.171766178767897e-05, + "loss": 0.3292, + "step": 3550 + }, + { + "epoch": 0.30293465278962634, + "grad_norm": 1.2315346596936414, + "learning_rate": 8.170698077404165e-05, + "loss": 0.3066, + "step": 3551 + }, + { + "epoch": 0.3030199624637434, + "grad_norm": 1.6023618610610273, + "learning_rate": 8.169629733974482e-05, + "loss": 0.3601, + "step": 3552 + }, + { + "epoch": 0.3031052721378604, + "grad_norm": 1.2986138877000324, + "learning_rate": 8.168561148560414e-05, + "loss": 0.3029, + "step": 3553 + }, + { + "epoch": 0.3031905818119775, + "grad_norm": 1.3816782291850394, + "learning_rate": 8.167492321243539e-05, + "loss": 0.3233, + "step": 3554 + }, + { + "epoch": 0.3032758914860945, + "grad_norm": 1.3255857540437934, + "learning_rate": 8.166423252105458e-05, + "loss": 0.3437, + "step": 3555 + }, + { + "epoch": 0.3033612011602116, + "grad_norm": 1.2761829479660753, + "learning_rate": 8.165353941227789e-05, + "loss": 0.3667, + "step": 3556 + }, + { + "epoch": 0.3034465108343286, + "grad_norm": 1.529998968909421, + "learning_rate": 8.164284388692166e-05, + "loss": 0.3511, + "step": 3557 + }, + { + "epoch": 0.3035318205084457, + "grad_norm": 1.364590297807388, + "learning_rate": 8.163214594580245e-05, + "loss": 0.2867, + "step": 3558 + }, + { + "epoch": 0.3036171301825627, + "grad_norm": 1.6363331925565703, + "learning_rate": 8.1621445589737e-05, + "loss": 0.3281, + "step": 3559 + }, + { + "epoch": 0.3037024398566798, + "grad_norm": 1.5527255459900322, + "learning_rate": 8.161074281954219e-05, + "loss": 0.3211, + "step": 3560 + }, + { + "epoch": 0.3037877495307968, + "grad_norm": 1.4530528881269154, + "learning_rate": 8.160003763603516e-05, + "loss": 0.3243, + "step": 3561 + }, + { + "epoch": 0.30387305920491386, + "grad_norm": 1.4642926209316791, + "learning_rate": 8.158933004003319e-05, + "loss": 0.3228, + "step": 3562 + }, + { + "epoch": 0.3039583688790309, + "grad_norm": 1.3902791482565307, + "learning_rate": 8.157862003235373e-05, + "loss": 0.2909, + "step": 3563 + }, + { + "epoch": 0.3040436785531479, + "grad_norm": 1.2190863805337258, + "learning_rate": 8.156790761381444e-05, + "loss": 0.2788, + "step": 3564 + }, + { + "epoch": 0.304128988227265, + "grad_norm": 1.2962757702494117, + "learning_rate": 8.155719278523316e-05, + "loss": 0.3269, + "step": 3565 + }, + { + "epoch": 0.304214297901382, + "grad_norm": 1.583933197963333, + "learning_rate": 8.154647554742789e-05, + "loss": 0.2952, + "step": 3566 + }, + { + "epoch": 0.30429960757549906, + "grad_norm": 1.764371714087253, + "learning_rate": 8.153575590121686e-05, + "loss": 0.3625, + "step": 3567 + }, + { + "epoch": 0.3043849172496161, + "grad_norm": 1.583031910095734, + "learning_rate": 8.152503384741846e-05, + "loss": 0.369, + "step": 3568 + }, + { + "epoch": 0.30447022692373316, + "grad_norm": 1.4602315900622067, + "learning_rate": 8.151430938685123e-05, + "loss": 0.3834, + "step": 3569 + }, + { + "epoch": 0.3045555365978502, + "grad_norm": 1.439861762378643, + "learning_rate": 8.150358252033394e-05, + "loss": 0.306, + "step": 3570 + }, + { + "epoch": 0.30464084627196725, + "grad_norm": 1.9153408219354495, + "learning_rate": 8.149285324868554e-05, + "loss": 0.3586, + "step": 3571 + }, + { + "epoch": 0.30472615594608427, + "grad_norm": 1.477629906483165, + "learning_rate": 8.148212157272517e-05, + "loss": 0.2707, + "step": 3572 + }, + { + "epoch": 0.30481146562020134, + "grad_norm": 1.2465691212793606, + "learning_rate": 8.14713874932721e-05, + "loss": 0.3307, + "step": 3573 + }, + { + "epoch": 0.30489677529431836, + "grad_norm": 1.636057717935035, + "learning_rate": 8.146065101114581e-05, + "loss": 0.3345, + "step": 3574 + }, + { + "epoch": 0.30498208496843543, + "grad_norm": 1.4130805805687379, + "learning_rate": 8.144991212716603e-05, + "loss": 0.3473, + "step": 3575 + }, + { + "epoch": 0.30506739464255245, + "grad_norm": 1.4746202076899293, + "learning_rate": 8.143917084215256e-05, + "loss": 0.3235, + "step": 3576 + }, + { + "epoch": 0.3051527043166695, + "grad_norm": 1.4596257976540583, + "learning_rate": 8.142842715692548e-05, + "loss": 0.3555, + "step": 3577 + }, + { + "epoch": 0.30523801399078654, + "grad_norm": 1.385400623811846, + "learning_rate": 8.141768107230498e-05, + "loss": 0.3626, + "step": 3578 + }, + { + "epoch": 0.3053233236649036, + "grad_norm": 1.5070231847314883, + "learning_rate": 8.140693258911151e-05, + "loss": 0.325, + "step": 3579 + }, + { + "epoch": 0.30540863333902063, + "grad_norm": 1.2832228660516554, + "learning_rate": 8.139618170816562e-05, + "loss": 0.3216, + "step": 3580 + }, + { + "epoch": 0.3054939430131377, + "grad_norm": 1.4572536922557924, + "learning_rate": 8.13854284302881e-05, + "loss": 0.3257, + "step": 3581 + }, + { + "epoch": 0.3055792526872547, + "grad_norm": 1.4285053306002127, + "learning_rate": 8.137467275629988e-05, + "loss": 0.3353, + "step": 3582 + }, + { + "epoch": 0.3056645623613718, + "grad_norm": 1.3397401819711119, + "learning_rate": 8.136391468702214e-05, + "loss": 0.2797, + "step": 3583 + }, + { + "epoch": 0.3057498720354888, + "grad_norm": 1.5260889699155487, + "learning_rate": 8.135315422327618e-05, + "loss": 0.3344, + "step": 3584 + }, + { + "epoch": 0.3058351817096059, + "grad_norm": 1.3658843014703155, + "learning_rate": 8.13423913658835e-05, + "loss": 0.3312, + "step": 3585 + }, + { + "epoch": 0.3059204913837229, + "grad_norm": 1.3195056142626134, + "learning_rate": 8.133162611566581e-05, + "loss": 0.2964, + "step": 3586 + }, + { + "epoch": 0.30600580105784, + "grad_norm": 1.5070951662802252, + "learning_rate": 8.132085847344493e-05, + "loss": 0.3243, + "step": 3587 + }, + { + "epoch": 0.306091110731957, + "grad_norm": 1.664252758343018, + "learning_rate": 8.131008844004295e-05, + "loss": 0.3642, + "step": 3588 + }, + { + "epoch": 0.30617642040607407, + "grad_norm": 1.335969444919145, + "learning_rate": 8.129931601628212e-05, + "loss": 0.3443, + "step": 3589 + }, + { + "epoch": 0.3062617300801911, + "grad_norm": 1.4762130404504286, + "learning_rate": 8.128854120298484e-05, + "loss": 0.3358, + "step": 3590 + }, + { + "epoch": 0.30634703975430816, + "grad_norm": 1.1969408898605605, + "learning_rate": 8.127776400097369e-05, + "loss": 0.291, + "step": 3591 + }, + { + "epoch": 0.3064323494284252, + "grad_norm": 1.1840130154661104, + "learning_rate": 8.126698441107146e-05, + "loss": 0.3007, + "step": 3592 + }, + { + "epoch": 0.30651765910254225, + "grad_norm": 1.4549758772800314, + "learning_rate": 8.125620243410113e-05, + "loss": 0.3426, + "step": 3593 + }, + { + "epoch": 0.30660296877665927, + "grad_norm": 1.0477214067581129, + "learning_rate": 8.124541807088587e-05, + "loss": 0.2318, + "step": 3594 + }, + { + "epoch": 0.30668827845077634, + "grad_norm": 1.4422035389976475, + "learning_rate": 8.123463132224893e-05, + "loss": 0.3122, + "step": 3595 + }, + { + "epoch": 0.30677358812489336, + "grad_norm": 1.2631619351671666, + "learning_rate": 8.122384218901389e-05, + "loss": 0.304, + "step": 3596 + }, + { + "epoch": 0.30685889779901043, + "grad_norm": 2.185165249230106, + "learning_rate": 8.121305067200442e-05, + "loss": 0.2589, + "step": 3597 + }, + { + "epoch": 0.30694420747312745, + "grad_norm": 1.4079143423825364, + "learning_rate": 8.120225677204441e-05, + "loss": 0.2595, + "step": 3598 + }, + { + "epoch": 0.3070295171472445, + "grad_norm": 1.7501583027765895, + "learning_rate": 8.11914604899579e-05, + "loss": 0.3522, + "step": 3599 + }, + { + "epoch": 0.30711482682136154, + "grad_norm": 1.437414830214042, + "learning_rate": 8.118066182656911e-05, + "loss": 0.3001, + "step": 3600 + }, + { + "epoch": 0.30720013649547856, + "grad_norm": 1.6969193960321889, + "learning_rate": 8.116986078270252e-05, + "loss": 0.2912, + "step": 3601 + }, + { + "epoch": 0.30728544616959563, + "grad_norm": 1.5887636456927758, + "learning_rate": 8.115905735918268e-05, + "loss": 0.3516, + "step": 3602 + }, + { + "epoch": 0.30737075584371265, + "grad_norm": 1.6084359401460373, + "learning_rate": 8.114825155683437e-05, + "loss": 0.3109, + "step": 3603 + }, + { + "epoch": 0.3074560655178297, + "grad_norm": 1.279836665997689, + "learning_rate": 8.113744337648259e-05, + "loss": 0.3112, + "step": 3604 + }, + { + "epoch": 0.30754137519194674, + "grad_norm": 1.0891060468814595, + "learning_rate": 8.112663281895248e-05, + "loss": 0.2937, + "step": 3605 + }, + { + "epoch": 0.3076266848660638, + "grad_norm": 1.490159819051607, + "learning_rate": 8.111581988506935e-05, + "loss": 0.3102, + "step": 3606 + }, + { + "epoch": 0.30771199454018083, + "grad_norm": 1.4211479256376478, + "learning_rate": 8.110500457565873e-05, + "loss": 0.3605, + "step": 3607 + }, + { + "epoch": 0.3077973042142979, + "grad_norm": 1.4294538463334896, + "learning_rate": 8.109418689154629e-05, + "loss": 0.2959, + "step": 3608 + }, + { + "epoch": 0.3078826138884149, + "grad_norm": 1.5354620165535078, + "learning_rate": 8.108336683355792e-05, + "loss": 0.3745, + "step": 3609 + }, + { + "epoch": 0.307967923562532, + "grad_norm": 1.437142120262007, + "learning_rate": 8.107254440251967e-05, + "loss": 0.3126, + "step": 3610 + }, + { + "epoch": 0.308053233236649, + "grad_norm": 1.5439487097725115, + "learning_rate": 8.106171959925779e-05, + "loss": 0.3361, + "step": 3611 + }, + { + "epoch": 0.3081385429107661, + "grad_norm": 1.6168644531755365, + "learning_rate": 8.105089242459866e-05, + "loss": 0.2885, + "step": 3612 + }, + { + "epoch": 0.3082238525848831, + "grad_norm": 1.376680301124599, + "learning_rate": 8.104006287936892e-05, + "loss": 0.3387, + "step": 3613 + }, + { + "epoch": 0.3083091622590002, + "grad_norm": 1.3830855364024144, + "learning_rate": 8.10292309643953e-05, + "loss": 0.3121, + "step": 3614 + }, + { + "epoch": 0.3083944719331172, + "grad_norm": 1.2391086064692225, + "learning_rate": 8.10183966805048e-05, + "loss": 0.2938, + "step": 3615 + }, + { + "epoch": 0.30847978160723427, + "grad_norm": 1.3481845870486784, + "learning_rate": 8.100756002852454e-05, + "loss": 0.3263, + "step": 3616 + }, + { + "epoch": 0.3085650912813513, + "grad_norm": 1.4338434402096012, + "learning_rate": 8.099672100928184e-05, + "loss": 0.2862, + "step": 3617 + }, + { + "epoch": 0.30865040095546836, + "grad_norm": 1.4004728966119238, + "learning_rate": 8.098587962360422e-05, + "loss": 0.2738, + "step": 3618 + }, + { + "epoch": 0.3087357106295854, + "grad_norm": 1.436710223638843, + "learning_rate": 8.097503587231933e-05, + "loss": 0.3221, + "step": 3619 + }, + { + "epoch": 0.30882102030370245, + "grad_norm": 1.4794903342861743, + "learning_rate": 8.096418975625508e-05, + "loss": 0.2773, + "step": 3620 + }, + { + "epoch": 0.30890632997781947, + "grad_norm": 1.1304414335903616, + "learning_rate": 8.095334127623947e-05, + "loss": 0.2963, + "step": 3621 + }, + { + "epoch": 0.30899163965193654, + "grad_norm": 1.6499924399462813, + "learning_rate": 8.094249043310073e-05, + "loss": 0.3482, + "step": 3622 + }, + { + "epoch": 0.30907694932605356, + "grad_norm": 1.7380816741558427, + "learning_rate": 8.09316372276673e-05, + "loss": 0.3273, + "step": 3623 + }, + { + "epoch": 0.30916225900017064, + "grad_norm": 1.6273357037708396, + "learning_rate": 8.09207816607677e-05, + "loss": 0.3566, + "step": 3624 + }, + { + "epoch": 0.30924756867428765, + "grad_norm": 1.8102239590338958, + "learning_rate": 8.090992373323077e-05, + "loss": 0.3145, + "step": 3625 + }, + { + "epoch": 0.3093328783484047, + "grad_norm": 1.3716726358051288, + "learning_rate": 8.08990634458854e-05, + "loss": 0.2828, + "step": 3626 + }, + { + "epoch": 0.30941818802252175, + "grad_norm": 1.4389022124879303, + "learning_rate": 8.088820079956074e-05, + "loss": 0.3365, + "step": 3627 + }, + { + "epoch": 0.3095034976966388, + "grad_norm": 1.5979822847437128, + "learning_rate": 8.087733579508609e-05, + "loss": 0.3726, + "step": 3628 + }, + { + "epoch": 0.30958880737075584, + "grad_norm": 1.52398890762097, + "learning_rate": 8.086646843329093e-05, + "loss": 0.3466, + "step": 3629 + }, + { + "epoch": 0.3096741170448729, + "grad_norm": 1.6026156800602114, + "learning_rate": 8.085559871500493e-05, + "loss": 0.3564, + "step": 3630 + }, + { + "epoch": 0.3097594267189899, + "grad_norm": 1.5661817280350046, + "learning_rate": 8.084472664105794e-05, + "loss": 0.3214, + "step": 3631 + }, + { + "epoch": 0.309844736393107, + "grad_norm": 1.5492469250246994, + "learning_rate": 8.083385221227997e-05, + "loss": 0.3635, + "step": 3632 + }, + { + "epoch": 0.309930046067224, + "grad_norm": 1.5025508172554627, + "learning_rate": 8.082297542950123e-05, + "loss": 0.3586, + "step": 3633 + }, + { + "epoch": 0.3100153557413411, + "grad_norm": 1.5343509338107555, + "learning_rate": 8.081209629355214e-05, + "loss": 0.2943, + "step": 3634 + }, + { + "epoch": 0.3101006654154581, + "grad_norm": 1.587395660095319, + "learning_rate": 8.080121480526319e-05, + "loss": 0.3294, + "step": 3635 + }, + { + "epoch": 0.3101859750895752, + "grad_norm": 1.4899691724468391, + "learning_rate": 8.07903309654652e-05, + "loss": 0.331, + "step": 3636 + }, + { + "epoch": 0.3102712847636922, + "grad_norm": 1.503876604060479, + "learning_rate": 8.077944477498905e-05, + "loss": 0.3194, + "step": 3637 + }, + { + "epoch": 0.3103565944378093, + "grad_norm": 1.1481458851633592, + "learning_rate": 8.076855623466584e-05, + "loss": 0.3142, + "step": 3638 + }, + { + "epoch": 0.3104419041119263, + "grad_norm": 1.2510355951107346, + "learning_rate": 8.075766534532689e-05, + "loss": 0.3097, + "step": 3639 + }, + { + "epoch": 0.3105272137860433, + "grad_norm": 1.2975969717873541, + "learning_rate": 8.074677210780361e-05, + "loss": 0.3599, + "step": 3640 + }, + { + "epoch": 0.3106125234601604, + "grad_norm": 1.328794243435732, + "learning_rate": 8.073587652292769e-05, + "loss": 0.2917, + "step": 3641 + }, + { + "epoch": 0.3106978331342774, + "grad_norm": 1.4600153986236377, + "learning_rate": 8.072497859153091e-05, + "loss": 0.3104, + "step": 3642 + }, + { + "epoch": 0.3107831428083945, + "grad_norm": 1.421829893633357, + "learning_rate": 8.07140783144453e-05, + "loss": 0.3339, + "step": 3643 + }, + { + "epoch": 0.3108684524825115, + "grad_norm": 1.9012824197847922, + "learning_rate": 8.070317569250302e-05, + "loss": 0.3635, + "step": 3644 + }, + { + "epoch": 0.31095376215662857, + "grad_norm": 1.5866569803781057, + "learning_rate": 8.069227072653642e-05, + "loss": 0.2937, + "step": 3645 + }, + { + "epoch": 0.3110390718307456, + "grad_norm": 1.400903413703806, + "learning_rate": 8.068136341737807e-05, + "loss": 0.2742, + "step": 3646 + }, + { + "epoch": 0.31112438150486266, + "grad_norm": 1.696432842163217, + "learning_rate": 8.067045376586066e-05, + "loss": 0.2881, + "step": 3647 + }, + { + "epoch": 0.3112096911789797, + "grad_norm": 1.7414653975110013, + "learning_rate": 8.065954177281708e-05, + "loss": 0.3997, + "step": 3648 + }, + { + "epoch": 0.31129500085309675, + "grad_norm": 1.4700571290417541, + "learning_rate": 8.064862743908042e-05, + "loss": 0.2867, + "step": 3649 + }, + { + "epoch": 0.31138031052721377, + "grad_norm": 1.7032528750508593, + "learning_rate": 8.063771076548391e-05, + "loss": 0.299, + "step": 3650 + }, + { + "epoch": 0.31146562020133084, + "grad_norm": 1.6136717981704622, + "learning_rate": 8.0626791752861e-05, + "loss": 0.2848, + "step": 3651 + }, + { + "epoch": 0.31155092987544786, + "grad_norm": 1.7027294154487662, + "learning_rate": 8.061587040204528e-05, + "loss": 0.3387, + "step": 3652 + }, + { + "epoch": 0.31163623954956493, + "grad_norm": 1.4041955988287023, + "learning_rate": 8.060494671387055e-05, + "loss": 0.3441, + "step": 3653 + }, + { + "epoch": 0.31172154922368195, + "grad_norm": 1.3209589893406137, + "learning_rate": 8.059402068917079e-05, + "loss": 0.3296, + "step": 3654 + }, + { + "epoch": 0.311806858897799, + "grad_norm": 1.6779348056072638, + "learning_rate": 8.058309232878012e-05, + "loss": 0.3287, + "step": 3655 + }, + { + "epoch": 0.31189216857191604, + "grad_norm": 1.208837349229705, + "learning_rate": 8.057216163353285e-05, + "loss": 0.3232, + "step": 3656 + }, + { + "epoch": 0.3119774782460331, + "grad_norm": 1.4624366616121516, + "learning_rate": 8.056122860426352e-05, + "loss": 0.3109, + "step": 3657 + }, + { + "epoch": 0.31206278792015013, + "grad_norm": 1.4403485808733343, + "learning_rate": 8.055029324180678e-05, + "loss": 0.3215, + "step": 3658 + }, + { + "epoch": 0.3121480975942672, + "grad_norm": 1.6590863538943188, + "learning_rate": 8.053935554699749e-05, + "loss": 0.2525, + "step": 3659 + }, + { + "epoch": 0.3122334072683842, + "grad_norm": 1.648168315875182, + "learning_rate": 8.05284155206707e-05, + "loss": 0.3158, + "step": 3660 + }, + { + "epoch": 0.3123187169425013, + "grad_norm": 1.5621157364406917, + "learning_rate": 8.05174731636616e-05, + "loss": 0.2664, + "step": 3661 + }, + { + "epoch": 0.3124040266166183, + "grad_norm": 1.3914107342433049, + "learning_rate": 8.050652847680562e-05, + "loss": 0.2944, + "step": 3662 + }, + { + "epoch": 0.3124893362907354, + "grad_norm": 1.4796217455649705, + "learning_rate": 8.049558146093827e-05, + "loss": 0.3127, + "step": 3663 + }, + { + "epoch": 0.3125746459648524, + "grad_norm": 1.3318526520234757, + "learning_rate": 8.048463211689535e-05, + "loss": 0.2805, + "step": 3664 + }, + { + "epoch": 0.3126599556389695, + "grad_norm": 1.4244491415910117, + "learning_rate": 8.047368044551276e-05, + "loss": 0.3035, + "step": 3665 + }, + { + "epoch": 0.3127452653130865, + "grad_norm": 1.405863560455443, + "learning_rate": 8.04627264476266e-05, + "loss": 0.2507, + "step": 3666 + }, + { + "epoch": 0.31283057498720357, + "grad_norm": 1.5191288196971153, + "learning_rate": 8.045177012407316e-05, + "loss": 0.341, + "step": 3667 + }, + { + "epoch": 0.3129158846613206, + "grad_norm": 1.400010231525636, + "learning_rate": 8.044081147568889e-05, + "loss": 0.3306, + "step": 3668 + }, + { + "epoch": 0.31300119433543766, + "grad_norm": 1.4407630043376214, + "learning_rate": 8.042985050331042e-05, + "loss": 0.4905, + "step": 3669 + }, + { + "epoch": 0.3130865040095547, + "grad_norm": 1.324594899995326, + "learning_rate": 8.041888720777457e-05, + "loss": 0.274, + "step": 3670 + }, + { + "epoch": 0.31317181368367175, + "grad_norm": 1.4454634304111627, + "learning_rate": 8.040792158991833e-05, + "loss": 0.2748, + "step": 3671 + }, + { + "epoch": 0.31325712335778877, + "grad_norm": 1.2926237908115972, + "learning_rate": 8.039695365057887e-05, + "loss": 0.3224, + "step": 3672 + }, + { + "epoch": 0.31334243303190584, + "grad_norm": 1.682648926898849, + "learning_rate": 8.038598339059351e-05, + "loss": 0.3048, + "step": 3673 + }, + { + "epoch": 0.31342774270602286, + "grad_norm": 1.5212536931992198, + "learning_rate": 8.03750108107998e-05, + "loss": 0.342, + "step": 3674 + }, + { + "epoch": 0.31351305238013993, + "grad_norm": 1.435235603476932, + "learning_rate": 8.036403591203544e-05, + "loss": 0.3614, + "step": 3675 + }, + { + "epoch": 0.31359836205425695, + "grad_norm": 1.5057730050062628, + "learning_rate": 8.035305869513828e-05, + "loss": 0.3178, + "step": 3676 + }, + { + "epoch": 0.313683671728374, + "grad_norm": 1.404223019669934, + "learning_rate": 8.034207916094638e-05, + "loss": 0.3093, + "step": 3677 + }, + { + "epoch": 0.31376898140249104, + "grad_norm": 1.59684736672099, + "learning_rate": 8.033109731029798e-05, + "loss": 0.3532, + "step": 3678 + }, + { + "epoch": 0.31385429107660806, + "grad_norm": 1.5252377559243568, + "learning_rate": 8.032011314403147e-05, + "loss": 0.3324, + "step": 3679 + }, + { + "epoch": 0.31393960075072513, + "grad_norm": 1.4349047411891953, + "learning_rate": 8.030912666298546e-05, + "loss": 0.2669, + "step": 3680 + }, + { + "epoch": 0.31402491042484215, + "grad_norm": 1.3972249661556237, + "learning_rate": 8.029813786799868e-05, + "loss": 0.3388, + "step": 3681 + }, + { + "epoch": 0.3141102200989592, + "grad_norm": 1.2998634743526631, + "learning_rate": 8.028714675991006e-05, + "loss": 0.3223, + "step": 3682 + }, + { + "epoch": 0.31419552977307624, + "grad_norm": 1.6053898924235062, + "learning_rate": 8.027615333955877e-05, + "loss": 0.3524, + "step": 3683 + }, + { + "epoch": 0.3142808394471933, + "grad_norm": 1.5210035394330197, + "learning_rate": 8.026515760778403e-05, + "loss": 0.3018, + "step": 3684 + }, + { + "epoch": 0.31436614912131033, + "grad_norm": 1.465829827610065, + "learning_rate": 8.025415956542535e-05, + "loss": 0.2978, + "step": 3685 + }, + { + "epoch": 0.3144514587954274, + "grad_norm": 1.6196325863878902, + "learning_rate": 8.024315921332236e-05, + "loss": 0.3786, + "step": 3686 + }, + { + "epoch": 0.3145367684695444, + "grad_norm": 1.202404400917446, + "learning_rate": 8.023215655231488e-05, + "loss": 0.2675, + "step": 3687 + }, + { + "epoch": 0.3146220781436615, + "grad_norm": 1.685727990454968, + "learning_rate": 8.022115158324288e-05, + "loss": 0.3388, + "step": 3688 + }, + { + "epoch": 0.3147073878177785, + "grad_norm": 1.4134731856165186, + "learning_rate": 8.021014430694655e-05, + "loss": 0.3765, + "step": 3689 + }, + { + "epoch": 0.3147926974918956, + "grad_norm": 1.6145243808279397, + "learning_rate": 8.019913472426626e-05, + "loss": 0.3363, + "step": 3690 + }, + { + "epoch": 0.3148780071660126, + "grad_norm": 1.585788888968316, + "learning_rate": 8.018812283604251e-05, + "loss": 0.3415, + "step": 3691 + }, + { + "epoch": 0.3149633168401297, + "grad_norm": 1.4935242422439652, + "learning_rate": 8.017710864311599e-05, + "loss": 0.3075, + "step": 3692 + }, + { + "epoch": 0.3150486265142467, + "grad_norm": 1.375005201850068, + "learning_rate": 8.016609214632759e-05, + "loss": 0.2806, + "step": 3693 + }, + { + "epoch": 0.3151339361883638, + "grad_norm": 1.730468887776871, + "learning_rate": 8.015507334651835e-05, + "loss": 0.321, + "step": 3694 + }, + { + "epoch": 0.3152192458624808, + "grad_norm": 1.454747104755306, + "learning_rate": 8.014405224452953e-05, + "loss": 0.3027, + "step": 3695 + }, + { + "epoch": 0.31530455553659786, + "grad_norm": 1.517602397474206, + "learning_rate": 8.013302884120247e-05, + "loss": 0.3441, + "step": 3696 + }, + { + "epoch": 0.3153898652107149, + "grad_norm": 1.3960691034652943, + "learning_rate": 8.012200313737881e-05, + "loss": 0.2935, + "step": 3697 + }, + { + "epoch": 0.31547517488483195, + "grad_norm": 1.2506278368657016, + "learning_rate": 8.011097513390027e-05, + "loss": 0.3704, + "step": 3698 + }, + { + "epoch": 0.315560484558949, + "grad_norm": 1.3285807556859188, + "learning_rate": 8.009994483160879e-05, + "loss": 0.3463, + "step": 3699 + }, + { + "epoch": 0.31564579423306605, + "grad_norm": 1.7792854183091984, + "learning_rate": 8.008891223134647e-05, + "loss": 0.3238, + "step": 3700 + }, + { + "epoch": 0.31573110390718306, + "grad_norm": 1.366916171806953, + "learning_rate": 8.007787733395559e-05, + "loss": 0.2807, + "step": 3701 + }, + { + "epoch": 0.31581641358130014, + "grad_norm": 1.2892642239031378, + "learning_rate": 8.006684014027862e-05, + "loss": 0.2748, + "step": 3702 + }, + { + "epoch": 0.31590172325541716, + "grad_norm": 1.4383925279361707, + "learning_rate": 8.005580065115816e-05, + "loss": 0.3277, + "step": 3703 + }, + { + "epoch": 0.31598703292953423, + "grad_norm": 1.3984786725509817, + "learning_rate": 8.004475886743705e-05, + "loss": 0.3171, + "step": 3704 + }, + { + "epoch": 0.31607234260365125, + "grad_norm": 1.6608500388543512, + "learning_rate": 8.003371478995827e-05, + "loss": 0.3562, + "step": 3705 + }, + { + "epoch": 0.3161576522777683, + "grad_norm": 1.511993777469163, + "learning_rate": 8.002266841956496e-05, + "loss": 0.3351, + "step": 3706 + }, + { + "epoch": 0.31624296195188534, + "grad_norm": 1.5121853524792144, + "learning_rate": 8.001161975710045e-05, + "loss": 0.319, + "step": 3707 + }, + { + "epoch": 0.3163282716260024, + "grad_norm": 1.8368222965598509, + "learning_rate": 8.000056880340824e-05, + "loss": 0.3952, + "step": 3708 + }, + { + "epoch": 0.31641358130011943, + "grad_norm": 1.4163404631446825, + "learning_rate": 7.998951555933205e-05, + "loss": 0.2821, + "step": 3709 + }, + { + "epoch": 0.3164988909742365, + "grad_norm": 1.4023362693480839, + "learning_rate": 7.99784600257157e-05, + "loss": 0.295, + "step": 3710 + }, + { + "epoch": 0.3165842006483535, + "grad_norm": 1.6437622592019234, + "learning_rate": 7.996740220340323e-05, + "loss": 0.3004, + "step": 3711 + }, + { + "epoch": 0.3166695103224706, + "grad_norm": 1.3748687334692138, + "learning_rate": 7.995634209323886e-05, + "loss": 0.3057, + "step": 3712 + }, + { + "epoch": 0.3167548199965876, + "grad_norm": 1.4855815601537656, + "learning_rate": 7.994527969606695e-05, + "loss": 0.2606, + "step": 3713 + }, + { + "epoch": 0.3168401296707047, + "grad_norm": 1.4980598459092545, + "learning_rate": 7.993421501273205e-05, + "loss": 0.3182, + "step": 3714 + }, + { + "epoch": 0.3169254393448217, + "grad_norm": 1.396268729252478, + "learning_rate": 7.992314804407892e-05, + "loss": 0.3177, + "step": 3715 + }, + { + "epoch": 0.3170107490189387, + "grad_norm": 1.8721645532487037, + "learning_rate": 7.991207879095244e-05, + "loss": 0.3702, + "step": 3716 + }, + { + "epoch": 0.3170960586930558, + "grad_norm": 1.6425411115869135, + "learning_rate": 7.990100725419771e-05, + "loss": 0.3374, + "step": 3717 + }, + { + "epoch": 0.3171813683671728, + "grad_norm": 1.266463861208134, + "learning_rate": 7.988993343465996e-05, + "loss": 0.283, + "step": 3718 + }, + { + "epoch": 0.3172666780412899, + "grad_norm": 1.451322688895287, + "learning_rate": 7.987885733318463e-05, + "loss": 0.375, + "step": 3719 + }, + { + "epoch": 0.3173519877154069, + "grad_norm": 1.7448586825885326, + "learning_rate": 7.986777895061732e-05, + "loss": 0.359, + "step": 3720 + }, + { + "epoch": 0.317437297389524, + "grad_norm": 1.6603259908629566, + "learning_rate": 7.98566982878038e-05, + "loss": 0.3952, + "step": 3721 + }, + { + "epoch": 0.317522607063641, + "grad_norm": 1.250088879286478, + "learning_rate": 7.984561534559003e-05, + "loss": 0.324, + "step": 3722 + }, + { + "epoch": 0.31760791673775807, + "grad_norm": 1.3534237608898174, + "learning_rate": 7.983453012482214e-05, + "loss": 0.2906, + "step": 3723 + }, + { + "epoch": 0.3176932264118751, + "grad_norm": 1.4734954466605406, + "learning_rate": 7.982344262634641e-05, + "loss": 0.3456, + "step": 3724 + }, + { + "epoch": 0.31777853608599216, + "grad_norm": 1.7367726841427111, + "learning_rate": 7.981235285100929e-05, + "loss": 0.3788, + "step": 3725 + }, + { + "epoch": 0.3178638457601092, + "grad_norm": 1.2591413030251557, + "learning_rate": 7.980126079965747e-05, + "loss": 0.287, + "step": 3726 + }, + { + "epoch": 0.31794915543422625, + "grad_norm": 1.3592019683507057, + "learning_rate": 7.979016647313774e-05, + "loss": 0.2926, + "step": 3727 + }, + { + "epoch": 0.31803446510834327, + "grad_norm": 1.4899066849744786, + "learning_rate": 7.977906987229713e-05, + "loss": 0.346, + "step": 3728 + }, + { + "epoch": 0.31811977478246034, + "grad_norm": 1.4836069629757775, + "learning_rate": 7.976797099798277e-05, + "loss": 0.3413, + "step": 3729 + }, + { + "epoch": 0.31820508445657736, + "grad_norm": 1.8704785826965291, + "learning_rate": 7.9756869851042e-05, + "loss": 0.33, + "step": 3730 + }, + { + "epoch": 0.31829039413069443, + "grad_norm": 1.726954074834335, + "learning_rate": 7.974576643232236e-05, + "loss": 0.3487, + "step": 3731 + }, + { + "epoch": 0.31837570380481145, + "grad_norm": 1.9179034251103277, + "learning_rate": 7.97346607426715e-05, + "loss": 0.3067, + "step": 3732 + }, + { + "epoch": 0.3184610134789285, + "grad_norm": 1.3166005248747719, + "learning_rate": 7.972355278293733e-05, + "loss": 0.3144, + "step": 3733 + }, + { + "epoch": 0.31854632315304554, + "grad_norm": 2.5752093850345488, + "learning_rate": 7.971244255396784e-05, + "loss": 0.2722, + "step": 3734 + }, + { + "epoch": 0.3186316328271626, + "grad_norm": 1.312104074615648, + "learning_rate": 7.970133005661125e-05, + "loss": 0.309, + "step": 3735 + }, + { + "epoch": 0.31871694250127963, + "grad_norm": 1.2785692315033275, + "learning_rate": 7.969021529171595e-05, + "loss": 0.2749, + "step": 3736 + }, + { + "epoch": 0.3188022521753967, + "grad_norm": 1.4141594447841506, + "learning_rate": 7.96790982601305e-05, + "loss": 0.3229, + "step": 3737 + }, + { + "epoch": 0.3188875618495137, + "grad_norm": 1.3648118660078932, + "learning_rate": 7.966797896270358e-05, + "loss": 0.3346, + "step": 3738 + }, + { + "epoch": 0.3189728715236308, + "grad_norm": 1.7154640692047571, + "learning_rate": 7.965685740028415e-05, + "loss": 0.3169, + "step": 3739 + }, + { + "epoch": 0.3190581811977478, + "grad_norm": 1.4178473470540967, + "learning_rate": 7.964573357372123e-05, + "loss": 0.3275, + "step": 3740 + }, + { + "epoch": 0.3191434908718649, + "grad_norm": 1.3119913659683933, + "learning_rate": 7.96346074838641e-05, + "loss": 0.2892, + "step": 3741 + }, + { + "epoch": 0.3192288005459819, + "grad_norm": 1.420412506175007, + "learning_rate": 7.962347913156218e-05, + "loss": 0.3042, + "step": 3742 + }, + { + "epoch": 0.319314110220099, + "grad_norm": 1.5046010975971755, + "learning_rate": 7.961234851766503e-05, + "loss": 0.3017, + "step": 3743 + }, + { + "epoch": 0.319399419894216, + "grad_norm": 1.533383498546036, + "learning_rate": 7.960121564302243e-05, + "loss": 0.3686, + "step": 3744 + }, + { + "epoch": 0.31948472956833307, + "grad_norm": 1.8396506562519799, + "learning_rate": 7.959008050848433e-05, + "loss": 0.3827, + "step": 3745 + }, + { + "epoch": 0.3195700392424501, + "grad_norm": 1.3220520297789906, + "learning_rate": 7.957894311490082e-05, + "loss": 0.3057, + "step": 3746 + }, + { + "epoch": 0.31965534891656716, + "grad_norm": 1.4268328407398738, + "learning_rate": 7.956780346312218e-05, + "loss": 0.2464, + "step": 3747 + }, + { + "epoch": 0.3197406585906842, + "grad_norm": 1.4316325480192946, + "learning_rate": 7.955666155399886e-05, + "loss": 0.3147, + "step": 3748 + }, + { + "epoch": 0.31982596826480125, + "grad_norm": 1.5499245994440478, + "learning_rate": 7.954551738838151e-05, + "loss": 0.3407, + "step": 3749 + }, + { + "epoch": 0.31991127793891827, + "grad_norm": 1.617373101092765, + "learning_rate": 7.953437096712091e-05, + "loss": 0.3853, + "step": 3750 + }, + { + "epoch": 0.31999658761303534, + "grad_norm": 1.2834483569893234, + "learning_rate": 7.952322229106803e-05, + "loss": 0.2933, + "step": 3751 + }, + { + "epoch": 0.32008189728715236, + "grad_norm": 1.332581079388846, + "learning_rate": 7.951207136107401e-05, + "loss": 0.3042, + "step": 3752 + }, + { + "epoch": 0.32016720696126943, + "grad_norm": 1.3606490380227523, + "learning_rate": 7.950091817799018e-05, + "loss": 0.3202, + "step": 3753 + }, + { + "epoch": 0.32025251663538645, + "grad_norm": 1.6621387806318466, + "learning_rate": 7.9489762742668e-05, + "loss": 0.3581, + "step": 3754 + }, + { + "epoch": 0.32033782630950347, + "grad_norm": 1.299428517100528, + "learning_rate": 7.947860505595915e-05, + "loss": 0.3449, + "step": 3755 + }, + { + "epoch": 0.32042313598362054, + "grad_norm": 1.690886032062863, + "learning_rate": 7.946744511871545e-05, + "loss": 0.3548, + "step": 3756 + }, + { + "epoch": 0.32050844565773756, + "grad_norm": 1.5002305330508288, + "learning_rate": 7.945628293178891e-05, + "loss": 0.306, + "step": 3757 + }, + { + "epoch": 0.32059375533185464, + "grad_norm": 1.4168870230939932, + "learning_rate": 7.944511849603171e-05, + "loss": 0.3126, + "step": 3758 + }, + { + "epoch": 0.32067906500597165, + "grad_norm": 1.8251324801501059, + "learning_rate": 7.943395181229616e-05, + "loss": 0.3035, + "step": 3759 + }, + { + "epoch": 0.3207643746800887, + "grad_norm": 1.5456031425710681, + "learning_rate": 7.942278288143482e-05, + "loss": 0.3074, + "step": 3760 + }, + { + "epoch": 0.32084968435420574, + "grad_norm": 1.2349125621128598, + "learning_rate": 7.941161170430036e-05, + "loss": 0.2725, + "step": 3761 + }, + { + "epoch": 0.3209349940283228, + "grad_norm": 1.3508290853243348, + "learning_rate": 7.940043828174562e-05, + "loss": 0.3079, + "step": 3762 + }, + { + "epoch": 0.32102030370243984, + "grad_norm": 1.5464957186482093, + "learning_rate": 7.938926261462366e-05, + "loss": 0.2939, + "step": 3763 + }, + { + "epoch": 0.3211056133765569, + "grad_norm": 1.7123002151564868, + "learning_rate": 7.937808470378767e-05, + "loss": 0.3452, + "step": 3764 + }, + { + "epoch": 0.3211909230506739, + "grad_norm": 1.6447713137577962, + "learning_rate": 7.936690455009104e-05, + "loss": 0.3442, + "step": 3765 + }, + { + "epoch": 0.321276232724791, + "grad_norm": 1.6833803258448907, + "learning_rate": 7.93557221543873e-05, + "loss": 0.3165, + "step": 3766 + }, + { + "epoch": 0.321361542398908, + "grad_norm": 1.4557089852541805, + "learning_rate": 7.934453751753017e-05, + "loss": 0.3585, + "step": 3767 + }, + { + "epoch": 0.3214468520730251, + "grad_norm": 1.8283407621948506, + "learning_rate": 7.933335064037353e-05, + "loss": 0.3291, + "step": 3768 + }, + { + "epoch": 0.3215321617471421, + "grad_norm": 1.2470640034467464, + "learning_rate": 7.932216152377146e-05, + "loss": 0.3389, + "step": 3769 + }, + { + "epoch": 0.3216174714212592, + "grad_norm": 1.5557000778061887, + "learning_rate": 7.931097016857816e-05, + "loss": 0.3521, + "step": 3770 + }, + { + "epoch": 0.3217027810953762, + "grad_norm": 1.346340432702658, + "learning_rate": 7.929977657564804e-05, + "loss": 0.3159, + "step": 3771 + }, + { + "epoch": 0.3217880907694933, + "grad_norm": 1.37986000511937, + "learning_rate": 7.928858074583569e-05, + "loss": 0.2655, + "step": 3772 + }, + { + "epoch": 0.3218734004436103, + "grad_norm": 1.5617603840318615, + "learning_rate": 7.927738267999584e-05, + "loss": 0.3314, + "step": 3773 + }, + { + "epoch": 0.32195871011772736, + "grad_norm": 1.2420328871754232, + "learning_rate": 7.92661823789834e-05, + "loss": 0.271, + "step": 3774 + }, + { + "epoch": 0.3220440197918444, + "grad_norm": 1.2373939006289472, + "learning_rate": 7.925497984365345e-05, + "loss": 0.3304, + "step": 3775 + }, + { + "epoch": 0.32212932946596146, + "grad_norm": 1.3775815138609921, + "learning_rate": 7.924377507486127e-05, + "loss": 0.2818, + "step": 3776 + }, + { + "epoch": 0.3222146391400785, + "grad_norm": 1.3104243441520063, + "learning_rate": 7.923256807346224e-05, + "loss": 0.3456, + "step": 3777 + }, + { + "epoch": 0.32229994881419555, + "grad_norm": 1.3712184229842794, + "learning_rate": 7.922135884031199e-05, + "loss": 0.3139, + "step": 3778 + }, + { + "epoch": 0.32238525848831256, + "grad_norm": 1.4927046114477418, + "learning_rate": 7.921014737626627e-05, + "loss": 0.289, + "step": 3779 + }, + { + "epoch": 0.32247056816242964, + "grad_norm": 1.2321289488202913, + "learning_rate": 7.919893368218103e-05, + "loss": 0.294, + "step": 3780 + }, + { + "epoch": 0.32255587783654666, + "grad_norm": 1.4059372448163163, + "learning_rate": 7.918771775891236e-05, + "loss": 0.2636, + "step": 3781 + }, + { + "epoch": 0.32264118751066373, + "grad_norm": 1.2829837698679227, + "learning_rate": 7.917649960731655e-05, + "loss": 0.2785, + "step": 3782 + }, + { + "epoch": 0.32272649718478075, + "grad_norm": 2.1073685887682303, + "learning_rate": 7.916527922825002e-05, + "loss": 0.36, + "step": 3783 + }, + { + "epoch": 0.3228118068588978, + "grad_norm": 1.7938305481161296, + "learning_rate": 7.915405662256942e-05, + "loss": 0.3744, + "step": 3784 + }, + { + "epoch": 0.32289711653301484, + "grad_norm": 1.2793136945011936, + "learning_rate": 7.91428317911315e-05, + "loss": 0.2672, + "step": 3785 + }, + { + "epoch": 0.3229824262071319, + "grad_norm": 1.7357292730255116, + "learning_rate": 7.913160473479327e-05, + "loss": 0.3313, + "step": 3786 + }, + { + "epoch": 0.32306773588124893, + "grad_norm": 1.8492607908059127, + "learning_rate": 7.912037545441182e-05, + "loss": 0.2921, + "step": 3787 + }, + { + "epoch": 0.323153045555366, + "grad_norm": 1.5170113426895915, + "learning_rate": 7.910914395084443e-05, + "loss": 0.2977, + "step": 3788 + }, + { + "epoch": 0.323238355229483, + "grad_norm": 1.5420005368084597, + "learning_rate": 7.909791022494859e-05, + "loss": 0.3183, + "step": 3789 + }, + { + "epoch": 0.3233236649036001, + "grad_norm": 1.6596543215274444, + "learning_rate": 7.908667427758194e-05, + "loss": 0.2752, + "step": 3790 + }, + { + "epoch": 0.3234089745777171, + "grad_norm": 1.372108366631985, + "learning_rate": 7.907543610960226e-05, + "loss": 0.3232, + "step": 3791 + }, + { + "epoch": 0.3234942842518342, + "grad_norm": 1.535201288004261, + "learning_rate": 7.906419572186756e-05, + "loss": 0.3196, + "step": 3792 + }, + { + "epoch": 0.3235795939259512, + "grad_norm": 1.2998091814406254, + "learning_rate": 7.905295311523595e-05, + "loss": 0.2927, + "step": 3793 + }, + { + "epoch": 0.3236649036000682, + "grad_norm": 1.436117336563859, + "learning_rate": 7.904170829056577e-05, + "loss": 0.298, + "step": 3794 + }, + { + "epoch": 0.3237502132741853, + "grad_norm": 1.5818350211057353, + "learning_rate": 7.903046124871547e-05, + "loss": 0.3212, + "step": 3795 + }, + { + "epoch": 0.3238355229483023, + "grad_norm": 1.8546375862481768, + "learning_rate": 7.901921199054373e-05, + "loss": 0.3316, + "step": 3796 + }, + { + "epoch": 0.3239208326224194, + "grad_norm": 1.4302718839073052, + "learning_rate": 7.900796051690937e-05, + "loss": 0.3077, + "step": 3797 + }, + { + "epoch": 0.3240061422965364, + "grad_norm": 1.5075588507101447, + "learning_rate": 7.899670682867136e-05, + "loss": 0.3201, + "step": 3798 + }, + { + "epoch": 0.3240914519706535, + "grad_norm": 1.529612775308138, + "learning_rate": 7.898545092668887e-05, + "loss": 0.3324, + "step": 3799 + }, + { + "epoch": 0.3241767616447705, + "grad_norm": 1.1540387634761629, + "learning_rate": 7.897419281182124e-05, + "loss": 0.2957, + "step": 3800 + }, + { + "epoch": 0.32426207131888757, + "grad_norm": 1.5245002365437414, + "learning_rate": 7.896293248492797e-05, + "loss": 0.3153, + "step": 3801 + }, + { + "epoch": 0.3243473809930046, + "grad_norm": 1.587161038473519, + "learning_rate": 7.895166994686869e-05, + "loss": 0.2764, + "step": 3802 + }, + { + "epoch": 0.32443269066712166, + "grad_norm": 1.367895019566022, + "learning_rate": 7.894040519850328e-05, + "loss": 0.2769, + "step": 3803 + }, + { + "epoch": 0.3245180003412387, + "grad_norm": 1.5674627456010066, + "learning_rate": 7.89291382406917e-05, + "loss": 0.3157, + "step": 3804 + }, + { + "epoch": 0.32460331001535575, + "grad_norm": 1.3924149555418988, + "learning_rate": 7.891786907429415e-05, + "loss": 0.3023, + "step": 3805 + }, + { + "epoch": 0.32468861968947277, + "grad_norm": 1.5839031683146827, + "learning_rate": 7.890659770017097e-05, + "loss": 0.31, + "step": 3806 + }, + { + "epoch": 0.32477392936358984, + "grad_norm": 1.8207061734815446, + "learning_rate": 7.889532411918267e-05, + "loss": 0.3393, + "step": 3807 + }, + { + "epoch": 0.32485923903770686, + "grad_norm": 1.3647031173354027, + "learning_rate": 7.888404833218992e-05, + "loss": 0.3289, + "step": 3808 + }, + { + "epoch": 0.32494454871182393, + "grad_norm": 1.5820254478819142, + "learning_rate": 7.887277034005356e-05, + "loss": 0.2813, + "step": 3809 + }, + { + "epoch": 0.32502985838594095, + "grad_norm": 1.4139985128014498, + "learning_rate": 7.886149014363463e-05, + "loss": 0.3076, + "step": 3810 + }, + { + "epoch": 0.325115168060058, + "grad_norm": 1.6299088500579013, + "learning_rate": 7.885020774379429e-05, + "loss": 0.3186, + "step": 3811 + }, + { + "epoch": 0.32520047773417504, + "grad_norm": 1.6871878017277597, + "learning_rate": 7.88389231413939e-05, + "loss": 0.3755, + "step": 3812 + }, + { + "epoch": 0.3252857874082921, + "grad_norm": 1.225114733324955, + "learning_rate": 7.882763633729497e-05, + "loss": 0.2921, + "step": 3813 + }, + { + "epoch": 0.32537109708240913, + "grad_norm": 1.144441256060886, + "learning_rate": 7.88163473323592e-05, + "loss": 0.2987, + "step": 3814 + }, + { + "epoch": 0.3254564067565262, + "grad_norm": 1.4204506919114988, + "learning_rate": 7.880505612744843e-05, + "loss": 0.3217, + "step": 3815 + }, + { + "epoch": 0.3255417164306432, + "grad_norm": 1.5318441989748375, + "learning_rate": 7.879376272342472e-05, + "loss": 0.3605, + "step": 3816 + }, + { + "epoch": 0.3256270261047603, + "grad_norm": 1.4908316479000474, + "learning_rate": 7.878246712115022e-05, + "loss": 0.2789, + "step": 3817 + }, + { + "epoch": 0.3257123357788773, + "grad_norm": 1.535297416428123, + "learning_rate": 7.877116932148731e-05, + "loss": 0.3185, + "step": 3818 + }, + { + "epoch": 0.3257976454529944, + "grad_norm": 1.4490028076513244, + "learning_rate": 7.875986932529852e-05, + "loss": 0.3051, + "step": 3819 + }, + { + "epoch": 0.3258829551271114, + "grad_norm": 1.7818088658643094, + "learning_rate": 7.874856713344651e-05, + "loss": 0.3301, + "step": 3820 + }, + { + "epoch": 0.3259682648012285, + "grad_norm": 1.3929015823233672, + "learning_rate": 7.87372627467942e-05, + "loss": 0.3502, + "step": 3821 + }, + { + "epoch": 0.3260535744753455, + "grad_norm": 1.3688998588845624, + "learning_rate": 7.872595616620458e-05, + "loss": 0.3437, + "step": 3822 + }, + { + "epoch": 0.32613888414946257, + "grad_norm": 1.2385951946852136, + "learning_rate": 7.871464739254084e-05, + "loss": 0.2948, + "step": 3823 + }, + { + "epoch": 0.3262241938235796, + "grad_norm": 1.6524111325928978, + "learning_rate": 7.870333642666639e-05, + "loss": 0.3372, + "step": 3824 + }, + { + "epoch": 0.32630950349769666, + "grad_norm": 1.26611262217371, + "learning_rate": 7.86920232694447e-05, + "loss": 0.2872, + "step": 3825 + }, + { + "epoch": 0.3263948131718137, + "grad_norm": 1.3513903646057024, + "learning_rate": 7.868070792173952e-05, + "loss": 0.245, + "step": 3826 + }, + { + "epoch": 0.32648012284593075, + "grad_norm": 1.7149790230405135, + "learning_rate": 7.86693903844147e-05, + "loss": 0.3336, + "step": 3827 + }, + { + "epoch": 0.32656543252004777, + "grad_norm": 1.077285882337616, + "learning_rate": 7.865807065833428e-05, + "loss": 0.2396, + "step": 3828 + }, + { + "epoch": 0.32665074219416484, + "grad_norm": 1.5015155765057044, + "learning_rate": 7.864674874436244e-05, + "loss": 0.3272, + "step": 3829 + }, + { + "epoch": 0.32673605186828186, + "grad_norm": 1.4458644328588268, + "learning_rate": 7.863542464336356e-05, + "loss": 0.2954, + "step": 3830 + }, + { + "epoch": 0.3268213615423989, + "grad_norm": 1.6378617040080945, + "learning_rate": 7.86240983562022e-05, + "loss": 0.3106, + "step": 3831 + }, + { + "epoch": 0.32690667121651595, + "grad_norm": 1.61736860505115, + "learning_rate": 7.861276988374302e-05, + "loss": 0.3075, + "step": 3832 + }, + { + "epoch": 0.32699198089063297, + "grad_norm": 1.6871673821318351, + "learning_rate": 7.860143922685091e-05, + "loss": 0.3401, + "step": 3833 + }, + { + "epoch": 0.32707729056475005, + "grad_norm": 1.7040561615325664, + "learning_rate": 7.85901063863909e-05, + "loss": 0.3038, + "step": 3834 + }, + { + "epoch": 0.32716260023886706, + "grad_norm": 1.3256987906299769, + "learning_rate": 7.85787713632282e-05, + "loss": 0.2649, + "step": 3835 + }, + { + "epoch": 0.32724790991298414, + "grad_norm": 1.5276569328294873, + "learning_rate": 7.856743415822816e-05, + "loss": 0.3338, + "step": 3836 + }, + { + "epoch": 0.32733321958710115, + "grad_norm": 1.6490282202550415, + "learning_rate": 7.855609477225635e-05, + "loss": 0.3409, + "step": 3837 + }, + { + "epoch": 0.3274185292612182, + "grad_norm": 1.422271002654795, + "learning_rate": 7.854475320617844e-05, + "loss": 0.3541, + "step": 3838 + }, + { + "epoch": 0.32750383893533525, + "grad_norm": 1.4235306207909502, + "learning_rate": 7.853340946086032e-05, + "loss": 0.2911, + "step": 3839 + }, + { + "epoch": 0.3275891486094523, + "grad_norm": 1.3515122244419997, + "learning_rate": 7.8522063537168e-05, + "loss": 0.3569, + "step": 3840 + }, + { + "epoch": 0.32767445828356934, + "grad_norm": 1.4840514784158867, + "learning_rate": 7.851071543596769e-05, + "loss": 0.312, + "step": 3841 + }, + { + "epoch": 0.3277597679576864, + "grad_norm": 1.4020652388169839, + "learning_rate": 7.849936515812578e-05, + "loss": 0.3064, + "step": 3842 + }, + { + "epoch": 0.3278450776318034, + "grad_norm": 1.67803050082361, + "learning_rate": 7.848801270450879e-05, + "loss": 0.3372, + "step": 3843 + }, + { + "epoch": 0.3279303873059205, + "grad_norm": 1.4462945849141275, + "learning_rate": 7.84766580759834e-05, + "loss": 0.3206, + "step": 3844 + }, + { + "epoch": 0.3280156969800375, + "grad_norm": 1.8351356865716122, + "learning_rate": 7.84653012734165e-05, + "loss": 0.3117, + "step": 3845 + }, + { + "epoch": 0.3281010066541546, + "grad_norm": 1.200155631543102, + "learning_rate": 7.845394229767509e-05, + "loss": 0.3081, + "step": 3846 + }, + { + "epoch": 0.3281863163282716, + "grad_norm": 1.296890350618629, + "learning_rate": 7.844258114962642e-05, + "loss": 0.3234, + "step": 3847 + }, + { + "epoch": 0.3282716260023887, + "grad_norm": 1.3451189121164937, + "learning_rate": 7.84312178301378e-05, + "loss": 0.3205, + "step": 3848 + }, + { + "epoch": 0.3283569356765057, + "grad_norm": 1.2196082492251616, + "learning_rate": 7.84198523400768e-05, + "loss": 0.3331, + "step": 3849 + }, + { + "epoch": 0.3284422453506228, + "grad_norm": 1.6531756676334852, + "learning_rate": 7.840848468031108e-05, + "loss": 0.2929, + "step": 3850 + }, + { + "epoch": 0.3285275550247398, + "grad_norm": 1.3540334000179826, + "learning_rate": 7.839711485170854e-05, + "loss": 0.2845, + "step": 3851 + }, + { + "epoch": 0.32861286469885687, + "grad_norm": 1.2528961962106053, + "learning_rate": 7.838574285513716e-05, + "loss": 0.3352, + "step": 3852 + }, + { + "epoch": 0.3286981743729739, + "grad_norm": 1.155394366425496, + "learning_rate": 7.837436869146517e-05, + "loss": 0.2776, + "step": 3853 + }, + { + "epoch": 0.32878348404709096, + "grad_norm": 1.7033774512537787, + "learning_rate": 7.836299236156089e-05, + "loss": 0.3202, + "step": 3854 + }, + { + "epoch": 0.328868793721208, + "grad_norm": 1.5285649015544482, + "learning_rate": 7.835161386629288e-05, + "loss": 0.3652, + "step": 3855 + }, + { + "epoch": 0.32895410339532505, + "grad_norm": 1.456315424681467, + "learning_rate": 7.834023320652981e-05, + "loss": 0.324, + "step": 3856 + }, + { + "epoch": 0.32903941306944207, + "grad_norm": 1.3320299970783147, + "learning_rate": 7.832885038314053e-05, + "loss": 0.2865, + "step": 3857 + }, + { + "epoch": 0.32912472274355914, + "grad_norm": 1.591642744407166, + "learning_rate": 7.831746539699407e-05, + "loss": 0.3446, + "step": 3858 + }, + { + "epoch": 0.32921003241767616, + "grad_norm": 1.4790171247655421, + "learning_rate": 7.830607824895962e-05, + "loss": 0.2956, + "step": 3859 + }, + { + "epoch": 0.32929534209179323, + "grad_norm": 1.3273717651643897, + "learning_rate": 7.829468893990649e-05, + "loss": 0.3294, + "step": 3860 + }, + { + "epoch": 0.32938065176591025, + "grad_norm": 1.4874832537053018, + "learning_rate": 7.828329747070422e-05, + "loss": 0.3096, + "step": 3861 + }, + { + "epoch": 0.3294659614400273, + "grad_norm": 1.5186233750291533, + "learning_rate": 7.82719038422225e-05, + "loss": 0.3679, + "step": 3862 + }, + { + "epoch": 0.32955127111414434, + "grad_norm": 1.5663936167371635, + "learning_rate": 7.826050805533114e-05, + "loss": 0.3291, + "step": 3863 + }, + { + "epoch": 0.3296365807882614, + "grad_norm": 1.341058075141567, + "learning_rate": 7.824911011090016e-05, + "loss": 0.292, + "step": 3864 + }, + { + "epoch": 0.32972189046237843, + "grad_norm": 1.4098199247798977, + "learning_rate": 7.823771000979976e-05, + "loss": 0.3069, + "step": 3865 + }, + { + "epoch": 0.3298072001364955, + "grad_norm": 1.6752254391309909, + "learning_rate": 7.822630775290025e-05, + "loss": 0.3782, + "step": 3866 + }, + { + "epoch": 0.3298925098106125, + "grad_norm": 1.371197905754974, + "learning_rate": 7.821490334107216e-05, + "loss": 0.2745, + "step": 3867 + }, + { + "epoch": 0.3299778194847296, + "grad_norm": 1.5806230470668663, + "learning_rate": 7.820349677518609e-05, + "loss": 0.3524, + "step": 3868 + }, + { + "epoch": 0.3300631291588466, + "grad_norm": 1.9117623346408965, + "learning_rate": 7.819208805611294e-05, + "loss": 0.3519, + "step": 3869 + }, + { + "epoch": 0.33014843883296363, + "grad_norm": 1.3528724011760425, + "learning_rate": 7.818067718472366e-05, + "loss": 0.327, + "step": 3870 + }, + { + "epoch": 0.3302337485070807, + "grad_norm": 1.377037705874116, + "learning_rate": 7.816926416188946e-05, + "loss": 0.2895, + "step": 3871 + }, + { + "epoch": 0.3303190581811977, + "grad_norm": 1.657863335005311, + "learning_rate": 7.815784898848163e-05, + "loss": 0.3045, + "step": 3872 + }, + { + "epoch": 0.3304043678553148, + "grad_norm": 1.307800781489184, + "learning_rate": 7.814643166537163e-05, + "loss": 0.2944, + "step": 3873 + }, + { + "epoch": 0.3304896775294318, + "grad_norm": 1.6460861140700027, + "learning_rate": 7.813501219343116e-05, + "loss": 0.3342, + "step": 3874 + }, + { + "epoch": 0.3305749872035489, + "grad_norm": 1.6215415979414065, + "learning_rate": 7.812359057353201e-05, + "loss": 0.3644, + "step": 3875 + }, + { + "epoch": 0.3306602968776659, + "grad_norm": 1.4768473032623652, + "learning_rate": 7.811216680654618e-05, + "loss": 0.316, + "step": 3876 + }, + { + "epoch": 0.330745606551783, + "grad_norm": 1.4916529471904851, + "learning_rate": 7.810074089334581e-05, + "loss": 0.3022, + "step": 3877 + }, + { + "epoch": 0.3308309162259, + "grad_norm": 1.8200598283512759, + "learning_rate": 7.808931283480316e-05, + "loss": 0.3415, + "step": 3878 + }, + { + "epoch": 0.33091622590001707, + "grad_norm": 1.419325838568809, + "learning_rate": 7.807788263179078e-05, + "loss": 0.3385, + "step": 3879 + }, + { + "epoch": 0.3310015355741341, + "grad_norm": 1.4392432547671787, + "learning_rate": 7.806645028518125e-05, + "loss": 0.2819, + "step": 3880 + }, + { + "epoch": 0.33108684524825116, + "grad_norm": 1.3756981724399662, + "learning_rate": 7.80550157958474e-05, + "loss": 0.264, + "step": 3881 + }, + { + "epoch": 0.3311721549223682, + "grad_norm": 1.5679884836640077, + "learning_rate": 7.804357916466216e-05, + "loss": 0.2922, + "step": 3882 + }, + { + "epoch": 0.33125746459648525, + "grad_norm": 1.3173560273407474, + "learning_rate": 7.80321403924987e-05, + "loss": 0.3424, + "step": 3883 + }, + { + "epoch": 0.33134277427060227, + "grad_norm": 1.6359010533860923, + "learning_rate": 7.802069948023025e-05, + "loss": 0.3162, + "step": 3884 + }, + { + "epoch": 0.33142808394471934, + "grad_norm": 1.5128935751084716, + "learning_rate": 7.800925642873032e-05, + "loss": 0.3031, + "step": 3885 + }, + { + "epoch": 0.33151339361883636, + "grad_norm": 1.4274254920073917, + "learning_rate": 7.799781123887248e-05, + "loss": 0.3228, + "step": 3886 + }, + { + "epoch": 0.33159870329295343, + "grad_norm": 1.221517209013403, + "learning_rate": 7.798636391153056e-05, + "loss": 0.2817, + "step": 3887 + }, + { + "epoch": 0.33168401296707045, + "grad_norm": 1.374694356760319, + "learning_rate": 7.797491444757847e-05, + "loss": 0.3186, + "step": 3888 + }, + { + "epoch": 0.3317693226411875, + "grad_norm": 1.7141356516053483, + "learning_rate": 7.796346284789032e-05, + "loss": 0.3307, + "step": 3889 + }, + { + "epoch": 0.33185463231530454, + "grad_norm": 1.717576128897983, + "learning_rate": 7.795200911334036e-05, + "loss": 0.3578, + "step": 3890 + }, + { + "epoch": 0.3319399419894216, + "grad_norm": 1.357723767856874, + "learning_rate": 7.794055324480305e-05, + "loss": 0.3463, + "step": 3891 + }, + { + "epoch": 0.33202525166353863, + "grad_norm": 1.4858903879424816, + "learning_rate": 7.792909524315298e-05, + "loss": 0.3108, + "step": 3892 + }, + { + "epoch": 0.3321105613376557, + "grad_norm": 1.4910848647948867, + "learning_rate": 7.791763510926491e-05, + "loss": 0.3626, + "step": 3893 + }, + { + "epoch": 0.3321958710117727, + "grad_norm": 1.4624639685993273, + "learning_rate": 7.790617284401374e-05, + "loss": 0.329, + "step": 3894 + }, + { + "epoch": 0.3322811806858898, + "grad_norm": 1.4378565470484657, + "learning_rate": 7.789470844827458e-05, + "loss": 0.3912, + "step": 3895 + }, + { + "epoch": 0.3323664903600068, + "grad_norm": 1.2656727534574663, + "learning_rate": 7.788324192292265e-05, + "loss": 0.2707, + "step": 3896 + }, + { + "epoch": 0.3324518000341239, + "grad_norm": 1.274390864252462, + "learning_rate": 7.787177326883336e-05, + "loss": 0.2841, + "step": 3897 + }, + { + "epoch": 0.3325371097082409, + "grad_norm": 1.6385703163366503, + "learning_rate": 7.786030248688232e-05, + "loss": 0.3666, + "step": 3898 + }, + { + "epoch": 0.332622419382358, + "grad_norm": 1.5431064978811737, + "learning_rate": 7.784882957794522e-05, + "loss": 0.3153, + "step": 3899 + }, + { + "epoch": 0.332707729056475, + "grad_norm": 1.3081294360497975, + "learning_rate": 7.783735454289798e-05, + "loss": 0.3288, + "step": 3900 + }, + { + "epoch": 0.3327930387305921, + "grad_norm": 1.3004968464016398, + "learning_rate": 7.782587738261664e-05, + "loss": 0.3005, + "step": 3901 + }, + { + "epoch": 0.3328783484047091, + "grad_norm": 1.289547545758673, + "learning_rate": 7.781439809797743e-05, + "loss": 0.3066, + "step": 3902 + }, + { + "epoch": 0.33296365807882616, + "grad_norm": 1.2751034563763408, + "learning_rate": 7.780291668985672e-05, + "loss": 0.2816, + "step": 3903 + }, + { + "epoch": 0.3330489677529432, + "grad_norm": 1.5050924007907995, + "learning_rate": 7.779143315913108e-05, + "loss": 0.3331, + "step": 3904 + }, + { + "epoch": 0.33313427742706025, + "grad_norm": 1.5260799086360861, + "learning_rate": 7.777994750667719e-05, + "loss": 0.3318, + "step": 3905 + }, + { + "epoch": 0.3332195871011773, + "grad_norm": 1.5667362008321781, + "learning_rate": 7.776845973337194e-05, + "loss": 0.2851, + "step": 3906 + }, + { + "epoch": 0.33330489677529435, + "grad_norm": 1.3730614608489142, + "learning_rate": 7.775696984009236e-05, + "loss": 0.2817, + "step": 3907 + }, + { + "epoch": 0.33339020644941136, + "grad_norm": 1.6359909733847366, + "learning_rate": 7.774547782771562e-05, + "loss": 0.3526, + "step": 3908 + }, + { + "epoch": 0.3334755161235284, + "grad_norm": 1.5374305709421578, + "learning_rate": 7.773398369711908e-05, + "loss": 0.2934, + "step": 3909 + }, + { + "epoch": 0.33356082579764545, + "grad_norm": 1.7952009280921455, + "learning_rate": 7.772248744918028e-05, + "loss": 0.3123, + "step": 3910 + }, + { + "epoch": 0.3336461354717625, + "grad_norm": 1.677343605016623, + "learning_rate": 7.771098908477686e-05, + "loss": 0.276, + "step": 3911 + }, + { + "epoch": 0.33373144514587955, + "grad_norm": 1.6946688366227523, + "learning_rate": 7.769948860478669e-05, + "loss": 0.3086, + "step": 3912 + }, + { + "epoch": 0.33381675481999656, + "grad_norm": 1.3611021798461471, + "learning_rate": 7.768798601008776e-05, + "loss": 0.289, + "step": 3913 + }, + { + "epoch": 0.33390206449411364, + "grad_norm": 1.557445284646221, + "learning_rate": 7.767648130155824e-05, + "loss": 0.3428, + "step": 3914 + }, + { + "epoch": 0.33398737416823066, + "grad_norm": 1.365411356749855, + "learning_rate": 7.766497448007643e-05, + "loss": 0.2924, + "step": 3915 + }, + { + "epoch": 0.33407268384234773, + "grad_norm": 1.53315903910045, + "learning_rate": 7.765346554652085e-05, + "loss": 0.3562, + "step": 3916 + }, + { + "epoch": 0.33415799351646475, + "grad_norm": 1.411693802039434, + "learning_rate": 7.764195450177011e-05, + "loss": 0.3196, + "step": 3917 + }, + { + "epoch": 0.3342433031905818, + "grad_norm": 1.3387046843338153, + "learning_rate": 7.763044134670303e-05, + "loss": 0.3203, + "step": 3918 + }, + { + "epoch": 0.33432861286469884, + "grad_norm": 1.6847990107440127, + "learning_rate": 7.761892608219859e-05, + "loss": 0.378, + "step": 3919 + }, + { + "epoch": 0.3344139225388159, + "grad_norm": 1.6518413961444232, + "learning_rate": 7.760740870913592e-05, + "loss": 0.3585, + "step": 3920 + }, + { + "epoch": 0.33449923221293293, + "grad_norm": 1.5476226541106852, + "learning_rate": 7.759588922839427e-05, + "loss": 0.3366, + "step": 3921 + }, + { + "epoch": 0.33458454188705, + "grad_norm": 1.5953782683592101, + "learning_rate": 7.758436764085315e-05, + "loss": 0.3531, + "step": 3922 + }, + { + "epoch": 0.334669851561167, + "grad_norm": 1.4659691315090346, + "learning_rate": 7.757284394739212e-05, + "loss": 0.3, + "step": 3923 + }, + { + "epoch": 0.3347551612352841, + "grad_norm": 1.471764575355378, + "learning_rate": 7.7561318148891e-05, + "loss": 0.3562, + "step": 3924 + }, + { + "epoch": 0.3348404709094011, + "grad_norm": 1.6986919419540636, + "learning_rate": 7.75497902462297e-05, + "loss": 0.2874, + "step": 3925 + }, + { + "epoch": 0.3349257805835182, + "grad_norm": 1.6562345971884758, + "learning_rate": 7.753826024028829e-05, + "loss": 0.2983, + "step": 3926 + }, + { + "epoch": 0.3350110902576352, + "grad_norm": 1.607939144444319, + "learning_rate": 7.752672813194707e-05, + "loss": 0.3025, + "step": 3927 + }, + { + "epoch": 0.3350963999317523, + "grad_norm": 1.5438897197059671, + "learning_rate": 7.751519392208644e-05, + "loss": 0.3675, + "step": 3928 + }, + { + "epoch": 0.3351817096058693, + "grad_norm": 1.352227989808941, + "learning_rate": 7.750365761158695e-05, + "loss": 0.2955, + "step": 3929 + }, + { + "epoch": 0.33526701927998637, + "grad_norm": 1.2724110966111923, + "learning_rate": 7.749211920132937e-05, + "loss": 0.3622, + "step": 3930 + }, + { + "epoch": 0.3353523289541034, + "grad_norm": 1.6049131002698322, + "learning_rate": 7.748057869219456e-05, + "loss": 0.2959, + "step": 3931 + }, + { + "epoch": 0.33543763862822046, + "grad_norm": 1.9325837551191765, + "learning_rate": 7.746903608506362e-05, + "loss": 0.3679, + "step": 3932 + }, + { + "epoch": 0.3355229483023375, + "grad_norm": 1.432186756337768, + "learning_rate": 7.745749138081775e-05, + "loss": 0.3093, + "step": 3933 + }, + { + "epoch": 0.33560825797645455, + "grad_norm": 1.3338408050801318, + "learning_rate": 7.74459445803383e-05, + "loss": 0.2704, + "step": 3934 + }, + { + "epoch": 0.33569356765057157, + "grad_norm": 1.3454729168426134, + "learning_rate": 7.743439568450684e-05, + "loss": 0.2911, + "step": 3935 + }, + { + "epoch": 0.33577887732468864, + "grad_norm": 1.4344695947242927, + "learning_rate": 7.742284469420505e-05, + "loss": 0.3275, + "step": 3936 + }, + { + "epoch": 0.33586418699880566, + "grad_norm": 1.5104039509555898, + "learning_rate": 7.74112916103148e-05, + "loss": 0.3271, + "step": 3937 + }, + { + "epoch": 0.33594949667292273, + "grad_norm": 1.4183296185783634, + "learning_rate": 7.739973643371809e-05, + "loss": 0.3427, + "step": 3938 + }, + { + "epoch": 0.33603480634703975, + "grad_norm": 1.492967172219617, + "learning_rate": 7.738817916529713e-05, + "loss": 0.3251, + "step": 3939 + }, + { + "epoch": 0.3361201160211568, + "grad_norm": 1.6590168711958155, + "learning_rate": 7.73766198059342e-05, + "loss": 0.3637, + "step": 3940 + }, + { + "epoch": 0.33620542569527384, + "grad_norm": 1.400285405631338, + "learning_rate": 7.736505835651186e-05, + "loss": 0.3194, + "step": 3941 + }, + { + "epoch": 0.3362907353693909, + "grad_norm": 1.60398554914819, + "learning_rate": 7.73534948179127e-05, + "loss": 0.3217, + "step": 3942 + }, + { + "epoch": 0.33637604504350793, + "grad_norm": 1.4053814218902507, + "learning_rate": 7.734192919101958e-05, + "loss": 0.271, + "step": 3943 + }, + { + "epoch": 0.336461354717625, + "grad_norm": 1.4927330417899625, + "learning_rate": 7.733036147671546e-05, + "loss": 0.3725, + "step": 3944 + }, + { + "epoch": 0.336546664391742, + "grad_norm": 1.1128744888073114, + "learning_rate": 7.731879167588347e-05, + "loss": 0.258, + "step": 3945 + }, + { + "epoch": 0.33663197406585904, + "grad_norm": 1.6112674586117453, + "learning_rate": 7.730721978940693e-05, + "loss": 0.3299, + "step": 3946 + }, + { + "epoch": 0.3367172837399761, + "grad_norm": 1.6447619641041809, + "learning_rate": 7.729564581816923e-05, + "loss": 0.3155, + "step": 3947 + }, + { + "epoch": 0.33680259341409313, + "grad_norm": 1.4494298076918168, + "learning_rate": 7.728406976305406e-05, + "loss": 0.2965, + "step": 3948 + }, + { + "epoch": 0.3368879030882102, + "grad_norm": 1.4890882651950985, + "learning_rate": 7.727249162494513e-05, + "loss": 0.3036, + "step": 3949 + }, + { + "epoch": 0.3369732127623272, + "grad_norm": 1.5045267781809095, + "learning_rate": 7.72609114047264e-05, + "loss": 0.3369, + "step": 3950 + }, + { + "epoch": 0.3370585224364443, + "grad_norm": 1.3385818366567765, + "learning_rate": 7.724932910328194e-05, + "loss": 0.2996, + "step": 3951 + }, + { + "epoch": 0.3371438321105613, + "grad_norm": 1.457377783753285, + "learning_rate": 7.723774472149601e-05, + "loss": 0.3122, + "step": 3952 + }, + { + "epoch": 0.3372291417846784, + "grad_norm": 1.3611762291821599, + "learning_rate": 7.722615826025302e-05, + "loss": 0.3144, + "step": 3953 + }, + { + "epoch": 0.3373144514587954, + "grad_norm": 1.3323294565877535, + "learning_rate": 7.72145697204375e-05, + "loss": 0.3074, + "step": 3954 + }, + { + "epoch": 0.3373997611329125, + "grad_norm": 1.5642561389667895, + "learning_rate": 7.720297910293421e-05, + "loss": 0.3361, + "step": 3955 + }, + { + "epoch": 0.3374850708070295, + "grad_norm": 1.3707394481769732, + "learning_rate": 7.719138640862804e-05, + "loss": 0.3345, + "step": 3956 + }, + { + "epoch": 0.33757038048114657, + "grad_norm": 1.5612302580074684, + "learning_rate": 7.717979163840401e-05, + "loss": 0.3191, + "step": 3957 + }, + { + "epoch": 0.3376556901552636, + "grad_norm": 1.4997167319811038, + "learning_rate": 7.71681947931473e-05, + "loss": 0.2478, + "step": 3958 + }, + { + "epoch": 0.33774099982938066, + "grad_norm": 1.9504935887008161, + "learning_rate": 7.715659587374331e-05, + "loss": 0.3124, + "step": 3959 + }, + { + "epoch": 0.3378263095034977, + "grad_norm": 1.7537529077594167, + "learning_rate": 7.714499488107751e-05, + "loss": 0.3739, + "step": 3960 + }, + { + "epoch": 0.33791161917761475, + "grad_norm": 1.7208946026231247, + "learning_rate": 7.713339181603563e-05, + "loss": 0.3229, + "step": 3961 + }, + { + "epoch": 0.33799692885173177, + "grad_norm": 1.7871791471608633, + "learning_rate": 7.712178667950346e-05, + "loss": 0.308, + "step": 3962 + }, + { + "epoch": 0.33808223852584884, + "grad_norm": 1.4734490079622509, + "learning_rate": 7.711017947236699e-05, + "loss": 0.2898, + "step": 3963 + }, + { + "epoch": 0.33816754819996586, + "grad_norm": 1.4938563257147763, + "learning_rate": 7.70985701955124e-05, + "loss": 0.3183, + "step": 3964 + }, + { + "epoch": 0.33825285787408294, + "grad_norm": 1.4020523576115806, + "learning_rate": 7.708695884982597e-05, + "loss": 0.3094, + "step": 3965 + }, + { + "epoch": 0.33833816754819995, + "grad_norm": 1.546255507833282, + "learning_rate": 7.707534543619417e-05, + "loss": 0.3206, + "step": 3966 + }, + { + "epoch": 0.338423477222317, + "grad_norm": 1.3295798019383003, + "learning_rate": 7.706372995550361e-05, + "loss": 0.3174, + "step": 3967 + }, + { + "epoch": 0.33850878689643404, + "grad_norm": 1.1849542736132055, + "learning_rate": 7.70521124086411e-05, + "loss": 0.4057, + "step": 3968 + }, + { + "epoch": 0.3385940965705511, + "grad_norm": 1.474100067908597, + "learning_rate": 7.704049279649357e-05, + "loss": 0.3148, + "step": 3969 + }, + { + "epoch": 0.33867940624466814, + "grad_norm": 1.2666399147738971, + "learning_rate": 7.70288711199481e-05, + "loss": 0.2407, + "step": 3970 + }, + { + "epoch": 0.3387647159187852, + "grad_norm": 1.4943570647206421, + "learning_rate": 7.701724737989196e-05, + "loss": 0.3593, + "step": 3971 + }, + { + "epoch": 0.3388500255929022, + "grad_norm": 1.5874392159907516, + "learning_rate": 7.700562157721254e-05, + "loss": 0.323, + "step": 3972 + }, + { + "epoch": 0.3389353352670193, + "grad_norm": 1.6138863894905935, + "learning_rate": 7.69939937127974e-05, + "loss": 0.2704, + "step": 3973 + }, + { + "epoch": 0.3390206449411363, + "grad_norm": 1.6441049634927343, + "learning_rate": 7.698236378753432e-05, + "loss": 0.3605, + "step": 3974 + }, + { + "epoch": 0.3391059546152534, + "grad_norm": 1.3948229743723883, + "learning_rate": 7.697073180231114e-05, + "loss": 0.306, + "step": 3975 + }, + { + "epoch": 0.3391912642893704, + "grad_norm": 1.62369426205817, + "learning_rate": 7.69590977580159e-05, + "loss": 0.2803, + "step": 3976 + }, + { + "epoch": 0.3392765739634875, + "grad_norm": 1.4747822034998963, + "learning_rate": 7.694746165553682e-05, + "loss": 0.3716, + "step": 3977 + }, + { + "epoch": 0.3393618836376045, + "grad_norm": 1.66666735013312, + "learning_rate": 7.693582349576223e-05, + "loss": 0.3849, + "step": 3978 + }, + { + "epoch": 0.3394471933117216, + "grad_norm": 1.9768652401398887, + "learning_rate": 7.692418327958066e-05, + "loss": 0.3125, + "step": 3979 + }, + { + "epoch": 0.3395325029858386, + "grad_norm": 1.6253554615635086, + "learning_rate": 7.691254100788077e-05, + "loss": 0.278, + "step": 3980 + }, + { + "epoch": 0.33961781265995566, + "grad_norm": 1.702115030666688, + "learning_rate": 7.690089668155138e-05, + "loss": 0.3687, + "step": 3981 + }, + { + "epoch": 0.3397031223340727, + "grad_norm": 1.3544420646959163, + "learning_rate": 7.688925030148149e-05, + "loss": 0.313, + "step": 3982 + }, + { + "epoch": 0.33978843200818976, + "grad_norm": 1.455446747036618, + "learning_rate": 7.687760186856023e-05, + "loss": 0.3375, + "step": 3983 + }, + { + "epoch": 0.3398737416823068, + "grad_norm": 1.591502980441875, + "learning_rate": 7.686595138367688e-05, + "loss": 0.2941, + "step": 3984 + }, + { + "epoch": 0.3399590513564238, + "grad_norm": 1.1763321102561513, + "learning_rate": 7.685429884772092e-05, + "loss": 0.3086, + "step": 3985 + }, + { + "epoch": 0.34004436103054086, + "grad_norm": 1.5322828896394542, + "learning_rate": 7.684264426158194e-05, + "loss": 0.355, + "step": 3986 + }, + { + "epoch": 0.3401296707046579, + "grad_norm": 1.3084801012478688, + "learning_rate": 7.683098762614971e-05, + "loss": 0.269, + "step": 3987 + }, + { + "epoch": 0.34021498037877496, + "grad_norm": 1.6098025596582188, + "learning_rate": 7.681932894231417e-05, + "loss": 0.2988, + "step": 3988 + }, + { + "epoch": 0.340300290052892, + "grad_norm": 1.3135492581242154, + "learning_rate": 7.680766821096537e-05, + "loss": 0.2996, + "step": 3989 + }, + { + "epoch": 0.34038559972700905, + "grad_norm": 1.4024991345190414, + "learning_rate": 7.679600543299356e-05, + "loss": 0.3235, + "step": 3990 + }, + { + "epoch": 0.34047090940112607, + "grad_norm": 1.6492285977082677, + "learning_rate": 7.678434060928913e-05, + "loss": 0.3595, + "step": 3991 + }, + { + "epoch": 0.34055621907524314, + "grad_norm": 1.7840003480440494, + "learning_rate": 7.677267374074262e-05, + "loss": 0.3966, + "step": 3992 + }, + { + "epoch": 0.34064152874936016, + "grad_norm": 1.8038688222935177, + "learning_rate": 7.676100482824476e-05, + "loss": 0.3421, + "step": 3993 + }, + { + "epoch": 0.34072683842347723, + "grad_norm": 1.4072793372249686, + "learning_rate": 7.674933387268637e-05, + "loss": 0.2656, + "step": 3994 + }, + { + "epoch": 0.34081214809759425, + "grad_norm": 1.474861254874631, + "learning_rate": 7.673766087495848e-05, + "loss": 0.3499, + "step": 3995 + }, + { + "epoch": 0.3408974577717113, + "grad_norm": 1.4108494457594876, + "learning_rate": 7.672598583595227e-05, + "loss": 0.3364, + "step": 3996 + }, + { + "epoch": 0.34098276744582834, + "grad_norm": 1.3219315573134145, + "learning_rate": 7.671430875655907e-05, + "loss": 0.2923, + "step": 3997 + }, + { + "epoch": 0.3410680771199454, + "grad_norm": 1.3971494144853618, + "learning_rate": 7.670262963767037e-05, + "loss": 0.3407, + "step": 3998 + }, + { + "epoch": 0.34115338679406243, + "grad_norm": 1.579002967073094, + "learning_rate": 7.669094848017777e-05, + "loss": 0.329, + "step": 3999 + }, + { + "epoch": 0.3412386964681795, + "grad_norm": 1.660576693083846, + "learning_rate": 7.667926528497311e-05, + "loss": 0.3666, + "step": 4000 + }, + { + "epoch": 0.3413240061422965, + "grad_norm": 1.352765027942841, + "learning_rate": 7.66675800529483e-05, + "loss": 0.2804, + "step": 4001 + }, + { + "epoch": 0.3414093158164136, + "grad_norm": 1.5221032192804937, + "learning_rate": 7.665589278499547e-05, + "loss": 0.3208, + "step": 4002 + }, + { + "epoch": 0.3414946254905306, + "grad_norm": 1.5937215765120147, + "learning_rate": 7.664420348200689e-05, + "loss": 0.3247, + "step": 4003 + }, + { + "epoch": 0.3415799351646477, + "grad_norm": 1.2340778041343683, + "learning_rate": 7.663251214487495e-05, + "loss": 0.3269, + "step": 4004 + }, + { + "epoch": 0.3416652448387647, + "grad_norm": 1.4674362637977554, + "learning_rate": 7.662081877449221e-05, + "loss": 0.3312, + "step": 4005 + }, + { + "epoch": 0.3417505545128818, + "grad_norm": 1.3932202585476263, + "learning_rate": 7.660912337175145e-05, + "loss": 0.3091, + "step": 4006 + }, + { + "epoch": 0.3418358641869988, + "grad_norm": 1.6014140734861553, + "learning_rate": 7.659742593754551e-05, + "loss": 0.2796, + "step": 4007 + }, + { + "epoch": 0.34192117386111587, + "grad_norm": 1.4357149402044846, + "learning_rate": 7.658572647276744e-05, + "loss": 0.3127, + "step": 4008 + }, + { + "epoch": 0.3420064835352329, + "grad_norm": 1.3885132260377775, + "learning_rate": 7.657402497831044e-05, + "loss": 0.3471, + "step": 4009 + }, + { + "epoch": 0.34209179320934996, + "grad_norm": 1.4325739172061165, + "learning_rate": 7.656232145506784e-05, + "loss": 0.2874, + "step": 4010 + }, + { + "epoch": 0.342177102883467, + "grad_norm": 1.4979959451903202, + "learning_rate": 7.655061590393314e-05, + "loss": 0.3023, + "step": 4011 + }, + { + "epoch": 0.34226241255758405, + "grad_norm": 1.4185688020722746, + "learning_rate": 7.653890832580002e-05, + "loss": 0.3358, + "step": 4012 + }, + { + "epoch": 0.34234772223170107, + "grad_norm": 1.2603940830361797, + "learning_rate": 7.652719872156226e-05, + "loss": 0.3126, + "step": 4013 + }, + { + "epoch": 0.34243303190581814, + "grad_norm": 1.4568494422533163, + "learning_rate": 7.651548709211384e-05, + "loss": 0.3401, + "step": 4014 + }, + { + "epoch": 0.34251834157993516, + "grad_norm": 1.5095048805099793, + "learning_rate": 7.650377343834891e-05, + "loss": 0.3185, + "step": 4015 + }, + { + "epoch": 0.34260365125405223, + "grad_norm": 1.2824448387374394, + "learning_rate": 7.64920577611617e-05, + "loss": 0.3207, + "step": 4016 + }, + { + "epoch": 0.34268896092816925, + "grad_norm": 1.4098760691078205, + "learning_rate": 7.648034006144667e-05, + "loss": 0.2471, + "step": 4017 + }, + { + "epoch": 0.3427742706022863, + "grad_norm": 1.9111008132181517, + "learning_rate": 7.646862034009837e-05, + "loss": 0.3188, + "step": 4018 + }, + { + "epoch": 0.34285958027640334, + "grad_norm": 1.322348114550975, + "learning_rate": 7.645689859801157e-05, + "loss": 0.2975, + "step": 4019 + }, + { + "epoch": 0.3429448899505204, + "grad_norm": 1.3778233152515755, + "learning_rate": 7.644517483608116e-05, + "loss": 0.3052, + "step": 4020 + }, + { + "epoch": 0.34303019962463743, + "grad_norm": 1.7301114576741412, + "learning_rate": 7.64334490552022e-05, + "loss": 0.2964, + "step": 4021 + }, + { + "epoch": 0.3431155092987545, + "grad_norm": 1.58577002030938, + "learning_rate": 7.642172125626986e-05, + "loss": 0.304, + "step": 4022 + }, + { + "epoch": 0.3432008189728715, + "grad_norm": 1.5041485479783492, + "learning_rate": 7.64099914401795e-05, + "loss": 0.3027, + "step": 4023 + }, + { + "epoch": 0.34328612864698854, + "grad_norm": 1.5131414456204502, + "learning_rate": 7.639825960782663e-05, + "loss": 0.317, + "step": 4024 + }, + { + "epoch": 0.3433714383211056, + "grad_norm": 1.3815066534980434, + "learning_rate": 7.638652576010692e-05, + "loss": 0.3254, + "step": 4025 + }, + { + "epoch": 0.34345674799522263, + "grad_norm": 1.6041499380585325, + "learning_rate": 7.637478989791618e-05, + "loss": 0.3953, + "step": 4026 + }, + { + "epoch": 0.3435420576693397, + "grad_norm": 1.8407583372980814, + "learning_rate": 7.636305202215041e-05, + "loss": 0.3873, + "step": 4027 + }, + { + "epoch": 0.3436273673434567, + "grad_norm": 1.3562832454702665, + "learning_rate": 7.63513121337057e-05, + "loss": 0.2829, + "step": 4028 + }, + { + "epoch": 0.3437126770175738, + "grad_norm": 1.535618836107564, + "learning_rate": 7.633957023347833e-05, + "loss": 0.3287, + "step": 4029 + }, + { + "epoch": 0.3437979866916908, + "grad_norm": 1.8178830941762445, + "learning_rate": 7.632782632236474e-05, + "loss": 0.2884, + "step": 4030 + }, + { + "epoch": 0.3438832963658079, + "grad_norm": 1.4759917594245437, + "learning_rate": 7.631608040126154e-05, + "loss": 0.274, + "step": 4031 + }, + { + "epoch": 0.3439686060399249, + "grad_norm": 1.695344704880243, + "learning_rate": 7.630433247106543e-05, + "loss": 0.3329, + "step": 4032 + }, + { + "epoch": 0.344053915714042, + "grad_norm": 1.3150633349233105, + "learning_rate": 7.629258253267332e-05, + "loss": 0.3568, + "step": 4033 + }, + { + "epoch": 0.344139225388159, + "grad_norm": 1.5845326181595236, + "learning_rate": 7.628083058698226e-05, + "loss": 0.3103, + "step": 4034 + }, + { + "epoch": 0.34422453506227607, + "grad_norm": 1.2611267779735986, + "learning_rate": 7.626907663488943e-05, + "loss": 0.2963, + "step": 4035 + }, + { + "epoch": 0.3443098447363931, + "grad_norm": 1.8803749609436669, + "learning_rate": 7.625732067729219e-05, + "loss": 0.3988, + "step": 4036 + }, + { + "epoch": 0.34439515441051016, + "grad_norm": 1.390685990956992, + "learning_rate": 7.624556271508805e-05, + "loss": 0.2852, + "step": 4037 + }, + { + "epoch": 0.3444804640846272, + "grad_norm": 1.3016508477715498, + "learning_rate": 7.623380274917467e-05, + "loss": 0.3273, + "step": 4038 + }, + { + "epoch": 0.34456577375874425, + "grad_norm": 1.633775490968363, + "learning_rate": 7.622204078044985e-05, + "loss": 0.3624, + "step": 4039 + }, + { + "epoch": 0.34465108343286127, + "grad_norm": 1.3898443967186216, + "learning_rate": 7.621027680981155e-05, + "loss": 0.3227, + "step": 4040 + }, + { + "epoch": 0.34473639310697834, + "grad_norm": 1.3124935967425415, + "learning_rate": 7.619851083815793e-05, + "loss": 0.335, + "step": 4041 + }, + { + "epoch": 0.34482170278109536, + "grad_norm": 1.4801624152273494, + "learning_rate": 7.61867428663872e-05, + "loss": 0.2769, + "step": 4042 + }, + { + "epoch": 0.34490701245521244, + "grad_norm": 1.2315905104972493, + "learning_rate": 7.617497289539781e-05, + "loss": 0.2637, + "step": 4043 + }, + { + "epoch": 0.34499232212932945, + "grad_norm": 1.5178894744763447, + "learning_rate": 7.616320092608834e-05, + "loss": 0.284, + "step": 4044 + }, + { + "epoch": 0.3450776318034465, + "grad_norm": 1.4841626467142526, + "learning_rate": 7.615142695935751e-05, + "loss": 0.2994, + "step": 4045 + }, + { + "epoch": 0.34516294147756355, + "grad_norm": 1.361737883319037, + "learning_rate": 7.613965099610419e-05, + "loss": 0.3082, + "step": 4046 + }, + { + "epoch": 0.3452482511516806, + "grad_norm": 1.4330508168082134, + "learning_rate": 7.612787303722744e-05, + "loss": 0.2881, + "step": 4047 + }, + { + "epoch": 0.34533356082579764, + "grad_norm": 1.3656385093736563, + "learning_rate": 7.61160930836264e-05, + "loss": 0.3008, + "step": 4048 + }, + { + "epoch": 0.3454188704999147, + "grad_norm": 1.4015089009900876, + "learning_rate": 7.610431113620046e-05, + "loss": 0.319, + "step": 4049 + }, + { + "epoch": 0.3455041801740317, + "grad_norm": 1.170258589802889, + "learning_rate": 7.609252719584907e-05, + "loss": 0.2734, + "step": 4050 + }, + { + "epoch": 0.3455894898481488, + "grad_norm": 1.8928108505284662, + "learning_rate": 7.608074126347189e-05, + "loss": 0.323, + "step": 4051 + }, + { + "epoch": 0.3456747995222658, + "grad_norm": 1.5732550688582676, + "learning_rate": 7.606895333996871e-05, + "loss": 0.3416, + "step": 4052 + }, + { + "epoch": 0.3457601091963829, + "grad_norm": 1.5893456430907877, + "learning_rate": 7.605716342623948e-05, + "loss": 0.3477, + "step": 4053 + }, + { + "epoch": 0.3458454188704999, + "grad_norm": 1.4243331453100203, + "learning_rate": 7.604537152318427e-05, + "loss": 0.2566, + "step": 4054 + }, + { + "epoch": 0.345930728544617, + "grad_norm": 1.5518401636030632, + "learning_rate": 7.603357763170336e-05, + "loss": 0.2926, + "step": 4055 + }, + { + "epoch": 0.346016038218734, + "grad_norm": 1.78766707593202, + "learning_rate": 7.602178175269713e-05, + "loss": 0.3484, + "step": 4056 + }, + { + "epoch": 0.3461013478928511, + "grad_norm": 1.377320455699284, + "learning_rate": 7.600998388706615e-05, + "loss": 0.32, + "step": 4057 + }, + { + "epoch": 0.3461866575669681, + "grad_norm": 1.1929486406051621, + "learning_rate": 7.599818403571112e-05, + "loss": 0.2711, + "step": 4058 + }, + { + "epoch": 0.34627196724108517, + "grad_norm": 1.4885353011717501, + "learning_rate": 7.598638219953289e-05, + "loss": 0.3159, + "step": 4059 + }, + { + "epoch": 0.3463572769152022, + "grad_norm": 1.4836365318457616, + "learning_rate": 7.597457837943247e-05, + "loss": 0.3071, + "step": 4060 + }, + { + "epoch": 0.3464425865893192, + "grad_norm": 1.1514592283569929, + "learning_rate": 7.596277257631102e-05, + "loss": 0.3206, + "step": 4061 + }, + { + "epoch": 0.3465278962634363, + "grad_norm": 1.5907813454404929, + "learning_rate": 7.595096479106985e-05, + "loss": 0.3468, + "step": 4062 + }, + { + "epoch": 0.3466132059375533, + "grad_norm": 1.6286431335615523, + "learning_rate": 7.593915502461042e-05, + "loss": 0.3476, + "step": 4063 + }, + { + "epoch": 0.34669851561167037, + "grad_norm": 1.5639501327408734, + "learning_rate": 7.592734327783435e-05, + "loss": 0.3175, + "step": 4064 + }, + { + "epoch": 0.3467838252857874, + "grad_norm": 1.3113974981562497, + "learning_rate": 7.59155295516434e-05, + "loss": 0.3072, + "step": 4065 + }, + { + "epoch": 0.34686913495990446, + "grad_norm": 1.414231432523959, + "learning_rate": 7.590371384693947e-05, + "loss": 0.3068, + "step": 4066 + }, + { + "epoch": 0.3469544446340215, + "grad_norm": 1.4930094906504658, + "learning_rate": 7.589189616462465e-05, + "loss": 0.3041, + "step": 4067 + }, + { + "epoch": 0.34703975430813855, + "grad_norm": 1.5718571292172265, + "learning_rate": 7.588007650560116e-05, + "loss": 0.304, + "step": 4068 + }, + { + "epoch": 0.34712506398225557, + "grad_norm": 1.6946339457844806, + "learning_rate": 7.586825487077132e-05, + "loss": 0.308, + "step": 4069 + }, + { + "epoch": 0.34721037365637264, + "grad_norm": 1.3680662110629636, + "learning_rate": 7.585643126103772e-05, + "loss": 0.3071, + "step": 4070 + }, + { + "epoch": 0.34729568333048966, + "grad_norm": 1.725300254121922, + "learning_rate": 7.584460567730298e-05, + "loss": 0.284, + "step": 4071 + }, + { + "epoch": 0.34738099300460673, + "grad_norm": 1.7706657629575684, + "learning_rate": 7.583277812046993e-05, + "loss": 0.2896, + "step": 4072 + }, + { + "epoch": 0.34746630267872375, + "grad_norm": 1.6346073780467865, + "learning_rate": 7.582094859144155e-05, + "loss": 0.3496, + "step": 4073 + }, + { + "epoch": 0.3475516123528408, + "grad_norm": 1.3797566760510427, + "learning_rate": 7.580911709112096e-05, + "loss": 0.2955, + "step": 4074 + }, + { + "epoch": 0.34763692202695784, + "grad_norm": 1.5427704116244934, + "learning_rate": 7.579728362041142e-05, + "loss": 0.3372, + "step": 4075 + }, + { + "epoch": 0.3477222317010749, + "grad_norm": 1.4062231697066132, + "learning_rate": 7.578544818021638e-05, + "loss": 0.3148, + "step": 4076 + }, + { + "epoch": 0.34780754137519193, + "grad_norm": 1.196500100964336, + "learning_rate": 7.577361077143939e-05, + "loss": 0.3191, + "step": 4077 + }, + { + "epoch": 0.347892851049309, + "grad_norm": 1.4102298791319308, + "learning_rate": 7.576177139498417e-05, + "loss": 0.2405, + "step": 4078 + }, + { + "epoch": 0.347978160723426, + "grad_norm": 1.5289611058050414, + "learning_rate": 7.574993005175459e-05, + "loss": 0.2994, + "step": 4079 + }, + { + "epoch": 0.3480634703975431, + "grad_norm": 1.5648426042572565, + "learning_rate": 7.57380867426547e-05, + "loss": 0.333, + "step": 4080 + }, + { + "epoch": 0.3481487800716601, + "grad_norm": 1.103932529792683, + "learning_rate": 7.572624146858867e-05, + "loss": 0.3406, + "step": 4081 + }, + { + "epoch": 0.3482340897457772, + "grad_norm": 1.1219149781651019, + "learning_rate": 7.571439423046079e-05, + "loss": 0.2933, + "step": 4082 + }, + { + "epoch": 0.3483193994198942, + "grad_norm": 1.434153434578593, + "learning_rate": 7.570254502917557e-05, + "loss": 0.3036, + "step": 4083 + }, + { + "epoch": 0.3484047090940113, + "grad_norm": 1.4100616507883788, + "learning_rate": 7.569069386563762e-05, + "loss": 0.3406, + "step": 4084 + }, + { + "epoch": 0.3484900187681283, + "grad_norm": 1.3114056339032942, + "learning_rate": 7.567884074075173e-05, + "loss": 0.2779, + "step": 4085 + }, + { + "epoch": 0.34857532844224537, + "grad_norm": 1.3448661781586864, + "learning_rate": 7.566698565542279e-05, + "loss": 0.2579, + "step": 4086 + }, + { + "epoch": 0.3486606381163624, + "grad_norm": 1.2339912977081804, + "learning_rate": 7.565512861055589e-05, + "loss": 0.296, + "step": 4087 + }, + { + "epoch": 0.34874594779047946, + "grad_norm": 1.2580899234739025, + "learning_rate": 7.564326960705624e-05, + "loss": 0.2506, + "step": 4088 + }, + { + "epoch": 0.3488312574645965, + "grad_norm": 1.2688520756893134, + "learning_rate": 7.563140864582925e-05, + "loss": 0.2509, + "step": 4089 + }, + { + "epoch": 0.34891656713871355, + "grad_norm": 1.3385696358733423, + "learning_rate": 7.561954572778038e-05, + "loss": 0.274, + "step": 4090 + }, + { + "epoch": 0.34900187681283057, + "grad_norm": 1.3440575802111436, + "learning_rate": 7.560768085381535e-05, + "loss": 0.2935, + "step": 4091 + }, + { + "epoch": 0.34908718648694764, + "grad_norm": 1.5781488700043467, + "learning_rate": 7.559581402483999e-05, + "loss": 0.2973, + "step": 4092 + }, + { + "epoch": 0.34917249616106466, + "grad_norm": 1.6693068414653833, + "learning_rate": 7.558394524176023e-05, + "loss": 0.4025, + "step": 4093 + }, + { + "epoch": 0.34925780583518173, + "grad_norm": 1.4364815296774165, + "learning_rate": 7.557207450548219e-05, + "loss": 0.2991, + "step": 4094 + }, + { + "epoch": 0.34934311550929875, + "grad_norm": 1.20372549156837, + "learning_rate": 7.556020181691217e-05, + "loss": 0.3148, + "step": 4095 + }, + { + "epoch": 0.3494284251834158, + "grad_norm": 1.6545374671703224, + "learning_rate": 7.554832717695656e-05, + "loss": 0.3257, + "step": 4096 + }, + { + "epoch": 0.34951373485753284, + "grad_norm": 1.641490598941576, + "learning_rate": 7.553645058652193e-05, + "loss": 0.3451, + "step": 4097 + }, + { + "epoch": 0.3495990445316499, + "grad_norm": 1.4703905399055552, + "learning_rate": 7.552457204651499e-05, + "loss": 0.2587, + "step": 4098 + }, + { + "epoch": 0.34968435420576693, + "grad_norm": 1.4368817202231285, + "learning_rate": 7.551269155784264e-05, + "loss": 0.3177, + "step": 4099 + }, + { + "epoch": 0.34976966387988395, + "grad_norm": 1.310234157586576, + "learning_rate": 7.550080912141184e-05, + "loss": 0.3147, + "step": 4100 + }, + { + "epoch": 0.349854973554001, + "grad_norm": 1.6547680108472942, + "learning_rate": 7.548892473812979e-05, + "loss": 0.327, + "step": 4101 + }, + { + "epoch": 0.34994028322811804, + "grad_norm": 1.252587882059664, + "learning_rate": 7.547703840890377e-05, + "loss": 0.335, + "step": 4102 + }, + { + "epoch": 0.3500255929022351, + "grad_norm": 1.4489425025731488, + "learning_rate": 7.546515013464125e-05, + "loss": 0.3166, + "step": 4103 + }, + { + "epoch": 0.35011090257635213, + "grad_norm": 1.5939719849617098, + "learning_rate": 7.545325991624986e-05, + "loss": 0.3155, + "step": 4104 + }, + { + "epoch": 0.3501962122504692, + "grad_norm": 1.495717133212102, + "learning_rate": 7.54413677546373e-05, + "loss": 0.3214, + "step": 4105 + }, + { + "epoch": 0.3502815219245862, + "grad_norm": 1.2977284759870593, + "learning_rate": 7.542947365071153e-05, + "loss": 0.3029, + "step": 4106 + }, + { + "epoch": 0.3503668315987033, + "grad_norm": 1.5921248583689127, + "learning_rate": 7.54175776053806e-05, + "loss": 0.3321, + "step": 4107 + }, + { + "epoch": 0.3504521412728203, + "grad_norm": 1.163097416767449, + "learning_rate": 7.540567961955267e-05, + "loss": 0.2762, + "step": 4108 + }, + { + "epoch": 0.3505374509469374, + "grad_norm": 1.5649523754003245, + "learning_rate": 7.539377969413608e-05, + "loss": 0.3308, + "step": 4109 + }, + { + "epoch": 0.3506227606210544, + "grad_norm": 1.6431287280389015, + "learning_rate": 7.538187783003939e-05, + "loss": 0.2689, + "step": 4110 + }, + { + "epoch": 0.3507080702951715, + "grad_norm": 1.4743762912073501, + "learning_rate": 7.536997402817119e-05, + "loss": 0.2915, + "step": 4111 + }, + { + "epoch": 0.3507933799692885, + "grad_norm": 1.5916707556440681, + "learning_rate": 7.535806828944028e-05, + "loss": 0.2698, + "step": 4112 + }, + { + "epoch": 0.3508786896434056, + "grad_norm": 1.9907851126339724, + "learning_rate": 7.534616061475563e-05, + "loss": 0.3693, + "step": 4113 + }, + { + "epoch": 0.3509639993175226, + "grad_norm": 1.5237396723058503, + "learning_rate": 7.533425100502629e-05, + "loss": 0.3653, + "step": 4114 + }, + { + "epoch": 0.35104930899163966, + "grad_norm": 1.3731086033366067, + "learning_rate": 7.532233946116151e-05, + "loss": 0.2875, + "step": 4115 + }, + { + "epoch": 0.3511346186657567, + "grad_norm": 1.8031621654785872, + "learning_rate": 7.53104259840707e-05, + "loss": 0.4071, + "step": 4116 + }, + { + "epoch": 0.35121992833987375, + "grad_norm": 1.3112265448931664, + "learning_rate": 7.529851057466336e-05, + "loss": 0.3067, + "step": 4117 + }, + { + "epoch": 0.3513052380139908, + "grad_norm": 1.31155125069401, + "learning_rate": 7.528659323384914e-05, + "loss": 0.3249, + "step": 4118 + }, + { + "epoch": 0.35139054768810785, + "grad_norm": 1.453463935291773, + "learning_rate": 7.527467396253792e-05, + "loss": 0.2815, + "step": 4119 + }, + { + "epoch": 0.35147585736222486, + "grad_norm": 1.9938749459557044, + "learning_rate": 7.526275276163966e-05, + "loss": 0.3056, + "step": 4120 + }, + { + "epoch": 0.35156116703634194, + "grad_norm": 1.639657961719472, + "learning_rate": 7.525082963206446e-05, + "loss": 0.3612, + "step": 4121 + }, + { + "epoch": 0.35164647671045896, + "grad_norm": 1.547053163074845, + "learning_rate": 7.523890457472261e-05, + "loss": 0.3241, + "step": 4122 + }, + { + "epoch": 0.35173178638457603, + "grad_norm": 1.3073050914951512, + "learning_rate": 7.522697759052451e-05, + "loss": 0.2788, + "step": 4123 + }, + { + "epoch": 0.35181709605869305, + "grad_norm": 1.3616056008966724, + "learning_rate": 7.521504868038073e-05, + "loss": 0.3158, + "step": 4124 + }, + { + "epoch": 0.3519024057328101, + "grad_norm": 1.4010233919258845, + "learning_rate": 7.520311784520197e-05, + "loss": 0.3454, + "step": 4125 + }, + { + "epoch": 0.35198771540692714, + "grad_norm": 1.5169208139008945, + "learning_rate": 7.519118508589911e-05, + "loss": 0.2744, + "step": 4126 + }, + { + "epoch": 0.3520730250810442, + "grad_norm": 1.6023258529995308, + "learning_rate": 7.517925040338312e-05, + "loss": 0.2746, + "step": 4127 + }, + { + "epoch": 0.35215833475516123, + "grad_norm": 1.3444247658570578, + "learning_rate": 7.516731379856517e-05, + "loss": 0.2539, + "step": 4128 + }, + { + "epoch": 0.3522436444292783, + "grad_norm": 1.3789300241122882, + "learning_rate": 7.515537527235655e-05, + "loss": 0.3573, + "step": 4129 + }, + { + "epoch": 0.3523289541033953, + "grad_norm": 1.45730195586049, + "learning_rate": 7.514343482566869e-05, + "loss": 0.2889, + "step": 4130 + }, + { + "epoch": 0.3524142637775124, + "grad_norm": 1.9558430793865038, + "learning_rate": 7.51314924594132e-05, + "loss": 0.2881, + "step": 4131 + }, + { + "epoch": 0.3524995734516294, + "grad_norm": 2.055531607594331, + "learning_rate": 7.511954817450181e-05, + "loss": 0.3734, + "step": 4132 + }, + { + "epoch": 0.3525848831257465, + "grad_norm": 1.4027894568298092, + "learning_rate": 7.51076019718464e-05, + "loss": 0.305, + "step": 4133 + }, + { + "epoch": 0.3526701927998635, + "grad_norm": 1.6639732372717686, + "learning_rate": 7.509565385235901e-05, + "loss": 0.3072, + "step": 4134 + }, + { + "epoch": 0.3527555024739806, + "grad_norm": 1.1673456271297786, + "learning_rate": 7.50837038169518e-05, + "loss": 0.2997, + "step": 4135 + }, + { + "epoch": 0.3528408121480976, + "grad_norm": 1.5373506271870694, + "learning_rate": 7.50717518665371e-05, + "loss": 0.3025, + "step": 4136 + }, + { + "epoch": 0.35292612182221467, + "grad_norm": 1.7573794361288875, + "learning_rate": 7.505979800202739e-05, + "loss": 0.3232, + "step": 4137 + }, + { + "epoch": 0.3530114314963317, + "grad_norm": 1.622206340600284, + "learning_rate": 7.504784222433525e-05, + "loss": 0.2917, + "step": 4138 + }, + { + "epoch": 0.3530967411704487, + "grad_norm": 1.5484647962087017, + "learning_rate": 7.503588453437347e-05, + "loss": 0.3148, + "step": 4139 + }, + { + "epoch": 0.3531820508445658, + "grad_norm": 1.626406574379336, + "learning_rate": 7.502392493305494e-05, + "loss": 0.2785, + "step": 4140 + }, + { + "epoch": 0.3532673605186828, + "grad_norm": 1.4869337013718502, + "learning_rate": 7.501196342129273e-05, + "loss": 0.3461, + "step": 4141 + }, + { + "epoch": 0.35335267019279987, + "grad_norm": 1.4256208773782901, + "learning_rate": 7.500000000000001e-05, + "loss": 0.3281, + "step": 4142 + }, + { + "epoch": 0.3534379798669169, + "grad_norm": 1.5872858638968617, + "learning_rate": 7.498803467009013e-05, + "loss": 0.3651, + "step": 4143 + }, + { + "epoch": 0.35352328954103396, + "grad_norm": 1.5283387984355252, + "learning_rate": 7.497606743247662e-05, + "loss": 0.3007, + "step": 4144 + }, + { + "epoch": 0.353608599215151, + "grad_norm": 1.5003152357099143, + "learning_rate": 7.496409828807307e-05, + "loss": 0.3449, + "step": 4145 + }, + { + "epoch": 0.35369390888926805, + "grad_norm": 1.4233263594183307, + "learning_rate": 7.495212723779327e-05, + "loss": 0.2734, + "step": 4146 + }, + { + "epoch": 0.35377921856338507, + "grad_norm": 1.4300046475208228, + "learning_rate": 7.494015428255116e-05, + "loss": 0.3114, + "step": 4147 + }, + { + "epoch": 0.35386452823750214, + "grad_norm": 1.266357398341105, + "learning_rate": 7.49281794232608e-05, + "loss": 0.2705, + "step": 4148 + }, + { + "epoch": 0.35394983791161916, + "grad_norm": 1.7008213247012014, + "learning_rate": 7.491620266083641e-05, + "loss": 0.3043, + "step": 4149 + }, + { + "epoch": 0.35403514758573623, + "grad_norm": 1.389950792415877, + "learning_rate": 7.490422399619235e-05, + "loss": 0.3158, + "step": 4150 + }, + { + "epoch": 0.35412045725985325, + "grad_norm": 1.3093855554706035, + "learning_rate": 7.489224343024313e-05, + "loss": 0.2677, + "step": 4151 + }, + { + "epoch": 0.3542057669339703, + "grad_norm": 1.2056975171718771, + "learning_rate": 7.488026096390339e-05, + "loss": 0.2544, + "step": 4152 + }, + { + "epoch": 0.35429107660808734, + "grad_norm": 1.3901506965237294, + "learning_rate": 7.486827659808796e-05, + "loss": 0.2786, + "step": 4153 + }, + { + "epoch": 0.3543763862822044, + "grad_norm": 1.4110196500500811, + "learning_rate": 7.485629033371175e-05, + "loss": 0.3179, + "step": 4154 + }, + { + "epoch": 0.35446169595632143, + "grad_norm": 1.1256501650287427, + "learning_rate": 7.484430217168985e-05, + "loss": 0.2729, + "step": 4155 + }, + { + "epoch": 0.3545470056304385, + "grad_norm": 1.4109723379369126, + "learning_rate": 7.483231211293751e-05, + "loss": 0.2497, + "step": 4156 + }, + { + "epoch": 0.3546323153045555, + "grad_norm": 1.3880243307311733, + "learning_rate": 7.482032015837009e-05, + "loss": 0.2828, + "step": 4157 + }, + { + "epoch": 0.3547176249786726, + "grad_norm": 2.203051085618749, + "learning_rate": 7.480832630890315e-05, + "loss": 0.3539, + "step": 4158 + }, + { + "epoch": 0.3548029346527896, + "grad_norm": 1.5888355254316853, + "learning_rate": 7.47963305654523e-05, + "loss": 0.2767, + "step": 4159 + }, + { + "epoch": 0.3548882443269067, + "grad_norm": 1.4808956124700212, + "learning_rate": 7.478433292893339e-05, + "loss": 0.2981, + "step": 4160 + }, + { + "epoch": 0.3549735540010237, + "grad_norm": 1.2384139510328742, + "learning_rate": 7.477233340026235e-05, + "loss": 0.338, + "step": 4161 + }, + { + "epoch": 0.3550588636751408, + "grad_norm": 1.6233144968703728, + "learning_rate": 7.476033198035531e-05, + "loss": 0.263, + "step": 4162 + }, + { + "epoch": 0.3551441733492578, + "grad_norm": 1.6678393847575337, + "learning_rate": 7.47483286701285e-05, + "loss": 0.3213, + "step": 4163 + }, + { + "epoch": 0.35522948302337487, + "grad_norm": 1.444358882244643, + "learning_rate": 7.473632347049831e-05, + "loss": 0.2465, + "step": 4164 + }, + { + "epoch": 0.3553147926974919, + "grad_norm": 1.4073903440510949, + "learning_rate": 7.472431638238127e-05, + "loss": 0.2983, + "step": 4165 + }, + { + "epoch": 0.35540010237160896, + "grad_norm": 1.6340053162132264, + "learning_rate": 7.471230740669405e-05, + "loss": 0.2679, + "step": 4166 + }, + { + "epoch": 0.355485412045726, + "grad_norm": 1.7243881688543232, + "learning_rate": 7.470029654435349e-05, + "loss": 0.3384, + "step": 4167 + }, + { + "epoch": 0.35557072171984305, + "grad_norm": 1.563595502190453, + "learning_rate": 7.468828379627653e-05, + "loss": 0.2607, + "step": 4168 + }, + { + "epoch": 0.35565603139396007, + "grad_norm": 1.5989946574538765, + "learning_rate": 7.467626916338032e-05, + "loss": 0.2704, + "step": 4169 + }, + { + "epoch": 0.35574134106807714, + "grad_norm": 1.2950643745829644, + "learning_rate": 7.466425264658208e-05, + "loss": 0.2776, + "step": 4170 + }, + { + "epoch": 0.35582665074219416, + "grad_norm": 1.5390310526312114, + "learning_rate": 7.46522342467992e-05, + "loss": 0.3212, + "step": 4171 + }, + { + "epoch": 0.35591196041631123, + "grad_norm": 1.4328080604675804, + "learning_rate": 7.464021396494925e-05, + "loss": 0.3127, + "step": 4172 + }, + { + "epoch": 0.35599727009042825, + "grad_norm": 1.3722706495758248, + "learning_rate": 7.462819180194991e-05, + "loss": 0.3046, + "step": 4173 + }, + { + "epoch": 0.3560825797645453, + "grad_norm": 1.382616147686098, + "learning_rate": 7.461616775871899e-05, + "loss": 0.3287, + "step": 4174 + }, + { + "epoch": 0.35616788943866234, + "grad_norm": 1.6908364690780078, + "learning_rate": 7.460414183617447e-05, + "loss": 0.3041, + "step": 4175 + }, + { + "epoch": 0.35625319911277936, + "grad_norm": 1.3343316850622668, + "learning_rate": 7.459211403523447e-05, + "loss": 0.2607, + "step": 4176 + }, + { + "epoch": 0.35633850878689644, + "grad_norm": 1.3024817912463575, + "learning_rate": 7.458008435681725e-05, + "loss": 0.3041, + "step": 4177 + }, + { + "epoch": 0.35642381846101345, + "grad_norm": 1.5170511044951327, + "learning_rate": 7.456805280184121e-05, + "loss": 0.2952, + "step": 4178 + }, + { + "epoch": 0.3565091281351305, + "grad_norm": 1.340921218975924, + "learning_rate": 7.455601937122489e-05, + "loss": 0.3189, + "step": 4179 + }, + { + "epoch": 0.35659443780924754, + "grad_norm": 1.314331819965079, + "learning_rate": 7.454398406588697e-05, + "loss": 0.3223, + "step": 4180 + }, + { + "epoch": 0.3566797474833646, + "grad_norm": 1.9313878895936387, + "learning_rate": 7.45319468867463e-05, + "loss": 0.336, + "step": 4181 + }, + { + "epoch": 0.35676505715748164, + "grad_norm": 1.5041286551761526, + "learning_rate": 7.451990783472186e-05, + "loss": 0.3258, + "step": 4182 + }, + { + "epoch": 0.3568503668315987, + "grad_norm": 1.6054708290550712, + "learning_rate": 7.450786691073274e-05, + "loss": 0.2921, + "step": 4183 + }, + { + "epoch": 0.3569356765057157, + "grad_norm": 1.4826322263061211, + "learning_rate": 7.449582411569822e-05, + "loss": 0.2592, + "step": 4184 + }, + { + "epoch": 0.3570209861798328, + "grad_norm": 1.2759791560010187, + "learning_rate": 7.44837794505377e-05, + "loss": 0.2797, + "step": 4185 + }, + { + "epoch": 0.3571062958539498, + "grad_norm": 1.1812247056624468, + "learning_rate": 7.447173291617072e-05, + "loss": 0.2738, + "step": 4186 + }, + { + "epoch": 0.3571916055280669, + "grad_norm": 1.435103367395486, + "learning_rate": 7.4459684513517e-05, + "loss": 0.2934, + "step": 4187 + }, + { + "epoch": 0.3572769152021839, + "grad_norm": 1.5267378051430691, + "learning_rate": 7.44476342434963e-05, + "loss": 0.2553, + "step": 4188 + }, + { + "epoch": 0.357362224876301, + "grad_norm": 1.4426187517285172, + "learning_rate": 7.443558210702868e-05, + "loss": 0.3066, + "step": 4189 + }, + { + "epoch": 0.357447534550418, + "grad_norm": 1.4840466587926515, + "learning_rate": 7.44235281050342e-05, + "loss": 0.3073, + "step": 4190 + }, + { + "epoch": 0.3575328442245351, + "grad_norm": 1.5241952429933074, + "learning_rate": 7.441147223843315e-05, + "loss": 0.3234, + "step": 4191 + }, + { + "epoch": 0.3576181538986521, + "grad_norm": 1.575554973978516, + "learning_rate": 7.439941450814591e-05, + "loss": 0.322, + "step": 4192 + }, + { + "epoch": 0.35770346357276916, + "grad_norm": 1.5873783125007348, + "learning_rate": 7.4387354915093e-05, + "loss": 0.3312, + "step": 4193 + }, + { + "epoch": 0.3577887732468862, + "grad_norm": 1.8190744143464306, + "learning_rate": 7.437529346019518e-05, + "loss": 0.3544, + "step": 4194 + }, + { + "epoch": 0.35787408292100326, + "grad_norm": 1.193727726753516, + "learning_rate": 7.436323014437322e-05, + "loss": 0.2359, + "step": 4195 + }, + { + "epoch": 0.3579593925951203, + "grad_norm": 1.409258273888283, + "learning_rate": 7.435116496854809e-05, + "loss": 0.2997, + "step": 4196 + }, + { + "epoch": 0.35804470226923735, + "grad_norm": 1.44090084322994, + "learning_rate": 7.433909793364093e-05, + "loss": 0.2593, + "step": 4197 + }, + { + "epoch": 0.35813001194335436, + "grad_norm": 1.2452632803385872, + "learning_rate": 7.432702904057299e-05, + "loss": 0.308, + "step": 4198 + }, + { + "epoch": 0.35821532161747144, + "grad_norm": 1.731010954841491, + "learning_rate": 7.431495829026563e-05, + "loss": 0.329, + "step": 4199 + }, + { + "epoch": 0.35830063129158846, + "grad_norm": 1.7393054473710938, + "learning_rate": 7.430288568364045e-05, + "loss": 0.2728, + "step": 4200 + }, + { + "epoch": 0.35838594096570553, + "grad_norm": 1.7898064278516481, + "learning_rate": 7.429081122161908e-05, + "loss": 0.3591, + "step": 4201 + }, + { + "epoch": 0.35847125063982255, + "grad_norm": 1.4840130013207395, + "learning_rate": 7.427873490512334e-05, + "loss": 0.298, + "step": 4202 + }, + { + "epoch": 0.3585565603139396, + "grad_norm": 1.563909581470781, + "learning_rate": 7.426665673507521e-05, + "loss": 0.327, + "step": 4203 + }, + { + "epoch": 0.35864186998805664, + "grad_norm": 1.4059001487421992, + "learning_rate": 7.425457671239679e-05, + "loss": 0.3011, + "step": 4204 + }, + { + "epoch": 0.3587271796621737, + "grad_norm": 1.3402321477753099, + "learning_rate": 7.424249483801033e-05, + "loss": 0.2902, + "step": 4205 + }, + { + "epoch": 0.35881248933629073, + "grad_norm": 1.5545089849222813, + "learning_rate": 7.423041111283822e-05, + "loss": 0.2936, + "step": 4206 + }, + { + "epoch": 0.3588977990104078, + "grad_norm": 1.3497369598278781, + "learning_rate": 7.421832553780299e-05, + "loss": 0.312, + "step": 4207 + }, + { + "epoch": 0.3589831086845248, + "grad_norm": 1.1563493840575407, + "learning_rate": 7.420623811382728e-05, + "loss": 0.3064, + "step": 4208 + }, + { + "epoch": 0.3590684183586419, + "grad_norm": 1.3791005805433056, + "learning_rate": 7.419414884183393e-05, + "loss": 0.2723, + "step": 4209 + }, + { + "epoch": 0.3591537280327589, + "grad_norm": 1.8005694018555456, + "learning_rate": 7.41820577227459e-05, + "loss": 0.3714, + "step": 4210 + }, + { + "epoch": 0.359239037706876, + "grad_norm": 1.663598224185657, + "learning_rate": 7.416996475748625e-05, + "loss": 0.3524, + "step": 4211 + }, + { + "epoch": 0.359324347380993, + "grad_norm": 1.4613591365995722, + "learning_rate": 7.415786994697823e-05, + "loss": 0.3246, + "step": 4212 + }, + { + "epoch": 0.3594096570551101, + "grad_norm": 1.1662786497009061, + "learning_rate": 7.414577329214522e-05, + "loss": 0.2725, + "step": 4213 + }, + { + "epoch": 0.3594949667292271, + "grad_norm": 1.29366203230182, + "learning_rate": 7.413367479391071e-05, + "loss": 0.3246, + "step": 4214 + }, + { + "epoch": 0.3595802764033441, + "grad_norm": 1.7716052299079519, + "learning_rate": 7.412157445319839e-05, + "loss": 0.3631, + "step": 4215 + }, + { + "epoch": 0.3596655860774612, + "grad_norm": 1.3172090612345997, + "learning_rate": 7.410947227093204e-05, + "loss": 0.2626, + "step": 4216 + }, + { + "epoch": 0.3597508957515782, + "grad_norm": 1.5265496963073362, + "learning_rate": 7.40973682480356e-05, + "loss": 0.3056, + "step": 4217 + }, + { + "epoch": 0.3598362054256953, + "grad_norm": 1.4675115680401103, + "learning_rate": 7.408526238543316e-05, + "loss": 0.3173, + "step": 4218 + }, + { + "epoch": 0.3599215150998123, + "grad_norm": 1.6291098341659236, + "learning_rate": 7.407315468404892e-05, + "loss": 0.3515, + "step": 4219 + }, + { + "epoch": 0.36000682477392937, + "grad_norm": 1.3142001176365272, + "learning_rate": 7.406104514480723e-05, + "loss": 0.307, + "step": 4220 + }, + { + "epoch": 0.3600921344480464, + "grad_norm": 1.3293194504624795, + "learning_rate": 7.404893376863263e-05, + "loss": 0.3062, + "step": 4221 + }, + { + "epoch": 0.36017744412216346, + "grad_norm": 1.6527274477621239, + "learning_rate": 7.40368205564497e-05, + "loss": 0.3486, + "step": 4222 + }, + { + "epoch": 0.3602627537962805, + "grad_norm": 1.4694359983230725, + "learning_rate": 7.402470550918328e-05, + "loss": 0.292, + "step": 4223 + }, + { + "epoch": 0.36034806347039755, + "grad_norm": 1.4141440183631373, + "learning_rate": 7.401258862775825e-05, + "loss": 0.3063, + "step": 4224 + }, + { + "epoch": 0.36043337314451457, + "grad_norm": 1.474940786951194, + "learning_rate": 7.400046991309968e-05, + "loss": 0.293, + "step": 4225 + }, + { + "epoch": 0.36051868281863164, + "grad_norm": 1.555093836758891, + "learning_rate": 7.398834936613277e-05, + "loss": 0.2422, + "step": 4226 + }, + { + "epoch": 0.36060399249274866, + "grad_norm": 1.8596147655121584, + "learning_rate": 7.397622698778286e-05, + "loss": 0.3099, + "step": 4227 + }, + { + "epoch": 0.36068930216686573, + "grad_norm": 1.2104587716129196, + "learning_rate": 7.396410277897543e-05, + "loss": 0.3026, + "step": 4228 + }, + { + "epoch": 0.36077461184098275, + "grad_norm": 1.8139240327046058, + "learning_rate": 7.39519767406361e-05, + "loss": 0.309, + "step": 4229 + }, + { + "epoch": 0.3608599215150998, + "grad_norm": 2.003670542873419, + "learning_rate": 7.39398488736906e-05, + "loss": 0.3189, + "step": 4230 + }, + { + "epoch": 0.36094523118921684, + "grad_norm": 1.7902239232150443, + "learning_rate": 7.392771917906489e-05, + "loss": 0.3611, + "step": 4231 + }, + { + "epoch": 0.3610305408633339, + "grad_norm": 1.4257203807640462, + "learning_rate": 7.391558765768496e-05, + "loss": 0.2955, + "step": 4232 + }, + { + "epoch": 0.36111585053745093, + "grad_norm": 1.5176082887928923, + "learning_rate": 7.390345431047702e-05, + "loss": 0.3002, + "step": 4233 + }, + { + "epoch": 0.361201160211568, + "grad_norm": 1.1945877371206675, + "learning_rate": 7.389131913836736e-05, + "loss": 0.3048, + "step": 4234 + }, + { + "epoch": 0.361286469885685, + "grad_norm": 1.3211375711438296, + "learning_rate": 7.387918214228242e-05, + "loss": 0.2615, + "step": 4235 + }, + { + "epoch": 0.3613717795598021, + "grad_norm": 1.513981035715959, + "learning_rate": 7.386704332314882e-05, + "loss": 0.2664, + "step": 4236 + }, + { + "epoch": 0.3614570892339191, + "grad_norm": 1.5980016059962905, + "learning_rate": 7.38549026818933e-05, + "loss": 0.2844, + "step": 4237 + }, + { + "epoch": 0.3615423989080362, + "grad_norm": 1.4503613515006382, + "learning_rate": 7.384276021944272e-05, + "loss": 0.2928, + "step": 4238 + }, + { + "epoch": 0.3616277085821532, + "grad_norm": 1.471855613759008, + "learning_rate": 7.38306159367241e-05, + "loss": 0.3485, + "step": 4239 + }, + { + "epoch": 0.3617130182562703, + "grad_norm": 1.205229961406717, + "learning_rate": 7.38184698346646e-05, + "loss": 0.2093, + "step": 4240 + }, + { + "epoch": 0.3617983279303873, + "grad_norm": 1.4829420702391927, + "learning_rate": 7.380632191419147e-05, + "loss": 0.3179, + "step": 4241 + }, + { + "epoch": 0.36188363760450437, + "grad_norm": 2.0362400676187655, + "learning_rate": 7.379417217623221e-05, + "loss": 0.3203, + "step": 4242 + }, + { + "epoch": 0.3619689472786214, + "grad_norm": 1.4047567280480766, + "learning_rate": 7.378202062171432e-05, + "loss": 0.2695, + "step": 4243 + }, + { + "epoch": 0.36205425695273846, + "grad_norm": 1.6149096087501598, + "learning_rate": 7.376986725156554e-05, + "loss": 0.3107, + "step": 4244 + }, + { + "epoch": 0.3621395666268555, + "grad_norm": 1.4874729955449901, + "learning_rate": 7.375771206671372e-05, + "loss": 0.2812, + "step": 4245 + }, + { + "epoch": 0.36222487630097255, + "grad_norm": 1.8303704450702114, + "learning_rate": 7.37455550680868e-05, + "loss": 0.3165, + "step": 4246 + }, + { + "epoch": 0.36231018597508957, + "grad_norm": 1.624906097046457, + "learning_rate": 7.373339625661295e-05, + "loss": 0.2678, + "step": 4247 + }, + { + "epoch": 0.36239549564920664, + "grad_norm": 1.308576350309929, + "learning_rate": 7.372123563322042e-05, + "loss": 0.303, + "step": 4248 + }, + { + "epoch": 0.36248080532332366, + "grad_norm": 1.2866710327416957, + "learning_rate": 7.37090731988376e-05, + "loss": 0.2701, + "step": 4249 + }, + { + "epoch": 0.36256611499744074, + "grad_norm": 2.1745091114533084, + "learning_rate": 7.369690895439303e-05, + "loss": 0.338, + "step": 4250 + }, + { + "epoch": 0.36265142467155775, + "grad_norm": 1.4140610668533398, + "learning_rate": 7.368474290081539e-05, + "loss": 0.3045, + "step": 4251 + }, + { + "epoch": 0.3627367343456748, + "grad_norm": 1.5209786942223085, + "learning_rate": 7.367257503903349e-05, + "loss": 0.2641, + "step": 4252 + }, + { + "epoch": 0.36282204401979185, + "grad_norm": 1.7939477047309853, + "learning_rate": 7.366040536997628e-05, + "loss": 0.3313, + "step": 4253 + }, + { + "epoch": 0.36290735369390886, + "grad_norm": 1.4808094769933504, + "learning_rate": 7.364823389457286e-05, + "loss": 0.3143, + "step": 4254 + }, + { + "epoch": 0.36299266336802594, + "grad_norm": 1.4487295636108986, + "learning_rate": 7.363606061375245e-05, + "loss": 0.2875, + "step": 4255 + }, + { + "epoch": 0.36307797304214295, + "grad_norm": 1.4336142057783612, + "learning_rate": 7.36238855284444e-05, + "loss": 0.3615, + "step": 4256 + }, + { + "epoch": 0.36316328271626, + "grad_norm": 1.3625060230087005, + "learning_rate": 7.361170863957822e-05, + "loss": 0.2954, + "step": 4257 + }, + { + "epoch": 0.36324859239037705, + "grad_norm": 1.6519839930001325, + "learning_rate": 7.359952994808358e-05, + "loss": 0.271, + "step": 4258 + }, + { + "epoch": 0.3633339020644941, + "grad_norm": 1.3618945745257973, + "learning_rate": 7.358734945489022e-05, + "loss": 0.2981, + "step": 4259 + }, + { + "epoch": 0.36341921173861114, + "grad_norm": 1.3943882289175156, + "learning_rate": 7.357516716092807e-05, + "loss": 0.316, + "step": 4260 + }, + { + "epoch": 0.3635045214127282, + "grad_norm": 1.3759598416245837, + "learning_rate": 7.35629830671272e-05, + "loss": 0.2531, + "step": 4261 + }, + { + "epoch": 0.3635898310868452, + "grad_norm": 1.4937804342704875, + "learning_rate": 7.355079717441777e-05, + "loss": 0.2667, + "step": 4262 + }, + { + "epoch": 0.3636751407609623, + "grad_norm": 1.6484140692758664, + "learning_rate": 7.353860948373015e-05, + "loss": 0.2491, + "step": 4263 + }, + { + "epoch": 0.3637604504350793, + "grad_norm": 1.3654732556568299, + "learning_rate": 7.352641999599477e-05, + "loss": 0.3196, + "step": 4264 + }, + { + "epoch": 0.3638457601091964, + "grad_norm": 1.3087475131210702, + "learning_rate": 7.351422871214223e-05, + "loss": 0.3158, + "step": 4265 + }, + { + "epoch": 0.3639310697833134, + "grad_norm": 1.42468955690604, + "learning_rate": 7.35020356331033e-05, + "loss": 0.3073, + "step": 4266 + }, + { + "epoch": 0.3640163794574305, + "grad_norm": 1.530576441015205, + "learning_rate": 7.348984075980882e-05, + "loss": 0.3116, + "step": 4267 + }, + { + "epoch": 0.3641016891315475, + "grad_norm": 1.540077920156181, + "learning_rate": 7.347764409318984e-05, + "loss": 0.2555, + "step": 4268 + }, + { + "epoch": 0.3641869988056646, + "grad_norm": 1.2610619314596752, + "learning_rate": 7.346544563417747e-05, + "loss": 0.3294, + "step": 4269 + }, + { + "epoch": 0.3642723084797816, + "grad_norm": 1.9141447088531902, + "learning_rate": 7.345324538370304e-05, + "loss": 0.3334, + "step": 4270 + }, + { + "epoch": 0.36435761815389867, + "grad_norm": 1.3899551235474363, + "learning_rate": 7.344104334269795e-05, + "loss": 0.2825, + "step": 4271 + }, + { + "epoch": 0.3644429278280157, + "grad_norm": 1.7019880507035143, + "learning_rate": 7.342883951209377e-05, + "loss": 0.2592, + "step": 4272 + }, + { + "epoch": 0.36452823750213276, + "grad_norm": 1.5462336125658036, + "learning_rate": 7.341663389282219e-05, + "loss": 0.3519, + "step": 4273 + }, + { + "epoch": 0.3646135471762498, + "grad_norm": 1.594169262360319, + "learning_rate": 7.340442648581505e-05, + "loss": 0.327, + "step": 4274 + }, + { + "epoch": 0.36469885685036685, + "grad_norm": 1.5767942048495174, + "learning_rate": 7.339221729200431e-05, + "loss": 0.3135, + "step": 4275 + }, + { + "epoch": 0.36478416652448387, + "grad_norm": 1.3762780232222667, + "learning_rate": 7.338000631232212e-05, + "loss": 0.3148, + "step": 4276 + }, + { + "epoch": 0.36486947619860094, + "grad_norm": 1.464915281456608, + "learning_rate": 7.336779354770066e-05, + "loss": 0.2507, + "step": 4277 + }, + { + "epoch": 0.36495478587271796, + "grad_norm": 1.3172839942783907, + "learning_rate": 7.335557899907232e-05, + "loss": 0.3203, + "step": 4278 + }, + { + "epoch": 0.36504009554683503, + "grad_norm": 1.4653195028204837, + "learning_rate": 7.334336266736968e-05, + "loss": 0.2962, + "step": 4279 + }, + { + "epoch": 0.36512540522095205, + "grad_norm": 1.367423075407528, + "learning_rate": 7.333114455352532e-05, + "loss": 0.3354, + "step": 4280 + }, + { + "epoch": 0.3652107148950691, + "grad_norm": 1.5490485439783876, + "learning_rate": 7.331892465847206e-05, + "loss": 0.3368, + "step": 4281 + }, + { + "epoch": 0.36529602456918614, + "grad_norm": 1.3289569717227228, + "learning_rate": 7.330670298314281e-05, + "loss": 0.2604, + "step": 4282 + }, + { + "epoch": 0.3653813342433032, + "grad_norm": 1.5952362909150917, + "learning_rate": 7.329447952847064e-05, + "loss": 0.2984, + "step": 4283 + }, + { + "epoch": 0.36546664391742023, + "grad_norm": 1.3088995277366045, + "learning_rate": 7.328225429538875e-05, + "loss": 0.3066, + "step": 4284 + }, + { + "epoch": 0.3655519535915373, + "grad_norm": 1.3614219512787074, + "learning_rate": 7.327002728483048e-05, + "loss": 0.3295, + "step": 4285 + }, + { + "epoch": 0.3656372632656543, + "grad_norm": 1.4281239115170394, + "learning_rate": 7.325779849772928e-05, + "loss": 0.2683, + "step": 4286 + }, + { + "epoch": 0.3657225729397714, + "grad_norm": 1.6948753531656995, + "learning_rate": 7.324556793501875e-05, + "loss": 0.3558, + "step": 4287 + }, + { + "epoch": 0.3658078826138884, + "grad_norm": 1.5210414726627473, + "learning_rate": 7.323333559763263e-05, + "loss": 0.3502, + "step": 4288 + }, + { + "epoch": 0.3658931922880055, + "grad_norm": 1.4985080292683581, + "learning_rate": 7.32211014865048e-05, + "loss": 0.2852, + "step": 4289 + }, + { + "epoch": 0.3659785019621225, + "grad_norm": 1.3202008533110372, + "learning_rate": 7.320886560256927e-05, + "loss": 0.2652, + "step": 4290 + }, + { + "epoch": 0.3660638116362395, + "grad_norm": 1.7107409359966237, + "learning_rate": 7.319662794676018e-05, + "loss": 0.3539, + "step": 4291 + }, + { + "epoch": 0.3661491213103566, + "grad_norm": 1.2947343371162618, + "learning_rate": 7.318438852001181e-05, + "loss": 0.2999, + "step": 4292 + }, + { + "epoch": 0.3662344309844736, + "grad_norm": 1.6847891049039134, + "learning_rate": 7.317214732325858e-05, + "loss": 0.2737, + "step": 4293 + }, + { + "epoch": 0.3663197406585907, + "grad_norm": 1.5465773479113516, + "learning_rate": 7.315990435743504e-05, + "loss": 0.2986, + "step": 4294 + }, + { + "epoch": 0.3664050503327077, + "grad_norm": 1.5373766035478993, + "learning_rate": 7.31476596234759e-05, + "loss": 0.3203, + "step": 4295 + }, + { + "epoch": 0.3664903600068248, + "grad_norm": 1.1775635046548942, + "learning_rate": 7.313541312231592e-05, + "loss": 0.2476, + "step": 4296 + }, + { + "epoch": 0.3665756696809418, + "grad_norm": 1.4924221952723868, + "learning_rate": 7.31231648548901e-05, + "loss": 0.2988, + "step": 4297 + }, + { + "epoch": 0.36666097935505887, + "grad_norm": 1.460340571722467, + "learning_rate": 7.311091482213353e-05, + "loss": 0.3042, + "step": 4298 + }, + { + "epoch": 0.3667462890291759, + "grad_norm": 1.5370674176045749, + "learning_rate": 7.30986630249814e-05, + "loss": 0.2681, + "step": 4299 + }, + { + "epoch": 0.36683159870329296, + "grad_norm": 1.501679592747295, + "learning_rate": 7.308640946436912e-05, + "loss": 0.3395, + "step": 4300 + }, + { + "epoch": 0.36691690837741, + "grad_norm": 1.347698222073416, + "learning_rate": 7.307415414123215e-05, + "loss": 0.247, + "step": 4301 + }, + { + "epoch": 0.36700221805152705, + "grad_norm": 1.1703509784072004, + "learning_rate": 7.306189705650613e-05, + "loss": 0.2841, + "step": 4302 + }, + { + "epoch": 0.36708752772564407, + "grad_norm": 1.3174073349216433, + "learning_rate": 7.304963821112681e-05, + "loss": 0.264, + "step": 4303 + }, + { + "epoch": 0.36717283739976114, + "grad_norm": 1.493185139239735, + "learning_rate": 7.30373776060301e-05, + "loss": 0.2958, + "step": 4304 + }, + { + "epoch": 0.36725814707387816, + "grad_norm": 1.3482652697852324, + "learning_rate": 7.302511524215203e-05, + "loss": 0.3237, + "step": 4305 + }, + { + "epoch": 0.36734345674799523, + "grad_norm": 1.220414712022384, + "learning_rate": 7.301285112042879e-05, + "loss": 0.265, + "step": 4306 + }, + { + "epoch": 0.36742876642211225, + "grad_norm": 1.6587330181572024, + "learning_rate": 7.300058524179662e-05, + "loss": 0.2716, + "step": 4307 + }, + { + "epoch": 0.3675140760962293, + "grad_norm": 1.6527367523544116, + "learning_rate": 7.298831760719202e-05, + "loss": 0.3201, + "step": 4308 + }, + { + "epoch": 0.36759938577034634, + "grad_norm": 1.5080392301399568, + "learning_rate": 7.297604821755153e-05, + "loss": 0.298, + "step": 4309 + }, + { + "epoch": 0.3676846954444634, + "grad_norm": 1.3067331324105926, + "learning_rate": 7.296377707381183e-05, + "loss": 0.2569, + "step": 4310 + }, + { + "epoch": 0.36777000511858043, + "grad_norm": 1.5563241596247046, + "learning_rate": 7.29515041769098e-05, + "loss": 0.3418, + "step": 4311 + }, + { + "epoch": 0.3678553147926975, + "grad_norm": 1.714249840429153, + "learning_rate": 7.293922952778239e-05, + "loss": 0.3104, + "step": 4312 + }, + { + "epoch": 0.3679406244668145, + "grad_norm": 1.5928775418873533, + "learning_rate": 7.292695312736668e-05, + "loss": 0.318, + "step": 4313 + }, + { + "epoch": 0.3680259341409316, + "grad_norm": 1.441016664036712, + "learning_rate": 7.291467497659996e-05, + "loss": 0.3301, + "step": 4314 + }, + { + "epoch": 0.3681112438150486, + "grad_norm": 1.6840717672608718, + "learning_rate": 7.290239507641956e-05, + "loss": 0.3281, + "step": 4315 + }, + { + "epoch": 0.3681965534891657, + "grad_norm": 1.380684200760041, + "learning_rate": 7.2890113427763e-05, + "loss": 0.2874, + "step": 4316 + }, + { + "epoch": 0.3682818631632827, + "grad_norm": 1.4871067809635161, + "learning_rate": 7.287783003156793e-05, + "loss": 0.2729, + "step": 4317 + }, + { + "epoch": 0.3683671728373998, + "grad_norm": 1.274898042529129, + "learning_rate": 7.286554488877211e-05, + "loss": 0.2891, + "step": 4318 + }, + { + "epoch": 0.3684524825115168, + "grad_norm": 1.6390937562186016, + "learning_rate": 7.285325800031343e-05, + "loss": 0.3554, + "step": 4319 + }, + { + "epoch": 0.3685377921856339, + "grad_norm": 1.5170547191530506, + "learning_rate": 7.284096936712994e-05, + "loss": 0.2619, + "step": 4320 + }, + { + "epoch": 0.3686231018597509, + "grad_norm": 1.5827702056265052, + "learning_rate": 7.282867899015983e-05, + "loss": 0.3088, + "step": 4321 + }, + { + "epoch": 0.36870841153386796, + "grad_norm": 1.4567260422524593, + "learning_rate": 7.281638687034139e-05, + "loss": 0.2823, + "step": 4322 + }, + { + "epoch": 0.368793721207985, + "grad_norm": 1.618799336981349, + "learning_rate": 7.280409300861305e-05, + "loss": 0.3355, + "step": 4323 + }, + { + "epoch": 0.36887903088210205, + "grad_norm": 1.548574958497118, + "learning_rate": 7.27917974059134e-05, + "loss": 0.3537, + "step": 4324 + }, + { + "epoch": 0.3689643405562191, + "grad_norm": 1.6706123932128372, + "learning_rate": 7.277950006318113e-05, + "loss": 0.353, + "step": 4325 + }, + { + "epoch": 0.36904965023033615, + "grad_norm": 1.575077551491637, + "learning_rate": 7.276720098135507e-05, + "loss": 0.3554, + "step": 4326 + }, + { + "epoch": 0.36913495990445316, + "grad_norm": 1.8081195778209982, + "learning_rate": 7.275490016137421e-05, + "loss": 0.3319, + "step": 4327 + }, + { + "epoch": 0.36922026957857024, + "grad_norm": 1.3679158477749767, + "learning_rate": 7.274259760417764e-05, + "loss": 0.3083, + "step": 4328 + }, + { + "epoch": 0.36930557925268725, + "grad_norm": 1.2424853465638062, + "learning_rate": 7.27302933107046e-05, + "loss": 0.2729, + "step": 4329 + }, + { + "epoch": 0.3693908889268043, + "grad_norm": 1.291742486676809, + "learning_rate": 7.271798728189445e-05, + "loss": 0.3157, + "step": 4330 + }, + { + "epoch": 0.36947619860092135, + "grad_norm": 1.3837875118529597, + "learning_rate": 7.27056795186867e-05, + "loss": 0.3027, + "step": 4331 + }, + { + "epoch": 0.36956150827503836, + "grad_norm": 1.2864427243828913, + "learning_rate": 7.269337002202096e-05, + "loss": 0.2415, + "step": 4332 + }, + { + "epoch": 0.36964681794915544, + "grad_norm": 1.390045302359493, + "learning_rate": 7.268105879283703e-05, + "loss": 0.3093, + "step": 4333 + }, + { + "epoch": 0.36973212762327246, + "grad_norm": 1.431386386419961, + "learning_rate": 7.266874583207479e-05, + "loss": 0.2891, + "step": 4334 + }, + { + "epoch": 0.36981743729738953, + "grad_norm": 1.3996492389236859, + "learning_rate": 7.265643114067426e-05, + "loss": 0.2714, + "step": 4335 + }, + { + "epoch": 0.36990274697150655, + "grad_norm": 1.4052561744970984, + "learning_rate": 7.26441147195756e-05, + "loss": 0.2713, + "step": 4336 + }, + { + "epoch": 0.3699880566456236, + "grad_norm": 1.849314552342862, + "learning_rate": 7.263179656971911e-05, + "loss": 0.3292, + "step": 4337 + }, + { + "epoch": 0.37007336631974064, + "grad_norm": 1.3831832302572808, + "learning_rate": 7.261947669204523e-05, + "loss": 0.2672, + "step": 4338 + }, + { + "epoch": 0.3701586759938577, + "grad_norm": 1.7834823406511973, + "learning_rate": 7.26071550874945e-05, + "loss": 0.3195, + "step": 4339 + }, + { + "epoch": 0.37024398566797473, + "grad_norm": 1.3294427953595531, + "learning_rate": 7.259483175700762e-05, + "loss": 0.3001, + "step": 4340 + }, + { + "epoch": 0.3703292953420918, + "grad_norm": 1.6326778666645747, + "learning_rate": 7.25825067015254e-05, + "loss": 0.3172, + "step": 4341 + }, + { + "epoch": 0.3704146050162088, + "grad_norm": 1.7688233431110416, + "learning_rate": 7.257017992198878e-05, + "loss": 0.3069, + "step": 4342 + }, + { + "epoch": 0.3704999146903259, + "grad_norm": 1.4269881483573605, + "learning_rate": 7.255785141933888e-05, + "loss": 0.2724, + "step": 4343 + }, + { + "epoch": 0.3705852243644429, + "grad_norm": 1.3166053236582393, + "learning_rate": 7.25455211945169e-05, + "loss": 0.3606, + "step": 4344 + }, + { + "epoch": 0.37067053403856, + "grad_norm": 1.5364867989009934, + "learning_rate": 7.253318924846417e-05, + "loss": 0.3846, + "step": 4345 + }, + { + "epoch": 0.370755843712677, + "grad_norm": 1.4492346256342792, + "learning_rate": 7.252085558212219e-05, + "loss": 0.2961, + "step": 4346 + }, + { + "epoch": 0.3708411533867941, + "grad_norm": 1.5382817800665272, + "learning_rate": 7.250852019643256e-05, + "loss": 0.2768, + "step": 4347 + }, + { + "epoch": 0.3709264630609111, + "grad_norm": 1.6097036970445997, + "learning_rate": 7.249618309233702e-05, + "loss": 0.2351, + "step": 4348 + }, + { + "epoch": 0.37101177273502817, + "grad_norm": 1.583355836541061, + "learning_rate": 7.248384427077745e-05, + "loss": 0.2932, + "step": 4349 + }, + { + "epoch": 0.3710970824091452, + "grad_norm": 1.5387907611273086, + "learning_rate": 7.247150373269586e-05, + "loss": 0.2691, + "step": 4350 + }, + { + "epoch": 0.37118239208326226, + "grad_norm": 1.4104370513900426, + "learning_rate": 7.245916147903436e-05, + "loss": 0.3084, + "step": 4351 + }, + { + "epoch": 0.3712677017573793, + "grad_norm": 1.280952698244463, + "learning_rate": 7.244681751073522e-05, + "loss": 0.2908, + "step": 4352 + }, + { + "epoch": 0.37135301143149635, + "grad_norm": 1.5568179767813544, + "learning_rate": 7.243447182874086e-05, + "loss": 0.3449, + "step": 4353 + }, + { + "epoch": 0.37143832110561337, + "grad_norm": 1.506800969448036, + "learning_rate": 7.242212443399378e-05, + "loss": 0.3706, + "step": 4354 + }, + { + "epoch": 0.37152363077973044, + "grad_norm": 1.6331521005057155, + "learning_rate": 7.240977532743667e-05, + "loss": 0.3032, + "step": 4355 + }, + { + "epoch": 0.37160894045384746, + "grad_norm": 1.5960562232447135, + "learning_rate": 7.239742451001228e-05, + "loss": 0.3265, + "step": 4356 + }, + { + "epoch": 0.37169425012796453, + "grad_norm": 1.4463413185297167, + "learning_rate": 7.238507198266356e-05, + "loss": 0.3393, + "step": 4357 + }, + { + "epoch": 0.37177955980208155, + "grad_norm": 1.6653974309629103, + "learning_rate": 7.237271774633354e-05, + "loss": 0.3093, + "step": 4358 + }, + { + "epoch": 0.3718648694761986, + "grad_norm": 1.1845964270069467, + "learning_rate": 7.236036180196541e-05, + "loss": 0.2531, + "step": 4359 + }, + { + "epoch": 0.37195017915031564, + "grad_norm": 1.39197097961276, + "learning_rate": 7.234800415050248e-05, + "loss": 0.3113, + "step": 4360 + }, + { + "epoch": 0.3720354888244327, + "grad_norm": 1.83333107196784, + "learning_rate": 7.233564479288821e-05, + "loss": 0.3093, + "step": 4361 + }, + { + "epoch": 0.37212079849854973, + "grad_norm": 1.3568999460282598, + "learning_rate": 7.232328373006613e-05, + "loss": 0.2241, + "step": 4362 + }, + { + "epoch": 0.3722061081726668, + "grad_norm": 1.4443224841168991, + "learning_rate": 7.231092096297995e-05, + "loss": 0.3187, + "step": 4363 + }, + { + "epoch": 0.3722914178467838, + "grad_norm": 1.217793113470004, + "learning_rate": 7.229855649257354e-05, + "loss": 0.2313, + "step": 4364 + }, + { + "epoch": 0.3723767275209009, + "grad_norm": 1.5464434551578699, + "learning_rate": 7.228619031979083e-05, + "loss": 0.3306, + "step": 4365 + }, + { + "epoch": 0.3724620371950179, + "grad_norm": 1.6947881353521652, + "learning_rate": 7.227382244557591e-05, + "loss": 0.3548, + "step": 4366 + }, + { + "epoch": 0.372547346869135, + "grad_norm": 1.3389232799040407, + "learning_rate": 7.226145287087302e-05, + "loss": 0.3083, + "step": 4367 + }, + { + "epoch": 0.372632656543252, + "grad_norm": 1.514016310358315, + "learning_rate": 7.224908159662649e-05, + "loss": 0.2558, + "step": 4368 + }, + { + "epoch": 0.372717966217369, + "grad_norm": 1.2655962717068876, + "learning_rate": 7.223670862378082e-05, + "loss": 0.2683, + "step": 4369 + }, + { + "epoch": 0.3728032758914861, + "grad_norm": 1.5703997991028324, + "learning_rate": 7.222433395328062e-05, + "loss": 0.2754, + "step": 4370 + }, + { + "epoch": 0.3728885855656031, + "grad_norm": 1.4558218265130378, + "learning_rate": 7.22119575860706e-05, + "loss": 0.2653, + "step": 4371 + }, + { + "epoch": 0.3729738952397202, + "grad_norm": 1.875604277829665, + "learning_rate": 7.219957952309567e-05, + "loss": 0.2789, + "step": 4372 + }, + { + "epoch": 0.3730592049138372, + "grad_norm": 1.786855477396208, + "learning_rate": 7.21871997653008e-05, + "loss": 0.2931, + "step": 4373 + }, + { + "epoch": 0.3731445145879543, + "grad_norm": 1.5101406327499347, + "learning_rate": 7.217481831363115e-05, + "loss": 0.316, + "step": 4374 + }, + { + "epoch": 0.3732298242620713, + "grad_norm": 1.6494376235643289, + "learning_rate": 7.216243516903194e-05, + "loss": 0.2911, + "step": 4375 + }, + { + "epoch": 0.37331513393618837, + "grad_norm": 1.7378892778085713, + "learning_rate": 7.215005033244857e-05, + "loss": 0.3172, + "step": 4376 + }, + { + "epoch": 0.3734004436103054, + "grad_norm": 1.988786254782528, + "learning_rate": 7.213766380482658e-05, + "loss": 0.259, + "step": 4377 + }, + { + "epoch": 0.37348575328442246, + "grad_norm": 1.8848063568299482, + "learning_rate": 7.21252755871116e-05, + "loss": 0.3313, + "step": 4378 + }, + { + "epoch": 0.3735710629585395, + "grad_norm": 1.770226954441182, + "learning_rate": 7.211288568024937e-05, + "loss": 0.3156, + "step": 4379 + }, + { + "epoch": 0.37365637263265655, + "grad_norm": 1.6639303951927757, + "learning_rate": 7.210049408518587e-05, + "loss": 0.3007, + "step": 4380 + }, + { + "epoch": 0.37374168230677357, + "grad_norm": 1.6907247178994895, + "learning_rate": 7.208810080286706e-05, + "loss": 0.3137, + "step": 4381 + }, + { + "epoch": 0.37382699198089064, + "grad_norm": 1.7459211498865272, + "learning_rate": 7.207570583423915e-05, + "loss": 0.3002, + "step": 4382 + }, + { + "epoch": 0.37391230165500766, + "grad_norm": 1.3160494991663427, + "learning_rate": 7.20633091802484e-05, + "loss": 0.2817, + "step": 4383 + }, + { + "epoch": 0.37399761132912474, + "grad_norm": 1.7150094684306092, + "learning_rate": 7.205091084184122e-05, + "loss": 0.3077, + "step": 4384 + }, + { + "epoch": 0.37408292100324175, + "grad_norm": 1.7399993292489349, + "learning_rate": 7.20385108199642e-05, + "loss": 0.3453, + "step": 4385 + }, + { + "epoch": 0.3741682306773588, + "grad_norm": 1.5081557441546642, + "learning_rate": 7.202610911556399e-05, + "loss": 0.3255, + "step": 4386 + }, + { + "epoch": 0.37425354035147584, + "grad_norm": 1.5607486064512994, + "learning_rate": 7.201370572958738e-05, + "loss": 0.3124, + "step": 4387 + }, + { + "epoch": 0.3743388500255929, + "grad_norm": 1.5149100278822254, + "learning_rate": 7.200130066298133e-05, + "loss": 0.3131, + "step": 4388 + }, + { + "epoch": 0.37442415969970994, + "grad_norm": 1.5569536571625238, + "learning_rate": 7.198889391669288e-05, + "loss": 0.3046, + "step": 4389 + }, + { + "epoch": 0.374509469373827, + "grad_norm": 1.2370297337954357, + "learning_rate": 7.197648549166924e-05, + "loss": 0.2277, + "step": 4390 + }, + { + "epoch": 0.374594779047944, + "grad_norm": 1.537432276777182, + "learning_rate": 7.19640753888577e-05, + "loss": 0.313, + "step": 4391 + }, + { + "epoch": 0.3746800887220611, + "grad_norm": 1.5450121519234288, + "learning_rate": 7.195166360920574e-05, + "loss": 0.2979, + "step": 4392 + }, + { + "epoch": 0.3747653983961781, + "grad_norm": 1.3605759675877236, + "learning_rate": 7.19392501536609e-05, + "loss": 0.2822, + "step": 4393 + }, + { + "epoch": 0.3748507080702952, + "grad_norm": 1.5706050467447592, + "learning_rate": 7.19268350231709e-05, + "loss": 0.3367, + "step": 4394 + }, + { + "epoch": 0.3749360177444122, + "grad_norm": 1.5403130577242727, + "learning_rate": 7.191441821868358e-05, + "loss": 0.2842, + "step": 4395 + }, + { + "epoch": 0.3750213274185293, + "grad_norm": 1.540489967727062, + "learning_rate": 7.190199974114686e-05, + "loss": 0.3651, + "step": 4396 + }, + { + "epoch": 0.3751066370926463, + "grad_norm": 1.4855013137960515, + "learning_rate": 7.188957959150886e-05, + "loss": 0.4192, + "step": 4397 + }, + { + "epoch": 0.3751919467667634, + "grad_norm": 1.3459384308318671, + "learning_rate": 7.187715777071777e-05, + "loss": 0.2313, + "step": 4398 + }, + { + "epoch": 0.3752772564408804, + "grad_norm": 1.7325026569648836, + "learning_rate": 7.186473427972195e-05, + "loss": 0.384, + "step": 4399 + }, + { + "epoch": 0.37536256611499746, + "grad_norm": 1.4726878195700241, + "learning_rate": 7.185230911946984e-05, + "loss": 0.3774, + "step": 4400 + }, + { + "epoch": 0.3754478757891145, + "grad_norm": 1.5450810520429672, + "learning_rate": 7.183988229091006e-05, + "loss": 0.2972, + "step": 4401 + }, + { + "epoch": 0.37553318546323156, + "grad_norm": 1.7396280279147585, + "learning_rate": 7.182745379499135e-05, + "loss": 0.3225, + "step": 4402 + }, + { + "epoch": 0.3756184951373486, + "grad_norm": 1.157491790379851, + "learning_rate": 7.181502363266251e-05, + "loss": 0.306, + "step": 4403 + }, + { + "epoch": 0.37570380481146565, + "grad_norm": 1.5327775303762587, + "learning_rate": 7.180259180487254e-05, + "loss": 0.3475, + "step": 4404 + }, + { + "epoch": 0.37578911448558266, + "grad_norm": 1.5953728883916565, + "learning_rate": 7.179015831257053e-05, + "loss": 0.3538, + "step": 4405 + }, + { + "epoch": 0.3758744241596997, + "grad_norm": 1.3971353787448542, + "learning_rate": 7.177772315670574e-05, + "loss": 0.3102, + "step": 4406 + }, + { + "epoch": 0.37595973383381676, + "grad_norm": 1.1452064995938662, + "learning_rate": 7.176528633822752e-05, + "loss": 0.253, + "step": 4407 + }, + { + "epoch": 0.3760450435079338, + "grad_norm": 1.2610253002130467, + "learning_rate": 7.175284785808534e-05, + "loss": 0.2646, + "step": 4408 + }, + { + "epoch": 0.37613035318205085, + "grad_norm": 1.703438826178883, + "learning_rate": 7.174040771722884e-05, + "loss": 0.3087, + "step": 4409 + }, + { + "epoch": 0.37621566285616787, + "grad_norm": 1.2967114632688255, + "learning_rate": 7.172796591660771e-05, + "loss": 0.2569, + "step": 4410 + }, + { + "epoch": 0.37630097253028494, + "grad_norm": 1.3635562739550307, + "learning_rate": 7.171552245717186e-05, + "loss": 0.324, + "step": 4411 + }, + { + "epoch": 0.37638628220440196, + "grad_norm": 1.4931130459779185, + "learning_rate": 7.170307733987127e-05, + "loss": 0.3178, + "step": 4412 + }, + { + "epoch": 0.37647159187851903, + "grad_norm": 1.27530631331439, + "learning_rate": 7.169063056565605e-05, + "loss": 0.289, + "step": 4413 + }, + { + "epoch": 0.37655690155263605, + "grad_norm": 1.5193347951009653, + "learning_rate": 7.167818213547646e-05, + "loss": 0.3137, + "step": 4414 + }, + { + "epoch": 0.3766422112267531, + "grad_norm": 1.5425241336758437, + "learning_rate": 7.166573205028285e-05, + "loss": 0.3323, + "step": 4415 + }, + { + "epoch": 0.37672752090087014, + "grad_norm": 1.5152090229333743, + "learning_rate": 7.165328031102572e-05, + "loss": 0.2983, + "step": 4416 + }, + { + "epoch": 0.3768128305749872, + "grad_norm": 1.228421594268837, + "learning_rate": 7.164082691865573e-05, + "loss": 0.3226, + "step": 4417 + }, + { + "epoch": 0.37689814024910423, + "grad_norm": 1.69459166781825, + "learning_rate": 7.162837187412356e-05, + "loss": 0.3414, + "step": 4418 + }, + { + "epoch": 0.3769834499232213, + "grad_norm": 1.6350273916300655, + "learning_rate": 7.161591517838018e-05, + "loss": 0.2798, + "step": 4419 + }, + { + "epoch": 0.3770687595973383, + "grad_norm": 1.5543809137971947, + "learning_rate": 7.160345683237652e-05, + "loss": 0.3092, + "step": 4420 + }, + { + "epoch": 0.3771540692714554, + "grad_norm": 1.39827586683754, + "learning_rate": 7.159099683706372e-05, + "loss": 0.3071, + "step": 4421 + }, + { + "epoch": 0.3772393789455724, + "grad_norm": 1.34530425562088, + "learning_rate": 7.157853519339306e-05, + "loss": 0.2711, + "step": 4422 + }, + { + "epoch": 0.3773246886196895, + "grad_norm": 1.864786176108398, + "learning_rate": 7.156607190231591e-05, + "loss": 0.2774, + "step": 4423 + }, + { + "epoch": 0.3774099982938065, + "grad_norm": 1.5696979287993604, + "learning_rate": 7.155360696478377e-05, + "loss": 0.3174, + "step": 4424 + }, + { + "epoch": 0.3774953079679236, + "grad_norm": 1.468408504312652, + "learning_rate": 7.154114038174828e-05, + "loss": 0.284, + "step": 4425 + }, + { + "epoch": 0.3775806176420406, + "grad_norm": 1.3508476174831083, + "learning_rate": 7.152867215416117e-05, + "loss": 0.2679, + "step": 4426 + }, + { + "epoch": 0.37766592731615767, + "grad_norm": 1.6541776828094772, + "learning_rate": 7.151620228297436e-05, + "loss": 0.3288, + "step": 4427 + }, + { + "epoch": 0.3777512369902747, + "grad_norm": 1.3435179931491983, + "learning_rate": 7.150373076913986e-05, + "loss": 0.2736, + "step": 4428 + }, + { + "epoch": 0.37783654666439176, + "grad_norm": 1.251651769304745, + "learning_rate": 7.149125761360975e-05, + "loss": 0.2562, + "step": 4429 + }, + { + "epoch": 0.3779218563385088, + "grad_norm": 1.6051948852107758, + "learning_rate": 7.147878281733634e-05, + "loss": 0.3129, + "step": 4430 + }, + { + "epoch": 0.37800716601262585, + "grad_norm": 1.461095954339367, + "learning_rate": 7.1466306381272e-05, + "loss": 0.3036, + "step": 4431 + }, + { + "epoch": 0.37809247568674287, + "grad_norm": 1.481242196750609, + "learning_rate": 7.145382830636924e-05, + "loss": 0.277, + "step": 4432 + }, + { + "epoch": 0.37817778536085994, + "grad_norm": 1.44148109469155, + "learning_rate": 7.14413485935807e-05, + "loss": 0.3121, + "step": 4433 + }, + { + "epoch": 0.37826309503497696, + "grad_norm": 1.5877467767138853, + "learning_rate": 7.142886724385913e-05, + "loss": 0.3039, + "step": 4434 + }, + { + "epoch": 0.37834840470909403, + "grad_norm": 1.3224369991706648, + "learning_rate": 7.141638425815743e-05, + "loss": 0.2214, + "step": 4435 + }, + { + "epoch": 0.37843371438321105, + "grad_norm": 1.5261194341675064, + "learning_rate": 7.140389963742859e-05, + "loss": 0.3006, + "step": 4436 + }, + { + "epoch": 0.3785190240573281, + "grad_norm": 1.4375679995168449, + "learning_rate": 7.139141338262573e-05, + "loss": 0.2783, + "step": 4437 + }, + { + "epoch": 0.37860433373144514, + "grad_norm": 1.1509361851664233, + "learning_rate": 7.137892549470218e-05, + "loss": 0.277, + "step": 4438 + }, + { + "epoch": 0.3786896434055622, + "grad_norm": 1.646106681175478, + "learning_rate": 7.136643597461124e-05, + "loss": 0.2612, + "step": 4439 + }, + { + "epoch": 0.37877495307967923, + "grad_norm": 1.6403031533037298, + "learning_rate": 7.135394482330646e-05, + "loss": 0.3087, + "step": 4440 + }, + { + "epoch": 0.3788602627537963, + "grad_norm": 1.5705839463381501, + "learning_rate": 7.134145204174148e-05, + "loss": 0.264, + "step": 4441 + }, + { + "epoch": 0.3789455724279133, + "grad_norm": 1.5962531683762666, + "learning_rate": 7.132895763087002e-05, + "loss": 0.2907, + "step": 4442 + }, + { + "epoch": 0.3790308821020304, + "grad_norm": 1.687729360746664, + "learning_rate": 7.1316461591646e-05, + "loss": 0.3036, + "step": 4443 + }, + { + "epoch": 0.3791161917761474, + "grad_norm": 1.4084835540439686, + "learning_rate": 7.130396392502342e-05, + "loss": 0.3232, + "step": 4444 + }, + { + "epoch": 0.37920150145026443, + "grad_norm": 1.3415673738467055, + "learning_rate": 7.129146463195641e-05, + "loss": 0.2935, + "step": 4445 + }, + { + "epoch": 0.3792868111243815, + "grad_norm": 1.4353790392693846, + "learning_rate": 7.127896371339921e-05, + "loss": 0.2689, + "step": 4446 + }, + { + "epoch": 0.3793721207984985, + "grad_norm": 1.350416193491896, + "learning_rate": 7.126646117030619e-05, + "loss": 0.2914, + "step": 4447 + }, + { + "epoch": 0.3794574304726156, + "grad_norm": 1.4725265648199157, + "learning_rate": 7.125395700363189e-05, + "loss": 0.3062, + "step": 4448 + }, + { + "epoch": 0.3795427401467326, + "grad_norm": 1.329915892444083, + "learning_rate": 7.124145121433092e-05, + "loss": 0.2889, + "step": 4449 + }, + { + "epoch": 0.3796280498208497, + "grad_norm": 1.6152672170416078, + "learning_rate": 7.122894380335799e-05, + "loss": 0.2977, + "step": 4450 + }, + { + "epoch": 0.3797133594949667, + "grad_norm": 1.5836934717412687, + "learning_rate": 7.121643477166805e-05, + "loss": 0.2538, + "step": 4451 + }, + { + "epoch": 0.3797986691690838, + "grad_norm": 1.7819698953665164, + "learning_rate": 7.120392412021605e-05, + "loss": 0.3111, + "step": 4452 + }, + { + "epoch": 0.3798839788432008, + "grad_norm": 1.949951767936282, + "learning_rate": 7.11914118499571e-05, + "loss": 0.3208, + "step": 4453 + }, + { + "epoch": 0.37996928851731787, + "grad_norm": 1.32922452404095, + "learning_rate": 7.11788979618465e-05, + "loss": 0.34, + "step": 4454 + }, + { + "epoch": 0.3800545981914349, + "grad_norm": 1.3900845795565544, + "learning_rate": 7.116638245683957e-05, + "loss": 0.2877, + "step": 4455 + }, + { + "epoch": 0.38013990786555196, + "grad_norm": 1.8832624222338015, + "learning_rate": 7.115386533589181e-05, + "loss": 0.3596, + "step": 4456 + }, + { + "epoch": 0.380225217539669, + "grad_norm": 1.335914031597822, + "learning_rate": 7.114134659995886e-05, + "loss": 0.2688, + "step": 4457 + }, + { + "epoch": 0.38031052721378605, + "grad_norm": 1.3490306977214326, + "learning_rate": 7.112882624999641e-05, + "loss": 0.2631, + "step": 4458 + }, + { + "epoch": 0.38039583688790307, + "grad_norm": 1.4811324189594515, + "learning_rate": 7.111630428696039e-05, + "loss": 0.3214, + "step": 4459 + }, + { + "epoch": 0.38048114656202014, + "grad_norm": 1.4295243967542237, + "learning_rate": 7.110378071180674e-05, + "loss": 0.2306, + "step": 4460 + }, + { + "epoch": 0.38056645623613716, + "grad_norm": 1.5315168498547969, + "learning_rate": 7.109125552549157e-05, + "loss": 0.3155, + "step": 4461 + }, + { + "epoch": 0.38065176591025424, + "grad_norm": 1.4643071933991942, + "learning_rate": 7.107872872897113e-05, + "loss": 0.3178, + "step": 4462 + }, + { + "epoch": 0.38073707558437125, + "grad_norm": 1.28925446901551, + "learning_rate": 7.106620032320174e-05, + "loss": 0.2684, + "step": 4463 + }, + { + "epoch": 0.3808223852584883, + "grad_norm": 1.2889682619731742, + "learning_rate": 7.105367030913993e-05, + "loss": 0.2966, + "step": 4464 + }, + { + "epoch": 0.38090769493260535, + "grad_norm": 1.3996739808315657, + "learning_rate": 7.104113868774225e-05, + "loss": 0.2486, + "step": 4465 + }, + { + "epoch": 0.3809930046067224, + "grad_norm": 1.5988222078744647, + "learning_rate": 7.102860545996546e-05, + "loss": 0.1741, + "step": 4466 + }, + { + "epoch": 0.38107831428083944, + "grad_norm": 1.4579466534183845, + "learning_rate": 7.101607062676638e-05, + "loss": 0.3599, + "step": 4467 + }, + { + "epoch": 0.3811636239549565, + "grad_norm": 1.386083064987905, + "learning_rate": 7.1003534189102e-05, + "loss": 0.2516, + "step": 4468 + }, + { + "epoch": 0.3812489336290735, + "grad_norm": 1.5083421162442658, + "learning_rate": 7.099099614792937e-05, + "loss": 0.2818, + "step": 4469 + }, + { + "epoch": 0.3813342433031906, + "grad_norm": 2.0031560791270735, + "learning_rate": 7.097845650420574e-05, + "loss": 0.3832, + "step": 4470 + }, + { + "epoch": 0.3814195529773076, + "grad_norm": 1.273001145568356, + "learning_rate": 7.096591525888845e-05, + "loss": 0.3189, + "step": 4471 + }, + { + "epoch": 0.3815048626514247, + "grad_norm": 1.3187343939992253, + "learning_rate": 7.095337241293493e-05, + "loss": 0.2536, + "step": 4472 + }, + { + "epoch": 0.3815901723255417, + "grad_norm": 1.5737800838876461, + "learning_rate": 7.094082796730279e-05, + "loss": 0.3236, + "step": 4473 + }, + { + "epoch": 0.3816754819996588, + "grad_norm": 1.579303716719737, + "learning_rate": 7.092828192294968e-05, + "loss": 0.2776, + "step": 4474 + }, + { + "epoch": 0.3817607916737758, + "grad_norm": 1.251548189802904, + "learning_rate": 7.091573428083348e-05, + "loss": 0.3397, + "step": 4475 + }, + { + "epoch": 0.3818461013478929, + "grad_norm": 1.2982236904194173, + "learning_rate": 7.090318504191211e-05, + "loss": 0.2733, + "step": 4476 + }, + { + "epoch": 0.3819314110220099, + "grad_norm": 1.3205868588313832, + "learning_rate": 7.089063420714366e-05, + "loss": 0.2733, + "step": 4477 + }, + { + "epoch": 0.38201672069612697, + "grad_norm": 1.5384484629808992, + "learning_rate": 7.087808177748628e-05, + "loss": 0.3131, + "step": 4478 + }, + { + "epoch": 0.382102030370244, + "grad_norm": 1.1459045503621361, + "learning_rate": 7.08655277538983e-05, + "loss": 0.2406, + "step": 4479 + }, + { + "epoch": 0.38218734004436106, + "grad_norm": 1.695712978331773, + "learning_rate": 7.085297213733816e-05, + "loss": 0.3436, + "step": 4480 + }, + { + "epoch": 0.3822726497184781, + "grad_norm": 1.415216638810336, + "learning_rate": 7.084041492876442e-05, + "loss": 0.3058, + "step": 4481 + }, + { + "epoch": 0.3823579593925951, + "grad_norm": 1.4036794692652694, + "learning_rate": 7.082785612913573e-05, + "loss": 0.2747, + "step": 4482 + }, + { + "epoch": 0.38244326906671217, + "grad_norm": 1.5035891508230086, + "learning_rate": 7.081529573941091e-05, + "loss": 0.2802, + "step": 4483 + }, + { + "epoch": 0.3825285787408292, + "grad_norm": 1.5613064594306942, + "learning_rate": 7.08027337605489e-05, + "loss": 0.2845, + "step": 4484 + }, + { + "epoch": 0.38261388841494626, + "grad_norm": 1.4793115285223695, + "learning_rate": 7.079017019350866e-05, + "loss": 0.2235, + "step": 4485 + }, + { + "epoch": 0.3826991980890633, + "grad_norm": 1.4909646181172165, + "learning_rate": 7.077760503924945e-05, + "loss": 0.3162, + "step": 4486 + }, + { + "epoch": 0.38278450776318035, + "grad_norm": 1.4548060220360375, + "learning_rate": 7.076503829873048e-05, + "loss": 0.2653, + "step": 4487 + }, + { + "epoch": 0.38286981743729737, + "grad_norm": 1.6898984465474707, + "learning_rate": 7.075246997291119e-05, + "loss": 0.2657, + "step": 4488 + }, + { + "epoch": 0.38295512711141444, + "grad_norm": 1.502906367844182, + "learning_rate": 7.073990006275111e-05, + "loss": 0.2632, + "step": 4489 + }, + { + "epoch": 0.38304043678553146, + "grad_norm": 1.5942791826922043, + "learning_rate": 7.072732856920983e-05, + "loss": 0.4223, + "step": 4490 + }, + { + "epoch": 0.38312574645964853, + "grad_norm": 1.9349089494126306, + "learning_rate": 7.07147554932472e-05, + "loss": 0.3131, + "step": 4491 + }, + { + "epoch": 0.38321105613376555, + "grad_norm": 1.769459165069068, + "learning_rate": 7.070218083582306e-05, + "loss": 0.3588, + "step": 4492 + }, + { + "epoch": 0.3832963658078826, + "grad_norm": 1.5582036268683386, + "learning_rate": 7.068960459789741e-05, + "loss": 0.3227, + "step": 4493 + }, + { + "epoch": 0.38338167548199964, + "grad_norm": 1.3828058188756547, + "learning_rate": 7.06770267804304e-05, + "loss": 0.3273, + "step": 4494 + }, + { + "epoch": 0.3834669851561167, + "grad_norm": 1.563419681731942, + "learning_rate": 7.066444738438227e-05, + "loss": 0.2464, + "step": 4495 + }, + { + "epoch": 0.38355229483023373, + "grad_norm": 1.446627127257955, + "learning_rate": 7.06518664107134e-05, + "loss": 0.3167, + "step": 4496 + }, + { + "epoch": 0.3836376045043508, + "grad_norm": 1.5659177403677336, + "learning_rate": 7.063928386038428e-05, + "loss": 0.344, + "step": 4497 + }, + { + "epoch": 0.3837229141784678, + "grad_norm": 1.3272815494434178, + "learning_rate": 7.06266997343555e-05, + "loss": 0.3102, + "step": 4498 + }, + { + "epoch": 0.3838082238525849, + "grad_norm": 1.485232617433028, + "learning_rate": 7.061411403358781e-05, + "loss": 0.2972, + "step": 4499 + }, + { + "epoch": 0.3838935335267019, + "grad_norm": 1.3395280382640038, + "learning_rate": 7.060152675904206e-05, + "loss": 0.2937, + "step": 4500 + }, + { + "epoch": 0.383978843200819, + "grad_norm": 1.7377499572112904, + "learning_rate": 7.058893791167921e-05, + "loss": 0.2748, + "step": 4501 + }, + { + "epoch": 0.384064152874936, + "grad_norm": 1.423485901864935, + "learning_rate": 7.057634749246037e-05, + "loss": 0.2665, + "step": 4502 + }, + { + "epoch": 0.3841494625490531, + "grad_norm": 1.62689186887609, + "learning_rate": 7.056375550234673e-05, + "loss": 0.319, + "step": 4503 + }, + { + "epoch": 0.3842347722231701, + "grad_norm": 1.4314249456504933, + "learning_rate": 7.055116194229964e-05, + "loss": 0.2749, + "step": 4504 + }, + { + "epoch": 0.38432008189728717, + "grad_norm": 1.5113308825434426, + "learning_rate": 7.053856681328055e-05, + "loss": 0.3101, + "step": 4505 + }, + { + "epoch": 0.3844053915714042, + "grad_norm": 1.4752007719565419, + "learning_rate": 7.052597011625101e-05, + "loss": 0.2447, + "step": 4506 + }, + { + "epoch": 0.38449070124552126, + "grad_norm": 1.3760972413423425, + "learning_rate": 7.051337185217273e-05, + "loss": 0.372, + "step": 4507 + }, + { + "epoch": 0.3845760109196383, + "grad_norm": 2.0004767803283325, + "learning_rate": 7.050077202200753e-05, + "loss": 0.3133, + "step": 4508 + }, + { + "epoch": 0.38466132059375535, + "grad_norm": 1.6176786552240165, + "learning_rate": 7.048817062671732e-05, + "loss": 0.3085, + "step": 4509 + }, + { + "epoch": 0.38474663026787237, + "grad_norm": 1.5237066569121365, + "learning_rate": 7.047556766726418e-05, + "loss": 0.3332, + "step": 4510 + }, + { + "epoch": 0.38483193994198944, + "grad_norm": 1.3421003284501412, + "learning_rate": 7.046296314461021e-05, + "loss": 0.264, + "step": 4511 + }, + { + "epoch": 0.38491724961610646, + "grad_norm": 1.67283054033511, + "learning_rate": 7.045035705971778e-05, + "loss": 0.3565, + "step": 4512 + }, + { + "epoch": 0.38500255929022353, + "grad_norm": 1.4272345673927316, + "learning_rate": 7.043774941354925e-05, + "loss": 0.3058, + "step": 4513 + }, + { + "epoch": 0.38508786896434055, + "grad_norm": 1.3820827304185932, + "learning_rate": 7.042514020706714e-05, + "loss": 0.3155, + "step": 4514 + }, + { + "epoch": 0.3851731786384576, + "grad_norm": 1.5793080946801872, + "learning_rate": 7.041252944123413e-05, + "loss": 0.3546, + "step": 4515 + }, + { + "epoch": 0.38525848831257464, + "grad_norm": 1.6127377534400469, + "learning_rate": 7.039991711701298e-05, + "loss": 0.2929, + "step": 4516 + }, + { + "epoch": 0.3853437979866917, + "grad_norm": 1.3065250271466937, + "learning_rate": 7.038730323536655e-05, + "loss": 0.3197, + "step": 4517 + }, + { + "epoch": 0.38542910766080873, + "grad_norm": 1.5736861546710388, + "learning_rate": 7.037468779725787e-05, + "loss": 0.2698, + "step": 4518 + }, + { + "epoch": 0.3855144173349258, + "grad_norm": 1.2892041215262011, + "learning_rate": 7.036207080365003e-05, + "loss": 0.2524, + "step": 4519 + }, + { + "epoch": 0.3855997270090428, + "grad_norm": 1.5455092749365158, + "learning_rate": 7.034945225550629e-05, + "loss": 0.3421, + "step": 4520 + }, + { + "epoch": 0.38568503668315984, + "grad_norm": 1.6513798925812262, + "learning_rate": 7.033683215379002e-05, + "loss": 0.3266, + "step": 4521 + }, + { + "epoch": 0.3857703463572769, + "grad_norm": 1.3138183149148563, + "learning_rate": 7.032421049946467e-05, + "loss": 0.2431, + "step": 4522 + }, + { + "epoch": 0.38585565603139393, + "grad_norm": 1.3271590815574126, + "learning_rate": 7.031158729349386e-05, + "loss": 0.2885, + "step": 4523 + }, + { + "epoch": 0.385940965705511, + "grad_norm": 1.3315747952828012, + "learning_rate": 7.02989625368413e-05, + "loss": 0.3573, + "step": 4524 + }, + { + "epoch": 0.386026275379628, + "grad_norm": 1.5156007745124165, + "learning_rate": 7.02863362304708e-05, + "loss": 0.3802, + "step": 4525 + }, + { + "epoch": 0.3861115850537451, + "grad_norm": 1.5279067774940591, + "learning_rate": 7.027370837534635e-05, + "loss": 0.2431, + "step": 4526 + }, + { + "epoch": 0.3861968947278621, + "grad_norm": 1.5311221633798902, + "learning_rate": 7.0261078972432e-05, + "loss": 0.287, + "step": 4527 + }, + { + "epoch": 0.3862822044019792, + "grad_norm": 1.7197512397966825, + "learning_rate": 7.024844802269193e-05, + "loss": 0.3525, + "step": 4528 + }, + { + "epoch": 0.3863675140760962, + "grad_norm": 1.377543091697106, + "learning_rate": 7.023581552709045e-05, + "loss": 0.3056, + "step": 4529 + }, + { + "epoch": 0.3864528237502133, + "grad_norm": 1.5646905421247825, + "learning_rate": 7.0223181486592e-05, + "loss": 0.2837, + "step": 4530 + }, + { + "epoch": 0.3865381334243303, + "grad_norm": 1.2348928210434789, + "learning_rate": 7.02105459021611e-05, + "loss": 0.2498, + "step": 4531 + }, + { + "epoch": 0.3866234430984474, + "grad_norm": 1.7044107326095346, + "learning_rate": 7.019790877476241e-05, + "loss": 0.3191, + "step": 4532 + }, + { + "epoch": 0.3867087527725644, + "grad_norm": 1.533493500443582, + "learning_rate": 7.018527010536073e-05, + "loss": 0.2377, + "step": 4533 + }, + { + "epoch": 0.38679406244668146, + "grad_norm": 1.4421556793703096, + "learning_rate": 7.017262989492095e-05, + "loss": 0.2793, + "step": 4534 + }, + { + "epoch": 0.3868793721207985, + "grad_norm": 1.316512785542691, + "learning_rate": 7.015998814440804e-05, + "loss": 0.25, + "step": 4535 + }, + { + "epoch": 0.38696468179491555, + "grad_norm": 1.2895395956597897, + "learning_rate": 7.01473448547872e-05, + "loss": 0.2323, + "step": 4536 + }, + { + "epoch": 0.3870499914690326, + "grad_norm": 1.6074258797318735, + "learning_rate": 7.013470002702363e-05, + "loss": 0.2545, + "step": 4537 + }, + { + "epoch": 0.38713530114314965, + "grad_norm": 1.6384351375198285, + "learning_rate": 7.012205366208272e-05, + "loss": 0.2891, + "step": 4538 + }, + { + "epoch": 0.38722061081726666, + "grad_norm": 1.887845984068671, + "learning_rate": 7.010940576092993e-05, + "loss": 0.3353, + "step": 4539 + }, + { + "epoch": 0.38730592049138374, + "grad_norm": 1.3958496785511645, + "learning_rate": 7.009675632453088e-05, + "loss": 0.2721, + "step": 4540 + }, + { + "epoch": 0.38739123016550076, + "grad_norm": 1.6703004642120742, + "learning_rate": 7.008410535385126e-05, + "loss": 0.283, + "step": 4541 + }, + { + "epoch": 0.38747653983961783, + "grad_norm": 1.3193126255689345, + "learning_rate": 7.007145284985694e-05, + "loss": 0.2369, + "step": 4542 + }, + { + "epoch": 0.38756184951373485, + "grad_norm": 1.4310684622029664, + "learning_rate": 7.005879881351384e-05, + "loss": 0.2585, + "step": 4543 + }, + { + "epoch": 0.3876471591878519, + "grad_norm": 1.3851591984802982, + "learning_rate": 7.004614324578806e-05, + "loss": 0.3339, + "step": 4544 + }, + { + "epoch": 0.38773246886196894, + "grad_norm": 1.512167063238164, + "learning_rate": 7.003348614764575e-05, + "loss": 0.2948, + "step": 4545 + }, + { + "epoch": 0.387817778536086, + "grad_norm": 1.5777215725255826, + "learning_rate": 7.002082752005324e-05, + "loss": 0.2805, + "step": 4546 + }, + { + "epoch": 0.38790308821020303, + "grad_norm": 1.3115208015784663, + "learning_rate": 7.000816736397695e-05, + "loss": 0.3047, + "step": 4547 + }, + { + "epoch": 0.3879883978843201, + "grad_norm": 1.4947810776321049, + "learning_rate": 6.999550568038339e-05, + "loss": 0.2952, + "step": 4548 + }, + { + "epoch": 0.3880737075584371, + "grad_norm": 1.2299786272750706, + "learning_rate": 6.998284247023924e-05, + "loss": 0.2762, + "step": 4549 + }, + { + "epoch": 0.3881590172325542, + "grad_norm": 1.4945677302644735, + "learning_rate": 6.997017773451124e-05, + "loss": 0.281, + "step": 4550 + }, + { + "epoch": 0.3882443269066712, + "grad_norm": 1.395592123038142, + "learning_rate": 6.995751147416632e-05, + "loss": 0.291, + "step": 4551 + }, + { + "epoch": 0.3883296365807883, + "grad_norm": 1.702299845577068, + "learning_rate": 6.994484369017143e-05, + "loss": 0.2787, + "step": 4552 + }, + { + "epoch": 0.3884149462549053, + "grad_norm": 1.5073641570014673, + "learning_rate": 6.993217438349371e-05, + "loss": 0.289, + "step": 4553 + }, + { + "epoch": 0.3885002559290224, + "grad_norm": 1.8571922248524761, + "learning_rate": 6.99195035551004e-05, + "loss": 0.354, + "step": 4554 + }, + { + "epoch": 0.3885855656031394, + "grad_norm": 1.8980786647674177, + "learning_rate": 6.990683120595884e-05, + "loss": 0.3151, + "step": 4555 + }, + { + "epoch": 0.38867087527725647, + "grad_norm": 1.3290734887223816, + "learning_rate": 6.98941573370365e-05, + "loss": 0.2471, + "step": 4556 + }, + { + "epoch": 0.3887561849513735, + "grad_norm": 2.0068219186706218, + "learning_rate": 6.988148194930099e-05, + "loss": 0.3674, + "step": 4557 + }, + { + "epoch": 0.38884149462549056, + "grad_norm": 1.3917257688539573, + "learning_rate": 6.986880504371996e-05, + "loss": 0.3222, + "step": 4558 + }, + { + "epoch": 0.3889268042996076, + "grad_norm": 1.5883562407120946, + "learning_rate": 6.985612662126125e-05, + "loss": 0.2701, + "step": 4559 + }, + { + "epoch": 0.3890121139737246, + "grad_norm": 1.6173405229346307, + "learning_rate": 6.98434466828928e-05, + "loss": 0.3168, + "step": 4560 + }, + { + "epoch": 0.38909742364784167, + "grad_norm": 1.4096693217974976, + "learning_rate": 6.983076522958262e-05, + "loss": 0.3096, + "step": 4561 + }, + { + "epoch": 0.3891827333219587, + "grad_norm": 1.7074700321902099, + "learning_rate": 6.981808226229892e-05, + "loss": 0.316, + "step": 4562 + }, + { + "epoch": 0.38926804299607576, + "grad_norm": 1.7294541391918639, + "learning_rate": 6.980539778200995e-05, + "loss": 0.3447, + "step": 4563 + }, + { + "epoch": 0.3893533526701928, + "grad_norm": 2.016712930111886, + "learning_rate": 6.979271178968409e-05, + "loss": 0.3429, + "step": 4564 + }, + { + "epoch": 0.38943866234430985, + "grad_norm": 1.665511597285209, + "learning_rate": 6.978002428628987e-05, + "loss": 0.2904, + "step": 4565 + }, + { + "epoch": 0.38952397201842687, + "grad_norm": 1.4408264648173768, + "learning_rate": 6.97673352727959e-05, + "loss": 0.2827, + "step": 4566 + }, + { + "epoch": 0.38960928169254394, + "grad_norm": 1.4380485483179404, + "learning_rate": 6.975464475017093e-05, + "loss": 0.2302, + "step": 4567 + }, + { + "epoch": 0.38969459136666096, + "grad_norm": 1.301428556476657, + "learning_rate": 6.974195271938383e-05, + "loss": 0.3175, + "step": 4568 + }, + { + "epoch": 0.38977990104077803, + "grad_norm": 1.3887090449985435, + "learning_rate": 6.972925918140352e-05, + "loss": 0.2562, + "step": 4569 + }, + { + "epoch": 0.38986521071489505, + "grad_norm": 1.2998455909581152, + "learning_rate": 6.971656413719912e-05, + "loss": 0.293, + "step": 4570 + }, + { + "epoch": 0.3899505203890121, + "grad_norm": 1.3520040507453412, + "learning_rate": 6.970386758773983e-05, + "loss": 0.2616, + "step": 4571 + }, + { + "epoch": 0.39003583006312914, + "grad_norm": 1.2969086424359084, + "learning_rate": 6.969116953399496e-05, + "loss": 0.2985, + "step": 4572 + }, + { + "epoch": 0.3901211397372462, + "grad_norm": 1.5802365519502544, + "learning_rate": 6.967846997693392e-05, + "loss": 0.336, + "step": 4573 + }, + { + "epoch": 0.39020644941136323, + "grad_norm": 1.377324134137384, + "learning_rate": 6.966576891752628e-05, + "loss": 0.3164, + "step": 4574 + }, + { + "epoch": 0.3902917590854803, + "grad_norm": 1.5819099285560505, + "learning_rate": 6.965306635674168e-05, + "loss": 0.2954, + "step": 4575 + }, + { + "epoch": 0.3903770687595973, + "grad_norm": 1.153300156768468, + "learning_rate": 6.964036229554991e-05, + "loss": 0.2948, + "step": 4576 + }, + { + "epoch": 0.3904623784337144, + "grad_norm": 1.5604367175509557, + "learning_rate": 6.962765673492083e-05, + "loss": 0.2654, + "step": 4577 + }, + { + "epoch": 0.3905476881078314, + "grad_norm": 1.490088459435667, + "learning_rate": 6.961494967582447e-05, + "loss": 0.2871, + "step": 4578 + }, + { + "epoch": 0.3906329977819485, + "grad_norm": 1.715694555321379, + "learning_rate": 6.960224111923093e-05, + "loss": 0.3044, + "step": 4579 + }, + { + "epoch": 0.3907183074560655, + "grad_norm": 1.2394286895372637, + "learning_rate": 6.958953106611045e-05, + "loss": 0.2372, + "step": 4580 + }, + { + "epoch": 0.3908036171301826, + "grad_norm": 1.585613649874231, + "learning_rate": 6.957681951743338e-05, + "loss": 0.3297, + "step": 4581 + }, + { + "epoch": 0.3908889268042996, + "grad_norm": 1.506513757682751, + "learning_rate": 6.956410647417017e-05, + "loss": 0.2415, + "step": 4582 + }, + { + "epoch": 0.39097423647841667, + "grad_norm": 1.2725742901473736, + "learning_rate": 6.955139193729139e-05, + "loss": 0.2718, + "step": 4583 + }, + { + "epoch": 0.3910595461525337, + "grad_norm": 1.8934275764977166, + "learning_rate": 6.953867590776773e-05, + "loss": 0.3761, + "step": 4584 + }, + { + "epoch": 0.39114485582665076, + "grad_norm": 1.4531075261203663, + "learning_rate": 6.952595838656998e-05, + "loss": 0.3444, + "step": 4585 + }, + { + "epoch": 0.3912301655007678, + "grad_norm": 1.6509228090971697, + "learning_rate": 6.951323937466909e-05, + "loss": 0.3049, + "step": 4586 + }, + { + "epoch": 0.39131547517488485, + "grad_norm": 1.272620565102642, + "learning_rate": 6.950051887303606e-05, + "loss": 0.257, + "step": 4587 + }, + { + "epoch": 0.39140078484900187, + "grad_norm": 1.7904365959712494, + "learning_rate": 6.948779688264203e-05, + "loss": 0.2748, + "step": 4588 + }, + { + "epoch": 0.39148609452311894, + "grad_norm": 1.4970478571273336, + "learning_rate": 6.947507340445827e-05, + "loss": 0.2717, + "step": 4589 + }, + { + "epoch": 0.39157140419723596, + "grad_norm": 1.373259917182607, + "learning_rate": 6.946234843945616e-05, + "loss": 0.2315, + "step": 4590 + }, + { + "epoch": 0.39165671387135303, + "grad_norm": 1.457366168517509, + "learning_rate": 6.944962198860715e-05, + "loss": 0.2091, + "step": 4591 + }, + { + "epoch": 0.39174202354547005, + "grad_norm": 1.3971432285380783, + "learning_rate": 6.943689405288288e-05, + "loss": 0.1899, + "step": 4592 + }, + { + "epoch": 0.3918273332195871, + "grad_norm": 1.5854819258878023, + "learning_rate": 6.942416463325502e-05, + "loss": 0.3365, + "step": 4593 + }, + { + "epoch": 0.39191264289370414, + "grad_norm": 1.69847851995856, + "learning_rate": 6.94114337306954e-05, + "loss": 0.2851, + "step": 4594 + }, + { + "epoch": 0.3919979525678212, + "grad_norm": 1.6070201645294004, + "learning_rate": 6.939870134617599e-05, + "loss": 0.2399, + "step": 4595 + }, + { + "epoch": 0.39208326224193824, + "grad_norm": 1.56035711211305, + "learning_rate": 6.93859674806688e-05, + "loss": 0.3152, + "step": 4596 + }, + { + "epoch": 0.39216857191605525, + "grad_norm": 1.5736167646491517, + "learning_rate": 6.937323213514601e-05, + "loss": 0.2808, + "step": 4597 + }, + { + "epoch": 0.3922538815901723, + "grad_norm": 1.4950705433153098, + "learning_rate": 6.93604953105799e-05, + "loss": 0.3767, + "step": 4598 + }, + { + "epoch": 0.39233919126428934, + "grad_norm": 1.8536279803069522, + "learning_rate": 6.934775700794286e-05, + "loss": 0.3353, + "step": 4599 + }, + { + "epoch": 0.3924245009384064, + "grad_norm": 1.3575979434205159, + "learning_rate": 6.933501722820739e-05, + "loss": 0.2499, + "step": 4600 + }, + { + "epoch": 0.39250981061252344, + "grad_norm": 1.3805691037620866, + "learning_rate": 6.932227597234609e-05, + "loss": 0.2846, + "step": 4601 + }, + { + "epoch": 0.3925951202866405, + "grad_norm": 1.531683101517452, + "learning_rate": 6.930953324133169e-05, + "loss": 0.2805, + "step": 4602 + }, + { + "epoch": 0.3926804299607575, + "grad_norm": 1.3539707580096503, + "learning_rate": 6.929678903613705e-05, + "loss": 0.2873, + "step": 4603 + }, + { + "epoch": 0.3927657396348746, + "grad_norm": 1.5910443545715849, + "learning_rate": 6.928404335773512e-05, + "loss": 0.3399, + "step": 4604 + }, + { + "epoch": 0.3928510493089916, + "grad_norm": 1.5362417639565131, + "learning_rate": 6.927129620709895e-05, + "loss": 0.2703, + "step": 4605 + }, + { + "epoch": 0.3929363589831087, + "grad_norm": 1.4814984205831707, + "learning_rate": 6.92585475852017e-05, + "loss": 0.3081, + "step": 4606 + }, + { + "epoch": 0.3930216686572257, + "grad_norm": 1.4779464452536302, + "learning_rate": 6.924579749301671e-05, + "loss": 0.3213, + "step": 4607 + }, + { + "epoch": 0.3931069783313428, + "grad_norm": 1.4357899155180385, + "learning_rate": 6.923304593151734e-05, + "loss": 0.232, + "step": 4608 + }, + { + "epoch": 0.3931922880054598, + "grad_norm": 1.0475815623355937, + "learning_rate": 6.92202929016771e-05, + "loss": 0.2211, + "step": 4609 + }, + { + "epoch": 0.3932775976795769, + "grad_norm": 1.344976885288535, + "learning_rate": 6.920753840446968e-05, + "loss": 0.2761, + "step": 4610 + }, + { + "epoch": 0.3933629073536939, + "grad_norm": 1.4498307819939664, + "learning_rate": 6.919478244086873e-05, + "loss": 0.3459, + "step": 4611 + }, + { + "epoch": 0.39344821702781096, + "grad_norm": 1.4729667356507057, + "learning_rate": 6.918202501184815e-05, + "loss": 0.3048, + "step": 4612 + }, + { + "epoch": 0.393533526701928, + "grad_norm": 1.2964531545623978, + "learning_rate": 6.91692661183819e-05, + "loss": 0.3007, + "step": 4613 + }, + { + "epoch": 0.39361883637604506, + "grad_norm": 1.3283965338091153, + "learning_rate": 6.915650576144405e-05, + "loss": 0.3408, + "step": 4614 + }, + { + "epoch": 0.3937041460501621, + "grad_norm": 1.463822153529901, + "learning_rate": 6.914374394200878e-05, + "loss": 0.2488, + "step": 4615 + }, + { + "epoch": 0.39378945572427915, + "grad_norm": 1.4963961860801878, + "learning_rate": 6.91309806610504e-05, + "loss": 0.3365, + "step": 4616 + }, + { + "epoch": 0.39387476539839616, + "grad_norm": 1.4169530672526711, + "learning_rate": 6.911821591954328e-05, + "loss": 0.2731, + "step": 4617 + }, + { + "epoch": 0.39396007507251324, + "grad_norm": 1.565964025159736, + "learning_rate": 6.910544971846198e-05, + "loss": 0.3119, + "step": 4618 + }, + { + "epoch": 0.39404538474663026, + "grad_norm": 2.190601003735815, + "learning_rate": 6.909268205878114e-05, + "loss": 0.3299, + "step": 4619 + }, + { + "epoch": 0.39413069442074733, + "grad_norm": 1.4464501929307045, + "learning_rate": 6.907991294147546e-05, + "loss": 0.3051, + "step": 4620 + }, + { + "epoch": 0.39421600409486435, + "grad_norm": 1.4319848951629957, + "learning_rate": 6.906714236751983e-05, + "loss": 0.219, + "step": 4621 + }, + { + "epoch": 0.3943013137689814, + "grad_norm": 1.4275950143328886, + "learning_rate": 6.90543703378892e-05, + "loss": 0.3097, + "step": 4622 + }, + { + "epoch": 0.39438662344309844, + "grad_norm": 1.742195710988218, + "learning_rate": 6.904159685355865e-05, + "loss": 0.2867, + "step": 4623 + }, + { + "epoch": 0.3944719331172155, + "grad_norm": 1.393571924371653, + "learning_rate": 6.902882191550337e-05, + "loss": 0.2816, + "step": 4624 + }, + { + "epoch": 0.39455724279133253, + "grad_norm": 1.742296634557364, + "learning_rate": 6.901604552469865e-05, + "loss": 0.2686, + "step": 4625 + }, + { + "epoch": 0.3946425524654496, + "grad_norm": 1.476607549070974, + "learning_rate": 6.900326768211991e-05, + "loss": 0.2896, + "step": 4626 + }, + { + "epoch": 0.3947278621395666, + "grad_norm": 1.7152135359059182, + "learning_rate": 6.899048838874267e-05, + "loss": 0.3569, + "step": 4627 + }, + { + "epoch": 0.3948131718136837, + "grad_norm": 1.8387632011390922, + "learning_rate": 6.897770764554255e-05, + "loss": 0.317, + "step": 4628 + }, + { + "epoch": 0.3948984814878007, + "grad_norm": 1.311800179785466, + "learning_rate": 6.896492545349529e-05, + "loss": 0.2702, + "step": 4629 + }, + { + "epoch": 0.3949837911619178, + "grad_norm": 1.460611153938112, + "learning_rate": 6.895214181357675e-05, + "loss": 0.2842, + "step": 4630 + }, + { + "epoch": 0.3950691008360348, + "grad_norm": 1.6120617115617952, + "learning_rate": 6.89393567267629e-05, + "loss": 0.2917, + "step": 4631 + }, + { + "epoch": 0.3951544105101519, + "grad_norm": 1.1927707550283513, + "learning_rate": 6.892657019402983e-05, + "loss": 0.2976, + "step": 4632 + }, + { + "epoch": 0.3952397201842689, + "grad_norm": 1.2825498732597815, + "learning_rate": 6.891378221635367e-05, + "loss": 0.2734, + "step": 4633 + }, + { + "epoch": 0.39532502985838597, + "grad_norm": 1.5736170676689993, + "learning_rate": 6.890099279471076e-05, + "loss": 0.2702, + "step": 4634 + }, + { + "epoch": 0.395410339532503, + "grad_norm": 1.5566537968976903, + "learning_rate": 6.888820193007749e-05, + "loss": 0.258, + "step": 4635 + }, + { + "epoch": 0.39549564920662, + "grad_norm": 1.902375771254981, + "learning_rate": 6.887540962343037e-05, + "loss": 0.3284, + "step": 4636 + }, + { + "epoch": 0.3955809588807371, + "grad_norm": 1.8539463583831013, + "learning_rate": 6.886261587574604e-05, + "loss": 0.3052, + "step": 4637 + }, + { + "epoch": 0.3956662685548541, + "grad_norm": 1.5961864024422563, + "learning_rate": 6.88498206880012e-05, + "loss": 0.2782, + "step": 4638 + }, + { + "epoch": 0.39575157822897117, + "grad_norm": 1.132478125320946, + "learning_rate": 6.883702406117275e-05, + "loss": 0.2434, + "step": 4639 + }, + { + "epoch": 0.3958368879030882, + "grad_norm": 1.4613461662347391, + "learning_rate": 6.88242259962376e-05, + "loss": 0.3153, + "step": 4640 + }, + { + "epoch": 0.39592219757720526, + "grad_norm": 1.5547258381932811, + "learning_rate": 6.881142649417281e-05, + "loss": 0.3, + "step": 4641 + }, + { + "epoch": 0.3960075072513223, + "grad_norm": 1.686727276242056, + "learning_rate": 6.879862555595559e-05, + "loss": 0.2861, + "step": 4642 + }, + { + "epoch": 0.39609281692543935, + "grad_norm": 1.479333447290605, + "learning_rate": 6.87858231825632e-05, + "loss": 0.2871, + "step": 4643 + }, + { + "epoch": 0.39617812659955637, + "grad_norm": 1.6713693157051168, + "learning_rate": 6.877301937497302e-05, + "loss": 0.2323, + "step": 4644 + }, + { + "epoch": 0.39626343627367344, + "grad_norm": 2.6435680353837383, + "learning_rate": 6.87602141341626e-05, + "loss": 0.2764, + "step": 4645 + }, + { + "epoch": 0.39634874594779046, + "grad_norm": 1.5422445021958167, + "learning_rate": 6.874740746110951e-05, + "loss": 0.3511, + "step": 4646 + }, + { + "epoch": 0.39643405562190753, + "grad_norm": 1.403750847997095, + "learning_rate": 6.87345993567915e-05, + "loss": 0.3002, + "step": 4647 + }, + { + "epoch": 0.39651936529602455, + "grad_norm": 1.5491902912890552, + "learning_rate": 6.872178982218635e-05, + "loss": 0.2992, + "step": 4648 + }, + { + "epoch": 0.3966046749701416, + "grad_norm": 1.4985863063779725, + "learning_rate": 6.870897885827206e-05, + "loss": 0.267, + "step": 4649 + }, + { + "epoch": 0.39668998464425864, + "grad_norm": 1.6645671813461298, + "learning_rate": 6.869616646602664e-05, + "loss": 0.2988, + "step": 4650 + }, + { + "epoch": 0.3967752943183757, + "grad_norm": 1.502275330666259, + "learning_rate": 6.868335264642827e-05, + "loss": 0.3, + "step": 4651 + }, + { + "epoch": 0.39686060399249273, + "grad_norm": 1.2325238217417498, + "learning_rate": 6.867053740045521e-05, + "loss": 0.2508, + "step": 4652 + }, + { + "epoch": 0.3969459136666098, + "grad_norm": 1.3103779486357645, + "learning_rate": 6.865772072908583e-05, + "loss": 0.3048, + "step": 4653 + }, + { + "epoch": 0.3970312233407268, + "grad_norm": 1.6371840951224823, + "learning_rate": 6.864490263329862e-05, + "loss": 0.3025, + "step": 4654 + }, + { + "epoch": 0.3971165330148439, + "grad_norm": 1.287458774684394, + "learning_rate": 6.863208311407216e-05, + "loss": 0.287, + "step": 4655 + }, + { + "epoch": 0.3972018426889609, + "grad_norm": 1.7996126632378862, + "learning_rate": 6.861926217238519e-05, + "loss": 0.3064, + "step": 4656 + }, + { + "epoch": 0.397287152363078, + "grad_norm": 1.3241550126392145, + "learning_rate": 6.860643980921648e-05, + "loss": 0.2847, + "step": 4657 + }, + { + "epoch": 0.397372462037195, + "grad_norm": 1.554806191979148, + "learning_rate": 6.859361602554499e-05, + "loss": 0.2791, + "step": 4658 + }, + { + "epoch": 0.3974577717113121, + "grad_norm": 1.275912261421778, + "learning_rate": 6.858079082234969e-05, + "loss": 0.2596, + "step": 4659 + }, + { + "epoch": 0.3975430813854291, + "grad_norm": 1.6908249770262418, + "learning_rate": 6.856796420060976e-05, + "loss": 0.3321, + "step": 4660 + }, + { + "epoch": 0.39762839105954617, + "grad_norm": 1.4943277079822164, + "learning_rate": 6.855513616130445e-05, + "loss": 0.2991, + "step": 4661 + }, + { + "epoch": 0.3977137007336632, + "grad_norm": 1.5328483802881172, + "learning_rate": 6.854230670541306e-05, + "loss": 0.2956, + "step": 4662 + }, + { + "epoch": 0.39779901040778026, + "grad_norm": 1.7092325433651747, + "learning_rate": 6.852947583391511e-05, + "loss": 0.3001, + "step": 4663 + }, + { + "epoch": 0.3978843200818973, + "grad_norm": 1.3917726217507325, + "learning_rate": 6.851664354779015e-05, + "loss": 0.2748, + "step": 4664 + }, + { + "epoch": 0.39796962975601435, + "grad_norm": 1.628888246724926, + "learning_rate": 6.850380984801783e-05, + "loss": 0.2603, + "step": 4665 + }, + { + "epoch": 0.39805493943013137, + "grad_norm": 1.8033425079272147, + "learning_rate": 6.849097473557798e-05, + "loss": 0.2407, + "step": 4666 + }, + { + "epoch": 0.39814024910424844, + "grad_norm": 1.8081164791092323, + "learning_rate": 6.847813821145045e-05, + "loss": 0.3652, + "step": 4667 + }, + { + "epoch": 0.39822555877836546, + "grad_norm": 1.7124288669486774, + "learning_rate": 6.84653002766153e-05, + "loss": 0.2479, + "step": 4668 + }, + { + "epoch": 0.39831086845248254, + "grad_norm": 1.5764689301472854, + "learning_rate": 6.845246093205256e-05, + "loss": 0.3081, + "step": 4669 + }, + { + "epoch": 0.39839617812659955, + "grad_norm": 1.3440145631324723, + "learning_rate": 6.84396201787425e-05, + "loss": 0.2867, + "step": 4670 + }, + { + "epoch": 0.3984814878007166, + "grad_norm": 1.3758896637138014, + "learning_rate": 6.842677801766541e-05, + "loss": 0.2784, + "step": 4671 + }, + { + "epoch": 0.39856679747483365, + "grad_norm": 1.5340133178560547, + "learning_rate": 6.841393444980177e-05, + "loss": 0.334, + "step": 4672 + }, + { + "epoch": 0.3986521071489507, + "grad_norm": 1.5273972914108909, + "learning_rate": 6.840108947613205e-05, + "loss": 0.2835, + "step": 4673 + }, + { + "epoch": 0.39873741682306774, + "grad_norm": 1.2305077440652907, + "learning_rate": 6.838824309763696e-05, + "loss": 0.2722, + "step": 4674 + }, + { + "epoch": 0.39882272649718475, + "grad_norm": 1.6475275791621005, + "learning_rate": 6.83753953152972e-05, + "loss": 0.2862, + "step": 4675 + }, + { + "epoch": 0.3989080361713018, + "grad_norm": 1.746284218454933, + "learning_rate": 6.836254613009367e-05, + "loss": 0.2862, + "step": 4676 + }, + { + "epoch": 0.39899334584541885, + "grad_norm": 1.5733628892681732, + "learning_rate": 6.834969554300732e-05, + "loss": 0.3365, + "step": 4677 + }, + { + "epoch": 0.3990786555195359, + "grad_norm": 1.5689946048514956, + "learning_rate": 6.833684355501923e-05, + "loss": 0.3176, + "step": 4678 + }, + { + "epoch": 0.39916396519365294, + "grad_norm": 1.3491703981218737, + "learning_rate": 6.832399016711058e-05, + "loss": 0.2719, + "step": 4679 + }, + { + "epoch": 0.39924927486777, + "grad_norm": 1.7031189104723774, + "learning_rate": 6.831113538026264e-05, + "loss": 0.3225, + "step": 4680 + }, + { + "epoch": 0.399334584541887, + "grad_norm": 1.7788293016868693, + "learning_rate": 6.829827919545682e-05, + "loss": 0.2748, + "step": 4681 + }, + { + "epoch": 0.3994198942160041, + "grad_norm": 1.442093682673117, + "learning_rate": 6.828542161367462e-05, + "loss": 0.2874, + "step": 4682 + }, + { + "epoch": 0.3995052038901211, + "grad_norm": 1.414656372132328, + "learning_rate": 6.827256263589766e-05, + "loss": 0.2037, + "step": 4683 + }, + { + "epoch": 0.3995905135642382, + "grad_norm": 1.4901555791699153, + "learning_rate": 6.825970226310762e-05, + "loss": 0.2484, + "step": 4684 + }, + { + "epoch": 0.3996758232383552, + "grad_norm": 1.6049473419345228, + "learning_rate": 6.824684049628638e-05, + "loss": 0.3102, + "step": 4685 + }, + { + "epoch": 0.3997611329124723, + "grad_norm": 1.5065416109179481, + "learning_rate": 6.82339773364158e-05, + "loss": 0.2904, + "step": 4686 + }, + { + "epoch": 0.3998464425865893, + "grad_norm": 1.72071497173756, + "learning_rate": 6.822111278447796e-05, + "loss": 0.2971, + "step": 4687 + }, + { + "epoch": 0.3999317522607064, + "grad_norm": 1.4908493992704064, + "learning_rate": 6.820824684145499e-05, + "loss": 0.2582, + "step": 4688 + }, + { + "epoch": 0.4000170619348234, + "grad_norm": 1.540378221165017, + "learning_rate": 6.819537950832912e-05, + "loss": 0.3105, + "step": 4689 + }, + { + "epoch": 0.40010237160894047, + "grad_norm": 1.475722624067136, + "learning_rate": 6.818251078608273e-05, + "loss": 0.2859, + "step": 4690 + }, + { + "epoch": 0.4001876812830575, + "grad_norm": 1.6429182464775187, + "learning_rate": 6.816964067569825e-05, + "loss": 0.3022, + "step": 4691 + }, + { + "epoch": 0.40027299095717456, + "grad_norm": 1.8708256348222843, + "learning_rate": 6.815676917815826e-05, + "loss": 0.3018, + "step": 4692 + }, + { + "epoch": 0.4003583006312916, + "grad_norm": 1.681261611387863, + "learning_rate": 6.814389629444543e-05, + "loss": 0.2971, + "step": 4693 + }, + { + "epoch": 0.40044361030540865, + "grad_norm": 1.6500267575724208, + "learning_rate": 6.813102202554254e-05, + "loss": 0.2745, + "step": 4694 + }, + { + "epoch": 0.40052891997952567, + "grad_norm": 1.38256385421221, + "learning_rate": 6.811814637243246e-05, + "loss": 0.2487, + "step": 4695 + }, + { + "epoch": 0.40061422965364274, + "grad_norm": 1.6625298260759604, + "learning_rate": 6.810526933609818e-05, + "loss": 0.2893, + "step": 4696 + }, + { + "epoch": 0.40069953932775976, + "grad_norm": 1.527707888228031, + "learning_rate": 6.80923909175228e-05, + "loss": 0.2719, + "step": 4697 + }, + { + "epoch": 0.40078484900187683, + "grad_norm": 1.576015308644645, + "learning_rate": 6.807951111768952e-05, + "loss": 0.241, + "step": 4698 + }, + { + "epoch": 0.40087015867599385, + "grad_norm": 1.6825287674460996, + "learning_rate": 6.806662993758164e-05, + "loss": 0.3669, + "step": 4699 + }, + { + "epoch": 0.4009554683501109, + "grad_norm": 1.8161988498865154, + "learning_rate": 6.805374737818257e-05, + "loss": 0.3379, + "step": 4700 + }, + { + "epoch": 0.40104077802422794, + "grad_norm": 1.7608824646006853, + "learning_rate": 6.804086344047583e-05, + "loss": 0.3706, + "step": 4701 + }, + { + "epoch": 0.401126087698345, + "grad_norm": 1.6890095211052047, + "learning_rate": 6.802797812544502e-05, + "loss": 0.3486, + "step": 4702 + }, + { + "epoch": 0.40121139737246203, + "grad_norm": 1.525512768572268, + "learning_rate": 6.80150914340739e-05, + "loss": 0.343, + "step": 4703 + }, + { + "epoch": 0.4012967070465791, + "grad_norm": 1.6695286337975233, + "learning_rate": 6.800220336734627e-05, + "loss": 0.3034, + "step": 4704 + }, + { + "epoch": 0.4013820167206961, + "grad_norm": 1.4851168736149685, + "learning_rate": 6.798931392624608e-05, + "loss": 0.3217, + "step": 4705 + }, + { + "epoch": 0.4014673263948132, + "grad_norm": 1.5656454850187258, + "learning_rate": 6.797642311175736e-05, + "loss": 0.2683, + "step": 4706 + }, + { + "epoch": 0.4015526360689302, + "grad_norm": 1.474514140912765, + "learning_rate": 6.796353092486427e-05, + "loss": 0.2983, + "step": 4707 + }, + { + "epoch": 0.4016379457430473, + "grad_norm": 1.5058859737436858, + "learning_rate": 6.795063736655104e-05, + "loss": 0.3351, + "step": 4708 + }, + { + "epoch": 0.4017232554171643, + "grad_norm": 1.4680075696651282, + "learning_rate": 6.793774243780206e-05, + "loss": 0.2918, + "step": 4709 + }, + { + "epoch": 0.4018085650912814, + "grad_norm": 1.317443710514474, + "learning_rate": 6.792484613960175e-05, + "loss": 0.2931, + "step": 4710 + }, + { + "epoch": 0.4018938747653984, + "grad_norm": 1.4555940878154447, + "learning_rate": 6.79119484729347e-05, + "loss": 0.2497, + "step": 4711 + }, + { + "epoch": 0.4019791844395154, + "grad_norm": 1.6225142907687375, + "learning_rate": 6.789904943878554e-05, + "loss": 0.3167, + "step": 4712 + }, + { + "epoch": 0.4020644941136325, + "grad_norm": 1.3311394085863255, + "learning_rate": 6.78861490381391e-05, + "loss": 0.2691, + "step": 4713 + }, + { + "epoch": 0.4021498037877495, + "grad_norm": 1.5026474477615848, + "learning_rate": 6.787324727198021e-05, + "loss": 0.2736, + "step": 4714 + }, + { + "epoch": 0.4022351134618666, + "grad_norm": 1.457659465775207, + "learning_rate": 6.786034414129388e-05, + "loss": 0.2258, + "step": 4715 + }, + { + "epoch": 0.4023204231359836, + "grad_norm": 1.3390062567384504, + "learning_rate": 6.784743964706518e-05, + "loss": 0.2586, + "step": 4716 + }, + { + "epoch": 0.40240573281010067, + "grad_norm": 1.5468754623874783, + "learning_rate": 6.783453379027931e-05, + "loss": 0.2841, + "step": 4717 + }, + { + "epoch": 0.4024910424842177, + "grad_norm": 1.9170068632429835, + "learning_rate": 6.782162657192154e-05, + "loss": 0.3211, + "step": 4718 + }, + { + "epoch": 0.40257635215833476, + "grad_norm": 1.6549958573027332, + "learning_rate": 6.780871799297731e-05, + "loss": 0.3336, + "step": 4719 + }, + { + "epoch": 0.4026616618324518, + "grad_norm": 1.3920109732184394, + "learning_rate": 6.779580805443208e-05, + "loss": 0.2924, + "step": 4720 + }, + { + "epoch": 0.40274697150656885, + "grad_norm": 1.2671315210209797, + "learning_rate": 6.778289675727149e-05, + "loss": 0.3115, + "step": 4721 + }, + { + "epoch": 0.40283228118068587, + "grad_norm": 1.371751763233006, + "learning_rate": 6.776998410248122e-05, + "loss": 0.2671, + "step": 4722 + }, + { + "epoch": 0.40291759085480294, + "grad_norm": 1.5866332383734976, + "learning_rate": 6.775707009104708e-05, + "loss": 0.2544, + "step": 4723 + }, + { + "epoch": 0.40300290052891996, + "grad_norm": 1.5288381460983147, + "learning_rate": 6.774415472395501e-05, + "loss": 0.2396, + "step": 4724 + }, + { + "epoch": 0.40308821020303703, + "grad_norm": 1.846050056266541, + "learning_rate": 6.773123800219103e-05, + "loss": 0.3256, + "step": 4725 + }, + { + "epoch": 0.40317351987715405, + "grad_norm": 1.395396458405523, + "learning_rate": 6.771831992674123e-05, + "loss": 0.2704, + "step": 4726 + }, + { + "epoch": 0.4032588295512711, + "grad_norm": 1.3338806648209176, + "learning_rate": 6.770540049859188e-05, + "loss": 0.2836, + "step": 4727 + }, + { + "epoch": 0.40334413922538814, + "grad_norm": 1.3227004177337123, + "learning_rate": 6.769247971872927e-05, + "loss": 0.2709, + "step": 4728 + }, + { + "epoch": 0.4034294488995052, + "grad_norm": 1.2198057248255307, + "learning_rate": 6.767955758813986e-05, + "loss": 0.2184, + "step": 4729 + }, + { + "epoch": 0.40351475857362223, + "grad_norm": 1.604279311877894, + "learning_rate": 6.766663410781019e-05, + "loss": 0.2834, + "step": 4730 + }, + { + "epoch": 0.4036000682477393, + "grad_norm": 1.4499869970856736, + "learning_rate": 6.765370927872687e-05, + "loss": 0.275, + "step": 4731 + }, + { + "epoch": 0.4036853779218563, + "grad_norm": 1.4083872342751873, + "learning_rate": 6.764078310187668e-05, + "loss": 0.261, + "step": 4732 + }, + { + "epoch": 0.4037706875959734, + "grad_norm": 1.468926926365788, + "learning_rate": 6.76278555782464e-05, + "loss": 0.3218, + "step": 4733 + }, + { + "epoch": 0.4038559972700904, + "grad_norm": 1.5468175087465845, + "learning_rate": 6.761492670882306e-05, + "loss": 0.2597, + "step": 4734 + }, + { + "epoch": 0.4039413069442075, + "grad_norm": 1.098861304364317, + "learning_rate": 6.760199649459366e-05, + "loss": 0.2649, + "step": 4735 + }, + { + "epoch": 0.4040266166183245, + "grad_norm": 1.432545125094113, + "learning_rate": 6.758906493654535e-05, + "loss": 0.2889, + "step": 4736 + }, + { + "epoch": 0.4041119262924416, + "grad_norm": 1.4892349991848974, + "learning_rate": 6.757613203566542e-05, + "loss": 0.2535, + "step": 4737 + }, + { + "epoch": 0.4041972359665586, + "grad_norm": 1.6863720267605191, + "learning_rate": 6.75631977929412e-05, + "loss": 0.33, + "step": 4738 + }, + { + "epoch": 0.4042825456406757, + "grad_norm": 1.4677454171009277, + "learning_rate": 6.755026220936016e-05, + "loss": 0.259, + "step": 4739 + }, + { + "epoch": 0.4043678553147927, + "grad_norm": 1.7032873793721703, + "learning_rate": 6.753732528590986e-05, + "loss": 0.2225, + "step": 4740 + }, + { + "epoch": 0.40445316498890976, + "grad_norm": 1.4061490340544598, + "learning_rate": 6.752438702357797e-05, + "loss": 0.2872, + "step": 4741 + }, + { + "epoch": 0.4045384746630268, + "grad_norm": 1.6354133298155482, + "learning_rate": 6.751144742335227e-05, + "loss": 0.3345, + "step": 4742 + }, + { + "epoch": 0.40462378433714385, + "grad_norm": 1.6193790053241155, + "learning_rate": 6.749850648622061e-05, + "loss": 0.2773, + "step": 4743 + }, + { + "epoch": 0.4047090940112609, + "grad_norm": 1.6087996556251798, + "learning_rate": 6.748556421317094e-05, + "loss": 0.2972, + "step": 4744 + }, + { + "epoch": 0.40479440368537795, + "grad_norm": 1.525580517699773, + "learning_rate": 6.747262060519139e-05, + "loss": 0.2971, + "step": 4745 + }, + { + "epoch": 0.40487971335949496, + "grad_norm": 1.4003914336943053, + "learning_rate": 6.745967566327009e-05, + "loss": 0.2378, + "step": 4746 + }, + { + "epoch": 0.40496502303361204, + "grad_norm": 1.6362124732065841, + "learning_rate": 6.744672938839534e-05, + "loss": 0.3152, + "step": 4747 + }, + { + "epoch": 0.40505033270772905, + "grad_norm": 1.4131028939516943, + "learning_rate": 6.743378178155551e-05, + "loss": 0.2627, + "step": 4748 + }, + { + "epoch": 0.40513564238184613, + "grad_norm": 1.6917404302791732, + "learning_rate": 6.742083284373907e-05, + "loss": 0.3055, + "step": 4749 + }, + { + "epoch": 0.40522095205596315, + "grad_norm": 1.64134621889421, + "learning_rate": 6.740788257593463e-05, + "loss": 0.2539, + "step": 4750 + }, + { + "epoch": 0.40530626173008016, + "grad_norm": 1.3712801466960194, + "learning_rate": 6.739493097913088e-05, + "loss": 0.2686, + "step": 4751 + }, + { + "epoch": 0.40539157140419724, + "grad_norm": 1.81781679574932, + "learning_rate": 6.738197805431657e-05, + "loss": 0.3294, + "step": 4752 + }, + { + "epoch": 0.40547688107831426, + "grad_norm": 1.3130989524585548, + "learning_rate": 6.73690238024806e-05, + "loss": 0.2972, + "step": 4753 + }, + { + "epoch": 0.40556219075243133, + "grad_norm": 1.3554860754307458, + "learning_rate": 6.735606822461195e-05, + "loss": 0.2533, + "step": 4754 + }, + { + "epoch": 0.40564750042654835, + "grad_norm": 1.3367435633609805, + "learning_rate": 6.734311132169974e-05, + "loss": 0.3049, + "step": 4755 + }, + { + "epoch": 0.4057328101006654, + "grad_norm": 1.5774879302758598, + "learning_rate": 6.733015309473313e-05, + "loss": 0.2941, + "step": 4756 + }, + { + "epoch": 0.40581811977478244, + "grad_norm": 1.3348080952903854, + "learning_rate": 6.731719354470143e-05, + "loss": 0.3004, + "step": 4757 + }, + { + "epoch": 0.4059034294488995, + "grad_norm": 1.6472083838261977, + "learning_rate": 6.730423267259402e-05, + "loss": 0.2782, + "step": 4758 + }, + { + "epoch": 0.40598873912301653, + "grad_norm": 1.3219518472324805, + "learning_rate": 6.729127047940042e-05, + "loss": 0.2912, + "step": 4759 + }, + { + "epoch": 0.4060740487971336, + "grad_norm": 1.2290556528854621, + "learning_rate": 6.727830696611018e-05, + "loss": 0.2489, + "step": 4760 + }, + { + "epoch": 0.4061593584712506, + "grad_norm": 1.4525354481506312, + "learning_rate": 6.726534213371304e-05, + "loss": 0.2752, + "step": 4761 + }, + { + "epoch": 0.4062446681453677, + "grad_norm": 1.438487584420817, + "learning_rate": 6.725237598319877e-05, + "loss": 0.2765, + "step": 4762 + }, + { + "epoch": 0.4063299778194847, + "grad_norm": 1.5139714295325781, + "learning_rate": 6.723940851555726e-05, + "loss": 0.2844, + "step": 4763 + }, + { + "epoch": 0.4064152874936018, + "grad_norm": 1.7191551424578038, + "learning_rate": 6.722643973177855e-05, + "loss": 0.284, + "step": 4764 + }, + { + "epoch": 0.4065005971677188, + "grad_norm": 1.586272706935678, + "learning_rate": 6.721346963285266e-05, + "loss": 0.2751, + "step": 4765 + }, + { + "epoch": 0.4065859068418359, + "grad_norm": 1.4115030298041737, + "learning_rate": 6.720049821976988e-05, + "loss": 0.2557, + "step": 4766 + }, + { + "epoch": 0.4066712165159529, + "grad_norm": 1.3294349941674626, + "learning_rate": 6.718752549352045e-05, + "loss": 0.2977, + "step": 4767 + }, + { + "epoch": 0.40675652619006997, + "grad_norm": 1.2977928682234485, + "learning_rate": 6.717455145509477e-05, + "loss": 0.2393, + "step": 4768 + }, + { + "epoch": 0.406841835864187, + "grad_norm": 1.5159516916454125, + "learning_rate": 6.716157610548338e-05, + "loss": 0.3351, + "step": 4769 + }, + { + "epoch": 0.40692714553830406, + "grad_norm": 1.4790604871151443, + "learning_rate": 6.714859944567681e-05, + "loss": 0.2487, + "step": 4770 + }, + { + "epoch": 0.4070124552124211, + "grad_norm": 1.4105449362746516, + "learning_rate": 6.713562147666584e-05, + "loss": 0.31, + "step": 4771 + }, + { + "epoch": 0.40709776488653815, + "grad_norm": 2.0355809916212393, + "learning_rate": 6.71226421994412e-05, + "loss": 0.3558, + "step": 4772 + }, + { + "epoch": 0.40718307456065517, + "grad_norm": 1.4466630553861157, + "learning_rate": 6.710966161499384e-05, + "loss": 0.2745, + "step": 4773 + }, + { + "epoch": 0.40726838423477224, + "grad_norm": 1.6573021623657533, + "learning_rate": 6.709667972431473e-05, + "loss": 0.2535, + "step": 4774 + }, + { + "epoch": 0.40735369390888926, + "grad_norm": 1.6361980474863502, + "learning_rate": 6.708369652839497e-05, + "loss": 0.2279, + "step": 4775 + }, + { + "epoch": 0.40743900358300633, + "grad_norm": 1.479941563612484, + "learning_rate": 6.707071202822575e-05, + "loss": 0.2904, + "step": 4776 + }, + { + "epoch": 0.40752431325712335, + "grad_norm": 1.390658603219355, + "learning_rate": 6.70577262247984e-05, + "loss": 0.3065, + "step": 4777 + }, + { + "epoch": 0.4076096229312404, + "grad_norm": 1.517193484073549, + "learning_rate": 6.704473911910428e-05, + "loss": 0.2586, + "step": 4778 + }, + { + "epoch": 0.40769493260535744, + "grad_norm": 1.559003656545759, + "learning_rate": 6.703175071213493e-05, + "loss": 0.2663, + "step": 4779 + }, + { + "epoch": 0.4077802422794745, + "grad_norm": 1.339472683162747, + "learning_rate": 6.701876100488189e-05, + "loss": 0.2897, + "step": 4780 + }, + { + "epoch": 0.40786555195359153, + "grad_norm": 1.6597126446620065, + "learning_rate": 6.70057699983369e-05, + "loss": 0.312, + "step": 4781 + }, + { + "epoch": 0.4079508616277086, + "grad_norm": 1.6333853706841626, + "learning_rate": 6.699277769349174e-05, + "loss": 0.335, + "step": 4782 + }, + { + "epoch": 0.4080361713018256, + "grad_norm": 1.3595979880360427, + "learning_rate": 6.697978409133831e-05, + "loss": 0.2774, + "step": 4783 + }, + { + "epoch": 0.4081214809759427, + "grad_norm": 1.650871468645439, + "learning_rate": 6.696678919286859e-05, + "loss": 0.2893, + "step": 4784 + }, + { + "epoch": 0.4082067906500597, + "grad_norm": 1.720100565787041, + "learning_rate": 6.695379299907467e-05, + "loss": 0.2992, + "step": 4785 + }, + { + "epoch": 0.4082921003241768, + "grad_norm": 1.3034881287104796, + "learning_rate": 6.694079551094873e-05, + "loss": 0.3235, + "step": 4786 + }, + { + "epoch": 0.4083774099982938, + "grad_norm": 1.4332083618035119, + "learning_rate": 6.69277967294831e-05, + "loss": 0.3069, + "step": 4787 + }, + { + "epoch": 0.4084627196724109, + "grad_norm": 1.6453404553711097, + "learning_rate": 6.691479665567015e-05, + "loss": 0.2907, + "step": 4788 + }, + { + "epoch": 0.4085480293465279, + "grad_norm": 1.4563808267791303, + "learning_rate": 6.690179529050235e-05, + "loss": 0.2546, + "step": 4789 + }, + { + "epoch": 0.4086333390206449, + "grad_norm": 1.7629975881316617, + "learning_rate": 6.688879263497229e-05, + "loss": 0.2713, + "step": 4790 + }, + { + "epoch": 0.408718648694762, + "grad_norm": 1.5129243050629935, + "learning_rate": 6.687578869007267e-05, + "loss": 0.3001, + "step": 4791 + }, + { + "epoch": 0.408803958368879, + "grad_norm": 1.5852886808302782, + "learning_rate": 6.686278345679625e-05, + "loss": 0.2835, + "step": 4792 + }, + { + "epoch": 0.4088892680429961, + "grad_norm": 1.4588586224514366, + "learning_rate": 6.684977693613593e-05, + "loss": 0.299, + "step": 4793 + }, + { + "epoch": 0.4089745777171131, + "grad_norm": 1.231377160336562, + "learning_rate": 6.683676912908469e-05, + "loss": 0.2735, + "step": 4794 + }, + { + "epoch": 0.40905988739123017, + "grad_norm": 1.669601249030669, + "learning_rate": 6.682376003663559e-05, + "loss": 0.2566, + "step": 4795 + }, + { + "epoch": 0.4091451970653472, + "grad_norm": 1.4045588178748063, + "learning_rate": 6.681074965978181e-05, + "loss": 0.2497, + "step": 4796 + }, + { + "epoch": 0.40923050673946426, + "grad_norm": 1.5610400436367815, + "learning_rate": 6.679773799951662e-05, + "loss": 0.2712, + "step": 4797 + }, + { + "epoch": 0.4093158164135813, + "grad_norm": 1.3994277772411083, + "learning_rate": 6.67847250568334e-05, + "loss": 0.3067, + "step": 4798 + }, + { + "epoch": 0.40940112608769835, + "grad_norm": 1.6934813705199043, + "learning_rate": 6.677171083272562e-05, + "loss": 0.2866, + "step": 4799 + }, + { + "epoch": 0.40948643576181537, + "grad_norm": 1.5171451613768165, + "learning_rate": 6.675869532818683e-05, + "loss": 0.2775, + "step": 4800 + }, + { + "epoch": 0.40957174543593244, + "grad_norm": 1.575198036943552, + "learning_rate": 6.674567854421073e-05, + "loss": 0.3057, + "step": 4801 + }, + { + "epoch": 0.40965705511004946, + "grad_norm": 1.6551723933254163, + "learning_rate": 6.673266048179103e-05, + "loss": 0.3136, + "step": 4802 + }, + { + "epoch": 0.40974236478416654, + "grad_norm": 1.7257018554992303, + "learning_rate": 6.671964114192164e-05, + "loss": 0.2594, + "step": 4803 + }, + { + "epoch": 0.40982767445828355, + "grad_norm": 1.2906795647196054, + "learning_rate": 6.670662052559649e-05, + "loss": 0.331, + "step": 4804 + }, + { + "epoch": 0.4099129841324006, + "grad_norm": 1.4877199731590542, + "learning_rate": 6.669359863380964e-05, + "loss": 0.2822, + "step": 4805 + }, + { + "epoch": 0.40999829380651764, + "grad_norm": 1.6376371515509076, + "learning_rate": 6.668057546755526e-05, + "loss": 0.2949, + "step": 4806 + }, + { + "epoch": 0.4100836034806347, + "grad_norm": 1.6680255674739966, + "learning_rate": 6.666755102782758e-05, + "loss": 0.2584, + "step": 4807 + }, + { + "epoch": 0.41016891315475174, + "grad_norm": 1.2645501647235977, + "learning_rate": 6.665452531562093e-05, + "loss": 0.2581, + "step": 4808 + }, + { + "epoch": 0.4102542228288688, + "grad_norm": 1.4702913840193408, + "learning_rate": 6.66414983319298e-05, + "loss": 0.2963, + "step": 4809 + }, + { + "epoch": 0.4103395325029858, + "grad_norm": 1.575541279137308, + "learning_rate": 6.662847007774869e-05, + "loss": 0.3006, + "step": 4810 + }, + { + "epoch": 0.4104248421771029, + "grad_norm": 1.4026066950640559, + "learning_rate": 6.661544055407225e-05, + "loss": 0.3007, + "step": 4811 + }, + { + "epoch": 0.4105101518512199, + "grad_norm": 1.2258523233595118, + "learning_rate": 6.660240976189523e-05, + "loss": 0.2369, + "step": 4812 + }, + { + "epoch": 0.410595461525337, + "grad_norm": 1.6342489686297144, + "learning_rate": 6.658937770221242e-05, + "loss": 0.3084, + "step": 4813 + }, + { + "epoch": 0.410680771199454, + "grad_norm": 1.4188835618929705, + "learning_rate": 6.657634437601881e-05, + "loss": 0.2622, + "step": 4814 + }, + { + "epoch": 0.4107660808735711, + "grad_norm": 1.348456634350364, + "learning_rate": 6.656330978430939e-05, + "loss": 0.2277, + "step": 4815 + }, + { + "epoch": 0.4108513905476881, + "grad_norm": 1.2839683910056208, + "learning_rate": 6.65502739280793e-05, + "loss": 0.2366, + "step": 4816 + }, + { + "epoch": 0.4109367002218052, + "grad_norm": 1.514936467717608, + "learning_rate": 6.653723680832371e-05, + "loss": 0.2776, + "step": 4817 + }, + { + "epoch": 0.4110220098959222, + "grad_norm": 1.6281571163590254, + "learning_rate": 6.652419842603797e-05, + "loss": 0.2781, + "step": 4818 + }, + { + "epoch": 0.41110731957003926, + "grad_norm": 1.88539794434864, + "learning_rate": 6.651115878221752e-05, + "loss": 0.3939, + "step": 4819 + }, + { + "epoch": 0.4111926292441563, + "grad_norm": 1.6162540856447674, + "learning_rate": 6.649811787785781e-05, + "loss": 0.2993, + "step": 4820 + }, + { + "epoch": 0.41127793891827336, + "grad_norm": 1.5074825419969742, + "learning_rate": 6.648507571395449e-05, + "loss": 0.3057, + "step": 4821 + }, + { + "epoch": 0.4113632485923904, + "grad_norm": 2.197790301940499, + "learning_rate": 6.647203229150322e-05, + "loss": 0.2711, + "step": 4822 + }, + { + "epoch": 0.41144855826650745, + "grad_norm": 1.6507823592045492, + "learning_rate": 6.645898761149982e-05, + "loss": 0.3134, + "step": 4823 + }, + { + "epoch": 0.41153386794062446, + "grad_norm": 1.4879250087555818, + "learning_rate": 6.644594167494019e-05, + "loss": 0.2409, + "step": 4824 + }, + { + "epoch": 0.41161917761474154, + "grad_norm": 2.0705843764953813, + "learning_rate": 6.643289448282031e-05, + "loss": 0.2797, + "step": 4825 + }, + { + "epoch": 0.41170448728885856, + "grad_norm": 1.626970563522614, + "learning_rate": 6.641984603613625e-05, + "loss": 0.2932, + "step": 4826 + }, + { + "epoch": 0.4117897969629756, + "grad_norm": 1.6510548716437219, + "learning_rate": 6.640679633588421e-05, + "loss": 0.3052, + "step": 4827 + }, + { + "epoch": 0.41187510663709265, + "grad_norm": 1.436554473396467, + "learning_rate": 6.639374538306046e-05, + "loss": 0.2885, + "step": 4828 + }, + { + "epoch": 0.41196041631120967, + "grad_norm": 1.604262221161558, + "learning_rate": 6.638069317866135e-05, + "loss": 0.2546, + "step": 4829 + }, + { + "epoch": 0.41204572598532674, + "grad_norm": 1.6879276157696694, + "learning_rate": 6.636763972368337e-05, + "loss": 0.2601, + "step": 4830 + }, + { + "epoch": 0.41213103565944376, + "grad_norm": 1.2485497645007244, + "learning_rate": 6.635458501912307e-05, + "loss": 0.2158, + "step": 4831 + }, + { + "epoch": 0.41221634533356083, + "grad_norm": 1.522920804555188, + "learning_rate": 6.63415290659771e-05, + "loss": 0.2949, + "step": 4832 + }, + { + "epoch": 0.41230165500767785, + "grad_norm": 1.7777541808045803, + "learning_rate": 6.632847186524225e-05, + "loss": 0.3476, + "step": 4833 + }, + { + "epoch": 0.4123869646817949, + "grad_norm": 1.3738818390524794, + "learning_rate": 6.631541341791533e-05, + "loss": 0.2571, + "step": 4834 + }, + { + "epoch": 0.41247227435591194, + "grad_norm": 1.4469817548549313, + "learning_rate": 6.63023537249933e-05, + "loss": 0.3236, + "step": 4835 + }, + { + "epoch": 0.412557584030029, + "grad_norm": 1.354891959383992, + "learning_rate": 6.62892927874732e-05, + "loss": 0.2897, + "step": 4836 + }, + { + "epoch": 0.41264289370414603, + "grad_norm": 1.6598351022099516, + "learning_rate": 6.627623060635214e-05, + "loss": 0.2653, + "step": 4837 + }, + { + "epoch": 0.4127282033782631, + "grad_norm": 1.5061981572136998, + "learning_rate": 6.626316718262737e-05, + "loss": 0.2547, + "step": 4838 + }, + { + "epoch": 0.4128135130523801, + "grad_norm": 1.705779710171263, + "learning_rate": 6.62501025172962e-05, + "loss": 0.3946, + "step": 4839 + }, + { + "epoch": 0.4128988227264972, + "grad_norm": 1.5386128809655613, + "learning_rate": 6.623703661135609e-05, + "loss": 0.3108, + "step": 4840 + }, + { + "epoch": 0.4129841324006142, + "grad_norm": 1.437040836144225, + "learning_rate": 6.622396946580449e-05, + "loss": 0.282, + "step": 4841 + }, + { + "epoch": 0.4130694420747313, + "grad_norm": 1.69877004728074, + "learning_rate": 6.621090108163904e-05, + "loss": 0.369, + "step": 4842 + }, + { + "epoch": 0.4131547517488483, + "grad_norm": 1.9205416884819932, + "learning_rate": 6.619783145985743e-05, + "loss": 0.292, + "step": 4843 + }, + { + "epoch": 0.4132400614229654, + "grad_norm": 1.5724037630989578, + "learning_rate": 6.618476060145747e-05, + "loss": 0.2963, + "step": 4844 + }, + { + "epoch": 0.4133253710970824, + "grad_norm": 1.8058168914966501, + "learning_rate": 6.617168850743704e-05, + "loss": 0.3426, + "step": 4845 + }, + { + "epoch": 0.41341068077119947, + "grad_norm": 1.2368497546579917, + "learning_rate": 6.615861517879414e-05, + "loss": 0.304, + "step": 4846 + }, + { + "epoch": 0.4134959904453165, + "grad_norm": 1.578098183583461, + "learning_rate": 6.614554061652683e-05, + "loss": 0.2585, + "step": 4847 + }, + { + "epoch": 0.41358130011943356, + "grad_norm": 1.34293562585836, + "learning_rate": 6.61324648216333e-05, + "loss": 0.3155, + "step": 4848 + }, + { + "epoch": 0.4136666097935506, + "grad_norm": 1.4355705414981563, + "learning_rate": 6.61193877951118e-05, + "loss": 0.3067, + "step": 4849 + }, + { + "epoch": 0.41375191946766765, + "grad_norm": 1.2571726053916839, + "learning_rate": 6.61063095379607e-05, + "loss": 0.2296, + "step": 4850 + }, + { + "epoch": 0.41383722914178467, + "grad_norm": 1.242346459540386, + "learning_rate": 6.609323005117846e-05, + "loss": 0.2694, + "step": 4851 + }, + { + "epoch": 0.41392253881590174, + "grad_norm": 1.4193528831166418, + "learning_rate": 6.608014933576362e-05, + "loss": 0.2843, + "step": 4852 + }, + { + "epoch": 0.41400784849001876, + "grad_norm": 1.235074147071974, + "learning_rate": 6.606706739271482e-05, + "loss": 0.245, + "step": 4853 + }, + { + "epoch": 0.41409315816413583, + "grad_norm": 1.4270393568999271, + "learning_rate": 6.605398422303082e-05, + "loss": 0.2657, + "step": 4854 + }, + { + "epoch": 0.41417846783825285, + "grad_norm": 1.6997278612762343, + "learning_rate": 6.604089982771043e-05, + "loss": 0.2971, + "step": 4855 + }, + { + "epoch": 0.4142637775123699, + "grad_norm": 1.380924984076075, + "learning_rate": 6.602781420775258e-05, + "loss": 0.249, + "step": 4856 + }, + { + "epoch": 0.41434908718648694, + "grad_norm": 1.3078091674908072, + "learning_rate": 6.601472736415629e-05, + "loss": 0.2564, + "step": 4857 + }, + { + "epoch": 0.414434396860604, + "grad_norm": 1.5479485995805413, + "learning_rate": 6.600163929792067e-05, + "loss": 0.2998, + "step": 4858 + }, + { + "epoch": 0.41451970653472103, + "grad_norm": 1.50136583451505, + "learning_rate": 6.598855001004492e-05, + "loss": 0.2487, + "step": 4859 + }, + { + "epoch": 0.4146050162088381, + "grad_norm": 1.5002971990528542, + "learning_rate": 6.597545950152833e-05, + "loss": 0.302, + "step": 4860 + }, + { + "epoch": 0.4146903258829551, + "grad_norm": 1.1430056041269763, + "learning_rate": 6.59623677733703e-05, + "loss": 0.2417, + "step": 4861 + }, + { + "epoch": 0.4147756355570722, + "grad_norm": 1.8097241949209082, + "learning_rate": 6.594927482657033e-05, + "loss": 0.3001, + "step": 4862 + }, + { + "epoch": 0.4148609452311892, + "grad_norm": 1.677204372547872, + "learning_rate": 6.593618066212797e-05, + "loss": 0.3559, + "step": 4863 + }, + { + "epoch": 0.4149462549053063, + "grad_norm": 1.52064218582746, + "learning_rate": 6.59230852810429e-05, + "loss": 0.2676, + "step": 4864 + }, + { + "epoch": 0.4150315645794233, + "grad_norm": 1.3643279770957948, + "learning_rate": 6.59099886843149e-05, + "loss": 0.2371, + "step": 4865 + }, + { + "epoch": 0.4151168742535403, + "grad_norm": 1.8213034621582234, + "learning_rate": 6.589689087294378e-05, + "loss": 0.3238, + "step": 4866 + }, + { + "epoch": 0.4152021839276574, + "grad_norm": 1.551464631680959, + "learning_rate": 6.588379184792954e-05, + "loss": 0.3016, + "step": 4867 + }, + { + "epoch": 0.4152874936017744, + "grad_norm": 1.7703638539016255, + "learning_rate": 6.587069161027219e-05, + "loss": 0.274, + "step": 4868 + }, + { + "epoch": 0.4153728032758915, + "grad_norm": 1.4515022322808058, + "learning_rate": 6.585759016097188e-05, + "loss": 0.3483, + "step": 4869 + }, + { + "epoch": 0.4154581129500085, + "grad_norm": 1.242068446950036, + "learning_rate": 6.584448750102883e-05, + "loss": 0.2549, + "step": 4870 + }, + { + "epoch": 0.4155434226241256, + "grad_norm": 1.2866226225234039, + "learning_rate": 6.583138363144334e-05, + "loss": 0.2698, + "step": 4871 + }, + { + "epoch": 0.4156287322982426, + "grad_norm": 1.4957710097283277, + "learning_rate": 6.581827855321587e-05, + "loss": 0.2565, + "step": 4872 + }, + { + "epoch": 0.41571404197235967, + "grad_norm": 1.4714851883697895, + "learning_rate": 6.580517226734686e-05, + "loss": 0.2576, + "step": 4873 + }, + { + "epoch": 0.4157993516464767, + "grad_norm": 1.6357711926066456, + "learning_rate": 6.579206477483695e-05, + "loss": 0.2418, + "step": 4874 + }, + { + "epoch": 0.41588466132059376, + "grad_norm": 1.2188766120287349, + "learning_rate": 6.57789560766868e-05, + "loss": 0.285, + "step": 4875 + }, + { + "epoch": 0.4159699709947108, + "grad_norm": 1.964607725130225, + "learning_rate": 6.57658461738972e-05, + "loss": 0.346, + "step": 4876 + }, + { + "epoch": 0.41605528066882785, + "grad_norm": 1.2505544386541019, + "learning_rate": 6.575273506746905e-05, + "loss": 0.3054, + "step": 4877 + }, + { + "epoch": 0.41614059034294487, + "grad_norm": 1.7286863770710394, + "learning_rate": 6.573962275840328e-05, + "loss": 0.2767, + "step": 4878 + }, + { + "epoch": 0.41622590001706194, + "grad_norm": 1.4914710436934489, + "learning_rate": 6.572650924770093e-05, + "loss": 0.295, + "step": 4879 + }, + { + "epoch": 0.41631120969117896, + "grad_norm": 1.4868991471743462, + "learning_rate": 6.57133945363632e-05, + "loss": 0.2293, + "step": 4880 + }, + { + "epoch": 0.41639651936529604, + "grad_norm": 1.5796469723778568, + "learning_rate": 6.570027862539128e-05, + "loss": 0.3075, + "step": 4881 + }, + { + "epoch": 0.41648182903941305, + "grad_norm": 1.4883498929431387, + "learning_rate": 6.568716151578653e-05, + "loss": 0.2743, + "step": 4882 + }, + { + "epoch": 0.4165671387135301, + "grad_norm": 1.5247221396534563, + "learning_rate": 6.567404320855035e-05, + "loss": 0.3543, + "step": 4883 + }, + { + "epoch": 0.41665244838764715, + "grad_norm": 1.6760623580512068, + "learning_rate": 6.566092370468427e-05, + "loss": 0.3127, + "step": 4884 + }, + { + "epoch": 0.4167377580617642, + "grad_norm": 1.3410004274622203, + "learning_rate": 6.564780300518987e-05, + "loss": 0.3151, + "step": 4885 + }, + { + "epoch": 0.41682306773588124, + "grad_norm": 1.5382114904113149, + "learning_rate": 6.563468111106889e-05, + "loss": 0.2797, + "step": 4886 + }, + { + "epoch": 0.4169083774099983, + "grad_norm": 1.4165280592186344, + "learning_rate": 6.562155802332307e-05, + "loss": 0.2531, + "step": 4887 + }, + { + "epoch": 0.4169936870841153, + "grad_norm": 1.3878260530094775, + "learning_rate": 6.56084337429543e-05, + "loss": 0.2573, + "step": 4888 + }, + { + "epoch": 0.4170789967582324, + "grad_norm": 1.6602662532489318, + "learning_rate": 6.559530827096457e-05, + "loss": 0.3202, + "step": 4889 + }, + { + "epoch": 0.4171643064323494, + "grad_norm": 1.7286308638095118, + "learning_rate": 6.558218160835594e-05, + "loss": 0.2659, + "step": 4890 + }, + { + "epoch": 0.4172496161064665, + "grad_norm": 1.6611686940385408, + "learning_rate": 6.556905375613054e-05, + "loss": 0.2434, + "step": 4891 + }, + { + "epoch": 0.4173349257805835, + "grad_norm": 1.450972901536218, + "learning_rate": 6.555592471529059e-05, + "loss": 0.2797, + "step": 4892 + }, + { + "epoch": 0.4174202354547006, + "grad_norm": 1.334855025917071, + "learning_rate": 6.554279448683849e-05, + "loss": 0.2391, + "step": 4893 + }, + { + "epoch": 0.4175055451288176, + "grad_norm": 1.7012804874824794, + "learning_rate": 6.552966307177662e-05, + "loss": 0.2751, + "step": 4894 + }, + { + "epoch": 0.4175908548029347, + "grad_norm": 1.4660280043671141, + "learning_rate": 6.551653047110747e-05, + "loss": 0.2839, + "step": 4895 + }, + { + "epoch": 0.4176761644770517, + "grad_norm": 1.5146605579080659, + "learning_rate": 6.550339668583369e-05, + "loss": 0.3005, + "step": 4896 + }, + { + "epoch": 0.41776147415116877, + "grad_norm": 1.9936533004605224, + "learning_rate": 6.549026171695799e-05, + "loss": 0.294, + "step": 4897 + }, + { + "epoch": 0.4178467838252858, + "grad_norm": 1.6270256621641148, + "learning_rate": 6.547712556548307e-05, + "loss": 0.2347, + "step": 4898 + }, + { + "epoch": 0.41793209349940286, + "grad_norm": 1.4227355197564429, + "learning_rate": 6.546398823241188e-05, + "loss": 0.3289, + "step": 4899 + }, + { + "epoch": 0.4180174031735199, + "grad_norm": 1.3577796958411017, + "learning_rate": 6.545084971874738e-05, + "loss": 0.2786, + "step": 4900 + }, + { + "epoch": 0.41810271284763695, + "grad_norm": 1.5878272612857258, + "learning_rate": 6.543771002549259e-05, + "loss": 0.2828, + "step": 4901 + }, + { + "epoch": 0.41818802252175397, + "grad_norm": 1.7927204699847095, + "learning_rate": 6.54245691536507e-05, + "loss": 0.2872, + "step": 4902 + }, + { + "epoch": 0.41827333219587104, + "grad_norm": 1.66460750054619, + "learning_rate": 6.541142710422489e-05, + "loss": 0.2994, + "step": 4903 + }, + { + "epoch": 0.41835864186998806, + "grad_norm": 1.479481632207603, + "learning_rate": 6.539828387821854e-05, + "loss": 0.229, + "step": 4904 + }, + { + "epoch": 0.4184439515441051, + "grad_norm": 1.4846065139923608, + "learning_rate": 6.538513947663503e-05, + "loss": 0.2517, + "step": 4905 + }, + { + "epoch": 0.41852926121822215, + "grad_norm": 1.402261290609683, + "learning_rate": 6.537199390047786e-05, + "loss": 0.2773, + "step": 4906 + }, + { + "epoch": 0.41861457089233917, + "grad_norm": 1.3888321366901035, + "learning_rate": 6.535884715075067e-05, + "loss": 0.3197, + "step": 4907 + }, + { + "epoch": 0.41869988056645624, + "grad_norm": 1.3495215733808272, + "learning_rate": 6.53456992284571e-05, + "loss": 0.2077, + "step": 4908 + }, + { + "epoch": 0.41878519024057326, + "grad_norm": 1.5438421553931745, + "learning_rate": 6.533255013460095e-05, + "loss": 0.3054, + "step": 4909 + }, + { + "epoch": 0.41887049991469033, + "grad_norm": 1.4245710697782408, + "learning_rate": 6.531939987018608e-05, + "loss": 0.216, + "step": 4910 + }, + { + "epoch": 0.41895580958880735, + "grad_norm": 1.4661093165817223, + "learning_rate": 6.530624843621644e-05, + "loss": 0.2576, + "step": 4911 + }, + { + "epoch": 0.4190411192629244, + "grad_norm": 1.3902580548611994, + "learning_rate": 6.529309583369605e-05, + "loss": 0.276, + "step": 4912 + }, + { + "epoch": 0.41912642893704144, + "grad_norm": 1.4022539795431253, + "learning_rate": 6.527994206362907e-05, + "loss": 0.2757, + "step": 4913 + }, + { + "epoch": 0.4192117386111585, + "grad_norm": 1.5297317763438636, + "learning_rate": 6.526678712701973e-05, + "loss": 0.3004, + "step": 4914 + }, + { + "epoch": 0.41929704828527553, + "grad_norm": 1.5414544896012865, + "learning_rate": 6.52536310248723e-05, + "loss": 0.2944, + "step": 4915 + }, + { + "epoch": 0.4193823579593926, + "grad_norm": 1.6844490162399595, + "learning_rate": 6.524047375819118e-05, + "loss": 0.2778, + "step": 4916 + }, + { + "epoch": 0.4194676676335096, + "grad_norm": 1.561239726122121, + "learning_rate": 6.522731532798091e-05, + "loss": 0.3231, + "step": 4917 + }, + { + "epoch": 0.4195529773076267, + "grad_norm": 1.249434009206129, + "learning_rate": 6.521415573524603e-05, + "loss": 0.258, + "step": 4918 + }, + { + "epoch": 0.4196382869817437, + "grad_norm": 1.4933090389069346, + "learning_rate": 6.520099498099118e-05, + "loss": 0.3221, + "step": 4919 + }, + { + "epoch": 0.4197235966558608, + "grad_norm": 1.562441786635312, + "learning_rate": 6.518783306622116e-05, + "loss": 0.3249, + "step": 4920 + }, + { + "epoch": 0.4198089063299778, + "grad_norm": 1.5202551835832265, + "learning_rate": 6.517466999194079e-05, + "loss": 0.2882, + "step": 4921 + }, + { + "epoch": 0.4198942160040949, + "grad_norm": 1.5399782970682685, + "learning_rate": 6.516150575915502e-05, + "loss": 0.2242, + "step": 4922 + }, + { + "epoch": 0.4199795256782119, + "grad_norm": 1.8064962463424374, + "learning_rate": 6.514834036886884e-05, + "loss": 0.3279, + "step": 4923 + }, + { + "epoch": 0.42006483535232897, + "grad_norm": 1.5646207531527534, + "learning_rate": 6.513517382208737e-05, + "loss": 0.2641, + "step": 4924 + }, + { + "epoch": 0.420150145026446, + "grad_norm": 1.5347325174591449, + "learning_rate": 6.51220061198158e-05, + "loss": 0.2491, + "step": 4925 + }, + { + "epoch": 0.42023545470056306, + "grad_norm": 1.5170877221531271, + "learning_rate": 6.510883726305943e-05, + "loss": 0.2768, + "step": 4926 + }, + { + "epoch": 0.4203207643746801, + "grad_norm": 1.5500417334568006, + "learning_rate": 6.509566725282362e-05, + "loss": 0.3113, + "step": 4927 + }, + { + "epoch": 0.42040607404879715, + "grad_norm": 1.3096807719162282, + "learning_rate": 6.508249609011384e-05, + "loss": 0.2495, + "step": 4928 + }, + { + "epoch": 0.42049138372291417, + "grad_norm": 1.642535595791203, + "learning_rate": 6.506932377593562e-05, + "loss": 0.2764, + "step": 4929 + }, + { + "epoch": 0.42057669339703124, + "grad_norm": 1.659510372408742, + "learning_rate": 6.505615031129462e-05, + "loss": 0.243, + "step": 4930 + }, + { + "epoch": 0.42066200307114826, + "grad_norm": 1.5249779371322834, + "learning_rate": 6.504297569719654e-05, + "loss": 0.2424, + "step": 4931 + }, + { + "epoch": 0.42074731274526533, + "grad_norm": 1.486710808673122, + "learning_rate": 6.502979993464723e-05, + "loss": 0.3179, + "step": 4932 + }, + { + "epoch": 0.42083262241938235, + "grad_norm": 1.903080165190847, + "learning_rate": 6.501662302465254e-05, + "loss": 0.2834, + "step": 4933 + }, + { + "epoch": 0.4209179320934994, + "grad_norm": 1.5632935606906888, + "learning_rate": 6.50034449682185e-05, + "loss": 0.2689, + "step": 4934 + }, + { + "epoch": 0.42100324176761644, + "grad_norm": 1.6050322373144421, + "learning_rate": 6.499026576635115e-05, + "loss": 0.3088, + "step": 4935 + }, + { + "epoch": 0.4210885514417335, + "grad_norm": 1.740569111390786, + "learning_rate": 6.497708542005666e-05, + "loss": 0.2892, + "step": 4936 + }, + { + "epoch": 0.42117386111585053, + "grad_norm": 1.3154297781238369, + "learning_rate": 6.496390393034129e-05, + "loss": 0.2344, + "step": 4937 + }, + { + "epoch": 0.4212591707899676, + "grad_norm": 1.532265229281269, + "learning_rate": 6.495072129821136e-05, + "loss": 0.3421, + "step": 4938 + }, + { + "epoch": 0.4213444804640846, + "grad_norm": 1.8399028404125592, + "learning_rate": 6.493753752467334e-05, + "loss": 0.3379, + "step": 4939 + }, + { + "epoch": 0.4214297901382017, + "grad_norm": 1.7988961172766964, + "learning_rate": 6.492435261073368e-05, + "loss": 0.3361, + "step": 4940 + }, + { + "epoch": 0.4215150998123187, + "grad_norm": 1.8284413642692512, + "learning_rate": 6.491116655739902e-05, + "loss": 0.2747, + "step": 4941 + }, + { + "epoch": 0.42160040948643573, + "grad_norm": 1.5882839640056128, + "learning_rate": 6.489797936567603e-05, + "loss": 0.2795, + "step": 4942 + }, + { + "epoch": 0.4216857191605528, + "grad_norm": 1.8040326404198443, + "learning_rate": 6.488479103657149e-05, + "loss": 0.3335, + "step": 4943 + }, + { + "epoch": 0.4217710288346698, + "grad_norm": 1.3903690059648923, + "learning_rate": 6.487160157109224e-05, + "loss": 0.309, + "step": 4944 + }, + { + "epoch": 0.4218563385087869, + "grad_norm": 1.9079833358020941, + "learning_rate": 6.485841097024524e-05, + "loss": 0.3351, + "step": 4945 + }, + { + "epoch": 0.4219416481829039, + "grad_norm": 1.5300247584315667, + "learning_rate": 6.484521923503752e-05, + "loss": 0.3226, + "step": 4946 + }, + { + "epoch": 0.422026957857021, + "grad_norm": 1.3514725318908878, + "learning_rate": 6.48320263664762e-05, + "loss": 0.2983, + "step": 4947 + }, + { + "epoch": 0.422112267531138, + "grad_norm": 1.5308135052793193, + "learning_rate": 6.481883236556848e-05, + "loss": 0.3232, + "step": 4948 + }, + { + "epoch": 0.4221975772052551, + "grad_norm": 1.4037938178570202, + "learning_rate": 6.480563723332167e-05, + "loss": 0.324, + "step": 4949 + }, + { + "epoch": 0.4222828868793721, + "grad_norm": 1.5944347032758206, + "learning_rate": 6.479244097074313e-05, + "loss": 0.3028, + "step": 4950 + }, + { + "epoch": 0.4223681965534892, + "grad_norm": 1.4885448312514427, + "learning_rate": 6.477924357884031e-05, + "loss": 0.3116, + "step": 4951 + }, + { + "epoch": 0.4224535062276062, + "grad_norm": 1.5937077385778133, + "learning_rate": 6.47660450586208e-05, + "loss": 0.3033, + "step": 4952 + }, + { + "epoch": 0.42253881590172326, + "grad_norm": 1.649411243822525, + "learning_rate": 6.475284541109221e-05, + "loss": 0.2855, + "step": 4953 + }, + { + "epoch": 0.4226241255758403, + "grad_norm": 1.52705915292259, + "learning_rate": 6.473964463726228e-05, + "loss": 0.3377, + "step": 4954 + }, + { + "epoch": 0.42270943524995735, + "grad_norm": 1.3779801238499283, + "learning_rate": 6.47264427381388e-05, + "loss": 0.2899, + "step": 4955 + }, + { + "epoch": 0.4227947449240744, + "grad_norm": 1.5592318115543584, + "learning_rate": 6.471323971472966e-05, + "loss": 0.2282, + "step": 4956 + }, + { + "epoch": 0.42288005459819145, + "grad_norm": 1.5602426626298498, + "learning_rate": 6.470003556804286e-05, + "loss": 0.2855, + "step": 4957 + }, + { + "epoch": 0.42296536427230846, + "grad_norm": 1.1980278737514418, + "learning_rate": 6.468683029908647e-05, + "loss": 0.2804, + "step": 4958 + }, + { + "epoch": 0.42305067394642554, + "grad_norm": 1.5849367940195556, + "learning_rate": 6.467362390886862e-05, + "loss": 0.3368, + "step": 4959 + }, + { + "epoch": 0.42313598362054256, + "grad_norm": 1.4678125839444536, + "learning_rate": 6.466041639839757e-05, + "loss": 0.2736, + "step": 4960 + }, + { + "epoch": 0.42322129329465963, + "grad_norm": 1.4901521392570372, + "learning_rate": 6.464720776868163e-05, + "loss": 0.2968, + "step": 4961 + }, + { + "epoch": 0.42330660296877665, + "grad_norm": 1.3432612306461535, + "learning_rate": 6.46339980207292e-05, + "loss": 0.2845, + "step": 4962 + }, + { + "epoch": 0.4233919126428937, + "grad_norm": 1.3043811918220463, + "learning_rate": 6.46207871555488e-05, + "loss": 0.2664, + "step": 4963 + }, + { + "epoch": 0.42347722231701074, + "grad_norm": 1.4137524006601419, + "learning_rate": 6.4607575174149e-05, + "loss": 0.2509, + "step": 4964 + }, + { + "epoch": 0.4235625319911278, + "grad_norm": 1.6137478871704738, + "learning_rate": 6.459436207753846e-05, + "loss": 0.3325, + "step": 4965 + }, + { + "epoch": 0.42364784166524483, + "grad_norm": 1.351761114267749, + "learning_rate": 6.458114786672593e-05, + "loss": 0.2688, + "step": 4966 + }, + { + "epoch": 0.4237331513393619, + "grad_norm": 1.6542707889306185, + "learning_rate": 6.456793254272023e-05, + "loss": 0.3113, + "step": 4967 + }, + { + "epoch": 0.4238184610134789, + "grad_norm": 1.5603414502963964, + "learning_rate": 6.455471610653031e-05, + "loss": 0.2442, + "step": 4968 + }, + { + "epoch": 0.423903770687596, + "grad_norm": 1.839023804237256, + "learning_rate": 6.454149855916513e-05, + "loss": 0.27, + "step": 4969 + }, + { + "epoch": 0.423989080361713, + "grad_norm": 1.5564121667408446, + "learning_rate": 6.452827990163384e-05, + "loss": 0.2392, + "step": 4970 + }, + { + "epoch": 0.4240743900358301, + "grad_norm": 1.4868400584376922, + "learning_rate": 6.451506013494558e-05, + "loss": 0.2621, + "step": 4971 + }, + { + "epoch": 0.4241596997099471, + "grad_norm": 1.500700469178763, + "learning_rate": 6.45018392601096e-05, + "loss": 0.2775, + "step": 4972 + }, + { + "epoch": 0.4242450093840642, + "grad_norm": 1.6701020439174272, + "learning_rate": 6.448861727813526e-05, + "loss": 0.2647, + "step": 4973 + }, + { + "epoch": 0.4243303190581812, + "grad_norm": 1.4226182944080512, + "learning_rate": 6.447539419003198e-05, + "loss": 0.2315, + "step": 4974 + }, + { + "epoch": 0.42441562873229827, + "grad_norm": 1.4236864561079365, + "learning_rate": 6.446216999680928e-05, + "loss": 0.2487, + "step": 4975 + }, + { + "epoch": 0.4245009384064153, + "grad_norm": 1.4202436373795913, + "learning_rate": 6.444894469947677e-05, + "loss": 0.2817, + "step": 4976 + }, + { + "epoch": 0.42458624808053236, + "grad_norm": 1.4675060442356496, + "learning_rate": 6.443571829904408e-05, + "loss": 0.2724, + "step": 4977 + }, + { + "epoch": 0.4246715577546494, + "grad_norm": 1.592299156279491, + "learning_rate": 6.442249079652103e-05, + "loss": 0.3438, + "step": 4978 + }, + { + "epoch": 0.42475686742876645, + "grad_norm": 1.196091491577488, + "learning_rate": 6.440926219291744e-05, + "loss": 0.2619, + "step": 4979 + }, + { + "epoch": 0.42484217710288347, + "grad_norm": 1.2954769642120532, + "learning_rate": 6.439603248924325e-05, + "loss": 0.2652, + "step": 4980 + }, + { + "epoch": 0.4249274867770005, + "grad_norm": 1.403539631163416, + "learning_rate": 6.438280168650849e-05, + "loss": 0.2667, + "step": 4981 + }, + { + "epoch": 0.42501279645111756, + "grad_norm": 1.8838834763632937, + "learning_rate": 6.436956978572324e-05, + "loss": 0.2729, + "step": 4982 + }, + { + "epoch": 0.4250981061252346, + "grad_norm": 1.5993999488497601, + "learning_rate": 6.435633678789769e-05, + "loss": 0.2804, + "step": 4983 + }, + { + "epoch": 0.42518341579935165, + "grad_norm": 1.363487162519452, + "learning_rate": 6.434310269404214e-05, + "loss": 0.2936, + "step": 4984 + }, + { + "epoch": 0.42526872547346867, + "grad_norm": 1.62078486797503, + "learning_rate": 6.432986750516692e-05, + "loss": 0.2939, + "step": 4985 + }, + { + "epoch": 0.42535403514758574, + "grad_norm": 1.476602705150794, + "learning_rate": 6.431663122228245e-05, + "loss": 0.2471, + "step": 4986 + }, + { + "epoch": 0.42543934482170276, + "grad_norm": 1.8929551952599397, + "learning_rate": 6.430339384639927e-05, + "loss": 0.2936, + "step": 4987 + }, + { + "epoch": 0.42552465449581983, + "grad_norm": 1.8429736426378662, + "learning_rate": 6.429015537852797e-05, + "loss": 0.2385, + "step": 4988 + }, + { + "epoch": 0.42560996416993685, + "grad_norm": 1.550562230553712, + "learning_rate": 6.427691581967925e-05, + "loss": 0.3116, + "step": 4989 + }, + { + "epoch": 0.4256952738440539, + "grad_norm": 1.7543326966037327, + "learning_rate": 6.426367517086387e-05, + "loss": 0.3164, + "step": 4990 + }, + { + "epoch": 0.42578058351817094, + "grad_norm": 1.4051427084508785, + "learning_rate": 6.42504334330927e-05, + "loss": 0.2847, + "step": 4991 + }, + { + "epoch": 0.425865893192288, + "grad_norm": 1.750581848963728, + "learning_rate": 6.423719060737665e-05, + "loss": 0.2097, + "step": 4992 + }, + { + "epoch": 0.42595120286640503, + "grad_norm": 1.665060397819686, + "learning_rate": 6.422394669472676e-05, + "loss": 0.3133, + "step": 4993 + }, + { + "epoch": 0.4260365125405221, + "grad_norm": 1.388764154343188, + "learning_rate": 6.421070169615411e-05, + "loss": 0.2923, + "step": 4994 + }, + { + "epoch": 0.4261218222146391, + "grad_norm": 1.7784467355786293, + "learning_rate": 6.419745561266993e-05, + "loss": 0.3088, + "step": 4995 + }, + { + "epoch": 0.4262071318887562, + "grad_norm": 1.701641871875404, + "learning_rate": 6.418420844528545e-05, + "loss": 0.3192, + "step": 4996 + }, + { + "epoch": 0.4262924415628732, + "grad_norm": 1.455558625833575, + "learning_rate": 6.417096019501203e-05, + "loss": 0.2874, + "step": 4997 + }, + { + "epoch": 0.4263777512369903, + "grad_norm": 1.442698739177142, + "learning_rate": 6.415771086286109e-05, + "loss": 0.2427, + "step": 4998 + }, + { + "epoch": 0.4264630609111073, + "grad_norm": 1.6258838890632172, + "learning_rate": 6.414446044984417e-05, + "loss": 0.2999, + "step": 4999 + }, + { + "epoch": 0.4265483705852244, + "grad_norm": 1.510672747070879, + "learning_rate": 6.413120895697287e-05, + "loss": 0.2629, + "step": 5000 + }, + { + "epoch": 0.4266336802593414, + "grad_norm": 1.6736754233146347, + "learning_rate": 6.411795638525883e-05, + "loss": 0.3318, + "step": 5001 + }, + { + "epoch": 0.42671898993345847, + "grad_norm": 1.3971222814373248, + "learning_rate": 6.410470273571387e-05, + "loss": 0.2411, + "step": 5002 + }, + { + "epoch": 0.4268042996075755, + "grad_norm": 1.5315214422545076, + "learning_rate": 6.409144800934979e-05, + "loss": 0.3034, + "step": 5003 + }, + { + "epoch": 0.42688960928169256, + "grad_norm": 1.5365076692925421, + "learning_rate": 6.407819220717855e-05, + "loss": 0.2953, + "step": 5004 + }, + { + "epoch": 0.4269749189558096, + "grad_norm": 1.5956503813861982, + "learning_rate": 6.406493533021213e-05, + "loss": 0.2752, + "step": 5005 + }, + { + "epoch": 0.42706022862992665, + "grad_norm": 1.5986825495994885, + "learning_rate": 6.405167737946265e-05, + "loss": 0.2617, + "step": 5006 + }, + { + "epoch": 0.42714553830404367, + "grad_norm": 1.4907225762825478, + "learning_rate": 6.403841835594228e-05, + "loss": 0.282, + "step": 5007 + }, + { + "epoch": 0.42723084797816074, + "grad_norm": 1.3242027709356947, + "learning_rate": 6.402515826066327e-05, + "loss": 0.2783, + "step": 5008 + }, + { + "epoch": 0.42731615765227776, + "grad_norm": 1.4581983049641574, + "learning_rate": 6.401189709463794e-05, + "loss": 0.3307, + "step": 5009 + }, + { + "epoch": 0.42740146732639483, + "grad_norm": 1.4673970260754279, + "learning_rate": 6.399863485887873e-05, + "loss": 0.3069, + "step": 5010 + }, + { + "epoch": 0.42748677700051185, + "grad_norm": 1.2355954860467726, + "learning_rate": 6.398537155439812e-05, + "loss": 0.3739, + "step": 5011 + }, + { + "epoch": 0.4275720866746289, + "grad_norm": 1.3512115904989064, + "learning_rate": 6.397210718220874e-05, + "loss": 0.2415, + "step": 5012 + }, + { + "epoch": 0.42765739634874594, + "grad_norm": 1.4644537241179756, + "learning_rate": 6.395884174332322e-05, + "loss": 0.1983, + "step": 5013 + }, + { + "epoch": 0.427742706022863, + "grad_norm": 1.5087787911766077, + "learning_rate": 6.394557523875428e-05, + "loss": 0.2475, + "step": 5014 + }, + { + "epoch": 0.42782801569698004, + "grad_norm": 1.3224143729225564, + "learning_rate": 6.393230766951481e-05, + "loss": 0.2179, + "step": 5015 + }, + { + "epoch": 0.4279133253710971, + "grad_norm": 1.2498365772230822, + "learning_rate": 6.391903903661768e-05, + "loss": 0.2203, + "step": 5016 + }, + { + "epoch": 0.4279986350452141, + "grad_norm": 1.4537432596143887, + "learning_rate": 6.390576934107589e-05, + "loss": 0.2929, + "step": 5017 + }, + { + "epoch": 0.4280839447193312, + "grad_norm": 1.4774945259315564, + "learning_rate": 6.389249858390251e-05, + "loss": 0.2558, + "step": 5018 + }, + { + "epoch": 0.4281692543934482, + "grad_norm": 1.406540734648776, + "learning_rate": 6.387922676611065e-05, + "loss": 0.2555, + "step": 5019 + }, + { + "epoch": 0.42825456406756524, + "grad_norm": 1.9842722107705277, + "learning_rate": 6.386595388871361e-05, + "loss": 0.3068, + "step": 5020 + }, + { + "epoch": 0.4283398737416823, + "grad_norm": 1.4244747498866293, + "learning_rate": 6.385267995272468e-05, + "loss": 0.2441, + "step": 5021 + }, + { + "epoch": 0.4284251834157993, + "grad_norm": 1.707643796191413, + "learning_rate": 6.383940495915723e-05, + "loss": 0.2738, + "step": 5022 + }, + { + "epoch": 0.4285104930899164, + "grad_norm": 1.5359239672727285, + "learning_rate": 6.382612890902478e-05, + "loss": 0.2605, + "step": 5023 + }, + { + "epoch": 0.4285958027640334, + "grad_norm": 1.567758409412175, + "learning_rate": 6.381285180334084e-05, + "loss": 0.2985, + "step": 5024 + }, + { + "epoch": 0.4286811124381505, + "grad_norm": 1.9627545209926593, + "learning_rate": 6.379957364311905e-05, + "loss": 0.2608, + "step": 5025 + }, + { + "epoch": 0.4287664221122675, + "grad_norm": 2.0602531910822868, + "learning_rate": 6.378629442937318e-05, + "loss": 0.279, + "step": 5026 + }, + { + "epoch": 0.4288517317863846, + "grad_norm": 1.2433014679132959, + "learning_rate": 6.377301416311696e-05, + "loss": 0.2331, + "step": 5027 + }, + { + "epoch": 0.4289370414605016, + "grad_norm": 1.5937912131571954, + "learning_rate": 6.375973284536432e-05, + "loss": 0.2771, + "step": 5028 + }, + { + "epoch": 0.4290223511346187, + "grad_norm": 1.649112219828119, + "learning_rate": 6.374645047712919e-05, + "loss": 0.2727, + "step": 5029 + }, + { + "epoch": 0.4291076608087357, + "grad_norm": 1.4039206816664418, + "learning_rate": 6.37331670594256e-05, + "loss": 0.2353, + "step": 5030 + }, + { + "epoch": 0.42919297048285276, + "grad_norm": 1.46114572275325, + "learning_rate": 6.371988259326771e-05, + "loss": 0.3018, + "step": 5031 + }, + { + "epoch": 0.4292782801569698, + "grad_norm": 1.5280234929171135, + "learning_rate": 6.370659707966967e-05, + "loss": 0.3462, + "step": 5032 + }, + { + "epoch": 0.42936358983108686, + "grad_norm": 1.4756458000193908, + "learning_rate": 6.369331051964579e-05, + "loss": 0.2652, + "step": 5033 + }, + { + "epoch": 0.4294488995052039, + "grad_norm": 1.4364643512661552, + "learning_rate": 6.368002291421042e-05, + "loss": 0.2726, + "step": 5034 + }, + { + "epoch": 0.42953420917932095, + "grad_norm": 1.4852242700530909, + "learning_rate": 6.366673426437797e-05, + "loss": 0.2802, + "step": 5035 + }, + { + "epoch": 0.42961951885343796, + "grad_norm": 1.5808239509868491, + "learning_rate": 6.365344457116301e-05, + "loss": 0.2715, + "step": 5036 + }, + { + "epoch": 0.42970482852755504, + "grad_norm": 1.4000692810218613, + "learning_rate": 6.36401538355801e-05, + "loss": 0.2791, + "step": 5037 + }, + { + "epoch": 0.42979013820167206, + "grad_norm": 1.4924541454843063, + "learning_rate": 6.362686205864394e-05, + "loss": 0.343, + "step": 5038 + }, + { + "epoch": 0.42987544787578913, + "grad_norm": 1.6734033889214617, + "learning_rate": 6.361356924136928e-05, + "loss": 0.2703, + "step": 5039 + }, + { + "epoch": 0.42996075754990615, + "grad_norm": 1.3009234284834132, + "learning_rate": 6.360027538477094e-05, + "loss": 0.247, + "step": 5040 + }, + { + "epoch": 0.4300460672240232, + "grad_norm": 1.4939258296382887, + "learning_rate": 6.358698048986384e-05, + "loss": 0.287, + "step": 5041 + }, + { + "epoch": 0.43013137689814024, + "grad_norm": 1.497857630189252, + "learning_rate": 6.357368455766299e-05, + "loss": 0.3244, + "step": 5042 + }, + { + "epoch": 0.4302166865722573, + "grad_norm": 1.4444708770795052, + "learning_rate": 6.356038758918344e-05, + "loss": 0.2579, + "step": 5043 + }, + { + "epoch": 0.43030199624637433, + "grad_norm": 1.519471939616116, + "learning_rate": 6.354708958544038e-05, + "loss": 0.2819, + "step": 5044 + }, + { + "epoch": 0.4303873059204914, + "grad_norm": 1.634957980161359, + "learning_rate": 6.353379054744901e-05, + "loss": 0.2955, + "step": 5045 + }, + { + "epoch": 0.4304726155946084, + "grad_norm": 1.5117636977026703, + "learning_rate": 6.352049047622463e-05, + "loss": 0.3025, + "step": 5046 + }, + { + "epoch": 0.4305579252687255, + "grad_norm": 1.636549619981285, + "learning_rate": 6.350718937278269e-05, + "loss": 0.3128, + "step": 5047 + }, + { + "epoch": 0.4306432349428425, + "grad_norm": 1.2979218268049522, + "learning_rate": 6.349388723813859e-05, + "loss": 0.2777, + "step": 5048 + }, + { + "epoch": 0.4307285446169596, + "grad_norm": 1.2476425829421292, + "learning_rate": 6.348058407330792e-05, + "loss": 0.246, + "step": 5049 + }, + { + "epoch": 0.4308138542910766, + "grad_norm": 1.4239605709896446, + "learning_rate": 6.346727987930628e-05, + "loss": 0.3025, + "step": 5050 + }, + { + "epoch": 0.4308991639651937, + "grad_norm": 1.70583820334247, + "learning_rate": 6.345397465714939e-05, + "loss": 0.3105, + "step": 5051 + }, + { + "epoch": 0.4309844736393107, + "grad_norm": 1.3631100260153917, + "learning_rate": 6.344066840785302e-05, + "loss": 0.247, + "step": 5052 + }, + { + "epoch": 0.43106978331342777, + "grad_norm": 1.5509798705850315, + "learning_rate": 6.342736113243305e-05, + "loss": 0.3193, + "step": 5053 + }, + { + "epoch": 0.4311550929875448, + "grad_norm": 1.5376578955481142, + "learning_rate": 6.341405283190541e-05, + "loss": 0.3019, + "step": 5054 + }, + { + "epoch": 0.43124040266166186, + "grad_norm": 1.3880101168316028, + "learning_rate": 6.340074350728612e-05, + "loss": 0.2905, + "step": 5055 + }, + { + "epoch": 0.4313257123357789, + "grad_norm": 1.477145286878499, + "learning_rate": 6.338743315959127e-05, + "loss": 0.2689, + "step": 5056 + }, + { + "epoch": 0.4314110220098959, + "grad_norm": 1.3504475310950526, + "learning_rate": 6.337412178983704e-05, + "loss": 0.24, + "step": 5057 + }, + { + "epoch": 0.43149633168401297, + "grad_norm": 1.241861936503385, + "learning_rate": 6.336080939903968e-05, + "loss": 0.2697, + "step": 5058 + }, + { + "epoch": 0.43158164135813, + "grad_norm": 1.548566721615955, + "learning_rate": 6.334749598821555e-05, + "loss": 0.261, + "step": 5059 + }, + { + "epoch": 0.43166695103224706, + "grad_norm": 1.3921929852755426, + "learning_rate": 6.3334181558381e-05, + "loss": 0.2484, + "step": 5060 + }, + { + "epoch": 0.4317522607063641, + "grad_norm": 1.2917589134173089, + "learning_rate": 6.332086611055255e-05, + "loss": 0.2546, + "step": 5061 + }, + { + "epoch": 0.43183757038048115, + "grad_norm": 1.6515259939903106, + "learning_rate": 6.330754964574676e-05, + "loss": 0.3036, + "step": 5062 + }, + { + "epoch": 0.43192288005459817, + "grad_norm": 1.6574933405493533, + "learning_rate": 6.329423216498027e-05, + "loss": 0.2998, + "step": 5063 + }, + { + "epoch": 0.43200818972871524, + "grad_norm": 1.4420528460592685, + "learning_rate": 6.328091366926979e-05, + "loss": 0.2514, + "step": 5064 + }, + { + "epoch": 0.43209349940283226, + "grad_norm": 1.7987987006449317, + "learning_rate": 6.326759415963216e-05, + "loss": 0.277, + "step": 5065 + }, + { + "epoch": 0.43217880907694933, + "grad_norm": 1.9954484168879056, + "learning_rate": 6.325427363708418e-05, + "loss": 0.3327, + "step": 5066 + }, + { + "epoch": 0.43226411875106635, + "grad_norm": 1.4983881396297263, + "learning_rate": 6.324095210264286e-05, + "loss": 0.2754, + "step": 5067 + }, + { + "epoch": 0.4323494284251834, + "grad_norm": 1.3265107161588363, + "learning_rate": 6.322762955732521e-05, + "loss": 0.2447, + "step": 5068 + }, + { + "epoch": 0.43243473809930044, + "grad_norm": 1.3503106995958802, + "learning_rate": 6.321430600214832e-05, + "loss": 0.3108, + "step": 5069 + }, + { + "epoch": 0.4325200477734175, + "grad_norm": 1.7680134779817702, + "learning_rate": 6.320098143812942e-05, + "loss": 0.3397, + "step": 5070 + }, + { + "epoch": 0.43260535744753453, + "grad_norm": 1.7289890826240784, + "learning_rate": 6.318765586628572e-05, + "loss": 0.2652, + "step": 5071 + }, + { + "epoch": 0.4326906671216516, + "grad_norm": 1.2743224830900588, + "learning_rate": 6.317432928763456e-05, + "loss": 0.2929, + "step": 5072 + }, + { + "epoch": 0.4327759767957686, + "grad_norm": 1.4830763104818316, + "learning_rate": 6.316100170319337e-05, + "loss": 0.2783, + "step": 5073 + }, + { + "epoch": 0.4328612864698857, + "grad_norm": 1.4062855610059903, + "learning_rate": 6.314767311397966e-05, + "loss": 0.2222, + "step": 5074 + }, + { + "epoch": 0.4329465961440027, + "grad_norm": 1.970494662885739, + "learning_rate": 6.313434352101095e-05, + "loss": 0.2926, + "step": 5075 + }, + { + "epoch": 0.4330319058181198, + "grad_norm": 1.276004474123821, + "learning_rate": 6.312101292530492e-05, + "loss": 0.2947, + "step": 5076 + }, + { + "epoch": 0.4331172154922368, + "grad_norm": 1.644434257782209, + "learning_rate": 6.310768132787928e-05, + "loss": 0.2534, + "step": 5077 + }, + { + "epoch": 0.4332025251663539, + "grad_norm": 1.8482645658468353, + "learning_rate": 6.309434872975181e-05, + "loss": 0.2576, + "step": 5078 + }, + { + "epoch": 0.4332878348404709, + "grad_norm": 1.6113810952467613, + "learning_rate": 6.308101513194041e-05, + "loss": 0.2466, + "step": 5079 + }, + { + "epoch": 0.43337314451458797, + "grad_norm": 1.6173753122561878, + "learning_rate": 6.306768053546302e-05, + "loss": 0.2702, + "step": 5080 + }, + { + "epoch": 0.433458454188705, + "grad_norm": 1.2983841904623148, + "learning_rate": 6.305434494133766e-05, + "loss": 0.2853, + "step": 5081 + }, + { + "epoch": 0.43354376386282206, + "grad_norm": 1.4637733719521049, + "learning_rate": 6.304100835058244e-05, + "loss": 0.2471, + "step": 5082 + }, + { + "epoch": 0.4336290735369391, + "grad_norm": 1.6074859495127352, + "learning_rate": 6.302767076421552e-05, + "loss": 0.3184, + "step": 5083 + }, + { + "epoch": 0.43371438321105615, + "grad_norm": 1.5560416453655141, + "learning_rate": 6.301433218325518e-05, + "loss": 0.2535, + "step": 5084 + }, + { + "epoch": 0.43379969288517317, + "grad_norm": 1.6192441384941332, + "learning_rate": 6.300099260871972e-05, + "loss": 0.3169, + "step": 5085 + }, + { + "epoch": 0.43388500255929024, + "grad_norm": 1.521379068082254, + "learning_rate": 6.298765204162757e-05, + "loss": 0.3261, + "step": 5086 + }, + { + "epoch": 0.43397031223340726, + "grad_norm": 1.9326284137417122, + "learning_rate": 6.29743104829972e-05, + "loss": 0.266, + "step": 5087 + }, + { + "epoch": 0.43405562190752434, + "grad_norm": 1.454326993890119, + "learning_rate": 6.296096793384716e-05, + "loss": 0.2344, + "step": 5088 + }, + { + "epoch": 0.43414093158164135, + "grad_norm": 1.6085421434881086, + "learning_rate": 6.29476243951961e-05, + "loss": 0.3151, + "step": 5089 + }, + { + "epoch": 0.4342262412557584, + "grad_norm": 1.4742993970037481, + "learning_rate": 6.293427986806274e-05, + "loss": 0.2391, + "step": 5090 + }, + { + "epoch": 0.43431155092987545, + "grad_norm": 1.4140353964215566, + "learning_rate": 6.292093435346583e-05, + "loss": 0.2673, + "step": 5091 + }, + { + "epoch": 0.4343968606039925, + "grad_norm": 1.461170687909295, + "learning_rate": 6.290758785242425e-05, + "loss": 0.2949, + "step": 5092 + }, + { + "epoch": 0.43448217027810954, + "grad_norm": 1.435889627449407, + "learning_rate": 6.289424036595693e-05, + "loss": 0.2601, + "step": 5093 + }, + { + "epoch": 0.4345674799522266, + "grad_norm": 1.3765646961554026, + "learning_rate": 6.288089189508286e-05, + "loss": 0.2792, + "step": 5094 + }, + { + "epoch": 0.4346527896263436, + "grad_norm": 1.4944249499694886, + "learning_rate": 6.286754244082115e-05, + "loss": 0.2523, + "step": 5095 + }, + { + "epoch": 0.43473809930046065, + "grad_norm": 1.639685225400661, + "learning_rate": 6.285419200419095e-05, + "loss": 0.2736, + "step": 5096 + }, + { + "epoch": 0.4348234089745777, + "grad_norm": 1.2794456337204412, + "learning_rate": 6.28408405862115e-05, + "loss": 0.215, + "step": 5097 + }, + { + "epoch": 0.43490871864869474, + "grad_norm": 1.3889299842264815, + "learning_rate": 6.282748818790212e-05, + "loss": 0.2298, + "step": 5098 + }, + { + "epoch": 0.4349940283228118, + "grad_norm": 1.500298152537951, + "learning_rate": 6.281413481028217e-05, + "loss": 0.2499, + "step": 5099 + }, + { + "epoch": 0.4350793379969288, + "grad_norm": 1.6133842527745306, + "learning_rate": 6.280078045437113e-05, + "loss": 0.3081, + "step": 5100 + }, + { + "epoch": 0.4351646476710459, + "grad_norm": 1.6147952607233418, + "learning_rate": 6.278742512118852e-05, + "loss": 0.3168, + "step": 5101 + }, + { + "epoch": 0.4352499573451629, + "grad_norm": 1.7215792864141926, + "learning_rate": 6.277406881175395e-05, + "loss": 0.2917, + "step": 5102 + }, + { + "epoch": 0.43533526701928, + "grad_norm": 1.6930448070529511, + "learning_rate": 6.276071152708712e-05, + "loss": 0.3192, + "step": 5103 + }, + { + "epoch": 0.435420576693397, + "grad_norm": 1.5554088663388788, + "learning_rate": 6.274735326820775e-05, + "loss": 0.2883, + "step": 5104 + }, + { + "epoch": 0.4355058863675141, + "grad_norm": 1.820750826341912, + "learning_rate": 6.273399403613572e-05, + "loss": 0.3228, + "step": 5105 + }, + { + "epoch": 0.4355911960416311, + "grad_norm": 1.3727531715874688, + "learning_rate": 6.272063383189091e-05, + "loss": 0.2265, + "step": 5106 + }, + { + "epoch": 0.4356765057157482, + "grad_norm": 1.7821076319478921, + "learning_rate": 6.27072726564933e-05, + "loss": 0.3114, + "step": 5107 + }, + { + "epoch": 0.4357618153898652, + "grad_norm": 1.5560485403045548, + "learning_rate": 6.269391051096295e-05, + "loss": 0.2492, + "step": 5108 + }, + { + "epoch": 0.43584712506398227, + "grad_norm": 2.3669489875714738, + "learning_rate": 6.268054739631999e-05, + "loss": 0.3562, + "step": 5109 + }, + { + "epoch": 0.4359324347380993, + "grad_norm": 1.6685066874424086, + "learning_rate": 6.266718331358461e-05, + "loss": 0.2433, + "step": 5110 + }, + { + "epoch": 0.43601774441221636, + "grad_norm": 1.5237833266333765, + "learning_rate": 6.265381826377711e-05, + "loss": 0.2095, + "step": 5111 + }, + { + "epoch": 0.4361030540863334, + "grad_norm": 1.3610898305998322, + "learning_rate": 6.264045224791784e-05, + "loss": 0.2883, + "step": 5112 + }, + { + "epoch": 0.43618836376045045, + "grad_norm": 1.4754608727402108, + "learning_rate": 6.26270852670272e-05, + "loss": 0.2756, + "step": 5113 + }, + { + "epoch": 0.43627367343456747, + "grad_norm": 1.6672345942215778, + "learning_rate": 6.26137173221257e-05, + "loss": 0.2749, + "step": 5114 + }, + { + "epoch": 0.43635898310868454, + "grad_norm": 1.8929589737699923, + "learning_rate": 6.26003484142339e-05, + "loss": 0.2851, + "step": 5115 + }, + { + "epoch": 0.43644429278280156, + "grad_norm": 1.422836230319117, + "learning_rate": 6.258697854437247e-05, + "loss": 0.2843, + "step": 5116 + }, + { + "epoch": 0.43652960245691863, + "grad_norm": 1.5511477251173464, + "learning_rate": 6.25736077135621e-05, + "loss": 0.2994, + "step": 5117 + }, + { + "epoch": 0.43661491213103565, + "grad_norm": 1.5758007799185605, + "learning_rate": 6.25602359228236e-05, + "loss": 0.2504, + "step": 5118 + }, + { + "epoch": 0.4367002218051527, + "grad_norm": 1.3434544615330335, + "learning_rate": 6.254686317317785e-05, + "loss": 0.2683, + "step": 5119 + }, + { + "epoch": 0.43678553147926974, + "grad_norm": 1.3835542058220913, + "learning_rate": 6.253348946564575e-05, + "loss": 0.269, + "step": 5120 + }, + { + "epoch": 0.4368708411533868, + "grad_norm": 1.400441444145243, + "learning_rate": 6.252011480124835e-05, + "loss": 0.3492, + "step": 5121 + }, + { + "epoch": 0.43695615082750383, + "grad_norm": 1.4901112596844583, + "learning_rate": 6.250673918100671e-05, + "loss": 0.2602, + "step": 5122 + }, + { + "epoch": 0.4370414605016209, + "grad_norm": 1.6346843156909356, + "learning_rate": 6.2493362605942e-05, + "loss": 0.3288, + "step": 5123 + }, + { + "epoch": 0.4371267701757379, + "grad_norm": 1.4179501703059176, + "learning_rate": 6.247998507707545e-05, + "loss": 0.2642, + "step": 5124 + }, + { + "epoch": 0.437212079849855, + "grad_norm": 1.7672789262483783, + "learning_rate": 6.246660659542833e-05, + "loss": 0.2853, + "step": 5125 + }, + { + "epoch": 0.437297389523972, + "grad_norm": 1.5395747073099206, + "learning_rate": 6.245322716202207e-05, + "loss": 0.2801, + "step": 5126 + }, + { + "epoch": 0.4373826991980891, + "grad_norm": 1.4990915090407348, + "learning_rate": 6.243984677787808e-05, + "loss": 0.2852, + "step": 5127 + }, + { + "epoch": 0.4374680088722061, + "grad_norm": 1.6057709272631342, + "learning_rate": 6.242646544401788e-05, + "loss": 0.3041, + "step": 5128 + }, + { + "epoch": 0.4375533185463232, + "grad_norm": 1.706364900961085, + "learning_rate": 6.241308316146311e-05, + "loss": 0.2983, + "step": 5129 + }, + { + "epoch": 0.4376386282204402, + "grad_norm": 1.6541272361277293, + "learning_rate": 6.239969993123539e-05, + "loss": 0.3353, + "step": 5130 + }, + { + "epoch": 0.43772393789455727, + "grad_norm": 1.949821669420438, + "learning_rate": 6.238631575435647e-05, + "loss": 0.319, + "step": 5131 + }, + { + "epoch": 0.4378092475686743, + "grad_norm": 1.9016036644322158, + "learning_rate": 6.237293063184816e-05, + "loss": 0.3027, + "step": 5132 + }, + { + "epoch": 0.43789455724279136, + "grad_norm": 1.7997454728203253, + "learning_rate": 6.235954456473235e-05, + "loss": 0.2878, + "step": 5133 + }, + { + "epoch": 0.4379798669169084, + "grad_norm": 1.4868105532776832, + "learning_rate": 6.234615755403101e-05, + "loss": 0.3155, + "step": 5134 + }, + { + "epoch": 0.4380651765910254, + "grad_norm": 1.4993764852983147, + "learning_rate": 6.233276960076612e-05, + "loss": 0.3125, + "step": 5135 + }, + { + "epoch": 0.43815048626514247, + "grad_norm": 1.3794388647466354, + "learning_rate": 6.231938070595981e-05, + "loss": 0.2524, + "step": 5136 + }, + { + "epoch": 0.4382357959392595, + "grad_norm": 2.153658650431996, + "learning_rate": 6.230599087063426e-05, + "loss": 0.3569, + "step": 5137 + }, + { + "epoch": 0.43832110561337656, + "grad_norm": 1.2546178396989243, + "learning_rate": 6.22926000958117e-05, + "loss": 0.2758, + "step": 5138 + }, + { + "epoch": 0.4384064152874936, + "grad_norm": 1.335148762547462, + "learning_rate": 6.227920838251443e-05, + "loss": 0.2539, + "step": 5139 + }, + { + "epoch": 0.43849172496161065, + "grad_norm": 1.5401026894651864, + "learning_rate": 6.226581573176487e-05, + "loss": 0.3029, + "step": 5140 + }, + { + "epoch": 0.43857703463572767, + "grad_norm": 1.3128806879086894, + "learning_rate": 6.225242214458544e-05, + "loss": 0.2847, + "step": 5141 + }, + { + "epoch": 0.43866234430984474, + "grad_norm": 1.4588859964112695, + "learning_rate": 6.223902762199871e-05, + "loss": 0.2632, + "step": 5142 + }, + { + "epoch": 0.43874765398396176, + "grad_norm": 1.5169085543787444, + "learning_rate": 6.222563216502724e-05, + "loss": 0.3082, + "step": 5143 + }, + { + "epoch": 0.43883296365807883, + "grad_norm": 1.3512414098814172, + "learning_rate": 6.221223577469373e-05, + "loss": 0.2355, + "step": 5144 + }, + { + "epoch": 0.43891827333219585, + "grad_norm": 1.4488942073377913, + "learning_rate": 6.219883845202092e-05, + "loss": 0.2334, + "step": 5145 + }, + { + "epoch": 0.4390035830063129, + "grad_norm": 1.4788356822609379, + "learning_rate": 6.21854401980316e-05, + "loss": 0.298, + "step": 5146 + }, + { + "epoch": 0.43908889268042994, + "grad_norm": 1.400425440995251, + "learning_rate": 6.21720410137487e-05, + "loss": 0.2718, + "step": 5147 + }, + { + "epoch": 0.439174202354547, + "grad_norm": 1.639958418807465, + "learning_rate": 6.215864090019515e-05, + "loss": 0.2793, + "step": 5148 + }, + { + "epoch": 0.43925951202866403, + "grad_norm": 1.7826025662867973, + "learning_rate": 6.214523985839395e-05, + "loss": 0.3227, + "step": 5149 + }, + { + "epoch": 0.4393448217027811, + "grad_norm": 1.5873346047335588, + "learning_rate": 6.213183788936825e-05, + "loss": 0.3121, + "step": 5150 + }, + { + "epoch": 0.4394301313768981, + "grad_norm": 1.4809368269259358, + "learning_rate": 6.211843499414119e-05, + "loss": 0.2202, + "step": 5151 + }, + { + "epoch": 0.4395154410510152, + "grad_norm": 1.6675245779076318, + "learning_rate": 6.2105031173736e-05, + "loss": 0.288, + "step": 5152 + }, + { + "epoch": 0.4396007507251322, + "grad_norm": 1.3764440582968591, + "learning_rate": 6.209162642917603e-05, + "loss": 0.2348, + "step": 5153 + }, + { + "epoch": 0.4396860603992493, + "grad_norm": 1.7225934144754642, + "learning_rate": 6.207822076148462e-05, + "loss": 0.2331, + "step": 5154 + }, + { + "epoch": 0.4397713700733663, + "grad_norm": 1.5540717357588882, + "learning_rate": 6.206481417168526e-05, + "loss": 0.3335, + "step": 5155 + }, + { + "epoch": 0.4398566797474834, + "grad_norm": 1.5373201528669584, + "learning_rate": 6.205140666080143e-05, + "loss": 0.2839, + "step": 5156 + }, + { + "epoch": 0.4399419894216004, + "grad_norm": 1.5838442362965663, + "learning_rate": 6.203799822985676e-05, + "loss": 0.2013, + "step": 5157 + }, + { + "epoch": 0.4400272990957175, + "grad_norm": 1.6029133379969167, + "learning_rate": 6.202458887987488e-05, + "loss": 0.2862, + "step": 5158 + }, + { + "epoch": 0.4401126087698345, + "grad_norm": 1.5283134484670142, + "learning_rate": 6.201117861187955e-05, + "loss": 0.2927, + "step": 5159 + }, + { + "epoch": 0.44019791844395156, + "grad_norm": 1.5045056226225304, + "learning_rate": 6.199776742689454e-05, + "loss": 0.3805, + "step": 5160 + }, + { + "epoch": 0.4402832281180686, + "grad_norm": 1.7948939141720086, + "learning_rate": 6.198435532594375e-05, + "loss": 0.2245, + "step": 5161 + }, + { + "epoch": 0.44036853779218565, + "grad_norm": 1.3969265751203481, + "learning_rate": 6.197094231005112e-05, + "loss": 0.2361, + "step": 5162 + }, + { + "epoch": 0.4404538474663027, + "grad_norm": 1.5890756762173044, + "learning_rate": 6.195752838024066e-05, + "loss": 0.2551, + "step": 5163 + }, + { + "epoch": 0.44053915714041975, + "grad_norm": 1.6392128816317217, + "learning_rate": 6.194411353753646e-05, + "loss": 0.2553, + "step": 5164 + }, + { + "epoch": 0.44062446681453676, + "grad_norm": 1.4795481856925143, + "learning_rate": 6.193069778296265e-05, + "loss": 0.2202, + "step": 5165 + }, + { + "epoch": 0.44070977648865384, + "grad_norm": 2.1887993359628863, + "learning_rate": 6.191728111754347e-05, + "loss": 0.2988, + "step": 5166 + }, + { + "epoch": 0.44079508616277085, + "grad_norm": 1.9438953063229973, + "learning_rate": 6.19038635423032e-05, + "loss": 0.3341, + "step": 5167 + }, + { + "epoch": 0.44088039583688793, + "grad_norm": 1.5584100218209778, + "learning_rate": 6.18904450582662e-05, + "loss": 0.3063, + "step": 5168 + }, + { + "epoch": 0.44096570551100495, + "grad_norm": 1.4458368123623695, + "learning_rate": 6.18770256664569e-05, + "loss": 0.2736, + "step": 5169 + }, + { + "epoch": 0.441051015185122, + "grad_norm": 1.240801968191985, + "learning_rate": 6.186360536789981e-05, + "loss": 0.2749, + "step": 5170 + }, + { + "epoch": 0.44113632485923904, + "grad_norm": 1.2328721567473298, + "learning_rate": 6.185018416361951e-05, + "loss": 0.2096, + "step": 5171 + }, + { + "epoch": 0.44122163453335606, + "grad_norm": 1.588641787849778, + "learning_rate": 6.183676205464062e-05, + "loss": 0.3315, + "step": 5172 + }, + { + "epoch": 0.44130694420747313, + "grad_norm": 1.8023450437226136, + "learning_rate": 6.182333904198782e-05, + "loss": 0.2943, + "step": 5173 + }, + { + "epoch": 0.44139225388159015, + "grad_norm": 1.306969480086682, + "learning_rate": 6.180991512668594e-05, + "loss": 0.2512, + "step": 5174 + }, + { + "epoch": 0.4414775635557072, + "grad_norm": 1.3164907366075025, + "learning_rate": 6.17964903097598e-05, + "loss": 0.2462, + "step": 5175 + }, + { + "epoch": 0.44156287322982424, + "grad_norm": 1.5136432866503924, + "learning_rate": 6.17830645922343e-05, + "loss": 0.2548, + "step": 5176 + }, + { + "epoch": 0.4416481829039413, + "grad_norm": 2.2202351931832167, + "learning_rate": 6.176963797513443e-05, + "loss": 0.2466, + "step": 5177 + }, + { + "epoch": 0.44173349257805833, + "grad_norm": 1.4381161903267352, + "learning_rate": 6.175621045948524e-05, + "loss": 0.2978, + "step": 5178 + }, + { + "epoch": 0.4418188022521754, + "grad_norm": 1.8296350616576644, + "learning_rate": 6.174278204631187e-05, + "loss": 0.3037, + "step": 5179 + }, + { + "epoch": 0.4419041119262924, + "grad_norm": 1.4221353816468, + "learning_rate": 6.172935273663949e-05, + "loss": 0.2781, + "step": 5180 + }, + { + "epoch": 0.4419894216004095, + "grad_norm": 1.4896032514329738, + "learning_rate": 6.171592253149334e-05, + "loss": 0.2795, + "step": 5181 + }, + { + "epoch": 0.4420747312745265, + "grad_norm": 1.7798755764068521, + "learning_rate": 6.170249143189878e-05, + "loss": 0.3044, + "step": 5182 + }, + { + "epoch": 0.4421600409486436, + "grad_norm": 1.8107979115888055, + "learning_rate": 6.168905943888118e-05, + "loss": 0.2773, + "step": 5183 + }, + { + "epoch": 0.4422453506227606, + "grad_norm": 1.4896003704386085, + "learning_rate": 6.1675626553466e-05, + "loss": 0.317, + "step": 5184 + }, + { + "epoch": 0.4423306602968777, + "grad_norm": 1.3969656588430133, + "learning_rate": 6.16621927766788e-05, + "loss": 0.2334, + "step": 5185 + }, + { + "epoch": 0.4424159699709947, + "grad_norm": 1.7278999223427496, + "learning_rate": 6.164875810954514e-05, + "loss": 0.2922, + "step": 5186 + }, + { + "epoch": 0.44250127964511177, + "grad_norm": 1.4718029676819824, + "learning_rate": 6.16353225530907e-05, + "loss": 0.2702, + "step": 5187 + }, + { + "epoch": 0.4425865893192288, + "grad_norm": 1.5142939120068197, + "learning_rate": 6.162188610834121e-05, + "loss": 0.2681, + "step": 5188 + }, + { + "epoch": 0.44267189899334586, + "grad_norm": 1.6394124227565277, + "learning_rate": 6.160844877632248e-05, + "loss": 0.3187, + "step": 5189 + }, + { + "epoch": 0.4427572086674629, + "grad_norm": 1.2255543252583894, + "learning_rate": 6.159501055806038e-05, + "loss": 0.2619, + "step": 5190 + }, + { + "epoch": 0.44284251834157995, + "grad_norm": 1.673456745054852, + "learning_rate": 6.158157145458082e-05, + "loss": 0.2188, + "step": 5191 + }, + { + "epoch": 0.44292782801569697, + "grad_norm": 1.4724191328035214, + "learning_rate": 6.156813146690983e-05, + "loss": 0.2091, + "step": 5192 + }, + { + "epoch": 0.44301313768981404, + "grad_norm": 1.7781437341544537, + "learning_rate": 6.155469059607348e-05, + "loss": 0.3387, + "step": 5193 + }, + { + "epoch": 0.44309844736393106, + "grad_norm": 1.8527349309178482, + "learning_rate": 6.15412488430979e-05, + "loss": 0.2854, + "step": 5194 + }, + { + "epoch": 0.44318375703804813, + "grad_norm": 1.7062141330847755, + "learning_rate": 6.152780620900931e-05, + "loss": 0.2648, + "step": 5195 + }, + { + "epoch": 0.44326906671216515, + "grad_norm": 1.5507985457601707, + "learning_rate": 6.151436269483397e-05, + "loss": 0.2621, + "step": 5196 + }, + { + "epoch": 0.4433543763862822, + "grad_norm": 1.2461117829838635, + "learning_rate": 6.150091830159823e-05, + "loss": 0.2276, + "step": 5197 + }, + { + "epoch": 0.44343968606039924, + "grad_norm": 1.4133148324328326, + "learning_rate": 6.148747303032849e-05, + "loss": 0.289, + "step": 5198 + }, + { + "epoch": 0.4435249957345163, + "grad_norm": 1.2218338513092024, + "learning_rate": 6.147402688205122e-05, + "loss": 0.2112, + "step": 5199 + }, + { + "epoch": 0.44361030540863333, + "grad_norm": 1.2490177109671305, + "learning_rate": 6.146057985779299e-05, + "loss": 0.2842, + "step": 5200 + }, + { + "epoch": 0.4436956150827504, + "grad_norm": 1.4386268842769159, + "learning_rate": 6.144713195858037e-05, + "loss": 0.3061, + "step": 5201 + }, + { + "epoch": 0.4437809247568674, + "grad_norm": 1.4714355266210903, + "learning_rate": 6.143368318544006e-05, + "loss": 0.2758, + "step": 5202 + }, + { + "epoch": 0.4438662344309845, + "grad_norm": 1.3903686201379104, + "learning_rate": 6.14202335393988e-05, + "loss": 0.2147, + "step": 5203 + }, + { + "epoch": 0.4439515441051015, + "grad_norm": 1.6431117512084286, + "learning_rate": 6.140678302148339e-05, + "loss": 0.2642, + "step": 5204 + }, + { + "epoch": 0.4440368537792186, + "grad_norm": 1.2858744646384437, + "learning_rate": 6.139333163272072e-05, + "loss": 0.2456, + "step": 5205 + }, + { + "epoch": 0.4441221634533356, + "grad_norm": 1.5328748217870887, + "learning_rate": 6.137987937413771e-05, + "loss": 0.2651, + "step": 5206 + }, + { + "epoch": 0.4442074731274527, + "grad_norm": 1.5326018302403768, + "learning_rate": 6.13664262467614e-05, + "loss": 0.2872, + "step": 5207 + }, + { + "epoch": 0.4442927828015697, + "grad_norm": 1.5295784058989852, + "learning_rate": 6.135297225161886e-05, + "loss": 0.2464, + "step": 5208 + }, + { + "epoch": 0.44437809247568677, + "grad_norm": 1.9544282761141707, + "learning_rate": 6.13395173897372e-05, + "loss": 0.3742, + "step": 5209 + }, + { + "epoch": 0.4444634021498038, + "grad_norm": 1.4048587274628073, + "learning_rate": 6.132606166214363e-05, + "loss": 0.3307, + "step": 5210 + }, + { + "epoch": 0.4445487118239208, + "grad_norm": 1.6571629725019927, + "learning_rate": 6.131260506986545e-05, + "loss": 0.264, + "step": 5211 + }, + { + "epoch": 0.4446340214980379, + "grad_norm": 1.557589941474087, + "learning_rate": 6.129914761393001e-05, + "loss": 0.2593, + "step": 5212 + }, + { + "epoch": 0.4447193311721549, + "grad_norm": 1.4912932747303944, + "learning_rate": 6.128568929536466e-05, + "loss": 0.2584, + "step": 5213 + }, + { + "epoch": 0.44480464084627197, + "grad_norm": 2.059533963124588, + "learning_rate": 6.127223011519692e-05, + "loss": 0.2803, + "step": 5214 + }, + { + "epoch": 0.444889950520389, + "grad_norm": 1.6256529889892681, + "learning_rate": 6.12587700744543e-05, + "loss": 0.3207, + "step": 5215 + }, + { + "epoch": 0.44497526019450606, + "grad_norm": 1.906697517770092, + "learning_rate": 6.124530917416443e-05, + "loss": 0.2884, + "step": 5216 + }, + { + "epoch": 0.4450605698686231, + "grad_norm": 1.4580011080200004, + "learning_rate": 6.123184741535495e-05, + "loss": 0.2764, + "step": 5217 + }, + { + "epoch": 0.44514587954274015, + "grad_norm": 2.0293637247245213, + "learning_rate": 6.121838479905363e-05, + "loss": 0.3249, + "step": 5218 + }, + { + "epoch": 0.44523118921685717, + "grad_norm": 2.0002829828335726, + "learning_rate": 6.120492132628823e-05, + "loss": 0.2556, + "step": 5219 + }, + { + "epoch": 0.44531649889097424, + "grad_norm": 1.1030083224483207, + "learning_rate": 6.119145699808662e-05, + "loss": 0.2096, + "step": 5220 + }, + { + "epoch": 0.44540180856509126, + "grad_norm": 1.6598835081567125, + "learning_rate": 6.117799181547674e-05, + "loss": 0.2611, + "step": 5221 + }, + { + "epoch": 0.44548711823920834, + "grad_norm": 1.7112401407207194, + "learning_rate": 6.11645257794866e-05, + "loss": 0.2944, + "step": 5222 + }, + { + "epoch": 0.44557242791332535, + "grad_norm": 1.3723354097078366, + "learning_rate": 6.115105889114422e-05, + "loss": 0.2543, + "step": 5223 + }, + { + "epoch": 0.4456577375874424, + "grad_norm": 1.3198701918960858, + "learning_rate": 6.113759115147778e-05, + "loss": 0.2742, + "step": 5224 + }, + { + "epoch": 0.44574304726155944, + "grad_norm": 1.761535296647109, + "learning_rate": 6.112412256151543e-05, + "loss": 0.235, + "step": 5225 + }, + { + "epoch": 0.4458283569356765, + "grad_norm": 1.9402292778757666, + "learning_rate": 6.111065312228542e-05, + "loss": 0.3108, + "step": 5226 + }, + { + "epoch": 0.44591366660979354, + "grad_norm": 1.3374754324109495, + "learning_rate": 6.109718283481611e-05, + "loss": 0.2744, + "step": 5227 + }, + { + "epoch": 0.4459989762839106, + "grad_norm": 1.7133194181171574, + "learning_rate": 6.108371170013585e-05, + "loss": 0.2312, + "step": 5228 + }, + { + "epoch": 0.4460842859580276, + "grad_norm": 1.432666031434522, + "learning_rate": 6.10702397192731e-05, + "loss": 0.3297, + "step": 5229 + }, + { + "epoch": 0.4461695956321447, + "grad_norm": 1.7231450155186696, + "learning_rate": 6.105676689325638e-05, + "loss": 0.323, + "step": 5230 + }, + { + "epoch": 0.4462549053062617, + "grad_norm": 1.6949707249276655, + "learning_rate": 6.104329322311425e-05, + "loss": 0.2609, + "step": 5231 + }, + { + "epoch": 0.4463402149803788, + "grad_norm": 1.5347356244284058, + "learning_rate": 6.1029818709875374e-05, + "loss": 0.2297, + "step": 5232 + }, + { + "epoch": 0.4464255246544958, + "grad_norm": 1.4104101739656034, + "learning_rate": 6.1016343354568464e-05, + "loss": 0.2461, + "step": 5233 + }, + { + "epoch": 0.4465108343286129, + "grad_norm": 1.6508820112594653, + "learning_rate": 6.100286715822225e-05, + "loss": 0.2898, + "step": 5234 + }, + { + "epoch": 0.4465961440027299, + "grad_norm": 1.495578448044967, + "learning_rate": 6.0989390121865634e-05, + "loss": 0.278, + "step": 5235 + }, + { + "epoch": 0.446681453676847, + "grad_norm": 1.970429143394914, + "learning_rate": 6.0975912246527455e-05, + "loss": 0.2939, + "step": 5236 + }, + { + "epoch": 0.446766763350964, + "grad_norm": 1.4498760861033335, + "learning_rate": 6.0962433533236705e-05, + "loss": 0.2782, + "step": 5237 + }, + { + "epoch": 0.44685207302508106, + "grad_norm": 1.5991839354303115, + "learning_rate": 6.094895398302241e-05, + "loss": 0.2813, + "step": 5238 + }, + { + "epoch": 0.4469373826991981, + "grad_norm": 1.4452465145410207, + "learning_rate": 6.093547359691367e-05, + "loss": 0.3788, + "step": 5239 + }, + { + "epoch": 0.44702269237331516, + "grad_norm": 1.5115167053995482, + "learning_rate": 6.092199237593963e-05, + "loss": 0.2613, + "step": 5240 + }, + { + "epoch": 0.4471080020474322, + "grad_norm": 1.4694979771888623, + "learning_rate": 6.090851032112951e-05, + "loss": 0.2755, + "step": 5241 + }, + { + "epoch": 0.44719331172154925, + "grad_norm": 1.6402394886305067, + "learning_rate": 6.089502743351259e-05, + "loss": 0.3729, + "step": 5242 + }, + { + "epoch": 0.44727862139566626, + "grad_norm": 1.3387551292086095, + "learning_rate": 6.088154371411822e-05, + "loss": 0.2339, + "step": 5243 + }, + { + "epoch": 0.44736393106978334, + "grad_norm": 1.3734613826615132, + "learning_rate": 6.086805916397581e-05, + "loss": 0.289, + "step": 5244 + }, + { + "epoch": 0.44744924074390036, + "grad_norm": 1.821416822764228, + "learning_rate": 6.085457378411484e-05, + "loss": 0.2874, + "step": 5245 + }, + { + "epoch": 0.44753455041801743, + "grad_norm": 1.9600965664878032, + "learning_rate": 6.084108757556485e-05, + "loss": 0.2595, + "step": 5246 + }, + { + "epoch": 0.44761986009213445, + "grad_norm": 1.5326645214889356, + "learning_rate": 6.082760053935541e-05, + "loss": 0.2789, + "step": 5247 + }, + { + "epoch": 0.4477051697662515, + "grad_norm": 1.3018390375482094, + "learning_rate": 6.0814112676516234e-05, + "loss": 0.3012, + "step": 5248 + }, + { + "epoch": 0.44779047944036854, + "grad_norm": 1.5963610036515838, + "learning_rate": 6.080062398807701e-05, + "loss": 0.2554, + "step": 5249 + }, + { + "epoch": 0.44787578911448556, + "grad_norm": 1.6259060314563756, + "learning_rate": 6.078713447506754e-05, + "loss": 0.2914, + "step": 5250 + }, + { + "epoch": 0.44796109878860263, + "grad_norm": 1.343912514019017, + "learning_rate": 6.077364413851768e-05, + "loss": 0.2828, + "step": 5251 + }, + { + "epoch": 0.44804640846271965, + "grad_norm": 1.4950345027093461, + "learning_rate": 6.076015297945733e-05, + "loss": 0.2783, + "step": 5252 + }, + { + "epoch": 0.4481317181368367, + "grad_norm": 1.7488813912342565, + "learning_rate": 6.0746660998916495e-05, + "loss": 0.2882, + "step": 5253 + }, + { + "epoch": 0.44821702781095374, + "grad_norm": 1.6529743984106795, + "learning_rate": 6.0733168197925195e-05, + "loss": 0.2958, + "step": 5254 + }, + { + "epoch": 0.4483023374850708, + "grad_norm": 1.6597367777733705, + "learning_rate": 6.071967457751352e-05, + "loss": 0.3164, + "step": 5255 + }, + { + "epoch": 0.44838764715918783, + "grad_norm": 1.4902570930239203, + "learning_rate": 6.070618013871168e-05, + "loss": 0.2731, + "step": 5256 + }, + { + "epoch": 0.4484729568333049, + "grad_norm": 1.5891414656472078, + "learning_rate": 6.0692684882549864e-05, + "loss": 0.2534, + "step": 5257 + }, + { + "epoch": 0.4485582665074219, + "grad_norm": 1.8003599124936547, + "learning_rate": 6.067918881005839e-05, + "loss": 0.2366, + "step": 5258 + }, + { + "epoch": 0.448643576181539, + "grad_norm": 1.68585131604341, + "learning_rate": 6.0665691922267586e-05, + "loss": 0.2657, + "step": 5259 + }, + { + "epoch": 0.448728885855656, + "grad_norm": 1.5923699780717449, + "learning_rate": 6.065219422020789e-05, + "loss": 0.3073, + "step": 5260 + }, + { + "epoch": 0.4488141955297731, + "grad_norm": 1.4205680961305651, + "learning_rate": 6.0638695704909776e-05, + "loss": 0.2009, + "step": 5261 + }, + { + "epoch": 0.4488995052038901, + "grad_norm": 1.3684904592736875, + "learning_rate": 6.062519637740378e-05, + "loss": 0.3257, + "step": 5262 + }, + { + "epoch": 0.4489848148780072, + "grad_norm": 1.3325976289121466, + "learning_rate": 6.0611696238720485e-05, + "loss": 0.3054, + "step": 5263 + }, + { + "epoch": 0.4490701245521242, + "grad_norm": 1.5370350762796896, + "learning_rate": 6.059819528989058e-05, + "loss": 0.2203, + "step": 5264 + }, + { + "epoch": 0.44915543422624127, + "grad_norm": 1.5802576743435792, + "learning_rate": 6.058469353194479e-05, + "loss": 0.2687, + "step": 5265 + }, + { + "epoch": 0.4492407439003583, + "grad_norm": 1.476797741109375, + "learning_rate": 6.057119096591388e-05, + "loss": 0.2844, + "step": 5266 + }, + { + "epoch": 0.44932605357447536, + "grad_norm": 1.5180398638721757, + "learning_rate": 6.055768759282874e-05, + "loss": 0.3001, + "step": 5267 + }, + { + "epoch": 0.4494113632485924, + "grad_norm": 1.61176004805287, + "learning_rate": 6.0544183413720235e-05, + "loss": 0.2851, + "step": 5268 + }, + { + "epoch": 0.44949667292270945, + "grad_norm": 1.367968004053284, + "learning_rate": 6.053067842961937e-05, + "loss": 0.1934, + "step": 5269 + }, + { + "epoch": 0.44958198259682647, + "grad_norm": 1.8513630304854296, + "learning_rate": 6.051717264155716e-05, + "loss": 0.3231, + "step": 5270 + }, + { + "epoch": 0.44966729227094354, + "grad_norm": 1.5373362818444576, + "learning_rate": 6.050366605056471e-05, + "loss": 0.2526, + "step": 5271 + }, + { + "epoch": 0.44975260194506056, + "grad_norm": 1.4209011222053676, + "learning_rate": 6.049015865767318e-05, + "loss": 0.2726, + "step": 5272 + }, + { + "epoch": 0.44983791161917763, + "grad_norm": 1.7456990933152985, + "learning_rate": 6.047665046391378e-05, + "loss": 0.2953, + "step": 5273 + }, + { + "epoch": 0.44992322129329465, + "grad_norm": 1.6697849507552758, + "learning_rate": 6.0463141470317774e-05, + "loss": 0.2693, + "step": 5274 + }, + { + "epoch": 0.4500085309674117, + "grad_norm": 1.4548469103734905, + "learning_rate": 6.044963167791653e-05, + "loss": 0.3193, + "step": 5275 + }, + { + "epoch": 0.45009384064152874, + "grad_norm": 1.837033597959754, + "learning_rate": 6.0436121087741425e-05, + "loss": 0.2306, + "step": 5276 + }, + { + "epoch": 0.4501791503156458, + "grad_norm": 1.4547656242133378, + "learning_rate": 6.042260970082395e-05, + "loss": 0.2619, + "step": 5277 + }, + { + "epoch": 0.45026445998976283, + "grad_norm": 1.8460849266148116, + "learning_rate": 6.04090975181956e-05, + "loss": 0.2537, + "step": 5278 + }, + { + "epoch": 0.4503497696638799, + "grad_norm": 1.277109013724992, + "learning_rate": 6.0395584540887963e-05, + "loss": 0.2636, + "step": 5279 + }, + { + "epoch": 0.4504350793379969, + "grad_norm": 1.5010143664928624, + "learning_rate": 6.03820707699327e-05, + "loss": 0.3206, + "step": 5280 + }, + { + "epoch": 0.450520389012114, + "grad_norm": 1.71959138563172, + "learning_rate": 6.03685562063615e-05, + "loss": 0.3005, + "step": 5281 + }, + { + "epoch": 0.450605698686231, + "grad_norm": 1.811825659699528, + "learning_rate": 6.035504085120613e-05, + "loss": 0.3028, + "step": 5282 + }, + { + "epoch": 0.4506910083603481, + "grad_norm": 1.2813216398726388, + "learning_rate": 6.034152470549843e-05, + "loss": 0.2441, + "step": 5283 + }, + { + "epoch": 0.4507763180344651, + "grad_norm": 1.492211067053547, + "learning_rate": 6.0328007770270256e-05, + "loss": 0.2256, + "step": 5284 + }, + { + "epoch": 0.4508616277085822, + "grad_norm": 1.6621591491060015, + "learning_rate": 6.031449004655359e-05, + "loss": 0.3062, + "step": 5285 + }, + { + "epoch": 0.4509469373826992, + "grad_norm": 1.5886805072415773, + "learning_rate": 6.03009715353804e-05, + "loss": 0.3067, + "step": 5286 + }, + { + "epoch": 0.4510322470568162, + "grad_norm": 1.221163292171883, + "learning_rate": 6.028745223778278e-05, + "loss": 0.2106, + "step": 5287 + }, + { + "epoch": 0.4511175567309333, + "grad_norm": 1.6048521913817675, + "learning_rate": 6.027393215479286e-05, + "loss": 0.2118, + "step": 5288 + }, + { + "epoch": 0.4512028664050503, + "grad_norm": 1.6170851817911316, + "learning_rate": 6.0260411287442786e-05, + "loss": 0.2375, + "step": 5289 + }, + { + "epoch": 0.4512881760791674, + "grad_norm": 1.5302787347347688, + "learning_rate": 6.0246889636764856e-05, + "loss": 0.2316, + "step": 5290 + }, + { + "epoch": 0.4513734857532844, + "grad_norm": 1.3690412760127562, + "learning_rate": 6.023336720379136e-05, + "loss": 0.2289, + "step": 5291 + }, + { + "epoch": 0.45145879542740147, + "grad_norm": 1.496696330897282, + "learning_rate": 6.021984398955466e-05, + "loss": 0.2546, + "step": 5292 + }, + { + "epoch": 0.4515441051015185, + "grad_norm": 1.484466549911171, + "learning_rate": 6.020631999508717e-05, + "loss": 0.3118, + "step": 5293 + }, + { + "epoch": 0.45162941477563556, + "grad_norm": 1.4509230306652785, + "learning_rate": 6.019279522142138e-05, + "loss": 0.2319, + "step": 5294 + }, + { + "epoch": 0.4517147244497526, + "grad_norm": 1.1941430348580064, + "learning_rate": 6.017926966958984e-05, + "loss": 0.2623, + "step": 5295 + }, + { + "epoch": 0.45180003412386965, + "grad_norm": 1.6544845816849807, + "learning_rate": 6.0165743340625155e-05, + "loss": 0.2112, + "step": 5296 + }, + { + "epoch": 0.45188534379798667, + "grad_norm": 1.6726692666015448, + "learning_rate": 6.015221623555999e-05, + "loss": 0.2848, + "step": 5297 + }, + { + "epoch": 0.45197065347210374, + "grad_norm": 2.0145557731734107, + "learning_rate": 6.013868835542707e-05, + "loss": 0.2803, + "step": 5298 + }, + { + "epoch": 0.45205596314622076, + "grad_norm": 1.3349221369721378, + "learning_rate": 6.012515970125916e-05, + "loss": 0.2705, + "step": 5299 + }, + { + "epoch": 0.45214127282033784, + "grad_norm": 1.703491451491664, + "learning_rate": 6.0111630274089105e-05, + "loss": 0.3095, + "step": 5300 + }, + { + "epoch": 0.45222658249445485, + "grad_norm": 1.2740621146565712, + "learning_rate": 6.0098100074949825e-05, + "loss": 0.2505, + "step": 5301 + }, + { + "epoch": 0.4523118921685719, + "grad_norm": 1.51120214974981, + "learning_rate": 6.008456910487428e-05, + "loss": 0.2783, + "step": 5302 + }, + { + "epoch": 0.45239720184268895, + "grad_norm": 1.3804369419235114, + "learning_rate": 6.0071037364895454e-05, + "loss": 0.2556, + "step": 5303 + }, + { + "epoch": 0.452482511516806, + "grad_norm": 1.3680550139028613, + "learning_rate": 6.0057504856046445e-05, + "loss": 0.2649, + "step": 5304 + }, + { + "epoch": 0.45256782119092304, + "grad_norm": 1.4577224360179035, + "learning_rate": 6.004397157936038e-05, + "loss": 0.2389, + "step": 5305 + }, + { + "epoch": 0.4526531308650401, + "grad_norm": 1.2958580533988728, + "learning_rate": 6.003043753587046e-05, + "loss": 0.2432, + "step": 5306 + }, + { + "epoch": 0.4527384405391571, + "grad_norm": 1.962129754779501, + "learning_rate": 6.0016902726609945e-05, + "loss": 0.2696, + "step": 5307 + }, + { + "epoch": 0.4528237502132742, + "grad_norm": 1.4628625925635257, + "learning_rate": 6.000336715261212e-05, + "loss": 0.2325, + "step": 5308 + }, + { + "epoch": 0.4529090598873912, + "grad_norm": 1.657408111464771, + "learning_rate": 5.9989830814910397e-05, + "loss": 0.2758, + "step": 5309 + }, + { + "epoch": 0.4529943695615083, + "grad_norm": 1.4086160887577575, + "learning_rate": 5.997629371453817e-05, + "loss": 0.2591, + "step": 5310 + }, + { + "epoch": 0.4530796792356253, + "grad_norm": 1.6527619971083838, + "learning_rate": 5.996275585252891e-05, + "loss": 0.2363, + "step": 5311 + }, + { + "epoch": 0.4531649889097424, + "grad_norm": 1.5573770080793505, + "learning_rate": 5.99492172299162e-05, + "loss": 0.2787, + "step": 5312 + }, + { + "epoch": 0.4532502985838594, + "grad_norm": 1.642126722145731, + "learning_rate": 5.993567784773362e-05, + "loss": 0.2442, + "step": 5313 + }, + { + "epoch": 0.4533356082579765, + "grad_norm": 1.5240442877114042, + "learning_rate": 5.9922137707014845e-05, + "loss": 0.2925, + "step": 5314 + }, + { + "epoch": 0.4534209179320935, + "grad_norm": 1.5874153354926295, + "learning_rate": 5.990859680879357e-05, + "loss": 0.1988, + "step": 5315 + }, + { + "epoch": 0.45350622760621057, + "grad_norm": 1.4534818149684887, + "learning_rate": 5.989505515410358e-05, + "loss": 0.2931, + "step": 5316 + }, + { + "epoch": 0.4535915372803276, + "grad_norm": 1.5701146546794318, + "learning_rate": 5.988151274397873e-05, + "loss": 0.192, + "step": 5317 + }, + { + "epoch": 0.45367684695444466, + "grad_norm": 1.4538247669309718, + "learning_rate": 5.986796957945287e-05, + "loss": 0.2569, + "step": 5318 + }, + { + "epoch": 0.4537621566285617, + "grad_norm": 1.6527535581906754, + "learning_rate": 5.9854425661559975e-05, + "loss": 0.2959, + "step": 5319 + }, + { + "epoch": 0.45384746630267875, + "grad_norm": 1.7895593078326884, + "learning_rate": 5.984088099133406e-05, + "loss": 0.3015, + "step": 5320 + }, + { + "epoch": 0.45393277597679577, + "grad_norm": 1.6689435982290786, + "learning_rate": 5.9827335569809165e-05, + "loss": 0.2411, + "step": 5321 + }, + { + "epoch": 0.45401808565091284, + "grad_norm": 1.5399256575586677, + "learning_rate": 5.981378939801942e-05, + "loss": 0.2756, + "step": 5322 + }, + { + "epoch": 0.45410339532502986, + "grad_norm": 1.6328674745203033, + "learning_rate": 5.980024247699903e-05, + "loss": 0.2545, + "step": 5323 + }, + { + "epoch": 0.45418870499914693, + "grad_norm": 1.5046819098538098, + "learning_rate": 5.978669480778217e-05, + "loss": 0.2481, + "step": 5324 + }, + { + "epoch": 0.45427401467326395, + "grad_norm": 1.6185432685015617, + "learning_rate": 5.977314639140319e-05, + "loss": 0.2854, + "step": 5325 + }, + { + "epoch": 0.45435932434738097, + "grad_norm": 1.6421023302286935, + "learning_rate": 5.975959722889641e-05, + "loss": 0.2589, + "step": 5326 + }, + { + "epoch": 0.45444463402149804, + "grad_norm": 1.5051066574374226, + "learning_rate": 5.974604732129625e-05, + "loss": 0.2897, + "step": 5327 + }, + { + "epoch": 0.45452994369561506, + "grad_norm": 1.1847907330638054, + "learning_rate": 5.9732496669637164e-05, + "loss": 0.251, + "step": 5328 + }, + { + "epoch": 0.45461525336973213, + "grad_norm": 1.464841796873698, + "learning_rate": 5.971894527495366e-05, + "loss": 0.2877, + "step": 5329 + }, + { + "epoch": 0.45470056304384915, + "grad_norm": 1.3439864904201548, + "learning_rate": 5.970539313828035e-05, + "loss": 0.244, + "step": 5330 + }, + { + "epoch": 0.4547858727179662, + "grad_norm": 1.5264138905902824, + "learning_rate": 5.9691840260651844e-05, + "loss": 0.2699, + "step": 5331 + }, + { + "epoch": 0.45487118239208324, + "grad_norm": 1.2784330057546784, + "learning_rate": 5.967828664310283e-05, + "loss": 0.2309, + "step": 5332 + }, + { + "epoch": 0.4549564920662003, + "grad_norm": 1.7694254795151372, + "learning_rate": 5.966473228666807e-05, + "loss": 0.3093, + "step": 5333 + }, + { + "epoch": 0.45504180174031733, + "grad_norm": 1.6231067338869718, + "learning_rate": 5.965117719238236e-05, + "loss": 0.2976, + "step": 5334 + }, + { + "epoch": 0.4551271114144344, + "grad_norm": 1.6549281477529028, + "learning_rate": 5.963762136128055e-05, + "loss": 0.2469, + "step": 5335 + }, + { + "epoch": 0.4552124210885514, + "grad_norm": 2.0208797121788797, + "learning_rate": 5.962406479439757e-05, + "loss": 0.3341, + "step": 5336 + }, + { + "epoch": 0.4552977307626685, + "grad_norm": 1.5389326786340187, + "learning_rate": 5.961050749276838e-05, + "loss": 0.2703, + "step": 5337 + }, + { + "epoch": 0.4553830404367855, + "grad_norm": 1.4307672144447299, + "learning_rate": 5.9596949457428006e-05, + "loss": 0.3058, + "step": 5338 + }, + { + "epoch": 0.4554683501109026, + "grad_norm": 1.9040161551904107, + "learning_rate": 5.9583390689411556e-05, + "loss": 0.2982, + "step": 5339 + }, + { + "epoch": 0.4555536597850196, + "grad_norm": 1.3588020608117684, + "learning_rate": 5.9569831189754135e-05, + "loss": 0.2363, + "step": 5340 + }, + { + "epoch": 0.4556389694591367, + "grad_norm": 1.5614687759155415, + "learning_rate": 5.9556270959490966e-05, + "loss": 0.2472, + "step": 5341 + }, + { + "epoch": 0.4557242791332537, + "grad_norm": 1.5975651423585142, + "learning_rate": 5.9542709999657286e-05, + "loss": 0.2735, + "step": 5342 + }, + { + "epoch": 0.45580958880737077, + "grad_norm": 1.5709675044360072, + "learning_rate": 5.952914831128842e-05, + "loss": 0.3091, + "step": 5343 + }, + { + "epoch": 0.4558948984814878, + "grad_norm": 1.413178014490511, + "learning_rate": 5.951558589541971e-05, + "loss": 0.2845, + "step": 5344 + }, + { + "epoch": 0.45598020815560486, + "grad_norm": 1.4760499094766792, + "learning_rate": 5.9502022753086586e-05, + "loss": 0.2666, + "step": 5345 + }, + { + "epoch": 0.4560655178297219, + "grad_norm": 1.8204238268163606, + "learning_rate": 5.948845888532452e-05, + "loss": 0.3144, + "step": 5346 + }, + { + "epoch": 0.45615082750383895, + "grad_norm": 1.3334860366559373, + "learning_rate": 5.947489429316904e-05, + "loss": 0.2639, + "step": 5347 + }, + { + "epoch": 0.45623613717795597, + "grad_norm": 1.2121439676514931, + "learning_rate": 5.946132897765572e-05, + "loss": 0.2443, + "step": 5348 + }, + { + "epoch": 0.45632144685207304, + "grad_norm": 1.4506355208344517, + "learning_rate": 5.9447762939820216e-05, + "loss": 0.2137, + "step": 5349 + }, + { + "epoch": 0.45640675652619006, + "grad_norm": 1.6910189918701324, + "learning_rate": 5.943419618069821e-05, + "loss": 0.2728, + "step": 5350 + }, + { + "epoch": 0.45649206620030713, + "grad_norm": 1.541959176365496, + "learning_rate": 5.942062870132547e-05, + "loss": 0.2322, + "step": 5351 + }, + { + "epoch": 0.45657737587442415, + "grad_norm": 1.3619064788191444, + "learning_rate": 5.940706050273779e-05, + "loss": 0.2304, + "step": 5352 + }, + { + "epoch": 0.4566626855485412, + "grad_norm": 1.2254400280426216, + "learning_rate": 5.939349158597102e-05, + "loss": 0.2119, + "step": 5353 + }, + { + "epoch": 0.45674799522265824, + "grad_norm": 1.3419395493083146, + "learning_rate": 5.937992195206109e-05, + "loss": 0.2467, + "step": 5354 + }, + { + "epoch": 0.4568333048967753, + "grad_norm": 1.3690712730267685, + "learning_rate": 5.9366351602043955e-05, + "loss": 0.2027, + "step": 5355 + }, + { + "epoch": 0.45691861457089233, + "grad_norm": 1.4268360155672495, + "learning_rate": 5.935278053695566e-05, + "loss": 0.2696, + "step": 5356 + }, + { + "epoch": 0.4570039242450094, + "grad_norm": 1.6428461933363443, + "learning_rate": 5.933920875783228e-05, + "loss": 0.2558, + "step": 5357 + }, + { + "epoch": 0.4570892339191264, + "grad_norm": 1.2970147115068642, + "learning_rate": 5.932563626570992e-05, + "loss": 0.2891, + "step": 5358 + }, + { + "epoch": 0.4571745435932435, + "grad_norm": 1.326176684094166, + "learning_rate": 5.93120630616248e-05, + "loss": 0.2736, + "step": 5359 + }, + { + "epoch": 0.4572598532673605, + "grad_norm": 1.5262522199584911, + "learning_rate": 5.929848914661315e-05, + "loss": 0.2858, + "step": 5360 + }, + { + "epoch": 0.4573451629414776, + "grad_norm": 1.5431782640586555, + "learning_rate": 5.9284914521711245e-05, + "loss": 0.3077, + "step": 5361 + }, + { + "epoch": 0.4574304726155946, + "grad_norm": 1.6703026053097216, + "learning_rate": 5.9271339187955475e-05, + "loss": 0.253, + "step": 5362 + }, + { + "epoch": 0.4575157822897117, + "grad_norm": 1.5347071954251394, + "learning_rate": 5.925776314638223e-05, + "loss": 0.271, + "step": 5363 + }, + { + "epoch": 0.4576010919638287, + "grad_norm": 1.6203654296234535, + "learning_rate": 5.9244186398027944e-05, + "loss": 0.2734, + "step": 5364 + }, + { + "epoch": 0.4576864016379457, + "grad_norm": 1.737961025959943, + "learning_rate": 5.923060894392917e-05, + "loss": 0.3219, + "step": 5365 + }, + { + "epoch": 0.4577717113120628, + "grad_norm": 2.55685305022537, + "learning_rate": 5.921703078512245e-05, + "loss": 0.3116, + "step": 5366 + }, + { + "epoch": 0.4578570209861798, + "grad_norm": 1.7406012322876565, + "learning_rate": 5.92034519226444e-05, + "loss": 0.2766, + "step": 5367 + }, + { + "epoch": 0.4579423306602969, + "grad_norm": 1.2317264970968647, + "learning_rate": 5.918987235753172e-05, + "loss": 0.2977, + "step": 5368 + }, + { + "epoch": 0.4580276403344139, + "grad_norm": 1.1535338899307688, + "learning_rate": 5.9176292090821105e-05, + "loss": 0.2291, + "step": 5369 + }, + { + "epoch": 0.458112950008531, + "grad_norm": 1.5408208262195715, + "learning_rate": 5.916271112354935e-05, + "loss": 0.2954, + "step": 5370 + }, + { + "epoch": 0.458198259682648, + "grad_norm": 1.5150781805179774, + "learning_rate": 5.9149129456753306e-05, + "loss": 0.2831, + "step": 5371 + }, + { + "epoch": 0.45828356935676506, + "grad_norm": 1.1005328621713184, + "learning_rate": 5.913554709146983e-05, + "loss": 0.231, + "step": 5372 + }, + { + "epoch": 0.4583688790308821, + "grad_norm": 1.5670080384545502, + "learning_rate": 5.9121964028735886e-05, + "loss": 0.227, + "step": 5373 + }, + { + "epoch": 0.45845418870499915, + "grad_norm": 1.3253200756199186, + "learning_rate": 5.910838026958846e-05, + "loss": 0.2764, + "step": 5374 + }, + { + "epoch": 0.4585394983791162, + "grad_norm": 1.6315171063269012, + "learning_rate": 5.9094795815064604e-05, + "loss": 0.289, + "step": 5375 + }, + { + "epoch": 0.45862480805323325, + "grad_norm": 1.5335947884989958, + "learning_rate": 5.9081210666201435e-05, + "loss": 0.2581, + "step": 5376 + }, + { + "epoch": 0.45871011772735026, + "grad_norm": 1.9397958258717167, + "learning_rate": 5.906762482403607e-05, + "loss": 0.3092, + "step": 5377 + }, + { + "epoch": 0.45879542740146734, + "grad_norm": 1.1796646621211355, + "learning_rate": 5.905403828960575e-05, + "loss": 0.2353, + "step": 5378 + }, + { + "epoch": 0.45888073707558436, + "grad_norm": 1.6840933569585097, + "learning_rate": 5.904045106394771e-05, + "loss": 0.2969, + "step": 5379 + }, + { + "epoch": 0.45896604674970143, + "grad_norm": 1.4764278289900623, + "learning_rate": 5.902686314809927e-05, + "loss": 0.2482, + "step": 5380 + }, + { + "epoch": 0.45905135642381845, + "grad_norm": 1.8364882272186687, + "learning_rate": 5.9013274543097795e-05, + "loss": 0.2727, + "step": 5381 + }, + { + "epoch": 0.4591366660979355, + "grad_norm": 1.4810572438429903, + "learning_rate": 5.8999685249980696e-05, + "loss": 0.2916, + "step": 5382 + }, + { + "epoch": 0.45922197577205254, + "grad_norm": 1.6133885382568014, + "learning_rate": 5.898609526978547e-05, + "loss": 0.2735, + "step": 5383 + }, + { + "epoch": 0.4593072854461696, + "grad_norm": 1.5695431471013446, + "learning_rate": 5.8972504603549616e-05, + "loss": 0.2617, + "step": 5384 + }, + { + "epoch": 0.45939259512028663, + "grad_norm": 1.6759603622501758, + "learning_rate": 5.895891325231071e-05, + "loss": 0.3303, + "step": 5385 + }, + { + "epoch": 0.4594779047944037, + "grad_norm": 1.2543558996064483, + "learning_rate": 5.89453212171064e-05, + "loss": 0.2157, + "step": 5386 + }, + { + "epoch": 0.4595632144685207, + "grad_norm": 1.3786342449827689, + "learning_rate": 5.8931728498974336e-05, + "loss": 0.231, + "step": 5387 + }, + { + "epoch": 0.4596485241426378, + "grad_norm": 1.4288145641780716, + "learning_rate": 5.8918135098952276e-05, + "loss": 0.2905, + "step": 5388 + }, + { + "epoch": 0.4597338338167548, + "grad_norm": 1.3393375341290348, + "learning_rate": 5.8904541018077984e-05, + "loss": 0.2694, + "step": 5389 + }, + { + "epoch": 0.4598191434908719, + "grad_norm": 1.9154074167267918, + "learning_rate": 5.88909462573893e-05, + "loss": 0.2916, + "step": 5390 + }, + { + "epoch": 0.4599044531649889, + "grad_norm": 1.6536045179437302, + "learning_rate": 5.887735081792413e-05, + "loss": 0.3444, + "step": 5391 + }, + { + "epoch": 0.459989762839106, + "grad_norm": 1.6799751922501498, + "learning_rate": 5.88637547007204e-05, + "loss": 0.3078, + "step": 5392 + }, + { + "epoch": 0.460075072513223, + "grad_norm": 1.7124748116750033, + "learning_rate": 5.8850157906816075e-05, + "loss": 0.3361, + "step": 5393 + }, + { + "epoch": 0.46016038218734007, + "grad_norm": 1.829205576744229, + "learning_rate": 5.8836560437249245e-05, + "loss": 0.3305, + "step": 5394 + }, + { + "epoch": 0.4602456918614571, + "grad_norm": 1.5631487453263233, + "learning_rate": 5.882296229305797e-05, + "loss": 0.278, + "step": 5395 + }, + { + "epoch": 0.46033100153557416, + "grad_norm": 1.3210646163101234, + "learning_rate": 5.8809363475280424e-05, + "loss": 0.2602, + "step": 5396 + }, + { + "epoch": 0.4604163112096912, + "grad_norm": 1.648904309459142, + "learning_rate": 5.8795763984954776e-05, + "loss": 0.208, + "step": 5397 + }, + { + "epoch": 0.46050162088380825, + "grad_norm": 1.453787375811822, + "learning_rate": 5.878216382311931e-05, + "loss": 0.2521, + "step": 5398 + }, + { + "epoch": 0.46058693055792527, + "grad_norm": 1.3665923648888358, + "learning_rate": 5.876856299081228e-05, + "loss": 0.2976, + "step": 5399 + }, + { + "epoch": 0.46067224023204234, + "grad_norm": 1.5151670098234948, + "learning_rate": 5.875496148907208e-05, + "loss": 0.302, + "step": 5400 + }, + { + "epoch": 0.46075754990615936, + "grad_norm": 1.5835401333058419, + "learning_rate": 5.874135931893707e-05, + "loss": 0.2798, + "step": 5401 + }, + { + "epoch": 0.4608428595802764, + "grad_norm": 1.466791023415425, + "learning_rate": 5.872775648144575e-05, + "loss": 0.2688, + "step": 5402 + }, + { + "epoch": 0.46092816925439345, + "grad_norm": 1.40377738586235, + "learning_rate": 5.8714152977636595e-05, + "loss": 0.2637, + "step": 5403 + }, + { + "epoch": 0.46101347892851047, + "grad_norm": 1.4095740133382033, + "learning_rate": 5.8700548808548164e-05, + "loss": 0.1857, + "step": 5404 + }, + { + "epoch": 0.46109878860262754, + "grad_norm": 1.4000045265397054, + "learning_rate": 5.868694397521908e-05, + "loss": 0.2046, + "step": 5405 + }, + { + "epoch": 0.46118409827674456, + "grad_norm": 1.8682833851317695, + "learning_rate": 5.8673338478687955e-05, + "loss": 0.3033, + "step": 5406 + }, + { + "epoch": 0.46126940795086163, + "grad_norm": 1.4118345212094177, + "learning_rate": 5.8659732319993555e-05, + "loss": 0.27, + "step": 5407 + }, + { + "epoch": 0.46135471762497865, + "grad_norm": 1.4711158041736723, + "learning_rate": 5.864612550017461e-05, + "loss": 0.2418, + "step": 5408 + }, + { + "epoch": 0.4614400272990957, + "grad_norm": 1.6069681632752253, + "learning_rate": 5.863251802026992e-05, + "loss": 0.2692, + "step": 5409 + }, + { + "epoch": 0.46152533697321274, + "grad_norm": 1.5200493328973006, + "learning_rate": 5.8618909881318354e-05, + "loss": 0.2936, + "step": 5410 + }, + { + "epoch": 0.4616106466473298, + "grad_norm": 1.4816853298017372, + "learning_rate": 5.860530108435881e-05, + "loss": 0.1941, + "step": 5411 + }, + { + "epoch": 0.46169595632144683, + "grad_norm": 1.5875324065737988, + "learning_rate": 5.859169163043027e-05, + "loss": 0.2353, + "step": 5412 + }, + { + "epoch": 0.4617812659955639, + "grad_norm": 1.5655874647563597, + "learning_rate": 5.857808152057173e-05, + "loss": 0.2544, + "step": 5413 + }, + { + "epoch": 0.4618665756696809, + "grad_norm": 1.981307414573751, + "learning_rate": 5.856447075582223e-05, + "loss": 0.2772, + "step": 5414 + }, + { + "epoch": 0.461951885343798, + "grad_norm": 1.7142459461731119, + "learning_rate": 5.855085933722092e-05, + "loss": 0.2651, + "step": 5415 + }, + { + "epoch": 0.462037195017915, + "grad_norm": 1.4516726383271807, + "learning_rate": 5.8537247265806936e-05, + "loss": 0.2832, + "step": 5416 + }, + { + "epoch": 0.4621225046920321, + "grad_norm": 1.7096340143605215, + "learning_rate": 5.852363454261949e-05, + "loss": 0.269, + "step": 5417 + }, + { + "epoch": 0.4622078143661491, + "grad_norm": 1.8842543150558957, + "learning_rate": 5.851002116869784e-05, + "loss": 0.2635, + "step": 5418 + }, + { + "epoch": 0.4622931240402662, + "grad_norm": 1.4902640523492765, + "learning_rate": 5.849640714508129e-05, + "loss": 0.2612, + "step": 5419 + }, + { + "epoch": 0.4623784337143832, + "grad_norm": 1.838838857634103, + "learning_rate": 5.848279247280921e-05, + "loss": 0.2608, + "step": 5420 + }, + { + "epoch": 0.46246374338850027, + "grad_norm": 1.4526937521629404, + "learning_rate": 5.846917715292101e-05, + "loss": 0.219, + "step": 5421 + }, + { + "epoch": 0.4625490530626173, + "grad_norm": 1.7165822316981387, + "learning_rate": 5.845556118645612e-05, + "loss": 0.3131, + "step": 5422 + }, + { + "epoch": 0.46263436273673436, + "grad_norm": 1.2505016274058396, + "learning_rate": 5.844194457445408e-05, + "loss": 0.2719, + "step": 5423 + }, + { + "epoch": 0.4627196724108514, + "grad_norm": 1.6360051094665577, + "learning_rate": 5.8428327317954435e-05, + "loss": 0.2784, + "step": 5424 + }, + { + "epoch": 0.46280498208496845, + "grad_norm": 2.0019088219183248, + "learning_rate": 5.841470941799677e-05, + "loss": 0.2911, + "step": 5425 + }, + { + "epoch": 0.46289029175908547, + "grad_norm": 1.4692396605749176, + "learning_rate": 5.840109087562078e-05, + "loss": 0.2771, + "step": 5426 + }, + { + "epoch": 0.46297560143320254, + "grad_norm": 1.4156246092671578, + "learning_rate": 5.838747169186611e-05, + "loss": 0.2223, + "step": 5427 + }, + { + "epoch": 0.46306091110731956, + "grad_norm": 1.4555886006830627, + "learning_rate": 5.8373851867772576e-05, + "loss": 0.2517, + "step": 5428 + }, + { + "epoch": 0.46314622078143663, + "grad_norm": 1.827782362430057, + "learning_rate": 5.836023140437995e-05, + "loss": 0.2793, + "step": 5429 + }, + { + "epoch": 0.46323153045555365, + "grad_norm": 1.4818313487947108, + "learning_rate": 5.834661030272809e-05, + "loss": 0.2863, + "step": 5430 + }, + { + "epoch": 0.4633168401296707, + "grad_norm": 1.3726803112580832, + "learning_rate": 5.833298856385687e-05, + "loss": 0.2786, + "step": 5431 + }, + { + "epoch": 0.46340214980378774, + "grad_norm": 1.7835861413085765, + "learning_rate": 5.8319366188806256e-05, + "loss": 0.3351, + "step": 5432 + }, + { + "epoch": 0.4634874594779048, + "grad_norm": 1.5317716683034515, + "learning_rate": 5.830574317861625e-05, + "loss": 0.2793, + "step": 5433 + }, + { + "epoch": 0.46357276915202184, + "grad_norm": 1.4979813821295194, + "learning_rate": 5.8292119534326885e-05, + "loss": 0.2564, + "step": 5434 + }, + { + "epoch": 0.4636580788261389, + "grad_norm": 1.8039977501899578, + "learning_rate": 5.827849525697825e-05, + "loss": 0.2659, + "step": 5435 + }, + { + "epoch": 0.4637433885002559, + "grad_norm": 1.8142500354125015, + "learning_rate": 5.82648703476105e-05, + "loss": 0.2749, + "step": 5436 + }, + { + "epoch": 0.463828698174373, + "grad_norm": 1.439964089873043, + "learning_rate": 5.8251244807263825e-05, + "loss": 0.2139, + "step": 5437 + }, + { + "epoch": 0.46391400784849, + "grad_norm": 1.259219502203736, + "learning_rate": 5.823761863697844e-05, + "loss": 0.2707, + "step": 5438 + }, + { + "epoch": 0.4639993175226071, + "grad_norm": 1.5531792920944811, + "learning_rate": 5.822399183779467e-05, + "loss": 0.2297, + "step": 5439 + }, + { + "epoch": 0.4640846271967241, + "grad_norm": 1.6363761015137896, + "learning_rate": 5.8210364410752814e-05, + "loss": 0.283, + "step": 5440 + }, + { + "epoch": 0.4641699368708411, + "grad_norm": 2.0549573829854064, + "learning_rate": 5.819673635689327e-05, + "loss": 0.3438, + "step": 5441 + }, + { + "epoch": 0.4642552465449582, + "grad_norm": 1.3829070236744894, + "learning_rate": 5.8183107677256456e-05, + "loss": 0.2634, + "step": 5442 + }, + { + "epoch": 0.4643405562190752, + "grad_norm": 1.3039322911628248, + "learning_rate": 5.816947837288285e-05, + "loss": 0.2375, + "step": 5443 + }, + { + "epoch": 0.4644258658931923, + "grad_norm": 1.5729408851763238, + "learning_rate": 5.815584844481299e-05, + "loss": 0.2438, + "step": 5444 + }, + { + "epoch": 0.4645111755673093, + "grad_norm": 1.4004359537234332, + "learning_rate": 5.814221789408745e-05, + "loss": 0.2819, + "step": 5445 + }, + { + "epoch": 0.4645964852414264, + "grad_norm": 1.7188456422030982, + "learning_rate": 5.812858672174681e-05, + "loss": 0.3186, + "step": 5446 + }, + { + "epoch": 0.4646817949155434, + "grad_norm": 1.4846374279740842, + "learning_rate": 5.81149549288318e-05, + "loss": 0.2642, + "step": 5447 + }, + { + "epoch": 0.4647671045896605, + "grad_norm": 1.6762048146005644, + "learning_rate": 5.810132251638309e-05, + "loss": 0.2965, + "step": 5448 + }, + { + "epoch": 0.4648524142637775, + "grad_norm": 1.5193803020904184, + "learning_rate": 5.8087689485441466e-05, + "loss": 0.286, + "step": 5449 + }, + { + "epoch": 0.46493772393789456, + "grad_norm": 1.2733667272080142, + "learning_rate": 5.807405583704773e-05, + "loss": 0.3105, + "step": 5450 + }, + { + "epoch": 0.4650230336120116, + "grad_norm": 1.5356869156511759, + "learning_rate": 5.806042157224273e-05, + "loss": 0.3142, + "step": 5451 + }, + { + "epoch": 0.46510834328612866, + "grad_norm": 1.4014740693806294, + "learning_rate": 5.804678669206738e-05, + "loss": 0.2711, + "step": 5452 + }, + { + "epoch": 0.4651936529602457, + "grad_norm": 1.537089908772267, + "learning_rate": 5.803315119756262e-05, + "loss": 0.2632, + "step": 5453 + }, + { + "epoch": 0.46527896263436275, + "grad_norm": 1.3686215100879302, + "learning_rate": 5.801951508976945e-05, + "loss": 0.2363, + "step": 5454 + }, + { + "epoch": 0.46536427230847976, + "grad_norm": 1.5502805886562772, + "learning_rate": 5.8005878369728926e-05, + "loss": 0.3233, + "step": 5455 + }, + { + "epoch": 0.46544958198259684, + "grad_norm": 1.5136210771466203, + "learning_rate": 5.799224103848213e-05, + "loss": 0.2838, + "step": 5456 + }, + { + "epoch": 0.46553489165671386, + "grad_norm": 1.6200497862147487, + "learning_rate": 5.797860309707021e-05, + "loss": 0.2719, + "step": 5457 + }, + { + "epoch": 0.46562020133083093, + "grad_norm": 1.4447612639154932, + "learning_rate": 5.796496454653433e-05, + "loss": 0.3126, + "step": 5458 + }, + { + "epoch": 0.46570551100494795, + "grad_norm": 1.5392195723108084, + "learning_rate": 5.795132538791572e-05, + "loss": 0.302, + "step": 5459 + }, + { + "epoch": 0.465790820679065, + "grad_norm": 1.6199310177374169, + "learning_rate": 5.79376856222557e-05, + "loss": 0.2991, + "step": 5460 + }, + { + "epoch": 0.46587613035318204, + "grad_norm": 1.4000212156868084, + "learning_rate": 5.792404525059555e-05, + "loss": 0.2976, + "step": 5461 + }, + { + "epoch": 0.4659614400272991, + "grad_norm": 1.7667718724048935, + "learning_rate": 5.791040427397666e-05, + "loss": 0.3372, + "step": 5462 + }, + { + "epoch": 0.46604674970141613, + "grad_norm": 1.5939362361029261, + "learning_rate": 5.789676269344043e-05, + "loss": 0.2939, + "step": 5463 + }, + { + "epoch": 0.4661320593755332, + "grad_norm": 1.514851323502777, + "learning_rate": 5.7883120510028336e-05, + "loss": 0.2789, + "step": 5464 + }, + { + "epoch": 0.4662173690496502, + "grad_norm": 1.6565233400901502, + "learning_rate": 5.786947772478187e-05, + "loss": 0.2439, + "step": 5465 + }, + { + "epoch": 0.4663026787237673, + "grad_norm": 1.7896670190549464, + "learning_rate": 5.785583433874262e-05, + "loss": 0.2775, + "step": 5466 + }, + { + "epoch": 0.4663879883978843, + "grad_norm": 1.4308080398969287, + "learning_rate": 5.7842190352952143e-05, + "loss": 0.2953, + "step": 5467 + }, + { + "epoch": 0.4664732980720014, + "grad_norm": 1.5802379098268884, + "learning_rate": 5.7828545768452115e-05, + "loss": 0.2375, + "step": 5468 + }, + { + "epoch": 0.4665586077461184, + "grad_norm": 1.5243635444163752, + "learning_rate": 5.781490058628422e-05, + "loss": 0.2759, + "step": 5469 + }, + { + "epoch": 0.4666439174202355, + "grad_norm": 1.6226228786778187, + "learning_rate": 5.780125480749019e-05, + "loss": 0.2722, + "step": 5470 + }, + { + "epoch": 0.4667292270943525, + "grad_norm": 1.4631047620518394, + "learning_rate": 5.7787608433111816e-05, + "loss": 0.254, + "step": 5471 + }, + { + "epoch": 0.46681453676846957, + "grad_norm": 1.4636440399675126, + "learning_rate": 5.777396146419093e-05, + "loss": 0.1882, + "step": 5472 + }, + { + "epoch": 0.4668998464425866, + "grad_norm": 1.4378625992932743, + "learning_rate": 5.776031390176938e-05, + "loss": 0.1874, + "step": 5473 + }, + { + "epoch": 0.46698515611670366, + "grad_norm": 1.619372674490563, + "learning_rate": 5.7746665746889114e-05, + "loss": 0.2714, + "step": 5474 + }, + { + "epoch": 0.4670704657908207, + "grad_norm": 1.4560836697002424, + "learning_rate": 5.7733017000592074e-05, + "loss": 0.2579, + "step": 5475 + }, + { + "epoch": 0.46715577546493775, + "grad_norm": 1.6027304507753033, + "learning_rate": 5.7719367663920285e-05, + "loss": 0.2829, + "step": 5476 + }, + { + "epoch": 0.46724108513905477, + "grad_norm": 1.4885250502616163, + "learning_rate": 5.770571773791579e-05, + "loss": 0.2457, + "step": 5477 + }, + { + "epoch": 0.4673263948131718, + "grad_norm": 1.51157916703577, + "learning_rate": 5.7692067223620695e-05, + "loss": 0.2828, + "step": 5478 + }, + { + "epoch": 0.46741170448728886, + "grad_norm": 1.710810617419037, + "learning_rate": 5.767841612207715e-05, + "loss": 0.2639, + "step": 5479 + }, + { + "epoch": 0.4674970141614059, + "grad_norm": 1.2896776061144168, + "learning_rate": 5.766476443432732e-05, + "loss": 0.2864, + "step": 5480 + }, + { + "epoch": 0.46758232383552295, + "grad_norm": 1.2478248267325602, + "learning_rate": 5.765111216141348e-05, + "loss": 0.2607, + "step": 5481 + }, + { + "epoch": 0.46766763350963997, + "grad_norm": 1.6971529625834443, + "learning_rate": 5.7637459304377874e-05, + "loss": 0.2755, + "step": 5482 + }, + { + "epoch": 0.46775294318375704, + "grad_norm": 1.4161509061186794, + "learning_rate": 5.762380586426283e-05, + "loss": 0.2534, + "step": 5483 + }, + { + "epoch": 0.46783825285787406, + "grad_norm": 1.4344118367038112, + "learning_rate": 5.7610151842110736e-05, + "loss": 0.2461, + "step": 5484 + }, + { + "epoch": 0.46792356253199113, + "grad_norm": 1.4101807652928762, + "learning_rate": 5.7596497238963975e-05, + "loss": 0.3041, + "step": 5485 + }, + { + "epoch": 0.46800887220610815, + "grad_norm": 1.7017013479142826, + "learning_rate": 5.758284205586503e-05, + "loss": 0.2726, + "step": 5486 + }, + { + "epoch": 0.4680941818802252, + "grad_norm": 1.4376705731902604, + "learning_rate": 5.756918629385638e-05, + "loss": 0.1983, + "step": 5487 + }, + { + "epoch": 0.46817949155434224, + "grad_norm": 1.5671559962931352, + "learning_rate": 5.755552995398057e-05, + "loss": 0.2842, + "step": 5488 + }, + { + "epoch": 0.4682648012284593, + "grad_norm": 1.6664266492951554, + "learning_rate": 5.7541873037280215e-05, + "loss": 0.2886, + "step": 5489 + }, + { + "epoch": 0.46835011090257633, + "grad_norm": 1.2940372074532387, + "learning_rate": 5.752821554479793e-05, + "loss": 0.2454, + "step": 5490 + }, + { + "epoch": 0.4684354205766934, + "grad_norm": 1.8694980004650115, + "learning_rate": 5.751455747757637e-05, + "loss": 0.2412, + "step": 5491 + }, + { + "epoch": 0.4685207302508104, + "grad_norm": 1.8447335416258928, + "learning_rate": 5.75008988366583e-05, + "loss": 0.2751, + "step": 5492 + }, + { + "epoch": 0.4686060399249275, + "grad_norm": 1.8096520641653777, + "learning_rate": 5.748723962308646e-05, + "loss": 0.2034, + "step": 5493 + }, + { + "epoch": 0.4686913495990445, + "grad_norm": 1.5391281811016677, + "learning_rate": 5.747357983790367e-05, + "loss": 0.2231, + "step": 5494 + }, + { + "epoch": 0.4687766592731616, + "grad_norm": 2.0386069070171158, + "learning_rate": 5.745991948215277e-05, + "loss": 0.28, + "step": 5495 + }, + { + "epoch": 0.4688619689472786, + "grad_norm": 1.7441397364700635, + "learning_rate": 5.7446258556876645e-05, + "loss": 0.2599, + "step": 5496 + }, + { + "epoch": 0.4689472786213957, + "grad_norm": 1.8778434809012778, + "learning_rate": 5.743259706311827e-05, + "loss": 0.2878, + "step": 5497 + }, + { + "epoch": 0.4690325882955127, + "grad_norm": 1.6260501695989378, + "learning_rate": 5.741893500192059e-05, + "loss": 0.2979, + "step": 5498 + }, + { + "epoch": 0.46911789796962977, + "grad_norm": 1.625873330994743, + "learning_rate": 5.740527237432665e-05, + "loss": 0.3123, + "step": 5499 + }, + { + "epoch": 0.4692032076437468, + "grad_norm": 1.7174765290679141, + "learning_rate": 5.7391609181379514e-05, + "loss": 0.2889, + "step": 5500 + }, + { + "epoch": 0.46928851731786386, + "grad_norm": 1.7373163524148156, + "learning_rate": 5.737794542412229e-05, + "loss": 0.2389, + "step": 5501 + }, + { + "epoch": 0.4693738269919809, + "grad_norm": 1.5642556055086503, + "learning_rate": 5.736428110359815e-05, + "loss": 0.2569, + "step": 5502 + }, + { + "epoch": 0.46945913666609795, + "grad_norm": 1.6825369861726271, + "learning_rate": 5.7350616220850285e-05, + "loss": 0.264, + "step": 5503 + }, + { + "epoch": 0.46954444634021497, + "grad_norm": 1.8970185930932526, + "learning_rate": 5.733695077692193e-05, + "loss": 0.2996, + "step": 5504 + }, + { + "epoch": 0.46962975601433204, + "grad_norm": 1.35567110073833, + "learning_rate": 5.732328477285638e-05, + "loss": 0.2803, + "step": 5505 + }, + { + "epoch": 0.46971506568844906, + "grad_norm": 1.6020239188591037, + "learning_rate": 5.730961820969694e-05, + "loss": 0.3005, + "step": 5506 + }, + { + "epoch": 0.46980037536256614, + "grad_norm": 1.242826189632858, + "learning_rate": 5.7295951088486985e-05, + "loss": 0.2917, + "step": 5507 + }, + { + "epoch": 0.46988568503668315, + "grad_norm": 1.9112328613790208, + "learning_rate": 5.7282283410269955e-05, + "loss": 0.3362, + "step": 5508 + }, + { + "epoch": 0.4699709947108002, + "grad_norm": 1.4411203984850425, + "learning_rate": 5.726861517608927e-05, + "loss": 0.2726, + "step": 5509 + }, + { + "epoch": 0.47005630438491725, + "grad_norm": 1.3794836721058898, + "learning_rate": 5.725494638698845e-05, + "loss": 0.224, + "step": 5510 + }, + { + "epoch": 0.4701416140590343, + "grad_norm": 1.5968967861772172, + "learning_rate": 5.724127704401102e-05, + "loss": 0.2364, + "step": 5511 + }, + { + "epoch": 0.47022692373315134, + "grad_norm": 1.8172455848076587, + "learning_rate": 5.722760714820057e-05, + "loss": 0.2605, + "step": 5512 + }, + { + "epoch": 0.4703122334072684, + "grad_norm": 1.394992527685354, + "learning_rate": 5.721393670060072e-05, + "loss": 0.2819, + "step": 5513 + }, + { + "epoch": 0.4703975430813854, + "grad_norm": 1.7132559618320824, + "learning_rate": 5.720026570225514e-05, + "loss": 0.3039, + "step": 5514 + }, + { + "epoch": 0.4704828527555025, + "grad_norm": 1.6963064909870345, + "learning_rate": 5.718659415420754e-05, + "loss": 0.2493, + "step": 5515 + }, + { + "epoch": 0.4705681624296195, + "grad_norm": 1.7396528340540323, + "learning_rate": 5.717292205750167e-05, + "loss": 0.2663, + "step": 5516 + }, + { + "epoch": 0.47065347210373654, + "grad_norm": 1.682790755022504, + "learning_rate": 5.7159249413181303e-05, + "loss": 0.2976, + "step": 5517 + }, + { + "epoch": 0.4707387817778536, + "grad_norm": 1.176481323684068, + "learning_rate": 5.71455762222903e-05, + "loss": 0.1753, + "step": 5518 + }, + { + "epoch": 0.4708240914519706, + "grad_norm": 1.5359755797531094, + "learning_rate": 5.713190248587251e-05, + "loss": 0.2894, + "step": 5519 + }, + { + "epoch": 0.4709094011260877, + "grad_norm": 1.6019053022515382, + "learning_rate": 5.711822820497187e-05, + "loss": 0.2652, + "step": 5520 + }, + { + "epoch": 0.4709947108002047, + "grad_norm": 1.6396418941110502, + "learning_rate": 5.710455338063234e-05, + "loss": 0.3444, + "step": 5521 + }, + { + "epoch": 0.4710800204743218, + "grad_norm": 1.4436629496171078, + "learning_rate": 5.70908780138979e-05, + "loss": 0.2404, + "step": 5522 + }, + { + "epoch": 0.4711653301484388, + "grad_norm": 1.6983696580969743, + "learning_rate": 5.707720210581261e-05, + "loss": 0.3222, + "step": 5523 + }, + { + "epoch": 0.4712506398225559, + "grad_norm": 1.7352601962705727, + "learning_rate": 5.706352565742056e-05, + "loss": 0.2728, + "step": 5524 + }, + { + "epoch": 0.4713359494966729, + "grad_norm": 1.4806724549108385, + "learning_rate": 5.7049848669765846e-05, + "loss": 0.28, + "step": 5525 + }, + { + "epoch": 0.47142125917079, + "grad_norm": 1.5231717391389656, + "learning_rate": 5.703617114389266e-05, + "loss": 0.2253, + "step": 5526 + }, + { + "epoch": 0.471506568844907, + "grad_norm": 1.6187677448728772, + "learning_rate": 5.7022493080845194e-05, + "loss": 0.2269, + "step": 5527 + }, + { + "epoch": 0.47159187851902407, + "grad_norm": 1.6212065508100455, + "learning_rate": 5.700881448166769e-05, + "loss": 0.302, + "step": 5528 + }, + { + "epoch": 0.4716771881931411, + "grad_norm": 1.4671186762095996, + "learning_rate": 5.699513534740446e-05, + "loss": 0.218, + "step": 5529 + }, + { + "epoch": 0.47176249786725816, + "grad_norm": 1.36499999385177, + "learning_rate": 5.69814556790998e-05, + "loss": 0.2823, + "step": 5530 + }, + { + "epoch": 0.4718478075413752, + "grad_norm": 1.5973394028437404, + "learning_rate": 5.696777547779811e-05, + "loss": 0.3133, + "step": 5531 + }, + { + "epoch": 0.47193311721549225, + "grad_norm": 1.4546100862188767, + "learning_rate": 5.69540947445438e-05, + "loss": 0.2549, + "step": 5532 + }, + { + "epoch": 0.47201842688960927, + "grad_norm": 1.529292509771867, + "learning_rate": 5.694041348038128e-05, + "loss": 0.2456, + "step": 5533 + }, + { + "epoch": 0.47210373656372634, + "grad_norm": 1.6732938218612936, + "learning_rate": 5.6926731686355096e-05, + "loss": 0.3219, + "step": 5534 + }, + { + "epoch": 0.47218904623784336, + "grad_norm": 1.5122198018713262, + "learning_rate": 5.691304936350975e-05, + "loss": 0.2175, + "step": 5535 + }, + { + "epoch": 0.47227435591196043, + "grad_norm": 1.6295584783932038, + "learning_rate": 5.689936651288983e-05, + "loss": 0.309, + "step": 5536 + }, + { + "epoch": 0.47235966558607745, + "grad_norm": 1.7800429420434773, + "learning_rate": 5.688568313553994e-05, + "loss": 0.2896, + "step": 5537 + }, + { + "epoch": 0.4724449752601945, + "grad_norm": 1.4346615507171723, + "learning_rate": 5.68719992325047e-05, + "loss": 0.2442, + "step": 5538 + }, + { + "epoch": 0.47253028493431154, + "grad_norm": 1.510186732537592, + "learning_rate": 5.685831480482887e-05, + "loss": 0.2695, + "step": 5539 + }, + { + "epoch": 0.4726155946084286, + "grad_norm": 1.305405196646716, + "learning_rate": 5.684462985355714e-05, + "loss": 0.2733, + "step": 5540 + }, + { + "epoch": 0.47270090428254563, + "grad_norm": 1.2668324115296299, + "learning_rate": 5.683094437973429e-05, + "loss": 0.2396, + "step": 5541 + }, + { + "epoch": 0.4727862139566627, + "grad_norm": 1.511800916543242, + "learning_rate": 5.681725838440515e-05, + "loss": 0.275, + "step": 5542 + }, + { + "epoch": 0.4728715236307797, + "grad_norm": 1.706030999965243, + "learning_rate": 5.680357186861455e-05, + "loss": 0.2882, + "step": 5543 + }, + { + "epoch": 0.4729568333048968, + "grad_norm": 1.5852187458802143, + "learning_rate": 5.678988483340738e-05, + "loss": 0.2169, + "step": 5544 + }, + { + "epoch": 0.4730421429790138, + "grad_norm": 1.5858218897882548, + "learning_rate": 5.677619727982859e-05, + "loss": 0.2763, + "step": 5545 + }, + { + "epoch": 0.4731274526531309, + "grad_norm": 1.368055972418208, + "learning_rate": 5.6762509208923165e-05, + "loss": 0.2585, + "step": 5546 + }, + { + "epoch": 0.4732127623272479, + "grad_norm": 1.6164504135979463, + "learning_rate": 5.6748820621736084e-05, + "loss": 0.2801, + "step": 5547 + }, + { + "epoch": 0.473298072001365, + "grad_norm": 1.5914901718761396, + "learning_rate": 5.673513151931241e-05, + "loss": 0.2588, + "step": 5548 + }, + { + "epoch": 0.473383381675482, + "grad_norm": 1.3426949550927942, + "learning_rate": 5.6721441902697236e-05, + "loss": 0.2651, + "step": 5549 + }, + { + "epoch": 0.47346869134959907, + "grad_norm": 1.3252798684565943, + "learning_rate": 5.670775177293569e-05, + "loss": 0.2901, + "step": 5550 + }, + { + "epoch": 0.4735540010237161, + "grad_norm": 1.6285862596931213, + "learning_rate": 5.669406113107295e-05, + "loss": 0.25, + "step": 5551 + }, + { + "epoch": 0.47363931069783316, + "grad_norm": 1.4888622521552974, + "learning_rate": 5.66803699781542e-05, + "loss": 0.2377, + "step": 5552 + }, + { + "epoch": 0.4737246203719502, + "grad_norm": 1.6937511655233393, + "learning_rate": 5.666667831522471e-05, + "loss": 0.243, + "step": 5553 + }, + { + "epoch": 0.47380993004606725, + "grad_norm": 1.5452733851651954, + "learning_rate": 5.665298614332975e-05, + "loss": 0.2474, + "step": 5554 + }, + { + "epoch": 0.47389523972018427, + "grad_norm": 1.7047675375326532, + "learning_rate": 5.663929346351466e-05, + "loss": 0.2946, + "step": 5555 + }, + { + "epoch": 0.4739805493943013, + "grad_norm": 1.5628392423474862, + "learning_rate": 5.6625600276824796e-05, + "loss": 0.2503, + "step": 5556 + }, + { + "epoch": 0.47406585906841836, + "grad_norm": 1.840042395103327, + "learning_rate": 5.661190658430556e-05, + "loss": 0.2536, + "step": 5557 + }, + { + "epoch": 0.4741511687425354, + "grad_norm": 1.8643318589194051, + "learning_rate": 5.659821238700239e-05, + "loss": 0.3101, + "step": 5558 + }, + { + "epoch": 0.47423647841665245, + "grad_norm": 1.5277153792306157, + "learning_rate": 5.658451768596077e-05, + "loss": 0.2187, + "step": 5559 + }, + { + "epoch": 0.47432178809076947, + "grad_norm": 1.569001974713631, + "learning_rate": 5.65708224822262e-05, + "loss": 0.2482, + "step": 5560 + }, + { + "epoch": 0.47440709776488654, + "grad_norm": 1.5192380489891832, + "learning_rate": 5.655712677684426e-05, + "loss": 0.2747, + "step": 5561 + }, + { + "epoch": 0.47449240743900356, + "grad_norm": 1.5500718038601284, + "learning_rate": 5.654343057086053e-05, + "loss": 0.3039, + "step": 5562 + }, + { + "epoch": 0.47457771711312063, + "grad_norm": 1.261599980955783, + "learning_rate": 5.652973386532066e-05, + "loss": 0.2765, + "step": 5563 + }, + { + "epoch": 0.47466302678723765, + "grad_norm": 1.605151216983099, + "learning_rate": 5.651603666127031e-05, + "loss": 0.3013, + "step": 5564 + }, + { + "epoch": 0.4747483364613547, + "grad_norm": 1.5922455512218403, + "learning_rate": 5.6502338959755164e-05, + "loss": 0.251, + "step": 5565 + }, + { + "epoch": 0.47483364613547174, + "grad_norm": 1.5450241884480012, + "learning_rate": 5.648864076182101e-05, + "loss": 0.2289, + "step": 5566 + }, + { + "epoch": 0.4749189558095888, + "grad_norm": 1.5300672205949595, + "learning_rate": 5.647494206851363e-05, + "loss": 0.2693, + "step": 5567 + }, + { + "epoch": 0.47500426548370583, + "grad_norm": 1.5654443084548026, + "learning_rate": 5.646124288087881e-05, + "loss": 0.2481, + "step": 5568 + }, + { + "epoch": 0.4750895751578229, + "grad_norm": 1.5116847623410012, + "learning_rate": 5.644754319996244e-05, + "loss": 0.2555, + "step": 5569 + }, + { + "epoch": 0.4751748848319399, + "grad_norm": 1.8098382808033244, + "learning_rate": 5.643384302681039e-05, + "loss": 0.2488, + "step": 5570 + }, + { + "epoch": 0.475260194506057, + "grad_norm": 1.405884250125079, + "learning_rate": 5.6420142362468634e-05, + "loss": 0.2641, + "step": 5571 + }, + { + "epoch": 0.475345504180174, + "grad_norm": 1.8053587618792666, + "learning_rate": 5.640644120798312e-05, + "loss": 0.2174, + "step": 5572 + }, + { + "epoch": 0.4754308138542911, + "grad_norm": 1.3934592178886849, + "learning_rate": 5.6392739564399845e-05, + "loss": 0.2471, + "step": 5573 + }, + { + "epoch": 0.4755161235284081, + "grad_norm": 1.8380237170361016, + "learning_rate": 5.637903743276489e-05, + "loss": 0.2468, + "step": 5574 + }, + { + "epoch": 0.4756014332025252, + "grad_norm": 1.3564453125, + "learning_rate": 5.636533481412433e-05, + "loss": 0.2264, + "step": 5575 + }, + { + "epoch": 0.4756867428766422, + "grad_norm": 1.253481118454482, + "learning_rate": 5.635163170952428e-05, + "loss": 0.1879, + "step": 5576 + }, + { + "epoch": 0.4757720525507593, + "grad_norm": 1.5768013114509567, + "learning_rate": 5.6337928120010906e-05, + "loss": 0.2881, + "step": 5577 + }, + { + "epoch": 0.4758573622248763, + "grad_norm": 1.3396850580932202, + "learning_rate": 5.6324224046630395e-05, + "loss": 0.2252, + "step": 5578 + }, + { + "epoch": 0.47594267189899336, + "grad_norm": 1.6384632946299809, + "learning_rate": 5.631051949042898e-05, + "loss": 0.2397, + "step": 5579 + }, + { + "epoch": 0.4760279815731104, + "grad_norm": 1.6591269499877148, + "learning_rate": 5.629681445245295e-05, + "loss": 0.2899, + "step": 5580 + }, + { + "epoch": 0.47611329124722745, + "grad_norm": 1.4603688158294363, + "learning_rate": 5.628310893374859e-05, + "loss": 0.2552, + "step": 5581 + }, + { + "epoch": 0.4761986009213445, + "grad_norm": 1.6287639015992188, + "learning_rate": 5.626940293536225e-05, + "loss": 0.2949, + "step": 5582 + }, + { + "epoch": 0.47628391059546155, + "grad_norm": 1.8493938380884087, + "learning_rate": 5.62556964583403e-05, + "loss": 0.2585, + "step": 5583 + }, + { + "epoch": 0.47636922026957856, + "grad_norm": 1.5574783502485419, + "learning_rate": 5.624198950372918e-05, + "loss": 0.2455, + "step": 5584 + }, + { + "epoch": 0.47645452994369564, + "grad_norm": 1.2522607386747948, + "learning_rate": 5.622828207257533e-05, + "loss": 0.3167, + "step": 5585 + }, + { + "epoch": 0.47653983961781265, + "grad_norm": 1.7537191924191784, + "learning_rate": 5.621457416592524e-05, + "loss": 0.218, + "step": 5586 + }, + { + "epoch": 0.47662514929192973, + "grad_norm": 1.5087280656157234, + "learning_rate": 5.620086578482544e-05, + "loss": 0.2952, + "step": 5587 + }, + { + "epoch": 0.47671045896604675, + "grad_norm": 1.1528673954232784, + "learning_rate": 5.618715693032248e-05, + "loss": 0.1821, + "step": 5588 + }, + { + "epoch": 0.4767957686401638, + "grad_norm": 1.512739127936292, + "learning_rate": 5.617344760346298e-05, + "loss": 0.3048, + "step": 5589 + }, + { + "epoch": 0.47688107831428084, + "grad_norm": 1.411693802039434, + "learning_rate": 5.615973780529357e-05, + "loss": 0.2603, + "step": 5590 + }, + { + "epoch": 0.4769663879883979, + "grad_norm": 1.758329120588755, + "learning_rate": 5.614602753686088e-05, + "loss": 0.264, + "step": 5591 + }, + { + "epoch": 0.47705169766251493, + "grad_norm": 1.3446274044181687, + "learning_rate": 5.613231679921167e-05, + "loss": 0.2235, + "step": 5592 + }, + { + "epoch": 0.47713700733663195, + "grad_norm": 1.654104750825316, + "learning_rate": 5.611860559339265e-05, + "loss": 0.2546, + "step": 5593 + }, + { + "epoch": 0.477222317010749, + "grad_norm": 1.7154719216681986, + "learning_rate": 5.6104893920450605e-05, + "loss": 0.2734, + "step": 5594 + }, + { + "epoch": 0.47730762668486604, + "grad_norm": 1.2801473571475914, + "learning_rate": 5.609118178143236e-05, + "loss": 0.2455, + "step": 5595 + }, + { + "epoch": 0.4773929363589831, + "grad_norm": 1.568349949866605, + "learning_rate": 5.6077469177384754e-05, + "loss": 0.263, + "step": 5596 + }, + { + "epoch": 0.47747824603310013, + "grad_norm": 1.7075457114137187, + "learning_rate": 5.606375610935466e-05, + "loss": 0.239, + "step": 5597 + }, + { + "epoch": 0.4775635557072172, + "grad_norm": 1.821269295511029, + "learning_rate": 5.6050042578389016e-05, + "loss": 0.2129, + "step": 5598 + }, + { + "epoch": 0.4776488653813342, + "grad_norm": 1.243244992578802, + "learning_rate": 5.603632858553478e-05, + "loss": 0.2553, + "step": 5599 + }, + { + "epoch": 0.4777341750554513, + "grad_norm": 1.4662316016614474, + "learning_rate": 5.602261413183892e-05, + "loss": 0.2604, + "step": 5600 + }, + { + "epoch": 0.4778194847295683, + "grad_norm": 1.73531013915899, + "learning_rate": 5.60088992183485e-05, + "loss": 0.3607, + "step": 5601 + }, + { + "epoch": 0.4779047944036854, + "grad_norm": 1.7108369562575525, + "learning_rate": 5.5995183846110524e-05, + "loss": 0.2043, + "step": 5602 + }, + { + "epoch": 0.4779901040778024, + "grad_norm": 1.8648437730128362, + "learning_rate": 5.5981468016172134e-05, + "loss": 0.2397, + "step": 5603 + }, + { + "epoch": 0.4780754137519195, + "grad_norm": 1.9770667597642466, + "learning_rate": 5.5967751729580454e-05, + "loss": 0.3257, + "step": 5604 + }, + { + "epoch": 0.4781607234260365, + "grad_norm": 1.5253150520701944, + "learning_rate": 5.595403498738262e-05, + "loss": 0.2791, + "step": 5605 + }, + { + "epoch": 0.47824603310015357, + "grad_norm": 1.5547120365526095, + "learning_rate": 5.5940317790625876e-05, + "loss": 0.2613, + "step": 5606 + }, + { + "epoch": 0.4783313427742706, + "grad_norm": 1.5081721059854485, + "learning_rate": 5.592660014035742e-05, + "loss": 0.2976, + "step": 5607 + }, + { + "epoch": 0.47841665244838766, + "grad_norm": 1.59685990835923, + "learning_rate": 5.591288203762455e-05, + "loss": 0.2434, + "step": 5608 + }, + { + "epoch": 0.4785019621225047, + "grad_norm": 1.499400336881134, + "learning_rate": 5.589916348347455e-05, + "loss": 0.261, + "step": 5609 + }, + { + "epoch": 0.47858727179662175, + "grad_norm": 1.3891813521284981, + "learning_rate": 5.588544447895477e-05, + "loss": 0.2042, + "step": 5610 + }, + { + "epoch": 0.47867258147073877, + "grad_norm": 1.4097810283019168, + "learning_rate": 5.5871725025112586e-05, + "loss": 0.2178, + "step": 5611 + }, + { + "epoch": 0.47875789114485584, + "grad_norm": 1.454632705000654, + "learning_rate": 5.585800512299539e-05, + "loss": 0.2745, + "step": 5612 + }, + { + "epoch": 0.47884320081897286, + "grad_norm": 1.796073535807098, + "learning_rate": 5.584428477365063e-05, + "loss": 0.2708, + "step": 5613 + }, + { + "epoch": 0.47892851049308993, + "grad_norm": 1.485017657447927, + "learning_rate": 5.583056397812578e-05, + "loss": 0.2349, + "step": 5614 + }, + { + "epoch": 0.47901382016720695, + "grad_norm": 1.512636206933163, + "learning_rate": 5.5816842737468353e-05, + "loss": 0.2314, + "step": 5615 + }, + { + "epoch": 0.479099129841324, + "grad_norm": 1.783726961004454, + "learning_rate": 5.5803121052725916e-05, + "loss": 0.2887, + "step": 5616 + }, + { + "epoch": 0.47918443951544104, + "grad_norm": 1.6593847299638964, + "learning_rate": 5.578939892494601e-05, + "loss": 0.2363, + "step": 5617 + }, + { + "epoch": 0.4792697491895581, + "grad_norm": 1.5661765522339217, + "learning_rate": 5.577567635517625e-05, + "loss": 0.2417, + "step": 5618 + }, + { + "epoch": 0.47935505886367513, + "grad_norm": 1.5610627239657981, + "learning_rate": 5.57619533444643e-05, + "loss": 0.2737, + "step": 5619 + }, + { + "epoch": 0.4794403685377922, + "grad_norm": 1.4843165335936315, + "learning_rate": 5.574822989385784e-05, + "loss": 0.2566, + "step": 5620 + }, + { + "epoch": 0.4795256782119092, + "grad_norm": 1.6623557924525916, + "learning_rate": 5.5734506004404574e-05, + "loss": 0.2629, + "step": 5621 + }, + { + "epoch": 0.4796109878860263, + "grad_norm": 1.4712305427762022, + "learning_rate": 5.572078167715225e-05, + "loss": 0.2708, + "step": 5622 + }, + { + "epoch": 0.4796962975601433, + "grad_norm": 1.479334253121483, + "learning_rate": 5.5707056913148626e-05, + "loss": 0.262, + "step": 5623 + }, + { + "epoch": 0.4797816072342604, + "grad_norm": 1.4967822689249888, + "learning_rate": 5.569333171344154e-05, + "loss": 0.2227, + "step": 5624 + }, + { + "epoch": 0.4798669169083774, + "grad_norm": 2.0454733905502107, + "learning_rate": 5.567960607907885e-05, + "loss": 0.2535, + "step": 5625 + }, + { + "epoch": 0.4799522265824945, + "grad_norm": 1.433190395561741, + "learning_rate": 5.5665880011108394e-05, + "loss": 0.1972, + "step": 5626 + }, + { + "epoch": 0.4800375362566115, + "grad_norm": 1.1983741992212837, + "learning_rate": 5.565215351057812e-05, + "loss": 0.2702, + "step": 5627 + }, + { + "epoch": 0.48012284593072857, + "grad_norm": 1.305565772568966, + "learning_rate": 5.5638426578535955e-05, + "loss": 0.208, + "step": 5628 + }, + { + "epoch": 0.4802081556048456, + "grad_norm": 1.3223022275560743, + "learning_rate": 5.5624699216029885e-05, + "loss": 0.2351, + "step": 5629 + }, + { + "epoch": 0.48029346527896266, + "grad_norm": 1.549720074387617, + "learning_rate": 5.561097142410791e-05, + "loss": 0.2313, + "step": 5630 + }, + { + "epoch": 0.4803787749530797, + "grad_norm": 1.6540403200929426, + "learning_rate": 5.5597243203818104e-05, + "loss": 0.3239, + "step": 5631 + }, + { + "epoch": 0.4804640846271967, + "grad_norm": 1.5565671053842982, + "learning_rate": 5.5583514556208514e-05, + "loss": 0.2515, + "step": 5632 + }, + { + "epoch": 0.48054939430131377, + "grad_norm": 1.9285609860616875, + "learning_rate": 5.556978548232726e-05, + "loss": 0.2552, + "step": 5633 + }, + { + "epoch": 0.4806347039754308, + "grad_norm": 1.4753256163592798, + "learning_rate": 5.5556055983222474e-05, + "loss": 0.2351, + "step": 5634 + }, + { + "epoch": 0.48072001364954786, + "grad_norm": 1.5586192958334895, + "learning_rate": 5.554232605994235e-05, + "loss": 0.2666, + "step": 5635 + }, + { + "epoch": 0.4808053233236649, + "grad_norm": 1.748277565848846, + "learning_rate": 5.552859571353507e-05, + "loss": 0.2438, + "step": 5636 + }, + { + "epoch": 0.48089063299778195, + "grad_norm": 1.3815650269994986, + "learning_rate": 5.5514864945048904e-05, + "loss": 0.2311, + "step": 5637 + }, + { + "epoch": 0.48097594267189897, + "grad_norm": 1.8124323865183727, + "learning_rate": 5.55011337555321e-05, + "loss": 0.3055, + "step": 5638 + }, + { + "epoch": 0.48106125234601604, + "grad_norm": 1.380511422137322, + "learning_rate": 5.548740214603295e-05, + "loss": 0.23, + "step": 5639 + }, + { + "epoch": 0.48114656202013306, + "grad_norm": 1.6308150782251356, + "learning_rate": 5.547367011759982e-05, + "loss": 0.2714, + "step": 5640 + }, + { + "epoch": 0.48123187169425014, + "grad_norm": 1.7181407889164177, + "learning_rate": 5.545993767128107e-05, + "loss": 0.2519, + "step": 5641 + }, + { + "epoch": 0.48131718136836715, + "grad_norm": 1.4727885948230635, + "learning_rate": 5.544620480812508e-05, + "loss": 0.2735, + "step": 5642 + }, + { + "epoch": 0.4814024910424842, + "grad_norm": 1.5702847150935078, + "learning_rate": 5.5432471529180306e-05, + "loss": 0.2382, + "step": 5643 + }, + { + "epoch": 0.48148780071660124, + "grad_norm": 1.5081236523188652, + "learning_rate": 5.541873783549518e-05, + "loss": 0.2758, + "step": 5644 + }, + { + "epoch": 0.4815731103907183, + "grad_norm": 1.7802512062346159, + "learning_rate": 5.540500372811823e-05, + "loss": 0.2726, + "step": 5645 + }, + { + "epoch": 0.48165842006483534, + "grad_norm": 1.4389718854542244, + "learning_rate": 5.539126920809797e-05, + "loss": 0.2778, + "step": 5646 + }, + { + "epoch": 0.4817437297389524, + "grad_norm": 1.6506322920788763, + "learning_rate": 5.537753427648295e-05, + "loss": 0.2461, + "step": 5647 + }, + { + "epoch": 0.4818290394130694, + "grad_norm": 1.8618351665024866, + "learning_rate": 5.536379893432177e-05, + "loss": 0.2878, + "step": 5648 + }, + { + "epoch": 0.4819143490871865, + "grad_norm": 1.5500855699114973, + "learning_rate": 5.535006318266304e-05, + "loss": 0.2383, + "step": 5649 + }, + { + "epoch": 0.4819996587613035, + "grad_norm": 1.2447589194274535, + "learning_rate": 5.53363270225554e-05, + "loss": 0.1982, + "step": 5650 + }, + { + "epoch": 0.4820849684354206, + "grad_norm": 1.8089464803510396, + "learning_rate": 5.5322590455047564e-05, + "loss": 0.2187, + "step": 5651 + }, + { + "epoch": 0.4821702781095376, + "grad_norm": 2.0706778725719817, + "learning_rate": 5.5308853481188236e-05, + "loss": 0.3068, + "step": 5652 + }, + { + "epoch": 0.4822555877836547, + "grad_norm": 1.7008415803735837, + "learning_rate": 5.529511610202616e-05, + "loss": 0.2567, + "step": 5653 + }, + { + "epoch": 0.4823408974577717, + "grad_norm": 1.5031201971219232, + "learning_rate": 5.52813783186101e-05, + "loss": 0.2274, + "step": 5654 + }, + { + "epoch": 0.4824262071318888, + "grad_norm": 1.4006530430721453, + "learning_rate": 5.5267640131988864e-05, + "loss": 0.2383, + "step": 5655 + }, + { + "epoch": 0.4825115168060058, + "grad_norm": 1.735655372142338, + "learning_rate": 5.5253901543211295e-05, + "loss": 0.2509, + "step": 5656 + }, + { + "epoch": 0.48259682648012286, + "grad_norm": 1.6988853390572514, + "learning_rate": 5.524016255332627e-05, + "loss": 0.2275, + "step": 5657 + }, + { + "epoch": 0.4826821361542399, + "grad_norm": 1.9045664122147006, + "learning_rate": 5.522642316338268e-05, + "loss": 0.2844, + "step": 5658 + }, + { + "epoch": 0.48276744582835696, + "grad_norm": 1.4972134774887889, + "learning_rate": 5.521268337442945e-05, + "loss": 0.2206, + "step": 5659 + }, + { + "epoch": 0.482852755502474, + "grad_norm": 1.58512068135612, + "learning_rate": 5.519894318751554e-05, + "loss": 0.2574, + "step": 5660 + }, + { + "epoch": 0.48293806517659105, + "grad_norm": 1.347165446816573, + "learning_rate": 5.518520260368996e-05, + "loss": 0.2104, + "step": 5661 + }, + { + "epoch": 0.48302337485070806, + "grad_norm": 1.5991833390795898, + "learning_rate": 5.517146162400171e-05, + "loss": 0.2723, + "step": 5662 + }, + { + "epoch": 0.48310868452482514, + "grad_norm": 1.9771909656979638, + "learning_rate": 5.5157720249499847e-05, + "loss": 0.3039, + "step": 5663 + }, + { + "epoch": 0.48319399419894216, + "grad_norm": 1.7719119732479183, + "learning_rate": 5.514397848123345e-05, + "loss": 0.2525, + "step": 5664 + }, + { + "epoch": 0.48327930387305923, + "grad_norm": 1.6739399359625198, + "learning_rate": 5.5130236320251625e-05, + "loss": 0.27, + "step": 5665 + }, + { + "epoch": 0.48336461354717625, + "grad_norm": 1.8147219488527573, + "learning_rate": 5.5116493767603524e-05, + "loss": 0.238, + "step": 5666 + }, + { + "epoch": 0.4834499232212933, + "grad_norm": 1.528729758701773, + "learning_rate": 5.510275082433831e-05, + "loss": 0.286, + "step": 5667 + }, + { + "epoch": 0.48353523289541034, + "grad_norm": 1.580056170298068, + "learning_rate": 5.5089007491505186e-05, + "loss": 0.2811, + "step": 5668 + }, + { + "epoch": 0.4836205425695274, + "grad_norm": 1.7040249607828006, + "learning_rate": 5.507526377015339e-05, + "loss": 0.2175, + "step": 5669 + }, + { + "epoch": 0.48370585224364443, + "grad_norm": 1.5282022936410995, + "learning_rate": 5.5061519661332175e-05, + "loss": 0.2227, + "step": 5670 + }, + { + "epoch": 0.48379116191776145, + "grad_norm": 1.6785671312340917, + "learning_rate": 5.504777516609082e-05, + "loss": 0.2594, + "step": 5671 + }, + { + "epoch": 0.4838764715918785, + "grad_norm": 1.325143316903214, + "learning_rate": 5.503403028547867e-05, + "loss": 0.2314, + "step": 5672 + }, + { + "epoch": 0.48396178126599554, + "grad_norm": 1.5712248775443345, + "learning_rate": 5.5020285020545046e-05, + "loss": 0.3149, + "step": 5673 + }, + { + "epoch": 0.4840470909401126, + "grad_norm": 1.7620532612712478, + "learning_rate": 5.500653937233935e-05, + "loss": 0.2539, + "step": 5674 + }, + { + "epoch": 0.48413240061422963, + "grad_norm": 1.5699642421513653, + "learning_rate": 5.499279334191096e-05, + "loss": 0.2768, + "step": 5675 + }, + { + "epoch": 0.4842177102883467, + "grad_norm": 1.5814277910842771, + "learning_rate": 5.497904693030934e-05, + "loss": 0.2763, + "step": 5676 + }, + { + "epoch": 0.4843030199624637, + "grad_norm": 1.6796111333475783, + "learning_rate": 5.4965300138583955e-05, + "loss": 0.2967, + "step": 5677 + }, + { + "epoch": 0.4843883296365808, + "grad_norm": 1.4685107097085839, + "learning_rate": 5.495155296778428e-05, + "loss": 0.2271, + "step": 5678 + }, + { + "epoch": 0.4844736393106978, + "grad_norm": 1.7699973058949459, + "learning_rate": 5.4937805418959844e-05, + "loss": 0.2771, + "step": 5679 + }, + { + "epoch": 0.4845589489848149, + "grad_norm": 1.6935787212860438, + "learning_rate": 5.492405749316021e-05, + "loss": 0.3059, + "step": 5680 + }, + { + "epoch": 0.4846442586589319, + "grad_norm": 1.3703309026371964, + "learning_rate": 5.491030919143494e-05, + "loss": 0.2693, + "step": 5681 + }, + { + "epoch": 0.484729568333049, + "grad_norm": 1.5712383823981926, + "learning_rate": 5.4896560514833675e-05, + "loss": 0.2063, + "step": 5682 + }, + { + "epoch": 0.484814878007166, + "grad_norm": 1.599912658930116, + "learning_rate": 5.4882811464406026e-05, + "loss": 0.2361, + "step": 5683 + }, + { + "epoch": 0.48490018768128307, + "grad_norm": 1.7740704587428957, + "learning_rate": 5.486906204120168e-05, + "loss": 0.3197, + "step": 5684 + }, + { + "epoch": 0.4849854973554001, + "grad_norm": 1.4158921181320978, + "learning_rate": 5.4855312246270306e-05, + "loss": 0.2786, + "step": 5685 + }, + { + "epoch": 0.48507080702951716, + "grad_norm": 1.5856724409525247, + "learning_rate": 5.4841562080661656e-05, + "loss": 0.2273, + "step": 5686 + }, + { + "epoch": 0.4851561167036342, + "grad_norm": 1.4328558163350817, + "learning_rate": 5.4827811545425454e-05, + "loss": 0.2699, + "step": 5687 + }, + { + "epoch": 0.48524142637775125, + "grad_norm": 1.5479991950740248, + "learning_rate": 5.481406064161151e-05, + "loss": 0.287, + "step": 5688 + }, + { + "epoch": 0.48532673605186827, + "grad_norm": 1.8675872502505964, + "learning_rate": 5.4800309370269607e-05, + "loss": 0.3337, + "step": 5689 + }, + { + "epoch": 0.48541204572598534, + "grad_norm": 1.1728641150419816, + "learning_rate": 5.4786557732449594e-05, + "loss": 0.2716, + "step": 5690 + }, + { + "epoch": 0.48549735540010236, + "grad_norm": 1.6281048383581012, + "learning_rate": 5.4772805729201346e-05, + "loss": 0.2218, + "step": 5691 + }, + { + "epoch": 0.48558266507421943, + "grad_norm": 1.9505223137137309, + "learning_rate": 5.475905336157473e-05, + "loss": 0.2504, + "step": 5692 + }, + { + "epoch": 0.48566797474833645, + "grad_norm": 1.4840665798006671, + "learning_rate": 5.474530063061968e-05, + "loss": 0.2431, + "step": 5693 + }, + { + "epoch": 0.4857532844224535, + "grad_norm": 1.5397172493651403, + "learning_rate": 5.473154753738616e-05, + "loss": 0.2317, + "step": 5694 + }, + { + "epoch": 0.48583859409657054, + "grad_norm": 1.3791150159278205, + "learning_rate": 5.471779408292411e-05, + "loss": 0.2463, + "step": 5695 + }, + { + "epoch": 0.4859239037706876, + "grad_norm": 1.746608649678349, + "learning_rate": 5.4704040268283564e-05, + "loss": 0.2391, + "step": 5696 + }, + { + "epoch": 0.48600921344480463, + "grad_norm": 1.367807433040752, + "learning_rate": 5.4690286094514534e-05, + "loss": 0.1893, + "step": 5697 + }, + { + "epoch": 0.4860945231189217, + "grad_norm": 1.7683232038509216, + "learning_rate": 5.4676531562667076e-05, + "loss": 0.3157, + "step": 5698 + }, + { + "epoch": 0.4861798327930387, + "grad_norm": 2.157574233173745, + "learning_rate": 5.4662776673791307e-05, + "loss": 0.2823, + "step": 5699 + }, + { + "epoch": 0.4862651424671558, + "grad_norm": 1.7209687217427982, + "learning_rate": 5.46490214289373e-05, + "loss": 0.2762, + "step": 5700 + }, + { + "epoch": 0.4863504521412728, + "grad_norm": 1.2170990741621217, + "learning_rate": 5.463526582915521e-05, + "loss": 0.241, + "step": 5701 + }, + { + "epoch": 0.4864357618153899, + "grad_norm": 1.6927566599671913, + "learning_rate": 5.4621509875495216e-05, + "loss": 0.2761, + "step": 5702 + }, + { + "epoch": 0.4865210714895069, + "grad_norm": 1.611271749724031, + "learning_rate": 5.460775356900749e-05, + "loss": 0.2925, + "step": 5703 + }, + { + "epoch": 0.486606381163624, + "grad_norm": 1.9202289510357227, + "learning_rate": 5.4593996910742275e-05, + "loss": 0.2688, + "step": 5704 + }, + { + "epoch": 0.486691690837741, + "grad_norm": 1.4995023378695498, + "learning_rate": 5.458023990174981e-05, + "loss": 0.2565, + "step": 5705 + }, + { + "epoch": 0.48677700051185807, + "grad_norm": 1.7232781150624026, + "learning_rate": 5.456648254308037e-05, + "loss": 0.3275, + "step": 5706 + }, + { + "epoch": 0.4868623101859751, + "grad_norm": 1.5139997754720422, + "learning_rate": 5.4552724835784244e-05, + "loss": 0.2853, + "step": 5707 + }, + { + "epoch": 0.4869476198600921, + "grad_norm": 1.6474067394158105, + "learning_rate": 5.4538966780911774e-05, + "loss": 0.2274, + "step": 5708 + }, + { + "epoch": 0.4870329295342092, + "grad_norm": 1.5242384150188224, + "learning_rate": 5.45252083795133e-05, + "loss": 0.2756, + "step": 5709 + }, + { + "epoch": 0.4871182392083262, + "grad_norm": 1.4986092159780533, + "learning_rate": 5.451144963263922e-05, + "loss": 0.3001, + "step": 5710 + }, + { + "epoch": 0.48720354888244327, + "grad_norm": 1.6196593039382512, + "learning_rate": 5.449769054133994e-05, + "loss": 0.2335, + "step": 5711 + }, + { + "epoch": 0.4872888585565603, + "grad_norm": 1.5543432573452567, + "learning_rate": 5.448393110666588e-05, + "loss": 0.3022, + "step": 5712 + }, + { + "epoch": 0.48737416823067736, + "grad_norm": 1.5800794830127818, + "learning_rate": 5.4470171329667506e-05, + "loss": 0.2472, + "step": 5713 + }, + { + "epoch": 0.4874594779047944, + "grad_norm": 1.4116277229419965, + "learning_rate": 5.445641121139532e-05, + "loss": 0.2168, + "step": 5714 + }, + { + "epoch": 0.48754478757891145, + "grad_norm": 1.6505376806710417, + "learning_rate": 5.4442650752899814e-05, + "loss": 0.2412, + "step": 5715 + }, + { + "epoch": 0.48763009725302847, + "grad_norm": 1.444314230445612, + "learning_rate": 5.442888995523153e-05, + "loss": 0.2832, + "step": 5716 + }, + { + "epoch": 0.48771540692714554, + "grad_norm": 1.6026988394412374, + "learning_rate": 5.441512881944104e-05, + "loss": 0.2675, + "step": 5717 + }, + { + "epoch": 0.48780071660126256, + "grad_norm": 1.8572372842194989, + "learning_rate": 5.440136734657891e-05, + "loss": 0.2853, + "step": 5718 + }, + { + "epoch": 0.48788602627537964, + "grad_norm": 1.9609094746931293, + "learning_rate": 5.4387605537695784e-05, + "loss": 0.3093, + "step": 5719 + }, + { + "epoch": 0.48797133594949665, + "grad_norm": 1.598032713468097, + "learning_rate": 5.437384339384228e-05, + "loss": 0.2204, + "step": 5720 + }, + { + "epoch": 0.4880566456236137, + "grad_norm": 1.763386548049981, + "learning_rate": 5.436008091606908e-05, + "loss": 0.2617, + "step": 5721 + }, + { + "epoch": 0.48814195529773075, + "grad_norm": 1.52629017894949, + "learning_rate": 5.434631810542687e-05, + "loss": 0.3007, + "step": 5722 + }, + { + "epoch": 0.4882272649718478, + "grad_norm": 1.3378955256240082, + "learning_rate": 5.4332554962966384e-05, + "loss": 0.2499, + "step": 5723 + }, + { + "epoch": 0.48831257464596484, + "grad_norm": 1.5072328708502332, + "learning_rate": 5.431879148973833e-05, + "loss": 0.2897, + "step": 5724 + }, + { + "epoch": 0.4883978843200819, + "grad_norm": 1.823687915228946, + "learning_rate": 5.430502768679351e-05, + "loss": 0.2847, + "step": 5725 + }, + { + "epoch": 0.4884831939941989, + "grad_norm": 1.3723158212847322, + "learning_rate": 5.42912635551827e-05, + "loss": 0.2816, + "step": 5726 + }, + { + "epoch": 0.488568503668316, + "grad_norm": 1.584948903412417, + "learning_rate": 5.427749909595672e-05, + "loss": 0.2484, + "step": 5727 + }, + { + "epoch": 0.488653813342433, + "grad_norm": 1.4621695153242593, + "learning_rate": 5.4263734310166416e-05, + "loss": 0.2682, + "step": 5728 + }, + { + "epoch": 0.4887391230165501, + "grad_norm": 1.2726704446224637, + "learning_rate": 5.424996919886265e-05, + "loss": 0.2675, + "step": 5729 + }, + { + "epoch": 0.4888244326906671, + "grad_norm": 1.669141013522363, + "learning_rate": 5.423620376309633e-05, + "loss": 0.2313, + "step": 5730 + }, + { + "epoch": 0.4889097423647842, + "grad_norm": 1.570228308702918, + "learning_rate": 5.422243800391835e-05, + "loss": 0.2547, + "step": 5731 + }, + { + "epoch": 0.4889950520389012, + "grad_norm": 1.6650577488183478, + "learning_rate": 5.420867192237966e-05, + "loss": 0.343, + "step": 5732 + }, + { + "epoch": 0.4890803617130183, + "grad_norm": 1.389359058695369, + "learning_rate": 5.4194905519531255e-05, + "loss": 0.2226, + "step": 5733 + }, + { + "epoch": 0.4891656713871353, + "grad_norm": 1.4736782746984212, + "learning_rate": 5.418113879642409e-05, + "loss": 0.2482, + "step": 5734 + }, + { + "epoch": 0.48925098106125237, + "grad_norm": 1.5026991718830414, + "learning_rate": 5.416737175410921e-05, + "loss": 0.2383, + "step": 5735 + }, + { + "epoch": 0.4893362907353694, + "grad_norm": 1.5280319965667402, + "learning_rate": 5.415360439363764e-05, + "loss": 0.2619, + "step": 5736 + }, + { + "epoch": 0.48942160040948646, + "grad_norm": 1.371764190300094, + "learning_rate": 5.4139836716060435e-05, + "loss": 0.2124, + "step": 5737 + }, + { + "epoch": 0.4895069100836035, + "grad_norm": 1.9054448663107009, + "learning_rate": 5.4126068722428704e-05, + "loss": 0.2554, + "step": 5738 + }, + { + "epoch": 0.48959221975772055, + "grad_norm": 1.6391061927928376, + "learning_rate": 5.411230041379356e-05, + "loss": 0.2114, + "step": 5739 + }, + { + "epoch": 0.48967752943183757, + "grad_norm": 1.7722357488665403, + "learning_rate": 5.4098531791206105e-05, + "loss": 0.2897, + "step": 5740 + }, + { + "epoch": 0.48976283910595464, + "grad_norm": 1.366874833458017, + "learning_rate": 5.408476285571755e-05, + "loss": 0.2788, + "step": 5741 + }, + { + "epoch": 0.48984814878007166, + "grad_norm": 2.07059531529384, + "learning_rate": 5.407099360837905e-05, + "loss": 0.2902, + "step": 5742 + }, + { + "epoch": 0.48993345845418873, + "grad_norm": 2.012440139164567, + "learning_rate": 5.405722405024183e-05, + "loss": 0.3213, + "step": 5743 + }, + { + "epoch": 0.49001876812830575, + "grad_norm": 1.9891171599951407, + "learning_rate": 5.40434541823571e-05, + "loss": 0.2998, + "step": 5744 + }, + { + "epoch": 0.4901040778024228, + "grad_norm": 1.7482673378129276, + "learning_rate": 5.402968400577614e-05, + "loss": 0.2137, + "step": 5745 + }, + { + "epoch": 0.49018938747653984, + "grad_norm": 1.2642693971515382, + "learning_rate": 5.401591352155021e-05, + "loss": 0.2852, + "step": 5746 + }, + { + "epoch": 0.49027469715065686, + "grad_norm": 1.4408727139146806, + "learning_rate": 5.400214273073065e-05, + "loss": 0.2563, + "step": 5747 + }, + { + "epoch": 0.49036000682477393, + "grad_norm": 1.4284707817318052, + "learning_rate": 5.398837163436873e-05, + "loss": 0.2212, + "step": 5748 + }, + { + "epoch": 0.49044531649889095, + "grad_norm": 1.6549439949171258, + "learning_rate": 5.397460023351585e-05, + "loss": 0.2889, + "step": 5749 + }, + { + "epoch": 0.490530626173008, + "grad_norm": 1.7485694487744832, + "learning_rate": 5.396082852922334e-05, + "loss": 0.2751, + "step": 5750 + }, + { + "epoch": 0.49061593584712504, + "grad_norm": 1.4793485968376676, + "learning_rate": 5.394705652254264e-05, + "loss": 0.2066, + "step": 5751 + }, + { + "epoch": 0.4907012455212421, + "grad_norm": 1.432783266724, + "learning_rate": 5.393328421452514e-05, + "loss": 0.2389, + "step": 5752 + }, + { + "epoch": 0.49078655519535913, + "grad_norm": 1.7292849795145842, + "learning_rate": 5.391951160622228e-05, + "loss": 0.3044, + "step": 5753 + }, + { + "epoch": 0.4908718648694762, + "grad_norm": 1.0897995587758813, + "learning_rate": 5.390573869868556e-05, + "loss": 0.1819, + "step": 5754 + }, + { + "epoch": 0.4909571745435932, + "grad_norm": 1.4903129266957051, + "learning_rate": 5.389196549296644e-05, + "loss": 0.2811, + "step": 5755 + }, + { + "epoch": 0.4910424842177103, + "grad_norm": 1.4548832910142353, + "learning_rate": 5.387819199011642e-05, + "loss": 0.2864, + "step": 5756 + }, + { + "epoch": 0.4911277938918273, + "grad_norm": 1.5708548145528698, + "learning_rate": 5.3864418191187074e-05, + "loss": 0.2944, + "step": 5757 + }, + { + "epoch": 0.4912131035659444, + "grad_norm": 1.6550879810607895, + "learning_rate": 5.385064409722992e-05, + "loss": 0.2642, + "step": 5758 + }, + { + "epoch": 0.4912984132400614, + "grad_norm": 1.6626854871945544, + "learning_rate": 5.383686970929657e-05, + "loss": 0.2833, + "step": 5759 + }, + { + "epoch": 0.4913837229141785, + "grad_norm": 1.595901010072619, + "learning_rate": 5.3823095028438585e-05, + "loss": 0.2842, + "step": 5760 + }, + { + "epoch": 0.4914690325882955, + "grad_norm": 1.4249305172761728, + "learning_rate": 5.380932005570761e-05, + "loss": 0.2405, + "step": 5761 + }, + { + "epoch": 0.49155434226241257, + "grad_norm": 1.6056317991452056, + "learning_rate": 5.3795544792155306e-05, + "loss": 0.2679, + "step": 5762 + }, + { + "epoch": 0.4916396519365296, + "grad_norm": 1.680322571394912, + "learning_rate": 5.3781769238833315e-05, + "loss": 0.2418, + "step": 5763 + }, + { + "epoch": 0.49172496161064666, + "grad_norm": 1.5647632229635284, + "learning_rate": 5.3767993396793335e-05, + "loss": 0.2336, + "step": 5764 + }, + { + "epoch": 0.4918102712847637, + "grad_norm": 1.3745623672395788, + "learning_rate": 5.37542172670871e-05, + "loss": 0.2986, + "step": 5765 + }, + { + "epoch": 0.49189558095888075, + "grad_norm": 1.565186594116451, + "learning_rate": 5.37404408507663e-05, + "loss": 0.2599, + "step": 5766 + }, + { + "epoch": 0.49198089063299777, + "grad_norm": 1.5078433335071981, + "learning_rate": 5.372666414888274e-05, + "loss": 0.306, + "step": 5767 + }, + { + "epoch": 0.49206620030711484, + "grad_norm": 1.7437483291891427, + "learning_rate": 5.371288716248819e-05, + "loss": 0.3229, + "step": 5768 + }, + { + "epoch": 0.49215150998123186, + "grad_norm": 1.4101122904765173, + "learning_rate": 5.36991098926344e-05, + "loss": 0.2186, + "step": 5769 + }, + { + "epoch": 0.49223681965534893, + "grad_norm": 1.885535143366659, + "learning_rate": 5.368533234037325e-05, + "loss": 0.2467, + "step": 5770 + }, + { + "epoch": 0.49232212932946595, + "grad_norm": 1.5550256102682323, + "learning_rate": 5.3671554506756546e-05, + "loss": 0.2411, + "step": 5771 + }, + { + "epoch": 0.492407439003583, + "grad_norm": 1.9742235057608986, + "learning_rate": 5.3657776392836175e-05, + "loss": 0.2796, + "step": 5772 + }, + { + "epoch": 0.49249274867770004, + "grad_norm": 1.4805098153007648, + "learning_rate": 5.364399799966402e-05, + "loss": 0.3091, + "step": 5773 + }, + { + "epoch": 0.4925780583518171, + "grad_norm": 1.3829703375613611, + "learning_rate": 5.3630219328291965e-05, + "loss": 0.2466, + "step": 5774 + }, + { + "epoch": 0.49266336802593413, + "grad_norm": 1.4342473590381677, + "learning_rate": 5.361644037977196e-05, + "loss": 0.2615, + "step": 5775 + }, + { + "epoch": 0.4927486777000512, + "grad_norm": 1.599843661299624, + "learning_rate": 5.3602661155155966e-05, + "loss": 0.2754, + "step": 5776 + }, + { + "epoch": 0.4928339873741682, + "grad_norm": 1.6288949064962115, + "learning_rate": 5.3588881655495914e-05, + "loss": 0.1851, + "step": 5777 + }, + { + "epoch": 0.4929192970482853, + "grad_norm": 1.504111694571282, + "learning_rate": 5.3575101881843824e-05, + "loss": 0.2367, + "step": 5778 + }, + { + "epoch": 0.4930046067224023, + "grad_norm": 1.853648173956069, + "learning_rate": 5.3561321835251724e-05, + "loss": 0.2741, + "step": 5779 + }, + { + "epoch": 0.4930899163965194, + "grad_norm": 1.6264504048677995, + "learning_rate": 5.3547541516771603e-05, + "loss": 0.2947, + "step": 5780 + }, + { + "epoch": 0.4931752260706364, + "grad_norm": 1.2634134160948929, + "learning_rate": 5.353376092745556e-05, + "loss": 0.2316, + "step": 5781 + }, + { + "epoch": 0.4932605357447535, + "grad_norm": 1.7786081365013373, + "learning_rate": 5.351998006835562e-05, + "loss": 0.3295, + "step": 5782 + }, + { + "epoch": 0.4933458454188705, + "grad_norm": 1.5509761044096022, + "learning_rate": 5.350619894052393e-05, + "loss": 0.2267, + "step": 5783 + }, + { + "epoch": 0.4934311550929876, + "grad_norm": 1.284527935195022, + "learning_rate": 5.349241754501257e-05, + "loss": 0.1828, + "step": 5784 + }, + { + "epoch": 0.4935164647671046, + "grad_norm": 1.5697577716059679, + "learning_rate": 5.34786358828737e-05, + "loss": 0.2593, + "step": 5785 + }, + { + "epoch": 0.4936017744412216, + "grad_norm": 1.4441979312025572, + "learning_rate": 5.3464853955159456e-05, + "loss": 0.2556, + "step": 5786 + }, + { + "epoch": 0.4936870841153387, + "grad_norm": 1.4285845228003802, + "learning_rate": 5.345107176292202e-05, + "loss": 0.2454, + "step": 5787 + }, + { + "epoch": 0.4937723937894557, + "grad_norm": 1.632375558313376, + "learning_rate": 5.343728930721361e-05, + "loss": 0.2439, + "step": 5788 + }, + { + "epoch": 0.4938577034635728, + "grad_norm": 1.273291924703737, + "learning_rate": 5.3423506589086435e-05, + "loss": 0.2199, + "step": 5789 + }, + { + "epoch": 0.4939430131376898, + "grad_norm": 1.4093390974052038, + "learning_rate": 5.34097236095927e-05, + "loss": 0.2595, + "step": 5790 + }, + { + "epoch": 0.49402832281180686, + "grad_norm": 1.3539540735285254, + "learning_rate": 5.3395940369784706e-05, + "loss": 0.2256, + "step": 5791 + }, + { + "epoch": 0.4941136324859239, + "grad_norm": 1.440170461314077, + "learning_rate": 5.338215687071469e-05, + "loss": 0.2939, + "step": 5792 + }, + { + "epoch": 0.49419894216004095, + "grad_norm": 1.4814564170748117, + "learning_rate": 5.336837311343498e-05, + "loss": 0.2195, + "step": 5793 + }, + { + "epoch": 0.494284251834158, + "grad_norm": 2.1083314928290533, + "learning_rate": 5.3354589098997886e-05, + "loss": 0.2644, + "step": 5794 + }, + { + "epoch": 0.49436956150827505, + "grad_norm": 1.7025054364670766, + "learning_rate": 5.3340804828455726e-05, + "loss": 0.28, + "step": 5795 + }, + { + "epoch": 0.49445487118239206, + "grad_norm": 1.4579582640295639, + "learning_rate": 5.332702030286089e-05, + "loss": 0.278, + "step": 5796 + }, + { + "epoch": 0.49454018085650914, + "grad_norm": 2.240157108587428, + "learning_rate": 5.331323552326573e-05, + "loss": 0.2523, + "step": 5797 + }, + { + "epoch": 0.49462549053062616, + "grad_norm": 1.393940220668922, + "learning_rate": 5.329945049072263e-05, + "loss": 0.2861, + "step": 5798 + }, + { + "epoch": 0.49471080020474323, + "grad_norm": 2.0239849749146903, + "learning_rate": 5.328566520628403e-05, + "loss": 0.305, + "step": 5799 + }, + { + "epoch": 0.49479610987886025, + "grad_norm": 1.4276718355563696, + "learning_rate": 5.327187967100237e-05, + "loss": 0.2454, + "step": 5800 + }, + { + "epoch": 0.4948814195529773, + "grad_norm": 2.0104950673788258, + "learning_rate": 5.325809388593005e-05, + "loss": 0.2343, + "step": 5801 + }, + { + "epoch": 0.49496672922709434, + "grad_norm": 1.6559796382555234, + "learning_rate": 5.324430785211959e-05, + "loss": 0.3067, + "step": 5802 + }, + { + "epoch": 0.4950520389012114, + "grad_norm": 1.4715160539628211, + "learning_rate": 5.323052157062346e-05, + "loss": 0.2458, + "step": 5803 + }, + { + "epoch": 0.49513734857532843, + "grad_norm": 1.7226136217443933, + "learning_rate": 5.321673504249418e-05, + "loss": 0.1927, + "step": 5804 + }, + { + "epoch": 0.4952226582494455, + "grad_norm": 1.700790765514048, + "learning_rate": 5.320294826878428e-05, + "loss": 0.2284, + "step": 5805 + }, + { + "epoch": 0.4953079679235625, + "grad_norm": 1.148330060643228, + "learning_rate": 5.318916125054628e-05, + "loss": 0.2253, + "step": 5806 + }, + { + "epoch": 0.4953932775976796, + "grad_norm": 1.3760498980329623, + "learning_rate": 5.317537398883279e-05, + "loss": 0.2271, + "step": 5807 + }, + { + "epoch": 0.4954785872717966, + "grad_norm": 1.5268583575234504, + "learning_rate": 5.3161586484696347e-05, + "loss": 0.2703, + "step": 5808 + }, + { + "epoch": 0.4955638969459137, + "grad_norm": 1.5459467096557562, + "learning_rate": 5.3147798739189594e-05, + "loss": 0.2962, + "step": 5809 + }, + { + "epoch": 0.4956492066200307, + "grad_norm": 1.3504202541800756, + "learning_rate": 5.313401075336513e-05, + "loss": 0.2832, + "step": 5810 + }, + { + "epoch": 0.4957345162941478, + "grad_norm": 1.4477893123832444, + "learning_rate": 5.31202225282756e-05, + "loss": 0.2384, + "step": 5811 + }, + { + "epoch": 0.4958198259682648, + "grad_norm": 1.3453299526806017, + "learning_rate": 5.3106434064973665e-05, + "loss": 0.2529, + "step": 5812 + }, + { + "epoch": 0.49590513564238187, + "grad_norm": 1.4590746584818914, + "learning_rate": 5.309264536451199e-05, + "loss": 0.2667, + "step": 5813 + }, + { + "epoch": 0.4959904453164989, + "grad_norm": 1.3362922420551417, + "learning_rate": 5.307885642794327e-05, + "loss": 0.2438, + "step": 5814 + }, + { + "epoch": 0.49607575499061596, + "grad_norm": 1.2951646119882239, + "learning_rate": 5.306506725632023e-05, + "loss": 0.2511, + "step": 5815 + }, + { + "epoch": 0.496161064664733, + "grad_norm": 1.7635808944470182, + "learning_rate": 5.305127785069558e-05, + "loss": 0.3, + "step": 5816 + }, + { + "epoch": 0.49624637433885005, + "grad_norm": 1.5342427804698409, + "learning_rate": 5.30374882121221e-05, + "loss": 0.3045, + "step": 5817 + }, + { + "epoch": 0.49633168401296707, + "grad_norm": 1.7214396845532902, + "learning_rate": 5.302369834165253e-05, + "loss": 0.2954, + "step": 5818 + }, + { + "epoch": 0.49641699368708414, + "grad_norm": 1.383938605628136, + "learning_rate": 5.3009908240339647e-05, + "loss": 0.2127, + "step": 5819 + }, + { + "epoch": 0.49650230336120116, + "grad_norm": 1.6940828425655505, + "learning_rate": 5.299611790923629e-05, + "loss": 0.2663, + "step": 5820 + }, + { + "epoch": 0.49658761303531823, + "grad_norm": 1.5419863893425267, + "learning_rate": 5.2982327349395246e-05, + "loss": 0.2218, + "step": 5821 + }, + { + "epoch": 0.49667292270943525, + "grad_norm": 1.6894450810411825, + "learning_rate": 5.296853656186934e-05, + "loss": 0.2657, + "step": 5822 + }, + { + "epoch": 0.49675823238355227, + "grad_norm": 1.6107474095609238, + "learning_rate": 5.2954745547711446e-05, + "loss": 0.2858, + "step": 5823 + }, + { + "epoch": 0.49684354205766934, + "grad_norm": 1.9403288094610311, + "learning_rate": 5.294095430797443e-05, + "loss": 0.2969, + "step": 5824 + }, + { + "epoch": 0.49692885173178636, + "grad_norm": 1.7595877266402264, + "learning_rate": 5.2927162843711196e-05, + "loss": 0.2922, + "step": 5825 + }, + { + "epoch": 0.49701416140590343, + "grad_norm": 1.6874260356376158, + "learning_rate": 5.291337115597462e-05, + "loss": 0.2453, + "step": 5826 + }, + { + "epoch": 0.49709947108002045, + "grad_norm": 1.210704461100857, + "learning_rate": 5.289957924581764e-05, + "loss": 0.2535, + "step": 5827 + }, + { + "epoch": 0.4971847807541375, + "grad_norm": 1.991453327207648, + "learning_rate": 5.28857871142932e-05, + "loss": 0.2755, + "step": 5828 + }, + { + "epoch": 0.49727009042825454, + "grad_norm": 1.5219985817740973, + "learning_rate": 5.287199476245425e-05, + "loss": 0.2813, + "step": 5829 + }, + { + "epoch": 0.4973554001023716, + "grad_norm": 1.5116917807369032, + "learning_rate": 5.285820219135374e-05, + "loss": 0.2827, + "step": 5830 + }, + { + "epoch": 0.49744070977648863, + "grad_norm": 1.79012923109813, + "learning_rate": 5.2844409402044707e-05, + "loss": 0.2854, + "step": 5831 + }, + { + "epoch": 0.4975260194506057, + "grad_norm": 1.7359221147873507, + "learning_rate": 5.283061639558011e-05, + "loss": 0.3226, + "step": 5832 + }, + { + "epoch": 0.4976113291247227, + "grad_norm": 1.70803677690832, + "learning_rate": 5.281682317301302e-05, + "loss": 0.2867, + "step": 5833 + }, + { + "epoch": 0.4976966387988398, + "grad_norm": 2.02566547055586, + "learning_rate": 5.280302973539644e-05, + "loss": 0.2253, + "step": 5834 + }, + { + "epoch": 0.4977819484729568, + "grad_norm": 2.116644645436194, + "learning_rate": 5.278923608378341e-05, + "loss": 0.3218, + "step": 5835 + }, + { + "epoch": 0.4978672581470739, + "grad_norm": 1.594813422169497, + "learning_rate": 5.277544221922705e-05, + "loss": 0.2765, + "step": 5836 + }, + { + "epoch": 0.4979525678211909, + "grad_norm": 1.4478661324809863, + "learning_rate": 5.276164814278043e-05, + "loss": 0.2041, + "step": 5837 + }, + { + "epoch": 0.498037877495308, + "grad_norm": 1.772720057354984, + "learning_rate": 5.274785385549663e-05, + "loss": 0.1893, + "step": 5838 + }, + { + "epoch": 0.498123187169425, + "grad_norm": 1.4368848728472936, + "learning_rate": 5.27340593584288e-05, + "loss": 0.2824, + "step": 5839 + }, + { + "epoch": 0.49820849684354207, + "grad_norm": 1.4924151661339713, + "learning_rate": 5.2720264652630055e-05, + "loss": 0.267, + "step": 5840 + }, + { + "epoch": 0.4982938065176591, + "grad_norm": 2.0172682576815872, + "learning_rate": 5.2706469739153574e-05, + "loss": 0.2994, + "step": 5841 + }, + { + "epoch": 0.49837911619177616, + "grad_norm": 1.4828115159766064, + "learning_rate": 5.269267461905253e-05, + "loss": 0.2381, + "step": 5842 + }, + { + "epoch": 0.4984644258658932, + "grad_norm": 1.4644034982953646, + "learning_rate": 5.267887929338006e-05, + "loss": 0.2176, + "step": 5843 + }, + { + "epoch": 0.49854973554001025, + "grad_norm": 1.728912825125884, + "learning_rate": 5.2665083763189396e-05, + "loss": 0.2423, + "step": 5844 + }, + { + "epoch": 0.49863504521412727, + "grad_norm": 1.4746109918244448, + "learning_rate": 5.265128802953375e-05, + "loss": 0.2248, + "step": 5845 + }, + { + "epoch": 0.49872035488824434, + "grad_norm": 1.4306673954739333, + "learning_rate": 5.263749209346634e-05, + "loss": 0.2119, + "step": 5846 + }, + { + "epoch": 0.49880566456236136, + "grad_norm": 1.2311304644346124, + "learning_rate": 5.2623695956040445e-05, + "loss": 0.2139, + "step": 5847 + }, + { + "epoch": 0.49889097423647843, + "grad_norm": 1.565707157743297, + "learning_rate": 5.260989961830929e-05, + "loss": 0.2014, + "step": 5848 + }, + { + "epoch": 0.49897628391059545, + "grad_norm": 1.820710560248006, + "learning_rate": 5.259610308132618e-05, + "loss": 0.3001, + "step": 5849 + }, + { + "epoch": 0.4990615935847125, + "grad_norm": 1.3703884473435073, + "learning_rate": 5.2582306346144394e-05, + "loss": 0.2779, + "step": 5850 + }, + { + "epoch": 0.49914690325882954, + "grad_norm": 2.267545635769351, + "learning_rate": 5.2568509413817236e-05, + "loss": 0.2923, + "step": 5851 + }, + { + "epoch": 0.4992322129329466, + "grad_norm": 1.6061376955425855, + "learning_rate": 5.2554712285398034e-05, + "loss": 0.224, + "step": 5852 + }, + { + "epoch": 0.49931752260706364, + "grad_norm": 1.8379992656644033, + "learning_rate": 5.2540914961940124e-05, + "loss": 0.2576, + "step": 5853 + }, + { + "epoch": 0.4994028322811807, + "grad_norm": 1.2964594071694537, + "learning_rate": 5.2527117444496864e-05, + "loss": 0.3014, + "step": 5854 + }, + { + "epoch": 0.4994881419552977, + "grad_norm": 1.5114715926125302, + "learning_rate": 5.251331973412162e-05, + "loss": 0.2937, + "step": 5855 + }, + { + "epoch": 0.4995734516294148, + "grad_norm": 1.8027539484649742, + "learning_rate": 5.249952183186776e-05, + "loss": 0.2566, + "step": 5856 + }, + { + "epoch": 0.4996587613035318, + "grad_norm": 1.4205643198771252, + "learning_rate": 5.24857237387887e-05, + "loss": 0.262, + "step": 5857 + }, + { + "epoch": 0.4997440709776489, + "grad_norm": 2.055350889149309, + "learning_rate": 5.247192545593783e-05, + "loss": 0.28, + "step": 5858 + }, + { + "epoch": 0.4998293806517659, + "grad_norm": 1.4797012634603999, + "learning_rate": 5.245812698436858e-05, + "loss": 0.2711, + "step": 5859 + }, + { + "epoch": 0.499914690325883, + "grad_norm": 1.7482080822139743, + "learning_rate": 5.2444328325134415e-05, + "loss": 0.295, + "step": 5860 + }, + { + "epoch": 0.5, + "grad_norm": 1.6863375297886776, + "learning_rate": 5.243052947928876e-05, + "loss": 0.2783, + "step": 5861 + }, + { + "epoch": 0.500085309674117, + "grad_norm": 1.5120959538283225, + "learning_rate": 5.241673044788511e-05, + "loss": 0.2329, + "step": 5862 + }, + { + "epoch": 0.500170619348234, + "grad_norm": 1.891059951562145, + "learning_rate": 5.240293123197694e-05, + "loss": 0.2778, + "step": 5863 + }, + { + "epoch": 0.5002559290223512, + "grad_norm": 1.3542357891732193, + "learning_rate": 5.238913183261771e-05, + "loss": 0.3032, + "step": 5864 + }, + { + "epoch": 0.5003412386964682, + "grad_norm": 1.5950554381873732, + "learning_rate": 5.237533225086098e-05, + "loss": 0.3291, + "step": 5865 + }, + { + "epoch": 0.5004265483705852, + "grad_norm": 1.8420390579688488, + "learning_rate": 5.236153248776025e-05, + "loss": 0.2602, + "step": 5866 + }, + { + "epoch": 0.5005118580447022, + "grad_norm": 1.4708248660498557, + "learning_rate": 5.2347732544369055e-05, + "loss": 0.2653, + "step": 5867 + }, + { + "epoch": 0.5005971677188193, + "grad_norm": 1.442753108278878, + "learning_rate": 5.2333932421740975e-05, + "loss": 0.2493, + "step": 5868 + }, + { + "epoch": 0.5006824773929364, + "grad_norm": 1.3726463113423903, + "learning_rate": 5.2320132120929533e-05, + "loss": 0.2928, + "step": 5869 + }, + { + "epoch": 0.5007677870670534, + "grad_norm": 1.9623386802702012, + "learning_rate": 5.230633164298835e-05, + "loss": 0.2535, + "step": 5870 + }, + { + "epoch": 0.5008530967411704, + "grad_norm": 1.6361072645302275, + "learning_rate": 5.2292530988971e-05, + "loss": 0.2711, + "step": 5871 + }, + { + "epoch": 0.5009384064152875, + "grad_norm": 1.5438128130408117, + "learning_rate": 5.2278730159931076e-05, + "loss": 0.2128, + "step": 5872 + }, + { + "epoch": 0.5010237160894045, + "grad_norm": 1.8804235854705895, + "learning_rate": 5.226492915692224e-05, + "loss": 0.2337, + "step": 5873 + }, + { + "epoch": 0.5011090257635216, + "grad_norm": 1.462496817821725, + "learning_rate": 5.225112798099809e-05, + "loss": 0.2667, + "step": 5874 + }, + { + "epoch": 0.5011943354376386, + "grad_norm": 1.5650795052902708, + "learning_rate": 5.2237326633212266e-05, + "loss": 0.2266, + "step": 5875 + }, + { + "epoch": 0.5012796451117557, + "grad_norm": 1.1434658156082054, + "learning_rate": 5.2223525114618466e-05, + "loss": 0.2393, + "step": 5876 + }, + { + "epoch": 0.5013649547858727, + "grad_norm": 1.993498187770168, + "learning_rate": 5.220972342627032e-05, + "loss": 0.2419, + "step": 5877 + }, + { + "epoch": 0.5014502644599897, + "grad_norm": 1.712421348603371, + "learning_rate": 5.219592156922154e-05, + "loss": 0.2603, + "step": 5878 + }, + { + "epoch": 0.5015355741341068, + "grad_norm": 1.3861098121088262, + "learning_rate": 5.218211954452582e-05, + "loss": 0.2577, + "step": 5879 + }, + { + "epoch": 0.5016208838082239, + "grad_norm": 1.7382686314499862, + "learning_rate": 5.216831735323685e-05, + "loss": 0.2854, + "step": 5880 + }, + { + "epoch": 0.5017061934823409, + "grad_norm": 1.5093608278987387, + "learning_rate": 5.21545149964084e-05, + "loss": 0.2492, + "step": 5881 + }, + { + "epoch": 0.5017915031564579, + "grad_norm": 1.757952604008616, + "learning_rate": 5.2140712475094166e-05, + "loss": 0.2855, + "step": 5882 + }, + { + "epoch": 0.501876812830575, + "grad_norm": 1.4441749013558491, + "learning_rate": 5.21269097903479e-05, + "loss": 0.2831, + "step": 5883 + }, + { + "epoch": 0.5019621225046921, + "grad_norm": 1.673840659594257, + "learning_rate": 5.21131069432234e-05, + "loss": 0.2869, + "step": 5884 + }, + { + "epoch": 0.5020474321788091, + "grad_norm": 1.622677757623885, + "learning_rate": 5.209930393477439e-05, + "loss": 0.2872, + "step": 5885 + }, + { + "epoch": 0.5021327418529261, + "grad_norm": 1.9520603787902224, + "learning_rate": 5.2085500766054695e-05, + "loss": 0.2975, + "step": 5886 + }, + { + "epoch": 0.5022180515270431, + "grad_norm": 1.4128255755843133, + "learning_rate": 5.207169743811809e-05, + "loss": 0.2131, + "step": 5887 + }, + { + "epoch": 0.5023033612011603, + "grad_norm": 1.5006246855456218, + "learning_rate": 5.20578939520184e-05, + "loss": 0.3082, + "step": 5888 + }, + { + "epoch": 0.5023886708752773, + "grad_norm": 1.5862545203947884, + "learning_rate": 5.204409030880945e-05, + "loss": 0.2845, + "step": 5889 + }, + { + "epoch": 0.5024739805493943, + "grad_norm": 1.8511905216035383, + "learning_rate": 5.2030286509545054e-05, + "loss": 0.2454, + "step": 5890 + }, + { + "epoch": 0.5025592902235113, + "grad_norm": 1.587594505560823, + "learning_rate": 5.2016482555279065e-05, + "loss": 0.2655, + "step": 5891 + }, + { + "epoch": 0.5026445998976284, + "grad_norm": 1.5883336499038065, + "learning_rate": 5.200267844706537e-05, + "loss": 0.3391, + "step": 5892 + }, + { + "epoch": 0.5027299095717455, + "grad_norm": 1.7045823607639747, + "learning_rate": 5.198887418595779e-05, + "loss": 0.219, + "step": 5893 + }, + { + "epoch": 0.5028152192458625, + "grad_norm": 1.376404781594101, + "learning_rate": 5.1975069773010255e-05, + "loss": 0.2327, + "step": 5894 + }, + { + "epoch": 0.5029005289199795, + "grad_norm": 1.3447535116671432, + "learning_rate": 5.196126520927666e-05, + "loss": 0.2102, + "step": 5895 + }, + { + "epoch": 0.5029858385940966, + "grad_norm": 1.2838632404040276, + "learning_rate": 5.194746049581084e-05, + "loss": 0.2541, + "step": 5896 + }, + { + "epoch": 0.5030711482682136, + "grad_norm": 1.4389664177809935, + "learning_rate": 5.193365563366679e-05, + "loss": 0.2756, + "step": 5897 + }, + { + "epoch": 0.5031564579423307, + "grad_norm": 1.4791343936175387, + "learning_rate": 5.1919850623898395e-05, + "loss": 0.266, + "step": 5898 + }, + { + "epoch": 0.5032417676164477, + "grad_norm": 1.4810329358690528, + "learning_rate": 5.1906045467559616e-05, + "loss": 0.2263, + "step": 5899 + }, + { + "epoch": 0.5033270772905647, + "grad_norm": 1.7743239015085717, + "learning_rate": 5.18922401657044e-05, + "loss": 0.2794, + "step": 5900 + }, + { + "epoch": 0.5034123869646818, + "grad_norm": 1.5283326364818608, + "learning_rate": 5.187843471938668e-05, + "loss": 0.2431, + "step": 5901 + }, + { + "epoch": 0.5034976966387988, + "grad_norm": 1.5806899424613952, + "learning_rate": 5.186462912966047e-05, + "loss": 0.2224, + "step": 5902 + }, + { + "epoch": 0.5035830063129159, + "grad_norm": 1.7972482044923637, + "learning_rate": 5.1850823397579726e-05, + "loss": 0.2937, + "step": 5903 + }, + { + "epoch": 0.5036683159870329, + "grad_norm": 1.5520907237583155, + "learning_rate": 5.183701752419845e-05, + "loss": 0.2858, + "step": 5904 + }, + { + "epoch": 0.50375362566115, + "grad_norm": 1.4247549883702764, + "learning_rate": 5.1823211510570656e-05, + "loss": 0.2476, + "step": 5905 + }, + { + "epoch": 0.503838935335267, + "grad_norm": 1.5395827600102094, + "learning_rate": 5.180940535775035e-05, + "loss": 0.1959, + "step": 5906 + }, + { + "epoch": 0.503924245009384, + "grad_norm": 1.5977613046145027, + "learning_rate": 5.179559906679157e-05, + "loss": 0.1785, + "step": 5907 + }, + { + "epoch": 0.5040095546835011, + "grad_norm": 1.695439276811284, + "learning_rate": 5.178179263874833e-05, + "loss": 0.2612, + "step": 5908 + }, + { + "epoch": 0.5040948643576182, + "grad_norm": 1.4875978517804556, + "learning_rate": 5.176798607467468e-05, + "loss": 0.2477, + "step": 5909 + }, + { + "epoch": 0.5041801740317352, + "grad_norm": 1.7229606376796687, + "learning_rate": 5.17541793756247e-05, + "loss": 0.2545, + "step": 5910 + }, + { + "epoch": 0.5042654837058522, + "grad_norm": 1.6379138161572897, + "learning_rate": 5.174037254265245e-05, + "loss": 0.1659, + "step": 5911 + }, + { + "epoch": 0.5043507933799692, + "grad_norm": 1.718331442066315, + "learning_rate": 5.172656557681199e-05, + "loss": 0.2619, + "step": 5912 + }, + { + "epoch": 0.5044361030540864, + "grad_norm": 1.6036472057482438, + "learning_rate": 5.171275847915744e-05, + "loss": 0.2545, + "step": 5913 + }, + { + "epoch": 0.5045214127282034, + "grad_norm": 1.2877060660479243, + "learning_rate": 5.169895125074287e-05, + "loss": 0.2508, + "step": 5914 + }, + { + "epoch": 0.5046067224023204, + "grad_norm": 1.4333168198868853, + "learning_rate": 5.168514389262241e-05, + "loss": 0.249, + "step": 5915 + }, + { + "epoch": 0.5046920320764374, + "grad_norm": 2.101871964029643, + "learning_rate": 5.167133640585018e-05, + "loss": 0.2691, + "step": 5916 + }, + { + "epoch": 0.5047773417505546, + "grad_norm": 1.5586309213436342, + "learning_rate": 5.165752879148027e-05, + "loss": 0.2566, + "step": 5917 + }, + { + "epoch": 0.5048626514246716, + "grad_norm": 1.8421706854883606, + "learning_rate": 5.164372105056686e-05, + "loss": 0.3007, + "step": 5918 + }, + { + "epoch": 0.5049479610987886, + "grad_norm": 1.38764759172269, + "learning_rate": 5.162991318416408e-05, + "loss": 0.2075, + "step": 5919 + }, + { + "epoch": 0.5050332707729056, + "grad_norm": 1.9937325264574592, + "learning_rate": 5.1616105193326084e-05, + "loss": 0.2596, + "step": 5920 + }, + { + "epoch": 0.5051185804470227, + "grad_norm": 1.292742245475485, + "learning_rate": 5.1602297079107054e-05, + "loss": 0.2384, + "step": 5921 + }, + { + "epoch": 0.5052038901211398, + "grad_norm": 1.5361952044505145, + "learning_rate": 5.1588488842561145e-05, + "loss": 0.2387, + "step": 5922 + }, + { + "epoch": 0.5052891997952568, + "grad_norm": 1.530123315591308, + "learning_rate": 5.157468048474257e-05, + "loss": 0.2765, + "step": 5923 + }, + { + "epoch": 0.5053745094693738, + "grad_norm": 1.5117988663767181, + "learning_rate": 5.15608720067055e-05, + "loss": 0.273, + "step": 5924 + }, + { + "epoch": 0.5054598191434909, + "grad_norm": 1.6562986546693421, + "learning_rate": 5.1547063409504135e-05, + "loss": 0.2415, + "step": 5925 + }, + { + "epoch": 0.5055451288176079, + "grad_norm": 1.5464752913880582, + "learning_rate": 5.1533254694192714e-05, + "loss": 0.2234, + "step": 5926 + }, + { + "epoch": 0.505630438491725, + "grad_norm": 1.7044107326095346, + "learning_rate": 5.151944586182545e-05, + "loss": 0.264, + "step": 5927 + }, + { + "epoch": 0.505715748165842, + "grad_norm": 1.6801880556393776, + "learning_rate": 5.1505636913456555e-05, + "loss": 0.2711, + "step": 5928 + }, + { + "epoch": 0.5058010578399591, + "grad_norm": 1.7662622432729855, + "learning_rate": 5.149182785014029e-05, + "loss": 0.2745, + "step": 5929 + }, + { + "epoch": 0.5058863675140761, + "grad_norm": 1.447829657851678, + "learning_rate": 5.147801867293088e-05, + "loss": 0.2586, + "step": 5930 + }, + { + "epoch": 0.5059716771881931, + "grad_norm": 1.4287403074988725, + "learning_rate": 5.146420938288262e-05, + "loss": 0.2784, + "step": 5931 + }, + { + "epoch": 0.5060569868623102, + "grad_norm": 1.5649817784378997, + "learning_rate": 5.145039998104974e-05, + "loss": 0.2424, + "step": 5932 + }, + { + "epoch": 0.5061422965364273, + "grad_norm": 1.4042859242825378, + "learning_rate": 5.143659046848653e-05, + "loss": 0.208, + "step": 5933 + }, + { + "epoch": 0.5062276062105443, + "grad_norm": 1.5361281562826763, + "learning_rate": 5.1422780846247284e-05, + "loss": 0.3082, + "step": 5934 + }, + { + "epoch": 0.5063129158846613, + "grad_norm": 1.5918382884998115, + "learning_rate": 5.1408971115386287e-05, + "loss": 0.2759, + "step": 5935 + }, + { + "epoch": 0.5063982255587783, + "grad_norm": 1.704442275999321, + "learning_rate": 5.1395161276957804e-05, + "loss": 0.3432, + "step": 5936 + }, + { + "epoch": 0.5064835352328955, + "grad_norm": 1.4653173062666098, + "learning_rate": 5.13813513320162e-05, + "loss": 0.2609, + "step": 5937 + }, + { + "epoch": 0.5065688449070125, + "grad_norm": 1.4745662859753812, + "learning_rate": 5.136754128161575e-05, + "loss": 0.244, + "step": 5938 + }, + { + "epoch": 0.5066541545811295, + "grad_norm": 1.3539920205200915, + "learning_rate": 5.135373112681079e-05, + "loss": 0.2859, + "step": 5939 + }, + { + "epoch": 0.5067394642552465, + "grad_norm": 1.6660498272335529, + "learning_rate": 5.133992086865565e-05, + "loss": 0.246, + "step": 5940 + }, + { + "epoch": 0.5068247739293635, + "grad_norm": 1.786360586919571, + "learning_rate": 5.1326110508204675e-05, + "loss": 0.2163, + "step": 5941 + }, + { + "epoch": 0.5069100836034807, + "grad_norm": 1.5340262954925026, + "learning_rate": 5.1312300046512205e-05, + "loss": 0.2207, + "step": 5942 + }, + { + "epoch": 0.5069953932775977, + "grad_norm": 1.4919979114824664, + "learning_rate": 5.1298489484632605e-05, + "loss": 0.2705, + "step": 5943 + }, + { + "epoch": 0.5070807029517147, + "grad_norm": 1.3960971535511508, + "learning_rate": 5.1284678823620225e-05, + "loss": 0.2755, + "step": 5944 + }, + { + "epoch": 0.5071660126258317, + "grad_norm": 1.4268576543341005, + "learning_rate": 5.127086806452945e-05, + "loss": 0.2358, + "step": 5945 + }, + { + "epoch": 0.5072513222999488, + "grad_norm": 1.9368955838444324, + "learning_rate": 5.125705720841465e-05, + "loss": 0.3159, + "step": 5946 + }, + { + "epoch": 0.5073366319740659, + "grad_norm": 1.3739654810760265, + "learning_rate": 5.124324625633021e-05, + "loss": 0.2272, + "step": 5947 + }, + { + "epoch": 0.5074219416481829, + "grad_norm": 1.3135692237587173, + "learning_rate": 5.122943520933054e-05, + "loss": 0.2537, + "step": 5948 + }, + { + "epoch": 0.5075072513222999, + "grad_norm": 1.843229058588964, + "learning_rate": 5.1215624068470014e-05, + "loss": 0.2673, + "step": 5949 + }, + { + "epoch": 0.507592560996417, + "grad_norm": 1.71033428828564, + "learning_rate": 5.120181283480305e-05, + "loss": 0.2342, + "step": 5950 + }, + { + "epoch": 0.507677870670534, + "grad_norm": 1.270643058457138, + "learning_rate": 5.118800150938407e-05, + "loss": 0.2001, + "step": 5951 + }, + { + "epoch": 0.5077631803446511, + "grad_norm": 1.9015421030146435, + "learning_rate": 5.117419009326747e-05, + "loss": 0.184, + "step": 5952 + }, + { + "epoch": 0.5078484900187681, + "grad_norm": 1.7380986835746681, + "learning_rate": 5.1160378587507716e-05, + "loss": 0.233, + "step": 5953 + }, + { + "epoch": 0.5079337996928852, + "grad_norm": 1.4798515059749502, + "learning_rate": 5.1146566993159205e-05, + "loss": 0.2688, + "step": 5954 + }, + { + "epoch": 0.5080191093670022, + "grad_norm": 1.7938274247210617, + "learning_rate": 5.1132755311276405e-05, + "loss": 0.2966, + "step": 5955 + }, + { + "epoch": 0.5081044190411192, + "grad_norm": 1.473903947058223, + "learning_rate": 5.111894354291376e-05, + "loss": 0.2553, + "step": 5956 + }, + { + "epoch": 0.5081897287152363, + "grad_norm": 1.696379505968058, + "learning_rate": 5.110513168912571e-05, + "loss": 0.2746, + "step": 5957 + }, + { + "epoch": 0.5082750383893534, + "grad_norm": 1.9283049414458777, + "learning_rate": 5.109131975096675e-05, + "loss": 0.2792, + "step": 5958 + }, + { + "epoch": 0.5083603480634704, + "grad_norm": 1.9694061396929539, + "learning_rate": 5.10775077294913e-05, + "loss": 0.2454, + "step": 5959 + }, + { + "epoch": 0.5084456577375874, + "grad_norm": 1.562864032300786, + "learning_rate": 5.106369562575388e-05, + "loss": 0.1917, + "step": 5960 + }, + { + "epoch": 0.5085309674117044, + "grad_norm": 1.5029148549716833, + "learning_rate": 5.104988344080896e-05, + "loss": 0.2809, + "step": 5961 + }, + { + "epoch": 0.5086162770858216, + "grad_norm": 2.008763425790468, + "learning_rate": 5.1036071175710986e-05, + "loss": 0.3273, + "step": 5962 + }, + { + "epoch": 0.5087015867599386, + "grad_norm": 2.5782060090255756, + "learning_rate": 5.1022258831514504e-05, + "loss": 0.3289, + "step": 5963 + }, + { + "epoch": 0.5087868964340556, + "grad_norm": 1.4236062378011682, + "learning_rate": 5.100844640927399e-05, + "loss": 0.2444, + "step": 5964 + }, + { + "epoch": 0.5088722061081726, + "grad_norm": 1.266446259211943, + "learning_rate": 5.099463391004394e-05, + "loss": 0.3101, + "step": 5965 + }, + { + "epoch": 0.5089575157822898, + "grad_norm": 1.6851502058958021, + "learning_rate": 5.098082133487889e-05, + "loss": 0.2374, + "step": 5966 + }, + { + "epoch": 0.5090428254564068, + "grad_norm": 1.365865318749289, + "learning_rate": 5.096700868483334e-05, + "loss": 0.2127, + "step": 5967 + }, + { + "epoch": 0.5091281351305238, + "grad_norm": 1.5783890607127062, + "learning_rate": 5.095319596096182e-05, + "loss": 0.2597, + "step": 5968 + }, + { + "epoch": 0.5092134448046408, + "grad_norm": 1.3119395739840773, + "learning_rate": 5.0939383164318865e-05, + "loss": 0.2747, + "step": 5969 + }, + { + "epoch": 0.5092987544787579, + "grad_norm": 1.633303627730143, + "learning_rate": 5.092557029595897e-05, + "loss": 0.2504, + "step": 5970 + }, + { + "epoch": 0.509384064152875, + "grad_norm": 1.6510405756067308, + "learning_rate": 5.091175735693672e-05, + "loss": 0.2689, + "step": 5971 + }, + { + "epoch": 0.509469373826992, + "grad_norm": 1.4744206794108212, + "learning_rate": 5.0897944348306636e-05, + "loss": 0.2119, + "step": 5972 + }, + { + "epoch": 0.509554683501109, + "grad_norm": 1.1878273161476167, + "learning_rate": 5.088413127112326e-05, + "loss": 0.237, + "step": 5973 + }, + { + "epoch": 0.5096399931752261, + "grad_norm": 1.2452400176853045, + "learning_rate": 5.087031812644118e-05, + "loss": 0.1993, + "step": 5974 + }, + { + "epoch": 0.5097253028493431, + "grad_norm": 1.6236871404425337, + "learning_rate": 5.085650491531492e-05, + "loss": 0.3029, + "step": 5975 + }, + { + "epoch": 0.5098106125234602, + "grad_norm": 1.465211867808174, + "learning_rate": 5.084269163879907e-05, + "loss": 0.296, + "step": 5976 + }, + { + "epoch": 0.5098959221975772, + "grad_norm": 1.6675938490919109, + "learning_rate": 5.082887829794819e-05, + "loss": 0.2844, + "step": 5977 + }, + { + "epoch": 0.5099812318716942, + "grad_norm": 1.5568430923261134, + "learning_rate": 5.081506489381684e-05, + "loss": 0.2659, + "step": 5978 + }, + { + "epoch": 0.5100665415458113, + "grad_norm": 1.4907446471365606, + "learning_rate": 5.080125142745965e-05, + "loss": 0.2201, + "step": 5979 + }, + { + "epoch": 0.5101518512199283, + "grad_norm": 1.3569559079669398, + "learning_rate": 5.078743789993115e-05, + "loss": 0.257, + "step": 5980 + }, + { + "epoch": 0.5102371608940454, + "grad_norm": 1.3181168396698528, + "learning_rate": 5.077362431228596e-05, + "loss": 0.2129, + "step": 5981 + }, + { + "epoch": 0.5103224705681624, + "grad_norm": 1.3821497476602438, + "learning_rate": 5.075981066557866e-05, + "loss": 0.2863, + "step": 5982 + }, + { + "epoch": 0.5104077802422795, + "grad_norm": 1.683326788533899, + "learning_rate": 5.074599696086384e-05, + "loss": 0.2579, + "step": 5983 + }, + { + "epoch": 0.5104930899163965, + "grad_norm": 2.1101607378079015, + "learning_rate": 5.073218319919614e-05, + "loss": 0.2585, + "step": 5984 + }, + { + "epoch": 0.5105783995905135, + "grad_norm": 2.2139636785599963, + "learning_rate": 5.0718369381630126e-05, + "loss": 0.2129, + "step": 5985 + }, + { + "epoch": 0.5106637092646306, + "grad_norm": 1.7002194319081827, + "learning_rate": 5.070455550922043e-05, + "loss": 0.2805, + "step": 5986 + }, + { + "epoch": 0.5107490189387477, + "grad_norm": 1.7149218843425413, + "learning_rate": 5.069074158302167e-05, + "loss": 0.2792, + "step": 5987 + }, + { + "epoch": 0.5108343286128647, + "grad_norm": 1.9170668087552287, + "learning_rate": 5.0676927604088465e-05, + "loss": 0.308, + "step": 5988 + }, + { + "epoch": 0.5109196382869817, + "grad_norm": 1.3842754922413765, + "learning_rate": 5.066311357347542e-05, + "loss": 0.2242, + "step": 5989 + }, + { + "epoch": 0.5110049479610987, + "grad_norm": 1.423947092536067, + "learning_rate": 5.06492994922372e-05, + "loss": 0.2597, + "step": 5990 + }, + { + "epoch": 0.5110902576352159, + "grad_norm": 1.7831266455738004, + "learning_rate": 5.0635485361428395e-05, + "loss": 0.2708, + "step": 5991 + }, + { + "epoch": 0.5111755673093329, + "grad_norm": 1.5620136266937004, + "learning_rate": 5.062167118210367e-05, + "loss": 0.2579, + "step": 5992 + }, + { + "epoch": 0.5112608769834499, + "grad_norm": 1.7714116292903612, + "learning_rate": 5.0607856955317646e-05, + "loss": 0.2822, + "step": 5993 + }, + { + "epoch": 0.5113461866575669, + "grad_norm": 1.5552364127947291, + "learning_rate": 5.0594042682124976e-05, + "loss": 0.2151, + "step": 5994 + }, + { + "epoch": 0.511431496331684, + "grad_norm": 1.6104401702284303, + "learning_rate": 5.0580228363580304e-05, + "loss": 0.2601, + "step": 5995 + }, + { + "epoch": 0.5115168060058011, + "grad_norm": 1.8607368931235033, + "learning_rate": 5.056641400073827e-05, + "loss": 0.2188, + "step": 5996 + }, + { + "epoch": 0.5116021156799181, + "grad_norm": 1.802755138736395, + "learning_rate": 5.055259959465355e-05, + "loss": 0.268, + "step": 5997 + }, + { + "epoch": 0.5116874253540351, + "grad_norm": 1.4884024342938114, + "learning_rate": 5.053878514638078e-05, + "loss": 0.2232, + "step": 5998 + }, + { + "epoch": 0.5117727350281522, + "grad_norm": 1.4516060386345004, + "learning_rate": 5.052497065697464e-05, + "loss": 0.2614, + "step": 5999 + }, + { + "epoch": 0.5118580447022693, + "grad_norm": 1.6460958182996765, + "learning_rate": 5.0511156127489766e-05, + "loss": 0.2786, + "step": 6000 + }, + { + "epoch": 0.5119433543763863, + "grad_norm": 1.8768283195165538, + "learning_rate": 5.049734155898086e-05, + "loss": 0.2346, + "step": 6001 + }, + { + "epoch": 0.5120286640505033, + "grad_norm": 1.456279570971012, + "learning_rate": 5.0483526952502545e-05, + "loss": 0.2309, + "step": 6002 + }, + { + "epoch": 0.5121139737246204, + "grad_norm": 1.6673399836730316, + "learning_rate": 5.046971230910953e-05, + "loss": 0.2704, + "step": 6003 + }, + { + "epoch": 0.5121992833987374, + "grad_norm": 1.7884110509878686, + "learning_rate": 5.045589762985646e-05, + "loss": 0.2706, + "step": 6004 + }, + { + "epoch": 0.5122845930728545, + "grad_norm": 2.0026248158620947, + "learning_rate": 5.0442082915798037e-05, + "loss": 0.2263, + "step": 6005 + }, + { + "epoch": 0.5123699027469715, + "grad_norm": 1.2927887667403968, + "learning_rate": 5.0428268167988946e-05, + "loss": 0.2263, + "step": 6006 + }, + { + "epoch": 0.5124552124210886, + "grad_norm": 1.7573451798511983, + "learning_rate": 5.041445338748383e-05, + "loss": 0.2933, + "step": 6007 + }, + { + "epoch": 0.5125405220952056, + "grad_norm": 1.6121724823712378, + "learning_rate": 5.040063857533742e-05, + "loss": 0.2813, + "step": 6008 + }, + { + "epoch": 0.5126258317693226, + "grad_norm": 1.4268003401270088, + "learning_rate": 5.038682373260438e-05, + "loss": 0.2399, + "step": 6009 + }, + { + "epoch": 0.5127111414434397, + "grad_norm": 1.6980126320646507, + "learning_rate": 5.03730088603394e-05, + "loss": 0.2638, + "step": 6010 + }, + { + "epoch": 0.5127964511175568, + "grad_norm": 1.428524774402654, + "learning_rate": 5.035919395959719e-05, + "loss": 0.273, + "step": 6011 + }, + { + "epoch": 0.5128817607916738, + "grad_norm": 1.4359116279006374, + "learning_rate": 5.0345379031432414e-05, + "loss": 0.1872, + "step": 6012 + }, + { + "epoch": 0.5129670704657908, + "grad_norm": 1.5418454487554518, + "learning_rate": 5.033156407689978e-05, + "loss": 0.2958, + "step": 6013 + }, + { + "epoch": 0.5130523801399078, + "grad_norm": 1.3424523763893954, + "learning_rate": 5.031774909705401e-05, + "loss": 0.2151, + "step": 6014 + }, + { + "epoch": 0.5131376898140249, + "grad_norm": 1.806661871707836, + "learning_rate": 5.030393409294977e-05, + "loss": 0.2766, + "step": 6015 + }, + { + "epoch": 0.513222999488142, + "grad_norm": 1.2867437604593919, + "learning_rate": 5.029011906564178e-05, + "loss": 0.243, + "step": 6016 + }, + { + "epoch": 0.513308309162259, + "grad_norm": 1.8251861686334625, + "learning_rate": 5.027630401618475e-05, + "loss": 0.2373, + "step": 6017 + }, + { + "epoch": 0.513393618836376, + "grad_norm": 1.4270801219300755, + "learning_rate": 5.026248894563336e-05, + "loss": 0.1937, + "step": 6018 + }, + { + "epoch": 0.513478928510493, + "grad_norm": 1.242225262529758, + "learning_rate": 5.024867385504234e-05, + "loss": 0.189, + "step": 6019 + }, + { + "epoch": 0.5135642381846102, + "grad_norm": 1.3838365286881662, + "learning_rate": 5.023485874546639e-05, + "loss": 0.2162, + "step": 6020 + }, + { + "epoch": 0.5136495478587272, + "grad_norm": 1.8096984389720019, + "learning_rate": 5.022104361796023e-05, + "loss": 0.2741, + "step": 6021 + }, + { + "epoch": 0.5137348575328442, + "grad_norm": 1.7275098040201797, + "learning_rate": 5.020722847357858e-05, + "loss": 0.2349, + "step": 6022 + }, + { + "epoch": 0.5138201672069612, + "grad_norm": 1.5447280318498828, + "learning_rate": 5.019341331337612e-05, + "loss": 0.2582, + "step": 6023 + }, + { + "epoch": 0.5139054768810783, + "grad_norm": 1.613788294142059, + "learning_rate": 5.0179598138407566e-05, + "loss": 0.2469, + "step": 6024 + }, + { + "epoch": 0.5139907865551954, + "grad_norm": 1.4098013222504064, + "learning_rate": 5.016578294972768e-05, + "loss": 0.2365, + "step": 6025 + }, + { + "epoch": 0.5140760962293124, + "grad_norm": 1.6382471200192055, + "learning_rate": 5.0151967748391116e-05, + "loss": 0.2289, + "step": 6026 + }, + { + "epoch": 0.5141614059034294, + "grad_norm": 2.018735865493281, + "learning_rate": 5.0138152535452646e-05, + "loss": 0.2338, + "step": 6027 + }, + { + "epoch": 0.5142467155775465, + "grad_norm": 1.46197921414087, + "learning_rate": 5.012433731196694e-05, + "loss": 0.2473, + "step": 6028 + }, + { + "epoch": 0.5143320252516635, + "grad_norm": 1.6489933758030633, + "learning_rate": 5.0110522078988764e-05, + "loss": 0.2537, + "step": 6029 + }, + { + "epoch": 0.5144173349257806, + "grad_norm": 1.6875360273118147, + "learning_rate": 5.00967068375728e-05, + "loss": 0.2209, + "step": 6030 + }, + { + "epoch": 0.5145026445998976, + "grad_norm": 1.2821486508245756, + "learning_rate": 5.0082891588773784e-05, + "loss": 0.2369, + "step": 6031 + }, + { + "epoch": 0.5145879542740147, + "grad_norm": 1.565641221135638, + "learning_rate": 5.006907633364646e-05, + "loss": 0.2631, + "step": 6032 + }, + { + "epoch": 0.5146732639481317, + "grad_norm": 1.7750141008918403, + "learning_rate": 5.005526107324551e-05, + "loss": 0.2201, + "step": 6033 + }, + { + "epoch": 0.5147585736222487, + "grad_norm": 2.004698955357469, + "learning_rate": 5.0041445808625656e-05, + "loss": 0.255, + "step": 6034 + }, + { + "epoch": 0.5148438832963658, + "grad_norm": 1.4982769129267954, + "learning_rate": 5.002763054084164e-05, + "loss": 0.2433, + "step": 6035 + }, + { + "epoch": 0.5149291929704829, + "grad_norm": 1.4779057926924757, + "learning_rate": 5.001381527094818e-05, + "loss": 0.2237, + "step": 6036 + }, + { + "epoch": 0.5150145026445999, + "grad_norm": 1.4681732993465255, + "learning_rate": 5e-05, + "loss": 0.2543, + "step": 6037 + }, + { + "epoch": 0.5150998123187169, + "grad_norm": 1.96045450896313, + "learning_rate": 4.9986184729051824e-05, + "loss": 0.2304, + "step": 6038 + }, + { + "epoch": 0.515185121992834, + "grad_norm": 1.27931681610174, + "learning_rate": 4.997236945915838e-05, + "loss": 0.2173, + "step": 6039 + }, + { + "epoch": 0.5152704316669511, + "grad_norm": 1.4791125524947004, + "learning_rate": 4.9958554191374356e-05, + "loss": 0.2329, + "step": 6040 + }, + { + "epoch": 0.5153557413410681, + "grad_norm": 1.7981394423831958, + "learning_rate": 4.994473892675451e-05, + "loss": 0.2309, + "step": 6041 + }, + { + "epoch": 0.5154410510151851, + "grad_norm": 1.6179389866145597, + "learning_rate": 4.9930923666353565e-05, + "loss": 0.2138, + "step": 6042 + }, + { + "epoch": 0.5155263606893021, + "grad_norm": 1.9957482921770318, + "learning_rate": 4.991710841122623e-05, + "loss": 0.2723, + "step": 6043 + }, + { + "epoch": 0.5156116703634193, + "grad_norm": 1.4708183010439015, + "learning_rate": 4.990329316242721e-05, + "loss": 0.2407, + "step": 6044 + }, + { + "epoch": 0.5156969800375363, + "grad_norm": 1.6360246374611487, + "learning_rate": 4.988947792101124e-05, + "loss": 0.2709, + "step": 6045 + }, + { + "epoch": 0.5157822897116533, + "grad_norm": 1.807597600747306, + "learning_rate": 4.987566268803307e-05, + "loss": 0.2561, + "step": 6046 + }, + { + "epoch": 0.5158675993857703, + "grad_norm": 1.855492778421718, + "learning_rate": 4.9861847464547366e-05, + "loss": 0.2188, + "step": 6047 + }, + { + "epoch": 0.5159529090598874, + "grad_norm": 1.4477711153857042, + "learning_rate": 4.984803225160888e-05, + "loss": 0.2975, + "step": 6048 + }, + { + "epoch": 0.5160382187340045, + "grad_norm": 1.4043607100407822, + "learning_rate": 4.9834217050272345e-05, + "loss": 0.2503, + "step": 6049 + }, + { + "epoch": 0.5161235284081215, + "grad_norm": 1.6351987203522096, + "learning_rate": 4.982040186159244e-05, + "loss": 0.2299, + "step": 6050 + }, + { + "epoch": 0.5162088380822385, + "grad_norm": 1.4997087036846832, + "learning_rate": 4.980658668662389e-05, + "loss": 0.2832, + "step": 6051 + }, + { + "epoch": 0.5162941477563556, + "grad_norm": 1.5390939467262437, + "learning_rate": 4.9792771526421445e-05, + "loss": 0.2647, + "step": 6052 + }, + { + "epoch": 0.5163794574304726, + "grad_norm": 1.6529288912950049, + "learning_rate": 4.977895638203978e-05, + "loss": 0.2534, + "step": 6053 + }, + { + "epoch": 0.5164647671045897, + "grad_norm": 1.3303350572956423, + "learning_rate": 4.976514125453361e-05, + "loss": 0.2706, + "step": 6054 + }, + { + "epoch": 0.5165500767787067, + "grad_norm": 1.4952309613134314, + "learning_rate": 4.975132614495768e-05, + "loss": 0.1865, + "step": 6055 + }, + { + "epoch": 0.5166353864528237, + "grad_norm": 2.0743932273457015, + "learning_rate": 4.973751105436665e-05, + "loss": 0.1982, + "step": 6056 + }, + { + "epoch": 0.5167206961269408, + "grad_norm": 1.5626402219795792, + "learning_rate": 4.972369598381527e-05, + "loss": 0.2272, + "step": 6057 + }, + { + "epoch": 0.5168060058010578, + "grad_norm": 1.657537571638001, + "learning_rate": 4.9709880934358244e-05, + "loss": 0.2566, + "step": 6058 + }, + { + "epoch": 0.5168913154751749, + "grad_norm": 1.6371085856642171, + "learning_rate": 4.9696065907050244e-05, + "loss": 0.2871, + "step": 6059 + }, + { + "epoch": 0.5169766251492919, + "grad_norm": 1.5293123870491623, + "learning_rate": 4.9682250902946e-05, + "loss": 0.2215, + "step": 6060 + }, + { + "epoch": 0.517061934823409, + "grad_norm": 1.407426299745421, + "learning_rate": 4.966843592310021e-05, + "loss": 0.256, + "step": 6061 + }, + { + "epoch": 0.517147244497526, + "grad_norm": 1.3980684272872053, + "learning_rate": 4.96546209685676e-05, + "loss": 0.235, + "step": 6062 + }, + { + "epoch": 0.517232554171643, + "grad_norm": 1.5414158213222064, + "learning_rate": 4.964080604040282e-05, + "loss": 0.2119, + "step": 6063 + }, + { + "epoch": 0.5173178638457601, + "grad_norm": 1.3302514499655687, + "learning_rate": 4.96269911396606e-05, + "loss": 0.2404, + "step": 6064 + }, + { + "epoch": 0.5174031735198772, + "grad_norm": 1.781857035749392, + "learning_rate": 4.961317626739564e-05, + "loss": 0.2565, + "step": 6065 + }, + { + "epoch": 0.5174884831939942, + "grad_norm": 1.5615947389814995, + "learning_rate": 4.959936142466259e-05, + "loss": 0.2289, + "step": 6066 + }, + { + "epoch": 0.5175737928681112, + "grad_norm": 1.779641077308621, + "learning_rate": 4.958554661251616e-05, + "loss": 0.281, + "step": 6067 + }, + { + "epoch": 0.5176591025422282, + "grad_norm": 1.709941696154593, + "learning_rate": 4.957173183201108e-05, + "loss": 0.287, + "step": 6068 + }, + { + "epoch": 0.5177444122163454, + "grad_norm": 1.6856679508223698, + "learning_rate": 4.955791708420197e-05, + "loss": 0.2451, + "step": 6069 + }, + { + "epoch": 0.5178297218904624, + "grad_norm": 1.407234864273309, + "learning_rate": 4.954410237014354e-05, + "loss": 0.1872, + "step": 6070 + }, + { + "epoch": 0.5179150315645794, + "grad_norm": 1.667311742241396, + "learning_rate": 4.95302876908905e-05, + "loss": 0.2788, + "step": 6071 + }, + { + "epoch": 0.5180003412386964, + "grad_norm": 1.4560974237629203, + "learning_rate": 4.9516473047497466e-05, + "loss": 0.2166, + "step": 6072 + }, + { + "epoch": 0.5180856509128136, + "grad_norm": 1.5240722898724588, + "learning_rate": 4.950265844101915e-05, + "loss": 0.2946, + "step": 6073 + }, + { + "epoch": 0.5181709605869306, + "grad_norm": 1.4415332769230693, + "learning_rate": 4.9488843872510245e-05, + "loss": 0.255, + "step": 6074 + }, + { + "epoch": 0.5182562702610476, + "grad_norm": 1.3874215078060725, + "learning_rate": 4.9475029343025375e-05, + "loss": 0.2145, + "step": 6075 + }, + { + "epoch": 0.5183415799351646, + "grad_norm": 1.5309790546721163, + "learning_rate": 4.946121485361922e-05, + "loss": 0.2363, + "step": 6076 + }, + { + "epoch": 0.5184268896092817, + "grad_norm": 1.7659092227073898, + "learning_rate": 4.944740040534645e-05, + "loss": 0.2454, + "step": 6077 + }, + { + "epoch": 0.5185121992833988, + "grad_norm": 1.5537326506722495, + "learning_rate": 4.943358599926174e-05, + "loss": 0.2111, + "step": 6078 + }, + { + "epoch": 0.5185975089575158, + "grad_norm": 1.9903379344108647, + "learning_rate": 4.941977163641971e-05, + "loss": 0.2433, + "step": 6079 + }, + { + "epoch": 0.5186828186316328, + "grad_norm": 2.047578181263473, + "learning_rate": 4.940595731787503e-05, + "loss": 0.3465, + "step": 6080 + }, + { + "epoch": 0.5187681283057499, + "grad_norm": 1.7207797463165506, + "learning_rate": 4.939214304468237e-05, + "loss": 0.1874, + "step": 6081 + }, + { + "epoch": 0.5188534379798669, + "grad_norm": 1.8663536987315457, + "learning_rate": 4.9378328817896347e-05, + "loss": 0.2617, + "step": 6082 + }, + { + "epoch": 0.518938747653984, + "grad_norm": 1.5096833476832872, + "learning_rate": 4.936451463857161e-05, + "loss": 0.2837, + "step": 6083 + }, + { + "epoch": 0.519024057328101, + "grad_norm": 2.0721623056094263, + "learning_rate": 4.935070050776282e-05, + "loss": 0.2781, + "step": 6084 + }, + { + "epoch": 0.5191093670022181, + "grad_norm": 1.6137317831959943, + "learning_rate": 4.933688642652459e-05, + "loss": 0.2299, + "step": 6085 + }, + { + "epoch": 0.5191946766763351, + "grad_norm": 1.3147303840057354, + "learning_rate": 4.932307239591155e-05, + "loss": 0.1715, + "step": 6086 + }, + { + "epoch": 0.5192799863504521, + "grad_norm": 1.7054377260763331, + "learning_rate": 4.9309258416978346e-05, + "loss": 0.2944, + "step": 6087 + }, + { + "epoch": 0.5193652960245692, + "grad_norm": 1.9656520871963, + "learning_rate": 4.929544449077958e-05, + "loss": 0.2616, + "step": 6088 + }, + { + "epoch": 0.5194506056986863, + "grad_norm": 1.2889666434974225, + "learning_rate": 4.928163061836988e-05, + "loss": 0.2557, + "step": 6089 + }, + { + "epoch": 0.5195359153728033, + "grad_norm": 1.371358601347691, + "learning_rate": 4.926781680080386e-05, + "loss": 0.2045, + "step": 6090 + }, + { + "epoch": 0.5196212250469203, + "grad_norm": 1.7430685236204855, + "learning_rate": 4.925400303913616e-05, + "loss": 0.2495, + "step": 6091 + }, + { + "epoch": 0.5197065347210373, + "grad_norm": 1.3636359597696806, + "learning_rate": 4.924018933442135e-05, + "loss": 0.286, + "step": 6092 + }, + { + "epoch": 0.5197918443951544, + "grad_norm": 1.614711026300313, + "learning_rate": 4.9226375687714044e-05, + "loss": 0.1984, + "step": 6093 + }, + { + "epoch": 0.5198771540692715, + "grad_norm": 1.694520545598538, + "learning_rate": 4.921256210006886e-05, + "loss": 0.2353, + "step": 6094 + }, + { + "epoch": 0.5199624637433885, + "grad_norm": 1.297880449396946, + "learning_rate": 4.919874857254036e-05, + "loss": 0.1816, + "step": 6095 + }, + { + "epoch": 0.5200477734175055, + "grad_norm": 1.4235377388424557, + "learning_rate": 4.9184935106183156e-05, + "loss": 0.2687, + "step": 6096 + }, + { + "epoch": 0.5201330830916225, + "grad_norm": 1.4177748702573179, + "learning_rate": 4.917112170205183e-05, + "loss": 0.2374, + "step": 6097 + }, + { + "epoch": 0.5202183927657397, + "grad_norm": 1.4638264696847127, + "learning_rate": 4.9157308361200944e-05, + "loss": 0.2804, + "step": 6098 + }, + { + "epoch": 0.5203037024398567, + "grad_norm": 1.818011749506585, + "learning_rate": 4.914349508468508e-05, + "loss": 0.2532, + "step": 6099 + }, + { + "epoch": 0.5203890121139737, + "grad_norm": 1.6093887032231007, + "learning_rate": 4.912968187355885e-05, + "loss": 0.2231, + "step": 6100 + }, + { + "epoch": 0.5204743217880907, + "grad_norm": 2.00212353981233, + "learning_rate": 4.9115868728876745e-05, + "loss": 0.3009, + "step": 6101 + }, + { + "epoch": 0.5205596314622079, + "grad_norm": 1.6235488868075405, + "learning_rate": 4.9102055651693376e-05, + "loss": 0.2745, + "step": 6102 + }, + { + "epoch": 0.5206449411363249, + "grad_norm": 1.5797363967949731, + "learning_rate": 4.9088242643063304e-05, + "loss": 0.2927, + "step": 6103 + }, + { + "epoch": 0.5207302508104419, + "grad_norm": 1.6353844641493493, + "learning_rate": 4.907442970404104e-05, + "loss": 0.235, + "step": 6104 + }, + { + "epoch": 0.5208155604845589, + "grad_norm": 1.4678150204106328, + "learning_rate": 4.906061683568115e-05, + "loss": 0.2524, + "step": 6105 + }, + { + "epoch": 0.520900870158676, + "grad_norm": 1.4527966528290996, + "learning_rate": 4.904680403903818e-05, + "loss": 0.2632, + "step": 6106 + }, + { + "epoch": 0.520986179832793, + "grad_norm": 1.5271002143596424, + "learning_rate": 4.9032991315166674e-05, + "loss": 0.2146, + "step": 6107 + }, + { + "epoch": 0.5210714895069101, + "grad_norm": 1.4095778613200898, + "learning_rate": 4.9019178665121115e-05, + "loss": 0.2099, + "step": 6108 + }, + { + "epoch": 0.5211567991810271, + "grad_norm": 1.5981689973398394, + "learning_rate": 4.900536608995605e-05, + "loss": 0.2745, + "step": 6109 + }, + { + "epoch": 0.5212421088551442, + "grad_norm": 1.2812221803203567, + "learning_rate": 4.899155359072603e-05, + "loss": 0.219, + "step": 6110 + }, + { + "epoch": 0.5213274185292612, + "grad_norm": 1.5020161430720618, + "learning_rate": 4.897774116848551e-05, + "loss": 0.2128, + "step": 6111 + }, + { + "epoch": 0.5214127282033783, + "grad_norm": 1.4325700893864282, + "learning_rate": 4.896392882428901e-05, + "loss": 0.2502, + "step": 6112 + }, + { + "epoch": 0.5214980378774953, + "grad_norm": 1.4808836987026537, + "learning_rate": 4.8950116559191075e-05, + "loss": 0.2234, + "step": 6113 + }, + { + "epoch": 0.5215833475516124, + "grad_norm": 1.8308116375148855, + "learning_rate": 4.893630437424613e-05, + "loss": 0.2669, + "step": 6114 + }, + { + "epoch": 0.5216686572257294, + "grad_norm": 1.9278997255659223, + "learning_rate": 4.892249227050869e-05, + "loss": 0.2558, + "step": 6115 + }, + { + "epoch": 0.5217539668998464, + "grad_norm": 1.3183263703912695, + "learning_rate": 4.890868024903327e-05, + "loss": 0.2361, + "step": 6116 + }, + { + "epoch": 0.5218392765739635, + "grad_norm": 1.7238350984565003, + "learning_rate": 4.8894868310874296e-05, + "loss": 0.2342, + "step": 6117 + }, + { + "epoch": 0.5219245862480806, + "grad_norm": 1.6200517729739654, + "learning_rate": 4.888105645708625e-05, + "loss": 0.2242, + "step": 6118 + }, + { + "epoch": 0.5220098959221976, + "grad_norm": 1.6265154887651305, + "learning_rate": 4.886724468872362e-05, + "loss": 0.2842, + "step": 6119 + }, + { + "epoch": 0.5220952055963146, + "grad_norm": 1.4605549194081755, + "learning_rate": 4.8853433006840807e-05, + "loss": 0.3254, + "step": 6120 + }, + { + "epoch": 0.5221805152704316, + "grad_norm": 1.5671535621383752, + "learning_rate": 4.8839621412492296e-05, + "loss": 0.2741, + "step": 6121 + }, + { + "epoch": 0.5222658249445488, + "grad_norm": 1.716676536395888, + "learning_rate": 4.882580990673253e-05, + "loss": 0.2982, + "step": 6122 + }, + { + "epoch": 0.5223511346186658, + "grad_norm": 1.704314769971804, + "learning_rate": 4.881199849061595e-05, + "loss": 0.2652, + "step": 6123 + }, + { + "epoch": 0.5224364442927828, + "grad_norm": 1.9538519155092737, + "learning_rate": 4.879818716519696e-05, + "loss": 0.2323, + "step": 6124 + }, + { + "epoch": 0.5225217539668998, + "grad_norm": 1.235008270452197, + "learning_rate": 4.8784375931529984e-05, + "loss": 0.2406, + "step": 6125 + }, + { + "epoch": 0.522607063641017, + "grad_norm": 1.4620013112220154, + "learning_rate": 4.877056479066947e-05, + "loss": 0.2396, + "step": 6126 + }, + { + "epoch": 0.522692373315134, + "grad_norm": 1.7483489558731362, + "learning_rate": 4.875675374366979e-05, + "loss": 0.3093, + "step": 6127 + }, + { + "epoch": 0.522777682989251, + "grad_norm": 1.731339556309138, + "learning_rate": 4.874294279158535e-05, + "loss": 0.3176, + "step": 6128 + }, + { + "epoch": 0.522862992663368, + "grad_norm": 2.0483923958730914, + "learning_rate": 4.8729131935470565e-05, + "loss": 0.2167, + "step": 6129 + }, + { + "epoch": 0.522948302337485, + "grad_norm": 1.5851907710362856, + "learning_rate": 4.8715321176379787e-05, + "loss": 0.2411, + "step": 6130 + }, + { + "epoch": 0.5230336120116021, + "grad_norm": 1.6390909925226234, + "learning_rate": 4.870151051536741e-05, + "loss": 0.2166, + "step": 6131 + }, + { + "epoch": 0.5231189216857192, + "grad_norm": 1.5628301653123897, + "learning_rate": 4.868769995348781e-05, + "loss": 0.2269, + "step": 6132 + }, + { + "epoch": 0.5232042313598362, + "grad_norm": 1.1469904288014818, + "learning_rate": 4.8673889491795344e-05, + "loss": 0.1479, + "step": 6133 + }, + { + "epoch": 0.5232895410339532, + "grad_norm": 1.788229736012791, + "learning_rate": 4.866007913134435e-05, + "loss": 0.2768, + "step": 6134 + }, + { + "epoch": 0.5233748507080703, + "grad_norm": 1.465920340055998, + "learning_rate": 4.864626887318921e-05, + "loss": 0.2374, + "step": 6135 + }, + { + "epoch": 0.5234601603821873, + "grad_norm": 1.3362558442889911, + "learning_rate": 4.8632458718384266e-05, + "loss": 0.1952, + "step": 6136 + }, + { + "epoch": 0.5235454700563044, + "grad_norm": 1.5185019333682346, + "learning_rate": 4.861864866798381e-05, + "loss": 0.2708, + "step": 6137 + }, + { + "epoch": 0.5236307797304214, + "grad_norm": 1.7427016384849698, + "learning_rate": 4.8604838723042194e-05, + "loss": 0.2483, + "step": 6138 + }, + { + "epoch": 0.5237160894045385, + "grad_norm": 1.6988626041101862, + "learning_rate": 4.8591028884613745e-05, + "loss": 0.293, + "step": 6139 + }, + { + "epoch": 0.5238013990786555, + "grad_norm": 1.4086416886942472, + "learning_rate": 4.857721915375272e-05, + "loss": 0.2782, + "step": 6140 + }, + { + "epoch": 0.5238867087527725, + "grad_norm": 1.5293428650392416, + "learning_rate": 4.856340953151346e-05, + "loss": 0.343, + "step": 6141 + }, + { + "epoch": 0.5239720184268896, + "grad_norm": 2.050589067445999, + "learning_rate": 4.854960001895027e-05, + "loss": 0.2627, + "step": 6142 + }, + { + "epoch": 0.5240573281010067, + "grad_norm": 1.7553564248778608, + "learning_rate": 4.853579061711739e-05, + "loss": 0.2906, + "step": 6143 + }, + { + "epoch": 0.5241426377751237, + "grad_norm": 1.6601010302167556, + "learning_rate": 4.8521981327069117e-05, + "loss": 0.2795, + "step": 6144 + }, + { + "epoch": 0.5242279474492407, + "grad_norm": 1.538787817285007, + "learning_rate": 4.850817214985973e-05, + "loss": 0.2595, + "step": 6145 + }, + { + "epoch": 0.5243132571233577, + "grad_norm": 1.254760403730026, + "learning_rate": 4.849436308654346e-05, + "loss": 0.1885, + "step": 6146 + }, + { + "epoch": 0.5243985667974749, + "grad_norm": 1.559667690535751, + "learning_rate": 4.848055413817456e-05, + "loss": 0.2116, + "step": 6147 + }, + { + "epoch": 0.5244838764715919, + "grad_norm": 1.4644844937062012, + "learning_rate": 4.84667453058073e-05, + "loss": 0.1948, + "step": 6148 + }, + { + "epoch": 0.5245691861457089, + "grad_norm": 1.4498087460896478, + "learning_rate": 4.845293659049588e-05, + "loss": 0.2789, + "step": 6149 + }, + { + "epoch": 0.5246544958198259, + "grad_norm": 1.7339567762749295, + "learning_rate": 4.843912799329451e-05, + "loss": 0.3137, + "step": 6150 + }, + { + "epoch": 0.5247398054939431, + "grad_norm": 1.409375214841027, + "learning_rate": 4.842531951525744e-05, + "loss": 0.2047, + "step": 6151 + }, + { + "epoch": 0.5248251151680601, + "grad_norm": 1.7817036235153505, + "learning_rate": 4.841151115743887e-05, + "loss": 0.2818, + "step": 6152 + }, + { + "epoch": 0.5249104248421771, + "grad_norm": 1.881557632082661, + "learning_rate": 4.839770292089296e-05, + "loss": 0.2267, + "step": 6153 + }, + { + "epoch": 0.5249957345162941, + "grad_norm": 1.511935354087057, + "learning_rate": 4.8383894806673914e-05, + "loss": 0.2681, + "step": 6154 + }, + { + "epoch": 0.5250810441904112, + "grad_norm": 1.4361478001925159, + "learning_rate": 4.837008681583593e-05, + "loss": 0.2836, + "step": 6155 + }, + { + "epoch": 0.5251663538645283, + "grad_norm": 1.5417778731094067, + "learning_rate": 4.835627894943315e-05, + "loss": 0.2953, + "step": 6156 + }, + { + "epoch": 0.5252516635386453, + "grad_norm": 1.7179816175727467, + "learning_rate": 4.8342471208519726e-05, + "loss": 0.2848, + "step": 6157 + }, + { + "epoch": 0.5253369732127623, + "grad_norm": 1.7012950620492249, + "learning_rate": 4.832866359414984e-05, + "loss": 0.2535, + "step": 6158 + }, + { + "epoch": 0.5254222828868794, + "grad_norm": 1.7514815190130018, + "learning_rate": 4.83148561073776e-05, + "loss": 0.215, + "step": 6159 + }, + { + "epoch": 0.5255075925609964, + "grad_norm": 1.5333510017759198, + "learning_rate": 4.830104874925713e-05, + "loss": 0.2679, + "step": 6160 + }, + { + "epoch": 0.5255929022351135, + "grad_norm": 1.3580063693553115, + "learning_rate": 4.828724152084258e-05, + "loss": 0.2382, + "step": 6161 + }, + { + "epoch": 0.5256782119092305, + "grad_norm": 1.3755511999720356, + "learning_rate": 4.827343442318801e-05, + "loss": 0.2668, + "step": 6162 + }, + { + "epoch": 0.5257635215833476, + "grad_norm": 1.273802722442492, + "learning_rate": 4.8259627457347554e-05, + "loss": 0.2292, + "step": 6163 + }, + { + "epoch": 0.5258488312574646, + "grad_norm": 1.3262034259352071, + "learning_rate": 4.8245820624375314e-05, + "loss": 0.2325, + "step": 6164 + }, + { + "epoch": 0.5259341409315816, + "grad_norm": 1.7470967187906226, + "learning_rate": 4.823201392532533e-05, + "loss": 0.232, + "step": 6165 + }, + { + "epoch": 0.5260194506056987, + "grad_norm": 1.464671051017671, + "learning_rate": 4.821820736125168e-05, + "loss": 0.2149, + "step": 6166 + }, + { + "epoch": 0.5261047602798158, + "grad_norm": 1.7502093189940393, + "learning_rate": 4.8204400933208435e-05, + "loss": 0.2707, + "step": 6167 + }, + { + "epoch": 0.5261900699539328, + "grad_norm": 1.3110923939292023, + "learning_rate": 4.819059464224966e-05, + "loss": 0.2165, + "step": 6168 + }, + { + "epoch": 0.5262753796280498, + "grad_norm": 1.528862863586825, + "learning_rate": 4.8176788489429355e-05, + "loss": 0.2906, + "step": 6169 + }, + { + "epoch": 0.5263606893021668, + "grad_norm": 1.4260049722806853, + "learning_rate": 4.816298247580155e-05, + "loss": 0.2314, + "step": 6170 + }, + { + "epoch": 0.5264459989762839, + "grad_norm": 1.7609295822006166, + "learning_rate": 4.814917660242029e-05, + "loss": 0.2454, + "step": 6171 + }, + { + "epoch": 0.526531308650401, + "grad_norm": 1.2983806097280934, + "learning_rate": 4.8135370870339545e-05, + "loss": 0.1975, + "step": 6172 + }, + { + "epoch": 0.526616618324518, + "grad_norm": 1.3147316080777425, + "learning_rate": 4.8121565280613316e-05, + "loss": 0.201, + "step": 6173 + }, + { + "epoch": 0.526701927998635, + "grad_norm": 1.7784907067168425, + "learning_rate": 4.8107759834295627e-05, + "loss": 0.2465, + "step": 6174 + }, + { + "epoch": 0.526787237672752, + "grad_norm": 1.4576823643600176, + "learning_rate": 4.8093954532440396e-05, + "loss": 0.2623, + "step": 6175 + }, + { + "epoch": 0.5268725473468692, + "grad_norm": 1.3817990139722378, + "learning_rate": 4.808014937610161e-05, + "loss": 0.2435, + "step": 6176 + }, + { + "epoch": 0.5269578570209862, + "grad_norm": 1.6396155748828558, + "learning_rate": 4.806634436633323e-05, + "loss": 0.2507, + "step": 6177 + }, + { + "epoch": 0.5270431666951032, + "grad_norm": 1.345288659924601, + "learning_rate": 4.805253950418917e-05, + "loss": 0.2003, + "step": 6178 + }, + { + "epoch": 0.5271284763692202, + "grad_norm": 1.5540889948986003, + "learning_rate": 4.803873479072336e-05, + "loss": 0.2129, + "step": 6179 + }, + { + "epoch": 0.5272137860433374, + "grad_norm": 1.5090409250745338, + "learning_rate": 4.802493022698974e-05, + "loss": 0.2262, + "step": 6180 + }, + { + "epoch": 0.5272990957174544, + "grad_norm": 1.5804084649423393, + "learning_rate": 4.801112581404222e-05, + "loss": 0.2878, + "step": 6181 + }, + { + "epoch": 0.5273844053915714, + "grad_norm": 1.2522109505628325, + "learning_rate": 4.799732155293465e-05, + "loss": 0.2029, + "step": 6182 + }, + { + "epoch": 0.5274697150656884, + "grad_norm": 1.8471575138482876, + "learning_rate": 4.798351744472093e-05, + "loss": 0.2574, + "step": 6183 + }, + { + "epoch": 0.5275550247398055, + "grad_norm": 1.681705770438035, + "learning_rate": 4.796971349045497e-05, + "loss": 0.2827, + "step": 6184 + }, + { + "epoch": 0.5276403344139226, + "grad_norm": 1.5065144698457478, + "learning_rate": 4.795590969119057e-05, + "loss": 0.2782, + "step": 6185 + }, + { + "epoch": 0.5277256440880396, + "grad_norm": 1.7099450424910865, + "learning_rate": 4.79421060479816e-05, + "loss": 0.3372, + "step": 6186 + }, + { + "epoch": 0.5278109537621566, + "grad_norm": 1.7156755867259197, + "learning_rate": 4.792830256188192e-05, + "loss": 0.2366, + "step": 6187 + }, + { + "epoch": 0.5278962634362737, + "grad_norm": 1.8703456330770551, + "learning_rate": 4.791449923394532e-05, + "loss": 0.2927, + "step": 6188 + }, + { + "epoch": 0.5279815731103907, + "grad_norm": 1.6000552644722101, + "learning_rate": 4.7900696065225605e-05, + "loss": 0.2647, + "step": 6189 + }, + { + "epoch": 0.5280668827845078, + "grad_norm": 1.4176186373680277, + "learning_rate": 4.7886893056776616e-05, + "loss": 0.2473, + "step": 6190 + }, + { + "epoch": 0.5281521924586248, + "grad_norm": 1.521026424883168, + "learning_rate": 4.7873090209652106e-05, + "loss": 0.2469, + "step": 6191 + }, + { + "epoch": 0.5282375021327419, + "grad_norm": 1.4574883692491623, + "learning_rate": 4.7859287524905845e-05, + "loss": 0.2853, + "step": 6192 + }, + { + "epoch": 0.5283228118068589, + "grad_norm": 1.7159452268044053, + "learning_rate": 4.784548500359162e-05, + "loss": 0.2637, + "step": 6193 + }, + { + "epoch": 0.5284081214809759, + "grad_norm": 2.1023906224815336, + "learning_rate": 4.7831682646763156e-05, + "loss": 0.2468, + "step": 6194 + }, + { + "epoch": 0.528493431155093, + "grad_norm": 2.118998748670913, + "learning_rate": 4.781788045547419e-05, + "loss": 0.313, + "step": 6195 + }, + { + "epoch": 0.5285787408292101, + "grad_norm": 1.657717792131951, + "learning_rate": 4.780407843077846e-05, + "loss": 0.2301, + "step": 6196 + }, + { + "epoch": 0.5286640505033271, + "grad_norm": 1.2867503381776486, + "learning_rate": 4.77902765737297e-05, + "loss": 0.2585, + "step": 6197 + }, + { + "epoch": 0.5287493601774441, + "grad_norm": 1.5041204919182418, + "learning_rate": 4.777647488538155e-05, + "loss": 0.2604, + "step": 6198 + }, + { + "epoch": 0.5288346698515611, + "grad_norm": 1.742302176631489, + "learning_rate": 4.7762673366787725e-05, + "loss": 0.2333, + "step": 6199 + }, + { + "epoch": 0.5289199795256783, + "grad_norm": 1.9217684336324456, + "learning_rate": 4.774887201900193e-05, + "loss": 0.3188, + "step": 6200 + }, + { + "epoch": 0.5290052891997953, + "grad_norm": 1.5539420943156028, + "learning_rate": 4.773507084307778e-05, + "loss": 0.2298, + "step": 6201 + }, + { + "epoch": 0.5290905988739123, + "grad_norm": 1.4758922529099836, + "learning_rate": 4.772126984006892e-05, + "loss": 0.1886, + "step": 6202 + }, + { + "epoch": 0.5291759085480293, + "grad_norm": 1.1294457225684957, + "learning_rate": 4.770746901102902e-05, + "loss": 0.2775, + "step": 6203 + }, + { + "epoch": 0.5292612182221464, + "grad_norm": 1.5850396080964726, + "learning_rate": 4.769366835701166e-05, + "loss": 0.1845, + "step": 6204 + }, + { + "epoch": 0.5293465278962635, + "grad_norm": 1.6588566259568538, + "learning_rate": 4.7679867879070465e-05, + "loss": 0.2534, + "step": 6205 + }, + { + "epoch": 0.5294318375703805, + "grad_norm": 1.5553106084853878, + "learning_rate": 4.7666067578259057e-05, + "loss": 0.2239, + "step": 6206 + }, + { + "epoch": 0.5295171472444975, + "grad_norm": 1.355576656564696, + "learning_rate": 4.7652267455630956e-05, + "loss": 0.297, + "step": 6207 + }, + { + "epoch": 0.5296024569186145, + "grad_norm": 1.832410565701018, + "learning_rate": 4.763846751223976e-05, + "loss": 0.2916, + "step": 6208 + }, + { + "epoch": 0.5296877665927316, + "grad_norm": 1.5135882348363474, + "learning_rate": 4.762466774913905e-05, + "loss": 0.2159, + "step": 6209 + }, + { + "epoch": 0.5297730762668487, + "grad_norm": 1.7183779920786575, + "learning_rate": 4.76108681673823e-05, + "loss": 0.2179, + "step": 6210 + }, + { + "epoch": 0.5298583859409657, + "grad_norm": 2.0042514912610563, + "learning_rate": 4.759706876802308e-05, + "loss": 0.2934, + "step": 6211 + }, + { + "epoch": 0.5299436956150827, + "grad_norm": 1.7089221528851402, + "learning_rate": 4.75832695521149e-05, + "loss": 0.1611, + "step": 6212 + }, + { + "epoch": 0.5300290052891998, + "grad_norm": 2.0337000229596867, + "learning_rate": 4.756947052071125e-05, + "loss": 0.3227, + "step": 6213 + }, + { + "epoch": 0.5301143149633168, + "grad_norm": 1.8898227975920991, + "learning_rate": 4.755567167486559e-05, + "loss": 0.2398, + "step": 6214 + }, + { + "epoch": 0.5301996246374339, + "grad_norm": 1.6778240424143012, + "learning_rate": 4.7541873015631416e-05, + "loss": 0.2282, + "step": 6215 + }, + { + "epoch": 0.5302849343115509, + "grad_norm": 1.2823988718089747, + "learning_rate": 4.752807454406219e-05, + "loss": 0.1691, + "step": 6216 + }, + { + "epoch": 0.530370243985668, + "grad_norm": 1.6766422078577665, + "learning_rate": 4.751427626121132e-05, + "loss": 0.2339, + "step": 6217 + }, + { + "epoch": 0.530455553659785, + "grad_norm": 1.520322618127382, + "learning_rate": 4.750047816813224e-05, + "loss": 0.2223, + "step": 6218 + }, + { + "epoch": 0.530540863333902, + "grad_norm": 1.5628704394863053, + "learning_rate": 4.74866802658784e-05, + "loss": 0.2294, + "step": 6219 + }, + { + "epoch": 0.5306261730080191, + "grad_norm": 1.4412295845749565, + "learning_rate": 4.747288255550315e-05, + "loss": 0.293, + "step": 6220 + }, + { + "epoch": 0.5307114826821362, + "grad_norm": 1.5927933458385641, + "learning_rate": 4.7459085038059874e-05, + "loss": 0.2512, + "step": 6221 + }, + { + "epoch": 0.5307967923562532, + "grad_norm": 1.8644891498441267, + "learning_rate": 4.744528771460198e-05, + "loss": 0.2946, + "step": 6222 + }, + { + "epoch": 0.5308821020303702, + "grad_norm": 1.2814571864388054, + "learning_rate": 4.743149058618278e-05, + "loss": 0.2113, + "step": 6223 + }, + { + "epoch": 0.5309674117044872, + "grad_norm": 1.8095336844478378, + "learning_rate": 4.741769365385562e-05, + "loss": 0.2737, + "step": 6224 + }, + { + "epoch": 0.5310527213786044, + "grad_norm": 1.8309208284404572, + "learning_rate": 4.740389691867382e-05, + "loss": 0.2749, + "step": 6225 + }, + { + "epoch": 0.5311380310527214, + "grad_norm": 1.7989752395562928, + "learning_rate": 4.7390100381690724e-05, + "loss": 0.2696, + "step": 6226 + }, + { + "epoch": 0.5312233407268384, + "grad_norm": 1.2526231421965155, + "learning_rate": 4.737630404395956e-05, + "loss": 0.2179, + "step": 6227 + }, + { + "epoch": 0.5313086504009554, + "grad_norm": 1.2459008716320867, + "learning_rate": 4.7362507906533656e-05, + "loss": 0.2064, + "step": 6228 + }, + { + "epoch": 0.5313939600750726, + "grad_norm": 1.3112692284787892, + "learning_rate": 4.734871197046627e-05, + "loss": 0.2477, + "step": 6229 + }, + { + "epoch": 0.5314792697491896, + "grad_norm": 1.5380694288548569, + "learning_rate": 4.7334916236810615e-05, + "loss": 0.1747, + "step": 6230 + }, + { + "epoch": 0.5315645794233066, + "grad_norm": 1.4903711578670276, + "learning_rate": 4.732112070661994e-05, + "loss": 0.2048, + "step": 6231 + }, + { + "epoch": 0.5316498890974236, + "grad_norm": 1.5335750444605531, + "learning_rate": 4.730732538094749e-05, + "loss": 0.2614, + "step": 6232 + }, + { + "epoch": 0.5317351987715407, + "grad_norm": 1.768784658139627, + "learning_rate": 4.729353026084643e-05, + "loss": 0.2791, + "step": 6233 + }, + { + "epoch": 0.5318205084456578, + "grad_norm": 1.5811122909711872, + "learning_rate": 4.727973534736994e-05, + "loss": 0.2223, + "step": 6234 + }, + { + "epoch": 0.5319058181197748, + "grad_norm": 1.9097113530013448, + "learning_rate": 4.726594064157122e-05, + "loss": 0.1983, + "step": 6235 + }, + { + "epoch": 0.5319911277938918, + "grad_norm": 1.386047415691551, + "learning_rate": 4.725214614450339e-05, + "loss": 0.2765, + "step": 6236 + }, + { + "epoch": 0.5320764374680089, + "grad_norm": 1.4857691592258995, + "learning_rate": 4.7238351857219587e-05, + "loss": 0.1964, + "step": 6237 + }, + { + "epoch": 0.5321617471421259, + "grad_norm": 1.4774770175288852, + "learning_rate": 4.722455778077297e-05, + "loss": 0.2354, + "step": 6238 + }, + { + "epoch": 0.532247056816243, + "grad_norm": 1.6916631983578962, + "learning_rate": 4.72107639162166e-05, + "loss": 0.2093, + "step": 6239 + }, + { + "epoch": 0.53233236649036, + "grad_norm": 1.4168518543750768, + "learning_rate": 4.719697026460358e-05, + "loss": 0.1993, + "step": 6240 + }, + { + "epoch": 0.5324176761644771, + "grad_norm": 1.5536269207769968, + "learning_rate": 4.7183176826986984e-05, + "loss": 0.2929, + "step": 6241 + }, + { + "epoch": 0.5325029858385941, + "grad_norm": 1.5655713222537888, + "learning_rate": 4.716938360441989e-05, + "loss": 0.2603, + "step": 6242 + }, + { + "epoch": 0.5325882955127111, + "grad_norm": 1.7088162584720763, + "learning_rate": 4.715559059795531e-05, + "loss": 0.2414, + "step": 6243 + }, + { + "epoch": 0.5326736051868282, + "grad_norm": 2.178908998659301, + "learning_rate": 4.714179780864626e-05, + "loss": 0.2228, + "step": 6244 + }, + { + "epoch": 0.5327589148609452, + "grad_norm": 1.824283051429629, + "learning_rate": 4.712800523754578e-05, + "loss": 0.2427, + "step": 6245 + }, + { + "epoch": 0.5328442245350623, + "grad_norm": 1.4670554592204301, + "learning_rate": 4.7114212885706815e-05, + "loss": 0.2167, + "step": 6246 + }, + { + "epoch": 0.5329295342091793, + "grad_norm": 1.792725058223975, + "learning_rate": 4.710042075418236e-05, + "loss": 0.1961, + "step": 6247 + }, + { + "epoch": 0.5330148438832963, + "grad_norm": 1.3964312349892671, + "learning_rate": 4.70866288440254e-05, + "loss": 0.2137, + "step": 6248 + }, + { + "epoch": 0.5331001535574134, + "grad_norm": 1.4480511262150555, + "learning_rate": 4.7072837156288816e-05, + "loss": 0.2536, + "step": 6249 + }, + { + "epoch": 0.5331854632315305, + "grad_norm": 1.9367513132993224, + "learning_rate": 4.7059045692025567e-05, + "loss": 0.2101, + "step": 6250 + }, + { + "epoch": 0.5332707729056475, + "grad_norm": 1.865933879285937, + "learning_rate": 4.7045254452288566e-05, + "loss": 0.194, + "step": 6251 + }, + { + "epoch": 0.5333560825797645, + "grad_norm": 1.5941369578013902, + "learning_rate": 4.7031463438130676e-05, + "loss": 0.211, + "step": 6252 + }, + { + "epoch": 0.5334413922538815, + "grad_norm": 1.8773865452403864, + "learning_rate": 4.7017672650604766e-05, + "loss": 0.262, + "step": 6253 + }, + { + "epoch": 0.5335267019279987, + "grad_norm": 1.6904972444166027, + "learning_rate": 4.700388209076373e-05, + "loss": 0.2152, + "step": 6254 + }, + { + "epoch": 0.5336120116021157, + "grad_norm": 1.9724146931118196, + "learning_rate": 4.699009175966036e-05, + "loss": 0.273, + "step": 6255 + }, + { + "epoch": 0.5336973212762327, + "grad_norm": 1.7826604779996043, + "learning_rate": 4.6976301658347484e-05, + "loss": 0.2203, + "step": 6256 + }, + { + "epoch": 0.5337826309503497, + "grad_norm": 1.4576760672850535, + "learning_rate": 4.69625117878779e-05, + "loss": 0.3006, + "step": 6257 + }, + { + "epoch": 0.5338679406244669, + "grad_norm": 1.6257986893605296, + "learning_rate": 4.6948722149304424e-05, + "loss": 0.2073, + "step": 6258 + }, + { + "epoch": 0.5339532502985839, + "grad_norm": 1.6046737596022, + "learning_rate": 4.693493274367978e-05, + "loss": 0.1792, + "step": 6259 + }, + { + "epoch": 0.5340385599727009, + "grad_norm": 1.8200784295338872, + "learning_rate": 4.692114357205673e-05, + "loss": 0.2029, + "step": 6260 + }, + { + "epoch": 0.5341238696468179, + "grad_norm": 1.737913628612647, + "learning_rate": 4.690735463548803e-05, + "loss": 0.1846, + "step": 6261 + }, + { + "epoch": 0.534209179320935, + "grad_norm": 1.8298027621220965, + "learning_rate": 4.6893565935026354e-05, + "loss": 0.2577, + "step": 6262 + }, + { + "epoch": 0.534294488995052, + "grad_norm": 2.2139526942987677, + "learning_rate": 4.68797774717244e-05, + "loss": 0.3097, + "step": 6263 + }, + { + "epoch": 0.5343797986691691, + "grad_norm": 1.7098212236790604, + "learning_rate": 4.6865989246634884e-05, + "loss": 0.2876, + "step": 6264 + }, + { + "epoch": 0.5344651083432861, + "grad_norm": 1.8093374877625243, + "learning_rate": 4.685220126081042e-05, + "loss": 0.2715, + "step": 6265 + }, + { + "epoch": 0.5345504180174032, + "grad_norm": 1.67488333737066, + "learning_rate": 4.683841351530365e-05, + "loss": 0.2547, + "step": 6266 + }, + { + "epoch": 0.5346357276915202, + "grad_norm": 1.4824205883558317, + "learning_rate": 4.682462601116724e-05, + "loss": 0.3152, + "step": 6267 + }, + { + "epoch": 0.5347210373656373, + "grad_norm": 1.6720191679499201, + "learning_rate": 4.681083874945373e-05, + "loss": 0.3191, + "step": 6268 + }, + { + "epoch": 0.5348063470397543, + "grad_norm": 1.270265290719166, + "learning_rate": 4.679705173121573e-05, + "loss": 0.2237, + "step": 6269 + }, + { + "epoch": 0.5348916567138714, + "grad_norm": 1.7475462468348757, + "learning_rate": 4.678326495750584e-05, + "loss": 0.296, + "step": 6270 + }, + { + "epoch": 0.5349769663879884, + "grad_norm": 1.461782935000973, + "learning_rate": 4.676947842937655e-05, + "loss": 0.2018, + "step": 6271 + }, + { + "epoch": 0.5350622760621054, + "grad_norm": 1.7935471609448481, + "learning_rate": 4.6755692147880414e-05, + "loss": 0.2631, + "step": 6272 + }, + { + "epoch": 0.5351475857362225, + "grad_norm": 1.6665629195665754, + "learning_rate": 4.674190611406995e-05, + "loss": 0.2385, + "step": 6273 + }, + { + "epoch": 0.5352328954103396, + "grad_norm": 1.7529831392007231, + "learning_rate": 4.672812032899765e-05, + "loss": 0.2157, + "step": 6274 + }, + { + "epoch": 0.5353182050844566, + "grad_norm": 1.7111141165931256, + "learning_rate": 4.671433479371598e-05, + "loss": 0.2555, + "step": 6275 + }, + { + "epoch": 0.5354035147585736, + "grad_norm": 1.2568841197300917, + "learning_rate": 4.670054950927737e-05, + "loss": 0.2597, + "step": 6276 + }, + { + "epoch": 0.5354888244326906, + "grad_norm": 1.3885449916211225, + "learning_rate": 4.668676447673429e-05, + "loss": 0.2156, + "step": 6277 + }, + { + "epoch": 0.5355741341068078, + "grad_norm": 1.864357243759304, + "learning_rate": 4.667297969713912e-05, + "loss": 0.2103, + "step": 6278 + }, + { + "epoch": 0.5356594437809248, + "grad_norm": 1.6149768554871475, + "learning_rate": 4.6659195171544265e-05, + "loss": 0.2486, + "step": 6279 + }, + { + "epoch": 0.5357447534550418, + "grad_norm": 1.5201743365997915, + "learning_rate": 4.664541090100213e-05, + "loss": 0.2181, + "step": 6280 + }, + { + "epoch": 0.5358300631291588, + "grad_norm": 1.5358822879588854, + "learning_rate": 4.6631626886565026e-05, + "loss": 0.2447, + "step": 6281 + }, + { + "epoch": 0.535915372803276, + "grad_norm": 1.591446052693981, + "learning_rate": 4.661784312928531e-05, + "loss": 0.2137, + "step": 6282 + }, + { + "epoch": 0.536000682477393, + "grad_norm": 2.1280831802001035, + "learning_rate": 4.6604059630215326e-05, + "loss": 0.271, + "step": 6283 + }, + { + "epoch": 0.53608599215151, + "grad_norm": 1.5153869756615432, + "learning_rate": 4.659027639040731e-05, + "loss": 0.2874, + "step": 6284 + }, + { + "epoch": 0.536171301825627, + "grad_norm": 1.360139653664992, + "learning_rate": 4.657649341091358e-05, + "loss": 0.2464, + "step": 6285 + }, + { + "epoch": 0.536256611499744, + "grad_norm": 1.4019159711207063, + "learning_rate": 4.656271069278639e-05, + "loss": 0.2242, + "step": 6286 + }, + { + "epoch": 0.5363419211738611, + "grad_norm": 1.5844137454138973, + "learning_rate": 4.654892823707799e-05, + "loss": 0.2191, + "step": 6287 + }, + { + "epoch": 0.5364272308479782, + "grad_norm": 1.5989555168334144, + "learning_rate": 4.6535146044840556e-05, + "loss": 0.2131, + "step": 6288 + }, + { + "epoch": 0.5365125405220952, + "grad_norm": 1.841499960015592, + "learning_rate": 4.6521364117126306e-05, + "loss": 0.2326, + "step": 6289 + }, + { + "epoch": 0.5365978501962122, + "grad_norm": 1.692845531674437, + "learning_rate": 4.650758245498744e-05, + "loss": 0.2653, + "step": 6290 + }, + { + "epoch": 0.5366831598703293, + "grad_norm": 1.742069804916036, + "learning_rate": 4.649380105947608e-05, + "loss": 0.253, + "step": 6291 + }, + { + "epoch": 0.5367684695444463, + "grad_norm": 1.7310783053205916, + "learning_rate": 4.648001993164438e-05, + "loss": 0.2779, + "step": 6292 + }, + { + "epoch": 0.5368537792185634, + "grad_norm": 1.6049582604896049, + "learning_rate": 4.646623907254447e-05, + "loss": 0.1668, + "step": 6293 + }, + { + "epoch": 0.5369390888926804, + "grad_norm": 1.6332005673899193, + "learning_rate": 4.64524584832284e-05, + "loss": 0.2817, + "step": 6294 + }, + { + "epoch": 0.5370243985667975, + "grad_norm": 1.4845704602498775, + "learning_rate": 4.643867816474828e-05, + "loss": 0.1675, + "step": 6295 + }, + { + "epoch": 0.5371097082409145, + "grad_norm": 1.6598368258880807, + "learning_rate": 4.642489811815618e-05, + "loss": 0.2148, + "step": 6296 + }, + { + "epoch": 0.5371950179150315, + "grad_norm": 1.5011657317663951, + "learning_rate": 4.64111183445041e-05, + "loss": 0.1912, + "step": 6297 + }, + { + "epoch": 0.5372803275891486, + "grad_norm": 1.7215355235569227, + "learning_rate": 4.639733884484405e-05, + "loss": 0.1995, + "step": 6298 + }, + { + "epoch": 0.5373656372632657, + "grad_norm": 1.4846571804669126, + "learning_rate": 4.638355962022805e-05, + "loss": 0.2907, + "step": 6299 + }, + { + "epoch": 0.5374509469373827, + "grad_norm": 1.7477090690315067, + "learning_rate": 4.636978067170805e-05, + "loss": 0.2691, + "step": 6300 + }, + { + "epoch": 0.5375362566114997, + "grad_norm": 1.512871827378863, + "learning_rate": 4.6356002000335994e-05, + "loss": 0.2653, + "step": 6301 + }, + { + "epoch": 0.5376215662856167, + "grad_norm": 2.3946552088951187, + "learning_rate": 4.634222360716382e-05, + "loss": 0.2477, + "step": 6302 + }, + { + "epoch": 0.5377068759597339, + "grad_norm": 1.6605983010968037, + "learning_rate": 4.632844549324346e-05, + "loss": 0.2689, + "step": 6303 + }, + { + "epoch": 0.5377921856338509, + "grad_norm": 1.9297743557676548, + "learning_rate": 4.6314667659626754e-05, + "loss": 0.2862, + "step": 6304 + }, + { + "epoch": 0.5378774953079679, + "grad_norm": 1.4427813661821451, + "learning_rate": 4.630089010736559e-05, + "loss": 0.236, + "step": 6305 + }, + { + "epoch": 0.5379628049820849, + "grad_norm": 1.5330594329560696, + "learning_rate": 4.628711283751183e-05, + "loss": 0.2726, + "step": 6306 + }, + { + "epoch": 0.5380481146562021, + "grad_norm": 1.54397534721834, + "learning_rate": 4.6273335851117264e-05, + "loss": 0.2237, + "step": 6307 + }, + { + "epoch": 0.5381334243303191, + "grad_norm": 2.1682408067615455, + "learning_rate": 4.6259559149233695e-05, + "loss": 0.2704, + "step": 6308 + }, + { + "epoch": 0.5382187340044361, + "grad_norm": 1.484688414563552, + "learning_rate": 4.6245782732912924e-05, + "loss": 0.2534, + "step": 6309 + }, + { + "epoch": 0.5383040436785531, + "grad_norm": 1.520986139990504, + "learning_rate": 4.623200660320667e-05, + "loss": 0.2755, + "step": 6310 + }, + { + "epoch": 0.5383893533526702, + "grad_norm": 1.9342681017104204, + "learning_rate": 4.621823076116669e-05, + "loss": 0.2547, + "step": 6311 + }, + { + "epoch": 0.5384746630267873, + "grad_norm": 1.5629241367704976, + "learning_rate": 4.620445520784472e-05, + "loss": 0.2529, + "step": 6312 + }, + { + "epoch": 0.5385599727009043, + "grad_norm": 1.2065422277320923, + "learning_rate": 4.6190679944292395e-05, + "loss": 0.1982, + "step": 6313 + }, + { + "epoch": 0.5386452823750213, + "grad_norm": 1.6226534406759805, + "learning_rate": 4.6176904971561426e-05, + "loss": 0.2558, + "step": 6314 + }, + { + "epoch": 0.5387305920491384, + "grad_norm": 1.9002863668167027, + "learning_rate": 4.616313029070346e-05, + "loss": 0.2732, + "step": 6315 + }, + { + "epoch": 0.5388159017232554, + "grad_norm": 1.4971052689524118, + "learning_rate": 4.6149355902770086e-05, + "loss": 0.2719, + "step": 6316 + }, + { + "epoch": 0.5389012113973725, + "grad_norm": 1.6939282369264317, + "learning_rate": 4.613558180881294e-05, + "loss": 0.2126, + "step": 6317 + }, + { + "epoch": 0.5389865210714895, + "grad_norm": 1.4897957052436508, + "learning_rate": 4.612180800988358e-05, + "loss": 0.2453, + "step": 6318 + }, + { + "epoch": 0.5390718307456066, + "grad_norm": 1.28020267014561, + "learning_rate": 4.610803450703358e-05, + "loss": 0.2544, + "step": 6319 + }, + { + "epoch": 0.5391571404197236, + "grad_norm": 1.38703033815083, + "learning_rate": 4.609426130131445e-05, + "loss": 0.241, + "step": 6320 + }, + { + "epoch": 0.5392424500938406, + "grad_norm": 1.5110385359994598, + "learning_rate": 4.608048839377772e-05, + "loss": 0.2619, + "step": 6321 + }, + { + "epoch": 0.5393277597679577, + "grad_norm": 1.6165206935040626, + "learning_rate": 4.606671578547488e-05, + "loss": 0.2704, + "step": 6322 + }, + { + "epoch": 0.5394130694420747, + "grad_norm": 1.4679895420845326, + "learning_rate": 4.605294347745738e-05, + "loss": 0.2894, + "step": 6323 + }, + { + "epoch": 0.5394983791161918, + "grad_norm": 1.6049670249902808, + "learning_rate": 4.603917147077666e-05, + "loss": 0.218, + "step": 6324 + }, + { + "epoch": 0.5395836887903088, + "grad_norm": 1.481889269698551, + "learning_rate": 4.602539976648418e-05, + "loss": 0.2236, + "step": 6325 + }, + { + "epoch": 0.5396689984644258, + "grad_norm": 1.4290856934380993, + "learning_rate": 4.601162836563128e-05, + "loss": 0.2493, + "step": 6326 + }, + { + "epoch": 0.5397543081385429, + "grad_norm": 1.4661275298995797, + "learning_rate": 4.599785726926936e-05, + "loss": 0.1299, + "step": 6327 + }, + { + "epoch": 0.53983961781266, + "grad_norm": 1.3267952938280432, + "learning_rate": 4.598408647844979e-05, + "loss": 0.2335, + "step": 6328 + }, + { + "epoch": 0.539924927486777, + "grad_norm": 1.9704784495857808, + "learning_rate": 4.597031599422388e-05, + "loss": 0.239, + "step": 6329 + }, + { + "epoch": 0.540010237160894, + "grad_norm": 1.4636307640603994, + "learning_rate": 4.5956545817642906e-05, + "loss": 0.2218, + "step": 6330 + }, + { + "epoch": 0.540095546835011, + "grad_norm": 1.6621775092068347, + "learning_rate": 4.594277594975818e-05, + "loss": 0.2233, + "step": 6331 + }, + { + "epoch": 0.5401808565091282, + "grad_norm": 1.4085520234065982, + "learning_rate": 4.5929006391620963e-05, + "loss": 0.2342, + "step": 6332 + }, + { + "epoch": 0.5402661661832452, + "grad_norm": 1.5713107603672012, + "learning_rate": 4.591523714428246e-05, + "loss": 0.2669, + "step": 6333 + }, + { + "epoch": 0.5403514758573622, + "grad_norm": 1.8720805009177635, + "learning_rate": 4.590146820879389e-05, + "loss": 0.1931, + "step": 6334 + }, + { + "epoch": 0.5404367855314792, + "grad_norm": 1.6595336464657267, + "learning_rate": 4.5887699586206466e-05, + "loss": 0.3055, + "step": 6335 + }, + { + "epoch": 0.5405220952055964, + "grad_norm": 2.2043981458218327, + "learning_rate": 4.58739312775713e-05, + "loss": 0.3112, + "step": 6336 + }, + { + "epoch": 0.5406074048797134, + "grad_norm": 1.697155140045162, + "learning_rate": 4.586016328393956e-05, + "loss": 0.2377, + "step": 6337 + }, + { + "epoch": 0.5406927145538304, + "grad_norm": 1.4914078997707285, + "learning_rate": 4.584639560636238e-05, + "loss": 0.189, + "step": 6338 + }, + { + "epoch": 0.5407780242279474, + "grad_norm": 1.8696059521720332, + "learning_rate": 4.583262824589081e-05, + "loss": 0.2283, + "step": 6339 + }, + { + "epoch": 0.5408633339020645, + "grad_norm": 1.631575412374012, + "learning_rate": 4.5818861203575915e-05, + "loss": 0.2611, + "step": 6340 + }, + { + "epoch": 0.5409486435761816, + "grad_norm": 1.8819384306951943, + "learning_rate": 4.580509448046877e-05, + "loss": 0.2961, + "step": 6341 + }, + { + "epoch": 0.5410339532502986, + "grad_norm": 1.663694385771265, + "learning_rate": 4.5791328077620344e-05, + "loss": 0.1938, + "step": 6342 + }, + { + "epoch": 0.5411192629244156, + "grad_norm": 1.3941684111960275, + "learning_rate": 4.5777561996081656e-05, + "loss": 0.2107, + "step": 6343 + }, + { + "epoch": 0.5412045725985327, + "grad_norm": 1.7595766158641537, + "learning_rate": 4.57637962369037e-05, + "loss": 0.2122, + "step": 6344 + }, + { + "epoch": 0.5412898822726497, + "grad_norm": 1.4014023620231695, + "learning_rate": 4.5750030801137364e-05, + "loss": 0.1789, + "step": 6345 + }, + { + "epoch": 0.5413751919467668, + "grad_norm": 1.703718029682232, + "learning_rate": 4.573626568983359e-05, + "loss": 0.2113, + "step": 6346 + }, + { + "epoch": 0.5414605016208838, + "grad_norm": 1.5573812945886498, + "learning_rate": 4.572250090404328e-05, + "loss": 0.2221, + "step": 6347 + }, + { + "epoch": 0.5415458112950009, + "grad_norm": 1.5731857355977883, + "learning_rate": 4.5708736444817316e-05, + "loss": 0.203, + "step": 6348 + }, + { + "epoch": 0.5416311209691179, + "grad_norm": 1.7151441723694656, + "learning_rate": 4.5694972313206504e-05, + "loss": 0.3025, + "step": 6349 + }, + { + "epoch": 0.5417164306432349, + "grad_norm": 1.7873205508342163, + "learning_rate": 4.568120851026167e-05, + "loss": 0.2401, + "step": 6350 + }, + { + "epoch": 0.541801740317352, + "grad_norm": 1.5605995064311877, + "learning_rate": 4.5667445037033635e-05, + "loss": 0.2862, + "step": 6351 + }, + { + "epoch": 0.5418870499914691, + "grad_norm": 1.72401136180382, + "learning_rate": 4.565368189457313e-05, + "loss": 0.232, + "step": 6352 + }, + { + "epoch": 0.5419723596655861, + "grad_norm": 1.953300529221885, + "learning_rate": 4.563991908393092e-05, + "loss": 0.3165, + "step": 6353 + }, + { + "epoch": 0.5420576693397031, + "grad_norm": 1.7326669616395631, + "learning_rate": 4.5626156606157736e-05, + "loss": 0.2707, + "step": 6354 + }, + { + "epoch": 0.5421429790138201, + "grad_norm": 1.2962120383873883, + "learning_rate": 4.5612394462304234e-05, + "loss": 0.2499, + "step": 6355 + }, + { + "epoch": 0.5422282886879373, + "grad_norm": 1.4851168736149685, + "learning_rate": 4.559863265342109e-05, + "loss": 0.2267, + "step": 6356 + }, + { + "epoch": 0.5423135983620543, + "grad_norm": 1.3558058086024927, + "learning_rate": 4.558487118055898e-05, + "loss": 0.2991, + "step": 6357 + }, + { + "epoch": 0.5423989080361713, + "grad_norm": 2.0012888331937413, + "learning_rate": 4.557111004476848e-05, + "loss": 0.3495, + "step": 6358 + }, + { + "epoch": 0.5424842177102883, + "grad_norm": 1.2752671934164177, + "learning_rate": 4.55573492471002e-05, + "loss": 0.1808, + "step": 6359 + }, + { + "epoch": 0.5425695273844053, + "grad_norm": 2.07768348076415, + "learning_rate": 4.554358878860469e-05, + "loss": 0.259, + "step": 6360 + }, + { + "epoch": 0.5426548370585225, + "grad_norm": 1.4722661885508175, + "learning_rate": 4.55298286703325e-05, + "loss": 0.2036, + "step": 6361 + }, + { + "epoch": 0.5427401467326395, + "grad_norm": 1.716712993009085, + "learning_rate": 4.551606889333412e-05, + "loss": 0.2514, + "step": 6362 + }, + { + "epoch": 0.5428254564067565, + "grad_norm": 2.0167256979801578, + "learning_rate": 4.550230945866006e-05, + "loss": 0.2848, + "step": 6363 + }, + { + "epoch": 0.5429107660808735, + "grad_norm": 1.7453001126060652, + "learning_rate": 4.548855036736079e-05, + "loss": 0.2329, + "step": 6364 + }, + { + "epoch": 0.5429960757549906, + "grad_norm": 1.5781272661551644, + "learning_rate": 4.5474791620486703e-05, + "loss": 0.1726, + "step": 6365 + }, + { + "epoch": 0.5430813854291077, + "grad_norm": 1.2287020640370407, + "learning_rate": 4.546103321908823e-05, + "loss": 0.2235, + "step": 6366 + }, + { + "epoch": 0.5431666951032247, + "grad_norm": 1.7332037330129442, + "learning_rate": 4.5447275164215774e-05, + "loss": 0.2713, + "step": 6367 + }, + { + "epoch": 0.5432520047773417, + "grad_norm": 1.4969928320774892, + "learning_rate": 4.543351745691964e-05, + "loss": 0.1988, + "step": 6368 + }, + { + "epoch": 0.5433373144514588, + "grad_norm": 1.366330690295891, + "learning_rate": 4.541976009825019e-05, + "loss": 0.2641, + "step": 6369 + }, + { + "epoch": 0.5434226241255758, + "grad_norm": 1.7539509722948676, + "learning_rate": 4.540600308925774e-05, + "loss": 0.2, + "step": 6370 + }, + { + "epoch": 0.5435079337996929, + "grad_norm": 1.6561739922123124, + "learning_rate": 4.5392246430992517e-05, + "loss": 0.1823, + "step": 6371 + }, + { + "epoch": 0.5435932434738099, + "grad_norm": 1.5402689431413628, + "learning_rate": 4.5378490124504796e-05, + "loss": 0.172, + "step": 6372 + }, + { + "epoch": 0.543678553147927, + "grad_norm": 1.7943191263447948, + "learning_rate": 4.5364734170844807e-05, + "loss": 0.268, + "step": 6373 + }, + { + "epoch": 0.543763862822044, + "grad_norm": 1.867140765383357, + "learning_rate": 4.535097857106272e-05, + "loss": 0.2766, + "step": 6374 + }, + { + "epoch": 0.543849172496161, + "grad_norm": 1.8762811734165883, + "learning_rate": 4.5337223326208705e-05, + "loss": 0.2217, + "step": 6375 + }, + { + "epoch": 0.5439344821702781, + "grad_norm": 1.490031817307723, + "learning_rate": 4.5323468437332916e-05, + "loss": 0.2432, + "step": 6376 + }, + { + "epoch": 0.5440197918443952, + "grad_norm": 1.3045804099715521, + "learning_rate": 4.5309713905485485e-05, + "loss": 0.2125, + "step": 6377 + }, + { + "epoch": 0.5441051015185122, + "grad_norm": 1.392369194461806, + "learning_rate": 4.529595973171645e-05, + "loss": 0.2358, + "step": 6378 + }, + { + "epoch": 0.5441904111926292, + "grad_norm": 1.7743185266400934, + "learning_rate": 4.528220591707589e-05, + "loss": 0.2395, + "step": 6379 + }, + { + "epoch": 0.5442757208667462, + "grad_norm": 1.2185934895499144, + "learning_rate": 4.526845246261386e-05, + "loss": 0.1921, + "step": 6380 + }, + { + "epoch": 0.5443610305408634, + "grad_norm": 1.1537592583116425, + "learning_rate": 4.5254699369380324e-05, + "loss": 0.209, + "step": 6381 + }, + { + "epoch": 0.5444463402149804, + "grad_norm": 1.749133235992973, + "learning_rate": 4.5240946638425275e-05, + "loss": 0.248, + "step": 6382 + }, + { + "epoch": 0.5445316498890974, + "grad_norm": 1.4408096691605667, + "learning_rate": 4.522719427079868e-05, + "loss": 0.2952, + "step": 6383 + }, + { + "epoch": 0.5446169595632144, + "grad_norm": 1.6229374340264644, + "learning_rate": 4.521344226755041e-05, + "loss": 0.2808, + "step": 6384 + }, + { + "epoch": 0.5447022692373316, + "grad_norm": 1.5649661629038296, + "learning_rate": 4.51996906297304e-05, + "loss": 0.2535, + "step": 6385 + }, + { + "epoch": 0.5447875789114486, + "grad_norm": 1.7335534426733825, + "learning_rate": 4.5185939358388514e-05, + "loss": 0.279, + "step": 6386 + }, + { + "epoch": 0.5448728885855656, + "grad_norm": 1.4145945185732691, + "learning_rate": 4.517218845457456e-05, + "loss": 0.2126, + "step": 6387 + }, + { + "epoch": 0.5449581982596826, + "grad_norm": 1.6005938322048285, + "learning_rate": 4.5158437919338355e-05, + "loss": 0.2262, + "step": 6388 + }, + { + "epoch": 0.5450435079337997, + "grad_norm": 1.4086301793588771, + "learning_rate": 4.5144687753729706e-05, + "loss": 0.2341, + "step": 6389 + }, + { + "epoch": 0.5451288176079168, + "grad_norm": 1.5484246862595366, + "learning_rate": 4.5130937958798334e-05, + "loss": 0.26, + "step": 6390 + }, + { + "epoch": 0.5452141272820338, + "grad_norm": 1.4076180479370723, + "learning_rate": 4.5117188535593985e-05, + "loss": 0.236, + "step": 6391 + }, + { + "epoch": 0.5452994369561508, + "grad_norm": 1.6651566181467026, + "learning_rate": 4.510343948516633e-05, + "loss": 0.2366, + "step": 6392 + }, + { + "epoch": 0.5453847466302679, + "grad_norm": 1.7032838799799523, + "learning_rate": 4.508969080856507e-05, + "loss": 0.2057, + "step": 6393 + }, + { + "epoch": 0.5454700563043849, + "grad_norm": 1.465766555034195, + "learning_rate": 4.5075942506839804e-05, + "loss": 0.3168, + "step": 6394 + }, + { + "epoch": 0.545555365978502, + "grad_norm": 1.362688345533869, + "learning_rate": 4.506219458104016e-05, + "loss": 0.208, + "step": 6395 + }, + { + "epoch": 0.545640675652619, + "grad_norm": 1.7172260811162203, + "learning_rate": 4.504844703221575e-05, + "loss": 0.2247, + "step": 6396 + }, + { + "epoch": 0.5457259853267361, + "grad_norm": 1.4513080681906119, + "learning_rate": 4.503469986141606e-05, + "loss": 0.2007, + "step": 6397 + }, + { + "epoch": 0.5458112950008531, + "grad_norm": 1.509710194901363, + "learning_rate": 4.502095306969066e-05, + "loss": 0.265, + "step": 6398 + }, + { + "epoch": 0.5458966046749701, + "grad_norm": 1.8048313628484887, + "learning_rate": 4.500720665808905e-05, + "loss": 0.2342, + "step": 6399 + }, + { + "epoch": 0.5459819143490872, + "grad_norm": 1.7591765811893165, + "learning_rate": 4.499346062766067e-05, + "loss": 0.264, + "step": 6400 + }, + { + "epoch": 0.5460672240232042, + "grad_norm": 1.3515285421526606, + "learning_rate": 4.497971497945496e-05, + "loss": 0.1841, + "step": 6401 + }, + { + "epoch": 0.5461525336973213, + "grad_norm": 1.4827661730138804, + "learning_rate": 4.496596971452135e-05, + "loss": 0.1859, + "step": 6402 + }, + { + "epoch": 0.5462378433714383, + "grad_norm": 1.4708754399281863, + "learning_rate": 4.4952224833909194e-05, + "loss": 0.2755, + "step": 6403 + }, + { + "epoch": 0.5463231530455553, + "grad_norm": 1.4027721207675659, + "learning_rate": 4.493848033866784e-05, + "loss": 0.2293, + "step": 6404 + }, + { + "epoch": 0.5464084627196724, + "grad_norm": 1.6834781899145497, + "learning_rate": 4.492473622984663e-05, + "loss": 0.279, + "step": 6405 + }, + { + "epoch": 0.5464937723937895, + "grad_norm": 1.7655287859872975, + "learning_rate": 4.4910992508494826e-05, + "loss": 0.2061, + "step": 6406 + }, + { + "epoch": 0.5465790820679065, + "grad_norm": 1.705470298944627, + "learning_rate": 4.489724917566169e-05, + "loss": 0.2462, + "step": 6407 + }, + { + "epoch": 0.5466643917420235, + "grad_norm": 1.5215447821518853, + "learning_rate": 4.488350623239648e-05, + "loss": 0.2016, + "step": 6408 + }, + { + "epoch": 0.5467497014161405, + "grad_norm": 1.9055124325533104, + "learning_rate": 4.4869763679748386e-05, + "loss": 0.2043, + "step": 6409 + }, + { + "epoch": 0.5468350110902577, + "grad_norm": 1.8687956839498558, + "learning_rate": 4.485602151876656e-05, + "loss": 0.2408, + "step": 6410 + }, + { + "epoch": 0.5469203207643747, + "grad_norm": 1.4749427267043644, + "learning_rate": 4.484227975050015e-05, + "loss": 0.2663, + "step": 6411 + }, + { + "epoch": 0.5470056304384917, + "grad_norm": 1.9807952671889613, + "learning_rate": 4.48285383759983e-05, + "loss": 0.2135, + "step": 6412 + }, + { + "epoch": 0.5470909401126087, + "grad_norm": 1.7787000909426207, + "learning_rate": 4.4814797396310055e-05, + "loss": 0.2516, + "step": 6413 + }, + { + "epoch": 0.5471762497867259, + "grad_norm": 1.6882666506438335, + "learning_rate": 4.480105681248446e-05, + "loss": 0.2693, + "step": 6414 + }, + { + "epoch": 0.5472615594608429, + "grad_norm": 1.709645798344891, + "learning_rate": 4.478731662557057e-05, + "loss": 0.2933, + "step": 6415 + }, + { + "epoch": 0.5473468691349599, + "grad_norm": 1.3683289041265496, + "learning_rate": 4.477357683661734e-05, + "loss": 0.2672, + "step": 6416 + }, + { + "epoch": 0.5474321788090769, + "grad_norm": 1.55006526686555, + "learning_rate": 4.475983744667374e-05, + "loss": 0.2006, + "step": 6417 + }, + { + "epoch": 0.547517488483194, + "grad_norm": 1.6823220107547405, + "learning_rate": 4.4746098456788724e-05, + "loss": 0.2672, + "step": 6418 + }, + { + "epoch": 0.547602798157311, + "grad_norm": 1.3493194489487208, + "learning_rate": 4.4732359868011155e-05, + "loss": 0.2138, + "step": 6419 + }, + { + "epoch": 0.5476881078314281, + "grad_norm": 1.3104862932509382, + "learning_rate": 4.4718621681389915e-05, + "loss": 0.2144, + "step": 6420 + }, + { + "epoch": 0.5477734175055451, + "grad_norm": 1.8474736516619676, + "learning_rate": 4.470488389797385e-05, + "loss": 0.1759, + "step": 6421 + }, + { + "epoch": 0.5478587271796622, + "grad_norm": 1.421427645891796, + "learning_rate": 4.4691146518811775e-05, + "loss": 0.246, + "step": 6422 + }, + { + "epoch": 0.5479440368537792, + "grad_norm": 1.6404305479072634, + "learning_rate": 4.467740954495244e-05, + "loss": 0.1931, + "step": 6423 + }, + { + "epoch": 0.5480293465278963, + "grad_norm": 1.184127586510917, + "learning_rate": 4.46636729774446e-05, + "loss": 0.1767, + "step": 6424 + }, + { + "epoch": 0.5481146562020133, + "grad_norm": 1.3842893139087835, + "learning_rate": 4.464993681733699e-05, + "loss": 0.2617, + "step": 6425 + }, + { + "epoch": 0.5481999658761304, + "grad_norm": 1.7072555429556047, + "learning_rate": 4.463620106567825e-05, + "loss": 0.2277, + "step": 6426 + }, + { + "epoch": 0.5482852755502474, + "grad_norm": 1.5382781377928458, + "learning_rate": 4.462246572351706e-05, + "loss": 0.2768, + "step": 6427 + }, + { + "epoch": 0.5483705852243644, + "grad_norm": 1.5917463984999154, + "learning_rate": 4.460873079190205e-05, + "loss": 0.2724, + "step": 6428 + }, + { + "epoch": 0.5484558948984815, + "grad_norm": 1.498931186086459, + "learning_rate": 4.459499627188178e-05, + "loss": 0.2128, + "step": 6429 + }, + { + "epoch": 0.5485412045725986, + "grad_norm": 1.244322806872699, + "learning_rate": 4.458126216450482e-05, + "loss": 0.2222, + "step": 6430 + }, + { + "epoch": 0.5486265142467156, + "grad_norm": 1.8713850778103618, + "learning_rate": 4.456752847081971e-05, + "loss": 0.2379, + "step": 6431 + }, + { + "epoch": 0.5487118239208326, + "grad_norm": 1.6481302707256567, + "learning_rate": 4.4553795191874924e-05, + "loss": 0.2547, + "step": 6432 + }, + { + "epoch": 0.5487971335949496, + "grad_norm": 1.6118771259342652, + "learning_rate": 4.4540062328718945e-05, + "loss": 0.2149, + "step": 6433 + }, + { + "epoch": 0.5488824432690668, + "grad_norm": 1.6529750474729978, + "learning_rate": 4.452632988240019e-05, + "loss": 0.2446, + "step": 6434 + }, + { + "epoch": 0.5489677529431838, + "grad_norm": 1.5983601630660296, + "learning_rate": 4.451259785396707e-05, + "loss": 0.2347, + "step": 6435 + }, + { + "epoch": 0.5490530626173008, + "grad_norm": 1.4383476080394246, + "learning_rate": 4.449886624446792e-05, + "loss": 0.2342, + "step": 6436 + }, + { + "epoch": 0.5491383722914178, + "grad_norm": 1.7961445526199304, + "learning_rate": 4.44851350549511e-05, + "loss": 0.2861, + "step": 6437 + }, + { + "epoch": 0.5492236819655348, + "grad_norm": 1.831408038822339, + "learning_rate": 4.447140428646494e-05, + "loss": 0.1988, + "step": 6438 + }, + { + "epoch": 0.549308991639652, + "grad_norm": 1.9592958546887607, + "learning_rate": 4.445767394005766e-05, + "loss": 0.2891, + "step": 6439 + }, + { + "epoch": 0.549394301313769, + "grad_norm": 1.6071292059182363, + "learning_rate": 4.4443944016777524e-05, + "loss": 0.1929, + "step": 6440 + }, + { + "epoch": 0.549479610987886, + "grad_norm": 1.832221893607139, + "learning_rate": 4.443021451767275e-05, + "loss": 0.2362, + "step": 6441 + }, + { + "epoch": 0.549564920662003, + "grad_norm": 1.516056038824771, + "learning_rate": 4.441648544379149e-05, + "loss": 0.1741, + "step": 6442 + }, + { + "epoch": 0.5496502303361201, + "grad_norm": 1.5319326008146443, + "learning_rate": 4.4402756796181894e-05, + "loss": 0.2515, + "step": 6443 + }, + { + "epoch": 0.5497355400102372, + "grad_norm": 1.592421108549411, + "learning_rate": 4.438902857589209e-05, + "loss": 0.238, + "step": 6444 + }, + { + "epoch": 0.5498208496843542, + "grad_norm": 1.5311494132499939, + "learning_rate": 4.437530078397013e-05, + "loss": 0.2426, + "step": 6445 + }, + { + "epoch": 0.5499061593584712, + "grad_norm": 1.6467144534993083, + "learning_rate": 4.436157342146405e-05, + "loss": 0.2329, + "step": 6446 + }, + { + "epoch": 0.5499914690325883, + "grad_norm": 1.458740213944822, + "learning_rate": 4.434784648942191e-05, + "loss": 0.2109, + "step": 6447 + }, + { + "epoch": 0.5500767787067053, + "grad_norm": 1.767570301691941, + "learning_rate": 4.433411998889162e-05, + "loss": 0.2951, + "step": 6448 + }, + { + "epoch": 0.5501620883808224, + "grad_norm": 1.930806812186006, + "learning_rate": 4.432039392092117e-05, + "loss": 0.244, + "step": 6449 + }, + { + "epoch": 0.5502473980549394, + "grad_norm": 1.378122166221975, + "learning_rate": 4.4306668286558476e-05, + "loss": 0.2652, + "step": 6450 + }, + { + "epoch": 0.5503327077290565, + "grad_norm": 1.6440294093543526, + "learning_rate": 4.429294308685139e-05, + "loss": 0.163, + "step": 6451 + }, + { + "epoch": 0.5504180174031735, + "grad_norm": 1.8561077744170635, + "learning_rate": 4.4279218322847764e-05, + "loss": 0.2909, + "step": 6452 + }, + { + "epoch": 0.5505033270772905, + "grad_norm": 1.9906828939262355, + "learning_rate": 4.4265493995595424e-05, + "loss": 0.2275, + "step": 6453 + }, + { + "epoch": 0.5505886367514076, + "grad_norm": 1.6591203397180867, + "learning_rate": 4.4251770106142166e-05, + "loss": 0.2396, + "step": 6454 + }, + { + "epoch": 0.5506739464255247, + "grad_norm": 1.2664764742528047, + "learning_rate": 4.42380466555357e-05, + "loss": 0.2538, + "step": 6455 + }, + { + "epoch": 0.5507592560996417, + "grad_norm": 1.4851841377637944, + "learning_rate": 4.422432364482375e-05, + "loss": 0.195, + "step": 6456 + }, + { + "epoch": 0.5508445657737587, + "grad_norm": 1.3442972532639708, + "learning_rate": 4.421060107505401e-05, + "loss": 0.2162, + "step": 6457 + }, + { + "epoch": 0.5509298754478757, + "grad_norm": 1.598705068736823, + "learning_rate": 4.41968789472741e-05, + "loss": 0.2703, + "step": 6458 + }, + { + "epoch": 0.5510151851219929, + "grad_norm": 1.6287973491025087, + "learning_rate": 4.418315726253164e-05, + "loss": 0.2591, + "step": 6459 + }, + { + "epoch": 0.5511004947961099, + "grad_norm": 1.7217854831363182, + "learning_rate": 4.4169436021874236e-05, + "loss": 0.2589, + "step": 6460 + }, + { + "epoch": 0.5511858044702269, + "grad_norm": 1.4930641834492742, + "learning_rate": 4.415571522634938e-05, + "loss": 0.1741, + "step": 6461 + }, + { + "epoch": 0.5512711141443439, + "grad_norm": 1.7241228223007123, + "learning_rate": 4.4141994877004614e-05, + "loss": 0.2301, + "step": 6462 + }, + { + "epoch": 0.5513564238184611, + "grad_norm": 1.824035504631355, + "learning_rate": 4.412827497488744e-05, + "loss": 0.2644, + "step": 6463 + }, + { + "epoch": 0.5514417334925781, + "grad_norm": 1.4820509535747073, + "learning_rate": 4.411455552104524e-05, + "loss": 0.1826, + "step": 6464 + }, + { + "epoch": 0.5515270431666951, + "grad_norm": 1.7170641173586756, + "learning_rate": 4.4100836516525456e-05, + "loss": 0.2891, + "step": 6465 + }, + { + "epoch": 0.5516123528408121, + "grad_norm": 1.6732859851906587, + "learning_rate": 4.408711796237545e-05, + "loss": 0.2246, + "step": 6466 + }, + { + "epoch": 0.5516976625149292, + "grad_norm": 1.5828604326587778, + "learning_rate": 4.407339985964259e-05, + "loss": 0.2292, + "step": 6467 + }, + { + "epoch": 0.5517829721890463, + "grad_norm": 1.5350256323149831, + "learning_rate": 4.4059682209374136e-05, + "loss": 0.1987, + "step": 6468 + }, + { + "epoch": 0.5518682818631633, + "grad_norm": 1.5990046474778112, + "learning_rate": 4.404596501261737e-05, + "loss": 0.2047, + "step": 6469 + }, + { + "epoch": 0.5519535915372803, + "grad_norm": 1.5062644008090522, + "learning_rate": 4.403224827041957e-05, + "loss": 0.2387, + "step": 6470 + }, + { + "epoch": 0.5520389012113974, + "grad_norm": 1.5780083641536464, + "learning_rate": 4.401853198382788e-05, + "loss": 0.2687, + "step": 6471 + }, + { + "epoch": 0.5521242108855144, + "grad_norm": 1.8730867797292154, + "learning_rate": 4.400481615388948e-05, + "loss": 0.2964, + "step": 6472 + }, + { + "epoch": 0.5522095205596315, + "grad_norm": 1.7258313732544388, + "learning_rate": 4.399110078165153e-05, + "loss": 0.273, + "step": 6473 + }, + { + "epoch": 0.5522948302337485, + "grad_norm": 1.794211827282804, + "learning_rate": 4.397738586816108e-05, + "loss": 0.2476, + "step": 6474 + }, + { + "epoch": 0.5523801399078655, + "grad_norm": 1.4167683882751, + "learning_rate": 4.3963671414465216e-05, + "loss": 0.2432, + "step": 6475 + }, + { + "epoch": 0.5524654495819826, + "grad_norm": 1.6309317384301223, + "learning_rate": 4.3949957421610995e-05, + "loss": 0.229, + "step": 6476 + }, + { + "epoch": 0.5525507592560996, + "grad_norm": 1.5164910427876666, + "learning_rate": 4.393624389064535e-05, + "loss": 0.2118, + "step": 6477 + }, + { + "epoch": 0.5526360689302167, + "grad_norm": 1.754709174683864, + "learning_rate": 4.392253082261526e-05, + "loss": 0.2746, + "step": 6478 + }, + { + "epoch": 0.5527213786043337, + "grad_norm": 1.4863040636398346, + "learning_rate": 4.390881821856767e-05, + "loss": 0.2486, + "step": 6479 + }, + { + "epoch": 0.5528066882784508, + "grad_norm": 1.5870077346151763, + "learning_rate": 4.3895106079549407e-05, + "loss": 0.2427, + "step": 6480 + }, + { + "epoch": 0.5528919979525678, + "grad_norm": 1.5663520632382562, + "learning_rate": 4.388139440660736e-05, + "loss": 0.2406, + "step": 6481 + }, + { + "epoch": 0.5529773076266848, + "grad_norm": 1.566096401248512, + "learning_rate": 4.3867683200788334e-05, + "loss": 0.2265, + "step": 6482 + }, + { + "epoch": 0.5530626173008019, + "grad_norm": 1.7113344611647283, + "learning_rate": 4.385397246313913e-05, + "loss": 0.2191, + "step": 6483 + }, + { + "epoch": 0.553147926974919, + "grad_norm": 1.6166200978496723, + "learning_rate": 4.384026219470645e-05, + "loss": 0.1826, + "step": 6484 + }, + { + "epoch": 0.553233236649036, + "grad_norm": 1.9530638418159147, + "learning_rate": 4.382655239653702e-05, + "loss": 0.2343, + "step": 6485 + }, + { + "epoch": 0.553318546323153, + "grad_norm": 1.4000462916077863, + "learning_rate": 4.3812843069677526e-05, + "loss": 0.2225, + "step": 6486 + }, + { + "epoch": 0.55340385599727, + "grad_norm": 1.6036269117946529, + "learning_rate": 4.379913421517458e-05, + "loss": 0.2678, + "step": 6487 + }, + { + "epoch": 0.5534891656713872, + "grad_norm": 1.6772490077985966, + "learning_rate": 4.3785425834074764e-05, + "loss": 0.2424, + "step": 6488 + }, + { + "epoch": 0.5535744753455042, + "grad_norm": 1.6677999299354598, + "learning_rate": 4.377171792742469e-05, + "loss": 0.2615, + "step": 6489 + }, + { + "epoch": 0.5536597850196212, + "grad_norm": 1.3811355163632337, + "learning_rate": 4.375801049627083e-05, + "loss": 0.2529, + "step": 6490 + }, + { + "epoch": 0.5537450946937382, + "grad_norm": 2.0560388798729745, + "learning_rate": 4.37443035416597e-05, + "loss": 0.273, + "step": 6491 + }, + { + "epoch": 0.5538304043678554, + "grad_norm": 1.454810037174165, + "learning_rate": 4.373059706463778e-05, + "loss": 0.2094, + "step": 6492 + }, + { + "epoch": 0.5539157140419724, + "grad_norm": 1.5253947669448007, + "learning_rate": 4.371689106625143e-05, + "loss": 0.2046, + "step": 6493 + }, + { + "epoch": 0.5540010237160894, + "grad_norm": 1.5771003644956725, + "learning_rate": 4.370318554754706e-05, + "loss": 0.243, + "step": 6494 + }, + { + "epoch": 0.5540863333902064, + "grad_norm": 1.942237967512923, + "learning_rate": 4.368948050957104e-05, + "loss": 0.2966, + "step": 6495 + }, + { + "epoch": 0.5541716430643235, + "grad_norm": 1.4591314402535926, + "learning_rate": 4.367577595336961e-05, + "loss": 0.2062, + "step": 6496 + }, + { + "epoch": 0.5542569527384406, + "grad_norm": 1.8242062684638911, + "learning_rate": 4.3662071879989106e-05, + "loss": 0.2886, + "step": 6497 + }, + { + "epoch": 0.5543422624125576, + "grad_norm": 1.615214152826481, + "learning_rate": 4.364836829047572e-05, + "loss": 0.2348, + "step": 6498 + }, + { + "epoch": 0.5544275720866746, + "grad_norm": 1.7034862730090672, + "learning_rate": 4.363466518587568e-05, + "loss": 0.2181, + "step": 6499 + }, + { + "epoch": 0.5545128817607917, + "grad_norm": 1.4745980571792492, + "learning_rate": 4.362096256723511e-05, + "loss": 0.3018, + "step": 6500 + }, + { + "epoch": 0.5545981914349087, + "grad_norm": 1.8497291211979827, + "learning_rate": 4.360726043560015e-05, + "loss": 0.245, + "step": 6501 + }, + { + "epoch": 0.5546835011090258, + "grad_norm": 1.5128600078294736, + "learning_rate": 4.359355879201691e-05, + "loss": 0.1686, + "step": 6502 + }, + { + "epoch": 0.5547688107831428, + "grad_norm": 1.4899489303131856, + "learning_rate": 4.3579857637531384e-05, + "loss": 0.237, + "step": 6503 + }, + { + "epoch": 0.5548541204572599, + "grad_norm": 1.4996754772088288, + "learning_rate": 4.356615697318962e-05, + "loss": 0.2533, + "step": 6504 + }, + { + "epoch": 0.5549394301313769, + "grad_norm": 1.3643888330042313, + "learning_rate": 4.355245680003759e-05, + "loss": 0.1919, + "step": 6505 + }, + { + "epoch": 0.5550247398054939, + "grad_norm": 1.9141063451610878, + "learning_rate": 4.3538757119121204e-05, + "loss": 0.2312, + "step": 6506 + }, + { + "epoch": 0.555110049479611, + "grad_norm": 1.6664194241560526, + "learning_rate": 4.352505793148639e-05, + "loss": 0.2505, + "step": 6507 + }, + { + "epoch": 0.5551953591537281, + "grad_norm": 1.5022384313048098, + "learning_rate": 4.3511359238178996e-05, + "loss": 0.2721, + "step": 6508 + }, + { + "epoch": 0.5552806688278451, + "grad_norm": 1.894022639323581, + "learning_rate": 4.349766104024484e-05, + "loss": 0.3145, + "step": 6509 + }, + { + "epoch": 0.5553659785019621, + "grad_norm": 1.4146600798891866, + "learning_rate": 4.348396333872971e-05, + "loss": 0.2204, + "step": 6510 + }, + { + "epoch": 0.5554512881760791, + "grad_norm": 1.367456638548344, + "learning_rate": 4.347026613467934e-05, + "loss": 0.2196, + "step": 6511 + }, + { + "epoch": 0.5555365978501963, + "grad_norm": 1.5560212668115354, + "learning_rate": 4.345656942913947e-05, + "loss": 0.276, + "step": 6512 + }, + { + "epoch": 0.5556219075243133, + "grad_norm": 1.4547705408440084, + "learning_rate": 4.3442873223155746e-05, + "loss": 0.181, + "step": 6513 + }, + { + "epoch": 0.5557072171984303, + "grad_norm": 1.5840459357739112, + "learning_rate": 4.34291775177738e-05, + "loss": 0.216, + "step": 6514 + }, + { + "epoch": 0.5557925268725473, + "grad_norm": 1.9306672732491006, + "learning_rate": 4.341548231403925e-05, + "loss": 0.3353, + "step": 6515 + }, + { + "epoch": 0.5558778365466643, + "grad_norm": 1.4981915380464783, + "learning_rate": 4.340178761299762e-05, + "loss": 0.2667, + "step": 6516 + }, + { + "epoch": 0.5559631462207815, + "grad_norm": 1.4963628063844305, + "learning_rate": 4.338809341569444e-05, + "loss": 0.2572, + "step": 6517 + }, + { + "epoch": 0.5560484558948985, + "grad_norm": 1.5240165979177391, + "learning_rate": 4.3374399723175216e-05, + "loss": 0.2273, + "step": 6518 + }, + { + "epoch": 0.5561337655690155, + "grad_norm": 1.328586273872187, + "learning_rate": 4.336070653648535e-05, + "loss": 0.2418, + "step": 6519 + }, + { + "epoch": 0.5562190752431325, + "grad_norm": 2.0690797986608245, + "learning_rate": 4.334701385667026e-05, + "loss": 0.2756, + "step": 6520 + }, + { + "epoch": 0.5563043849172497, + "grad_norm": 1.8275223741322846, + "learning_rate": 4.3333321684775314e-05, + "loss": 0.2481, + "step": 6521 + }, + { + "epoch": 0.5563896945913667, + "grad_norm": 1.5992186724761064, + "learning_rate": 4.331963002184581e-05, + "loss": 0.2484, + "step": 6522 + }, + { + "epoch": 0.5564750042654837, + "grad_norm": 1.3997764783624673, + "learning_rate": 4.330593886892707e-05, + "loss": 0.1928, + "step": 6523 + }, + { + "epoch": 0.5565603139396007, + "grad_norm": 1.4967801981871556, + "learning_rate": 4.329224822706433e-05, + "loss": 0.2174, + "step": 6524 + }, + { + "epoch": 0.5566456236137178, + "grad_norm": 1.5475646514205919, + "learning_rate": 4.327855809730278e-05, + "loss": 0.2425, + "step": 6525 + }, + { + "epoch": 0.5567309332878349, + "grad_norm": 1.613089266015076, + "learning_rate": 4.32648684806876e-05, + "loss": 0.2417, + "step": 6526 + }, + { + "epoch": 0.5568162429619519, + "grad_norm": 1.9223822567802356, + "learning_rate": 4.325117937826392e-05, + "loss": 0.2586, + "step": 6527 + }, + { + "epoch": 0.5569015526360689, + "grad_norm": 1.924041767166648, + "learning_rate": 4.323749079107685e-05, + "loss": 0.2406, + "step": 6528 + }, + { + "epoch": 0.556986862310186, + "grad_norm": 1.405388377395284, + "learning_rate": 4.3223802720171417e-05, + "loss": 0.2182, + "step": 6529 + }, + { + "epoch": 0.557072171984303, + "grad_norm": 1.4223348691324245, + "learning_rate": 4.321011516659263e-05, + "loss": 0.2547, + "step": 6530 + }, + { + "epoch": 0.55715748165842, + "grad_norm": 1.9687902052876338, + "learning_rate": 4.319642813138548e-05, + "loss": 0.275, + "step": 6531 + }, + { + "epoch": 0.5572427913325371, + "grad_norm": 1.2855532882191996, + "learning_rate": 4.318274161559487e-05, + "loss": 0.2383, + "step": 6532 + }, + { + "epoch": 0.5573281010066542, + "grad_norm": 1.4385478305707613, + "learning_rate": 4.316905562026571e-05, + "loss": 0.1975, + "step": 6533 + }, + { + "epoch": 0.5574134106807712, + "grad_norm": 1.657946239833076, + "learning_rate": 4.315537014644288e-05, + "loss": 0.2165, + "step": 6534 + }, + { + "epoch": 0.5574987203548882, + "grad_norm": 1.6466316347346888, + "learning_rate": 4.3141685195171136e-05, + "loss": 0.1979, + "step": 6535 + }, + { + "epoch": 0.5575840300290053, + "grad_norm": 1.729095396235334, + "learning_rate": 4.312800076749529e-05, + "loss": 0.2523, + "step": 6536 + }, + { + "epoch": 0.5576693397031224, + "grad_norm": 1.7152120068802406, + "learning_rate": 4.311431686446009e-05, + "loss": 0.2761, + "step": 6537 + }, + { + "epoch": 0.5577546493772394, + "grad_norm": 1.5649599928205924, + "learning_rate": 4.310063348711018e-05, + "loss": 0.2385, + "step": 6538 + }, + { + "epoch": 0.5578399590513564, + "grad_norm": 1.8193536287636225, + "learning_rate": 4.3086950636490256e-05, + "loss": 0.2126, + "step": 6539 + }, + { + "epoch": 0.5579252687254734, + "grad_norm": 1.793043479941151, + "learning_rate": 4.3073268313644915e-05, + "loss": 0.2948, + "step": 6540 + }, + { + "epoch": 0.5580105783995906, + "grad_norm": 1.336434835027676, + "learning_rate": 4.305958651961873e-05, + "loss": 0.2522, + "step": 6541 + }, + { + "epoch": 0.5580958880737076, + "grad_norm": 1.523388671092473, + "learning_rate": 4.304590525545622e-05, + "loss": 0.2081, + "step": 6542 + }, + { + "epoch": 0.5581811977478246, + "grad_norm": 1.883036873138863, + "learning_rate": 4.303222452220189e-05, + "loss": 0.288, + "step": 6543 + }, + { + "epoch": 0.5582665074219416, + "grad_norm": 1.3017664205805974, + "learning_rate": 4.301854432090021e-05, + "loss": 0.2235, + "step": 6544 + }, + { + "epoch": 0.5583518170960587, + "grad_norm": 1.6743095711529339, + "learning_rate": 4.300486465259555e-05, + "loss": 0.3225, + "step": 6545 + }, + { + "epoch": 0.5584371267701758, + "grad_norm": 1.7524651467072274, + "learning_rate": 4.299118551833231e-05, + "loss": 0.2282, + "step": 6546 + }, + { + "epoch": 0.5585224364442928, + "grad_norm": 1.6314690278433899, + "learning_rate": 4.297750691915482e-05, + "loss": 0.204, + "step": 6547 + }, + { + "epoch": 0.5586077461184098, + "grad_norm": 2.240907948609908, + "learning_rate": 4.296382885610735e-05, + "loss": 0.2059, + "step": 6548 + }, + { + "epoch": 0.5586930557925269, + "grad_norm": 1.3983578792474296, + "learning_rate": 4.2950151330234145e-05, + "loss": 0.2147, + "step": 6549 + }, + { + "epoch": 0.558778365466644, + "grad_norm": 1.285259532546595, + "learning_rate": 4.2936474342579453e-05, + "loss": 0.2209, + "step": 6550 + }, + { + "epoch": 0.558863675140761, + "grad_norm": 1.3162093199179248, + "learning_rate": 4.2922797894187394e-05, + "loss": 0.2654, + "step": 6551 + }, + { + "epoch": 0.558948984814878, + "grad_norm": 1.5144614246250203, + "learning_rate": 4.29091219861021e-05, + "loss": 0.1777, + "step": 6552 + }, + { + "epoch": 0.559034294488995, + "grad_norm": 1.7758117109271512, + "learning_rate": 4.2895446619367684e-05, + "loss": 0.2575, + "step": 6553 + }, + { + "epoch": 0.5591196041631121, + "grad_norm": 2.025618155049144, + "learning_rate": 4.288177179502814e-05, + "loss": 0.202, + "step": 6554 + }, + { + "epoch": 0.5592049138372291, + "grad_norm": 1.5904337470110448, + "learning_rate": 4.286809751412749e-05, + "loss": 0.2253, + "step": 6555 + }, + { + "epoch": 0.5592902235113462, + "grad_norm": 1.7688734166521904, + "learning_rate": 4.285442377770971e-05, + "loss": 0.2689, + "step": 6556 + }, + { + "epoch": 0.5593755331854632, + "grad_norm": 1.63915935626563, + "learning_rate": 4.2840750586818715e-05, + "loss": 0.1693, + "step": 6557 + }, + { + "epoch": 0.5594608428595803, + "grad_norm": 1.3075719687383085, + "learning_rate": 4.2827077942498343e-05, + "loss": 0.2462, + "step": 6558 + }, + { + "epoch": 0.5595461525336973, + "grad_norm": 1.6719558553663136, + "learning_rate": 4.281340584579246e-05, + "loss": 0.2445, + "step": 6559 + }, + { + "epoch": 0.5596314622078143, + "grad_norm": 1.4364799529235928, + "learning_rate": 4.2799734297744864e-05, + "loss": 0.2783, + "step": 6560 + }, + { + "epoch": 0.5597167718819314, + "grad_norm": 1.8270007858274677, + "learning_rate": 4.278606329939929e-05, + "loss": 0.2346, + "step": 6561 + }, + { + "epoch": 0.5598020815560485, + "grad_norm": 2.196804313553467, + "learning_rate": 4.2772392851799434e-05, + "loss": 0.277, + "step": 6562 + }, + { + "epoch": 0.5598873912301655, + "grad_norm": 1.7058755903999814, + "learning_rate": 4.2758722955989e-05, + "loss": 0.3142, + "step": 6563 + }, + { + "epoch": 0.5599727009042825, + "grad_norm": 1.721214677369279, + "learning_rate": 4.2745053613011564e-05, + "loss": 0.2184, + "step": 6564 + }, + { + "epoch": 0.5600580105783995, + "grad_norm": 1.9020727696983697, + "learning_rate": 4.2731384823910735e-05, + "loss": 0.2429, + "step": 6565 + }, + { + "epoch": 0.5601433202525167, + "grad_norm": 1.710021030452861, + "learning_rate": 4.271771658973007e-05, + "loss": 0.2691, + "step": 6566 + }, + { + "epoch": 0.5602286299266337, + "grad_norm": 1.8730767877157208, + "learning_rate": 4.270404891151302e-05, + "loss": 0.2575, + "step": 6567 + }, + { + "epoch": 0.5603139396007507, + "grad_norm": 1.5748684207510268, + "learning_rate": 4.2690381790303066e-05, + "loss": 0.2652, + "step": 6568 + }, + { + "epoch": 0.5603992492748677, + "grad_norm": 1.958786594070947, + "learning_rate": 4.267671522714365e-05, + "loss": 0.2633, + "step": 6569 + }, + { + "epoch": 0.5604845589489849, + "grad_norm": 1.631426501316936, + "learning_rate": 4.2663049223078075e-05, + "loss": 0.2155, + "step": 6570 + }, + { + "epoch": 0.5605698686231019, + "grad_norm": 1.4280764147885154, + "learning_rate": 4.264938377914973e-05, + "loss": 0.2772, + "step": 6571 + }, + { + "epoch": 0.5606551782972189, + "grad_norm": 1.466964773024284, + "learning_rate": 4.263571889640184e-05, + "loss": 0.2238, + "step": 6572 + }, + { + "epoch": 0.5607404879713359, + "grad_norm": 1.309594708129407, + "learning_rate": 4.262205457587772e-05, + "loss": 0.2033, + "step": 6573 + }, + { + "epoch": 0.560825797645453, + "grad_norm": 1.3939448387227504, + "learning_rate": 4.26083908186205e-05, + "loss": 0.2555, + "step": 6574 + }, + { + "epoch": 0.5609111073195701, + "grad_norm": 1.4768640926699292, + "learning_rate": 4.2594727625673356e-05, + "loss": 0.1966, + "step": 6575 + }, + { + "epoch": 0.5609964169936871, + "grad_norm": 1.7647437538028867, + "learning_rate": 4.258106499807943e-05, + "loss": 0.2906, + "step": 6576 + }, + { + "epoch": 0.5610817266678041, + "grad_norm": 1.6598995952804958, + "learning_rate": 4.256740293688175e-05, + "loss": 0.1967, + "step": 6577 + }, + { + "epoch": 0.5611670363419212, + "grad_norm": 1.7780819879300818, + "learning_rate": 4.255374144312335e-05, + "loss": 0.2098, + "step": 6578 + }, + { + "epoch": 0.5612523460160382, + "grad_norm": 1.8010049213509514, + "learning_rate": 4.2540080517847255e-05, + "loss": 0.2471, + "step": 6579 + }, + { + "epoch": 0.5613376556901553, + "grad_norm": 1.8643070492204854, + "learning_rate": 4.2526420162096344e-05, + "loss": 0.2519, + "step": 6580 + }, + { + "epoch": 0.5614229653642723, + "grad_norm": 1.591191425908551, + "learning_rate": 4.251276037691355e-05, + "loss": 0.2422, + "step": 6581 + }, + { + "epoch": 0.5615082750383894, + "grad_norm": 1.620600834558567, + "learning_rate": 4.249910116334171e-05, + "loss": 0.1633, + "step": 6582 + }, + { + "epoch": 0.5615935847125064, + "grad_norm": 2.136684605745816, + "learning_rate": 4.2485442522423636e-05, + "loss": 0.2485, + "step": 6583 + }, + { + "epoch": 0.5616788943866234, + "grad_norm": 1.9860327096802, + "learning_rate": 4.247178445520209e-05, + "loss": 0.2624, + "step": 6584 + }, + { + "epoch": 0.5617642040607405, + "grad_norm": 1.911135868933587, + "learning_rate": 4.245812696271981e-05, + "loss": 0.2113, + "step": 6585 + }, + { + "epoch": 0.5618495137348576, + "grad_norm": 1.4476807033822452, + "learning_rate": 4.2444470046019444e-05, + "loss": 0.2345, + "step": 6586 + }, + { + "epoch": 0.5619348234089746, + "grad_norm": 1.9050424221651925, + "learning_rate": 4.2430813706143636e-05, + "loss": 0.2596, + "step": 6587 + }, + { + "epoch": 0.5620201330830916, + "grad_norm": 1.678368835919744, + "learning_rate": 4.2417157944134975e-05, + "loss": 0.2141, + "step": 6588 + }, + { + "epoch": 0.5621054427572086, + "grad_norm": 1.5524024464645707, + "learning_rate": 4.240350276103604e-05, + "loss": 0.1908, + "step": 6589 + }, + { + "epoch": 0.5621907524313257, + "grad_norm": 1.4243598437036438, + "learning_rate": 4.2389848157889276e-05, + "loss": 0.1956, + "step": 6590 + }, + { + "epoch": 0.5622760621054428, + "grad_norm": 1.314815930294903, + "learning_rate": 4.2376194135737165e-05, + "loss": 0.2629, + "step": 6591 + }, + { + "epoch": 0.5623613717795598, + "grad_norm": 1.7280500399178853, + "learning_rate": 4.236254069562213e-05, + "loss": 0.1994, + "step": 6592 + }, + { + "epoch": 0.5624466814536768, + "grad_norm": 1.8638801172122395, + "learning_rate": 4.234888783858653e-05, + "loss": 0.2279, + "step": 6593 + }, + { + "epoch": 0.5625319911277938, + "grad_norm": 1.7340163126335484, + "learning_rate": 4.233523556567267e-05, + "loss": 0.182, + "step": 6594 + }, + { + "epoch": 0.562617300801911, + "grad_norm": 2.0344356498525857, + "learning_rate": 4.232158387792287e-05, + "loss": 0.298, + "step": 6595 + }, + { + "epoch": 0.562702610476028, + "grad_norm": 1.5330984675167785, + "learning_rate": 4.230793277637931e-05, + "loss": 0.2722, + "step": 6596 + }, + { + "epoch": 0.562787920150145, + "grad_norm": 1.4347775428416165, + "learning_rate": 4.2294282262084215e-05, + "loss": 0.1801, + "step": 6597 + }, + { + "epoch": 0.562873229824262, + "grad_norm": 2.0618060707487658, + "learning_rate": 4.228063233607974e-05, + "loss": 0.3041, + "step": 6598 + }, + { + "epoch": 0.5629585394983792, + "grad_norm": 1.6079286168229283, + "learning_rate": 4.226698299940794e-05, + "loss": 0.2045, + "step": 6599 + }, + { + "epoch": 0.5630438491724962, + "grad_norm": 1.8413961870697808, + "learning_rate": 4.225333425311089e-05, + "loss": 0.2433, + "step": 6600 + }, + { + "epoch": 0.5631291588466132, + "grad_norm": 2.2436739682887445, + "learning_rate": 4.223968609823061e-05, + "loss": 0.2562, + "step": 6601 + }, + { + "epoch": 0.5632144685207302, + "grad_norm": 1.801913155989632, + "learning_rate": 4.2226038535809084e-05, + "loss": 0.1995, + "step": 6602 + }, + { + "epoch": 0.5632997781948473, + "grad_norm": 1.659699285110179, + "learning_rate": 4.2212391566888196e-05, + "loss": 0.2187, + "step": 6603 + }, + { + "epoch": 0.5633850878689644, + "grad_norm": 2.1472135629461504, + "learning_rate": 4.219874519250981e-05, + "loss": 0.276, + "step": 6604 + }, + { + "epoch": 0.5634703975430814, + "grad_norm": 1.8449105958030454, + "learning_rate": 4.2185099413715795e-05, + "loss": 0.2685, + "step": 6605 + }, + { + "epoch": 0.5635557072171984, + "grad_norm": 1.4184978748422596, + "learning_rate": 4.217145423154789e-05, + "loss": 0.2064, + "step": 6606 + }, + { + "epoch": 0.5636410168913155, + "grad_norm": 1.591308968313664, + "learning_rate": 4.2157809647047855e-05, + "loss": 0.2301, + "step": 6607 + }, + { + "epoch": 0.5637263265654325, + "grad_norm": 1.608177406118355, + "learning_rate": 4.2144165661257405e-05, + "loss": 0.2514, + "step": 6608 + }, + { + "epoch": 0.5638116362395496, + "grad_norm": 1.5261417742324508, + "learning_rate": 4.2130522275218134e-05, + "loss": 0.2218, + "step": 6609 + }, + { + "epoch": 0.5638969459136666, + "grad_norm": 1.4789949595047573, + "learning_rate": 4.211687948997167e-05, + "loss": 0.2032, + "step": 6610 + }, + { + "epoch": 0.5639822555877837, + "grad_norm": 1.6309913079353515, + "learning_rate": 4.210323730655959e-05, + "loss": 0.2465, + "step": 6611 + }, + { + "epoch": 0.5640675652619007, + "grad_norm": 1.60266373153477, + "learning_rate": 4.208959572602336e-05, + "loss": 0.25, + "step": 6612 + }, + { + "epoch": 0.5641528749360177, + "grad_norm": 2.0399558496372507, + "learning_rate": 4.207595474940446e-05, + "loss": 0.2463, + "step": 6613 + }, + { + "epoch": 0.5642381846101348, + "grad_norm": 1.6960664118773578, + "learning_rate": 4.2062314377744315e-05, + "loss": 0.2195, + "step": 6614 + }, + { + "epoch": 0.5643234942842519, + "grad_norm": 1.4328794440851396, + "learning_rate": 4.204867461208428e-05, + "loss": 0.2522, + "step": 6615 + }, + { + "epoch": 0.5644088039583689, + "grad_norm": 1.7323438418252348, + "learning_rate": 4.2035035453465684e-05, + "loss": 0.1914, + "step": 6616 + }, + { + "epoch": 0.5644941136324859, + "grad_norm": 1.410159969589104, + "learning_rate": 4.2021396902929796e-05, + "loss": 0.2889, + "step": 6617 + }, + { + "epoch": 0.5645794233066029, + "grad_norm": 1.5155306127987247, + "learning_rate": 4.2007758961517886e-05, + "loss": 0.2466, + "step": 6618 + }, + { + "epoch": 0.5646647329807201, + "grad_norm": 1.6453469760974218, + "learning_rate": 4.1994121630271086e-05, + "loss": 0.2164, + "step": 6619 + }, + { + "epoch": 0.5647500426548371, + "grad_norm": 1.7273792389148885, + "learning_rate": 4.198048491023055e-05, + "loss": 0.2412, + "step": 6620 + }, + { + "epoch": 0.5648353523289541, + "grad_norm": 1.3577817151757767, + "learning_rate": 4.19668488024374e-05, + "loss": 0.1811, + "step": 6621 + }, + { + "epoch": 0.5649206620030711, + "grad_norm": 1.460599727648663, + "learning_rate": 4.195321330793264e-05, + "loss": 0.2262, + "step": 6622 + }, + { + "epoch": 0.5650059716771882, + "grad_norm": 2.217470794278974, + "learning_rate": 4.1939578427757284e-05, + "loss": 0.209, + "step": 6623 + }, + { + "epoch": 0.5650912813513053, + "grad_norm": 1.3388394075345558, + "learning_rate": 4.1925944162952285e-05, + "loss": 0.1812, + "step": 6624 + }, + { + "epoch": 0.5651765910254223, + "grad_norm": 1.7849358687647652, + "learning_rate": 4.1912310514558545e-05, + "loss": 0.2588, + "step": 6625 + }, + { + "epoch": 0.5652619006995393, + "grad_norm": 1.4854313354338593, + "learning_rate": 4.189867748361691e-05, + "loss": 0.2273, + "step": 6626 + }, + { + "epoch": 0.5653472103736564, + "grad_norm": 1.9509119542294875, + "learning_rate": 4.1885045071168216e-05, + "loss": 0.2011, + "step": 6627 + }, + { + "epoch": 0.5654325200477734, + "grad_norm": 2.063139036189201, + "learning_rate": 4.187141327825319e-05, + "loss": 0.2572, + "step": 6628 + }, + { + "epoch": 0.5655178297218905, + "grad_norm": 1.5805611267095945, + "learning_rate": 4.185778210591257e-05, + "loss": 0.1948, + "step": 6629 + }, + { + "epoch": 0.5656031393960075, + "grad_norm": 1.4876114747407245, + "learning_rate": 4.1844151555187035e-05, + "loss": 0.2906, + "step": 6630 + }, + { + "epoch": 0.5656884490701245, + "grad_norm": 1.441323792288082, + "learning_rate": 4.183052162711716e-05, + "loss": 0.2263, + "step": 6631 + }, + { + "epoch": 0.5657737587442416, + "grad_norm": 1.2925217884521936, + "learning_rate": 4.1816892322743555e-05, + "loss": 0.1392, + "step": 6632 + }, + { + "epoch": 0.5658590684183586, + "grad_norm": 1.2502438307413328, + "learning_rate": 4.1803263643106735e-05, + "loss": 0.2217, + "step": 6633 + }, + { + "epoch": 0.5659443780924757, + "grad_norm": 2.0751768151749146, + "learning_rate": 4.17896355892472e-05, + "loss": 0.4398, + "step": 6634 + }, + { + "epoch": 0.5660296877665927, + "grad_norm": 1.7830412040138561, + "learning_rate": 4.177600816220535e-05, + "loss": 0.2324, + "step": 6635 + }, + { + "epoch": 0.5661149974407098, + "grad_norm": 1.7778514095131408, + "learning_rate": 4.1762381363021557e-05, + "loss": 0.2433, + "step": 6636 + }, + { + "epoch": 0.5662003071148268, + "grad_norm": 1.4990765590139672, + "learning_rate": 4.1748755192736194e-05, + "loss": 0.1814, + "step": 6637 + }, + { + "epoch": 0.5662856167889438, + "grad_norm": 1.307696088671073, + "learning_rate": 4.173512965238951e-05, + "loss": 0.2035, + "step": 6638 + }, + { + "epoch": 0.5663709264630609, + "grad_norm": 1.557858248146981, + "learning_rate": 4.172150474302175e-05, + "loss": 0.2287, + "step": 6639 + }, + { + "epoch": 0.566456236137178, + "grad_norm": 2.457588751151746, + "learning_rate": 4.170788046567314e-05, + "loss": 0.3438, + "step": 6640 + }, + { + "epoch": 0.566541545811295, + "grad_norm": 1.878373607868276, + "learning_rate": 4.1694256821383764e-05, + "loss": 0.3037, + "step": 6641 + }, + { + "epoch": 0.566626855485412, + "grad_norm": 1.675778048852384, + "learning_rate": 4.168063381119375e-05, + "loss": 0.2579, + "step": 6642 + }, + { + "epoch": 0.566712165159529, + "grad_norm": 1.4861803821014832, + "learning_rate": 4.166701143614315e-05, + "loss": 0.2415, + "step": 6643 + }, + { + "epoch": 0.5667974748336462, + "grad_norm": 1.4904076311781198, + "learning_rate": 4.1653389697271925e-05, + "loss": 0.2386, + "step": 6644 + }, + { + "epoch": 0.5668827845077632, + "grad_norm": 1.5475410029536387, + "learning_rate": 4.1639768595620056e-05, + "loss": 0.2127, + "step": 6645 + }, + { + "epoch": 0.5669680941818802, + "grad_norm": 1.3562211029208824, + "learning_rate": 4.162614813222743e-05, + "loss": 0.2648, + "step": 6646 + }, + { + "epoch": 0.5670534038559972, + "grad_norm": 1.308790778514908, + "learning_rate": 4.1612528308133895e-05, + "loss": 0.2298, + "step": 6647 + }, + { + "epoch": 0.5671387135301144, + "grad_norm": 1.2879699701101883, + "learning_rate": 4.1598909124379237e-05, + "loss": 0.2574, + "step": 6648 + }, + { + "epoch": 0.5672240232042314, + "grad_norm": 1.6166481187418438, + "learning_rate": 4.1585290582003225e-05, + "loss": 0.215, + "step": 6649 + }, + { + "epoch": 0.5673093328783484, + "grad_norm": 1.4679166986597372, + "learning_rate": 4.157167268204559e-05, + "loss": 0.2321, + "step": 6650 + }, + { + "epoch": 0.5673946425524654, + "grad_norm": 1.6072346797325099, + "learning_rate": 4.1558055425545925e-05, + "loss": 0.2495, + "step": 6651 + }, + { + "epoch": 0.5674799522265825, + "grad_norm": 1.8409710647820476, + "learning_rate": 4.154443881354388e-05, + "loss": 0.2533, + "step": 6652 + }, + { + "epoch": 0.5675652619006996, + "grad_norm": 1.480759161896823, + "learning_rate": 4.153082284707902e-05, + "loss": 0.215, + "step": 6653 + }, + { + "epoch": 0.5676505715748166, + "grad_norm": 1.572369495064697, + "learning_rate": 4.15172075271908e-05, + "loss": 0.2586, + "step": 6654 + }, + { + "epoch": 0.5677358812489336, + "grad_norm": 1.5291623267061076, + "learning_rate": 4.1503592854918714e-05, + "loss": 0.175, + "step": 6655 + }, + { + "epoch": 0.5678211909230507, + "grad_norm": 1.9971741740260123, + "learning_rate": 4.148997883130218e-05, + "loss": 0.2793, + "step": 6656 + }, + { + "epoch": 0.5679065005971677, + "grad_norm": 1.6046388434858736, + "learning_rate": 4.147636545738053e-05, + "loss": 0.2008, + "step": 6657 + }, + { + "epoch": 0.5679918102712848, + "grad_norm": 1.4509703546304726, + "learning_rate": 4.146275273419307e-05, + "loss": 0.1956, + "step": 6658 + }, + { + "epoch": 0.5680771199454018, + "grad_norm": 1.38621185064494, + "learning_rate": 4.14491406627791e-05, + "loss": 0.2296, + "step": 6659 + }, + { + "epoch": 0.5681624296195189, + "grad_norm": 1.8214516411183188, + "learning_rate": 4.143552924417777e-05, + "loss": 0.161, + "step": 6660 + }, + { + "epoch": 0.5682477392936359, + "grad_norm": 1.936875150273709, + "learning_rate": 4.1421918479428285e-05, + "loss": 0.2956, + "step": 6661 + }, + { + "epoch": 0.5683330489677529, + "grad_norm": 1.720985346153774, + "learning_rate": 4.1408308369569734e-05, + "loss": 0.2206, + "step": 6662 + }, + { + "epoch": 0.56841835864187, + "grad_norm": 1.9192549671637023, + "learning_rate": 4.13946989156412e-05, + "loss": 0.2699, + "step": 6663 + }, + { + "epoch": 0.5685036683159871, + "grad_norm": 1.6278080520085776, + "learning_rate": 4.138109011868165e-05, + "loss": 0.2729, + "step": 6664 + }, + { + "epoch": 0.5685889779901041, + "grad_norm": 1.4604244045137833, + "learning_rate": 4.1367481979730086e-05, + "loss": 0.1972, + "step": 6665 + }, + { + "epoch": 0.5686742876642211, + "grad_norm": 1.8348529615400826, + "learning_rate": 4.135387449982541e-05, + "loss": 0.2117, + "step": 6666 + }, + { + "epoch": 0.5687595973383381, + "grad_norm": 1.7252313652231706, + "learning_rate": 4.134026768000646e-05, + "loss": 0.2401, + "step": 6667 + }, + { + "epoch": 0.5688449070124552, + "grad_norm": 1.582166877866853, + "learning_rate": 4.132666152131204e-05, + "loss": 0.3159, + "step": 6668 + }, + { + "epoch": 0.5689302166865723, + "grad_norm": 1.3378241083915297, + "learning_rate": 4.131305602478095e-05, + "loss": 0.1981, + "step": 6669 + }, + { + "epoch": 0.5690155263606893, + "grad_norm": 1.5601287396201773, + "learning_rate": 4.129945119145184e-05, + "loss": 0.1995, + "step": 6670 + }, + { + "epoch": 0.5691008360348063, + "grad_norm": 1.4973297193001815, + "learning_rate": 4.128584702236341e-05, + "loss": 0.1857, + "step": 6671 + }, + { + "epoch": 0.5691861457089233, + "grad_norm": 1.580166695167121, + "learning_rate": 4.1272243518554274e-05, + "loss": 0.2231, + "step": 6672 + }, + { + "epoch": 0.5692714553830405, + "grad_norm": 2.0404791460006817, + "learning_rate": 4.1258640681062934e-05, + "loss": 0.2604, + "step": 6673 + }, + { + "epoch": 0.5693567650571575, + "grad_norm": 1.8052554207141838, + "learning_rate": 4.124503851092793e-05, + "loss": 0.2648, + "step": 6674 + }, + { + "epoch": 0.5694420747312745, + "grad_norm": 1.646124061627656, + "learning_rate": 4.123143700918773e-05, + "loss": 0.232, + "step": 6675 + }, + { + "epoch": 0.5695273844053915, + "grad_norm": 1.6388947581960176, + "learning_rate": 4.121783617688071e-05, + "loss": 0.2433, + "step": 6676 + }, + { + "epoch": 0.5696126940795087, + "grad_norm": 1.5959586005518103, + "learning_rate": 4.120423601504523e-05, + "loss": 0.2318, + "step": 6677 + }, + { + "epoch": 0.5696980037536257, + "grad_norm": 1.5151361679976036, + "learning_rate": 4.119063652471958e-05, + "loss": 0.2213, + "step": 6678 + }, + { + "epoch": 0.5697833134277427, + "grad_norm": 1.862125766202671, + "learning_rate": 4.117703770694204e-05, + "loss": 0.2299, + "step": 6679 + }, + { + "epoch": 0.5698686231018597, + "grad_norm": 1.4390027028602521, + "learning_rate": 4.1163439562750767e-05, + "loss": 0.2234, + "step": 6680 + }, + { + "epoch": 0.5699539327759768, + "grad_norm": 1.2648624901799923, + "learning_rate": 4.114984209318392e-05, + "loss": 0.1948, + "step": 6681 + }, + { + "epoch": 0.5700392424500939, + "grad_norm": 1.6212959623034875, + "learning_rate": 4.113624529927963e-05, + "loss": 0.2629, + "step": 6682 + }, + { + "epoch": 0.5701245521242109, + "grad_norm": 2.1427445881665848, + "learning_rate": 4.112264918207588e-05, + "loss": 0.2563, + "step": 6683 + }, + { + "epoch": 0.5702098617983279, + "grad_norm": 2.1934666594629464, + "learning_rate": 4.110905374261069e-05, + "loss": 0.2642, + "step": 6684 + }, + { + "epoch": 0.570295171472445, + "grad_norm": 1.648089186764265, + "learning_rate": 4.109545898192203e-05, + "loss": 0.2397, + "step": 6685 + }, + { + "epoch": 0.570380481146562, + "grad_norm": 1.4700786992499841, + "learning_rate": 4.1081864901047736e-05, + "loss": 0.1947, + "step": 6686 + }, + { + "epoch": 0.570465790820679, + "grad_norm": 1.6132479980588745, + "learning_rate": 4.106827150102567e-05, + "loss": 0.1979, + "step": 6687 + }, + { + "epoch": 0.5705511004947961, + "grad_norm": 1.505178651440039, + "learning_rate": 4.105467878289361e-05, + "loss": 0.2215, + "step": 6688 + }, + { + "epoch": 0.5706364101689132, + "grad_norm": 2.359894145919229, + "learning_rate": 4.10410867476893e-05, + "loss": 0.2743, + "step": 6689 + }, + { + "epoch": 0.5707217198430302, + "grad_norm": 1.8526680136821325, + "learning_rate": 4.102749539645039e-05, + "loss": 0.2338, + "step": 6690 + }, + { + "epoch": 0.5708070295171472, + "grad_norm": 1.4088009902404737, + "learning_rate": 4.1013904730214556e-05, + "loss": 0.1986, + "step": 6691 + }, + { + "epoch": 0.5708923391912643, + "grad_norm": 1.4795497165485902, + "learning_rate": 4.1000314750019316e-05, + "loss": 0.2502, + "step": 6692 + }, + { + "epoch": 0.5709776488653814, + "grad_norm": 1.9634545574209266, + "learning_rate": 4.0986725456902216e-05, + "loss": 0.1892, + "step": 6693 + }, + { + "epoch": 0.5710629585394984, + "grad_norm": 1.5921004491486845, + "learning_rate": 4.097313685190074e-05, + "loss": 0.2475, + "step": 6694 + }, + { + "epoch": 0.5711482682136154, + "grad_norm": 1.9347870814556383, + "learning_rate": 4.095954893605232e-05, + "loss": 0.2455, + "step": 6695 + }, + { + "epoch": 0.5712335778877324, + "grad_norm": 1.7554955025889372, + "learning_rate": 4.0945961710394265e-05, + "loss": 0.2494, + "step": 6696 + }, + { + "epoch": 0.5713188875618496, + "grad_norm": 1.5569129236626162, + "learning_rate": 4.093237517596394e-05, + "loss": 0.2032, + "step": 6697 + }, + { + "epoch": 0.5714041972359666, + "grad_norm": 1.9568379015407724, + "learning_rate": 4.0918789333798576e-05, + "loss": 0.1965, + "step": 6698 + }, + { + "epoch": 0.5714895069100836, + "grad_norm": 1.7524504535108827, + "learning_rate": 4.09052041849354e-05, + "loss": 0.3027, + "step": 6699 + }, + { + "epoch": 0.5715748165842006, + "grad_norm": 1.638088772754119, + "learning_rate": 4.089161973041153e-05, + "loss": 0.1774, + "step": 6700 + }, + { + "epoch": 0.5716601262583177, + "grad_norm": 1.6970039051755033, + "learning_rate": 4.0878035971264125e-05, + "loss": 0.2291, + "step": 6701 + }, + { + "epoch": 0.5717454359324348, + "grad_norm": 1.7172736329404397, + "learning_rate": 4.086445290853018e-05, + "loss": 0.2439, + "step": 6702 + }, + { + "epoch": 0.5718307456065518, + "grad_norm": 1.7397452029756701, + "learning_rate": 4.08508705432467e-05, + "loss": 0.2682, + "step": 6703 + }, + { + "epoch": 0.5719160552806688, + "grad_norm": 1.4457055459547246, + "learning_rate": 4.083728887645066e-05, + "loss": 0.2134, + "step": 6704 + }, + { + "epoch": 0.5720013649547858, + "grad_norm": 1.4757551781616665, + "learning_rate": 4.082370790917891e-05, + "loss": 0.2265, + "step": 6705 + }, + { + "epoch": 0.572086674628903, + "grad_norm": 1.9900851058013875, + "learning_rate": 4.081012764246829e-05, + "loss": 0.1931, + "step": 6706 + }, + { + "epoch": 0.57217198430302, + "grad_norm": 1.400217979355169, + "learning_rate": 4.079654807735559e-05, + "loss": 0.2031, + "step": 6707 + }, + { + "epoch": 0.572257293977137, + "grad_norm": 1.7720749784240208, + "learning_rate": 4.078296921487756e-05, + "loss": 0.3002, + "step": 6708 + }, + { + "epoch": 0.572342603651254, + "grad_norm": 1.5960616755985506, + "learning_rate": 4.076939105607084e-05, + "loss": 0.1998, + "step": 6709 + }, + { + "epoch": 0.5724279133253711, + "grad_norm": 1.9565672790829671, + "learning_rate": 4.0755813601972054e-05, + "loss": 0.3191, + "step": 6710 + }, + { + "epoch": 0.5725132229994881, + "grad_norm": 2.140668214236476, + "learning_rate": 4.074223685361779e-05, + "loss": 0.2245, + "step": 6711 + }, + { + "epoch": 0.5725985326736052, + "grad_norm": 1.3220862938941647, + "learning_rate": 4.0728660812044536e-05, + "loss": 0.2039, + "step": 6712 + }, + { + "epoch": 0.5726838423477222, + "grad_norm": 2.065743064103295, + "learning_rate": 4.071508547828875e-05, + "loss": 0.1736, + "step": 6713 + }, + { + "epoch": 0.5727691520218393, + "grad_norm": 1.38359207334153, + "learning_rate": 4.070151085338688e-05, + "loss": 0.2477, + "step": 6714 + }, + { + "epoch": 0.5728544616959563, + "grad_norm": 1.2478580242424944, + "learning_rate": 4.068793693837522e-05, + "loss": 0.221, + "step": 6715 + }, + { + "epoch": 0.5729397713700733, + "grad_norm": 2.1287651365215767, + "learning_rate": 4.067436373429008e-05, + "loss": 0.2873, + "step": 6716 + }, + { + "epoch": 0.5730250810441904, + "grad_norm": 1.4166722952039017, + "learning_rate": 4.066079124216775e-05, + "loss": 0.2172, + "step": 6717 + }, + { + "epoch": 0.5731103907183075, + "grad_norm": 1.9116250224456757, + "learning_rate": 4.064721946304434e-05, + "loss": 0.2527, + "step": 6718 + }, + { + "epoch": 0.5731957003924245, + "grad_norm": 1.5102257268073023, + "learning_rate": 4.063364839795605e-05, + "loss": 0.2356, + "step": 6719 + }, + { + "epoch": 0.5732810100665415, + "grad_norm": 1.8623731787598616, + "learning_rate": 4.062007804793893e-05, + "loss": 0.2218, + "step": 6720 + }, + { + "epoch": 0.5733663197406585, + "grad_norm": 1.8154333647084868, + "learning_rate": 4.0606508414029e-05, + "loss": 0.247, + "step": 6721 + }, + { + "epoch": 0.5734516294147757, + "grad_norm": 1.5479530662203385, + "learning_rate": 4.059293949726222e-05, + "loss": 0.2413, + "step": 6722 + }, + { + "epoch": 0.5735369390888927, + "grad_norm": 1.5991250447231726, + "learning_rate": 4.057937129867453e-05, + "loss": 0.203, + "step": 6723 + }, + { + "epoch": 0.5736222487630097, + "grad_norm": 1.5881137292335143, + "learning_rate": 4.05658038193018e-05, + "loss": 0.2884, + "step": 6724 + }, + { + "epoch": 0.5737075584371267, + "grad_norm": 1.607017345674307, + "learning_rate": 4.0552237060179796e-05, + "loss": 0.1972, + "step": 6725 + }, + { + "epoch": 0.5737928681112439, + "grad_norm": 1.521494090449565, + "learning_rate": 4.053867102234428e-05, + "loss": 0.2102, + "step": 6726 + }, + { + "epoch": 0.5738781777853609, + "grad_norm": 1.5134234612188122, + "learning_rate": 4.052510570683098e-05, + "loss": 0.2383, + "step": 6727 + }, + { + "epoch": 0.5739634874594779, + "grad_norm": 1.4997400217463555, + "learning_rate": 4.0511541114675487e-05, + "loss": 0.2264, + "step": 6728 + }, + { + "epoch": 0.5740487971335949, + "grad_norm": 1.6026179859688352, + "learning_rate": 4.049797724691342e-05, + "loss": 0.2394, + "step": 6729 + }, + { + "epoch": 0.574134106807712, + "grad_norm": 1.6428361796587634, + "learning_rate": 4.04844141045803e-05, + "loss": 0.2097, + "step": 6730 + }, + { + "epoch": 0.5742194164818291, + "grad_norm": 1.4995552039292677, + "learning_rate": 4.047085168871159e-05, + "loss": 0.2037, + "step": 6731 + }, + { + "epoch": 0.5743047261559461, + "grad_norm": 1.6508455450821713, + "learning_rate": 4.045729000034271e-05, + "loss": 0.2694, + "step": 6732 + }, + { + "epoch": 0.5743900358300631, + "grad_norm": 1.6221736390460992, + "learning_rate": 4.0443729040509045e-05, + "loss": 0.1993, + "step": 6733 + }, + { + "epoch": 0.5744753455041802, + "grad_norm": 1.5568704279920902, + "learning_rate": 4.043016881024587e-05, + "loss": 0.2386, + "step": 6734 + }, + { + "epoch": 0.5745606551782972, + "grad_norm": 1.7381258434305953, + "learning_rate": 4.0416609310588455e-05, + "loss": 0.2249, + "step": 6735 + }, + { + "epoch": 0.5746459648524143, + "grad_norm": 1.8992604799130877, + "learning_rate": 4.0403050542572005e-05, + "loss": 0.2348, + "step": 6736 + }, + { + "epoch": 0.5747312745265313, + "grad_norm": 1.545476107523433, + "learning_rate": 4.0389492507231635e-05, + "loss": 0.2355, + "step": 6737 + }, + { + "epoch": 0.5748165842006484, + "grad_norm": 1.9715923802023265, + "learning_rate": 4.037593520560244e-05, + "loss": 0.3487, + "step": 6738 + }, + { + "epoch": 0.5749018938747654, + "grad_norm": 1.6894666726335423, + "learning_rate": 4.036237863871944e-05, + "loss": 0.2952, + "step": 6739 + }, + { + "epoch": 0.5749872035488824, + "grad_norm": 1.4593197438400374, + "learning_rate": 4.0348822807617654e-05, + "loss": 0.1783, + "step": 6740 + }, + { + "epoch": 0.5750725132229995, + "grad_norm": 1.5596900850953692, + "learning_rate": 4.0335267713331944e-05, + "loss": 0.2588, + "step": 6741 + }, + { + "epoch": 0.5751578228971166, + "grad_norm": 1.404554998579199, + "learning_rate": 4.0321713356897176e-05, + "loss": 0.2411, + "step": 6742 + }, + { + "epoch": 0.5752431325712336, + "grad_norm": 1.372821338954217, + "learning_rate": 4.0308159739348174e-05, + "loss": 0.239, + "step": 6743 + }, + { + "epoch": 0.5753284422453506, + "grad_norm": 2.1047970296542275, + "learning_rate": 4.029460686171967e-05, + "loss": 0.2622, + "step": 6744 + }, + { + "epoch": 0.5754137519194676, + "grad_norm": 2.1247116341996666, + "learning_rate": 4.028105472504634e-05, + "loss": 0.2709, + "step": 6745 + }, + { + "epoch": 0.5754990615935847, + "grad_norm": 1.7729467981858305, + "learning_rate": 4.026750333036286e-05, + "loss": 0.2662, + "step": 6746 + }, + { + "epoch": 0.5755843712677018, + "grad_norm": 1.5440849033591832, + "learning_rate": 4.025395267870376e-05, + "loss": 0.1853, + "step": 6747 + }, + { + "epoch": 0.5756696809418188, + "grad_norm": 1.3762169134613313, + "learning_rate": 4.024040277110359e-05, + "loss": 0.2291, + "step": 6748 + }, + { + "epoch": 0.5757549906159358, + "grad_norm": 1.8539336911825837, + "learning_rate": 4.022685360859683e-05, + "loss": 0.1728, + "step": 6749 + }, + { + "epoch": 0.5758403002900528, + "grad_norm": 1.6049584090408626, + "learning_rate": 4.0213305192217834e-05, + "loss": 0.2073, + "step": 6750 + }, + { + "epoch": 0.57592560996417, + "grad_norm": 1.8401960614693254, + "learning_rate": 4.019975752300099e-05, + "loss": 0.2392, + "step": 6751 + }, + { + "epoch": 0.576010919638287, + "grad_norm": 1.941942167860835, + "learning_rate": 4.0186210601980575e-05, + "loss": 0.2249, + "step": 6752 + }, + { + "epoch": 0.576096229312404, + "grad_norm": 1.6010302892151207, + "learning_rate": 4.0172664430190847e-05, + "loss": 0.2517, + "step": 6753 + }, + { + "epoch": 0.576181538986521, + "grad_norm": 1.4075128606882885, + "learning_rate": 4.0159119008665945e-05, + "loss": 0.2006, + "step": 6754 + }, + { + "epoch": 0.5762668486606382, + "grad_norm": 2.3130848119337273, + "learning_rate": 4.0145574338440016e-05, + "loss": 0.3146, + "step": 6755 + }, + { + "epoch": 0.5763521583347552, + "grad_norm": 1.5406901471521977, + "learning_rate": 4.0132030420547145e-05, + "loss": 0.1855, + "step": 6756 + }, + { + "epoch": 0.5764374680088722, + "grad_norm": 1.9924820027800973, + "learning_rate": 4.011848725602129e-05, + "loss": 0.1914, + "step": 6757 + }, + { + "epoch": 0.5765227776829892, + "grad_norm": 1.7331109466698307, + "learning_rate": 4.010494484589642e-05, + "loss": 0.2164, + "step": 6758 + }, + { + "epoch": 0.5766080873571063, + "grad_norm": 1.3969024245879094, + "learning_rate": 4.009140319120645e-05, + "loss": 0.22, + "step": 6759 + }, + { + "epoch": 0.5766933970312234, + "grad_norm": 1.8881136395675502, + "learning_rate": 4.007786229298517e-05, + "loss": 0.2937, + "step": 6760 + }, + { + "epoch": 0.5767787067053404, + "grad_norm": 1.5199752198257304, + "learning_rate": 4.0064322152266385e-05, + "loss": 0.2627, + "step": 6761 + }, + { + "epoch": 0.5768640163794574, + "grad_norm": 1.6501391698926604, + "learning_rate": 4.005078277008381e-05, + "loss": 0.2377, + "step": 6762 + }, + { + "epoch": 0.5769493260535745, + "grad_norm": 1.4624464432781075, + "learning_rate": 4.00372441474711e-05, + "loss": 0.2381, + "step": 6763 + }, + { + "epoch": 0.5770346357276915, + "grad_norm": 2.174264923932256, + "learning_rate": 4.002370628546185e-05, + "loss": 0.2189, + "step": 6764 + }, + { + "epoch": 0.5771199454018086, + "grad_norm": 2.014644413850199, + "learning_rate": 4.001016918508963e-05, + "loss": 0.1533, + "step": 6765 + }, + { + "epoch": 0.5772052550759256, + "grad_norm": 1.4012264380400965, + "learning_rate": 3.9996632847387885e-05, + "loss": 0.2578, + "step": 6766 + }, + { + "epoch": 0.5772905647500427, + "grad_norm": 1.4388151372487104, + "learning_rate": 3.998309727339006e-05, + "loss": 0.2521, + "step": 6767 + }, + { + "epoch": 0.5773758744241597, + "grad_norm": 1.7240648803362613, + "learning_rate": 3.9969562464129535e-05, + "loss": 0.2478, + "step": 6768 + }, + { + "epoch": 0.5774611840982767, + "grad_norm": 1.554253905901481, + "learning_rate": 3.9956028420639636e-05, + "loss": 0.2434, + "step": 6769 + }, + { + "epoch": 0.5775464937723938, + "grad_norm": 1.668143913130381, + "learning_rate": 3.994249514395356e-05, + "loss": 0.2221, + "step": 6770 + }, + { + "epoch": 0.5776318034465109, + "grad_norm": 1.4402014186183076, + "learning_rate": 3.992896263510456e-05, + "loss": 0.2106, + "step": 6771 + }, + { + "epoch": 0.5777171131206279, + "grad_norm": 1.6117703287633398, + "learning_rate": 3.991543089512574e-05, + "loss": 0.2321, + "step": 6772 + }, + { + "epoch": 0.5778024227947449, + "grad_norm": 1.529617529292597, + "learning_rate": 3.990189992505018e-05, + "loss": 0.2461, + "step": 6773 + }, + { + "epoch": 0.5778877324688619, + "grad_norm": 1.9477614342617433, + "learning_rate": 3.988836972591089e-05, + "loss": 0.2051, + "step": 6774 + }, + { + "epoch": 0.5779730421429791, + "grad_norm": 1.4997448704233276, + "learning_rate": 3.987484029874086e-05, + "loss": 0.1822, + "step": 6775 + }, + { + "epoch": 0.5780583518170961, + "grad_norm": 1.9168036315189407, + "learning_rate": 3.9861311644572944e-05, + "loss": 0.2342, + "step": 6776 + }, + { + "epoch": 0.5781436614912131, + "grad_norm": 1.688157695183893, + "learning_rate": 3.984778376444001e-05, + "loss": 0.2013, + "step": 6777 + }, + { + "epoch": 0.5782289711653301, + "grad_norm": 1.756393334455185, + "learning_rate": 3.9834256659374856e-05, + "loss": 0.2617, + "step": 6778 + }, + { + "epoch": 0.5783142808394472, + "grad_norm": 1.5035922428628186, + "learning_rate": 3.982073033041017e-05, + "loss": 0.2742, + "step": 6779 + }, + { + "epoch": 0.5783995905135643, + "grad_norm": 1.6851197162288118, + "learning_rate": 3.980720477857863e-05, + "loss": 0.2969, + "step": 6780 + }, + { + "epoch": 0.5784849001876813, + "grad_norm": 1.8430372250503089, + "learning_rate": 3.9793680004912856e-05, + "loss": 0.2409, + "step": 6781 + }, + { + "epoch": 0.5785702098617983, + "grad_norm": 1.2594589924786561, + "learning_rate": 3.9780156010445355e-05, + "loss": 0.1962, + "step": 6782 + }, + { + "epoch": 0.5786555195359153, + "grad_norm": 1.2585123141159542, + "learning_rate": 3.976663279620865e-05, + "loss": 0.1763, + "step": 6783 + }, + { + "epoch": 0.5787408292100324, + "grad_norm": 1.5356103744710856, + "learning_rate": 3.9753110363235136e-05, + "loss": 0.2064, + "step": 6784 + }, + { + "epoch": 0.5788261388841495, + "grad_norm": 1.4158304869541043, + "learning_rate": 3.973958871255722e-05, + "loss": 0.242, + "step": 6785 + }, + { + "epoch": 0.5789114485582665, + "grad_norm": 1.6754872880098837, + "learning_rate": 3.972606784520716e-05, + "loss": 0.2027, + "step": 6786 + }, + { + "epoch": 0.5789967582323835, + "grad_norm": 1.2912282148297882, + "learning_rate": 3.9712547762217226e-05, + "loss": 0.175, + "step": 6787 + }, + { + "epoch": 0.5790820679065006, + "grad_norm": 1.6931138086072313, + "learning_rate": 3.969902846461961e-05, + "loss": 0.1827, + "step": 6788 + }, + { + "epoch": 0.5791673775806176, + "grad_norm": 1.809250912010234, + "learning_rate": 3.9685509953446424e-05, + "loss": 0.2253, + "step": 6789 + }, + { + "epoch": 0.5792526872547347, + "grad_norm": 1.7551175630758429, + "learning_rate": 3.967199222972974e-05, + "loss": 0.2111, + "step": 6790 + }, + { + "epoch": 0.5793379969288517, + "grad_norm": 1.4464674999912317, + "learning_rate": 3.965847529450159e-05, + "loss": 0.192, + "step": 6791 + }, + { + "epoch": 0.5794233066029688, + "grad_norm": 1.5825507170639013, + "learning_rate": 3.964495914879387e-05, + "loss": 0.2656, + "step": 6792 + }, + { + "epoch": 0.5795086162770858, + "grad_norm": 1.4891369379737518, + "learning_rate": 3.96314437936385e-05, + "loss": 0.1906, + "step": 6793 + }, + { + "epoch": 0.5795939259512028, + "grad_norm": 1.9043609138871724, + "learning_rate": 3.9617929230067306e-05, + "loss": 0.2226, + "step": 6794 + }, + { + "epoch": 0.5796792356253199, + "grad_norm": 1.7684473082005339, + "learning_rate": 3.960441545911204e-05, + "loss": 0.2493, + "step": 6795 + }, + { + "epoch": 0.579764545299437, + "grad_norm": 1.8005927063715106, + "learning_rate": 3.959090248180441e-05, + "loss": 0.2846, + "step": 6796 + }, + { + "epoch": 0.579849854973554, + "grad_norm": 1.5439967340523764, + "learning_rate": 3.957739029917605e-05, + "loss": 0.2128, + "step": 6797 + }, + { + "epoch": 0.579935164647671, + "grad_norm": 1.916114983427395, + "learning_rate": 3.9563878912258586e-05, + "loss": 0.2838, + "step": 6798 + }, + { + "epoch": 0.580020474321788, + "grad_norm": 1.763959723858345, + "learning_rate": 3.955036832208348e-05, + "loss": 0.2514, + "step": 6799 + }, + { + "epoch": 0.5801057839959052, + "grad_norm": 1.287097334624545, + "learning_rate": 3.9536858529682224e-05, + "loss": 0.2088, + "step": 6800 + }, + { + "epoch": 0.5801910936700222, + "grad_norm": 1.4774201339591198, + "learning_rate": 3.9523349536086247e-05, + "loss": 0.2673, + "step": 6801 + }, + { + "epoch": 0.5802764033441392, + "grad_norm": 1.7668963555906543, + "learning_rate": 3.950984134232683e-05, + "loss": 0.2278, + "step": 6802 + }, + { + "epoch": 0.5803617130182562, + "grad_norm": 1.6450727934804616, + "learning_rate": 3.94963339494353e-05, + "loss": 0.2419, + "step": 6803 + }, + { + "epoch": 0.5804470226923734, + "grad_norm": 1.6381992390506632, + "learning_rate": 3.948282735844285e-05, + "loss": 0.1786, + "step": 6804 + }, + { + "epoch": 0.5805323323664904, + "grad_norm": 1.6060196054480456, + "learning_rate": 3.9469321570380646e-05, + "loss": 0.2068, + "step": 6805 + }, + { + "epoch": 0.5806176420406074, + "grad_norm": 1.8207343926452868, + "learning_rate": 3.945581658627977e-05, + "loss": 0.2309, + "step": 6806 + }, + { + "epoch": 0.5807029517147244, + "grad_norm": 1.748945736077635, + "learning_rate": 3.9442312407171285e-05, + "loss": 0.2013, + "step": 6807 + }, + { + "epoch": 0.5807882613888415, + "grad_norm": 1.9641229661178279, + "learning_rate": 3.9428809034086126e-05, + "loss": 0.2151, + "step": 6808 + }, + { + "epoch": 0.5808735710629586, + "grad_norm": 2.3071226541747296, + "learning_rate": 3.941530646805522e-05, + "loss": 0.2541, + "step": 6809 + }, + { + "epoch": 0.5809588807370756, + "grad_norm": 1.6634398535726675, + "learning_rate": 3.940180471010944e-05, + "loss": 0.2219, + "step": 6810 + }, + { + "epoch": 0.5810441904111926, + "grad_norm": 1.6434017842385582, + "learning_rate": 3.938830376127953e-05, + "loss": 0.2279, + "step": 6811 + }, + { + "epoch": 0.5811295000853097, + "grad_norm": 1.5295427107744497, + "learning_rate": 3.9374803622596236e-05, + "loss": 0.1291, + "step": 6812 + }, + { + "epoch": 0.5812148097594267, + "grad_norm": 1.4377196600227877, + "learning_rate": 3.9361304295090236e-05, + "loss": 0.1689, + "step": 6813 + }, + { + "epoch": 0.5813001194335438, + "grad_norm": 1.599304990022491, + "learning_rate": 3.934780577979212e-05, + "loss": 0.1874, + "step": 6814 + }, + { + "epoch": 0.5813854291076608, + "grad_norm": 1.8808830156751897, + "learning_rate": 3.9334308077732426e-05, + "loss": 0.3407, + "step": 6815 + }, + { + "epoch": 0.5814707387817779, + "grad_norm": 1.9049862283924759, + "learning_rate": 3.932081118994162e-05, + "loss": 0.2488, + "step": 6816 + }, + { + "epoch": 0.5815560484558949, + "grad_norm": 1.9567393315527513, + "learning_rate": 3.930731511745015e-05, + "loss": 0.2445, + "step": 6817 + }, + { + "epoch": 0.5816413581300119, + "grad_norm": 1.4626432038972246, + "learning_rate": 3.9293819861288334e-05, + "loss": 0.229, + "step": 6818 + }, + { + "epoch": 0.581726667804129, + "grad_norm": 2.018270958908694, + "learning_rate": 3.9280325422486474e-05, + "loss": 0.2907, + "step": 6819 + }, + { + "epoch": 0.581811977478246, + "grad_norm": 1.4957888618737003, + "learning_rate": 3.926683180207483e-05, + "loss": 0.2257, + "step": 6820 + }, + { + "epoch": 0.5818972871523631, + "grad_norm": 1.5780888921553784, + "learning_rate": 3.9253339001083524e-05, + "loss": 0.2761, + "step": 6821 + }, + { + "epoch": 0.5819825968264801, + "grad_norm": 1.6171763691542138, + "learning_rate": 3.923984702054266e-05, + "loss": 0.2221, + "step": 6822 + }, + { + "epoch": 0.5820679065005971, + "grad_norm": 1.8562498355955315, + "learning_rate": 3.922635586148234e-05, + "loss": 0.2411, + "step": 6823 + }, + { + "epoch": 0.5821532161747142, + "grad_norm": 1.4497814474153596, + "learning_rate": 3.921286552493246e-05, + "loss": 0.2027, + "step": 6824 + }, + { + "epoch": 0.5822385258488313, + "grad_norm": 1.3132383903485512, + "learning_rate": 3.9199376011922995e-05, + "loss": 0.1673, + "step": 6825 + }, + { + "epoch": 0.5823238355229483, + "grad_norm": 1.7271511403551947, + "learning_rate": 3.918588732348378e-05, + "loss": 0.2184, + "step": 6826 + }, + { + "epoch": 0.5824091451970653, + "grad_norm": 1.753849224021608, + "learning_rate": 3.917239946064459e-05, + "loss": 0.2802, + "step": 6827 + }, + { + "epoch": 0.5824944548711823, + "grad_norm": 1.8859722801105043, + "learning_rate": 3.915891242443516e-05, + "loss": 0.1754, + "step": 6828 + }, + { + "epoch": 0.5825797645452995, + "grad_norm": 1.278450675827457, + "learning_rate": 3.914542621588516e-05, + "loss": 0.1978, + "step": 6829 + }, + { + "epoch": 0.5826650742194165, + "grad_norm": 1.76836499987888, + "learning_rate": 3.913194083602421e-05, + "loss": 0.2127, + "step": 6830 + }, + { + "epoch": 0.5827503838935335, + "grad_norm": 1.6174905833894415, + "learning_rate": 3.9118456285881786e-05, + "loss": 0.2463, + "step": 6831 + }, + { + "epoch": 0.5828356935676505, + "grad_norm": 1.7549285967611972, + "learning_rate": 3.910497256648742e-05, + "loss": 0.1865, + "step": 6832 + }, + { + "epoch": 0.5829210032417677, + "grad_norm": 1.7535776624882302, + "learning_rate": 3.9091489678870514e-05, + "loss": 0.3496, + "step": 6833 + }, + { + "epoch": 0.5830063129158847, + "grad_norm": 2.103143373950431, + "learning_rate": 3.907800762406038e-05, + "loss": 0.2604, + "step": 6834 + }, + { + "epoch": 0.5830916225900017, + "grad_norm": 1.875993465444738, + "learning_rate": 3.906452640308634e-05, + "loss": 0.2116, + "step": 6835 + }, + { + "epoch": 0.5831769322641187, + "grad_norm": 1.2715850649575495, + "learning_rate": 3.90510460169776e-05, + "loss": 0.252, + "step": 6836 + }, + { + "epoch": 0.5832622419382358, + "grad_norm": 1.4552057607086895, + "learning_rate": 3.903756646676331e-05, + "loss": 0.225, + "step": 6837 + }, + { + "epoch": 0.5833475516123529, + "grad_norm": 1.7436705977980727, + "learning_rate": 3.902408775347256e-05, + "loss": 0.25, + "step": 6838 + }, + { + "epoch": 0.5834328612864699, + "grad_norm": 1.9616526481280927, + "learning_rate": 3.90106098781344e-05, + "loss": 0.2734, + "step": 6839 + }, + { + "epoch": 0.5835181709605869, + "grad_norm": 1.3156676441811086, + "learning_rate": 3.899713284177775e-05, + "loss": 0.1822, + "step": 6840 + }, + { + "epoch": 0.583603480634704, + "grad_norm": 1.4631063915896039, + "learning_rate": 3.898365664543155e-05, + "loss": 0.2406, + "step": 6841 + }, + { + "epoch": 0.583688790308821, + "grad_norm": 1.5631642264927945, + "learning_rate": 3.897018129012462e-05, + "loss": 0.2094, + "step": 6842 + }, + { + "epoch": 0.583774099982938, + "grad_norm": 1.4374498275625636, + "learning_rate": 3.895670677688576e-05, + "loss": 0.2218, + "step": 6843 + }, + { + "epoch": 0.5838594096570551, + "grad_norm": 1.7863526456690237, + "learning_rate": 3.8943233106743634e-05, + "loss": 0.1769, + "step": 6844 + }, + { + "epoch": 0.5839447193311722, + "grad_norm": 1.570579240453411, + "learning_rate": 3.8929760280726904e-05, + "loss": 0.2526, + "step": 6845 + }, + { + "epoch": 0.5840300290052892, + "grad_norm": 1.5368891052864686, + "learning_rate": 3.8916288299864164e-05, + "loss": 0.2131, + "step": 6846 + }, + { + "epoch": 0.5841153386794062, + "grad_norm": 1.6538167389589424, + "learning_rate": 3.89028171651839e-05, + "loss": 0.2102, + "step": 6847 + }, + { + "epoch": 0.5842006483535233, + "grad_norm": 1.5661242604606733, + "learning_rate": 3.888934687771458e-05, + "loss": 0.2629, + "step": 6848 + }, + { + "epoch": 0.5842859580276404, + "grad_norm": 1.7180218628101769, + "learning_rate": 3.887587743848459e-05, + "loss": 0.2458, + "step": 6849 + }, + { + "epoch": 0.5843712677017574, + "grad_norm": 1.5843525751396639, + "learning_rate": 3.886240884852223e-05, + "loss": 0.2537, + "step": 6850 + }, + { + "epoch": 0.5844565773758744, + "grad_norm": 1.6623334901645286, + "learning_rate": 3.8848941108855774e-05, + "loss": 0.2353, + "step": 6851 + }, + { + "epoch": 0.5845418870499914, + "grad_norm": 1.5086527642471872, + "learning_rate": 3.883547422051343e-05, + "loss": 0.2302, + "step": 6852 + }, + { + "epoch": 0.5846271967241086, + "grad_norm": 1.6409812903677221, + "learning_rate": 3.8822008184523265e-05, + "loss": 0.1962, + "step": 6853 + }, + { + "epoch": 0.5847125063982256, + "grad_norm": 1.5552049858668746, + "learning_rate": 3.8808543001913384e-05, + "loss": 0.2739, + "step": 6854 + }, + { + "epoch": 0.5847978160723426, + "grad_norm": 1.44139458876444, + "learning_rate": 3.8795078673711795e-05, + "loss": 0.2383, + "step": 6855 + }, + { + "epoch": 0.5848831257464596, + "grad_norm": 1.6460153583676853, + "learning_rate": 3.8781615200946386e-05, + "loss": 0.2483, + "step": 6856 + }, + { + "epoch": 0.5849684354205766, + "grad_norm": 1.684968957586758, + "learning_rate": 3.8768152584645045e-05, + "loss": 0.24, + "step": 6857 + }, + { + "epoch": 0.5850537450946938, + "grad_norm": 1.784240086895008, + "learning_rate": 3.8754690825835564e-05, + "loss": 0.1809, + "step": 6858 + }, + { + "epoch": 0.5851390547688108, + "grad_norm": 1.9421845071636044, + "learning_rate": 3.8741229925545705e-05, + "loss": 0.2169, + "step": 6859 + }, + { + "epoch": 0.5852243644429278, + "grad_norm": 1.7489124051990026, + "learning_rate": 3.872776988480309e-05, + "loss": 0.2547, + "step": 6860 + }, + { + "epoch": 0.5853096741170448, + "grad_norm": 1.4305236539195072, + "learning_rate": 3.8714310704635335e-05, + "loss": 0.2883, + "step": 6861 + }, + { + "epoch": 0.585394983791162, + "grad_norm": 1.6043517596319186, + "learning_rate": 3.870085238607002e-05, + "loss": 0.1789, + "step": 6862 + }, + { + "epoch": 0.585480293465279, + "grad_norm": 1.4734550758195426, + "learning_rate": 3.868739493013455e-05, + "loss": 0.2005, + "step": 6863 + }, + { + "epoch": 0.585565603139396, + "grad_norm": 1.7234789901262906, + "learning_rate": 3.867393833785637e-05, + "loss": 0.2342, + "step": 6864 + }, + { + "epoch": 0.585650912813513, + "grad_norm": 1.360057002023015, + "learning_rate": 3.866048261026283e-05, + "loss": 0.2298, + "step": 6865 + }, + { + "epoch": 0.5857362224876301, + "grad_norm": 1.1615437132696744, + "learning_rate": 3.864702774838116e-05, + "loss": 0.1957, + "step": 6866 + }, + { + "epoch": 0.5858215321617471, + "grad_norm": 1.6104118192467354, + "learning_rate": 3.86335737532386e-05, + "loss": 0.2339, + "step": 6867 + }, + { + "epoch": 0.5859068418358642, + "grad_norm": 2.204837647427509, + "learning_rate": 3.8620120625862285e-05, + "loss": 0.2362, + "step": 6868 + }, + { + "epoch": 0.5859921515099812, + "grad_norm": 1.4271412672920158, + "learning_rate": 3.860666836727929e-05, + "loss": 0.2931, + "step": 6869 + }, + { + "epoch": 0.5860774611840983, + "grad_norm": 1.3164028088442596, + "learning_rate": 3.859321697851661e-05, + "loss": 0.2415, + "step": 6870 + }, + { + "epoch": 0.5861627708582153, + "grad_norm": 1.675022620432686, + "learning_rate": 3.857976646060122e-05, + "loss": 0.237, + "step": 6871 + }, + { + "epoch": 0.5862480805323323, + "grad_norm": 1.5715570025784988, + "learning_rate": 3.8566316814559955e-05, + "loss": 0.257, + "step": 6872 + }, + { + "epoch": 0.5863333902064494, + "grad_norm": 1.426038410562941, + "learning_rate": 3.855286804141964e-05, + "loss": 0.2203, + "step": 6873 + }, + { + "epoch": 0.5864186998805665, + "grad_norm": 1.622267332748958, + "learning_rate": 3.8539420142207013e-05, + "loss": 0.264, + "step": 6874 + }, + { + "epoch": 0.5865040095546835, + "grad_norm": 1.441007564167445, + "learning_rate": 3.8525973117948785e-05, + "loss": 0.2455, + "step": 6875 + }, + { + "epoch": 0.5865893192288005, + "grad_norm": 1.2731090660390263, + "learning_rate": 3.851252696967152e-05, + "loss": 0.2381, + "step": 6876 + }, + { + "epoch": 0.5866746289029175, + "grad_norm": 1.6056576360145387, + "learning_rate": 3.8499081698401786e-05, + "loss": 0.2134, + "step": 6877 + }, + { + "epoch": 0.5867599385770347, + "grad_norm": 1.5477066116573328, + "learning_rate": 3.848563730516604e-05, + "loss": 0.2248, + "step": 6878 + }, + { + "epoch": 0.5868452482511517, + "grad_norm": 2.050905409187982, + "learning_rate": 3.84721937909907e-05, + "loss": 0.2515, + "step": 6879 + }, + { + "epoch": 0.5869305579252687, + "grad_norm": 2.0658315858247525, + "learning_rate": 3.8458751156902104e-05, + "loss": 0.2305, + "step": 6880 + }, + { + "epoch": 0.5870158675993857, + "grad_norm": 1.4688009699641662, + "learning_rate": 3.844530940392654e-05, + "loss": 0.1478, + "step": 6881 + }, + { + "epoch": 0.5871011772735029, + "grad_norm": 1.346838484620931, + "learning_rate": 3.843186853309018e-05, + "loss": 0.2003, + "step": 6882 + }, + { + "epoch": 0.5871864869476199, + "grad_norm": 2.1089171866468055, + "learning_rate": 3.841842854541919e-05, + "loss": 0.3018, + "step": 6883 + }, + { + "epoch": 0.5872717966217369, + "grad_norm": 1.442689732547575, + "learning_rate": 3.8404989441939656e-05, + "loss": 0.2644, + "step": 6884 + }, + { + "epoch": 0.5873571062958539, + "grad_norm": 1.5882378793209204, + "learning_rate": 3.8391551223677535e-05, + "loss": 0.2299, + "step": 6885 + }, + { + "epoch": 0.587442415969971, + "grad_norm": 1.9275251002477143, + "learning_rate": 3.83781138916588e-05, + "loss": 0.1967, + "step": 6886 + }, + { + "epoch": 0.5875277256440881, + "grad_norm": 1.8473377558144632, + "learning_rate": 3.836467744690931e-05, + "loss": 0.2774, + "step": 6887 + }, + { + "epoch": 0.5876130353182051, + "grad_norm": 2.187518092489354, + "learning_rate": 3.835124189045487e-05, + "loss": 0.2402, + "step": 6888 + }, + { + "epoch": 0.5876983449923221, + "grad_norm": 1.5814042721002801, + "learning_rate": 3.8337807223321216e-05, + "loss": 0.2357, + "step": 6889 + }, + { + "epoch": 0.5877836546664392, + "grad_norm": 2.024703877779191, + "learning_rate": 3.832437344653399e-05, + "loss": 0.187, + "step": 6890 + }, + { + "epoch": 0.5878689643405562, + "grad_norm": 1.6442350360919675, + "learning_rate": 3.8310940561118835e-05, + "loss": 0.2099, + "step": 6891 + }, + { + "epoch": 0.5879542740146733, + "grad_norm": 2.0039406340328005, + "learning_rate": 3.829750856810123e-05, + "loss": 0.2597, + "step": 6892 + }, + { + "epoch": 0.5880395836887903, + "grad_norm": 1.380170291400362, + "learning_rate": 3.8284077468506654e-05, + "loss": 0.2435, + "step": 6893 + }, + { + "epoch": 0.5881248933629074, + "grad_norm": 1.3394436697683192, + "learning_rate": 3.8270647263360535e-05, + "loss": 0.2362, + "step": 6894 + }, + { + "epoch": 0.5882102030370244, + "grad_norm": 1.8609602125240765, + "learning_rate": 3.825721795368814e-05, + "loss": 0.251, + "step": 6895 + }, + { + "epoch": 0.5882955127111414, + "grad_norm": 1.859349034232267, + "learning_rate": 3.8243789540514754e-05, + "loss": 0.2231, + "step": 6896 + }, + { + "epoch": 0.5883808223852585, + "grad_norm": 1.821829102659433, + "learning_rate": 3.823036202486559e-05, + "loss": 0.2342, + "step": 6897 + }, + { + "epoch": 0.5884661320593755, + "grad_norm": 1.7128233945347309, + "learning_rate": 3.821693540776571e-05, + "loss": 0.2329, + "step": 6898 + }, + { + "epoch": 0.5885514417334926, + "grad_norm": 1.3942518194719236, + "learning_rate": 3.8203509690240216e-05, + "loss": 0.2289, + "step": 6899 + }, + { + "epoch": 0.5886367514076096, + "grad_norm": 1.954204535642691, + "learning_rate": 3.819008487331407e-05, + "loss": 0.2267, + "step": 6900 + }, + { + "epoch": 0.5887220610817266, + "grad_norm": 1.495930316653595, + "learning_rate": 3.817666095801219e-05, + "loss": 0.2268, + "step": 6901 + }, + { + "epoch": 0.5888073707558437, + "grad_norm": 1.6899231361651452, + "learning_rate": 3.81632379453594e-05, + "loss": 0.1817, + "step": 6902 + }, + { + "epoch": 0.5888926804299608, + "grad_norm": 1.5573572593662577, + "learning_rate": 3.8149815836380484e-05, + "loss": 0.2217, + "step": 6903 + }, + { + "epoch": 0.5889779901040778, + "grad_norm": 1.4057958929340817, + "learning_rate": 3.8136394632100193e-05, + "loss": 0.2277, + "step": 6904 + }, + { + "epoch": 0.5890632997781948, + "grad_norm": 1.7234616289153013, + "learning_rate": 3.8122974333543106e-05, + "loss": 0.2791, + "step": 6905 + }, + { + "epoch": 0.5891486094523118, + "grad_norm": 1.9275649903617988, + "learning_rate": 3.8109554941733805e-05, + "loss": 0.2227, + "step": 6906 + }, + { + "epoch": 0.589233919126429, + "grad_norm": 1.7769273207210812, + "learning_rate": 3.809613645769682e-05, + "loss": 0.2244, + "step": 6907 + }, + { + "epoch": 0.589319228800546, + "grad_norm": 1.3406090510915747, + "learning_rate": 3.8082718882456546e-05, + "loss": 0.2627, + "step": 6908 + }, + { + "epoch": 0.589404538474663, + "grad_norm": 1.2753411790221258, + "learning_rate": 3.806930221703737e-05, + "loss": 0.2278, + "step": 6909 + }, + { + "epoch": 0.58948984814878, + "grad_norm": 1.5196992050746156, + "learning_rate": 3.805588646246356e-05, + "loss": 0.2395, + "step": 6910 + }, + { + "epoch": 0.5895751578228972, + "grad_norm": 2.090709243240319, + "learning_rate": 3.804247161975935e-05, + "loss": 0.2183, + "step": 6911 + }, + { + "epoch": 0.5896604674970142, + "grad_norm": 1.5769916657280199, + "learning_rate": 3.8029057689948885e-05, + "loss": 0.2335, + "step": 6912 + }, + { + "epoch": 0.5897457771711312, + "grad_norm": 1.9649894766489373, + "learning_rate": 3.8015644674056266e-05, + "loss": 0.2132, + "step": 6913 + }, + { + "epoch": 0.5898310868452482, + "grad_norm": 1.5967762955769524, + "learning_rate": 3.8002232573105475e-05, + "loss": 0.2167, + "step": 6914 + }, + { + "epoch": 0.5899163965193653, + "grad_norm": 1.7216494985477182, + "learning_rate": 3.7988821388120466e-05, + "loss": 0.2355, + "step": 6915 + }, + { + "epoch": 0.5900017061934824, + "grad_norm": 1.7933388454796881, + "learning_rate": 3.797541112012514e-05, + "loss": 0.2415, + "step": 6916 + }, + { + "epoch": 0.5900870158675994, + "grad_norm": 1.520637795754691, + "learning_rate": 3.796200177014326e-05, + "loss": 0.2828, + "step": 6917 + }, + { + "epoch": 0.5901723255417164, + "grad_norm": 1.4733795898966675, + "learning_rate": 3.794859333919857e-05, + "loss": 0.2372, + "step": 6918 + }, + { + "epoch": 0.5902576352158335, + "grad_norm": 1.6763312586490042, + "learning_rate": 3.7935185828314754e-05, + "loss": 0.1954, + "step": 6919 + }, + { + "epoch": 0.5903429448899505, + "grad_norm": 1.7589104508316504, + "learning_rate": 3.792177923851537e-05, + "loss": 0.2184, + "step": 6920 + }, + { + "epoch": 0.5904282545640676, + "grad_norm": 1.5660582652939226, + "learning_rate": 3.790837357082398e-05, + "loss": 0.2942, + "step": 6921 + }, + { + "epoch": 0.5905135642381846, + "grad_norm": 1.6589404153085756, + "learning_rate": 3.789496882626399e-05, + "loss": 0.2393, + "step": 6922 + }, + { + "epoch": 0.5905988739123017, + "grad_norm": 1.2189395463706552, + "learning_rate": 3.788156500585883e-05, + "loss": 0.1419, + "step": 6923 + }, + { + "epoch": 0.5906841835864187, + "grad_norm": 1.6070416766817346, + "learning_rate": 3.786816211063176e-05, + "loss": 0.2502, + "step": 6924 + }, + { + "epoch": 0.5907694932605357, + "grad_norm": 1.5107516564021293, + "learning_rate": 3.7854760141606046e-05, + "loss": 0.2804, + "step": 6925 + }, + { + "epoch": 0.5908548029346528, + "grad_norm": 1.789451723362595, + "learning_rate": 3.784135909980488e-05, + "loss": 0.1783, + "step": 6926 + }, + { + "epoch": 0.5909401126087699, + "grad_norm": 1.4574930313231658, + "learning_rate": 3.782795898625131e-05, + "loss": 0.23, + "step": 6927 + }, + { + "epoch": 0.5910254222828869, + "grad_norm": 1.5111202662009864, + "learning_rate": 3.781455980196839e-05, + "loss": 0.2365, + "step": 6928 + }, + { + "epoch": 0.5911107319570039, + "grad_norm": 1.9085384453011112, + "learning_rate": 3.7801161547979095e-05, + "loss": 0.2606, + "step": 6929 + }, + { + "epoch": 0.5911960416311209, + "grad_norm": 1.6300753781962614, + "learning_rate": 3.778776422530628e-05, + "loss": 0.2196, + "step": 6930 + }, + { + "epoch": 0.5912813513052381, + "grad_norm": 1.5008752176871276, + "learning_rate": 3.777436783497277e-05, + "loss": 0.2224, + "step": 6931 + }, + { + "epoch": 0.5913666609793551, + "grad_norm": 1.4715395470249013, + "learning_rate": 3.7760972378001304e-05, + "loss": 0.2214, + "step": 6932 + }, + { + "epoch": 0.5914519706534721, + "grad_norm": 1.506030833654386, + "learning_rate": 3.774757785541457e-05, + "loss": 0.2606, + "step": 6933 + }, + { + "epoch": 0.5915372803275891, + "grad_norm": 1.414386522462766, + "learning_rate": 3.7734184268235146e-05, + "loss": 0.1729, + "step": 6934 + }, + { + "epoch": 0.5916225900017061, + "grad_norm": 1.8142462900982474, + "learning_rate": 3.772079161748557e-05, + "loss": 0.2598, + "step": 6935 + }, + { + "epoch": 0.5917078996758233, + "grad_norm": 1.5565464273667187, + "learning_rate": 3.770739990418832e-05, + "loss": 0.2701, + "step": 6936 + }, + { + "epoch": 0.5917932093499403, + "grad_norm": 1.5744870243597815, + "learning_rate": 3.7694009129365756e-05, + "loss": 0.2482, + "step": 6937 + }, + { + "epoch": 0.5918785190240573, + "grad_norm": 1.5139657602809988, + "learning_rate": 3.768061929404019e-05, + "loss": 0.2497, + "step": 6938 + }, + { + "epoch": 0.5919638286981743, + "grad_norm": 1.2590832658704099, + "learning_rate": 3.76672303992339e-05, + "loss": 0.2116, + "step": 6939 + }, + { + "epoch": 0.5920491383722915, + "grad_norm": 1.448806416134711, + "learning_rate": 3.765384244596901e-05, + "loss": 0.2395, + "step": 6940 + }, + { + "epoch": 0.5921344480464085, + "grad_norm": 1.6416096230421902, + "learning_rate": 3.764045543526765e-05, + "loss": 0.2997, + "step": 6941 + }, + { + "epoch": 0.5922197577205255, + "grad_norm": 1.7883808553211475, + "learning_rate": 3.7627069368151845e-05, + "loss": 0.2492, + "step": 6942 + }, + { + "epoch": 0.5923050673946425, + "grad_norm": 1.8744765504686465, + "learning_rate": 3.7613684245643544e-05, + "loss": 0.3074, + "step": 6943 + }, + { + "epoch": 0.5923903770687596, + "grad_norm": 1.9771734808685117, + "learning_rate": 3.7600300068764615e-05, + "loss": 0.2443, + "step": 6944 + }, + { + "epoch": 0.5924756867428767, + "grad_norm": 2.065753566878747, + "learning_rate": 3.758691683853691e-05, + "loss": 0.2389, + "step": 6945 + }, + { + "epoch": 0.5925609964169937, + "grad_norm": 1.5999628777965909, + "learning_rate": 3.7573534555982115e-05, + "loss": 0.2588, + "step": 6946 + }, + { + "epoch": 0.5926463060911107, + "grad_norm": 1.444237303960806, + "learning_rate": 3.756015322212193e-05, + "loss": 0.2465, + "step": 6947 + }, + { + "epoch": 0.5927316157652278, + "grad_norm": 1.6647606759577582, + "learning_rate": 3.754677283797793e-05, + "loss": 0.2443, + "step": 6948 + }, + { + "epoch": 0.5928169254393448, + "grad_norm": 1.753003268171921, + "learning_rate": 3.753339340457168e-05, + "loss": 0.2565, + "step": 6949 + }, + { + "epoch": 0.5929022351134619, + "grad_norm": 1.5855780133001929, + "learning_rate": 3.752001492292456e-05, + "loss": 0.1793, + "step": 6950 + }, + { + "epoch": 0.5929875447875789, + "grad_norm": 2.189921210800832, + "learning_rate": 3.750663739405801e-05, + "loss": 0.2517, + "step": 6951 + }, + { + "epoch": 0.593072854461696, + "grad_norm": 1.5526000150693071, + "learning_rate": 3.749326081899329e-05, + "loss": 0.2326, + "step": 6952 + }, + { + "epoch": 0.593158164135813, + "grad_norm": 1.525653811522229, + "learning_rate": 3.747988519875166e-05, + "loss": 0.2487, + "step": 6953 + }, + { + "epoch": 0.59324347380993, + "grad_norm": 1.3536613793218575, + "learning_rate": 3.746651053435424e-05, + "loss": 0.2328, + "step": 6954 + }, + { + "epoch": 0.593328783484047, + "grad_norm": 1.4280729922918773, + "learning_rate": 3.7453136826822166e-05, + "loss": 0.1996, + "step": 6955 + }, + { + "epoch": 0.5934140931581642, + "grad_norm": 1.576897323033087, + "learning_rate": 3.74397640771764e-05, + "loss": 0.2116, + "step": 6956 + }, + { + "epoch": 0.5934994028322812, + "grad_norm": 1.6659317621587726, + "learning_rate": 3.742639228643791e-05, + "loss": 0.2059, + "step": 6957 + }, + { + "epoch": 0.5935847125063982, + "grad_norm": 1.1791989660164066, + "learning_rate": 3.7413021455627554e-05, + "loss": 0.196, + "step": 6958 + }, + { + "epoch": 0.5936700221805152, + "grad_norm": 1.7966236768025081, + "learning_rate": 3.7399651585766115e-05, + "loss": 0.1803, + "step": 6959 + }, + { + "epoch": 0.5937553318546324, + "grad_norm": 1.5577633588839783, + "learning_rate": 3.738628267787432e-05, + "loss": 0.231, + "step": 6960 + }, + { + "epoch": 0.5938406415287494, + "grad_norm": 1.5849426606934893, + "learning_rate": 3.737291473297283e-05, + "loss": 0.2507, + "step": 6961 + }, + { + "epoch": 0.5939259512028664, + "grad_norm": 1.6552846722516965, + "learning_rate": 3.735954775208218e-05, + "loss": 0.2341, + "step": 6962 + }, + { + "epoch": 0.5940112608769834, + "grad_norm": 1.5105632140876737, + "learning_rate": 3.73461817362229e-05, + "loss": 0.2604, + "step": 6963 + }, + { + "epoch": 0.5940965705511005, + "grad_norm": 1.7831742450426973, + "learning_rate": 3.733281668641538e-05, + "loss": 0.1897, + "step": 6964 + }, + { + "epoch": 0.5941818802252176, + "grad_norm": 2.239564857110984, + "learning_rate": 3.7319452603680025e-05, + "loss": 0.2937, + "step": 6965 + }, + { + "epoch": 0.5942671898993346, + "grad_norm": 1.6999741552295689, + "learning_rate": 3.7306089489037056e-05, + "loss": 0.2366, + "step": 6966 + }, + { + "epoch": 0.5943524995734516, + "grad_norm": 2.014831505065605, + "learning_rate": 3.729272734350671e-05, + "loss": 0.1926, + "step": 6967 + }, + { + "epoch": 0.5944378092475687, + "grad_norm": 1.655623839433032, + "learning_rate": 3.7279366168109106e-05, + "loss": 0.2458, + "step": 6968 + }, + { + "epoch": 0.5945231189216857, + "grad_norm": 1.5765829578353978, + "learning_rate": 3.7266005963864296e-05, + "loss": 0.1868, + "step": 6969 + }, + { + "epoch": 0.5946084285958028, + "grad_norm": 1.446804616711596, + "learning_rate": 3.725264673179225e-05, + "loss": 0.2522, + "step": 6970 + }, + { + "epoch": 0.5946937382699198, + "grad_norm": 1.6679088572842569, + "learning_rate": 3.72392884729129e-05, + "loss": 0.2284, + "step": 6971 + }, + { + "epoch": 0.5947790479440368, + "grad_norm": 1.5310794968948436, + "learning_rate": 3.722593118824606e-05, + "loss": 0.2068, + "step": 6972 + }, + { + "epoch": 0.5948643576181539, + "grad_norm": 1.4259645945098118, + "learning_rate": 3.7212574878811495e-05, + "loss": 0.1936, + "step": 6973 + }, + { + "epoch": 0.594949667292271, + "grad_norm": 1.6356446012168622, + "learning_rate": 3.71992195456289e-05, + "loss": 0.212, + "step": 6974 + }, + { + "epoch": 0.595034976966388, + "grad_norm": 1.5352300184546317, + "learning_rate": 3.718586518971785e-05, + "loss": 0.1861, + "step": 6975 + }, + { + "epoch": 0.595120286640505, + "grad_norm": 1.6000230042472774, + "learning_rate": 3.7172511812097894e-05, + "loss": 0.2373, + "step": 6976 + }, + { + "epoch": 0.5952055963146221, + "grad_norm": 1.664394148610085, + "learning_rate": 3.715915941378849e-05, + "loss": 0.166, + "step": 6977 + }, + { + "epoch": 0.5952909059887391, + "grad_norm": 1.7085256778393902, + "learning_rate": 3.7145807995809065e-05, + "loss": 0.2122, + "step": 6978 + }, + { + "epoch": 0.5953762156628561, + "grad_norm": 2.0118948553835145, + "learning_rate": 3.7132457559178856e-05, + "loss": 0.222, + "step": 6979 + }, + { + "epoch": 0.5954615253369732, + "grad_norm": 1.410406285993878, + "learning_rate": 3.711910810491714e-05, + "loss": 0.2036, + "step": 6980 + }, + { + "epoch": 0.5955468350110903, + "grad_norm": 1.2213750592080355, + "learning_rate": 3.7105759634043105e-05, + "loss": 0.2229, + "step": 6981 + }, + { + "epoch": 0.5956321446852073, + "grad_norm": 1.5645238357927538, + "learning_rate": 3.709241214757576e-05, + "loss": 0.2417, + "step": 6982 + }, + { + "epoch": 0.5957174543593243, + "grad_norm": 1.8981716142494425, + "learning_rate": 3.7079065646534184e-05, + "loss": 0.1731, + "step": 6983 + }, + { + "epoch": 0.5958027640334413, + "grad_norm": 1.6817883505272708, + "learning_rate": 3.7065720131937275e-05, + "loss": 0.2461, + "step": 6984 + }, + { + "epoch": 0.5958880737075585, + "grad_norm": 1.5112594182657308, + "learning_rate": 3.70523756048039e-05, + "loss": 0.2044, + "step": 6985 + }, + { + "epoch": 0.5959733833816755, + "grad_norm": 1.6502353933791996, + "learning_rate": 3.703903206615284e-05, + "loss": 0.2404, + "step": 6986 + }, + { + "epoch": 0.5960586930557925, + "grad_norm": 1.3066716896093582, + "learning_rate": 3.7025689517002826e-05, + "loss": 0.155, + "step": 6987 + }, + { + "epoch": 0.5961440027299095, + "grad_norm": 1.8159825640988303, + "learning_rate": 3.701234795837245e-05, + "loss": 0.2419, + "step": 6988 + }, + { + "epoch": 0.5962293124040267, + "grad_norm": 2.1458500488025254, + "learning_rate": 3.6999007391280285e-05, + "loss": 0.2239, + "step": 6989 + }, + { + "epoch": 0.5963146220781437, + "grad_norm": 1.549102104747333, + "learning_rate": 3.698566781674485e-05, + "loss": 0.1992, + "step": 6990 + }, + { + "epoch": 0.5963999317522607, + "grad_norm": 1.3138619123534143, + "learning_rate": 3.6972329235784493e-05, + "loss": 0.2048, + "step": 6991 + }, + { + "epoch": 0.5964852414263777, + "grad_norm": 1.6191166230598102, + "learning_rate": 3.695899164941757e-05, + "loss": 0.2664, + "step": 6992 + }, + { + "epoch": 0.5965705511004948, + "grad_norm": 2.197352211835619, + "learning_rate": 3.694565505866234e-05, + "loss": 0.2568, + "step": 6993 + }, + { + "epoch": 0.5966558607746119, + "grad_norm": 1.7857934661749124, + "learning_rate": 3.693231946453699e-05, + "loss": 0.2779, + "step": 6994 + }, + { + "epoch": 0.5967411704487289, + "grad_norm": 1.7701621036330055, + "learning_rate": 3.69189848680596e-05, + "loss": 0.2574, + "step": 6995 + }, + { + "epoch": 0.5968264801228459, + "grad_norm": 1.6319693985061492, + "learning_rate": 3.690565127024819e-05, + "loss": 0.2093, + "step": 6996 + }, + { + "epoch": 0.596911789796963, + "grad_norm": 1.9540428751416499, + "learning_rate": 3.689231867212074e-05, + "loss": 0.2183, + "step": 6997 + }, + { + "epoch": 0.59699709947108, + "grad_norm": 1.2833386586231534, + "learning_rate": 3.6878987074695094e-05, + "loss": 0.1867, + "step": 6998 + }, + { + "epoch": 0.5970824091451971, + "grad_norm": 2.2997843682754184, + "learning_rate": 3.686565647898905e-05, + "loss": 0.2329, + "step": 6999 + }, + { + "epoch": 0.5971677188193141, + "grad_norm": 1.376654886183008, + "learning_rate": 3.685232688602036e-05, + "loss": 0.172, + "step": 7000 + }, + { + "epoch": 0.5972530284934312, + "grad_norm": 1.710419389147185, + "learning_rate": 3.683899829680663e-05, + "loss": 0.2371, + "step": 7001 + }, + { + "epoch": 0.5973383381675482, + "grad_norm": 2.233588447044138, + "learning_rate": 3.682567071236544e-05, + "loss": 0.2706, + "step": 7002 + }, + { + "epoch": 0.5974236478416652, + "grad_norm": 1.3089344549037556, + "learning_rate": 3.68123441337143e-05, + "loss": 0.1776, + "step": 7003 + }, + { + "epoch": 0.5975089575157823, + "grad_norm": 1.568909734975293, + "learning_rate": 3.679901856187059e-05, + "loss": 0.1783, + "step": 7004 + }, + { + "epoch": 0.5975942671898994, + "grad_norm": 1.633406024729591, + "learning_rate": 3.678569399785168e-05, + "loss": 0.2761, + "step": 7005 + }, + { + "epoch": 0.5976795768640164, + "grad_norm": 1.440026012689031, + "learning_rate": 3.6772370442674806e-05, + "loss": 0.227, + "step": 7006 + }, + { + "epoch": 0.5977648865381334, + "grad_norm": 1.677634399419565, + "learning_rate": 3.675904789735716e-05, + "loss": 0.2341, + "step": 7007 + }, + { + "epoch": 0.5978501962122504, + "grad_norm": 1.7026779568018309, + "learning_rate": 3.674572636291582e-05, + "loss": 0.1721, + "step": 7008 + }, + { + "epoch": 0.5979355058863676, + "grad_norm": 2.0828539741891197, + "learning_rate": 3.6732405840367856e-05, + "loss": 0.2471, + "step": 7009 + }, + { + "epoch": 0.5980208155604846, + "grad_norm": 2.2870326497591704, + "learning_rate": 3.6719086330730215e-05, + "loss": 0.2901, + "step": 7010 + }, + { + "epoch": 0.5981061252346016, + "grad_norm": 1.8748418105470113, + "learning_rate": 3.6705767835019736e-05, + "loss": 0.2698, + "step": 7011 + }, + { + "epoch": 0.5981914349087186, + "grad_norm": 1.8644374243336732, + "learning_rate": 3.6692450354253246e-05, + "loss": 0.2637, + "step": 7012 + }, + { + "epoch": 0.5982767445828356, + "grad_norm": 1.9188136708384707, + "learning_rate": 3.667913388944747e-05, + "loss": 0.2205, + "step": 7013 + }, + { + "epoch": 0.5983620542569528, + "grad_norm": 1.7738385524245854, + "learning_rate": 3.666581844161902e-05, + "loss": 0.2387, + "step": 7014 + }, + { + "epoch": 0.5984473639310698, + "grad_norm": 1.5847647704774932, + "learning_rate": 3.665250401178447e-05, + "loss": 0.2627, + "step": 7015 + }, + { + "epoch": 0.5985326736051868, + "grad_norm": 1.6579841316393298, + "learning_rate": 3.6639190600960314e-05, + "loss": 0.2253, + "step": 7016 + }, + { + "epoch": 0.5986179832793038, + "grad_norm": 1.5453969656575695, + "learning_rate": 3.6625878210162966e-05, + "loss": 0.16, + "step": 7017 + }, + { + "epoch": 0.598703292953421, + "grad_norm": 1.7878871205091145, + "learning_rate": 3.661256684040873e-05, + "loss": 0.1822, + "step": 7018 + }, + { + "epoch": 0.598788602627538, + "grad_norm": 1.6587402049093802, + "learning_rate": 3.6599256492713895e-05, + "loss": 0.1819, + "step": 7019 + }, + { + "epoch": 0.598873912301655, + "grad_norm": 1.9627415235076666, + "learning_rate": 3.65859471680946e-05, + "loss": 0.2366, + "step": 7020 + }, + { + "epoch": 0.598959221975772, + "grad_norm": 1.414289424665684, + "learning_rate": 3.657263886756696e-05, + "loss": 0.2516, + "step": 7021 + }, + { + "epoch": 0.5990445316498891, + "grad_norm": 1.8986371096313308, + "learning_rate": 3.6559331592147e-05, + "loss": 0.2457, + "step": 7022 + }, + { + "epoch": 0.5991298413240062, + "grad_norm": 1.3296007820657296, + "learning_rate": 3.654602534285063e-05, + "loss": 0.2813, + "step": 7023 + }, + { + "epoch": 0.5992151509981232, + "grad_norm": 1.9120775790975504, + "learning_rate": 3.653272012069373e-05, + "loss": 0.2179, + "step": 7024 + }, + { + "epoch": 0.5993004606722402, + "grad_norm": 1.630417376715589, + "learning_rate": 3.65194159266921e-05, + "loss": 0.2577, + "step": 7025 + }, + { + "epoch": 0.5993857703463573, + "grad_norm": 1.616117409019632, + "learning_rate": 3.6506112761861425e-05, + "loss": 0.2144, + "step": 7026 + }, + { + "epoch": 0.5994710800204743, + "grad_norm": 1.750413300891532, + "learning_rate": 3.649281062721733e-05, + "loss": 0.248, + "step": 7027 + }, + { + "epoch": 0.5995563896945914, + "grad_norm": 1.761250232726299, + "learning_rate": 3.6479509523775366e-05, + "loss": 0.2323, + "step": 7028 + }, + { + "epoch": 0.5996416993687084, + "grad_norm": 1.6335969345667414, + "learning_rate": 3.646620945255101e-05, + "loss": 0.1805, + "step": 7029 + }, + { + "epoch": 0.5997270090428255, + "grad_norm": 1.8956086860555936, + "learning_rate": 3.645291041455964e-05, + "loss": 0.285, + "step": 7030 + }, + { + "epoch": 0.5998123187169425, + "grad_norm": 1.6221249161818951, + "learning_rate": 3.643961241081656e-05, + "loss": 0.1766, + "step": 7031 + }, + { + "epoch": 0.5998976283910595, + "grad_norm": 1.660747683023734, + "learning_rate": 3.642631544233703e-05, + "loss": 0.1908, + "step": 7032 + }, + { + "epoch": 0.5999829380651766, + "grad_norm": 1.7700075430628508, + "learning_rate": 3.641301951013617e-05, + "loss": 0.2214, + "step": 7033 + }, + { + "epoch": 0.6000682477392937, + "grad_norm": 1.487498801094662, + "learning_rate": 3.639972461522907e-05, + "loss": 0.2182, + "step": 7034 + }, + { + "epoch": 0.6001535574134107, + "grad_norm": 1.4854148836200958, + "learning_rate": 3.638643075863074e-05, + "loss": 0.2189, + "step": 7035 + }, + { + "epoch": 0.6002388670875277, + "grad_norm": 1.5677970362648272, + "learning_rate": 3.637313794135606e-05, + "loss": 0.2444, + "step": 7036 + }, + { + "epoch": 0.6003241767616447, + "grad_norm": 1.7894250760547983, + "learning_rate": 3.63598461644199e-05, + "loss": 0.2032, + "step": 7037 + }, + { + "epoch": 0.6004094864357619, + "grad_norm": 1.4092417364279162, + "learning_rate": 3.6346555428836985e-05, + "loss": 0.2404, + "step": 7038 + }, + { + "epoch": 0.6004947961098789, + "grad_norm": 1.7412870671803629, + "learning_rate": 3.633326573562204e-05, + "loss": 0.2526, + "step": 7039 + }, + { + "epoch": 0.6005801057839959, + "grad_norm": 1.6369826802786358, + "learning_rate": 3.6319977085789606e-05, + "loss": 0.2261, + "step": 7040 + }, + { + "epoch": 0.6006654154581129, + "grad_norm": 1.6349632298712427, + "learning_rate": 3.630668948035422e-05, + "loss": 0.22, + "step": 7041 + }, + { + "epoch": 0.60075072513223, + "grad_norm": 1.457582098609966, + "learning_rate": 3.6293402920330346e-05, + "loss": 0.2345, + "step": 7042 + }, + { + "epoch": 0.6008360348063471, + "grad_norm": 1.6675376603021597, + "learning_rate": 3.6280117406732304e-05, + "loss": 0.1911, + "step": 7043 + }, + { + "epoch": 0.6009213444804641, + "grad_norm": 1.3171924993707487, + "learning_rate": 3.626683294057439e-05, + "loss": 0.2001, + "step": 7044 + }, + { + "epoch": 0.6010066541545811, + "grad_norm": 1.6106551921058914, + "learning_rate": 3.6253549522870825e-05, + "loss": 0.2981, + "step": 7045 + }, + { + "epoch": 0.6010919638286982, + "grad_norm": 1.41811645411089, + "learning_rate": 3.6240267154635686e-05, + "loss": 0.2353, + "step": 7046 + }, + { + "epoch": 0.6011772735028152, + "grad_norm": 2.3314757332419407, + "learning_rate": 3.622698583688304e-05, + "loss": 0.2428, + "step": 7047 + }, + { + "epoch": 0.6012625831769323, + "grad_norm": 2.105012805383292, + "learning_rate": 3.621370557062684e-05, + "loss": 0.2866, + "step": 7048 + }, + { + "epoch": 0.6013478928510493, + "grad_norm": 2.1381411656022546, + "learning_rate": 3.620042635688096e-05, + "loss": 0.2406, + "step": 7049 + }, + { + "epoch": 0.6014332025251663, + "grad_norm": 1.59519451725074, + "learning_rate": 3.618714819665917e-05, + "loss": 0.2646, + "step": 7050 + }, + { + "epoch": 0.6015185121992834, + "grad_norm": 1.4188519714277246, + "learning_rate": 3.617387109097525e-05, + "loss": 0.2446, + "step": 7051 + }, + { + "epoch": 0.6016038218734004, + "grad_norm": 1.6877996390604393, + "learning_rate": 3.616059504084278e-05, + "loss": 0.2076, + "step": 7052 + }, + { + "epoch": 0.6016891315475175, + "grad_norm": 1.5244727896229107, + "learning_rate": 3.614732004727533e-05, + "loss": 0.3077, + "step": 7053 + }, + { + "epoch": 0.6017744412216345, + "grad_norm": 1.6994652580263374, + "learning_rate": 3.6134046111286376e-05, + "loss": 0.2511, + "step": 7054 + }, + { + "epoch": 0.6018597508957516, + "grad_norm": 1.7427225018472747, + "learning_rate": 3.612077323388935e-05, + "loss": 0.189, + "step": 7055 + }, + { + "epoch": 0.6019450605698686, + "grad_norm": 1.8786770686792735, + "learning_rate": 3.610750141609751e-05, + "loss": 0.2442, + "step": 7056 + }, + { + "epoch": 0.6020303702439856, + "grad_norm": 1.6092330721891845, + "learning_rate": 3.609423065892412e-05, + "loss": 0.2983, + "step": 7057 + }, + { + "epoch": 0.6021156799181027, + "grad_norm": 1.3795087756893682, + "learning_rate": 3.608096096338233e-05, + "loss": 0.1819, + "step": 7058 + }, + { + "epoch": 0.6022009895922198, + "grad_norm": 1.6727915595539953, + "learning_rate": 3.606769233048519e-05, + "loss": 0.2183, + "step": 7059 + }, + { + "epoch": 0.6022862992663368, + "grad_norm": 1.348925537339124, + "learning_rate": 3.605442476124571e-05, + "loss": 0.1851, + "step": 7060 + }, + { + "epoch": 0.6023716089404538, + "grad_norm": 1.6434569848271579, + "learning_rate": 3.604115825667681e-05, + "loss": 0.2201, + "step": 7061 + }, + { + "epoch": 0.6024569186145708, + "grad_norm": 2.301467145613612, + "learning_rate": 3.6027892817791275e-05, + "loss": 0.2638, + "step": 7062 + }, + { + "epoch": 0.602542228288688, + "grad_norm": 1.9812967649891373, + "learning_rate": 3.601462844560187e-05, + "loss": 0.1669, + "step": 7063 + }, + { + "epoch": 0.602627537962805, + "grad_norm": 2.0455087076587843, + "learning_rate": 3.6001365141121295e-05, + "loss": 0.2566, + "step": 7064 + }, + { + "epoch": 0.602712847636922, + "grad_norm": 1.5699230109611932, + "learning_rate": 3.598810290536208e-05, + "loss": 0.2544, + "step": 7065 + }, + { + "epoch": 0.602798157311039, + "grad_norm": 1.4398469425027345, + "learning_rate": 3.597484173933675e-05, + "loss": 0.204, + "step": 7066 + }, + { + "epoch": 0.6028834669851562, + "grad_norm": 1.7419479958165784, + "learning_rate": 3.5961581644057744e-05, + "loss": 0.2165, + "step": 7067 + }, + { + "epoch": 0.6029687766592732, + "grad_norm": 1.5921926930710668, + "learning_rate": 3.5948322620537357e-05, + "loss": 0.2034, + "step": 7068 + }, + { + "epoch": 0.6030540863333902, + "grad_norm": 1.6229011479404185, + "learning_rate": 3.593506466978788e-05, + "loss": 0.1961, + "step": 7069 + }, + { + "epoch": 0.6031393960075072, + "grad_norm": 1.6585755486012963, + "learning_rate": 3.592180779282146e-05, + "loss": 0.219, + "step": 7070 + }, + { + "epoch": 0.6032247056816243, + "grad_norm": 1.6071379585796968, + "learning_rate": 3.590855199065023e-05, + "loss": 0.1814, + "step": 7071 + }, + { + "epoch": 0.6033100153557414, + "grad_norm": 1.5247290198538148, + "learning_rate": 3.589529726428615e-05, + "loss": 0.2566, + "step": 7072 + }, + { + "epoch": 0.6033953250298584, + "grad_norm": 1.336587847972553, + "learning_rate": 3.5882043614741165e-05, + "loss": 0.1638, + "step": 7073 + }, + { + "epoch": 0.6034806347039754, + "grad_norm": 1.938179881263922, + "learning_rate": 3.586879104302716e-05, + "loss": 0.2408, + "step": 7074 + }, + { + "epoch": 0.6035659443780925, + "grad_norm": 1.4704557213263334, + "learning_rate": 3.585553955015584e-05, + "loss": 0.2207, + "step": 7075 + }, + { + "epoch": 0.6036512540522095, + "grad_norm": 1.5512648436199634, + "learning_rate": 3.584228913713891e-05, + "loss": 0.1804, + "step": 7076 + }, + { + "epoch": 0.6037365637263266, + "grad_norm": 1.5516296679507142, + "learning_rate": 3.5829039804988e-05, + "loss": 0.2402, + "step": 7077 + }, + { + "epoch": 0.6038218734004436, + "grad_norm": 1.3954844062395901, + "learning_rate": 3.5815791554714564e-05, + "loss": 0.1645, + "step": 7078 + }, + { + "epoch": 0.6039071830745607, + "grad_norm": 1.7409154242185996, + "learning_rate": 3.580254438733008e-05, + "loss": 0.2944, + "step": 7079 + }, + { + "epoch": 0.6039924927486777, + "grad_norm": 1.6267845918299484, + "learning_rate": 3.5789298303845884e-05, + "loss": 0.2781, + "step": 7080 + }, + { + "epoch": 0.6040778024227947, + "grad_norm": 1.5661864471328493, + "learning_rate": 3.577605330527326e-05, + "loss": 0.1924, + "step": 7081 + }, + { + "epoch": 0.6041631120969118, + "grad_norm": 1.565787861589954, + "learning_rate": 3.576280939262336e-05, + "loss": 0.2374, + "step": 7082 + }, + { + "epoch": 0.6042484217710289, + "grad_norm": 1.5741022791028874, + "learning_rate": 3.574956656690731e-05, + "loss": 0.2417, + "step": 7083 + }, + { + "epoch": 0.6043337314451459, + "grad_norm": 1.5934679118319917, + "learning_rate": 3.573632482913615e-05, + "loss": 0.1543, + "step": 7084 + }, + { + "epoch": 0.6044190411192629, + "grad_norm": 1.4257809155607484, + "learning_rate": 3.572308418032076e-05, + "loss": 0.2078, + "step": 7085 + }, + { + "epoch": 0.6045043507933799, + "grad_norm": 1.7695971344057166, + "learning_rate": 3.570984462147203e-05, + "loss": 0.1952, + "step": 7086 + }, + { + "epoch": 0.604589660467497, + "grad_norm": 1.746262032338374, + "learning_rate": 3.5696606153600754e-05, + "loss": 0.2312, + "step": 7087 + }, + { + "epoch": 0.6046749701416141, + "grad_norm": 1.4095738441959014, + "learning_rate": 3.568336877771756e-05, + "loss": 0.1543, + "step": 7088 + }, + { + "epoch": 0.6047602798157311, + "grad_norm": 1.9393923962200412, + "learning_rate": 3.56701324948331e-05, + "loss": 0.2324, + "step": 7089 + }, + { + "epoch": 0.6048455894898481, + "grad_norm": 2.017967459825553, + "learning_rate": 3.565689730595787e-05, + "loss": 0.2491, + "step": 7090 + }, + { + "epoch": 0.6049308991639651, + "grad_norm": 1.9734729851245387, + "learning_rate": 3.5643663212102306e-05, + "loss": 0.2337, + "step": 7091 + }, + { + "epoch": 0.6050162088380823, + "grad_norm": 1.4191283641551193, + "learning_rate": 3.563043021427677e-05, + "loss": 0.1558, + "step": 7092 + }, + { + "epoch": 0.6051015185121993, + "grad_norm": 1.762203851804246, + "learning_rate": 3.561719831349153e-05, + "loss": 0.2161, + "step": 7093 + }, + { + "epoch": 0.6051868281863163, + "grad_norm": 1.8089091806551933, + "learning_rate": 3.5603967510756764e-05, + "loss": 0.2279, + "step": 7094 + }, + { + "epoch": 0.6052721378604333, + "grad_norm": 1.5634734363989282, + "learning_rate": 3.559073780708257e-05, + "loss": 0.2328, + "step": 7095 + }, + { + "epoch": 0.6053574475345505, + "grad_norm": 1.2811571413211014, + "learning_rate": 3.5577509203479e-05, + "loss": 0.2, + "step": 7096 + }, + { + "epoch": 0.6054427572086675, + "grad_norm": 2.0435055087775753, + "learning_rate": 3.556428170095593e-05, + "loss": 0.2183, + "step": 7097 + }, + { + "epoch": 0.6055280668827845, + "grad_norm": 1.8061970924099753, + "learning_rate": 3.5551055300523254e-05, + "loss": 0.2197, + "step": 7098 + }, + { + "epoch": 0.6056133765569015, + "grad_norm": 2.023496413385838, + "learning_rate": 3.553783000319072e-05, + "loss": 0.276, + "step": 7099 + }, + { + "epoch": 0.6056986862310186, + "grad_norm": 1.6269567885706802, + "learning_rate": 3.552460580996803e-05, + "loss": 0.2205, + "step": 7100 + }, + { + "epoch": 0.6057839959051357, + "grad_norm": 1.5224947628939651, + "learning_rate": 3.551138272186475e-05, + "loss": 0.2149, + "step": 7101 + }, + { + "epoch": 0.6058693055792527, + "grad_norm": 1.71676875276935, + "learning_rate": 3.5498160739890404e-05, + "loss": 0.2195, + "step": 7102 + }, + { + "epoch": 0.6059546152533697, + "grad_norm": 1.7924720225684623, + "learning_rate": 3.548493986505444e-05, + "loss": 0.2396, + "step": 7103 + }, + { + "epoch": 0.6060399249274868, + "grad_norm": 1.407848297578525, + "learning_rate": 3.547172009836617e-05, + "loss": 0.2133, + "step": 7104 + }, + { + "epoch": 0.6061252346016038, + "grad_norm": 1.664859276515212, + "learning_rate": 3.545850144083486e-05, + "loss": 0.2555, + "step": 7105 + }, + { + "epoch": 0.6062105442757209, + "grad_norm": 2.084874028487248, + "learning_rate": 3.544528389346972e-05, + "loss": 0.3101, + "step": 7106 + }, + { + "epoch": 0.6062958539498379, + "grad_norm": 1.6386228424484084, + "learning_rate": 3.5432067457279775e-05, + "loss": 0.2414, + "step": 7107 + }, + { + "epoch": 0.606381163623955, + "grad_norm": 1.7896026728706027, + "learning_rate": 3.5418852133274084e-05, + "loss": 0.288, + "step": 7108 + }, + { + "epoch": 0.606466473298072, + "grad_norm": 1.834194703477827, + "learning_rate": 3.5405637922461556e-05, + "loss": 0.2039, + "step": 7109 + }, + { + "epoch": 0.606551782972189, + "grad_norm": 1.63814058659549, + "learning_rate": 3.5392424825851e-05, + "loss": 0.1994, + "step": 7110 + }, + { + "epoch": 0.606637092646306, + "grad_norm": 1.7467805676535282, + "learning_rate": 3.5379212844451206e-05, + "loss": 0.2101, + "step": 7111 + }, + { + "epoch": 0.6067224023204232, + "grad_norm": 1.5889274337216217, + "learning_rate": 3.5366001979270805e-05, + "loss": 0.1476, + "step": 7112 + }, + { + "epoch": 0.6068077119945402, + "grad_norm": 1.6555849575501282, + "learning_rate": 3.5352792231318385e-05, + "loss": 0.2265, + "step": 7113 + }, + { + "epoch": 0.6068930216686572, + "grad_norm": 1.3866957111862586, + "learning_rate": 3.5339583601602443e-05, + "loss": 0.1816, + "step": 7114 + }, + { + "epoch": 0.6069783313427742, + "grad_norm": 1.7439436336735257, + "learning_rate": 3.532637609113138e-05, + "loss": 0.2292, + "step": 7115 + }, + { + "epoch": 0.6070636410168914, + "grad_norm": 1.3485232011031825, + "learning_rate": 3.531316970091355e-05, + "loss": 0.227, + "step": 7116 + }, + { + "epoch": 0.6071489506910084, + "grad_norm": 1.885627636425589, + "learning_rate": 3.529996443195714e-05, + "loss": 0.2473, + "step": 7117 + }, + { + "epoch": 0.6072342603651254, + "grad_norm": 1.643357753004793, + "learning_rate": 3.528676028527035e-05, + "loss": 0.2142, + "step": 7118 + }, + { + "epoch": 0.6073195700392424, + "grad_norm": 1.6807220821351485, + "learning_rate": 3.527355726186123e-05, + "loss": 0.1983, + "step": 7119 + }, + { + "epoch": 0.6074048797133595, + "grad_norm": 1.313973598670782, + "learning_rate": 3.526035536273774e-05, + "loss": 0.1865, + "step": 7120 + }, + { + "epoch": 0.6074901893874766, + "grad_norm": 2.0417167371980947, + "learning_rate": 3.52471545889078e-05, + "loss": 0.1977, + "step": 7121 + }, + { + "epoch": 0.6075754990615936, + "grad_norm": 1.74823460769587, + "learning_rate": 3.523395494137921e-05, + "loss": 0.2065, + "step": 7122 + }, + { + "epoch": 0.6076608087357106, + "grad_norm": 1.747512684613689, + "learning_rate": 3.5220756421159696e-05, + "loss": 0.1981, + "step": 7123 + }, + { + "epoch": 0.6077461184098277, + "grad_norm": 1.7192107536714487, + "learning_rate": 3.520755902925689e-05, + "loss": 0.1823, + "step": 7124 + }, + { + "epoch": 0.6078314280839447, + "grad_norm": 2.0910822255748314, + "learning_rate": 3.519436276667836e-05, + "loss": 0.2179, + "step": 7125 + }, + { + "epoch": 0.6079167377580618, + "grad_norm": 1.458671730380245, + "learning_rate": 3.518116763443153e-05, + "loss": 0.2156, + "step": 7126 + }, + { + "epoch": 0.6080020474321788, + "grad_norm": 1.635168028313819, + "learning_rate": 3.5167973633523804e-05, + "loss": 0.2495, + "step": 7127 + }, + { + "epoch": 0.6080873571062958, + "grad_norm": 2.098905450676935, + "learning_rate": 3.515478076496248e-05, + "loss": 0.2773, + "step": 7128 + }, + { + "epoch": 0.6081726667804129, + "grad_norm": 1.6193837166260454, + "learning_rate": 3.5141589029754776e-05, + "loss": 0.2201, + "step": 7129 + }, + { + "epoch": 0.60825797645453, + "grad_norm": 1.692737927300396, + "learning_rate": 3.5128398428907766e-05, + "loss": 0.2096, + "step": 7130 + }, + { + "epoch": 0.608343286128647, + "grad_norm": 1.913850235360299, + "learning_rate": 3.511520896342852e-05, + "loss": 0.1975, + "step": 7131 + }, + { + "epoch": 0.608428595802764, + "grad_norm": 1.3840669448515885, + "learning_rate": 3.510202063432397e-05, + "loss": 0.2497, + "step": 7132 + }, + { + "epoch": 0.6085139054768811, + "grad_norm": 1.7355641592789104, + "learning_rate": 3.5088833442600985e-05, + "loss": 0.2138, + "step": 7133 + }, + { + "epoch": 0.6085992151509981, + "grad_norm": 1.753430614250558, + "learning_rate": 3.507564738926632e-05, + "loss": 0.2318, + "step": 7134 + }, + { + "epoch": 0.6086845248251151, + "grad_norm": 2.00515880440979, + "learning_rate": 3.506246247532669e-05, + "loss": 0.1665, + "step": 7135 + }, + { + "epoch": 0.6087698344992322, + "grad_norm": 1.6161772295153707, + "learning_rate": 3.504927870178863e-05, + "loss": 0.1968, + "step": 7136 + }, + { + "epoch": 0.6088551441733493, + "grad_norm": 1.331815551118445, + "learning_rate": 3.503609606965872e-05, + "loss": 0.1479, + "step": 7137 + }, + { + "epoch": 0.6089404538474663, + "grad_norm": 2.051978003675863, + "learning_rate": 3.5022914579943365e-05, + "loss": 0.1681, + "step": 7138 + }, + { + "epoch": 0.6090257635215833, + "grad_norm": 1.9596234664468128, + "learning_rate": 3.500973423364887e-05, + "loss": 0.2019, + "step": 7139 + }, + { + "epoch": 0.6091110731957003, + "grad_norm": 1.3282696364688995, + "learning_rate": 3.4996555031781516e-05, + "loss": 0.2212, + "step": 7140 + }, + { + "epoch": 0.6091963828698175, + "grad_norm": 1.7005447300336487, + "learning_rate": 3.498337697534747e-05, + "loss": 0.2209, + "step": 7141 + }, + { + "epoch": 0.6092816925439345, + "grad_norm": 1.8502125385988177, + "learning_rate": 3.497020006535278e-05, + "loss": 0.2228, + "step": 7142 + }, + { + "epoch": 0.6093670022180515, + "grad_norm": 1.9209601853438159, + "learning_rate": 3.4957024302803453e-05, + "loss": 0.2204, + "step": 7143 + }, + { + "epoch": 0.6094523118921685, + "grad_norm": 1.7912093029736436, + "learning_rate": 3.4943849688705376e-05, + "loss": 0.2074, + "step": 7144 + }, + { + "epoch": 0.6095376215662857, + "grad_norm": 1.818889470995856, + "learning_rate": 3.493067622406439e-05, + "loss": 0.2391, + "step": 7145 + }, + { + "epoch": 0.6096229312404027, + "grad_norm": 1.5096396014875189, + "learning_rate": 3.491750390988616e-05, + "loss": 0.2398, + "step": 7146 + }, + { + "epoch": 0.6097082409145197, + "grad_norm": 1.905752586129796, + "learning_rate": 3.4904332747176373e-05, + "loss": 0.2186, + "step": 7147 + }, + { + "epoch": 0.6097935505886367, + "grad_norm": 2.135693515106114, + "learning_rate": 3.4891162736940584e-05, + "loss": 0.23, + "step": 7148 + }, + { + "epoch": 0.6098788602627538, + "grad_norm": 1.686663526273274, + "learning_rate": 3.48779938801842e-05, + "loss": 0.2791, + "step": 7149 + }, + { + "epoch": 0.6099641699368709, + "grad_norm": 1.300933187506458, + "learning_rate": 3.486482617791263e-05, + "loss": 0.1628, + "step": 7150 + }, + { + "epoch": 0.6100494796109879, + "grad_norm": 1.9635977159455367, + "learning_rate": 3.485165963113118e-05, + "loss": 0.2974, + "step": 7151 + }, + { + "epoch": 0.6101347892851049, + "grad_norm": 1.7048553635896795, + "learning_rate": 3.483849424084499e-05, + "loss": 0.2186, + "step": 7152 + }, + { + "epoch": 0.610220098959222, + "grad_norm": 1.8182974865273611, + "learning_rate": 3.482533000805921e-05, + "loss": 0.2495, + "step": 7153 + }, + { + "epoch": 0.610305408633339, + "grad_norm": 1.6146565410465576, + "learning_rate": 3.4812166933778844e-05, + "loss": 0.2592, + "step": 7154 + }, + { + "epoch": 0.6103907183074561, + "grad_norm": 1.9093367802217651, + "learning_rate": 3.4799005019008826e-05, + "loss": 0.2547, + "step": 7155 + }, + { + "epoch": 0.6104760279815731, + "grad_norm": 1.571203178491116, + "learning_rate": 3.478584426475399e-05, + "loss": 0.2474, + "step": 7156 + }, + { + "epoch": 0.6105613376556902, + "grad_norm": 1.373108429702421, + "learning_rate": 3.477268467201911e-05, + "loss": 0.2715, + "step": 7157 + }, + { + "epoch": 0.6106466473298072, + "grad_norm": 1.4849608219000678, + "learning_rate": 3.475952624180882e-05, + "loss": 0.1946, + "step": 7158 + }, + { + "epoch": 0.6107319570039242, + "grad_norm": 1.4727268353856526, + "learning_rate": 3.4746368975127716e-05, + "loss": 0.1999, + "step": 7159 + }, + { + "epoch": 0.6108172666780413, + "grad_norm": 1.6498326187777081, + "learning_rate": 3.473321287298028e-05, + "loss": 0.2455, + "step": 7160 + }, + { + "epoch": 0.6109025763521584, + "grad_norm": 1.4369972842674636, + "learning_rate": 3.472005793637094e-05, + "loss": 0.2541, + "step": 7161 + }, + { + "epoch": 0.6109878860262754, + "grad_norm": 2.182480202286844, + "learning_rate": 3.470690416630395e-05, + "loss": 0.2699, + "step": 7162 + }, + { + "epoch": 0.6110731957003924, + "grad_norm": 1.6192267640027878, + "learning_rate": 3.4693751563783574e-05, + "loss": 0.2887, + "step": 7163 + }, + { + "epoch": 0.6111585053745094, + "grad_norm": 2.00179150929201, + "learning_rate": 3.4680600129813926e-05, + "loss": 0.2724, + "step": 7164 + }, + { + "epoch": 0.6112438150486265, + "grad_norm": 1.743451399856299, + "learning_rate": 3.4667449865399054e-05, + "loss": 0.1754, + "step": 7165 + }, + { + "epoch": 0.6113291247227436, + "grad_norm": 1.5576897391998183, + "learning_rate": 3.4654300771542896e-05, + "loss": 0.2411, + "step": 7166 + }, + { + "epoch": 0.6114144343968606, + "grad_norm": 1.181593208307416, + "learning_rate": 3.4641152849249346e-05, + "loss": 0.1891, + "step": 7167 + }, + { + "epoch": 0.6114997440709776, + "grad_norm": 1.6116660393783049, + "learning_rate": 3.462800609952214e-05, + "loss": 0.2188, + "step": 7168 + }, + { + "epoch": 0.6115850537450946, + "grad_norm": 1.6132753385938912, + "learning_rate": 3.461486052336499e-05, + "loss": 0.1824, + "step": 7169 + }, + { + "epoch": 0.6116703634192118, + "grad_norm": 1.5642187202475712, + "learning_rate": 3.460171612178149e-05, + "loss": 0.2173, + "step": 7170 + }, + { + "epoch": 0.6117556730933288, + "grad_norm": 1.7516839236284438, + "learning_rate": 3.4588572895775126e-05, + "loss": 0.1875, + "step": 7171 + }, + { + "epoch": 0.6118409827674458, + "grad_norm": 1.3006140799112091, + "learning_rate": 3.457543084634932e-05, + "loss": 0.255, + "step": 7172 + }, + { + "epoch": 0.6119262924415628, + "grad_norm": 1.4092831431564201, + "learning_rate": 3.456228997450741e-05, + "loss": 0.2207, + "step": 7173 + }, + { + "epoch": 0.61201160211568, + "grad_norm": 2.1094040480132437, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.1745, + "step": 7174 + }, + { + "epoch": 0.612096911789797, + "grad_norm": 1.5917731347683282, + "learning_rate": 3.453601176758813e-05, + "loss": 0.2537, + "step": 7175 + }, + { + "epoch": 0.612182221463914, + "grad_norm": 1.3495549634020705, + "learning_rate": 3.452287443451693e-05, + "loss": 0.2204, + "step": 7176 + }, + { + "epoch": 0.612267531138031, + "grad_norm": 1.3748388629474873, + "learning_rate": 3.4509738283042046e-05, + "loss": 0.1917, + "step": 7177 + }, + { + "epoch": 0.6123528408121481, + "grad_norm": 1.6697302635866689, + "learning_rate": 3.449660331416631e-05, + "loss": 0.2076, + "step": 7178 + }, + { + "epoch": 0.6124381504862652, + "grad_norm": 1.3134305470854604, + "learning_rate": 3.448346952889253e-05, + "loss": 0.1822, + "step": 7179 + }, + { + "epoch": 0.6125234601603822, + "grad_norm": 1.3962074271928706, + "learning_rate": 3.447033692822341e-05, + "loss": 0.2196, + "step": 7180 + }, + { + "epoch": 0.6126087698344992, + "grad_norm": 1.4460300623194497, + "learning_rate": 3.445720551316152e-05, + "loss": 0.21, + "step": 7181 + }, + { + "epoch": 0.6126940795086163, + "grad_norm": 1.5883741780353466, + "learning_rate": 3.44440752847094e-05, + "loss": 0.2454, + "step": 7182 + }, + { + "epoch": 0.6127793891827333, + "grad_norm": 1.898834940433729, + "learning_rate": 3.443094624386949e-05, + "loss": 0.1908, + "step": 7183 + }, + { + "epoch": 0.6128646988568504, + "grad_norm": 1.5895064462421231, + "learning_rate": 3.441781839164408e-05, + "loss": 0.174, + "step": 7184 + }, + { + "epoch": 0.6129500085309674, + "grad_norm": 1.652462353068325, + "learning_rate": 3.440469172903543e-05, + "loss": 0.1851, + "step": 7185 + }, + { + "epoch": 0.6130353182050845, + "grad_norm": 1.3666780230972981, + "learning_rate": 3.4391566257045705e-05, + "loss": 0.196, + "step": 7186 + }, + { + "epoch": 0.6131206278792015, + "grad_norm": 1.2725189266033896, + "learning_rate": 3.437844197667696e-05, + "loss": 0.1838, + "step": 7187 + }, + { + "epoch": 0.6132059375533185, + "grad_norm": 2.490964583504634, + "learning_rate": 3.436531888893113e-05, + "loss": 0.2621, + "step": 7188 + }, + { + "epoch": 0.6132912472274356, + "grad_norm": 1.9686380990333954, + "learning_rate": 3.4352196994810125e-05, + "loss": 0.2545, + "step": 7189 + }, + { + "epoch": 0.6133765569015527, + "grad_norm": 1.6232093334946784, + "learning_rate": 3.433907629531575e-05, + "loss": 0.2466, + "step": 7190 + }, + { + "epoch": 0.6134618665756697, + "grad_norm": 1.2580797373574206, + "learning_rate": 3.432595679144966e-05, + "loss": 0.2248, + "step": 7191 + }, + { + "epoch": 0.6135471762497867, + "grad_norm": 1.6790548863212593, + "learning_rate": 3.431283848421347e-05, + "loss": 0.3304, + "step": 7192 + }, + { + "epoch": 0.6136324859239037, + "grad_norm": 1.5179645532382156, + "learning_rate": 3.429972137460873e-05, + "loss": 0.2357, + "step": 7193 + }, + { + "epoch": 0.6137177955980209, + "grad_norm": 1.8916238715530747, + "learning_rate": 3.4286605463636804e-05, + "loss": 0.2377, + "step": 7194 + }, + { + "epoch": 0.6138031052721379, + "grad_norm": 1.7200682699715837, + "learning_rate": 3.4273490752299064e-05, + "loss": 0.2347, + "step": 7195 + }, + { + "epoch": 0.6138884149462549, + "grad_norm": 1.3801869612664535, + "learning_rate": 3.426037724159673e-05, + "loss": 0.3335, + "step": 7196 + }, + { + "epoch": 0.6139737246203719, + "grad_norm": 1.4794032306177403, + "learning_rate": 3.4247264932530964e-05, + "loss": 0.1827, + "step": 7197 + }, + { + "epoch": 0.614059034294489, + "grad_norm": 1.5606480877153488, + "learning_rate": 3.4234153826102787e-05, + "loss": 0.2431, + "step": 7198 + }, + { + "epoch": 0.6141443439686061, + "grad_norm": 1.4596830479940488, + "learning_rate": 3.422104392331322e-05, + "loss": 0.2144, + "step": 7199 + }, + { + "epoch": 0.6142296536427231, + "grad_norm": 1.3923787406297856, + "learning_rate": 3.4207935225163066e-05, + "loss": 0.1947, + "step": 7200 + }, + { + "epoch": 0.6143149633168401, + "grad_norm": 1.6280573913243357, + "learning_rate": 3.419482773265314e-05, + "loss": 0.1929, + "step": 7201 + }, + { + "epoch": 0.6144002729909571, + "grad_norm": 2.2776006485005866, + "learning_rate": 3.418172144678416e-05, + "loss": 0.1785, + "step": 7202 + }, + { + "epoch": 0.6144855826650742, + "grad_norm": 1.895356869473053, + "learning_rate": 3.416861636855666e-05, + "loss": 0.2472, + "step": 7203 + }, + { + "epoch": 0.6145708923391913, + "grad_norm": 1.7025029157481943, + "learning_rate": 3.415551249897117e-05, + "loss": 0.2363, + "step": 7204 + }, + { + "epoch": 0.6146562020133083, + "grad_norm": 1.7534188525782721, + "learning_rate": 3.4142409839028125e-05, + "loss": 0.2463, + "step": 7205 + }, + { + "epoch": 0.6147415116874253, + "grad_norm": 1.3906244856587004, + "learning_rate": 3.412930838972781e-05, + "loss": 0.2101, + "step": 7206 + }, + { + "epoch": 0.6148268213615424, + "grad_norm": 1.9100157641214652, + "learning_rate": 3.4116208152070473e-05, + "loss": 0.2768, + "step": 7207 + }, + { + "epoch": 0.6149121310356594, + "grad_norm": 1.6158735309156527, + "learning_rate": 3.410310912705622e-05, + "loss": 0.1868, + "step": 7208 + }, + { + "epoch": 0.6149974407097765, + "grad_norm": 1.5060776924569836, + "learning_rate": 3.409001131568513e-05, + "loss": 0.1964, + "step": 7209 + }, + { + "epoch": 0.6150827503838935, + "grad_norm": 2.0841077001202897, + "learning_rate": 3.407691471895711e-05, + "loss": 0.3119, + "step": 7210 + }, + { + "epoch": 0.6151680600580106, + "grad_norm": 1.7964384129088622, + "learning_rate": 3.406381933787204e-05, + "loss": 0.1815, + "step": 7211 + }, + { + "epoch": 0.6152533697321276, + "grad_norm": 1.6138824746549207, + "learning_rate": 3.4050725173429695e-05, + "loss": 0.265, + "step": 7212 + }, + { + "epoch": 0.6153386794062446, + "grad_norm": 1.3716476495350756, + "learning_rate": 3.4037632226629704e-05, + "loss": 0.1882, + "step": 7213 + }, + { + "epoch": 0.6154239890803617, + "grad_norm": 1.3882455077370852, + "learning_rate": 3.402454049847168e-05, + "loss": 0.1949, + "step": 7214 + }, + { + "epoch": 0.6155092987544788, + "grad_norm": 1.9908821167470472, + "learning_rate": 3.4011449989955105e-05, + "loss": 0.2893, + "step": 7215 + }, + { + "epoch": 0.6155946084285958, + "grad_norm": 1.7277790461403026, + "learning_rate": 3.3998360702079345e-05, + "loss": 0.2117, + "step": 7216 + }, + { + "epoch": 0.6156799181027128, + "grad_norm": 1.6426459807578673, + "learning_rate": 3.3985272635843725e-05, + "loss": 0.2418, + "step": 7217 + }, + { + "epoch": 0.6157652277768298, + "grad_norm": 1.8886473850370598, + "learning_rate": 3.3972185792247424e-05, + "loss": 0.2534, + "step": 7218 + }, + { + "epoch": 0.615850537450947, + "grad_norm": 1.7902697358872732, + "learning_rate": 3.395910017228958e-05, + "loss": 0.2375, + "step": 7219 + }, + { + "epoch": 0.615935847125064, + "grad_norm": 1.3416264188349003, + "learning_rate": 3.3946015776969185e-05, + "loss": 0.2053, + "step": 7220 + }, + { + "epoch": 0.616021156799181, + "grad_norm": 1.4775357545763945, + "learning_rate": 3.393293260728517e-05, + "loss": 0.2091, + "step": 7221 + }, + { + "epoch": 0.616106466473298, + "grad_norm": 1.579350967858409, + "learning_rate": 3.39198506642364e-05, + "loss": 0.2704, + "step": 7222 + }, + { + "epoch": 0.6161917761474152, + "grad_norm": 1.7573310023052595, + "learning_rate": 3.390676994882155e-05, + "loss": 0.2486, + "step": 7223 + }, + { + "epoch": 0.6162770858215322, + "grad_norm": 1.9663871027636037, + "learning_rate": 3.3893690462039305e-05, + "loss": 0.265, + "step": 7224 + }, + { + "epoch": 0.6163623954956492, + "grad_norm": 1.3168674645844227, + "learning_rate": 3.388061220488822e-05, + "loss": 0.1711, + "step": 7225 + }, + { + "epoch": 0.6164477051697662, + "grad_norm": 1.5410069609230457, + "learning_rate": 3.386753517836671e-05, + "loss": 0.1736, + "step": 7226 + }, + { + "epoch": 0.6165330148438833, + "grad_norm": 2.125058341627887, + "learning_rate": 3.3854459383473174e-05, + "loss": 0.2477, + "step": 7227 + }, + { + "epoch": 0.6166183245180004, + "grad_norm": 1.6714551166666876, + "learning_rate": 3.384138482120587e-05, + "loss": 0.2317, + "step": 7228 + }, + { + "epoch": 0.6167036341921174, + "grad_norm": 1.5900592329880043, + "learning_rate": 3.382831149256297e-05, + "loss": 0.213, + "step": 7229 + }, + { + "epoch": 0.6167889438662344, + "grad_norm": 1.8174730675130386, + "learning_rate": 3.381523939854253e-05, + "loss": 0.2113, + "step": 7230 + }, + { + "epoch": 0.6168742535403515, + "grad_norm": 1.6989470165943086, + "learning_rate": 3.380216854014259e-05, + "loss": 0.2781, + "step": 7231 + }, + { + "epoch": 0.6169595632144685, + "grad_norm": 1.9035046929997237, + "learning_rate": 3.378909891836098e-05, + "loss": 0.2426, + "step": 7232 + }, + { + "epoch": 0.6170448728885856, + "grad_norm": 1.5955516131596457, + "learning_rate": 3.377603053419552e-05, + "loss": 0.2282, + "step": 7233 + }, + { + "epoch": 0.6171301825627026, + "grad_norm": 2.236439851193815, + "learning_rate": 3.376296338864392e-05, + "loss": 0.2979, + "step": 7234 + }, + { + "epoch": 0.6172154922368197, + "grad_norm": 1.485475313046355, + "learning_rate": 3.3749897482703794e-05, + "loss": 0.1908, + "step": 7235 + }, + { + "epoch": 0.6173008019109367, + "grad_norm": 2.0619048213507747, + "learning_rate": 3.373683281737263e-05, + "loss": 0.2248, + "step": 7236 + }, + { + "epoch": 0.6173861115850537, + "grad_norm": 1.5371940618546733, + "learning_rate": 3.372376939364787e-05, + "loss": 0.2292, + "step": 7237 + }, + { + "epoch": 0.6174714212591708, + "grad_norm": 1.3527782462813558, + "learning_rate": 3.371070721252682e-05, + "loss": 0.2191, + "step": 7238 + }, + { + "epoch": 0.6175567309332879, + "grad_norm": 1.4521734752433393, + "learning_rate": 3.369764627500671e-05, + "loss": 0.2569, + "step": 7239 + }, + { + "epoch": 0.6176420406074049, + "grad_norm": 1.8582884875760979, + "learning_rate": 3.3684586582084674e-05, + "loss": 0.2877, + "step": 7240 + }, + { + "epoch": 0.6177273502815219, + "grad_norm": 1.8145007567626181, + "learning_rate": 3.3671528134757766e-05, + "loss": 0.2675, + "step": 7241 + }, + { + "epoch": 0.6178126599556389, + "grad_norm": 2.0562251034420793, + "learning_rate": 3.36584709340229e-05, + "loss": 0.2649, + "step": 7242 + }, + { + "epoch": 0.617897969629756, + "grad_norm": 1.5985825518500274, + "learning_rate": 3.3645414980876946e-05, + "loss": 0.2313, + "step": 7243 + }, + { + "epoch": 0.6179832793038731, + "grad_norm": 1.6103500079349304, + "learning_rate": 3.363236027631665e-05, + "loss": 0.2582, + "step": 7244 + }, + { + "epoch": 0.6180685889779901, + "grad_norm": 1.3249067021757475, + "learning_rate": 3.361930682133867e-05, + "loss": 0.2316, + "step": 7245 + }, + { + "epoch": 0.6181538986521071, + "grad_norm": 1.3371309894371635, + "learning_rate": 3.3606254616939555e-05, + "loss": 0.2132, + "step": 7246 + }, + { + "epoch": 0.6182392083262241, + "grad_norm": 1.7516128736971985, + "learning_rate": 3.3593203664115807e-05, + "loss": 0.2312, + "step": 7247 + }, + { + "epoch": 0.6183245180003413, + "grad_norm": 1.5077771592816331, + "learning_rate": 3.358015396386376e-05, + "loss": 0.2624, + "step": 7248 + }, + { + "epoch": 0.6184098276744583, + "grad_norm": 1.6638659867140064, + "learning_rate": 3.3567105517179696e-05, + "loss": 0.2759, + "step": 7249 + }, + { + "epoch": 0.6184951373485753, + "grad_norm": 1.7326347624534884, + "learning_rate": 3.3554058325059806e-05, + "loss": 0.2613, + "step": 7250 + }, + { + "epoch": 0.6185804470226923, + "grad_norm": 1.6443742328266484, + "learning_rate": 3.354101238850019e-05, + "loss": 0.1903, + "step": 7251 + }, + { + "epoch": 0.6186657566968095, + "grad_norm": 1.5237373252635662, + "learning_rate": 3.352796770849679e-05, + "loss": 0.2168, + "step": 7252 + }, + { + "epoch": 0.6187510663709265, + "grad_norm": 1.3758920463558915, + "learning_rate": 3.3514924286045526e-05, + "loss": 0.2074, + "step": 7253 + }, + { + "epoch": 0.6188363760450435, + "grad_norm": 1.6842604730910955, + "learning_rate": 3.35018821221422e-05, + "loss": 0.2213, + "step": 7254 + }, + { + "epoch": 0.6189216857191605, + "grad_norm": 2.090187115848783, + "learning_rate": 3.3488841217782493e-05, + "loss": 0.2899, + "step": 7255 + }, + { + "epoch": 0.6190069953932776, + "grad_norm": 1.768612385171475, + "learning_rate": 3.347580157396202e-05, + "loss": 0.1754, + "step": 7256 + }, + { + "epoch": 0.6190923050673947, + "grad_norm": 1.335537900891743, + "learning_rate": 3.3462763191676305e-05, + "loss": 0.2343, + "step": 7257 + }, + { + "epoch": 0.6191776147415117, + "grad_norm": 1.5574406921459327, + "learning_rate": 3.3449726071920724e-05, + "loss": 0.2654, + "step": 7258 + }, + { + "epoch": 0.6192629244156287, + "grad_norm": 1.7317669478829376, + "learning_rate": 3.3436690215690615e-05, + "loss": 0.2174, + "step": 7259 + }, + { + "epoch": 0.6193482340897458, + "grad_norm": 2.2773332808239206, + "learning_rate": 3.34236556239812e-05, + "loss": 0.2003, + "step": 7260 + }, + { + "epoch": 0.6194335437638628, + "grad_norm": 1.4638111594931698, + "learning_rate": 3.3410622297787574e-05, + "loss": 0.1942, + "step": 7261 + }, + { + "epoch": 0.6195188534379799, + "grad_norm": 1.364514642787154, + "learning_rate": 3.339759023810478e-05, + "loss": 0.2241, + "step": 7262 + }, + { + "epoch": 0.6196041631120969, + "grad_norm": 1.9476489394777938, + "learning_rate": 3.3384559445927746e-05, + "loss": 0.2186, + "step": 7263 + }, + { + "epoch": 0.619689472786214, + "grad_norm": 1.816640531136101, + "learning_rate": 3.337152992225133e-05, + "loss": 0.2251, + "step": 7264 + }, + { + "epoch": 0.619774782460331, + "grad_norm": 1.2833333638839388, + "learning_rate": 3.335850166807021e-05, + "loss": 0.1431, + "step": 7265 + }, + { + "epoch": 0.619860092134448, + "grad_norm": 1.656017503074621, + "learning_rate": 3.3345474684379064e-05, + "loss": 0.2711, + "step": 7266 + }, + { + "epoch": 0.619945401808565, + "grad_norm": 1.968574576661078, + "learning_rate": 3.3332448972172434e-05, + "loss": 0.2512, + "step": 7267 + }, + { + "epoch": 0.6200307114826822, + "grad_norm": 1.9476366980864666, + "learning_rate": 3.3319424532444745e-05, + "loss": 0.3656, + "step": 7268 + }, + { + "epoch": 0.6201160211567992, + "grad_norm": 1.6894908746459938, + "learning_rate": 3.3306401366190354e-05, + "loss": 0.2212, + "step": 7269 + }, + { + "epoch": 0.6202013308309162, + "grad_norm": 2.005977044001753, + "learning_rate": 3.3293379474403514e-05, + "loss": 0.2255, + "step": 7270 + }, + { + "epoch": 0.6202866405050332, + "grad_norm": 1.7534986672332458, + "learning_rate": 3.328035885807837e-05, + "loss": 0.1669, + "step": 7271 + }, + { + "epoch": 0.6203719501791504, + "grad_norm": 1.892159713931708, + "learning_rate": 3.3267339518208976e-05, + "loss": 0.2468, + "step": 7272 + }, + { + "epoch": 0.6204572598532674, + "grad_norm": 1.832111348757885, + "learning_rate": 3.32543214557893e-05, + "loss": 0.199, + "step": 7273 + }, + { + "epoch": 0.6205425695273844, + "grad_norm": 1.6545297578110894, + "learning_rate": 3.324130467181318e-05, + "loss": 0.2321, + "step": 7274 + }, + { + "epoch": 0.6206278792015014, + "grad_norm": 1.456349885977598, + "learning_rate": 3.322828916727439e-05, + "loss": 0.238, + "step": 7275 + }, + { + "epoch": 0.6207131888756185, + "grad_norm": 1.9093338457782598, + "learning_rate": 3.321527494316662e-05, + "loss": 0.2675, + "step": 7276 + }, + { + "epoch": 0.6207984985497356, + "grad_norm": 1.6494149297844078, + "learning_rate": 3.32022620004834e-05, + "loss": 0.2574, + "step": 7277 + }, + { + "epoch": 0.6208838082238526, + "grad_norm": 1.5723092967537282, + "learning_rate": 3.3189250340218204e-05, + "loss": 0.2476, + "step": 7278 + }, + { + "epoch": 0.6209691178979696, + "grad_norm": 1.7892261413629422, + "learning_rate": 3.317623996336443e-05, + "loss": 0.2521, + "step": 7279 + }, + { + "epoch": 0.6210544275720866, + "grad_norm": 2.072755919187038, + "learning_rate": 3.316323087091532e-05, + "loss": 0.1646, + "step": 7280 + }, + { + "epoch": 0.6211397372462037, + "grad_norm": 1.6106101916235263, + "learning_rate": 3.315022306386407e-05, + "loss": 0.2247, + "step": 7281 + }, + { + "epoch": 0.6212250469203208, + "grad_norm": 1.6412722809347615, + "learning_rate": 3.313721654320375e-05, + "loss": 0.1841, + "step": 7282 + }, + { + "epoch": 0.6213103565944378, + "grad_norm": 2.1386749964303706, + "learning_rate": 3.312421130992734e-05, + "loss": 0.259, + "step": 7283 + }, + { + "epoch": 0.6213956662685548, + "grad_norm": 1.6468775450598145, + "learning_rate": 3.311120736502771e-05, + "loss": 0.2095, + "step": 7284 + }, + { + "epoch": 0.6214809759426719, + "grad_norm": 1.717814520365745, + "learning_rate": 3.309820470949765e-05, + "loss": 0.19, + "step": 7285 + }, + { + "epoch": 0.621566285616789, + "grad_norm": 1.7537786696490778, + "learning_rate": 3.3085203344329865e-05, + "loss": 0.222, + "step": 7286 + }, + { + "epoch": 0.621651595290906, + "grad_norm": 1.7186069775809396, + "learning_rate": 3.30722032705169e-05, + "loss": 0.1904, + "step": 7287 + }, + { + "epoch": 0.621736904965023, + "grad_norm": 1.5496796123029282, + "learning_rate": 3.305920448905125e-05, + "loss": 0.2307, + "step": 7288 + }, + { + "epoch": 0.6218222146391401, + "grad_norm": 1.5217940636013563, + "learning_rate": 3.304620700092535e-05, + "loss": 0.1994, + "step": 7289 + }, + { + "epoch": 0.6219075243132571, + "grad_norm": 1.8265856921679091, + "learning_rate": 3.303321080713143e-05, + "loss": 0.2177, + "step": 7290 + }, + { + "epoch": 0.6219928339873741, + "grad_norm": 1.6684953273791214, + "learning_rate": 3.30202159086617e-05, + "loss": 0.2564, + "step": 7291 + }, + { + "epoch": 0.6220781436614912, + "grad_norm": 1.6800647400688922, + "learning_rate": 3.300722230650827e-05, + "loss": 0.2252, + "step": 7292 + }, + { + "epoch": 0.6221634533356083, + "grad_norm": 1.9010925564856633, + "learning_rate": 3.2994230001663104e-05, + "loss": 0.2379, + "step": 7293 + }, + { + "epoch": 0.6222487630097253, + "grad_norm": 1.7113851022715423, + "learning_rate": 3.298123899511811e-05, + "loss": 0.2771, + "step": 7294 + }, + { + "epoch": 0.6223340726838423, + "grad_norm": 1.465620074233377, + "learning_rate": 3.2968249287865084e-05, + "loss": 0.2149, + "step": 7295 + }, + { + "epoch": 0.6224193823579593, + "grad_norm": 1.6729942211956539, + "learning_rate": 3.295526088089573e-05, + "loss": 0.2176, + "step": 7296 + }, + { + "epoch": 0.6225046920320765, + "grad_norm": 1.4190795582895015, + "learning_rate": 3.294227377520161e-05, + "loss": 0.2637, + "step": 7297 + }, + { + "epoch": 0.6225900017061935, + "grad_norm": 1.55900411533598, + "learning_rate": 3.292928797177425e-05, + "loss": 0.1942, + "step": 7298 + }, + { + "epoch": 0.6226753113803105, + "grad_norm": 1.6988300448907006, + "learning_rate": 3.291630347160505e-05, + "loss": 0.2314, + "step": 7299 + }, + { + "epoch": 0.6227606210544275, + "grad_norm": 1.8949676325708602, + "learning_rate": 3.290332027568529e-05, + "loss": 0.2772, + "step": 7300 + }, + { + "epoch": 0.6228459307285447, + "grad_norm": 2.073372821043409, + "learning_rate": 3.289033838500618e-05, + "loss": 0.2753, + "step": 7301 + }, + { + "epoch": 0.6229312404026617, + "grad_norm": 1.6932741207498267, + "learning_rate": 3.2877357800558804e-05, + "loss": 0.1959, + "step": 7302 + }, + { + "epoch": 0.6230165500767787, + "grad_norm": 1.4627247857694963, + "learning_rate": 3.286437852333418e-05, + "loss": 0.244, + "step": 7303 + }, + { + "epoch": 0.6231018597508957, + "grad_norm": 1.8670714912927202, + "learning_rate": 3.2851400554323184e-05, + "loss": 0.2094, + "step": 7304 + }, + { + "epoch": 0.6231871694250128, + "grad_norm": 1.8008607528248994, + "learning_rate": 3.2838423894516656e-05, + "loss": 0.2488, + "step": 7305 + }, + { + "epoch": 0.6232724790991299, + "grad_norm": 1.6819154379699257, + "learning_rate": 3.282544854490524e-05, + "loss": 0.229, + "step": 7306 + }, + { + "epoch": 0.6233577887732469, + "grad_norm": 1.198909508412235, + "learning_rate": 3.281247450647956e-05, + "loss": 0.193, + "step": 7307 + }, + { + "epoch": 0.6234430984473639, + "grad_norm": 1.359783418183372, + "learning_rate": 3.279950178023012e-05, + "loss": 0.2173, + "step": 7308 + }, + { + "epoch": 0.623528408121481, + "grad_norm": 1.7953876476313781, + "learning_rate": 3.2786530367147336e-05, + "loss": 0.1852, + "step": 7309 + }, + { + "epoch": 0.623613717795598, + "grad_norm": 1.7192822412549384, + "learning_rate": 3.277356026822147e-05, + "loss": 0.1996, + "step": 7310 + }, + { + "epoch": 0.6236990274697151, + "grad_norm": 1.7494187070804705, + "learning_rate": 3.2760591484442735e-05, + "loss": 0.2583, + "step": 7311 + }, + { + "epoch": 0.6237843371438321, + "grad_norm": 1.427284848355554, + "learning_rate": 3.274762401680124e-05, + "loss": 0.208, + "step": 7312 + }, + { + "epoch": 0.6238696468179492, + "grad_norm": 1.8344957900041883, + "learning_rate": 3.2734657866286974e-05, + "loss": 0.1969, + "step": 7313 + }, + { + "epoch": 0.6239549564920662, + "grad_norm": 1.8254309472027068, + "learning_rate": 3.272169303388982e-05, + "loss": 0.2795, + "step": 7314 + }, + { + "epoch": 0.6240402661661832, + "grad_norm": 1.5508810246654114, + "learning_rate": 3.27087295205996e-05, + "loss": 0.193, + "step": 7315 + }, + { + "epoch": 0.6241255758403003, + "grad_norm": 1.629937373871438, + "learning_rate": 3.269576732740598e-05, + "loss": 0.2295, + "step": 7316 + }, + { + "epoch": 0.6242108855144173, + "grad_norm": 1.8606582188612102, + "learning_rate": 3.268280645529857e-05, + "loss": 0.1989, + "step": 7317 + }, + { + "epoch": 0.6242961951885344, + "grad_norm": 1.9763832463338582, + "learning_rate": 3.2669846905266885e-05, + "loss": 0.2028, + "step": 7318 + }, + { + "epoch": 0.6243815048626514, + "grad_norm": 1.4663382675516843, + "learning_rate": 3.265688867830027e-05, + "loss": 0.2114, + "step": 7319 + }, + { + "epoch": 0.6244668145367684, + "grad_norm": 1.541877147980776, + "learning_rate": 3.264393177538805e-05, + "loss": 0.1623, + "step": 7320 + }, + { + "epoch": 0.6245521242108855, + "grad_norm": 1.8274228956705592, + "learning_rate": 3.2630976197519424e-05, + "loss": 0.2256, + "step": 7321 + }, + { + "epoch": 0.6246374338850026, + "grad_norm": 1.538087332588653, + "learning_rate": 3.2618021945683455e-05, + "loss": 0.2504, + "step": 7322 + }, + { + "epoch": 0.6247227435591196, + "grad_norm": 1.7485308611150472, + "learning_rate": 3.2605069020869136e-05, + "loss": 0.1968, + "step": 7323 + }, + { + "epoch": 0.6248080532332366, + "grad_norm": 1.7227487690619085, + "learning_rate": 3.259211742406537e-05, + "loss": 0.2144, + "step": 7324 + }, + { + "epoch": 0.6248933629073536, + "grad_norm": 1.7776134244459987, + "learning_rate": 3.2579167156260934e-05, + "loss": 0.2224, + "step": 7325 + }, + { + "epoch": 0.6249786725814708, + "grad_norm": 1.3701860950241609, + "learning_rate": 3.256621821844451e-05, + "loss": 0.2806, + "step": 7326 + }, + { + "epoch": 0.6250639822555878, + "grad_norm": 1.8186274247960568, + "learning_rate": 3.255327061160467e-05, + "loss": 0.3253, + "step": 7327 + }, + { + "epoch": 0.6251492919297048, + "grad_norm": 1.2729164414892087, + "learning_rate": 3.2540324336729935e-05, + "loss": 0.2412, + "step": 7328 + }, + { + "epoch": 0.6252346016038218, + "grad_norm": 1.5450233397217772, + "learning_rate": 3.2527379394808635e-05, + "loss": 0.2318, + "step": 7329 + }, + { + "epoch": 0.625319911277939, + "grad_norm": 1.659041158136859, + "learning_rate": 3.251443578682906e-05, + "loss": 0.2945, + "step": 7330 + }, + { + "epoch": 0.625405220952056, + "grad_norm": 1.3726965944443694, + "learning_rate": 3.250149351377942e-05, + "loss": 0.1533, + "step": 7331 + }, + { + "epoch": 0.625490530626173, + "grad_norm": 1.4538888871818065, + "learning_rate": 3.2488552576647746e-05, + "loss": 0.1711, + "step": 7332 + }, + { + "epoch": 0.62557584030029, + "grad_norm": 1.7472629941958453, + "learning_rate": 3.247561297642203e-05, + "loss": 0.2398, + "step": 7333 + }, + { + "epoch": 0.6256611499744071, + "grad_norm": 2.1344354623745345, + "learning_rate": 3.246267471409015e-05, + "loss": 0.2225, + "step": 7334 + }, + { + "epoch": 0.6257464596485242, + "grad_norm": 1.9948564191887874, + "learning_rate": 3.2449737790639855e-05, + "loss": 0.2535, + "step": 7335 + }, + { + "epoch": 0.6258317693226412, + "grad_norm": 1.5907585642558766, + "learning_rate": 3.24368022070588e-05, + "loss": 0.2672, + "step": 7336 + }, + { + "epoch": 0.6259170789967582, + "grad_norm": 1.5165649331036175, + "learning_rate": 3.24238679643346e-05, + "loss": 0.2595, + "step": 7337 + }, + { + "epoch": 0.6260023886708753, + "grad_norm": 1.4571082373521527, + "learning_rate": 3.2410935063454654e-05, + "loss": 0.2339, + "step": 7338 + }, + { + "epoch": 0.6260876983449923, + "grad_norm": 1.287962565612569, + "learning_rate": 3.239800350540635e-05, + "loss": 0.2464, + "step": 7339 + }, + { + "epoch": 0.6261730080191094, + "grad_norm": 1.4129414619663274, + "learning_rate": 3.238507329117694e-05, + "loss": 0.2183, + "step": 7340 + }, + { + "epoch": 0.6262583176932264, + "grad_norm": 1.4540257277226123, + "learning_rate": 3.23721444217536e-05, + "loss": 0.2002, + "step": 7341 + }, + { + "epoch": 0.6263436273673435, + "grad_norm": 1.429953628535115, + "learning_rate": 3.235921689812334e-05, + "loss": 0.1639, + "step": 7342 + }, + { + "epoch": 0.6264289370414605, + "grad_norm": 1.623704540623601, + "learning_rate": 3.234629072127314e-05, + "loss": 0.2165, + "step": 7343 + }, + { + "epoch": 0.6265142467155775, + "grad_norm": 1.6791209130249856, + "learning_rate": 3.233336589218983e-05, + "loss": 0.1505, + "step": 7344 + }, + { + "epoch": 0.6265995563896946, + "grad_norm": 1.613420900669154, + "learning_rate": 3.232044241186014e-05, + "loss": 0.252, + "step": 7345 + }, + { + "epoch": 0.6266848660638117, + "grad_norm": 1.7827939486395123, + "learning_rate": 3.230752028127073e-05, + "loss": 0.2235, + "step": 7346 + }, + { + "epoch": 0.6267701757379287, + "grad_norm": 1.4370677132152552, + "learning_rate": 3.229459950140814e-05, + "loss": 0.1816, + "step": 7347 + }, + { + "epoch": 0.6268554854120457, + "grad_norm": 1.9363044003158112, + "learning_rate": 3.228168007325877e-05, + "loss": 0.1997, + "step": 7348 + }, + { + "epoch": 0.6269407950861627, + "grad_norm": 1.403156990417243, + "learning_rate": 3.2268761997808984e-05, + "loss": 0.2009, + "step": 7349 + }, + { + "epoch": 0.6270261047602799, + "grad_norm": 1.25415683036994, + "learning_rate": 3.2255845276045e-05, + "loss": 0.1745, + "step": 7350 + }, + { + "epoch": 0.6271114144343969, + "grad_norm": 1.729939468074506, + "learning_rate": 3.224292990895292e-05, + "loss": 0.1893, + "step": 7351 + }, + { + "epoch": 0.6271967241085139, + "grad_norm": 1.7382117096781835, + "learning_rate": 3.22300158975188e-05, + "loss": 0.1337, + "step": 7352 + }, + { + "epoch": 0.6272820337826309, + "grad_norm": 1.461499111325349, + "learning_rate": 3.221710324272852e-05, + "loss": 0.1987, + "step": 7353 + }, + { + "epoch": 0.627367343456748, + "grad_norm": 1.7333157715763627, + "learning_rate": 3.2204191945567925e-05, + "loss": 0.2107, + "step": 7354 + }, + { + "epoch": 0.6274526531308651, + "grad_norm": 1.6157339446710148, + "learning_rate": 3.2191282007022705e-05, + "loss": 0.1685, + "step": 7355 + }, + { + "epoch": 0.6275379628049821, + "grad_norm": 1.9037738414331025, + "learning_rate": 3.2178373428078454e-05, + "loss": 0.1999, + "step": 7356 + }, + { + "epoch": 0.6276232724790991, + "grad_norm": 1.920682956107879, + "learning_rate": 3.2165466209720705e-05, + "loss": 0.2187, + "step": 7357 + }, + { + "epoch": 0.6277085821532161, + "grad_norm": 2.8967793700926103, + "learning_rate": 3.215256035293483e-05, + "loss": 0.2352, + "step": 7358 + }, + { + "epoch": 0.6277938918273333, + "grad_norm": 2.0340371597031415, + "learning_rate": 3.213965585870612e-05, + "loss": 0.2244, + "step": 7359 + }, + { + "epoch": 0.6278792015014503, + "grad_norm": 1.8451168360325694, + "learning_rate": 3.2126752728019805e-05, + "loss": 0.2156, + "step": 7360 + }, + { + "epoch": 0.6279645111755673, + "grad_norm": 1.8174971391322146, + "learning_rate": 3.2113850961860915e-05, + "loss": 0.2265, + "step": 7361 + }, + { + "epoch": 0.6280498208496843, + "grad_norm": 1.7789303583212808, + "learning_rate": 3.210095056121446e-05, + "loss": 0.1895, + "step": 7362 + }, + { + "epoch": 0.6281351305238014, + "grad_norm": 2.1397557407983605, + "learning_rate": 3.208805152706533e-05, + "loss": 0.19, + "step": 7363 + }, + { + "epoch": 0.6282204401979185, + "grad_norm": 2.5494495321284782, + "learning_rate": 3.2075153860398265e-05, + "loss": 0.2435, + "step": 7364 + }, + { + "epoch": 0.6283057498720355, + "grad_norm": 1.5521489413260476, + "learning_rate": 3.206225756219796e-05, + "loss": 0.2443, + "step": 7365 + }, + { + "epoch": 0.6283910595461525, + "grad_norm": 2.1543446844196286, + "learning_rate": 3.204936263344896e-05, + "loss": 0.2271, + "step": 7366 + }, + { + "epoch": 0.6284763692202696, + "grad_norm": 2.0614038502293126, + "learning_rate": 3.203646907513575e-05, + "loss": 0.2245, + "step": 7367 + }, + { + "epoch": 0.6285616788943866, + "grad_norm": 1.8436914208368314, + "learning_rate": 3.202357688824265e-05, + "loss": 0.2749, + "step": 7368 + }, + { + "epoch": 0.6286469885685037, + "grad_norm": 2.040995299315658, + "learning_rate": 3.201068607375393e-05, + "loss": 0.2628, + "step": 7369 + }, + { + "epoch": 0.6287322982426207, + "grad_norm": 1.4001131727251903, + "learning_rate": 3.199779663265375e-05, + "loss": 0.2335, + "step": 7370 + }, + { + "epoch": 0.6288176079167378, + "grad_norm": 1.9183882431315873, + "learning_rate": 3.198490856592611e-05, + "loss": 0.229, + "step": 7371 + }, + { + "epoch": 0.6289029175908548, + "grad_norm": 1.8551857180308444, + "learning_rate": 3.197202187455498e-05, + "loss": 0.1853, + "step": 7372 + }, + { + "epoch": 0.6289882272649718, + "grad_norm": 1.764223876275377, + "learning_rate": 3.195913655952419e-05, + "loss": 0.2611, + "step": 7373 + }, + { + "epoch": 0.6290735369390889, + "grad_norm": 1.676706054457429, + "learning_rate": 3.194625262181744e-05, + "loss": 0.2316, + "step": 7374 + }, + { + "epoch": 0.629158846613206, + "grad_norm": 1.6495497869613716, + "learning_rate": 3.1933370062418366e-05, + "loss": 0.1833, + "step": 7375 + }, + { + "epoch": 0.629244156287323, + "grad_norm": 1.5099447718998276, + "learning_rate": 3.1920488882310495e-05, + "loss": 0.2082, + "step": 7376 + }, + { + "epoch": 0.62932946596144, + "grad_norm": 1.8422519611084711, + "learning_rate": 3.1907609082477215e-05, + "loss": 0.2487, + "step": 7377 + }, + { + "epoch": 0.629414775635557, + "grad_norm": 1.4446501992881262, + "learning_rate": 3.189473066390183e-05, + "loss": 0.2101, + "step": 7378 + }, + { + "epoch": 0.6295000853096742, + "grad_norm": 1.4739823985297575, + "learning_rate": 3.1881853627567566e-05, + "loss": 0.18, + "step": 7379 + }, + { + "epoch": 0.6295853949837912, + "grad_norm": 1.6976881126928218, + "learning_rate": 3.186897797445748e-05, + "loss": 0.1974, + "step": 7380 + }, + { + "epoch": 0.6296707046579082, + "grad_norm": 1.715173502784287, + "learning_rate": 3.1856103705554575e-05, + "loss": 0.1959, + "step": 7381 + }, + { + "epoch": 0.6297560143320252, + "grad_norm": 1.7126698541984033, + "learning_rate": 3.184323082184176e-05, + "loss": 0.1577, + "step": 7382 + }, + { + "epoch": 0.6298413240061423, + "grad_norm": 1.5270730483613868, + "learning_rate": 3.1830359324301764e-05, + "loss": 0.1984, + "step": 7383 + }, + { + "epoch": 0.6299266336802594, + "grad_norm": 1.8768821806108154, + "learning_rate": 3.181748921391728e-05, + "loss": 0.2555, + "step": 7384 + }, + { + "epoch": 0.6300119433543764, + "grad_norm": 2.003794170613847, + "learning_rate": 3.1804620491670884e-05, + "loss": 0.209, + "step": 7385 + }, + { + "epoch": 0.6300972530284934, + "grad_norm": 2.07071644423065, + "learning_rate": 3.1791753158545026e-05, + "loss": 0.2082, + "step": 7386 + }, + { + "epoch": 0.6301825627026105, + "grad_norm": 1.3825935147270625, + "learning_rate": 3.1778887215522044e-05, + "loss": 0.2489, + "step": 7387 + }, + { + "epoch": 0.6302678723767275, + "grad_norm": 1.7545375579875502, + "learning_rate": 3.17660226635842e-05, + "loss": 0.2533, + "step": 7388 + }, + { + "epoch": 0.6303531820508446, + "grad_norm": 1.8808457482023568, + "learning_rate": 3.175315950371365e-05, + "loss": 0.3098, + "step": 7389 + }, + { + "epoch": 0.6304384917249616, + "grad_norm": 1.5033993349720116, + "learning_rate": 3.1740297736892376e-05, + "loss": 0.222, + "step": 7390 + }, + { + "epoch": 0.6305238013990787, + "grad_norm": 1.7086680789891564, + "learning_rate": 3.172743736410235e-05, + "loss": 0.1818, + "step": 7391 + }, + { + "epoch": 0.6306091110731957, + "grad_norm": 1.6174868983780242, + "learning_rate": 3.171457838632539e-05, + "loss": 0.2222, + "step": 7392 + }, + { + "epoch": 0.6306944207473127, + "grad_norm": 1.7250716733554872, + "learning_rate": 3.170172080454319e-05, + "loss": 0.2171, + "step": 7393 + }, + { + "epoch": 0.6307797304214298, + "grad_norm": 1.417790173091403, + "learning_rate": 3.168886461973737e-05, + "loss": 0.1984, + "step": 7394 + }, + { + "epoch": 0.6308650400955468, + "grad_norm": 1.8671468945800547, + "learning_rate": 3.167600983288944e-05, + "loss": 0.1693, + "step": 7395 + }, + { + "epoch": 0.6309503497696639, + "grad_norm": 1.7411153599153628, + "learning_rate": 3.166315644498078e-05, + "loss": 0.1773, + "step": 7396 + }, + { + "epoch": 0.6310356594437809, + "grad_norm": 1.5785872282927864, + "learning_rate": 3.165030445699269e-05, + "loss": 0.2011, + "step": 7397 + }, + { + "epoch": 0.631120969117898, + "grad_norm": 1.6362642736081932, + "learning_rate": 3.163745386990634e-05, + "loss": 0.2925, + "step": 7398 + }, + { + "epoch": 0.631206278792015, + "grad_norm": 2.0517795421787812, + "learning_rate": 3.162460468470281e-05, + "loss": 0.1654, + "step": 7399 + }, + { + "epoch": 0.6312915884661321, + "grad_norm": 1.4525852636632626, + "learning_rate": 3.161175690236305e-05, + "loss": 0.225, + "step": 7400 + }, + { + "epoch": 0.6313768981402491, + "grad_norm": 1.6062568159738047, + "learning_rate": 3.159891052386795e-05, + "loss": 0.1709, + "step": 7401 + }, + { + "epoch": 0.6314622078143661, + "grad_norm": 1.5372830865946838, + "learning_rate": 3.158606555019826e-05, + "loss": 0.2332, + "step": 7402 + }, + { + "epoch": 0.6315475174884831, + "grad_norm": 1.7662024439884105, + "learning_rate": 3.157322198233459e-05, + "loss": 0.2118, + "step": 7403 + }, + { + "epoch": 0.6316328271626003, + "grad_norm": 1.4495660000815862, + "learning_rate": 3.156037982125751e-05, + "loss": 0.2089, + "step": 7404 + }, + { + "epoch": 0.6317181368367173, + "grad_norm": 2.1890392882200844, + "learning_rate": 3.1547539067947454e-05, + "loss": 0.1866, + "step": 7405 + }, + { + "epoch": 0.6318034465108343, + "grad_norm": 1.910734311496724, + "learning_rate": 3.153469972338472e-05, + "loss": 0.2102, + "step": 7406 + }, + { + "epoch": 0.6318887561849513, + "grad_norm": 1.7654129171012776, + "learning_rate": 3.1521861788549544e-05, + "loss": 0.2667, + "step": 7407 + }, + { + "epoch": 0.6319740658590685, + "grad_norm": 1.8283228318855491, + "learning_rate": 3.150902526442203e-05, + "loss": 0.2506, + "step": 7408 + }, + { + "epoch": 0.6320593755331855, + "grad_norm": 1.7937067378871463, + "learning_rate": 3.149619015198218e-05, + "loss": 0.1987, + "step": 7409 + }, + { + "epoch": 0.6321446852073025, + "grad_norm": 1.6316382461322285, + "learning_rate": 3.148335645220987e-05, + "loss": 0.1828, + "step": 7410 + }, + { + "epoch": 0.6322299948814195, + "grad_norm": 1.6051993410855805, + "learning_rate": 3.147052416608491e-05, + "loss": 0.2311, + "step": 7411 + }, + { + "epoch": 0.6323153045555366, + "grad_norm": 1.6445155924507, + "learning_rate": 3.145769329458695e-05, + "loss": 0.178, + "step": 7412 + }, + { + "epoch": 0.6324006142296537, + "grad_norm": 1.8499686547794214, + "learning_rate": 3.144486383869557e-05, + "loss": 0.2365, + "step": 7413 + }, + { + "epoch": 0.6324859239037707, + "grad_norm": 1.9597875255131256, + "learning_rate": 3.143203579939023e-05, + "loss": 0.1847, + "step": 7414 + }, + { + "epoch": 0.6325712335778877, + "grad_norm": 1.740250335859922, + "learning_rate": 3.1419209177650324e-05, + "loss": 0.2315, + "step": 7415 + }, + { + "epoch": 0.6326565432520048, + "grad_norm": 1.5553690121075652, + "learning_rate": 3.1406383974455025e-05, + "loss": 0.1825, + "step": 7416 + }, + { + "epoch": 0.6327418529261218, + "grad_norm": 1.288361052198072, + "learning_rate": 3.139356019078352e-05, + "loss": 0.1649, + "step": 7417 + }, + { + "epoch": 0.6328271626002389, + "grad_norm": 1.7821414791334467, + "learning_rate": 3.1380737827614816e-05, + "loss": 0.2663, + "step": 7418 + }, + { + "epoch": 0.6329124722743559, + "grad_norm": 2.059436256528021, + "learning_rate": 3.136791688592784e-05, + "loss": 0.234, + "step": 7419 + }, + { + "epoch": 0.632997781948473, + "grad_norm": 1.4816839620627058, + "learning_rate": 3.1355097366701385e-05, + "loss": 0.2278, + "step": 7420 + }, + { + "epoch": 0.63308309162259, + "grad_norm": 1.5703027070152895, + "learning_rate": 3.134227927091419e-05, + "loss": 0.2537, + "step": 7421 + }, + { + "epoch": 0.633168401296707, + "grad_norm": 1.6227277862105056, + "learning_rate": 3.13294625995448e-05, + "loss": 0.1751, + "step": 7422 + }, + { + "epoch": 0.6332537109708241, + "grad_norm": 1.8103025203609533, + "learning_rate": 3.131664735357174e-05, + "loss": 0.2232, + "step": 7423 + }, + { + "epoch": 0.6333390206449412, + "grad_norm": 1.9018056370838468, + "learning_rate": 3.130383353397338e-05, + "loss": 0.1933, + "step": 7424 + }, + { + "epoch": 0.6334243303190582, + "grad_norm": 1.5240543779348774, + "learning_rate": 3.129102114172796e-05, + "loss": 0.1951, + "step": 7425 + }, + { + "epoch": 0.6335096399931752, + "grad_norm": 1.5879400229189828, + "learning_rate": 3.127821017781365e-05, + "loss": 0.3022, + "step": 7426 + }, + { + "epoch": 0.6335949496672922, + "grad_norm": 1.9567124036591441, + "learning_rate": 3.126540064320853e-05, + "loss": 0.2464, + "step": 7427 + }, + { + "epoch": 0.6336802593414094, + "grad_norm": 1.896986418567356, + "learning_rate": 3.12525925388905e-05, + "loss": 0.213, + "step": 7428 + }, + { + "epoch": 0.6337655690155264, + "grad_norm": 1.4230031997633965, + "learning_rate": 3.1239785865837415e-05, + "loss": 0.1551, + "step": 7429 + }, + { + "epoch": 0.6338508786896434, + "grad_norm": 1.4823786109772898, + "learning_rate": 3.122698062502697e-05, + "loss": 0.2392, + "step": 7430 + }, + { + "epoch": 0.6339361883637604, + "grad_norm": 1.2971757287337056, + "learning_rate": 3.121417681743682e-05, + "loss": 0.219, + "step": 7431 + }, + { + "epoch": 0.6340214980378774, + "grad_norm": 1.5832953030218477, + "learning_rate": 3.120137444404442e-05, + "loss": 0.264, + "step": 7432 + }, + { + "epoch": 0.6341068077119946, + "grad_norm": 1.5450864528183332, + "learning_rate": 3.118857350582719e-05, + "loss": 0.2229, + "step": 7433 + }, + { + "epoch": 0.6341921173861116, + "grad_norm": 1.7335949767539458, + "learning_rate": 3.117577400376243e-05, + "loss": 0.2123, + "step": 7434 + }, + { + "epoch": 0.6342774270602286, + "grad_norm": 1.8352155848785061, + "learning_rate": 3.116297593882727e-05, + "loss": 0.1766, + "step": 7435 + }, + { + "epoch": 0.6343627367343456, + "grad_norm": 2.1294013051478764, + "learning_rate": 3.115017931199879e-05, + "loss": 0.1953, + "step": 7436 + }, + { + "epoch": 0.6344480464084628, + "grad_norm": 1.2083759903229427, + "learning_rate": 3.1137384124253974e-05, + "loss": 0.1302, + "step": 7437 + }, + { + "epoch": 0.6345333560825798, + "grad_norm": 1.368385052038495, + "learning_rate": 3.112459037656963e-05, + "loss": 0.1869, + "step": 7438 + }, + { + "epoch": 0.6346186657566968, + "grad_norm": 1.8000575533248881, + "learning_rate": 3.111179806992251e-05, + "loss": 0.3073, + "step": 7439 + }, + { + "epoch": 0.6347039754308138, + "grad_norm": 1.624405091767643, + "learning_rate": 3.109900720528925e-05, + "loss": 0.229, + "step": 7440 + }, + { + "epoch": 0.6347892851049309, + "grad_norm": 2.021503835390931, + "learning_rate": 3.108621778364634e-05, + "loss": 0.2519, + "step": 7441 + }, + { + "epoch": 0.634874594779048, + "grad_norm": 1.712067252843555, + "learning_rate": 3.107342980597019e-05, + "loss": 0.1995, + "step": 7442 + }, + { + "epoch": 0.634959904453165, + "grad_norm": 1.927098378990481, + "learning_rate": 3.106064327323711e-05, + "loss": 0.1793, + "step": 7443 + }, + { + "epoch": 0.635045214127282, + "grad_norm": 1.465881143079058, + "learning_rate": 3.1047858186423254e-05, + "loss": 0.1647, + "step": 7444 + }, + { + "epoch": 0.6351305238013991, + "grad_norm": 1.7056924209325257, + "learning_rate": 3.1035074546504715e-05, + "loss": 0.2697, + "step": 7445 + }, + { + "epoch": 0.6352158334755161, + "grad_norm": 1.5066823726583742, + "learning_rate": 3.102229235445746e-05, + "loss": 0.2148, + "step": 7446 + }, + { + "epoch": 0.6353011431496332, + "grad_norm": 1.557749890262096, + "learning_rate": 3.1009511611257354e-05, + "loss": 0.2246, + "step": 7447 + }, + { + "epoch": 0.6353864528237502, + "grad_norm": 1.6648047139654187, + "learning_rate": 3.0996732317880096e-05, + "loss": 0.2163, + "step": 7448 + }, + { + "epoch": 0.6354717624978673, + "grad_norm": 1.6199651627268057, + "learning_rate": 3.098395447530136e-05, + "loss": 0.2271, + "step": 7449 + }, + { + "epoch": 0.6355570721719843, + "grad_norm": 1.5146687430638421, + "learning_rate": 3.0971178084496645e-05, + "loss": 0.2365, + "step": 7450 + }, + { + "epoch": 0.6356423818461013, + "grad_norm": 1.8868891479123513, + "learning_rate": 3.0958403146441364e-05, + "loss": 0.2393, + "step": 7451 + }, + { + "epoch": 0.6357276915202184, + "grad_norm": 1.8075619219755994, + "learning_rate": 3.0945629662110805e-05, + "loss": 0.2627, + "step": 7452 + }, + { + "epoch": 0.6358130011943355, + "grad_norm": 1.5268747531611164, + "learning_rate": 3.0932857632480185e-05, + "loss": 0.2107, + "step": 7453 + }, + { + "epoch": 0.6358983108684525, + "grad_norm": 1.5702953432578768, + "learning_rate": 3.092008705852455e-05, + "loss": 0.2523, + "step": 7454 + }, + { + "epoch": 0.6359836205425695, + "grad_norm": 1.8204565687020038, + "learning_rate": 3.090731794121887e-05, + "loss": 0.1943, + "step": 7455 + }, + { + "epoch": 0.6360689302166865, + "grad_norm": 1.7750648728456853, + "learning_rate": 3.089455028153803e-05, + "loss": 0.3252, + "step": 7456 + }, + { + "epoch": 0.6361542398908037, + "grad_norm": 1.3674516695119165, + "learning_rate": 3.088178408045672e-05, + "loss": 0.2329, + "step": 7457 + }, + { + "epoch": 0.6362395495649207, + "grad_norm": 1.668632571771394, + "learning_rate": 3.086901933894961e-05, + "loss": 0.1809, + "step": 7458 + }, + { + "epoch": 0.6363248592390377, + "grad_norm": 2.023807212016177, + "learning_rate": 3.085625605799123e-05, + "loss": 0.2587, + "step": 7459 + }, + { + "epoch": 0.6364101689131547, + "grad_norm": 1.8055630496268964, + "learning_rate": 3.084349423855596e-05, + "loss": 0.2417, + "step": 7460 + }, + { + "epoch": 0.6364954785872718, + "grad_norm": 2.2872428040711794, + "learning_rate": 3.083073388161811e-05, + "loss": 0.2238, + "step": 7461 + }, + { + "epoch": 0.6365807882613889, + "grad_norm": 1.6232510471572683, + "learning_rate": 3.0817974988151846e-05, + "loss": 0.2498, + "step": 7462 + }, + { + "epoch": 0.6366660979355059, + "grad_norm": 1.8858640642256783, + "learning_rate": 3.080521755913128e-05, + "loss": 0.1894, + "step": 7463 + }, + { + "epoch": 0.6367514076096229, + "grad_norm": 2.187759384035759, + "learning_rate": 3.079246159553034e-05, + "loss": 0.2255, + "step": 7464 + }, + { + "epoch": 0.63683671728374, + "grad_norm": 1.5084118220244092, + "learning_rate": 3.0779707098322885e-05, + "loss": 0.1992, + "step": 7465 + }, + { + "epoch": 0.636922026957857, + "grad_norm": 1.7514218957089691, + "learning_rate": 3.076695406848268e-05, + "loss": 0.2008, + "step": 7466 + }, + { + "epoch": 0.6370073366319741, + "grad_norm": 1.9266843090547585, + "learning_rate": 3.075420250698331e-05, + "loss": 0.2172, + "step": 7467 + }, + { + "epoch": 0.6370926463060911, + "grad_norm": 1.7651981073998237, + "learning_rate": 3.0741452414798295e-05, + "loss": 0.2662, + "step": 7468 + }, + { + "epoch": 0.6371779559802082, + "grad_norm": 1.56144037556602, + "learning_rate": 3.072870379290107e-05, + "loss": 0.2018, + "step": 7469 + }, + { + "epoch": 0.6372632656543252, + "grad_norm": 1.804039512658307, + "learning_rate": 3.071595664226489e-05, + "loss": 0.2533, + "step": 7470 + }, + { + "epoch": 0.6373485753284422, + "grad_norm": 2.0281328198204287, + "learning_rate": 3.070321096386295e-05, + "loss": 0.2744, + "step": 7471 + }, + { + "epoch": 0.6374338850025593, + "grad_norm": 2.1098419802825745, + "learning_rate": 3.069046675866831e-05, + "loss": 0.1795, + "step": 7472 + }, + { + "epoch": 0.6375191946766763, + "grad_norm": 1.3628779035976577, + "learning_rate": 3.067772402765393e-05, + "loss": 0.2334, + "step": 7473 + }, + { + "epoch": 0.6376045043507934, + "grad_norm": 1.6674452949712628, + "learning_rate": 3.0664982771792626e-05, + "loss": 0.2008, + "step": 7474 + }, + { + "epoch": 0.6376898140249104, + "grad_norm": 1.8113758942189417, + "learning_rate": 3.065224299205714e-05, + "loss": 0.2128, + "step": 7475 + }, + { + "epoch": 0.6377751236990274, + "grad_norm": 1.8836279405159173, + "learning_rate": 3.063950468942011e-05, + "loss": 0.1845, + "step": 7476 + }, + { + "epoch": 0.6378604333731445, + "grad_norm": 1.802657335480592, + "learning_rate": 3.062676786485399e-05, + "loss": 0.266, + "step": 7477 + }, + { + "epoch": 0.6379457430472616, + "grad_norm": 1.3431778510316321, + "learning_rate": 3.0614032519331204e-05, + "loss": 0.1237, + "step": 7478 + }, + { + "epoch": 0.6380310527213786, + "grad_norm": 1.8744550548831784, + "learning_rate": 3.060129865382403e-05, + "loss": 0.2302, + "step": 7479 + }, + { + "epoch": 0.6381163623954956, + "grad_norm": 1.3550814644229985, + "learning_rate": 3.05885662693046e-05, + "loss": 0.2366, + "step": 7480 + }, + { + "epoch": 0.6382016720696126, + "grad_norm": 1.5294685900712464, + "learning_rate": 3.0575835366745e-05, + "loss": 0.2757, + "step": 7481 + }, + { + "epoch": 0.6382869817437298, + "grad_norm": 1.667596637035995, + "learning_rate": 3.0563105947117147e-05, + "loss": 0.2128, + "step": 7482 + }, + { + "epoch": 0.6383722914178468, + "grad_norm": 1.7153471813841346, + "learning_rate": 3.055037801139286e-05, + "loss": 0.1929, + "step": 7483 + }, + { + "epoch": 0.6384576010919638, + "grad_norm": 1.7585430746348152, + "learning_rate": 3.053765156054385e-05, + "loss": 0.289, + "step": 7484 + }, + { + "epoch": 0.6385429107660808, + "grad_norm": 1.4100313844637393, + "learning_rate": 3.0524926595541744e-05, + "loss": 0.2331, + "step": 7485 + }, + { + "epoch": 0.638628220440198, + "grad_norm": 1.6849054240956591, + "learning_rate": 3.051220311735798e-05, + "loss": 0.1936, + "step": 7486 + }, + { + "epoch": 0.638713530114315, + "grad_norm": 1.8315807187108741, + "learning_rate": 3.0499481126963947e-05, + "loss": 0.2103, + "step": 7487 + }, + { + "epoch": 0.638798839788432, + "grad_norm": 1.5187182764578393, + "learning_rate": 3.0486760625330935e-05, + "loss": 0.2467, + "step": 7488 + }, + { + "epoch": 0.638884149462549, + "grad_norm": 1.4364720691285133, + "learning_rate": 3.0474041613430028e-05, + "loss": 0.2233, + "step": 7489 + }, + { + "epoch": 0.6389694591366661, + "grad_norm": 1.8854279772452593, + "learning_rate": 3.046132409223228e-05, + "loss": 0.2806, + "step": 7490 + }, + { + "epoch": 0.6390547688107832, + "grad_norm": 2.3132048898633673, + "learning_rate": 3.0448608062708628e-05, + "loss": 0.2391, + "step": 7491 + }, + { + "epoch": 0.6391400784849002, + "grad_norm": 1.6685950646643983, + "learning_rate": 3.0435893525829846e-05, + "loss": 0.2472, + "step": 7492 + }, + { + "epoch": 0.6392253881590172, + "grad_norm": 1.9597136793424266, + "learning_rate": 3.0423180482566633e-05, + "loss": 0.1723, + "step": 7493 + }, + { + "epoch": 0.6393106978331343, + "grad_norm": 1.8212349318572856, + "learning_rate": 3.0410468933889548e-05, + "loss": 0.1826, + "step": 7494 + }, + { + "epoch": 0.6393960075072513, + "grad_norm": 1.4935499432402255, + "learning_rate": 3.0397758880769084e-05, + "loss": 0.1515, + "step": 7495 + }, + { + "epoch": 0.6394813171813684, + "grad_norm": 1.7721077391074496, + "learning_rate": 3.038505032417554e-05, + "loss": 0.1988, + "step": 7496 + }, + { + "epoch": 0.6395666268554854, + "grad_norm": 1.8369522212444083, + "learning_rate": 3.0372343265079172e-05, + "loss": 0.1952, + "step": 7497 + }, + { + "epoch": 0.6396519365296025, + "grad_norm": 1.5446480798987396, + "learning_rate": 3.0359637704450117e-05, + "loss": 0.1999, + "step": 7498 + }, + { + "epoch": 0.6397372462037195, + "grad_norm": 1.536029130719037, + "learning_rate": 3.034693364325833e-05, + "loss": 0.1932, + "step": 7499 + }, + { + "epoch": 0.6398225558778365, + "grad_norm": 1.539195098586814, + "learning_rate": 3.0334231082473724e-05, + "loss": 0.2254, + "step": 7500 + }, + { + "epoch": 0.6399078655519536, + "grad_norm": 1.7638592963685806, + "learning_rate": 3.0321530023066093e-05, + "loss": 0.242, + "step": 7501 + }, + { + "epoch": 0.6399931752260707, + "grad_norm": 2.4160049672612542, + "learning_rate": 3.030883046600505e-05, + "loss": 0.2425, + "step": 7502 + }, + { + "epoch": 0.6400784849001877, + "grad_norm": 1.5439377458207215, + "learning_rate": 3.0296132412260175e-05, + "loss": 0.1932, + "step": 7503 + }, + { + "epoch": 0.6401637945743047, + "grad_norm": 1.8001256316105547, + "learning_rate": 3.0283435862800873e-05, + "loss": 0.2181, + "step": 7504 + }, + { + "epoch": 0.6402491042484217, + "grad_norm": 1.9301027167860707, + "learning_rate": 3.0270740818596487e-05, + "loss": 0.2003, + "step": 7505 + }, + { + "epoch": 0.6403344139225389, + "grad_norm": 1.591567171525494, + "learning_rate": 3.0258047280616187e-05, + "loss": 0.2368, + "step": 7506 + }, + { + "epoch": 0.6404197235966559, + "grad_norm": 2.086718466646953, + "learning_rate": 3.0245355249829055e-05, + "loss": 0.2412, + "step": 7507 + }, + { + "epoch": 0.6405050332707729, + "grad_norm": 2.1311243104089486, + "learning_rate": 3.023266472720411e-05, + "loss": 0.2007, + "step": 7508 + }, + { + "epoch": 0.6405903429448899, + "grad_norm": 1.679946311705893, + "learning_rate": 3.0219975713710134e-05, + "loss": 0.1489, + "step": 7509 + }, + { + "epoch": 0.6406756526190069, + "grad_norm": 1.2436168771935352, + "learning_rate": 3.020728821031591e-05, + "loss": 0.2155, + "step": 7510 + }, + { + "epoch": 0.6407609622931241, + "grad_norm": 1.702545907498142, + "learning_rate": 3.019460221799007e-05, + "loss": 0.1938, + "step": 7511 + }, + { + "epoch": 0.6408462719672411, + "grad_norm": 1.6672887197583894, + "learning_rate": 3.018191773770108e-05, + "loss": 0.1521, + "step": 7512 + }, + { + "epoch": 0.6409315816413581, + "grad_norm": 1.5300445482836982, + "learning_rate": 3.0169234770417376e-05, + "loss": 0.1798, + "step": 7513 + }, + { + "epoch": 0.6410168913154751, + "grad_norm": 1.6576255987805033, + "learning_rate": 3.0156553317107218e-05, + "loss": 0.2087, + "step": 7514 + }, + { + "epoch": 0.6411022009895923, + "grad_norm": 1.3771493758362194, + "learning_rate": 3.0143873378738762e-05, + "loss": 0.1564, + "step": 7515 + }, + { + "epoch": 0.6411875106637093, + "grad_norm": 1.3452175022659756, + "learning_rate": 3.0131194956280052e-05, + "loss": 0.2112, + "step": 7516 + }, + { + "epoch": 0.6412728203378263, + "grad_norm": 1.5729548300182787, + "learning_rate": 3.011851805069904e-05, + "loss": 0.1392, + "step": 7517 + }, + { + "epoch": 0.6413581300119433, + "grad_norm": 1.881795774861702, + "learning_rate": 3.0105842662963503e-05, + "loss": 0.1762, + "step": 7518 + }, + { + "epoch": 0.6414434396860604, + "grad_norm": 1.3783793969319902, + "learning_rate": 3.009316879404116e-05, + "loss": 0.2375, + "step": 7519 + }, + { + "epoch": 0.6415287493601775, + "grad_norm": 1.485131964166407, + "learning_rate": 3.0080496444899597e-05, + "loss": 0.2047, + "step": 7520 + }, + { + "epoch": 0.6416140590342945, + "grad_norm": 1.4213846220409219, + "learning_rate": 3.0067825616506306e-05, + "loss": 0.2593, + "step": 7521 + }, + { + "epoch": 0.6416993687084115, + "grad_norm": 1.7560228519907897, + "learning_rate": 3.005515630982858e-05, + "loss": 0.2343, + "step": 7522 + }, + { + "epoch": 0.6417846783825286, + "grad_norm": 1.2590847807404246, + "learning_rate": 3.00424885258337e-05, + "loss": 0.1842, + "step": 7523 + }, + { + "epoch": 0.6418699880566456, + "grad_norm": 1.7169103313708662, + "learning_rate": 3.002982226548876e-05, + "loss": 0.2007, + "step": 7524 + }, + { + "epoch": 0.6419552977307627, + "grad_norm": 1.4544987081742935, + "learning_rate": 3.0017157529760775e-05, + "loss": 0.1838, + "step": 7525 + }, + { + "epoch": 0.6420406074048797, + "grad_norm": 1.6420274099322678, + "learning_rate": 3.0004494319616604e-05, + "loss": 0.1678, + "step": 7526 + }, + { + "epoch": 0.6421259170789968, + "grad_norm": 1.8150384162221087, + "learning_rate": 2.9991832636023065e-05, + "loss": 0.2547, + "step": 7527 + }, + { + "epoch": 0.6422112267531138, + "grad_norm": 1.6807591058432496, + "learning_rate": 2.997917247994676e-05, + "loss": 0.1544, + "step": 7528 + }, + { + "epoch": 0.6422965364272308, + "grad_norm": 1.8952640335414845, + "learning_rate": 2.9966513852354243e-05, + "loss": 0.1922, + "step": 7529 + }, + { + "epoch": 0.6423818461013479, + "grad_norm": 2.0630917711653063, + "learning_rate": 2.995385675421196e-05, + "loss": 0.2631, + "step": 7530 + }, + { + "epoch": 0.642467155775465, + "grad_norm": 1.6390545549372872, + "learning_rate": 2.994120118648617e-05, + "loss": 0.187, + "step": 7531 + }, + { + "epoch": 0.642552465449582, + "grad_norm": 1.8281698628977072, + "learning_rate": 2.9928547150143066e-05, + "loss": 0.2414, + "step": 7532 + }, + { + "epoch": 0.642637775123699, + "grad_norm": 1.7693841801509314, + "learning_rate": 2.9915894646148756e-05, + "loss": 0.1793, + "step": 7533 + }, + { + "epoch": 0.642723084797816, + "grad_norm": 1.5245047718847953, + "learning_rate": 2.990324367546914e-05, + "loss": 0.2128, + "step": 7534 + }, + { + "epoch": 0.6428083944719332, + "grad_norm": 1.4024379772948952, + "learning_rate": 2.9890594239070084e-05, + "loss": 0.1615, + "step": 7535 + }, + { + "epoch": 0.6428937041460502, + "grad_norm": 1.72873782006729, + "learning_rate": 2.9877946337917296e-05, + "loss": 0.1603, + "step": 7536 + }, + { + "epoch": 0.6429790138201672, + "grad_norm": 1.6192058554694257, + "learning_rate": 2.9865299972976386e-05, + "loss": 0.2249, + "step": 7537 + }, + { + "epoch": 0.6430643234942842, + "grad_norm": 1.4199111287420036, + "learning_rate": 2.985265514521281e-05, + "loss": 0.1739, + "step": 7538 + }, + { + "epoch": 0.6431496331684013, + "grad_norm": 1.8327804367499934, + "learning_rate": 2.9840011855591953e-05, + "loss": 0.1659, + "step": 7539 + }, + { + "epoch": 0.6432349428425184, + "grad_norm": 1.7140645753053327, + "learning_rate": 2.982737010507908e-05, + "loss": 0.1967, + "step": 7540 + }, + { + "epoch": 0.6433202525166354, + "grad_norm": 1.4870008833368855, + "learning_rate": 2.9814729894639282e-05, + "loss": 0.1801, + "step": 7541 + }, + { + "epoch": 0.6434055621907524, + "grad_norm": 2.0591963694874638, + "learning_rate": 2.980209122523759e-05, + "loss": 0.2475, + "step": 7542 + }, + { + "epoch": 0.6434908718648695, + "grad_norm": 2.092762728551112, + "learning_rate": 2.978945409783892e-05, + "loss": 0.2058, + "step": 7543 + }, + { + "epoch": 0.6435761815389865, + "grad_norm": 1.7208461810256634, + "learning_rate": 2.9776818513408016e-05, + "loss": 0.2066, + "step": 7544 + }, + { + "epoch": 0.6436614912131036, + "grad_norm": 1.3407506959405844, + "learning_rate": 2.9764184472909562e-05, + "loss": 0.2458, + "step": 7545 + }, + { + "epoch": 0.6437468008872206, + "grad_norm": 1.4651433612866402, + "learning_rate": 2.975155197730809e-05, + "loss": 0.1879, + "step": 7546 + }, + { + "epoch": 0.6438321105613376, + "grad_norm": 1.5368459783655166, + "learning_rate": 2.9738921027568024e-05, + "loss": 0.1567, + "step": 7547 + }, + { + "epoch": 0.6439174202354547, + "grad_norm": 1.5641426607029827, + "learning_rate": 2.9726291624653658e-05, + "loss": 0.2503, + "step": 7548 + }, + { + "epoch": 0.6440027299095717, + "grad_norm": 1.5580215359767602, + "learning_rate": 2.9713663769529194e-05, + "loss": 0.2151, + "step": 7549 + }, + { + "epoch": 0.6440880395836888, + "grad_norm": 1.5045538758773247, + "learning_rate": 2.970103746315872e-05, + "loss": 0.2442, + "step": 7550 + }, + { + "epoch": 0.6441733492578058, + "grad_norm": 1.7828508511696877, + "learning_rate": 2.9688412706506147e-05, + "loss": 0.2513, + "step": 7551 + }, + { + "epoch": 0.6442586589319229, + "grad_norm": 1.7571962081527872, + "learning_rate": 2.9675789500535328e-05, + "loss": 0.2147, + "step": 7552 + }, + { + "epoch": 0.6443439686060399, + "grad_norm": 2.5015205527511024, + "learning_rate": 2.9663167846209998e-05, + "loss": 0.2576, + "step": 7553 + }, + { + "epoch": 0.644429278280157, + "grad_norm": 1.5006685356615432, + "learning_rate": 2.9650547744493712e-05, + "loss": 0.1971, + "step": 7554 + }, + { + "epoch": 0.644514587954274, + "grad_norm": 1.5566861135233323, + "learning_rate": 2.9637929196349978e-05, + "loss": 0.1856, + "step": 7555 + }, + { + "epoch": 0.6445998976283911, + "grad_norm": 1.5083779969461046, + "learning_rate": 2.962531220274215e-05, + "loss": 0.1733, + "step": 7556 + }, + { + "epoch": 0.6446852073025081, + "grad_norm": 1.4123941348078055, + "learning_rate": 2.9612696764633465e-05, + "loss": 0.1264, + "step": 7557 + }, + { + "epoch": 0.6447705169766251, + "grad_norm": 1.7413611398076443, + "learning_rate": 2.9600082882987028e-05, + "loss": 0.2804, + "step": 7558 + }, + { + "epoch": 0.6448558266507421, + "grad_norm": 1.3106227346827295, + "learning_rate": 2.9587470558765886e-05, + "loss": 0.2146, + "step": 7559 + }, + { + "epoch": 0.6449411363248593, + "grad_norm": 2.104491281177299, + "learning_rate": 2.9574859792932863e-05, + "loss": 0.246, + "step": 7560 + }, + { + "epoch": 0.6450264459989763, + "grad_norm": 1.7646032432084477, + "learning_rate": 2.956225058645077e-05, + "loss": 0.2338, + "step": 7561 + }, + { + "epoch": 0.6451117556730933, + "grad_norm": 1.5126513381782114, + "learning_rate": 2.9549642940282246e-05, + "loss": 0.1753, + "step": 7562 + }, + { + "epoch": 0.6451970653472103, + "grad_norm": 1.598955815051288, + "learning_rate": 2.95370368553898e-05, + "loss": 0.2527, + "step": 7563 + }, + { + "epoch": 0.6452823750213275, + "grad_norm": 1.7256645525827932, + "learning_rate": 2.9524432332735842e-05, + "loss": 0.2428, + "step": 7564 + }, + { + "epoch": 0.6453676846954445, + "grad_norm": 2.0358463808574054, + "learning_rate": 2.9511829373282683e-05, + "loss": 0.2032, + "step": 7565 + }, + { + "epoch": 0.6454529943695615, + "grad_norm": 1.5909009413076953, + "learning_rate": 2.949922797799247e-05, + "loss": 0.2491, + "step": 7566 + }, + { + "epoch": 0.6455383040436785, + "grad_norm": 1.9233941997547321, + "learning_rate": 2.9486628147827273e-05, + "loss": 0.2407, + "step": 7567 + }, + { + "epoch": 0.6456236137177956, + "grad_norm": 1.7766476116745058, + "learning_rate": 2.947402988374899e-05, + "loss": 0.1916, + "step": 7568 + }, + { + "epoch": 0.6457089233919127, + "grad_norm": 1.862579597462002, + "learning_rate": 2.946143318671947e-05, + "loss": 0.1943, + "step": 7569 + }, + { + "epoch": 0.6457942330660297, + "grad_norm": 1.6303043357523317, + "learning_rate": 2.9448838057700368e-05, + "loss": 0.1674, + "step": 7570 + }, + { + "epoch": 0.6458795427401467, + "grad_norm": 1.6272072107078404, + "learning_rate": 2.9436244497653274e-05, + "loss": 0.2431, + "step": 7571 + }, + { + "epoch": 0.6459648524142638, + "grad_norm": 1.6750930760272673, + "learning_rate": 2.9423652507539655e-05, + "loss": 0.2305, + "step": 7572 + }, + { + "epoch": 0.6460501620883808, + "grad_norm": 1.5723786686603456, + "learning_rate": 2.94110620883208e-05, + "loss": 0.1625, + "step": 7573 + }, + { + "epoch": 0.6461354717624979, + "grad_norm": 1.8479891390460677, + "learning_rate": 2.9398473240957945e-05, + "loss": 0.2023, + "step": 7574 + }, + { + "epoch": 0.6462207814366149, + "grad_norm": 1.3971324350610246, + "learning_rate": 2.9385885966412207e-05, + "loss": 0.2796, + "step": 7575 + }, + { + "epoch": 0.646306091110732, + "grad_norm": 1.5677812966976301, + "learning_rate": 2.937330026564451e-05, + "loss": 0.1712, + "step": 7576 + }, + { + "epoch": 0.646391400784849, + "grad_norm": 1.5649328746354805, + "learning_rate": 2.936071613961574e-05, + "loss": 0.2386, + "step": 7577 + }, + { + "epoch": 0.646476710458966, + "grad_norm": 1.6056655057829392, + "learning_rate": 2.934813358928661e-05, + "loss": 0.2276, + "step": 7578 + }, + { + "epoch": 0.6465620201330831, + "grad_norm": 1.5927256115645032, + "learning_rate": 2.9335552615617745e-05, + "loss": 0.2077, + "step": 7579 + }, + { + "epoch": 0.6466473298072002, + "grad_norm": 1.53046568421879, + "learning_rate": 2.9322973219569605e-05, + "loss": 0.1813, + "step": 7580 + }, + { + "epoch": 0.6467326394813172, + "grad_norm": 1.4725310173718558, + "learning_rate": 2.9310395402102592e-05, + "loss": 0.1843, + "step": 7581 + }, + { + "epoch": 0.6468179491554342, + "grad_norm": 1.871191989812386, + "learning_rate": 2.9297819164176965e-05, + "loss": 0.1746, + "step": 7582 + }, + { + "epoch": 0.6469032588295512, + "grad_norm": 1.8777732049947784, + "learning_rate": 2.9285244506752808e-05, + "loss": 0.2491, + "step": 7583 + }, + { + "epoch": 0.6469885685036684, + "grad_norm": 2.19582352850195, + "learning_rate": 2.9272671430790155e-05, + "loss": 0.2587, + "step": 7584 + }, + { + "epoch": 0.6470738781777854, + "grad_norm": 1.739084671029986, + "learning_rate": 2.926009993724892e-05, + "loss": 0.2218, + "step": 7585 + }, + { + "epoch": 0.6471591878519024, + "grad_norm": 1.3369985906212625, + "learning_rate": 2.924753002708882e-05, + "loss": 0.2458, + "step": 7586 + }, + { + "epoch": 0.6472444975260194, + "grad_norm": 1.8691016564846223, + "learning_rate": 2.923496170126953e-05, + "loss": 0.2505, + "step": 7587 + }, + { + "epoch": 0.6473298072001364, + "grad_norm": 1.486115409076374, + "learning_rate": 2.9222394960750577e-05, + "loss": 0.1513, + "step": 7588 + }, + { + "epoch": 0.6474151168742536, + "grad_norm": 1.7997307682019261, + "learning_rate": 2.920982980649135e-05, + "loss": 0.2027, + "step": 7589 + }, + { + "epoch": 0.6475004265483706, + "grad_norm": 2.304242003751565, + "learning_rate": 2.9197266239451128e-05, + "loss": 0.2063, + "step": 7590 + }, + { + "epoch": 0.6475857362224876, + "grad_norm": 1.5365577105264612, + "learning_rate": 2.9184704260589096e-05, + "loss": 0.1977, + "step": 7591 + }, + { + "epoch": 0.6476710458966046, + "grad_norm": 1.4127172319928643, + "learning_rate": 2.9172143870864276e-05, + "loss": 0.2325, + "step": 7592 + }, + { + "epoch": 0.6477563555707218, + "grad_norm": 1.7507223273409038, + "learning_rate": 2.91595850712356e-05, + "loss": 0.2406, + "step": 7593 + }, + { + "epoch": 0.6478416652448388, + "grad_norm": 1.2391920620886292, + "learning_rate": 2.9147027862661835e-05, + "loss": 0.2094, + "step": 7594 + }, + { + "epoch": 0.6479269749189558, + "grad_norm": 2.1810270179079025, + "learning_rate": 2.913447224610172e-05, + "loss": 0.2191, + "step": 7595 + }, + { + "epoch": 0.6480122845930728, + "grad_norm": 1.6505107407057176, + "learning_rate": 2.9121918222513735e-05, + "loss": 0.179, + "step": 7596 + }, + { + "epoch": 0.6480975942671899, + "grad_norm": 2.334170077928717, + "learning_rate": 2.9109365792856358e-05, + "loss": 0.2512, + "step": 7597 + }, + { + "epoch": 0.648182903941307, + "grad_norm": 1.8676600157627083, + "learning_rate": 2.909681495808789e-05, + "loss": 0.2249, + "step": 7598 + }, + { + "epoch": 0.648268213615424, + "grad_norm": 1.954363498928386, + "learning_rate": 2.9084265719166527e-05, + "loss": 0.1928, + "step": 7599 + }, + { + "epoch": 0.648353523289541, + "grad_norm": 1.6824304230178133, + "learning_rate": 2.907171807705031e-05, + "loss": 0.1711, + "step": 7600 + }, + { + "epoch": 0.6484388329636581, + "grad_norm": 1.700042875534616, + "learning_rate": 2.905917203269724e-05, + "loss": 0.2824, + "step": 7601 + }, + { + "epoch": 0.6485241426377751, + "grad_norm": 1.4284398205625002, + "learning_rate": 2.9046627587065066e-05, + "loss": 0.1564, + "step": 7602 + }, + { + "epoch": 0.6486094523118922, + "grad_norm": 1.6694918608671283, + "learning_rate": 2.9034084741111555e-05, + "loss": 0.2092, + "step": 7603 + }, + { + "epoch": 0.6486947619860092, + "grad_norm": 1.577888244394417, + "learning_rate": 2.9021543495794263e-05, + "loss": 0.2044, + "step": 7604 + }, + { + "epoch": 0.6487800716601263, + "grad_norm": 1.7294333225530791, + "learning_rate": 2.9009003852070636e-05, + "loss": 0.2238, + "step": 7605 + }, + { + "epoch": 0.6488653813342433, + "grad_norm": 1.424599018901298, + "learning_rate": 2.8996465810898027e-05, + "loss": 0.2098, + "step": 7606 + }, + { + "epoch": 0.6489506910083603, + "grad_norm": 2.1116765781923688, + "learning_rate": 2.898392937323364e-05, + "loss": 0.2418, + "step": 7607 + }, + { + "epoch": 0.6490360006824774, + "grad_norm": 1.6803723735475236, + "learning_rate": 2.8971394540034562e-05, + "loss": 0.2115, + "step": 7608 + }, + { + "epoch": 0.6491213103565945, + "grad_norm": 1.5838462684716246, + "learning_rate": 2.8958861312257745e-05, + "loss": 0.1879, + "step": 7609 + }, + { + "epoch": 0.6492066200307115, + "grad_norm": 1.223470106214108, + "learning_rate": 2.894632969086008e-05, + "loss": 0.1971, + "step": 7610 + }, + { + "epoch": 0.6492919297048285, + "grad_norm": 1.7480165958446003, + "learning_rate": 2.8933799676798256e-05, + "loss": 0.2097, + "step": 7611 + }, + { + "epoch": 0.6493772393789455, + "grad_norm": 1.8753838464113461, + "learning_rate": 2.8921271271028894e-05, + "loss": 0.2435, + "step": 7612 + }, + { + "epoch": 0.6494625490530627, + "grad_norm": 1.5554712514890734, + "learning_rate": 2.8908744474508443e-05, + "loss": 0.1936, + "step": 7613 + }, + { + "epoch": 0.6495478587271797, + "grad_norm": 1.8410864518736498, + "learning_rate": 2.8896219288193282e-05, + "loss": 0.2028, + "step": 7614 + }, + { + "epoch": 0.6496331684012967, + "grad_norm": 1.797067648470338, + "learning_rate": 2.8883695713039625e-05, + "loss": 0.197, + "step": 7615 + }, + { + "epoch": 0.6497184780754137, + "grad_norm": 1.9077191554172865, + "learning_rate": 2.8871173750003577e-05, + "loss": 0.2033, + "step": 7616 + }, + { + "epoch": 0.6498037877495308, + "grad_norm": 1.5685506017643838, + "learning_rate": 2.8858653400041176e-05, + "loss": 0.1908, + "step": 7617 + }, + { + "epoch": 0.6498890974236479, + "grad_norm": 1.5963383020937152, + "learning_rate": 2.8846134664108194e-05, + "loss": 0.2858, + "step": 7618 + }, + { + "epoch": 0.6499744070977649, + "grad_norm": 1.4876850365705399, + "learning_rate": 2.8833617543160448e-05, + "loss": 0.215, + "step": 7619 + }, + { + "epoch": 0.6500597167718819, + "grad_norm": 2.3474253002952636, + "learning_rate": 2.8821102038153515e-05, + "loss": 0.2035, + "step": 7620 + }, + { + "epoch": 0.650145026445999, + "grad_norm": 1.5568581767725724, + "learning_rate": 2.8808588150042902e-05, + "loss": 0.2038, + "step": 7621 + }, + { + "epoch": 0.650230336120116, + "grad_norm": 1.8331983545494477, + "learning_rate": 2.8796075879783956e-05, + "loss": 0.212, + "step": 7622 + }, + { + "epoch": 0.6503156457942331, + "grad_norm": 1.9737362162112955, + "learning_rate": 2.8783565228331976e-05, + "loss": 0.3025, + "step": 7623 + }, + { + "epoch": 0.6504009554683501, + "grad_norm": 1.5400022164861409, + "learning_rate": 2.8771056196641998e-05, + "loss": 0.1625, + "step": 7624 + }, + { + "epoch": 0.6504862651424671, + "grad_norm": 1.6229618201709695, + "learning_rate": 2.8758548785669104e-05, + "loss": 0.1777, + "step": 7625 + }, + { + "epoch": 0.6505715748165842, + "grad_norm": 1.6702660635335747, + "learning_rate": 2.874604299636813e-05, + "loss": 0.2067, + "step": 7626 + }, + { + "epoch": 0.6506568844907012, + "grad_norm": 1.9915823222966142, + "learning_rate": 2.873353882969382e-05, + "loss": 0.2519, + "step": 7627 + }, + { + "epoch": 0.6507421941648183, + "grad_norm": 1.1129125151752566, + "learning_rate": 2.8721036286600818e-05, + "loss": 0.1847, + "step": 7628 + }, + { + "epoch": 0.6508275038389353, + "grad_norm": 1.6286991271288276, + "learning_rate": 2.87085353680436e-05, + "loss": 0.1513, + "step": 7629 + }, + { + "epoch": 0.6509128135130524, + "grad_norm": 1.4372641328444244, + "learning_rate": 2.8696036074976595e-05, + "loss": 0.2337, + "step": 7630 + }, + { + "epoch": 0.6509981231871694, + "grad_norm": 1.8451519824442297, + "learning_rate": 2.8683538408353992e-05, + "loss": 0.1954, + "step": 7631 + }, + { + "epoch": 0.6510834328612864, + "grad_norm": 2.3594947430874837, + "learning_rate": 2.8671042369129984e-05, + "loss": 0.2572, + "step": 7632 + }, + { + "epoch": 0.6511687425354035, + "grad_norm": 1.5455069609540495, + "learning_rate": 2.8658547958258543e-05, + "loss": 0.2353, + "step": 7633 + }, + { + "epoch": 0.6512540522095206, + "grad_norm": 1.5575893291983665, + "learning_rate": 2.8646055176693553e-05, + "loss": 0.2089, + "step": 7634 + }, + { + "epoch": 0.6513393618836376, + "grad_norm": 2.166011442244045, + "learning_rate": 2.863356402538878e-05, + "loss": 0.1964, + "step": 7635 + }, + { + "epoch": 0.6514246715577546, + "grad_norm": 1.3243892413638751, + "learning_rate": 2.8621074505297852e-05, + "loss": 0.1473, + "step": 7636 + }, + { + "epoch": 0.6515099812318716, + "grad_norm": 2.2775132392640867, + "learning_rate": 2.860858661737428e-05, + "loss": 0.2332, + "step": 7637 + }, + { + "epoch": 0.6515952909059888, + "grad_norm": 1.642878773647567, + "learning_rate": 2.8596100362571422e-05, + "loss": 0.191, + "step": 7638 + }, + { + "epoch": 0.6516806005801058, + "grad_norm": 1.6827598683973661, + "learning_rate": 2.8583615741842585e-05, + "loss": 0.2087, + "step": 7639 + }, + { + "epoch": 0.6517659102542228, + "grad_norm": 1.6077685440030898, + "learning_rate": 2.8571132756140873e-05, + "loss": 0.2249, + "step": 7640 + }, + { + "epoch": 0.6518512199283398, + "grad_norm": 1.5186218050631812, + "learning_rate": 2.8558651406419308e-05, + "loss": 0.2022, + "step": 7641 + }, + { + "epoch": 0.651936529602457, + "grad_norm": 1.3829893009933183, + "learning_rate": 2.8546171693630746e-05, + "loss": 0.1845, + "step": 7642 + }, + { + "epoch": 0.652021839276574, + "grad_norm": 2.0866138061961768, + "learning_rate": 2.853369361872801e-05, + "loss": 0.2787, + "step": 7643 + }, + { + "epoch": 0.652107148950691, + "grad_norm": 1.5156183930872427, + "learning_rate": 2.8521217182663655e-05, + "loss": 0.2358, + "step": 7644 + }, + { + "epoch": 0.652192458624808, + "grad_norm": 2.0805964867244366, + "learning_rate": 2.8508742386390252e-05, + "loss": 0.2475, + "step": 7645 + }, + { + "epoch": 0.6522777682989251, + "grad_norm": 1.8662528408103012, + "learning_rate": 2.8496269230860163e-05, + "loss": 0.1953, + "step": 7646 + }, + { + "epoch": 0.6523630779730422, + "grad_norm": 1.644450206073485, + "learning_rate": 2.8483797717025646e-05, + "loss": 0.238, + "step": 7647 + }, + { + "epoch": 0.6524483876471592, + "grad_norm": 1.5678070730077784, + "learning_rate": 2.8471327845838834e-05, + "loss": 0.2204, + "step": 7648 + }, + { + "epoch": 0.6525336973212762, + "grad_norm": 1.6837856953326948, + "learning_rate": 2.8458859618251744e-05, + "loss": 0.2653, + "step": 7649 + }, + { + "epoch": 0.6526190069953933, + "grad_norm": 1.9193255875194761, + "learning_rate": 2.8446393035216245e-05, + "loss": 0.1676, + "step": 7650 + }, + { + "epoch": 0.6527043166695103, + "grad_norm": 1.6784471767319178, + "learning_rate": 2.8433928097684087e-05, + "loss": 0.2447, + "step": 7651 + }, + { + "epoch": 0.6527896263436274, + "grad_norm": 1.8718022735172668, + "learning_rate": 2.8421464806606955e-05, + "loss": 0.178, + "step": 7652 + }, + { + "epoch": 0.6528749360177444, + "grad_norm": 1.5811003783835669, + "learning_rate": 2.8409003162936278e-05, + "loss": 0.2268, + "step": 7653 + }, + { + "epoch": 0.6529602456918615, + "grad_norm": 1.7462339750304188, + "learning_rate": 2.839654316762349e-05, + "loss": 0.2072, + "step": 7654 + }, + { + "epoch": 0.6530455553659785, + "grad_norm": 1.8060732058688196, + "learning_rate": 2.838408482161984e-05, + "loss": 0.2282, + "step": 7655 + }, + { + "epoch": 0.6531308650400955, + "grad_norm": 1.8758115919297356, + "learning_rate": 2.8371628125876443e-05, + "loss": 0.2665, + "step": 7656 + }, + { + "epoch": 0.6532161747142126, + "grad_norm": 1.7817432322999174, + "learning_rate": 2.8359173081344305e-05, + "loss": 0.2775, + "step": 7657 + }, + { + "epoch": 0.6533014843883297, + "grad_norm": 1.7812235646210317, + "learning_rate": 2.8346719688974278e-05, + "loss": 0.247, + "step": 7658 + }, + { + "epoch": 0.6533867940624467, + "grad_norm": 1.6238835240673972, + "learning_rate": 2.8334267949717187e-05, + "loss": 0.2461, + "step": 7659 + }, + { + "epoch": 0.6534721037365637, + "grad_norm": 1.5356891668046029, + "learning_rate": 2.8321817864523558e-05, + "loss": 0.2306, + "step": 7660 + }, + { + "epoch": 0.6535574134106807, + "grad_norm": 1.8725435059895752, + "learning_rate": 2.8309369434343963e-05, + "loss": 0.2566, + "step": 7661 + }, + { + "epoch": 0.6536427230847978, + "grad_norm": 1.7629047470152635, + "learning_rate": 2.8296922660128744e-05, + "loss": 0.2265, + "step": 7662 + }, + { + "epoch": 0.6537280327589149, + "grad_norm": 1.8353644592734866, + "learning_rate": 2.8284477542828153e-05, + "loss": 0.2319, + "step": 7663 + }, + { + "epoch": 0.6538133424330319, + "grad_norm": 1.5023435722585048, + "learning_rate": 2.8272034083392284e-05, + "loss": 0.1996, + "step": 7664 + }, + { + "epoch": 0.6538986521071489, + "grad_norm": 1.495360192123293, + "learning_rate": 2.8259592282771186e-05, + "loss": 0.1579, + "step": 7665 + }, + { + "epoch": 0.6539839617812659, + "grad_norm": 1.617936407822347, + "learning_rate": 2.8247152141914656e-05, + "loss": 0.1926, + "step": 7666 + }, + { + "epoch": 0.6540692714553831, + "grad_norm": 1.7945533681327674, + "learning_rate": 2.8234713661772484e-05, + "loss": 0.2346, + "step": 7667 + }, + { + "epoch": 0.6541545811295001, + "grad_norm": 1.5442124388420182, + "learning_rate": 2.822227684329426e-05, + "loss": 0.1917, + "step": 7668 + }, + { + "epoch": 0.6542398908036171, + "grad_norm": 1.9872271848370215, + "learning_rate": 2.820984168742947e-05, + "loss": 0.1878, + "step": 7669 + }, + { + "epoch": 0.6543252004777341, + "grad_norm": 1.4857531925624776, + "learning_rate": 2.8197408195127484e-05, + "loss": 0.1901, + "step": 7670 + }, + { + "epoch": 0.6544105101518513, + "grad_norm": 2.1774755486605697, + "learning_rate": 2.81849763673375e-05, + "loss": 0.3262, + "step": 7671 + }, + { + "epoch": 0.6544958198259683, + "grad_norm": 2.098366956171266, + "learning_rate": 2.8172546205008683e-05, + "loss": 0.2229, + "step": 7672 + }, + { + "epoch": 0.6545811295000853, + "grad_norm": 1.7555027685531996, + "learning_rate": 2.8160117709089927e-05, + "loss": 0.1882, + "step": 7673 + }, + { + "epoch": 0.6546664391742023, + "grad_norm": 1.4020919786312356, + "learning_rate": 2.8147690880530154e-05, + "loss": 0.1675, + "step": 7674 + }, + { + "epoch": 0.6547517488483194, + "grad_norm": 1.6988437282593902, + "learning_rate": 2.813526572027806e-05, + "loss": 0.2205, + "step": 7675 + }, + { + "epoch": 0.6548370585224365, + "grad_norm": 1.535812121345027, + "learning_rate": 2.8122842229282237e-05, + "loss": 0.2065, + "step": 7676 + }, + { + "epoch": 0.6549223681965535, + "grad_norm": 2.210401058831032, + "learning_rate": 2.8110420408491155e-05, + "loss": 0.2192, + "step": 7677 + }, + { + "epoch": 0.6550076778706705, + "grad_norm": 1.85429219738933, + "learning_rate": 2.809800025885315e-05, + "loss": 0.1873, + "step": 7678 + }, + { + "epoch": 0.6550929875447876, + "grad_norm": 1.8435873186141687, + "learning_rate": 2.8085581781316444e-05, + "loss": 0.2175, + "step": 7679 + }, + { + "epoch": 0.6551782972189046, + "grad_norm": 1.1869072187489105, + "learning_rate": 2.807316497682909e-05, + "loss": 0.1893, + "step": 7680 + }, + { + "epoch": 0.6552636068930217, + "grad_norm": 2.191140225016933, + "learning_rate": 2.8060749846339117e-05, + "loss": 0.206, + "step": 7681 + }, + { + "epoch": 0.6553489165671387, + "grad_norm": 1.6546755810993656, + "learning_rate": 2.8048336390794272e-05, + "loss": 0.2288, + "step": 7682 + }, + { + "epoch": 0.6554342262412558, + "grad_norm": 2.439555450810025, + "learning_rate": 2.8035924611142304e-05, + "loss": 0.2208, + "step": 7683 + }, + { + "epoch": 0.6555195359153728, + "grad_norm": 1.4958352445769076, + "learning_rate": 2.8023514508330755e-05, + "loss": 0.1717, + "step": 7684 + }, + { + "epoch": 0.6556048455894898, + "grad_norm": 1.8011081092747225, + "learning_rate": 2.8011106083307137e-05, + "loss": 0.1896, + "step": 7685 + }, + { + "epoch": 0.6556901552636069, + "grad_norm": 1.7572804641894182, + "learning_rate": 2.7998699337018676e-05, + "loss": 0.2335, + "step": 7686 + }, + { + "epoch": 0.655775464937724, + "grad_norm": 1.570052851376578, + "learning_rate": 2.7986294270412623e-05, + "loss": 0.2456, + "step": 7687 + }, + { + "epoch": 0.655860774611841, + "grad_norm": 1.4260593926846923, + "learning_rate": 2.7973890884436027e-05, + "loss": 0.1707, + "step": 7688 + }, + { + "epoch": 0.655946084285958, + "grad_norm": 1.2141080634209198, + "learning_rate": 2.796148918003581e-05, + "loss": 0.1849, + "step": 7689 + }, + { + "epoch": 0.656031393960075, + "grad_norm": 1.8882623206733764, + "learning_rate": 2.7949089158158788e-05, + "loss": 0.1789, + "step": 7690 + }, + { + "epoch": 0.6561167036341922, + "grad_norm": 1.7067940763261202, + "learning_rate": 2.7936690819751628e-05, + "loss": 0.2086, + "step": 7691 + }, + { + "epoch": 0.6562020133083092, + "grad_norm": 1.818951798088204, + "learning_rate": 2.7924294165760878e-05, + "loss": 0.2685, + "step": 7692 + }, + { + "epoch": 0.6562873229824262, + "grad_norm": 1.3469261513752735, + "learning_rate": 2.791189919713294e-05, + "loss": 0.2245, + "step": 7693 + }, + { + "epoch": 0.6563726326565432, + "grad_norm": 1.756955221726941, + "learning_rate": 2.789950591481416e-05, + "loss": 0.2487, + "step": 7694 + }, + { + "epoch": 0.6564579423306603, + "grad_norm": 1.680786766702085, + "learning_rate": 2.788711431975062e-05, + "loss": 0.1812, + "step": 7695 + }, + { + "epoch": 0.6565432520047774, + "grad_norm": 1.9716259371349516, + "learning_rate": 2.787472441288842e-05, + "loss": 0.2232, + "step": 7696 + }, + { + "epoch": 0.6566285616788944, + "grad_norm": 1.6622505172887034, + "learning_rate": 2.7862336195173434e-05, + "loss": 0.2221, + "step": 7697 + }, + { + "epoch": 0.6567138713530114, + "grad_norm": 1.5655838098646937, + "learning_rate": 2.784994966755144e-05, + "loss": 0.1914, + "step": 7698 + }, + { + "epoch": 0.6567991810271285, + "grad_norm": 1.5564197493665588, + "learning_rate": 2.7837564830968084e-05, + "loss": 0.2839, + "step": 7699 + }, + { + "epoch": 0.6568844907012455, + "grad_norm": 1.5926686526691056, + "learning_rate": 2.7825181686368863e-05, + "loss": 0.154, + "step": 7700 + }, + { + "epoch": 0.6569698003753626, + "grad_norm": 1.6998285010728853, + "learning_rate": 2.7812800234699222e-05, + "loss": 0.2425, + "step": 7701 + }, + { + "epoch": 0.6570551100494796, + "grad_norm": 1.6888941904174404, + "learning_rate": 2.7800420476904337e-05, + "loss": 0.3112, + "step": 7702 + }, + { + "epoch": 0.6571404197235966, + "grad_norm": 1.686555244751091, + "learning_rate": 2.7788042413929406e-05, + "loss": 0.2113, + "step": 7703 + }, + { + "epoch": 0.6572257293977137, + "grad_norm": 1.9056264138771686, + "learning_rate": 2.77756660467194e-05, + "loss": 0.1194, + "step": 7704 + }, + { + "epoch": 0.6573110390718307, + "grad_norm": 1.2269012080700339, + "learning_rate": 2.776329137621919e-05, + "loss": 0.1363, + "step": 7705 + }, + { + "epoch": 0.6573963487459478, + "grad_norm": 1.9860480156978453, + "learning_rate": 2.7750918403373506e-05, + "loss": 0.2114, + "step": 7706 + }, + { + "epoch": 0.6574816584200648, + "grad_norm": 1.8162270078102947, + "learning_rate": 2.7738547129127002e-05, + "loss": 0.1547, + "step": 7707 + }, + { + "epoch": 0.6575669680941819, + "grad_norm": 1.7752739224827847, + "learning_rate": 2.7726177554424087e-05, + "loss": 0.2742, + "step": 7708 + }, + { + "epoch": 0.6576522777682989, + "grad_norm": 1.3692426221787022, + "learning_rate": 2.7713809680209175e-05, + "loss": 0.1711, + "step": 7709 + }, + { + "epoch": 0.657737587442416, + "grad_norm": 1.7492047273960605, + "learning_rate": 2.7701443507426468e-05, + "loss": 0.1648, + "step": 7710 + }, + { + "epoch": 0.657822897116533, + "grad_norm": 1.996233791507662, + "learning_rate": 2.768907903702005e-05, + "loss": 0.2595, + "step": 7711 + }, + { + "epoch": 0.6579082067906501, + "grad_norm": 1.6317628836131193, + "learning_rate": 2.767671626993389e-05, + "loss": 0.2146, + "step": 7712 + }, + { + "epoch": 0.6579935164647671, + "grad_norm": 2.206250510067111, + "learning_rate": 2.7664355207111813e-05, + "loss": 0.2669, + "step": 7713 + }, + { + "epoch": 0.6580788261388841, + "grad_norm": 2.152835741254033, + "learning_rate": 2.765199584949753e-05, + "loss": 0.1813, + "step": 7714 + }, + { + "epoch": 0.6581641358130011, + "grad_norm": 1.7526079227885278, + "learning_rate": 2.763963819803459e-05, + "loss": 0.2183, + "step": 7715 + }, + { + "epoch": 0.6582494454871183, + "grad_norm": 2.022602159256463, + "learning_rate": 2.7627282253666465e-05, + "loss": 0.1899, + "step": 7716 + }, + { + "epoch": 0.6583347551612353, + "grad_norm": 1.4713272857461068, + "learning_rate": 2.761492801733645e-05, + "loss": 0.2276, + "step": 7717 + }, + { + "epoch": 0.6584200648353523, + "grad_norm": 1.9996480632123705, + "learning_rate": 2.7602575489987727e-05, + "loss": 0.1849, + "step": 7718 + }, + { + "epoch": 0.6585053745094693, + "grad_norm": 1.9821616016026957, + "learning_rate": 2.759022467256335e-05, + "loss": 0.1853, + "step": 7719 + }, + { + "epoch": 0.6585906841835865, + "grad_norm": 1.8480897681161554, + "learning_rate": 2.7577875566006227e-05, + "loss": 0.2397, + "step": 7720 + }, + { + "epoch": 0.6586759938577035, + "grad_norm": 1.466419156098515, + "learning_rate": 2.7565528171259158e-05, + "loss": 0.2421, + "step": 7721 + }, + { + "epoch": 0.6587613035318205, + "grad_norm": 1.371909830611417, + "learning_rate": 2.7553182489264777e-05, + "loss": 0.1794, + "step": 7722 + }, + { + "epoch": 0.6588466132059375, + "grad_norm": 1.5744047220717536, + "learning_rate": 2.7540838520965672e-05, + "loss": 0.219, + "step": 7723 + }, + { + "epoch": 0.6589319228800546, + "grad_norm": 1.648103436071104, + "learning_rate": 2.7528496267304155e-05, + "loss": 0.2051, + "step": 7724 + }, + { + "epoch": 0.6590172325541717, + "grad_norm": 1.7361498849565575, + "learning_rate": 2.7516155729222553e-05, + "loss": 0.2039, + "step": 7725 + }, + { + "epoch": 0.6591025422282887, + "grad_norm": 2.026156803837697, + "learning_rate": 2.7503816907662982e-05, + "loss": 0.2838, + "step": 7726 + }, + { + "epoch": 0.6591878519024057, + "grad_norm": 1.9698889934421115, + "learning_rate": 2.7491479803567453e-05, + "loss": 0.2156, + "step": 7727 + }, + { + "epoch": 0.6592731615765228, + "grad_norm": 1.8779265135422345, + "learning_rate": 2.74791444178778e-05, + "loss": 0.2648, + "step": 7728 + }, + { + "epoch": 0.6593584712506398, + "grad_norm": 1.8021327840675705, + "learning_rate": 2.746681075153582e-05, + "loss": 0.2456, + "step": 7729 + }, + { + "epoch": 0.6594437809247569, + "grad_norm": 1.585249878593841, + "learning_rate": 2.7454478805483104e-05, + "loss": 0.2274, + "step": 7730 + }, + { + "epoch": 0.6595290905988739, + "grad_norm": 1.641171173736209, + "learning_rate": 2.744214858066112e-05, + "loss": 0.2528, + "step": 7731 + }, + { + "epoch": 0.659614400272991, + "grad_norm": 1.9828643455370627, + "learning_rate": 2.7429820078011214e-05, + "loss": 0.1651, + "step": 7732 + }, + { + "epoch": 0.659699709947108, + "grad_norm": 1.8156236500825311, + "learning_rate": 2.7417493298474618e-05, + "loss": 0.2119, + "step": 7733 + }, + { + "epoch": 0.659785019621225, + "grad_norm": 1.4940413697628951, + "learning_rate": 2.7405168242992396e-05, + "loss": 0.1807, + "step": 7734 + }, + { + "epoch": 0.6598703292953421, + "grad_norm": 1.3152850438988888, + "learning_rate": 2.7392844912505494e-05, + "loss": 0.168, + "step": 7735 + }, + { + "epoch": 0.6599556389694592, + "grad_norm": 1.9336686553317948, + "learning_rate": 2.7380523307954785e-05, + "loss": 0.2246, + "step": 7736 + }, + { + "epoch": 0.6600409486435762, + "grad_norm": 1.6171061177439536, + "learning_rate": 2.7368203430280887e-05, + "loss": 0.2099, + "step": 7737 + }, + { + "epoch": 0.6601262583176932, + "grad_norm": 1.4903378831905865, + "learning_rate": 2.735588528042441e-05, + "loss": 0.2006, + "step": 7738 + }, + { + "epoch": 0.6602115679918102, + "grad_norm": 1.451553643633841, + "learning_rate": 2.7343568859325763e-05, + "loss": 0.2175, + "step": 7739 + }, + { + "epoch": 0.6602968776659273, + "grad_norm": 1.832577036426203, + "learning_rate": 2.7331254167925235e-05, + "loss": 0.2077, + "step": 7740 + }, + { + "epoch": 0.6603821873400444, + "grad_norm": 1.8377777616282263, + "learning_rate": 2.7318941207162984e-05, + "loss": 0.2626, + "step": 7741 + }, + { + "epoch": 0.6604674970141614, + "grad_norm": 1.6265423131270893, + "learning_rate": 2.7306629977979047e-05, + "loss": 0.228, + "step": 7742 + }, + { + "epoch": 0.6605528066882784, + "grad_norm": 1.862424001487287, + "learning_rate": 2.7294320481313328e-05, + "loss": 0.189, + "step": 7743 + }, + { + "epoch": 0.6606381163623954, + "grad_norm": 1.5469129153142132, + "learning_rate": 2.7282012718105554e-05, + "loss": 0.1735, + "step": 7744 + }, + { + "epoch": 0.6607234260365126, + "grad_norm": 1.8272988824771113, + "learning_rate": 2.726970668929541e-05, + "loss": 0.1795, + "step": 7745 + }, + { + "epoch": 0.6608087357106296, + "grad_norm": 2.063639355009185, + "learning_rate": 2.7257402395822372e-05, + "loss": 0.2128, + "step": 7746 + }, + { + "epoch": 0.6608940453847466, + "grad_norm": 2.104357594303922, + "learning_rate": 2.7245099838625805e-05, + "loss": 0.282, + "step": 7747 + }, + { + "epoch": 0.6609793550588636, + "grad_norm": 1.4727150983713955, + "learning_rate": 2.723279901864493e-05, + "loss": 0.2183, + "step": 7748 + }, + { + "epoch": 0.6610646647329808, + "grad_norm": 1.3413468096121213, + "learning_rate": 2.7220499936818896e-05, + "loss": 0.1784, + "step": 7749 + }, + { + "epoch": 0.6611499744070978, + "grad_norm": 1.5493714196278028, + "learning_rate": 2.7208202594086605e-05, + "loss": 0.1641, + "step": 7750 + }, + { + "epoch": 0.6612352840812148, + "grad_norm": 1.9132399679569192, + "learning_rate": 2.7195906991386953e-05, + "loss": 0.1611, + "step": 7751 + }, + { + "epoch": 0.6613205937553318, + "grad_norm": 1.7674922015977157, + "learning_rate": 2.718361312965862e-05, + "loss": 0.2374, + "step": 7752 + }, + { + "epoch": 0.6614059034294489, + "grad_norm": 2.5542990479589127, + "learning_rate": 2.7171321009840178e-05, + "loss": 0.273, + "step": 7753 + }, + { + "epoch": 0.661491213103566, + "grad_norm": 1.801595574081161, + "learning_rate": 2.7159030632870063e-05, + "loss": 0.2499, + "step": 7754 + }, + { + "epoch": 0.661576522777683, + "grad_norm": 1.6379621421435415, + "learning_rate": 2.7146741999686588e-05, + "loss": 0.2202, + "step": 7755 + }, + { + "epoch": 0.6616618324518, + "grad_norm": 1.3295934300929415, + "learning_rate": 2.7134455111227917e-05, + "loss": 0.1619, + "step": 7756 + }, + { + "epoch": 0.6617471421259171, + "grad_norm": 1.7956864780055204, + "learning_rate": 2.7122169968432075e-05, + "loss": 0.1987, + "step": 7757 + }, + { + "epoch": 0.6618324518000341, + "grad_norm": 1.6438338370754462, + "learning_rate": 2.710988657223702e-05, + "loss": 0.1835, + "step": 7758 + }, + { + "epoch": 0.6619177614741512, + "grad_norm": 1.4315816703200648, + "learning_rate": 2.7097604923580443e-05, + "loss": 0.1799, + "step": 7759 + }, + { + "epoch": 0.6620030711482682, + "grad_norm": 1.66975068224296, + "learning_rate": 2.7085325023400056e-05, + "loss": 0.2805, + "step": 7760 + }, + { + "epoch": 0.6620883808223853, + "grad_norm": 1.9071114266562563, + "learning_rate": 2.7073046872633324e-05, + "loss": 0.1815, + "step": 7761 + }, + { + "epoch": 0.6621736904965023, + "grad_norm": 1.4154193754098192, + "learning_rate": 2.7060770472217634e-05, + "loss": 0.2122, + "step": 7762 + }, + { + "epoch": 0.6622590001706193, + "grad_norm": 1.6822860135041513, + "learning_rate": 2.7048495823090226e-05, + "loss": 0.2249, + "step": 7763 + }, + { + "epoch": 0.6623443098447364, + "grad_norm": 1.9270813057003866, + "learning_rate": 2.703622292618817e-05, + "loss": 0.2226, + "step": 7764 + }, + { + "epoch": 0.6624296195188535, + "grad_norm": 1.334882799497456, + "learning_rate": 2.7023951782448505e-05, + "loss": 0.2049, + "step": 7765 + }, + { + "epoch": 0.6625149291929705, + "grad_norm": 1.8440890242959553, + "learning_rate": 2.701168239280799e-05, + "loss": 0.2383, + "step": 7766 + }, + { + "epoch": 0.6626002388670875, + "grad_norm": 1.921828788884059, + "learning_rate": 2.6999414758203378e-05, + "loss": 0.2528, + "step": 7767 + }, + { + "epoch": 0.6626855485412045, + "grad_norm": 1.9771286225298685, + "learning_rate": 2.6987148879571233e-05, + "loss": 0.2213, + "step": 7768 + }, + { + "epoch": 0.6627708582153217, + "grad_norm": 1.9368713343420358, + "learning_rate": 2.6974884757847975e-05, + "loss": 0.1866, + "step": 7769 + }, + { + "epoch": 0.6628561678894387, + "grad_norm": 1.234625296023871, + "learning_rate": 2.6962622393969893e-05, + "loss": 0.1405, + "step": 7770 + }, + { + "epoch": 0.6629414775635557, + "grad_norm": 1.3033506660008112, + "learning_rate": 2.6950361788873207e-05, + "loss": 0.2008, + "step": 7771 + }, + { + "epoch": 0.6630267872376727, + "grad_norm": 1.756957392923373, + "learning_rate": 2.693810294349388e-05, + "loss": 0.1485, + "step": 7772 + }, + { + "epoch": 0.6631120969117899, + "grad_norm": 1.704147172115283, + "learning_rate": 2.6925845858767856e-05, + "loss": 0.2587, + "step": 7773 + }, + { + "epoch": 0.6631974065859069, + "grad_norm": 1.1593446300878254, + "learning_rate": 2.6913590535630885e-05, + "loss": 0.1789, + "step": 7774 + }, + { + "epoch": 0.6632827162600239, + "grad_norm": 1.6631662166839631, + "learning_rate": 2.6901336975018597e-05, + "loss": 0.2, + "step": 7775 + }, + { + "epoch": 0.6633680259341409, + "grad_norm": 1.576896869448805, + "learning_rate": 2.6889085177866492e-05, + "loss": 0.1952, + "step": 7776 + }, + { + "epoch": 0.6634533356082579, + "grad_norm": 1.8390423432495764, + "learning_rate": 2.6876835145109892e-05, + "loss": 0.2372, + "step": 7777 + }, + { + "epoch": 0.663538645282375, + "grad_norm": 1.571044751147281, + "learning_rate": 2.6864586877684093e-05, + "loss": 0.1931, + "step": 7778 + }, + { + "epoch": 0.6636239549564921, + "grad_norm": 1.5904036901935847, + "learning_rate": 2.685234037652411e-05, + "loss": 0.1572, + "step": 7779 + }, + { + "epoch": 0.6637092646306091, + "grad_norm": 1.6159523195182148, + "learning_rate": 2.684009564256495e-05, + "loss": 0.2089, + "step": 7780 + }, + { + "epoch": 0.6637945743047261, + "grad_norm": 1.5787942807956323, + "learning_rate": 2.6827852676741415e-05, + "loss": 0.2846, + "step": 7781 + }, + { + "epoch": 0.6638798839788432, + "grad_norm": 2.255746919545511, + "learning_rate": 2.681561147998819e-05, + "loss": 0.2199, + "step": 7782 + }, + { + "epoch": 0.6639651936529603, + "grad_norm": 1.7266030975678115, + "learning_rate": 2.6803372053239834e-05, + "loss": 0.1797, + "step": 7783 + }, + { + "epoch": 0.6640505033270773, + "grad_norm": 2.190091478187797, + "learning_rate": 2.679113439743075e-05, + "loss": 0.1901, + "step": 7784 + }, + { + "epoch": 0.6641358130011943, + "grad_norm": 1.9191329126651466, + "learning_rate": 2.677889851349522e-05, + "loss": 0.214, + "step": 7785 + }, + { + "epoch": 0.6642211226753114, + "grad_norm": 1.771907196559745, + "learning_rate": 2.676666440236738e-05, + "loss": 0.1757, + "step": 7786 + }, + { + "epoch": 0.6643064323494284, + "grad_norm": 1.9592598961904755, + "learning_rate": 2.6754432064981285e-05, + "loss": 0.2287, + "step": 7787 + }, + { + "epoch": 0.6643917420235455, + "grad_norm": 2.104866465412768, + "learning_rate": 2.6742201502270736e-05, + "loss": 0.1854, + "step": 7788 + }, + { + "epoch": 0.6644770516976625, + "grad_norm": 1.9150264259088754, + "learning_rate": 2.6729972715169528e-05, + "loss": 0.222, + "step": 7789 + }, + { + "epoch": 0.6645623613717796, + "grad_norm": 1.8390112287169222, + "learning_rate": 2.671774570461123e-05, + "loss": 0.2277, + "step": 7790 + }, + { + "epoch": 0.6646476710458966, + "grad_norm": 1.527721309581612, + "learning_rate": 2.6705520471529366e-05, + "loss": 0.175, + "step": 7791 + }, + { + "epoch": 0.6647329807200136, + "grad_norm": 1.918222569842617, + "learning_rate": 2.6693297016857188e-05, + "loss": 0.1691, + "step": 7792 + }, + { + "epoch": 0.6648182903941307, + "grad_norm": 2.3262106173962014, + "learning_rate": 2.668107534152795e-05, + "loss": 0.2487, + "step": 7793 + }, + { + "epoch": 0.6649036000682478, + "grad_norm": 1.672598067460568, + "learning_rate": 2.6668855446474693e-05, + "loss": 0.1854, + "step": 7794 + }, + { + "epoch": 0.6649889097423648, + "grad_norm": 1.2826743233869222, + "learning_rate": 2.665663733263034e-05, + "loss": 0.2332, + "step": 7795 + }, + { + "epoch": 0.6650742194164818, + "grad_norm": 2.1549357956529076, + "learning_rate": 2.6644421000927677e-05, + "loss": 0.2067, + "step": 7796 + }, + { + "epoch": 0.6651595290905988, + "grad_norm": 1.4948513360248188, + "learning_rate": 2.6632206452299363e-05, + "loss": 0.231, + "step": 7797 + }, + { + "epoch": 0.665244838764716, + "grad_norm": 1.4660303624848352, + "learning_rate": 2.661999368767791e-05, + "loss": 0.1672, + "step": 7798 + }, + { + "epoch": 0.665330148438833, + "grad_norm": 1.3835458481184586, + "learning_rate": 2.6607782707995678e-05, + "loss": 0.1749, + "step": 7799 + }, + { + "epoch": 0.66541545811295, + "grad_norm": 1.4837033006860074, + "learning_rate": 2.6595573514184967e-05, + "loss": 0.2056, + "step": 7800 + }, + { + "epoch": 0.665500767787067, + "grad_norm": 1.6918490140883014, + "learning_rate": 2.658336610717781e-05, + "loss": 0.2121, + "step": 7801 + }, + { + "epoch": 0.6655860774611841, + "grad_norm": 1.2432373217072987, + "learning_rate": 2.657116048790624e-05, + "loss": 0.21, + "step": 7802 + }, + { + "epoch": 0.6656713871353012, + "grad_norm": 1.20849609375, + "learning_rate": 2.655895665730206e-05, + "loss": 0.1577, + "step": 7803 + }, + { + "epoch": 0.6657566968094182, + "grad_norm": 1.7007065146361324, + "learning_rate": 2.6546754616296977e-05, + "loss": 0.1656, + "step": 7804 + }, + { + "epoch": 0.6658420064835352, + "grad_norm": 1.5099428771121521, + "learning_rate": 2.6534554365822538e-05, + "loss": 0.2051, + "step": 7805 + }, + { + "epoch": 0.6659273161576523, + "grad_norm": 1.8198534341631272, + "learning_rate": 2.652235590681017e-05, + "loss": 0.2152, + "step": 7806 + }, + { + "epoch": 0.6660126258317693, + "grad_norm": 1.584713317745035, + "learning_rate": 2.6510159240191202e-05, + "loss": 0.2197, + "step": 7807 + }, + { + "epoch": 0.6660979355058864, + "grad_norm": 1.8517140515591102, + "learning_rate": 2.6497964366896716e-05, + "loss": 0.2233, + "step": 7808 + }, + { + "epoch": 0.6661832451800034, + "grad_norm": 1.5532360873800377, + "learning_rate": 2.6485771287857774e-05, + "loss": 0.1646, + "step": 7809 + }, + { + "epoch": 0.6662685548541205, + "grad_norm": 1.4279075340125134, + "learning_rate": 2.6473580004005248e-05, + "loss": 0.1779, + "step": 7810 + }, + { + "epoch": 0.6663538645282375, + "grad_norm": 1.6736017027291923, + "learning_rate": 2.6461390516269868e-05, + "loss": 0.1853, + "step": 7811 + }, + { + "epoch": 0.6664391742023545, + "grad_norm": 2.1259678151376087, + "learning_rate": 2.6449202825582214e-05, + "loss": 0.255, + "step": 7812 + }, + { + "epoch": 0.6665244838764716, + "grad_norm": 1.9569891584366028, + "learning_rate": 2.6437016932872816e-05, + "loss": 0.2215, + "step": 7813 + }, + { + "epoch": 0.6666097935505887, + "grad_norm": 1.681744969904423, + "learning_rate": 2.642483283907192e-05, + "loss": 0.2012, + "step": 7814 + }, + { + "epoch": 0.6666951032247057, + "grad_norm": 1.7398470908973522, + "learning_rate": 2.6412650545109787e-05, + "loss": 0.188, + "step": 7815 + }, + { + "epoch": 0.6667804128988227, + "grad_norm": 1.505811876074807, + "learning_rate": 2.6400470051916432e-05, + "loss": 0.1779, + "step": 7816 + }, + { + "epoch": 0.6668657225729397, + "grad_norm": 1.5570311397508427, + "learning_rate": 2.6388291360421784e-05, + "loss": 0.1788, + "step": 7817 + }, + { + "epoch": 0.6669510322470568, + "grad_norm": 2.1612233842365867, + "learning_rate": 2.6376114471555623e-05, + "loss": 0.1925, + "step": 7818 + }, + { + "epoch": 0.6670363419211739, + "grad_norm": 1.9968042471563887, + "learning_rate": 2.6363939386247576e-05, + "loss": 0.1951, + "step": 7819 + }, + { + "epoch": 0.6671216515952909, + "grad_norm": 1.7889484219224512, + "learning_rate": 2.6351766105427163e-05, + "loss": 0.201, + "step": 7820 + }, + { + "epoch": 0.6672069612694079, + "grad_norm": 1.7626221364225756, + "learning_rate": 2.6339594630023717e-05, + "loss": 0.2004, + "step": 7821 + }, + { + "epoch": 0.667292270943525, + "grad_norm": 1.6432771590086621, + "learning_rate": 2.6327424960966506e-05, + "loss": 0.1961, + "step": 7822 + }, + { + "epoch": 0.6673775806176421, + "grad_norm": 1.5191296044183136, + "learning_rate": 2.6315257099184608e-05, + "loss": 0.196, + "step": 7823 + }, + { + "epoch": 0.6674628902917591, + "grad_norm": 1.5933786595386985, + "learning_rate": 2.6303091045606976e-05, + "loss": 0.2162, + "step": 7824 + }, + { + "epoch": 0.6675481999658761, + "grad_norm": 1.5631390599865749, + "learning_rate": 2.6290926801162407e-05, + "loss": 0.1938, + "step": 7825 + }, + { + "epoch": 0.6676335096399931, + "grad_norm": 1.6771887357141113, + "learning_rate": 2.6278764366779596e-05, + "loss": 0.1779, + "step": 7826 + }, + { + "epoch": 0.6677188193141103, + "grad_norm": 1.3557261901346018, + "learning_rate": 2.6266603743387063e-05, + "loss": 0.1842, + "step": 7827 + }, + { + "epoch": 0.6678041289882273, + "grad_norm": 1.9467581773612888, + "learning_rate": 2.62544449319132e-05, + "loss": 0.2851, + "step": 7828 + }, + { + "epoch": 0.6678894386623443, + "grad_norm": 1.878773706404611, + "learning_rate": 2.6242287933286318e-05, + "loss": 0.2073, + "step": 7829 + }, + { + "epoch": 0.6679747483364613, + "grad_norm": 1.7558137507976848, + "learning_rate": 2.6230132748434466e-05, + "loss": 0.1458, + "step": 7830 + }, + { + "epoch": 0.6680600580105784, + "grad_norm": 1.6680874570107078, + "learning_rate": 2.621797937828569e-05, + "loss": 0.2148, + "step": 7831 + }, + { + "epoch": 0.6681453676846955, + "grad_norm": 1.94359994290688, + "learning_rate": 2.6205827823767808e-05, + "loss": 0.1499, + "step": 7832 + }, + { + "epoch": 0.6682306773588125, + "grad_norm": 1.5254000029721329, + "learning_rate": 2.6193678085808526e-05, + "loss": 0.212, + "step": 7833 + }, + { + "epoch": 0.6683159870329295, + "grad_norm": 1.4242394877402889, + "learning_rate": 2.61815301653354e-05, + "loss": 0.2174, + "step": 7834 + }, + { + "epoch": 0.6684012967070466, + "grad_norm": 1.6824604654414432, + "learning_rate": 2.6169384063275892e-05, + "loss": 0.203, + "step": 7835 + }, + { + "epoch": 0.6684866063811636, + "grad_norm": 1.8473958321861639, + "learning_rate": 2.615723978055728e-05, + "loss": 0.2617, + "step": 7836 + }, + { + "epoch": 0.6685719160552807, + "grad_norm": 1.904018346517791, + "learning_rate": 2.6145097318106703e-05, + "loss": 0.2332, + "step": 7837 + }, + { + "epoch": 0.6686572257293977, + "grad_norm": 1.7758113081503173, + "learning_rate": 2.613295667685119e-05, + "loss": 0.2595, + "step": 7838 + }, + { + "epoch": 0.6687425354035148, + "grad_norm": 2.038206541973939, + "learning_rate": 2.61208178577176e-05, + "loss": 0.2012, + "step": 7839 + }, + { + "epoch": 0.6688278450776318, + "grad_norm": 1.944903472427186, + "learning_rate": 2.6108680861632673e-05, + "loss": 0.2207, + "step": 7840 + }, + { + "epoch": 0.6689131547517488, + "grad_norm": 1.7907343220251455, + "learning_rate": 2.6096545689522983e-05, + "loss": 0.2285, + "step": 7841 + }, + { + "epoch": 0.6689984644258659, + "grad_norm": 1.6904758070235966, + "learning_rate": 2.6084412342315047e-05, + "loss": 0.1804, + "step": 7842 + }, + { + "epoch": 0.669083774099983, + "grad_norm": 1.7263340173118304, + "learning_rate": 2.6072280820935103e-05, + "loss": 0.2026, + "step": 7843 + }, + { + "epoch": 0.6691690837741, + "grad_norm": 1.411439517788445, + "learning_rate": 2.6060151126309385e-05, + "loss": 0.2096, + "step": 7844 + }, + { + "epoch": 0.669254393448217, + "grad_norm": 1.497203604480788, + "learning_rate": 2.6048023259363913e-05, + "loss": 0.2271, + "step": 7845 + }, + { + "epoch": 0.669339703122334, + "grad_norm": 1.7908508158392997, + "learning_rate": 2.6035897221024585e-05, + "loss": 0.2486, + "step": 7846 + }, + { + "epoch": 0.6694250127964512, + "grad_norm": 1.7572565852720352, + "learning_rate": 2.6023773012217155e-05, + "loss": 0.2201, + "step": 7847 + }, + { + "epoch": 0.6695103224705682, + "grad_norm": 1.792436840801463, + "learning_rate": 2.601165063386725e-05, + "loss": 0.2181, + "step": 7848 + }, + { + "epoch": 0.6695956321446852, + "grad_norm": 1.6015687523696553, + "learning_rate": 2.599953008690035e-05, + "loss": 0.2638, + "step": 7849 + }, + { + "epoch": 0.6696809418188022, + "grad_norm": 1.6283454203994883, + "learning_rate": 2.598741137224176e-05, + "loss": 0.1772, + "step": 7850 + }, + { + "epoch": 0.6697662514929194, + "grad_norm": 1.5859777135401585, + "learning_rate": 2.5975294490816737e-05, + "loss": 0.21, + "step": 7851 + }, + { + "epoch": 0.6698515611670364, + "grad_norm": 2.2230676897033366, + "learning_rate": 2.5963179443550302e-05, + "loss": 0.3174, + "step": 7852 + }, + { + "epoch": 0.6699368708411534, + "grad_norm": 1.841729042527393, + "learning_rate": 2.5951066231367395e-05, + "loss": 0.2173, + "step": 7853 + }, + { + "epoch": 0.6700221805152704, + "grad_norm": 1.4538810157943318, + "learning_rate": 2.5938954855192766e-05, + "loss": 0.2189, + "step": 7854 + }, + { + "epoch": 0.6701074901893874, + "grad_norm": 1.6349206484049341, + "learning_rate": 2.5926845315951103e-05, + "loss": 0.2356, + "step": 7855 + }, + { + "epoch": 0.6701927998635046, + "grad_norm": 1.7395973966414566, + "learning_rate": 2.591473761456684e-05, + "loss": 0.2777, + "step": 7856 + }, + { + "epoch": 0.6702781095376216, + "grad_norm": 1.7146972037503638, + "learning_rate": 2.5902631751964395e-05, + "loss": 0.1338, + "step": 7857 + }, + { + "epoch": 0.6703634192117386, + "grad_norm": 1.4837322248825862, + "learning_rate": 2.589052772906796e-05, + "loss": 0.2195, + "step": 7858 + }, + { + "epoch": 0.6704487288858556, + "grad_norm": 1.7482203562659084, + "learning_rate": 2.5878425546801622e-05, + "loss": 0.2901, + "step": 7859 + }, + { + "epoch": 0.6705340385599727, + "grad_norm": 1.999462413063487, + "learning_rate": 2.5866325206089305e-05, + "loss": 0.2296, + "step": 7860 + }, + { + "epoch": 0.6706193482340898, + "grad_norm": 1.371397240243944, + "learning_rate": 2.585422670785481e-05, + "loss": 0.1374, + "step": 7861 + }, + { + "epoch": 0.6707046579082068, + "grad_norm": 1.525450252290695, + "learning_rate": 2.5842130053021796e-05, + "loss": 0.2381, + "step": 7862 + }, + { + "epoch": 0.6707899675823238, + "grad_norm": 1.8796062474117945, + "learning_rate": 2.583003524251376e-05, + "loss": 0.2842, + "step": 7863 + }, + { + "epoch": 0.6708752772564409, + "grad_norm": 1.5431342313852667, + "learning_rate": 2.581794227725414e-05, + "loss": 0.1807, + "step": 7864 + }, + { + "epoch": 0.6709605869305579, + "grad_norm": 1.282976429509427, + "learning_rate": 2.580585115816607e-05, + "loss": 0.2219, + "step": 7865 + }, + { + "epoch": 0.671045896604675, + "grad_norm": 1.6075755308925346, + "learning_rate": 2.579376188617273e-05, + "loss": 0.2336, + "step": 7866 + }, + { + "epoch": 0.671131206278792, + "grad_norm": 1.501400849754141, + "learning_rate": 2.5781674462197026e-05, + "loss": 0.2262, + "step": 7867 + }, + { + "epoch": 0.6712165159529091, + "grad_norm": 1.5335046945406137, + "learning_rate": 2.576958888716179e-05, + "loss": 0.2931, + "step": 7868 + }, + { + "epoch": 0.6713018256270261, + "grad_norm": 1.9661412590596796, + "learning_rate": 2.575750516198968e-05, + "loss": 0.2988, + "step": 7869 + }, + { + "epoch": 0.6713871353011431, + "grad_norm": 1.5522341900249372, + "learning_rate": 2.5745423287603206e-05, + "loss": 0.2215, + "step": 7870 + }, + { + "epoch": 0.6714724449752602, + "grad_norm": 2.1489763173768304, + "learning_rate": 2.5733343264924815e-05, + "loss": 0.2044, + "step": 7871 + }, + { + "epoch": 0.6715577546493773, + "grad_norm": 1.5659975198796732, + "learning_rate": 2.5721265094876667e-05, + "loss": 0.2329, + "step": 7872 + }, + { + "epoch": 0.6716430643234943, + "grad_norm": 1.783208339386886, + "learning_rate": 2.5709188778380942e-05, + "loss": 0.1535, + "step": 7873 + }, + { + "epoch": 0.6717283739976113, + "grad_norm": 1.8765359944957904, + "learning_rate": 2.5697114316359572e-05, + "loss": 0.2287, + "step": 7874 + }, + { + "epoch": 0.6718136836717283, + "grad_norm": 1.8364803729048391, + "learning_rate": 2.568504170973437e-05, + "loss": 0.2216, + "step": 7875 + }, + { + "epoch": 0.6718989933458455, + "grad_norm": 1.8531816058977777, + "learning_rate": 2.567297095942701e-05, + "loss": 0.2117, + "step": 7876 + }, + { + "epoch": 0.6719843030199625, + "grad_norm": 1.4861991515576976, + "learning_rate": 2.5660902066359084e-05, + "loss": 0.2193, + "step": 7877 + }, + { + "epoch": 0.6720696126940795, + "grad_norm": 2.1260227659196165, + "learning_rate": 2.5648835031451902e-05, + "loss": 0.1811, + "step": 7878 + }, + { + "epoch": 0.6721549223681965, + "grad_norm": 1.391190446057125, + "learning_rate": 2.563676985562679e-05, + "loss": 0.2446, + "step": 7879 + }, + { + "epoch": 0.6722402320423136, + "grad_norm": 1.5071227714785966, + "learning_rate": 2.5624706539804833e-05, + "loss": 0.2533, + "step": 7880 + }, + { + "epoch": 0.6723255417164307, + "grad_norm": 1.6889080954527587, + "learning_rate": 2.5612645084906995e-05, + "loss": 0.2538, + "step": 7881 + }, + { + "epoch": 0.6724108513905477, + "grad_norm": 1.3206627736644825, + "learning_rate": 2.560058549185412e-05, + "loss": 0.1946, + "step": 7882 + }, + { + "epoch": 0.6724961610646647, + "grad_norm": 1.7538704985307367, + "learning_rate": 2.5588527761566857e-05, + "loss": 0.1781, + "step": 7883 + }, + { + "epoch": 0.6725814707387818, + "grad_norm": 1.7959976044449835, + "learning_rate": 2.5576471894965815e-05, + "loss": 0.2309, + "step": 7884 + }, + { + "epoch": 0.6726667804128988, + "grad_norm": 1.7320669126646089, + "learning_rate": 2.5564417892971327e-05, + "loss": 0.1925, + "step": 7885 + }, + { + "epoch": 0.6727520900870159, + "grad_norm": 1.6096698527923057, + "learning_rate": 2.5552365756503693e-05, + "loss": 0.198, + "step": 7886 + }, + { + "epoch": 0.6728373997611329, + "grad_norm": 2.1586988613342486, + "learning_rate": 2.5540315486483024e-05, + "loss": 0.2132, + "step": 7887 + }, + { + "epoch": 0.67292270943525, + "grad_norm": 1.7442339181244768, + "learning_rate": 2.552826708382929e-05, + "loss": 0.2044, + "step": 7888 + }, + { + "epoch": 0.673008019109367, + "grad_norm": 1.7331564806924278, + "learning_rate": 2.5516220549462317e-05, + "loss": 0.2072, + "step": 7889 + }, + { + "epoch": 0.673093328783484, + "grad_norm": 1.7880418023038187, + "learning_rate": 2.55041758843018e-05, + "loss": 0.2558, + "step": 7890 + }, + { + "epoch": 0.6731786384576011, + "grad_norm": 1.422245354679519, + "learning_rate": 2.5492133089267284e-05, + "loss": 0.2326, + "step": 7891 + }, + { + "epoch": 0.6732639481317181, + "grad_norm": 2.101250267233029, + "learning_rate": 2.5480092165278153e-05, + "loss": 0.2061, + "step": 7892 + }, + { + "epoch": 0.6733492578058352, + "grad_norm": 2.008421334307997, + "learning_rate": 2.5468053113253722e-05, + "loss": 0.2287, + "step": 7893 + }, + { + "epoch": 0.6734345674799522, + "grad_norm": 1.6956302327498203, + "learning_rate": 2.5456015934113043e-05, + "loss": 0.2266, + "step": 7894 + }, + { + "epoch": 0.6735198771540692, + "grad_norm": 1.681650832939401, + "learning_rate": 2.5443980628775133e-05, + "loss": 0.2222, + "step": 7895 + }, + { + "epoch": 0.6736051868281863, + "grad_norm": 1.364988727872369, + "learning_rate": 2.543194719815879e-05, + "loss": 0.2609, + "step": 7896 + }, + { + "epoch": 0.6736904965023034, + "grad_norm": 1.8557097268475942, + "learning_rate": 2.5419915643182767e-05, + "loss": 0.1836, + "step": 7897 + }, + { + "epoch": 0.6737758061764204, + "grad_norm": 1.6871264715403793, + "learning_rate": 2.5407885964765526e-05, + "loss": 0.1874, + "step": 7898 + }, + { + "epoch": 0.6738611158505374, + "grad_norm": 2.2445177470198385, + "learning_rate": 2.539585816382553e-05, + "loss": 0.1742, + "step": 7899 + }, + { + "epoch": 0.6739464255246544, + "grad_norm": 1.628748604809741, + "learning_rate": 2.5383832241281015e-05, + "loss": 0.1704, + "step": 7900 + }, + { + "epoch": 0.6740317351987716, + "grad_norm": 1.8069845666062694, + "learning_rate": 2.53718081980501e-05, + "loss": 0.22, + "step": 7901 + }, + { + "epoch": 0.6741170448728886, + "grad_norm": 1.742838990063349, + "learning_rate": 2.5359786035050758e-05, + "loss": 0.2098, + "step": 7902 + }, + { + "epoch": 0.6742023545470056, + "grad_norm": 1.7647523326931316, + "learning_rate": 2.5347765753200808e-05, + "loss": 0.1864, + "step": 7903 + }, + { + "epoch": 0.6742876642211226, + "grad_norm": 1.5048016941553828, + "learning_rate": 2.5335747353417942e-05, + "loss": 0.2174, + "step": 7904 + }, + { + "epoch": 0.6743729738952398, + "grad_norm": 2.096965882197966, + "learning_rate": 2.5323730836619685e-05, + "loss": 0.2313, + "step": 7905 + }, + { + "epoch": 0.6744582835693568, + "grad_norm": 1.769657223156361, + "learning_rate": 2.5311716203723483e-05, + "loss": 0.189, + "step": 7906 + }, + { + "epoch": 0.6745435932434738, + "grad_norm": 1.771487335909185, + "learning_rate": 2.5299703455646516e-05, + "loss": 0.1904, + "step": 7907 + }, + { + "epoch": 0.6746289029175908, + "grad_norm": 1.456466606304711, + "learning_rate": 2.5287692593305956e-05, + "loss": 0.187, + "step": 7908 + }, + { + "epoch": 0.6747142125917079, + "grad_norm": 1.5445789289278773, + "learning_rate": 2.5275683617618752e-05, + "loss": 0.2503, + "step": 7909 + }, + { + "epoch": 0.674799522265825, + "grad_norm": 2.268302860807429, + "learning_rate": 2.5263676529501706e-05, + "loss": 0.2104, + "step": 7910 + }, + { + "epoch": 0.674884831939942, + "grad_norm": 1.5690065333739116, + "learning_rate": 2.5251671329871518e-05, + "loss": 0.164, + "step": 7911 + }, + { + "epoch": 0.674970141614059, + "grad_norm": 1.6213083148195104, + "learning_rate": 2.523966801964468e-05, + "loss": 0.2487, + "step": 7912 + }, + { + "epoch": 0.6750554512881761, + "grad_norm": 1.7383409811322863, + "learning_rate": 2.5227666599737666e-05, + "loss": 0.2406, + "step": 7913 + }, + { + "epoch": 0.6751407609622931, + "grad_norm": 1.4520441773369819, + "learning_rate": 2.5215667071066616e-05, + "loss": 0.1969, + "step": 7914 + }, + { + "epoch": 0.6752260706364102, + "grad_norm": 1.9314401674930208, + "learning_rate": 2.5203669434547705e-05, + "loss": 0.2339, + "step": 7915 + }, + { + "epoch": 0.6753113803105272, + "grad_norm": 1.550341334752429, + "learning_rate": 2.5191673691096866e-05, + "loss": 0.1623, + "step": 7916 + }, + { + "epoch": 0.6753966899846443, + "grad_norm": 1.9079313523920807, + "learning_rate": 2.5179679841629912e-05, + "loss": 0.205, + "step": 7917 + }, + { + "epoch": 0.6754819996587613, + "grad_norm": 1.7726430583948891, + "learning_rate": 2.5167687887062485e-05, + "loss": 0.1833, + "step": 7918 + }, + { + "epoch": 0.6755673093328783, + "grad_norm": 1.4600893710491853, + "learning_rate": 2.5155697828310164e-05, + "loss": 0.1735, + "step": 7919 + }, + { + "epoch": 0.6756526190069954, + "grad_norm": 1.421757536008049, + "learning_rate": 2.5143709666288262e-05, + "loss": 0.1877, + "step": 7920 + }, + { + "epoch": 0.6757379286811125, + "grad_norm": 1.613218883574328, + "learning_rate": 2.5131723401912056e-05, + "loss": 0.197, + "step": 7921 + }, + { + "epoch": 0.6758232383552295, + "grad_norm": 1.8308240740192776, + "learning_rate": 2.5119739036096613e-05, + "loss": 0.248, + "step": 7922 + }, + { + "epoch": 0.6759085480293465, + "grad_norm": 1.4004793232130615, + "learning_rate": 2.510775656975689e-05, + "loss": 0.1474, + "step": 7923 + }, + { + "epoch": 0.6759938577034635, + "grad_norm": 1.7931030489088873, + "learning_rate": 2.509577600380767e-05, + "loss": 0.1485, + "step": 7924 + }, + { + "epoch": 0.6760791673775807, + "grad_norm": 1.697248978966667, + "learning_rate": 2.508379733916359e-05, + "loss": 0.1836, + "step": 7925 + }, + { + "epoch": 0.6761644770516977, + "grad_norm": 1.6598407041573267, + "learning_rate": 2.507182057673922e-05, + "loss": 0.2326, + "step": 7926 + }, + { + "epoch": 0.6762497867258147, + "grad_norm": 1.655518712239003, + "learning_rate": 2.5059845717448843e-05, + "loss": 0.2142, + "step": 7927 + }, + { + "epoch": 0.6763350963999317, + "grad_norm": 2.358667532849337, + "learning_rate": 2.5047872762206732e-05, + "loss": 0.2384, + "step": 7928 + }, + { + "epoch": 0.6764204060740489, + "grad_norm": 1.5639990672720936, + "learning_rate": 2.503590171192694e-05, + "loss": 0.2013, + "step": 7929 + }, + { + "epoch": 0.6765057157481659, + "grad_norm": 2.2087289947564486, + "learning_rate": 2.5023932567523388e-05, + "loss": 0.2844, + "step": 7930 + }, + { + "epoch": 0.6765910254222829, + "grad_norm": 1.9076486678077853, + "learning_rate": 2.501196532990987e-05, + "loss": 0.1573, + "step": 7931 + }, + { + "epoch": 0.6766763350963999, + "grad_norm": 1.6459981940388186, + "learning_rate": 2.500000000000001e-05, + "loss": 0.19, + "step": 7932 + }, + { + "epoch": 0.6767616447705169, + "grad_norm": 1.3207015869348298, + "learning_rate": 2.4988036578707303e-05, + "loss": 0.1515, + "step": 7933 + }, + { + "epoch": 0.676846954444634, + "grad_norm": 1.5193868141860507, + "learning_rate": 2.4976075066945064e-05, + "loss": 0.1973, + "step": 7934 + }, + { + "epoch": 0.6769322641187511, + "grad_norm": 1.7024786186276042, + "learning_rate": 2.496411546562656e-05, + "loss": 0.1811, + "step": 7935 + }, + { + "epoch": 0.6770175737928681, + "grad_norm": 1.5145476926924228, + "learning_rate": 2.4952157775664757e-05, + "loss": 0.2033, + "step": 7936 + }, + { + "epoch": 0.6771028834669851, + "grad_norm": 1.6388518424435787, + "learning_rate": 2.4940201997972628e-05, + "loss": 0.2689, + "step": 7937 + }, + { + "epoch": 0.6771881931411022, + "grad_norm": 1.6249715362403456, + "learning_rate": 2.4928248133462907e-05, + "loss": 0.204, + "step": 7938 + }, + { + "epoch": 0.6772735028152193, + "grad_norm": 1.780526934648433, + "learning_rate": 2.491629618304821e-05, + "loss": 0.1744, + "step": 7939 + }, + { + "epoch": 0.6773588124893363, + "grad_norm": 1.8222834840739615, + "learning_rate": 2.4904346147640984e-05, + "loss": 0.1938, + "step": 7940 + }, + { + "epoch": 0.6774441221634533, + "grad_norm": 1.8660663127503114, + "learning_rate": 2.489239802815359e-05, + "loss": 0.1707, + "step": 7941 + }, + { + "epoch": 0.6775294318375704, + "grad_norm": 1.706423583650431, + "learning_rate": 2.488045182549819e-05, + "loss": 0.2, + "step": 7942 + }, + { + "epoch": 0.6776147415116874, + "grad_norm": 2.0446828503900547, + "learning_rate": 2.4868507540586805e-05, + "loss": 0.2177, + "step": 7943 + }, + { + "epoch": 0.6777000511858045, + "grad_norm": 1.7311497848645907, + "learning_rate": 2.485656517433132e-05, + "loss": 0.1804, + "step": 7944 + }, + { + "epoch": 0.6777853608599215, + "grad_norm": 1.6750589872762995, + "learning_rate": 2.4844624727643472e-05, + "loss": 0.1647, + "step": 7945 + }, + { + "epoch": 0.6778706705340386, + "grad_norm": 2.0407824515431248, + "learning_rate": 2.4832686201434852e-05, + "loss": 0.1679, + "step": 7946 + }, + { + "epoch": 0.6779559802081556, + "grad_norm": 1.5472952531935913, + "learning_rate": 2.482074959661688e-05, + "loss": 0.2069, + "step": 7947 + }, + { + "epoch": 0.6780412898822726, + "grad_norm": 1.5025433594704092, + "learning_rate": 2.4808814914100915e-05, + "loss": 0.168, + "step": 7948 + }, + { + "epoch": 0.6781265995563897, + "grad_norm": 1.9199799910138529, + "learning_rate": 2.4796882154798023e-05, + "loss": 0.1902, + "step": 7949 + }, + { + "epoch": 0.6782119092305068, + "grad_norm": 1.5145996400706518, + "learning_rate": 2.478495131961927e-05, + "loss": 0.1873, + "step": 7950 + }, + { + "epoch": 0.6782972189046238, + "grad_norm": 1.895969875015891, + "learning_rate": 2.4773022409475495e-05, + "loss": 0.174, + "step": 7951 + }, + { + "epoch": 0.6783825285787408, + "grad_norm": 1.6462746121153309, + "learning_rate": 2.4761095425277396e-05, + "loss": 0.1606, + "step": 7952 + }, + { + "epoch": 0.6784678382528578, + "grad_norm": 1.7998355525449634, + "learning_rate": 2.474917036793555e-05, + "loss": 0.2127, + "step": 7953 + }, + { + "epoch": 0.678553147926975, + "grad_norm": 1.894048192685017, + "learning_rate": 2.4737247238360356e-05, + "loss": 0.1949, + "step": 7954 + }, + { + "epoch": 0.678638457601092, + "grad_norm": 1.517732079747132, + "learning_rate": 2.4725326037462092e-05, + "loss": 0.199, + "step": 7955 + }, + { + "epoch": 0.678723767275209, + "grad_norm": 1.6094029988936582, + "learning_rate": 2.4713406766150858e-05, + "loss": 0.1728, + "step": 7956 + }, + { + "epoch": 0.678809076949326, + "grad_norm": 1.7380610980234976, + "learning_rate": 2.4701489425336667e-05, + "loss": 0.1739, + "step": 7957 + }, + { + "epoch": 0.6788943866234431, + "grad_norm": 1.6494471635314611, + "learning_rate": 2.468957401592932e-05, + "loss": 0.2291, + "step": 7958 + }, + { + "epoch": 0.6789796962975602, + "grad_norm": 1.6529352378459041, + "learning_rate": 2.467766053883849e-05, + "loss": 0.1608, + "step": 7959 + }, + { + "epoch": 0.6790650059716772, + "grad_norm": 1.581694164392169, + "learning_rate": 2.4665748994973704e-05, + "loss": 0.1577, + "step": 7960 + }, + { + "epoch": 0.6791503156457942, + "grad_norm": 1.3861613698132467, + "learning_rate": 2.4653839385244392e-05, + "loss": 0.1751, + "step": 7961 + }, + { + "epoch": 0.6792356253199113, + "grad_norm": 1.9870778097229385, + "learning_rate": 2.4641931710559717e-05, + "loss": 0.2489, + "step": 7962 + }, + { + "epoch": 0.6793209349940283, + "grad_norm": 1.8112296552548788, + "learning_rate": 2.463002597182882e-05, + "loss": 0.2135, + "step": 7963 + }, + { + "epoch": 0.6794062446681454, + "grad_norm": 1.7992161633584316, + "learning_rate": 2.461812216996062e-05, + "loss": 0.2286, + "step": 7964 + }, + { + "epoch": 0.6794915543422624, + "grad_norm": 2.2493961371604954, + "learning_rate": 2.460622030586392e-05, + "loss": 0.2121, + "step": 7965 + }, + { + "epoch": 0.6795768640163795, + "grad_norm": 1.9020124769714135, + "learning_rate": 2.4594320380447356e-05, + "loss": 0.1923, + "step": 7966 + }, + { + "epoch": 0.6796621736904965, + "grad_norm": 1.9147406174812989, + "learning_rate": 2.4582422394619427e-05, + "loss": 0.2047, + "step": 7967 + }, + { + "epoch": 0.6797474833646135, + "grad_norm": 1.6782063896005734, + "learning_rate": 2.4570526349288475e-05, + "loss": 0.1887, + "step": 7968 + }, + { + "epoch": 0.6798327930387306, + "grad_norm": 1.999058859641282, + "learning_rate": 2.455863224536269e-05, + "loss": 0.28, + "step": 7969 + }, + { + "epoch": 0.6799181027128476, + "grad_norm": 1.4101267465479677, + "learning_rate": 2.454674008375015e-05, + "loss": 0.1758, + "step": 7970 + }, + { + "epoch": 0.6800034123869647, + "grad_norm": 1.673564022064799, + "learning_rate": 2.453484986535875e-05, + "loss": 0.1702, + "step": 7971 + }, + { + "epoch": 0.6800887220610817, + "grad_norm": 1.5995758388483163, + "learning_rate": 2.4522961591096246e-05, + "loss": 0.1603, + "step": 7972 + }, + { + "epoch": 0.6801740317351987, + "grad_norm": 1.8071031133114381, + "learning_rate": 2.4511075261870232e-05, + "loss": 0.2075, + "step": 7973 + }, + { + "epoch": 0.6802593414093158, + "grad_norm": 1.820296324301956, + "learning_rate": 2.449919087858818e-05, + "loss": 0.2286, + "step": 7974 + }, + { + "epoch": 0.6803446510834329, + "grad_norm": 1.7394173670497928, + "learning_rate": 2.4487308442157386e-05, + "loss": 0.1911, + "step": 7975 + }, + { + "epoch": 0.6804299607575499, + "grad_norm": 1.5655836575773556, + "learning_rate": 2.4475427953485002e-05, + "loss": 0.1605, + "step": 7976 + }, + { + "epoch": 0.6805152704316669, + "grad_norm": 1.9317307253537899, + "learning_rate": 2.4463549413478098e-05, + "loss": 0.1692, + "step": 7977 + }, + { + "epoch": 0.680600580105784, + "grad_norm": 1.5708295435906119, + "learning_rate": 2.4451672823043455e-05, + "loss": 0.1481, + "step": 7978 + }, + { + "epoch": 0.6806858897799011, + "grad_norm": 2.225874516791536, + "learning_rate": 2.4439798183087846e-05, + "loss": 0.1728, + "step": 7979 + }, + { + "epoch": 0.6807711994540181, + "grad_norm": 1.640463684870651, + "learning_rate": 2.4427925494517823e-05, + "loss": 0.2328, + "step": 7980 + }, + { + "epoch": 0.6808565091281351, + "grad_norm": 2.45572306098597, + "learning_rate": 2.4416054758239794e-05, + "loss": 0.1949, + "step": 7981 + }, + { + "epoch": 0.6809418188022521, + "grad_norm": 1.8355607315777658, + "learning_rate": 2.4404185975160014e-05, + "loss": 0.2512, + "step": 7982 + }, + { + "epoch": 0.6810271284763693, + "grad_norm": 1.4227803460536188, + "learning_rate": 2.4392319146184655e-05, + "loss": 0.1538, + "step": 7983 + }, + { + "epoch": 0.6811124381504863, + "grad_norm": 2.0299225210803873, + "learning_rate": 2.4380454272219617e-05, + "loss": 0.2695, + "step": 7984 + }, + { + "epoch": 0.6811977478246033, + "grad_norm": 1.4406464184143224, + "learning_rate": 2.436859135417077e-05, + "loss": 0.2292, + "step": 7985 + }, + { + "epoch": 0.6812830574987203, + "grad_norm": 2.189835092266822, + "learning_rate": 2.4356730392943767e-05, + "loss": 0.2029, + "step": 7986 + }, + { + "epoch": 0.6813683671728374, + "grad_norm": 1.8735821131234385, + "learning_rate": 2.4344871389444128e-05, + "loss": 0.3047, + "step": 7987 + }, + { + "epoch": 0.6814536768469545, + "grad_norm": 1.286435496427437, + "learning_rate": 2.4333014344577232e-05, + "loss": 0.2189, + "step": 7988 + }, + { + "epoch": 0.6815389865210715, + "grad_norm": 1.8831325274854154, + "learning_rate": 2.4321159259248272e-05, + "loss": 0.2172, + "step": 7989 + }, + { + "epoch": 0.6816242961951885, + "grad_norm": 1.6375770230414912, + "learning_rate": 2.430930613436239e-05, + "loss": 0.2577, + "step": 7990 + }, + { + "epoch": 0.6817096058693056, + "grad_norm": 1.497521259749216, + "learning_rate": 2.4297454970824424e-05, + "loss": 0.2587, + "step": 7991 + }, + { + "epoch": 0.6817949155434226, + "grad_norm": 1.5139127674999928, + "learning_rate": 2.4285605769539204e-05, + "loss": 0.2078, + "step": 7992 + }, + { + "epoch": 0.6818802252175397, + "grad_norm": 1.9287618664162014, + "learning_rate": 2.427375853141134e-05, + "loss": 0.2771, + "step": 7993 + }, + { + "epoch": 0.6819655348916567, + "grad_norm": 1.962179330273258, + "learning_rate": 2.4261913257345304e-05, + "loss": 0.2337, + "step": 7994 + }, + { + "epoch": 0.6820508445657738, + "grad_norm": 1.4041849020086103, + "learning_rate": 2.4250069948245414e-05, + "loss": 0.1352, + "step": 7995 + }, + { + "epoch": 0.6821361542398908, + "grad_norm": 1.6525915514679592, + "learning_rate": 2.423822860501585e-05, + "loss": 0.2249, + "step": 7996 + }, + { + "epoch": 0.6822214639140078, + "grad_norm": 1.7642372551612153, + "learning_rate": 2.4226389228560635e-05, + "loss": 0.2023, + "step": 7997 + }, + { + "epoch": 0.6823067735881249, + "grad_norm": 1.227865219790259, + "learning_rate": 2.4214551819783626e-05, + "loss": 0.1745, + "step": 7998 + }, + { + "epoch": 0.682392083262242, + "grad_norm": 1.9117658889128188, + "learning_rate": 2.4202716379588598e-05, + "loss": 0.163, + "step": 7999 + }, + { + "epoch": 0.682477392936359, + "grad_norm": 1.7674607042965056, + "learning_rate": 2.4190882908879048e-05, + "loss": 0.1557, + "step": 8000 + }, + { + "epoch": 0.682562702610476, + "grad_norm": 1.6295027338258437, + "learning_rate": 2.4179051408558458e-05, + "loss": 0.2191, + "step": 8001 + }, + { + "epoch": 0.682648012284593, + "grad_norm": 1.7957677995919377, + "learning_rate": 2.416722187953006e-05, + "loss": 0.2164, + "step": 8002 + }, + { + "epoch": 0.6827333219587102, + "grad_norm": 1.8449125342583916, + "learning_rate": 2.4155394322697038e-05, + "loss": 0.2105, + "step": 8003 + }, + { + "epoch": 0.6828186316328272, + "grad_norm": 1.9347485725426155, + "learning_rate": 2.414356873896228e-05, + "loss": 0.2327, + "step": 8004 + }, + { + "epoch": 0.6829039413069442, + "grad_norm": 1.5783875501935376, + "learning_rate": 2.4131745129228674e-05, + "loss": 0.1429, + "step": 8005 + }, + { + "epoch": 0.6829892509810612, + "grad_norm": 1.5328455028033574, + "learning_rate": 2.4119923494398856e-05, + "loss": 0.1786, + "step": 8006 + }, + { + "epoch": 0.6830745606551782, + "grad_norm": 1.6188460980615125, + "learning_rate": 2.4108103835375357e-05, + "loss": 0.1741, + "step": 8007 + }, + { + "epoch": 0.6831598703292954, + "grad_norm": 1.8081846495414007, + "learning_rate": 2.4096286153060538e-05, + "loss": 0.2294, + "step": 8008 + }, + { + "epoch": 0.6832451800034124, + "grad_norm": 1.3684628015493308, + "learning_rate": 2.4084470448356617e-05, + "loss": 0.1692, + "step": 8009 + }, + { + "epoch": 0.6833304896775294, + "grad_norm": 1.2497908417235382, + "learning_rate": 2.4072656722165665e-05, + "loss": 0.1559, + "step": 8010 + }, + { + "epoch": 0.6834157993516464, + "grad_norm": 2.139864709980578, + "learning_rate": 2.4060844975389574e-05, + "loss": 0.1758, + "step": 8011 + }, + { + "epoch": 0.6835011090257636, + "grad_norm": 1.858586119948412, + "learning_rate": 2.4049035208930172e-05, + "loss": 0.1993, + "step": 8012 + }, + { + "epoch": 0.6835864186998806, + "grad_norm": 1.6194855951773885, + "learning_rate": 2.4037227423688985e-05, + "loss": 0.1181, + "step": 8013 + }, + { + "epoch": 0.6836717283739976, + "grad_norm": 1.7468124378445509, + "learning_rate": 2.4025421620567535e-05, + "loss": 0.1669, + "step": 8014 + }, + { + "epoch": 0.6837570380481146, + "grad_norm": 1.973273454554534, + "learning_rate": 2.401361780046712e-05, + "loss": 0.2368, + "step": 8015 + }, + { + "epoch": 0.6838423477222317, + "grad_norm": 2.193255563862613, + "learning_rate": 2.4001815964288893e-05, + "loss": 0.1672, + "step": 8016 + }, + { + "epoch": 0.6839276573963488, + "grad_norm": 1.7465808709503066, + "learning_rate": 2.3990016112933866e-05, + "loss": 0.2465, + "step": 8017 + }, + { + "epoch": 0.6840129670704658, + "grad_norm": 1.6062772993447105, + "learning_rate": 2.397821824730287e-05, + "loss": 0.2362, + "step": 8018 + }, + { + "epoch": 0.6840982767445828, + "grad_norm": 1.792268903192291, + "learning_rate": 2.396642236829667e-05, + "loss": 0.2383, + "step": 8019 + }, + { + "epoch": 0.6841835864186999, + "grad_norm": 1.4881577327866116, + "learning_rate": 2.3954628476815736e-05, + "loss": 0.2426, + "step": 8020 + }, + { + "epoch": 0.6842688960928169, + "grad_norm": 1.7321746202458352, + "learning_rate": 2.394283657376054e-05, + "loss": 0.1975, + "step": 8021 + }, + { + "epoch": 0.684354205766934, + "grad_norm": 1.5110995185086376, + "learning_rate": 2.3931046660031302e-05, + "loss": 0.2144, + "step": 8022 + }, + { + "epoch": 0.684439515441051, + "grad_norm": 1.7730490641423888, + "learning_rate": 2.3919258736528123e-05, + "loss": 0.2283, + "step": 8023 + }, + { + "epoch": 0.6845248251151681, + "grad_norm": 1.7123073163191296, + "learning_rate": 2.390747280415092e-05, + "loss": 0.2051, + "step": 8024 + }, + { + "epoch": 0.6846101347892851, + "grad_norm": 1.3989452101072286, + "learning_rate": 2.389568886379956e-05, + "loss": 0.1429, + "step": 8025 + }, + { + "epoch": 0.6846954444634021, + "grad_norm": 1.7189520977152044, + "learning_rate": 2.3883906916373595e-05, + "loss": 0.2029, + "step": 8026 + }, + { + "epoch": 0.6847807541375192, + "grad_norm": 1.9333335624343913, + "learning_rate": 2.3872126962772572e-05, + "loss": 0.1976, + "step": 8027 + }, + { + "epoch": 0.6848660638116363, + "grad_norm": 2.1279963517480427, + "learning_rate": 2.3860349003895816e-05, + "loss": 0.1948, + "step": 8028 + }, + { + "epoch": 0.6849513734857533, + "grad_norm": 1.6478187882411752, + "learning_rate": 2.3848573040642508e-05, + "loss": 0.2159, + "step": 8029 + }, + { + "epoch": 0.6850366831598703, + "grad_norm": 1.7632955527750183, + "learning_rate": 2.383679907391168e-05, + "loss": 0.2629, + "step": 8030 + }, + { + "epoch": 0.6851219928339873, + "grad_norm": 1.5586036930388512, + "learning_rate": 2.382502710460219e-05, + "loss": 0.1653, + "step": 8031 + }, + { + "epoch": 0.6852073025081045, + "grad_norm": 1.6767116711354841, + "learning_rate": 2.3813257133612827e-05, + "loss": 0.1916, + "step": 8032 + }, + { + "epoch": 0.6852926121822215, + "grad_norm": 1.7529768828517802, + "learning_rate": 2.3801489161842083e-05, + "loss": 0.207, + "step": 8033 + }, + { + "epoch": 0.6853779218563385, + "grad_norm": 1.893708292183944, + "learning_rate": 2.3789723190188444e-05, + "loss": 0.1893, + "step": 8034 + }, + { + "epoch": 0.6854632315304555, + "grad_norm": 1.7911330322920394, + "learning_rate": 2.377795921955016e-05, + "loss": 0.2344, + "step": 8035 + }, + { + "epoch": 0.6855485412045726, + "grad_norm": 1.4608946605256046, + "learning_rate": 2.376619725082535e-05, + "loss": 0.2189, + "step": 8036 + }, + { + "epoch": 0.6856338508786897, + "grad_norm": 1.7661750409309718, + "learning_rate": 2.3754437284911968e-05, + "loss": 0.2582, + "step": 8037 + }, + { + "epoch": 0.6857191605528067, + "grad_norm": 1.696271633833895, + "learning_rate": 2.3742679322707835e-05, + "loss": 0.1871, + "step": 8038 + }, + { + "epoch": 0.6858044702269237, + "grad_norm": 1.5756351734212746, + "learning_rate": 2.3730923365110597e-05, + "loss": 0.2345, + "step": 8039 + }, + { + "epoch": 0.6858897799010408, + "grad_norm": 2.0855011787274034, + "learning_rate": 2.371916941301775e-05, + "loss": 0.2372, + "step": 8040 + }, + { + "epoch": 0.6859750895751578, + "grad_norm": 1.7101973931672891, + "learning_rate": 2.3707417467326704e-05, + "loss": 0.2058, + "step": 8041 + }, + { + "epoch": 0.6860603992492749, + "grad_norm": 2.091712073736264, + "learning_rate": 2.3695667528934573e-05, + "loss": 0.2137, + "step": 8042 + }, + { + "epoch": 0.6861457089233919, + "grad_norm": 1.883416549847122, + "learning_rate": 2.368391959873847e-05, + "loss": 0.2279, + "step": 8043 + }, + { + "epoch": 0.686231018597509, + "grad_norm": 1.6932637716758634, + "learning_rate": 2.3672173677635258e-05, + "loss": 0.2164, + "step": 8044 + }, + { + "epoch": 0.686316328271626, + "grad_norm": 1.4493515077227581, + "learning_rate": 2.366042976652168e-05, + "loss": 0.1769, + "step": 8045 + }, + { + "epoch": 0.686401637945743, + "grad_norm": 1.8995432204239378, + "learning_rate": 2.36486878662943e-05, + "loss": 0.1874, + "step": 8046 + }, + { + "epoch": 0.6864869476198601, + "grad_norm": 1.453689301552852, + "learning_rate": 2.3636947977849592e-05, + "loss": 0.1574, + "step": 8047 + }, + { + "epoch": 0.6865722572939771, + "grad_norm": 2.3516381337115346, + "learning_rate": 2.3625210102083817e-05, + "loss": 0.1715, + "step": 8048 + }, + { + "epoch": 0.6866575669680942, + "grad_norm": 1.714439257489413, + "learning_rate": 2.3613474239893092e-05, + "loss": 0.1835, + "step": 8049 + }, + { + "epoch": 0.6867428766422112, + "grad_norm": 1.7246534745830413, + "learning_rate": 2.360174039217339e-05, + "loss": 0.2236, + "step": 8050 + }, + { + "epoch": 0.6868281863163282, + "grad_norm": 1.6857931193698406, + "learning_rate": 2.3590008559820526e-05, + "loss": 0.1412, + "step": 8051 + }, + { + "epoch": 0.6869134959904453, + "grad_norm": 1.4529714195554302, + "learning_rate": 2.357827874373017e-05, + "loss": 0.19, + "step": 8052 + }, + { + "epoch": 0.6869988056645624, + "grad_norm": 1.5684086280015321, + "learning_rate": 2.3566550944797804e-05, + "loss": 0.1881, + "step": 8053 + }, + { + "epoch": 0.6870841153386794, + "grad_norm": 1.6829417796227693, + "learning_rate": 2.3554825163918848e-05, + "loss": 0.2132, + "step": 8054 + }, + { + "epoch": 0.6871694250127964, + "grad_norm": 1.6471004035052919, + "learning_rate": 2.354310140198842e-05, + "loss": 0.202, + "step": 8055 + }, + { + "epoch": 0.6872547346869134, + "grad_norm": 1.590444240529251, + "learning_rate": 2.353137965990163e-05, + "loss": 0.1666, + "step": 8056 + }, + { + "epoch": 0.6873400443610306, + "grad_norm": 1.7507546024090428, + "learning_rate": 2.3519659938553352e-05, + "loss": 0.1588, + "step": 8057 + }, + { + "epoch": 0.6874253540351476, + "grad_norm": 1.9958610622998993, + "learning_rate": 2.3507942238838314e-05, + "loss": 0.2246, + "step": 8058 + }, + { + "epoch": 0.6875106637092646, + "grad_norm": 2.1211031918816095, + "learning_rate": 2.3496226561651113e-05, + "loss": 0.1753, + "step": 8059 + }, + { + "epoch": 0.6875959733833816, + "grad_norm": 1.5938841071459144, + "learning_rate": 2.3484512907886154e-05, + "loss": 0.1719, + "step": 8060 + }, + { + "epoch": 0.6876812830574988, + "grad_norm": 2.134332806685406, + "learning_rate": 2.3472801278437768e-05, + "loss": 0.2304, + "step": 8061 + }, + { + "epoch": 0.6877665927316158, + "grad_norm": 1.3304819622716562, + "learning_rate": 2.3461091674199998e-05, + "loss": 0.1747, + "step": 8062 + }, + { + "epoch": 0.6878519024057328, + "grad_norm": 1.4677825338623653, + "learning_rate": 2.3449384096066874e-05, + "loss": 0.1469, + "step": 8063 + }, + { + "epoch": 0.6879372120798498, + "grad_norm": 1.966370613099556, + "learning_rate": 2.343767854493218e-05, + "loss": 0.1316, + "step": 8064 + }, + { + "epoch": 0.6880225217539669, + "grad_norm": 1.8086199830626513, + "learning_rate": 2.3425975021689584e-05, + "loss": 0.213, + "step": 8065 + }, + { + "epoch": 0.688107831428084, + "grad_norm": 1.56936981601383, + "learning_rate": 2.3414273527232554e-05, + "loss": 0.2092, + "step": 8066 + }, + { + "epoch": 0.688193141102201, + "grad_norm": 1.8522596374747036, + "learning_rate": 2.3402574062454508e-05, + "loss": 0.2194, + "step": 8067 + }, + { + "epoch": 0.688278450776318, + "grad_norm": 1.7683973574156346, + "learning_rate": 2.3390876628248553e-05, + "loss": 0.2325, + "step": 8068 + }, + { + "epoch": 0.6883637604504351, + "grad_norm": 1.6086957664803487, + "learning_rate": 2.3379181225507783e-05, + "loss": 0.2058, + "step": 8069 + }, + { + "epoch": 0.6884490701245521, + "grad_norm": 1.8736987366894342, + "learning_rate": 2.336748785512507e-05, + "loss": 0.21, + "step": 8070 + }, + { + "epoch": 0.6885343797986692, + "grad_norm": 1.7392717261296013, + "learning_rate": 2.335579651799313e-05, + "loss": 0.1489, + "step": 8071 + }, + { + "epoch": 0.6886196894727862, + "grad_norm": 1.4492852948891932, + "learning_rate": 2.334410721500454e-05, + "loss": 0.1498, + "step": 8072 + }, + { + "epoch": 0.6887049991469033, + "grad_norm": 2.2056476408309034, + "learning_rate": 2.3332419947051715e-05, + "loss": 0.1843, + "step": 8073 + }, + { + "epoch": 0.6887903088210203, + "grad_norm": 1.3918223423411753, + "learning_rate": 2.3320734715026916e-05, + "loss": 0.1901, + "step": 8074 + }, + { + "epoch": 0.6888756184951373, + "grad_norm": 1.6434895530301339, + "learning_rate": 2.330905151982223e-05, + "loss": 0.1019, + "step": 8075 + }, + { + "epoch": 0.6889609281692544, + "grad_norm": 1.7151436163378, + "learning_rate": 2.329737036232964e-05, + "loss": 0.1492, + "step": 8076 + }, + { + "epoch": 0.6890462378433715, + "grad_norm": 1.6828598936698242, + "learning_rate": 2.3285691243440927e-05, + "loss": 0.2131, + "step": 8077 + }, + { + "epoch": 0.6891315475174885, + "grad_norm": 1.5325668278073739, + "learning_rate": 2.327401416404773e-05, + "loss": 0.1164, + "step": 8078 + }, + { + "epoch": 0.6892168571916055, + "grad_norm": 1.7163707564533948, + "learning_rate": 2.3262339125041527e-05, + "loss": 0.2194, + "step": 8079 + }, + { + "epoch": 0.6893021668657225, + "grad_norm": 1.674064842053004, + "learning_rate": 2.3250666127313647e-05, + "loss": 0.1849, + "step": 8080 + }, + { + "epoch": 0.6893874765398397, + "grad_norm": 2.1492668631800513, + "learning_rate": 2.3238995171755268e-05, + "loss": 0.1837, + "step": 8081 + }, + { + "epoch": 0.6894727862139567, + "grad_norm": 1.7343316974904674, + "learning_rate": 2.3227326259257376e-05, + "loss": 0.1881, + "step": 8082 + }, + { + "epoch": 0.6895580958880737, + "grad_norm": 2.0464766020025604, + "learning_rate": 2.321565939071089e-05, + "loss": 0.1821, + "step": 8083 + }, + { + "epoch": 0.6896434055621907, + "grad_norm": 1.3625645543475744, + "learning_rate": 2.3203994567006447e-05, + "loss": 0.1791, + "step": 8084 + }, + { + "epoch": 0.6897287152363077, + "grad_norm": 1.9085091508501562, + "learning_rate": 2.319233178903464e-05, + "loss": 0.2508, + "step": 8085 + }, + { + "epoch": 0.6898140249104249, + "grad_norm": 1.667718658637687, + "learning_rate": 2.3180671057685844e-05, + "loss": 0.2228, + "step": 8086 + }, + { + "epoch": 0.6898993345845419, + "grad_norm": 1.4034266209270947, + "learning_rate": 2.3169012373850298e-05, + "loss": 0.1881, + "step": 8087 + }, + { + "epoch": 0.6899846442586589, + "grad_norm": 1.4821066942251548, + "learning_rate": 2.3157355738418058e-05, + "loss": 0.1834, + "step": 8088 + }, + { + "epoch": 0.6900699539327759, + "grad_norm": 1.775391296453737, + "learning_rate": 2.3145701152279103e-05, + "loss": 0.1808, + "step": 8089 + }, + { + "epoch": 0.690155263606893, + "grad_norm": 1.570099773516832, + "learning_rate": 2.3134048616323125e-05, + "loss": 0.1935, + "step": 8090 + }, + { + "epoch": 0.6902405732810101, + "grad_norm": 1.8055400733610016, + "learning_rate": 2.3122398131439783e-05, + "loss": 0.2128, + "step": 8091 + }, + { + "epoch": 0.6903258829551271, + "grad_norm": 1.2697624113464339, + "learning_rate": 2.311074969851852e-05, + "loss": 0.201, + "step": 8092 + }, + { + "epoch": 0.6904111926292441, + "grad_norm": 1.666881841438823, + "learning_rate": 2.309910331844863e-05, + "loss": 0.2379, + "step": 8093 + }, + { + "epoch": 0.6904965023033612, + "grad_norm": 1.8947041540783969, + "learning_rate": 2.308745899211925e-05, + "loss": 0.21, + "step": 8094 + }, + { + "epoch": 0.6905818119774783, + "grad_norm": 1.3642027180933052, + "learning_rate": 2.307581672041934e-05, + "loss": 0.1696, + "step": 8095 + }, + { + "epoch": 0.6906671216515953, + "grad_norm": 1.9053206132437945, + "learning_rate": 2.3064176504237788e-05, + "loss": 0.2358, + "step": 8096 + }, + { + "epoch": 0.6907524313257123, + "grad_norm": 1.693696970654952, + "learning_rate": 2.3052538344463187e-05, + "loss": 0.1812, + "step": 8097 + }, + { + "epoch": 0.6908377409998294, + "grad_norm": 1.527750804932747, + "learning_rate": 2.3040902241984103e-05, + "loss": 0.2264, + "step": 8098 + }, + { + "epoch": 0.6909230506739464, + "grad_norm": 1.5441604068723236, + "learning_rate": 2.302926819768887e-05, + "loss": 0.1721, + "step": 8099 + }, + { + "epoch": 0.6910083603480635, + "grad_norm": 1.5223528791230432, + "learning_rate": 2.3017636212465692e-05, + "loss": 0.219, + "step": 8100 + }, + { + "epoch": 0.6910936700221805, + "grad_norm": 1.828712564581942, + "learning_rate": 2.3006006287202604e-05, + "loss": 0.2176, + "step": 8101 + }, + { + "epoch": 0.6911789796962976, + "grad_norm": 1.6746428208495971, + "learning_rate": 2.2994378422787488e-05, + "loss": 0.2144, + "step": 8102 + }, + { + "epoch": 0.6912642893704146, + "grad_norm": 1.7248250236776737, + "learning_rate": 2.2982752620108072e-05, + "loss": 0.1674, + "step": 8103 + }, + { + "epoch": 0.6913495990445316, + "grad_norm": 2.138558162644787, + "learning_rate": 2.2971128880051905e-05, + "loss": 0.1998, + "step": 8104 + }, + { + "epoch": 0.6914349087186487, + "grad_norm": 2.198855544566669, + "learning_rate": 2.2959507203506437e-05, + "loss": 0.1613, + "step": 8105 + }, + { + "epoch": 0.6915202183927658, + "grad_norm": 1.6519633547433912, + "learning_rate": 2.2947887591358897e-05, + "loss": 0.0983, + "step": 8106 + }, + { + "epoch": 0.6916055280668828, + "grad_norm": 2.2159253259877696, + "learning_rate": 2.293627004449639e-05, + "loss": 0.1773, + "step": 8107 + }, + { + "epoch": 0.6916908377409998, + "grad_norm": 2.5062625646870793, + "learning_rate": 2.2924654563805826e-05, + "loss": 0.2597, + "step": 8108 + }, + { + "epoch": 0.6917761474151168, + "grad_norm": 1.8052867868687839, + "learning_rate": 2.2913041150174047e-05, + "loss": 0.2098, + "step": 8109 + }, + { + "epoch": 0.691861457089234, + "grad_norm": 1.6551210406431571, + "learning_rate": 2.29014298044876e-05, + "loss": 0.1843, + "step": 8110 + }, + { + "epoch": 0.691946766763351, + "grad_norm": 1.5019596332860454, + "learning_rate": 2.2889820527633005e-05, + "loss": 0.2166, + "step": 8111 + }, + { + "epoch": 0.692032076437468, + "grad_norm": 2.0363966077993236, + "learning_rate": 2.2878213320496545e-05, + "loss": 0.2242, + "step": 8112 + }, + { + "epoch": 0.692117386111585, + "grad_norm": 1.838105305731888, + "learning_rate": 2.2866608183964376e-05, + "loss": 0.1659, + "step": 8113 + }, + { + "epoch": 0.6922026957857021, + "grad_norm": 1.840211738345172, + "learning_rate": 2.2855005118922485e-05, + "loss": 0.2317, + "step": 8114 + }, + { + "epoch": 0.6922880054598192, + "grad_norm": 1.41577466295691, + "learning_rate": 2.2843404126256708e-05, + "loss": 0.1799, + "step": 8115 + }, + { + "epoch": 0.6923733151339362, + "grad_norm": 1.9331165071554317, + "learning_rate": 2.2831805206852714e-05, + "loss": 0.2399, + "step": 8116 + }, + { + "epoch": 0.6924586248080532, + "grad_norm": 1.7511260633158379, + "learning_rate": 2.2820208361596e-05, + "loss": 0.2808, + "step": 8117 + }, + { + "epoch": 0.6925439344821703, + "grad_norm": 1.8576493149065707, + "learning_rate": 2.280861359137198e-05, + "loss": 0.1662, + "step": 8118 + }, + { + "epoch": 0.6926292441562873, + "grad_norm": 1.7308765215100828, + "learning_rate": 2.2797020897065784e-05, + "loss": 0.1926, + "step": 8119 + }, + { + "epoch": 0.6927145538304044, + "grad_norm": 1.3843919602295947, + "learning_rate": 2.27854302795625e-05, + "loss": 0.1993, + "step": 8120 + }, + { + "epoch": 0.6927998635045214, + "grad_norm": 1.6530874035315253, + "learning_rate": 2.2773841739747003e-05, + "loss": 0.2418, + "step": 8121 + }, + { + "epoch": 0.6928851731786384, + "grad_norm": 2.0714470975263506, + "learning_rate": 2.2762255278504007e-05, + "loss": 0.2681, + "step": 8122 + }, + { + "epoch": 0.6929704828527555, + "grad_norm": 1.6142999051624123, + "learning_rate": 2.275067089671808e-05, + "loss": 0.2136, + "step": 8123 + }, + { + "epoch": 0.6930557925268725, + "grad_norm": 1.7514941104329256, + "learning_rate": 2.2739088595273604e-05, + "loss": 0.2019, + "step": 8124 + }, + { + "epoch": 0.6931411022009896, + "grad_norm": 1.2581320883811327, + "learning_rate": 2.272750837505489e-05, + "loss": 0.1892, + "step": 8125 + }, + { + "epoch": 0.6932264118751066, + "grad_norm": 1.7928995355947075, + "learning_rate": 2.2715930236945947e-05, + "loss": 0.243, + "step": 8126 + }, + { + "epoch": 0.6933117215492237, + "grad_norm": 1.3703282058494313, + "learning_rate": 2.2704354181830767e-05, + "loss": 0.1097, + "step": 8127 + }, + { + "epoch": 0.6933970312233407, + "grad_norm": 1.6339561436252723, + "learning_rate": 2.269278021059309e-05, + "loss": 0.1921, + "step": 8128 + }, + { + "epoch": 0.6934823408974577, + "grad_norm": 1.6449988780843974, + "learning_rate": 2.2681208324116538e-05, + "loss": 0.2106, + "step": 8129 + }, + { + "epoch": 0.6935676505715748, + "grad_norm": 1.8447465951052973, + "learning_rate": 2.2669638523284535e-05, + "loss": 0.2677, + "step": 8130 + }, + { + "epoch": 0.6936529602456919, + "grad_norm": 1.635147104907537, + "learning_rate": 2.2658070808980436e-05, + "loss": 0.2723, + "step": 8131 + }, + { + "epoch": 0.6937382699198089, + "grad_norm": 1.8870832197824237, + "learning_rate": 2.26465051820873e-05, + "loss": 0.2672, + "step": 8132 + }, + { + "epoch": 0.6938235795939259, + "grad_norm": 1.3952455374272634, + "learning_rate": 2.2634941643488156e-05, + "loss": 0.22, + "step": 8133 + }, + { + "epoch": 0.693908889268043, + "grad_norm": 1.6972317708702938, + "learning_rate": 2.2623380194065802e-05, + "loss": 0.2083, + "step": 8134 + }, + { + "epoch": 0.6939941989421601, + "grad_norm": 1.6605630533295783, + "learning_rate": 2.2611820834702886e-05, + "loss": 0.1917, + "step": 8135 + }, + { + "epoch": 0.6940795086162771, + "grad_norm": 1.9115890402804385, + "learning_rate": 2.2600263566281908e-05, + "loss": 0.2758, + "step": 8136 + }, + { + "epoch": 0.6941648182903941, + "grad_norm": 1.9631742813351734, + "learning_rate": 2.2588708389685193e-05, + "loss": 0.2665, + "step": 8137 + }, + { + "epoch": 0.6942501279645111, + "grad_norm": 1.467931966031455, + "learning_rate": 2.2577155305794962e-05, + "loss": 0.1839, + "step": 8138 + }, + { + "epoch": 0.6943354376386283, + "grad_norm": 1.5212016595888076, + "learning_rate": 2.256560431549316e-05, + "loss": 0.2073, + "step": 8139 + }, + { + "epoch": 0.6944207473127453, + "grad_norm": 1.7286693439990073, + "learning_rate": 2.2554055419661703e-05, + "loss": 0.1473, + "step": 8140 + }, + { + "epoch": 0.6945060569868623, + "grad_norm": 1.6799622776701062, + "learning_rate": 2.254250861918227e-05, + "loss": 0.1854, + "step": 8141 + }, + { + "epoch": 0.6945913666609793, + "grad_norm": 2.4682251577565455, + "learning_rate": 2.2530963914936387e-05, + "loss": 0.214, + "step": 8142 + }, + { + "epoch": 0.6946766763350964, + "grad_norm": 1.8159143581698107, + "learning_rate": 2.2519421307805445e-05, + "loss": 0.1638, + "step": 8143 + }, + { + "epoch": 0.6947619860092135, + "grad_norm": 1.4018068693819152, + "learning_rate": 2.2507880798670656e-05, + "loss": 0.1663, + "step": 8144 + }, + { + "epoch": 0.6948472956833305, + "grad_norm": 1.701674237186182, + "learning_rate": 2.2496342388413072e-05, + "loss": 0.1935, + "step": 8145 + }, + { + "epoch": 0.6949326053574475, + "grad_norm": 1.9668209979823401, + "learning_rate": 2.2484806077913572e-05, + "loss": 0.2265, + "step": 8146 + }, + { + "epoch": 0.6950179150315646, + "grad_norm": 1.7058524594386242, + "learning_rate": 2.247327186805295e-05, + "loss": 0.2272, + "step": 8147 + }, + { + "epoch": 0.6951032247056816, + "grad_norm": 1.6039070645946696, + "learning_rate": 2.246173975971171e-05, + "loss": 0.1993, + "step": 8148 + }, + { + "epoch": 0.6951885343797987, + "grad_norm": 1.412217596399825, + "learning_rate": 2.245020975377032e-05, + "loss": 0.1711, + "step": 8149 + }, + { + "epoch": 0.6952738440539157, + "grad_norm": 1.6039462330289866, + "learning_rate": 2.2438681851109013e-05, + "loss": 0.2427, + "step": 8150 + }, + { + "epoch": 0.6953591537280328, + "grad_norm": 1.591998465474985, + "learning_rate": 2.2427156052607885e-05, + "loss": 0.1847, + "step": 8151 + }, + { + "epoch": 0.6954444634021498, + "grad_norm": 1.7293526729442532, + "learning_rate": 2.2415632359146856e-05, + "loss": 0.2007, + "step": 8152 + }, + { + "epoch": 0.6955297730762668, + "grad_norm": 1.866183469880605, + "learning_rate": 2.2404110771605726e-05, + "loss": 0.1814, + "step": 8153 + }, + { + "epoch": 0.6956150827503839, + "grad_norm": 1.6591164597649521, + "learning_rate": 2.2392591290864096e-05, + "loss": 0.2337, + "step": 8154 + }, + { + "epoch": 0.695700392424501, + "grad_norm": 2.1054166599829567, + "learning_rate": 2.2381073917801416e-05, + "loss": 0.1918, + "step": 8155 + }, + { + "epoch": 0.695785702098618, + "grad_norm": 2.040585822157233, + "learning_rate": 2.2369558653296978e-05, + "loss": 0.2405, + "step": 8156 + }, + { + "epoch": 0.695871011772735, + "grad_norm": 1.7157702191704067, + "learning_rate": 2.2358045498229907e-05, + "loss": 0.2136, + "step": 8157 + }, + { + "epoch": 0.695956321446852, + "grad_norm": 1.4294073945691788, + "learning_rate": 2.2346534453479173e-05, + "loss": 0.1992, + "step": 8158 + }, + { + "epoch": 0.6960416311209692, + "grad_norm": 1.6006513283145725, + "learning_rate": 2.2335025519923565e-05, + "loss": 0.1993, + "step": 8159 + }, + { + "epoch": 0.6961269407950862, + "grad_norm": 1.4247947310792035, + "learning_rate": 2.2323518698441786e-05, + "loss": 0.197, + "step": 8160 + }, + { + "epoch": 0.6962122504692032, + "grad_norm": 1.391837545095643, + "learning_rate": 2.2312013989912238e-05, + "loss": 0.214, + "step": 8161 + }, + { + "epoch": 0.6962975601433202, + "grad_norm": 1.7357217869268533, + "learning_rate": 2.2300511395213313e-05, + "loss": 0.1574, + "step": 8162 + }, + { + "epoch": 0.6963828698174372, + "grad_norm": 2.1060819564478943, + "learning_rate": 2.2289010915223145e-05, + "loss": 0.2218, + "step": 8163 + }, + { + "epoch": 0.6964681794915544, + "grad_norm": 1.79032300496228, + "learning_rate": 2.227751255081974e-05, + "loss": 0.2162, + "step": 8164 + }, + { + "epoch": 0.6965534891656714, + "grad_norm": 1.7957777570841373, + "learning_rate": 2.2266016302880934e-05, + "loss": 0.2796, + "step": 8165 + }, + { + "epoch": 0.6966387988397884, + "grad_norm": 1.9750489482367033, + "learning_rate": 2.2254522172284386e-05, + "loss": 0.2557, + "step": 8166 + }, + { + "epoch": 0.6967241085139054, + "grad_norm": 1.7348004068565464, + "learning_rate": 2.224303015990767e-05, + "loss": 0.1907, + "step": 8167 + }, + { + "epoch": 0.6968094181880226, + "grad_norm": 1.9119047497682198, + "learning_rate": 2.223154026662806e-05, + "loss": 0.2355, + "step": 8168 + }, + { + "epoch": 0.6968947278621396, + "grad_norm": 1.5377592969131808, + "learning_rate": 2.2220052493322806e-05, + "loss": 0.2145, + "step": 8169 + }, + { + "epoch": 0.6969800375362566, + "grad_norm": 1.6271916795020862, + "learning_rate": 2.220856684086893e-05, + "loss": 0.2075, + "step": 8170 + }, + { + "epoch": 0.6970653472103736, + "grad_norm": 1.418599390446634, + "learning_rate": 2.2197083310143284e-05, + "loss": 0.2357, + "step": 8171 + }, + { + "epoch": 0.6971506568844907, + "grad_norm": 1.771104260831184, + "learning_rate": 2.218560190202257e-05, + "loss": 0.1735, + "step": 8172 + }, + { + "epoch": 0.6972359665586078, + "grad_norm": 1.7889015092003082, + "learning_rate": 2.217412261738338e-05, + "loss": 0.2123, + "step": 8173 + }, + { + "epoch": 0.6973212762327248, + "grad_norm": 1.499832461855408, + "learning_rate": 2.216264545710202e-05, + "loss": 0.1251, + "step": 8174 + }, + { + "epoch": 0.6974065859068418, + "grad_norm": 1.861001465384323, + "learning_rate": 2.215117042205478e-05, + "loss": 0.2511, + "step": 8175 + }, + { + "epoch": 0.6974918955809589, + "grad_norm": 1.7244983607124287, + "learning_rate": 2.213969751311768e-05, + "loss": 0.196, + "step": 8176 + }, + { + "epoch": 0.6975772052550759, + "grad_norm": 2.121686708525965, + "learning_rate": 2.2128226731166633e-05, + "loss": 0.178, + "step": 8177 + }, + { + "epoch": 0.697662514929193, + "grad_norm": 1.8117926960437554, + "learning_rate": 2.211675807707736e-05, + "loss": 0.1347, + "step": 8178 + }, + { + "epoch": 0.69774782460331, + "grad_norm": 2.0391033066214335, + "learning_rate": 2.210529155172544e-05, + "loss": 0.2135, + "step": 8179 + }, + { + "epoch": 0.6978331342774271, + "grad_norm": 1.8828818280764108, + "learning_rate": 2.2093827155986273e-05, + "loss": 0.2649, + "step": 8180 + }, + { + "epoch": 0.6979184439515441, + "grad_norm": 1.5272060634670515, + "learning_rate": 2.2082364890735096e-05, + "loss": 0.2043, + "step": 8181 + }, + { + "epoch": 0.6980037536256611, + "grad_norm": 2.0316570167635697, + "learning_rate": 2.2070904756847022e-05, + "loss": 0.2125, + "step": 8182 + }, + { + "epoch": 0.6980890632997782, + "grad_norm": 1.4365385406135864, + "learning_rate": 2.205944675519695e-05, + "loss": 0.2263, + "step": 8183 + }, + { + "epoch": 0.6981743729738953, + "grad_norm": 1.5857829502512686, + "learning_rate": 2.2047990886659648e-05, + "loss": 0.1925, + "step": 8184 + }, + { + "epoch": 0.6982596826480123, + "grad_norm": 1.4878235761538245, + "learning_rate": 2.2036537152109705e-05, + "loss": 0.1934, + "step": 8185 + }, + { + "epoch": 0.6983449923221293, + "grad_norm": 1.6997212686356096, + "learning_rate": 2.202508555242155e-05, + "loss": 0.2006, + "step": 8186 + }, + { + "epoch": 0.6984303019962463, + "grad_norm": 1.4301613611267308, + "learning_rate": 2.2013636088469458e-05, + "loss": 0.2472, + "step": 8187 + }, + { + "epoch": 0.6985156116703635, + "grad_norm": 1.549211836975956, + "learning_rate": 2.2002188761127507e-05, + "loss": 0.183, + "step": 8188 + }, + { + "epoch": 0.6986009213444805, + "grad_norm": 1.422151727262114, + "learning_rate": 2.1990743571269706e-05, + "loss": 0.1452, + "step": 8189 + }, + { + "epoch": 0.6986862310185975, + "grad_norm": 2.530161117127818, + "learning_rate": 2.1979300519769752e-05, + "loss": 0.2526, + "step": 8190 + }, + { + "epoch": 0.6987715406927145, + "grad_norm": 1.7805764112608313, + "learning_rate": 2.1967859607501325e-05, + "loss": 0.2019, + "step": 8191 + }, + { + "epoch": 0.6988568503668317, + "grad_norm": 1.4645965774537817, + "learning_rate": 2.1956420835337848e-05, + "loss": 0.1862, + "step": 8192 + }, + { + "epoch": 0.6989421600409487, + "grad_norm": 2.133663357102439, + "learning_rate": 2.194498420415262e-05, + "loss": 0.2865, + "step": 8193 + }, + { + "epoch": 0.6990274697150657, + "grad_norm": 1.553561315519243, + "learning_rate": 2.1933549714818748e-05, + "loss": 0.1626, + "step": 8194 + }, + { + "epoch": 0.6991127793891827, + "grad_norm": 2.552815442596656, + "learning_rate": 2.1922117368209245e-05, + "loss": 0.2185, + "step": 8195 + }, + { + "epoch": 0.6991980890632998, + "grad_norm": 1.4022860289359131, + "learning_rate": 2.1910687165196837e-05, + "loss": 0.226, + "step": 8196 + }, + { + "epoch": 0.6992833987374169, + "grad_norm": 1.4510082291225799, + "learning_rate": 2.1899259106654215e-05, + "loss": 0.2376, + "step": 8197 + }, + { + "epoch": 0.6993687084115339, + "grad_norm": 1.4125725499653852, + "learning_rate": 2.1887833193453832e-05, + "loss": 0.1919, + "step": 8198 + }, + { + "epoch": 0.6994540180856509, + "grad_norm": 1.3063695880229182, + "learning_rate": 2.1876409426468005e-05, + "loss": 0.1453, + "step": 8199 + }, + { + "epoch": 0.6995393277597679, + "grad_norm": 1.7329365031501285, + "learning_rate": 2.1864987806568858e-05, + "loss": 0.2234, + "step": 8200 + }, + { + "epoch": 0.699624637433885, + "grad_norm": 2.5020365049716387, + "learning_rate": 2.185356833462837e-05, + "loss": 0.2275, + "step": 8201 + }, + { + "epoch": 0.699709947108002, + "grad_norm": 1.58778882140215, + "learning_rate": 2.1842151011518413e-05, + "loss": 0.2163, + "step": 8202 + }, + { + "epoch": 0.6997952567821191, + "grad_norm": 1.7964315779513742, + "learning_rate": 2.183073583811055e-05, + "loss": 0.2578, + "step": 8203 + }, + { + "epoch": 0.6998805664562361, + "grad_norm": 1.415491130638026, + "learning_rate": 2.181932281527634e-05, + "loss": 0.1623, + "step": 8204 + }, + { + "epoch": 0.6999658761303532, + "grad_norm": 1.51061387802531, + "learning_rate": 2.180791194388707e-05, + "loss": 0.1563, + "step": 8205 + }, + { + "epoch": 0.7000511858044702, + "grad_norm": 1.273257658259237, + "learning_rate": 2.179650322481392e-05, + "loss": 0.1715, + "step": 8206 + }, + { + "epoch": 0.7001364954785873, + "grad_norm": 1.4414524804948585, + "learning_rate": 2.1785096658927873e-05, + "loss": 0.2003, + "step": 8207 + }, + { + "epoch": 0.7002218051527043, + "grad_norm": 2.237966998077689, + "learning_rate": 2.1773692247099764e-05, + "loss": 0.2238, + "step": 8208 + }, + { + "epoch": 0.7003071148268214, + "grad_norm": 1.659356712341112, + "learning_rate": 2.176228999020025e-05, + "loss": 0.1111, + "step": 8209 + }, + { + "epoch": 0.7003924245009384, + "grad_norm": 1.5964693543371788, + "learning_rate": 2.1750889889099828e-05, + "loss": 0.1935, + "step": 8210 + }, + { + "epoch": 0.7004777341750554, + "grad_norm": 1.468194897231326, + "learning_rate": 2.1739491944668866e-05, + "loss": 0.1879, + "step": 8211 + }, + { + "epoch": 0.7005630438491725, + "grad_norm": 1.619549045198152, + "learning_rate": 2.1728096157777517e-05, + "loss": 0.2644, + "step": 8212 + }, + { + "epoch": 0.7006483535232896, + "grad_norm": 2.440097500776036, + "learning_rate": 2.171670252929579e-05, + "loss": 0.226, + "step": 8213 + }, + { + "epoch": 0.7007336631974066, + "grad_norm": 1.8538052781776768, + "learning_rate": 2.170531106009351e-05, + "loss": 0.2976, + "step": 8214 + }, + { + "epoch": 0.7008189728715236, + "grad_norm": 1.370566633575996, + "learning_rate": 2.1693921751040407e-05, + "loss": 0.147, + "step": 8215 + }, + { + "epoch": 0.7009042825456406, + "grad_norm": 1.6887809695235503, + "learning_rate": 2.1682534603005927e-05, + "loss": 0.1728, + "step": 8216 + }, + { + "epoch": 0.7009895922197578, + "grad_norm": 1.757303053889288, + "learning_rate": 2.1671149616859466e-05, + "loss": 0.2024, + "step": 8217 + }, + { + "epoch": 0.7010749018938748, + "grad_norm": 1.5952485463085404, + "learning_rate": 2.1659766793470195e-05, + "loss": 0.2376, + "step": 8218 + }, + { + "epoch": 0.7011602115679918, + "grad_norm": 1.573521310492869, + "learning_rate": 2.1648386133707128e-05, + "loss": 0.1869, + "step": 8219 + }, + { + "epoch": 0.7012455212421088, + "grad_norm": 2.0818580999178016, + "learning_rate": 2.1637007638439116e-05, + "loss": 0.264, + "step": 8220 + }, + { + "epoch": 0.701330830916226, + "grad_norm": 1.7932870619712007, + "learning_rate": 2.1625631308534854e-05, + "loss": 0.1714, + "step": 8221 + }, + { + "epoch": 0.701416140590343, + "grad_norm": 1.3391180714634794, + "learning_rate": 2.161425714486286e-05, + "loss": 0.1431, + "step": 8222 + }, + { + "epoch": 0.70150145026446, + "grad_norm": 1.5115641039228023, + "learning_rate": 2.1602885148291473e-05, + "loss": 0.192, + "step": 8223 + }, + { + "epoch": 0.701586759938577, + "grad_norm": 1.4965999533745395, + "learning_rate": 2.1591515319688936e-05, + "loss": 0.1859, + "step": 8224 + }, + { + "epoch": 0.7016720696126941, + "grad_norm": 1.9081089148794625, + "learning_rate": 2.1580147659923212e-05, + "loss": 0.1903, + "step": 8225 + }, + { + "epoch": 0.7017573792868111, + "grad_norm": 1.6199005515574205, + "learning_rate": 2.1568782169862205e-05, + "loss": 0.1843, + "step": 8226 + }, + { + "epoch": 0.7018426889609282, + "grad_norm": 1.7430339860869186, + "learning_rate": 2.1557418850373602e-05, + "loss": 0.15, + "step": 8227 + }, + { + "epoch": 0.7019279986350452, + "grad_norm": 1.5717579276623852, + "learning_rate": 2.1546057702324916e-05, + "loss": 0.1879, + "step": 8228 + }, + { + "epoch": 0.7020133083091623, + "grad_norm": 1.6115024911575402, + "learning_rate": 2.1534698726583524e-05, + "loss": 0.1729, + "step": 8229 + }, + { + "epoch": 0.7020986179832793, + "grad_norm": 1.3101450365163294, + "learning_rate": 2.1523341924016604e-05, + "loss": 0.2024, + "step": 8230 + }, + { + "epoch": 0.7021839276573963, + "grad_norm": 1.445929483226996, + "learning_rate": 2.1511987295491243e-05, + "loss": 0.1664, + "step": 8231 + }, + { + "epoch": 0.7022692373315134, + "grad_norm": 1.8720496807841982, + "learning_rate": 2.1500634841874224e-05, + "loss": 0.2093, + "step": 8232 + }, + { + "epoch": 0.7023545470056305, + "grad_norm": 1.3472164597448135, + "learning_rate": 2.1489284564032308e-05, + "loss": 0.1506, + "step": 8233 + }, + { + "epoch": 0.7024398566797475, + "grad_norm": 1.9912608665653528, + "learning_rate": 2.147793646283201e-05, + "loss": 0.2532, + "step": 8234 + }, + { + "epoch": 0.7025251663538645, + "grad_norm": 1.4968479735417342, + "learning_rate": 2.14665905391397e-05, + "loss": 0.1729, + "step": 8235 + }, + { + "epoch": 0.7026104760279815, + "grad_norm": 1.5452753137756325, + "learning_rate": 2.1455246793821555e-05, + "loss": 0.1614, + "step": 8236 + }, + { + "epoch": 0.7026957857020986, + "grad_norm": 1.8156352057637943, + "learning_rate": 2.144390522774367e-05, + "loss": 0.1784, + "step": 8237 + }, + { + "epoch": 0.7027810953762157, + "grad_norm": 2.082278811281668, + "learning_rate": 2.1432565841771836e-05, + "loss": 0.2356, + "step": 8238 + }, + { + "epoch": 0.7028664050503327, + "grad_norm": 2.2343235676854745, + "learning_rate": 2.142122863677181e-05, + "loss": 0.2217, + "step": 8239 + }, + { + "epoch": 0.7029517147244497, + "grad_norm": 1.5223495902695598, + "learning_rate": 2.1409893613609113e-05, + "loss": 0.1429, + "step": 8240 + }, + { + "epoch": 0.7030370243985667, + "grad_norm": 1.5376192093202021, + "learning_rate": 2.1398560773149105e-05, + "loss": 0.1546, + "step": 8241 + }, + { + "epoch": 0.7031223340726839, + "grad_norm": 1.4713228295514058, + "learning_rate": 2.1387230116257e-05, + "loss": 0.1903, + "step": 8242 + }, + { + "epoch": 0.7032076437468009, + "grad_norm": 1.988251991281528, + "learning_rate": 2.137590164379781e-05, + "loss": 0.2152, + "step": 8243 + }, + { + "epoch": 0.7032929534209179, + "grad_norm": 2.1559893685540557, + "learning_rate": 2.136457535663645e-05, + "loss": 0.2213, + "step": 8244 + }, + { + "epoch": 0.7033782630950349, + "grad_norm": 1.5144908633682246, + "learning_rate": 2.1353251255637563e-05, + "loss": 0.1848, + "step": 8245 + }, + { + "epoch": 0.7034635727691521, + "grad_norm": 1.7712173344757174, + "learning_rate": 2.1341929341665727e-05, + "loss": 0.1717, + "step": 8246 + }, + { + "epoch": 0.7035488824432691, + "grad_norm": 1.4882876578766253, + "learning_rate": 2.1330609615585308e-05, + "loss": 0.1894, + "step": 8247 + }, + { + "epoch": 0.7036341921173861, + "grad_norm": 1.6440143271144472, + "learning_rate": 2.1319292078260483e-05, + "loss": 0.244, + "step": 8248 + }, + { + "epoch": 0.7037195017915031, + "grad_norm": 1.3181337516714804, + "learning_rate": 2.1307976730555306e-05, + "loss": 0.1958, + "step": 8249 + }, + { + "epoch": 0.7038048114656202, + "grad_norm": 1.5806857945767583, + "learning_rate": 2.1296663573333635e-05, + "loss": 0.2274, + "step": 8250 + }, + { + "epoch": 0.7038901211397373, + "grad_norm": 1.6873356774468469, + "learning_rate": 2.1285352607459168e-05, + "loss": 0.2272, + "step": 8251 + }, + { + "epoch": 0.7039754308138543, + "grad_norm": 1.5428211769322064, + "learning_rate": 2.1274043833795426e-05, + "loss": 0.2217, + "step": 8252 + }, + { + "epoch": 0.7040607404879713, + "grad_norm": 1.9532643382914379, + "learning_rate": 2.1262737253205822e-05, + "loss": 0.217, + "step": 8253 + }, + { + "epoch": 0.7041460501620884, + "grad_norm": 1.5434959573251013, + "learning_rate": 2.1251432866553484e-05, + "loss": 0.195, + "step": 8254 + }, + { + "epoch": 0.7042313598362054, + "grad_norm": 1.71072358488109, + "learning_rate": 2.124013067470149e-05, + "loss": 0.263, + "step": 8255 + }, + { + "epoch": 0.7043166695103225, + "grad_norm": 1.8401453374393504, + "learning_rate": 2.1228830678512677e-05, + "loss": 0.2291, + "step": 8256 + }, + { + "epoch": 0.7044019791844395, + "grad_norm": 1.6764813719049305, + "learning_rate": 2.1217532878849787e-05, + "loss": 0.2235, + "step": 8257 + }, + { + "epoch": 0.7044872888585566, + "grad_norm": 1.5754774187480007, + "learning_rate": 2.1206237276575276e-05, + "loss": 0.196, + "step": 8258 + }, + { + "epoch": 0.7045725985326736, + "grad_norm": 1.5155674245064445, + "learning_rate": 2.1194943872551553e-05, + "loss": 0.1824, + "step": 8259 + }, + { + "epoch": 0.7046579082067906, + "grad_norm": 2.046783328733526, + "learning_rate": 2.1183652667640802e-05, + "loss": 0.1862, + "step": 8260 + }, + { + "epoch": 0.7047432178809077, + "grad_norm": 1.831265873238053, + "learning_rate": 2.1172363662705035e-05, + "loss": 0.1583, + "step": 8261 + }, + { + "epoch": 0.7048285275550248, + "grad_norm": 1.4849685285425138, + "learning_rate": 2.1161076858606115e-05, + "loss": 0.2208, + "step": 8262 + }, + { + "epoch": 0.7049138372291418, + "grad_norm": 1.744015952991001, + "learning_rate": 2.1149792256205725e-05, + "loss": 0.1856, + "step": 8263 + }, + { + "epoch": 0.7049991469032588, + "grad_norm": 1.255829996148029, + "learning_rate": 2.1138509856365386e-05, + "loss": 0.1732, + "step": 8264 + }, + { + "epoch": 0.7050844565773758, + "grad_norm": 1.585282815436865, + "learning_rate": 2.1127229659946435e-05, + "loss": 0.1886, + "step": 8265 + }, + { + "epoch": 0.705169766251493, + "grad_norm": 1.4389220957236168, + "learning_rate": 2.11159516678101e-05, + "loss": 0.2149, + "step": 8266 + }, + { + "epoch": 0.70525507592561, + "grad_norm": 1.7964844652454994, + "learning_rate": 2.1104675880817337e-05, + "loss": 0.1712, + "step": 8267 + }, + { + "epoch": 0.705340385599727, + "grad_norm": 1.836841831277727, + "learning_rate": 2.1093402299829036e-05, + "loss": 0.1861, + "step": 8268 + }, + { + "epoch": 0.705425695273844, + "grad_norm": 1.8677275447279373, + "learning_rate": 2.1082130925705857e-05, + "loss": 0.236, + "step": 8269 + }, + { + "epoch": 0.7055110049479612, + "grad_norm": 1.9984786088307835, + "learning_rate": 2.1070861759308315e-05, + "loss": 0.2036, + "step": 8270 + }, + { + "epoch": 0.7055963146220782, + "grad_norm": 1.638085134080076, + "learning_rate": 2.1059594801496745e-05, + "loss": 0.179, + "step": 8271 + }, + { + "epoch": 0.7056816242961952, + "grad_norm": 1.6917735487272934, + "learning_rate": 2.104833005313131e-05, + "loss": 0.2162, + "step": 8272 + }, + { + "epoch": 0.7057669339703122, + "grad_norm": 2.038879386309181, + "learning_rate": 2.1037067515072057e-05, + "loss": 0.2533, + "step": 8273 + }, + { + "epoch": 0.7058522436444293, + "grad_norm": 1.65335657989146, + "learning_rate": 2.1025807188178758e-05, + "loss": 0.171, + "step": 8274 + }, + { + "epoch": 0.7059375533185464, + "grad_norm": 1.392125809284176, + "learning_rate": 2.101454907331113e-05, + "loss": 0.1582, + "step": 8275 + }, + { + "epoch": 0.7060228629926634, + "grad_norm": 1.6494662433001972, + "learning_rate": 2.1003293171328646e-05, + "loss": 0.2322, + "step": 8276 + }, + { + "epoch": 0.7061081726667804, + "grad_norm": 1.2766550523131692, + "learning_rate": 2.0992039483090647e-05, + "loss": 0.1631, + "step": 8277 + }, + { + "epoch": 0.7061934823408974, + "grad_norm": 1.6941513092851068, + "learning_rate": 2.0980788009456264e-05, + "loss": 0.1393, + "step": 8278 + }, + { + "epoch": 0.7062787920150145, + "grad_norm": 1.5834169700350167, + "learning_rate": 2.0969538751284547e-05, + "loss": 0.1879, + "step": 8279 + }, + { + "epoch": 0.7063641016891316, + "grad_norm": 1.6809770468892724, + "learning_rate": 2.095829170943424e-05, + "loss": 0.2527, + "step": 8280 + }, + { + "epoch": 0.7064494113632486, + "grad_norm": 1.8062986637938607, + "learning_rate": 2.0947046884764054e-05, + "loss": 0.1917, + "step": 8281 + }, + { + "epoch": 0.7065347210373656, + "grad_norm": 1.7238691908117707, + "learning_rate": 2.0935804278132448e-05, + "loss": 0.2028, + "step": 8282 + }, + { + "epoch": 0.7066200307114827, + "grad_norm": 1.791653019461295, + "learning_rate": 2.0924563890397743e-05, + "loss": 0.176, + "step": 8283 + }, + { + "epoch": 0.7067053403855997, + "grad_norm": 1.541392619889132, + "learning_rate": 2.0913325722418077e-05, + "loss": 0.1665, + "step": 8284 + }, + { + "epoch": 0.7067906500597168, + "grad_norm": 1.7358563944138892, + "learning_rate": 2.090208977505142e-05, + "loss": 0.2009, + "step": 8285 + }, + { + "epoch": 0.7068759597338338, + "grad_norm": 1.5993125928991831, + "learning_rate": 2.089085604915559e-05, + "loss": 0.2088, + "step": 8286 + }, + { + "epoch": 0.7069612694079509, + "grad_norm": 1.876585290216165, + "learning_rate": 2.087962454558819e-05, + "loss": 0.2364, + "step": 8287 + }, + { + "epoch": 0.7070465790820679, + "grad_norm": 1.4111643645775125, + "learning_rate": 2.0868395265206732e-05, + "loss": 0.1672, + "step": 8288 + }, + { + "epoch": 0.7071318887561849, + "grad_norm": 2.741069163538519, + "learning_rate": 2.085716820886849e-05, + "loss": 0.2268, + "step": 8289 + }, + { + "epoch": 0.707217198430302, + "grad_norm": 2.5292711411275337, + "learning_rate": 2.0845943377430587e-05, + "loss": 0.2551, + "step": 8290 + }, + { + "epoch": 0.7073025081044191, + "grad_norm": 1.4059061689716186, + "learning_rate": 2.0834720771749987e-05, + "loss": 0.2458, + "step": 8291 + }, + { + "epoch": 0.7073878177785361, + "grad_norm": 1.803464532946029, + "learning_rate": 2.0823500392683476e-05, + "loss": 0.2171, + "step": 8292 + }, + { + "epoch": 0.7074731274526531, + "grad_norm": 1.6959342695310524, + "learning_rate": 2.0812282241087662e-05, + "loss": 0.1823, + "step": 8293 + }, + { + "epoch": 0.7075584371267701, + "grad_norm": 1.804463426294902, + "learning_rate": 2.0801066317818973e-05, + "loss": 0.1919, + "step": 8294 + }, + { + "epoch": 0.7076437468008873, + "grad_norm": 1.9416330615670572, + "learning_rate": 2.0789852623733745e-05, + "loss": 0.1561, + "step": 8295 + }, + { + "epoch": 0.7077290564750043, + "grad_norm": 1.7881178048344757, + "learning_rate": 2.0778641159688018e-05, + "loss": 0.1987, + "step": 8296 + }, + { + "epoch": 0.7078143661491213, + "grad_norm": 1.7310853294527915, + "learning_rate": 2.0767431926537766e-05, + "loss": 0.1725, + "step": 8297 + }, + { + "epoch": 0.7078996758232383, + "grad_norm": 2.116485817259507, + "learning_rate": 2.0756224925138752e-05, + "loss": 0.2512, + "step": 8298 + }, + { + "epoch": 0.7079849854973554, + "grad_norm": 1.8124701398823342, + "learning_rate": 2.0745020156346558e-05, + "loss": 0.1862, + "step": 8299 + }, + { + "epoch": 0.7080702951714725, + "grad_norm": 2.11824364026836, + "learning_rate": 2.07338176210166e-05, + "loss": 0.2124, + "step": 8300 + }, + { + "epoch": 0.7081556048455895, + "grad_norm": 1.394800795890137, + "learning_rate": 2.0722617320004162e-05, + "loss": 0.2213, + "step": 8301 + }, + { + "epoch": 0.7082409145197065, + "grad_norm": 1.32575570993994, + "learning_rate": 2.071141925416431e-05, + "loss": 0.2297, + "step": 8302 + }, + { + "epoch": 0.7083262241938236, + "grad_norm": 1.7235510614908287, + "learning_rate": 2.070022342435196e-05, + "loss": 0.1654, + "step": 8303 + }, + { + "epoch": 0.7084115338679406, + "grad_norm": 1.7614468451391692, + "learning_rate": 2.0689029831421856e-05, + "loss": 0.1918, + "step": 8304 + }, + { + "epoch": 0.7084968435420577, + "grad_norm": 2.121787728718631, + "learning_rate": 2.067783847622856e-05, + "loss": 0.2354, + "step": 8305 + }, + { + "epoch": 0.7085821532161747, + "grad_norm": 1.3662615446979791, + "learning_rate": 2.066664935962649e-05, + "loss": 0.225, + "step": 8306 + }, + { + "epoch": 0.7086674628902918, + "grad_norm": 1.8396483234519287, + "learning_rate": 2.0655462482469833e-05, + "loss": 0.1953, + "step": 8307 + }, + { + "epoch": 0.7087527725644088, + "grad_norm": 1.5089108908642033, + "learning_rate": 2.064427784561272e-05, + "loss": 0.2183, + "step": 8308 + }, + { + "epoch": 0.7088380822385258, + "grad_norm": 1.6417005374080125, + "learning_rate": 2.0633095449908964e-05, + "loss": 0.1954, + "step": 8309 + }, + { + "epoch": 0.7089233919126429, + "grad_norm": 1.811694130477329, + "learning_rate": 2.062191529621233e-05, + "loss": 0.2061, + "step": 8310 + }, + { + "epoch": 0.70900870158676, + "grad_norm": 1.7481674408491645, + "learning_rate": 2.061073738537635e-05, + "loss": 0.2078, + "step": 8311 + }, + { + "epoch": 0.709094011260877, + "grad_norm": 1.6528166687109167, + "learning_rate": 2.0599561718254397e-05, + "loss": 0.1788, + "step": 8312 + }, + { + "epoch": 0.709179320934994, + "grad_norm": 1.3746910181540328, + "learning_rate": 2.0588388295699668e-05, + "loss": 0.1761, + "step": 8313 + }, + { + "epoch": 0.709264630609111, + "grad_norm": 1.861633852080652, + "learning_rate": 2.0577217118565202e-05, + "loss": 0.1629, + "step": 8314 + }, + { + "epoch": 0.7093499402832281, + "grad_norm": 1.8463617996323476, + "learning_rate": 2.0566048187703857e-05, + "loss": 0.1816, + "step": 8315 + }, + { + "epoch": 0.7094352499573452, + "grad_norm": 1.560366891118849, + "learning_rate": 2.05548815039683e-05, + "loss": 0.2009, + "step": 8316 + }, + { + "epoch": 0.7095205596314622, + "grad_norm": 2.417439096151101, + "learning_rate": 2.0543717068211093e-05, + "loss": 0.2598, + "step": 8317 + }, + { + "epoch": 0.7096058693055792, + "grad_norm": 1.5659752916454939, + "learning_rate": 2.0532554881284555e-05, + "loss": 0.1897, + "step": 8318 + }, + { + "epoch": 0.7096911789796962, + "grad_norm": 1.6956065401955882, + "learning_rate": 2.0521394944040856e-05, + "loss": 0.2199, + "step": 8319 + }, + { + "epoch": 0.7097764886538134, + "grad_norm": 1.43210077111811, + "learning_rate": 2.0510237257331994e-05, + "loss": 0.2203, + "step": 8320 + }, + { + "epoch": 0.7098617983279304, + "grad_norm": 1.7432353200158972, + "learning_rate": 2.0499081822009842e-05, + "loss": 0.2083, + "step": 8321 + }, + { + "epoch": 0.7099471080020474, + "grad_norm": 1.8961748367530997, + "learning_rate": 2.048792863892599e-05, + "loss": 0.1942, + "step": 8322 + }, + { + "epoch": 0.7100324176761644, + "grad_norm": 1.7660495618948437, + "learning_rate": 2.0476777708931978e-05, + "loss": 0.2129, + "step": 8323 + }, + { + "epoch": 0.7101177273502816, + "grad_norm": 1.3049147173905564, + "learning_rate": 2.0465629032879097e-05, + "loss": 0.1832, + "step": 8324 + }, + { + "epoch": 0.7102030370243986, + "grad_norm": 1.4515212859019717, + "learning_rate": 2.0454482611618496e-05, + "loss": 0.2125, + "step": 8325 + }, + { + "epoch": 0.7102883466985156, + "grad_norm": 1.9194498034762946, + "learning_rate": 2.044333844600114e-05, + "loss": 0.2439, + "step": 8326 + }, + { + "epoch": 0.7103736563726326, + "grad_norm": 1.4619402376633408, + "learning_rate": 2.043219653687784e-05, + "loss": 0.177, + "step": 8327 + }, + { + "epoch": 0.7104589660467497, + "grad_norm": 1.720575230464435, + "learning_rate": 2.0421056885099204e-05, + "loss": 0.2056, + "step": 8328 + }, + { + "epoch": 0.7105442757208668, + "grad_norm": 1.6777704699078548, + "learning_rate": 2.0409919491515677e-05, + "loss": 0.1603, + "step": 8329 + }, + { + "epoch": 0.7106295853949838, + "grad_norm": 2.530101845391402, + "learning_rate": 2.0398784356977585e-05, + "loss": 0.2615, + "step": 8330 + }, + { + "epoch": 0.7107148950691008, + "grad_norm": 1.7279839511585244, + "learning_rate": 2.038765148233498e-05, + "loss": 0.1957, + "step": 8331 + }, + { + "epoch": 0.7108002047432179, + "grad_norm": 1.570614989643856, + "learning_rate": 2.0376520868437838e-05, + "loss": 0.1529, + "step": 8332 + }, + { + "epoch": 0.7108855144173349, + "grad_norm": 1.8800261207551558, + "learning_rate": 2.0365392516135906e-05, + "loss": 0.192, + "step": 8333 + }, + { + "epoch": 0.710970824091452, + "grad_norm": 1.809660034919842, + "learning_rate": 2.035426642627878e-05, + "loss": 0.2323, + "step": 8334 + }, + { + "epoch": 0.711056133765569, + "grad_norm": 2.1551400244021903, + "learning_rate": 2.0343142599715874e-05, + "loss": 0.2594, + "step": 8335 + }, + { + "epoch": 0.7111414434396861, + "grad_norm": 1.560284531638903, + "learning_rate": 2.0332021037296418e-05, + "loss": 0.1817, + "step": 8336 + }, + { + "epoch": 0.7112267531138031, + "grad_norm": 1.541943095744232, + "learning_rate": 2.0320901739869537e-05, + "loss": 0.2149, + "step": 8337 + }, + { + "epoch": 0.7113120627879201, + "grad_norm": 1.6906476511116533, + "learning_rate": 2.0309784708284058e-05, + "loss": 0.2216, + "step": 8338 + }, + { + "epoch": 0.7113973724620372, + "grad_norm": 1.7409841718222612, + "learning_rate": 2.0298669943388753e-05, + "loss": 0.2068, + "step": 8339 + }, + { + "epoch": 0.7114826821361543, + "grad_norm": 1.6481056783365962, + "learning_rate": 2.0287557446032172e-05, + "loss": 0.166, + "step": 8340 + }, + { + "epoch": 0.7115679918102713, + "grad_norm": 1.3427414768923651, + "learning_rate": 2.027644721706269e-05, + "loss": 0.2084, + "step": 8341 + }, + { + "epoch": 0.7116533014843883, + "grad_norm": 1.3996354411809666, + "learning_rate": 2.026533925732849e-05, + "loss": 0.1613, + "step": 8342 + }, + { + "epoch": 0.7117386111585053, + "grad_norm": 1.9574768411915553, + "learning_rate": 2.0254233567677666e-05, + "loss": 0.1773, + "step": 8343 + }, + { + "epoch": 0.7118239208326225, + "grad_norm": 1.6014718378969093, + "learning_rate": 2.0243130148958006e-05, + "loss": 0.1524, + "step": 8344 + }, + { + "epoch": 0.7119092305067395, + "grad_norm": 2.1867037822705857, + "learning_rate": 2.0232029002017244e-05, + "loss": 0.2576, + "step": 8345 + }, + { + "epoch": 0.7119945401808565, + "grad_norm": 1.8133470594830947, + "learning_rate": 2.0220930127702885e-05, + "loss": 0.1971, + "step": 8346 + }, + { + "epoch": 0.7120798498549735, + "grad_norm": 1.9147701278845877, + "learning_rate": 2.0209833526862267e-05, + "loss": 0.2503, + "step": 8347 + }, + { + "epoch": 0.7121651595290907, + "grad_norm": 1.4716542525900096, + "learning_rate": 2.0198739200342547e-05, + "loss": 0.1751, + "step": 8348 + }, + { + "epoch": 0.7122504692032077, + "grad_norm": 1.586610322494136, + "learning_rate": 2.0187647148990712e-05, + "loss": 0.2376, + "step": 8349 + }, + { + "epoch": 0.7123357788773247, + "grad_norm": 1.865630773794163, + "learning_rate": 2.017655737365363e-05, + "loss": 0.2384, + "step": 8350 + }, + { + "epoch": 0.7124210885514417, + "grad_norm": 1.5033000566531127, + "learning_rate": 2.0165469875177874e-05, + "loss": 0.2632, + "step": 8351 + }, + { + "epoch": 0.7125063982255587, + "grad_norm": 2.0779750454370784, + "learning_rate": 2.0154384654409975e-05, + "loss": 0.156, + "step": 8352 + }, + { + "epoch": 0.7125917078996759, + "grad_norm": 1.3401338580158748, + "learning_rate": 2.01433017121962e-05, + "loss": 0.165, + "step": 8353 + }, + { + "epoch": 0.7126770175737929, + "grad_norm": 1.91503109460989, + "learning_rate": 2.013222104938269e-05, + "loss": 0.2016, + "step": 8354 + }, + { + "epoch": 0.7127623272479099, + "grad_norm": 2.0211556429946778, + "learning_rate": 2.012114266681538e-05, + "loss": 0.17, + "step": 8355 + }, + { + "epoch": 0.7128476369220269, + "grad_norm": 2.2393265924483825, + "learning_rate": 2.011006656534005e-05, + "loss": 0.2324, + "step": 8356 + }, + { + "epoch": 0.712932946596144, + "grad_norm": 1.2766409991068695, + "learning_rate": 2.009899274580231e-05, + "loss": 0.1539, + "step": 8357 + }, + { + "epoch": 0.713018256270261, + "grad_norm": 1.7374002002623292, + "learning_rate": 2.0087921209047554e-05, + "loss": 0.185, + "step": 8358 + }, + { + "epoch": 0.7131035659443781, + "grad_norm": 2.081341999758448, + "learning_rate": 2.00768519559211e-05, + "loss": 0.174, + "step": 8359 + }, + { + "epoch": 0.7131888756184951, + "grad_norm": 1.341159808138533, + "learning_rate": 2.0065784987267956e-05, + "loss": 0.2153, + "step": 8360 + }, + { + "epoch": 0.7132741852926122, + "grad_norm": 1.5191860248101665, + "learning_rate": 2.005472030393307e-05, + "loss": 0.1896, + "step": 8361 + }, + { + "epoch": 0.7133594949667292, + "grad_norm": 1.7327728432055116, + "learning_rate": 2.0043657906761142e-05, + "loss": 0.1675, + "step": 8362 + }, + { + "epoch": 0.7134448046408463, + "grad_norm": 2.1103543868921983, + "learning_rate": 2.0032597796596788e-05, + "loss": 0.2249, + "step": 8363 + }, + { + "epoch": 0.7135301143149633, + "grad_norm": 1.9029441058499363, + "learning_rate": 2.0021539974284304e-05, + "loss": 0.2409, + "step": 8364 + }, + { + "epoch": 0.7136154239890804, + "grad_norm": 2.3163778422320047, + "learning_rate": 2.0010484440667953e-05, + "loss": 0.2252, + "step": 8365 + }, + { + "epoch": 0.7137007336631974, + "grad_norm": 2.554075861981892, + "learning_rate": 1.9999431196591755e-05, + "loss": 0.2251, + "step": 8366 + }, + { + "epoch": 0.7137860433373144, + "grad_norm": 1.4968140464270432, + "learning_rate": 1.998838024289956e-05, + "loss": 0.2253, + "step": 8367 + }, + { + "epoch": 0.7138713530114315, + "grad_norm": 1.7270572005420353, + "learning_rate": 1.9977331580435054e-05, + "loss": 0.2889, + "step": 8368 + }, + { + "epoch": 0.7139566626855486, + "grad_norm": 2.0342272726821427, + "learning_rate": 1.996628521004174e-05, + "loss": 0.1825, + "step": 8369 + }, + { + "epoch": 0.7140419723596656, + "grad_norm": 1.6445610424251735, + "learning_rate": 1.995524113256295e-05, + "loss": 0.1387, + "step": 8370 + }, + { + "epoch": 0.7141272820337826, + "grad_norm": 1.29045907863693, + "learning_rate": 1.994419934884183e-05, + "loss": 0.1687, + "step": 8371 + }, + { + "epoch": 0.7142125917078996, + "grad_norm": 1.829739631933053, + "learning_rate": 1.9933159859721408e-05, + "loss": 0.1784, + "step": 8372 + }, + { + "epoch": 0.7142979013820168, + "grad_norm": 1.5224465301595405, + "learning_rate": 1.9922122666044413e-05, + "loss": 0.1936, + "step": 8373 + }, + { + "epoch": 0.7143832110561338, + "grad_norm": 1.3846981317296652, + "learning_rate": 1.991108776865354e-05, + "loss": 0.1957, + "step": 8374 + }, + { + "epoch": 0.7144685207302508, + "grad_norm": 1.5225804192111128, + "learning_rate": 1.9900055168391224e-05, + "loss": 0.1313, + "step": 8375 + }, + { + "epoch": 0.7145538304043678, + "grad_norm": 1.8941522907472919, + "learning_rate": 1.9889024866099748e-05, + "loss": 0.2172, + "step": 8376 + }, + { + "epoch": 0.714639140078485, + "grad_norm": 1.6621615875685722, + "learning_rate": 1.9877996862621207e-05, + "loss": 0.223, + "step": 8377 + }, + { + "epoch": 0.714724449752602, + "grad_norm": 1.41172428605008, + "learning_rate": 1.9866971158797528e-05, + "loss": 0.2033, + "step": 8378 + }, + { + "epoch": 0.714809759426719, + "grad_norm": 1.4109304315975053, + "learning_rate": 1.9855947755470504e-05, + "loss": 0.1752, + "step": 8379 + }, + { + "epoch": 0.714895069100836, + "grad_norm": 1.6798470354833115, + "learning_rate": 1.9844926653481648e-05, + "loss": 0.2212, + "step": 8380 + }, + { + "epoch": 0.7149803787749531, + "grad_norm": 1.703456321476564, + "learning_rate": 1.9833907853672417e-05, + "loss": 0.2188, + "step": 8381 + }, + { + "epoch": 0.7150656884490701, + "grad_norm": 1.566052099513456, + "learning_rate": 1.9822891356884022e-05, + "loss": 0.2034, + "step": 8382 + }, + { + "epoch": 0.7151509981231872, + "grad_norm": 1.8168123181018294, + "learning_rate": 1.981187716395751e-05, + "loss": 0.1411, + "step": 8383 + }, + { + "epoch": 0.7152363077973042, + "grad_norm": 1.797142473299192, + "learning_rate": 1.9800865275733737e-05, + "loss": 0.2459, + "step": 8384 + }, + { + "epoch": 0.7153216174714213, + "grad_norm": 1.8008729989930432, + "learning_rate": 1.9789855693053456e-05, + "loss": 0.2125, + "step": 8385 + }, + { + "epoch": 0.7154069271455383, + "grad_norm": 1.603159931241022, + "learning_rate": 1.9778848416757123e-05, + "loss": 0.1676, + "step": 8386 + }, + { + "epoch": 0.7154922368196553, + "grad_norm": 1.7498446123073526, + "learning_rate": 1.9767843447685137e-05, + "loss": 0.3673, + "step": 8387 + }, + { + "epoch": 0.7155775464937724, + "grad_norm": 2.1290247332998367, + "learning_rate": 1.9756840786677648e-05, + "loss": 0.1928, + "step": 8388 + }, + { + "epoch": 0.7156628561678895, + "grad_norm": 1.817167979666121, + "learning_rate": 1.9745840434574654e-05, + "loss": 0.1664, + "step": 8389 + }, + { + "epoch": 0.7157481658420065, + "grad_norm": 1.503851872241345, + "learning_rate": 1.9734842392215975e-05, + "loss": 0.1753, + "step": 8390 + }, + { + "epoch": 0.7158334755161235, + "grad_norm": 1.692863558963438, + "learning_rate": 1.972384666044123e-05, + "loss": 0.1768, + "step": 8391 + }, + { + "epoch": 0.7159187851902405, + "grad_norm": 2.392028197323257, + "learning_rate": 1.971285324008994e-05, + "loss": 0.2321, + "step": 8392 + }, + { + "epoch": 0.7160040948643576, + "grad_norm": 1.6815075616771449, + "learning_rate": 1.9701862132001324e-05, + "loss": 0.1981, + "step": 8393 + }, + { + "epoch": 0.7160894045384747, + "grad_norm": 1.4093772025418887, + "learning_rate": 1.969087333701455e-05, + "loss": 0.2027, + "step": 8394 + }, + { + "epoch": 0.7161747142125917, + "grad_norm": 1.640647088764993, + "learning_rate": 1.967988685596853e-05, + "loss": 0.2553, + "step": 8395 + }, + { + "epoch": 0.7162600238867087, + "grad_norm": 1.6035123539286065, + "learning_rate": 1.9668902689702035e-05, + "loss": 0.2067, + "step": 8396 + }, + { + "epoch": 0.7163453335608257, + "grad_norm": 1.9481579908184914, + "learning_rate": 1.9657920839053634e-05, + "loss": 0.1613, + "step": 8397 + }, + { + "epoch": 0.7164306432349429, + "grad_norm": 1.784122426450973, + "learning_rate": 1.9646941304861742e-05, + "loss": 0.1903, + "step": 8398 + }, + { + "epoch": 0.7165159529090599, + "grad_norm": 2.0217921810492916, + "learning_rate": 1.9635964087964586e-05, + "loss": 0.1486, + "step": 8399 + }, + { + "epoch": 0.7166012625831769, + "grad_norm": 1.8542799182837368, + "learning_rate": 1.96249891892002e-05, + "loss": 0.1356, + "step": 8400 + }, + { + "epoch": 0.7166865722572939, + "grad_norm": 1.5210420212768982, + "learning_rate": 1.961401660940651e-05, + "loss": 0.2213, + "step": 8401 + }, + { + "epoch": 0.7167718819314111, + "grad_norm": 1.8119817847963697, + "learning_rate": 1.9603046349421146e-05, + "loss": 0.1775, + "step": 8402 + }, + { + "epoch": 0.7168571916055281, + "grad_norm": 1.9213850435785265, + "learning_rate": 1.9592078410081682e-05, + "loss": 0.2218, + "step": 8403 + }, + { + "epoch": 0.7169425012796451, + "grad_norm": 2.0395217319910253, + "learning_rate": 1.958111279222544e-05, + "loss": 0.2152, + "step": 8404 + }, + { + "epoch": 0.7170278109537621, + "grad_norm": 2.033422394141366, + "learning_rate": 1.95701494966896e-05, + "loss": 0.2437, + "step": 8405 + }, + { + "epoch": 0.7171131206278792, + "grad_norm": 1.7914591232861758, + "learning_rate": 1.955918852431112e-05, + "loss": 0.1746, + "step": 8406 + }, + { + "epoch": 0.7171984303019963, + "grad_norm": 1.4704773667561046, + "learning_rate": 1.954822987592685e-05, + "loss": 0.221, + "step": 8407 + }, + { + "epoch": 0.7172837399761133, + "grad_norm": 1.8455402282199842, + "learning_rate": 1.953727355237341e-05, + "loss": 0.2285, + "step": 8408 + }, + { + "epoch": 0.7173690496502303, + "grad_norm": 2.2045376624673065, + "learning_rate": 1.9526319554487247e-05, + "loss": 0.1122, + "step": 8409 + }, + { + "epoch": 0.7174543593243474, + "grad_norm": 1.7066803665405998, + "learning_rate": 1.9515367883104658e-05, + "loss": 0.1308, + "step": 8410 + }, + { + "epoch": 0.7175396689984644, + "grad_norm": 1.7874068549369737, + "learning_rate": 1.9504418539061737e-05, + "loss": 0.1671, + "step": 8411 + }, + { + "epoch": 0.7176249786725815, + "grad_norm": 1.537112322056816, + "learning_rate": 1.9493471523194402e-05, + "loss": 0.2272, + "step": 8412 + }, + { + "epoch": 0.7177102883466985, + "grad_norm": 1.9381756988655212, + "learning_rate": 1.9482526836338387e-05, + "loss": 0.1638, + "step": 8413 + }, + { + "epoch": 0.7177955980208156, + "grad_norm": 1.4954429386212365, + "learning_rate": 1.947158447932932e-05, + "loss": 0.1761, + "step": 8414 + }, + { + "epoch": 0.7178809076949326, + "grad_norm": 1.7213200168040352, + "learning_rate": 1.946064445300251e-05, + "loss": 0.1793, + "step": 8415 + }, + { + "epoch": 0.7179662173690496, + "grad_norm": 1.2991404681269028, + "learning_rate": 1.9449706758193232e-05, + "loss": 0.1704, + "step": 8416 + }, + { + "epoch": 0.7180515270431667, + "grad_norm": 1.7960230922531297, + "learning_rate": 1.9438771395736495e-05, + "loss": 0.1862, + "step": 8417 + }, + { + "epoch": 0.7181368367172838, + "grad_norm": 2.0366491308045274, + "learning_rate": 1.942783836646716e-05, + "loss": 0.1926, + "step": 8418 + }, + { + "epoch": 0.7182221463914008, + "grad_norm": 1.4905108874005566, + "learning_rate": 1.94169076712199e-05, + "loss": 0.2271, + "step": 8419 + }, + { + "epoch": 0.7183074560655178, + "grad_norm": 1.5265327505377524, + "learning_rate": 1.9405979310829232e-05, + "loss": 0.2033, + "step": 8420 + }, + { + "epoch": 0.7183927657396348, + "grad_norm": 1.640669031914705, + "learning_rate": 1.9395053286129462e-05, + "loss": 0.1708, + "step": 8421 + }, + { + "epoch": 0.718478075413752, + "grad_norm": 3.7451100574410616, + "learning_rate": 1.938412959795472e-05, + "loss": 0.2014, + "step": 8422 + }, + { + "epoch": 0.718563385087869, + "grad_norm": 1.5917200362493422, + "learning_rate": 1.9373208247139008e-05, + "loss": 0.2106, + "step": 8423 + }, + { + "epoch": 0.718648694761986, + "grad_norm": 1.9259735073355115, + "learning_rate": 1.93622892345161e-05, + "loss": 0.1976, + "step": 8424 + }, + { + "epoch": 0.718734004436103, + "grad_norm": 2.067907008339979, + "learning_rate": 1.9351372560919596e-05, + "loss": 0.1881, + "step": 8425 + }, + { + "epoch": 0.7188193141102202, + "grad_norm": 1.9142961009197246, + "learning_rate": 1.9340458227182918e-05, + "loss": 0.1895, + "step": 8426 + }, + { + "epoch": 0.7189046237843372, + "grad_norm": 1.510765780737516, + "learning_rate": 1.9329546234139356e-05, + "loss": 0.2326, + "step": 8427 + }, + { + "epoch": 0.7189899334584542, + "grad_norm": 1.5387224315463768, + "learning_rate": 1.931863658262193e-05, + "loss": 0.1623, + "step": 8428 + }, + { + "epoch": 0.7190752431325712, + "grad_norm": 1.5526711121153316, + "learning_rate": 1.930772927346357e-05, + "loss": 0.2175, + "step": 8429 + }, + { + "epoch": 0.7191605528066882, + "grad_norm": 1.5870890828965967, + "learning_rate": 1.9296824307496992e-05, + "loss": 0.2146, + "step": 8430 + }, + { + "epoch": 0.7192458624808054, + "grad_norm": 1.5687296467102843, + "learning_rate": 1.9285921685554713e-05, + "loss": 0.1154, + "step": 8431 + }, + { + "epoch": 0.7193311721549224, + "grad_norm": 1.8558105153995068, + "learning_rate": 1.92750214084691e-05, + "loss": 0.1843, + "step": 8432 + }, + { + "epoch": 0.7194164818290394, + "grad_norm": 1.976137500048491, + "learning_rate": 1.926412347707233e-05, + "loss": 0.2379, + "step": 8433 + }, + { + "epoch": 0.7195017915031564, + "grad_norm": 1.5675247270741899, + "learning_rate": 1.9253227892196406e-05, + "loss": 0.2305, + "step": 8434 + }, + { + "epoch": 0.7195871011772735, + "grad_norm": 1.8597727117822076, + "learning_rate": 1.9242334654673124e-05, + "loss": 0.1512, + "step": 8435 + }, + { + "epoch": 0.7196724108513906, + "grad_norm": 2.2351779162021166, + "learning_rate": 1.923144376533416e-05, + "loss": 0.2374, + "step": 8436 + }, + { + "epoch": 0.7197577205255076, + "grad_norm": 1.7995061435324309, + "learning_rate": 1.9220555225010966e-05, + "loss": 0.1966, + "step": 8437 + }, + { + "epoch": 0.7198430301996246, + "grad_norm": 1.9603800798245852, + "learning_rate": 1.9209669034534816e-05, + "loss": 0.1908, + "step": 8438 + }, + { + "epoch": 0.7199283398737417, + "grad_norm": 2.001359715787019, + "learning_rate": 1.9198785194736817e-05, + "loss": 0.1996, + "step": 8439 + }, + { + "epoch": 0.7200136495478587, + "grad_norm": 1.882583857539184, + "learning_rate": 1.9187903706447892e-05, + "loss": 0.214, + "step": 8440 + }, + { + "epoch": 0.7200989592219758, + "grad_norm": 1.7331047561514215, + "learning_rate": 1.917702457049878e-05, + "loss": 0.2262, + "step": 8441 + }, + { + "epoch": 0.7201842688960928, + "grad_norm": 2.438801149176999, + "learning_rate": 1.9166147787720036e-05, + "loss": 0.2331, + "step": 8442 + }, + { + "epoch": 0.7202695785702099, + "grad_norm": 1.64352088752938, + "learning_rate": 1.915527335894209e-05, + "loss": 0.1968, + "step": 8443 + }, + { + "epoch": 0.7203548882443269, + "grad_norm": 1.5873642690710248, + "learning_rate": 1.9144401284995072e-05, + "loss": 0.1822, + "step": 8444 + }, + { + "epoch": 0.7204401979184439, + "grad_norm": 2.125134407729433, + "learning_rate": 1.9133531566709078e-05, + "loss": 0.1564, + "step": 8445 + }, + { + "epoch": 0.720525507592561, + "grad_norm": 1.896279383854787, + "learning_rate": 1.912266420491392e-05, + "loss": 0.2006, + "step": 8446 + }, + { + "epoch": 0.7206108172666781, + "grad_norm": 1.3179445419394746, + "learning_rate": 1.9111799200439267e-05, + "loss": 0.2056, + "step": 8447 + }, + { + "epoch": 0.7206961269407951, + "grad_norm": 1.4403504016844542, + "learning_rate": 1.910093655411459e-05, + "loss": 0.1601, + "step": 8448 + }, + { + "epoch": 0.7207814366149121, + "grad_norm": 1.8626176784129047, + "learning_rate": 1.9090076266769245e-05, + "loss": 0.2439, + "step": 8449 + }, + { + "epoch": 0.7208667462890291, + "grad_norm": 1.2670333937931306, + "learning_rate": 1.9079218339232285e-05, + "loss": 0.1866, + "step": 8450 + }, + { + "epoch": 0.7209520559631463, + "grad_norm": 1.630595330889205, + "learning_rate": 1.906836277233271e-05, + "loss": 0.1377, + "step": 8451 + }, + { + "epoch": 0.7210373656372633, + "grad_norm": 1.736525911581053, + "learning_rate": 1.9057509566899266e-05, + "loss": 0.1324, + "step": 8452 + }, + { + "epoch": 0.7211226753113803, + "grad_norm": 1.9593296830626903, + "learning_rate": 1.904665872376054e-05, + "loss": 0.2029, + "step": 8453 + }, + { + "epoch": 0.7212079849854973, + "grad_norm": 1.6081731067493863, + "learning_rate": 1.9035810243744934e-05, + "loss": 0.1763, + "step": 8454 + }, + { + "epoch": 0.7212932946596144, + "grad_norm": 2.6349950058544107, + "learning_rate": 1.9024964127680655e-05, + "loss": 0.2199, + "step": 8455 + }, + { + "epoch": 0.7213786043337315, + "grad_norm": 1.491045529847921, + "learning_rate": 1.90141203763958e-05, + "loss": 0.1844, + "step": 8456 + }, + { + "epoch": 0.7214639140078485, + "grad_norm": 1.4644066730742138, + "learning_rate": 1.900327899071816e-05, + "loss": 0.2029, + "step": 8457 + }, + { + "epoch": 0.7215492236819655, + "grad_norm": 2.3743438818273868, + "learning_rate": 1.8992439971475468e-05, + "loss": 0.2262, + "step": 8458 + }, + { + "epoch": 0.7216345333560826, + "grad_norm": 1.5692909675806381, + "learning_rate": 1.8981603319495213e-05, + "loss": 0.2282, + "step": 8459 + }, + { + "epoch": 0.7217198430301996, + "grad_norm": 2.0270142039331587, + "learning_rate": 1.8970769035604708e-05, + "loss": 0.2754, + "step": 8460 + }, + { + "epoch": 0.7218051527043167, + "grad_norm": 1.2512142006309694, + "learning_rate": 1.89599371206311e-05, + "loss": 0.2249, + "step": 8461 + }, + { + "epoch": 0.7218904623784337, + "grad_norm": 1.416492040006528, + "learning_rate": 1.8949107575401347e-05, + "loss": 0.2319, + "step": 8462 + }, + { + "epoch": 0.7219757720525508, + "grad_norm": 1.0246557308855242, + "learning_rate": 1.893828040074223e-05, + "loss": 0.1298, + "step": 8463 + }, + { + "epoch": 0.7220610817266678, + "grad_norm": 1.4830677902048233, + "learning_rate": 1.892745559748032e-05, + "loss": 0.1618, + "step": 8464 + }, + { + "epoch": 0.7221463914007848, + "grad_norm": 1.7371153618426531, + "learning_rate": 1.8916633166442094e-05, + "loss": 0.2054, + "step": 8465 + }, + { + "epoch": 0.7222317010749019, + "grad_norm": 2.0275535401449445, + "learning_rate": 1.8905813108453712e-05, + "loss": 0.217, + "step": 8466 + }, + { + "epoch": 0.7223170107490189, + "grad_norm": 2.3478376220106476, + "learning_rate": 1.8894995424341285e-05, + "loss": 0.2272, + "step": 8467 + }, + { + "epoch": 0.722402320423136, + "grad_norm": 1.3857418864104016, + "learning_rate": 1.8884180114930644e-05, + "loss": 0.2157, + "step": 8468 + }, + { + "epoch": 0.722487630097253, + "grad_norm": 1.324047156274955, + "learning_rate": 1.8873367181047537e-05, + "loss": 0.1888, + "step": 8469 + }, + { + "epoch": 0.72257293977137, + "grad_norm": 1.599850218437822, + "learning_rate": 1.8862556623517406e-05, + "loss": 0.2087, + "step": 8470 + }, + { + "epoch": 0.7226582494454871, + "grad_norm": 1.6589757694086757, + "learning_rate": 1.8851748443165628e-05, + "loss": 0.1557, + "step": 8471 + }, + { + "epoch": 0.7227435591196042, + "grad_norm": 2.0806190611186457, + "learning_rate": 1.8840942640817338e-05, + "loss": 0.1627, + "step": 8472 + }, + { + "epoch": 0.7228288687937212, + "grad_norm": 1.8470197875817096, + "learning_rate": 1.8830139217297498e-05, + "loss": 0.2358, + "step": 8473 + }, + { + "epoch": 0.7229141784678382, + "grad_norm": 2.045271966472821, + "learning_rate": 1.881933817343089e-05, + "loss": 0.1621, + "step": 8474 + }, + { + "epoch": 0.7229994881419552, + "grad_norm": 1.5505613848588298, + "learning_rate": 1.8808539510042124e-05, + "loss": 0.1588, + "step": 8475 + }, + { + "epoch": 0.7230847978160724, + "grad_norm": 1.6996728750652774, + "learning_rate": 1.879774322795561e-05, + "loss": 0.1655, + "step": 8476 + }, + { + "epoch": 0.7231701074901894, + "grad_norm": 1.6213040502710443, + "learning_rate": 1.8786949327995574e-05, + "loss": 0.1723, + "step": 8477 + }, + { + "epoch": 0.7232554171643064, + "grad_norm": 1.8709574671671092, + "learning_rate": 1.877615781098613e-05, + "loss": 0.2313, + "step": 8478 + }, + { + "epoch": 0.7233407268384234, + "grad_norm": 2.0868603668659524, + "learning_rate": 1.8765368677751072e-05, + "loss": 0.1776, + "step": 8479 + }, + { + "epoch": 0.7234260365125406, + "grad_norm": 1.7037660284820713, + "learning_rate": 1.8754581929114156e-05, + "loss": 0.2476, + "step": 8480 + }, + { + "epoch": 0.7235113461866576, + "grad_norm": 2.5547765313829656, + "learning_rate": 1.8743797565898873e-05, + "loss": 0.2266, + "step": 8481 + }, + { + "epoch": 0.7235966558607746, + "grad_norm": 1.961481999034873, + "learning_rate": 1.873301558892855e-05, + "loss": 0.2393, + "step": 8482 + }, + { + "epoch": 0.7236819655348916, + "grad_norm": 2.042068428940456, + "learning_rate": 1.8722235999026332e-05, + "loss": 0.1767, + "step": 8483 + }, + { + "epoch": 0.7237672752090087, + "grad_norm": 2.1162999393634188, + "learning_rate": 1.8711458797015174e-05, + "loss": 0.21, + "step": 8484 + }, + { + "epoch": 0.7238525848831258, + "grad_norm": 1.8659003382048671, + "learning_rate": 1.8700683983717897e-05, + "loss": 0.276, + "step": 8485 + }, + { + "epoch": 0.7239378945572428, + "grad_norm": 2.3584876002167103, + "learning_rate": 1.8689911559957048e-05, + "loss": 0.1884, + "step": 8486 + }, + { + "epoch": 0.7240232042313598, + "grad_norm": 2.7345332508614955, + "learning_rate": 1.8679141526555078e-05, + "loss": 0.2646, + "step": 8487 + }, + { + "epoch": 0.7241085139054769, + "grad_norm": 1.439059034047214, + "learning_rate": 1.8668373884334217e-05, + "loss": 0.1579, + "step": 8488 + }, + { + "epoch": 0.7241938235795939, + "grad_norm": 1.4297727132114713, + "learning_rate": 1.8657608634116512e-05, + "loss": 0.21, + "step": 8489 + }, + { + "epoch": 0.724279133253711, + "grad_norm": 1.8241213786830912, + "learning_rate": 1.864684577672382e-05, + "loss": 0.1924, + "step": 8490 + }, + { + "epoch": 0.724364442927828, + "grad_norm": 1.7980626700812046, + "learning_rate": 1.863608531297788e-05, + "loss": 0.2103, + "step": 8491 + }, + { + "epoch": 0.7244497526019451, + "grad_norm": 1.6098212206838642, + "learning_rate": 1.862532724370012e-05, + "loss": 0.1787, + "step": 8492 + }, + { + "epoch": 0.7245350622760621, + "grad_norm": 1.5295236158789351, + "learning_rate": 1.8614571569711914e-05, + "loss": 0.1579, + "step": 8493 + }, + { + "epoch": 0.7246203719501791, + "grad_norm": 1.6515952865825274, + "learning_rate": 1.860381829183439e-05, + "loss": 0.1993, + "step": 8494 + }, + { + "epoch": 0.7247056816242962, + "grad_norm": 1.7685370273224414, + "learning_rate": 1.8593067410888503e-05, + "loss": 0.1735, + "step": 8495 + }, + { + "epoch": 0.7247909912984133, + "grad_norm": 1.8779478424353193, + "learning_rate": 1.858231892769502e-05, + "loss": 0.2168, + "step": 8496 + }, + { + "epoch": 0.7248763009725303, + "grad_norm": 1.6908125683427684, + "learning_rate": 1.857157284307452e-05, + "loss": 0.2541, + "step": 8497 + }, + { + "epoch": 0.7249616106466473, + "grad_norm": 1.7353830243437383, + "learning_rate": 1.8560829157847452e-05, + "loss": 0.1569, + "step": 8498 + }, + { + "epoch": 0.7250469203207643, + "grad_norm": 1.6908840580315014, + "learning_rate": 1.8550087872833976e-05, + "loss": 0.2338, + "step": 8499 + }, + { + "epoch": 0.7251322299948815, + "grad_norm": 1.2205743584428685, + "learning_rate": 1.853934898885419e-05, + "loss": 0.1696, + "step": 8500 + }, + { + "epoch": 0.7252175396689985, + "grad_norm": 1.7890165231452364, + "learning_rate": 1.852861250672792e-05, + "loss": 0.191, + "step": 8501 + }, + { + "epoch": 0.7253028493431155, + "grad_norm": 1.8452366153576332, + "learning_rate": 1.8517878427274848e-05, + "loss": 0.1911, + "step": 8502 + }, + { + "epoch": 0.7253881590172325, + "grad_norm": 1.8924544129619503, + "learning_rate": 1.8507146751314464e-05, + "loss": 0.1814, + "step": 8503 + }, + { + "epoch": 0.7254734686913497, + "grad_norm": 1.737351621151452, + "learning_rate": 1.8496417479666072e-05, + "loss": 0.207, + "step": 8504 + }, + { + "epoch": 0.7255587783654667, + "grad_norm": 1.5273588916388998, + "learning_rate": 1.848569061314879e-05, + "loss": 0.1756, + "step": 8505 + }, + { + "epoch": 0.7256440880395837, + "grad_norm": 1.4029448344695377, + "learning_rate": 1.8474966152581556e-05, + "loss": 0.1904, + "step": 8506 + }, + { + "epoch": 0.7257293977137007, + "grad_norm": 1.7829656536839953, + "learning_rate": 1.8464244098783163e-05, + "loss": 0.2173, + "step": 8507 + }, + { + "epoch": 0.7258147073878177, + "grad_norm": 1.5812404587046933, + "learning_rate": 1.8453524452572114e-05, + "loss": 0.2123, + "step": 8508 + }, + { + "epoch": 0.7259000170619349, + "grad_norm": 1.7768117926026343, + "learning_rate": 1.8442807214766855e-05, + "loss": 0.2675, + "step": 8509 + }, + { + "epoch": 0.7259853267360519, + "grad_norm": 2.2137521681083, + "learning_rate": 1.8432092386185574e-05, + "loss": 0.1647, + "step": 8510 + }, + { + "epoch": 0.7260706364101689, + "grad_norm": 2.1066896653227745, + "learning_rate": 1.842137996764628e-05, + "loss": 0.1498, + "step": 8511 + }, + { + "epoch": 0.7261559460842859, + "grad_norm": 1.6961690259593893, + "learning_rate": 1.84106699599668e-05, + "loss": 0.2666, + "step": 8512 + }, + { + "epoch": 0.726241255758403, + "grad_norm": 1.5170952655951957, + "learning_rate": 1.839996236396483e-05, + "loss": 0.169, + "step": 8513 + }, + { + "epoch": 0.72632656543252, + "grad_norm": 1.4924832994558643, + "learning_rate": 1.8389257180457804e-05, + "loss": 0.1495, + "step": 8514 + }, + { + "epoch": 0.7264118751066371, + "grad_norm": 1.3971235186475217, + "learning_rate": 1.8378554410263015e-05, + "loss": 0.2293, + "step": 8515 + }, + { + "epoch": 0.7264971847807541, + "grad_norm": 1.9863989413230554, + "learning_rate": 1.8367854054197557e-05, + "loss": 0.2181, + "step": 8516 + }, + { + "epoch": 0.7265824944548712, + "grad_norm": 1.6128311822552397, + "learning_rate": 1.8357156113078357e-05, + "loss": 0.1602, + "step": 8517 + }, + { + "epoch": 0.7266678041289882, + "grad_norm": 1.7387570533636802, + "learning_rate": 1.834646058772213e-05, + "loss": 0.1862, + "step": 8518 + }, + { + "epoch": 0.7267531138031053, + "grad_norm": 1.9652166600453498, + "learning_rate": 1.8335767478945413e-05, + "loss": 0.2283, + "step": 8519 + }, + { + "epoch": 0.7268384234772223, + "grad_norm": 1.6874756987905897, + "learning_rate": 1.8325076787564627e-05, + "loss": 0.2354, + "step": 8520 + }, + { + "epoch": 0.7269237331513394, + "grad_norm": 1.586387157116766, + "learning_rate": 1.8314388514395864e-05, + "loss": 0.2282, + "step": 8521 + }, + { + "epoch": 0.7270090428254564, + "grad_norm": 1.5857268696240243, + "learning_rate": 1.8303702660255184e-05, + "loss": 0.1948, + "step": 8522 + }, + { + "epoch": 0.7270943524995734, + "grad_norm": 2.560554137754288, + "learning_rate": 1.8293019225958376e-05, + "loss": 0.1725, + "step": 8523 + }, + { + "epoch": 0.7271796621736905, + "grad_norm": 1.758535482287807, + "learning_rate": 1.828233821232105e-05, + "loss": 0.1902, + "step": 8524 + }, + { + "epoch": 0.7272649718478076, + "grad_norm": 1.7696541244590265, + "learning_rate": 1.827165962015866e-05, + "loss": 0.1721, + "step": 8525 + }, + { + "epoch": 0.7273502815219246, + "grad_norm": 1.8224462356276907, + "learning_rate": 1.8260983450286452e-05, + "loss": 0.1747, + "step": 8526 + }, + { + "epoch": 0.7274355911960416, + "grad_norm": 1.6387021376824167, + "learning_rate": 1.8250309703519496e-05, + "loss": 0.174, + "step": 8527 + }, + { + "epoch": 0.7275209008701586, + "grad_norm": 1.9366472274912099, + "learning_rate": 1.8239638380672657e-05, + "loss": 0.1654, + "step": 8528 + }, + { + "epoch": 0.7276062105442758, + "grad_norm": 1.8619638580190905, + "learning_rate": 1.8228969482560677e-05, + "loss": 0.2846, + "step": 8529 + }, + { + "epoch": 0.7276915202183928, + "grad_norm": 2.0084939831735458, + "learning_rate": 1.8218303009998038e-05, + "loss": 0.2234, + "step": 8530 + }, + { + "epoch": 0.7277768298925098, + "grad_norm": 1.834792149177223, + "learning_rate": 1.8207638963799084e-05, + "loss": 0.1457, + "step": 8531 + }, + { + "epoch": 0.7278621395666268, + "grad_norm": 1.580134707877098, + "learning_rate": 1.8196977344777933e-05, + "loss": 0.2189, + "step": 8532 + }, + { + "epoch": 0.727947449240744, + "grad_norm": 2.0820811768462675, + "learning_rate": 1.8186318153748587e-05, + "loss": 0.145, + "step": 8533 + }, + { + "epoch": 0.728032758914861, + "grad_norm": 2.071722853009397, + "learning_rate": 1.8175661391524767e-05, + "loss": 0.1432, + "step": 8534 + }, + { + "epoch": 0.728118068588978, + "grad_norm": 1.8569969549611176, + "learning_rate": 1.81650070589201e-05, + "loss": 0.2213, + "step": 8535 + }, + { + "epoch": 0.728203378263095, + "grad_norm": 1.6347597912829006, + "learning_rate": 1.815435515674797e-05, + "loss": 0.1604, + "step": 8536 + }, + { + "epoch": 0.7282886879372121, + "grad_norm": 1.4695989711799404, + "learning_rate": 1.81437056858216e-05, + "loss": 0.1698, + "step": 8537 + }, + { + "epoch": 0.7283739976113291, + "grad_norm": 1.6449380765582209, + "learning_rate": 1.813305864695401e-05, + "loss": 0.1645, + "step": 8538 + }, + { + "epoch": 0.7284593072854462, + "grad_norm": 1.3738350701827664, + "learning_rate": 1.8122414040958057e-05, + "loss": 0.212, + "step": 8539 + }, + { + "epoch": 0.7285446169595632, + "grad_norm": 1.6371581732709461, + "learning_rate": 1.8111771868646395e-05, + "loss": 0.2252, + "step": 8540 + }, + { + "epoch": 0.7286299266336803, + "grad_norm": 1.6557370237164242, + "learning_rate": 1.810113213083148e-05, + "loss": 0.1853, + "step": 8541 + }, + { + "epoch": 0.7287152363077973, + "grad_norm": 1.1223065664575653, + "learning_rate": 1.809049482832563e-05, + "loss": 0.1663, + "step": 8542 + }, + { + "epoch": 0.7288005459819143, + "grad_norm": 1.9167867153000313, + "learning_rate": 1.8079859961940936e-05, + "loss": 0.1796, + "step": 8543 + }, + { + "epoch": 0.7288858556560314, + "grad_norm": 1.5310120689668862, + "learning_rate": 1.8069227532489312e-05, + "loss": 0.1575, + "step": 8544 + }, + { + "epoch": 0.7289711653301484, + "grad_norm": 1.7441066554948919, + "learning_rate": 1.8058597540782485e-05, + "loss": 0.2149, + "step": 8545 + }, + { + "epoch": 0.7290564750042655, + "grad_norm": 1.8772538943461778, + "learning_rate": 1.8047969987631996e-05, + "loss": 0.1977, + "step": 8546 + }, + { + "epoch": 0.7291417846783825, + "grad_norm": 1.7037621102633844, + "learning_rate": 1.803734487384921e-05, + "loss": 0.2085, + "step": 8547 + }, + { + "epoch": 0.7292270943524995, + "grad_norm": 1.490747206055017, + "learning_rate": 1.8026722200245272e-05, + "loss": 0.1994, + "step": 8548 + }, + { + "epoch": 0.7293124040266166, + "grad_norm": 1.7751261867747286, + "learning_rate": 1.8016101967631223e-05, + "loss": 0.2339, + "step": 8549 + }, + { + "epoch": 0.7293977137007337, + "grad_norm": 1.9527475221164246, + "learning_rate": 1.8005484176817794e-05, + "loss": 0.1844, + "step": 8550 + }, + { + "epoch": 0.7294830233748507, + "grad_norm": 1.5656260690285693, + "learning_rate": 1.7994868828615648e-05, + "loss": 0.1722, + "step": 8551 + }, + { + "epoch": 0.7295683330489677, + "grad_norm": 1.875937418253762, + "learning_rate": 1.798425592383519e-05, + "loss": 0.1933, + "step": 8552 + }, + { + "epoch": 0.7296536427230847, + "grad_norm": 2.051663454220093, + "learning_rate": 1.797364546328666e-05, + "loss": 0.1572, + "step": 8553 + }, + { + "epoch": 0.7297389523972019, + "grad_norm": 1.566832524847244, + "learning_rate": 1.7963037447780097e-05, + "loss": 0.2111, + "step": 8554 + }, + { + "epoch": 0.7298242620713189, + "grad_norm": 2.147326150769445, + "learning_rate": 1.795243187812541e-05, + "loss": 0.1674, + "step": 8555 + }, + { + "epoch": 0.7299095717454359, + "grad_norm": 2.359186025971492, + "learning_rate": 1.794182875513222e-05, + "loss": 0.2519, + "step": 8556 + }, + { + "epoch": 0.7299948814195529, + "grad_norm": 1.5386506899682417, + "learning_rate": 1.7931228079610057e-05, + "loss": 0.1843, + "step": 8557 + }, + { + "epoch": 0.7300801910936701, + "grad_norm": 1.6184697619263004, + "learning_rate": 1.7920629852368227e-05, + "loss": 0.1471, + "step": 8558 + }, + { + "epoch": 0.7301655007677871, + "grad_norm": 1.8300382595544569, + "learning_rate": 1.791003407421583e-05, + "loss": 0.216, + "step": 8559 + }, + { + "epoch": 0.7302508104419041, + "grad_norm": 1.9472225876990878, + "learning_rate": 1.789944074596181e-05, + "loss": 0.214, + "step": 8560 + }, + { + "epoch": 0.7303361201160211, + "grad_norm": 1.6393754833015663, + "learning_rate": 1.7888849868414886e-05, + "loss": 0.2655, + "step": 8561 + }, + { + "epoch": 0.7304214297901382, + "grad_norm": 1.7148593215409191, + "learning_rate": 1.787826144238367e-05, + "loss": 0.2203, + "step": 8562 + }, + { + "epoch": 0.7305067394642553, + "grad_norm": 1.3320899570090803, + "learning_rate": 1.786767546867647e-05, + "loss": 0.1368, + "step": 8563 + }, + { + "epoch": 0.7305920491383723, + "grad_norm": 2.0462784225363473, + "learning_rate": 1.7857091948101506e-05, + "loss": 0.1746, + "step": 8564 + }, + { + "epoch": 0.7306773588124893, + "grad_norm": 1.8861974149923901, + "learning_rate": 1.784651088146677e-05, + "loss": 0.1922, + "step": 8565 + }, + { + "epoch": 0.7307626684866064, + "grad_norm": 1.9143467283387483, + "learning_rate": 1.7835932269580064e-05, + "loss": 0.1497, + "step": 8566 + }, + { + "epoch": 0.7308479781607234, + "grad_norm": 1.3928482628284247, + "learning_rate": 1.782535611324901e-05, + "loss": 0.1574, + "step": 8567 + }, + { + "epoch": 0.7309332878348405, + "grad_norm": 1.6794426318151954, + "learning_rate": 1.7814782413281038e-05, + "loss": 0.1948, + "step": 8568 + }, + { + "epoch": 0.7310185975089575, + "grad_norm": 1.9343961649832246, + "learning_rate": 1.7804211170483397e-05, + "loss": 0.2058, + "step": 8569 + }, + { + "epoch": 0.7311039071830746, + "grad_norm": 2.0548636357341095, + "learning_rate": 1.7793642385663134e-05, + "loss": 0.1816, + "step": 8570 + }, + { + "epoch": 0.7311892168571916, + "grad_norm": 2.079111653774242, + "learning_rate": 1.7783076059627156e-05, + "loss": 0.2369, + "step": 8571 + }, + { + "epoch": 0.7312745265313086, + "grad_norm": 1.9633997318630962, + "learning_rate": 1.7772512193182095e-05, + "loss": 0.2072, + "step": 8572 + }, + { + "epoch": 0.7313598362054257, + "grad_norm": 2.0738008872072298, + "learning_rate": 1.7761950787134484e-05, + "loss": 0.1929, + "step": 8573 + }, + { + "epoch": 0.7314451458795428, + "grad_norm": 1.6867637440986696, + "learning_rate": 1.77513918422906e-05, + "loss": 0.1755, + "step": 8574 + }, + { + "epoch": 0.7315304555536598, + "grad_norm": 1.4755616201824457, + "learning_rate": 1.7740835359456616e-05, + "loss": 0.2033, + "step": 8575 + }, + { + "epoch": 0.7316157652277768, + "grad_norm": 1.641014270922659, + "learning_rate": 1.7730281339438387e-05, + "loss": 0.1472, + "step": 8576 + }, + { + "epoch": 0.7317010749018938, + "grad_norm": 1.70616305901778, + "learning_rate": 1.7719729783041717e-05, + "loss": 0.1904, + "step": 8577 + }, + { + "epoch": 0.731786384576011, + "grad_norm": 1.4353120157584436, + "learning_rate": 1.770918069107214e-05, + "loss": 0.2281, + "step": 8578 + }, + { + "epoch": 0.731871694250128, + "grad_norm": 1.6471573618355497, + "learning_rate": 1.769863406433503e-05, + "loss": 0.2008, + "step": 8579 + }, + { + "epoch": 0.731957003924245, + "grad_norm": 1.7671146016891293, + "learning_rate": 1.768808990363556e-05, + "loss": 0.1586, + "step": 8580 + }, + { + "epoch": 0.732042313598362, + "grad_norm": 1.874650350394313, + "learning_rate": 1.767754820977871e-05, + "loss": 0.2521, + "step": 8581 + }, + { + "epoch": 0.732127623272479, + "grad_norm": 2.126852013853001, + "learning_rate": 1.76670089835693e-05, + "loss": 0.2013, + "step": 8582 + }, + { + "epoch": 0.7322129329465962, + "grad_norm": 1.665625254790267, + "learning_rate": 1.7656472225811922e-05, + "loss": 0.2077, + "step": 8583 + }, + { + "epoch": 0.7322982426207132, + "grad_norm": 1.5023983219925485, + "learning_rate": 1.7645937937311048e-05, + "loss": 0.2117, + "step": 8584 + }, + { + "epoch": 0.7323835522948302, + "grad_norm": 2.0615479555031833, + "learning_rate": 1.7635406118870846e-05, + "loss": 0.1999, + "step": 8585 + }, + { + "epoch": 0.7324688619689472, + "grad_norm": 1.3612159015038345, + "learning_rate": 1.7624876771295424e-05, + "loss": 0.1576, + "step": 8586 + }, + { + "epoch": 0.7325541716430644, + "grad_norm": 1.3008100260166358, + "learning_rate": 1.7614349895388614e-05, + "loss": 0.163, + "step": 8587 + }, + { + "epoch": 0.7326394813171814, + "grad_norm": 1.4231194719786031, + "learning_rate": 1.7603825491954097e-05, + "loss": 0.2005, + "step": 8588 + }, + { + "epoch": 0.7327247909912984, + "grad_norm": 1.4645643450978885, + "learning_rate": 1.759330356179535e-05, + "loss": 0.212, + "step": 8589 + }, + { + "epoch": 0.7328101006654154, + "grad_norm": 1.8626300945546395, + "learning_rate": 1.7582784105715644e-05, + "loss": 0.2171, + "step": 8590 + }, + { + "epoch": 0.7328954103395325, + "grad_norm": 1.8326383776284776, + "learning_rate": 1.7572267124518144e-05, + "loss": 0.1385, + "step": 8591 + }, + { + "epoch": 0.7329807200136496, + "grad_norm": 1.7202245455924965, + "learning_rate": 1.7561752619005695e-05, + "loss": 0.2184, + "step": 8592 + }, + { + "epoch": 0.7330660296877666, + "grad_norm": 2.014459080564325, + "learning_rate": 1.755124058998108e-05, + "loss": 0.25, + "step": 8593 + }, + { + "epoch": 0.7331513393618836, + "grad_norm": 1.9845764778761505, + "learning_rate": 1.7540731038246805e-05, + "loss": 0.2061, + "step": 8594 + }, + { + "epoch": 0.7332366490360007, + "grad_norm": 1.5067865706025338, + "learning_rate": 1.753022396460523e-05, + "loss": 0.2401, + "step": 8595 + }, + { + "epoch": 0.7333219587101177, + "grad_norm": 1.6850425345828133, + "learning_rate": 1.7519719369858488e-05, + "loss": 0.1751, + "step": 8596 + }, + { + "epoch": 0.7334072683842348, + "grad_norm": 1.4422103995740025, + "learning_rate": 1.7509217254808613e-05, + "loss": 0.1915, + "step": 8597 + }, + { + "epoch": 0.7334925780583518, + "grad_norm": 1.5045232126582055, + "learning_rate": 1.749871762025731e-05, + "loss": 0.2117, + "step": 8598 + }, + { + "epoch": 0.7335778877324689, + "grad_norm": 1.6682946201727276, + "learning_rate": 1.7488220467006223e-05, + "loss": 0.2041, + "step": 8599 + }, + { + "epoch": 0.7336631974065859, + "grad_norm": 2.08601544981051, + "learning_rate": 1.7477725795856737e-05, + "loss": 0.2099, + "step": 8600 + }, + { + "epoch": 0.7337485070807029, + "grad_norm": 1.5458103716328233, + "learning_rate": 1.7467233607610057e-05, + "loss": 0.1541, + "step": 8601 + }, + { + "epoch": 0.73383381675482, + "grad_norm": 1.564633019780525, + "learning_rate": 1.745674390306722e-05, + "loss": 0.194, + "step": 8602 + }, + { + "epoch": 0.7339191264289371, + "grad_norm": 1.8709957598868054, + "learning_rate": 1.7446256683029028e-05, + "loss": 0.1656, + "step": 8603 + }, + { + "epoch": 0.7340044361030541, + "grad_norm": 2.106948247794491, + "learning_rate": 1.743577194829618e-05, + "loss": 0.2613, + "step": 8604 + }, + { + "epoch": 0.7340897457771711, + "grad_norm": 1.8654615014284717, + "learning_rate": 1.7425289699669073e-05, + "loss": 0.1364, + "step": 8605 + }, + { + "epoch": 0.7341750554512881, + "grad_norm": 1.6254121551151177, + "learning_rate": 1.7414809937948008e-05, + "loss": 0.2015, + "step": 8606 + }, + { + "epoch": 0.7342603651254053, + "grad_norm": 1.5201742581816196, + "learning_rate": 1.7404332663933043e-05, + "loss": 0.2978, + "step": 8607 + }, + { + "epoch": 0.7343456747995223, + "grad_norm": 1.5821643161148768, + "learning_rate": 1.7393857878424068e-05, + "loss": 0.2047, + "step": 8608 + }, + { + "epoch": 0.7344309844736393, + "grad_norm": 1.8541441355275698, + "learning_rate": 1.738338558222078e-05, + "loss": 0.1831, + "step": 8609 + }, + { + "epoch": 0.7345162941477563, + "grad_norm": 1.3061915881954806, + "learning_rate": 1.737291577612267e-05, + "loss": 0.1873, + "step": 8610 + }, + { + "epoch": 0.7346016038218735, + "grad_norm": 1.7891375263590885, + "learning_rate": 1.7362448460929065e-05, + "loss": 0.165, + "step": 8611 + }, + { + "epoch": 0.7346869134959905, + "grad_norm": 1.7379641811643807, + "learning_rate": 1.735198363743907e-05, + "loss": 0.1784, + "step": 8612 + }, + { + "epoch": 0.7347722231701075, + "grad_norm": 1.7725503862157441, + "learning_rate": 1.7341521306451662e-05, + "loss": 0.2647, + "step": 8613 + }, + { + "epoch": 0.7348575328442245, + "grad_norm": 2.3054852672826756, + "learning_rate": 1.7331061468765523e-05, + "loss": 0.2305, + "step": 8614 + }, + { + "epoch": 0.7349428425183416, + "grad_norm": 1.6313944960865756, + "learning_rate": 1.7320604125179258e-05, + "loss": 0.1519, + "step": 8615 + }, + { + "epoch": 0.7350281521924587, + "grad_norm": 1.4604122421190369, + "learning_rate": 1.7310149276491205e-05, + "loss": 0.2106, + "step": 8616 + }, + { + "epoch": 0.7351134618665757, + "grad_norm": 1.7167341027702159, + "learning_rate": 1.7299696923499543e-05, + "loss": 0.2489, + "step": 8617 + }, + { + "epoch": 0.7351987715406927, + "grad_norm": 2.6729864296217247, + "learning_rate": 1.7289247067002233e-05, + "loss": 0.277, + "step": 8618 + }, + { + "epoch": 0.7352840812148098, + "grad_norm": 1.6280480189072053, + "learning_rate": 1.7278799707797104e-05, + "loss": 0.2217, + "step": 8619 + }, + { + "epoch": 0.7353693908889268, + "grad_norm": 1.408284601613662, + "learning_rate": 1.726835484668174e-05, + "loss": 0.1168, + "step": 8620 + }, + { + "epoch": 0.7354547005630439, + "grad_norm": 2.0492908693421708, + "learning_rate": 1.725791248445354e-05, + "loss": 0.1629, + "step": 8621 + }, + { + "epoch": 0.7355400102371609, + "grad_norm": 1.8031569426829377, + "learning_rate": 1.7247472621909737e-05, + "loss": 0.1503, + "step": 8622 + }, + { + "epoch": 0.7356253199112779, + "grad_norm": 1.9735493366268657, + "learning_rate": 1.723703525984735e-05, + "loss": 0.3072, + "step": 8623 + }, + { + "epoch": 0.735710629585395, + "grad_norm": 1.6968975480128872, + "learning_rate": 1.722660039906322e-05, + "loss": 0.1884, + "step": 8624 + }, + { + "epoch": 0.735795939259512, + "grad_norm": 1.474063433706907, + "learning_rate": 1.7216168040353976e-05, + "loss": 0.16, + "step": 8625 + }, + { + "epoch": 0.735881248933629, + "grad_norm": 2.1379941937873586, + "learning_rate": 1.7205738184516123e-05, + "loss": 0.2131, + "step": 8626 + }, + { + "epoch": 0.7359665586077461, + "grad_norm": 1.8406924742165518, + "learning_rate": 1.7195310832345852e-05, + "loss": 0.2077, + "step": 8627 + }, + { + "epoch": 0.7360518682818632, + "grad_norm": 2.4214388546924246, + "learning_rate": 1.71848859846393e-05, + "loss": 0.2138, + "step": 8628 + }, + { + "epoch": 0.7361371779559802, + "grad_norm": 1.452257779487187, + "learning_rate": 1.717446364219232e-05, + "loss": 0.1324, + "step": 8629 + }, + { + "epoch": 0.7362224876300972, + "grad_norm": 1.3067650614560695, + "learning_rate": 1.71640438058006e-05, + "loss": 0.1892, + "step": 8630 + }, + { + "epoch": 0.7363077973042143, + "grad_norm": 1.5575068993821262, + "learning_rate": 1.7153626476259656e-05, + "loss": 0.2478, + "step": 8631 + }, + { + "epoch": 0.7363931069783314, + "grad_norm": 1.9540663624976506, + "learning_rate": 1.7143211654364762e-05, + "loss": 0.2073, + "step": 8632 + }, + { + "epoch": 0.7364784166524484, + "grad_norm": 1.8382543351926284, + "learning_rate": 1.7132799340911087e-05, + "loss": 0.2238, + "step": 8633 + }, + { + "epoch": 0.7365637263265654, + "grad_norm": 1.2376274043303432, + "learning_rate": 1.71223895366935e-05, + "loss": 0.1815, + "step": 8634 + }, + { + "epoch": 0.7366490360006824, + "grad_norm": 2.093372026174976, + "learning_rate": 1.7111982242506775e-05, + "loss": 0.209, + "step": 8635 + }, + { + "epoch": 0.7367343456747996, + "grad_norm": 1.7815888735015635, + "learning_rate": 1.710157745914544e-05, + "loss": 0.2006, + "step": 8636 + }, + { + "epoch": 0.7368196553489166, + "grad_norm": 1.8697302833901854, + "learning_rate": 1.7091175187403842e-05, + "loss": 0.2602, + "step": 8637 + }, + { + "epoch": 0.7369049650230336, + "grad_norm": 1.6502030305504725, + "learning_rate": 1.7080775428076122e-05, + "loss": 0.1842, + "step": 8638 + }, + { + "epoch": 0.7369902746971506, + "grad_norm": 1.3034994232387167, + "learning_rate": 1.7070378181956302e-05, + "loss": 0.1455, + "step": 8639 + }, + { + "epoch": 0.7370755843712677, + "grad_norm": 1.223497874985733, + "learning_rate": 1.705998344983809e-05, + "loss": 0.1429, + "step": 8640 + }, + { + "epoch": 0.7371608940453848, + "grad_norm": 1.6785610946522693, + "learning_rate": 1.704959123251511e-05, + "loss": 0.242, + "step": 8641 + }, + { + "epoch": 0.7372462037195018, + "grad_norm": 1.9248907727055382, + "learning_rate": 1.7039201530780742e-05, + "loss": 0.214, + "step": 8642 + }, + { + "epoch": 0.7373315133936188, + "grad_norm": 1.921545295036543, + "learning_rate": 1.7028814345428185e-05, + "loss": 0.184, + "step": 8643 + }, + { + "epoch": 0.7374168230677359, + "grad_norm": 1.5225196616621097, + "learning_rate": 1.7018429677250447e-05, + "loss": 0.2175, + "step": 8644 + }, + { + "epoch": 0.737502132741853, + "grad_norm": 2.2092877490824616, + "learning_rate": 1.700804752704033e-05, + "loss": 0.2069, + "step": 8645 + }, + { + "epoch": 0.73758744241597, + "grad_norm": 1.6708359386141562, + "learning_rate": 1.6997667895590474e-05, + "loss": 0.1857, + "step": 8646 + }, + { + "epoch": 0.737672752090087, + "grad_norm": 2.0219155261829145, + "learning_rate": 1.6987290783693282e-05, + "loss": 0.2012, + "step": 8647 + }, + { + "epoch": 0.7377580617642041, + "grad_norm": 1.9458190993687625, + "learning_rate": 1.6976916192141022e-05, + "loss": 0.1958, + "step": 8648 + }, + { + "epoch": 0.7378433714383211, + "grad_norm": 1.7789161517672245, + "learning_rate": 1.696654412172573e-05, + "loss": 0.1519, + "step": 8649 + }, + { + "epoch": 0.7379286811124381, + "grad_norm": 1.4207798020168685, + "learning_rate": 1.695617457323925e-05, + "loss": 0.1598, + "step": 8650 + }, + { + "epoch": 0.7380139907865552, + "grad_norm": 2.1916993297081744, + "learning_rate": 1.6945807547473253e-05, + "loss": 0.2366, + "step": 8651 + }, + { + "epoch": 0.7380993004606723, + "grad_norm": 2.032759120513006, + "learning_rate": 1.6935443045219198e-05, + "loss": 0.2398, + "step": 8652 + }, + { + "epoch": 0.7381846101347893, + "grad_norm": 1.9857182199354884, + "learning_rate": 1.692508106726836e-05, + "loss": 0.1612, + "step": 8653 + }, + { + "epoch": 0.7382699198089063, + "grad_norm": 1.860230962093235, + "learning_rate": 1.691472161441181e-05, + "loss": 0.1906, + "step": 8654 + }, + { + "epoch": 0.7383552294830233, + "grad_norm": 1.8825145818137976, + "learning_rate": 1.6904364687440476e-05, + "loss": 0.186, + "step": 8655 + }, + { + "epoch": 0.7384405391571405, + "grad_norm": 2.261732137637137, + "learning_rate": 1.6894010287145e-05, + "loss": 0.1602, + "step": 8656 + }, + { + "epoch": 0.7385258488312575, + "grad_norm": 1.89091792083726, + "learning_rate": 1.6883658414315928e-05, + "loss": 0.1964, + "step": 8657 + }, + { + "epoch": 0.7386111585053745, + "grad_norm": 2.069388244457484, + "learning_rate": 1.687330906974356e-05, + "loss": 0.2175, + "step": 8658 + }, + { + "epoch": 0.7386964681794915, + "grad_norm": 2.3596205425702315, + "learning_rate": 1.6862962254218e-05, + "loss": 0.1973, + "step": 8659 + }, + { + "epoch": 0.7387817778536085, + "grad_norm": 1.4629485624322653, + "learning_rate": 1.6852617968529176e-05, + "loss": 0.2055, + "step": 8660 + }, + { + "epoch": 0.7388670875277257, + "grad_norm": 1.2587352708638537, + "learning_rate": 1.6842276213466852e-05, + "loss": 0.1611, + "step": 8661 + }, + { + "epoch": 0.7389523972018427, + "grad_norm": 1.8097992868996011, + "learning_rate": 1.6831936989820506e-05, + "loss": 0.2008, + "step": 8662 + }, + { + "epoch": 0.7390377068759597, + "grad_norm": 1.4240099629900151, + "learning_rate": 1.6821600298379538e-05, + "loss": 0.2027, + "step": 8663 + }, + { + "epoch": 0.7391230165500767, + "grad_norm": 2.0317361103276372, + "learning_rate": 1.6811266139933075e-05, + "loss": 0.2331, + "step": 8664 + }, + { + "epoch": 0.7392083262241939, + "grad_norm": 1.8178771923371715, + "learning_rate": 1.6800934515270074e-05, + "loss": 0.2211, + "step": 8665 + }, + { + "epoch": 0.7392936358983109, + "grad_norm": 1.8886013708199936, + "learning_rate": 1.6790605425179306e-05, + "loss": 0.2188, + "step": 8666 + }, + { + "epoch": 0.7393789455724279, + "grad_norm": 1.6877783086654659, + "learning_rate": 1.6780278870449325e-05, + "loss": 0.2148, + "step": 8667 + }, + { + "epoch": 0.7394642552465449, + "grad_norm": 1.8316580386156731, + "learning_rate": 1.6769954851868548e-05, + "loss": 0.2458, + "step": 8668 + }, + { + "epoch": 0.739549564920662, + "grad_norm": 1.6704977191542965, + "learning_rate": 1.675963337022511e-05, + "loss": 0.2193, + "step": 8669 + }, + { + "epoch": 0.7396348745947791, + "grad_norm": 1.4675578697011, + "learning_rate": 1.6749314426307035e-05, + "loss": 0.1589, + "step": 8670 + }, + { + "epoch": 0.7397201842688961, + "grad_norm": 1.951971583253356, + "learning_rate": 1.6738998020902108e-05, + "loss": 0.2206, + "step": 8671 + }, + { + "epoch": 0.7398054939430131, + "grad_norm": 1.272356007041577, + "learning_rate": 1.6728684154797942e-05, + "loss": 0.2137, + "step": 8672 + }, + { + "epoch": 0.7398908036171302, + "grad_norm": 1.708056598051665, + "learning_rate": 1.671837282878193e-05, + "loss": 0.1612, + "step": 8673 + }, + { + "epoch": 0.7399761132912472, + "grad_norm": 1.6076340378902547, + "learning_rate": 1.6708064043641297e-05, + "loss": 0.1504, + "step": 8674 + }, + { + "epoch": 0.7400614229653643, + "grad_norm": 2.168184726713864, + "learning_rate": 1.669775780016306e-05, + "loss": 0.2085, + "step": 8675 + }, + { + "epoch": 0.7401467326394813, + "grad_norm": 1.3590154501181997, + "learning_rate": 1.6687454099134032e-05, + "loss": 0.0972, + "step": 8676 + }, + { + "epoch": 0.7402320423135984, + "grad_norm": 1.205408282935934, + "learning_rate": 1.6677152941340873e-05, + "loss": 0.19, + "step": 8677 + }, + { + "epoch": 0.7403173519877154, + "grad_norm": 1.7323348271843848, + "learning_rate": 1.6666854327570015e-05, + "loss": 0.1256, + "step": 8678 + }, + { + "epoch": 0.7404026616618324, + "grad_norm": 1.7202502552104906, + "learning_rate": 1.6656558258607698e-05, + "loss": 0.2294, + "step": 8679 + }, + { + "epoch": 0.7404879713359495, + "grad_norm": 1.736614396790741, + "learning_rate": 1.6646264735239948e-05, + "loss": 0.1779, + "step": 8680 + }, + { + "epoch": 0.7405732810100666, + "grad_norm": 1.292677693870993, + "learning_rate": 1.663597375825268e-05, + "loss": 0.2262, + "step": 8681 + }, + { + "epoch": 0.7406585906841836, + "grad_norm": 1.5625233457728576, + "learning_rate": 1.662568532843149e-05, + "loss": 0.1776, + "step": 8682 + }, + { + "epoch": 0.7407439003583006, + "grad_norm": 1.8102132907617647, + "learning_rate": 1.6615399446561886e-05, + "loss": 0.2052, + "step": 8683 + }, + { + "epoch": 0.7408292100324176, + "grad_norm": 1.4339686431063376, + "learning_rate": 1.660511611342913e-05, + "loss": 0.1402, + "step": 8684 + }, + { + "epoch": 0.7409145197065348, + "grad_norm": 1.7295970228010769, + "learning_rate": 1.6594835329818297e-05, + "loss": 0.1824, + "step": 8685 + }, + { + "epoch": 0.7409998293806518, + "grad_norm": 1.7475482932909274, + "learning_rate": 1.6584557096514274e-05, + "loss": 0.2133, + "step": 8686 + }, + { + "epoch": 0.7410851390547688, + "grad_norm": 1.8660617131927464, + "learning_rate": 1.6574281414301744e-05, + "loss": 0.2131, + "step": 8687 + }, + { + "epoch": 0.7411704487288858, + "grad_norm": 1.586338612611732, + "learning_rate": 1.656400828396521e-05, + "loss": 0.1615, + "step": 8688 + }, + { + "epoch": 0.741255758403003, + "grad_norm": 1.8528390982281029, + "learning_rate": 1.655373770628894e-05, + "loss": 0.1587, + "step": 8689 + }, + { + "epoch": 0.74134106807712, + "grad_norm": 2.314350315820485, + "learning_rate": 1.6543469682057106e-05, + "loss": 0.2387, + "step": 8690 + }, + { + "epoch": 0.741426377751237, + "grad_norm": 1.9184850553318726, + "learning_rate": 1.6533204212053533e-05, + "loss": 0.2345, + "step": 8691 + }, + { + "epoch": 0.741511687425354, + "grad_norm": 1.4393727915486443, + "learning_rate": 1.6522941297061996e-05, + "loss": 0.2013, + "step": 8692 + }, + { + "epoch": 0.7415969970994711, + "grad_norm": 1.6141572287458705, + "learning_rate": 1.6512680937865993e-05, + "loss": 0.1695, + "step": 8693 + }, + { + "epoch": 0.7416823067735882, + "grad_norm": 1.7715195018309653, + "learning_rate": 1.650242313524885e-05, + "loss": 0.1909, + "step": 8694 + }, + { + "epoch": 0.7417676164477052, + "grad_norm": 1.840352565117436, + "learning_rate": 1.6492167889993693e-05, + "loss": 0.1867, + "step": 8695 + }, + { + "epoch": 0.7418529261218222, + "grad_norm": 2.224226519266594, + "learning_rate": 1.6481915202883442e-05, + "loss": 0.1796, + "step": 8696 + }, + { + "epoch": 0.7419382357959392, + "grad_norm": 2.081075309318286, + "learning_rate": 1.647166507470088e-05, + "loss": 0.1997, + "step": 8697 + }, + { + "epoch": 0.7420235454700563, + "grad_norm": 1.6480094751715255, + "learning_rate": 1.6461417506228493e-05, + "loss": 0.2063, + "step": 8698 + }, + { + "epoch": 0.7421088551441734, + "grad_norm": 1.4046890920729733, + "learning_rate": 1.645117249824867e-05, + "loss": 0.1813, + "step": 8699 + }, + { + "epoch": 0.7421941648182904, + "grad_norm": 1.765015893707902, + "learning_rate": 1.6440930051543546e-05, + "loss": 0.1762, + "step": 8700 + }, + { + "epoch": 0.7422794744924074, + "grad_norm": 1.443806538981669, + "learning_rate": 1.6430690166895084e-05, + "loss": 0.1916, + "step": 8701 + }, + { + "epoch": 0.7423647841665245, + "grad_norm": 1.8598335405947795, + "learning_rate": 1.642045284508502e-05, + "loss": 0.1694, + "step": 8702 + }, + { + "epoch": 0.7424500938406415, + "grad_norm": 1.855444207280118, + "learning_rate": 1.6410218086894976e-05, + "loss": 0.2305, + "step": 8703 + }, + { + "epoch": 0.7425354035147586, + "grad_norm": 2.0211204901578936, + "learning_rate": 1.6399985893106252e-05, + "loss": 0.2629, + "step": 8704 + }, + { + "epoch": 0.7426207131888756, + "grad_norm": 1.495475860569233, + "learning_rate": 1.6389756264500068e-05, + "loss": 0.1749, + "step": 8705 + }, + { + "epoch": 0.7427060228629927, + "grad_norm": 1.7411259038318811, + "learning_rate": 1.637952920185739e-05, + "loss": 0.2478, + "step": 8706 + }, + { + "epoch": 0.7427913325371097, + "grad_norm": 1.4568928097978666, + "learning_rate": 1.6369304705959e-05, + "loss": 0.2102, + "step": 8707 + }, + { + "epoch": 0.7428766422112267, + "grad_norm": 1.8760964049028777, + "learning_rate": 1.6359082777585483e-05, + "loss": 0.1685, + "step": 8708 + }, + { + "epoch": 0.7429619518853438, + "grad_norm": 2.3053161798121766, + "learning_rate": 1.6348863417517208e-05, + "loss": 0.2687, + "step": 8709 + }, + { + "epoch": 0.7430472615594609, + "grad_norm": 2.0592135052028397, + "learning_rate": 1.6338646626534427e-05, + "loss": 0.1558, + "step": 8710 + }, + { + "epoch": 0.7431325712335779, + "grad_norm": 1.7559016034879609, + "learning_rate": 1.632843240541706e-05, + "loss": 0.1868, + "step": 8711 + }, + { + "epoch": 0.7432178809076949, + "grad_norm": 1.616583669963732, + "learning_rate": 1.631822075494497e-05, + "loss": 0.2156, + "step": 8712 + }, + { + "epoch": 0.7433031905818119, + "grad_norm": 2.012324506796629, + "learning_rate": 1.630801167589774e-05, + "loss": 0.1878, + "step": 8713 + }, + { + "epoch": 0.7433885002559291, + "grad_norm": 1.3864324569675777, + "learning_rate": 1.629780516905478e-05, + "loss": 0.1592, + "step": 8714 + }, + { + "epoch": 0.7434738099300461, + "grad_norm": 1.80499819768673, + "learning_rate": 1.62876012351953e-05, + "loss": 0.1981, + "step": 8715 + }, + { + "epoch": 0.7435591196041631, + "grad_norm": 1.8808934098726509, + "learning_rate": 1.6277399875098322e-05, + "loss": 0.1368, + "step": 8716 + }, + { + "epoch": 0.7436444292782801, + "grad_norm": 2.267940942355306, + "learning_rate": 1.6267201089542657e-05, + "loss": 0.1876, + "step": 8717 + }, + { + "epoch": 0.7437297389523972, + "grad_norm": 1.646732551450994, + "learning_rate": 1.625700487930692e-05, + "loss": 0.2068, + "step": 8718 + }, + { + "epoch": 0.7438150486265143, + "grad_norm": 2.1507333037231504, + "learning_rate": 1.624681124516958e-05, + "loss": 0.299, + "step": 8719 + }, + { + "epoch": 0.7439003583006313, + "grad_norm": 1.4431097617486848, + "learning_rate": 1.623662018790881e-05, + "loss": 0.202, + "step": 8720 + }, + { + "epoch": 0.7439856679747483, + "grad_norm": 2.711518063341033, + "learning_rate": 1.6226431708302682e-05, + "loss": 0.2196, + "step": 8721 + }, + { + "epoch": 0.7440709776488654, + "grad_norm": 1.4222484559298694, + "learning_rate": 1.6216245807129004e-05, + "loss": 0.1728, + "step": 8722 + }, + { + "epoch": 0.7441562873229824, + "grad_norm": 1.470572051632411, + "learning_rate": 1.6206062485165463e-05, + "loss": 0.153, + "step": 8723 + }, + { + "epoch": 0.7442415969970995, + "grad_norm": 2.036030352825255, + "learning_rate": 1.619588174318944e-05, + "loss": 0.2455, + "step": 8724 + }, + { + "epoch": 0.7443269066712165, + "grad_norm": 1.9814232326309522, + "learning_rate": 1.618570358197823e-05, + "loss": 0.1249, + "step": 8725 + }, + { + "epoch": 0.7444122163453336, + "grad_norm": 2.138926925431064, + "learning_rate": 1.617552800230886e-05, + "loss": 0.1598, + "step": 8726 + }, + { + "epoch": 0.7444975260194506, + "grad_norm": 1.9025946418390822, + "learning_rate": 1.616535500495818e-05, + "loss": 0.1737, + "step": 8727 + }, + { + "epoch": 0.7445828356935676, + "grad_norm": 2.057145537647136, + "learning_rate": 1.6155184590702855e-05, + "loss": 0.1759, + "step": 8728 + }, + { + "epoch": 0.7446681453676847, + "grad_norm": 1.6802744704787305, + "learning_rate": 1.6145016760319338e-05, + "loss": 0.2468, + "step": 8729 + }, + { + "epoch": 0.7447534550418018, + "grad_norm": 1.4860978418076702, + "learning_rate": 1.6134851514583875e-05, + "loss": 0.134, + "step": 8730 + }, + { + "epoch": 0.7448387647159188, + "grad_norm": 1.851199086253502, + "learning_rate": 1.612468885427253e-05, + "loss": 0.1379, + "step": 8731 + }, + { + "epoch": 0.7449240743900358, + "grad_norm": 1.9999852179934687, + "learning_rate": 1.6114528780161213e-05, + "loss": 0.2159, + "step": 8732 + }, + { + "epoch": 0.7450093840641528, + "grad_norm": 1.6779286246595546, + "learning_rate": 1.610437129302552e-05, + "loss": 0.1844, + "step": 8733 + }, + { + "epoch": 0.74509469373827, + "grad_norm": 3.0812123882481943, + "learning_rate": 1.6094216393640977e-05, + "loss": 0.2646, + "step": 8734 + }, + { + "epoch": 0.745180003412387, + "grad_norm": 1.5062200013018607, + "learning_rate": 1.608406408278283e-05, + "loss": 0.2334, + "step": 8735 + }, + { + "epoch": 0.745265313086504, + "grad_norm": 1.2376448382764258, + "learning_rate": 1.6073914361226166e-05, + "loss": 0.1831, + "step": 8736 + }, + { + "epoch": 0.745350622760621, + "grad_norm": 2.2775908085864374, + "learning_rate": 1.606376722974586e-05, + "loss": 0.2066, + "step": 8737 + }, + { + "epoch": 0.745435932434738, + "grad_norm": 1.7681817641573596, + "learning_rate": 1.605362268911657e-05, + "loss": 0.1447, + "step": 8738 + }, + { + "epoch": 0.7455212421088552, + "grad_norm": 1.4711043784415494, + "learning_rate": 1.6043480740112827e-05, + "loss": 0.1733, + "step": 8739 + }, + { + "epoch": 0.7456065517829722, + "grad_norm": 1.729468614190348, + "learning_rate": 1.6033341383508854e-05, + "loss": 0.2542, + "step": 8740 + }, + { + "epoch": 0.7456918614570892, + "grad_norm": 1.419236806292454, + "learning_rate": 1.6023204620078787e-05, + "loss": 0.1768, + "step": 8741 + }, + { + "epoch": 0.7457771711312062, + "grad_norm": 1.7593333129280515, + "learning_rate": 1.6013070450596492e-05, + "loss": 0.1815, + "step": 8742 + }, + { + "epoch": 0.7458624808053234, + "grad_norm": 1.5254967490192533, + "learning_rate": 1.6002938875835665e-05, + "loss": 0.1611, + "step": 8743 + }, + { + "epoch": 0.7459477904794404, + "grad_norm": 2.374675527794205, + "learning_rate": 1.599280989656977e-05, + "loss": 0.2588, + "step": 8744 + }, + { + "epoch": 0.7460331001535574, + "grad_norm": 1.5522569222206055, + "learning_rate": 1.5982683513572165e-05, + "loss": 0.1766, + "step": 8745 + }, + { + "epoch": 0.7461184098276744, + "grad_norm": 1.6317399440179077, + "learning_rate": 1.5972559727615875e-05, + "loss": 0.17, + "step": 8746 + }, + { + "epoch": 0.7462037195017915, + "grad_norm": 1.647267654196973, + "learning_rate": 1.596243853947384e-05, + "loss": 0.1646, + "step": 8747 + }, + { + "epoch": 0.7462890291759086, + "grad_norm": 1.710860995319724, + "learning_rate": 1.5952319949918748e-05, + "loss": 0.1674, + "step": 8748 + }, + { + "epoch": 0.7463743388500256, + "grad_norm": 1.7238418063362733, + "learning_rate": 1.59422039597231e-05, + "loss": 0.177, + "step": 8749 + }, + { + "epoch": 0.7464596485241426, + "grad_norm": 1.9793050081858752, + "learning_rate": 1.5932090569659197e-05, + "loss": 0.1951, + "step": 8750 + }, + { + "epoch": 0.7465449581982597, + "grad_norm": 1.6851944184700478, + "learning_rate": 1.592197978049914e-05, + "loss": 0.2742, + "step": 8751 + }, + { + "epoch": 0.7466302678723767, + "grad_norm": 1.4483127281002108, + "learning_rate": 1.5911871593014837e-05, + "loss": 0.1687, + "step": 8752 + }, + { + "epoch": 0.7467155775464938, + "grad_norm": 1.918492325370496, + "learning_rate": 1.590176600797798e-05, + "loss": 0.183, + "step": 8753 + }, + { + "epoch": 0.7468008872206108, + "grad_norm": 1.664490120975035, + "learning_rate": 1.5891663026160102e-05, + "loss": 0.2954, + "step": 8754 + }, + { + "epoch": 0.7468861968947279, + "grad_norm": 1.7576756402885099, + "learning_rate": 1.5881562648332503e-05, + "loss": 0.1861, + "step": 8755 + }, + { + "epoch": 0.7469715065688449, + "grad_norm": 1.4589138601234741, + "learning_rate": 1.5871464875266294e-05, + "loss": 0.1676, + "step": 8756 + }, + { + "epoch": 0.7470568162429619, + "grad_norm": 1.8408574836681415, + "learning_rate": 1.586136970773238e-05, + "loss": 0.1738, + "step": 8757 + }, + { + "epoch": 0.747142125917079, + "grad_norm": 1.5362658191484115, + "learning_rate": 1.5851277146501476e-05, + "loss": 0.1895, + "step": 8758 + }, + { + "epoch": 0.7472274355911961, + "grad_norm": 1.5424243594376976, + "learning_rate": 1.5841187192344097e-05, + "loss": 0.1943, + "step": 8759 + }, + { + "epoch": 0.7473127452653131, + "grad_norm": 2.0193876883209394, + "learning_rate": 1.583109984603054e-05, + "loss": 0.1374, + "step": 8760 + }, + { + "epoch": 0.7473980549394301, + "grad_norm": 1.9971287501955872, + "learning_rate": 1.5821015108330968e-05, + "loss": 0.2232, + "step": 8761 + }, + { + "epoch": 0.7474833646135471, + "grad_norm": 1.8752408190890142, + "learning_rate": 1.581093298001523e-05, + "loss": 0.1752, + "step": 8762 + }, + { + "epoch": 0.7475686742876643, + "grad_norm": 1.896673568261612, + "learning_rate": 1.5800853461853098e-05, + "loss": 0.2593, + "step": 8763 + }, + { + "epoch": 0.7476539839617813, + "grad_norm": 2.896584630608578, + "learning_rate": 1.5790776554614066e-05, + "loss": 0.2462, + "step": 8764 + }, + { + "epoch": 0.7477392936358983, + "grad_norm": 1.691323575648899, + "learning_rate": 1.5780702259067465e-05, + "loss": 0.1647, + "step": 8765 + }, + { + "epoch": 0.7478246033100153, + "grad_norm": 1.6113390003184134, + "learning_rate": 1.577063057598238e-05, + "loss": 0.1942, + "step": 8766 + }, + { + "epoch": 0.7479099129841325, + "grad_norm": 2.253844050403674, + "learning_rate": 1.576056150612778e-05, + "loss": 0.1784, + "step": 8767 + }, + { + "epoch": 0.7479952226582495, + "grad_norm": 1.8665935896579164, + "learning_rate": 1.5750495050272358e-05, + "loss": 0.1813, + "step": 8768 + }, + { + "epoch": 0.7480805323323665, + "grad_norm": 2.31302090525797, + "learning_rate": 1.5740431209184636e-05, + "loss": 0.1861, + "step": 8769 + }, + { + "epoch": 0.7481658420064835, + "grad_norm": 1.7306256707473617, + "learning_rate": 1.5730369983632937e-05, + "loss": 0.1778, + "step": 8770 + }, + { + "epoch": 0.7482511516806006, + "grad_norm": 1.588852782087432, + "learning_rate": 1.5720311374385377e-05, + "loss": 0.2127, + "step": 8771 + }, + { + "epoch": 0.7483364613547177, + "grad_norm": 1.9399021393387816, + "learning_rate": 1.5710255382209887e-05, + "loss": 0.2234, + "step": 8772 + }, + { + "epoch": 0.7484217710288347, + "grad_norm": 2.2265137784211593, + "learning_rate": 1.5700202007874165e-05, + "loss": 0.2516, + "step": 8773 + }, + { + "epoch": 0.7485070807029517, + "grad_norm": 1.845728313940222, + "learning_rate": 1.569015125214578e-05, + "loss": 0.1578, + "step": 8774 + }, + { + "epoch": 0.7485923903770687, + "grad_norm": 1.5752516923855069, + "learning_rate": 1.5680103115791993e-05, + "loss": 0.2111, + "step": 8775 + }, + { + "epoch": 0.7486777000511858, + "grad_norm": 1.620697340698659, + "learning_rate": 1.567005759957998e-05, + "loss": 0.173, + "step": 8776 + }, + { + "epoch": 0.7487630097253029, + "grad_norm": 1.9495572883620191, + "learning_rate": 1.5660014704276638e-05, + "loss": 0.1971, + "step": 8777 + }, + { + "epoch": 0.7488483193994199, + "grad_norm": 2.172470450249727, + "learning_rate": 1.5649974430648684e-05, + "loss": 0.2498, + "step": 8778 + }, + { + "epoch": 0.7489336290735369, + "grad_norm": 1.8493072037357834, + "learning_rate": 1.5639936779462657e-05, + "loss": 0.144, + "step": 8779 + }, + { + "epoch": 0.749018938747654, + "grad_norm": 1.8234072070641372, + "learning_rate": 1.562990175148486e-05, + "loss": 0.2171, + "step": 8780 + }, + { + "epoch": 0.749104248421771, + "grad_norm": 2.080060931927184, + "learning_rate": 1.561986934748142e-05, + "loss": 0.2155, + "step": 8781 + }, + { + "epoch": 0.749189558095888, + "grad_norm": 1.736635951077458, + "learning_rate": 1.5609839568218247e-05, + "loss": 0.1576, + "step": 8782 + }, + { + "epoch": 0.7492748677700051, + "grad_norm": 1.8207304642496251, + "learning_rate": 1.559981241446109e-05, + "loss": 0.1745, + "step": 8783 + }, + { + "epoch": 0.7493601774441222, + "grad_norm": 1.8462744418975412, + "learning_rate": 1.5589787886975456e-05, + "loss": 0.1585, + "step": 8784 + }, + { + "epoch": 0.7494454871182392, + "grad_norm": 2.228387131860651, + "learning_rate": 1.5579765986526657e-05, + "loss": 0.2552, + "step": 8785 + }, + { + "epoch": 0.7495307967923562, + "grad_norm": 1.6624133037247757, + "learning_rate": 1.5569746713879802e-05, + "loss": 0.1522, + "step": 8786 + }, + { + "epoch": 0.7496161064664733, + "grad_norm": 1.7464680451867067, + "learning_rate": 1.5559730069799857e-05, + "loss": 0.2189, + "step": 8787 + }, + { + "epoch": 0.7497014161405904, + "grad_norm": 1.6813786709128347, + "learning_rate": 1.5549716055051468e-05, + "loss": 0.1891, + "step": 8788 + }, + { + "epoch": 0.7497867258147074, + "grad_norm": 1.5027472451603143, + "learning_rate": 1.553970467039921e-05, + "loss": 0.1687, + "step": 8789 + }, + { + "epoch": 0.7498720354888244, + "grad_norm": 1.584491013712994, + "learning_rate": 1.5529695916607374e-05, + "loss": 0.1747, + "step": 8790 + }, + { + "epoch": 0.7499573451629414, + "grad_norm": 1.6251112826463598, + "learning_rate": 1.5519689794440085e-05, + "loss": 0.2052, + "step": 8791 + }, + { + "epoch": 0.7500426548370586, + "grad_norm": 1.7156396639393523, + "learning_rate": 1.5509686304661247e-05, + "loss": 0.1725, + "step": 8792 + }, + { + "epoch": 0.7501279645111756, + "grad_norm": 1.9631037809870209, + "learning_rate": 1.549968544803458e-05, + "loss": 0.2014, + "step": 8793 + }, + { + "epoch": 0.7502132741852926, + "grad_norm": 1.2775656597784577, + "learning_rate": 1.5489687225323594e-05, + "loss": 0.122, + "step": 8794 + }, + { + "epoch": 0.7502985838594096, + "grad_norm": 1.8244809078724058, + "learning_rate": 1.5479691637291586e-05, + "loss": 0.2088, + "step": 8795 + }, + { + "epoch": 0.7503838935335267, + "grad_norm": 1.5505746852794664, + "learning_rate": 1.5469698684701715e-05, + "loss": 0.1315, + "step": 8796 + }, + { + "epoch": 0.7504692032076438, + "grad_norm": 1.5693759687596183, + "learning_rate": 1.5459708368316823e-05, + "loss": 0.1538, + "step": 8797 + }, + { + "epoch": 0.7505545128817608, + "grad_norm": 1.6553170797620624, + "learning_rate": 1.5449720688899665e-05, + "loss": 0.2432, + "step": 8798 + }, + { + "epoch": 0.7506398225558778, + "grad_norm": 1.524244515315132, + "learning_rate": 1.543973564721273e-05, + "loss": 0.1688, + "step": 8799 + }, + { + "epoch": 0.7507251322299949, + "grad_norm": 2.124121091505132, + "learning_rate": 1.542975324401833e-05, + "loss": 0.2512, + "step": 8800 + }, + { + "epoch": 0.750810441904112, + "grad_norm": 1.5841542257549208, + "learning_rate": 1.5419773480078563e-05, + "loss": 0.1811, + "step": 8801 + }, + { + "epoch": 0.750895751578229, + "grad_norm": 1.4835945085199154, + "learning_rate": 1.5409796356155316e-05, + "loss": 0.1817, + "step": 8802 + }, + { + "epoch": 0.750981061252346, + "grad_norm": 2.063477371295097, + "learning_rate": 1.5399821873010335e-05, + "loss": 0.1894, + "step": 8803 + }, + { + "epoch": 0.7510663709264631, + "grad_norm": 1.6079913367042795, + "learning_rate": 1.5389850031405057e-05, + "loss": 0.2219, + "step": 8804 + }, + { + "epoch": 0.7511516806005801, + "grad_norm": 1.295978833991156, + "learning_rate": 1.5379880832100824e-05, + "loss": 0.1411, + "step": 8805 + }, + { + "epoch": 0.7512369902746971, + "grad_norm": 1.582536781450066, + "learning_rate": 1.536991427585872e-05, + "loss": 0.1692, + "step": 8806 + }, + { + "epoch": 0.7513222999488142, + "grad_norm": 1.3715410508212393, + "learning_rate": 1.535995036343964e-05, + "loss": 0.1974, + "step": 8807 + }, + { + "epoch": 0.7514076096229313, + "grad_norm": 1.9034053652350351, + "learning_rate": 1.5349989095604257e-05, + "loss": 0.1575, + "step": 8808 + }, + { + "epoch": 0.7514929192970483, + "grad_norm": 1.7861575067063453, + "learning_rate": 1.5340030473113103e-05, + "loss": 0.2695, + "step": 8809 + }, + { + "epoch": 0.7515782289711653, + "grad_norm": 1.5002699450142991, + "learning_rate": 1.5330074496726415e-05, + "loss": 0.1743, + "step": 8810 + }, + { + "epoch": 0.7516635386452823, + "grad_norm": 2.30354099528169, + "learning_rate": 1.5320121167204314e-05, + "loss": 0.2357, + "step": 8811 + }, + { + "epoch": 0.7517488483193994, + "grad_norm": 3.270816746242443, + "learning_rate": 1.5310170485306672e-05, + "loss": 0.1941, + "step": 8812 + }, + { + "epoch": 0.7518341579935165, + "grad_norm": 1.9682540874479943, + "learning_rate": 1.5300222451793178e-05, + "loss": 0.1609, + "step": 8813 + }, + { + "epoch": 0.7519194676676335, + "grad_norm": 1.6259761959224437, + "learning_rate": 1.5290277067423303e-05, + "loss": 0.2457, + "step": 8814 + }, + { + "epoch": 0.7520047773417505, + "grad_norm": 2.181962770670658, + "learning_rate": 1.528033433295631e-05, + "loss": 0.1682, + "step": 8815 + }, + { + "epoch": 0.7520900870158675, + "grad_norm": 1.8525529618614052, + "learning_rate": 1.5270394249151322e-05, + "loss": 0.2221, + "step": 8816 + }, + { + "epoch": 0.7521753966899847, + "grad_norm": 2.798200410461941, + "learning_rate": 1.526045681676715e-05, + "loss": 0.1834, + "step": 8817 + }, + { + "epoch": 0.7522607063641017, + "grad_norm": 2.40025871789726, + "learning_rate": 1.5250522036562503e-05, + "loss": 0.2156, + "step": 8818 + }, + { + "epoch": 0.7523460160382187, + "grad_norm": 1.5156731352093729, + "learning_rate": 1.524058990929585e-05, + "loss": 0.1689, + "step": 8819 + }, + { + "epoch": 0.7524313257123357, + "grad_norm": 1.9079327269716335, + "learning_rate": 1.523066043572544e-05, + "loss": 0.2264, + "step": 8820 + }, + { + "epoch": 0.7525166353864529, + "grad_norm": 1.4099587595434417, + "learning_rate": 1.5220733616609345e-05, + "loss": 0.1192, + "step": 8821 + }, + { + "epoch": 0.7526019450605699, + "grad_norm": 1.7608788088721246, + "learning_rate": 1.5210809452705415e-05, + "loss": 0.1414, + "step": 8822 + }, + { + "epoch": 0.7526872547346869, + "grad_norm": 1.807875752311001, + "learning_rate": 1.520088794477132e-05, + "loss": 0.1682, + "step": 8823 + }, + { + "epoch": 0.7527725644088039, + "grad_norm": 1.8084078666456846, + "learning_rate": 1.5190969093564494e-05, + "loss": 0.184, + "step": 8824 + }, + { + "epoch": 0.752857874082921, + "grad_norm": 1.5766653732445044, + "learning_rate": 1.5181052899842229e-05, + "loss": 0.1766, + "step": 8825 + }, + { + "epoch": 0.7529431837570381, + "grad_norm": 1.7052347956774991, + "learning_rate": 1.517113936436152e-05, + "loss": 0.1369, + "step": 8826 + }, + { + "epoch": 0.7530284934311551, + "grad_norm": 1.8261444579325572, + "learning_rate": 1.5161228487879253e-05, + "loss": 0.2364, + "step": 8827 + }, + { + "epoch": 0.7531138031052721, + "grad_norm": 2.241880814582254, + "learning_rate": 1.5151320271152041e-05, + "loss": 0.2178, + "step": 8828 + }, + { + "epoch": 0.7531991127793892, + "grad_norm": 1.7458248059315635, + "learning_rate": 1.5141414714936376e-05, + "loss": 0.1519, + "step": 8829 + }, + { + "epoch": 0.7532844224535062, + "grad_norm": 2.217119289260291, + "learning_rate": 1.5131511819988426e-05, + "loss": 0.2617, + "step": 8830 + }, + { + "epoch": 0.7533697321276233, + "grad_norm": 1.891148014079603, + "learning_rate": 1.5121611587064278e-05, + "loss": 0.2398, + "step": 8831 + }, + { + "epoch": 0.7534550418017403, + "grad_norm": 1.5721121576886743, + "learning_rate": 1.5111714016919732e-05, + "loss": 0.2165, + "step": 8832 + }, + { + "epoch": 0.7535403514758574, + "grad_norm": 1.872947586817065, + "learning_rate": 1.5101819110310433e-05, + "loss": 0.2163, + "step": 8833 + }, + { + "epoch": 0.7536256611499744, + "grad_norm": 1.9383919416353785, + "learning_rate": 1.509192686799179e-05, + "loss": 0.1663, + "step": 8834 + }, + { + "epoch": 0.7537109708240914, + "grad_norm": 1.9455462702119224, + "learning_rate": 1.5082037290719036e-05, + "loss": 0.2336, + "step": 8835 + }, + { + "epoch": 0.7537962804982085, + "grad_norm": 1.6719872267659541, + "learning_rate": 1.5072150379247174e-05, + "loss": 0.2172, + "step": 8836 + }, + { + "epoch": 0.7538815901723256, + "grad_norm": 1.6695888968098063, + "learning_rate": 1.5062266134331016e-05, + "loss": 0.1876, + "step": 8837 + }, + { + "epoch": 0.7539668998464426, + "grad_norm": 1.836428962021142, + "learning_rate": 1.5052384556725201e-05, + "loss": 0.2155, + "step": 8838 + }, + { + "epoch": 0.7540522095205596, + "grad_norm": 1.7643096885795237, + "learning_rate": 1.5042505647184091e-05, + "loss": 0.1943, + "step": 8839 + }, + { + "epoch": 0.7541375191946766, + "grad_norm": 1.8469820304620208, + "learning_rate": 1.5032629406461923e-05, + "loss": 0.2476, + "step": 8840 + }, + { + "epoch": 0.7542228288687938, + "grad_norm": 1.658603076306651, + "learning_rate": 1.5022755835312686e-05, + "loss": 0.171, + "step": 8841 + }, + { + "epoch": 0.7543081385429108, + "grad_norm": 1.5480517910713956, + "learning_rate": 1.5012884934490167e-05, + "loss": 0.179, + "step": 8842 + }, + { + "epoch": 0.7543934482170278, + "grad_norm": 1.955989110935848, + "learning_rate": 1.5003016704747969e-05, + "loss": 0.2425, + "step": 8843 + }, + { + "epoch": 0.7544787578911448, + "grad_norm": 1.8458131787625622, + "learning_rate": 1.4993151146839451e-05, + "loss": 0.1485, + "step": 8844 + }, + { + "epoch": 0.754564067565262, + "grad_norm": 1.8182624766491988, + "learning_rate": 1.4983288261517853e-05, + "loss": 0.1634, + "step": 8845 + }, + { + "epoch": 0.754649377239379, + "grad_norm": 1.7332732679279041, + "learning_rate": 1.497342804953608e-05, + "loss": 0.1578, + "step": 8846 + }, + { + "epoch": 0.754734686913496, + "grad_norm": 1.840182587023028, + "learning_rate": 1.4963570511646963e-05, + "loss": 0.1793, + "step": 8847 + }, + { + "epoch": 0.754819996587613, + "grad_norm": 1.3517795895554527, + "learning_rate": 1.4953715648603057e-05, + "loss": 0.1889, + "step": 8848 + }, + { + "epoch": 0.75490530626173, + "grad_norm": 1.7629321332863288, + "learning_rate": 1.494386346115672e-05, + "loss": 0.1879, + "step": 8849 + }, + { + "epoch": 0.7549906159358472, + "grad_norm": 2.1008563929433044, + "learning_rate": 1.4934013950060104e-05, + "loss": 0.1061, + "step": 8850 + }, + { + "epoch": 0.7550759256099642, + "grad_norm": 1.621918250299221, + "learning_rate": 1.4924167116065218e-05, + "loss": 0.1958, + "step": 8851 + }, + { + "epoch": 0.7551612352840812, + "grad_norm": 1.4280494519708238, + "learning_rate": 1.491432295992375e-05, + "loss": 0.1936, + "step": 8852 + }, + { + "epoch": 0.7552465449581982, + "grad_norm": 1.5697937673222646, + "learning_rate": 1.4904481482387289e-05, + "loss": 0.13, + "step": 8853 + }, + { + "epoch": 0.7553318546323153, + "grad_norm": 1.5378960386243457, + "learning_rate": 1.4894642684207171e-05, + "loss": 0.1556, + "step": 8854 + }, + { + "epoch": 0.7554171643064324, + "grad_norm": 1.627512236987166, + "learning_rate": 1.4884806566134535e-05, + "loss": 0.2226, + "step": 8855 + }, + { + "epoch": 0.7555024739805494, + "grad_norm": 1.8384886851975362, + "learning_rate": 1.4874973128920317e-05, + "loss": 0.2307, + "step": 8856 + }, + { + "epoch": 0.7555877836546664, + "grad_norm": 1.6386506325910102, + "learning_rate": 1.4865142373315227e-05, + "loss": 0.2478, + "step": 8857 + }, + { + "epoch": 0.7556730933287835, + "grad_norm": 1.619509444432693, + "learning_rate": 1.4855314300069844e-05, + "loss": 0.2102, + "step": 8858 + }, + { + "epoch": 0.7557584030029005, + "grad_norm": 1.6506312087728372, + "learning_rate": 1.484548890993443e-05, + "loss": 0.2093, + "step": 8859 + }, + { + "epoch": 0.7558437126770176, + "grad_norm": 1.4454373692363878, + "learning_rate": 1.4835666203659133e-05, + "loss": 0.2337, + "step": 8860 + }, + { + "epoch": 0.7559290223511346, + "grad_norm": 1.7344779594034898, + "learning_rate": 1.4825846181993864e-05, + "loss": 0.2089, + "step": 8861 + }, + { + "epoch": 0.7560143320252517, + "grad_norm": 1.2581232291302487, + "learning_rate": 1.4816028845688323e-05, + "loss": 0.1915, + "step": 8862 + }, + { + "epoch": 0.7560996416993687, + "grad_norm": 1.7093096091195683, + "learning_rate": 1.4806214195492008e-05, + "loss": 0.2045, + "step": 8863 + }, + { + "epoch": 0.7561849513734857, + "grad_norm": 1.6791109026917215, + "learning_rate": 1.4796402232154228e-05, + "loss": 0.2255, + "step": 8864 + }, + { + "epoch": 0.7562702610476028, + "grad_norm": 1.7521086658849365, + "learning_rate": 1.4786592956424055e-05, + "loss": 0.2239, + "step": 8865 + }, + { + "epoch": 0.7563555707217199, + "grad_norm": 1.561126563122813, + "learning_rate": 1.4776786369050377e-05, + "loss": 0.2091, + "step": 8866 + }, + { + "epoch": 0.7564408803958369, + "grad_norm": 1.6336118940662039, + "learning_rate": 1.4766982470781915e-05, + "loss": 0.2354, + "step": 8867 + }, + { + "epoch": 0.7565261900699539, + "grad_norm": 1.444119512658007, + "learning_rate": 1.4757181262367081e-05, + "loss": 0.1792, + "step": 8868 + }, + { + "epoch": 0.7566114997440709, + "grad_norm": 1.8775984243304253, + "learning_rate": 1.4747382744554195e-05, + "loss": 0.1985, + "step": 8869 + }, + { + "epoch": 0.7566968094181881, + "grad_norm": 1.8378570261690423, + "learning_rate": 1.4737586918091301e-05, + "loss": 0.2028, + "step": 8870 + }, + { + "epoch": 0.7567821190923051, + "grad_norm": 1.7095682596311792, + "learning_rate": 1.4727793783726263e-05, + "loss": 0.1779, + "step": 8871 + }, + { + "epoch": 0.7568674287664221, + "grad_norm": 2.0760766740505674, + "learning_rate": 1.4718003342206722e-05, + "loss": 0.2168, + "step": 8872 + }, + { + "epoch": 0.7569527384405391, + "grad_norm": 1.6399525308717242, + "learning_rate": 1.4708215594280144e-05, + "loss": 0.2372, + "step": 8873 + }, + { + "epoch": 0.7570380481146562, + "grad_norm": 1.9851003207360152, + "learning_rate": 1.469843054069377e-05, + "loss": 0.2, + "step": 8874 + }, + { + "epoch": 0.7571233577887733, + "grad_norm": 2.4954837059569788, + "learning_rate": 1.4688648182194637e-05, + "loss": 0.224, + "step": 8875 + }, + { + "epoch": 0.7572086674628903, + "grad_norm": 1.4716059736536526, + "learning_rate": 1.4678868519529564e-05, + "loss": 0.2142, + "step": 8876 + }, + { + "epoch": 0.7572939771370073, + "grad_norm": 1.6548795968597048, + "learning_rate": 1.4669091553445185e-05, + "loss": 0.179, + "step": 8877 + }, + { + "epoch": 0.7573792868111244, + "grad_norm": 2.0055252287178242, + "learning_rate": 1.4659317284687918e-05, + "loss": 0.2388, + "step": 8878 + }, + { + "epoch": 0.7574645964852414, + "grad_norm": 1.6549939845226485, + "learning_rate": 1.4649545714003959e-05, + "loss": 0.156, + "step": 8879 + }, + { + "epoch": 0.7575499061593585, + "grad_norm": 1.7186205034890099, + "learning_rate": 1.4639776842139363e-05, + "loss": 0.1481, + "step": 8880 + }, + { + "epoch": 0.7576352158334755, + "grad_norm": 2.25203866787875, + "learning_rate": 1.4630010669839871e-05, + "loss": 0.1805, + "step": 8881 + }, + { + "epoch": 0.7577205255075926, + "grad_norm": 1.9116664291598975, + "learning_rate": 1.4620247197851128e-05, + "loss": 0.2199, + "step": 8882 + }, + { + "epoch": 0.7578058351817096, + "grad_norm": 1.7988086413592637, + "learning_rate": 1.4610486426918502e-05, + "loss": 0.1147, + "step": 8883 + }, + { + "epoch": 0.7578911448558266, + "grad_norm": 1.9661244641757478, + "learning_rate": 1.4600728357787175e-05, + "loss": 0.19, + "step": 8884 + }, + { + "epoch": 0.7579764545299437, + "grad_norm": 1.785712424686007, + "learning_rate": 1.4590972991202129e-05, + "loss": 0.1832, + "step": 8885 + }, + { + "epoch": 0.7580617642040608, + "grad_norm": 1.5332345364019566, + "learning_rate": 1.458122032790813e-05, + "loss": 0.1663, + "step": 8886 + }, + { + "epoch": 0.7581470738781778, + "grad_norm": 1.564434456207861, + "learning_rate": 1.4571470368649742e-05, + "loss": 0.1924, + "step": 8887 + }, + { + "epoch": 0.7582323835522948, + "grad_norm": 1.5258267959851917, + "learning_rate": 1.4561723114171311e-05, + "loss": 0.1868, + "step": 8888 + }, + { + "epoch": 0.7583176932264118, + "grad_norm": 1.9714843895120244, + "learning_rate": 1.4551978565217017e-05, + "loss": 0.2227, + "step": 8889 + }, + { + "epoch": 0.7584030029005289, + "grad_norm": 1.9382986453104887, + "learning_rate": 1.454223672253079e-05, + "loss": 0.2346, + "step": 8890 + }, + { + "epoch": 0.758488312574646, + "grad_norm": 1.731082712622558, + "learning_rate": 1.4532497586856364e-05, + "loss": 0.1732, + "step": 8891 + }, + { + "epoch": 0.758573622248763, + "grad_norm": 1.801468393294702, + "learning_rate": 1.452276115893726e-05, + "loss": 0.2279, + "step": 8892 + }, + { + "epoch": 0.75865893192288, + "grad_norm": 1.63711288185991, + "learning_rate": 1.4513027439516847e-05, + "loss": 0.1864, + "step": 8893 + }, + { + "epoch": 0.758744241596997, + "grad_norm": 1.2793351728225002, + "learning_rate": 1.4503296429338181e-05, + "loss": 0.2297, + "step": 8894 + }, + { + "epoch": 0.7588295512711142, + "grad_norm": 1.8432259542267087, + "learning_rate": 1.4493568129144214e-05, + "loss": 0.1459, + "step": 8895 + }, + { + "epoch": 0.7589148609452312, + "grad_norm": 2.0779779138355114, + "learning_rate": 1.4483842539677644e-05, + "loss": 0.239, + "step": 8896 + }, + { + "epoch": 0.7590001706193482, + "grad_norm": 2.034055679504428, + "learning_rate": 1.4474119661680962e-05, + "loss": 0.2081, + "step": 8897 + }, + { + "epoch": 0.7590854802934652, + "grad_norm": 1.8844404977831108, + "learning_rate": 1.4464399495896458e-05, + "loss": 0.1485, + "step": 8898 + }, + { + "epoch": 0.7591707899675824, + "grad_norm": 2.0827575905294893, + "learning_rate": 1.445468204306622e-05, + "loss": 0.1482, + "step": 8899 + }, + { + "epoch": 0.7592560996416994, + "grad_norm": 1.8831769028685432, + "learning_rate": 1.4444967303932116e-05, + "loss": 0.1995, + "step": 8900 + }, + { + "epoch": 0.7593414093158164, + "grad_norm": 1.7134892495282548, + "learning_rate": 1.443525527923581e-05, + "loss": 0.0857, + "step": 8901 + }, + { + "epoch": 0.7594267189899334, + "grad_norm": 1.5661231186994222, + "learning_rate": 1.4425545969718801e-05, + "loss": 0.1883, + "step": 8902 + }, + { + "epoch": 0.7595120286640505, + "grad_norm": 1.7140552558850168, + "learning_rate": 1.4415839376122282e-05, + "loss": 0.1967, + "step": 8903 + }, + { + "epoch": 0.7595973383381676, + "grad_norm": 1.6121182071045468, + "learning_rate": 1.4406135499187346e-05, + "loss": 0.1909, + "step": 8904 + }, + { + "epoch": 0.7596826480122846, + "grad_norm": 1.649615332529239, + "learning_rate": 1.4396434339654824e-05, + "loss": 0.1924, + "step": 8905 + }, + { + "epoch": 0.7597679576864016, + "grad_norm": 1.805430469907701, + "learning_rate": 1.4386735898265342e-05, + "loss": 0.1784, + "step": 8906 + }, + { + "epoch": 0.7598532673605187, + "grad_norm": 1.9044914263599648, + "learning_rate": 1.437704017575932e-05, + "loss": 0.2476, + "step": 8907 + }, + { + "epoch": 0.7599385770346357, + "grad_norm": 1.626160720614669, + "learning_rate": 1.4367347172876972e-05, + "loss": 0.1862, + "step": 8908 + }, + { + "epoch": 0.7600238867087528, + "grad_norm": 1.8487973066961527, + "learning_rate": 1.435765689035834e-05, + "loss": 0.1788, + "step": 8909 + }, + { + "epoch": 0.7601091963828698, + "grad_norm": 1.457929482626734, + "learning_rate": 1.4347969328943179e-05, + "loss": 0.1608, + "step": 8910 + }, + { + "epoch": 0.7601945060569869, + "grad_norm": 1.7702427795633546, + "learning_rate": 1.4338284489371113e-05, + "loss": 0.1645, + "step": 8911 + }, + { + "epoch": 0.7602798157311039, + "grad_norm": 1.6183303996150216, + "learning_rate": 1.4328602372381522e-05, + "loss": 0.2018, + "step": 8912 + }, + { + "epoch": 0.7603651254052209, + "grad_norm": 1.452833987393235, + "learning_rate": 1.4318922978713583e-05, + "loss": 0.1617, + "step": 8913 + }, + { + "epoch": 0.760450435079338, + "grad_norm": 1.435335603075821, + "learning_rate": 1.430924630910625e-05, + "loss": 0.1974, + "step": 8914 + }, + { + "epoch": 0.7605357447534551, + "grad_norm": 2.412565884654061, + "learning_rate": 1.429957236429833e-05, + "loss": 0.1948, + "step": 8915 + }, + { + "epoch": 0.7606210544275721, + "grad_norm": 2.2631963933193884, + "learning_rate": 1.4289901145028317e-05, + "loss": 0.2065, + "step": 8916 + }, + { + "epoch": 0.7607063641016891, + "grad_norm": 2.0676883981610112, + "learning_rate": 1.428023265203461e-05, + "loss": 0.2426, + "step": 8917 + }, + { + "epoch": 0.7607916737758061, + "grad_norm": 2.0079257799543226, + "learning_rate": 1.427056688605532e-05, + "loss": 0.274, + "step": 8918 + }, + { + "epoch": 0.7608769834499233, + "grad_norm": 1.8213977116553752, + "learning_rate": 1.4260903847828383e-05, + "loss": 0.2001, + "step": 8919 + }, + { + "epoch": 0.7609622931240403, + "grad_norm": 2.1422529504127765, + "learning_rate": 1.425124353809152e-05, + "loss": 0.192, + "step": 8920 + }, + { + "epoch": 0.7610476027981573, + "grad_norm": 1.8263904787482952, + "learning_rate": 1.4241585957582226e-05, + "loss": 0.2381, + "step": 8921 + }, + { + "epoch": 0.7611329124722743, + "grad_norm": 1.5282604071725956, + "learning_rate": 1.4231931107037861e-05, + "loss": 0.1951, + "step": 8922 + }, + { + "epoch": 0.7612182221463915, + "grad_norm": 1.960987895910713, + "learning_rate": 1.4222278987195447e-05, + "loss": 0.1745, + "step": 8923 + }, + { + "epoch": 0.7613035318205085, + "grad_norm": 1.4998955690270774, + "learning_rate": 1.4212629598791932e-05, + "loss": 0.1068, + "step": 8924 + }, + { + "epoch": 0.7613888414946255, + "grad_norm": 1.5927486639943804, + "learning_rate": 1.420298294256397e-05, + "loss": 0.2243, + "step": 8925 + }, + { + "epoch": 0.7614741511687425, + "grad_norm": 1.5333510017759198, + "learning_rate": 1.4193339019248036e-05, + "loss": 0.1615, + "step": 8926 + }, + { + "epoch": 0.7615594608428595, + "grad_norm": 2.3755452383491584, + "learning_rate": 1.4183697829580389e-05, + "loss": 0.2001, + "step": 8927 + }, + { + "epoch": 0.7616447705169767, + "grad_norm": 1.6271298462507713, + "learning_rate": 1.4174059374297088e-05, + "loss": 0.2064, + "step": 8928 + }, + { + "epoch": 0.7617300801910937, + "grad_norm": 1.6804782159557314, + "learning_rate": 1.4164423654133974e-05, + "loss": 0.1921, + "step": 8929 + }, + { + "epoch": 0.7618153898652107, + "grad_norm": 1.652136823534661, + "learning_rate": 1.4154790669826668e-05, + "loss": 0.1823, + "step": 8930 + }, + { + "epoch": 0.7619006995393277, + "grad_norm": 1.6652239117974854, + "learning_rate": 1.4145160422110643e-05, + "loss": 0.2076, + "step": 8931 + }, + { + "epoch": 0.7619860092134448, + "grad_norm": 1.5011620788492819, + "learning_rate": 1.4135532911721061e-05, + "loss": 0.2002, + "step": 8932 + }, + { + "epoch": 0.7620713188875619, + "grad_norm": 1.358649400006739, + "learning_rate": 1.4125908139392968e-05, + "loss": 0.165, + "step": 8933 + }, + { + "epoch": 0.7621566285616789, + "grad_norm": 1.7430653092650548, + "learning_rate": 1.4116286105861137e-05, + "loss": 0.2356, + "step": 8934 + }, + { + "epoch": 0.7622419382357959, + "grad_norm": 1.8531011312919243, + "learning_rate": 1.4106666811860214e-05, + "loss": 0.2131, + "step": 8935 + }, + { + "epoch": 0.762327247909913, + "grad_norm": 1.515030577196813, + "learning_rate": 1.4097050258124506e-05, + "loss": 0.1966, + "step": 8936 + }, + { + "epoch": 0.76241255758403, + "grad_norm": 1.6778167953052558, + "learning_rate": 1.4087436445388242e-05, + "loss": 0.2025, + "step": 8937 + }, + { + "epoch": 0.762497867258147, + "grad_norm": 2.0301781981103932, + "learning_rate": 1.4077825374385362e-05, + "loss": 0.2365, + "step": 8938 + }, + { + "epoch": 0.7625831769322641, + "grad_norm": 1.5811373975866225, + "learning_rate": 1.406821704584963e-05, + "loss": 0.1964, + "step": 8939 + }, + { + "epoch": 0.7626684866063812, + "grad_norm": 2.232006605077015, + "learning_rate": 1.4058611460514581e-05, + "loss": 0.2049, + "step": 8940 + }, + { + "epoch": 0.7627537962804982, + "grad_norm": 2.17968771876399, + "learning_rate": 1.4049008619113558e-05, + "loss": 0.2128, + "step": 8941 + }, + { + "epoch": 0.7628391059546152, + "grad_norm": 1.5431054163416682, + "learning_rate": 1.4039408522379683e-05, + "loss": 0.1736, + "step": 8942 + }, + { + "epoch": 0.7629244156287323, + "grad_norm": 2.195842963914786, + "learning_rate": 1.4029811171045853e-05, + "loss": 0.1951, + "step": 8943 + }, + { + "epoch": 0.7630097253028494, + "grad_norm": 1.5027764374230863, + "learning_rate": 1.402021656584483e-05, + "loss": 0.1688, + "step": 8944 + }, + { + "epoch": 0.7630950349769664, + "grad_norm": 1.7678183376572592, + "learning_rate": 1.401062470750904e-05, + "loss": 0.2178, + "step": 8945 + }, + { + "epoch": 0.7631803446510834, + "grad_norm": 2.0779857158592168, + "learning_rate": 1.4001035596770828e-05, + "loss": 0.1701, + "step": 8946 + }, + { + "epoch": 0.7632656543252004, + "grad_norm": 1.6303236395506673, + "learning_rate": 1.3991449234362246e-05, + "loss": 0.1448, + "step": 8947 + }, + { + "epoch": 0.7633509639993176, + "grad_norm": 1.5554117787234356, + "learning_rate": 1.3981865621015167e-05, + "loss": 0.1746, + "step": 8948 + }, + { + "epoch": 0.7634362736734346, + "grad_norm": 1.7935089427851598, + "learning_rate": 1.3972284757461246e-05, + "loss": 0.245, + "step": 8949 + }, + { + "epoch": 0.7635215833475516, + "grad_norm": 1.6497922997807244, + "learning_rate": 1.3962706644431922e-05, + "loss": 0.1799, + "step": 8950 + }, + { + "epoch": 0.7636068930216686, + "grad_norm": 1.6500826034676572, + "learning_rate": 1.395313128265847e-05, + "loss": 0.1864, + "step": 8951 + }, + { + "epoch": 0.7636922026957857, + "grad_norm": 2.25884774837296, + "learning_rate": 1.3943558672871858e-05, + "loss": 0.1024, + "step": 8952 + }, + { + "epoch": 0.7637775123699028, + "grad_norm": 1.6391424837770519, + "learning_rate": 1.3933988815802962e-05, + "loss": 0.1216, + "step": 8953 + }, + { + "epoch": 0.7638628220440198, + "grad_norm": 1.918370595195056, + "learning_rate": 1.3924421712182362e-05, + "loss": 0.1831, + "step": 8954 + }, + { + "epoch": 0.7639481317181368, + "grad_norm": 2.6378777111476808, + "learning_rate": 1.3914857362740457e-05, + "loss": 0.2424, + "step": 8955 + }, + { + "epoch": 0.7640334413922539, + "grad_norm": 1.9740390274611335, + "learning_rate": 1.3905295768207421e-05, + "loss": 0.2248, + "step": 8956 + }, + { + "epoch": 0.764118751066371, + "grad_norm": 2.378720431281859, + "learning_rate": 1.3895736929313274e-05, + "loss": 0.1481, + "step": 8957 + }, + { + "epoch": 0.764204060740488, + "grad_norm": 1.8613017381556989, + "learning_rate": 1.3886180846787727e-05, + "loss": 0.2323, + "step": 8958 + }, + { + "epoch": 0.764289370414605, + "grad_norm": 1.8229237438246177, + "learning_rate": 1.387662752136038e-05, + "loss": 0.1861, + "step": 8959 + }, + { + "epoch": 0.7643746800887221, + "grad_norm": 2.101197278361552, + "learning_rate": 1.386707695376056e-05, + "loss": 0.2062, + "step": 8960 + }, + { + "epoch": 0.7644599897628391, + "grad_norm": 1.609936218183016, + "learning_rate": 1.3857529144717407e-05, + "loss": 0.239, + "step": 8961 + }, + { + "epoch": 0.7645452994369561, + "grad_norm": 1.5107690948082566, + "learning_rate": 1.3847984094959842e-05, + "loss": 0.1742, + "step": 8962 + }, + { + "epoch": 0.7646306091110732, + "grad_norm": 2.1965576116247267, + "learning_rate": 1.3838441805216562e-05, + "loss": 0.1985, + "step": 8963 + }, + { + "epoch": 0.7647159187851902, + "grad_norm": 2.0440712865801873, + "learning_rate": 1.382890227621612e-05, + "loss": 0.2232, + "step": 8964 + }, + { + "epoch": 0.7648012284593073, + "grad_norm": 1.6744187157745472, + "learning_rate": 1.3819365508686749e-05, + "loss": 0.2235, + "step": 8965 + }, + { + "epoch": 0.7648865381334243, + "grad_norm": 1.768012668874928, + "learning_rate": 1.3809831503356569e-05, + "loss": 0.1655, + "step": 8966 + }, + { + "epoch": 0.7649718478075413, + "grad_norm": 2.2924190557701927, + "learning_rate": 1.3800300260953442e-05, + "loss": 0.2254, + "step": 8967 + }, + { + "epoch": 0.7650571574816584, + "grad_norm": 1.8335069516364586, + "learning_rate": 1.3790771782205026e-05, + "loss": 0.2027, + "step": 8968 + }, + { + "epoch": 0.7651424671557755, + "grad_norm": 2.3294921547486793, + "learning_rate": 1.3781246067838766e-05, + "loss": 0.2122, + "step": 8969 + }, + { + "epoch": 0.7652277768298925, + "grad_norm": 1.6420466485128655, + "learning_rate": 1.3771723118581897e-05, + "loss": 0.2175, + "step": 8970 + }, + { + "epoch": 0.7653130865040095, + "grad_norm": 1.9837947207279378, + "learning_rate": 1.3762202935161461e-05, + "loss": 0.2525, + "step": 8971 + }, + { + "epoch": 0.7653983961781265, + "grad_norm": 2.7365518678600416, + "learning_rate": 1.3752685518304243e-05, + "loss": 0.2776, + "step": 8972 + }, + { + "epoch": 0.7654837058522437, + "grad_norm": 1.4255246284201124, + "learning_rate": 1.3743170868736898e-05, + "loss": 0.1778, + "step": 8973 + }, + { + "epoch": 0.7655690155263607, + "grad_norm": 1.7500643718324338, + "learning_rate": 1.373365898718576e-05, + "loss": 0.212, + "step": 8974 + }, + { + "epoch": 0.7656543252004777, + "grad_norm": 1.541577448282354, + "learning_rate": 1.3724149874377057e-05, + "loss": 0.1494, + "step": 8975 + }, + { + "epoch": 0.7657396348745947, + "grad_norm": 1.564410910352342, + "learning_rate": 1.3714643531036735e-05, + "loss": 0.1379, + "step": 8976 + }, + { + "epoch": 0.7658249445487119, + "grad_norm": 1.1097433526376492, + "learning_rate": 1.3705139957890561e-05, + "loss": 0.1047, + "step": 8977 + }, + { + "epoch": 0.7659102542228289, + "grad_norm": 1.2988892505000944, + "learning_rate": 1.369563915566407e-05, + "loss": 0.186, + "step": 8978 + }, + { + "epoch": 0.7659955638969459, + "grad_norm": 1.9548140889624277, + "learning_rate": 1.3686141125082619e-05, + "loss": 0.1967, + "step": 8979 + }, + { + "epoch": 0.7660808735710629, + "grad_norm": 1.2740823248190283, + "learning_rate": 1.3676645866871323e-05, + "loss": 0.2133, + "step": 8980 + }, + { + "epoch": 0.76616618324518, + "grad_norm": 1.649937241256603, + "learning_rate": 1.3667153381755093e-05, + "loss": 0.1758, + "step": 8981 + }, + { + "epoch": 0.7662514929192971, + "grad_norm": 1.5093050670021353, + "learning_rate": 1.3657663670458631e-05, + "loss": 0.1872, + "step": 8982 + }, + { + "epoch": 0.7663368025934141, + "grad_norm": 1.6155202628741863, + "learning_rate": 1.3648176733706419e-05, + "loss": 0.2538, + "step": 8983 + }, + { + "epoch": 0.7664221122675311, + "grad_norm": 1.670401521028725, + "learning_rate": 1.3638692572222744e-05, + "loss": 0.1948, + "step": 8984 + }, + { + "epoch": 0.7665074219416482, + "grad_norm": 1.827192150023278, + "learning_rate": 1.3629211186731656e-05, + "loss": 0.2365, + "step": 8985 + }, + { + "epoch": 0.7665927316157652, + "grad_norm": 2.64723723003576, + "learning_rate": 1.3619732577957045e-05, + "loss": 0.1757, + "step": 8986 + }, + { + "epoch": 0.7666780412898823, + "grad_norm": 2.177838816511811, + "learning_rate": 1.3610256746622501e-05, + "loss": 0.2498, + "step": 8987 + }, + { + "epoch": 0.7667633509639993, + "grad_norm": 2.389076279774477, + "learning_rate": 1.3600783693451492e-05, + "loss": 0.2203, + "step": 8988 + }, + { + "epoch": 0.7668486606381164, + "grad_norm": 2.043917900444767, + "learning_rate": 1.3591313419167222e-05, + "loss": 0.144, + "step": 8989 + }, + { + "epoch": 0.7669339703122334, + "grad_norm": 1.7189121516639352, + "learning_rate": 1.3581845924492698e-05, + "loss": 0.24, + "step": 8990 + }, + { + "epoch": 0.7670192799863504, + "grad_norm": 2.248900038679391, + "learning_rate": 1.357238121015071e-05, + "loss": 0.1279, + "step": 8991 + }, + { + "epoch": 0.7671045896604675, + "grad_norm": 1.4894340635971324, + "learning_rate": 1.3562919276863844e-05, + "loss": 0.1683, + "step": 8992 + }, + { + "epoch": 0.7671898993345846, + "grad_norm": 1.5438856273795274, + "learning_rate": 1.3553460125354461e-05, + "loss": 0.1979, + "step": 8993 + }, + { + "epoch": 0.7672752090087016, + "grad_norm": 1.598334208233978, + "learning_rate": 1.3544003756344708e-05, + "loss": 0.129, + "step": 8994 + }, + { + "epoch": 0.7673605186828186, + "grad_norm": 1.4751280421915873, + "learning_rate": 1.3534550170556554e-05, + "loss": 0.205, + "step": 8995 + }, + { + "epoch": 0.7674458283569356, + "grad_norm": 1.4004801318561488, + "learning_rate": 1.3525099368711718e-05, + "loss": 0.1898, + "step": 8996 + }, + { + "epoch": 0.7675311380310528, + "grad_norm": 1.5392105883307496, + "learning_rate": 1.3515651351531722e-05, + "loss": 0.1854, + "step": 8997 + }, + { + "epoch": 0.7676164477051698, + "grad_norm": 1.6283889790804136, + "learning_rate": 1.3506206119737847e-05, + "loss": 0.1781, + "step": 8998 + }, + { + "epoch": 0.7677017573792868, + "grad_norm": 1.4965706406326313, + "learning_rate": 1.3496763674051238e-05, + "loss": 0.2106, + "step": 8999 + }, + { + "epoch": 0.7677870670534038, + "grad_norm": 2.256875447579833, + "learning_rate": 1.3487324015192721e-05, + "loss": 0.1785, + "step": 9000 + }, + { + "epoch": 0.767872376727521, + "grad_norm": 2.620045391155483, + "learning_rate": 1.3477887143882994e-05, + "loss": 0.2103, + "step": 9001 + }, + { + "epoch": 0.767957686401638, + "grad_norm": 1.6329662487889447, + "learning_rate": 1.3468453060842512e-05, + "loss": 0.1942, + "step": 9002 + }, + { + "epoch": 0.768042996075755, + "grad_norm": 1.847425450432741, + "learning_rate": 1.3459021766791502e-05, + "loss": 0.1351, + "step": 9003 + }, + { + "epoch": 0.768128305749872, + "grad_norm": 1.6723926491490737, + "learning_rate": 1.3449593262450011e-05, + "loss": 0.2175, + "step": 9004 + }, + { + "epoch": 0.768213615423989, + "grad_norm": 2.200308656714883, + "learning_rate": 1.344016754853784e-05, + "loss": 0.2131, + "step": 9005 + }, + { + "epoch": 0.7682989250981062, + "grad_norm": 1.6098386226358765, + "learning_rate": 1.34307446257746e-05, + "loss": 0.1474, + "step": 9006 + }, + { + "epoch": 0.7683842347722232, + "grad_norm": 1.8818176933793886, + "learning_rate": 1.342132449487966e-05, + "loss": 0.1443, + "step": 9007 + }, + { + "epoch": 0.7684695444463402, + "grad_norm": 2.215104345098459, + "learning_rate": 1.3411907156572235e-05, + "loss": 0.2065, + "step": 9008 + }, + { + "epoch": 0.7685548541204572, + "grad_norm": 1.3985455007778516, + "learning_rate": 1.3402492611571272e-05, + "loss": 0.1788, + "step": 9009 + }, + { + "epoch": 0.7686401637945743, + "grad_norm": 2.0386501786933495, + "learning_rate": 1.3393080860595514e-05, + "loss": 0.2196, + "step": 9010 + }, + { + "epoch": 0.7687254734686914, + "grad_norm": 1.3766250544178869, + "learning_rate": 1.3383671904363504e-05, + "loss": 0.1695, + "step": 9011 + }, + { + "epoch": 0.7688107831428084, + "grad_norm": 2.01489717812789, + "learning_rate": 1.3374265743593568e-05, + "loss": 0.1564, + "step": 9012 + }, + { + "epoch": 0.7688960928169254, + "grad_norm": 1.448338408305792, + "learning_rate": 1.3364862379003812e-05, + "loss": 0.1917, + "step": 9013 + }, + { + "epoch": 0.7689814024910425, + "grad_norm": 1.947051592268192, + "learning_rate": 1.335546181131212e-05, + "loss": 0.2067, + "step": 9014 + }, + { + "epoch": 0.7690667121651595, + "grad_norm": 1.62965957456749, + "learning_rate": 1.3346064041236217e-05, + "loss": 0.2096, + "step": 9015 + }, + { + "epoch": 0.7691520218392766, + "grad_norm": 1.7590450458538278, + "learning_rate": 1.3336669069493518e-05, + "loss": 0.2656, + "step": 9016 + }, + { + "epoch": 0.7692373315133936, + "grad_norm": 1.9411603745049866, + "learning_rate": 1.3327276896801321e-05, + "loss": 0.1168, + "step": 9017 + }, + { + "epoch": 0.7693226411875107, + "grad_norm": 1.7072671338829104, + "learning_rate": 1.3317887523876655e-05, + "loss": 0.1218, + "step": 9018 + }, + { + "epoch": 0.7694079508616277, + "grad_norm": 1.5318448993615748, + "learning_rate": 1.3308500951436348e-05, + "loss": 0.1859, + "step": 9019 + }, + { + "epoch": 0.7694932605357447, + "grad_norm": 1.6648637159091009, + "learning_rate": 1.3299117180197002e-05, + "loss": 0.2265, + "step": 9020 + }, + { + "epoch": 0.7695785702098618, + "grad_norm": 1.7994705030690015, + "learning_rate": 1.3289736210875059e-05, + "loss": 0.1937, + "step": 9021 + }, + { + "epoch": 0.7696638798839789, + "grad_norm": 1.6758381582703157, + "learning_rate": 1.3280358044186647e-05, + "loss": 0.1867, + "step": 9022 + }, + { + "epoch": 0.7697491895580959, + "grad_norm": 1.7998011108296663, + "learning_rate": 1.3270982680847788e-05, + "loss": 0.1899, + "step": 9023 + }, + { + "epoch": 0.7698344992322129, + "grad_norm": 1.4959186023150448, + "learning_rate": 1.326161012157422e-05, + "loss": 0.1487, + "step": 9024 + }, + { + "epoch": 0.7699198089063299, + "grad_norm": 2.1806734651191153, + "learning_rate": 1.3252240367081492e-05, + "loss": 0.2139, + "step": 9025 + }, + { + "epoch": 0.7700051185804471, + "grad_norm": 1.7658218679135391, + "learning_rate": 1.3242873418084939e-05, + "loss": 0.2508, + "step": 9026 + }, + { + "epoch": 0.7700904282545641, + "grad_norm": 1.9759720836474395, + "learning_rate": 1.3233509275299654e-05, + "loss": 0.1618, + "step": 9027 + }, + { + "epoch": 0.7701757379286811, + "grad_norm": 1.867731884880383, + "learning_rate": 1.322414793944059e-05, + "loss": 0.1869, + "step": 9028 + }, + { + "epoch": 0.7702610476027981, + "grad_norm": 1.6661552916251674, + "learning_rate": 1.3214789411222372e-05, + "loss": 0.1736, + "step": 9029 + }, + { + "epoch": 0.7703463572769153, + "grad_norm": 1.8767931311272719, + "learning_rate": 1.3205433691359526e-05, + "loss": 0.1531, + "step": 9030 + }, + { + "epoch": 0.7704316669510323, + "grad_norm": 1.7244773459955358, + "learning_rate": 1.319608078056629e-05, + "loss": 0.2273, + "step": 9031 + }, + { + "epoch": 0.7705169766251493, + "grad_norm": 1.9311025273086313, + "learning_rate": 1.3186730679556708e-05, + "loss": 0.188, + "step": 9032 + }, + { + "epoch": 0.7706022862992663, + "grad_norm": 2.008514162947633, + "learning_rate": 1.3177383389044617e-05, + "loss": 0.186, + "step": 9033 + }, + { + "epoch": 0.7706875959733834, + "grad_norm": 1.5626555556112767, + "learning_rate": 1.3168038909743623e-05, + "loss": 0.1767, + "step": 9034 + }, + { + "epoch": 0.7707729056475005, + "grad_norm": 1.892416113502278, + "learning_rate": 1.3158697242367141e-05, + "loss": 0.1923, + "step": 9035 + }, + { + "epoch": 0.7708582153216175, + "grad_norm": 1.6778423731974572, + "learning_rate": 1.314935838762833e-05, + "loss": 0.1463, + "step": 9036 + }, + { + "epoch": 0.7709435249957345, + "grad_norm": 1.9067316541178045, + "learning_rate": 1.3140022346240216e-05, + "loss": 0.2257, + "step": 9037 + }, + { + "epoch": 0.7710288346698516, + "grad_norm": 1.182331028533731, + "learning_rate": 1.313068911891549e-05, + "loss": 0.176, + "step": 9038 + }, + { + "epoch": 0.7711141443439686, + "grad_norm": 1.4743998195472148, + "learning_rate": 1.3121358706366738e-05, + "loss": 0.156, + "step": 9039 + }, + { + "epoch": 0.7711994540180857, + "grad_norm": 1.6877137507943583, + "learning_rate": 1.3112031109306267e-05, + "loss": 0.2297, + "step": 9040 + }, + { + "epoch": 0.7712847636922027, + "grad_norm": 1.4121184078365576, + "learning_rate": 1.3102706328446223e-05, + "loss": 0.1751, + "step": 9041 + }, + { + "epoch": 0.7713700733663197, + "grad_norm": 2.3433117266002963, + "learning_rate": 1.309338436449845e-05, + "loss": 0.1391, + "step": 9042 + }, + { + "epoch": 0.7714553830404368, + "grad_norm": 1.6493855141336755, + "learning_rate": 1.3084065218174679e-05, + "loss": 0.1825, + "step": 9043 + }, + { + "epoch": 0.7715406927145538, + "grad_norm": 2.087561957073749, + "learning_rate": 1.307474889018635e-05, + "loss": 0.1812, + "step": 9044 + }, + { + "epoch": 0.7716260023886709, + "grad_norm": 1.8231324059117247, + "learning_rate": 1.3065435381244728e-05, + "loss": 0.1556, + "step": 9045 + }, + { + "epoch": 0.7717113120627879, + "grad_norm": 1.454679252691276, + "learning_rate": 1.3056124692060845e-05, + "loss": 0.1654, + "step": 9046 + }, + { + "epoch": 0.771796621736905, + "grad_norm": 1.9010636489503001, + "learning_rate": 1.304681682334552e-05, + "loss": 0.2277, + "step": 9047 + }, + { + "epoch": 0.771881931411022, + "grad_norm": 1.6527035729484647, + "learning_rate": 1.3037511775809364e-05, + "loss": 0.22, + "step": 9048 + }, + { + "epoch": 0.771967241085139, + "grad_norm": 2.3604341870494085, + "learning_rate": 1.3028209550162746e-05, + "loss": 0.1806, + "step": 9049 + }, + { + "epoch": 0.772052550759256, + "grad_norm": 1.7174111440181283, + "learning_rate": 1.3018910147115893e-05, + "loss": 0.1836, + "step": 9050 + }, + { + "epoch": 0.7721378604333732, + "grad_norm": 1.7274372077272808, + "learning_rate": 1.3009613567378703e-05, + "loss": 0.1329, + "step": 9051 + }, + { + "epoch": 0.7722231701074902, + "grad_norm": 1.5136508472581764, + "learning_rate": 1.3000319811660967e-05, + "loss": 0.1319, + "step": 9052 + }, + { + "epoch": 0.7723084797816072, + "grad_norm": 2.5202128589891486, + "learning_rate": 1.2991028880672196e-05, + "loss": 0.1859, + "step": 9053 + }, + { + "epoch": 0.7723937894557242, + "grad_norm": 2.1869863724565244, + "learning_rate": 1.2981740775121704e-05, + "loss": 0.2267, + "step": 9054 + }, + { + "epoch": 0.7724790991298414, + "grad_norm": 2.112823718847611, + "learning_rate": 1.2972455495718589e-05, + "loss": 0.1994, + "step": 9055 + }, + { + "epoch": 0.7725644088039584, + "grad_norm": 1.8799114750249981, + "learning_rate": 1.2963173043171717e-05, + "loss": 0.2074, + "step": 9056 + }, + { + "epoch": 0.7726497184780754, + "grad_norm": 1.579194112789153, + "learning_rate": 1.2953893418189806e-05, + "loss": 0.2039, + "step": 9057 + }, + { + "epoch": 0.7727350281521924, + "grad_norm": 1.3839489851821603, + "learning_rate": 1.2944616621481231e-05, + "loss": 0.173, + "step": 9058 + }, + { + "epoch": 0.7728203378263095, + "grad_norm": 1.491314138102784, + "learning_rate": 1.2935342653754284e-05, + "loss": 0.1574, + "step": 9059 + }, + { + "epoch": 0.7729056475004266, + "grad_norm": 1.4425754508875077, + "learning_rate": 1.2926071515716964e-05, + "loss": 0.1764, + "step": 9060 + }, + { + "epoch": 0.7729909571745436, + "grad_norm": 1.6389619663706272, + "learning_rate": 1.2916803208077072e-05, + "loss": 0.1741, + "step": 9061 + }, + { + "epoch": 0.7730762668486606, + "grad_norm": 1.8831094847912628, + "learning_rate": 1.290753773154218e-05, + "loss": 0.2272, + "step": 9062 + }, + { + "epoch": 0.7731615765227777, + "grad_norm": 1.255247118127763, + "learning_rate": 1.2898275086819706e-05, + "loss": 0.0881, + "step": 9063 + }, + { + "epoch": 0.7732468861968947, + "grad_norm": 1.4395341164566497, + "learning_rate": 1.2889015274616739e-05, + "loss": 0.176, + "step": 9064 + }, + { + "epoch": 0.7733321958710118, + "grad_norm": 1.5188823971907672, + "learning_rate": 1.287975829564026e-05, + "loss": 0.2279, + "step": 9065 + }, + { + "epoch": 0.7734175055451288, + "grad_norm": 1.7638502400420124, + "learning_rate": 1.2870504150596985e-05, + "loss": 0.2276, + "step": 9066 + }, + { + "epoch": 0.7735028152192459, + "grad_norm": 1.6483732816652932, + "learning_rate": 1.2861252840193406e-05, + "loss": 0.1517, + "step": 9067 + }, + { + "epoch": 0.7735881248933629, + "grad_norm": 2.2394962974018644, + "learning_rate": 1.2852004365135823e-05, + "loss": 0.1844, + "step": 9068 + }, + { + "epoch": 0.77367343456748, + "grad_norm": 1.6651470250005445, + "learning_rate": 1.2842758726130283e-05, + "loss": 0.2081, + "step": 9069 + }, + { + "epoch": 0.773758744241597, + "grad_norm": 1.3484481917186164, + "learning_rate": 1.2833515923882694e-05, + "loss": 0.171, + "step": 9070 + }, + { + "epoch": 0.7738440539157141, + "grad_norm": 2.0813584949521013, + "learning_rate": 1.2824275959098625e-05, + "loss": 0.2387, + "step": 9071 + }, + { + "epoch": 0.7739293635898311, + "grad_norm": 2.024582351189209, + "learning_rate": 1.2815038832483556e-05, + "loss": 0.1678, + "step": 9072 + }, + { + "epoch": 0.7740146732639481, + "grad_norm": 1.6792557982207934, + "learning_rate": 1.2805804544742672e-05, + "loss": 0.1975, + "step": 9073 + }, + { + "epoch": 0.7740999829380651, + "grad_norm": 1.7823721045025058, + "learning_rate": 1.279657309658096e-05, + "loss": 0.1883, + "step": 9074 + }, + { + "epoch": 0.7741852926121823, + "grad_norm": 2.32188839812283, + "learning_rate": 1.2787344488703195e-05, + "loss": 0.193, + "step": 9075 + }, + { + "epoch": 0.7742706022862993, + "grad_norm": 1.8367202713350383, + "learning_rate": 1.2778118721813925e-05, + "loss": 0.1995, + "step": 9076 + }, + { + "epoch": 0.7743559119604163, + "grad_norm": 1.9077578350111706, + "learning_rate": 1.2768895796617497e-05, + "loss": 0.1569, + "step": 9077 + }, + { + "epoch": 0.7744412216345333, + "grad_norm": 1.7646919418233122, + "learning_rate": 1.2759675713818015e-05, + "loss": 0.2266, + "step": 9078 + }, + { + "epoch": 0.7745265313086503, + "grad_norm": 1.707340587809586, + "learning_rate": 1.2750458474119426e-05, + "loss": 0.1621, + "step": 9079 + }, + { + "epoch": 0.7746118409827675, + "grad_norm": 1.902261407154948, + "learning_rate": 1.2741244078225363e-05, + "loss": 0.1718, + "step": 9080 + }, + { + "epoch": 0.7746971506568845, + "grad_norm": 1.6701337356470256, + "learning_rate": 1.2732032526839333e-05, + "loss": 0.2175, + "step": 9081 + }, + { + "epoch": 0.7747824603310015, + "grad_norm": 1.8453025100528038, + "learning_rate": 1.2722823820664575e-05, + "loss": 0.141, + "step": 9082 + }, + { + "epoch": 0.7748677700051185, + "grad_norm": 1.885788966842743, + "learning_rate": 1.2713617960404134e-05, + "loss": 0.2469, + "step": 9083 + }, + { + "epoch": 0.7749530796792357, + "grad_norm": 1.899313139992339, + "learning_rate": 1.27044149467608e-05, + "loss": 0.1929, + "step": 9084 + }, + { + "epoch": 0.7750383893533527, + "grad_norm": 1.9223480882884834, + "learning_rate": 1.2695214780437215e-05, + "loss": 0.2131, + "step": 9085 + }, + { + "epoch": 0.7751236990274697, + "grad_norm": 1.2457365761802437, + "learning_rate": 1.2686017462135747e-05, + "loss": 0.2002, + "step": 9086 + }, + { + "epoch": 0.7752090087015867, + "grad_norm": 1.6169734204997122, + "learning_rate": 1.2676822992558556e-05, + "loss": 0.1181, + "step": 9087 + }, + { + "epoch": 0.7752943183757038, + "grad_norm": 1.9881516210122268, + "learning_rate": 1.2667631372407595e-05, + "loss": 0.2197, + "step": 9088 + }, + { + "epoch": 0.7753796280498209, + "grad_norm": 1.7700571790177253, + "learning_rate": 1.2658442602384596e-05, + "loss": 0.2109, + "step": 9089 + }, + { + "epoch": 0.7754649377239379, + "grad_norm": 1.64031180162464, + "learning_rate": 1.2649256683191068e-05, + "loss": 0.1867, + "step": 9090 + }, + { + "epoch": 0.7755502473980549, + "grad_norm": 1.2475915594653835, + "learning_rate": 1.2640073615528298e-05, + "loss": 0.1817, + "step": 9091 + }, + { + "epoch": 0.775635557072172, + "grad_norm": 1.8739152949679565, + "learning_rate": 1.2630893400097404e-05, + "loss": 0.2145, + "step": 9092 + }, + { + "epoch": 0.775720866746289, + "grad_norm": 2.2409042248258912, + "learning_rate": 1.2621716037599196e-05, + "loss": 0.1925, + "step": 9093 + }, + { + "epoch": 0.7758061764204061, + "grad_norm": 1.8125130225404347, + "learning_rate": 1.2612541528734351e-05, + "loss": 0.2252, + "step": 9094 + }, + { + "epoch": 0.7758914860945231, + "grad_norm": 1.3430835934412078, + "learning_rate": 1.2603369874203286e-05, + "loss": 0.1615, + "step": 9095 + }, + { + "epoch": 0.7759767957686402, + "grad_norm": 1.873438502829238, + "learning_rate": 1.2594201074706202e-05, + "loss": 0.2365, + "step": 9096 + }, + { + "epoch": 0.7760621054427572, + "grad_norm": 2.874981424022869, + "learning_rate": 1.2585035130943096e-05, + "loss": 0.2271, + "step": 9097 + }, + { + "epoch": 0.7761474151168742, + "grad_norm": 1.989271475698659, + "learning_rate": 1.2575872043613717e-05, + "loss": 0.2332, + "step": 9098 + }, + { + "epoch": 0.7762327247909913, + "grad_norm": 1.757339617382694, + "learning_rate": 1.2566711813417665e-05, + "loss": 0.1671, + "step": 9099 + }, + { + "epoch": 0.7763180344651084, + "grad_norm": 1.3967492339250065, + "learning_rate": 1.255755444105422e-05, + "loss": 0.1621, + "step": 9100 + }, + { + "epoch": 0.7764033441392254, + "grad_norm": 1.5090859525354718, + "learning_rate": 1.2548399927222538e-05, + "loss": 0.2127, + "step": 9101 + }, + { + "epoch": 0.7764886538133424, + "grad_norm": 2.0125761408648173, + "learning_rate": 1.2539248272621501e-05, + "loss": 0.2318, + "step": 9102 + }, + { + "epoch": 0.7765739634874594, + "grad_norm": 2.263786043146637, + "learning_rate": 1.2530099477949792e-05, + "loss": 0.1925, + "step": 9103 + }, + { + "epoch": 0.7766592731615766, + "grad_norm": 1.6020626124648851, + "learning_rate": 1.2520953543905862e-05, + "loss": 0.1869, + "step": 9104 + }, + { + "epoch": 0.7767445828356936, + "grad_norm": 1.3517326732955102, + "learning_rate": 1.251181047118799e-05, + "loss": 0.1883, + "step": 9105 + }, + { + "epoch": 0.7768298925098106, + "grad_norm": 1.3148460764419545, + "learning_rate": 1.2502670260494154e-05, + "loss": 0.1731, + "step": 9106 + }, + { + "epoch": 0.7769152021839276, + "grad_norm": 2.2405930010054895, + "learning_rate": 1.2493532912522193e-05, + "loss": 0.1845, + "step": 9107 + }, + { + "epoch": 0.7770005118580448, + "grad_norm": 2.031655139134129, + "learning_rate": 1.2484398427969685e-05, + "loss": 0.1864, + "step": 9108 + }, + { + "epoch": 0.7770858215321618, + "grad_norm": 2.077602578946152, + "learning_rate": 1.2475266807534003e-05, + "loss": 0.2359, + "step": 9109 + }, + { + "epoch": 0.7771711312062788, + "grad_norm": 1.9963948420252173, + "learning_rate": 1.2466138051912291e-05, + "loss": 0.1679, + "step": 9110 + }, + { + "epoch": 0.7772564408803958, + "grad_norm": 2.1087075767026042, + "learning_rate": 1.2457012161801484e-05, + "loss": 0.1993, + "step": 9111 + }, + { + "epoch": 0.7773417505545129, + "grad_norm": 1.3717763565504209, + "learning_rate": 1.2447889137898293e-05, + "loss": 0.1435, + "step": 9112 + }, + { + "epoch": 0.77742706022863, + "grad_norm": 1.7090672412945354, + "learning_rate": 1.2438768980899208e-05, + "loss": 0.1722, + "step": 9113 + }, + { + "epoch": 0.777512369902747, + "grad_norm": 1.4323320787681912, + "learning_rate": 1.2429651691500515e-05, + "loss": 0.1215, + "step": 9114 + }, + { + "epoch": 0.777597679576864, + "grad_norm": 1.5456622986201523, + "learning_rate": 1.2420537270398276e-05, + "loss": 0.2709, + "step": 9115 + }, + { + "epoch": 0.7776829892509811, + "grad_norm": 1.6851409387931924, + "learning_rate": 1.2411425718288311e-05, + "loss": 0.1329, + "step": 9116 + }, + { + "epoch": 0.7777682989250981, + "grad_norm": 1.8247847077350632, + "learning_rate": 1.2402317035866251e-05, + "loss": 0.1953, + "step": 9117 + }, + { + "epoch": 0.7778536085992152, + "grad_norm": 1.558412393138611, + "learning_rate": 1.2393211223827494e-05, + "loss": 0.1205, + "step": 9118 + }, + { + "epoch": 0.7779389182733322, + "grad_norm": 1.7793042447621112, + "learning_rate": 1.2384108282867218e-05, + "loss": 0.183, + "step": 9119 + }, + { + "epoch": 0.7780242279474492, + "grad_norm": 1.6103246905211315, + "learning_rate": 1.2375008213680367e-05, + "loss": 0.1713, + "step": 9120 + }, + { + "epoch": 0.7781095376215663, + "grad_norm": 1.8749092715882607, + "learning_rate": 1.236591101696173e-05, + "loss": 0.1657, + "step": 9121 + }, + { + "epoch": 0.7781948472956833, + "grad_norm": 2.0985975941524897, + "learning_rate": 1.2356816693405766e-05, + "loss": 0.159, + "step": 9122 + }, + { + "epoch": 0.7782801569698004, + "grad_norm": 2.1329702242146924, + "learning_rate": 1.2347725243706831e-05, + "loss": 0.2069, + "step": 9123 + }, + { + "epoch": 0.7783654666439174, + "grad_norm": 1.4384081086796885, + "learning_rate": 1.233863666855899e-05, + "loss": 0.1579, + "step": 9124 + }, + { + "epoch": 0.7784507763180345, + "grad_norm": 1.9208669112022723, + "learning_rate": 1.23295509686561e-05, + "loss": 0.2027, + "step": 9125 + }, + { + "epoch": 0.7785360859921515, + "grad_norm": 1.6970742911249255, + "learning_rate": 1.2320468144691805e-05, + "loss": 0.1955, + "step": 9126 + }, + { + "epoch": 0.7786213956662685, + "grad_norm": 2.2828386601614308, + "learning_rate": 1.2311388197359569e-05, + "loss": 0.1959, + "step": 9127 + }, + { + "epoch": 0.7787067053403856, + "grad_norm": 1.9509224641566674, + "learning_rate": 1.2302311127352534e-05, + "loss": 0.1651, + "step": 9128 + }, + { + "epoch": 0.7787920150145027, + "grad_norm": 2.1846707167081028, + "learning_rate": 1.2293236935363733e-05, + "loss": 0.1935, + "step": 9129 + }, + { + "epoch": 0.7788773246886197, + "grad_norm": 1.5758136403618261, + "learning_rate": 1.2284165622085924e-05, + "loss": 0.1702, + "step": 9130 + }, + { + "epoch": 0.7789626343627367, + "grad_norm": 1.652275931767968, + "learning_rate": 1.2275097188211643e-05, + "loss": 0.1771, + "step": 9131 + }, + { + "epoch": 0.7790479440368537, + "grad_norm": 1.7402276618048826, + "learning_rate": 1.2266031634433223e-05, + "loss": 0.2439, + "step": 9132 + }, + { + "epoch": 0.7791332537109709, + "grad_norm": 1.6198831841066093, + "learning_rate": 1.2256968961442755e-05, + "loss": 0.2053, + "step": 9133 + }, + { + "epoch": 0.7792185633850879, + "grad_norm": 2.0642898191825116, + "learning_rate": 1.224790916993217e-05, + "loss": 0.2298, + "step": 9134 + }, + { + "epoch": 0.7793038730592049, + "grad_norm": 1.9049725864470455, + "learning_rate": 1.223885226059308e-05, + "loss": 0.2051, + "step": 9135 + }, + { + "epoch": 0.7793891827333219, + "grad_norm": 2.67538453662707, + "learning_rate": 1.2229798234116968e-05, + "loss": 0.2044, + "step": 9136 + }, + { + "epoch": 0.779474492407439, + "grad_norm": 1.4788892870946346, + "learning_rate": 1.222074709119505e-05, + "loss": 0.1541, + "step": 9137 + }, + { + "epoch": 0.7795598020815561, + "grad_norm": 1.3134929897032697, + "learning_rate": 1.2211698832518331e-05, + "loss": 0.1554, + "step": 9138 + }, + { + "epoch": 0.7796451117556731, + "grad_norm": 1.947159713573798, + "learning_rate": 1.2202653458777602e-05, + "loss": 0.2795, + "step": 9139 + }, + { + "epoch": 0.7797304214297901, + "grad_norm": 2.1639889040114255, + "learning_rate": 1.2193610970663427e-05, + "loss": 0.2363, + "step": 9140 + }, + { + "epoch": 0.7798157311039072, + "grad_norm": 2.417774889050578, + "learning_rate": 1.218457136886615e-05, + "loss": 0.2241, + "step": 9141 + }, + { + "epoch": 0.7799010407780242, + "grad_norm": 1.8593281972428257, + "learning_rate": 1.2175534654075888e-05, + "loss": 0.2403, + "step": 9142 + }, + { + "epoch": 0.7799863504521413, + "grad_norm": 1.5946505377191957, + "learning_rate": 1.2166500826982564e-05, + "loss": 0.1711, + "step": 9143 + }, + { + "epoch": 0.7800716601262583, + "grad_norm": 1.1894933133155836, + "learning_rate": 1.2157469888275858e-05, + "loss": 0.1521, + "step": 9144 + }, + { + "epoch": 0.7801569698003754, + "grad_norm": 1.8492471245545685, + "learning_rate": 1.214844183864523e-05, + "loss": 0.1985, + "step": 9145 + }, + { + "epoch": 0.7802422794744924, + "grad_norm": 2.0806744074548633, + "learning_rate": 1.2139416678779913e-05, + "loss": 0.2207, + "step": 9146 + }, + { + "epoch": 0.7803275891486094, + "grad_norm": 1.8167249830609362, + "learning_rate": 1.2130394409368967e-05, + "loss": 0.2015, + "step": 9147 + }, + { + "epoch": 0.7804128988227265, + "grad_norm": 1.3014857128705926, + "learning_rate": 1.212137503110114e-05, + "loss": 0.14, + "step": 9148 + }, + { + "epoch": 0.7804982084968436, + "grad_norm": 1.5045825577026724, + "learning_rate": 1.211235854466506e-05, + "loss": 0.2446, + "step": 9149 + }, + { + "epoch": 0.7805835181709606, + "grad_norm": 1.8562890739053195, + "learning_rate": 1.2103344950749069e-05, + "loss": 0.1318, + "step": 9150 + }, + { + "epoch": 0.7806688278450776, + "grad_norm": 1.422694714029898, + "learning_rate": 1.2094334250041312e-05, + "loss": 0.1334, + "step": 9151 + }, + { + "epoch": 0.7807541375191946, + "grad_norm": 1.7777865015584102, + "learning_rate": 1.20853264432297e-05, + "loss": 0.1897, + "step": 9152 + }, + { + "epoch": 0.7808394471933118, + "grad_norm": 1.5893012393542143, + "learning_rate": 1.2076321531001933e-05, + "loss": 0.1353, + "step": 9153 + }, + { + "epoch": 0.7809247568674288, + "grad_norm": 1.8624072954288953, + "learning_rate": 1.2067319514045494e-05, + "loss": 0.1965, + "step": 9154 + }, + { + "epoch": 0.7810100665415458, + "grad_norm": 1.4985764424145223, + "learning_rate": 1.2058320393047624e-05, + "loss": 0.154, + "step": 9155 + }, + { + "epoch": 0.7810953762156628, + "grad_norm": 1.7423614962466616, + "learning_rate": 1.2049324168695398e-05, + "loss": 0.2263, + "step": 9156 + }, + { + "epoch": 0.7811806858897798, + "grad_norm": 2.100285138345152, + "learning_rate": 1.2040330841675573e-05, + "loss": 0.2094, + "step": 9157 + }, + { + "epoch": 0.781265995563897, + "grad_norm": 1.893994190348141, + "learning_rate": 1.2031340412674785e-05, + "loss": 0.1682, + "step": 9158 + }, + { + "epoch": 0.781351305238014, + "grad_norm": 1.7209180162974467, + "learning_rate": 1.2022352882379389e-05, + "loss": 0.1856, + "step": 9159 + }, + { + "epoch": 0.781436614912131, + "grad_norm": 2.1832669308686374, + "learning_rate": 1.2013368251475542e-05, + "loss": 0.2822, + "step": 9160 + }, + { + "epoch": 0.781521924586248, + "grad_norm": 1.4577224360179035, + "learning_rate": 1.2004386520649164e-05, + "loss": 0.1265, + "step": 9161 + }, + { + "epoch": 0.7816072342603652, + "grad_norm": 1.7871889523992024, + "learning_rate": 1.1995407690585953e-05, + "loss": 0.1584, + "step": 9162 + }, + { + "epoch": 0.7816925439344822, + "grad_norm": 2.085108903195129, + "learning_rate": 1.198643176197144e-05, + "loss": 0.1913, + "step": 9163 + }, + { + "epoch": 0.7817778536085992, + "grad_norm": 1.5101027414598003, + "learning_rate": 1.1977458735490826e-05, + "loss": 0.2148, + "step": 9164 + }, + { + "epoch": 0.7818631632827162, + "grad_norm": 2.0660484307032774, + "learning_rate": 1.1968488611829204e-05, + "loss": 0.2042, + "step": 9165 + }, + { + "epoch": 0.7819484729568333, + "grad_norm": 1.5873504508157088, + "learning_rate": 1.1959521391671374e-05, + "loss": 0.1895, + "step": 9166 + }, + { + "epoch": 0.7820337826309504, + "grad_norm": 2.1121471132858196, + "learning_rate": 1.1950557075701941e-05, + "loss": 0.2218, + "step": 9167 + }, + { + "epoch": 0.7821190923050674, + "grad_norm": 1.9662655488676855, + "learning_rate": 1.1941595664605265e-05, + "loss": 0.2493, + "step": 9168 + }, + { + "epoch": 0.7822044019791844, + "grad_norm": 1.627162374936558, + "learning_rate": 1.1932637159065546e-05, + "loss": 0.1567, + "step": 9169 + }, + { + "epoch": 0.7822897116533015, + "grad_norm": 1.795398802380302, + "learning_rate": 1.1923681559766663e-05, + "loss": 0.185, + "step": 9170 + }, + { + "epoch": 0.7823750213274185, + "grad_norm": 1.8003200511380029, + "learning_rate": 1.1914728867392371e-05, + "loss": 0.2341, + "step": 9171 + }, + { + "epoch": 0.7824603310015356, + "grad_norm": 1.7714542272793004, + "learning_rate": 1.190577908262614e-05, + "loss": 0.2158, + "step": 9172 + }, + { + "epoch": 0.7825456406756526, + "grad_norm": 1.208565684154802, + "learning_rate": 1.1896832206151248e-05, + "loss": 0.2242, + "step": 9173 + }, + { + "epoch": 0.7826309503497697, + "grad_norm": 1.8205001144974904, + "learning_rate": 1.1887888238650736e-05, + "loss": 0.212, + "step": 9174 + }, + { + "epoch": 0.7827162600238867, + "grad_norm": 2.4283970782261606, + "learning_rate": 1.1878947180807415e-05, + "loss": 0.1837, + "step": 9175 + }, + { + "epoch": 0.7828015696980037, + "grad_norm": 1.5703862870374081, + "learning_rate": 1.187000903330393e-05, + "loss": 0.1844, + "step": 9176 + }, + { + "epoch": 0.7828868793721208, + "grad_norm": 1.9617717533706611, + "learning_rate": 1.1861073796822608e-05, + "loss": 0.217, + "step": 9177 + }, + { + "epoch": 0.7829721890462379, + "grad_norm": 1.8566407046166549, + "learning_rate": 1.185214147204564e-05, + "loss": 0.2195, + "step": 9178 + }, + { + "epoch": 0.7830574987203549, + "grad_norm": 2.062627615015046, + "learning_rate": 1.1843212059654957e-05, + "loss": 0.1938, + "step": 9179 + }, + { + "epoch": 0.7831428083944719, + "grad_norm": 1.7514843095517114, + "learning_rate": 1.1834285560332264e-05, + "loss": 0.1532, + "step": 9180 + }, + { + "epoch": 0.7832281180685889, + "grad_norm": 1.9596609390641408, + "learning_rate": 1.1825361974759063e-05, + "loss": 0.2137, + "step": 9181 + }, + { + "epoch": 0.7833134277427061, + "grad_norm": 1.2007018619619816, + "learning_rate": 1.181644130361661e-05, + "loss": 0.1476, + "step": 9182 + }, + { + "epoch": 0.7833987374168231, + "grad_norm": 1.3921365559480146, + "learning_rate": 1.1807523547585958e-05, + "loss": 0.1726, + "step": 9183 + }, + { + "epoch": 0.7834840470909401, + "grad_norm": 1.7919094305652372, + "learning_rate": 1.1798608707347913e-05, + "loss": 0.1618, + "step": 9184 + }, + { + "epoch": 0.7835693567650571, + "grad_norm": 1.4911159642521625, + "learning_rate": 1.1789696783583121e-05, + "loss": 0.1885, + "step": 9185 + }, + { + "epoch": 0.7836546664391743, + "grad_norm": 1.6940247879376549, + "learning_rate": 1.1780787776971901e-05, + "loss": 0.2407, + "step": 9186 + }, + { + "epoch": 0.7837399761132913, + "grad_norm": 1.7217405484802661, + "learning_rate": 1.1771881688194452e-05, + "loss": 0.2106, + "step": 9187 + }, + { + "epoch": 0.7838252857874083, + "grad_norm": 1.6937027421390236, + "learning_rate": 1.1762978517930678e-05, + "loss": 0.139, + "step": 9188 + }, + { + "epoch": 0.7839105954615253, + "grad_norm": 1.7438009001710189, + "learning_rate": 1.175407826686033e-05, + "loss": 0.1824, + "step": 9189 + }, + { + "epoch": 0.7839959051356424, + "grad_norm": 1.8799608724661756, + "learning_rate": 1.1745180935662842e-05, + "loss": 0.2377, + "step": 9190 + }, + { + "epoch": 0.7840812148097595, + "grad_norm": 1.6469982064937423, + "learning_rate": 1.173628652501752e-05, + "loss": 0.1278, + "step": 9191 + }, + { + "epoch": 0.7841665244838765, + "grad_norm": 2.532290021319674, + "learning_rate": 1.1727395035603384e-05, + "loss": 0.2031, + "step": 9192 + }, + { + "epoch": 0.7842518341579935, + "grad_norm": 2.2779406228302213, + "learning_rate": 1.1718506468099254e-05, + "loss": 0.2133, + "step": 9193 + }, + { + "epoch": 0.7843371438321105, + "grad_norm": 1.9681206560132376, + "learning_rate": 1.1709620823183737e-05, + "loss": 0.1715, + "step": 9194 + }, + { + "epoch": 0.7844224535062276, + "grad_norm": 1.5828971847670128, + "learning_rate": 1.1700738101535186e-05, + "loss": 0.2234, + "step": 9195 + }, + { + "epoch": 0.7845077631803447, + "grad_norm": 1.369821420015231, + "learning_rate": 1.1691858303831766e-05, + "loss": 0.1301, + "step": 9196 + }, + { + "epoch": 0.7845930728544617, + "grad_norm": 1.5412496913590938, + "learning_rate": 1.1682981430751378e-05, + "loss": 0.1628, + "step": 9197 + }, + { + "epoch": 0.7846783825285787, + "grad_norm": 1.544570439197995, + "learning_rate": 1.1674107482971769e-05, + "loss": 0.1949, + "step": 9198 + }, + { + "epoch": 0.7847636922026958, + "grad_norm": 1.804734068430725, + "learning_rate": 1.166523646117036e-05, + "loss": 0.2184, + "step": 9199 + }, + { + "epoch": 0.7848490018768128, + "grad_norm": 1.4111101300485627, + "learning_rate": 1.1656368366024451e-05, + "loss": 0.1872, + "step": 9200 + }, + { + "epoch": 0.7849343115509299, + "grad_norm": 1.7455330104061282, + "learning_rate": 1.1647503198211063e-05, + "loss": 0.214, + "step": 9201 + }, + { + "epoch": 0.7850196212250469, + "grad_norm": 1.6834825802098081, + "learning_rate": 1.1638640958406999e-05, + "loss": 0.2073, + "step": 9202 + }, + { + "epoch": 0.785104930899164, + "grad_norm": 1.727443832610548, + "learning_rate": 1.1629781647288846e-05, + "loss": 0.2362, + "step": 9203 + }, + { + "epoch": 0.785190240573281, + "grad_norm": 2.1731331014934727, + "learning_rate": 1.1620925265532951e-05, + "loss": 0.1939, + "step": 9204 + }, + { + "epoch": 0.785275550247398, + "grad_norm": 1.3695661506623327, + "learning_rate": 1.1612071813815496e-05, + "loss": 0.2155, + "step": 9205 + }, + { + "epoch": 0.785360859921515, + "grad_norm": 2.3678459032412946, + "learning_rate": 1.1603221292812332e-05, + "loss": 0.1862, + "step": 9206 + }, + { + "epoch": 0.7854461695956322, + "grad_norm": 1.5727886964727467, + "learning_rate": 1.1594373703199195e-05, + "loss": 0.168, + "step": 9207 + }, + { + "epoch": 0.7855314792697492, + "grad_norm": 1.807938722838003, + "learning_rate": 1.1585529045651544e-05, + "loss": 0.2472, + "step": 9208 + }, + { + "epoch": 0.7856167889438662, + "grad_norm": 1.5098857955186655, + "learning_rate": 1.1576687320844615e-05, + "loss": 0.1413, + "step": 9209 + }, + { + "epoch": 0.7857020986179832, + "grad_norm": 1.9930884624013856, + "learning_rate": 1.1567848529453411e-05, + "loss": 0.2053, + "step": 9210 + }, + { + "epoch": 0.7857874082921004, + "grad_norm": 1.8506267774599783, + "learning_rate": 1.1559012672152775e-05, + "loss": 0.2028, + "step": 9211 + }, + { + "epoch": 0.7858727179662174, + "grad_norm": 2.2407502675386675, + "learning_rate": 1.1550179749617219e-05, + "loss": 0.1572, + "step": 9212 + }, + { + "epoch": 0.7859580276403344, + "grad_norm": 1.7873814443473015, + "learning_rate": 1.1541349762521126e-05, + "loss": 0.2369, + "step": 9213 + }, + { + "epoch": 0.7860433373144514, + "grad_norm": 1.5270930325910037, + "learning_rate": 1.1532522711538613e-05, + "loss": 0.194, + "step": 9214 + }, + { + "epoch": 0.7861286469885685, + "grad_norm": 1.9748818712277338, + "learning_rate": 1.1523698597343575e-05, + "loss": 0.1745, + "step": 9215 + }, + { + "epoch": 0.7862139566626856, + "grad_norm": 1.9778248973595915, + "learning_rate": 1.1514877420609688e-05, + "loss": 0.2283, + "step": 9216 + }, + { + "epoch": 0.7862992663368026, + "grad_norm": 1.4530411562778112, + "learning_rate": 1.1506059182010393e-05, + "loss": 0.1854, + "step": 9217 + }, + { + "epoch": 0.7863845760109196, + "grad_norm": 1.3414005765978232, + "learning_rate": 1.1497243882218928e-05, + "loss": 0.1899, + "step": 9218 + }, + { + "epoch": 0.7864698856850367, + "grad_norm": 1.3751415699991303, + "learning_rate": 1.1488431521908278e-05, + "loss": 0.2242, + "step": 9219 + }, + { + "epoch": 0.7865551953591537, + "grad_norm": 2.124097969238322, + "learning_rate": 1.1479622101751242e-05, + "loss": 0.2217, + "step": 9220 + }, + { + "epoch": 0.7866405050332708, + "grad_norm": 1.615070154774442, + "learning_rate": 1.1470815622420362e-05, + "loss": 0.1936, + "step": 9221 + }, + { + "epoch": 0.7867258147073878, + "grad_norm": 1.8422918210893557, + "learning_rate": 1.1462012084587964e-05, + "loss": 0.1885, + "step": 9222 + }, + { + "epoch": 0.7868111243815049, + "grad_norm": 1.6507713826683919, + "learning_rate": 1.1453211488926153e-05, + "loss": 0.1291, + "step": 9223 + }, + { + "epoch": 0.7868964340556219, + "grad_norm": 1.5620492666536632, + "learning_rate": 1.1444413836106804e-05, + "loss": 0.1353, + "step": 9224 + }, + { + "epoch": 0.786981743729739, + "grad_norm": 1.1494172776014402, + "learning_rate": 1.1435619126801584e-05, + "loss": 0.1081, + "step": 9225 + }, + { + "epoch": 0.787067053403856, + "grad_norm": 1.7135730806232685, + "learning_rate": 1.142682736168189e-05, + "loss": 0.2041, + "step": 9226 + }, + { + "epoch": 0.7871523630779731, + "grad_norm": 1.8221444665141522, + "learning_rate": 1.141803854141898e-05, + "loss": 0.2489, + "step": 9227 + }, + { + "epoch": 0.7872376727520901, + "grad_norm": 1.9517491491899068, + "learning_rate": 1.1409252666683778e-05, + "loss": 0.2307, + "step": 9228 + }, + { + "epoch": 0.7873229824262071, + "grad_norm": 1.6961572186276643, + "learning_rate": 1.1400469738147074e-05, + "loss": 0.1391, + "step": 9229 + }, + { + "epoch": 0.7874082921003241, + "grad_norm": 2.510545233973324, + "learning_rate": 1.139168975647939e-05, + "loss": 0.1706, + "step": 9230 + }, + { + "epoch": 0.7874936017744413, + "grad_norm": 1.8471342159840825, + "learning_rate": 1.1382912722351024e-05, + "loss": 0.2216, + "step": 9231 + }, + { + "epoch": 0.7875789114485583, + "grad_norm": 1.634177554514302, + "learning_rate": 1.1374138636432053e-05, + "loss": 0.2731, + "step": 9232 + }, + { + "epoch": 0.7876642211226753, + "grad_norm": 1.3745346582371982, + "learning_rate": 1.136536749939235e-05, + "loss": 0.1255, + "step": 9233 + }, + { + "epoch": 0.7877495307967923, + "grad_norm": 1.6025183825695695, + "learning_rate": 1.1356599311901534e-05, + "loss": 0.1461, + "step": 9234 + }, + { + "epoch": 0.7878348404709093, + "grad_norm": 1.6626668459125618, + "learning_rate": 1.134783407462901e-05, + "loss": 0.234, + "step": 9235 + }, + { + "epoch": 0.7879201501450265, + "grad_norm": 2.1300140018629747, + "learning_rate": 1.133907178824396e-05, + "loss": 0.1967, + "step": 9236 + }, + { + "epoch": 0.7880054598191435, + "grad_norm": 1.7838711108959937, + "learning_rate": 1.1330312453415332e-05, + "loss": 0.1651, + "step": 9237 + }, + { + "epoch": 0.7880907694932605, + "grad_norm": 1.56677348334142, + "learning_rate": 1.1321556070811861e-05, + "loss": 0.2143, + "step": 9238 + }, + { + "epoch": 0.7881760791673775, + "grad_norm": 2.081526646828601, + "learning_rate": 1.1312802641102033e-05, + "loss": 0.2223, + "step": 9239 + }, + { + "epoch": 0.7882613888414947, + "grad_norm": 1.3159659806872097, + "learning_rate": 1.1304052164954165e-05, + "loss": 0.1598, + "step": 9240 + }, + { + "epoch": 0.7883466985156117, + "grad_norm": 1.5609802484625126, + "learning_rate": 1.1295304643036252e-05, + "loss": 0.1437, + "step": 9241 + }, + { + "epoch": 0.7884320081897287, + "grad_norm": 1.8440788105157844, + "learning_rate": 1.1286560076016172e-05, + "loss": 0.182, + "step": 9242 + }, + { + "epoch": 0.7885173178638457, + "grad_norm": 1.7431657721674039, + "learning_rate": 1.1277818464561507e-05, + "loss": 0.1908, + "step": 9243 + }, + { + "epoch": 0.7886026275379628, + "grad_norm": 1.8333195628024723, + "learning_rate": 1.1269079809339633e-05, + "loss": 0.1541, + "step": 9244 + }, + { + "epoch": 0.7886879372120799, + "grad_norm": 1.7832998560796571, + "learning_rate": 1.1260344111017701e-05, + "loss": 0.1447, + "step": 9245 + }, + { + "epoch": 0.7887732468861969, + "grad_norm": 1.3568073445958055, + "learning_rate": 1.1251611370262632e-05, + "loss": 0.1527, + "step": 9246 + }, + { + "epoch": 0.7888585565603139, + "grad_norm": 1.5669760107797996, + "learning_rate": 1.1242881587741127e-05, + "loss": 0.126, + "step": 9247 + }, + { + "epoch": 0.788943866234431, + "grad_norm": 1.6718342143280098, + "learning_rate": 1.1234154764119642e-05, + "loss": 0.1841, + "step": 9248 + }, + { + "epoch": 0.789029175908548, + "grad_norm": 2.041819261829934, + "learning_rate": 1.1225430900064455e-05, + "loss": 0.1735, + "step": 9249 + }, + { + "epoch": 0.7891144855826651, + "grad_norm": 2.0491644016960238, + "learning_rate": 1.121670999624157e-05, + "loss": 0.1742, + "step": 9250 + }, + { + "epoch": 0.7891997952567821, + "grad_norm": 1.577655155755391, + "learning_rate": 1.1207992053316778e-05, + "loss": 0.2014, + "step": 9251 + }, + { + "epoch": 0.7892851049308992, + "grad_norm": 1.8619575196899647, + "learning_rate": 1.1199277071955649e-05, + "loss": 0.1678, + "step": 9252 + }, + { + "epoch": 0.7893704146050162, + "grad_norm": 1.8968934110096787, + "learning_rate": 1.1190565052823548e-05, + "loss": 0.2548, + "step": 9253 + }, + { + "epoch": 0.7894557242791332, + "grad_norm": 1.2845796259652207, + "learning_rate": 1.118185599658555e-05, + "loss": 0.1506, + "step": 9254 + }, + { + "epoch": 0.7895410339532503, + "grad_norm": 1.784842699423448, + "learning_rate": 1.1173149903906577e-05, + "loss": 0.1496, + "step": 9255 + }, + { + "epoch": 0.7896263436273674, + "grad_norm": 2.7086657589081464, + "learning_rate": 1.1164446775451282e-05, + "loss": 0.2248, + "step": 9256 + }, + { + "epoch": 0.7897116533014844, + "grad_norm": 1.5700543699144003, + "learning_rate": 1.1155746611884105e-05, + "loss": 0.1843, + "step": 9257 + }, + { + "epoch": 0.7897969629756014, + "grad_norm": 2.2340822895095087, + "learning_rate": 1.1147049413869259e-05, + "loss": 0.1929, + "step": 9258 + }, + { + "epoch": 0.7898822726497184, + "grad_norm": 2.0322174116054184, + "learning_rate": 1.1138355182070725e-05, + "loss": 0.2238, + "step": 9259 + }, + { + "epoch": 0.7899675823238356, + "grad_norm": 1.8393741340388892, + "learning_rate": 1.112966391715226e-05, + "loss": 0.1658, + "step": 9260 + }, + { + "epoch": 0.7900528919979526, + "grad_norm": 2.0841097592904103, + "learning_rate": 1.1120975619777384e-05, + "loss": 0.1385, + "step": 9261 + }, + { + "epoch": 0.7901382016720696, + "grad_norm": 1.423557334235682, + "learning_rate": 1.1112290290609445e-05, + "loss": 0.1674, + "step": 9262 + }, + { + "epoch": 0.7902235113461866, + "grad_norm": 1.4107245148258578, + "learning_rate": 1.1103607930311466e-05, + "loss": 0.1904, + "step": 9263 + }, + { + "epoch": 0.7903088210203038, + "grad_norm": 1.6484503722931212, + "learning_rate": 1.1094928539546346e-05, + "loss": 0.1527, + "step": 9264 + }, + { + "epoch": 0.7903941306944208, + "grad_norm": 1.7215011772442022, + "learning_rate": 1.1086252118976682e-05, + "loss": 0.2092, + "step": 9265 + }, + { + "epoch": 0.7904794403685378, + "grad_norm": 2.16488422533549, + "learning_rate": 1.1077578669264888e-05, + "loss": 0.2073, + "step": 9266 + }, + { + "epoch": 0.7905647500426548, + "grad_norm": 1.6514941617312184, + "learning_rate": 1.1068908191073123e-05, + "loss": 0.1672, + "step": 9267 + }, + { + "epoch": 0.7906500597167719, + "grad_norm": 2.3119217433158927, + "learning_rate": 1.1060240685063328e-05, + "loss": 0.2206, + "step": 9268 + }, + { + "epoch": 0.790735369390889, + "grad_norm": 1.704215164551531, + "learning_rate": 1.1051576151897258e-05, + "loss": 0.2451, + "step": 9269 + }, + { + "epoch": 0.790820679065006, + "grad_norm": 1.7919002498980563, + "learning_rate": 1.1042914592236347e-05, + "loss": 0.1988, + "step": 9270 + }, + { + "epoch": 0.790905988739123, + "grad_norm": 1.9708264012715515, + "learning_rate": 1.1034256006741906e-05, + "loss": 0.1453, + "step": 9271 + }, + { + "epoch": 0.79099129841324, + "grad_norm": 2.1156629866059213, + "learning_rate": 1.1025600396074954e-05, + "loss": 0.1243, + "step": 9272 + }, + { + "epoch": 0.7910766080873571, + "grad_norm": 2.0937930430073077, + "learning_rate": 1.1016947760896301e-05, + "loss": 0.1708, + "step": 9273 + }, + { + "epoch": 0.7911619177614742, + "grad_norm": 1.5983183964630352, + "learning_rate": 1.1008298101866515e-05, + "loss": 0.1208, + "step": 9274 + }, + { + "epoch": 0.7912472274355912, + "grad_norm": 1.7453088553722718, + "learning_rate": 1.0999651419646e-05, + "loss": 0.2056, + "step": 9275 + }, + { + "epoch": 0.7913325371097082, + "grad_norm": 2.0011621913195334, + "learning_rate": 1.0991007714894824e-05, + "loss": 0.148, + "step": 9276 + }, + { + "epoch": 0.7914178467838253, + "grad_norm": 1.7573061065291606, + "learning_rate": 1.0982366988272924e-05, + "loss": 0.2444, + "step": 9277 + }, + { + "epoch": 0.7915031564579423, + "grad_norm": 1.7382786440026112, + "learning_rate": 1.0973729240439967e-05, + "loss": 0.1623, + "step": 9278 + }, + { + "epoch": 0.7915884661320594, + "grad_norm": 1.641527926525538, + "learning_rate": 1.0965094472055398e-05, + "loss": 0.1906, + "step": 9279 + }, + { + "epoch": 0.7916737758061764, + "grad_norm": 1.5293284445825566, + "learning_rate": 1.0956462683778435e-05, + "loss": 0.1854, + "step": 9280 + }, + { + "epoch": 0.7917590854802935, + "grad_norm": 1.7999707961362483, + "learning_rate": 1.0947833876268055e-05, + "loss": 0.1701, + "step": 9281 + }, + { + "epoch": 0.7918443951544105, + "grad_norm": 1.886098692673164, + "learning_rate": 1.0939208050183064e-05, + "loss": 0.1703, + "step": 9282 + }, + { + "epoch": 0.7919297048285275, + "grad_norm": 1.5705368109527058, + "learning_rate": 1.0930585206181942e-05, + "loss": 0.1902, + "step": 9283 + }, + { + "epoch": 0.7920150145026446, + "grad_norm": 1.6523900668098455, + "learning_rate": 1.0921965344923035e-05, + "loss": 0.1602, + "step": 9284 + }, + { + "epoch": 0.7921003241767617, + "grad_norm": 1.4207288712986645, + "learning_rate": 1.0913348467064417e-05, + "loss": 0.129, + "step": 9285 + }, + { + "epoch": 0.7921856338508787, + "grad_norm": 1.7793566361911048, + "learning_rate": 1.0904734573263935e-05, + "loss": 0.1989, + "step": 9286 + }, + { + "epoch": 0.7922709435249957, + "grad_norm": 1.6187275358430866, + "learning_rate": 1.089612366417922e-05, + "loss": 0.2132, + "step": 9287 + }, + { + "epoch": 0.7923562531991127, + "grad_norm": 1.7151375694817967, + "learning_rate": 1.0887515740467662e-05, + "loss": 0.216, + "step": 9288 + }, + { + "epoch": 0.7924415628732299, + "grad_norm": 1.6836604483507285, + "learning_rate": 1.0878910802786436e-05, + "loss": 0.1351, + "step": 9289 + }, + { + "epoch": 0.7925268725473469, + "grad_norm": 2.4216739694161866, + "learning_rate": 1.0870308851792466e-05, + "loss": 0.1516, + "step": 9290 + }, + { + "epoch": 0.7926121822214639, + "grad_norm": 1.887581258881486, + "learning_rate": 1.0861709888142507e-05, + "loss": 0.201, + "step": 9291 + }, + { + "epoch": 0.7926974918955809, + "grad_norm": 1.7442254433507314, + "learning_rate": 1.085311391249299e-05, + "loss": 0.1119, + "step": 9292 + }, + { + "epoch": 0.792782801569698, + "grad_norm": 1.6139292304679478, + "learning_rate": 1.0844520925500218e-05, + "loss": 0.1975, + "step": 9293 + }, + { + "epoch": 0.7928681112438151, + "grad_norm": 2.318429716050629, + "learning_rate": 1.0835930927820181e-05, + "loss": 0.2184, + "step": 9294 + }, + { + "epoch": 0.7929534209179321, + "grad_norm": 2.034341191305333, + "learning_rate": 1.0827343920108729e-05, + "loss": 0.2019, + "step": 9295 + }, + { + "epoch": 0.7930387305920491, + "grad_norm": 1.9472735834293864, + "learning_rate": 1.0818759903021381e-05, + "loss": 0.208, + "step": 9296 + }, + { + "epoch": 0.7931240402661662, + "grad_norm": 1.5290394611443396, + "learning_rate": 1.0810178877213517e-05, + "loss": 0.157, + "step": 9297 + }, + { + "epoch": 0.7932093499402832, + "grad_norm": 2.0605908429846966, + "learning_rate": 1.0801600843340243e-05, + "loss": 0.1896, + "step": 9298 + }, + { + "epoch": 0.7932946596144003, + "grad_norm": 1.8259023212450072, + "learning_rate": 1.0793025802056445e-05, + "loss": 0.2022, + "step": 9299 + }, + { + "epoch": 0.7933799692885173, + "grad_norm": 1.7978352510208582, + "learning_rate": 1.0784453754016776e-05, + "loss": 0.1856, + "step": 9300 + }, + { + "epoch": 0.7934652789626344, + "grad_norm": 1.568549005771018, + "learning_rate": 1.0775884699875676e-05, + "loss": 0.1988, + "step": 9301 + }, + { + "epoch": 0.7935505886367514, + "grad_norm": 1.5136884922222955, + "learning_rate": 1.0767318640287343e-05, + "loss": 0.1026, + "step": 9302 + }, + { + "epoch": 0.7936358983108684, + "grad_norm": 1.6655020777592888, + "learning_rate": 1.0758755575905732e-05, + "loss": 0.1502, + "step": 9303 + }, + { + "epoch": 0.7937212079849855, + "grad_norm": 1.4982677630092864, + "learning_rate": 1.0750195507384637e-05, + "loss": 0.1204, + "step": 9304 + }, + { + "epoch": 0.7938065176591026, + "grad_norm": 2.069666462992501, + "learning_rate": 1.074163843537751e-05, + "loss": 0.2383, + "step": 9305 + }, + { + "epoch": 0.7938918273332196, + "grad_norm": 1.91199397205456, + "learning_rate": 1.0733084360537687e-05, + "loss": 0.1273, + "step": 9306 + }, + { + "epoch": 0.7939771370073366, + "grad_norm": 1.9702976743547223, + "learning_rate": 1.0724533283518206e-05, + "loss": 0.1889, + "step": 9307 + }, + { + "epoch": 0.7940624466814536, + "grad_norm": 1.9253262181498403, + "learning_rate": 1.0715985204971901e-05, + "loss": 0.208, + "step": 9308 + }, + { + "epoch": 0.7941477563555707, + "grad_norm": 1.7962464933420885, + "learning_rate": 1.0707440125551372e-05, + "loss": 0.1308, + "step": 9309 + }, + { + "epoch": 0.7942330660296878, + "grad_norm": 1.6313741819388285, + "learning_rate": 1.0698898045908972e-05, + "loss": 0.1905, + "step": 9310 + }, + { + "epoch": 0.7943183757038048, + "grad_norm": 1.4226751067552645, + "learning_rate": 1.069035896669689e-05, + "loss": 0.1663, + "step": 9311 + }, + { + "epoch": 0.7944036853779218, + "grad_norm": 1.5916857347128481, + "learning_rate": 1.0681822888566984e-05, + "loss": 0.1904, + "step": 9312 + }, + { + "epoch": 0.7944889950520388, + "grad_norm": 1.4581430401853137, + "learning_rate": 1.0673289812170972e-05, + "loss": 0.1686, + "step": 9313 + }, + { + "epoch": 0.794574304726156, + "grad_norm": 2.196314789929627, + "learning_rate": 1.0664759738160307e-05, + "loss": 0.2055, + "step": 9314 + }, + { + "epoch": 0.794659614400273, + "grad_norm": 1.4073841184135536, + "learning_rate": 1.0656232667186206e-05, + "loss": 0.1985, + "step": 9315 + }, + { + "epoch": 0.79474492407439, + "grad_norm": 1.8874089654859718, + "learning_rate": 1.0647708599899653e-05, + "loss": 0.1521, + "step": 9316 + }, + { + "epoch": 0.794830233748507, + "grad_norm": 2.608617432776124, + "learning_rate": 1.0639187536951462e-05, + "loss": 0.175, + "step": 9317 + }, + { + "epoch": 0.7949155434226242, + "grad_norm": 2.0877562177198, + "learning_rate": 1.0630669478992105e-05, + "loss": 0.2127, + "step": 9318 + }, + { + "epoch": 0.7950008530967412, + "grad_norm": 2.019685189672108, + "learning_rate": 1.0622154426671948e-05, + "loss": 0.1464, + "step": 9319 + }, + { + "epoch": 0.7950861627708582, + "grad_norm": 1.863175683059986, + "learning_rate": 1.0613642380641042e-05, + "loss": 0.1406, + "step": 9320 + }, + { + "epoch": 0.7951714724449752, + "grad_norm": 2.0171330451263554, + "learning_rate": 1.0605133341549239e-05, + "loss": 0.2295, + "step": 9321 + }, + { + "epoch": 0.7952567821190923, + "grad_norm": 1.641148874111944, + "learning_rate": 1.0596627310046165e-05, + "loss": 0.1857, + "step": 9322 + }, + { + "epoch": 0.7953420917932094, + "grad_norm": 1.7097791120848278, + "learning_rate": 1.0588124286781204e-05, + "loss": 0.1939, + "step": 9323 + }, + { + "epoch": 0.7954274014673264, + "grad_norm": 1.8970925547292379, + "learning_rate": 1.057962427240352e-05, + "loss": 0.1501, + "step": 9324 + }, + { + "epoch": 0.7955127111414434, + "grad_norm": 1.6293533406868423, + "learning_rate": 1.0571127267562031e-05, + "loss": 0.1188, + "step": 9325 + }, + { + "epoch": 0.7955980208155605, + "grad_norm": 2.2539117505619624, + "learning_rate": 1.0562633272905464e-05, + "loss": 0.2544, + "step": 9326 + }, + { + "epoch": 0.7956833304896775, + "grad_norm": 1.5693644988066813, + "learning_rate": 1.0554142289082275e-05, + "loss": 0.1893, + "step": 9327 + }, + { + "epoch": 0.7957686401637946, + "grad_norm": 1.6891344420350451, + "learning_rate": 1.0545654316740705e-05, + "loss": 0.1934, + "step": 9328 + }, + { + "epoch": 0.7958539498379116, + "grad_norm": 1.4358852273187104, + "learning_rate": 1.0537169356528775e-05, + "loss": 0.2321, + "step": 9329 + }, + { + "epoch": 0.7959392595120287, + "grad_norm": 2.2824325631782734, + "learning_rate": 1.0528687409094251e-05, + "loss": 0.2267, + "step": 9330 + }, + { + "epoch": 0.7960245691861457, + "grad_norm": 2.0071684401781815, + "learning_rate": 1.0520208475084698e-05, + "loss": 0.1956, + "step": 9331 + }, + { + "epoch": 0.7961098788602627, + "grad_norm": 1.3542524261706392, + "learning_rate": 1.0511732555147419e-05, + "loss": 0.1408, + "step": 9332 + }, + { + "epoch": 0.7961951885343798, + "grad_norm": 1.6611956764449676, + "learning_rate": 1.0503259649929542e-05, + "loss": 0.1694, + "step": 9333 + }, + { + "epoch": 0.7962804982084969, + "grad_norm": 1.950580740307047, + "learning_rate": 1.0494789760077883e-05, + "loss": 0.1976, + "step": 9334 + }, + { + "epoch": 0.7963658078826139, + "grad_norm": 1.3477814989352206, + "learning_rate": 1.0486322886239109e-05, + "loss": 0.2098, + "step": 9335 + }, + { + "epoch": 0.7964511175567309, + "grad_norm": 2.30533934604588, + "learning_rate": 1.0477859029059606e-05, + "loss": 0.2072, + "step": 9336 + }, + { + "epoch": 0.7965364272308479, + "grad_norm": 2.7122483835552194, + "learning_rate": 1.0469398189185542e-05, + "loss": 0.1714, + "step": 9337 + }, + { + "epoch": 0.7966217369049651, + "grad_norm": 1.6184475914522118, + "learning_rate": 1.046094036726285e-05, + "loss": 0.2177, + "step": 9338 + }, + { + "epoch": 0.7967070465790821, + "grad_norm": 1.9515491689771864, + "learning_rate": 1.0452485563937265e-05, + "loss": 0.2091, + "step": 9339 + }, + { + "epoch": 0.7967923562531991, + "grad_norm": 1.4586984540630268, + "learning_rate": 1.0444033779854251e-05, + "loss": 0.1202, + "step": 9340 + }, + { + "epoch": 0.7968776659273161, + "grad_norm": 1.8245904123198424, + "learning_rate": 1.043558501565906e-05, + "loss": 0.1853, + "step": 9341 + }, + { + "epoch": 0.7969629756014333, + "grad_norm": 2.1429149324708145, + "learning_rate": 1.0427139271996705e-05, + "loss": 0.1677, + "step": 9342 + }, + { + "epoch": 0.7970482852755503, + "grad_norm": 1.5499149852941627, + "learning_rate": 1.041869654951198e-05, + "loss": 0.1918, + "step": 9343 + }, + { + "epoch": 0.7971335949496673, + "grad_norm": 1.5545170369458507, + "learning_rate": 1.0410256848849437e-05, + "loss": 0.144, + "step": 9344 + }, + { + "epoch": 0.7972189046237843, + "grad_norm": 1.6664466076849411, + "learning_rate": 1.0401820170653387e-05, + "loss": 0.1828, + "step": 9345 + }, + { + "epoch": 0.7973042142979014, + "grad_norm": 2.0600736969662936, + "learning_rate": 1.0393386515567972e-05, + "loss": 0.1885, + "step": 9346 + }, + { + "epoch": 0.7973895239720185, + "grad_norm": 2.427843185036449, + "learning_rate": 1.0384955884237003e-05, + "loss": 0.1596, + "step": 9347 + }, + { + "epoch": 0.7974748336461355, + "grad_norm": 1.7075550663615455, + "learning_rate": 1.0376528277304148e-05, + "loss": 0.1881, + "step": 9348 + }, + { + "epoch": 0.7975601433202525, + "grad_norm": 1.2315077497367781, + "learning_rate": 1.0368103695412801e-05, + "loss": 0.1891, + "step": 9349 + }, + { + "epoch": 0.7976454529943695, + "grad_norm": 1.4260909070843115, + "learning_rate": 1.0359682139206134e-05, + "loss": 0.2058, + "step": 9350 + }, + { + "epoch": 0.7977307626684866, + "grad_norm": 1.665447822580326, + "learning_rate": 1.0351263609327083e-05, + "loss": 0.1847, + "step": 9351 + }, + { + "epoch": 0.7978160723426037, + "grad_norm": 1.5490180689211597, + "learning_rate": 1.0342848106418368e-05, + "loss": 0.1449, + "step": 9352 + }, + { + "epoch": 0.7979013820167207, + "grad_norm": 2.254112511887452, + "learning_rate": 1.0334435631122458e-05, + "loss": 0.121, + "step": 9353 + }, + { + "epoch": 0.7979866916908377, + "grad_norm": 1.5695117787833337, + "learning_rate": 1.0326026184081595e-05, + "loss": 0.188, + "step": 9354 + }, + { + "epoch": 0.7980720013649548, + "grad_norm": 1.9730273209356035, + "learning_rate": 1.031761976593782e-05, + "loss": 0.1868, + "step": 9355 + }, + { + "epoch": 0.7981573110390718, + "grad_norm": 1.5524713257431322, + "learning_rate": 1.0309216377332898e-05, + "loss": 0.1493, + "step": 9356 + }, + { + "epoch": 0.7982426207131889, + "grad_norm": 1.8607830198085593, + "learning_rate": 1.0300816018908393e-05, + "loss": 0.1823, + "step": 9357 + }, + { + "epoch": 0.7983279303873059, + "grad_norm": 2.11007588363077, + "learning_rate": 1.029241869130561e-05, + "loss": 0.1752, + "step": 9358 + }, + { + "epoch": 0.798413240061423, + "grad_norm": 1.625794289943125, + "learning_rate": 1.0284024395165682e-05, + "loss": 0.1665, + "step": 9359 + }, + { + "epoch": 0.79849854973554, + "grad_norm": 1.9645140354182316, + "learning_rate": 1.0275633131129413e-05, + "loss": 0.2091, + "step": 9360 + }, + { + "epoch": 0.798583859409657, + "grad_norm": 1.476936333375975, + "learning_rate": 1.0267244899837475e-05, + "loss": 0.1818, + "step": 9361 + }, + { + "epoch": 0.798669169083774, + "grad_norm": 1.757693545237197, + "learning_rate": 1.0258859701930246e-05, + "loss": 0.2215, + "step": 9362 + }, + { + "epoch": 0.7987544787578912, + "grad_norm": 1.9424749921173299, + "learning_rate": 1.0250477538047893e-05, + "loss": 0.1996, + "step": 9363 + }, + { + "epoch": 0.7988397884320082, + "grad_norm": 2.1710447192618276, + "learning_rate": 1.0242098408830353e-05, + "loss": 0.183, + "step": 9364 + }, + { + "epoch": 0.7989250981061252, + "grad_norm": 1.5186714936989913, + "learning_rate": 1.0233722314917326e-05, + "loss": 0.1482, + "step": 9365 + }, + { + "epoch": 0.7990104077802422, + "grad_norm": 1.76611145880038, + "learning_rate": 1.0225349256948286e-05, + "loss": 0.2002, + "step": 9366 + }, + { + "epoch": 0.7990957174543594, + "grad_norm": 1.4898713195842377, + "learning_rate": 1.0216979235562451e-05, + "loss": 0.1375, + "step": 9367 + }, + { + "epoch": 0.7991810271284764, + "grad_norm": 1.7581876905365987, + "learning_rate": 1.0208612251398874e-05, + "loss": 0.1672, + "step": 9368 + }, + { + "epoch": 0.7992663368025934, + "grad_norm": 1.9621168139113725, + "learning_rate": 1.020024830509627e-05, + "loss": 0.184, + "step": 9369 + }, + { + "epoch": 0.7993516464767104, + "grad_norm": 1.349388490921294, + "learning_rate": 1.0191887397293232e-05, + "loss": 0.1155, + "step": 9370 + }, + { + "epoch": 0.7994369561508275, + "grad_norm": 1.6986534844630086, + "learning_rate": 1.0183529528628044e-05, + "loss": 0.2208, + "step": 9371 + }, + { + "epoch": 0.7995222658249446, + "grad_norm": 2.032481597779547, + "learning_rate": 1.0175174699738793e-05, + "loss": 0.2015, + "step": 9372 + }, + { + "epoch": 0.7996075754990616, + "grad_norm": 1.773299963907237, + "learning_rate": 1.016682291126333e-05, + "loss": 0.2331, + "step": 9373 + }, + { + "epoch": 0.7996928851731786, + "grad_norm": 1.9652888436636815, + "learning_rate": 1.0158474163839249e-05, + "loss": 0.2063, + "step": 9374 + }, + { + "epoch": 0.7997781948472957, + "grad_norm": 1.5381713455949646, + "learning_rate": 1.015012845810397e-05, + "loss": 0.1615, + "step": 9375 + }, + { + "epoch": 0.7998635045214127, + "grad_norm": 1.608016468452463, + "learning_rate": 1.0141785794694597e-05, + "loss": 0.1505, + "step": 9376 + }, + { + "epoch": 0.7999488141955298, + "grad_norm": 1.7670254175142353, + "learning_rate": 1.013344617424809e-05, + "loss": 0.2118, + "step": 9377 + }, + { + "epoch": 0.8000341238696468, + "grad_norm": 1.8736757052238115, + "learning_rate": 1.0125109597401111e-05, + "loss": 0.1538, + "step": 9378 + }, + { + "epoch": 0.8001194335437639, + "grad_norm": 1.7436340895108262, + "learning_rate": 1.0116776064790123e-05, + "loss": 0.1407, + "step": 9379 + }, + { + "epoch": 0.8002047432178809, + "grad_norm": 1.4341926674902887, + "learning_rate": 1.0108445577051329e-05, + "loss": 0.1695, + "step": 9380 + }, + { + "epoch": 0.800290052891998, + "grad_norm": 1.3083532767183044, + "learning_rate": 1.0100118134820758e-05, + "loss": 0.2191, + "step": 9381 + }, + { + "epoch": 0.800375362566115, + "grad_norm": 1.8600863210069822, + "learning_rate": 1.0091793738734113e-05, + "loss": 0.1454, + "step": 9382 + }, + { + "epoch": 0.8004606722402321, + "grad_norm": 1.7868516079471295, + "learning_rate": 1.0083472389426956e-05, + "loss": 0.1553, + "step": 9383 + }, + { + "epoch": 0.8005459819143491, + "grad_norm": 1.372017747408353, + "learning_rate": 1.0075154087534566e-05, + "loss": 0.1628, + "step": 9384 + }, + { + "epoch": 0.8006312915884661, + "grad_norm": 1.197018621209035, + "learning_rate": 1.0066838833692004e-05, + "loss": 0.1499, + "step": 9385 + }, + { + "epoch": 0.8007166012625831, + "grad_norm": 1.6948179587224133, + "learning_rate": 1.0058526628534093e-05, + "loss": 0.1854, + "step": 9386 + }, + { + "epoch": 0.8008019109367002, + "grad_norm": 1.4673014863639942, + "learning_rate": 1.0050217472695405e-05, + "loss": 0.2084, + "step": 9387 + }, + { + "epoch": 0.8008872206108173, + "grad_norm": 1.2070330277216885, + "learning_rate": 1.0041911366810353e-05, + "loss": 0.1088, + "step": 9388 + }, + { + "epoch": 0.8009725302849343, + "grad_norm": 1.8645493772458772, + "learning_rate": 1.0033608311513004e-05, + "loss": 0.1826, + "step": 9389 + }, + { + "epoch": 0.8010578399590513, + "grad_norm": 1.5512809044574427, + "learning_rate": 1.0025308307437292e-05, + "loss": 0.2182, + "step": 9390 + }, + { + "epoch": 0.8011431496331683, + "grad_norm": 1.5415891249805114, + "learning_rate": 1.0017011355216866e-05, + "loss": 0.1721, + "step": 9391 + }, + { + "epoch": 0.8012284593072855, + "grad_norm": 1.3277621951756124, + "learning_rate": 1.000871745548515e-05, + "loss": 0.1702, + "step": 9392 + }, + { + "epoch": 0.8013137689814025, + "grad_norm": 2.4048660063344176, + "learning_rate": 1.0000426608875346e-05, + "loss": 0.2132, + "step": 9393 + }, + { + "epoch": 0.8013990786555195, + "grad_norm": 1.420696902320456, + "learning_rate": 9.992138816020412e-06, + "loss": 0.1739, + "step": 9394 + }, + { + "epoch": 0.8014843883296365, + "grad_norm": 1.6995126053605543, + "learning_rate": 9.983854077553078e-06, + "loss": 0.1554, + "step": 9395 + }, + { + "epoch": 0.8015696980037537, + "grad_norm": 2.095202838665065, + "learning_rate": 9.975572394105826e-06, + "loss": 0.1807, + "step": 9396 + }, + { + "epoch": 0.8016550076778707, + "grad_norm": 1.658000956186495, + "learning_rate": 9.96729376631096e-06, + "loss": 0.1666, + "step": 9397 + }, + { + "epoch": 0.8017403173519877, + "grad_norm": 1.9148668739917885, + "learning_rate": 9.95901819480045e-06, + "loss": 0.1601, + "step": 9398 + }, + { + "epoch": 0.8018256270261047, + "grad_norm": 1.6374345547942155, + "learning_rate": 9.950745680206142e-06, + "loss": 0.1742, + "step": 9399 + }, + { + "epoch": 0.8019109367002218, + "grad_norm": 1.7878165758336853, + "learning_rate": 9.942476223159569e-06, + "loss": 0.1244, + "step": 9400 + }, + { + "epoch": 0.8019962463743389, + "grad_norm": 2.083789648309272, + "learning_rate": 9.93420982429209e-06, + "loss": 0.231, + "step": 9401 + }, + { + "epoch": 0.8020815560484559, + "grad_norm": 1.7218786720749042, + "learning_rate": 9.92594648423476e-06, + "loss": 0.2257, + "step": 9402 + }, + { + "epoch": 0.8021668657225729, + "grad_norm": 1.7124778050007654, + "learning_rate": 9.917686203618475e-06, + "loss": 0.1896, + "step": 9403 + }, + { + "epoch": 0.80225217539669, + "grad_norm": 2.3512866089651383, + "learning_rate": 9.909428983073849e-06, + "loss": 0.2241, + "step": 9404 + }, + { + "epoch": 0.802337485070807, + "grad_norm": 1.6868585851039548, + "learning_rate": 9.901174823231279e-06, + "loss": 0.1847, + "step": 9405 + }, + { + "epoch": 0.8024227947449241, + "grad_norm": 1.9192226063963587, + "learning_rate": 9.892923724720932e-06, + "loss": 0.1168, + "step": 9406 + }, + { + "epoch": 0.8025081044190411, + "grad_norm": 1.6295235102392243, + "learning_rate": 9.884675688172723e-06, + "loss": 0.2176, + "step": 9407 + }, + { + "epoch": 0.8025934140931582, + "grad_norm": 1.6528876381201743, + "learning_rate": 9.876430714216356e-06, + "loss": 0.1752, + "step": 9408 + }, + { + "epoch": 0.8026787237672752, + "grad_norm": 1.8974381316380686, + "learning_rate": 9.868188803481276e-06, + "loss": 0.2139, + "step": 9409 + }, + { + "epoch": 0.8027640334413922, + "grad_norm": 1.906871506729352, + "learning_rate": 9.859949956596743e-06, + "loss": 0.172, + "step": 9410 + }, + { + "epoch": 0.8028493431155093, + "grad_norm": 1.8851200383261872, + "learning_rate": 9.851714174191701e-06, + "loss": 0.2626, + "step": 9411 + }, + { + "epoch": 0.8029346527896264, + "grad_norm": 1.697846658897713, + "learning_rate": 9.843481456894948e-06, + "loss": 0.1811, + "step": 9412 + }, + { + "epoch": 0.8030199624637434, + "grad_norm": 2.330658628508079, + "learning_rate": 9.835251805334994e-06, + "loss": 0.1898, + "step": 9413 + }, + { + "epoch": 0.8031052721378604, + "grad_norm": 1.8980357054958577, + "learning_rate": 9.827025220140129e-06, + "loss": 0.23, + "step": 9414 + }, + { + "epoch": 0.8031905818119774, + "grad_norm": 2.1988960964467488, + "learning_rate": 9.818801701938413e-06, + "loss": 0.1944, + "step": 9415 + }, + { + "epoch": 0.8032758914860946, + "grad_norm": 2.440155343598254, + "learning_rate": 9.810581251357647e-06, + "loss": 0.1185, + "step": 9416 + }, + { + "epoch": 0.8033612011602116, + "grad_norm": 1.684882005244667, + "learning_rate": 9.802363869025467e-06, + "loss": 0.2364, + "step": 9417 + }, + { + "epoch": 0.8034465108343286, + "grad_norm": 1.497514572964228, + "learning_rate": 9.794149555569165e-06, + "loss": 0.1543, + "step": 9418 + }, + { + "epoch": 0.8035318205084456, + "grad_norm": 1.5693594854234403, + "learning_rate": 9.785938311615906e-06, + "loss": 0.1501, + "step": 9419 + }, + { + "epoch": 0.8036171301825628, + "grad_norm": 1.7200210033287646, + "learning_rate": 9.777730137792557e-06, + "loss": 0.2067, + "step": 9420 + }, + { + "epoch": 0.8037024398566798, + "grad_norm": 1.7341947719649589, + "learning_rate": 9.769525034725774e-06, + "loss": 0.1533, + "step": 9421 + }, + { + "epoch": 0.8037877495307968, + "grad_norm": 1.5830051768032114, + "learning_rate": 9.761323003041955e-06, + "loss": 0.14, + "step": 9422 + }, + { + "epoch": 0.8038730592049138, + "grad_norm": 1.675033865059653, + "learning_rate": 9.753124043367328e-06, + "loss": 0.1513, + "step": 9423 + }, + { + "epoch": 0.8039583688790308, + "grad_norm": 1.6239368188528691, + "learning_rate": 9.744928156327776e-06, + "loss": 0.1688, + "step": 9424 + }, + { + "epoch": 0.804043678553148, + "grad_norm": 2.177824365783587, + "learning_rate": 9.736735342549059e-06, + "loss": 0.1931, + "step": 9425 + }, + { + "epoch": 0.804128988227265, + "grad_norm": 1.5659463640189375, + "learning_rate": 9.728545602656642e-06, + "loss": 0.1644, + "step": 9426 + }, + { + "epoch": 0.804214297901382, + "grad_norm": 1.7113805745857162, + "learning_rate": 9.720358937275764e-06, + "loss": 0.1944, + "step": 9427 + }, + { + "epoch": 0.804299607575499, + "grad_norm": 1.822573064482853, + "learning_rate": 9.712175347031433e-06, + "loss": 0.2253, + "step": 9428 + }, + { + "epoch": 0.8043849172496161, + "grad_norm": 1.4729552433442312, + "learning_rate": 9.703994832548419e-06, + "loss": 0.1296, + "step": 9429 + }, + { + "epoch": 0.8044702269237332, + "grad_norm": 1.9256945855562924, + "learning_rate": 9.695817394451285e-06, + "loss": 0.1306, + "step": 9430 + }, + { + "epoch": 0.8045555365978502, + "grad_norm": 2.362226468352868, + "learning_rate": 9.687643033364297e-06, + "loss": 0.22, + "step": 9431 + }, + { + "epoch": 0.8046408462719672, + "grad_norm": 1.7496341595402807, + "learning_rate": 9.67947174991155e-06, + "loss": 0.2289, + "step": 9432 + }, + { + "epoch": 0.8047261559460843, + "grad_norm": 1.6167851928967227, + "learning_rate": 9.671303544716875e-06, + "loss": 0.1629, + "step": 9433 + }, + { + "epoch": 0.8048114656202013, + "grad_norm": 1.4018752396643301, + "learning_rate": 9.663138418403872e-06, + "loss": 0.2349, + "step": 9434 + }, + { + "epoch": 0.8048967752943184, + "grad_norm": 1.7146123152083779, + "learning_rate": 9.654976371595898e-06, + "loss": 0.1702, + "step": 9435 + }, + { + "epoch": 0.8049820849684354, + "grad_norm": 1.4991403341534093, + "learning_rate": 9.646817404916081e-06, + "loss": 0.2442, + "step": 9436 + }, + { + "epoch": 0.8050673946425525, + "grad_norm": 1.439066075290542, + "learning_rate": 9.638661518987324e-06, + "loss": 0.159, + "step": 9437 + }, + { + "epoch": 0.8051527043166695, + "grad_norm": 2.3781502560691776, + "learning_rate": 9.630508714432268e-06, + "loss": 0.1936, + "step": 9438 + }, + { + "epoch": 0.8052380139907865, + "grad_norm": 1.2720007719421444, + "learning_rate": 9.62235899187337e-06, + "loss": 0.1484, + "step": 9439 + }, + { + "epoch": 0.8053233236649036, + "grad_norm": 1.5809012439293135, + "learning_rate": 9.614212351932772e-06, + "loss": 0.1486, + "step": 9440 + }, + { + "epoch": 0.8054086333390207, + "grad_norm": 1.4793670500657334, + "learning_rate": 9.606068795232465e-06, + "loss": 0.2311, + "step": 9441 + }, + { + "epoch": 0.8054939430131377, + "grad_norm": 1.0152121071393405, + "learning_rate": 9.59792832239415e-06, + "loss": 0.1021, + "step": 9442 + }, + { + "epoch": 0.8055792526872547, + "grad_norm": 1.3166702866001505, + "learning_rate": 9.589790934039311e-06, + "loss": 0.1242, + "step": 9443 + }, + { + "epoch": 0.8056645623613717, + "grad_norm": 1.8264480463591606, + "learning_rate": 9.581656630789181e-06, + "loss": 0.1291, + "step": 9444 + }, + { + "epoch": 0.8057498720354889, + "grad_norm": 2.2049541049947177, + "learning_rate": 9.5735254132648e-06, + "loss": 0.2454, + "step": 9445 + }, + { + "epoch": 0.8058351817096059, + "grad_norm": 2.449142333181633, + "learning_rate": 9.56539728208693e-06, + "loss": 0.2225, + "step": 9446 + }, + { + "epoch": 0.8059204913837229, + "grad_norm": 2.3233784671305444, + "learning_rate": 9.557272237876102e-06, + "loss": 0.2166, + "step": 9447 + }, + { + "epoch": 0.8060058010578399, + "grad_norm": 1.897515720709411, + "learning_rate": 9.549150281252633e-06, + "loss": 0.2456, + "step": 9448 + }, + { + "epoch": 0.806091110731957, + "grad_norm": 1.6609898527548534, + "learning_rate": 9.541031412836581e-06, + "loss": 0.1761, + "step": 9449 + }, + { + "epoch": 0.8061764204060741, + "grad_norm": 1.3992721368518601, + "learning_rate": 9.53291563324778e-06, + "loss": 0.1186, + "step": 9450 + }, + { + "epoch": 0.8062617300801911, + "grad_norm": 1.3409532229680328, + "learning_rate": 9.52480294310582e-06, + "loss": 0.1626, + "step": 9451 + }, + { + "epoch": 0.8063470397543081, + "grad_norm": 1.6141674941885813, + "learning_rate": 9.516693343030093e-06, + "loss": 0.1981, + "step": 9452 + }, + { + "epoch": 0.8064323494284252, + "grad_norm": 1.5347259928032235, + "learning_rate": 9.508586833639677e-06, + "loss": 0.1703, + "step": 9453 + }, + { + "epoch": 0.8065176591025423, + "grad_norm": 1.6656111553826296, + "learning_rate": 9.500483415553497e-06, + "loss": 0.17, + "step": 9454 + }, + { + "epoch": 0.8066029687766593, + "grad_norm": 1.8142719157780907, + "learning_rate": 9.492383089390195e-06, + "loss": 0.1391, + "step": 9455 + }, + { + "epoch": 0.8066882784507763, + "grad_norm": 1.3612311833590025, + "learning_rate": 9.484285855768182e-06, + "loss": 0.165, + "step": 9456 + }, + { + "epoch": 0.8067735881248934, + "grad_norm": 1.8595930340298987, + "learning_rate": 9.476191715305649e-06, + "loss": 0.1868, + "step": 9457 + }, + { + "epoch": 0.8068588977990104, + "grad_norm": 1.8874027126073267, + "learning_rate": 9.468100668620532e-06, + "loss": 0.1813, + "step": 9458 + }, + { + "epoch": 0.8069442074731275, + "grad_norm": 1.6540434912391646, + "learning_rate": 9.460012716330546e-06, + "loss": 0.1807, + "step": 9459 + }, + { + "epoch": 0.8070295171472445, + "grad_norm": 1.861746229704513, + "learning_rate": 9.451927859053145e-06, + "loss": 0.1747, + "step": 9460 + }, + { + "epoch": 0.8071148268213616, + "grad_norm": 2.0689628376340505, + "learning_rate": 9.443846097405596e-06, + "loss": 0.2412, + "step": 9461 + }, + { + "epoch": 0.8072001364954786, + "grad_norm": 1.8794264360803525, + "learning_rate": 9.435767432004877e-06, + "loss": 0.1799, + "step": 9462 + }, + { + "epoch": 0.8072854461695956, + "grad_norm": 1.5273334473557572, + "learning_rate": 9.427691863467758e-06, + "loss": 0.1437, + "step": 9463 + }, + { + "epoch": 0.8073707558437127, + "grad_norm": 1.6727244278163615, + "learning_rate": 9.419619392410756e-06, + "loss": 0.2313, + "step": 9464 + }, + { + "epoch": 0.8074560655178297, + "grad_norm": 1.3010218858805886, + "learning_rate": 9.411550019450189e-06, + "loss": 0.1618, + "step": 9465 + }, + { + "epoch": 0.8075413751919468, + "grad_norm": 1.8246728630021105, + "learning_rate": 9.403483745202068e-06, + "loss": 0.1815, + "step": 9466 + }, + { + "epoch": 0.8076266848660638, + "grad_norm": 1.7751611072335156, + "learning_rate": 9.395420570282248e-06, + "loss": 0.181, + "step": 9467 + }, + { + "epoch": 0.8077119945401808, + "grad_norm": 2.3175855854119454, + "learning_rate": 9.387360495306292e-06, + "loss": 0.1967, + "step": 9468 + }, + { + "epoch": 0.8077973042142979, + "grad_norm": 1.6295119515630638, + "learning_rate": 9.379303520889548e-06, + "loss": 0.1905, + "step": 9469 + }, + { + "epoch": 0.807882613888415, + "grad_norm": 1.2825475960538437, + "learning_rate": 9.371249647647124e-06, + "loss": 0.1554, + "step": 9470 + }, + { + "epoch": 0.807967923562532, + "grad_norm": 1.6588317614069812, + "learning_rate": 9.363198876193884e-06, + "loss": 0.2221, + "step": 9471 + }, + { + "epoch": 0.808053233236649, + "grad_norm": 1.8009709653404375, + "learning_rate": 9.35515120714447e-06, + "loss": 0.2088, + "step": 9472 + }, + { + "epoch": 0.808138542910766, + "grad_norm": 1.7699133184727514, + "learning_rate": 9.347106641113263e-06, + "loss": 0.1844, + "step": 9473 + }, + { + "epoch": 0.8082238525848832, + "grad_norm": 1.5497249974589116, + "learning_rate": 9.33906517871444e-06, + "loss": 0.1605, + "step": 9474 + }, + { + "epoch": 0.8083091622590002, + "grad_norm": 2.372524577617464, + "learning_rate": 9.331026820561928e-06, + "loss": 0.1727, + "step": 9475 + }, + { + "epoch": 0.8083944719331172, + "grad_norm": 1.7979961049140605, + "learning_rate": 9.322991567269395e-06, + "loss": 0.1391, + "step": 9476 + }, + { + "epoch": 0.8084797816072342, + "grad_norm": 1.727548171170871, + "learning_rate": 9.314959419450303e-06, + "loss": 0.1948, + "step": 9477 + }, + { + "epoch": 0.8085650912813513, + "grad_norm": 2.1898398827672776, + "learning_rate": 9.306930377717859e-06, + "loss": 0.1301, + "step": 9478 + }, + { + "epoch": 0.8086504009554684, + "grad_norm": 2.249471178428514, + "learning_rate": 9.298904442685042e-06, + "loss": 0.2404, + "step": 9479 + }, + { + "epoch": 0.8087357106295854, + "grad_norm": 2.196277772731099, + "learning_rate": 9.290881614964569e-06, + "loss": 0.2351, + "step": 9480 + }, + { + "epoch": 0.8088210203037024, + "grad_norm": 1.5864716180089466, + "learning_rate": 9.282861895168981e-06, + "loss": 0.1897, + "step": 9481 + }, + { + "epoch": 0.8089063299778195, + "grad_norm": 1.5687187039722523, + "learning_rate": 9.274845283910493e-06, + "loss": 0.173, + "step": 9482 + }, + { + "epoch": 0.8089916396519365, + "grad_norm": 1.6334012079084257, + "learning_rate": 9.266831781801167e-06, + "loss": 0.2913, + "step": 9483 + }, + { + "epoch": 0.8090769493260536, + "grad_norm": 1.6637734175480672, + "learning_rate": 9.258821389452777e-06, + "loss": 0.2285, + "step": 9484 + }, + { + "epoch": 0.8091622590001706, + "grad_norm": 1.536179063492373, + "learning_rate": 9.250814107476875e-06, + "loss": 0.1917, + "step": 9485 + }, + { + "epoch": 0.8092475686742877, + "grad_norm": 1.9559066495520907, + "learning_rate": 9.242809936484765e-06, + "loss": 0.2574, + "step": 9486 + }, + { + "epoch": 0.8093328783484047, + "grad_norm": 1.5995379048870746, + "learning_rate": 9.234808877087554e-06, + "loss": 0.1623, + "step": 9487 + }, + { + "epoch": 0.8094181880225217, + "grad_norm": 1.2018665816894505, + "learning_rate": 9.226810929896034e-06, + "loss": 0.1529, + "step": 9488 + }, + { + "epoch": 0.8095034976966388, + "grad_norm": 1.6790039800900058, + "learning_rate": 9.218816095520848e-06, + "loss": 0.177, + "step": 9489 + }, + { + "epoch": 0.8095888073707559, + "grad_norm": 1.5665123349461152, + "learning_rate": 9.21082437457234e-06, + "loss": 0.2001, + "step": 9490 + }, + { + "epoch": 0.8096741170448729, + "grad_norm": 1.809208347374001, + "learning_rate": 9.20283576766064e-06, + "loss": 0.1647, + "step": 9491 + }, + { + "epoch": 0.8097594267189899, + "grad_norm": 2.424482553864164, + "learning_rate": 9.194850275395633e-06, + "loss": 0.1955, + "step": 9492 + }, + { + "epoch": 0.809844736393107, + "grad_norm": 1.8980286711365293, + "learning_rate": 9.186867898386952e-06, + "loss": 0.1829, + "step": 9493 + }, + { + "epoch": 0.8099300460672241, + "grad_norm": 1.5055523471555663, + "learning_rate": 9.178888637244054e-06, + "loss": 0.169, + "step": 9494 + }, + { + "epoch": 0.8100153557413411, + "grad_norm": 1.369255986162814, + "learning_rate": 9.170912492576061e-06, + "loss": 0.1822, + "step": 9495 + }, + { + "epoch": 0.8101006654154581, + "grad_norm": 1.7037995428773187, + "learning_rate": 9.162939464991948e-06, + "loss": 0.1804, + "step": 9496 + }, + { + "epoch": 0.8101859750895751, + "grad_norm": 1.872062289081985, + "learning_rate": 9.154969555100396e-06, + "loss": 0.1565, + "step": 9497 + }, + { + "epoch": 0.8102712847636923, + "grad_norm": 2.0533938668769993, + "learning_rate": 9.147002763509865e-06, + "loss": 0.1852, + "step": 9498 + }, + { + "epoch": 0.8103565944378093, + "grad_norm": 1.7283805147804348, + "learning_rate": 9.139039090828588e-06, + "loss": 0.2533, + "step": 9499 + }, + { + "epoch": 0.8104419041119263, + "grad_norm": 1.3754336800265017, + "learning_rate": 9.131078537664539e-06, + "loss": 0.1473, + "step": 9500 + }, + { + "epoch": 0.8105272137860433, + "grad_norm": 1.8152526473653992, + "learning_rate": 9.123121104625465e-06, + "loss": 0.1806, + "step": 9501 + }, + { + "epoch": 0.8106125234601603, + "grad_norm": 2.5699614775030404, + "learning_rate": 9.115166792318858e-06, + "loss": 0.2727, + "step": 9502 + }, + { + "epoch": 0.8106978331342775, + "grad_norm": 1.5173764677844328, + "learning_rate": 9.107215601352038e-06, + "loss": 0.1589, + "step": 9503 + }, + { + "epoch": 0.8107831428083945, + "grad_norm": 1.5361335885348484, + "learning_rate": 9.099267532331973e-06, + "loss": 0.1634, + "step": 9504 + }, + { + "epoch": 0.8108684524825115, + "grad_norm": 2.410652987392121, + "learning_rate": 9.091322585865497e-06, + "loss": 0.1834, + "step": 9505 + }, + { + "epoch": 0.8109537621566285, + "grad_norm": 1.6491779996390348, + "learning_rate": 9.083380762559146e-06, + "loss": 0.1653, + "step": 9506 + }, + { + "epoch": 0.8110390718307456, + "grad_norm": 1.594821644440432, + "learning_rate": 9.075442063019263e-06, + "loss": 0.2156, + "step": 9507 + }, + { + "epoch": 0.8111243815048627, + "grad_norm": 1.7464488647645968, + "learning_rate": 9.067506487851884e-06, + "loss": 0.1276, + "step": 9508 + }, + { + "epoch": 0.8112096911789797, + "grad_norm": 1.5684050556907903, + "learning_rate": 9.059574037662882e-06, + "loss": 0.191, + "step": 9509 + }, + { + "epoch": 0.8112950008530967, + "grad_norm": 2.2393236113179475, + "learning_rate": 9.051644713057844e-06, + "loss": 0.1575, + "step": 9510 + }, + { + "epoch": 0.8113803105272138, + "grad_norm": 1.8956817594307147, + "learning_rate": 9.04371851464213e-06, + "loss": 0.1602, + "step": 9511 + }, + { + "epoch": 0.8114656202013308, + "grad_norm": 1.5632618381986, + "learning_rate": 9.035795443020873e-06, + "loss": 0.152, + "step": 9512 + }, + { + "epoch": 0.8115509298754479, + "grad_norm": 2.0695974590673725, + "learning_rate": 9.027875498798943e-06, + "loss": 0.1974, + "step": 9513 + }, + { + "epoch": 0.8116362395495649, + "grad_norm": 1.9272779486734317, + "learning_rate": 9.019958682580998e-06, + "loss": 0.1975, + "step": 9514 + }, + { + "epoch": 0.811721549223682, + "grad_norm": 2.229938718013612, + "learning_rate": 9.012044994971425e-06, + "loss": 0.2543, + "step": 9515 + }, + { + "epoch": 0.811806858897799, + "grad_norm": 1.614717375410822, + "learning_rate": 9.004134436574429e-06, + "loss": 0.1726, + "step": 9516 + }, + { + "epoch": 0.811892168571916, + "grad_norm": 1.8403100072889504, + "learning_rate": 8.996227007993896e-06, + "loss": 0.2188, + "step": 9517 + }, + { + "epoch": 0.8119774782460331, + "grad_norm": 1.458809838330267, + "learning_rate": 8.988322709833553e-06, + "loss": 0.1807, + "step": 9518 + }, + { + "epoch": 0.8120627879201502, + "grad_norm": 1.8271591373784781, + "learning_rate": 8.980421542696832e-06, + "loss": 0.1875, + "step": 9519 + }, + { + "epoch": 0.8121480975942672, + "grad_norm": 1.4265502528054512, + "learning_rate": 8.972523507186948e-06, + "loss": 0.092, + "step": 9520 + }, + { + "epoch": 0.8122334072683842, + "grad_norm": 2.3082803942764567, + "learning_rate": 8.964628603906872e-06, + "loss": 0.2279, + "step": 9521 + }, + { + "epoch": 0.8123187169425012, + "grad_norm": 1.5982686480313337, + "learning_rate": 8.956736833459328e-06, + "loss": 0.139, + "step": 9522 + }, + { + "epoch": 0.8124040266166184, + "grad_norm": 1.225066420642306, + "learning_rate": 8.948848196446852e-06, + "loss": 0.1474, + "step": 9523 + }, + { + "epoch": 0.8124893362907354, + "grad_norm": 2.223692318970739, + "learning_rate": 8.940962693471645e-06, + "loss": 0.1919, + "step": 9524 + }, + { + "epoch": 0.8125746459648524, + "grad_norm": 1.5631749030699957, + "learning_rate": 8.933080325135756e-06, + "loss": 0.2019, + "step": 9525 + }, + { + "epoch": 0.8126599556389694, + "grad_norm": 3.169408882454987, + "learning_rate": 8.925201092040958e-06, + "loss": 0.1926, + "step": 9526 + }, + { + "epoch": 0.8127452653130866, + "grad_norm": 2.357859547969136, + "learning_rate": 8.91732499478879e-06, + "loss": 0.1927, + "step": 9527 + }, + { + "epoch": 0.8128305749872036, + "grad_norm": 1.6423772264926113, + "learning_rate": 8.909452033980526e-06, + "loss": 0.2174, + "step": 9528 + }, + { + "epoch": 0.8129158846613206, + "grad_norm": 2.0572326909235357, + "learning_rate": 8.901582210217274e-06, + "loss": 0.2253, + "step": 9529 + }, + { + "epoch": 0.8130011943354376, + "grad_norm": 2.0742223132546487, + "learning_rate": 8.8937155240998e-06, + "loss": 0.1048, + "step": 9530 + }, + { + "epoch": 0.8130865040095547, + "grad_norm": 2.225097816974566, + "learning_rate": 8.885851976228714e-06, + "loss": 0.1901, + "step": 9531 + }, + { + "epoch": 0.8131718136836718, + "grad_norm": 2.2362106359012057, + "learning_rate": 8.877991567204352e-06, + "loss": 0.1667, + "step": 9532 + }, + { + "epoch": 0.8132571233577888, + "grad_norm": 1.8421351586332446, + "learning_rate": 8.870134297626815e-06, + "loss": 0.1858, + "step": 9533 + }, + { + "epoch": 0.8133424330319058, + "grad_norm": 1.835893216512143, + "learning_rate": 8.862280168095955e-06, + "loss": 0.1281, + "step": 9534 + }, + { + "epoch": 0.8134277427060229, + "grad_norm": 1.4614369563711975, + "learning_rate": 8.854429179211388e-06, + "loss": 0.1935, + "step": 9535 + }, + { + "epoch": 0.8135130523801399, + "grad_norm": 1.82418489935302, + "learning_rate": 8.846581331572528e-06, + "loss": 0.2163, + "step": 9536 + }, + { + "epoch": 0.813598362054257, + "grad_norm": 1.5964362749283876, + "learning_rate": 8.838736625778476e-06, + "loss": 0.1574, + "step": 9537 + }, + { + "epoch": 0.813683671728374, + "grad_norm": 1.6897672328123852, + "learning_rate": 8.830895062428163e-06, + "loss": 0.1509, + "step": 9538 + }, + { + "epoch": 0.813768981402491, + "grad_norm": 1.448228770404176, + "learning_rate": 8.823056642120236e-06, + "loss": 0.1879, + "step": 9539 + }, + { + "epoch": 0.8138542910766081, + "grad_norm": 1.6340002093367518, + "learning_rate": 8.81522136545312e-06, + "loss": 0.1627, + "step": 9540 + }, + { + "epoch": 0.8139396007507251, + "grad_norm": 2.047286130660162, + "learning_rate": 8.807389233025e-06, + "loss": 0.1882, + "step": 9541 + }, + { + "epoch": 0.8140249104248422, + "grad_norm": 1.5689949847421425, + "learning_rate": 8.799560245433814e-06, + "loss": 0.1572, + "step": 9542 + }, + { + "epoch": 0.8141102200989592, + "grad_norm": 2.088621206188004, + "learning_rate": 8.791734403277262e-06, + "loss": 0.2116, + "step": 9543 + }, + { + "epoch": 0.8141955297730763, + "grad_norm": 2.3048064637596326, + "learning_rate": 8.783911707152797e-06, + "loss": 0.1587, + "step": 9544 + }, + { + "epoch": 0.8142808394471933, + "grad_norm": 1.80134133588499, + "learning_rate": 8.776092157657679e-06, + "loss": 0.1708, + "step": 9545 + }, + { + "epoch": 0.8143661491213103, + "grad_norm": 1.8494909101532513, + "learning_rate": 8.768275755388833e-06, + "loss": 0.2194, + "step": 9546 + }, + { + "epoch": 0.8144514587954274, + "grad_norm": 2.216290682501567, + "learning_rate": 8.76046250094304e-06, + "loss": 0.2314, + "step": 9547 + }, + { + "epoch": 0.8145367684695445, + "grad_norm": 2.3399863987714857, + "learning_rate": 8.752652394916788e-06, + "loss": 0.2216, + "step": 9548 + }, + { + "epoch": 0.8146220781436615, + "grad_norm": 1.9449089275112257, + "learning_rate": 8.74484543790634e-06, + "loss": 0.2268, + "step": 9549 + }, + { + "epoch": 0.8147073878177785, + "grad_norm": 1.9034763857411572, + "learning_rate": 8.737041630507697e-06, + "loss": 0.185, + "step": 9550 + }, + { + "epoch": 0.8147926974918955, + "grad_norm": 2.000860625110951, + "learning_rate": 8.729240973316671e-06, + "loss": 0.1851, + "step": 9551 + }, + { + "epoch": 0.8148780071660127, + "grad_norm": 2.0865261661690595, + "learning_rate": 8.721443466928786e-06, + "loss": 0.1978, + "step": 9552 + }, + { + "epoch": 0.8149633168401297, + "grad_norm": 1.6327175861132317, + "learning_rate": 8.713649111939332e-06, + "loss": 0.1743, + "step": 9553 + }, + { + "epoch": 0.8150486265142467, + "grad_norm": 2.801599359136886, + "learning_rate": 8.705857908943376e-06, + "loss": 0.1579, + "step": 9554 + }, + { + "epoch": 0.8151339361883637, + "grad_norm": 1.825966367530904, + "learning_rate": 8.698069858535728e-06, + "loss": 0.1519, + "step": 9555 + }, + { + "epoch": 0.8152192458624808, + "grad_norm": 2.5202124805788944, + "learning_rate": 8.690284961310973e-06, + "loss": 0.1701, + "step": 9556 + }, + { + "epoch": 0.8153045555365979, + "grad_norm": 1.637622665586468, + "learning_rate": 8.68250321786343e-06, + "loss": 0.1136, + "step": 9557 + }, + { + "epoch": 0.8153898652107149, + "grad_norm": 1.6881706176831088, + "learning_rate": 8.674724628787228e-06, + "loss": 0.167, + "step": 9558 + }, + { + "epoch": 0.8154751748848319, + "grad_norm": 1.7567860637072075, + "learning_rate": 8.666949194676171e-06, + "loss": 0.2053, + "step": 9559 + }, + { + "epoch": 0.815560484558949, + "grad_norm": 2.121267294062992, + "learning_rate": 8.659176916123918e-06, + "loss": 0.2048, + "step": 9560 + }, + { + "epoch": 0.815645794233066, + "grad_norm": 1.7425048265431355, + "learning_rate": 8.651407793723815e-06, + "loss": 0.1378, + "step": 9561 + }, + { + "epoch": 0.8157311039071831, + "grad_norm": 1.5923205678512107, + "learning_rate": 8.643641828069005e-06, + "loss": 0.182, + "step": 9562 + }, + { + "epoch": 0.8158164135813001, + "grad_norm": 1.5482606942774393, + "learning_rate": 8.635879019752374e-06, + "loss": 0.1586, + "step": 9563 + }, + { + "epoch": 0.8159017232554172, + "grad_norm": 2.1540868108570965, + "learning_rate": 8.62811936936656e-06, + "loss": 0.1388, + "step": 9564 + }, + { + "epoch": 0.8159870329295342, + "grad_norm": 2.0110336174035353, + "learning_rate": 8.620362877504006e-06, + "loss": 0.1961, + "step": 9565 + }, + { + "epoch": 0.8160723426036512, + "grad_norm": 2.160241013255596, + "learning_rate": 8.612609544756828e-06, + "loss": 0.1596, + "step": 9566 + }, + { + "epoch": 0.8161576522777683, + "grad_norm": 1.6248135826581778, + "learning_rate": 8.604859371716994e-06, + "loss": 0.1788, + "step": 9567 + }, + { + "epoch": 0.8162429619518854, + "grad_norm": 1.5956457494274898, + "learning_rate": 8.597112358976172e-06, + "loss": 0.1407, + "step": 9568 + }, + { + "epoch": 0.8163282716260024, + "grad_norm": 1.572897837275628, + "learning_rate": 8.589368507125805e-06, + "loss": 0.1601, + "step": 9569 + }, + { + "epoch": 0.8164135813001194, + "grad_norm": 1.722374717906727, + "learning_rate": 8.581627816757088e-06, + "loss": 0.1502, + "step": 9570 + }, + { + "epoch": 0.8164988909742364, + "grad_norm": 1.9884308944177567, + "learning_rate": 8.573890288461011e-06, + "loss": 0.1559, + "step": 9571 + }, + { + "epoch": 0.8165842006483536, + "grad_norm": 2.3773282334206494, + "learning_rate": 8.56615592282825e-06, + "loss": 0.2472, + "step": 9572 + }, + { + "epoch": 0.8166695103224706, + "grad_norm": 1.8678133885172632, + "learning_rate": 8.558424720449321e-06, + "loss": 0.2052, + "step": 9573 + }, + { + "epoch": 0.8167548199965876, + "grad_norm": 1.5452133656053333, + "learning_rate": 8.550696681914438e-06, + "loss": 0.1618, + "step": 9574 + }, + { + "epoch": 0.8168401296707046, + "grad_norm": 2.1371103656636117, + "learning_rate": 8.542971807813604e-06, + "loss": 0.2495, + "step": 9575 + }, + { + "epoch": 0.8169254393448218, + "grad_norm": 2.3857419876304666, + "learning_rate": 8.535250098736575e-06, + "loss": 0.16, + "step": 9576 + }, + { + "epoch": 0.8170107490189388, + "grad_norm": 1.7992180847882306, + "learning_rate": 8.527531555272849e-06, + "loss": 0.1633, + "step": 9577 + }, + { + "epoch": 0.8170960586930558, + "grad_norm": 1.8604722832386456, + "learning_rate": 8.519816178011714e-06, + "loss": 0.2043, + "step": 9578 + }, + { + "epoch": 0.8171813683671728, + "grad_norm": 1.8296286113328342, + "learning_rate": 8.512103967542167e-06, + "loss": 0.205, + "step": 9579 + }, + { + "epoch": 0.8172666780412898, + "grad_norm": 1.9184102406843277, + "learning_rate": 8.504394924453029e-06, + "loss": 0.2257, + "step": 9580 + }, + { + "epoch": 0.817351987715407, + "grad_norm": 1.3152529591402178, + "learning_rate": 8.496689049332835e-06, + "loss": 0.1263, + "step": 9581 + }, + { + "epoch": 0.817437297389524, + "grad_norm": 1.6557947648805276, + "learning_rate": 8.488986342769883e-06, + "loss": 0.149, + "step": 9582 + }, + { + "epoch": 0.817522607063641, + "grad_norm": 1.585085484940125, + "learning_rate": 8.481286805352234e-06, + "loss": 0.1805, + "step": 9583 + }, + { + "epoch": 0.817607916737758, + "grad_norm": 2.4168207404470605, + "learning_rate": 8.473590437667706e-06, + "loss": 0.181, + "step": 9584 + }, + { + "epoch": 0.8176932264118751, + "grad_norm": 1.3342099089865107, + "learning_rate": 8.465897240303877e-06, + "loss": 0.1654, + "step": 9585 + }, + { + "epoch": 0.8177785360859922, + "grad_norm": 1.8099030931359195, + "learning_rate": 8.458207213848074e-06, + "loss": 0.1794, + "step": 9586 + }, + { + "epoch": 0.8178638457601092, + "grad_norm": 1.826555474939208, + "learning_rate": 8.450520358887415e-06, + "loss": 0.1479, + "step": 9587 + }, + { + "epoch": 0.8179491554342262, + "grad_norm": 2.029449838962458, + "learning_rate": 8.442836676008715e-06, + "loss": 0.2055, + "step": 9588 + }, + { + "epoch": 0.8180344651083433, + "grad_norm": 1.9069622772758577, + "learning_rate": 8.435156165798608e-06, + "loss": 0.2178, + "step": 9589 + }, + { + "epoch": 0.8181197747824603, + "grad_norm": 1.7829283453492062, + "learning_rate": 8.427478828843455e-06, + "loss": 0.1722, + "step": 9590 + }, + { + "epoch": 0.8182050844565774, + "grad_norm": 1.471147325768844, + "learning_rate": 8.419804665729375e-06, + "loss": 0.208, + "step": 9591 + }, + { + "epoch": 0.8182903941306944, + "grad_norm": 2.8385041867269467, + "learning_rate": 8.41213367704224e-06, + "loss": 0.2002, + "step": 9592 + }, + { + "epoch": 0.8183757038048115, + "grad_norm": 2.3325055788551987, + "learning_rate": 8.404465863367727e-06, + "loss": 0.1975, + "step": 9593 + }, + { + "epoch": 0.8184610134789285, + "grad_norm": 1.6529199483864685, + "learning_rate": 8.396801225291179e-06, + "loss": 0.1888, + "step": 9594 + }, + { + "epoch": 0.8185463231530455, + "grad_norm": 2.0688689183287585, + "learning_rate": 8.389139763397796e-06, + "loss": 0.1926, + "step": 9595 + }, + { + "epoch": 0.8186316328271626, + "grad_norm": 1.4460175315353436, + "learning_rate": 8.381481478272469e-06, + "loss": 0.1732, + "step": 9596 + }, + { + "epoch": 0.8187169425012797, + "grad_norm": 2.8761844682916444, + "learning_rate": 8.37382637049987e-06, + "loss": 0.2097, + "step": 9597 + }, + { + "epoch": 0.8188022521753967, + "grad_norm": 2.38526904941777, + "learning_rate": 8.366174440664425e-06, + "loss": 0.2401, + "step": 9598 + }, + { + "epoch": 0.8188875618495137, + "grad_norm": 1.5741755855880482, + "learning_rate": 8.35852568935031e-06, + "loss": 0.1445, + "step": 9599 + }, + { + "epoch": 0.8189728715236307, + "grad_norm": 1.6635626816860365, + "learning_rate": 8.350880117141503e-06, + "loss": 0.1884, + "step": 9600 + }, + { + "epoch": 0.8190581811977479, + "grad_norm": 2.1268266792648647, + "learning_rate": 8.34323772462165e-06, + "loss": 0.1975, + "step": 9601 + }, + { + "epoch": 0.8191434908718649, + "grad_norm": 1.6821287645137821, + "learning_rate": 8.335598512374243e-06, + "loss": 0.146, + "step": 9602 + }, + { + "epoch": 0.8192288005459819, + "grad_norm": 1.6186270825470381, + "learning_rate": 8.327962480982482e-06, + "loss": 0.227, + "step": 9603 + }, + { + "epoch": 0.8193141102200989, + "grad_norm": 1.8359415257186582, + "learning_rate": 8.320329631029344e-06, + "loss": 0.1707, + "step": 9604 + }, + { + "epoch": 0.819399419894216, + "grad_norm": 1.6333694603226776, + "learning_rate": 8.312699963097554e-06, + "loss": 0.2028, + "step": 9605 + }, + { + "epoch": 0.8194847295683331, + "grad_norm": 1.749458160985968, + "learning_rate": 8.305073477769599e-06, + "loss": 0.2397, + "step": 9606 + }, + { + "epoch": 0.8195700392424501, + "grad_norm": 2.1772265470816063, + "learning_rate": 8.297450175627714e-06, + "loss": 0.2226, + "step": 9607 + }, + { + "epoch": 0.8196553489165671, + "grad_norm": 2.0784434597977812, + "learning_rate": 8.289830057253883e-06, + "loss": 0.1791, + "step": 9608 + }, + { + "epoch": 0.8197406585906842, + "grad_norm": 1.4920719761093528, + "learning_rate": 8.282213123229898e-06, + "loss": 0.1447, + "step": 9609 + }, + { + "epoch": 0.8198259682648013, + "grad_norm": 1.8919534351030272, + "learning_rate": 8.274599374137254e-06, + "loss": 0.1725, + "step": 9610 + }, + { + "epoch": 0.8199112779389183, + "grad_norm": 2.0200584210549697, + "learning_rate": 8.26698881055722e-06, + "loss": 0.1563, + "step": 9611 + }, + { + "epoch": 0.8199965876130353, + "grad_norm": 2.048808924145908, + "learning_rate": 8.259381433070801e-06, + "loss": 0.2407, + "step": 9612 + }, + { + "epoch": 0.8200818972871524, + "grad_norm": 2.327014222879062, + "learning_rate": 8.251777242258834e-06, + "loss": 0.1171, + "step": 9613 + }, + { + "epoch": 0.8201672069612694, + "grad_norm": 1.865564191328828, + "learning_rate": 8.244176238701795e-06, + "loss": 0.2048, + "step": 9614 + }, + { + "epoch": 0.8202525166353865, + "grad_norm": 1.2513627729937888, + "learning_rate": 8.236578422980024e-06, + "loss": 0.1097, + "step": 9615 + }, + { + "epoch": 0.8203378263095035, + "grad_norm": 1.619301708569612, + "learning_rate": 8.228983795673562e-06, + "loss": 0.1779, + "step": 9616 + }, + { + "epoch": 0.8204231359836205, + "grad_norm": 1.7803054446776476, + "learning_rate": 8.221392357362211e-06, + "loss": 0.1331, + "step": 9617 + }, + { + "epoch": 0.8205084456577376, + "grad_norm": 1.7311154915788622, + "learning_rate": 8.213804108625545e-06, + "loss": 0.1311, + "step": 9618 + }, + { + "epoch": 0.8205937553318546, + "grad_norm": 1.3376557749651696, + "learning_rate": 8.206219050042884e-06, + "loss": 0.1469, + "step": 9619 + }, + { + "epoch": 0.8206790650059717, + "grad_norm": 1.3472901662536323, + "learning_rate": 8.198637182193303e-06, + "loss": 0.1639, + "step": 9620 + }, + { + "epoch": 0.8207643746800887, + "grad_norm": 1.6558559596770481, + "learning_rate": 8.191058505655636e-06, + "loss": 0.1319, + "step": 9621 + }, + { + "epoch": 0.8208496843542058, + "grad_norm": 2.0518196310891827, + "learning_rate": 8.183483021008498e-06, + "loss": 0.2335, + "step": 9622 + }, + { + "epoch": 0.8209349940283228, + "grad_norm": 1.489491048590688, + "learning_rate": 8.175910728830194e-06, + "loss": 0.2237, + "step": 9623 + }, + { + "epoch": 0.8210203037024398, + "grad_norm": 1.8383261218351759, + "learning_rate": 8.168341629698867e-06, + "loss": 0.1957, + "step": 9624 + }, + { + "epoch": 0.8211056133765569, + "grad_norm": 1.6564781463682978, + "learning_rate": 8.160775724192365e-06, + "loss": 0.1996, + "step": 9625 + }, + { + "epoch": 0.821190923050674, + "grad_norm": 2.5022196452807175, + "learning_rate": 8.153213012888305e-06, + "loss": 0.1884, + "step": 9626 + }, + { + "epoch": 0.821276232724791, + "grad_norm": 3.0713665344461925, + "learning_rate": 8.145653496364054e-06, + "loss": 0.1644, + "step": 9627 + }, + { + "epoch": 0.821361542398908, + "grad_norm": 1.2797411431141892, + "learning_rate": 8.13809717519673e-06, + "loss": 0.1498, + "step": 9628 + }, + { + "epoch": 0.821446852073025, + "grad_norm": 1.5511309712227639, + "learning_rate": 8.130544049963263e-06, + "loss": 0.1178, + "step": 9629 + }, + { + "epoch": 0.8215321617471422, + "grad_norm": 1.6586036512930145, + "learning_rate": 8.122994121240236e-06, + "loss": 0.1757, + "step": 9630 + }, + { + "epoch": 0.8216174714212592, + "grad_norm": 1.4192136233870407, + "learning_rate": 8.115447389604085e-06, + "loss": 0.1427, + "step": 9631 + }, + { + "epoch": 0.8217027810953762, + "grad_norm": 1.5434354824648466, + "learning_rate": 8.107903855630956e-06, + "loss": 0.1352, + "step": 9632 + }, + { + "epoch": 0.8217880907694932, + "grad_norm": 1.746521899409563, + "learning_rate": 8.100363519896748e-06, + "loss": 0.1373, + "step": 9633 + }, + { + "epoch": 0.8218734004436103, + "grad_norm": 1.3676565183231557, + "learning_rate": 8.09282638297712e-06, + "loss": 0.1956, + "step": 9634 + }, + { + "epoch": 0.8219587101177274, + "grad_norm": 1.3690828537010302, + "learning_rate": 8.085292445447529e-06, + "loss": 0.1797, + "step": 9635 + }, + { + "epoch": 0.8220440197918444, + "grad_norm": 1.9317974956445272, + "learning_rate": 8.077761707883102e-06, + "loss": 0.1884, + "step": 9636 + }, + { + "epoch": 0.8221293294659614, + "grad_norm": 1.5190124410555508, + "learning_rate": 8.070234170858803e-06, + "loss": 0.2198, + "step": 9637 + }, + { + "epoch": 0.8222146391400785, + "grad_norm": 1.5462428641250985, + "learning_rate": 8.062709834949312e-06, + "loss": 0.2119, + "step": 9638 + }, + { + "epoch": 0.8222999488141955, + "grad_norm": 1.5214501353807404, + "learning_rate": 8.055188700729072e-06, + "loss": 0.1448, + "step": 9639 + }, + { + "epoch": 0.8223852584883126, + "grad_norm": 1.5312717202650774, + "learning_rate": 8.04767076877228e-06, + "loss": 0.2187, + "step": 9640 + }, + { + "epoch": 0.8224705681624296, + "grad_norm": 1.3840958410688542, + "learning_rate": 8.040156039652874e-06, + "loss": 0.1891, + "step": 9641 + }, + { + "epoch": 0.8225558778365467, + "grad_norm": 1.6118229886405657, + "learning_rate": 8.032644513944609e-06, + "loss": 0.1585, + "step": 9642 + }, + { + "epoch": 0.8226411875106637, + "grad_norm": 1.5301737214240796, + "learning_rate": 8.025136192220894e-06, + "loss": 0.2561, + "step": 9643 + }, + { + "epoch": 0.8227264971847807, + "grad_norm": 1.746438694277375, + "learning_rate": 8.017631075054994e-06, + "loss": 0.1854, + "step": 9644 + }, + { + "epoch": 0.8228118068588978, + "grad_norm": 1.5454650772725245, + "learning_rate": 8.010129163019864e-06, + "loss": 0.1251, + "step": 9645 + }, + { + "epoch": 0.8228971165330149, + "grad_norm": 1.2296433359671228, + "learning_rate": 8.002630456688238e-06, + "loss": 0.1589, + "step": 9646 + }, + { + "epoch": 0.8229824262071319, + "grad_norm": 1.9834786140167893, + "learning_rate": 7.995134956632599e-06, + "loss": 0.1845, + "step": 9647 + }, + { + "epoch": 0.8230677358812489, + "grad_norm": 1.663072891766581, + "learning_rate": 7.9876426634252e-06, + "loss": 0.1746, + "step": 9648 + }, + { + "epoch": 0.823153045555366, + "grad_norm": 1.7623104615800205, + "learning_rate": 7.980153577638022e-06, + "loss": 0.2428, + "step": 9649 + }, + { + "epoch": 0.8232383552294831, + "grad_norm": 1.3974964235973146, + "learning_rate": 7.972667699842818e-06, + "loss": 0.1574, + "step": 9650 + }, + { + "epoch": 0.8233236649036001, + "grad_norm": 2.0009964606364843, + "learning_rate": 7.965185030611127e-06, + "loss": 0.2131, + "step": 9651 + }, + { + "epoch": 0.8234089745777171, + "grad_norm": 1.4617862785721454, + "learning_rate": 7.957705570514163e-06, + "loss": 0.1372, + "step": 9652 + }, + { + "epoch": 0.8234942842518341, + "grad_norm": 1.8799651209659674, + "learning_rate": 7.95022932012297e-06, + "loss": 0.2173, + "step": 9653 + }, + { + "epoch": 0.8235795939259511, + "grad_norm": 1.454356830189986, + "learning_rate": 7.942756280008324e-06, + "loss": 0.1887, + "step": 9654 + }, + { + "epoch": 0.8236649036000683, + "grad_norm": 1.5341614273509514, + "learning_rate": 7.935286450740743e-06, + "loss": 0.1259, + "step": 9655 + }, + { + "epoch": 0.8237502132741853, + "grad_norm": 1.600781208297111, + "learning_rate": 7.927819832890498e-06, + "loss": 0.1533, + "step": 9656 + }, + { + "epoch": 0.8238355229483023, + "grad_norm": 1.95002066527934, + "learning_rate": 7.920356427027648e-06, + "loss": 0.1801, + "step": 9657 + }, + { + "epoch": 0.8239208326224193, + "grad_norm": 1.529806897312993, + "learning_rate": 7.912896233721973e-06, + "loss": 0.2243, + "step": 9658 + }, + { + "epoch": 0.8240061422965365, + "grad_norm": 1.9274990629992947, + "learning_rate": 7.905439253543023e-06, + "loss": 0.1626, + "step": 9659 + }, + { + "epoch": 0.8240914519706535, + "grad_norm": 1.7428130664656938, + "learning_rate": 7.897985487060094e-06, + "loss": 0.1914, + "step": 9660 + }, + { + "epoch": 0.8241767616447705, + "grad_norm": 1.6875719832149692, + "learning_rate": 7.890534934842242e-06, + "loss": 0.1858, + "step": 9661 + }, + { + "epoch": 0.8242620713188875, + "grad_norm": 1.7967070127827505, + "learning_rate": 7.883087597458278e-06, + "loss": 0.1754, + "step": 9662 + }, + { + "epoch": 0.8243473809930046, + "grad_norm": 2.463866413371856, + "learning_rate": 7.875643475476757e-06, + "loss": 0.1504, + "step": 9663 + }, + { + "epoch": 0.8244326906671217, + "grad_norm": 1.3222775254495867, + "learning_rate": 7.868202569466031e-06, + "loss": 0.2099, + "step": 9664 + }, + { + "epoch": 0.8245180003412387, + "grad_norm": 2.3522826451296464, + "learning_rate": 7.860764879994126e-06, + "loss": 0.2405, + "step": 9665 + }, + { + "epoch": 0.8246033100153557, + "grad_norm": 2.3183554671012767, + "learning_rate": 7.853330407628912e-06, + "loss": 0.2313, + "step": 9666 + }, + { + "epoch": 0.8246886196894728, + "grad_norm": 2.19394584281519, + "learning_rate": 7.845899152937946e-06, + "loss": 0.1576, + "step": 9667 + }, + { + "epoch": 0.8247739293635898, + "grad_norm": 1.8275917775373869, + "learning_rate": 7.838471116488577e-06, + "loss": 0.2026, + "step": 9668 + }, + { + "epoch": 0.8248592390377069, + "grad_norm": 1.5294864386025593, + "learning_rate": 7.831046298847894e-06, + "loss": 0.1636, + "step": 9669 + }, + { + "epoch": 0.8249445487118239, + "grad_norm": 1.5434438239657564, + "learning_rate": 7.823624700582728e-06, + "loss": 0.1647, + "step": 9670 + }, + { + "epoch": 0.825029858385941, + "grad_norm": 1.6609210238711627, + "learning_rate": 7.816206322259712e-06, + "loss": 0.1464, + "step": 9671 + }, + { + "epoch": 0.825115168060058, + "grad_norm": 1.788487104480848, + "learning_rate": 7.808791164445156e-06, + "loss": 0.2269, + "step": 9672 + }, + { + "epoch": 0.825200477734175, + "grad_norm": 2.3144976261544556, + "learning_rate": 7.801379227705203e-06, + "loss": 0.2133, + "step": 9673 + }, + { + "epoch": 0.8252857874082921, + "grad_norm": 1.5679561719024024, + "learning_rate": 7.793970512605703e-06, + "loss": 0.1809, + "step": 9674 + }, + { + "epoch": 0.8253710970824092, + "grad_norm": 1.4580228565868196, + "learning_rate": 7.786565019712271e-06, + "loss": 0.2076, + "step": 9675 + }, + { + "epoch": 0.8254564067565262, + "grad_norm": 1.6957642968424465, + "learning_rate": 7.779162749590268e-06, + "loss": 0.2063, + "step": 9676 + }, + { + "epoch": 0.8255417164306432, + "grad_norm": 1.808689848157332, + "learning_rate": 7.771763702804852e-06, + "loss": 0.1486, + "step": 9677 + }, + { + "epoch": 0.8256270261047602, + "grad_norm": 2.1927040323561093, + "learning_rate": 7.764367879920852e-06, + "loss": 0.1555, + "step": 9678 + }, + { + "epoch": 0.8257123357788774, + "grad_norm": 1.6563124734864072, + "learning_rate": 7.756975281502932e-06, + "loss": 0.114, + "step": 9679 + }, + { + "epoch": 0.8257976454529944, + "grad_norm": 1.9157873361825464, + "learning_rate": 7.749585908115475e-06, + "loss": 0.209, + "step": 9680 + }, + { + "epoch": 0.8258829551271114, + "grad_norm": 1.8999966470789187, + "learning_rate": 7.742199760322616e-06, + "loss": 0.2039, + "step": 9681 + }, + { + "epoch": 0.8259682648012284, + "grad_norm": 1.6892532142155807, + "learning_rate": 7.734816838688248e-06, + "loss": 0.1773, + "step": 9682 + }, + { + "epoch": 0.8260535744753456, + "grad_norm": 1.5208111939930802, + "learning_rate": 7.727437143776012e-06, + "loss": 0.2208, + "step": 9683 + }, + { + "epoch": 0.8261388841494626, + "grad_norm": 1.6644457165388253, + "learning_rate": 7.720060676149315e-06, + "loss": 0.1195, + "step": 9684 + }, + { + "epoch": 0.8262241938235796, + "grad_norm": 1.6918907969159727, + "learning_rate": 7.712687436371302e-06, + "loss": 0.1475, + "step": 9685 + }, + { + "epoch": 0.8263095034976966, + "grad_norm": 2.0810684354015625, + "learning_rate": 7.705317425004899e-06, + "loss": 0.1657, + "step": 9686 + }, + { + "epoch": 0.8263948131718137, + "grad_norm": 1.8812560933670266, + "learning_rate": 7.697950642612756e-06, + "loss": 0.2208, + "step": 9687 + }, + { + "epoch": 0.8264801228459308, + "grad_norm": 2.0331110718159437, + "learning_rate": 7.69058708975729e-06, + "loss": 0.2372, + "step": 9688 + }, + { + "epoch": 0.8265654325200478, + "grad_norm": 1.5858351199924643, + "learning_rate": 7.683226767000667e-06, + "loss": 0.2685, + "step": 9689 + }, + { + "epoch": 0.8266507421941648, + "grad_norm": 1.6968330560772715, + "learning_rate": 7.675869674904806e-06, + "loss": 0.248, + "step": 9690 + }, + { + "epoch": 0.8267360518682819, + "grad_norm": 2.0482112803254795, + "learning_rate": 7.66851581403139e-06, + "loss": 0.1906, + "step": 9691 + }, + { + "epoch": 0.8268213615423989, + "grad_norm": 1.4230721432968414, + "learning_rate": 7.661165184941832e-06, + "loss": 0.2132, + "step": 9692 + }, + { + "epoch": 0.826906671216516, + "grad_norm": 1.9820392709223174, + "learning_rate": 7.653817788197343e-06, + "loss": 0.1924, + "step": 9693 + }, + { + "epoch": 0.826991980890633, + "grad_norm": 2.3359802650441073, + "learning_rate": 7.646473624358819e-06, + "loss": 0.1942, + "step": 9694 + }, + { + "epoch": 0.82707729056475, + "grad_norm": 2.141430000359958, + "learning_rate": 7.639132693986972e-06, + "loss": 0.157, + "step": 9695 + }, + { + "epoch": 0.8271626002388671, + "grad_norm": 1.4063947391197877, + "learning_rate": 7.631794997642245e-06, + "loss": 0.1888, + "step": 9696 + }, + { + "epoch": 0.8272479099129841, + "grad_norm": 1.5935663603654244, + "learning_rate": 7.62446053588482e-06, + "loss": 0.1682, + "step": 9697 + }, + { + "epoch": 0.8273332195871012, + "grad_norm": 1.6569469353116286, + "learning_rate": 7.617129309274645e-06, + "loss": 0.1772, + "step": 9698 + }, + { + "epoch": 0.8274185292612182, + "grad_norm": 2.147226887256884, + "learning_rate": 7.6098013183714454e-06, + "loss": 0.167, + "step": 9699 + }, + { + "epoch": 0.8275038389353353, + "grad_norm": 1.6052036484194676, + "learning_rate": 7.6024765637346286e-06, + "loss": 0.2141, + "step": 9700 + }, + { + "epoch": 0.8275891486094523, + "grad_norm": 1.656335216705138, + "learning_rate": 7.595155045923435e-06, + "loss": 0.1707, + "step": 9701 + }, + { + "epoch": 0.8276744582835693, + "grad_norm": 2.235456725013745, + "learning_rate": 7.587836765496819e-06, + "loss": 0.2022, + "step": 9702 + }, + { + "epoch": 0.8277597679576864, + "grad_norm": 1.638185849593268, + "learning_rate": 7.58052172301349e-06, + "loss": 0.1748, + "step": 9703 + }, + { + "epoch": 0.8278450776318035, + "grad_norm": 1.8939521455046509, + "learning_rate": 7.573209919031909e-06, + "loss": 0.1472, + "step": 9704 + }, + { + "epoch": 0.8279303873059205, + "grad_norm": 1.4853483522319024, + "learning_rate": 7.565901354110283e-06, + "loss": 0.1512, + "step": 9705 + }, + { + "epoch": 0.8280156969800375, + "grad_norm": 1.5586735984598072, + "learning_rate": 7.558596028806614e-06, + "loss": 0.1175, + "step": 9706 + }, + { + "epoch": 0.8281010066541545, + "grad_norm": 1.8573485158398793, + "learning_rate": 7.551293943678583e-06, + "loss": 0.2136, + "step": 9707 + }, + { + "epoch": 0.8281863163282717, + "grad_norm": 1.986996338563031, + "learning_rate": 7.543995099283702e-06, + "loss": 0.2809, + "step": 9708 + }, + { + "epoch": 0.8282716260023887, + "grad_norm": 1.4006368295628748, + "learning_rate": 7.536699496179184e-06, + "loss": 0.1511, + "step": 9709 + }, + { + "epoch": 0.8283569356765057, + "grad_norm": 1.5785891917190356, + "learning_rate": 7.5294071349220065e-06, + "loss": 0.1526, + "step": 9710 + }, + { + "epoch": 0.8284422453506227, + "grad_norm": 1.5606212765332628, + "learning_rate": 7.5221180160689075e-06, + "loss": 0.1784, + "step": 9711 + }, + { + "epoch": 0.8285275550247398, + "grad_norm": 1.4522613091589116, + "learning_rate": 7.51483214017637e-06, + "loss": 0.21, + "step": 9712 + }, + { + "epoch": 0.8286128646988569, + "grad_norm": 1.999708631272404, + "learning_rate": 7.507549507800632e-06, + "loss": 0.1475, + "step": 9713 + }, + { + "epoch": 0.8286981743729739, + "grad_norm": 1.4607873523277448, + "learning_rate": 7.500270119497671e-06, + "loss": 0.1853, + "step": 9714 + }, + { + "epoch": 0.8287834840470909, + "grad_norm": 1.5221287509934336, + "learning_rate": 7.492993975823259e-06, + "loss": 0.1533, + "step": 9715 + }, + { + "epoch": 0.828868793721208, + "grad_norm": 1.7830874686531561, + "learning_rate": 7.485721077332875e-06, + "loss": 0.168, + "step": 9716 + }, + { + "epoch": 0.828954103395325, + "grad_norm": 1.6923899972348213, + "learning_rate": 7.478451424581761e-06, + "loss": 0.1762, + "step": 9717 + }, + { + "epoch": 0.8290394130694421, + "grad_norm": 2.352111144080238, + "learning_rate": 7.4711850181249156e-06, + "loss": 0.2047, + "step": 9718 + }, + { + "epoch": 0.8291247227435591, + "grad_norm": 2.688606744033061, + "learning_rate": 7.46392185851712e-06, + "loss": 0.2855, + "step": 9719 + }, + { + "epoch": 0.8292100324176762, + "grad_norm": 2.1401924754422668, + "learning_rate": 7.456661946312826e-06, + "loss": 0.1646, + "step": 9720 + }, + { + "epoch": 0.8292953420917932, + "grad_norm": 2.084251837115664, + "learning_rate": 7.44940528206633e-06, + "loss": 0.2784, + "step": 9721 + }, + { + "epoch": 0.8293806517659102, + "grad_norm": 1.4440992056881483, + "learning_rate": 7.442151866331631e-06, + "loss": 0.1987, + "step": 9722 + }, + { + "epoch": 0.8294659614400273, + "grad_norm": 1.8175513810945487, + "learning_rate": 7.434901699662477e-06, + "loss": 0.182, + "step": 9723 + }, + { + "epoch": 0.8295512711141444, + "grad_norm": 2.2867339597351943, + "learning_rate": 7.427654782612398e-06, + "loss": 0.1772, + "step": 9724 + }, + { + "epoch": 0.8296365807882614, + "grad_norm": 2.3928041187910307, + "learning_rate": 7.4204111157346375e-06, + "loss": 0.1755, + "step": 9725 + }, + { + "epoch": 0.8297218904623784, + "grad_norm": 1.8490324480129956, + "learning_rate": 7.413170699582228e-06, + "loss": 0.1916, + "step": 9726 + }, + { + "epoch": 0.8298072001364954, + "grad_norm": 2.050082423716364, + "learning_rate": 7.405933534707915e-06, + "loss": 0.2357, + "step": 9727 + }, + { + "epoch": 0.8298925098106126, + "grad_norm": 1.3691431499898703, + "learning_rate": 7.398699621664251e-06, + "loss": 0.1688, + "step": 9728 + }, + { + "epoch": 0.8299778194847296, + "grad_norm": 1.4327010615784943, + "learning_rate": 7.391468961003473e-06, + "loss": 0.1852, + "step": 9729 + }, + { + "epoch": 0.8300631291588466, + "grad_norm": 1.394773959027742, + "learning_rate": 7.384241553277621e-06, + "loss": 0.1693, + "step": 9730 + }, + { + "epoch": 0.8301484388329636, + "grad_norm": 2.1633277499573085, + "learning_rate": 7.377017399038471e-06, + "loss": 0.2021, + "step": 9731 + }, + { + "epoch": 0.8302337485070806, + "grad_norm": 1.8006376594011053, + "learning_rate": 7.3697964988375444e-06, + "loss": 0.239, + "step": 9732 + }, + { + "epoch": 0.8303190581811978, + "grad_norm": 1.6599603514934547, + "learning_rate": 7.362578853226121e-06, + "loss": 0.1305, + "step": 9733 + }, + { + "epoch": 0.8304043678553148, + "grad_norm": 1.6166722310681623, + "learning_rate": 7.3553644627552095e-06, + "loss": 0.1392, + "step": 9734 + }, + { + "epoch": 0.8304896775294318, + "grad_norm": 1.5919821414904323, + "learning_rate": 7.348153327975638e-06, + "loss": 0.1976, + "step": 9735 + }, + { + "epoch": 0.8305749872035488, + "grad_norm": 2.079792930859682, + "learning_rate": 7.340945449437881e-06, + "loss": 0.2415, + "step": 9736 + }, + { + "epoch": 0.830660296877666, + "grad_norm": 1.8929317053532575, + "learning_rate": 7.3337408276922594e-06, + "loss": 0.2662, + "step": 9737 + }, + { + "epoch": 0.830745606551783, + "grad_norm": 2.0516032578776957, + "learning_rate": 7.326539463288801e-06, + "loss": 0.1806, + "step": 9738 + }, + { + "epoch": 0.8308309162259, + "grad_norm": 1.6386337548642669, + "learning_rate": 7.3193413567772815e-06, + "loss": 0.1782, + "step": 9739 + }, + { + "epoch": 0.830916225900017, + "grad_norm": 1.7837023668137786, + "learning_rate": 7.312146508707241e-06, + "loss": 0.1976, + "step": 9740 + }, + { + "epoch": 0.8310015355741341, + "grad_norm": 1.8072099767979473, + "learning_rate": 7.304954919627993e-06, + "loss": 0.2335, + "step": 9741 + }, + { + "epoch": 0.8310868452482512, + "grad_norm": 1.4652196783245548, + "learning_rate": 7.2977665900885285e-06, + "loss": 0.1776, + "step": 9742 + }, + { + "epoch": 0.8311721549223682, + "grad_norm": 1.4722237596963657, + "learning_rate": 7.2905815206376794e-06, + "loss": 0.2107, + "step": 9743 + }, + { + "epoch": 0.8312574645964852, + "grad_norm": 1.5854928281171328, + "learning_rate": 7.283399711823974e-06, + "loss": 0.1418, + "step": 9744 + }, + { + "epoch": 0.8313427742706023, + "grad_norm": 1.5429872922145362, + "learning_rate": 7.276221164195701e-06, + "loss": 0.1965, + "step": 9745 + }, + { + "epoch": 0.8314280839447193, + "grad_norm": 1.8877815112440703, + "learning_rate": 7.269045878300912e-06, + "loss": 0.2241, + "step": 9746 + }, + { + "epoch": 0.8315133936188364, + "grad_norm": 1.9027263405461272, + "learning_rate": 7.26187385468739e-06, + "loss": 0.1864, + "step": 9747 + }, + { + "epoch": 0.8315987032929534, + "grad_norm": 1.6529169193258333, + "learning_rate": 7.254705093902708e-06, + "loss": 0.1892, + "step": 9748 + }, + { + "epoch": 0.8316840129670705, + "grad_norm": 1.6253055872052298, + "learning_rate": 7.247539596494118e-06, + "loss": 0.1932, + "step": 9749 + }, + { + "epoch": 0.8317693226411875, + "grad_norm": 2.0444169755601465, + "learning_rate": 7.24037736300871e-06, + "loss": 0.2215, + "step": 9750 + }, + { + "epoch": 0.8318546323153045, + "grad_norm": 1.5288865670490255, + "learning_rate": 7.233218393993263e-06, + "loss": 0.1732, + "step": 9751 + }, + { + "epoch": 0.8319399419894216, + "grad_norm": 1.7423032713600952, + "learning_rate": 7.226062689994328e-06, + "loss": 0.1962, + "step": 9752 + }, + { + "epoch": 0.8320252516635387, + "grad_norm": 2.3451804310046778, + "learning_rate": 7.218910251558209e-06, + "loss": 0.1876, + "step": 9753 + }, + { + "epoch": 0.8321105613376557, + "grad_norm": 2.3971340352803314, + "learning_rate": 7.2117610792309555e-06, + "loss": 0.1838, + "step": 9754 + }, + { + "epoch": 0.8321958710117727, + "grad_norm": 2.0319737245482874, + "learning_rate": 7.204615173558365e-06, + "loss": 0.172, + "step": 9755 + }, + { + "epoch": 0.8322811806858897, + "grad_norm": 1.5193479765760645, + "learning_rate": 7.197472535085981e-06, + "loss": 0.1994, + "step": 9756 + }, + { + "epoch": 0.8323664903600069, + "grad_norm": 2.128661087338648, + "learning_rate": 7.190333164359137e-06, + "loss": 0.1552, + "step": 9757 + }, + { + "epoch": 0.8324518000341239, + "grad_norm": 2.596114677829289, + "learning_rate": 7.183197061922842e-06, + "loss": 0.1702, + "step": 9758 + }, + { + "epoch": 0.8325371097082409, + "grad_norm": 1.9965290110118945, + "learning_rate": 7.176064228321938e-06, + "loss": 0.2276, + "step": 9759 + }, + { + "epoch": 0.8326224193823579, + "grad_norm": 1.550960732170149, + "learning_rate": 7.1689346641009516e-06, + "loss": 0.131, + "step": 9760 + }, + { + "epoch": 0.832707729056475, + "grad_norm": 1.884303851706035, + "learning_rate": 7.161808369804224e-06, + "loss": 0.1968, + "step": 9761 + }, + { + "epoch": 0.8327930387305921, + "grad_norm": 1.895548502167437, + "learning_rate": 7.154685345975759e-06, + "loss": 0.1874, + "step": 9762 + }, + { + "epoch": 0.8328783484047091, + "grad_norm": 1.5208695115615638, + "learning_rate": 7.1475655931594e-06, + "loss": 0.1942, + "step": 9763 + }, + { + "epoch": 0.8329636580788261, + "grad_norm": 1.5940179786529765, + "learning_rate": 7.1404491118986895e-06, + "loss": 0.2272, + "step": 9764 + }, + { + "epoch": 0.8330489677529432, + "grad_norm": 1.4355505288006223, + "learning_rate": 7.133335902736937e-06, + "loss": 0.1562, + "step": 9765 + }, + { + "epoch": 0.8331342774270603, + "grad_norm": 1.4183153301639937, + "learning_rate": 7.126225966217193e-06, + "loss": 0.1973, + "step": 9766 + }, + { + "epoch": 0.8332195871011773, + "grad_norm": 2.1256268642544107, + "learning_rate": 7.119119302882271e-06, + "loss": 0.2305, + "step": 9767 + }, + { + "epoch": 0.8333048967752943, + "grad_norm": 1.935046703947985, + "learning_rate": 7.112015913274717e-06, + "loss": 0.1607, + "step": 9768 + }, + { + "epoch": 0.8333902064494113, + "grad_norm": 1.8732927815312037, + "learning_rate": 7.1049157979368286e-06, + "loss": 0.1475, + "step": 9769 + }, + { + "epoch": 0.8334755161235284, + "grad_norm": 1.9022986310742878, + "learning_rate": 7.097818957410696e-06, + "loss": 0.1711, + "step": 9770 + }, + { + "epoch": 0.8335608257976455, + "grad_norm": 1.8555941569462242, + "learning_rate": 7.090725392238084e-06, + "loss": 0.1917, + "step": 9771 + }, + { + "epoch": 0.8336461354717625, + "grad_norm": 1.679317557825789, + "learning_rate": 7.083635102960584e-06, + "loss": 0.1492, + "step": 9772 + }, + { + "epoch": 0.8337314451458795, + "grad_norm": 1.626009334199584, + "learning_rate": 7.076548090119484e-06, + "loss": 0.1595, + "step": 9773 + }, + { + "epoch": 0.8338167548199966, + "grad_norm": 1.5354849970540183, + "learning_rate": 7.0694643542558394e-06, + "loss": 0.1297, + "step": 9774 + }, + { + "epoch": 0.8339020644941136, + "grad_norm": 2.4165216161233496, + "learning_rate": 7.062383895910463e-06, + "loss": 0.1436, + "step": 9775 + }, + { + "epoch": 0.8339873741682307, + "grad_norm": 1.8087784279445906, + "learning_rate": 7.055306715623888e-06, + "loss": 0.1365, + "step": 9776 + }, + { + "epoch": 0.8340726838423477, + "grad_norm": 2.024312187461684, + "learning_rate": 7.048232813936467e-06, + "loss": 0.2704, + "step": 9777 + }, + { + "epoch": 0.8341579935164648, + "grad_norm": 1.3297137462925874, + "learning_rate": 7.0411621913882005e-06, + "loss": 0.1116, + "step": 9778 + }, + { + "epoch": 0.8342433031905818, + "grad_norm": 2.2454278904326483, + "learning_rate": 7.034094848518924e-06, + "loss": 0.2319, + "step": 9779 + }, + { + "epoch": 0.8343286128646988, + "grad_norm": 1.6902808835159449, + "learning_rate": 7.0270307858681935e-06, + "loss": 0.2591, + "step": 9780 + }, + { + "epoch": 0.8344139225388159, + "grad_norm": 2.0605813552358008, + "learning_rate": 7.019970003975301e-06, + "loss": 0.1653, + "step": 9781 + }, + { + "epoch": 0.834499232212933, + "grad_norm": 1.669109017260114, + "learning_rate": 7.012912503379287e-06, + "loss": 0.1579, + "step": 9782 + }, + { + "epoch": 0.83458454188705, + "grad_norm": 2.274690682479324, + "learning_rate": 7.005858284618999e-06, + "loss": 0.2369, + "step": 9783 + }, + { + "epoch": 0.834669851561167, + "grad_norm": 2.0729927419635708, + "learning_rate": 6.998807348232933e-06, + "loss": 0.1984, + "step": 9784 + }, + { + "epoch": 0.834755161235284, + "grad_norm": 1.6214021320480045, + "learning_rate": 6.9917596947594364e-06, + "loss": 0.1664, + "step": 9785 + }, + { + "epoch": 0.8348404709094012, + "grad_norm": 1.7022896914326127, + "learning_rate": 6.984715324736535e-06, + "loss": 0.1752, + "step": 9786 + }, + { + "epoch": 0.8349257805835182, + "grad_norm": 1.428134679448487, + "learning_rate": 6.977674238702036e-06, + "loss": 0.1823, + "step": 9787 + }, + { + "epoch": 0.8350110902576352, + "grad_norm": 2.0672748665429888, + "learning_rate": 6.970636437193489e-06, + "loss": 0.2455, + "step": 9788 + }, + { + "epoch": 0.8350963999317522, + "grad_norm": 2.120828910578205, + "learning_rate": 6.963601920748198e-06, + "loss": 0.1167, + "step": 9789 + }, + { + "epoch": 0.8351817096058693, + "grad_norm": 1.4628062001047648, + "learning_rate": 6.956570689903197e-06, + "loss": 0.1828, + "step": 9790 + }, + { + "epoch": 0.8352670192799864, + "grad_norm": 1.7097774387550222, + "learning_rate": 6.9495427451952865e-06, + "loss": 0.2655, + "step": 9791 + }, + { + "epoch": 0.8353523289541034, + "grad_norm": 1.9366575686142036, + "learning_rate": 6.942518087161026e-06, + "loss": 0.1981, + "step": 9792 + }, + { + "epoch": 0.8354376386282204, + "grad_norm": 1.7064787016242804, + "learning_rate": 6.9354967163367035e-06, + "loss": 0.2087, + "step": 9793 + }, + { + "epoch": 0.8355229483023375, + "grad_norm": 1.8882331535833332, + "learning_rate": 6.928478633258357e-06, + "loss": 0.1844, + "step": 9794 + }, + { + "epoch": 0.8356082579764545, + "grad_norm": 1.9430415368030776, + "learning_rate": 6.921463838461789e-06, + "loss": 0.1972, + "step": 9795 + }, + { + "epoch": 0.8356935676505716, + "grad_norm": 2.3273076408379145, + "learning_rate": 6.91445233248254e-06, + "loss": 0.1776, + "step": 9796 + }, + { + "epoch": 0.8357788773246886, + "grad_norm": 1.7222920765946843, + "learning_rate": 6.907444115855899e-06, + "loss": 0.2091, + "step": 9797 + }, + { + "epoch": 0.8358641869988057, + "grad_norm": 2.090594290606152, + "learning_rate": 6.900439189116892e-06, + "loss": 0.1639, + "step": 9798 + }, + { + "epoch": 0.8359494966729227, + "grad_norm": 1.8505360780305067, + "learning_rate": 6.893437552800342e-06, + "loss": 0.1293, + "step": 9799 + }, + { + "epoch": 0.8360348063470397, + "grad_norm": 1.8073389961428818, + "learning_rate": 6.886439207440748e-06, + "loss": 0.1571, + "step": 9800 + }, + { + "epoch": 0.8361201160211568, + "grad_norm": 1.834882067598855, + "learning_rate": 6.879444153572428e-06, + "loss": 0.12, + "step": 9801 + }, + { + "epoch": 0.8362054256952739, + "grad_norm": 1.3911121672347682, + "learning_rate": 6.8724523917294e-06, + "loss": 0.1682, + "step": 9802 + }, + { + "epoch": 0.8362907353693909, + "grad_norm": 1.5100511130467313, + "learning_rate": 6.865463922445459e-06, + "loss": 0.1837, + "step": 9803 + }, + { + "epoch": 0.8363760450435079, + "grad_norm": 1.8841423941253304, + "learning_rate": 6.858478746254115e-06, + "loss": 0.2309, + "step": 9804 + }, + { + "epoch": 0.836461354717625, + "grad_norm": 1.421596122473225, + "learning_rate": 6.851496863688678e-06, + "loss": 0.1409, + "step": 9805 + }, + { + "epoch": 0.8365466643917421, + "grad_norm": 2.1456922713150535, + "learning_rate": 6.844518275282163e-06, + "loss": 0.2358, + "step": 9806 + }, + { + "epoch": 0.8366319740658591, + "grad_norm": 1.8495928754543904, + "learning_rate": 6.837542981567346e-06, + "loss": 0.2009, + "step": 9807 + }, + { + "epoch": 0.8367172837399761, + "grad_norm": 1.30649601203085, + "learning_rate": 6.830570983076761e-06, + "loss": 0.1437, + "step": 9808 + }, + { + "epoch": 0.8368025934140931, + "grad_norm": 1.36998327774089, + "learning_rate": 6.823602280342683e-06, + "loss": 0.2153, + "step": 9809 + }, + { + "epoch": 0.8368879030882101, + "grad_norm": 1.450499100125654, + "learning_rate": 6.816636873897125e-06, + "loss": 0.1478, + "step": 9810 + }, + { + "epoch": 0.8369732127623273, + "grad_norm": 1.4456633270895918, + "learning_rate": 6.809674764271862e-06, + "loss": 0.2003, + "step": 9811 + }, + { + "epoch": 0.8370585224364443, + "grad_norm": 2.5547127912010232, + "learning_rate": 6.802715951998434e-06, + "loss": 0.2643, + "step": 9812 + }, + { + "epoch": 0.8371438321105613, + "grad_norm": 1.9176150615442804, + "learning_rate": 6.795760437608073e-06, + "loss": 0.1609, + "step": 9813 + }, + { + "epoch": 0.8372291417846783, + "grad_norm": 1.782962310675086, + "learning_rate": 6.788808221631826e-06, + "loss": 0.1598, + "step": 9814 + }, + { + "epoch": 0.8373144514587955, + "grad_norm": 1.6195753224950624, + "learning_rate": 6.781859304600446e-06, + "loss": 0.1556, + "step": 9815 + }, + { + "epoch": 0.8373997611329125, + "grad_norm": 2.0948625711600077, + "learning_rate": 6.774913687044448e-06, + "loss": 0.2585, + "step": 9816 + }, + { + "epoch": 0.8374850708070295, + "grad_norm": 1.6734122223641255, + "learning_rate": 6.767971369494097e-06, + "loss": 0.1944, + "step": 9817 + }, + { + "epoch": 0.8375703804811465, + "grad_norm": 2.3494272264509313, + "learning_rate": 6.761032352479391e-06, + "loss": 0.2088, + "step": 9818 + }, + { + "epoch": 0.8376556901552636, + "grad_norm": 1.6334503242218417, + "learning_rate": 6.754096636530094e-06, + "loss": 0.1459, + "step": 9819 + }, + { + "epoch": 0.8377409998293807, + "grad_norm": 2.204462713925137, + "learning_rate": 6.7471642221757005e-06, + "loss": 0.1619, + "step": 9820 + }, + { + "epoch": 0.8378263095034977, + "grad_norm": 2.0287383763984517, + "learning_rate": 6.740235109945487e-06, + "loss": 0.0874, + "step": 9821 + }, + { + "epoch": 0.8379116191776147, + "grad_norm": 2.0737229381448796, + "learning_rate": 6.733309300368435e-06, + "loss": 0.1522, + "step": 9822 + }, + { + "epoch": 0.8379969288517318, + "grad_norm": 1.908679352197963, + "learning_rate": 6.726386793973305e-06, + "loss": 0.2011, + "step": 9823 + }, + { + "epoch": 0.8380822385258488, + "grad_norm": 1.7108030919621238, + "learning_rate": 6.719467591288569e-06, + "loss": 0.1784, + "step": 9824 + }, + { + "epoch": 0.8381675481999659, + "grad_norm": 1.6523971368628663, + "learning_rate": 6.712551692842517e-06, + "loss": 0.2034, + "step": 9825 + }, + { + "epoch": 0.8382528578740829, + "grad_norm": 2.064621037030838, + "learning_rate": 6.705639099163091e-06, + "loss": 0.1941, + "step": 9826 + }, + { + "epoch": 0.8383381675482, + "grad_norm": 2.676451839869598, + "learning_rate": 6.698729810778065e-06, + "loss": 0.1251, + "step": 9827 + }, + { + "epoch": 0.838423477222317, + "grad_norm": 2.039075361773971, + "learning_rate": 6.691823828214916e-06, + "loss": 0.1999, + "step": 9828 + }, + { + "epoch": 0.838508786896434, + "grad_norm": 2.1345769828519736, + "learning_rate": 6.684921152000878e-06, + "loss": 0.2076, + "step": 9829 + }, + { + "epoch": 0.8385940965705511, + "grad_norm": 1.9973975773836468, + "learning_rate": 6.678021782662935e-06, + "loss": 0.166, + "step": 9830 + }, + { + "epoch": 0.8386794062446682, + "grad_norm": 2.92444501649116, + "learning_rate": 6.671125720727817e-06, + "loss": 0.2432, + "step": 9831 + }, + { + "epoch": 0.8387647159187852, + "grad_norm": 1.6590341882522845, + "learning_rate": 6.664232966721995e-06, + "loss": 0.1378, + "step": 9832 + }, + { + "epoch": 0.8388500255929022, + "grad_norm": 1.364270307829096, + "learning_rate": 6.657343521171694e-06, + "loss": 0.1942, + "step": 9833 + }, + { + "epoch": 0.8389353352670192, + "grad_norm": 1.2491260811487928, + "learning_rate": 6.650457384602915e-06, + "loss": 0.1283, + "step": 9834 + }, + { + "epoch": 0.8390206449411364, + "grad_norm": 1.7084958844267877, + "learning_rate": 6.643574557541332e-06, + "loss": 0.2287, + "step": 9835 + }, + { + "epoch": 0.8391059546152534, + "grad_norm": 1.7769988343638052, + "learning_rate": 6.6366950405124415e-06, + "loss": 0.1989, + "step": 9836 + }, + { + "epoch": 0.8391912642893704, + "grad_norm": 1.6100743079955833, + "learning_rate": 6.629818834041457e-06, + "loss": 0.131, + "step": 9837 + }, + { + "epoch": 0.8392765739634874, + "grad_norm": 1.628403254360994, + "learning_rate": 6.622945938653341e-06, + "loss": 0.1774, + "step": 9838 + }, + { + "epoch": 0.8393618836376046, + "grad_norm": 2.5138057029922263, + "learning_rate": 6.616076354872791e-06, + "loss": 0.2023, + "step": 9839 + }, + { + "epoch": 0.8394471933117216, + "grad_norm": 1.4900922195007185, + "learning_rate": 6.60921008322426e-06, + "loss": 0.1729, + "step": 9840 + }, + { + "epoch": 0.8395325029858386, + "grad_norm": 1.7913879872724987, + "learning_rate": 6.6023471242319775e-06, + "loss": 0.1616, + "step": 9841 + }, + { + "epoch": 0.8396178126599556, + "grad_norm": 2.434598320449103, + "learning_rate": 6.595487478419859e-06, + "loss": 0.1849, + "step": 9842 + }, + { + "epoch": 0.8397031223340727, + "grad_norm": 1.479854728167428, + "learning_rate": 6.588631146311635e-06, + "loss": 0.1386, + "step": 9843 + }, + { + "epoch": 0.8397884320081898, + "grad_norm": 1.987963458072689, + "learning_rate": 6.581778128430732e-06, + "loss": 0.2208, + "step": 9844 + }, + { + "epoch": 0.8398737416823068, + "grad_norm": 1.5621708332476816, + "learning_rate": 6.574928425300347e-06, + "loss": 0.1712, + "step": 9845 + }, + { + "epoch": 0.8399590513564238, + "grad_norm": 1.491256903019266, + "learning_rate": 6.568082037443401e-06, + "loss": 0.1875, + "step": 9846 + }, + { + "epoch": 0.8400443610305408, + "grad_norm": 2.107899693862297, + "learning_rate": 6.5612389653826215e-06, + "loss": 0.2178, + "step": 9847 + }, + { + "epoch": 0.8401296707046579, + "grad_norm": 1.3467798894238956, + "learning_rate": 6.5543992096403885e-06, + "loss": 0.203, + "step": 9848 + }, + { + "epoch": 0.840214980378775, + "grad_norm": 1.6702584981440922, + "learning_rate": 6.5475627707389135e-06, + "loss": 0.1936, + "step": 9849 + }, + { + "epoch": 0.840300290052892, + "grad_norm": 2.3797816277345962, + "learning_rate": 6.540729649200123e-06, + "loss": 0.215, + "step": 9850 + }, + { + "epoch": 0.840385599727009, + "grad_norm": 1.6400686138035196, + "learning_rate": 6.533899845545677e-06, + "loss": 0.1937, + "step": 9851 + }, + { + "epoch": 0.8404709094011261, + "grad_norm": 1.463174260225604, + "learning_rate": 6.527073360296998e-06, + "loss": 0.1161, + "step": 9852 + }, + { + "epoch": 0.8405562190752431, + "grad_norm": 1.6709226944286908, + "learning_rate": 6.520250193975242e-06, + "loss": 0.1671, + "step": 9853 + }, + { + "epoch": 0.8406415287493602, + "grad_norm": 1.7310830569425515, + "learning_rate": 6.513430347101357e-06, + "loss": 0.1211, + "step": 9854 + }, + { + "epoch": 0.8407268384234772, + "grad_norm": 1.5810898228511827, + "learning_rate": 6.506613820195956e-06, + "loss": 0.1897, + "step": 9855 + }, + { + "epoch": 0.8408121480975943, + "grad_norm": 1.2117036426224672, + "learning_rate": 6.499800613779472e-06, + "loss": 0.2077, + "step": 9856 + }, + { + "epoch": 0.8408974577717113, + "grad_norm": 1.8441402862748908, + "learning_rate": 6.492990728372056e-06, + "loss": 0.1262, + "step": 9857 + }, + { + "epoch": 0.8409827674458283, + "grad_norm": 1.4980414637899335, + "learning_rate": 6.486184164493603e-06, + "loss": 0.161, + "step": 9858 + }, + { + "epoch": 0.8410680771199454, + "grad_norm": 1.7374066499291458, + "learning_rate": 6.479380922663752e-06, + "loss": 0.1917, + "step": 9859 + }, + { + "epoch": 0.8411533867940625, + "grad_norm": 1.8458501202212494, + "learning_rate": 6.4725810034019064e-06, + "loss": 0.2148, + "step": 9860 + }, + { + "epoch": 0.8412386964681795, + "grad_norm": 1.4917918374734924, + "learning_rate": 6.465784407227194e-06, + "loss": 0.1546, + "step": 9861 + }, + { + "epoch": 0.8413240061422965, + "grad_norm": 1.9208317848359304, + "learning_rate": 6.458991134658487e-06, + "loss": 0.2598, + "step": 9862 + }, + { + "epoch": 0.8414093158164135, + "grad_norm": 1.7318806625378265, + "learning_rate": 6.4522011862144515e-06, + "loss": 0.1532, + "step": 9863 + }, + { + "epoch": 0.8414946254905307, + "grad_norm": 2.099407012413737, + "learning_rate": 6.445414562413427e-06, + "loss": 0.1362, + "step": 9864 + }, + { + "epoch": 0.8415799351646477, + "grad_norm": 1.4785877043918827, + "learning_rate": 6.438631263773559e-06, + "loss": 0.1796, + "step": 9865 + }, + { + "epoch": 0.8416652448387647, + "grad_norm": 1.874439727998619, + "learning_rate": 6.431851290812696e-06, + "loss": 0.1905, + "step": 9866 + }, + { + "epoch": 0.8417505545128817, + "grad_norm": 1.5606454142563284, + "learning_rate": 6.42507464404849e-06, + "loss": 0.1858, + "step": 9867 + }, + { + "epoch": 0.8418358641869989, + "grad_norm": 1.6349910822167266, + "learning_rate": 6.41830132399826e-06, + "loss": 0.1419, + "step": 9868 + }, + { + "epoch": 0.8419211738611159, + "grad_norm": 1.7125768602484097, + "learning_rate": 6.411531331179138e-06, + "loss": 0.1369, + "step": 9869 + }, + { + "epoch": 0.8420064835352329, + "grad_norm": 2.089616367412393, + "learning_rate": 6.404764666107971e-06, + "loss": 0.2085, + "step": 9870 + }, + { + "epoch": 0.8420917932093499, + "grad_norm": 1.8551253151529319, + "learning_rate": 6.398001329301356e-06, + "loss": 0.1753, + "step": 9871 + }, + { + "epoch": 0.842177102883467, + "grad_norm": 2.409138518725062, + "learning_rate": 6.391241321275637e-06, + "loss": 0.2024, + "step": 9872 + }, + { + "epoch": 0.842262412557584, + "grad_norm": 1.9407644765368752, + "learning_rate": 6.384484642546912e-06, + "loss": 0.141, + "step": 9873 + }, + { + "epoch": 0.8423477222317011, + "grad_norm": 1.5837915493664192, + "learning_rate": 6.377731293631006e-06, + "loss": 0.1231, + "step": 9874 + }, + { + "epoch": 0.8424330319058181, + "grad_norm": 1.9673278152563376, + "learning_rate": 6.370981275043497e-06, + "loss": 0.193, + "step": 9875 + }, + { + "epoch": 0.8425183415799352, + "grad_norm": 1.9803921837085565, + "learning_rate": 6.36423458729975e-06, + "loss": 0.1422, + "step": 9876 + }, + { + "epoch": 0.8426036512540522, + "grad_norm": 1.511058337834362, + "learning_rate": 6.357491230914786e-06, + "loss": 0.2077, + "step": 9877 + }, + { + "epoch": 0.8426889609281693, + "grad_norm": 1.9290702543067284, + "learning_rate": 6.350751206403466e-06, + "loss": 0.1947, + "step": 9878 + }, + { + "epoch": 0.8427742706022863, + "grad_norm": 1.5467877604226952, + "learning_rate": 6.344014514280333e-06, + "loss": 0.1319, + "step": 9879 + }, + { + "epoch": 0.8428595802764034, + "grad_norm": 1.9250071364431498, + "learning_rate": 6.33728115505971e-06, + "loss": 0.2177, + "step": 9880 + }, + { + "epoch": 0.8429448899505204, + "grad_norm": 2.4498553992436714, + "learning_rate": 6.3305511292556405e-06, + "loss": 0.2187, + "step": 9881 + }, + { + "epoch": 0.8430301996246374, + "grad_norm": 1.8964799749321382, + "learning_rate": 6.323824437381931e-06, + "loss": 0.1923, + "step": 9882 + }, + { + "epoch": 0.8431155092987545, + "grad_norm": 2.2886091701279514, + "learning_rate": 6.317101079952148e-06, + "loss": 0.1859, + "step": 9883 + }, + { + "epoch": 0.8432008189728715, + "grad_norm": 1.6652564122251658, + "learning_rate": 6.310381057479542e-06, + "loss": 0.1709, + "step": 9884 + }, + { + "epoch": 0.8432861286469886, + "grad_norm": 1.6461299999067562, + "learning_rate": 6.303664370477192e-06, + "loss": 0.1801, + "step": 9885 + }, + { + "epoch": 0.8433714383211056, + "grad_norm": 1.9063834471972847, + "learning_rate": 6.296951019457864e-06, + "loss": 0.1389, + "step": 9886 + }, + { + "epoch": 0.8434567479952226, + "grad_norm": 1.5285452485229956, + "learning_rate": 6.290241004934083e-06, + "loss": 0.1328, + "step": 9887 + }, + { + "epoch": 0.8435420576693397, + "grad_norm": 1.9232474288179444, + "learning_rate": 6.283534327418122e-06, + "loss": 0.1902, + "step": 9888 + }, + { + "epoch": 0.8436273673434568, + "grad_norm": 1.3759910306582765, + "learning_rate": 6.276830987422028e-06, + "loss": 0.1398, + "step": 9889 + }, + { + "epoch": 0.8437126770175738, + "grad_norm": 2.316738677335029, + "learning_rate": 6.270130985457523e-06, + "loss": 0.203, + "step": 9890 + }, + { + "epoch": 0.8437979866916908, + "grad_norm": 2.37603817384411, + "learning_rate": 6.2634343220361436e-06, + "loss": 0.2045, + "step": 9891 + }, + { + "epoch": 0.8438832963658078, + "grad_norm": 1.9925369853723625, + "learning_rate": 6.256740997669142e-06, + "loss": 0.1987, + "step": 9892 + }, + { + "epoch": 0.843968606039925, + "grad_norm": 1.7227424721105657, + "learning_rate": 6.2500510128675085e-06, + "loss": 0.1517, + "step": 9893 + }, + { + "epoch": 0.844053915714042, + "grad_norm": 1.7142342633519079, + "learning_rate": 6.243364368141996e-06, + "loss": 0.1817, + "step": 9894 + }, + { + "epoch": 0.844139225388159, + "grad_norm": 1.5026604582910195, + "learning_rate": 6.2366810640030805e-06, + "loss": 0.2267, + "step": 9895 + }, + { + "epoch": 0.844224535062276, + "grad_norm": 1.7668388042918806, + "learning_rate": 6.230001100961031e-06, + "loss": 0.1825, + "step": 9896 + }, + { + "epoch": 0.8443098447363931, + "grad_norm": 2.447575311330086, + "learning_rate": 6.223324479525778e-06, + "loss": 0.1755, + "step": 9897 + }, + { + "epoch": 0.8443951544105102, + "grad_norm": 1.96906302249419, + "learning_rate": 6.216651200207085e-06, + "loss": 0.207, + "step": 9898 + }, + { + "epoch": 0.8444804640846272, + "grad_norm": 1.7160495698256018, + "learning_rate": 6.209981263514414e-06, + "loss": 0.1626, + "step": 9899 + }, + { + "epoch": 0.8445657737587442, + "grad_norm": 2.031798303400986, + "learning_rate": 6.203314669956967e-06, + "loss": 0.2415, + "step": 9900 + }, + { + "epoch": 0.8446510834328613, + "grad_norm": 2.0017426766292217, + "learning_rate": 6.1966514200437084e-06, + "loss": 0.1783, + "step": 9901 + }, + { + "epoch": 0.8447363931069783, + "grad_norm": 1.3574477378522023, + "learning_rate": 6.18999151428335e-06, + "loss": 0.1229, + "step": 9902 + }, + { + "epoch": 0.8448217027810954, + "grad_norm": 1.6686308571791777, + "learning_rate": 6.183334953184328e-06, + "loss": 0.1463, + "step": 9903 + }, + { + "epoch": 0.8449070124552124, + "grad_norm": 1.835318473183947, + "learning_rate": 6.1766817372548305e-06, + "loss": 0.1464, + "step": 9904 + }, + { + "epoch": 0.8449923221293295, + "grad_norm": 1.680496446850331, + "learning_rate": 6.170031867002829e-06, + "loss": 0.1662, + "step": 9905 + }, + { + "epoch": 0.8450776318034465, + "grad_norm": 1.5782744081985436, + "learning_rate": 6.163385342935963e-06, + "loss": 0.1032, + "step": 9906 + }, + { + "epoch": 0.8451629414775635, + "grad_norm": 1.3237278571361641, + "learning_rate": 6.1567421655616856e-06, + "loss": 0.1555, + "step": 9907 + }, + { + "epoch": 0.8452482511516806, + "grad_norm": 1.8094060074911513, + "learning_rate": 6.150102335387159e-06, + "loss": 0.1093, + "step": 9908 + }, + { + "epoch": 0.8453335608257977, + "grad_norm": 1.960928989103169, + "learning_rate": 6.143465852919306e-06, + "loss": 0.2021, + "step": 9909 + }, + { + "epoch": 0.8454188704999147, + "grad_norm": 1.8135702985080604, + "learning_rate": 6.136832718664765e-06, + "loss": 0.1543, + "step": 9910 + }, + { + "epoch": 0.8455041801740317, + "grad_norm": 1.5067299232594764, + "learning_rate": 6.130202933129975e-06, + "loss": 0.1435, + "step": 9911 + }, + { + "epoch": 0.8455894898481487, + "grad_norm": 1.6853300907766762, + "learning_rate": 6.12357649682106e-06, + "loss": 0.2028, + "step": 9912 + }, + { + "epoch": 0.8456747995222659, + "grad_norm": 1.5828594535950453, + "learning_rate": 6.116953410243925e-06, + "loss": 0.1575, + "step": 9913 + }, + { + "epoch": 0.8457601091963829, + "grad_norm": 1.864385953149134, + "learning_rate": 6.1103336739042e-06, + "loss": 0.194, + "step": 9914 + }, + { + "epoch": 0.8458454188704999, + "grad_norm": 1.6321770558615594, + "learning_rate": 6.103717288307275e-06, + "loss": 0.1534, + "step": 9915 + }, + { + "epoch": 0.8459307285446169, + "grad_norm": 1.747634310492982, + "learning_rate": 6.097104253958263e-06, + "loss": 0.1762, + "step": 9916 + }, + { + "epoch": 0.8460160382187341, + "grad_norm": 1.4878392802317928, + "learning_rate": 6.090494571362037e-06, + "loss": 0.1736, + "step": 9917 + }, + { + "epoch": 0.8461013478928511, + "grad_norm": 1.3120086749389004, + "learning_rate": 6.083888241023234e-06, + "loss": 0.1054, + "step": 9918 + }, + { + "epoch": 0.8461866575669681, + "grad_norm": 1.6909668948668162, + "learning_rate": 6.077285263446175e-06, + "loss": 0.2116, + "step": 9919 + }, + { + "epoch": 0.8462719672410851, + "grad_norm": 1.9233397817696423, + "learning_rate": 6.070685639134988e-06, + "loss": 0.1645, + "step": 9920 + }, + { + "epoch": 0.8463572769152022, + "grad_norm": 1.861539014175367, + "learning_rate": 6.064089368593517e-06, + "loss": 0.1619, + "step": 9921 + }, + { + "epoch": 0.8464425865893193, + "grad_norm": 2.1047295173240124, + "learning_rate": 6.057496452325346e-06, + "loss": 0.1564, + "step": 9922 + }, + { + "epoch": 0.8465278962634363, + "grad_norm": 1.712675979371154, + "learning_rate": 6.0509068908338075e-06, + "loss": 0.1613, + "step": 9923 + }, + { + "epoch": 0.8466132059375533, + "grad_norm": 2.005840238277446, + "learning_rate": 6.044320684621985e-06, + "loss": 0.1387, + "step": 9924 + }, + { + "epoch": 0.8466985156116703, + "grad_norm": 1.7025477979899148, + "learning_rate": 6.037737834192697e-06, + "loss": 0.1592, + "step": 9925 + }, + { + "epoch": 0.8467838252857874, + "grad_norm": 1.6385113861461316, + "learning_rate": 6.031158340048504e-06, + "loss": 0.1926, + "step": 9926 + }, + { + "epoch": 0.8468691349599045, + "grad_norm": 1.6408640369482914, + "learning_rate": 6.0245822026917256e-06, + "loss": 0.179, + "step": 9927 + }, + { + "epoch": 0.8469544446340215, + "grad_norm": 1.3421644570767355, + "learning_rate": 6.018009422624415e-06, + "loss": 0.1204, + "step": 9928 + }, + { + "epoch": 0.8470397543081385, + "grad_norm": 1.8415327803102044, + "learning_rate": 6.011440000348362e-06, + "loss": 0.1803, + "step": 9929 + }, + { + "epoch": 0.8471250639822556, + "grad_norm": 1.7217755823806287, + "learning_rate": 6.004873936365102e-06, + "loss": 0.1386, + "step": 9930 + }, + { + "epoch": 0.8472103736563726, + "grad_norm": 2.3372792756828655, + "learning_rate": 5.998311231175946e-06, + "loss": 0.1815, + "step": 9931 + }, + { + "epoch": 0.8472956833304897, + "grad_norm": 1.675146876485848, + "learning_rate": 5.991751885281882e-06, + "loss": 0.1428, + "step": 9932 + }, + { + "epoch": 0.8473809930046067, + "grad_norm": 1.5640330613878524, + "learning_rate": 5.985195899183715e-06, + "loss": 0.2987, + "step": 9933 + }, + { + "epoch": 0.8474663026787238, + "grad_norm": 1.987948646530711, + "learning_rate": 5.978643273381945e-06, + "loss": 0.1851, + "step": 9934 + }, + { + "epoch": 0.8475516123528408, + "grad_norm": 2.4247706668601126, + "learning_rate": 5.9720940083768325e-06, + "loss": 0.2393, + "step": 9935 + }, + { + "epoch": 0.8476369220269578, + "grad_norm": 1.798336330454924, + "learning_rate": 5.965548104668378e-06, + "loss": 0.1409, + "step": 9936 + }, + { + "epoch": 0.8477222317010749, + "grad_norm": 1.6561968812881587, + "learning_rate": 5.9590055627563256e-06, + "loss": 0.2014, + "step": 9937 + }, + { + "epoch": 0.847807541375192, + "grad_norm": 2.5166784888320364, + "learning_rate": 5.95246638314017e-06, + "loss": 0.1801, + "step": 9938 + }, + { + "epoch": 0.847892851049309, + "grad_norm": 1.6351395228405659, + "learning_rate": 5.945930566319119e-06, + "loss": 0.2038, + "step": 9939 + }, + { + "epoch": 0.847978160723426, + "grad_norm": 1.4910392137739767, + "learning_rate": 5.939398112792183e-06, + "loss": 0.1414, + "step": 9940 + }, + { + "epoch": 0.848063470397543, + "grad_norm": 1.840682889225557, + "learning_rate": 5.932869023058063e-06, + "loss": 0.1481, + "step": 9941 + }, + { + "epoch": 0.8481487800716602, + "grad_norm": 2.182903146465868, + "learning_rate": 5.926343297615216e-06, + "loss": 0.1655, + "step": 9942 + }, + { + "epoch": 0.8482340897457772, + "grad_norm": 1.5596555377220465, + "learning_rate": 5.919820936961856e-06, + "loss": 0.1796, + "step": 9943 + }, + { + "epoch": 0.8483193994198942, + "grad_norm": 1.475967044958645, + "learning_rate": 5.913301941595922e-06, + "loss": 0.1147, + "step": 9944 + }, + { + "epoch": 0.8484047090940112, + "grad_norm": 2.388203510270791, + "learning_rate": 5.906786312015111e-06, + "loss": 0.1817, + "step": 9945 + }, + { + "epoch": 0.8484900187681284, + "grad_norm": 1.6886080177193592, + "learning_rate": 5.900274048716841e-06, + "loss": 0.1923, + "step": 9946 + }, + { + "epoch": 0.8485753284422454, + "grad_norm": 1.4922624339516986, + "learning_rate": 5.893765152198327e-06, + "loss": 0.1768, + "step": 9947 + }, + { + "epoch": 0.8486606381163624, + "grad_norm": 2.1126306346702832, + "learning_rate": 5.887259622956437e-06, + "loss": 0.2657, + "step": 9948 + }, + { + "epoch": 0.8487459477904794, + "grad_norm": 1.4008758427881283, + "learning_rate": 5.8807574614878734e-06, + "loss": 0.1354, + "step": 9949 + }, + { + "epoch": 0.8488312574645965, + "grad_norm": 1.6012824581290055, + "learning_rate": 5.874258668289029e-06, + "loss": 0.1995, + "step": 9950 + }, + { + "epoch": 0.8489165671387136, + "grad_norm": 2.5344542014732157, + "learning_rate": 5.86776324385605e-06, + "loss": 0.216, + "step": 9951 + }, + { + "epoch": 0.8490018768128306, + "grad_norm": 2.109586351897177, + "learning_rate": 5.8612711886848196e-06, + "loss": 0.1783, + "step": 9952 + }, + { + "epoch": 0.8490871864869476, + "grad_norm": 1.7497939260947895, + "learning_rate": 5.8547825032710006e-06, + "loss": 0.1965, + "step": 9953 + }, + { + "epoch": 0.8491724961610647, + "grad_norm": 1.7256515654522788, + "learning_rate": 5.84829718810993e-06, + "loss": 0.2123, + "step": 9954 + }, + { + "epoch": 0.8492578058351817, + "grad_norm": 1.6009551594713511, + "learning_rate": 5.8418152436967575e-06, + "loss": 0.2218, + "step": 9955 + }, + { + "epoch": 0.8493431155092988, + "grad_norm": 2.1301290657448333, + "learning_rate": 5.835336670526331e-06, + "loss": 0.2127, + "step": 9956 + }, + { + "epoch": 0.8494284251834158, + "grad_norm": 1.6441721037001247, + "learning_rate": 5.828861469093266e-06, + "loss": 0.1537, + "step": 9957 + }, + { + "epoch": 0.8495137348575329, + "grad_norm": 1.8006664578803595, + "learning_rate": 5.822389639891895e-06, + "loss": 0.1478, + "step": 9958 + }, + { + "epoch": 0.8495990445316499, + "grad_norm": 1.7546550961743912, + "learning_rate": 5.8159211834163116e-06, + "loss": 0.2498, + "step": 9959 + }, + { + "epoch": 0.8496843542057669, + "grad_norm": 1.924483227413644, + "learning_rate": 5.809456100160371e-06, + "loss": 0.1741, + "step": 9960 + }, + { + "epoch": 0.849769663879884, + "grad_norm": 1.540927280074179, + "learning_rate": 5.802994390617605e-06, + "loss": 0.2267, + "step": 9961 + }, + { + "epoch": 0.849854973554001, + "grad_norm": 2.197605442472814, + "learning_rate": 5.796536055281371e-06, + "loss": 0.2301, + "step": 9962 + }, + { + "epoch": 0.8499402832281181, + "grad_norm": 1.7717276241094715, + "learning_rate": 5.790081094644706e-06, + "loss": 0.2202, + "step": 9963 + }, + { + "epoch": 0.8500255929022351, + "grad_norm": 2.184270600319457, + "learning_rate": 5.783629509200423e-06, + "loss": 0.1614, + "step": 9964 + }, + { + "epoch": 0.8501109025763521, + "grad_norm": 1.6462157404759177, + "learning_rate": 5.777181299441054e-06, + "loss": 0.2439, + "step": 9965 + }, + { + "epoch": 0.8501962122504692, + "grad_norm": 1.63454407497575, + "learning_rate": 5.7707364658589e-06, + "loss": 0.1991, + "step": 9966 + }, + { + "epoch": 0.8502815219245863, + "grad_norm": 1.6087165152051948, + "learning_rate": 5.7642950089459805e-06, + "loss": 0.187, + "step": 9967 + }, + { + "epoch": 0.8503668315987033, + "grad_norm": 1.4271271506309426, + "learning_rate": 5.757856929194061e-06, + "loss": 0.2182, + "step": 9968 + }, + { + "epoch": 0.8504521412728203, + "grad_norm": 1.6044692294985257, + "learning_rate": 5.751422227094677e-06, + "loss": 0.177, + "step": 9969 + }, + { + "epoch": 0.8505374509469373, + "grad_norm": 1.5041859552444676, + "learning_rate": 5.744990903139053e-06, + "loss": 0.196, + "step": 9970 + }, + { + "epoch": 0.8506227606210545, + "grad_norm": 1.7588690401200906, + "learning_rate": 5.738562957818217e-06, + "loss": 0.1868, + "step": 9971 + }, + { + "epoch": 0.8507080702951715, + "grad_norm": 1.7765625966255745, + "learning_rate": 5.7321383916228764e-06, + "loss": 0.1265, + "step": 9972 + }, + { + "epoch": 0.8507933799692885, + "grad_norm": 1.9605513720661276, + "learning_rate": 5.725717205043552e-06, + "loss": 0.1562, + "step": 9973 + }, + { + "epoch": 0.8508786896434055, + "grad_norm": 1.5504469042669762, + "learning_rate": 5.719299398570427e-06, + "loss": 0.1335, + "step": 9974 + }, + { + "epoch": 0.8509639993175226, + "grad_norm": 1.799466462005358, + "learning_rate": 5.7128849726935e-06, + "loss": 0.1603, + "step": 9975 + }, + { + "epoch": 0.8510493089916397, + "grad_norm": 2.071910313554163, + "learning_rate": 5.706473927902456e-06, + "loss": 0.1804, + "step": 9976 + }, + { + "epoch": 0.8511346186657567, + "grad_norm": 1.5039287614995358, + "learning_rate": 5.700066264686759e-06, + "loss": 0.1745, + "step": 9977 + }, + { + "epoch": 0.8512199283398737, + "grad_norm": 1.4473283885365877, + "learning_rate": 5.693661983535587e-06, + "loss": 0.1643, + "step": 9978 + }, + { + "epoch": 0.8513052380139908, + "grad_norm": 1.7863348945107353, + "learning_rate": 5.687261084937884e-06, + "loss": 0.1816, + "step": 9979 + }, + { + "epoch": 0.8513905476881078, + "grad_norm": 2.307457762194208, + "learning_rate": 5.680863569382316e-06, + "loss": 0.2016, + "step": 9980 + }, + { + "epoch": 0.8514758573622249, + "grad_norm": 1.945861738870292, + "learning_rate": 5.674469437357293e-06, + "loss": 0.2569, + "step": 9981 + }, + { + "epoch": 0.8515611670363419, + "grad_norm": 2.8052513809574657, + "learning_rate": 5.668078689351009e-06, + "loss": 0.1654, + "step": 9982 + }, + { + "epoch": 0.851646476710459, + "grad_norm": 1.5331379676142909, + "learning_rate": 5.66169132585131e-06, + "loss": 0.1614, + "step": 9983 + }, + { + "epoch": 0.851731786384576, + "grad_norm": 1.3693663755186196, + "learning_rate": 5.655307347345879e-06, + "loss": 0.1965, + "step": 9984 + }, + { + "epoch": 0.851817096058693, + "grad_norm": 2.509207554381334, + "learning_rate": 5.648926754322081e-06, + "loss": 0.1975, + "step": 9985 + }, + { + "epoch": 0.8519024057328101, + "grad_norm": 1.4866694335455586, + "learning_rate": 5.642549547267045e-06, + "loss": 0.1924, + "step": 9986 + }, + { + "epoch": 0.8519877154069272, + "grad_norm": 1.8499814780055563, + "learning_rate": 5.636175726667636e-06, + "loss": 0.1914, + "step": 9987 + }, + { + "epoch": 0.8520730250810442, + "grad_norm": 2.045885618620342, + "learning_rate": 5.6298052930104536e-06, + "loss": 0.1732, + "step": 9988 + }, + { + "epoch": 0.8521583347551612, + "grad_norm": 1.8724331133601457, + "learning_rate": 5.62343824678187e-06, + "loss": 0.1839, + "step": 9989 + }, + { + "epoch": 0.8522436444292782, + "grad_norm": 2.0916183780197617, + "learning_rate": 5.617074588467941e-06, + "loss": 0.1295, + "step": 9990 + }, + { + "epoch": 0.8523289541033954, + "grad_norm": 1.286694380176783, + "learning_rate": 5.610714318554527e-06, + "loss": 0.1473, + "step": 9991 + }, + { + "epoch": 0.8524142637775124, + "grad_norm": 2.0478056910247515, + "learning_rate": 5.604357437527191e-06, + "loss": 0.2612, + "step": 9992 + }, + { + "epoch": 0.8524995734516294, + "grad_norm": 1.972624221390668, + "learning_rate": 5.5980039458712494e-06, + "loss": 0.1874, + "step": 9993 + }, + { + "epoch": 0.8525848831257464, + "grad_norm": 1.4541796067034656, + "learning_rate": 5.591653844071743e-06, + "loss": 0.1413, + "step": 9994 + }, + { + "epoch": 0.8526701927998636, + "grad_norm": 1.718931986104035, + "learning_rate": 5.585307132613493e-06, + "loss": 0.184, + "step": 9995 + }, + { + "epoch": 0.8527555024739806, + "grad_norm": 2.893425651811962, + "learning_rate": 5.578963811981014e-06, + "loss": 0.1991, + "step": 9996 + }, + { + "epoch": 0.8528408121480976, + "grad_norm": 1.6853802400793425, + "learning_rate": 5.572623882658595e-06, + "loss": 0.1775, + "step": 9997 + }, + { + "epoch": 0.8529261218222146, + "grad_norm": 1.935130978274125, + "learning_rate": 5.566287345130267e-06, + "loss": 0.2265, + "step": 9998 + }, + { + "epoch": 0.8530114314963316, + "grad_norm": 1.8030793923042858, + "learning_rate": 5.559954199879769e-06, + "loss": 0.1644, + "step": 9999 + }, + { + "epoch": 0.8530967411704488, + "grad_norm": 1.8005580143119055, + "learning_rate": 5.553624447390621e-06, + "loss": 0.1726, + "step": 10000 + }, + { + "epoch": 0.8531820508445658, + "grad_norm": 2.1740866179209375, + "learning_rate": 5.547298088146047e-06, + "loss": 0.1773, + "step": 10001 + }, + { + "epoch": 0.8532673605186828, + "grad_norm": 2.4048210954643143, + "learning_rate": 5.540975122629061e-06, + "loss": 0.1539, + "step": 10002 + }, + { + "epoch": 0.8533526701927998, + "grad_norm": 1.6736925887046836, + "learning_rate": 5.5346555513223485e-06, + "loss": 0.1848, + "step": 10003 + }, + { + "epoch": 0.8534379798669169, + "grad_norm": 1.7702291767009188, + "learning_rate": 5.5283393747084075e-06, + "loss": 0.2065, + "step": 10004 + }, + { + "epoch": 0.853523289541034, + "grad_norm": 2.2974858931314865, + "learning_rate": 5.52202659326943e-06, + "loss": 0.2522, + "step": 10005 + }, + { + "epoch": 0.853608599215151, + "grad_norm": 1.3589045708128253, + "learning_rate": 5.51571720748737e-06, + "loss": 0.1767, + "step": 10006 + }, + { + "epoch": 0.853693908889268, + "grad_norm": 1.7817038242377623, + "learning_rate": 5.509411217843913e-06, + "loss": 0.1981, + "step": 10007 + }, + { + "epoch": 0.8537792185633851, + "grad_norm": 1.312122472326899, + "learning_rate": 5.503108624820486e-06, + "loss": 0.1761, + "step": 10008 + }, + { + "epoch": 0.8538645282375021, + "grad_norm": 2.4207823226553065, + "learning_rate": 5.4968094288982585e-06, + "loss": 0.1649, + "step": 10009 + }, + { + "epoch": 0.8539498379116192, + "grad_norm": 1.7038098979224963, + "learning_rate": 5.49051363055813e-06, + "loss": 0.1679, + "step": 10010 + }, + { + "epoch": 0.8540351475857362, + "grad_norm": 1.849868321243088, + "learning_rate": 5.484221230280784e-06, + "loss": 0.1994, + "step": 10011 + }, + { + "epoch": 0.8541204572598533, + "grad_norm": 1.662956694208281, + "learning_rate": 5.477932228546573e-06, + "loss": 0.1997, + "step": 10012 + }, + { + "epoch": 0.8542057669339703, + "grad_norm": 1.7338028385001385, + "learning_rate": 5.4716466258356525e-06, + "loss": 0.2225, + "step": 10013 + }, + { + "epoch": 0.8542910766080873, + "grad_norm": 2.0272531948556023, + "learning_rate": 5.465364422627889e-06, + "loss": 0.2125, + "step": 10014 + }, + { + "epoch": 0.8543763862822044, + "grad_norm": 1.564180309877695, + "learning_rate": 5.459085619402898e-06, + "loss": 0.1229, + "step": 10015 + }, + { + "epoch": 0.8544616959563215, + "grad_norm": 2.180496776568806, + "learning_rate": 5.452810216640014e-06, + "loss": 0.2893, + "step": 10016 + }, + { + "epoch": 0.8545470056304385, + "grad_norm": 1.6503345728277365, + "learning_rate": 5.4465382148183645e-06, + "loss": 0.1788, + "step": 10017 + }, + { + "epoch": 0.8546323153045555, + "grad_norm": 1.4965365479171262, + "learning_rate": 5.44026961441676e-06, + "loss": 0.1752, + "step": 10018 + }, + { + "epoch": 0.8547176249786725, + "grad_norm": 1.7602801147710827, + "learning_rate": 5.4340044159137796e-06, + "loss": 0.173, + "step": 10019 + }, + { + "epoch": 0.8548029346527897, + "grad_norm": 2.0104173913931325, + "learning_rate": 5.42774261978774e-06, + "loss": 0.137, + "step": 10020 + }, + { + "epoch": 0.8548882443269067, + "grad_norm": 1.542733708116516, + "learning_rate": 5.421484226516698e-06, + "loss": 0.1637, + "step": 10021 + }, + { + "epoch": 0.8549735540010237, + "grad_norm": 1.7572885368182114, + "learning_rate": 5.41522923657844e-06, + "loss": 0.2104, + "step": 10022 + }, + { + "epoch": 0.8550588636751407, + "grad_norm": 1.8237728905499533, + "learning_rate": 5.408977650450503e-06, + "loss": 0.1981, + "step": 10023 + }, + { + "epoch": 0.8551441733492579, + "grad_norm": 1.4996429654229955, + "learning_rate": 5.402729468610179e-06, + "loss": 0.1525, + "step": 10024 + }, + { + "epoch": 0.8552294830233749, + "grad_norm": 2.110241521237344, + "learning_rate": 5.396484691534454e-06, + "loss": 0.1883, + "step": 10025 + }, + { + "epoch": 0.8553147926974919, + "grad_norm": 1.802895849730873, + "learning_rate": 5.3902433197001115e-06, + "loss": 0.163, + "step": 10026 + }, + { + "epoch": 0.8554001023716089, + "grad_norm": 1.8463603146499554, + "learning_rate": 5.384005353583632e-06, + "loss": 0.263, + "step": 10027 + }, + { + "epoch": 0.855485412045726, + "grad_norm": 1.1643066662216264, + "learning_rate": 5.377770793661257e-06, + "loss": 0.1913, + "step": 10028 + }, + { + "epoch": 0.855570721719843, + "grad_norm": 1.4130303425321857, + "learning_rate": 5.371539640408957e-06, + "loss": 0.149, + "step": 10029 + }, + { + "epoch": 0.8556560313939601, + "grad_norm": 1.400785935708425, + "learning_rate": 5.3653118943024546e-06, + "loss": 0.1477, + "step": 10030 + }, + { + "epoch": 0.8557413410680771, + "grad_norm": 1.5972417839437247, + "learning_rate": 5.359087555817194e-06, + "loss": 0.1556, + "step": 10031 + }, + { + "epoch": 0.8558266507421942, + "grad_norm": 1.7909472669828677, + "learning_rate": 5.352866625428371e-06, + "loss": 0.1807, + "step": 10032 + }, + { + "epoch": 0.8559119604163112, + "grad_norm": 1.774741949209053, + "learning_rate": 5.34664910361094e-06, + "loss": 0.1693, + "step": 10033 + }, + { + "epoch": 0.8559972700904283, + "grad_norm": 1.4883152114317388, + "learning_rate": 5.34043499083956e-06, + "loss": 0.1662, + "step": 10034 + }, + { + "epoch": 0.8560825797645453, + "grad_norm": 1.727039806276091, + "learning_rate": 5.334224287588646e-06, + "loss": 0.1665, + "step": 10035 + }, + { + "epoch": 0.8561678894386624, + "grad_norm": 1.9539674696698706, + "learning_rate": 5.328016994332341e-06, + "loss": 0.1569, + "step": 10036 + }, + { + "epoch": 0.8562531991127794, + "grad_norm": 1.761718208668154, + "learning_rate": 5.321813111544577e-06, + "loss": 0.2471, + "step": 10037 + }, + { + "epoch": 0.8563385087868964, + "grad_norm": 1.5036507525714269, + "learning_rate": 5.315612639698941e-06, + "loss": 0.1887, + "step": 10038 + }, + { + "epoch": 0.8564238184610135, + "grad_norm": 1.692569323969589, + "learning_rate": 5.309415579268834e-06, + "loss": 0.1642, + "step": 10039 + }, + { + "epoch": 0.8565091281351305, + "grad_norm": 1.6029591495023023, + "learning_rate": 5.303221930727364e-06, + "loss": 0.1548, + "step": 10040 + }, + { + "epoch": 0.8565944378092476, + "grad_norm": 1.7527313715290735, + "learning_rate": 5.297031694547383e-06, + "loss": 0.1896, + "step": 10041 + }, + { + "epoch": 0.8566797474833646, + "grad_norm": 2.192400212208237, + "learning_rate": 5.290844871201484e-06, + "loss": 0.2116, + "step": 10042 + }, + { + "epoch": 0.8567650571574816, + "grad_norm": 1.475167639952743, + "learning_rate": 5.2846614611619885e-06, + "loss": 0.1667, + "step": 10043 + }, + { + "epoch": 0.8568503668315987, + "grad_norm": 1.7669449994821764, + "learning_rate": 5.2784814649009754e-06, + "loss": 0.1479, + "step": 10044 + }, + { + "epoch": 0.8569356765057158, + "grad_norm": 1.5030899012030627, + "learning_rate": 5.272304882890244e-06, + "loss": 0.1508, + "step": 10045 + }, + { + "epoch": 0.8570209861798328, + "grad_norm": 1.9301523119857638, + "learning_rate": 5.266131715601358e-06, + "loss": 0.2133, + "step": 10046 + }, + { + "epoch": 0.8571062958539498, + "grad_norm": 1.7571763985785365, + "learning_rate": 5.259961963505606e-06, + "loss": 0.1736, + "step": 10047 + }, + { + "epoch": 0.8571916055280668, + "grad_norm": 1.7140213160752842, + "learning_rate": 5.253795627074004e-06, + "loss": 0.151, + "step": 10048 + }, + { + "epoch": 0.857276915202184, + "grad_norm": 1.9982972645414283, + "learning_rate": 5.247632706777328e-06, + "loss": 0.213, + "step": 10049 + }, + { + "epoch": 0.857362224876301, + "grad_norm": 1.8438952114013047, + "learning_rate": 5.241473203086084e-06, + "loss": 0.1967, + "step": 10050 + }, + { + "epoch": 0.857447534550418, + "grad_norm": 2.3282226951634, + "learning_rate": 5.235317116470506e-06, + "loss": 0.1771, + "step": 10051 + }, + { + "epoch": 0.857532844224535, + "grad_norm": 1.5054619211419134, + "learning_rate": 5.229164447400587e-06, + "loss": 0.1391, + "step": 10052 + }, + { + "epoch": 0.8576181538986521, + "grad_norm": 1.6698263576026933, + "learning_rate": 5.223015196346065e-06, + "loss": 0.1937, + "step": 10053 + }, + { + "epoch": 0.8577034635727692, + "grad_norm": 1.7647263256927108, + "learning_rate": 5.216869363776367e-06, + "loss": 0.1674, + "step": 10054 + }, + { + "epoch": 0.8577887732468862, + "grad_norm": 2.1641771857655137, + "learning_rate": 5.210726950160727e-06, + "loss": 0.1704, + "step": 10055 + }, + { + "epoch": 0.8578740829210032, + "grad_norm": 1.9179643992201798, + "learning_rate": 5.204587955968076e-06, + "loss": 0.1791, + "step": 10056 + }, + { + "epoch": 0.8579593925951203, + "grad_norm": 1.6197655809597766, + "learning_rate": 5.19845238166709e-06, + "loss": 0.1964, + "step": 10057 + }, + { + "epoch": 0.8580447022692373, + "grad_norm": 2.1364112088719143, + "learning_rate": 5.1923202277261775e-06, + "loss": 0.1595, + "step": 10058 + }, + { + "epoch": 0.8581300119433544, + "grad_norm": 1.880194906114589, + "learning_rate": 5.186191494613529e-06, + "loss": 0.304, + "step": 10059 + }, + { + "epoch": 0.8582153216174714, + "grad_norm": 1.5268617928144936, + "learning_rate": 5.180066182797006e-06, + "loss": 0.1385, + "step": 10060 + }, + { + "epoch": 0.8583006312915885, + "grad_norm": 1.7627310202408188, + "learning_rate": 5.17394429274426e-06, + "loss": 0.1881, + "step": 10061 + }, + { + "epoch": 0.8583859409657055, + "grad_norm": 1.4930621873969518, + "learning_rate": 5.1678258249226615e-06, + "loss": 0.2018, + "step": 10062 + }, + { + "epoch": 0.8584712506398225, + "grad_norm": 1.4803696249436114, + "learning_rate": 5.161710779799328e-06, + "loss": 0.2038, + "step": 10063 + }, + { + "epoch": 0.8585565603139396, + "grad_norm": 2.0689055646508168, + "learning_rate": 5.155599157841101e-06, + "loss": 0.1616, + "step": 10064 + }, + { + "epoch": 0.8586418699880567, + "grad_norm": 2.995573911368518, + "learning_rate": 5.1494909595145695e-06, + "loss": 0.2185, + "step": 10065 + }, + { + "epoch": 0.8587271796621737, + "grad_norm": 2.5911385131937013, + "learning_rate": 5.143386185286086e-06, + "loss": 0.2022, + "step": 10066 + }, + { + "epoch": 0.8588124893362907, + "grad_norm": 1.8250055025618699, + "learning_rate": 5.137284835621681e-06, + "loss": 0.2044, + "step": 10067 + }, + { + "epoch": 0.8588977990104077, + "grad_norm": 2.1135301143756053, + "learning_rate": 5.13118691098719e-06, + "loss": 0.1839, + "step": 10068 + }, + { + "epoch": 0.8589831086845249, + "grad_norm": 2.0483691171366214, + "learning_rate": 5.1250924118481425e-06, + "loss": 0.1536, + "step": 10069 + }, + { + "epoch": 0.8590684183586419, + "grad_norm": 2.064250549944647, + "learning_rate": 5.119001338669827e-06, + "loss": 0.3044, + "step": 10070 + }, + { + "epoch": 0.8591537280327589, + "grad_norm": 1.3510501750520287, + "learning_rate": 5.112913691917259e-06, + "loss": 0.1252, + "step": 10071 + }, + { + "epoch": 0.8592390377068759, + "grad_norm": 2.310808955312858, + "learning_rate": 5.106829472055202e-06, + "loss": 0.2062, + "step": 10072 + }, + { + "epoch": 0.8593243473809931, + "grad_norm": 1.6607672072034108, + "learning_rate": 5.100748679548151e-06, + "loss": 0.1773, + "step": 10073 + }, + { + "epoch": 0.8594096570551101, + "grad_norm": 2.093505389599984, + "learning_rate": 5.094671314860339e-06, + "loss": 0.2441, + "step": 10074 + }, + { + "epoch": 0.8594949667292271, + "grad_norm": 1.9743585776297836, + "learning_rate": 5.088597378455762e-06, + "loss": 0.1986, + "step": 10075 + }, + { + "epoch": 0.8595802764033441, + "grad_norm": 1.922157328887797, + "learning_rate": 5.082526870798093e-06, + "loss": 0.162, + "step": 10076 + }, + { + "epoch": 0.8596655860774611, + "grad_norm": 1.7388068958136078, + "learning_rate": 5.0764597923508235e-06, + "loss": 0.1956, + "step": 10077 + }, + { + "epoch": 0.8597508957515783, + "grad_norm": 1.426112807927998, + "learning_rate": 5.0703961435771105e-06, + "loss": 0.1686, + "step": 10078 + }, + { + "epoch": 0.8598362054256953, + "grad_norm": 1.745179212051798, + "learning_rate": 5.064335924939917e-06, + "loss": 0.1342, + "step": 10079 + }, + { + "epoch": 0.8599215150998123, + "grad_norm": 1.4513295885025188, + "learning_rate": 5.0582791369018665e-06, + "loss": 0.2295, + "step": 10080 + }, + { + "epoch": 0.8600068247739293, + "grad_norm": 2.149374240920589, + "learning_rate": 5.0522257799253955e-06, + "loss": 0.1961, + "step": 10081 + }, + { + "epoch": 0.8600921344480464, + "grad_norm": 1.711746512274506, + "learning_rate": 5.046175854472634e-06, + "loss": 0.1699, + "step": 10082 + }, + { + "epoch": 0.8601774441221635, + "grad_norm": 2.201905149566895, + "learning_rate": 5.040129361005464e-06, + "loss": 0.1396, + "step": 10083 + }, + { + "epoch": 0.8602627537962805, + "grad_norm": 2.1687385360898492, + "learning_rate": 5.034086299985497e-06, + "loss": 0.2116, + "step": 10084 + }, + { + "epoch": 0.8603480634703975, + "grad_norm": 1.67810360061953, + "learning_rate": 5.028046671874093e-06, + "loss": 0.1444, + "step": 10085 + }, + { + "epoch": 0.8604333731445146, + "grad_norm": 2.161959952245537, + "learning_rate": 5.02201047713235e-06, + "loss": 0.2145, + "step": 10086 + }, + { + "epoch": 0.8605186828186316, + "grad_norm": 1.5183956345060279, + "learning_rate": 5.015977716221076e-06, + "loss": 0.1972, + "step": 10087 + }, + { + "epoch": 0.8606039924927487, + "grad_norm": 1.8239874029393797, + "learning_rate": 5.009948389600883e-06, + "loss": 0.2418, + "step": 10088 + }, + { + "epoch": 0.8606893021668657, + "grad_norm": 1.9067122102290945, + "learning_rate": 5.003922497732033e-06, + "loss": 0.1911, + "step": 10089 + }, + { + "epoch": 0.8607746118409828, + "grad_norm": 2.007644586926319, + "learning_rate": 4.997900041074605e-06, + "loss": 0.1919, + "step": 10090 + }, + { + "epoch": 0.8608599215150998, + "grad_norm": 1.3854309979990613, + "learning_rate": 4.991881020088362e-06, + "loss": 0.1925, + "step": 10091 + }, + { + "epoch": 0.8609452311892168, + "grad_norm": 1.7253186332429178, + "learning_rate": 4.985865435232834e-06, + "loss": 0.1212, + "step": 10092 + }, + { + "epoch": 0.8610305408633339, + "grad_norm": 2.0324534445823184, + "learning_rate": 4.979853286967273e-06, + "loss": 0.1933, + "step": 10093 + }, + { + "epoch": 0.861115850537451, + "grad_norm": 1.769753280082645, + "learning_rate": 4.973844575750669e-06, + "loss": 0.1558, + "step": 10094 + }, + { + "epoch": 0.861201160211568, + "grad_norm": 2.2451970441702613, + "learning_rate": 4.967839302041782e-06, + "loss": 0.1389, + "step": 10095 + }, + { + "epoch": 0.861286469885685, + "grad_norm": 1.6456992018241126, + "learning_rate": 4.9618374662990406e-06, + "loss": 0.1688, + "step": 10096 + }, + { + "epoch": 0.861371779559802, + "grad_norm": 1.7583079000263766, + "learning_rate": 4.955839068980689e-06, + "loss": 0.153, + "step": 10097 + }, + { + "epoch": 0.8614570892339192, + "grad_norm": 1.6388913395247304, + "learning_rate": 4.9498441105446606e-06, + "loss": 0.2042, + "step": 10098 + }, + { + "epoch": 0.8615423989080362, + "grad_norm": 2.05405001305865, + "learning_rate": 4.9438525914486385e-06, + "loss": 0.1176, + "step": 10099 + }, + { + "epoch": 0.8616277085821532, + "grad_norm": 1.6223915879718065, + "learning_rate": 4.937864512150031e-06, + "loss": 0.1607, + "step": 10100 + }, + { + "epoch": 0.8617130182562702, + "grad_norm": 2.3951229092590975, + "learning_rate": 4.931879873106027e-06, + "loss": 0.1566, + "step": 10101 + }, + { + "epoch": 0.8617983279303874, + "grad_norm": 1.9043977837847526, + "learning_rate": 4.925898674773488e-06, + "loss": 0.2082, + "step": 10102 + }, + { + "epoch": 0.8618836376045044, + "grad_norm": 1.844970686971723, + "learning_rate": 4.919920917609066e-06, + "loss": 0.1874, + "step": 10103 + }, + { + "epoch": 0.8619689472786214, + "grad_norm": 1.7604193452291783, + "learning_rate": 4.9139466020691305e-06, + "loss": 0.1834, + "step": 10104 + }, + { + "epoch": 0.8620542569527384, + "grad_norm": 1.5030826047125139, + "learning_rate": 4.907975728609782e-06, + "loss": 0.1535, + "step": 10105 + }, + { + "epoch": 0.8621395666268555, + "grad_norm": 1.9202819672804057, + "learning_rate": 4.902008297686872e-06, + "loss": 0.1624, + "step": 10106 + }, + { + "epoch": 0.8622248763009726, + "grad_norm": 1.6524653108219394, + "learning_rate": 4.896044309755965e-06, + "loss": 0.2314, + "step": 10107 + }, + { + "epoch": 0.8623101859750896, + "grad_norm": 1.7505127291920721, + "learning_rate": 4.890083765272413e-06, + "loss": 0.2528, + "step": 10108 + }, + { + "epoch": 0.8623954956492066, + "grad_norm": 1.915916821404449, + "learning_rate": 4.884126664691229e-06, + "loss": 0.1995, + "step": 10109 + }, + { + "epoch": 0.8624808053233237, + "grad_norm": 1.6205765600059814, + "learning_rate": 4.878173008467241e-06, + "loss": 0.2212, + "step": 10110 + }, + { + "epoch": 0.8625661149974407, + "grad_norm": 1.7912210161588793, + "learning_rate": 4.872222797054971e-06, + "loss": 0.2058, + "step": 10111 + }, + { + "epoch": 0.8626514246715578, + "grad_norm": 1.734728665420923, + "learning_rate": 4.866276030908678e-06, + "loss": 0.1228, + "step": 10112 + }, + { + "epoch": 0.8627367343456748, + "grad_norm": 2.1315169539183376, + "learning_rate": 4.8603327104823685e-06, + "loss": 0.1673, + "step": 10113 + }, + { + "epoch": 0.8628220440197918, + "grad_norm": 2.0166658774550394, + "learning_rate": 4.854392836229788e-06, + "loss": 0.1518, + "step": 10114 + }, + { + "epoch": 0.8629073536939089, + "grad_norm": 2.139977795851125, + "learning_rate": 4.848456408604407e-06, + "loss": 0.1894, + "step": 10115 + }, + { + "epoch": 0.8629926633680259, + "grad_norm": 1.77526828189059, + "learning_rate": 4.842523428059437e-06, + "loss": 0.2349, + "step": 10116 + }, + { + "epoch": 0.863077973042143, + "grad_norm": 2.5046627431349204, + "learning_rate": 4.836593895047853e-06, + "loss": 0.1782, + "step": 10117 + }, + { + "epoch": 0.86316328271626, + "grad_norm": 1.6507394636419617, + "learning_rate": 4.83066781002231e-06, + "loss": 0.1485, + "step": 10118 + }, + { + "epoch": 0.8632485923903771, + "grad_norm": 1.6962457013339263, + "learning_rate": 4.824745173435258e-06, + "loss": 0.1575, + "step": 10119 + }, + { + "epoch": 0.8633339020644941, + "grad_norm": 1.915046407869327, + "learning_rate": 4.818825985738856e-06, + "loss": 0.1256, + "step": 10120 + }, + { + "epoch": 0.8634192117386111, + "grad_norm": 1.5143287857567325, + "learning_rate": 4.812910247384994e-06, + "loss": 0.1737, + "step": 10121 + }, + { + "epoch": 0.8635045214127282, + "grad_norm": 1.6407940005497308, + "learning_rate": 4.806997958825299e-06, + "loss": 0.1442, + "step": 10122 + }, + { + "epoch": 0.8635898310868453, + "grad_norm": 1.801944911102025, + "learning_rate": 4.801089120511165e-06, + "loss": 0.1734, + "step": 10123 + }, + { + "epoch": 0.8636751407609623, + "grad_norm": 1.6985448443646907, + "learning_rate": 4.795183732893694e-06, + "loss": 0.2182, + "step": 10124 + }, + { + "epoch": 0.8637604504350793, + "grad_norm": 1.8631062615395075, + "learning_rate": 4.789281796423723e-06, + "loss": 0.1988, + "step": 10125 + }, + { + "epoch": 0.8638457601091963, + "grad_norm": 2.0116592546572454, + "learning_rate": 4.783383311551837e-06, + "loss": 0.1495, + "step": 10126 + }, + { + "epoch": 0.8639310697833135, + "grad_norm": 1.4991030395446463, + "learning_rate": 4.777488278728354e-06, + "loss": 0.1553, + "step": 10127 + }, + { + "epoch": 0.8640163794574305, + "grad_norm": 1.3993102179555572, + "learning_rate": 4.771596698403336e-06, + "loss": 0.1644, + "step": 10128 + }, + { + "epoch": 0.8641016891315475, + "grad_norm": 1.4323655357963647, + "learning_rate": 4.76570857102655e-06, + "loss": 0.1625, + "step": 10129 + }, + { + "epoch": 0.8641869988056645, + "grad_norm": 1.8465649081447029, + "learning_rate": 4.75982389704756e-06, + "loss": 0.1437, + "step": 10130 + }, + { + "epoch": 0.8642723084797816, + "grad_norm": 1.7385120024377856, + "learning_rate": 4.753942676915591e-06, + "loss": 0.1421, + "step": 10131 + }, + { + "epoch": 0.8643576181538987, + "grad_norm": 1.832193395926228, + "learning_rate": 4.748064911079669e-06, + "loss": 0.2708, + "step": 10132 + }, + { + "epoch": 0.8644429278280157, + "grad_norm": 1.929689106187654, + "learning_rate": 4.742190599988522e-06, + "loss": 0.1707, + "step": 10133 + }, + { + "epoch": 0.8645282375021327, + "grad_norm": 1.4665686459009166, + "learning_rate": 4.736319744090628e-06, + "loss": 0.2371, + "step": 10134 + }, + { + "epoch": 0.8646135471762498, + "grad_norm": 1.4940602798301799, + "learning_rate": 4.7304523438341855e-06, + "loss": 0.1401, + "step": 10135 + }, + { + "epoch": 0.8646988568503668, + "grad_norm": 1.77143693248472, + "learning_rate": 4.7245883996671405e-06, + "loss": 0.1764, + "step": 10136 + }, + { + "epoch": 0.8647841665244839, + "grad_norm": 1.9167047441542548, + "learning_rate": 4.7187279120371905e-06, + "loss": 0.1866, + "step": 10137 + }, + { + "epoch": 0.8648694761986009, + "grad_norm": 1.8887972864054763, + "learning_rate": 4.712870881391723e-06, + "loss": 0.1449, + "step": 10138 + }, + { + "epoch": 0.864954785872718, + "grad_norm": 1.2266079353615726, + "learning_rate": 4.707017308177919e-06, + "loss": 0.1863, + "step": 10139 + }, + { + "epoch": 0.865040095546835, + "grad_norm": 1.7956380816154756, + "learning_rate": 4.701167192842659e-06, + "loss": 0.2157, + "step": 10140 + }, + { + "epoch": 0.865125405220952, + "grad_norm": 1.829588345154677, + "learning_rate": 4.695320535832565e-06, + "loss": 0.1639, + "step": 10141 + }, + { + "epoch": 0.8652107148950691, + "grad_norm": 1.5248814708735965, + "learning_rate": 4.6894773375939836e-06, + "loss": 0.1728, + "step": 10142 + }, + { + "epoch": 0.8652960245691862, + "grad_norm": 1.4572778242579465, + "learning_rate": 4.683637598573054e-06, + "loss": 0.1907, + "step": 10143 + }, + { + "epoch": 0.8653813342433032, + "grad_norm": 1.4214445028350184, + "learning_rate": 4.677801319215564e-06, + "loss": 0.1604, + "step": 10144 + }, + { + "epoch": 0.8654666439174202, + "grad_norm": 1.7067287011097851, + "learning_rate": 4.6719684999671055e-06, + "loss": 0.1589, + "step": 10145 + }, + { + "epoch": 0.8655519535915372, + "grad_norm": 1.0729422458204407, + "learning_rate": 4.66613914127298e-06, + "loss": 0.1255, + "step": 10146 + }, + { + "epoch": 0.8656372632656544, + "grad_norm": 1.4041453400453492, + "learning_rate": 4.6603132435782295e-06, + "loss": 0.1692, + "step": 10147 + }, + { + "epoch": 0.8657225729397714, + "grad_norm": 1.5934250444223546, + "learning_rate": 4.6544908073276276e-06, + "loss": 0.1918, + "step": 10148 + }, + { + "epoch": 0.8658078826138884, + "grad_norm": 1.5783295451638757, + "learning_rate": 4.64867183296569e-06, + "loss": 0.166, + "step": 10149 + }, + { + "epoch": 0.8658931922880054, + "grad_norm": 2.0707592752007846, + "learning_rate": 4.642856320936656e-06, + "loss": 0.2075, + "step": 10150 + }, + { + "epoch": 0.8659785019621226, + "grad_norm": 2.306890747049639, + "learning_rate": 4.637044271684504e-06, + "loss": 0.2279, + "step": 10151 + }, + { + "epoch": 0.8660638116362396, + "grad_norm": 1.6860127429593723, + "learning_rate": 4.631235685652979e-06, + "loss": 0.184, + "step": 10152 + }, + { + "epoch": 0.8661491213103566, + "grad_norm": 1.6193594238285953, + "learning_rate": 4.625430563285515e-06, + "loss": 0.1702, + "step": 10153 + }, + { + "epoch": 0.8662344309844736, + "grad_norm": 1.3284824563484312, + "learning_rate": 4.619628905025308e-06, + "loss": 0.1684, + "step": 10154 + }, + { + "epoch": 0.8663197406585906, + "grad_norm": 2.619213629493835, + "learning_rate": 4.613830711315287e-06, + "loss": 0.1717, + "step": 10155 + }, + { + "epoch": 0.8664050503327078, + "grad_norm": 1.8641191753715192, + "learning_rate": 4.608035982598108e-06, + "loss": 0.1815, + "step": 10156 + }, + { + "epoch": 0.8664903600068248, + "grad_norm": 1.426632812371652, + "learning_rate": 4.6022447193161625e-06, + "loss": 0.1747, + "step": 10157 + }, + { + "epoch": 0.8665756696809418, + "grad_norm": 1.8780260463041836, + "learning_rate": 4.596456921911585e-06, + "loss": 0.1557, + "step": 10158 + }, + { + "epoch": 0.8666609793550588, + "grad_norm": 1.6185361978739077, + "learning_rate": 4.590672590826267e-06, + "loss": 0.1608, + "step": 10159 + }, + { + "epoch": 0.8667462890291759, + "grad_norm": 1.868156405172493, + "learning_rate": 4.584891726501772e-06, + "loss": 0.1608, + "step": 10160 + }, + { + "epoch": 0.866831598703293, + "grad_norm": 1.574485131531207, + "learning_rate": 4.579114329379469e-06, + "loss": 0.1834, + "step": 10161 + }, + { + "epoch": 0.86691690837741, + "grad_norm": 1.3843501103835318, + "learning_rate": 4.573340399900417e-06, + "loss": 0.1592, + "step": 10162 + }, + { + "epoch": 0.867002218051527, + "grad_norm": 1.6498040053974854, + "learning_rate": 4.567569938505434e-06, + "loss": 0.204, + "step": 10163 + }, + { + "epoch": 0.8670875277256441, + "grad_norm": 1.3385605520061556, + "learning_rate": 4.561802945635046e-06, + "loss": 0.1783, + "step": 10164 + }, + { + "epoch": 0.8671728373997611, + "grad_norm": 1.9367793188745437, + "learning_rate": 4.556039421729563e-06, + "loss": 0.2275, + "step": 10165 + }, + { + "epoch": 0.8672581470738782, + "grad_norm": 2.114023018501432, + "learning_rate": 4.55027936722896e-06, + "loss": 0.2121, + "step": 10166 + }, + { + "epoch": 0.8673434567479952, + "grad_norm": 1.7816151027277407, + "learning_rate": 4.544522782573019e-06, + "loss": 0.1955, + "step": 10167 + }, + { + "epoch": 0.8674287664221123, + "grad_norm": 1.8019017769381633, + "learning_rate": 4.5387696682012145e-06, + "loss": 0.2275, + "step": 10168 + }, + { + "epoch": 0.8675140760962293, + "grad_norm": 2.3658522059633027, + "learning_rate": 4.533020024552765e-06, + "loss": 0.2069, + "step": 10169 + }, + { + "epoch": 0.8675993857703463, + "grad_norm": 1.4856452726747065, + "learning_rate": 4.527273852066627e-06, + "loss": 0.1695, + "step": 10170 + }, + { + "epoch": 0.8676846954444634, + "grad_norm": 2.0040166575335334, + "learning_rate": 4.521531151181474e-06, + "loss": 0.2214, + "step": 10171 + }, + { + "epoch": 0.8677700051185805, + "grad_norm": 1.5995516923837334, + "learning_rate": 4.515791922335772e-06, + "loss": 0.1585, + "step": 10172 + }, + { + "epoch": 0.8678553147926975, + "grad_norm": 1.6796572660342206, + "learning_rate": 4.51005616596763e-06, + "loss": 0.1512, + "step": 10173 + }, + { + "epoch": 0.8679406244668145, + "grad_norm": 2.658289496639355, + "learning_rate": 4.504323882514977e-06, + "loss": 0.1641, + "step": 10174 + }, + { + "epoch": 0.8680259341409315, + "grad_norm": 1.8984460398791976, + "learning_rate": 4.498595072415429e-06, + "loss": 0.2033, + "step": 10175 + }, + { + "epoch": 0.8681112438150487, + "grad_norm": 1.9657127324403918, + "learning_rate": 4.492869736106354e-06, + "loss": 0.1559, + "step": 10176 + }, + { + "epoch": 0.8681965534891657, + "grad_norm": 2.4608222086709124, + "learning_rate": 4.487147874024855e-06, + "loss": 0.2245, + "step": 10177 + }, + { + "epoch": 0.8682818631632827, + "grad_norm": 1.9531293945263062, + "learning_rate": 4.4814294866077525e-06, + "loss": 0.2152, + "step": 10178 + }, + { + "epoch": 0.8683671728373997, + "grad_norm": 1.6737489982072085, + "learning_rate": 4.475714574291628e-06, + "loss": 0.1736, + "step": 10179 + }, + { + "epoch": 0.8684524825115169, + "grad_norm": 1.5014697662520935, + "learning_rate": 4.470003137512774e-06, + "loss": 0.2082, + "step": 10180 + }, + { + "epoch": 0.8685377921856339, + "grad_norm": 1.6054181093271669, + "learning_rate": 4.4642951767072395e-06, + "loss": 0.1353, + "step": 10181 + }, + { + "epoch": 0.8686231018597509, + "grad_norm": 1.5932788526459574, + "learning_rate": 4.458590692310793e-06, + "loss": 0.1629, + "step": 10182 + }, + { + "epoch": 0.8687084115338679, + "grad_norm": 1.4488882834502153, + "learning_rate": 4.452889684758938e-06, + "loss": 0.1388, + "step": 10183 + }, + { + "epoch": 0.868793721207985, + "grad_norm": 1.4892353994215075, + "learning_rate": 4.447192154486912e-06, + "loss": 0.1598, + "step": 10184 + }, + { + "epoch": 0.868879030882102, + "grad_norm": 1.3745526106123132, + "learning_rate": 4.441498101929714e-06, + "loss": 0.1679, + "step": 10185 + }, + { + "epoch": 0.8689643405562191, + "grad_norm": 2.121683224985256, + "learning_rate": 4.435807527522024e-06, + "loss": 0.1267, + "step": 10186 + }, + { + "epoch": 0.8690496502303361, + "grad_norm": 2.0047286876036066, + "learning_rate": 4.4301204316983035e-06, + "loss": 0.1773, + "step": 10187 + }, + { + "epoch": 0.8691349599044532, + "grad_norm": 2.190496299979109, + "learning_rate": 4.424436814892735e-06, + "loss": 0.2195, + "step": 10188 + }, + { + "epoch": 0.8692202695785702, + "grad_norm": 1.529349880347383, + "learning_rate": 4.4187566775392265e-06, + "loss": 0.2137, + "step": 10189 + }, + { + "epoch": 0.8693055792526873, + "grad_norm": 1.75881508951453, + "learning_rate": 4.413080020071425e-06, + "loss": 0.1541, + "step": 10190 + }, + { + "epoch": 0.8693908889268043, + "grad_norm": 1.9369200176806392, + "learning_rate": 4.4074068429227174e-06, + "loss": 0.2109, + "step": 10191 + }, + { + "epoch": 0.8694761986009213, + "grad_norm": 1.3925359219312867, + "learning_rate": 4.401737146526219e-06, + "loss": 0.1442, + "step": 10192 + }, + { + "epoch": 0.8695615082750384, + "grad_norm": 1.6955771525318417, + "learning_rate": 4.396070931314772e-06, + "loss": 0.1594, + "step": 10193 + }, + { + "epoch": 0.8696468179491554, + "grad_norm": 1.997848127964255, + "learning_rate": 4.3904081977209855e-06, + "loss": 0.1814, + "step": 10194 + }, + { + "epoch": 0.8697321276232725, + "grad_norm": 1.6511610772145537, + "learning_rate": 4.384748946177153e-06, + "loss": 0.2336, + "step": 10195 + }, + { + "epoch": 0.8698174372973895, + "grad_norm": 1.9272185063423282, + "learning_rate": 4.37909317711534e-06, + "loss": 0.1799, + "step": 10196 + }, + { + "epoch": 0.8699027469715066, + "grad_norm": 1.864993350449231, + "learning_rate": 4.37344089096734e-06, + "loss": 0.1896, + "step": 10197 + }, + { + "epoch": 0.8699880566456236, + "grad_norm": 1.3506179331356734, + "learning_rate": 4.367792088164662e-06, + "loss": 0.2418, + "step": 10198 + }, + { + "epoch": 0.8700733663197406, + "grad_norm": 1.7148090611036235, + "learning_rate": 4.362146769138575e-06, + "loss": 0.1592, + "step": 10199 + }, + { + "epoch": 0.8701586759938577, + "grad_norm": 1.7481828519477, + "learning_rate": 4.356504934320049e-06, + "loss": 0.2006, + "step": 10200 + }, + { + "epoch": 0.8702439856679748, + "grad_norm": 2.172175983458245, + "learning_rate": 4.350866584139845e-06, + "loss": 0.2152, + "step": 10201 + }, + { + "epoch": 0.8703292953420918, + "grad_norm": 1.8968620513524335, + "learning_rate": 4.3452317190283755e-06, + "loss": 0.1892, + "step": 10202 + }, + { + "epoch": 0.8704146050162088, + "grad_norm": 1.6380174532695237, + "learning_rate": 4.3396003394158665e-06, + "loss": 0.1224, + "step": 10203 + }, + { + "epoch": 0.8704999146903258, + "grad_norm": 1.6455317917160452, + "learning_rate": 4.333972445732231e-06, + "loss": 0.1865, + "step": 10204 + }, + { + "epoch": 0.870585224364443, + "grad_norm": 1.4657458159991932, + "learning_rate": 4.328348038407132e-06, + "loss": 0.1924, + "step": 10205 + }, + { + "epoch": 0.87067053403856, + "grad_norm": 1.858275914124799, + "learning_rate": 4.322727117869951e-06, + "loss": 0.198, + "step": 10206 + }, + { + "epoch": 0.870755843712677, + "grad_norm": 1.8427241429466026, + "learning_rate": 4.317109684549847e-06, + "loss": 0.2105, + "step": 10207 + }, + { + "epoch": 0.870841153386794, + "grad_norm": 2.0438981868671466, + "learning_rate": 4.311495738875637e-06, + "loss": 0.1589, + "step": 10208 + }, + { + "epoch": 0.8709264630609111, + "grad_norm": 1.750715245811758, + "learning_rate": 4.305885281275951e-06, + "loss": 0.1923, + "step": 10209 + }, + { + "epoch": 0.8710117727350282, + "grad_norm": 1.6532102797204562, + "learning_rate": 4.300278312179107e-06, + "loss": 0.2038, + "step": 10210 + }, + { + "epoch": 0.8710970824091452, + "grad_norm": 1.6670619813669911, + "learning_rate": 4.294674832013163e-06, + "loss": 0.1458, + "step": 10211 + }, + { + "epoch": 0.8711823920832622, + "grad_norm": 1.9043421969766976, + "learning_rate": 4.289074841205914e-06, + "loss": 0.1698, + "step": 10212 + }, + { + "epoch": 0.8712677017573793, + "grad_norm": 2.4108086542298564, + "learning_rate": 4.283478340184893e-06, + "loss": 0.2283, + "step": 10213 + }, + { + "epoch": 0.8713530114314963, + "grad_norm": 1.2929969623173008, + "learning_rate": 4.277885329377373e-06, + "loss": 0.1274, + "step": 10214 + }, + { + "epoch": 0.8714383211056134, + "grad_norm": 1.9505895408137965, + "learning_rate": 4.27229580921033e-06, + "loss": 0.113, + "step": 10215 + }, + { + "epoch": 0.8715236307797304, + "grad_norm": 1.4949645718870765, + "learning_rate": 4.266709780110511e-06, + "loss": 0.1292, + "step": 10216 + }, + { + "epoch": 0.8716089404538475, + "grad_norm": 1.692265880174892, + "learning_rate": 4.261127242504376e-06, + "loss": 0.2185, + "step": 10217 + }, + { + "epoch": 0.8716942501279645, + "grad_norm": 1.756530972740595, + "learning_rate": 4.255548196818115e-06, + "loss": 0.2274, + "step": 10218 + }, + { + "epoch": 0.8717795598020815, + "grad_norm": 1.7216465904096832, + "learning_rate": 4.249972643477668e-06, + "loss": 0.1487, + "step": 10219 + }, + { + "epoch": 0.8718648694761986, + "grad_norm": 1.755705592260196, + "learning_rate": 4.244400582908692e-06, + "loss": 0.1746, + "step": 10220 + }, + { + "epoch": 0.8719501791503157, + "grad_norm": 1.4793727713255056, + "learning_rate": 4.238832015536587e-06, + "loss": 0.1499, + "step": 10221 + }, + { + "epoch": 0.8720354888244327, + "grad_norm": 1.3025070518258826, + "learning_rate": 4.2332669417864735e-06, + "loss": 0.1486, + "step": 10222 + }, + { + "epoch": 0.8721207984985497, + "grad_norm": 2.015079631509803, + "learning_rate": 4.2277053620832455e-06, + "loss": 0.1936, + "step": 10223 + }, + { + "epoch": 0.8722061081726667, + "grad_norm": 1.5840658785433477, + "learning_rate": 4.222147276851457e-06, + "loss": 0.285, + "step": 10224 + }, + { + "epoch": 0.8722914178467839, + "grad_norm": 1.6736359636177156, + "learning_rate": 4.216592686515475e-06, + "loss": 0.1729, + "step": 10225 + }, + { + "epoch": 0.8723767275209009, + "grad_norm": 2.000531364425779, + "learning_rate": 4.2110415914993376e-06, + "loss": 0.1482, + "step": 10226 + }, + { + "epoch": 0.8724620371950179, + "grad_norm": 1.7368584800173428, + "learning_rate": 4.205493992226867e-06, + "loss": 0.2376, + "step": 10227 + }, + { + "epoch": 0.8725473468691349, + "grad_norm": 1.922924158557695, + "learning_rate": 4.199949889121563e-06, + "loss": 0.1515, + "step": 10228 + }, + { + "epoch": 0.872632656543252, + "grad_norm": 1.781403652055765, + "learning_rate": 4.194409282606715e-06, + "loss": 0.2097, + "step": 10229 + }, + { + "epoch": 0.8727179662173691, + "grad_norm": 1.6253331649807308, + "learning_rate": 4.188872173105302e-06, + "loss": 0.1677, + "step": 10230 + }, + { + "epoch": 0.8728032758914861, + "grad_norm": 1.7545014117135482, + "learning_rate": 4.183338561040062e-06, + "loss": 0.1996, + "step": 10231 + }, + { + "epoch": 0.8728885855656031, + "grad_norm": 1.6051685210317497, + "learning_rate": 4.177808446833453e-06, + "loss": 0.1792, + "step": 10232 + }, + { + "epoch": 0.8729738952397201, + "grad_norm": 1.567704649590339, + "learning_rate": 4.1722818309076706e-06, + "loss": 0.2, + "step": 10233 + }, + { + "epoch": 0.8730592049138373, + "grad_norm": 1.3726565591706485, + "learning_rate": 4.166758713684643e-06, + "loss": 0.1289, + "step": 10234 + }, + { + "epoch": 0.8731445145879543, + "grad_norm": 1.6235590194439868, + "learning_rate": 4.161239095586022e-06, + "loss": 0.2111, + "step": 10235 + }, + { + "epoch": 0.8732298242620713, + "grad_norm": 1.6890015456134417, + "learning_rate": 4.155722977033222e-06, + "loss": 0.1607, + "step": 10236 + }, + { + "epoch": 0.8733151339361883, + "grad_norm": 1.4366362920914932, + "learning_rate": 4.150210358447343e-06, + "loss": 0.1875, + "step": 10237 + }, + { + "epoch": 0.8734004436103054, + "grad_norm": 1.2113637358626224, + "learning_rate": 4.1447012402492654e-06, + "loss": 0.1377, + "step": 10238 + }, + { + "epoch": 0.8734857532844225, + "grad_norm": 2.6906542343341395, + "learning_rate": 4.13919562285957e-06, + "loss": 0.2345, + "step": 10239 + }, + { + "epoch": 0.8735710629585395, + "grad_norm": 1.9342526324736071, + "learning_rate": 4.133693506698589e-06, + "loss": 0.1748, + "step": 10240 + }, + { + "epoch": 0.8736563726326565, + "grad_norm": 2.1098027678597826, + "learning_rate": 4.128194892186376e-06, + "loss": 0.1431, + "step": 10241 + }, + { + "epoch": 0.8737416823067736, + "grad_norm": 1.8859961727342172, + "learning_rate": 4.122699779742711e-06, + "loss": 0.1201, + "step": 10242 + }, + { + "epoch": 0.8738269919808906, + "grad_norm": 2.053319903779803, + "learning_rate": 4.11720816978714e-06, + "loss": 0.1911, + "step": 10243 + }, + { + "epoch": 0.8739123016550077, + "grad_norm": 2.7753973255438997, + "learning_rate": 4.111720062738894e-06, + "loss": 0.1569, + "step": 10244 + }, + { + "epoch": 0.8739976113291247, + "grad_norm": 2.2073637315244428, + "learning_rate": 4.106235459016972e-06, + "loss": 0.1882, + "step": 10245 + }, + { + "epoch": 0.8740829210032418, + "grad_norm": 2.107442578204319, + "learning_rate": 4.1007543590400984e-06, + "loss": 0.203, + "step": 10246 + }, + { + "epoch": 0.8741682306773588, + "grad_norm": 1.9552067762913006, + "learning_rate": 4.095276763226719e-06, + "loss": 0.1722, + "step": 10247 + }, + { + "epoch": 0.8742535403514758, + "grad_norm": 1.5693793869412997, + "learning_rate": 4.089802671995013e-06, + "loss": 0.1715, + "step": 10248 + }, + { + "epoch": 0.8743388500255929, + "grad_norm": 1.8143512873819827, + "learning_rate": 4.084332085762926e-06, + "loss": 0.1829, + "step": 10249 + }, + { + "epoch": 0.87442415969971, + "grad_norm": 1.6368135047014285, + "learning_rate": 4.0788650049480684e-06, + "loss": 0.1856, + "step": 10250 + }, + { + "epoch": 0.874509469373827, + "grad_norm": 1.6850177027058222, + "learning_rate": 4.073401429967854e-06, + "loss": 0.1827, + "step": 10251 + }, + { + "epoch": 0.874594779047944, + "grad_norm": 2.1335698274816868, + "learning_rate": 4.067941361239386e-06, + "loss": 0.2177, + "step": 10252 + }, + { + "epoch": 0.874680088722061, + "grad_norm": 2.045397509155368, + "learning_rate": 4.062484799179511e-06, + "loss": 0.1137, + "step": 10253 + }, + { + "epoch": 0.8747653983961782, + "grad_norm": 2.1839255420372834, + "learning_rate": 4.0570317442048086e-06, + "loss": 0.1781, + "step": 10254 + }, + { + "epoch": 0.8748507080702952, + "grad_norm": 1.6719572813517944, + "learning_rate": 4.051582196731596e-06, + "loss": 0.2054, + "step": 10255 + }, + { + "epoch": 0.8749360177444122, + "grad_norm": 1.7902436334619036, + "learning_rate": 4.046136157175912e-06, + "loss": 0.1673, + "step": 10256 + }, + { + "epoch": 0.8750213274185292, + "grad_norm": 1.8471539643311643, + "learning_rate": 4.040693625953523e-06, + "loss": 0.2056, + "step": 10257 + }, + { + "epoch": 0.8751066370926464, + "grad_norm": 1.395462195556474, + "learning_rate": 4.03525460347996e-06, + "loss": 0.1299, + "step": 10258 + }, + { + "epoch": 0.8751919467667634, + "grad_norm": 1.6132719395255704, + "learning_rate": 4.0298190901704505e-06, + "loss": 0.2011, + "step": 10259 + }, + { + "epoch": 0.8752772564408804, + "grad_norm": 1.286789989062764, + "learning_rate": 4.024387086439962e-06, + "loss": 0.1357, + "step": 10260 + }, + { + "epoch": 0.8753625661149974, + "grad_norm": 1.7184396636999215, + "learning_rate": 4.018958592703215e-06, + "loss": 0.1185, + "step": 10261 + }, + { + "epoch": 0.8754478757891145, + "grad_norm": 1.862029032717498, + "learning_rate": 4.013533609374631e-06, + "loss": 0.1966, + "step": 10262 + }, + { + "epoch": 0.8755331854632316, + "grad_norm": 1.5792300443923843, + "learning_rate": 4.008112136868386e-06, + "loss": 0.1719, + "step": 10263 + }, + { + "epoch": 0.8756184951373486, + "grad_norm": 2.264175056308127, + "learning_rate": 4.002694175598371e-06, + "loss": 0.2439, + "step": 10264 + }, + { + "epoch": 0.8757038048114656, + "grad_norm": 1.7646904556684702, + "learning_rate": 3.9972797259782425e-06, + "loss": 0.1471, + "step": 10265 + }, + { + "epoch": 0.8757891144855827, + "grad_norm": 2.027659838075734, + "learning_rate": 3.991868788421327e-06, + "loss": 0.2055, + "step": 10266 + }, + { + "epoch": 0.8758744241596997, + "grad_norm": 1.9014594746799358, + "learning_rate": 3.986461363340754e-06, + "loss": 0.1221, + "step": 10267 + }, + { + "epoch": 0.8759597338338168, + "grad_norm": 1.553385203495528, + "learning_rate": 3.981057451149344e-06, + "loss": 0.0937, + "step": 10268 + }, + { + "epoch": 0.8760450435079338, + "grad_norm": 2.035493496941842, + "learning_rate": 3.9756570522596516e-06, + "loss": 0.268, + "step": 10269 + }, + { + "epoch": 0.8761303531820508, + "grad_norm": 2.400456774477569, + "learning_rate": 3.970260167083961e-06, + "loss": 0.2455, + "step": 10270 + }, + { + "epoch": 0.8762156628561679, + "grad_norm": 1.7565364020376435, + "learning_rate": 3.964866796034311e-06, + "loss": 0.1718, + "step": 10271 + }, + { + "epoch": 0.8763009725302849, + "grad_norm": 1.809404426295893, + "learning_rate": 3.959476939522455e-06, + "loss": 0.1737, + "step": 10272 + }, + { + "epoch": 0.876386282204402, + "grad_norm": 1.5209123077093232, + "learning_rate": 3.95409059795987e-06, + "loss": 0.1809, + "step": 10273 + }, + { + "epoch": 0.876471591878519, + "grad_norm": 1.7183066750033191, + "learning_rate": 3.948707771757781e-06, + "loss": 0.2034, + "step": 10274 + }, + { + "epoch": 0.8765569015526361, + "grad_norm": 1.7417762168246096, + "learning_rate": 3.943328461327145e-06, + "loss": 0.1431, + "step": 10275 + }, + { + "epoch": 0.8766422112267531, + "grad_norm": 1.3329790161152442, + "learning_rate": 3.937952667078626e-06, + "loss": 0.1738, + "step": 10276 + }, + { + "epoch": 0.8767275209008701, + "grad_norm": 1.7514729431832867, + "learning_rate": 3.932580389422647e-06, + "loss": 0.1884, + "step": 10277 + }, + { + "epoch": 0.8768128305749872, + "grad_norm": 1.48420352949271, + "learning_rate": 3.927211628769367e-06, + "loss": 0.1861, + "step": 10278 + }, + { + "epoch": 0.8768981402491043, + "grad_norm": 1.6321380536460965, + "learning_rate": 3.921846385528633e-06, + "loss": 0.1984, + "step": 10279 + }, + { + "epoch": 0.8769834499232213, + "grad_norm": 1.8669604558449986, + "learning_rate": 3.916484660110076e-06, + "loss": 0.1557, + "step": 10280 + }, + { + "epoch": 0.8770687595973383, + "grad_norm": 1.939232697498577, + "learning_rate": 3.911126452923025e-06, + "loss": 0.1466, + "step": 10281 + }, + { + "epoch": 0.8771540692714553, + "grad_norm": 1.5928249292335892, + "learning_rate": 3.905771764376553e-06, + "loss": 0.1626, + "step": 10282 + }, + { + "epoch": 0.8772393789455725, + "grad_norm": 1.8789739457781112, + "learning_rate": 3.900420594879467e-06, + "loss": 0.2047, + "step": 10283 + }, + { + "epoch": 0.8773246886196895, + "grad_norm": 2.0965068374547764, + "learning_rate": 3.895072944840294e-06, + "loss": 0.1808, + "step": 10284 + }, + { + "epoch": 0.8774099982938065, + "grad_norm": 1.6802050835455227, + "learning_rate": 3.889728814667298e-06, + "loss": 0.143, + "step": 10285 + }, + { + "epoch": 0.8774953079679235, + "grad_norm": 2.0091829484015897, + "learning_rate": 3.8843882047684745e-06, + "loss": 0.1816, + "step": 10286 + }, + { + "epoch": 0.8775806176420407, + "grad_norm": 1.477254311906968, + "learning_rate": 3.879051115551557e-06, + "loss": 0.191, + "step": 10287 + }, + { + "epoch": 0.8776659273161577, + "grad_norm": 1.6922251632880836, + "learning_rate": 3.873717547423999e-06, + "loss": 0.2308, + "step": 10288 + }, + { + "epoch": 0.8777512369902747, + "grad_norm": 2.01378932429064, + "learning_rate": 3.868387500792997e-06, + "loss": 0.1574, + "step": 10289 + }, + { + "epoch": 0.8778365466643917, + "grad_norm": 1.719439212289196, + "learning_rate": 3.863060976065452e-06, + "loss": 0.1589, + "step": 10290 + }, + { + "epoch": 0.8779218563385088, + "grad_norm": 1.4409282274181578, + "learning_rate": 3.857737973648051e-06, + "loss": 0.1439, + "step": 10291 + }, + { + "epoch": 0.8780071660126259, + "grad_norm": 1.3002808670930472, + "learning_rate": 3.852418493947135e-06, + "loss": 0.1324, + "step": 10292 + }, + { + "epoch": 0.8780924756867429, + "grad_norm": 3.545237158844408, + "learning_rate": 3.847102537368852e-06, + "loss": 0.2491, + "step": 10293 + }, + { + "epoch": 0.8781777853608599, + "grad_norm": 1.9846961294739498, + "learning_rate": 3.8417901043190315e-06, + "loss": 0.1313, + "step": 10294 + }, + { + "epoch": 0.878263095034977, + "grad_norm": 1.574659489350162, + "learning_rate": 3.836481195203251e-06, + "loss": 0.1841, + "step": 10295 + }, + { + "epoch": 0.878348404709094, + "grad_norm": 1.9559365749646445, + "learning_rate": 3.831175810426823e-06, + "loss": 0.2027, + "step": 10296 + }, + { + "epoch": 0.878433714383211, + "grad_norm": 1.584596941148318, + "learning_rate": 3.825873950394776e-06, + "loss": 0.2084, + "step": 10297 + }, + { + "epoch": 0.8785190240573281, + "grad_norm": 1.612175735871409, + "learning_rate": 3.82057561551189e-06, + "loss": 0.1516, + "step": 10298 + }, + { + "epoch": 0.8786043337314452, + "grad_norm": 1.4840574225963201, + "learning_rate": 3.815280806182647e-06, + "loss": 0.1395, + "step": 10299 + }, + { + "epoch": 0.8786896434055622, + "grad_norm": 1.9596788234698068, + "learning_rate": 3.8099895228113024e-06, + "loss": 0.1691, + "step": 10300 + }, + { + "epoch": 0.8787749530796792, + "grad_norm": 1.8797313757997485, + "learning_rate": 3.8047017658017935e-06, + "loss": 0.1634, + "step": 10301 + }, + { + "epoch": 0.8788602627537963, + "grad_norm": 2.1472664155589096, + "learning_rate": 3.7994175355578256e-06, + "loss": 0.2669, + "step": 10302 + }, + { + "epoch": 0.8789455724279134, + "grad_norm": 2.1908045195895958, + "learning_rate": 3.7941368324828253e-06, + "loss": 0.1907, + "step": 10303 + }, + { + "epoch": 0.8790308821020304, + "grad_norm": 1.6544691624377281, + "learning_rate": 3.7888596569799316e-06, + "loss": 0.1724, + "step": 10304 + }, + { + "epoch": 0.8791161917761474, + "grad_norm": 2.063677365037416, + "learning_rate": 3.7835860094520446e-06, + "loss": 0.1693, + "step": 10305 + }, + { + "epoch": 0.8792015014502644, + "grad_norm": 1.8177245247747331, + "learning_rate": 3.7783158903017645e-06, + "loss": 0.2372, + "step": 10306 + }, + { + "epoch": 0.8792868111243815, + "grad_norm": 1.7041315026918542, + "learning_rate": 3.7730492999314583e-06, + "loss": 0.1228, + "step": 10307 + }, + { + "epoch": 0.8793721207984986, + "grad_norm": 2.2660688294397198, + "learning_rate": 3.7677862387431706e-06, + "loss": 0.174, + "step": 10308 + }, + { + "epoch": 0.8794574304726156, + "grad_norm": 1.9877039103000156, + "learning_rate": 3.762526707138736e-06, + "loss": 0.1437, + "step": 10309 + }, + { + "epoch": 0.8795427401467326, + "grad_norm": 1.6444163520163275, + "learning_rate": 3.7572707055196775e-06, + "loss": 0.121, + "step": 10310 + }, + { + "epoch": 0.8796280498208496, + "grad_norm": 1.8718571071374588, + "learning_rate": 3.7520182342872743e-06, + "loss": 0.155, + "step": 10311 + }, + { + "epoch": 0.8797133594949668, + "grad_norm": 1.7217683125587333, + "learning_rate": 3.7467692938425057e-06, + "loss": 0.1894, + "step": 10312 + }, + { + "epoch": 0.8797986691690838, + "grad_norm": 1.6696588677206208, + "learning_rate": 3.741523884586129e-06, + "loss": 0.2008, + "step": 10313 + }, + { + "epoch": 0.8798839788432008, + "grad_norm": 1.6106834647921617, + "learning_rate": 3.7362820069185677e-06, + "loss": 0.1373, + "step": 10314 + }, + { + "epoch": 0.8799692885173178, + "grad_norm": 1.8916950823068257, + "learning_rate": 3.731043661240036e-06, + "loss": 0.1612, + "step": 10315 + }, + { + "epoch": 0.880054598191435, + "grad_norm": 1.453610820979611, + "learning_rate": 3.7258088479504514e-06, + "loss": 0.184, + "step": 10316 + }, + { + "epoch": 0.880139907865552, + "grad_norm": 1.7436259536647574, + "learning_rate": 3.7205775674494624e-06, + "loss": 0.216, + "step": 10317 + }, + { + "epoch": 0.880225217539669, + "grad_norm": 1.5594662014697886, + "learning_rate": 3.715349820136449e-06, + "loss": 0.1428, + "step": 10318 + }, + { + "epoch": 0.880310527213786, + "grad_norm": 1.4579029901514802, + "learning_rate": 3.7101256064105084e-06, + "loss": 0.1465, + "step": 10319 + }, + { + "epoch": 0.8803958368879031, + "grad_norm": 1.5432782212888392, + "learning_rate": 3.7049049266705106e-06, + "loss": 0.1331, + "step": 10320 + }, + { + "epoch": 0.8804811465620201, + "grad_norm": 2.318999461612746, + "learning_rate": 3.699687781314992e-06, + "loss": 0.2231, + "step": 10321 + }, + { + "epoch": 0.8805664562361372, + "grad_norm": 2.156688369197617, + "learning_rate": 3.694474170742279e-06, + "loss": 0.1955, + "step": 10322 + }, + { + "epoch": 0.8806517659102542, + "grad_norm": 1.5594541235288435, + "learning_rate": 3.6892640953503975e-06, + "loss": 0.1438, + "step": 10323 + }, + { + "epoch": 0.8807370755843713, + "grad_norm": 1.357021618568971, + "learning_rate": 3.684057555537113e-06, + "loss": 0.1335, + "step": 10324 + }, + { + "epoch": 0.8808223852584883, + "grad_norm": 1.5984856054799224, + "learning_rate": 3.6788545516999063e-06, + "loss": 0.2127, + "step": 10325 + }, + { + "epoch": 0.8809076949326053, + "grad_norm": 1.3462664971009384, + "learning_rate": 3.6736550842359997e-06, + "loss": 0.2185, + "step": 10326 + }, + { + "epoch": 0.8809930046067224, + "grad_norm": 2.0124368219351068, + "learning_rate": 3.6684591535423586e-06, + "loss": 0.2004, + "step": 10327 + }, + { + "epoch": 0.8810783142808395, + "grad_norm": 1.8550450533924956, + "learning_rate": 3.6632667600156433e-06, + "loss": 0.1903, + "step": 10328 + }, + { + "epoch": 0.8811636239549565, + "grad_norm": 1.7031321394184504, + "learning_rate": 3.658077904052293e-06, + "loss": 0.1844, + "step": 10329 + }, + { + "epoch": 0.8812489336290735, + "grad_norm": 1.593108552987769, + "learning_rate": 3.6528925860484175e-06, + "loss": 0.1344, + "step": 10330 + }, + { + "epoch": 0.8813342433031905, + "grad_norm": 1.5445894252566752, + "learning_rate": 3.647710806399912e-06, + "loss": 0.1389, + "step": 10331 + }, + { + "epoch": 0.8814195529773077, + "grad_norm": 1.7002783266996697, + "learning_rate": 3.6425325655023656e-06, + "loss": 0.2258, + "step": 10332 + }, + { + "epoch": 0.8815048626514247, + "grad_norm": 2.5299750991018066, + "learning_rate": 3.6373578637511283e-06, + "loss": 0.201, + "step": 10333 + }, + { + "epoch": 0.8815901723255417, + "grad_norm": 2.032646051581116, + "learning_rate": 3.6321867015412346e-06, + "loss": 0.1859, + "step": 10334 + }, + { + "epoch": 0.8816754819996587, + "grad_norm": 1.4571434161909083, + "learning_rate": 3.627019079267491e-06, + "loss": 0.1965, + "step": 10335 + }, + { + "epoch": 0.8817607916737759, + "grad_norm": 1.5678080614714118, + "learning_rate": 3.6218549973244197e-06, + "loss": 0.1684, + "step": 10336 + }, + { + "epoch": 0.8818461013478929, + "grad_norm": 1.8088869059117285, + "learning_rate": 3.6166944561062622e-06, + "loss": 0.1543, + "step": 10337 + }, + { + "epoch": 0.8819314110220099, + "grad_norm": 1.302963305351134, + "learning_rate": 3.611537456007008e-06, + "loss": 0.1353, + "step": 10338 + }, + { + "epoch": 0.8820167206961269, + "grad_norm": 1.669234856164764, + "learning_rate": 3.606383997420354e-06, + "loss": 0.1863, + "step": 10339 + }, + { + "epoch": 0.882102030370244, + "grad_norm": 1.5539996288353122, + "learning_rate": 3.6012340807397515e-06, + "loss": 0.1734, + "step": 10340 + }, + { + "epoch": 0.8821873400443611, + "grad_norm": 1.2554638656587667, + "learning_rate": 3.5960877063583532e-06, + "loss": 0.1498, + "step": 10341 + }, + { + "epoch": 0.8822726497184781, + "grad_norm": 1.7944957073646066, + "learning_rate": 3.590944874669089e-06, + "loss": 0.14, + "step": 10342 + }, + { + "epoch": 0.8823579593925951, + "grad_norm": 1.9214493815226525, + "learning_rate": 3.5858055860645445e-06, + "loss": 0.2142, + "step": 10343 + }, + { + "epoch": 0.8824432690667121, + "grad_norm": 2.323316895998253, + "learning_rate": 3.580669840937112e-06, + "loss": 0.2147, + "step": 10344 + }, + { + "epoch": 0.8825285787408292, + "grad_norm": 1.581644496032093, + "learning_rate": 3.575537639678861e-06, + "loss": 0.1463, + "step": 10345 + }, + { + "epoch": 0.8826138884149463, + "grad_norm": 2.8178707976168402, + "learning_rate": 3.570408982681611e-06, + "loss": 0.1961, + "step": 10346 + }, + { + "epoch": 0.8826991980890633, + "grad_norm": 1.6624758324264746, + "learning_rate": 3.565283870336911e-06, + "loss": 0.1551, + "step": 10347 + }, + { + "epoch": 0.8827845077631803, + "grad_norm": 1.279438226476473, + "learning_rate": 3.5601623030360243e-06, + "loss": 0.1671, + "step": 10348 + }, + { + "epoch": 0.8828698174372974, + "grad_norm": 2.0315004634495177, + "learning_rate": 3.5550442811699837e-06, + "loss": 0.1927, + "step": 10349 + }, + { + "epoch": 0.8829551271114144, + "grad_norm": 2.0555381029550865, + "learning_rate": 3.5499298051294873e-06, + "loss": 0.1928, + "step": 10350 + }, + { + "epoch": 0.8830404367855315, + "grad_norm": 1.9607013089010883, + "learning_rate": 3.5448188753050173e-06, + "loss": 0.1419, + "step": 10351 + }, + { + "epoch": 0.8831257464596485, + "grad_norm": 1.613790584086389, + "learning_rate": 3.5397114920867725e-06, + "loss": 0.17, + "step": 10352 + }, + { + "epoch": 0.8832110561337656, + "grad_norm": 1.5673609081413344, + "learning_rate": 3.534607655864658e-06, + "loss": 0.1569, + "step": 10353 + }, + { + "epoch": 0.8832963658078826, + "grad_norm": 1.6461764191307753, + "learning_rate": 3.5295073670283286e-06, + "loss": 0.1614, + "step": 10354 + }, + { + "epoch": 0.8833816754819996, + "grad_norm": 1.8137601220908626, + "learning_rate": 3.5244106259671837e-06, + "loss": 0.1317, + "step": 10355 + }, + { + "epoch": 0.8834669851561167, + "grad_norm": 1.7146228135277093, + "learning_rate": 3.519317433070296e-06, + "loss": 0.1368, + "step": 10356 + }, + { + "epoch": 0.8835522948302338, + "grad_norm": 1.6947584521370849, + "learning_rate": 3.514227788726537e-06, + "loss": 0.2092, + "step": 10357 + }, + { + "epoch": 0.8836376045043508, + "grad_norm": 1.8883619399343514, + "learning_rate": 3.509141693324458e-06, + "loss": 0.1721, + "step": 10358 + }, + { + "epoch": 0.8837229141784678, + "grad_norm": 1.3974294598950927, + "learning_rate": 3.504059147252359e-06, + "loss": 0.1301, + "step": 10359 + }, + { + "epoch": 0.8838082238525848, + "grad_norm": 1.5989335976674441, + "learning_rate": 3.498980150898268e-06, + "loss": 0.1955, + "step": 10360 + }, + { + "epoch": 0.883893533526702, + "grad_norm": 1.7276096534431034, + "learning_rate": 3.4939047046499252e-06, + "loss": 0.1773, + "step": 10361 + }, + { + "epoch": 0.883978843200819, + "grad_norm": 2.0207328247035106, + "learning_rate": 3.488832808894843e-06, + "loss": 0.1371, + "step": 10362 + }, + { + "epoch": 0.884064152874936, + "grad_norm": 2.206280768074563, + "learning_rate": 3.4837644640202003e-06, + "loss": 0.1636, + "step": 10363 + }, + { + "epoch": 0.884149462549053, + "grad_norm": 2.3366709294554626, + "learning_rate": 3.4786996704129604e-06, + "loss": 0.2203, + "step": 10364 + }, + { + "epoch": 0.8842347722231702, + "grad_norm": 1.7056487396925317, + "learning_rate": 3.4736384284597857e-06, + "loss": 0.1236, + "step": 10365 + }, + { + "epoch": 0.8843200818972872, + "grad_norm": 1.983665880271787, + "learning_rate": 3.468580738547078e-06, + "loss": 0.1844, + "step": 10366 + }, + { + "epoch": 0.8844053915714042, + "grad_norm": 1.802121009506313, + "learning_rate": 3.4635266010609624e-06, + "loss": 0.1345, + "step": 10367 + }, + { + "epoch": 0.8844907012455212, + "grad_norm": 1.5185593976012162, + "learning_rate": 3.4584760163872963e-06, + "loss": 0.1484, + "step": 10368 + }, + { + "epoch": 0.8845760109196383, + "grad_norm": 1.6045422630366735, + "learning_rate": 3.453428984911666e-06, + "loss": 0.1897, + "step": 10369 + }, + { + "epoch": 0.8846613205937554, + "grad_norm": 1.5929737070407792, + "learning_rate": 3.448385507019375e-06, + "loss": 0.1364, + "step": 10370 + }, + { + "epoch": 0.8847466302678724, + "grad_norm": 1.7676798922922612, + "learning_rate": 3.443345583095492e-06, + "loss": 0.1749, + "step": 10371 + }, + { + "epoch": 0.8848319399419894, + "grad_norm": 1.8307788854787048, + "learning_rate": 3.4383092135247543e-06, + "loss": 0.1977, + "step": 10372 + }, + { + "epoch": 0.8849172496161065, + "grad_norm": 1.4702003298384807, + "learning_rate": 3.433276398691687e-06, + "loss": 0.1356, + "step": 10373 + }, + { + "epoch": 0.8850025592902235, + "grad_norm": 1.636326271651642, + "learning_rate": 3.428247138980517e-06, + "loss": 0.1785, + "step": 10374 + }, + { + "epoch": 0.8850878689643406, + "grad_norm": 2.1498825484730566, + "learning_rate": 3.4232214347751924e-06, + "loss": 0.2169, + "step": 10375 + }, + { + "epoch": 0.8851731786384576, + "grad_norm": 1.4770534445998715, + "learning_rate": 3.418199286459395e-06, + "loss": 0.1815, + "step": 10376 + }, + { + "epoch": 0.8852584883125747, + "grad_norm": 1.7369866854739895, + "learning_rate": 3.413180694416551e-06, + "loss": 0.1394, + "step": 10377 + }, + { + "epoch": 0.8853437979866917, + "grad_norm": 2.3933032622436126, + "learning_rate": 3.408165659029805e-06, + "loss": 0.1479, + "step": 10378 + }, + { + "epoch": 0.8854291076608087, + "grad_norm": 1.8850846252873181, + "learning_rate": 3.4031541806820166e-06, + "loss": 0.2194, + "step": 10379 + }, + { + "epoch": 0.8855144173349258, + "grad_norm": 1.6098044850121427, + "learning_rate": 3.398146259755797e-06, + "loss": 0.1396, + "step": 10380 + }, + { + "epoch": 0.8855997270090429, + "grad_norm": 1.505903705978469, + "learning_rate": 3.3931418966334673e-06, + "loss": 0.1991, + "step": 10381 + }, + { + "epoch": 0.8856850366831599, + "grad_norm": 1.6841271212749986, + "learning_rate": 3.388141091697078e-06, + "loss": 0.1357, + "step": 10382 + }, + { + "epoch": 0.8857703463572769, + "grad_norm": 1.947650898093266, + "learning_rate": 3.383143845328424e-06, + "loss": 0.1903, + "step": 10383 + }, + { + "epoch": 0.8858556560313939, + "grad_norm": 2.213557009176302, + "learning_rate": 3.3781501579090214e-06, + "loss": 0.1733, + "step": 10384 + }, + { + "epoch": 0.885940965705511, + "grad_norm": 1.7506042527178405, + "learning_rate": 3.3731600298200993e-06, + "loss": 0.1926, + "step": 10385 + }, + { + "epoch": 0.8860262753796281, + "grad_norm": 1.9725083097372293, + "learning_rate": 3.3681734614426365e-06, + "loss": 0.1752, + "step": 10386 + }, + { + "epoch": 0.8861115850537451, + "grad_norm": 1.4916817172408252, + "learning_rate": 3.3631904531573277e-06, + "loss": 0.1992, + "step": 10387 + }, + { + "epoch": 0.8861968947278621, + "grad_norm": 1.5711080329507556, + "learning_rate": 3.3582110053446025e-06, + "loss": 0.1979, + "step": 10388 + }, + { + "epoch": 0.8862822044019791, + "grad_norm": 2.255295138784479, + "learning_rate": 3.3532351183846123e-06, + "loss": 0.194, + "step": 10389 + }, + { + "epoch": 0.8863675140760963, + "grad_norm": 1.6453337172601046, + "learning_rate": 3.348262792657242e-06, + "loss": 0.1731, + "step": 10390 + }, + { + "epoch": 0.8864528237502133, + "grad_norm": 2.1783561332149604, + "learning_rate": 3.3432940285420987e-06, + "loss": 0.1712, + "step": 10391 + }, + { + "epoch": 0.8865381334243303, + "grad_norm": 1.8321011332703545, + "learning_rate": 3.338328826418513e-06, + "loss": 0.1881, + "step": 10392 + }, + { + "epoch": 0.8866234430984473, + "grad_norm": 1.8629105237112833, + "learning_rate": 3.333367186665576e-06, + "loss": 0.1727, + "step": 10393 + }, + { + "epoch": 0.8867087527725644, + "grad_norm": 2.097467794068782, + "learning_rate": 3.328409109662062e-06, + "loss": 0.2058, + "step": 10394 + }, + { + "epoch": 0.8867940624466815, + "grad_norm": 2.2305662330930334, + "learning_rate": 3.3234545957865016e-06, + "loss": 0.2393, + "step": 10395 + }, + { + "epoch": 0.8868793721207985, + "grad_norm": 1.4802532592799404, + "learning_rate": 3.318503645417137e-06, + "loss": 0.1532, + "step": 10396 + }, + { + "epoch": 0.8869646817949155, + "grad_norm": 1.5075999688298036, + "learning_rate": 3.3135562589319656e-06, + "loss": 0.2153, + "step": 10397 + }, + { + "epoch": 0.8870499914690326, + "grad_norm": 1.441664427092178, + "learning_rate": 3.308612436708669e-06, + "loss": 0.2076, + "step": 10398 + }, + { + "epoch": 0.8871353011431496, + "grad_norm": 2.6621923062903226, + "learning_rate": 3.303672179124706e-06, + "loss": 0.1182, + "step": 10399 + }, + { + "epoch": 0.8872206108172667, + "grad_norm": 1.3766056136308764, + "learning_rate": 3.298735486557225e-06, + "loss": 0.131, + "step": 10400 + }, + { + "epoch": 0.8873059204913837, + "grad_norm": 1.9289936254464242, + "learning_rate": 3.2938023593831193e-06, + "loss": 0.1715, + "step": 10401 + }, + { + "epoch": 0.8873912301655008, + "grad_norm": 1.8067566210650652, + "learning_rate": 3.288872797979009e-06, + "loss": 0.1628, + "step": 10402 + }, + { + "epoch": 0.8874765398396178, + "grad_norm": 1.4920996195694303, + "learning_rate": 3.2839468027212326e-06, + "loss": 0.1775, + "step": 10403 + }, + { + "epoch": 0.8875618495137348, + "grad_norm": 1.9199502502393557, + "learning_rate": 3.2790243739858782e-06, + "loss": 0.1657, + "step": 10404 + }, + { + "epoch": 0.8876471591878519, + "grad_norm": 1.6965129487127844, + "learning_rate": 3.274105512148723e-06, + "loss": 0.1983, + "step": 10405 + }, + { + "epoch": 0.887732468861969, + "grad_norm": 1.3344992765136678, + "learning_rate": 3.2691902175853272e-06, + "loss": 0.1199, + "step": 10406 + }, + { + "epoch": 0.887817778536086, + "grad_norm": 2.3836288304530946, + "learning_rate": 3.264278490670919e-06, + "loss": 0.2344, + "step": 10407 + }, + { + "epoch": 0.887903088210203, + "grad_norm": 1.5363928398354796, + "learning_rate": 3.259370331780498e-06, + "loss": 0.1611, + "step": 10408 + }, + { + "epoch": 0.88798839788432, + "grad_norm": 2.2629817934403773, + "learning_rate": 3.2544657412887756e-06, + "loss": 0.2066, + "step": 10409 + }, + { + "epoch": 0.8880737075584372, + "grad_norm": 2.174620066815709, + "learning_rate": 3.249564719570186e-06, + "loss": 0.2184, + "step": 10410 + }, + { + "epoch": 0.8881590172325542, + "grad_norm": 2.1930543410582586, + "learning_rate": 3.244667266998902e-06, + "loss": 0.1742, + "step": 10411 + }, + { + "epoch": 0.8882443269066712, + "grad_norm": 1.494880682480142, + "learning_rate": 3.239773383948802e-06, + "loss": 0.1491, + "step": 10412 + }, + { + "epoch": 0.8883296365807882, + "grad_norm": 2.7863988471886096, + "learning_rate": 3.2348830707935427e-06, + "loss": 0.1537, + "step": 10413 + }, + { + "epoch": 0.8884149462549054, + "grad_norm": 2.173145828037546, + "learning_rate": 3.2299963279064315e-06, + "loss": 0.2027, + "step": 10414 + }, + { + "epoch": 0.8885002559290224, + "grad_norm": 1.712890625, + "learning_rate": 3.2251131556605695e-06, + "loss": 0.1275, + "step": 10415 + }, + { + "epoch": 0.8885855656031394, + "grad_norm": 1.5249759046808502, + "learning_rate": 3.2202335544287643e-06, + "loss": 0.2008, + "step": 10416 + }, + { + "epoch": 0.8886708752772564, + "grad_norm": 2.4497182723014213, + "learning_rate": 3.2153575245835344e-06, + "loss": 0.1602, + "step": 10417 + }, + { + "epoch": 0.8887561849513735, + "grad_norm": 1.2289014249926387, + "learning_rate": 3.2104850664971374e-06, + "loss": 0.1659, + "step": 10418 + }, + { + "epoch": 0.8888414946254906, + "grad_norm": 2.2578472411970876, + "learning_rate": 3.205616180541582e-06, + "loss": 0.1983, + "step": 10419 + }, + { + "epoch": 0.8889268042996076, + "grad_norm": 1.7062933612386042, + "learning_rate": 3.200750867088553e-06, + "loss": 0.2211, + "step": 10420 + }, + { + "epoch": 0.8890121139737246, + "grad_norm": 1.7283830667303204, + "learning_rate": 3.19588912650951e-06, + "loss": 0.121, + "step": 10421 + }, + { + "epoch": 0.8890974236478416, + "grad_norm": 1.2392894118588764, + "learning_rate": 3.1910309591756172e-06, + "loss": 0.1252, + "step": 10422 + }, + { + "epoch": 0.8891827333219587, + "grad_norm": 2.1329560283951903, + "learning_rate": 3.186176365457766e-06, + "loss": 0.2277, + "step": 10423 + }, + { + "epoch": 0.8892680429960758, + "grad_norm": 1.9449561225433274, + "learning_rate": 3.181325345726582e-06, + "loss": 0.1254, + "step": 10424 + }, + { + "epoch": 0.8893533526701928, + "grad_norm": 2.7001139016321347, + "learning_rate": 3.1764779003524037e-06, + "loss": 0.2355, + "step": 10425 + }, + { + "epoch": 0.8894386623443098, + "grad_norm": 2.2641218789727326, + "learning_rate": 3.1716340297053336e-06, + "loss": 0.1545, + "step": 10426 + }, + { + "epoch": 0.8895239720184269, + "grad_norm": 1.3878511345374773, + "learning_rate": 3.166793734155149e-06, + "loss": 0.1492, + "step": 10427 + }, + { + "epoch": 0.8896092816925439, + "grad_norm": 1.595795310076257, + "learning_rate": 3.1619570140713927e-06, + "loss": 0.1862, + "step": 10428 + }, + { + "epoch": 0.889694591366661, + "grad_norm": 2.1022785767794874, + "learning_rate": 3.1571238698233252e-06, + "loss": 0.2421, + "step": 10429 + }, + { + "epoch": 0.889779901040778, + "grad_norm": 1.4832815057651223, + "learning_rate": 3.1522943017799232e-06, + "loss": 0.1296, + "step": 10430 + }, + { + "epoch": 0.8898652107148951, + "grad_norm": 1.5892275054093372, + "learning_rate": 3.147468310309909e-06, + "loss": 0.1519, + "step": 10431 + }, + { + "epoch": 0.8899505203890121, + "grad_norm": 1.5301734098011572, + "learning_rate": 3.142645895781715e-06, + "loss": 0.1733, + "step": 10432 + }, + { + "epoch": 0.8900358300631291, + "grad_norm": 1.8567424056849473, + "learning_rate": 3.1378270585635026e-06, + "loss": 0.2573, + "step": 10433 + }, + { + "epoch": 0.8901211397372462, + "grad_norm": 2.36287384304722, + "learning_rate": 3.1330117990231613e-06, + "loss": 0.1673, + "step": 10434 + }, + { + "epoch": 0.8902064494113633, + "grad_norm": 1.63332413683787, + "learning_rate": 3.128200117528335e-06, + "loss": 0.2335, + "step": 10435 + }, + { + "epoch": 0.8902917590854803, + "grad_norm": 1.7922524078774398, + "learning_rate": 3.1233920144463415e-06, + "loss": 0.1968, + "step": 10436 + }, + { + "epoch": 0.8903770687595973, + "grad_norm": 1.749378570847378, + "learning_rate": 3.1185874901442703e-06, + "loss": 0.2045, + "step": 10437 + }, + { + "epoch": 0.8904623784337143, + "grad_norm": 1.660779553257392, + "learning_rate": 3.113786544988906e-06, + "loss": 0.1931, + "step": 10438 + }, + { + "epoch": 0.8905476881078315, + "grad_norm": 2.347318450439965, + "learning_rate": 3.108989179346805e-06, + "loss": 0.2725, + "step": 10439 + }, + { + "epoch": 0.8906329977819485, + "grad_norm": 1.6523855217597867, + "learning_rate": 3.104195393584186e-06, + "loss": 0.1356, + "step": 10440 + }, + { + "epoch": 0.8907183074560655, + "grad_norm": 2.0533095696341603, + "learning_rate": 3.0994051880670504e-06, + "loss": 0.1998, + "step": 10441 + }, + { + "epoch": 0.8908036171301825, + "grad_norm": 1.6705343985830896, + "learning_rate": 3.0946185631611002e-06, + "loss": 0.168, + "step": 10442 + }, + { + "epoch": 0.8908889268042997, + "grad_norm": 1.4164267318094759, + "learning_rate": 3.089835519231771e-06, + "loss": 0.1416, + "step": 10443 + }, + { + "epoch": 0.8909742364784167, + "grad_norm": 1.7399030684733865, + "learning_rate": 3.0850560566442145e-06, + "loss": 0.1613, + "step": 10444 + }, + { + "epoch": 0.8910595461525337, + "grad_norm": 1.3733091795673766, + "learning_rate": 3.080280175763328e-06, + "loss": 0.1485, + "step": 10445 + }, + { + "epoch": 0.8911448558266507, + "grad_norm": 1.6664922464020393, + "learning_rate": 3.075507876953715e-06, + "loss": 0.1337, + "step": 10446 + }, + { + "epoch": 0.8912301655007678, + "grad_norm": 2.4467379281929262, + "learning_rate": 3.070739160579711e-06, + "loss": 0.2178, + "step": 10447 + }, + { + "epoch": 0.8913154751748849, + "grad_norm": 1.6329643507440024, + "learning_rate": 3.065974027005408e-06, + "loss": 0.1941, + "step": 10448 + }, + { + "epoch": 0.8914007848490019, + "grad_norm": 1.5735763867325292, + "learning_rate": 3.0612124765945603e-06, + "loss": 0.1721, + "step": 10449 + }, + { + "epoch": 0.8914860945231189, + "grad_norm": 1.6134469822322912, + "learning_rate": 3.056454509710721e-06, + "loss": 0.1773, + "step": 10450 + }, + { + "epoch": 0.891571404197236, + "grad_norm": 1.5323527510876433, + "learning_rate": 3.051700126717122e-06, + "loss": 0.1346, + "step": 10451 + }, + { + "epoch": 0.891656713871353, + "grad_norm": 1.5041089998781352, + "learning_rate": 3.0469493279767335e-06, + "loss": 0.1654, + "step": 10452 + }, + { + "epoch": 0.89174202354547, + "grad_norm": 1.503623083629841, + "learning_rate": 3.042202113852255e-06, + "loss": 0.1614, + "step": 10453 + }, + { + "epoch": 0.8918273332195871, + "grad_norm": 1.576719507994643, + "learning_rate": 3.037458484706102e-06, + "loss": 0.1551, + "step": 10454 + }, + { + "epoch": 0.8919126428937042, + "grad_norm": 1.303282569488527, + "learning_rate": 3.032718440900456e-06, + "loss": 0.1238, + "step": 10455 + }, + { + "epoch": 0.8919979525678212, + "grad_norm": 2.0062909131022915, + "learning_rate": 3.0279819827971513e-06, + "loss": 0.1473, + "step": 10456 + }, + { + "epoch": 0.8920832622419382, + "grad_norm": 2.214527679442634, + "learning_rate": 3.0232491107578253e-06, + "loss": 0.2257, + "step": 10457 + }, + { + "epoch": 0.8921685719160553, + "grad_norm": 1.868266284816012, + "learning_rate": 3.018519825143795e-06, + "loss": 0.2214, + "step": 10458 + }, + { + "epoch": 0.8922538815901723, + "grad_norm": 2.0095586049156604, + "learning_rate": 3.0137941263161164e-06, + "loss": 0.1437, + "step": 10459 + }, + { + "epoch": 0.8923391912642894, + "grad_norm": 1.6162697957088574, + "learning_rate": 3.0090720146355666e-06, + "loss": 0.1799, + "step": 10460 + }, + { + "epoch": 0.8924245009384064, + "grad_norm": 1.679664931034787, + "learning_rate": 3.004353490462669e-06, + "loss": 0.2663, + "step": 10461 + }, + { + "epoch": 0.8925098106125234, + "grad_norm": 2.430323870076652, + "learning_rate": 2.9996385541576353e-06, + "loss": 0.2399, + "step": 10462 + }, + { + "epoch": 0.8925951202866405, + "grad_norm": 1.9673142420364764, + "learning_rate": 2.9949272060804445e-06, + "loss": 0.1909, + "step": 10463 + }, + { + "epoch": 0.8926804299607576, + "grad_norm": 1.4730415143846374, + "learning_rate": 2.9902194465907807e-06, + "loss": 0.1342, + "step": 10464 + }, + { + "epoch": 0.8927657396348746, + "grad_norm": 2.0161896144009526, + "learning_rate": 2.985515276048051e-06, + "loss": 0.1629, + "step": 10465 + }, + { + "epoch": 0.8928510493089916, + "grad_norm": 2.244352246059992, + "learning_rate": 2.9808146948113958e-06, + "loss": 0.217, + "step": 10466 + }, + { + "epoch": 0.8929363589831086, + "grad_norm": 1.6065947581986981, + "learning_rate": 2.976117703239667e-06, + "loss": 0.1448, + "step": 10467 + }, + { + "epoch": 0.8930216686572258, + "grad_norm": 2.01882302355894, + "learning_rate": 2.9714243016914834e-06, + "loss": 0.19, + "step": 10468 + }, + { + "epoch": 0.8931069783313428, + "grad_norm": 2.111786093080852, + "learning_rate": 2.9667344905251302e-06, + "loss": 0.1908, + "step": 10469 + }, + { + "epoch": 0.8931922880054598, + "grad_norm": 1.516348440020543, + "learning_rate": 2.9620482700986774e-06, + "loss": 0.1488, + "step": 10470 + }, + { + "epoch": 0.8932775976795768, + "grad_norm": 1.6777431856397, + "learning_rate": 2.9573656407698713e-06, + "loss": 0.1425, + "step": 10471 + }, + { + "epoch": 0.893362907353694, + "grad_norm": 1.5583931930115256, + "learning_rate": 2.9526866028962206e-06, + "loss": 0.2141, + "step": 10472 + }, + { + "epoch": 0.893448217027811, + "grad_norm": 1.8180842698545303, + "learning_rate": 2.9480111568349346e-06, + "loss": 0.1734, + "step": 10473 + }, + { + "epoch": 0.893533526701928, + "grad_norm": 2.0939973144222552, + "learning_rate": 2.9433393029429657e-06, + "loss": 0.2387, + "step": 10474 + }, + { + "epoch": 0.893618836376045, + "grad_norm": 1.5273201786963904, + "learning_rate": 2.938671041576979e-06, + "loss": 0.1802, + "step": 10475 + }, + { + "epoch": 0.8937041460501621, + "grad_norm": 1.774565820917315, + "learning_rate": 2.9340063730933675e-06, + "loss": 0.1625, + "step": 10476 + }, + { + "epoch": 0.8937894557242791, + "grad_norm": 1.8775182978605747, + "learning_rate": 2.9293452978482793e-06, + "loss": 0.1703, + "step": 10477 + }, + { + "epoch": 0.8938747653983962, + "grad_norm": 2.0663615980894625, + "learning_rate": 2.9246878161975298e-06, + "loss": 0.1979, + "step": 10478 + }, + { + "epoch": 0.8939600750725132, + "grad_norm": 1.8573085297057736, + "learning_rate": 2.9200339284967127e-06, + "loss": 0.1886, + "step": 10479 + }, + { + "epoch": 0.8940453847466303, + "grad_norm": 1.3854936803001126, + "learning_rate": 2.915383635101121e-06, + "loss": 0.1065, + "step": 10480 + }, + { + "epoch": 0.8941306944207473, + "grad_norm": 2.02107743299789, + "learning_rate": 2.910736936365782e-06, + "loss": 0.2082, + "step": 10481 + }, + { + "epoch": 0.8942160040948643, + "grad_norm": 2.003807258763536, + "learning_rate": 2.90609383264544e-06, + "loss": 0.2331, + "step": 10482 + }, + { + "epoch": 0.8943013137689814, + "grad_norm": 2.413153319766929, + "learning_rate": 2.9014543242945837e-06, + "loss": 0.174, + "step": 10483 + }, + { + "epoch": 0.8943866234430985, + "grad_norm": 1.3785938633124581, + "learning_rate": 2.896818411667407e-06, + "loss": 0.1646, + "step": 10484 + }, + { + "epoch": 0.8944719331172155, + "grad_norm": 1.38346692133472, + "learning_rate": 2.8921860951178435e-06, + "loss": 0.13, + "step": 10485 + }, + { + "epoch": 0.8945572427913325, + "grad_norm": 2.854448921744894, + "learning_rate": 2.8875573749995335e-06, + "loss": 0.1676, + "step": 10486 + }, + { + "epoch": 0.8946425524654495, + "grad_norm": 1.7130211809745042, + "learning_rate": 2.882932251665871e-06, + "loss": 0.1559, + "step": 10487 + }, + { + "epoch": 0.8947278621395667, + "grad_norm": 2.032023825333057, + "learning_rate": 2.878310725469946e-06, + "loss": 0.1459, + "step": 10488 + }, + { + "epoch": 0.8948131718136837, + "grad_norm": 1.5012232243344112, + "learning_rate": 2.873692796764582e-06, + "loss": 0.1588, + "step": 10489 + }, + { + "epoch": 0.8948984814878007, + "grad_norm": 1.870215032423232, + "learning_rate": 2.869078465902364e-06, + "loss": 0.2501, + "step": 10490 + }, + { + "epoch": 0.8949837911619177, + "grad_norm": 2.1096939410932216, + "learning_rate": 2.8644677332355374e-06, + "loss": 0.2145, + "step": 10491 + }, + { + "epoch": 0.8950691008360349, + "grad_norm": 1.806758798394396, + "learning_rate": 2.8598605991161264e-06, + "loss": 0.2183, + "step": 10492 + }, + { + "epoch": 0.8951544105101519, + "grad_norm": 2.4077465803367453, + "learning_rate": 2.855257063895861e-06, + "loss": 0.2607, + "step": 10493 + }, + { + "epoch": 0.8952397201842689, + "grad_norm": 1.719185999276056, + "learning_rate": 2.8506571279261874e-06, + "loss": 0.1774, + "step": 10494 + }, + { + "epoch": 0.8953250298583859, + "grad_norm": 2.1273753410064282, + "learning_rate": 2.8460607915582916e-06, + "loss": 0.1504, + "step": 10495 + }, + { + "epoch": 0.895410339532503, + "grad_norm": 2.131442122315891, + "learning_rate": 2.8414680551430762e-06, + "loss": 0.2177, + "step": 10496 + }, + { + "epoch": 0.8954956492066201, + "grad_norm": 1.7430209915901826, + "learning_rate": 2.8368789190311773e-06, + "loss": 0.1876, + "step": 10497 + }, + { + "epoch": 0.8955809588807371, + "grad_norm": 1.950038760362443, + "learning_rate": 2.8322933835729426e-06, + "loss": 0.2045, + "step": 10498 + }, + { + "epoch": 0.8956662685548541, + "grad_norm": 1.6880785338903581, + "learning_rate": 2.8277114491184643e-06, + "loss": 0.1557, + "step": 10499 + }, + { + "epoch": 0.8957515782289711, + "grad_norm": 1.9905600688411276, + "learning_rate": 2.82313311601754e-06, + "loss": 0.1608, + "step": 10500 + }, + { + "epoch": 0.8958368879030882, + "grad_norm": 1.8851094144843785, + "learning_rate": 2.818558384619713e-06, + "loss": 0.168, + "step": 10501 + }, + { + "epoch": 0.8959221975772053, + "grad_norm": 1.785841662087409, + "learning_rate": 2.81398725527422e-06, + "loss": 0.2549, + "step": 10502 + }, + { + "epoch": 0.8960075072513223, + "grad_norm": 2.1300600057755052, + "learning_rate": 2.8094197283300647e-06, + "loss": 0.1734, + "step": 10503 + }, + { + "epoch": 0.8960928169254393, + "grad_norm": 2.183879253577854, + "learning_rate": 2.804855804135931e-06, + "loss": 0.1799, + "step": 10504 + }, + { + "epoch": 0.8961781265995564, + "grad_norm": 1.4967291456292207, + "learning_rate": 2.8002954830402717e-06, + "loss": 0.1479, + "step": 10505 + }, + { + "epoch": 0.8962634362736734, + "grad_norm": 1.7773807731377445, + "learning_rate": 2.7957387653912315e-06, + "loss": 0.2044, + "step": 10506 + }, + { + "epoch": 0.8963487459477905, + "grad_norm": 2.0735830133772555, + "learning_rate": 2.791185651536693e-06, + "loss": 0.1594, + "step": 10507 + }, + { + "epoch": 0.8964340556219075, + "grad_norm": 2.1115970916647084, + "learning_rate": 2.7866361418242616e-06, + "loss": 0.1753, + "step": 10508 + }, + { + "epoch": 0.8965193652960246, + "grad_norm": 1.6617454193084238, + "learning_rate": 2.7820902366012703e-06, + "loss": 0.1841, + "step": 10509 + }, + { + "epoch": 0.8966046749701416, + "grad_norm": 1.6799834234691435, + "learning_rate": 2.777547936214775e-06, + "loss": 0.2259, + "step": 10510 + }, + { + "epoch": 0.8966899846442586, + "grad_norm": 2.1432096373053375, + "learning_rate": 2.7730092410115484e-06, + "loss": 0.1473, + "step": 10511 + }, + { + "epoch": 0.8967752943183757, + "grad_norm": 1.7226980468108275, + "learning_rate": 2.7684741513381074e-06, + "loss": 0.1778, + "step": 10512 + }, + { + "epoch": 0.8968606039924928, + "grad_norm": 2.6704035940419235, + "learning_rate": 2.7639426675406753e-06, + "loss": 0.1506, + "step": 10513 + }, + { + "epoch": 0.8969459136666098, + "grad_norm": 2.264081758215843, + "learning_rate": 2.759414789965209e-06, + "loss": 0.1716, + "step": 10514 + }, + { + "epoch": 0.8970312233407268, + "grad_norm": 1.650777015373695, + "learning_rate": 2.754890518957387e-06, + "loss": 0.1594, + "step": 10515 + }, + { + "epoch": 0.8971165330148438, + "grad_norm": 1.5992930638685905, + "learning_rate": 2.7503698548626167e-06, + "loss": 0.1479, + "step": 10516 + }, + { + "epoch": 0.897201842688961, + "grad_norm": 1.6958277046928754, + "learning_rate": 2.7458527980260216e-06, + "loss": 0.1898, + "step": 10517 + }, + { + "epoch": 0.897287152363078, + "grad_norm": 2.010331410657524, + "learning_rate": 2.7413393487924543e-06, + "loss": 0.1573, + "step": 10518 + }, + { + "epoch": 0.897372462037195, + "grad_norm": 1.5973883593118763, + "learning_rate": 2.736829507506505e-06, + "loss": 0.1972, + "step": 10519 + }, + { + "epoch": 0.897457771711312, + "grad_norm": 1.7921863585836806, + "learning_rate": 2.732323274512455e-06, + "loss": 0.1946, + "step": 10520 + }, + { + "epoch": 0.8975430813854292, + "grad_norm": 1.293145942647463, + "learning_rate": 2.7278206501543448e-06, + "loss": 0.1596, + "step": 10521 + }, + { + "epoch": 0.8976283910595462, + "grad_norm": 1.9004375455891884, + "learning_rate": 2.7233216347759272e-06, + "loss": 0.1842, + "step": 10522 + }, + { + "epoch": 0.8977137007336632, + "grad_norm": 2.1753882664486004, + "learning_rate": 2.7188262287206776e-06, + "loss": 0.1376, + "step": 10523 + }, + { + "epoch": 0.8977990104077802, + "grad_norm": 1.3421684539118581, + "learning_rate": 2.714334432331783e-06, + "loss": 0.1398, + "step": 10524 + }, + { + "epoch": 0.8978843200818973, + "grad_norm": 1.7704862665884258, + "learning_rate": 2.709846245952191e-06, + "loss": 0.1894, + "step": 10525 + }, + { + "epoch": 0.8979696297560144, + "grad_norm": 1.2460427587713037, + "learning_rate": 2.7053616699245277e-06, + "loss": 0.1712, + "step": 10526 + }, + { + "epoch": 0.8980549394301314, + "grad_norm": 1.741190740668234, + "learning_rate": 2.7008807045911855e-06, + "loss": 0.194, + "step": 10527 + }, + { + "epoch": 0.8981402491042484, + "grad_norm": 1.727555899702432, + "learning_rate": 2.6964033502942523e-06, + "loss": 0.1942, + "step": 10528 + }, + { + "epoch": 0.8982255587783655, + "grad_norm": 1.652489333356403, + "learning_rate": 2.691929607375554e-06, + "loss": 0.1285, + "step": 10529 + }, + { + "epoch": 0.8983108684524825, + "grad_norm": 1.5580467851442994, + "learning_rate": 2.687459476176635e-06, + "loss": 0.1712, + "step": 10530 + }, + { + "epoch": 0.8983961781265996, + "grad_norm": 2.0175027306974287, + "learning_rate": 2.6829929570387545e-06, + "loss": 0.182, + "step": 10531 + }, + { + "epoch": 0.8984814878007166, + "grad_norm": 1.7270804616477047, + "learning_rate": 2.6785300503029407e-06, + "loss": 0.2635, + "step": 10532 + }, + { + "epoch": 0.8985667974748337, + "grad_norm": 1.7850329734257127, + "learning_rate": 2.674070756309871e-06, + "loss": 0.2032, + "step": 10533 + }, + { + "epoch": 0.8986521071489507, + "grad_norm": 1.9727984386603434, + "learning_rate": 2.6696150754000227e-06, + "loss": 0.1815, + "step": 10534 + }, + { + "epoch": 0.8987374168230677, + "grad_norm": 1.6653025130654109, + "learning_rate": 2.665163007913546e-06, + "loss": 0.2374, + "step": 10535 + }, + { + "epoch": 0.8988227264971848, + "grad_norm": 2.0836344946305614, + "learning_rate": 2.6607145541903354e-06, + "loss": 0.1745, + "step": 10536 + }, + { + "epoch": 0.8989080361713018, + "grad_norm": 1.4313238399731516, + "learning_rate": 2.656269714570009e-06, + "loss": 0.1839, + "step": 10537 + }, + { + "epoch": 0.8989933458454189, + "grad_norm": 1.5465227747170442, + "learning_rate": 2.6518284893919108e-06, + "loss": 0.148, + "step": 10538 + }, + { + "epoch": 0.8990786555195359, + "grad_norm": 1.3864058880400834, + "learning_rate": 2.647390878995093e-06, + "loss": 0.162, + "step": 10539 + }, + { + "epoch": 0.8991639651936529, + "grad_norm": 1.7834781965201505, + "learning_rate": 2.642956883718345e-06, + "loss": 0.226, + "step": 10540 + }, + { + "epoch": 0.89924927486777, + "grad_norm": 1.5048748909793133, + "learning_rate": 2.6385265039002015e-06, + "loss": 0.1384, + "step": 10541 + }, + { + "epoch": 0.8993345845418871, + "grad_norm": 1.8364711554027917, + "learning_rate": 2.6340997398788593e-06, + "loss": 0.1838, + "step": 10542 + }, + { + "epoch": 0.8994198942160041, + "grad_norm": 2.4610237227578793, + "learning_rate": 2.629676591992314e-06, + "loss": 0.1965, + "step": 10543 + }, + { + "epoch": 0.8995052038901211, + "grad_norm": 1.5320080807425427, + "learning_rate": 2.6252570605782234e-06, + "loss": 0.1158, + "step": 10544 + }, + { + "epoch": 0.8995905135642381, + "grad_norm": 1.6403734286753582, + "learning_rate": 2.6208411459740235e-06, + "loss": 0.1427, + "step": 10545 + }, + { + "epoch": 0.8996758232383553, + "grad_norm": 1.7684483867444307, + "learning_rate": 2.6164288485168164e-06, + "loss": 0.1793, + "step": 10546 + }, + { + "epoch": 0.8997611329124723, + "grad_norm": 1.4423553732791197, + "learning_rate": 2.6120201685434776e-06, + "loss": 0.1994, + "step": 10547 + }, + { + "epoch": 0.8998464425865893, + "grad_norm": 1.9681565131931975, + "learning_rate": 2.6076151063905764e-06, + "loss": 0.2312, + "step": 10548 + }, + { + "epoch": 0.8999317522607063, + "grad_norm": 2.205974062340875, + "learning_rate": 2.6032136623944214e-06, + "loss": 0.1726, + "step": 10549 + }, + { + "epoch": 0.9000170619348234, + "grad_norm": 1.5435340328734017, + "learning_rate": 2.598815836891033e-06, + "loss": 0.1686, + "step": 10550 + }, + { + "epoch": 0.9001023716089405, + "grad_norm": 1.42687143947777, + "learning_rate": 2.5944216302161704e-06, + "loss": 0.1606, + "step": 10551 + }, + { + "epoch": 0.9001876812830575, + "grad_norm": 1.8744873617646194, + "learning_rate": 2.5900310427053044e-06, + "loss": 0.197, + "step": 10552 + }, + { + "epoch": 0.9002729909571745, + "grad_norm": 1.4120500269826781, + "learning_rate": 2.5856440746936216e-06, + "loss": 0.2018, + "step": 10553 + }, + { + "epoch": 0.9003583006312916, + "grad_norm": 1.501288575781354, + "learning_rate": 2.5812607265160716e-06, + "loss": 0.168, + "step": 10554 + }, + { + "epoch": 0.9004436103054086, + "grad_norm": 1.6156271073052777, + "learning_rate": 2.576880998507264e-06, + "loss": 0.1809, + "step": 10555 + }, + { + "epoch": 0.9005289199795257, + "grad_norm": 2.3627376214876596, + "learning_rate": 2.5725048910015924e-06, + "loss": 0.1671, + "step": 10556 + }, + { + "epoch": 0.9006142296536427, + "grad_norm": 2.0578717407875504, + "learning_rate": 2.5681324043331455e-06, + "loss": 0.1742, + "step": 10557 + }, + { + "epoch": 0.9006995393277598, + "grad_norm": 2.2944761452115436, + "learning_rate": 2.5637635388357395e-06, + "loss": 0.1784, + "step": 10558 + }, + { + "epoch": 0.9007848490018768, + "grad_norm": 1.9985248370618334, + "learning_rate": 2.5593982948429074e-06, + "loss": 0.1971, + "step": 10559 + }, + { + "epoch": 0.9008701586759938, + "grad_norm": 2.2466666643650193, + "learning_rate": 2.5550366726879103e-06, + "loss": 0.1328, + "step": 10560 + }, + { + "epoch": 0.9009554683501109, + "grad_norm": 2.107862820600807, + "learning_rate": 2.5506786727037545e-06, + "loss": 0.2625, + "step": 10561 + }, + { + "epoch": 0.901040778024228, + "grad_norm": 1.486819052106636, + "learning_rate": 2.5463242952231235e-06, + "loss": 0.1645, + "step": 10562 + }, + { + "epoch": 0.901126087698345, + "grad_norm": 1.7392643923584128, + "learning_rate": 2.541973540578474e-06, + "loss": 0.1671, + "step": 10563 + }, + { + "epoch": 0.901211397372462, + "grad_norm": 3.5097895362986264, + "learning_rate": 2.5376264091019506e-06, + "loss": 0.2236, + "step": 10564 + }, + { + "epoch": 0.901296707046579, + "grad_norm": 1.7950209630735612, + "learning_rate": 2.5332829011254334e-06, + "loss": 0.2056, + "step": 10565 + }, + { + "epoch": 0.9013820167206962, + "grad_norm": 1.6449071314143493, + "learning_rate": 2.528943016980523e-06, + "loss": 0.2016, + "step": 10566 + }, + { + "epoch": 0.9014673263948132, + "grad_norm": 2.08566785348244, + "learning_rate": 2.524606756998571e-06, + "loss": 0.1654, + "step": 10567 + }, + { + "epoch": 0.9015526360689302, + "grad_norm": 2.0137263382244344, + "learning_rate": 2.520274121510591e-06, + "loss": 0.1729, + "step": 10568 + }, + { + "epoch": 0.9016379457430472, + "grad_norm": 1.3655144618360504, + "learning_rate": 2.5159451108473843e-06, + "loss": 0.16, + "step": 10569 + }, + { + "epoch": 0.9017232554171644, + "grad_norm": 1.4959645028722306, + "learning_rate": 2.511619725339431e-06, + "loss": 0.1039, + "step": 10570 + }, + { + "epoch": 0.9018085650912814, + "grad_norm": 1.36393285006297, + "learning_rate": 2.507297965316968e-06, + "loss": 0.1386, + "step": 10571 + }, + { + "epoch": 0.9018938747653984, + "grad_norm": 1.7942748787164877, + "learning_rate": 2.502979831109925e-06, + "loss": 0.1384, + "step": 10572 + }, + { + "epoch": 0.9019791844395154, + "grad_norm": 1.73686424534815, + "learning_rate": 2.498665323047966e-06, + "loss": 0.1875, + "step": 10573 + }, + { + "epoch": 0.9020644941136324, + "grad_norm": 2.204059051117226, + "learning_rate": 2.4943544414605e-06, + "loss": 0.196, + "step": 10574 + }, + { + "epoch": 0.9021498037877496, + "grad_norm": 1.7937276061251848, + "learning_rate": 2.4900471866766194e-06, + "loss": 0.216, + "step": 10575 + }, + { + "epoch": 0.9022351134618666, + "grad_norm": 2.3830418210603965, + "learning_rate": 2.485743559025172e-06, + "loss": 0.182, + "step": 10576 + }, + { + "epoch": 0.9023204231359836, + "grad_norm": 1.9185712376925055, + "learning_rate": 2.481443558834712e-06, + "loss": 0.1259, + "step": 10577 + }, + { + "epoch": 0.9024057328101006, + "grad_norm": 2.124762016882703, + "learning_rate": 2.477147186433526e-06, + "loss": 0.1483, + "step": 10578 + }, + { + "epoch": 0.9024910424842177, + "grad_norm": 1.9880815866641777, + "learning_rate": 2.4728544421496137e-06, + "loss": 0.1828, + "step": 10579 + }, + { + "epoch": 0.9025763521583348, + "grad_norm": 2.3383634594231664, + "learning_rate": 2.4685653263107067e-06, + "loss": 0.1985, + "step": 10580 + }, + { + "epoch": 0.9026616618324518, + "grad_norm": 1.541455417628065, + "learning_rate": 2.4642798392442547e-06, + "loss": 0.1817, + "step": 10581 + }, + { + "epoch": 0.9027469715065688, + "grad_norm": 2.005275325564332, + "learning_rate": 2.459997981277423e-06, + "loss": 0.1306, + "step": 10582 + }, + { + "epoch": 0.9028322811806859, + "grad_norm": 1.8729805561691883, + "learning_rate": 2.4557197527371344e-06, + "loss": 0.1497, + "step": 10583 + }, + { + "epoch": 0.9029175908548029, + "grad_norm": 1.9235317873369684, + "learning_rate": 2.4514451539499828e-06, + "loss": 0.1401, + "step": 10584 + }, + { + "epoch": 0.90300290052892, + "grad_norm": 1.4631228498192932, + "learning_rate": 2.4471741852423237e-06, + "loss": 0.1457, + "step": 10585 + }, + { + "epoch": 0.903088210203037, + "grad_norm": 1.5457407328108679, + "learning_rate": 2.4429068469402184e-06, + "loss": 0.1969, + "step": 10586 + }, + { + "epoch": 0.9031735198771541, + "grad_norm": 2.2592827784710137, + "learning_rate": 2.438643139369462e-06, + "loss": 0.2441, + "step": 10587 + }, + { + "epoch": 0.9032588295512711, + "grad_norm": 1.5113886194256736, + "learning_rate": 2.4343830628555496e-06, + "loss": 0.1074, + "step": 10588 + }, + { + "epoch": 0.9033441392253881, + "grad_norm": 2.3444981715999997, + "learning_rate": 2.430126617723738e-06, + "loss": 0.139, + "step": 10589 + }, + { + "epoch": 0.9034294488995052, + "grad_norm": 1.7867951664097712, + "learning_rate": 2.4258738042989728e-06, + "loss": 0.1486, + "step": 10590 + }, + { + "epoch": 0.9035147585736223, + "grad_norm": 1.519964867236028, + "learning_rate": 2.4216246229059326e-06, + "loss": 0.1782, + "step": 10591 + }, + { + "epoch": 0.9036000682477393, + "grad_norm": 2.3794073068896964, + "learning_rate": 2.4173790738690195e-06, + "loss": 0.2083, + "step": 10592 + }, + { + "epoch": 0.9036853779218563, + "grad_norm": 2.8158197096417648, + "learning_rate": 2.413137157512363e-06, + "loss": 0.2363, + "step": 10593 + }, + { + "epoch": 0.9037706875959733, + "grad_norm": 1.4726968855778746, + "learning_rate": 2.408898874159804e-06, + "loss": 0.2488, + "step": 10594 + }, + { + "epoch": 0.9038559972700905, + "grad_norm": 1.541473823377075, + "learning_rate": 2.404664224134917e-06, + "loss": 0.1671, + "step": 10595 + }, + { + "epoch": 0.9039413069442075, + "grad_norm": 2.1053748737985862, + "learning_rate": 2.4004332077610047e-06, + "loss": 0.1767, + "step": 10596 + }, + { + "epoch": 0.9040266166183245, + "grad_norm": 1.834124639995287, + "learning_rate": 2.3962058253610587e-06, + "loss": 0.1633, + "step": 10597 + }, + { + "epoch": 0.9041119262924415, + "grad_norm": 2.141628837498739, + "learning_rate": 2.391982077257837e-06, + "loss": 0.169, + "step": 10598 + }, + { + "epoch": 0.9041972359665587, + "grad_norm": 1.751007198952909, + "learning_rate": 2.387761963773799e-06, + "loss": 0.2075, + "step": 10599 + }, + { + "epoch": 0.9042825456406757, + "grad_norm": 1.1925453618856723, + "learning_rate": 2.3835454852311255e-06, + "loss": 0.1589, + "step": 10600 + }, + { + "epoch": 0.9043678553147927, + "grad_norm": 1.5879376956958946, + "learning_rate": 2.3793326419517147e-06, + "loss": 0.1869, + "step": 10601 + }, + { + "epoch": 0.9044531649889097, + "grad_norm": 2.2634726988854657, + "learning_rate": 2.375123434257198e-06, + "loss": 0.2274, + "step": 10602 + }, + { + "epoch": 0.9045384746630268, + "grad_norm": 1.435930639339823, + "learning_rate": 2.370917862468941e-06, + "loss": 0.1486, + "step": 10603 + }, + { + "epoch": 0.9046237843371439, + "grad_norm": 1.9916421181138184, + "learning_rate": 2.366715926907992e-06, + "loss": 0.1433, + "step": 10604 + }, + { + "epoch": 0.9047090940112609, + "grad_norm": 2.134515885580592, + "learning_rate": 2.362517627895167e-06, + "loss": 0.1895, + "step": 10605 + }, + { + "epoch": 0.9047944036853779, + "grad_norm": 2.003473603240591, + "learning_rate": 2.35832296575097e-06, + "loss": 0.1782, + "step": 10606 + }, + { + "epoch": 0.904879713359495, + "grad_norm": 1.7748703063004383, + "learning_rate": 2.354131940795651e-06, + "loss": 0.2, + "step": 10607 + }, + { + "epoch": 0.904965023033612, + "grad_norm": 1.5280859819357275, + "learning_rate": 2.3499445533491646e-06, + "loss": 0.1563, + "step": 10608 + }, + { + "epoch": 0.905050332707729, + "grad_norm": 1.7783134079054934, + "learning_rate": 2.345760803731206e-06, + "loss": 0.1243, + "step": 10609 + }, + { + "epoch": 0.9051356423818461, + "grad_norm": 2.333620689499675, + "learning_rate": 2.3415806922611695e-06, + "loss": 0.184, + "step": 10610 + }, + { + "epoch": 0.9052209520559632, + "grad_norm": 2.2649902506906905, + "learning_rate": 2.3374042192581934e-06, + "loss": 0.1544, + "step": 10611 + }, + { + "epoch": 0.9053062617300802, + "grad_norm": 1.6221706260607487, + "learning_rate": 2.3332313850411236e-06, + "loss": 0.1179, + "step": 10612 + }, + { + "epoch": 0.9053915714041972, + "grad_norm": 1.6463911181700592, + "learning_rate": 2.3290621899285436e-06, + "loss": 0.1725, + "step": 10613 + }, + { + "epoch": 0.9054768810783143, + "grad_norm": 1.4939875904573885, + "learning_rate": 2.3248966342387378e-06, + "loss": 0.1309, + "step": 10614 + }, + { + "epoch": 0.9055621907524313, + "grad_norm": 2.0156234622919884, + "learning_rate": 2.3207347182897298e-06, + "loss": 0.1557, + "step": 10615 + }, + { + "epoch": 0.9056475004265484, + "grad_norm": 2.041257882228567, + "learning_rate": 2.3165764423992543e-06, + "loss": 0.1745, + "step": 10616 + }, + { + "epoch": 0.9057328101006654, + "grad_norm": 1.890602048624249, + "learning_rate": 2.312421806884779e-06, + "loss": 0.1838, + "step": 10617 + }, + { + "epoch": 0.9058181197747824, + "grad_norm": 1.8740193663647806, + "learning_rate": 2.3082708120634898e-06, + "loss": 0.1402, + "step": 10618 + }, + { + "epoch": 0.9059034294488995, + "grad_norm": 1.769450339515936, + "learning_rate": 2.3041234582522886e-06, + "loss": 0.1726, + "step": 10619 + }, + { + "epoch": 0.9059887391230166, + "grad_norm": 1.9190244546736495, + "learning_rate": 2.299979745767811e-06, + "loss": 0.1836, + "step": 10620 + }, + { + "epoch": 0.9060740487971336, + "grad_norm": 1.5010995014104713, + "learning_rate": 2.2958396749263976e-06, + "loss": 0.1955, + "step": 10621 + }, + { + "epoch": 0.9061593584712506, + "grad_norm": 1.6766105680516468, + "learning_rate": 2.29170324604413e-06, + "loss": 0.1312, + "step": 10622 + }, + { + "epoch": 0.9062446681453676, + "grad_norm": 1.7216488753756956, + "learning_rate": 2.287570459436794e-06, + "loss": 0.2343, + "step": 10623 + }, + { + "epoch": 0.9063299778194848, + "grad_norm": 2.156510599946563, + "learning_rate": 2.283441315419904e-06, + "loss": 0.1969, + "step": 10624 + }, + { + "epoch": 0.9064152874936018, + "grad_norm": 1.569080989617024, + "learning_rate": 2.279315814308719e-06, + "loss": 0.1897, + "step": 10625 + }, + { + "epoch": 0.9065005971677188, + "grad_norm": 1.9775121767076682, + "learning_rate": 2.27519395641817e-06, + "loss": 0.1846, + "step": 10626 + }, + { + "epoch": 0.9065859068418358, + "grad_norm": 1.7200352804868233, + "learning_rate": 2.2710757420629558e-06, + "loss": 0.1901, + "step": 10627 + }, + { + "epoch": 0.906671216515953, + "grad_norm": 2.172458268479211, + "learning_rate": 2.26696117155748e-06, + "loss": 0.159, + "step": 10628 + }, + { + "epoch": 0.90675652619007, + "grad_norm": 1.5962757217856063, + "learning_rate": 2.2628502452158695e-06, + "loss": 0.2112, + "step": 10629 + }, + { + "epoch": 0.906841835864187, + "grad_norm": 1.9475472111788479, + "learning_rate": 2.2587429633519563e-06, + "loss": 0.1676, + "step": 10630 + }, + { + "epoch": 0.906927145538304, + "grad_norm": 2.30079700095415, + "learning_rate": 2.2546393262793397e-06, + "loss": 0.2076, + "step": 10631 + }, + { + "epoch": 0.9070124552124211, + "grad_norm": 1.542957933605284, + "learning_rate": 2.2505393343112745e-06, + "loss": 0.1992, + "step": 10632 + }, + { + "epoch": 0.9070977648865381, + "grad_norm": 1.7101736933069978, + "learning_rate": 2.2464429877607995e-06, + "loss": 0.1709, + "step": 10633 + }, + { + "epoch": 0.9071830745606552, + "grad_norm": 1.4812164431605725, + "learning_rate": 2.2423502869406366e-06, + "loss": 0.1706, + "step": 10634 + }, + { + "epoch": 0.9072683842347722, + "grad_norm": 1.4944605266709414, + "learning_rate": 2.2382612321632468e-06, + "loss": 0.14, + "step": 10635 + }, + { + "epoch": 0.9073536939088893, + "grad_norm": 2.4160556897824956, + "learning_rate": 2.2341758237408085e-06, + "loss": 0.1481, + "step": 10636 + }, + { + "epoch": 0.9074390035830063, + "grad_norm": 1.6358617027233726, + "learning_rate": 2.2300940619852107e-06, + "loss": 0.1494, + "step": 10637 + }, + { + "epoch": 0.9075243132571233, + "grad_norm": 1.6436601447431998, + "learning_rate": 2.2260159472080934e-06, + "loss": 0.1763, + "step": 10638 + }, + { + "epoch": 0.9076096229312404, + "grad_norm": 1.3617695731627453, + "learning_rate": 2.2219414797207794e-06, + "loss": 0.1564, + "step": 10639 + }, + { + "epoch": 0.9076949326053575, + "grad_norm": 2.0816317921209673, + "learning_rate": 2.2178706598343422e-06, + "loss": 0.2183, + "step": 10640 + }, + { + "epoch": 0.9077802422794745, + "grad_norm": 1.5530212521201192, + "learning_rate": 2.2138034878595728e-06, + "loss": 0.1907, + "step": 10641 + }, + { + "epoch": 0.9078655519535915, + "grad_norm": 1.9174525543271477, + "learning_rate": 2.209739964106966e-06, + "loss": 0.1948, + "step": 10642 + }, + { + "epoch": 0.9079508616277085, + "grad_norm": 2.020800429662929, + "learning_rate": 2.205680088886758e-06, + "loss": 0.1439, + "step": 10643 + }, + { + "epoch": 0.9080361713018257, + "grad_norm": 1.3559648556838977, + "learning_rate": 2.2016238625088946e-06, + "loss": 0.1219, + "step": 10644 + }, + { + "epoch": 0.9081214809759427, + "grad_norm": 1.605237883886415, + "learning_rate": 2.197571285283051e-06, + "loss": 0.117, + "step": 10645 + }, + { + "epoch": 0.9082067906500597, + "grad_norm": 1.6341944782640456, + "learning_rate": 2.1935223575186124e-06, + "loss": 0.1487, + "step": 10646 + }, + { + "epoch": 0.9082921003241767, + "grad_norm": 2.5672345036982422, + "learning_rate": 2.1894770795247042e-06, + "loss": 0.2073, + "step": 10647 + }, + { + "epoch": 0.9083774099982939, + "grad_norm": 1.7043104333401644, + "learning_rate": 2.185435451610157e-06, + "loss": 0.2204, + "step": 10648 + }, + { + "epoch": 0.9084627196724109, + "grad_norm": 1.7422377916954648, + "learning_rate": 2.181397474083524e-06, + "loss": 0.176, + "step": 10649 + }, + { + "epoch": 0.9085480293465279, + "grad_norm": 1.4242595756735164, + "learning_rate": 2.1773631472530807e-06, + "loss": 0.1435, + "step": 10650 + }, + { + "epoch": 0.9086333390206449, + "grad_norm": 1.524956361740362, + "learning_rate": 2.1733324714268476e-06, + "loss": 0.1636, + "step": 10651 + }, + { + "epoch": 0.9087186486947619, + "grad_norm": 1.671009802437412, + "learning_rate": 2.1693054469125118e-06, + "loss": 0.2274, + "step": 10652 + }, + { + "epoch": 0.9088039583688791, + "grad_norm": 1.9998764953626764, + "learning_rate": 2.165282074017544e-06, + "loss": 0.0943, + "step": 10653 + }, + { + "epoch": 0.9088892680429961, + "grad_norm": 1.780483415517294, + "learning_rate": 2.161262353049093e-06, + "loss": 0.1885, + "step": 10654 + }, + { + "epoch": 0.9089745777171131, + "grad_norm": 1.8932314474379817, + "learning_rate": 2.157246284314046e-06, + "loss": 0.1285, + "step": 10655 + }, + { + "epoch": 0.9090598873912301, + "grad_norm": 1.5483961236470136, + "learning_rate": 2.1532338681190145e-06, + "loss": 0.1265, + "step": 10656 + }, + { + "epoch": 0.9091451970653472, + "grad_norm": 2.1378643863015045, + "learning_rate": 2.149225104770314e-06, + "loss": 0.1879, + "step": 10657 + }, + { + "epoch": 0.9092305067394643, + "grad_norm": 1.4984707984764731, + "learning_rate": 2.145219994573999e-06, + "loss": 0.1531, + "step": 10658 + }, + { + "epoch": 0.9093158164135813, + "grad_norm": 1.7662753367513413, + "learning_rate": 2.141218537835832e-06, + "loss": 0.1597, + "step": 10659 + }, + { + "epoch": 0.9094011260876983, + "grad_norm": 1.6363176751184438, + "learning_rate": 2.1372207348613225e-06, + "loss": 0.1744, + "step": 10660 + }, + { + "epoch": 0.9094864357618154, + "grad_norm": 2.096061114250162, + "learning_rate": 2.1332265859556556e-06, + "loss": 0.195, + "step": 10661 + }, + { + "epoch": 0.9095717454359324, + "grad_norm": 2.060514245520679, + "learning_rate": 2.1292360914237753e-06, + "loss": 0.2446, + "step": 10662 + }, + { + "epoch": 0.9096570551100495, + "grad_norm": 1.879341186178652, + "learning_rate": 2.1252492515703382e-06, + "loss": 0.2225, + "step": 10663 + }, + { + "epoch": 0.9097423647841665, + "grad_norm": 1.8060722818030202, + "learning_rate": 2.1212660666997177e-06, + "loss": 0.2038, + "step": 10664 + }, + { + "epoch": 0.9098276744582836, + "grad_norm": 1.6979844795524202, + "learning_rate": 2.1172865371160035e-06, + "loss": 0.1426, + "step": 10665 + }, + { + "epoch": 0.9099129841324006, + "grad_norm": 2.335407822178459, + "learning_rate": 2.1133106631230027e-06, + "loss": 0.1753, + "step": 10666 + }, + { + "epoch": 0.9099982938065176, + "grad_norm": 1.814539124059415, + "learning_rate": 2.109338445024284e-06, + "loss": 0.1596, + "step": 10667 + }, + { + "epoch": 0.9100836034806347, + "grad_norm": 1.6829807378037216, + "learning_rate": 2.105369883123065e-06, + "loss": 0.1387, + "step": 10668 + }, + { + "epoch": 0.9101689131547518, + "grad_norm": 1.5308641998689994, + "learning_rate": 2.1014049777223544e-06, + "loss": 0.1839, + "step": 10669 + }, + { + "epoch": 0.9102542228288688, + "grad_norm": 1.9653621770785823, + "learning_rate": 2.097443729124837e-06, + "loss": 0.1402, + "step": 10670 + }, + { + "epoch": 0.9103395325029858, + "grad_norm": 1.4857852059509329, + "learning_rate": 2.0934861376329385e-06, + "loss": 0.1567, + "step": 10671 + }, + { + "epoch": 0.9104248421771028, + "grad_norm": 2.1505728912027084, + "learning_rate": 2.089532203548794e-06, + "loss": 0.2604, + "step": 10672 + }, + { + "epoch": 0.91051015185122, + "grad_norm": 1.5787831057885344, + "learning_rate": 2.0855819271742793e-06, + "loss": 0.1914, + "step": 10673 + }, + { + "epoch": 0.910595461525337, + "grad_norm": 1.9542041086325683, + "learning_rate": 2.0816353088109585e-06, + "loss": 0.1932, + "step": 10674 + }, + { + "epoch": 0.910680771199454, + "grad_norm": 1.947839038410429, + "learning_rate": 2.0776923487601462e-06, + "loss": 0.1488, + "step": 10675 + }, + { + "epoch": 0.910766080873571, + "grad_norm": 2.0295294884425816, + "learning_rate": 2.073753047322868e-06, + "loss": 0.1476, + "step": 10676 + }, + { + "epoch": 0.9108513905476882, + "grad_norm": 1.5530285442649654, + "learning_rate": 2.0698174047998618e-06, + "loss": 0.1685, + "step": 10677 + }, + { + "epoch": 0.9109367002218052, + "grad_norm": 2.3656949918564374, + "learning_rate": 2.065885421491598e-06, + "loss": 0.1956, + "step": 10678 + }, + { + "epoch": 0.9110220098959222, + "grad_norm": 1.7575372777162526, + "learning_rate": 2.061957097698253e-06, + "loss": 0.1769, + "step": 10679 + }, + { + "epoch": 0.9111073195700392, + "grad_norm": 1.7274051870999814, + "learning_rate": 2.058032433719759e-06, + "loss": 0.1491, + "step": 10680 + }, + { + "epoch": 0.9111926292441563, + "grad_norm": 1.6963173837004992, + "learning_rate": 2.0541114298557042e-06, + "loss": 0.2321, + "step": 10681 + }, + { + "epoch": 0.9112779389182734, + "grad_norm": 2.1011059346643237, + "learning_rate": 2.0501940864054715e-06, + "loss": 0.2198, + "step": 10682 + }, + { + "epoch": 0.9113632485923904, + "grad_norm": 2.3515426280046983, + "learning_rate": 2.0462804036681103e-06, + "loss": 0.1815, + "step": 10683 + }, + { + "epoch": 0.9114485582665074, + "grad_norm": 1.7519875547498307, + "learning_rate": 2.042370381942416e-06, + "loss": 0.1954, + "step": 10684 + }, + { + "epoch": 0.9115338679406245, + "grad_norm": 1.7804427074286988, + "learning_rate": 2.038464021526898e-06, + "loss": 0.1084, + "step": 10685 + }, + { + "epoch": 0.9116191776147415, + "grad_norm": 1.923807676564867, + "learning_rate": 2.0345613227197803e-06, + "loss": 0.2011, + "step": 10686 + }, + { + "epoch": 0.9117044872888586, + "grad_norm": 1.7468930320286762, + "learning_rate": 2.030662285819024e-06, + "loss": 0.1435, + "step": 10687 + }, + { + "epoch": 0.9117897969629756, + "grad_norm": 1.4473461792802031, + "learning_rate": 2.02676691112228e-06, + "loss": 0.1631, + "step": 10688 + }, + { + "epoch": 0.9118751066370926, + "grad_norm": 1.513587053445002, + "learning_rate": 2.022875198926971e-06, + "loss": 0.1397, + "step": 10689 + }, + { + "epoch": 0.9119604163112097, + "grad_norm": 2.374851322539729, + "learning_rate": 2.0189871495301714e-06, + "loss": 0.2143, + "step": 10690 + }, + { + "epoch": 0.9120457259853267, + "grad_norm": 2.0736430315662533, + "learning_rate": 2.0151027632287433e-06, + "loss": 0.2073, + "step": 10691 + }, + { + "epoch": 0.9121310356594438, + "grad_norm": 1.553220123509606, + "learning_rate": 2.0112220403192215e-06, + "loss": 0.1772, + "step": 10692 + }, + { + "epoch": 0.9122163453335608, + "grad_norm": 2.0195294791532974, + "learning_rate": 2.0073449810978974e-06, + "loss": 0.2253, + "step": 10693 + }, + { + "epoch": 0.9123016550076779, + "grad_norm": 1.47527664952271, + "learning_rate": 2.003471585860739e-06, + "loss": 0.1076, + "step": 10694 + }, + { + "epoch": 0.9123869646817949, + "grad_norm": 1.7549213963482353, + "learning_rate": 1.9996018549034767e-06, + "loss": 0.1661, + "step": 10695 + }, + { + "epoch": 0.9124722743559119, + "grad_norm": 1.4480270050965947, + "learning_rate": 1.995735788521541e-06, + "loss": 0.2398, + "step": 10696 + }, + { + "epoch": 0.912557584030029, + "grad_norm": 2.0984705759766435, + "learning_rate": 1.9918733870100793e-06, + "loss": 0.1649, + "step": 10697 + }, + { + "epoch": 0.9126428937041461, + "grad_norm": 2.0848079294705313, + "learning_rate": 1.988014650663972e-06, + "loss": 0.1386, + "step": 10698 + }, + { + "epoch": 0.9127282033782631, + "grad_norm": 1.5315458050938555, + "learning_rate": 1.9841595797778113e-06, + "loss": 0.1394, + "step": 10699 + }, + { + "epoch": 0.9128135130523801, + "grad_norm": 2.3200952704369295, + "learning_rate": 1.980308174645912e-06, + "loss": 0.1711, + "step": 10700 + }, + { + "epoch": 0.9128988227264971, + "grad_norm": 1.7737349878257895, + "learning_rate": 1.9764604355622996e-06, + "loss": 0.2024, + "step": 10701 + }, + { + "epoch": 0.9129841324006143, + "grad_norm": 2.142774964073777, + "learning_rate": 1.972616362820745e-06, + "loss": 0.2291, + "step": 10702 + }, + { + "epoch": 0.9130694420747313, + "grad_norm": 2.0847233013726623, + "learning_rate": 1.968775956714708e-06, + "loss": 0.1955, + "step": 10703 + }, + { + "epoch": 0.9131547517488483, + "grad_norm": 2.3809767358533356, + "learning_rate": 1.9649392175373927e-06, + "loss": 0.1621, + "step": 10704 + }, + { + "epoch": 0.9132400614229653, + "grad_norm": 1.6525965287586788, + "learning_rate": 1.961106145581709e-06, + "loss": 0.1944, + "step": 10705 + }, + { + "epoch": 0.9133253710970825, + "grad_norm": 1.473544391787944, + "learning_rate": 1.9572767411402904e-06, + "loss": 0.1394, + "step": 10706 + }, + { + "epoch": 0.9134106807711995, + "grad_norm": 2.4470604447482422, + "learning_rate": 1.9534510045054967e-06, + "loss": 0.1782, + "step": 10707 + }, + { + "epoch": 0.9134959904453165, + "grad_norm": 1.6305588496729286, + "learning_rate": 1.949628935969394e-06, + "loss": 0.2074, + "step": 10708 + }, + { + "epoch": 0.9135813001194335, + "grad_norm": 2.608640099016867, + "learning_rate": 1.9458105358237945e-06, + "loss": 0.2022, + "step": 10709 + }, + { + "epoch": 0.9136666097935506, + "grad_norm": 1.6954660653734737, + "learning_rate": 1.941995804360186e-06, + "loss": 0.2042, + "step": 10710 + }, + { + "epoch": 0.9137519194676677, + "grad_norm": 1.4787831234547544, + "learning_rate": 1.9381847418698253e-06, + "loss": 0.181, + "step": 10711 + }, + { + "epoch": 0.9138372291417847, + "grad_norm": 2.066788463182478, + "learning_rate": 1.934377348643662e-06, + "loss": 0.148, + "step": 10712 + }, + { + "epoch": 0.9139225388159017, + "grad_norm": 1.69206623125424, + "learning_rate": 1.930573624972365e-06, + "loss": 0.159, + "step": 10713 + }, + { + "epoch": 0.9140078484900188, + "grad_norm": 1.6270097628806384, + "learning_rate": 1.9267735711463286e-06, + "loss": 0.1261, + "step": 10714 + }, + { + "epoch": 0.9140931581641358, + "grad_norm": 1.7164692397924188, + "learning_rate": 1.9229771874556766e-06, + "loss": 0.1429, + "step": 10715 + }, + { + "epoch": 0.9141784678382529, + "grad_norm": 1.7795840051581704, + "learning_rate": 1.9191844741902275e-06, + "loss": 0.1507, + "step": 10716 + }, + { + "epoch": 0.9142637775123699, + "grad_norm": 2.1018420178890853, + "learning_rate": 1.915395431639544e-06, + "loss": 0.1002, + "step": 10717 + }, + { + "epoch": 0.914349087186487, + "grad_norm": 2.0304736487757733, + "learning_rate": 1.9116100600929057e-06, + "loss": 0.1642, + "step": 10718 + }, + { + "epoch": 0.914434396860604, + "grad_norm": 1.3891601991569633, + "learning_rate": 1.907828359839292e-06, + "loss": 0.1491, + "step": 10719 + }, + { + "epoch": 0.914519706534721, + "grad_norm": 1.976479087547185, + "learning_rate": 1.9040503311674229e-06, + "loss": 0.1505, + "step": 10720 + }, + { + "epoch": 0.914605016208838, + "grad_norm": 2.801851927172769, + "learning_rate": 1.9002759743657284e-06, + "loss": 0.143, + "step": 10721 + }, + { + "epoch": 0.9146903258829552, + "grad_norm": 1.6429586616093919, + "learning_rate": 1.8965052897223611e-06, + "loss": 0.1624, + "step": 10722 + }, + { + "epoch": 0.9147756355570722, + "grad_norm": 1.6029887477981954, + "learning_rate": 1.8927382775251856e-06, + "loss": 0.2007, + "step": 10723 + }, + { + "epoch": 0.9148609452311892, + "grad_norm": 1.6457267276023015, + "learning_rate": 1.8889749380618105e-06, + "loss": 0.1076, + "step": 10724 + }, + { + "epoch": 0.9149462549053062, + "grad_norm": 1.533646401634941, + "learning_rate": 1.8852152716195336e-06, + "loss": 0.1971, + "step": 10725 + }, + { + "epoch": 0.9150315645794234, + "grad_norm": 1.3626270199984598, + "learning_rate": 1.8814592784853924e-06, + "loss": 0.1017, + "step": 10726 + }, + { + "epoch": 0.9151168742535404, + "grad_norm": 1.5947880823588025, + "learning_rate": 1.8777069589461348e-06, + "loss": 0.176, + "step": 10727 + }, + { + "epoch": 0.9152021839276574, + "grad_norm": 1.9949317132644864, + "learning_rate": 1.8739583132882265e-06, + "loss": 0.1176, + "step": 10728 + }, + { + "epoch": 0.9152874936017744, + "grad_norm": 1.4390167030229286, + "learning_rate": 1.8702133417978607e-06, + "loss": 0.1746, + "step": 10729 + }, + { + "epoch": 0.9153728032758914, + "grad_norm": 1.3755822249490668, + "learning_rate": 1.8664720447609363e-06, + "loss": 0.0834, + "step": 10730 + }, + { + "epoch": 0.9154581129500086, + "grad_norm": 2.053469801099968, + "learning_rate": 1.8627344224631082e-06, + "loss": 0.2016, + "step": 10731 + }, + { + "epoch": 0.9155434226241256, + "grad_norm": 2.556543078794666, + "learning_rate": 1.8590004751896871e-06, + "loss": 0.1976, + "step": 10732 + }, + { + "epoch": 0.9156287322982426, + "grad_norm": 1.5353523109782634, + "learning_rate": 1.8552702032257674e-06, + "loss": 0.1916, + "step": 10733 + }, + { + "epoch": 0.9157140419723596, + "grad_norm": 1.4976990854329155, + "learning_rate": 1.8515436068561265e-06, + "loss": 0.1517, + "step": 10734 + }, + { + "epoch": 0.9157993516464767, + "grad_norm": 1.7322719986518378, + "learning_rate": 1.8478206863652702e-06, + "loss": 0.1955, + "step": 10735 + }, + { + "epoch": 0.9158846613205938, + "grad_norm": 1.7914692378127226, + "learning_rate": 1.8441014420374215e-06, + "loss": 0.1811, + "step": 10736 + }, + { + "epoch": 0.9159699709947108, + "grad_norm": 1.6871769207281637, + "learning_rate": 1.8403858741565306e-06, + "loss": 0.1409, + "step": 10737 + }, + { + "epoch": 0.9160552806688278, + "grad_norm": 1.411577390972688, + "learning_rate": 1.8366739830062597e-06, + "loss": 0.1485, + "step": 10738 + }, + { + "epoch": 0.9161405903429449, + "grad_norm": 1.572034053453062, + "learning_rate": 1.8329657688699875e-06, + "loss": 0.1474, + "step": 10739 + }, + { + "epoch": 0.916225900017062, + "grad_norm": 1.921142314407189, + "learning_rate": 1.8292612320308212e-06, + "loss": 0.1788, + "step": 10740 + }, + { + "epoch": 0.916311209691179, + "grad_norm": 1.8213158326827836, + "learning_rate": 1.8255603727715786e-06, + "loss": 0.195, + "step": 10741 + }, + { + "epoch": 0.916396519365296, + "grad_norm": 1.8081526084398172, + "learning_rate": 1.8218631913748062e-06, + "loss": 0.1492, + "step": 10742 + }, + { + "epoch": 0.9164818290394131, + "grad_norm": 2.4042377728062747, + "learning_rate": 1.8181696881227562e-06, + "loss": 0.1662, + "step": 10743 + }, + { + "epoch": 0.9165671387135301, + "grad_norm": 2.160392872438078, + "learning_rate": 1.8144798632974192e-06, + "loss": 0.1589, + "step": 10744 + }, + { + "epoch": 0.9166524483876471, + "grad_norm": 2.7550392796092655, + "learning_rate": 1.8107937171804812e-06, + "loss": 0.2502, + "step": 10745 + }, + { + "epoch": 0.9167377580617642, + "grad_norm": 2.2498089391332234, + "learning_rate": 1.807111250053367e-06, + "loss": 0.2106, + "step": 10746 + }, + { + "epoch": 0.9168230677358813, + "grad_norm": 1.681628148545471, + "learning_rate": 1.8034324621972132e-06, + "loss": 0.1576, + "step": 10747 + }, + { + "epoch": 0.9169083774099983, + "grad_norm": 1.6906991938812397, + "learning_rate": 1.799757353892878e-06, + "loss": 0.1704, + "step": 10748 + }, + { + "epoch": 0.9169936870841153, + "grad_norm": 1.6785005855974047, + "learning_rate": 1.7960859254209262e-06, + "loss": 0.1503, + "step": 10749 + }, + { + "epoch": 0.9170789967582323, + "grad_norm": 1.6371567169749752, + "learning_rate": 1.792418177061661e-06, + "loss": 0.1718, + "step": 10750 + }, + { + "epoch": 0.9171643064323495, + "grad_norm": 1.6463725820217634, + "learning_rate": 1.7887541090950977e-06, + "loss": 0.1559, + "step": 10751 + }, + { + "epoch": 0.9172496161064665, + "grad_norm": 1.7705487490096479, + "learning_rate": 1.7850937218009567e-06, + "loss": 0.1815, + "step": 10752 + }, + { + "epoch": 0.9173349257805835, + "grad_norm": 1.62557063720572, + "learning_rate": 1.781437015458698e-06, + "loss": 0.1559, + "step": 10753 + }, + { + "epoch": 0.9174202354547005, + "grad_norm": 1.4989316632635503, + "learning_rate": 1.7777839903474924e-06, + "loss": 0.1804, + "step": 10754 + }, + { + "epoch": 0.9175055451288177, + "grad_norm": 1.8028789887951813, + "learning_rate": 1.7741346467462284e-06, + "loss": 0.1731, + "step": 10755 + }, + { + "epoch": 0.9175908548029347, + "grad_norm": 2.0975091694391583, + "learning_rate": 1.770488984933505e-06, + "loss": 0.1358, + "step": 10756 + }, + { + "epoch": 0.9176761644770517, + "grad_norm": 1.670361270317432, + "learning_rate": 1.7668470051876662e-06, + "loss": 0.2243, + "step": 10757 + }, + { + "epoch": 0.9177614741511687, + "grad_norm": 2.356219676503112, + "learning_rate": 1.76320870778674e-06, + "loss": 0.1263, + "step": 10758 + }, + { + "epoch": 0.9178467838252858, + "grad_norm": 2.0815934227223027, + "learning_rate": 1.759574093008498e-06, + "loss": 0.1662, + "step": 10759 + }, + { + "epoch": 0.9179320934994029, + "grad_norm": 2.0755284659047106, + "learning_rate": 1.75594316113043e-06, + "loss": 0.0971, + "step": 10760 + }, + { + "epoch": 0.9180174031735199, + "grad_norm": 1.669979554011195, + "learning_rate": 1.7523159124297306e-06, + "loss": 0.2214, + "step": 10761 + }, + { + "epoch": 0.9181027128476369, + "grad_norm": 2.303101902590995, + "learning_rate": 1.7486923471833284e-06, + "loss": 0.223, + "step": 10762 + }, + { + "epoch": 0.918188022521754, + "grad_norm": 1.7475699855777416, + "learning_rate": 1.7450724656678518e-06, + "loss": 0.1562, + "step": 10763 + }, + { + "epoch": 0.918273332195871, + "grad_norm": 1.6857905736614645, + "learning_rate": 1.741456268159669e-06, + "loss": 0.1626, + "step": 10764 + }, + { + "epoch": 0.9183586418699881, + "grad_norm": 1.9254896705368207, + "learning_rate": 1.737843754934848e-06, + "loss": 0.1809, + "step": 10765 + }, + { + "epoch": 0.9184439515441051, + "grad_norm": 1.855549507391407, + "learning_rate": 1.734234926269207e-06, + "loss": 0.1892, + "step": 10766 + }, + { + "epoch": 0.9185292612182221, + "grad_norm": 1.5277539261006075, + "learning_rate": 1.7306297824382312e-06, + "loss": 0.1467, + "step": 10767 + }, + { + "epoch": 0.9186145708923392, + "grad_norm": 1.4597120398204495, + "learning_rate": 1.7270283237171725e-06, + "loss": 0.1317, + "step": 10768 + }, + { + "epoch": 0.9186998805664562, + "grad_norm": 1.7575174719863123, + "learning_rate": 1.7234305503809778e-06, + "loss": 0.1678, + "step": 10769 + }, + { + "epoch": 0.9187851902405733, + "grad_norm": 1.7534404042528733, + "learning_rate": 1.7198364627043218e-06, + "loss": 0.208, + "step": 10770 + }, + { + "epoch": 0.9188704999146903, + "grad_norm": 1.726541096136066, + "learning_rate": 1.7162460609615905e-06, + "loss": 0.1545, + "step": 10771 + }, + { + "epoch": 0.9189558095888074, + "grad_norm": 1.7893450651286409, + "learning_rate": 1.712659345426887e-06, + "loss": 0.16, + "step": 10772 + }, + { + "epoch": 0.9190411192629244, + "grad_norm": 1.580131237518961, + "learning_rate": 1.7090763163740586e-06, + "loss": 0.1966, + "step": 10773 + }, + { + "epoch": 0.9191264289370414, + "grad_norm": 1.9248450055827788, + "learning_rate": 1.7054969740766203e-06, + "loss": 0.1919, + "step": 10774 + }, + { + "epoch": 0.9192117386111585, + "grad_norm": 1.675771148580059, + "learning_rate": 1.7019213188078587e-06, + "loss": 0.1794, + "step": 10775 + }, + { + "epoch": 0.9192970482852756, + "grad_norm": 1.7487859602005977, + "learning_rate": 1.6983493508407443e-06, + "loss": 0.1946, + "step": 10776 + }, + { + "epoch": 0.9193823579593926, + "grad_norm": 1.807632685285232, + "learning_rate": 1.6947810704479873e-06, + "loss": 0.1999, + "step": 10777 + }, + { + "epoch": 0.9194676676335096, + "grad_norm": 1.4101667324532483, + "learning_rate": 1.6912164779019969e-06, + "loss": 0.1594, + "step": 10778 + }, + { + "epoch": 0.9195529773076266, + "grad_norm": 1.9625372573328839, + "learning_rate": 1.687655573474922e-06, + "loss": 0.2643, + "step": 10779 + }, + { + "epoch": 0.9196382869817438, + "grad_norm": 1.615127652204698, + "learning_rate": 1.6840983574386072e-06, + "loss": 0.1793, + "step": 10780 + }, + { + "epoch": 0.9197235966558608, + "grad_norm": 1.6512485779634138, + "learning_rate": 1.6805448300646342e-06, + "loss": 0.1234, + "step": 10781 + }, + { + "epoch": 0.9198089063299778, + "grad_norm": 1.8935796176317876, + "learning_rate": 1.6769949916242977e-06, + "loss": 0.0869, + "step": 10782 + }, + { + "epoch": 0.9198942160040948, + "grad_norm": 1.9977741968978304, + "learning_rate": 1.673448842388603e-06, + "loss": 0.1208, + "step": 10783 + }, + { + "epoch": 0.919979525678212, + "grad_norm": 1.81797568496542, + "learning_rate": 1.669906382628278e-06, + "loss": 0.1526, + "step": 10784 + }, + { + "epoch": 0.920064835352329, + "grad_norm": 1.7854260797894188, + "learning_rate": 1.6663676126137784e-06, + "loss": 0.1472, + "step": 10785 + }, + { + "epoch": 0.920150145026446, + "grad_norm": 1.6557633026754441, + "learning_rate": 1.6628325326152717e-06, + "loss": 0.1206, + "step": 10786 + }, + { + "epoch": 0.920235454700563, + "grad_norm": 1.9737629118503683, + "learning_rate": 1.659301142902625e-06, + "loss": 0.1312, + "step": 10787 + }, + { + "epoch": 0.9203207643746801, + "grad_norm": 1.3463167029160639, + "learning_rate": 1.655773443745462e-06, + "loss": 0.1731, + "step": 10788 + }, + { + "epoch": 0.9204060740487972, + "grad_norm": 1.7423670381144762, + "learning_rate": 1.6522494354131002e-06, + "loss": 0.1822, + "step": 10789 + }, + { + "epoch": 0.9204913837229142, + "grad_norm": 1.6948937808713203, + "learning_rate": 1.648729118174569e-06, + "loss": 0.2115, + "step": 10790 + }, + { + "epoch": 0.9205766933970312, + "grad_norm": 1.8283365241376122, + "learning_rate": 1.6452124922986311e-06, + "loss": 0.2144, + "step": 10791 + }, + { + "epoch": 0.9206620030711483, + "grad_norm": 1.8926508114992577, + "learning_rate": 1.6416995580537664e-06, + "loss": 0.2292, + "step": 10792 + }, + { + "epoch": 0.9207473127452653, + "grad_norm": 1.5067936909279465, + "learning_rate": 1.6381903157081602e-06, + "loss": 0.1747, + "step": 10793 + }, + { + "epoch": 0.9208326224193824, + "grad_norm": 2.1131405598457675, + "learning_rate": 1.634684765529726e-06, + "loss": 0.1666, + "step": 10794 + }, + { + "epoch": 0.9209179320934994, + "grad_norm": 2.0184235772460193, + "learning_rate": 1.6311829077861051e-06, + "loss": 0.2508, + "step": 10795 + }, + { + "epoch": 0.9210032417676165, + "grad_norm": 1.7395869805151192, + "learning_rate": 1.6276847427446284e-06, + "loss": 0.1817, + "step": 10796 + }, + { + "epoch": 0.9210885514417335, + "grad_norm": 1.5056440345381223, + "learning_rate": 1.6241902706723766e-06, + "loss": 0.1861, + "step": 10797 + }, + { + "epoch": 0.9211738611158505, + "grad_norm": 1.6882671449170008, + "learning_rate": 1.620699491836125e-06, + "loss": 0.1729, + "step": 10798 + }, + { + "epoch": 0.9212591707899676, + "grad_norm": 1.772351037183415, + "learning_rate": 1.6172124065023886e-06, + "loss": 0.2059, + "step": 10799 + }, + { + "epoch": 0.9213444804640847, + "grad_norm": 1.5139945787562035, + "learning_rate": 1.6137290149373708e-06, + "loss": 0.1824, + "step": 10800 + }, + { + "epoch": 0.9214297901382017, + "grad_norm": 1.5503278016509998, + "learning_rate": 1.6102493174070255e-06, + "loss": 0.1735, + "step": 10801 + }, + { + "epoch": 0.9215150998123187, + "grad_norm": 2.4126242887273452, + "learning_rate": 1.6067733141769958e-06, + "loss": 0.1557, + "step": 10802 + }, + { + "epoch": 0.9216004094864357, + "grad_norm": 1.9817192749321109, + "learning_rate": 1.603301005512664e-06, + "loss": 0.1668, + "step": 10803 + }, + { + "epoch": 0.9216857191605528, + "grad_norm": 1.9829697326412445, + "learning_rate": 1.5998323916791235e-06, + "loss": 0.2066, + "step": 10804 + }, + { + "epoch": 0.9217710288346699, + "grad_norm": 1.4234223382772555, + "learning_rate": 1.5963674729411792e-06, + "loss": 0.1767, + "step": 10805 + }, + { + "epoch": 0.9218563385087869, + "grad_norm": 1.591862926399925, + "learning_rate": 1.592906249563364e-06, + "loss": 0.2041, + "step": 10806 + }, + { + "epoch": 0.9219416481829039, + "grad_norm": 2.4247028208063095, + "learning_rate": 1.5894487218099164e-06, + "loss": 0.229, + "step": 10807 + }, + { + "epoch": 0.9220269578570209, + "grad_norm": 1.6915395917597444, + "learning_rate": 1.5859948899448141e-06, + "loss": 0.2217, + "step": 10808 + }, + { + "epoch": 0.9221122675311381, + "grad_norm": 1.9997308669206855, + "learning_rate": 1.5825447542317184e-06, + "loss": 0.2123, + "step": 10809 + }, + { + "epoch": 0.9221975772052551, + "grad_norm": 1.5526281164506728, + "learning_rate": 1.5790983149340466e-06, + "loss": 0.2082, + "step": 10810 + }, + { + "epoch": 0.9222828868793721, + "grad_norm": 1.8623332364995133, + "learning_rate": 1.5756555723149103e-06, + "loss": 0.22, + "step": 10811 + }, + { + "epoch": 0.9223681965534891, + "grad_norm": 1.4398725253213962, + "learning_rate": 1.5722165266371492e-06, + "loss": 0.1096, + "step": 10812 + }, + { + "epoch": 0.9224535062276062, + "grad_norm": 1.6312471031660176, + "learning_rate": 1.5687811781633033e-06, + "loss": 0.2266, + "step": 10813 + }, + { + "epoch": 0.9225388159017233, + "grad_norm": 1.735174252667014, + "learning_rate": 1.5653495271556517e-06, + "loss": 0.1875, + "step": 10814 + }, + { + "epoch": 0.9226241255758403, + "grad_norm": 1.5548215261992353, + "learning_rate": 1.5619215738761906e-06, + "loss": 0.1925, + "step": 10815 + }, + { + "epoch": 0.9227094352499573, + "grad_norm": 1.632818194657758, + "learning_rate": 1.5584973185866103e-06, + "loss": 0.1744, + "step": 10816 + }, + { + "epoch": 0.9227947449240744, + "grad_norm": 1.875218187988661, + "learning_rate": 1.5550767615483408e-06, + "loss": 0.1429, + "step": 10817 + }, + { + "epoch": 0.9228800545981914, + "grad_norm": 1.7072019164363326, + "learning_rate": 1.5516599030225288e-06, + "loss": 0.1393, + "step": 10818 + }, + { + "epoch": 0.9229653642723085, + "grad_norm": 1.7613803851542276, + "learning_rate": 1.548246743270032e-06, + "loss": 0.1765, + "step": 10819 + }, + { + "epoch": 0.9230506739464255, + "grad_norm": 1.4604845619835345, + "learning_rate": 1.5448372825514146e-06, + "loss": 0.162, + "step": 10820 + }, + { + "epoch": 0.9231359836205426, + "grad_norm": 1.646100453135453, + "learning_rate": 1.5414315211269902e-06, + "loss": 0.1551, + "step": 10821 + }, + { + "epoch": 0.9232212932946596, + "grad_norm": 2.294691124536186, + "learning_rate": 1.5380294592567513e-06, + "loss": 0.226, + "step": 10822 + }, + { + "epoch": 0.9233066029687766, + "grad_norm": 1.5559356126775747, + "learning_rate": 1.5346310972004397e-06, + "loss": 0.1808, + "step": 10823 + }, + { + "epoch": 0.9233919126428937, + "grad_norm": 1.6120590494193126, + "learning_rate": 1.5312364352175034e-06, + "loss": 0.1629, + "step": 10824 + }, + { + "epoch": 0.9234772223170108, + "grad_norm": 1.6397781366213262, + "learning_rate": 1.5278454735670967e-06, + "loss": 0.154, + "step": 10825 + }, + { + "epoch": 0.9235625319911278, + "grad_norm": 1.900204687637475, + "learning_rate": 1.524458212508112e-06, + "loss": 0.2101, + "step": 10826 + }, + { + "epoch": 0.9236478416652448, + "grad_norm": 2.6070675269813166, + "learning_rate": 1.5210746522991425e-06, + "loss": 0.1441, + "step": 10827 + }, + { + "epoch": 0.9237331513393618, + "grad_norm": 1.676942791785876, + "learning_rate": 1.5176947931985097e-06, + "loss": 0.1812, + "step": 10828 + }, + { + "epoch": 0.923818461013479, + "grad_norm": 1.8260634446210045, + "learning_rate": 1.5143186354642346e-06, + "loss": 0.2281, + "step": 10829 + }, + { + "epoch": 0.923903770687596, + "grad_norm": 1.4663191625701362, + "learning_rate": 1.5109461793540891e-06, + "loss": 0.15, + "step": 10830 + }, + { + "epoch": 0.923989080361713, + "grad_norm": 1.627392328336649, + "learning_rate": 1.507577425125528e-06, + "loss": 0.1768, + "step": 10831 + }, + { + "epoch": 0.92407439003583, + "grad_norm": 1.804620452560167, + "learning_rate": 1.504212373035746e-06, + "loss": 0.0719, + "step": 10832 + }, + { + "epoch": 0.9241596997099472, + "grad_norm": 2.091174919208006, + "learning_rate": 1.5008510233416374e-06, + "loss": 0.175, + "step": 10833 + }, + { + "epoch": 0.9242450093840642, + "grad_norm": 1.6755667595878032, + "learning_rate": 1.4974933762998356e-06, + "loss": 0.1389, + "step": 10834 + }, + { + "epoch": 0.9243303190581812, + "grad_norm": 1.9071890597311283, + "learning_rate": 1.494139432166669e-06, + "loss": 0.2107, + "step": 10835 + }, + { + "epoch": 0.9244156287322982, + "grad_norm": 1.842730482739959, + "learning_rate": 1.4907891911981886e-06, + "loss": 0.231, + "step": 10836 + }, + { + "epoch": 0.9245009384064153, + "grad_norm": 1.994873988060641, + "learning_rate": 1.4874426536501895e-06, + "loss": 0.2207, + "step": 10837 + }, + { + "epoch": 0.9245862480805324, + "grad_norm": 1.328288932107668, + "learning_rate": 1.4840998197781397e-06, + "loss": 0.1828, + "step": 10838 + }, + { + "epoch": 0.9246715577546494, + "grad_norm": 1.7749655438154786, + "learning_rate": 1.4807606898372572e-06, + "loss": 0.2159, + "step": 10839 + }, + { + "epoch": 0.9247568674287664, + "grad_norm": 1.4331171974339052, + "learning_rate": 1.477425264082466e-06, + "loss": 0.1376, + "step": 10840 + }, + { + "epoch": 0.9248421771028834, + "grad_norm": 1.6547660657693808, + "learning_rate": 1.4740935427684066e-06, + "loss": 0.2282, + "step": 10841 + }, + { + "epoch": 0.9249274867770005, + "grad_norm": 1.6749363616696935, + "learning_rate": 1.4707655261494368e-06, + "loss": 0.1256, + "step": 10842 + }, + { + "epoch": 0.9250127964511176, + "grad_norm": 1.3920831643342038, + "learning_rate": 1.4674412144796368e-06, + "loss": 0.1551, + "step": 10843 + }, + { + "epoch": 0.9250981061252346, + "grad_norm": 1.6572628163388672, + "learning_rate": 1.4641206080128034e-06, + "loss": 0.1683, + "step": 10844 + }, + { + "epoch": 0.9251834157993516, + "grad_norm": 1.9613674953804079, + "learning_rate": 1.4608037070024338e-06, + "loss": 0.1961, + "step": 10845 + }, + { + "epoch": 0.9252687254734687, + "grad_norm": 1.6979720529833235, + "learning_rate": 1.45749051170177e-06, + "loss": 0.1833, + "step": 10846 + }, + { + "epoch": 0.9253540351475857, + "grad_norm": 1.3814695485859791, + "learning_rate": 1.4541810223637487e-06, + "loss": 0.1018, + "step": 10847 + }, + { + "epoch": 0.9254393448217028, + "grad_norm": 1.8951340178554483, + "learning_rate": 1.4508752392410396e-06, + "loss": 0.2762, + "step": 10848 + }, + { + "epoch": 0.9255246544958198, + "grad_norm": 2.33319798712785, + "learning_rate": 1.4475731625860023e-06, + "loss": 0.2182, + "step": 10849 + }, + { + "epoch": 0.9256099641699369, + "grad_norm": 1.3228468638750206, + "learning_rate": 1.4442747926507626e-06, + "loss": 0.1444, + "step": 10850 + }, + { + "epoch": 0.9256952738440539, + "grad_norm": 1.5818078154091686, + "learning_rate": 1.4409801296871083e-06, + "loss": 0.1977, + "step": 10851 + }, + { + "epoch": 0.9257805835181709, + "grad_norm": 2.075147747616655, + "learning_rate": 1.4376891739465826e-06, + "loss": 0.2232, + "step": 10852 + }, + { + "epoch": 0.925865893192288, + "grad_norm": 1.487931818753642, + "learning_rate": 1.4344019256804286e-06, + "loss": 0.1373, + "step": 10853 + }, + { + "epoch": 0.9259512028664051, + "grad_norm": 1.400193119353318, + "learning_rate": 1.431118385139607e-06, + "loss": 0.1481, + "step": 10854 + }, + { + "epoch": 0.9260365125405221, + "grad_norm": 1.8147434950818517, + "learning_rate": 1.427838552574806e-06, + "loss": 0.1633, + "step": 10855 + }, + { + "epoch": 0.9261218222146391, + "grad_norm": 1.7499049024628666, + "learning_rate": 1.42456242823642e-06, + "loss": 0.1384, + "step": 10856 + }, + { + "epoch": 0.9262071318887561, + "grad_norm": 1.6397292098506915, + "learning_rate": 1.42129001237456e-06, + "loss": 0.2279, + "step": 10857 + }, + { + "epoch": 0.9262924415628733, + "grad_norm": 1.953083312543393, + "learning_rate": 1.4180213052390534e-06, + "loss": 0.2126, + "step": 10858 + }, + { + "epoch": 0.9263777512369903, + "grad_norm": 1.8869381099835265, + "learning_rate": 1.4147563070794623e-06, + "loss": 0.1232, + "step": 10859 + }, + { + "epoch": 0.9264630609111073, + "grad_norm": 1.7109785379861844, + "learning_rate": 1.4114950181450481e-06, + "loss": 0.1687, + "step": 10860 + }, + { + "epoch": 0.9265483705852243, + "grad_norm": 1.367001330098061, + "learning_rate": 1.4082374386847897e-06, + "loss": 0.1174, + "step": 10861 + }, + { + "epoch": 0.9266336802593415, + "grad_norm": 1.704195998233985, + "learning_rate": 1.4049835689473822e-06, + "loss": 0.1252, + "step": 10862 + }, + { + "epoch": 0.9267189899334585, + "grad_norm": 2.422199153741691, + "learning_rate": 1.4017334091812551e-06, + "loss": 0.1401, + "step": 10863 + }, + { + "epoch": 0.9268042996075755, + "grad_norm": 1.2499114005160585, + "learning_rate": 1.3984869596345207e-06, + "loss": 0.1381, + "step": 10864 + }, + { + "epoch": 0.9268896092816925, + "grad_norm": 1.6032987533906153, + "learning_rate": 1.395244220555042e-06, + "loss": 0.1819, + "step": 10865 + }, + { + "epoch": 0.9269749189558096, + "grad_norm": 1.7377845311636622, + "learning_rate": 1.392005192190382e-06, + "loss": 0.2084, + "step": 10866 + }, + { + "epoch": 0.9270602286299267, + "grad_norm": 2.6562392290682357, + "learning_rate": 1.3887698747878263e-06, + "loss": 0.1268, + "step": 10867 + }, + { + "epoch": 0.9271455383040437, + "grad_norm": 1.3762927913682925, + "learning_rate": 1.3855382685943662e-06, + "loss": 0.1901, + "step": 10868 + }, + { + "epoch": 0.9272308479781607, + "grad_norm": 1.7766370772759996, + "learning_rate": 1.3823103738567267e-06, + "loss": 0.2549, + "step": 10869 + }, + { + "epoch": 0.9273161576522778, + "grad_norm": 1.7674209777778493, + "learning_rate": 1.3790861908213326e-06, + "loss": 0.1569, + "step": 10870 + }, + { + "epoch": 0.9274014673263948, + "grad_norm": 1.5713120500911322, + "learning_rate": 1.3758657197343372e-06, + "loss": 0.1733, + "step": 10871 + }, + { + "epoch": 0.9274867770005119, + "grad_norm": 2.3188447259437424, + "learning_rate": 1.3726489608416104e-06, + "loss": 0.1863, + "step": 10872 + }, + { + "epoch": 0.9275720866746289, + "grad_norm": 1.7632476194974926, + "learning_rate": 1.3694359143887225e-06, + "loss": 0.2163, + "step": 10873 + }, + { + "epoch": 0.927657396348746, + "grad_norm": 2.4014642857936455, + "learning_rate": 1.366226580620983e-06, + "loss": 0.2174, + "step": 10874 + }, + { + "epoch": 0.927742706022863, + "grad_norm": 1.7630178732309514, + "learning_rate": 1.3630209597834009e-06, + "loss": 0.1199, + "step": 10875 + }, + { + "epoch": 0.92782801569698, + "grad_norm": 1.7266603329933878, + "learning_rate": 1.35981905212072e-06, + "loss": 0.1461, + "step": 10876 + }, + { + "epoch": 0.927913325371097, + "grad_norm": 2.9951827791766346, + "learning_rate": 1.3566208578773775e-06, + "loss": 0.196, + "step": 10877 + }, + { + "epoch": 0.9279986350452142, + "grad_norm": 1.938544822278974, + "learning_rate": 1.3534263772975342e-06, + "loss": 0.1186, + "step": 10878 + }, + { + "epoch": 0.9280839447193312, + "grad_norm": 2.4714823221650635, + "learning_rate": 1.3502356106250947e-06, + "loss": 0.1947, + "step": 10879 + }, + { + "epoch": 0.9281692543934482, + "grad_norm": 2.1676047079628957, + "learning_rate": 1.347048558103625e-06, + "loss": 0.1458, + "step": 10880 + }, + { + "epoch": 0.9282545640675652, + "grad_norm": 1.6507429299914473, + "learning_rate": 1.3438652199764646e-06, + "loss": 0.1885, + "step": 10881 + }, + { + "epoch": 0.9283398737416823, + "grad_norm": 1.8960717299369025, + "learning_rate": 1.3406855964866405e-06, + "loss": 0.1615, + "step": 10882 + }, + { + "epoch": 0.9284251834157994, + "grad_norm": 1.3992309024523741, + "learning_rate": 1.3375096878768923e-06, + "loss": 0.1696, + "step": 10883 + }, + { + "epoch": 0.9285104930899164, + "grad_norm": 2.2359644431697445, + "learning_rate": 1.3343374943896815e-06, + "loss": 0.1895, + "step": 10884 + }, + { + "epoch": 0.9285958027640334, + "grad_norm": 1.5625990264030225, + "learning_rate": 1.331169016267203e-06, + "loss": 0.177, + "step": 10885 + }, + { + "epoch": 0.9286811124381504, + "grad_norm": 1.8161450269992863, + "learning_rate": 1.328004253751336e-06, + "loss": 0.1316, + "step": 10886 + }, + { + "epoch": 0.9287664221122676, + "grad_norm": 1.815578214531744, + "learning_rate": 1.3248432070837035e-06, + "loss": 0.1823, + "step": 10887 + }, + { + "epoch": 0.9288517317863846, + "grad_norm": 2.032868899267458, + "learning_rate": 1.321685876505635e-06, + "loss": 0.2967, + "step": 10888 + }, + { + "epoch": 0.9289370414605016, + "grad_norm": 1.7713598777486754, + "learning_rate": 1.3185322622581764e-06, + "loss": 0.1931, + "step": 10889 + }, + { + "epoch": 0.9290223511346186, + "grad_norm": 1.5294616532495957, + "learning_rate": 1.315382364582085e-06, + "loss": 0.1406, + "step": 10890 + }, + { + "epoch": 0.9291076608087357, + "grad_norm": 1.633560520383277, + "learning_rate": 1.31223618371783e-06, + "loss": 0.156, + "step": 10891 + }, + { + "epoch": 0.9291929704828528, + "grad_norm": 2.650668214725884, + "learning_rate": 1.3090937199056352e-06, + "loss": 0.1781, + "step": 10892 + }, + { + "epoch": 0.9292782801569698, + "grad_norm": 1.838745826285972, + "learning_rate": 1.305954973385376e-06, + "loss": 0.2145, + "step": 10893 + }, + { + "epoch": 0.9293635898310868, + "grad_norm": 1.3175880266638023, + "learning_rate": 1.3028199443966994e-06, + "loss": 0.1231, + "step": 10894 + }, + { + "epoch": 0.9294488995052039, + "grad_norm": 2.4667993879267684, + "learning_rate": 1.2996886331789416e-06, + "loss": 0.3009, + "step": 10895 + }, + { + "epoch": 0.929534209179321, + "grad_norm": 1.548239751327598, + "learning_rate": 1.296561039971167e-06, + "loss": 0.1606, + "step": 10896 + }, + { + "epoch": 0.929619518853438, + "grad_norm": 1.570794026956453, + "learning_rate": 1.2934371650121458e-06, + "loss": 0.1287, + "step": 10897 + }, + { + "epoch": 0.929704828527555, + "grad_norm": 1.5826039725764915, + "learning_rate": 1.290317008540376e-06, + "loss": 0.1615, + "step": 10898 + }, + { + "epoch": 0.9297901382016721, + "grad_norm": 2.1935315493420613, + "learning_rate": 1.2872005707940505e-06, + "loss": 0.1903, + "step": 10899 + }, + { + "epoch": 0.9298754478757891, + "grad_norm": 1.4045095906023548, + "learning_rate": 1.284087852011101e-06, + "loss": 0.1765, + "step": 10900 + }, + { + "epoch": 0.9299607575499061, + "grad_norm": 2.1002582345861716, + "learning_rate": 1.2809788524291821e-06, + "loss": 0.1901, + "step": 10901 + }, + { + "epoch": 0.9300460672240232, + "grad_norm": 1.5302201525303347, + "learning_rate": 1.2778735722856205e-06, + "loss": 0.1915, + "step": 10902 + }, + { + "epoch": 0.9301313768981403, + "grad_norm": 2.220042752034703, + "learning_rate": 1.2747720118175099e-06, + "loss": 0.175, + "step": 10903 + }, + { + "epoch": 0.9302166865722573, + "grad_norm": 1.5939642070355742, + "learning_rate": 1.271674171261622e-06, + "loss": 0.1735, + "step": 10904 + }, + { + "epoch": 0.9303019962463743, + "grad_norm": 1.4593897489990955, + "learning_rate": 1.2685800508544786e-06, + "loss": 0.2101, + "step": 10905 + }, + { + "epoch": 0.9303873059204913, + "grad_norm": 2.706402858443277, + "learning_rate": 1.2654896508322856e-06, + "loss": 0.2584, + "step": 10906 + }, + { + "epoch": 0.9304726155946085, + "grad_norm": 1.8983180632519934, + "learning_rate": 1.2624029714309872e-06, + "loss": 0.1748, + "step": 10907 + }, + { + "epoch": 0.9305579252687255, + "grad_norm": 1.5283261625037798, + "learning_rate": 1.2593200128862225e-06, + "loss": 0.2073, + "step": 10908 + }, + { + "epoch": 0.9306432349428425, + "grad_norm": 2.1735767311678726, + "learning_rate": 1.256240775433376e-06, + "loss": 0.1715, + "step": 10909 + }, + { + "epoch": 0.9307285446169595, + "grad_norm": 1.2212728162838242, + "learning_rate": 1.2531652593075204e-06, + "loss": 0.1259, + "step": 10910 + }, + { + "epoch": 0.9308138542910767, + "grad_norm": 1.7815397595752698, + "learning_rate": 1.2500934647434515e-06, + "loss": 0.186, + "step": 10911 + }, + { + "epoch": 0.9308991639651937, + "grad_norm": 2.1014124026106695, + "learning_rate": 1.247025391975698e-06, + "loss": 0.2447, + "step": 10912 + }, + { + "epoch": 0.9309844736393107, + "grad_norm": 1.788043602401106, + "learning_rate": 1.2439610412384727e-06, + "loss": 0.151, + "step": 10913 + }, + { + "epoch": 0.9310697833134277, + "grad_norm": 1.674008087134111, + "learning_rate": 1.2409004127657441e-06, + "loss": 0.1354, + "step": 10914 + }, + { + "epoch": 0.9311550929875448, + "grad_norm": 1.7924533344190432, + "learning_rate": 1.2378435067911532e-06, + "loss": 0.1491, + "step": 10915 + }, + { + "epoch": 0.9312404026616619, + "grad_norm": 1.8125420927224352, + "learning_rate": 1.2347903235480906e-06, + "loss": 0.199, + "step": 10916 + }, + { + "epoch": 0.9313257123357789, + "grad_norm": 2.1335213290271025, + "learning_rate": 1.2317408632696537e-06, + "loss": 0.1712, + "step": 10917 + }, + { + "epoch": 0.9314110220098959, + "grad_norm": 1.9843945239450769, + "learning_rate": 1.2286951261886447e-06, + "loss": 0.1548, + "step": 10918 + }, + { + "epoch": 0.9314963316840129, + "grad_norm": 1.9672733400264089, + "learning_rate": 1.2256531125375948e-06, + "loss": 0.1997, + "step": 10919 + }, + { + "epoch": 0.93158164135813, + "grad_norm": 1.4236146952679596, + "learning_rate": 1.2226148225487343e-06, + "loss": 0.1497, + "step": 10920 + }, + { + "epoch": 0.9316669510322471, + "grad_norm": 1.551092697885617, + "learning_rate": 1.2195802564540392e-06, + "loss": 0.219, + "step": 10921 + }, + { + "epoch": 0.9317522607063641, + "grad_norm": 2.4543478280922177, + "learning_rate": 1.2165494144851686e-06, + "loss": 0.2411, + "step": 10922 + }, + { + "epoch": 0.9318375703804811, + "grad_norm": 2.5416858443073456, + "learning_rate": 1.213522296873515e-06, + "loss": 0.2321, + "step": 10923 + }, + { + "epoch": 0.9319228800545982, + "grad_norm": 1.48876272508386, + "learning_rate": 1.2104989038501825e-06, + "loss": 0.227, + "step": 10924 + }, + { + "epoch": 0.9320081897287152, + "grad_norm": 2.235784766140921, + "learning_rate": 1.2074792356459973e-06, + "loss": 0.1711, + "step": 10925 + }, + { + "epoch": 0.9320934994028323, + "grad_norm": 2.3079442693636, + "learning_rate": 1.2044632924914812e-06, + "loss": 0.1712, + "step": 10926 + }, + { + "epoch": 0.9321788090769493, + "grad_norm": 1.3028191077086981, + "learning_rate": 1.2014510746168994e-06, + "loss": 0.152, + "step": 10927 + }, + { + "epoch": 0.9322641187510664, + "grad_norm": 1.7474593384559438, + "learning_rate": 1.1984425822522072e-06, + "loss": 0.1901, + "step": 10928 + }, + { + "epoch": 0.9323494284251834, + "grad_norm": 2.4923342479203976, + "learning_rate": 1.1954378156270984e-06, + "loss": 0.2675, + "step": 10929 + }, + { + "epoch": 0.9324347380993004, + "grad_norm": 2.1073487899311583, + "learning_rate": 1.192436774970962e-06, + "loss": 0.1899, + "step": 10930 + }, + { + "epoch": 0.9325200477734175, + "grad_norm": 1.5607013267884438, + "learning_rate": 1.18943946051292e-06, + "loss": 0.1383, + "step": 10931 + }, + { + "epoch": 0.9326053574475346, + "grad_norm": 2.482627782078035, + "learning_rate": 1.1864458724817895e-06, + "loss": 0.2476, + "step": 10932 + }, + { + "epoch": 0.9326906671216516, + "grad_norm": 1.4667707865054695, + "learning_rate": 1.1834560111061211e-06, + "loss": 0.1506, + "step": 10933 + }, + { + "epoch": 0.9327759767957686, + "grad_norm": 1.6566912225301587, + "learning_rate": 1.180469876614182e-06, + "loss": 0.1975, + "step": 10934 + }, + { + "epoch": 0.9328612864698856, + "grad_norm": 1.9087893972471697, + "learning_rate": 1.1774874692339343e-06, + "loss": 0.2339, + "step": 10935 + }, + { + "epoch": 0.9329465961440028, + "grad_norm": 1.8245921110244199, + "learning_rate": 1.1745087891930794e-06, + "loss": 0.1439, + "step": 10936 + }, + { + "epoch": 0.9330319058181198, + "grad_norm": 1.9729350582287388, + "learning_rate": 1.1715338367190188e-06, + "loss": 0.2532, + "step": 10937 + }, + { + "epoch": 0.9331172154922368, + "grad_norm": 1.861323577779237, + "learning_rate": 1.1685626120388816e-06, + "loss": 0.143, + "step": 10938 + }, + { + "epoch": 0.9332025251663538, + "grad_norm": 2.701942229183476, + "learning_rate": 1.165595115379492e-06, + "loss": 0.2456, + "step": 10939 + }, + { + "epoch": 0.933287834840471, + "grad_norm": 2.1447260200146157, + "learning_rate": 1.1626313469674134e-06, + "loss": 0.2581, + "step": 10940 + }, + { + "epoch": 0.933373144514588, + "grad_norm": 3.253297160317526, + "learning_rate": 1.159671307028909e-06, + "loss": 0.2108, + "step": 10941 + }, + { + "epoch": 0.933458454188705, + "grad_norm": 1.7795381183577133, + "learning_rate": 1.1567149957899592e-06, + "loss": 0.2155, + "step": 10942 + }, + { + "epoch": 0.933543763862822, + "grad_norm": 1.9197626678489972, + "learning_rate": 1.1537624134762726e-06, + "loss": 0.2365, + "step": 10943 + }, + { + "epoch": 0.9336290735369391, + "grad_norm": 1.7381404519654524, + "learning_rate": 1.150813560313252e-06, + "loss": 0.2029, + "step": 10944 + }, + { + "epoch": 0.9337143832110562, + "grad_norm": 2.5219485965847754, + "learning_rate": 1.1478684365260338e-06, + "loss": 0.1767, + "step": 10945 + }, + { + "epoch": 0.9337996928851732, + "grad_norm": 1.5528077690004845, + "learning_rate": 1.1449270423394608e-06, + "loss": 0.1922, + "step": 10946 + }, + { + "epoch": 0.9338850025592902, + "grad_norm": 1.454304534293556, + "learning_rate": 1.1419893779780922e-06, + "loss": 0.1412, + "step": 10947 + }, + { + "epoch": 0.9339703122334073, + "grad_norm": 2.567056466100129, + "learning_rate": 1.1390554436661983e-06, + "loss": 0.1745, + "step": 10948 + }, + { + "epoch": 0.9340556219075243, + "grad_norm": 2.1078733397258254, + "learning_rate": 1.1361252396277778e-06, + "loss": 0.2142, + "step": 10949 + }, + { + "epoch": 0.9341409315816414, + "grad_norm": 1.7293787293829925, + "learning_rate": 1.133198766086535e-06, + "loss": 0.1826, + "step": 10950 + }, + { + "epoch": 0.9342262412557584, + "grad_norm": 1.7075934629985816, + "learning_rate": 1.1302760232658916e-06, + "loss": 0.1515, + "step": 10951 + }, + { + "epoch": 0.9343115509298755, + "grad_norm": 2.0049983983972863, + "learning_rate": 1.1273570113889798e-06, + "loss": 0.1866, + "step": 10952 + }, + { + "epoch": 0.9343968606039925, + "grad_norm": 1.80036838788407, + "learning_rate": 1.1244417306786493e-06, + "loss": 0.1731, + "step": 10953 + }, + { + "epoch": 0.9344821702781095, + "grad_norm": 1.6164088194334867, + "learning_rate": 1.1215301813574664e-06, + "loss": 0.1682, + "step": 10954 + }, + { + "epoch": 0.9345674799522266, + "grad_norm": 1.9495433468300811, + "learning_rate": 1.1186223636477088e-06, + "loss": 0.2011, + "step": 10955 + }, + { + "epoch": 0.9346527896263436, + "grad_norm": 3.238624324502425, + "learning_rate": 1.1157182777713936e-06, + "loss": 0.2214, + "step": 10956 + }, + { + "epoch": 0.9347380993004607, + "grad_norm": 1.7135274436228993, + "learning_rate": 1.1128179239502046e-06, + "loss": 0.0981, + "step": 10957 + }, + { + "epoch": 0.9348234089745777, + "grad_norm": 1.4580647993352933, + "learning_rate": 1.109921302405581e-06, + "loss": 0.1336, + "step": 10958 + }, + { + "epoch": 0.9349087186486947, + "grad_norm": 1.4608656921690784, + "learning_rate": 1.1070284133586683e-06, + "loss": 0.1664, + "step": 10959 + }, + { + "epoch": 0.9349940283228118, + "grad_norm": 1.3177897261057696, + "learning_rate": 1.1041392570303233e-06, + "loss": 0.1516, + "step": 10960 + }, + { + "epoch": 0.9350793379969289, + "grad_norm": 2.013634933887634, + "learning_rate": 1.1012538336411083e-06, + "loss": 0.1898, + "step": 10961 + }, + { + "epoch": 0.9351646476710459, + "grad_norm": 1.6849259418546894, + "learning_rate": 1.0983721434113192e-06, + "loss": 0.1742, + "step": 10962 + }, + { + "epoch": 0.9352499573451629, + "grad_norm": 1.9869873393346082, + "learning_rate": 1.095494186560947e-06, + "loss": 0.1653, + "step": 10963 + }, + { + "epoch": 0.9353352670192799, + "grad_norm": 1.7813606227856433, + "learning_rate": 1.0926199633097157e-06, + "loss": 0.2204, + "step": 10964 + }, + { + "epoch": 0.9354205766933971, + "grad_norm": 1.7929192164275367, + "learning_rate": 1.0897494738770608e-06, + "loss": 0.1653, + "step": 10965 + }, + { + "epoch": 0.9355058863675141, + "grad_norm": 1.883350849351477, + "learning_rate": 1.0868827184821296e-06, + "loss": 0.2256, + "step": 10966 + }, + { + "epoch": 0.9355911960416311, + "grad_norm": 1.5496179171289013, + "learning_rate": 1.0840196973437744e-06, + "loss": 0.0847, + "step": 10967 + }, + { + "epoch": 0.9356765057157481, + "grad_norm": 1.8249481507343304, + "learning_rate": 1.0811604106805705e-06, + "loss": 0.1729, + "step": 10968 + }, + { + "epoch": 0.9357618153898652, + "grad_norm": 1.946838638186531, + "learning_rate": 1.0783048587108213e-06, + "loss": 0.1839, + "step": 10969 + }, + { + "epoch": 0.9358471250639823, + "grad_norm": 1.7191593029599839, + "learning_rate": 1.0754530416525245e-06, + "loss": 0.2553, + "step": 10970 + }, + { + "epoch": 0.9359324347380993, + "grad_norm": 2.1055597911960984, + "learning_rate": 1.0726049597234055e-06, + "loss": 0.1692, + "step": 10971 + }, + { + "epoch": 0.9360177444122163, + "grad_norm": 2.2775282089881603, + "learning_rate": 1.0697606131408966e-06, + "loss": 0.2178, + "step": 10972 + }, + { + "epoch": 0.9361030540863334, + "grad_norm": 1.3687248959264093, + "learning_rate": 1.066920002122146e-06, + "loss": 0.1459, + "step": 10973 + }, + { + "epoch": 0.9361883637604504, + "grad_norm": 1.3402980556941557, + "learning_rate": 1.0640831268840302e-06, + "loss": 0.167, + "step": 10974 + }, + { + "epoch": 0.9362736734345675, + "grad_norm": 1.4091613726043666, + "learning_rate": 1.0612499876431204e-06, + "loss": 0.1768, + "step": 10975 + }, + { + "epoch": 0.9363589831086845, + "grad_norm": 1.7006105532693225, + "learning_rate": 1.0584205846157102e-06, + "loss": 0.1557, + "step": 10976 + }, + { + "epoch": 0.9364442927828016, + "grad_norm": 1.8272661979563813, + "learning_rate": 1.0555949180178104e-06, + "loss": 0.1599, + "step": 10977 + }, + { + "epoch": 0.9365296024569186, + "grad_norm": 2.021866943720667, + "learning_rate": 1.0527729880651537e-06, + "loss": 0.2238, + "step": 10978 + }, + { + "epoch": 0.9366149121310356, + "grad_norm": 1.459012073422443, + "learning_rate": 1.0499547949731735e-06, + "loss": 0.1864, + "step": 10979 + }, + { + "epoch": 0.9367002218051527, + "grad_norm": 1.6304200088826775, + "learning_rate": 1.0471403389570256e-06, + "loss": 0.1563, + "step": 10980 + }, + { + "epoch": 0.9367855314792698, + "grad_norm": 1.961059688089642, + "learning_rate": 1.0443296202315767e-06, + "loss": 0.1809, + "step": 10981 + }, + { + "epoch": 0.9368708411533868, + "grad_norm": 2.136190680433033, + "learning_rate": 1.0415226390114108e-06, + "loss": 0.2235, + "step": 10982 + }, + { + "epoch": 0.9369561508275038, + "grad_norm": 1.7293288219974432, + "learning_rate": 1.0387193955108287e-06, + "loss": 0.1702, + "step": 10983 + }, + { + "epoch": 0.9370414605016208, + "grad_norm": 1.7017419781709462, + "learning_rate": 1.0359198899438371e-06, + "loss": 0.1534, + "step": 10984 + }, + { + "epoch": 0.937126770175738, + "grad_norm": 2.671322146321917, + "learning_rate": 1.0331241225241706e-06, + "loss": 0.1575, + "step": 10985 + }, + { + "epoch": 0.937212079849855, + "grad_norm": 1.3488271740319826, + "learning_rate": 1.0303320934652637e-06, + "loss": 0.1139, + "step": 10986 + }, + { + "epoch": 0.937297389523972, + "grad_norm": 1.3284064948269232, + "learning_rate": 1.0275438029802797e-06, + "loss": 0.1036, + "step": 10987 + }, + { + "epoch": 0.937382699198089, + "grad_norm": 1.5639136214001097, + "learning_rate": 1.024759251282087e-06, + "loss": 0.1547, + "step": 10988 + }, + { + "epoch": 0.9374680088722062, + "grad_norm": 1.6550794099501591, + "learning_rate": 1.021978438583271e-06, + "loss": 0.0994, + "step": 10989 + }, + { + "epoch": 0.9375533185463232, + "grad_norm": 2.203799313638817, + "learning_rate": 1.0192013650961286e-06, + "loss": 0.2423, + "step": 10990 + }, + { + "epoch": 0.9376386282204402, + "grad_norm": 1.7832588780389504, + "learning_rate": 1.0164280310326845e-06, + "loss": 0.1883, + "step": 10991 + }, + { + "epoch": 0.9377239378945572, + "grad_norm": 1.8059231709923258, + "learning_rate": 1.0136584366046531e-06, + "loss": 0.1781, + "step": 10992 + }, + { + "epoch": 0.9378092475686743, + "grad_norm": 1.7436320384607742, + "learning_rate": 1.0108925820234926e-06, + "loss": 0.1443, + "step": 10993 + }, + { + "epoch": 0.9378945572427914, + "grad_norm": 1.624802063844573, + "learning_rate": 1.0081304675003566e-06, + "loss": 0.1725, + "step": 10994 + }, + { + "epoch": 0.9379798669169084, + "grad_norm": 1.5156378991276158, + "learning_rate": 1.0053720932461152e-06, + "loss": 0.1582, + "step": 10995 + }, + { + "epoch": 0.9380651765910254, + "grad_norm": 1.5239777218499444, + "learning_rate": 1.0026174594713612e-06, + "loss": 0.1481, + "step": 10996 + }, + { + "epoch": 0.9381504862651424, + "grad_norm": 1.4792448838019594, + "learning_rate": 9.99866566386387e-07, + "loss": 0.2255, + "step": 10997 + }, + { + "epoch": 0.9382357959392595, + "grad_norm": 1.9864462308176598, + "learning_rate": 9.971194142012197e-07, + "loss": 0.113, + "step": 10998 + }, + { + "epoch": 0.9383211056133766, + "grad_norm": 1.7823041507083812, + "learning_rate": 9.943760031255744e-07, + "loss": 0.2367, + "step": 10999 + }, + { + "epoch": 0.9384064152874936, + "grad_norm": 2.1890884082031596, + "learning_rate": 9.916363333689116e-07, + "loss": 0.1918, + "step": 11000 + }, + { + "epoch": 0.9384917249616106, + "grad_norm": 1.4703260042095376, + "learning_rate": 9.88900405140386e-07, + "loss": 0.102, + "step": 11001 + }, + { + "epoch": 0.9385770346357277, + "grad_norm": 1.2115296915456526, + "learning_rate": 9.861682186488697e-07, + "loss": 0.1472, + "step": 11002 + }, + { + "epoch": 0.9386623443098447, + "grad_norm": 1.7206632195996994, + "learning_rate": 9.83439774102951e-07, + "loss": 0.1916, + "step": 11003 + }, + { + "epoch": 0.9387476539839618, + "grad_norm": 1.8790503940290642, + "learning_rate": 9.807150717109303e-07, + "loss": 0.1772, + "step": 11004 + }, + { + "epoch": 0.9388329636580788, + "grad_norm": 1.5088309371717405, + "learning_rate": 9.779941116808245e-07, + "loss": 0.1608, + "step": 11005 + }, + { + "epoch": 0.9389182733321959, + "grad_norm": 1.7675709761164136, + "learning_rate": 9.75276894220367e-07, + "loss": 0.2382, + "step": 11006 + }, + { + "epoch": 0.9390035830063129, + "grad_norm": 1.797474835661355, + "learning_rate": 9.725634195370036e-07, + "loss": 0.1729, + "step": 11007 + }, + { + "epoch": 0.9390888926804299, + "grad_norm": 2.0071824565873384, + "learning_rate": 9.69853687837885e-07, + "loss": 0.2642, + "step": 11008 + }, + { + "epoch": 0.939174202354547, + "grad_norm": 1.554757504827994, + "learning_rate": 9.671476993298956e-07, + "loss": 0.1982, + "step": 11009 + }, + { + "epoch": 0.9392595120286641, + "grad_norm": 1.82998862195256, + "learning_rate": 9.644454542196146e-07, + "loss": 0.1765, + "step": 11010 + }, + { + "epoch": 0.9393448217027811, + "grad_norm": 1.840554911860942, + "learning_rate": 9.617469527133605e-07, + "loss": 0.1722, + "step": 11011 + }, + { + "epoch": 0.9394301313768981, + "grad_norm": 1.49459947509103, + "learning_rate": 9.590521950171293e-07, + "loss": 0.1377, + "step": 11012 + }, + { + "epoch": 0.9395154410510151, + "grad_norm": 2.1370359530532785, + "learning_rate": 9.563611813366568e-07, + "loss": 0.1065, + "step": 11013 + }, + { + "epoch": 0.9396007507251323, + "grad_norm": 1.5537948729586752, + "learning_rate": 9.536739118773951e-07, + "loss": 0.1747, + "step": 11014 + }, + { + "epoch": 0.9396860603992493, + "grad_norm": 1.417203502291304, + "learning_rate": 9.50990386844497e-07, + "loss": 0.1813, + "step": 11015 + }, + { + "epoch": 0.9397713700733663, + "grad_norm": 1.529214713080669, + "learning_rate": 9.483106064428426e-07, + "loss": 0.1348, + "step": 11016 + }, + { + "epoch": 0.9398566797474833, + "grad_norm": 2.021573183624394, + "learning_rate": 9.456345708770076e-07, + "loss": 0.1483, + "step": 11017 + }, + { + "epoch": 0.9399419894216005, + "grad_norm": 1.990051081399568, + "learning_rate": 9.429622803512949e-07, + "loss": 0.2049, + "step": 11018 + }, + { + "epoch": 0.9400272990957175, + "grad_norm": 1.3509504222229782, + "learning_rate": 9.40293735069725e-07, + "loss": 0.1586, + "step": 11019 + }, + { + "epoch": 0.9401126087698345, + "grad_norm": 2.5304309791509, + "learning_rate": 9.376289352360346e-07, + "loss": 0.1595, + "step": 11020 + }, + { + "epoch": 0.9401979184439515, + "grad_norm": 1.775418892982637, + "learning_rate": 9.349678810536444e-07, + "loss": 0.1512, + "step": 11021 + }, + { + "epoch": 0.9402832281180686, + "grad_norm": 1.858729017866545, + "learning_rate": 9.323105727257308e-07, + "loss": 0.1877, + "step": 11022 + }, + { + "epoch": 0.9403685377921857, + "grad_norm": 1.5393719821889509, + "learning_rate": 9.29657010455165e-07, + "loss": 0.1622, + "step": 11023 + }, + { + "epoch": 0.9404538474663027, + "grad_norm": 1.556816292187633, + "learning_rate": 9.27007194444518e-07, + "loss": 0.1681, + "step": 11024 + }, + { + "epoch": 0.9405391571404197, + "grad_norm": 1.694703867427199, + "learning_rate": 9.243611248961059e-07, + "loss": 0.2298, + "step": 11025 + }, + { + "epoch": 0.9406244668145368, + "grad_norm": 1.5201761402166314, + "learning_rate": 9.217188020119283e-07, + "loss": 0.187, + "step": 11026 + }, + { + "epoch": 0.9407097764886538, + "grad_norm": 1.7370487258305174, + "learning_rate": 9.190802259937237e-07, + "loss": 0.16, + "step": 11027 + }, + { + "epoch": 0.9407950861627709, + "grad_norm": 1.2749750882875257, + "learning_rate": 9.164453970429199e-07, + "loss": 0.0831, + "step": 11028 + }, + { + "epoch": 0.9408803958368879, + "grad_norm": 1.3380770142747282, + "learning_rate": 9.138143153606893e-07, + "loss": 0.1127, + "step": 11029 + }, + { + "epoch": 0.940965705511005, + "grad_norm": 1.3527044422386079, + "learning_rate": 9.111869811478879e-07, + "loss": 0.1386, + "step": 11030 + }, + { + "epoch": 0.941051015185122, + "grad_norm": 2.3853492116483133, + "learning_rate": 9.085633946050998e-07, + "loss": 0.1954, + "step": 11031 + }, + { + "epoch": 0.941136324859239, + "grad_norm": 2.1819699823428405, + "learning_rate": 9.059435559326257e-07, + "loss": 0.2157, + "step": 11032 + }, + { + "epoch": 0.941221634533356, + "grad_norm": 1.9419506391975971, + "learning_rate": 9.033274653304836e-07, + "loss": 0.234, + "step": 11033 + }, + { + "epoch": 0.9413069442074731, + "grad_norm": 1.9083812868626027, + "learning_rate": 9.007151229983801e-07, + "loss": 0.2045, + "step": 11034 + }, + { + "epoch": 0.9413922538815902, + "grad_norm": 2.258359532597257, + "learning_rate": 8.981065291357671e-07, + "loss": 0.1876, + "step": 11035 + }, + { + "epoch": 0.9414775635557072, + "grad_norm": 1.5649434629715424, + "learning_rate": 8.955016839418018e-07, + "loss": 0.1729, + "step": 11036 + }, + { + "epoch": 0.9415628732298242, + "grad_norm": 1.593707364577924, + "learning_rate": 8.929005876153307e-07, + "loss": 0.1401, + "step": 11037 + }, + { + "epoch": 0.9416481829039413, + "grad_norm": 1.6991010954014505, + "learning_rate": 8.903032403549505e-07, + "loss": 0.166, + "step": 11038 + }, + { + "epoch": 0.9417334925780584, + "grad_norm": 2.8192131457066694, + "learning_rate": 8.877096423589415e-07, + "loss": 0.1803, + "step": 11039 + }, + { + "epoch": 0.9418188022521754, + "grad_norm": 2.5132259515780317, + "learning_rate": 8.851197938253286e-07, + "loss": 0.2198, + "step": 11040 + }, + { + "epoch": 0.9419041119262924, + "grad_norm": 1.3032586502587207, + "learning_rate": 8.825336949518204e-07, + "loss": 0.1407, + "step": 11041 + }, + { + "epoch": 0.9419894216004094, + "grad_norm": 1.766981565853176, + "learning_rate": 8.799513459358533e-07, + "loss": 0.2043, + "step": 11042 + }, + { + "epoch": 0.9420747312745266, + "grad_norm": 1.665114307661043, + "learning_rate": 8.773727469745751e-07, + "loss": 0.2412, + "step": 11043 + }, + { + "epoch": 0.9421600409486436, + "grad_norm": 1.9581489340898952, + "learning_rate": 8.747978982648564e-07, + "loss": 0.1513, + "step": 11044 + }, + { + "epoch": 0.9422453506227606, + "grad_norm": 1.5060981135736016, + "learning_rate": 8.722268000032618e-07, + "loss": 0.1247, + "step": 11045 + }, + { + "epoch": 0.9423306602968776, + "grad_norm": 1.7790115747445128, + "learning_rate": 8.696594523860901e-07, + "loss": 0.1933, + "step": 11046 + }, + { + "epoch": 0.9424159699709947, + "grad_norm": 1.9890089221100231, + "learning_rate": 8.670958556093401e-07, + "loss": 0.2297, + "step": 11047 + }, + { + "epoch": 0.9425012796451118, + "grad_norm": 1.8463681269352419, + "learning_rate": 8.645360098687272e-07, + "loss": 0.1678, + "step": 11048 + }, + { + "epoch": 0.9425865893192288, + "grad_norm": 2.667273502047156, + "learning_rate": 8.619799153596897e-07, + "loss": 0.2418, + "step": 11049 + }, + { + "epoch": 0.9426718989933458, + "grad_norm": 2.409162072100679, + "learning_rate": 8.594275722773548e-07, + "loss": 0.125, + "step": 11050 + }, + { + "epoch": 0.9427572086674629, + "grad_norm": 1.5542098035101395, + "learning_rate": 8.568789808166e-07, + "loss": 0.1329, + "step": 11051 + }, + { + "epoch": 0.94284251834158, + "grad_norm": 2.4069313967111987, + "learning_rate": 8.543341411719918e-07, + "loss": 0.2695, + "step": 11052 + }, + { + "epoch": 0.942927828015697, + "grad_norm": 1.5980629997773317, + "learning_rate": 8.517930535378083e-07, + "loss": 0.1667, + "step": 11053 + }, + { + "epoch": 0.943013137689814, + "grad_norm": 1.3773499694787656, + "learning_rate": 8.492557181080496e-07, + "loss": 0.1371, + "step": 11054 + }, + { + "epoch": 0.9430984473639311, + "grad_norm": 1.789435735025535, + "learning_rate": 8.467221350764332e-07, + "loss": 0.1709, + "step": 11055 + }, + { + "epoch": 0.9431837570380481, + "grad_norm": 1.6852677026782872, + "learning_rate": 8.44192304636382e-07, + "loss": 0.1688, + "step": 11056 + }, + { + "epoch": 0.9432690667121651, + "grad_norm": 1.8761000902849212, + "learning_rate": 8.41666226981036e-07, + "loss": 0.1838, + "step": 11057 + }, + { + "epoch": 0.9433543763862822, + "grad_norm": 1.459869728939203, + "learning_rate": 8.39143902303241e-07, + "loss": 0.1448, + "step": 11058 + }, + { + "epoch": 0.9434396860603993, + "grad_norm": 1.5117977624397458, + "learning_rate": 8.366253307955763e-07, + "loss": 0.2356, + "step": 11059 + }, + { + "epoch": 0.9435249957345163, + "grad_norm": 1.7568180238108388, + "learning_rate": 8.341105126503102e-07, + "loss": 0.1515, + "step": 11060 + }, + { + "epoch": 0.9436103054086333, + "grad_norm": 1.7912492339724215, + "learning_rate": 8.315994480594336e-07, + "loss": 0.1338, + "step": 11061 + }, + { + "epoch": 0.9436956150827503, + "grad_norm": 1.9167618382363112, + "learning_rate": 8.290921372146654e-07, + "loss": 0.2325, + "step": 11062 + }, + { + "epoch": 0.9437809247568675, + "grad_norm": 1.7834346825627714, + "learning_rate": 8.265885803074136e-07, + "loss": 0.1506, + "step": 11063 + }, + { + "epoch": 0.9438662344309845, + "grad_norm": 1.7508207848650106, + "learning_rate": 8.240887775288197e-07, + "loss": 0.1825, + "step": 11064 + }, + { + "epoch": 0.9439515441051015, + "grad_norm": 1.3962023897173246, + "learning_rate": 8.215927290697256e-07, + "loss": 0.1496, + "step": 11065 + }, + { + "epoch": 0.9440368537792185, + "grad_norm": 1.695378877948797, + "learning_rate": 8.191004351206954e-07, + "loss": 0.2223, + "step": 11066 + }, + { + "epoch": 0.9441221634533357, + "grad_norm": 1.7045889346039182, + "learning_rate": 8.166118958719992e-07, + "loss": 0.1208, + "step": 11067 + }, + { + "epoch": 0.9442074731274527, + "grad_norm": 2.05316825368251, + "learning_rate": 8.141271115136184e-07, + "loss": 0.2424, + "step": 11068 + }, + { + "epoch": 0.9442927828015697, + "grad_norm": 1.8627353084334521, + "learning_rate": 8.116460822352734e-07, + "loss": 0.2336, + "step": 11069 + }, + { + "epoch": 0.9443780924756867, + "grad_norm": 1.8683907051817275, + "learning_rate": 8.091688082263515e-07, + "loss": 0.1658, + "step": 11070 + }, + { + "epoch": 0.9444634021498037, + "grad_norm": 1.748012776814779, + "learning_rate": 8.066952896759905e-07, + "loss": 0.2266, + "step": 11071 + }, + { + "epoch": 0.9445487118239209, + "grad_norm": 2.494453954171129, + "learning_rate": 8.042255267730392e-07, + "loss": 0.2372, + "step": 11072 + }, + { + "epoch": 0.9446340214980379, + "grad_norm": 1.9549599534638087, + "learning_rate": 8.017595197060357e-07, + "loss": 0.1546, + "step": 11073 + }, + { + "epoch": 0.9447193311721549, + "grad_norm": 1.627481326744669, + "learning_rate": 7.992972686632571e-07, + "loss": 0.1496, + "step": 11074 + }, + { + "epoch": 0.9448046408462719, + "grad_norm": 1.4325045155825187, + "learning_rate": 7.96838773832681e-07, + "loss": 0.1128, + "step": 11075 + }, + { + "epoch": 0.944889950520389, + "grad_norm": 2.0757004210643193, + "learning_rate": 7.94384035401996e-07, + "loss": 0.1657, + "step": 11076 + }, + { + "epoch": 0.9449752601945061, + "grad_norm": 2.127451660467742, + "learning_rate": 7.919330535586134e-07, + "loss": 0.2197, + "step": 11077 + }, + { + "epoch": 0.9450605698686231, + "grad_norm": 1.5973508958079246, + "learning_rate": 7.8948582848965e-07, + "loss": 0.1195, + "step": 11078 + }, + { + "epoch": 0.9451458795427401, + "grad_norm": 2.309124519278297, + "learning_rate": 7.870423603819399e-07, + "loss": 0.1738, + "step": 11079 + }, + { + "epoch": 0.9452311892168572, + "grad_norm": 1.7350550512545095, + "learning_rate": 7.846026494220282e-07, + "loss": 0.1952, + "step": 11080 + }, + { + "epoch": 0.9453164988909742, + "grad_norm": 2.3097548816314286, + "learning_rate": 7.821666957961771e-07, + "loss": 0.2577, + "step": 11081 + }, + { + "epoch": 0.9454018085650913, + "grad_norm": 2.234101498810769, + "learning_rate": 7.797344996903544e-07, + "loss": 0.2094, + "step": 11082 + }, + { + "epoch": 0.9454871182392083, + "grad_norm": 1.5672507732545835, + "learning_rate": 7.773060612902395e-07, + "loss": 0.1633, + "step": 11083 + }, + { + "epoch": 0.9455724279133254, + "grad_norm": 1.9755638524856145, + "learning_rate": 7.748813807812394e-07, + "loss": 0.1288, + "step": 11084 + }, + { + "epoch": 0.9456577375874424, + "grad_norm": 1.7527793883598224, + "learning_rate": 7.724604583484674e-07, + "loss": 0.1704, + "step": 11085 + }, + { + "epoch": 0.9457430472615594, + "grad_norm": 2.6991035668712797, + "learning_rate": 7.700432941767477e-07, + "loss": 0.1776, + "step": 11086 + }, + { + "epoch": 0.9458283569356765, + "grad_norm": 1.8854977781717162, + "learning_rate": 7.676298884506106e-07, + "loss": 0.169, + "step": 11087 + }, + { + "epoch": 0.9459136666097936, + "grad_norm": 1.6955220317123492, + "learning_rate": 7.652202413543141e-07, + "loss": 0.1341, + "step": 11088 + }, + { + "epoch": 0.9459989762839106, + "grad_norm": 1.5388550594358834, + "learning_rate": 7.628143530718169e-07, + "loss": 0.1746, + "step": 11089 + }, + { + "epoch": 0.9460842859580276, + "grad_norm": 1.7849436159605163, + "learning_rate": 7.604122237867939e-07, + "loss": 0.1782, + "step": 11090 + }, + { + "epoch": 0.9461695956321446, + "grad_norm": 1.819446930952371, + "learning_rate": 7.580138536826431e-07, + "loss": 0.1513, + "step": 11091 + }, + { + "epoch": 0.9462549053062618, + "grad_norm": 1.6328358625720965, + "learning_rate": 7.556192429424569e-07, + "loss": 0.1904, + "step": 11092 + }, + { + "epoch": 0.9463402149803788, + "grad_norm": 1.7544288451408776, + "learning_rate": 7.532283917490668e-07, + "loss": 0.2018, + "step": 11093 + }, + { + "epoch": 0.9464255246544958, + "grad_norm": 1.6583398553665702, + "learning_rate": 7.508413002849879e-07, + "loss": 0.1623, + "step": 11094 + }, + { + "epoch": 0.9465108343286128, + "grad_norm": 1.9290554231519739, + "learning_rate": 7.484579687324633e-07, + "loss": 0.1985, + "step": 11095 + }, + { + "epoch": 0.94659614400273, + "grad_norm": 2.4441591900740303, + "learning_rate": 7.460783972734476e-07, + "loss": 0.2148, + "step": 11096 + }, + { + "epoch": 0.946681453676847, + "grad_norm": 1.8394600696963233, + "learning_rate": 7.43702586089623e-07, + "loss": 0.1609, + "step": 11097 + }, + { + "epoch": 0.946766763350964, + "grad_norm": 1.6441772514862287, + "learning_rate": 7.413305353623445e-07, + "loss": 0.161, + "step": 11098 + }, + { + "epoch": 0.946852073025081, + "grad_norm": 1.3393507514784317, + "learning_rate": 7.389622452727285e-07, + "loss": 0.1951, + "step": 11099 + }, + { + "epoch": 0.9469373826991981, + "grad_norm": 1.9161787517761868, + "learning_rate": 7.365977160015692e-07, + "loss": 0.1663, + "step": 11100 + }, + { + "epoch": 0.9470226923733152, + "grad_norm": 2.1605495763915945, + "learning_rate": 7.342369477293886e-07, + "loss": 0.1751, + "step": 11101 + }, + { + "epoch": 0.9471080020474322, + "grad_norm": 2.4085005107084796, + "learning_rate": 7.318799406364208e-07, + "loss": 0.1991, + "step": 11102 + }, + { + "epoch": 0.9471933117215492, + "grad_norm": 1.8136771095819828, + "learning_rate": 7.295266949026047e-07, + "loss": 0.2126, + "step": 11103 + }, + { + "epoch": 0.9472786213956663, + "grad_norm": 1.4853675334806644, + "learning_rate": 7.271772107076136e-07, + "loss": 0.1489, + "step": 11104 + }, + { + "epoch": 0.9473639310697833, + "grad_norm": 2.137226386153649, + "learning_rate": 7.248314882307928e-07, + "loss": 0.1976, + "step": 11105 + }, + { + "epoch": 0.9474492407439004, + "grad_norm": 1.532145646963206, + "learning_rate": 7.224895276512489e-07, + "loss": 0.1258, + "step": 11106 + }, + { + "epoch": 0.9475345504180174, + "grad_norm": 2.193473942007321, + "learning_rate": 7.201513291477669e-07, + "loss": 0.225, + "step": 11107 + }, + { + "epoch": 0.9476198600921345, + "grad_norm": 2.0221427398241176, + "learning_rate": 7.178168928988593e-07, + "loss": 0.1965, + "step": 11108 + }, + { + "epoch": 0.9477051697662515, + "grad_norm": 1.881830109536253, + "learning_rate": 7.154862190827449e-07, + "loss": 0.1745, + "step": 11109 + }, + { + "epoch": 0.9477904794403685, + "grad_norm": 2.17391608403864, + "learning_rate": 7.13159307877359e-07, + "loss": 0.2046, + "step": 11110 + }, + { + "epoch": 0.9478757891144856, + "grad_norm": 1.8132935463990925, + "learning_rate": 7.108361594603541e-07, + "loss": 0.1421, + "step": 11111 + }, + { + "epoch": 0.9479610987886026, + "grad_norm": 1.5510272160103082, + "learning_rate": 7.085167740090771e-07, + "loss": 0.1351, + "step": 11112 + }, + { + "epoch": 0.9480464084627197, + "grad_norm": 2.0881574726063192, + "learning_rate": 7.062011517006139e-07, + "loss": 0.1746, + "step": 11113 + }, + { + "epoch": 0.9481317181368367, + "grad_norm": 2.243992308387562, + "learning_rate": 7.038892927117513e-07, + "loss": 0.1475, + "step": 11114 + }, + { + "epoch": 0.9482170278109537, + "grad_norm": 2.16484997465188, + "learning_rate": 7.015811972189757e-07, + "loss": 0.2312, + "step": 11115 + }, + { + "epoch": 0.9483023374850708, + "grad_norm": 1.860679937902832, + "learning_rate": 6.992768653985071e-07, + "loss": 0.181, + "step": 11116 + }, + { + "epoch": 0.9483876471591879, + "grad_norm": 1.4609352152597441, + "learning_rate": 6.969762974262717e-07, + "loss": 0.1437, + "step": 11117 + }, + { + "epoch": 0.9484729568333049, + "grad_norm": 2.3285076671733775, + "learning_rate": 6.9467949347789e-07, + "loss": 0.1463, + "step": 11118 + }, + { + "epoch": 0.9485582665074219, + "grad_norm": 2.363275903112495, + "learning_rate": 6.923864537287217e-07, + "loss": 0.1756, + "step": 11119 + }, + { + "epoch": 0.9486435761815389, + "grad_norm": 1.9652198143428925, + "learning_rate": 6.900971783538323e-07, + "loss": 0.1896, + "step": 11120 + }, + { + "epoch": 0.9487288858556561, + "grad_norm": 1.754858493371073, + "learning_rate": 6.878116675279878e-07, + "loss": 0.1982, + "step": 11121 + }, + { + "epoch": 0.9488141955297731, + "grad_norm": 1.907020163122921, + "learning_rate": 6.855299214256817e-07, + "loss": 0.1702, + "step": 11122 + }, + { + "epoch": 0.9488995052038901, + "grad_norm": 1.6689124553341124, + "learning_rate": 6.832519402211079e-07, + "loss": 0.1909, + "step": 11123 + }, + { + "epoch": 0.9489848148780071, + "grad_norm": 2.7255960844938314, + "learning_rate": 6.809777240881776e-07, + "loss": 0.1757, + "step": 11124 + }, + { + "epoch": 0.9490701245521243, + "grad_norm": 1.2664585429837396, + "learning_rate": 6.787072732005129e-07, + "loss": 0.171, + "step": 11125 + }, + { + "epoch": 0.9491554342262413, + "grad_norm": 1.4496903384806463, + "learning_rate": 6.764405877314639e-07, + "loss": 0.1154, + "step": 11126 + }, + { + "epoch": 0.9492407439003583, + "grad_norm": 1.446227242992194, + "learning_rate": 6.741776678540645e-07, + "loss": 0.1731, + "step": 11127 + }, + { + "epoch": 0.9493260535744753, + "grad_norm": 1.6014384896253326, + "learning_rate": 6.719185137410878e-07, + "loss": 0.1956, + "step": 11128 + }, + { + "epoch": 0.9494113632485924, + "grad_norm": 2.127427901986202, + "learning_rate": 6.69663125565001e-07, + "loss": 0.1549, + "step": 11129 + }, + { + "epoch": 0.9494966729227095, + "grad_norm": 1.3739563709367453, + "learning_rate": 6.674115034979945e-07, + "loss": 0.1573, + "step": 11130 + }, + { + "epoch": 0.9495819825968265, + "grad_norm": 2.485041882720392, + "learning_rate": 6.651636477119639e-07, + "loss": 0.2713, + "step": 11131 + }, + { + "epoch": 0.9496672922709435, + "grad_norm": 1.9809291688136377, + "learning_rate": 6.629195583785219e-07, + "loss": 0.1797, + "step": 11132 + }, + { + "epoch": 0.9497526019450606, + "grad_norm": 2.194239995851254, + "learning_rate": 6.606792356690039e-07, + "loss": 0.1488, + "step": 11133 + }, + { + "epoch": 0.9498379116191776, + "grad_norm": 2.068996370978881, + "learning_rate": 6.584426797544286e-07, + "loss": 0.2251, + "step": 11134 + }, + { + "epoch": 0.9499232212932947, + "grad_norm": 1.7401498416530197, + "learning_rate": 6.562098908055536e-07, + "loss": 0.1493, + "step": 11135 + }, + { + "epoch": 0.9500085309674117, + "grad_norm": 1.7677501615821745, + "learning_rate": 6.539808689928484e-07, + "loss": 0.1434, + "step": 11136 + }, + { + "epoch": 0.9500938406415288, + "grad_norm": 1.567500543183593, + "learning_rate": 6.517556144864711e-07, + "loss": 0.2085, + "step": 11137 + }, + { + "epoch": 0.9501791503156458, + "grad_norm": 1.9548461654510345, + "learning_rate": 6.495341274563193e-07, + "loss": 0.1892, + "step": 11138 + }, + { + "epoch": 0.9502644599897628, + "grad_norm": 1.2749115539241476, + "learning_rate": 6.473164080719906e-07, + "loss": 0.1529, + "step": 11139 + }, + { + "epoch": 0.9503497696638799, + "grad_norm": 2.14969135109398, + "learning_rate": 6.451024565027941e-07, + "loss": 0.136, + "step": 11140 + }, + { + "epoch": 0.950435079337997, + "grad_norm": 1.6748149371042054, + "learning_rate": 6.4289227291775e-07, + "loss": 0.1921, + "step": 11141 + }, + { + "epoch": 0.950520389012114, + "grad_norm": 2.0509665559881123, + "learning_rate": 6.406858574856067e-07, + "loss": 0.2177, + "step": 11142 + }, + { + "epoch": 0.950605698686231, + "grad_norm": 2.085379650944695, + "learning_rate": 6.384832103747907e-07, + "loss": 0.1507, + "step": 11143 + }, + { + "epoch": 0.950691008360348, + "grad_norm": 1.9400849480071765, + "learning_rate": 6.362843317534839e-07, + "loss": 0.1702, + "step": 11144 + }, + { + "epoch": 0.9507763180344652, + "grad_norm": 1.6623846200472028, + "learning_rate": 6.340892217895411e-07, + "loss": 0.1845, + "step": 11145 + }, + { + "epoch": 0.9508616277085822, + "grad_norm": 1.7704054670846734, + "learning_rate": 6.318978806505671e-07, + "loss": 0.2263, + "step": 11146 + }, + { + "epoch": 0.9509469373826992, + "grad_norm": 1.8577417846259843, + "learning_rate": 6.297103085038391e-07, + "loss": 0.1892, + "step": 11147 + }, + { + "epoch": 0.9510322470568162, + "grad_norm": 1.5743823096638319, + "learning_rate": 6.275265055163793e-07, + "loss": 0.1707, + "step": 11148 + }, + { + "epoch": 0.9511175567309332, + "grad_norm": 1.761106938872959, + "learning_rate": 6.253464718549096e-07, + "loss": 0.1656, + "step": 11149 + }, + { + "epoch": 0.9512028664050504, + "grad_norm": 2.60243013594165, + "learning_rate": 6.231702076858526e-07, + "loss": 0.2207, + "step": 11150 + }, + { + "epoch": 0.9512881760791674, + "grad_norm": 1.3713264811215882, + "learning_rate": 6.209977131753697e-07, + "loss": 0.0716, + "step": 11151 + }, + { + "epoch": 0.9513734857532844, + "grad_norm": 1.494341588685849, + "learning_rate": 6.188289884893062e-07, + "loss": 0.1135, + "step": 11152 + }, + { + "epoch": 0.9514587954274014, + "grad_norm": 1.3268834762554063, + "learning_rate": 6.166640337932406e-07, + "loss": 0.1345, + "step": 11153 + }, + { + "epoch": 0.9515441051015185, + "grad_norm": 1.5127343209090378, + "learning_rate": 6.145028492524463e-07, + "loss": 0.1753, + "step": 11154 + }, + { + "epoch": 0.9516294147756356, + "grad_norm": 1.6137110250835482, + "learning_rate": 6.123454350319358e-07, + "loss": 0.212, + "step": 11155 + }, + { + "epoch": 0.9517147244497526, + "grad_norm": 1.7017651650033407, + "learning_rate": 6.101917912963995e-07, + "loss": 0.0786, + "step": 11156 + }, + { + "epoch": 0.9518000341238696, + "grad_norm": 2.51400173723135, + "learning_rate": 6.080419182102615e-07, + "loss": 0.2283, + "step": 11157 + }, + { + "epoch": 0.9518853437979867, + "grad_norm": 1.4985511458243963, + "learning_rate": 6.058958159376571e-07, + "loss": 0.1619, + "step": 11158 + }, + { + "epoch": 0.9519706534721037, + "grad_norm": 1.9282537531717898, + "learning_rate": 6.037534846424276e-07, + "loss": 0.1628, + "step": 11159 + }, + { + "epoch": 0.9520559631462208, + "grad_norm": 1.8127955491710814, + "learning_rate": 6.016149244881253e-07, + "loss": 0.1824, + "step": 11160 + }, + { + "epoch": 0.9521412728203378, + "grad_norm": 2.077396924965521, + "learning_rate": 5.994801356380253e-07, + "loss": 0.2285, + "step": 11161 + }, + { + "epoch": 0.9522265824944549, + "grad_norm": 1.6969942110735063, + "learning_rate": 5.973491182551028e-07, + "loss": 0.1927, + "step": 11162 + }, + { + "epoch": 0.9523118921685719, + "grad_norm": 1.969960763801957, + "learning_rate": 5.952218725020442e-07, + "loss": 0.1592, + "step": 11163 + }, + { + "epoch": 0.952397201842689, + "grad_norm": 2.361447264234943, + "learning_rate": 5.930983985412641e-07, + "loss": 0.1868, + "step": 11164 + }, + { + "epoch": 0.952482511516806, + "grad_norm": 1.7289070332797312, + "learning_rate": 5.909786965348718e-07, + "loss": 0.231, + "step": 11165 + }, + { + "epoch": 0.9525678211909231, + "grad_norm": 1.6845738507662116, + "learning_rate": 5.888627666446988e-07, + "loss": 0.1496, + "step": 11166 + }, + { + "epoch": 0.9526531308650401, + "grad_norm": 1.8352508559511447, + "learning_rate": 5.867506090322772e-07, + "loss": 0.1935, + "step": 11167 + }, + { + "epoch": 0.9527384405391571, + "grad_norm": 1.697133084336015, + "learning_rate": 5.846422238588723e-07, + "loss": 0.2013, + "step": 11168 + }, + { + "epoch": 0.9528237502132741, + "grad_norm": 1.6911044303009053, + "learning_rate": 5.82537611285433e-07, + "loss": 0.1656, + "step": 11169 + }, + { + "epoch": 0.9529090598873913, + "grad_norm": 1.7130214593348214, + "learning_rate": 5.804367714726477e-07, + "loss": 0.1433, + "step": 11170 + }, + { + "epoch": 0.9529943695615083, + "grad_norm": 1.7131248670631531, + "learning_rate": 5.783397045808992e-07, + "loss": 0.1408, + "step": 11171 + }, + { + "epoch": 0.9530796792356253, + "grad_norm": 2.3111941800512636, + "learning_rate": 5.76246410770287e-07, + "loss": 0.1655, + "step": 11172 + }, + { + "epoch": 0.9531649889097423, + "grad_norm": 3.297168013582268, + "learning_rate": 5.741568902006277e-07, + "loss": 0.2369, + "step": 11173 + }, + { + "epoch": 0.9532502985838595, + "grad_norm": 1.7411301487672566, + "learning_rate": 5.720711430314329e-07, + "loss": 0.2427, + "step": 11174 + }, + { + "epoch": 0.9533356082579765, + "grad_norm": 1.7431127716981416, + "learning_rate": 5.699891694219584e-07, + "loss": 0.1412, + "step": 11175 + }, + { + "epoch": 0.9534209179320935, + "grad_norm": 1.7214367760608025, + "learning_rate": 5.679109695311269e-07, + "loss": 0.137, + "step": 11176 + }, + { + "epoch": 0.9535062276062105, + "grad_norm": 1.728744784752074, + "learning_rate": 5.658365435176171e-07, + "loss": 0.1381, + "step": 11177 + }, + { + "epoch": 0.9535915372803276, + "grad_norm": 2.3499026095723834, + "learning_rate": 5.63765891539797e-07, + "loss": 0.1989, + "step": 11178 + }, + { + "epoch": 0.9536768469544447, + "grad_norm": 1.676461106333575, + "learning_rate": 5.616990137557454e-07, + "loss": 0.1851, + "step": 11179 + }, + { + "epoch": 0.9537621566285617, + "grad_norm": 1.6338933260449304, + "learning_rate": 5.596359103232529e-07, + "loss": 0.1626, + "step": 11180 + }, + { + "epoch": 0.9538474663026787, + "grad_norm": 1.7264223343929126, + "learning_rate": 5.575765813998435e-07, + "loss": 0.2028, + "step": 11181 + }, + { + "epoch": 0.9539327759767958, + "grad_norm": 2.2576951785113697, + "learning_rate": 5.555210271427192e-07, + "loss": 0.2158, + "step": 11182 + }, + { + "epoch": 0.9540180856509128, + "grad_norm": 1.7667138447250212, + "learning_rate": 5.534692477088155e-07, + "loss": 0.1882, + "step": 11183 + }, + { + "epoch": 0.9541033953250299, + "grad_norm": 1.4926677150839058, + "learning_rate": 5.51421243254785e-07, + "loss": 0.1741, + "step": 11184 + }, + { + "epoch": 0.9541887049991469, + "grad_norm": 2.096663995123507, + "learning_rate": 5.493770139369636e-07, + "loss": 0.2255, + "step": 11185 + }, + { + "epoch": 0.9542740146732639, + "grad_norm": 2.2850505576185403, + "learning_rate": 5.473365599114266e-07, + "loss": 0.1817, + "step": 11186 + }, + { + "epoch": 0.954359324347381, + "grad_norm": 1.5637287648906648, + "learning_rate": 5.452998813339605e-07, + "loss": 0.1298, + "step": 11187 + }, + { + "epoch": 0.954444634021498, + "grad_norm": 1.705030093221135, + "learning_rate": 5.432669783600408e-07, + "loss": 0.1804, + "step": 11188 + }, + { + "epoch": 0.9545299436956151, + "grad_norm": 2.188057310772972, + "learning_rate": 5.412378511448712e-07, + "loss": 0.167, + "step": 11189 + }, + { + "epoch": 0.9546152533697321, + "grad_norm": 1.380589697590695, + "learning_rate": 5.392124998433723e-07, + "loss": 0.1012, + "step": 11190 + }, + { + "epoch": 0.9547005630438492, + "grad_norm": 1.7120275639338256, + "learning_rate": 5.371909246101648e-07, + "loss": 0.162, + "step": 11191 + }, + { + "epoch": 0.9547858727179662, + "grad_norm": 2.550359670667662, + "learning_rate": 5.351731255995862e-07, + "loss": 0.1825, + "step": 11192 + }, + { + "epoch": 0.9548711823920832, + "grad_norm": 1.6736147376197508, + "learning_rate": 5.331591029656802e-07, + "loss": 0.1639, + "step": 11193 + }, + { + "epoch": 0.9549564920662003, + "grad_norm": 2.0805232614960056, + "learning_rate": 5.311488568622125e-07, + "loss": 0.1973, + "step": 11194 + }, + { + "epoch": 0.9550418017403174, + "grad_norm": 1.4353153379392734, + "learning_rate": 5.291423874426548e-07, + "loss": 0.1605, + "step": 11195 + }, + { + "epoch": 0.9551271114144344, + "grad_norm": 1.7800488353845125, + "learning_rate": 5.271396948601792e-07, + "loss": 0.1821, + "step": 11196 + }, + { + "epoch": 0.9552124210885514, + "grad_norm": 2.5033124913018083, + "learning_rate": 5.251407792677021e-07, + "loss": 0.1658, + "step": 11197 + }, + { + "epoch": 0.9552977307626684, + "grad_norm": 1.5950872757331527, + "learning_rate": 5.23145640817807e-07, + "loss": 0.2052, + "step": 11198 + }, + { + "epoch": 0.9553830404367856, + "grad_norm": 1.5200786634217691, + "learning_rate": 5.211542796628277e-07, + "loss": 0.1879, + "step": 11199 + }, + { + "epoch": 0.9554683501109026, + "grad_norm": 1.497371277538691, + "learning_rate": 5.191666959547869e-07, + "loss": 0.1349, + "step": 11200 + }, + { + "epoch": 0.9555536597850196, + "grad_norm": 1.6497383950662485, + "learning_rate": 5.1718288984543e-07, + "loss": 0.1461, + "step": 11201 + }, + { + "epoch": 0.9556389694591366, + "grad_norm": 2.3801270407742123, + "learning_rate": 5.152028614862026e-07, + "loss": 0.2022, + "step": 11202 + }, + { + "epoch": 0.9557242791332538, + "grad_norm": 2.0218072753129315, + "learning_rate": 5.132266110282835e-07, + "loss": 0.1548, + "step": 11203 + }, + { + "epoch": 0.9558095888073708, + "grad_norm": 1.4118302149802189, + "learning_rate": 5.1125413862253e-07, + "loss": 0.1732, + "step": 11204 + }, + { + "epoch": 0.9558948984814878, + "grad_norm": 1.731088359461804, + "learning_rate": 5.092854444195494e-07, + "loss": 0.1757, + "step": 11205 + }, + { + "epoch": 0.9559802081556048, + "grad_norm": 2.726589432285502, + "learning_rate": 5.073205285696269e-07, + "loss": 0.1959, + "step": 11206 + }, + { + "epoch": 0.9560655178297219, + "grad_norm": 1.7607114496155731, + "learning_rate": 5.05359391222776e-07, + "loss": 0.1319, + "step": 11207 + }, + { + "epoch": 0.956150827503839, + "grad_norm": 1.9188785919937112, + "learning_rate": 5.034020325287269e-07, + "loss": 0.2407, + "step": 11208 + }, + { + "epoch": 0.956236137177956, + "grad_norm": 2.803535841281218, + "learning_rate": 5.014484526369046e-07, + "loss": 0.2062, + "step": 11209 + }, + { + "epoch": 0.956321446852073, + "grad_norm": 2.2740676897856518, + "learning_rate": 4.994986516964617e-07, + "loss": 0.1375, + "step": 11210 + }, + { + "epoch": 0.9564067565261901, + "grad_norm": 1.53968349263479, + "learning_rate": 4.97552629856246e-07, + "loss": 0.1549, + "step": 11211 + }, + { + "epoch": 0.9564920662003071, + "grad_norm": 2.051113021898373, + "learning_rate": 4.956103872648333e-07, + "loss": 0.1536, + "step": 11212 + }, + { + "epoch": 0.9565773758744242, + "grad_norm": 1.2229317059260971, + "learning_rate": 4.936719240705045e-07, + "loss": 0.1432, + "step": 11213 + }, + { + "epoch": 0.9566626855485412, + "grad_norm": 1.2639590000553027, + "learning_rate": 4.917372404212417e-07, + "loss": 0.1547, + "step": 11214 + }, + { + "epoch": 0.9567479952226583, + "grad_norm": 1.8723359573687248, + "learning_rate": 4.898063364647598e-07, + "loss": 0.1507, + "step": 11215 + }, + { + "epoch": 0.9568333048967753, + "grad_norm": 1.7869169871683357, + "learning_rate": 4.878792123484688e-07, + "loss": 0.1952, + "step": 11216 + }, + { + "epoch": 0.9569186145708923, + "grad_norm": 2.0011321677997507, + "learning_rate": 4.859558682194898e-07, + "loss": 0.0982, + "step": 11217 + }, + { + "epoch": 0.9570039242450094, + "grad_norm": 1.6079295064837404, + "learning_rate": 4.840363042246554e-07, + "loss": 0.1337, + "step": 11218 + }, + { + "epoch": 0.9570892339191265, + "grad_norm": 1.8285705805887231, + "learning_rate": 4.821205205105317e-07, + "loss": 0.1544, + "step": 11219 + }, + { + "epoch": 0.9571745435932435, + "grad_norm": 2.35484930848144, + "learning_rate": 4.802085172233628e-07, + "loss": 0.1666, + "step": 11220 + }, + { + "epoch": 0.9572598532673605, + "grad_norm": 1.8563062845552907, + "learning_rate": 4.78300294509132e-07, + "loss": 0.1547, + "step": 11221 + }, + { + "epoch": 0.9573451629414775, + "grad_norm": 2.0207721137248282, + "learning_rate": 4.7639585251350593e-07, + "loss": 0.2199, + "step": 11222 + }, + { + "epoch": 0.9574304726155947, + "grad_norm": 2.449993297022778, + "learning_rate": 4.744951913819018e-07, + "loss": 0.1436, + "step": 11223 + }, + { + "epoch": 0.9575157822897117, + "grad_norm": 1.6835355460954904, + "learning_rate": 4.725983112593979e-07, + "loss": 0.1361, + "step": 11224 + }, + { + "epoch": 0.9576010919638287, + "grad_norm": 1.8468755318629602, + "learning_rate": 4.7070521229083396e-07, + "loss": 0.2248, + "step": 11225 + }, + { + "epoch": 0.9576864016379457, + "grad_norm": 2.001501949922106, + "learning_rate": 4.6881589462072773e-07, + "loss": 0.1619, + "step": 11226 + }, + { + "epoch": 0.9577717113120627, + "grad_norm": 1.812555114960449, + "learning_rate": 4.669303583933138e-07, + "loss": 0.1988, + "step": 11227 + }, + { + "epoch": 0.9578570209861799, + "grad_norm": 2.2885079086330413, + "learning_rate": 4.6504860375255475e-07, + "loss": 0.1987, + "step": 11228 + }, + { + "epoch": 0.9579423306602969, + "grad_norm": 2.165672171823461, + "learning_rate": 4.631706308421024e-07, + "loss": 0.2011, + "step": 11229 + }, + { + "epoch": 0.9580276403344139, + "grad_norm": 1.7811318073880178, + "learning_rate": 4.6129643980533657e-07, + "loss": 0.1464, + "step": 11230 + }, + { + "epoch": 0.9581129500085309, + "grad_norm": 2.1405171833591665, + "learning_rate": 4.5942603078533706e-07, + "loss": 0.1812, + "step": 11231 + }, + { + "epoch": 0.958198259682648, + "grad_norm": 1.3975300321951765, + "learning_rate": 4.575594039249065e-07, + "loss": 0.1147, + "step": 11232 + }, + { + "epoch": 0.9582835693567651, + "grad_norm": 1.8900413558108762, + "learning_rate": 4.5569655936654186e-07, + "loss": 0.215, + "step": 11233 + }, + { + "epoch": 0.9583688790308821, + "grad_norm": 1.9123189653470025, + "learning_rate": 4.538374972524684e-07, + "loss": 0.2007, + "step": 11234 + }, + { + "epoch": 0.9584541887049991, + "grad_norm": 1.475993132438293, + "learning_rate": 4.519822177246114e-07, + "loss": 0.1359, + "step": 11235 + }, + { + "epoch": 0.9585394983791162, + "grad_norm": 1.6121264890072484, + "learning_rate": 4.501307209246186e-07, + "loss": 0.1273, + "step": 11236 + }, + { + "epoch": 0.9586248080532332, + "grad_norm": 1.322102028039009, + "learning_rate": 4.4828300699383264e-07, + "loss": 0.1862, + "step": 11237 + }, + { + "epoch": 0.9587101177273503, + "grad_norm": 1.4300745872572587, + "learning_rate": 4.4643907607332394e-07, + "loss": 0.1817, + "step": 11238 + }, + { + "epoch": 0.9587954274014673, + "grad_norm": 1.5802815121340796, + "learning_rate": 4.4459892830386876e-07, + "loss": 0.146, + "step": 11239 + }, + { + "epoch": 0.9588807370755844, + "grad_norm": 2.0013551889065284, + "learning_rate": 4.4276256382594364e-07, + "loss": 0.1334, + "step": 11240 + }, + { + "epoch": 0.9589660467497014, + "grad_norm": 1.8897101971294783, + "learning_rate": 4.409299827797475e-07, + "loss": 0.1547, + "step": 11241 + }, + { + "epoch": 0.9590513564238184, + "grad_norm": 2.269605835626285, + "learning_rate": 4.3910118530519626e-07, + "loss": 0.2226, + "step": 11242 + }, + { + "epoch": 0.9591366660979355, + "grad_norm": 1.9142439152906228, + "learning_rate": 4.372761715419005e-07, + "loss": 0.181, + "step": 11243 + }, + { + "epoch": 0.9592219757720526, + "grad_norm": 1.2629625549504266, + "learning_rate": 4.354549416291931e-07, + "loss": 0.1172, + "step": 11244 + }, + { + "epoch": 0.9593072854461696, + "grad_norm": 1.6551577727380038, + "learning_rate": 4.3363749570611846e-07, + "loss": 0.2064, + "step": 11245 + }, + { + "epoch": 0.9593925951202866, + "grad_norm": 1.6411033296382664, + "learning_rate": 4.318238339114211e-07, + "loss": 0.1875, + "step": 11246 + }, + { + "epoch": 0.9594779047944036, + "grad_norm": 1.694968755653508, + "learning_rate": 4.300139563835681e-07, + "loss": 0.1959, + "step": 11247 + }, + { + "epoch": 0.9595632144685208, + "grad_norm": 1.6754929799223401, + "learning_rate": 4.2820786326074334e-07, + "loss": 0.151, + "step": 11248 + }, + { + "epoch": 0.9596485241426378, + "grad_norm": 2.3952124966065425, + "learning_rate": 4.264055546808143e-07, + "loss": 0.1378, + "step": 11249 + }, + { + "epoch": 0.9597338338167548, + "grad_norm": 1.4645528682544557, + "learning_rate": 4.246070307813932e-07, + "loss": 0.1312, + "step": 11250 + }, + { + "epoch": 0.9598191434908718, + "grad_norm": 1.8633874507044612, + "learning_rate": 4.2281229169977565e-07, + "loss": 0.1324, + "step": 11251 + }, + { + "epoch": 0.959904453164989, + "grad_norm": 1.818935151523885, + "learning_rate": 4.2102133757299103e-07, + "loss": 0.2278, + "step": 11252 + }, + { + "epoch": 0.959989762839106, + "grad_norm": 1.3846397611721952, + "learning_rate": 4.192341685377632e-07, + "loss": 0.1219, + "step": 11253 + }, + { + "epoch": 0.960075072513223, + "grad_norm": 1.6350981853763165, + "learning_rate": 4.1745078473053866e-07, + "loss": 0.1933, + "step": 11254 + }, + { + "epoch": 0.96016038218734, + "grad_norm": 2.1635581850815, + "learning_rate": 4.1567118628746406e-07, + "loss": 0.1957, + "step": 11255 + }, + { + "epoch": 0.9602456918614571, + "grad_norm": 1.770383919927536, + "learning_rate": 4.1389537334440287e-07, + "loss": 0.1166, + "step": 11256 + }, + { + "epoch": 0.9603310015355742, + "grad_norm": 1.5605228885339772, + "learning_rate": 4.1212334603693003e-07, + "loss": 0.1496, + "step": 11257 + }, + { + "epoch": 0.9604163112096912, + "grad_norm": 2.119825795591705, + "learning_rate": 4.103551045003262e-07, + "loss": 0.1356, + "step": 11258 + }, + { + "epoch": 0.9605016208838082, + "grad_norm": 1.8312788925235142, + "learning_rate": 4.085906488695945e-07, + "loss": 0.1307, + "step": 11259 + }, + { + "epoch": 0.9605869305579253, + "grad_norm": 2.2980419490229056, + "learning_rate": 4.068299792794383e-07, + "loss": 0.132, + "step": 11260 + }, + { + "epoch": 0.9606722402320423, + "grad_norm": 1.718948075411795, + "learning_rate": 4.0507309586427787e-07, + "loss": 0.2156, + "step": 11261 + }, + { + "epoch": 0.9607575499061594, + "grad_norm": 1.7683635168109215, + "learning_rate": 4.033199987582337e-07, + "loss": 0.1605, + "step": 11262 + }, + { + "epoch": 0.9608428595802764, + "grad_norm": 1.6128002862998163, + "learning_rate": 4.0157068809515417e-07, + "loss": 0.1552, + "step": 11263 + }, + { + "epoch": 0.9609281692543934, + "grad_norm": 1.9300131580912274, + "learning_rate": 3.998251640085826e-07, + "loss": 0.1945, + "step": 11264 + }, + { + "epoch": 0.9610134789285105, + "grad_norm": 1.7711807209430195, + "learning_rate": 3.980834266317901e-07, + "loss": 0.2419, + "step": 11265 + }, + { + "epoch": 0.9610987886026275, + "grad_norm": 1.739478840992292, + "learning_rate": 3.9634547609774255e-07, + "loss": 0.1884, + "step": 11266 + }, + { + "epoch": 0.9611840982767446, + "grad_norm": 1.917199503045721, + "learning_rate": 3.9461131253912266e-07, + "loss": 0.1938, + "step": 11267 + }, + { + "epoch": 0.9612694079508616, + "grad_norm": 1.926356479244102, + "learning_rate": 3.928809360883301e-07, + "loss": 0.2164, + "step": 11268 + }, + { + "epoch": 0.9613547176249787, + "grad_norm": 1.431454593078201, + "learning_rate": 3.9115434687746477e-07, + "loss": 0.1542, + "step": 11269 + }, + { + "epoch": 0.9614400272990957, + "grad_norm": 1.6036323384081308, + "learning_rate": 3.8943154503834344e-07, + "loss": 0.1722, + "step": 11270 + }, + { + "epoch": 0.9615253369732127, + "grad_norm": 1.7677001236601502, + "learning_rate": 3.8771253070249423e-07, + "loss": 0.1553, + "step": 11271 + }, + { + "epoch": 0.9616106466473298, + "grad_norm": 1.7806182544134606, + "learning_rate": 3.8599730400115107e-07, + "loss": 0.1387, + "step": 11272 + }, + { + "epoch": 0.9616959563214469, + "grad_norm": 1.8657599063320478, + "learning_rate": 3.842858650652648e-07, + "loss": 0.2156, + "step": 11273 + }, + { + "epoch": 0.9617812659955639, + "grad_norm": 1.5599299847956638, + "learning_rate": 3.8257821402549745e-07, + "loss": 0.1469, + "step": 11274 + }, + { + "epoch": 0.9618665756696809, + "grad_norm": 2.371996837999832, + "learning_rate": 3.80874351012217e-07, + "loss": 0.1203, + "step": 11275 + }, + { + "epoch": 0.9619518853437979, + "grad_norm": 1.6164504873455143, + "learning_rate": 3.7917427615550283e-07, + "loss": 0.193, + "step": 11276 + }, + { + "epoch": 0.9620371950179151, + "grad_norm": 2.334654592905826, + "learning_rate": 3.7747798958515103e-07, + "loss": 0.2568, + "step": 11277 + }, + { + "epoch": 0.9621225046920321, + "grad_norm": 1.491823881094132, + "learning_rate": 3.75785491430658e-07, + "loss": 0.1283, + "step": 11278 + }, + { + "epoch": 0.9622078143661491, + "grad_norm": 1.7668262547562081, + "learning_rate": 3.740967818212371e-07, + "loss": 0.1664, + "step": 11279 + }, + { + "epoch": 0.9622931240402661, + "grad_norm": 1.7004723846248082, + "learning_rate": 3.724118608858185e-07, + "loss": 0.1879, + "step": 11280 + }, + { + "epoch": 0.9623784337143833, + "grad_norm": 1.545714280033036, + "learning_rate": 3.7073072875303816e-07, + "loss": 0.1759, + "step": 11281 + }, + { + "epoch": 0.9624637433885003, + "grad_norm": 1.8916449199664465, + "learning_rate": 3.690533855512268e-07, + "loss": 0.1471, + "step": 11282 + }, + { + "epoch": 0.9625490530626173, + "grad_norm": 1.7324774044044127, + "learning_rate": 3.673798314084598e-07, + "loss": 0.1831, + "step": 11283 + }, + { + "epoch": 0.9626343627367343, + "grad_norm": 1.8084645565020132, + "learning_rate": 3.657100664524904e-07, + "loss": 0.1321, + "step": 11284 + }, + { + "epoch": 0.9627196724108514, + "grad_norm": 1.6989494724225822, + "learning_rate": 3.6404409081080004e-07, + "loss": 0.19, + "step": 11285 + }, + { + "epoch": 0.9628049820849685, + "grad_norm": 2.0718197500189546, + "learning_rate": 3.623819046105814e-07, + "loss": 0.1531, + "step": 11286 + }, + { + "epoch": 0.9628902917590855, + "grad_norm": 1.592419386759159, + "learning_rate": 3.60723507978733e-07, + "loss": 0.1093, + "step": 11287 + }, + { + "epoch": 0.9629756014332025, + "grad_norm": 1.7636278723616896, + "learning_rate": 3.590689010418535e-07, + "loss": 0.1898, + "step": 11288 + }, + { + "epoch": 0.9630609111073196, + "grad_norm": 1.8230865689981712, + "learning_rate": 3.5741808392628083e-07, + "loss": 0.25, + "step": 11289 + }, + { + "epoch": 0.9631462207814366, + "grad_norm": 2.2686076558334096, + "learning_rate": 3.5577105675803634e-07, + "loss": 0.196, + "step": 11290 + }, + { + "epoch": 0.9632315304555537, + "grad_norm": 2.2110216104528804, + "learning_rate": 3.541278196628528e-07, + "loss": 0.1634, + "step": 11291 + }, + { + "epoch": 0.9633168401296707, + "grad_norm": 1.6393110554099122, + "learning_rate": 3.5248837276620205e-07, + "loss": 0.1365, + "step": 11292 + }, + { + "epoch": 0.9634021498037878, + "grad_norm": 2.407942337803183, + "learning_rate": 3.50852716193234e-07, + "loss": 0.1958, + "step": 11293 + }, + { + "epoch": 0.9634874594779048, + "grad_norm": 1.5738362872380023, + "learning_rate": 3.492208500688265e-07, + "loss": 0.1705, + "step": 11294 + }, + { + "epoch": 0.9635727691520218, + "grad_norm": 1.72506980754747, + "learning_rate": 3.475927745175578e-07, + "loss": 0.1557, + "step": 11295 + }, + { + "epoch": 0.9636580788261389, + "grad_norm": 1.6463226202702275, + "learning_rate": 3.4596848966373384e-07, + "loss": 0.1445, + "step": 11296 + }, + { + "epoch": 0.963743388500256, + "grad_norm": 1.4354588488105182, + "learning_rate": 3.4434799563135556e-07, + "loss": 0.1745, + "step": 11297 + }, + { + "epoch": 0.963828698174373, + "grad_norm": 1.9198069416776258, + "learning_rate": 3.4273129254413505e-07, + "loss": 0.1442, + "step": 11298 + }, + { + "epoch": 0.96391400784849, + "grad_norm": 1.5957610961394404, + "learning_rate": 3.411183805255014e-07, + "loss": 0.169, + "step": 11299 + }, + { + "epoch": 0.963999317522607, + "grad_norm": 1.3136590653588127, + "learning_rate": 3.3950925969859494e-07, + "loss": 0.1199, + "step": 11300 + }, + { + "epoch": 0.964084627196724, + "grad_norm": 1.632464869239503, + "learning_rate": 3.379039301862619e-07, + "loss": 0.2173, + "step": 11301 + }, + { + "epoch": 0.9641699368708412, + "grad_norm": 1.8140048652844443, + "learning_rate": 3.363023921110542e-07, + "loss": 0.1612, + "step": 11302 + }, + { + "epoch": 0.9642552465449582, + "grad_norm": 1.509305382933698, + "learning_rate": 3.3470464559525184e-07, + "loss": 0.1552, + "step": 11303 + }, + { + "epoch": 0.9643405562190752, + "grad_norm": 2.0777207748676556, + "learning_rate": 3.331106907608239e-07, + "loss": 0.1532, + "step": 11304 + }, + { + "epoch": 0.9644258658931922, + "grad_norm": 1.7704653937365455, + "learning_rate": 3.315205277294675e-07, + "loss": 0.1935, + "step": 11305 + }, + { + "epoch": 0.9645111755673094, + "grad_norm": 1.7540771808890003, + "learning_rate": 3.2993415662258555e-07, + "loss": 0.2271, + "step": 11306 + }, + { + "epoch": 0.9645964852414264, + "grad_norm": 2.3377551913570414, + "learning_rate": 3.283515775612811e-07, + "loss": 0.1881, + "step": 11307 + }, + { + "epoch": 0.9646817949155434, + "grad_norm": 1.7160566554731522, + "learning_rate": 3.2677279066637423e-07, + "loss": 0.1298, + "step": 11308 + }, + { + "epoch": 0.9647671045896604, + "grad_norm": 1.4986992600404854, + "learning_rate": 3.2519779605840184e-07, + "loss": 0.186, + "step": 11309 + }, + { + "epoch": 0.9648524142637775, + "grad_norm": 1.2968791364121253, + "learning_rate": 3.236265938576122e-07, + "loss": 0.122, + "step": 11310 + }, + { + "epoch": 0.9649377239378946, + "grad_norm": 1.3934148171583334, + "learning_rate": 3.220591841839482e-07, + "loss": 0.1424, + "step": 11311 + }, + { + "epoch": 0.9650230336120116, + "grad_norm": 1.3060645870782241, + "learning_rate": 3.2049556715708083e-07, + "loss": 0.1152, + "step": 11312 + }, + { + "epoch": 0.9651083432861286, + "grad_norm": 2.2698325188648365, + "learning_rate": 3.189357428963757e-07, + "loss": 0.2063, + "step": 11313 + }, + { + "epoch": 0.9651936529602457, + "grad_norm": 1.8818986503156365, + "learning_rate": 3.173797115209265e-07, + "loss": 0.2084, + "step": 11314 + }, + { + "epoch": 0.9652789626343627, + "grad_norm": 1.9970399886909285, + "learning_rate": 3.158274731495159e-07, + "loss": 0.2209, + "step": 11315 + }, + { + "epoch": 0.9653642723084798, + "grad_norm": 1.8369639023342303, + "learning_rate": 3.1427902790066575e-07, + "loss": 0.1659, + "step": 11316 + }, + { + "epoch": 0.9654495819825968, + "grad_norm": 2.2269111226524867, + "learning_rate": 3.127343758925705e-07, + "loss": 0.2031, + "step": 11317 + }, + { + "epoch": 0.9655348916567139, + "grad_norm": 1.8650031300945453, + "learning_rate": 3.111935172431746e-07, + "loss": 0.1542, + "step": 11318 + }, + { + "epoch": 0.9656202013308309, + "grad_norm": 1.4620199018655415, + "learning_rate": 3.0965645207011175e-07, + "loss": 0.1241, + "step": 11319 + }, + { + "epoch": 0.965705511004948, + "grad_norm": 1.469415635585919, + "learning_rate": 3.0812318049071586e-07, + "loss": 0.1725, + "step": 11320 + }, + { + "epoch": 0.965790820679065, + "grad_norm": 2.089749970515616, + "learning_rate": 3.0659370262206e-07, + "loss": 0.2035, + "step": 11321 + }, + { + "epoch": 0.9658761303531821, + "grad_norm": 2.0216320334054885, + "learning_rate": 3.0506801858090073e-07, + "loss": 0.1317, + "step": 11322 + }, + { + "epoch": 0.9659614400272991, + "grad_norm": 2.095206366232704, + "learning_rate": 3.0354612848372265e-07, + "loss": 0.1444, + "step": 11323 + }, + { + "epoch": 0.9660467497014161, + "grad_norm": 1.7894044907378484, + "learning_rate": 3.020280324467051e-07, + "loss": 0.1302, + "step": 11324 + }, + { + "epoch": 0.9661320593755331, + "grad_norm": 1.7388712021182455, + "learning_rate": 3.0051373058576083e-07, + "loss": 0.1709, + "step": 11325 + }, + { + "epoch": 0.9662173690496503, + "grad_norm": 2.2331644260055414, + "learning_rate": 2.990032230164863e-07, + "loss": 0.1417, + "step": 11326 + }, + { + "epoch": 0.9663026787237673, + "grad_norm": 1.7653647036809599, + "learning_rate": 2.9749650985420043e-07, + "loss": 0.1566, + "step": 11327 + }, + { + "epoch": 0.9663879883978843, + "grad_norm": 2.249819854306352, + "learning_rate": 2.9599359121393887e-07, + "loss": 0.1571, + "step": 11328 + }, + { + "epoch": 0.9664732980720013, + "grad_norm": 2.9311244848256504, + "learning_rate": 2.944944672104488e-07, + "loss": 0.1685, + "step": 11329 + }, + { + "epoch": 0.9665586077461185, + "grad_norm": 1.631568544351414, + "learning_rate": 2.929991379581609e-07, + "loss": 0.1515, + "step": 11330 + }, + { + "epoch": 0.9666439174202355, + "grad_norm": 1.6909576596419142, + "learning_rate": 2.915076035712505e-07, + "loss": 0.194, + "step": 11331 + }, + { + "epoch": 0.9667292270943525, + "grad_norm": 1.5944995987258554, + "learning_rate": 2.900198641635876e-07, + "loss": 0.1756, + "step": 11332 + }, + { + "epoch": 0.9668145367684695, + "grad_norm": 1.4587858133896572, + "learning_rate": 2.885359198487425e-07, + "loss": 0.198, + "step": 11333 + }, + { + "epoch": 0.9668998464425866, + "grad_norm": 1.5187096421644821, + "learning_rate": 2.87055770740019e-07, + "loss": 0.1987, + "step": 11334 + }, + { + "epoch": 0.9669851561167037, + "grad_norm": 1.5235090190043343, + "learning_rate": 2.8557941695041003e-07, + "loss": 0.203, + "step": 11335 + }, + { + "epoch": 0.9670704657908207, + "grad_norm": 2.136194028707058, + "learning_rate": 2.84106858592631e-07, + "loss": 0.1741, + "step": 11336 + }, + { + "epoch": 0.9671557754649377, + "grad_norm": 2.0866540256803914, + "learning_rate": 2.826380957790975e-07, + "loss": 0.1899, + "step": 11337 + }, + { + "epoch": 0.9672410851390548, + "grad_norm": 2.2283741858462633, + "learning_rate": 2.8117312862195876e-07, + "loss": 0.1339, + "step": 11338 + }, + { + "epoch": 0.9673263948131718, + "grad_norm": 2.193630891267166, + "learning_rate": 2.7971195723303646e-07, + "loss": 0.1557, + "step": 11339 + }, + { + "epoch": 0.9674117044872889, + "grad_norm": 1.6029061240528857, + "learning_rate": 2.7825458172389684e-07, + "loss": 0.1312, + "step": 11340 + }, + { + "epoch": 0.9674970141614059, + "grad_norm": 1.1727335010474738, + "learning_rate": 2.7680100220580097e-07, + "loss": 0.1101, + "step": 11341 + }, + { + "epoch": 0.9675823238355229, + "grad_norm": 1.8105861161638819, + "learning_rate": 2.753512187897211e-07, + "loss": 0.2422, + "step": 11342 + }, + { + "epoch": 0.96766763350964, + "grad_norm": 1.5330602883057, + "learning_rate": 2.7390523158633554e-07, + "loss": 0.2332, + "step": 11343 + }, + { + "epoch": 0.967752943183757, + "grad_norm": 1.760142769658211, + "learning_rate": 2.7246304070603913e-07, + "loss": 0.2003, + "step": 11344 + }, + { + "epoch": 0.9678382528578741, + "grad_norm": 1.8620043844013652, + "learning_rate": 2.7102464625894387e-07, + "loss": 0.1953, + "step": 11345 + }, + { + "epoch": 0.9679235625319911, + "grad_norm": 1.797489625054608, + "learning_rate": 2.695900483548508e-07, + "loss": 0.2237, + "step": 11346 + }, + { + "epoch": 0.9680088722061082, + "grad_norm": 1.4432242488972324, + "learning_rate": 2.6815924710329456e-07, + "loss": 0.1386, + "step": 11347 + }, + { + "epoch": 0.9680941818802252, + "grad_norm": 1.8918511694222309, + "learning_rate": 2.6673224261350436e-07, + "loss": 0.1623, + "step": 11348 + }, + { + "epoch": 0.9681794915543422, + "grad_norm": 1.8555845204425756, + "learning_rate": 2.653090349944265e-07, + "loss": 0.205, + "step": 11349 + }, + { + "epoch": 0.9682648012284593, + "grad_norm": 2.3708222436450006, + "learning_rate": 2.638896243547073e-07, + "loss": 0.1914, + "step": 11350 + }, + { + "epoch": 0.9683501109025764, + "grad_norm": 1.5176276121576562, + "learning_rate": 2.624740108027268e-07, + "loss": 0.1212, + "step": 11351 + }, + { + "epoch": 0.9684354205766934, + "grad_norm": 1.4779144233892363, + "learning_rate": 2.61062194446543e-07, + "loss": 0.1484, + "step": 11352 + }, + { + "epoch": 0.9685207302508104, + "grad_norm": 2.0577073333213574, + "learning_rate": 2.59654175393953e-07, + "loss": 0.2263, + "step": 11353 + }, + { + "epoch": 0.9686060399249274, + "grad_norm": 2.600914676872748, + "learning_rate": 2.5824995375244855e-07, + "loss": 0.2035, + "step": 11354 + }, + { + "epoch": 0.9686913495990446, + "grad_norm": 1.7742201636714838, + "learning_rate": 2.568495296292273e-07, + "loss": 0.1485, + "step": 11355 + }, + { + "epoch": 0.9687766592731616, + "grad_norm": 2.050521049341241, + "learning_rate": 2.5545290313121474e-07, + "loss": 0.1364, + "step": 11356 + }, + { + "epoch": 0.9688619689472786, + "grad_norm": 1.518268442927277, + "learning_rate": 2.5406007436502566e-07, + "loss": 0.0959, + "step": 11357 + }, + { + "epoch": 0.9689472786213956, + "grad_norm": 1.6441120691735698, + "learning_rate": 2.526710434370083e-07, + "loss": 0.1395, + "step": 11358 + }, + { + "epoch": 0.9690325882955128, + "grad_norm": 1.8291348007774832, + "learning_rate": 2.51285810453189e-07, + "loss": 0.1575, + "step": 11359 + }, + { + "epoch": 0.9691178979696298, + "grad_norm": 1.2847582539955589, + "learning_rate": 2.4990437551933863e-07, + "loss": 0.1298, + "step": 11360 + }, + { + "epoch": 0.9692032076437468, + "grad_norm": 2.020326085351051, + "learning_rate": 2.485267387409229e-07, + "loss": 0.2509, + "step": 11361 + }, + { + "epoch": 0.9692885173178638, + "grad_norm": 1.6453501639986579, + "learning_rate": 2.471529002231021e-07, + "loss": 0.1027, + "step": 11362 + }, + { + "epoch": 0.9693738269919809, + "grad_norm": 2.1478069697974487, + "learning_rate": 2.457828600707812e-07, + "loss": 0.1189, + "step": 11363 + }, + { + "epoch": 0.969459136666098, + "grad_norm": 3.6382480442070957, + "learning_rate": 2.444166183885377e-07, + "loss": 0.3038, + "step": 11364 + }, + { + "epoch": 0.969544446340215, + "grad_norm": 1.7493520627114327, + "learning_rate": 2.430541752806881e-07, + "loss": 0.2084, + "step": 11365 + }, + { + "epoch": 0.969629756014332, + "grad_norm": 2.571522436623746, + "learning_rate": 2.416955308512381e-07, + "loss": 0.163, + "step": 11366 + }, + { + "epoch": 0.9697150656884491, + "grad_norm": 1.5300191486631398, + "learning_rate": 2.4034068520392693e-07, + "loss": 0.1594, + "step": 11367 + }, + { + "epoch": 0.9698003753625661, + "grad_norm": 1.8022470859224105, + "learning_rate": 2.3898963844217746e-07, + "loss": 0.1444, + "step": 11368 + }, + { + "epoch": 0.9698856850366832, + "grad_norm": 1.9812691480030202, + "learning_rate": 2.376423906691405e-07, + "loss": 0.1816, + "step": 11369 + }, + { + "epoch": 0.9699709947108002, + "grad_norm": 1.9426441199059659, + "learning_rate": 2.362989419876671e-07, + "loss": 0.2431, + "step": 11370 + }, + { + "epoch": 0.9700563043849173, + "grad_norm": 1.5545089849222813, + "learning_rate": 2.3495929250033078e-07, + "loss": 0.1796, + "step": 11371 + }, + { + "epoch": 0.9701416140590343, + "grad_norm": 2.9225316253334572, + "learning_rate": 2.336234423093997e-07, + "loss": 0.1831, + "step": 11372 + }, + { + "epoch": 0.9702269237331513, + "grad_norm": 1.93288252880192, + "learning_rate": 2.3229139151685896e-07, + "loss": 0.179, + "step": 11373 + }, + { + "epoch": 0.9703122334072684, + "grad_norm": 1.5457155139919223, + "learning_rate": 2.3096314022440503e-07, + "loss": 0.2155, + "step": 11374 + }, + { + "epoch": 0.9703975430813855, + "grad_norm": 1.740128947445678, + "learning_rate": 2.2963868853344562e-07, + "loss": 0.1169, + "step": 11375 + }, + { + "epoch": 0.9704828527555025, + "grad_norm": 2.009047785052539, + "learning_rate": 2.2831803654508877e-07, + "loss": 0.1708, + "step": 11376 + }, + { + "epoch": 0.9705681624296195, + "grad_norm": 2.0126551550533542, + "learning_rate": 2.2700118436016494e-07, + "loss": 0.1995, + "step": 11377 + }, + { + "epoch": 0.9706534721037365, + "grad_norm": 1.7963631605846884, + "learning_rate": 2.2568813207921037e-07, + "loss": 0.1931, + "step": 11378 + }, + { + "epoch": 0.9707387817778536, + "grad_norm": 1.6304083103297573, + "learning_rate": 2.2437887980246153e-07, + "loss": 0.2242, + "step": 11379 + }, + { + "epoch": 0.9708240914519707, + "grad_norm": 1.9979648368154532, + "learning_rate": 2.2307342762988294e-07, + "loss": 0.1614, + "step": 11380 + }, + { + "epoch": 0.9709094011260877, + "grad_norm": 1.7279697396754756, + "learning_rate": 2.217717756611337e-07, + "loss": 0.2402, + "step": 11381 + }, + { + "epoch": 0.9709947108002047, + "grad_norm": 2.051359200555122, + "learning_rate": 2.2047392399558443e-07, + "loss": 0.1824, + "step": 11382 + }, + { + "epoch": 0.9710800204743217, + "grad_norm": 1.6463733785011765, + "learning_rate": 2.1917987273232245e-07, + "loss": 0.1513, + "step": 11383 + }, + { + "epoch": 0.9711653301484389, + "grad_norm": 1.6231069542223553, + "learning_rate": 2.1788962197014652e-07, + "loss": 0.1657, + "step": 11384 + }, + { + "epoch": 0.9712506398225559, + "grad_norm": 1.4152848666663178, + "learning_rate": 2.1660317180755564e-07, + "loss": 0.1564, + "step": 11385 + }, + { + "epoch": 0.9713359494966729, + "grad_norm": 1.8620922845656747, + "learning_rate": 2.1532052234276013e-07, + "loss": 0.186, + "step": 11386 + }, + { + "epoch": 0.9714212591707899, + "grad_norm": 1.5981771277562915, + "learning_rate": 2.1404167367368721e-07, + "loss": 0.1178, + "step": 11387 + }, + { + "epoch": 0.971506568844907, + "grad_norm": 2.2431746911480888, + "learning_rate": 2.1276662589797547e-07, + "loss": 0.1653, + "step": 11388 + }, + { + "epoch": 0.9715918785190241, + "grad_norm": 1.7591398525799644, + "learning_rate": 2.1149537911295814e-07, + "loss": 0.2155, + "step": 11389 + }, + { + "epoch": 0.9716771881931411, + "grad_norm": 2.780462775099386, + "learning_rate": 2.102279334156909e-07, + "loss": 0.1881, + "step": 11390 + }, + { + "epoch": 0.9717624978672581, + "grad_norm": 1.711117251634245, + "learning_rate": 2.089642889029464e-07, + "loss": 0.1881, + "step": 11391 + }, + { + "epoch": 0.9718478075413752, + "grad_norm": 1.8852284866196152, + "learning_rate": 2.0770444567118075e-07, + "loss": 0.1207, + "step": 11392 + }, + { + "epoch": 0.9719331172154922, + "grad_norm": 2.277282923414468, + "learning_rate": 2.0644840381658926e-07, + "loss": 0.2087, + "step": 11393 + }, + { + "epoch": 0.9720184268896093, + "grad_norm": 2.022650370436426, + "learning_rate": 2.0519616343505633e-07, + "loss": 0.1859, + "step": 11394 + }, + { + "epoch": 0.9721037365637263, + "grad_norm": 2.06236844654731, + "learning_rate": 2.0394772462218882e-07, + "loss": 0.1613, + "step": 11395 + }, + { + "epoch": 0.9721890462378434, + "grad_norm": 1.6392409526603788, + "learning_rate": 2.0270308747329936e-07, + "loss": 0.1061, + "step": 11396 + }, + { + "epoch": 0.9722743559119604, + "grad_norm": 1.780547823453564, + "learning_rate": 2.014622520834064e-07, + "loss": 0.2326, + "step": 11397 + }, + { + "epoch": 0.9723596655860774, + "grad_norm": 1.4285256088946898, + "learning_rate": 2.0022521854723975e-07, + "loss": 0.2074, + "step": 11398 + }, + { + "epoch": 0.9724449752601945, + "grad_norm": 1.7637285157969473, + "learning_rate": 1.9899198695924048e-07, + "loss": 0.1865, + "step": 11399 + }, + { + "epoch": 0.9725302849343116, + "grad_norm": 1.4911081294885709, + "learning_rate": 1.977625574135611e-07, + "loss": 0.0873, + "step": 11400 + }, + { + "epoch": 0.9726155946084286, + "grad_norm": 1.867127485388155, + "learning_rate": 1.9653693000405982e-07, + "loss": 0.1585, + "step": 11401 + }, + { + "epoch": 0.9727009042825456, + "grad_norm": 1.719432487233947, + "learning_rate": 1.9531510482431182e-07, + "loss": 0.2221, + "step": 11402 + }, + { + "epoch": 0.9727862139566626, + "grad_norm": 1.8743754936337178, + "learning_rate": 1.9409708196759247e-07, + "loss": 0.1572, + "step": 11403 + }, + { + "epoch": 0.9728715236307798, + "grad_norm": 1.4902246957367098, + "learning_rate": 1.9288286152689406e-07, + "loss": 0.1915, + "step": 11404 + }, + { + "epoch": 0.9729568333048968, + "grad_norm": 1.9929807392688363, + "learning_rate": 1.9167244359491467e-07, + "loss": 0.0915, + "step": 11405 + }, + { + "epoch": 0.9730421429790138, + "grad_norm": 2.019770418094364, + "learning_rate": 1.9046582826406368e-07, + "loss": 0.1596, + "step": 11406 + }, + { + "epoch": 0.9731274526531308, + "grad_norm": 1.8482501827463322, + "learning_rate": 1.8926301562645632e-07, + "loss": 0.2454, + "step": 11407 + }, + { + "epoch": 0.973212762327248, + "grad_norm": 1.3659583533795676, + "learning_rate": 1.880640057739247e-07, + "loss": 0.1707, + "step": 11408 + }, + { + "epoch": 0.973298072001365, + "grad_norm": 1.6358506260698096, + "learning_rate": 1.8686879879800667e-07, + "loss": 0.1277, + "step": 11409 + }, + { + "epoch": 0.973383381675482, + "grad_norm": 2.169700247281965, + "learning_rate": 1.8567739478994595e-07, + "loss": 0.2012, + "step": 11410 + }, + { + "epoch": 0.973468691349599, + "grad_norm": 1.4510540715454348, + "learning_rate": 1.8448979384070863e-07, + "loss": 0.1628, + "step": 11411 + }, + { + "epoch": 0.9735540010237161, + "grad_norm": 1.6232683785751005, + "learning_rate": 1.8330599604095e-07, + "loss": 0.1974, + "step": 11412 + }, + { + "epoch": 0.9736393106978332, + "grad_norm": 1.7716181493212342, + "learning_rate": 1.8212600148105884e-07, + "loss": 0.1138, + "step": 11413 + }, + { + "epoch": 0.9737246203719502, + "grad_norm": 1.4498161462441217, + "learning_rate": 1.8094981025110756e-07, + "loss": 0.1401, + "step": 11414 + }, + { + "epoch": 0.9738099300460672, + "grad_norm": 1.7599101795771974, + "learning_rate": 1.7977742244090768e-07, + "loss": 0.1802, + "step": 11415 + }, + { + "epoch": 0.9738952397201842, + "grad_norm": 1.7363643749731164, + "learning_rate": 1.7860883813995976e-07, + "loss": 0.2101, + "step": 11416 + }, + { + "epoch": 0.9739805493943013, + "grad_norm": 1.6365557386846112, + "learning_rate": 1.774440574374703e-07, + "loss": 0.1135, + "step": 11417 + }, + { + "epoch": 0.9740658590684184, + "grad_norm": 1.8500642791097202, + "learning_rate": 1.7628308042236807e-07, + "loss": 0.1999, + "step": 11418 + }, + { + "epoch": 0.9741511687425354, + "grad_norm": 2.1206598277089133, + "learning_rate": 1.7512590718329336e-07, + "loss": 0.162, + "step": 11419 + }, + { + "epoch": 0.9742364784166524, + "grad_norm": 1.6588103459756982, + "learning_rate": 1.7397253780858658e-07, + "loss": 0.1939, + "step": 11420 + }, + { + "epoch": 0.9743217880907695, + "grad_norm": 1.4117934004062362, + "learning_rate": 1.7282297238629953e-07, + "loss": 0.1682, + "step": 11421 + }, + { + "epoch": 0.9744070977648865, + "grad_norm": 1.533196671587732, + "learning_rate": 1.7167721100420087e-07, + "loss": 0.1579, + "step": 11422 + }, + { + "epoch": 0.9744924074390036, + "grad_norm": 1.533437917432151, + "learning_rate": 1.7053525374975953e-07, + "loss": 0.1424, + "step": 11423 + }, + { + "epoch": 0.9745777171131206, + "grad_norm": 1.3991824249204163, + "learning_rate": 1.693971007101558e-07, + "loss": 0.1804, + "step": 11424 + }, + { + "epoch": 0.9746630267872377, + "grad_norm": 1.4324156366508383, + "learning_rate": 1.682627519722868e-07, + "loss": 0.1625, + "step": 11425 + }, + { + "epoch": 0.9747483364613547, + "grad_norm": 1.33933001308748, + "learning_rate": 1.671322076227444e-07, + "loss": 0.1592, + "step": 11426 + }, + { + "epoch": 0.9748336461354717, + "grad_norm": 1.940650716172015, + "learning_rate": 1.6600546774785398e-07, + "loss": 0.1375, + "step": 11427 + }, + { + "epoch": 0.9749189558095888, + "grad_norm": 1.4528674646041186, + "learning_rate": 1.648825324336245e-07, + "loss": 0.1515, + "step": 11428 + }, + { + "epoch": 0.9750042654837059, + "grad_norm": 1.706204631048537, + "learning_rate": 1.6376340176579297e-07, + "loss": 0.146, + "step": 11429 + }, + { + "epoch": 0.9750895751578229, + "grad_norm": 1.898935637248625, + "learning_rate": 1.6264807582979103e-07, + "loss": 0.1971, + "step": 11430 + }, + { + "epoch": 0.9751748848319399, + "grad_norm": 1.7726406374126535, + "learning_rate": 1.6153655471077832e-07, + "loss": 0.1143, + "step": 11431 + }, + { + "epoch": 0.9752601945060569, + "grad_norm": 1.5614884726287888, + "learning_rate": 1.604288384936037e-07, + "loss": 0.1278, + "step": 11432 + }, + { + "epoch": 0.9753455041801741, + "grad_norm": 1.8245959004366246, + "learning_rate": 1.5932492726284386e-07, + "loss": 0.2335, + "step": 11433 + }, + { + "epoch": 0.9754308138542911, + "grad_norm": 1.7316492329595856, + "learning_rate": 1.5822482110277036e-07, + "loss": 0.1481, + "step": 11434 + }, + { + "epoch": 0.9755161235284081, + "grad_norm": 2.2988611055275445, + "learning_rate": 1.5712852009737711e-07, + "loss": 0.2111, + "step": 11435 + }, + { + "epoch": 0.9756014332025251, + "grad_norm": 2.0033060405647176, + "learning_rate": 1.5603602433035269e-07, + "loss": 0.1313, + "step": 11436 + }, + { + "epoch": 0.9756867428766423, + "grad_norm": 1.7201156739217112, + "learning_rate": 1.5494733388510817e-07, + "loss": 0.1673, + "step": 11437 + }, + { + "epoch": 0.9757720525507593, + "grad_norm": 1.817136687390803, + "learning_rate": 1.5386244884476043e-07, + "loss": 0.1541, + "step": 11438 + }, + { + "epoch": 0.9758573622248763, + "grad_norm": 1.5640460185917502, + "learning_rate": 1.527813692921265e-07, + "loss": 0.1165, + "step": 11439 + }, + { + "epoch": 0.9759426718989933, + "grad_norm": 1.751611648673111, + "learning_rate": 1.517040953097515e-07, + "loss": 0.1695, + "step": 11440 + }, + { + "epoch": 0.9760279815731104, + "grad_norm": 1.9408952435776534, + "learning_rate": 1.5063062697987518e-07, + "loss": 0.1743, + "step": 11441 + }, + { + "epoch": 0.9761132912472275, + "grad_norm": 2.147967255887981, + "learning_rate": 1.4956096438445423e-07, + "loss": 0.133, + "step": 11442 + }, + { + "epoch": 0.9761986009213445, + "grad_norm": 1.5609537484637845, + "learning_rate": 1.4849510760513995e-07, + "loss": 0.1601, + "step": 11443 + }, + { + "epoch": 0.9762839105954615, + "grad_norm": 1.5650491900528738, + "learning_rate": 1.4743305672332287e-07, + "loss": 0.1562, + "step": 11444 + }, + { + "epoch": 0.9763692202695786, + "grad_norm": 1.4754596608205248, + "learning_rate": 1.463748118200714e-07, + "loss": 0.1619, + "step": 11445 + }, + { + "epoch": 0.9764545299436956, + "grad_norm": 2.4138254586292196, + "learning_rate": 1.4532037297618205e-07, + "loss": 0.1699, + "step": 11446 + }, + { + "epoch": 0.9765398396178127, + "grad_norm": 1.507408760055267, + "learning_rate": 1.4426974027215713e-07, + "loss": 0.0899, + "step": 11447 + }, + { + "epoch": 0.9766251492919297, + "grad_norm": 2.6729961519224337, + "learning_rate": 1.4322291378819908e-07, + "loss": 0.2042, + "step": 11448 + }, + { + "epoch": 0.9767104589660468, + "grad_norm": 1.3725160356707014, + "learning_rate": 1.4217989360423845e-07, + "loss": 0.1463, + "step": 11449 + }, + { + "epoch": 0.9767957686401638, + "grad_norm": 1.5737492545128375, + "learning_rate": 1.4114067979989488e-07, + "loss": 0.1622, + "step": 11450 + }, + { + "epoch": 0.9768810783142808, + "grad_norm": 2.0715445828954153, + "learning_rate": 1.4010527245451045e-07, + "loss": 0.147, + "step": 11451 + }, + { + "epoch": 0.9769663879883979, + "grad_norm": 1.7639975009563156, + "learning_rate": 1.3907367164713303e-07, + "loss": 0.2406, + "step": 11452 + }, + { + "epoch": 0.977051697662515, + "grad_norm": 1.3049040289188782, + "learning_rate": 1.3804587745652187e-07, + "loss": 0.1332, + "step": 11453 + }, + { + "epoch": 0.977137007336632, + "grad_norm": 2.467341443719584, + "learning_rate": 1.3702188996114196e-07, + "loss": 0.1479, + "step": 11454 + }, + { + "epoch": 0.977222317010749, + "grad_norm": 1.9512790959771333, + "learning_rate": 1.3600170923916966e-07, + "loss": 0.2226, + "step": 11455 + }, + { + "epoch": 0.977307626684866, + "grad_norm": 2.1812245408460615, + "learning_rate": 1.349853353684871e-07, + "loss": 0.1668, + "step": 11456 + }, + { + "epoch": 0.977392936358983, + "grad_norm": 1.4734927768795596, + "learning_rate": 1.3397276842669892e-07, + "loss": 0.1478, + "step": 11457 + }, + { + "epoch": 0.9774782460331002, + "grad_norm": 1.7135625758726392, + "learning_rate": 1.3296400849109324e-07, + "loss": 0.1644, + "step": 11458 + }, + { + "epoch": 0.9775635557072172, + "grad_norm": 2.2728180451038975, + "learning_rate": 1.3195905563869737e-07, + "loss": 0.1811, + "step": 11459 + }, + { + "epoch": 0.9776488653813342, + "grad_norm": 1.6749493150017127, + "learning_rate": 1.309579099462277e-07, + "loss": 0.1464, + "step": 11460 + }, + { + "epoch": 0.9777341750554512, + "grad_norm": 1.3181455085501053, + "learning_rate": 1.2996057149011752e-07, + "loss": 0.1309, + "step": 11461 + }, + { + "epoch": 0.9778194847295684, + "grad_norm": 1.7107577294790073, + "learning_rate": 1.2896704034651152e-07, + "loss": 0.2076, + "step": 11462 + }, + { + "epoch": 0.9779047944036854, + "grad_norm": 1.8573768201110905, + "learning_rate": 1.27977316591249e-07, + "loss": 0.1585, + "step": 11463 + }, + { + "epoch": 0.9779901040778024, + "grad_norm": 1.8965856994527435, + "learning_rate": 1.2699140029990842e-07, + "loss": 0.1509, + "step": 11464 + }, + { + "epoch": 0.9780754137519194, + "grad_norm": 1.8243216050617468, + "learning_rate": 1.2600929154774621e-07, + "loss": 0.1404, + "step": 11465 + }, + { + "epoch": 0.9781607234260365, + "grad_norm": 1.8126419274897347, + "learning_rate": 1.250309904097413e-07, + "loss": 0.1656, + "step": 11466 + }, + { + "epoch": 0.9782460331001536, + "grad_norm": 1.9100822325175553, + "learning_rate": 1.2405649696058953e-07, + "loss": 0.1541, + "step": 11467 + }, + { + "epoch": 0.9783313427742706, + "grad_norm": 2.152035782895874, + "learning_rate": 1.2308581127468132e-07, + "loss": 0.1441, + "step": 11468 + }, + { + "epoch": 0.9784166524483876, + "grad_norm": 1.7346184362078532, + "learning_rate": 1.2211893342612968e-07, + "loss": 0.2017, + "step": 11469 + }, + { + "epoch": 0.9785019621225047, + "grad_norm": 1.5382823225320823, + "learning_rate": 1.211558634887422e-07, + "loss": 0.2284, + "step": 11470 + }, + { + "epoch": 0.9785872717966217, + "grad_norm": 1.538717163380137, + "learning_rate": 1.2019660153604894e-07, + "loss": 0.1715, + "step": 11471 + }, + { + "epoch": 0.9786725814707388, + "grad_norm": 2.1833066802788257, + "learning_rate": 1.192411476412858e-07, + "loss": 0.1614, + "step": 11472 + }, + { + "epoch": 0.9787578911448558, + "grad_norm": 2.046084884851571, + "learning_rate": 1.182895018773944e-07, + "loss": 0.1594, + "step": 11473 + }, + { + "epoch": 0.9788432008189729, + "grad_norm": 1.7624732049298517, + "learning_rate": 1.1734166431702776e-07, + "loss": 0.162, + "step": 11474 + }, + { + "epoch": 0.9789285104930899, + "grad_norm": 2.1251369880951594, + "learning_rate": 1.1639763503255019e-07, + "loss": 0.1842, + "step": 11475 + }, + { + "epoch": 0.979013820167207, + "grad_norm": 2.0776577761308035, + "learning_rate": 1.1545741409603184e-07, + "loss": 0.2055, + "step": 11476 + }, + { + "epoch": 0.979099129841324, + "grad_norm": 1.9634105392370589, + "learning_rate": 1.1452100157925416e-07, + "loss": 0.1259, + "step": 11477 + }, + { + "epoch": 0.9791844395154411, + "grad_norm": 1.905153365582354, + "learning_rate": 1.1358839755370443e-07, + "loss": 0.1898, + "step": 11478 + }, + { + "epoch": 0.9792697491895581, + "grad_norm": 2.1539180142341157, + "learning_rate": 1.1265960209058679e-07, + "loss": 0.1259, + "step": 11479 + }, + { + "epoch": 0.9793550588636751, + "grad_norm": 1.7203987536518306, + "learning_rate": 1.1173461526080565e-07, + "loss": 0.188, + "step": 11480 + }, + { + "epoch": 0.9794403685377921, + "grad_norm": 1.9986968563379381, + "learning_rate": 1.1081343713498227e-07, + "loss": 0.2157, + "step": 11481 + }, + { + "epoch": 0.9795256782119093, + "grad_norm": 1.807247575410775, + "learning_rate": 1.0989606778344375e-07, + "loss": 0.2114, + "step": 11482 + }, + { + "epoch": 0.9796109878860263, + "grad_norm": 2.0419232992176015, + "learning_rate": 1.0898250727622294e-07, + "loss": 0.1816, + "step": 11483 + }, + { + "epoch": 0.9796962975601433, + "grad_norm": 1.7975341914589862, + "learning_rate": 1.0807275568306407e-07, + "loss": 0.1706, + "step": 11484 + }, + { + "epoch": 0.9797816072342603, + "grad_norm": 2.1060852393844316, + "learning_rate": 1.0716681307342825e-07, + "loss": 0.1499, + "step": 11485 + }, + { + "epoch": 0.9798669169083775, + "grad_norm": 1.2437447916813207, + "learning_rate": 1.0626467951647678e-07, + "loss": 0.113, + "step": 11486 + }, + { + "epoch": 0.9799522265824945, + "grad_norm": 2.1938451023626078, + "learning_rate": 1.0536635508107684e-07, + "loss": 0.1387, + "step": 11487 + }, + { + "epoch": 0.9800375362566115, + "grad_norm": 1.7235408942160422, + "learning_rate": 1.0447183983582353e-07, + "loss": 0.114, + "step": 11488 + }, + { + "epoch": 0.9801228459307285, + "grad_norm": 1.9303166525995104, + "learning_rate": 1.0358113384899559e-07, + "loss": 0.1617, + "step": 11489 + }, + { + "epoch": 0.9802081556048456, + "grad_norm": 1.951156784167274, + "learning_rate": 1.0269423718859971e-07, + "loss": 0.1437, + "step": 11490 + }, + { + "epoch": 0.9802934652789627, + "grad_norm": 1.8460521872511648, + "learning_rate": 1.0181114992234287e-07, + "loss": 0.1871, + "step": 11491 + }, + { + "epoch": 0.9803787749530797, + "grad_norm": 1.5359536155561517, + "learning_rate": 1.0093187211764887e-07, + "loss": 0.1702, + "step": 11492 + }, + { + "epoch": 0.9804640846271967, + "grad_norm": 1.8832766645339774, + "learning_rate": 1.0005640384164738e-07, + "loss": 0.2384, + "step": 11493 + }, + { + "epoch": 0.9805493943013137, + "grad_norm": 1.8489566283598726, + "learning_rate": 9.918474516116272e-08, + "loss": 0.1659, + "step": 11494 + }, + { + "epoch": 0.9806347039754308, + "grad_norm": 1.9911462792959735, + "learning_rate": 9.831689614275275e-08, + "loss": 0.1726, + "step": 11495 + }, + { + "epoch": 0.9807200136495479, + "grad_norm": 2.060545139402409, + "learning_rate": 9.745285685267558e-08, + "loss": 0.1768, + "step": 11496 + }, + { + "epoch": 0.9808053233236649, + "grad_norm": 1.7638878167350227, + "learning_rate": 9.659262735688401e-08, + "loss": 0.1722, + "step": 11497 + }, + { + "epoch": 0.9808906329977819, + "grad_norm": 1.6782840987215868, + "learning_rate": 9.573620772106439e-08, + "loss": 0.1676, + "step": 11498 + }, + { + "epoch": 0.980975942671899, + "grad_norm": 1.9893035958311438, + "learning_rate": 9.488359801059222e-08, + "loss": 0.1549, + "step": 11499 + }, + { + "epoch": 0.981061252346016, + "grad_norm": 1.4396642066921006, + "learning_rate": 9.403479829055983e-08, + "loss": 0.1512, + "step": 11500 + }, + { + "epoch": 0.9811465620201331, + "grad_norm": 2.1363922371785242, + "learning_rate": 9.318980862577098e-08, + "loss": 0.1715, + "step": 11501 + }, + { + "epoch": 0.9812318716942501, + "grad_norm": 1.7653373551471763, + "learning_rate": 9.234862908074071e-08, + "loss": 0.2269, + "step": 11502 + }, + { + "epoch": 0.9813171813683672, + "grad_norm": 1.5498805276910756, + "learning_rate": 9.151125971967878e-08, + "loss": 0.1729, + "step": 11503 + }, + { + "epoch": 0.9814024910424842, + "grad_norm": 1.6625945731203058, + "learning_rate": 9.067770060651737e-08, + "loss": 0.2036, + "step": 11504 + }, + { + "epoch": 0.9814878007166012, + "grad_norm": 2.375889661699447, + "learning_rate": 8.984795180490003e-08, + "loss": 0.2257, + "step": 11505 + }, + { + "epoch": 0.9815731103907183, + "grad_norm": 1.9426799564146997, + "learning_rate": 8.902201337816496e-08, + "loss": 0.2286, + "step": 11506 + }, + { + "epoch": 0.9816584200648354, + "grad_norm": 1.6351726212208255, + "learning_rate": 8.81998853893784e-08, + "loss": 0.1715, + "step": 11507 + }, + { + "epoch": 0.9817437297389524, + "grad_norm": 1.9371725236338713, + "learning_rate": 8.73815679012957e-08, + "loss": 0.138, + "step": 11508 + }, + { + "epoch": 0.9818290394130694, + "grad_norm": 1.5722330978535612, + "learning_rate": 8.656706097639467e-08, + "loss": 0.1369, + "step": 11509 + }, + { + "epoch": 0.9819143490871864, + "grad_norm": 1.4580328313848905, + "learning_rate": 8.575636467685888e-08, + "loss": 0.1775, + "step": 11510 + }, + { + "epoch": 0.9819996587613036, + "grad_norm": 1.8453115542390808, + "learning_rate": 8.494947906458328e-08, + "loss": 0.1542, + "step": 11511 + }, + { + "epoch": 0.9820849684354206, + "grad_norm": 1.9906508559339864, + "learning_rate": 8.414640420116305e-08, + "loss": 0.1556, + "step": 11512 + }, + { + "epoch": 0.9821702781095376, + "grad_norm": 1.954559165833913, + "learning_rate": 8.334714014791578e-08, + "loss": 0.16, + "step": 11513 + }, + { + "epoch": 0.9822555877836546, + "grad_norm": 1.4782132402659334, + "learning_rate": 8.25516869658538e-08, + "loss": 0.1741, + "step": 11514 + }, + { + "epoch": 0.9823408974577718, + "grad_norm": 1.5207759201848452, + "learning_rate": 8.176004471571186e-08, + "loss": 0.1709, + "step": 11515 + }, + { + "epoch": 0.9824262071318888, + "grad_norm": 2.0802263233958636, + "learning_rate": 8.097221345792493e-08, + "loss": 0.182, + "step": 11516 + }, + { + "epoch": 0.9825115168060058, + "grad_norm": 1.3774107693331628, + "learning_rate": 8.018819325263937e-08, + "loss": 0.187, + "step": 11517 + }, + { + "epoch": 0.9825968264801228, + "grad_norm": 2.036243111895578, + "learning_rate": 7.940798415971284e-08, + "loss": 0.2021, + "step": 11518 + }, + { + "epoch": 0.9826821361542399, + "grad_norm": 2.0474272703637606, + "learning_rate": 7.86315862387088e-08, + "loss": 0.1822, + "step": 11519 + }, + { + "epoch": 0.982767445828357, + "grad_norm": 2.147328815498483, + "learning_rate": 7.785899954890208e-08, + "loss": 0.2191, + "step": 11520 + }, + { + "epoch": 0.982852755502474, + "grad_norm": 1.72345830882316, + "learning_rate": 7.709022414927325e-08, + "loss": 0.1085, + "step": 11521 + }, + { + "epoch": 0.982938065176591, + "grad_norm": 1.335705028630083, + "learning_rate": 7.632526009851981e-08, + "loss": 0.1919, + "step": 11522 + }, + { + "epoch": 0.9830233748507081, + "grad_norm": 1.5786957416890914, + "learning_rate": 7.556410745503395e-08, + "loss": 0.1273, + "step": 11523 + }, + { + "epoch": 0.9831086845248251, + "grad_norm": 2.130767736378797, + "learning_rate": 7.480676627693029e-08, + "loss": 0.1968, + "step": 11524 + }, + { + "epoch": 0.9831939941989422, + "grad_norm": 2.0036186621654544, + "learning_rate": 7.405323662202924e-08, + "loss": 0.2051, + "step": 11525 + }, + { + "epoch": 0.9832793038730592, + "grad_norm": 1.8393649310381779, + "learning_rate": 7.3303518547857e-08, + "loss": 0.1829, + "step": 11526 + }, + { + "epoch": 0.9833646135471763, + "grad_norm": 1.3685678979283158, + "learning_rate": 7.255761211165113e-08, + "loss": 0.1154, + "step": 11527 + }, + { + "epoch": 0.9834499232212933, + "grad_norm": 1.6785204004224672, + "learning_rate": 7.181551737035497e-08, + "loss": 0.137, + "step": 11528 + }, + { + "epoch": 0.9835352328954103, + "grad_norm": 2.470550074002115, + "learning_rate": 7.107723438062874e-08, + "loss": 0.1534, + "step": 11529 + }, + { + "epoch": 0.9836205425695274, + "grad_norm": 1.9512679159452613, + "learning_rate": 7.034276319883293e-08, + "loss": 0.1846, + "step": 11530 + }, + { + "epoch": 0.9837058522436444, + "grad_norm": 1.6943674573797272, + "learning_rate": 6.961210388104488e-08, + "loss": 0.1253, + "step": 11531 + }, + { + "epoch": 0.9837911619177615, + "grad_norm": 1.7689330582532792, + "learning_rate": 6.888525648303667e-08, + "loss": 0.1866, + "step": 11532 + }, + { + "epoch": 0.9838764715918785, + "grad_norm": 1.5412352276145942, + "learning_rate": 6.816222106030834e-08, + "loss": 0.1522, + "step": 11533 + }, + { + "epoch": 0.9839617812659955, + "grad_norm": 1.7024981543801228, + "learning_rate": 6.744299766806017e-08, + "loss": 0.2162, + "step": 11534 + }, + { + "epoch": 0.9840470909401126, + "grad_norm": 2.108368018647808, + "learning_rate": 6.67275863611927e-08, + "loss": 0.1559, + "step": 11535 + }, + { + "epoch": 0.9841324006142297, + "grad_norm": 1.508027214614209, + "learning_rate": 6.601598719432889e-08, + "loss": 0.1383, + "step": 11536 + }, + { + "epoch": 0.9842177102883467, + "grad_norm": 1.7924656380268718, + "learning_rate": 6.530820022179751e-08, + "loss": 0.2114, + "step": 11537 + }, + { + "epoch": 0.9843030199624637, + "grad_norm": 1.904275465624843, + "learning_rate": 6.460422549763312e-08, + "loss": 0.1755, + "step": 11538 + }, + { + "epoch": 0.9843883296365807, + "grad_norm": 2.2839072441333266, + "learning_rate": 6.390406307558161e-08, + "loss": 0.1199, + "step": 11539 + }, + { + "epoch": 0.9844736393106979, + "grad_norm": 1.3404311513468938, + "learning_rate": 6.320771300908912e-08, + "loss": 0.1178, + "step": 11540 + }, + { + "epoch": 0.9845589489848149, + "grad_norm": 1.400832783420749, + "learning_rate": 6.251517535132979e-08, + "loss": 0.1217, + "step": 11541 + }, + { + "epoch": 0.9846442586589319, + "grad_norm": 1.7392305332381337, + "learning_rate": 6.182645015516131e-08, + "loss": 0.0919, + "step": 11542 + }, + { + "epoch": 0.9847295683330489, + "grad_norm": 1.8364057877052418, + "learning_rate": 6.114153747318052e-08, + "loss": 0.1569, + "step": 11543 + }, + { + "epoch": 0.984814878007166, + "grad_norm": 2.084180685019324, + "learning_rate": 6.046043735766783e-08, + "loss": 0.2082, + "step": 11544 + }, + { + "epoch": 0.9849001876812831, + "grad_norm": 1.7784479421182693, + "learning_rate": 5.978314986061495e-08, + "loss": 0.16, + "step": 11545 + }, + { + "epoch": 0.9849854973554001, + "grad_norm": 2.055895664098481, + "learning_rate": 5.9109675033741654e-08, + "loss": 0.2287, + "step": 11546 + }, + { + "epoch": 0.9850708070295171, + "grad_norm": 1.6621899882221047, + "learning_rate": 5.844001292846235e-08, + "loss": 0.1697, + "step": 11547 + }, + { + "epoch": 0.9851561167036342, + "grad_norm": 1.9205778132661802, + "learning_rate": 5.7774163595891716e-08, + "loss": 0.199, + "step": 11548 + }, + { + "epoch": 0.9852414263777513, + "grad_norm": 1.7410237483947097, + "learning_rate": 5.7112127086877965e-08, + "loss": 0.1661, + "step": 11549 + }, + { + "epoch": 0.9853267360518683, + "grad_norm": 1.9405715345306118, + "learning_rate": 5.6453903451952894e-08, + "loss": 0.1259, + "step": 11550 + }, + { + "epoch": 0.9854120457259853, + "grad_norm": 1.6956504098272769, + "learning_rate": 5.579949274137075e-08, + "loss": 0.1369, + "step": 11551 + }, + { + "epoch": 0.9854973554001024, + "grad_norm": 1.7658409054161608, + "learning_rate": 5.514889500509712e-08, + "loss": 0.2026, + "step": 11552 + }, + { + "epoch": 0.9855826650742194, + "grad_norm": 2.3971132480826394, + "learning_rate": 5.450211029279784e-08, + "loss": 0.208, + "step": 11553 + }, + { + "epoch": 0.9856679747483365, + "grad_norm": 2.070128465066224, + "learning_rate": 5.385913865385561e-08, + "loss": 0.1716, + "step": 11554 + }, + { + "epoch": 0.9857532844224535, + "grad_norm": 1.7487297897861598, + "learning_rate": 5.321998013735341e-08, + "loss": 0.1331, + "step": 11555 + }, + { + "epoch": 0.9858385940965706, + "grad_norm": 1.6739790324284627, + "learning_rate": 5.258463479208553e-08, + "loss": 0.1536, + "step": 11556 + }, + { + "epoch": 0.9859239037706876, + "grad_norm": 1.8657188864746985, + "learning_rate": 5.195310266656317e-08, + "loss": 0.1946, + "step": 11557 + }, + { + "epoch": 0.9860092134448046, + "grad_norm": 1.957188643714489, + "learning_rate": 5.1325383808997764e-08, + "loss": 0.1602, + "step": 11558 + }, + { + "epoch": 0.9860945231189217, + "grad_norm": 3.460981863869866, + "learning_rate": 5.070147826731209e-08, + "loss": 0.2907, + "step": 11559 + }, + { + "epoch": 0.9861798327930388, + "grad_norm": 1.7066398538529672, + "learning_rate": 5.008138608913471e-08, + "loss": 0.1909, + "step": 11560 + }, + { + "epoch": 0.9862651424671558, + "grad_norm": 1.4811860211554648, + "learning_rate": 4.946510732181664e-08, + "loss": 0.1584, + "step": 11561 + }, + { + "epoch": 0.9863504521412728, + "grad_norm": 1.7321404160719738, + "learning_rate": 4.885264201239248e-08, + "loss": 0.1433, + "step": 11562 + }, + { + "epoch": 0.9864357618153898, + "grad_norm": 1.9863654539188444, + "learning_rate": 4.824399020763593e-08, + "loss": 0.1491, + "step": 11563 + }, + { + "epoch": 0.986521071489507, + "grad_norm": 1.5541587965751518, + "learning_rate": 4.7639151954004254e-08, + "loss": 0.1806, + "step": 11564 + }, + { + "epoch": 0.986606381163624, + "grad_norm": 1.8626827662356318, + "learning_rate": 4.70381272976772e-08, + "loss": 0.1384, + "step": 11565 + }, + { + "epoch": 0.986691690837741, + "grad_norm": 1.6169468060191354, + "learning_rate": 4.644091628454028e-08, + "loss": 0.1872, + "step": 11566 + }, + { + "epoch": 0.986777000511858, + "grad_norm": 1.2300409451314331, + "learning_rate": 4.5847518960184796e-08, + "loss": 0.173, + "step": 11567 + }, + { + "epoch": 0.9868623101859751, + "grad_norm": 2.1645145976877855, + "learning_rate": 4.525793536991896e-08, + "loss": 0.1491, + "step": 11568 + }, + { + "epoch": 0.9869476198600922, + "grad_norm": 1.83972050928303, + "learning_rate": 4.467216555874565e-08, + "loss": 0.1518, + "step": 11569 + }, + { + "epoch": 0.9870329295342092, + "grad_norm": 1.7562361367109505, + "learning_rate": 4.409020957139576e-08, + "loss": 0.2019, + "step": 11570 + }, + { + "epoch": 0.9871182392083262, + "grad_norm": 1.632955079492769, + "learning_rate": 4.351206745228931e-08, + "loss": 0.192, + "step": 11571 + }, + { + "epoch": 0.9872035488824432, + "grad_norm": 1.8693997353996497, + "learning_rate": 4.293773924556321e-08, + "loss": 0.2104, + "step": 11572 + }, + { + "epoch": 0.9872888585565603, + "grad_norm": 2.499808494862422, + "learning_rate": 4.236722499507684e-08, + "loss": 0.2131, + "step": 11573 + }, + { + "epoch": 0.9873741682306774, + "grad_norm": 1.819907278381882, + "learning_rate": 4.180052474437313e-08, + "loss": 0.1204, + "step": 11574 + }, + { + "epoch": 0.9874594779047944, + "grad_norm": 2.103250952657513, + "learning_rate": 4.1237638536728573e-08, + "loss": 0.1088, + "step": 11575 + }, + { + "epoch": 0.9875447875789114, + "grad_norm": 2.0573007189072667, + "learning_rate": 4.0678566415103256e-08, + "loss": 0.1933, + "step": 11576 + }, + { + "epoch": 0.9876300972530285, + "grad_norm": 2.1691081034487474, + "learning_rate": 4.012330842219081e-08, + "loss": 0.1602, + "step": 11577 + }, + { + "epoch": 0.9877154069271455, + "grad_norm": 1.902233457403387, + "learning_rate": 3.957186460037399e-08, + "loss": 0.1485, + "step": 11578 + }, + { + "epoch": 0.9878007166012626, + "grad_norm": 1.1004276788043303, + "learning_rate": 3.9024234991758004e-08, + "loss": 0.1151, + "step": 11579 + }, + { + "epoch": 0.9878860262753796, + "grad_norm": 2.1359085136852207, + "learning_rate": 3.848041963814275e-08, + "loss": 0.1421, + "step": 11580 + }, + { + "epoch": 0.9879713359494967, + "grad_norm": 1.7712285741399882, + "learning_rate": 3.794041858106168e-08, + "loss": 0.1659, + "step": 11581 + }, + { + "epoch": 0.9880566456236137, + "grad_norm": 1.8324347663707297, + "learning_rate": 3.7404231861726255e-08, + "loss": 0.1757, + "step": 11582 + }, + { + "epoch": 0.9881419552977307, + "grad_norm": 2.09747927467919, + "learning_rate": 3.687185952107597e-08, + "loss": 0.1998, + "step": 11583 + }, + { + "epoch": 0.9882272649718478, + "grad_norm": 2.259068440049436, + "learning_rate": 3.6343301599756074e-08, + "loss": 0.1952, + "step": 11584 + }, + { + "epoch": 0.9883125746459649, + "grad_norm": 1.850347385783097, + "learning_rate": 3.5818558138123184e-08, + "loss": 0.1774, + "step": 11585 + }, + { + "epoch": 0.9883978843200819, + "grad_norm": 1.69016084345775, + "learning_rate": 3.5297629176228587e-08, + "loss": 0.2475, + "step": 11586 + }, + { + "epoch": 0.9884831939941989, + "grad_norm": 1.3624037402385591, + "learning_rate": 3.478051475385158e-08, + "loss": 0.1727, + "step": 11587 + }, + { + "epoch": 0.988568503668316, + "grad_norm": 2.1954911454573196, + "learning_rate": 3.426721491046059e-08, + "loss": 0.0997, + "step": 11588 + }, + { + "epoch": 0.9886538133424331, + "grad_norm": 1.6427744998640454, + "learning_rate": 3.375772968525759e-08, + "loss": 0.1538, + "step": 11589 + }, + { + "epoch": 0.9887391230165501, + "grad_norm": 1.7891315296965171, + "learning_rate": 3.325205911712814e-08, + "loss": 0.2257, + "step": 11590 + }, + { + "epoch": 0.9888244326906671, + "grad_norm": 2.3947560638883005, + "learning_rate": 3.275020324468026e-08, + "loss": 0.1813, + "step": 11591 + }, + { + "epoch": 0.9889097423647841, + "grad_norm": 1.6452476408981662, + "learning_rate": 3.225216210623327e-08, + "loss": 0.1342, + "step": 11592 + }, + { + "epoch": 0.9889950520389013, + "grad_norm": 1.9131374069853404, + "learning_rate": 3.175793573980124e-08, + "loss": 0.1227, + "step": 11593 + }, + { + "epoch": 0.9890803617130183, + "grad_norm": 2.2119922076772673, + "learning_rate": 3.126752418312062e-08, + "loss": 0.2218, + "step": 11594 + }, + { + "epoch": 0.9891656713871353, + "grad_norm": 1.6993274873164295, + "learning_rate": 3.07809274736337e-08, + "loss": 0.1592, + "step": 11595 + }, + { + "epoch": 0.9892509810612523, + "grad_norm": 1.5041210467041632, + "learning_rate": 3.029814564848299e-08, + "loss": 0.1184, + "step": 11596 + }, + { + "epoch": 0.9893362907353694, + "grad_norm": 1.3812663167964985, + "learning_rate": 2.981917874453344e-08, + "loss": 0.1367, + "step": 11597 + }, + { + "epoch": 0.9894216004094865, + "grad_norm": 2.008951895442169, + "learning_rate": 2.9344026798344692e-08, + "loss": 0.1598, + "step": 11598 + }, + { + "epoch": 0.9895069100836035, + "grad_norm": 1.6321283394661854, + "learning_rate": 2.887268984619884e-08, + "loss": 0.1803, + "step": 11599 + }, + { + "epoch": 0.9895922197577205, + "grad_norm": 2.3340362670748087, + "learning_rate": 2.840516792407266e-08, + "loss": 0.2107, + "step": 11600 + }, + { + "epoch": 0.9896775294318376, + "grad_norm": 1.8092495942346598, + "learning_rate": 2.7941461067665376e-08, + "loss": 0.1536, + "step": 11601 + }, + { + "epoch": 0.9897628391059546, + "grad_norm": 1.5108341911525294, + "learning_rate": 2.7481569312381995e-08, + "loss": 0.1672, + "step": 11602 + }, + { + "epoch": 0.9898481487800717, + "grad_norm": 2.0720068565976906, + "learning_rate": 2.702549269332222e-08, + "loss": 0.2043, + "step": 11603 + }, + { + "epoch": 0.9899334584541887, + "grad_norm": 1.4955443327112494, + "learning_rate": 2.6573231245308196e-08, + "loss": 0.163, + "step": 11604 + }, + { + "epoch": 0.9900187681283058, + "grad_norm": 2.597211527221564, + "learning_rate": 2.6124785002867857e-08, + "loss": 0.2075, + "step": 11605 + }, + { + "epoch": 0.9901040778024228, + "grad_norm": 2.0749610391658915, + "learning_rate": 2.568015400024604e-08, + "loss": 0.1993, + "step": 11606 + }, + { + "epoch": 0.9901893874765398, + "grad_norm": 1.7538329790948712, + "learning_rate": 2.52393382713767e-08, + "loss": 0.1796, + "step": 11607 + }, + { + "epoch": 0.9902746971506569, + "grad_norm": 1.8056805673343166, + "learning_rate": 2.4802337849921807e-08, + "loss": 0.1573, + "step": 11608 + }, + { + "epoch": 0.9903600068247739, + "grad_norm": 1.9986327981850995, + "learning_rate": 2.4369152769238014e-08, + "loss": 0.2273, + "step": 11609 + }, + { + "epoch": 0.990445316498891, + "grad_norm": 1.9571444236940034, + "learning_rate": 2.3939783062398857e-08, + "loss": 0.224, + "step": 11610 + }, + { + "epoch": 0.990530626173008, + "grad_norm": 1.7863584514607092, + "learning_rate": 2.3514228762183676e-08, + "loss": 0.142, + "step": 11611 + }, + { + "epoch": 0.990615935847125, + "grad_norm": 1.2654439066614835, + "learning_rate": 2.3092489901083148e-08, + "loss": 0.1522, + "step": 11612 + }, + { + "epoch": 0.9907012455212421, + "grad_norm": 1.6006596695475477, + "learning_rate": 2.2674566511293737e-08, + "loss": 0.2552, + "step": 11613 + }, + { + "epoch": 0.9907865551953592, + "grad_norm": 1.9880992154053605, + "learning_rate": 2.2260458624723257e-08, + "loss": 0.211, + "step": 11614 + }, + { + "epoch": 0.9908718648694762, + "grad_norm": 1.4871736344767015, + "learning_rate": 2.1850166272985306e-08, + "loss": 0.1638, + "step": 11615 + }, + { + "epoch": 0.9909571745435932, + "grad_norm": 2.4021611338111706, + "learning_rate": 2.1443689487404827e-08, + "loss": 0.2548, + "step": 11616 + }, + { + "epoch": 0.9910424842177102, + "grad_norm": 2.1834910032498858, + "learning_rate": 2.1041028299012555e-08, + "loss": 0.1762, + "step": 11617 + }, + { + "epoch": 0.9911277938918274, + "grad_norm": 1.7883565917669793, + "learning_rate": 2.0642182738545013e-08, + "loss": 0.1972, + "step": 11618 + }, + { + "epoch": 0.9912131035659444, + "grad_norm": 1.5623726602163388, + "learning_rate": 2.024715283646117e-08, + "loss": 0.2217, + "step": 11619 + }, + { + "epoch": 0.9912984132400614, + "grad_norm": 1.7589754453835216, + "learning_rate": 1.9855938622914683e-08, + "loss": 0.0995, + "step": 11620 + }, + { + "epoch": 0.9913837229141784, + "grad_norm": 2.674551977982194, + "learning_rate": 1.9468540127770552e-08, + "loss": 0.1586, + "step": 11621 + }, + { + "epoch": 0.9914690325882956, + "grad_norm": 1.8114773726454452, + "learning_rate": 1.908495738061067e-08, + "loss": 0.1475, + "step": 11622 + }, + { + "epoch": 0.9915543422624126, + "grad_norm": 2.4733628262340592, + "learning_rate": 1.8705190410717166e-08, + "loss": 0.2581, + "step": 11623 + }, + { + "epoch": 0.9916396519365296, + "grad_norm": 1.6309896268625836, + "learning_rate": 1.8329239247077967e-08, + "loss": 0.128, + "step": 11624 + }, + { + "epoch": 0.9917249616106466, + "grad_norm": 1.519296819259557, + "learning_rate": 1.7957103918397888e-08, + "loss": 0.1884, + "step": 11625 + }, + { + "epoch": 0.9918102712847637, + "grad_norm": 1.7733649688542399, + "learning_rate": 1.7588784453093088e-08, + "loss": 0.1916, + "step": 11626 + }, + { + "epoch": 0.9918955809588808, + "grad_norm": 2.0625478565559385, + "learning_rate": 1.7224280879279964e-08, + "loss": 0.1872, + "step": 11627 + }, + { + "epoch": 0.9919808906329978, + "grad_norm": 1.325056593064235, + "learning_rate": 1.6863593224780704e-08, + "loss": 0.1505, + "step": 11628 + }, + { + "epoch": 0.9920662003071148, + "grad_norm": 1.7166602174705667, + "learning_rate": 1.6506721517134394e-08, + "loss": 0.2001, + "step": 11629 + }, + { + "epoch": 0.9921515099812319, + "grad_norm": 1.6553938470217062, + "learning_rate": 1.6153665783591453e-08, + "loss": 0.1931, + "step": 11630 + }, + { + "epoch": 0.9922368196553489, + "grad_norm": 2.1033394827453065, + "learning_rate": 1.580442605110255e-08, + "loss": 0.1877, + "step": 11631 + }, + { + "epoch": 0.992322129329466, + "grad_norm": 1.3208461788928711, + "learning_rate": 1.5459002346324135e-08, + "loss": 0.1303, + "step": 11632 + }, + { + "epoch": 0.992407439003583, + "grad_norm": 1.8873463726049167, + "learning_rate": 1.5117394695640663e-08, + "loss": 0.1729, + "step": 11633 + }, + { + "epoch": 0.9924927486777001, + "grad_norm": 2.269635354049267, + "learning_rate": 1.4779603125120166e-08, + "loss": 0.1456, + "step": 11634 + }, + { + "epoch": 0.9925780583518171, + "grad_norm": 1.6016914129710673, + "learning_rate": 1.444562766055868e-08, + "loss": 0.1983, + "step": 11635 + }, + { + "epoch": 0.9926633680259341, + "grad_norm": 0.983047074068788, + "learning_rate": 1.4115468327446923e-08, + "loss": 0.1325, + "step": 11636 + }, + { + "epoch": 0.9927486777000512, + "grad_norm": 1.54463796984957, + "learning_rate": 1.3789125150998061e-08, + "loss": 0.1538, + "step": 11637 + }, + { + "epoch": 0.9928339873741683, + "grad_norm": 2.0759338070160283, + "learning_rate": 1.3466598156125498e-08, + "loss": 0.1789, + "step": 11638 + }, + { + "epoch": 0.9929192970482853, + "grad_norm": 1.4314393530445857, + "learning_rate": 1.314788736744288e-08, + "loss": 0.2343, + "step": 11639 + }, + { + "epoch": 0.9930046067224023, + "grad_norm": 1.4358899595341954, + "learning_rate": 1.2832992809291843e-08, + "loss": 0.216, + "step": 11640 + }, + { + "epoch": 0.9930899163965193, + "grad_norm": 2.4578654172600243, + "learning_rate": 1.2521914505714272e-08, + "loss": 0.1395, + "step": 11641 + }, + { + "epoch": 0.9931752260706365, + "grad_norm": 2.253770106813571, + "learning_rate": 1.2214652480452282e-08, + "loss": 0.2145, + "step": 11642 + }, + { + "epoch": 0.9932605357447535, + "grad_norm": 2.3773733627977194, + "learning_rate": 1.1911206756964888e-08, + "loss": 0.1516, + "step": 11643 + }, + { + "epoch": 0.9933458454188705, + "grad_norm": 1.9609383510876515, + "learning_rate": 1.1611577358422442e-08, + "loss": 0.2431, + "step": 11644 + }, + { + "epoch": 0.9934311550929875, + "grad_norm": 2.28405547405691, + "learning_rate": 1.1315764307695542e-08, + "loss": 0.1521, + "step": 11645 + }, + { + "epoch": 0.9935164647671045, + "grad_norm": 1.5206507307541852, + "learning_rate": 1.1023767627377224e-08, + "loss": 0.1114, + "step": 11646 + }, + { + "epoch": 0.9936017744412217, + "grad_norm": 1.6323276511751603, + "learning_rate": 1.0735587339749665e-08, + "loss": 0.1816, + "step": 11647 + }, + { + "epoch": 0.9936870841153387, + "grad_norm": 2.1703448497099713, + "learning_rate": 1.0451223466811933e-08, + "loss": 0.1911, + "step": 11648 + }, + { + "epoch": 0.9937723937894557, + "grad_norm": 1.2818958236513223, + "learning_rate": 1.0170676030285542e-08, + "loss": 0.1656, + "step": 11649 + }, + { + "epoch": 0.9938577034635727, + "grad_norm": 1.963498817468748, + "learning_rate": 9.893945051581143e-09, + "loss": 0.1732, + "step": 11650 + }, + { + "epoch": 0.9939430131376898, + "grad_norm": 1.6978323355796916, + "learning_rate": 9.621030551826282e-09, + "loss": 0.2014, + "step": 11651 + }, + { + "epoch": 0.9940283228118069, + "grad_norm": 2.175492930185784, + "learning_rate": 9.351932551854292e-09, + "loss": 0.1953, + "step": 11652 + }, + { + "epoch": 0.9941136324859239, + "grad_norm": 1.7040671446201947, + "learning_rate": 9.086651072215402e-09, + "loss": 0.166, + "step": 11653 + }, + { + "epoch": 0.9941989421600409, + "grad_norm": 1.3955154578905118, + "learning_rate": 8.825186133160079e-09, + "loss": 0.2272, + "step": 11654 + }, + { + "epoch": 0.994284251834158, + "grad_norm": 3.232219917926278, + "learning_rate": 8.567537754650135e-09, + "loss": 0.1619, + "step": 11655 + }, + { + "epoch": 0.994369561508275, + "grad_norm": 1.349040064551146, + "learning_rate": 8.313705956347618e-09, + "loss": 0.2098, + "step": 11656 + }, + { + "epoch": 0.9944548711823921, + "grad_norm": 2.059851824792283, + "learning_rate": 8.063690757642572e-09, + "loss": 0.1801, + "step": 11657 + }, + { + "epoch": 0.9945401808565091, + "grad_norm": 2.3465425258370076, + "learning_rate": 7.817492177619735e-09, + "loss": 0.1575, + "step": 11658 + }, + { + "epoch": 0.9946254905306262, + "grad_norm": 2.3436098183989578, + "learning_rate": 7.575110235069626e-09, + "loss": 0.1327, + "step": 11659 + }, + { + "epoch": 0.9947108002047432, + "grad_norm": 1.920350190949749, + "learning_rate": 7.33654494850522e-09, + "loss": 0.1829, + "step": 11660 + }, + { + "epoch": 0.9947961098788602, + "grad_norm": 2.452699466413769, + "learning_rate": 7.101796336128619e-09, + "loss": 0.2022, + "step": 11661 + }, + { + "epoch": 0.9948814195529773, + "grad_norm": 2.041436110992375, + "learning_rate": 6.8708644158754775e-09, + "loss": 0.1315, + "step": 11662 + }, + { + "epoch": 0.9949667292270944, + "grad_norm": 1.914618648774791, + "learning_rate": 6.6437492053594844e-09, + "loss": 0.1492, + "step": 11663 + }, + { + "epoch": 0.9950520389012114, + "grad_norm": 1.4422049441785947, + "learning_rate": 6.420450721933424e-09, + "loss": 0.1698, + "step": 11664 + }, + { + "epoch": 0.9951373485753284, + "grad_norm": 1.9709286820598648, + "learning_rate": 6.200968982644773e-09, + "loss": 0.2145, + "step": 11665 + }, + { + "epoch": 0.9952226582494454, + "grad_norm": 1.9790705272681024, + "learning_rate": 5.985304004241243e-09, + "loss": 0.1514, + "step": 11666 + }, + { + "epoch": 0.9953079679235626, + "grad_norm": 2.672314077963239, + "learning_rate": 5.773455803187444e-09, + "loss": 0.2021, + "step": 11667 + }, + { + "epoch": 0.9953932775976796, + "grad_norm": 1.792893285555301, + "learning_rate": 5.565424395670427e-09, + "loss": 0.1752, + "step": 11668 + }, + { + "epoch": 0.9954785872717966, + "grad_norm": 1.481572285933442, + "learning_rate": 5.3612097975552775e-09, + "loss": 0.162, + "step": 11669 + }, + { + "epoch": 0.9955638969459136, + "grad_norm": 1.6005244170509914, + "learning_rate": 5.160812024446182e-09, + "loss": 0.1852, + "step": 11670 + }, + { + "epoch": 0.9956492066200308, + "grad_norm": 1.4178599586472582, + "learning_rate": 4.964231091630911e-09, + "loss": 0.1259, + "step": 11671 + }, + { + "epoch": 0.9957345162941478, + "grad_norm": 1.4213665063456815, + "learning_rate": 4.771467014125231e-09, + "loss": 0.1248, + "step": 11672 + }, + { + "epoch": 0.9958198259682648, + "grad_norm": 1.91365875345541, + "learning_rate": 4.582519806645147e-09, + "loss": 0.1964, + "step": 11673 + }, + { + "epoch": 0.9959051356423818, + "grad_norm": 2.4015077702596916, + "learning_rate": 4.397389483618009e-09, + "loss": 0.1772, + "step": 11674 + }, + { + "epoch": 0.9959904453164989, + "grad_norm": 2.007947390288891, + "learning_rate": 4.2160760591658525e-09, + "loss": 0.2135, + "step": 11675 + }, + { + "epoch": 0.996075754990616, + "grad_norm": 1.212257404244205, + "learning_rate": 4.038579547144261e-09, + "loss": 0.1504, + "step": 11676 + }, + { + "epoch": 0.996161064664733, + "grad_norm": 1.5354507590882212, + "learning_rate": 3.864899961097956e-09, + "loss": 0.1587, + "step": 11677 + }, + { + "epoch": 0.99624637433885, + "grad_norm": 2.0113095950726843, + "learning_rate": 3.695037314288552e-09, + "loss": 0.2101, + "step": 11678 + }, + { + "epoch": 0.9963316840129671, + "grad_norm": 2.4099391054239954, + "learning_rate": 3.528991619683453e-09, + "loss": 0.1972, + "step": 11679 + }, + { + "epoch": 0.9964169936870841, + "grad_norm": 1.4553522248648914, + "learning_rate": 3.3667628899558545e-09, + "loss": 0.1725, + "step": 11680 + }, + { + "epoch": 0.9965023033612012, + "grad_norm": 1.2124550447290372, + "learning_rate": 3.2083511374958465e-09, + "loss": 0.1729, + "step": 11681 + }, + { + "epoch": 0.9965876130353182, + "grad_norm": 1.9204662711497062, + "learning_rate": 3.053756374393757e-09, + "loss": 0.201, + "step": 11682 + }, + { + "epoch": 0.9966729227094353, + "grad_norm": 1.332700499717863, + "learning_rate": 2.902978612456808e-09, + "loss": 0.1604, + "step": 11683 + }, + { + "epoch": 0.9967582323835523, + "grad_norm": 1.5188065004239284, + "learning_rate": 2.7560178631869103e-09, + "loss": 0.1731, + "step": 11684 + }, + { + "epoch": 0.9968435420576693, + "grad_norm": 2.553911563731232, + "learning_rate": 2.61287413781397e-09, + "loss": 0.1451, + "step": 11685 + }, + { + "epoch": 0.9969288517317864, + "grad_norm": 1.6532502269366882, + "learning_rate": 2.4735474472625806e-09, + "loss": 0.1852, + "step": 11686 + }, + { + "epoch": 0.9970141614059034, + "grad_norm": 1.5948971378557855, + "learning_rate": 2.338037802174231e-09, + "loss": 0.1816, + "step": 11687 + }, + { + "epoch": 0.9970994710800205, + "grad_norm": 3.0858148091709947, + "learning_rate": 2.206345212879546e-09, + "loss": 0.2044, + "step": 11688 + }, + { + "epoch": 0.9971847807541375, + "grad_norm": 1.4604174662473106, + "learning_rate": 2.078469689448248e-09, + "loss": 0.1506, + "step": 11689 + }, + { + "epoch": 0.9972700904282545, + "grad_norm": 2.832512923747357, + "learning_rate": 1.954411241639198e-09, + "loss": 0.2085, + "step": 11690 + }, + { + "epoch": 0.9973554001023716, + "grad_norm": 1.5428265856184278, + "learning_rate": 1.834169878917047e-09, + "loss": 0.1626, + "step": 11691 + }, + { + "epoch": 0.9974407097764887, + "grad_norm": 1.6077611294057041, + "learning_rate": 1.7177456104688905e-09, + "loss": 0.1439, + "step": 11692 + }, + { + "epoch": 0.9975260194506057, + "grad_norm": 2.1330420960869945, + "learning_rate": 1.6051384451765128e-09, + "loss": 0.2151, + "step": 11693 + }, + { + "epoch": 0.9976113291247227, + "grad_norm": 2.3823992261358864, + "learning_rate": 1.4963483916441424e-09, + "loss": 0.2375, + "step": 11694 + }, + { + "epoch": 0.9976966387988397, + "grad_norm": 1.7635292510939848, + "learning_rate": 1.3913754581762473e-09, + "loss": 0.1426, + "step": 11695 + }, + { + "epoch": 0.9977819484729569, + "grad_norm": 2.6698932994996056, + "learning_rate": 1.2902196527775356e-09, + "loss": 0.2231, + "step": 11696 + }, + { + "epoch": 0.9978672581470739, + "grad_norm": 1.3860819899320624, + "learning_rate": 1.1928809831807108e-09, + "loss": 0.2046, + "step": 11697 + }, + { + "epoch": 0.9979525678211909, + "grad_norm": 2.3290053373728514, + "learning_rate": 1.099359456818716e-09, + "loss": 0.1004, + "step": 11698 + }, + { + "epoch": 0.9980378774953079, + "grad_norm": 1.7684337589367927, + "learning_rate": 1.0096550808191828e-09, + "loss": 0.1214, + "step": 11699 + }, + { + "epoch": 0.998123187169425, + "grad_norm": 1.829174099373398, + "learning_rate": 9.2376786204329e-10, + "loss": 0.1827, + "step": 11700 + }, + { + "epoch": 0.9982084968435421, + "grad_norm": 1.7715534167310052, + "learning_rate": 8.416978070413529e-10, + "loss": 0.1469, + "step": 11701 + }, + { + "epoch": 0.9982938065176591, + "grad_norm": 1.5960598830459196, + "learning_rate": 7.634449220805806e-10, + "loss": 0.1689, + "step": 11702 + }, + { + "epoch": 0.9983791161917761, + "grad_norm": 1.7408880339788042, + "learning_rate": 6.890092131339732e-10, + "loss": 0.1742, + "step": 11703 + }, + { + "epoch": 0.9984644258658932, + "grad_norm": 1.674430534008567, + "learning_rate": 6.183906858858723e-10, + "loss": 0.1852, + "step": 11704 + }, + { + "epoch": 0.9985497355400103, + "grad_norm": 1.5614855715780989, + "learning_rate": 5.515893457264109e-10, + "loss": 0.1297, + "step": 11705 + }, + { + "epoch": 0.9986350452141273, + "grad_norm": 1.8135635938492884, + "learning_rate": 4.886051977626149e-10, + "loss": 0.1333, + "step": 11706 + }, + { + "epoch": 0.9987203548882443, + "grad_norm": 1.7238130383379164, + "learning_rate": 4.294382467906477e-10, + "loss": 0.1669, + "step": 11707 + }, + { + "epoch": 0.9988056645623614, + "grad_norm": 1.604323226753156, + "learning_rate": 3.7408849733466813e-10, + "loss": 0.1641, + "step": 11708 + }, + { + "epoch": 0.9988909742364784, + "grad_norm": 1.9371573852817334, + "learning_rate": 3.2255595361907475e-10, + "loss": 0.1704, + "step": 11709 + }, + { + "epoch": 0.9989762839105955, + "grad_norm": 1.9118266846461904, + "learning_rate": 2.748406195796083e-10, + "loss": 0.1763, + "step": 11710 + }, + { + "epoch": 0.9990615935847125, + "grad_norm": 1.8683903223625404, + "learning_rate": 2.3094249885780016e-10, + "loss": 0.1331, + "step": 11711 + }, + { + "epoch": 0.9991469032588296, + "grad_norm": 1.865484762112669, + "learning_rate": 1.9086159480097287e-10, + "loss": 0.1502, + "step": 11712 + }, + { + "epoch": 0.9992322129329466, + "grad_norm": 1.4362576963054134, + "learning_rate": 1.5459791047889305e-10, + "loss": 0.1435, + "step": 11713 + }, + { + "epoch": 0.9993175226070636, + "grad_norm": 1.1415880918727812, + "learning_rate": 1.221514486504649e-10, + "loss": 0.1056, + "step": 11714 + }, + { + "epoch": 0.9994028322811807, + "grad_norm": 1.3717853073656952, + "learning_rate": 9.35222117970369e-11, + "loss": 0.1295, + "step": 11715 + }, + { + "epoch": 0.9994881419552978, + "grad_norm": 1.3068893948477815, + "learning_rate": 6.871020210574841e-11, + "loss": 0.1355, + "step": 11716 + }, + { + "epoch": 0.9995734516294148, + "grad_norm": 1.6665547651236217, + "learning_rate": 4.771542146952967e-11, + "loss": 0.2112, + "step": 11717 + }, + { + "epoch": 0.9996587613035318, + "grad_norm": 1.9498945452223289, + "learning_rate": 3.053787148710185e-11, + "loss": 0.1962, + "step": 11718 + }, + { + "epoch": 0.9997440709776488, + "grad_norm": 2.5822525737512154, + "learning_rate": 1.717755347963035e-11, + "loss": 0.2002, + "step": 11719 + }, + { + "epoch": 0.999829380651766, + "grad_norm": 1.6951234773654764, + "learning_rate": 7.634468457418109e-12, + "loss": 0.1743, + "step": 11720 + }, + { + "epoch": 0.999914690325883, + "grad_norm": 2.6265397778310553, + "learning_rate": 1.908617147661218e-12, + "loss": 0.1708, + "step": 11721 + }, + { + "epoch": 1.0, + "grad_norm": 1.9556974026622904, + "learning_rate": 0.0, + "loss": 0.1204, + "step": 11722 + }, + { + "epoch": 1.0, + "step": 11722, + "total_flos": 2.204684751559092e+19, + "train_loss": 0.2725867780617959, + "train_runtime": 52680.0383, + "train_samples_per_second": 3.56, + "train_steps_per_second": 0.223 + } + ], + "logging_steps": 1.0, + "max_steps": 11722, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "total_flos": 2.204684751559092e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}