| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 620, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0064516129032258064, | |
| "grad_norm": 2.3284332752227783, | |
| "learning_rate": 4.032258064516129e-07, | |
| "loss": 0.1831, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.012903225806451613, | |
| "grad_norm": 3.1032278537750244, | |
| "learning_rate": 8.064516129032258e-07, | |
| "loss": 0.2496, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01935483870967742, | |
| "grad_norm": 2.7308666706085205, | |
| "learning_rate": 1.2096774193548388e-06, | |
| "loss": 0.2497, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.025806451612903226, | |
| "grad_norm": 2.6942598819732666, | |
| "learning_rate": 1.6129032258064516e-06, | |
| "loss": 0.2513, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.03225806451612903, | |
| "grad_norm": 2.281903028488159, | |
| "learning_rate": 2.0161290322580646e-06, | |
| "loss": 0.2021, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03870967741935484, | |
| "grad_norm": 2.2116780281066895, | |
| "learning_rate": 2.4193548387096776e-06, | |
| "loss": 0.2472, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04516129032258064, | |
| "grad_norm": 2.3709909915924072, | |
| "learning_rate": 2.82258064516129e-06, | |
| "loss": 0.2133, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.05161290322580645, | |
| "grad_norm": 1.6222234964370728, | |
| "learning_rate": 3.225806451612903e-06, | |
| "loss": 0.2072, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.05806451612903226, | |
| "grad_norm": 1.6226286888122559, | |
| "learning_rate": 3.6290322580645166e-06, | |
| "loss": 0.1732, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.06451612903225806, | |
| "grad_norm": 1.514697551727295, | |
| "learning_rate": 4.032258064516129e-06, | |
| "loss": 0.2088, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07096774193548387, | |
| "grad_norm": 1.6407102346420288, | |
| "learning_rate": 4.435483870967742e-06, | |
| "loss": 0.1733, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.07741935483870968, | |
| "grad_norm": 1.5498034954071045, | |
| "learning_rate": 4.838709677419355e-06, | |
| "loss": 0.2087, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.08387096774193549, | |
| "grad_norm": 1.9465513229370117, | |
| "learning_rate": 5.241935483870968e-06, | |
| "loss": 0.2046, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.09032258064516129, | |
| "grad_norm": 1.3483728170394897, | |
| "learning_rate": 5.64516129032258e-06, | |
| "loss": 0.1783, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0967741935483871, | |
| "grad_norm": 1.4068559408187866, | |
| "learning_rate": 6.048387096774194e-06, | |
| "loss": 0.1519, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1032258064516129, | |
| "grad_norm": 1.3083986043930054, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 0.1586, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.10967741935483871, | |
| "grad_norm": 1.6559300422668457, | |
| "learning_rate": 6.854838709677419e-06, | |
| "loss": 0.1476, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.11612903225806452, | |
| "grad_norm": 1.0691255331039429, | |
| "learning_rate": 7.258064516129033e-06, | |
| "loss": 0.1478, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.12258064516129032, | |
| "grad_norm": 1.1155110597610474, | |
| "learning_rate": 7.661290322580646e-06, | |
| "loss": 0.1382, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.12903225806451613, | |
| "grad_norm": 0.9816218018531799, | |
| "learning_rate": 8.064516129032258e-06, | |
| "loss": 0.1204, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.13548387096774195, | |
| "grad_norm": 1.2463096380233765, | |
| "learning_rate": 8.46774193548387e-06, | |
| "loss": 0.1503, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.14193548387096774, | |
| "grad_norm": 1.3447906970977783, | |
| "learning_rate": 8.870967741935484e-06, | |
| "loss": 0.1224, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.14838709677419354, | |
| "grad_norm": 1.1465381383895874, | |
| "learning_rate": 9.274193548387097e-06, | |
| "loss": 0.1374, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.15483870967741936, | |
| "grad_norm": 1.0978549718856812, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 0.1303, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.16129032258064516, | |
| "grad_norm": 1.1053048372268677, | |
| "learning_rate": 1.0080645161290323e-05, | |
| "loss": 0.1423, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.16774193548387098, | |
| "grad_norm": 1.0212026834487915, | |
| "learning_rate": 1.0483870967741936e-05, | |
| "loss": 0.1281, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.17419354838709677, | |
| "grad_norm": 0.9742250442504883, | |
| "learning_rate": 1.0887096774193549e-05, | |
| "loss": 0.1228, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.18064516129032257, | |
| "grad_norm": 1.1676782369613647, | |
| "learning_rate": 1.129032258064516e-05, | |
| "loss": 0.1429, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1870967741935484, | |
| "grad_norm": 1.088600516319275, | |
| "learning_rate": 1.1693548387096775e-05, | |
| "loss": 0.1069, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.1935483870967742, | |
| "grad_norm": 1.3947196006774902, | |
| "learning_rate": 1.2096774193548388e-05, | |
| "loss": 0.1316, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.9059141874313354, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.1121, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.2064516129032258, | |
| "grad_norm": 1.0918734073638916, | |
| "learning_rate": 1.2903225806451613e-05, | |
| "loss": 0.1521, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.2129032258064516, | |
| "grad_norm": 1.1550475358963013, | |
| "learning_rate": 1.3306451612903225e-05, | |
| "loss": 0.1251, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.21935483870967742, | |
| "grad_norm": 1.2110551595687866, | |
| "learning_rate": 1.3709677419354839e-05, | |
| "loss": 0.1325, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.22580645161290322, | |
| "grad_norm": 1.0340098142623901, | |
| "learning_rate": 1.4112903225806454e-05, | |
| "loss": 0.1115, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.23225806451612904, | |
| "grad_norm": 0.9789180159568787, | |
| "learning_rate": 1.4516129032258066e-05, | |
| "loss": 0.1154, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.23870967741935484, | |
| "grad_norm": 0.8362810015678406, | |
| "learning_rate": 1.4919354838709679e-05, | |
| "loss": 0.1041, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.24516129032258063, | |
| "grad_norm": 1.0805575847625732, | |
| "learning_rate": 1.5322580645161292e-05, | |
| "loss": 0.1201, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.25161290322580643, | |
| "grad_norm": 1.0794912576675415, | |
| "learning_rate": 1.5725806451612903e-05, | |
| "loss": 0.1387, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.25806451612903225, | |
| "grad_norm": 1.0303066968917847, | |
| "learning_rate": 1.6129032258064517e-05, | |
| "loss": 0.1381, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2645161290322581, | |
| "grad_norm": 0.8959848284721375, | |
| "learning_rate": 1.653225806451613e-05, | |
| "loss": 0.1251, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.2709677419354839, | |
| "grad_norm": 1.0856695175170898, | |
| "learning_rate": 1.693548387096774e-05, | |
| "loss": 0.1363, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.27741935483870966, | |
| "grad_norm": 0.8375802636146545, | |
| "learning_rate": 1.733870967741936e-05, | |
| "loss": 0.1009, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2838709677419355, | |
| "grad_norm": 0.9029824733734131, | |
| "learning_rate": 1.774193548387097e-05, | |
| "loss": 0.1006, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.2903225806451613, | |
| "grad_norm": 0.8736345767974854, | |
| "learning_rate": 1.8145161290322583e-05, | |
| "loss": 0.1212, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2967741935483871, | |
| "grad_norm": 1.165887713432312, | |
| "learning_rate": 1.8548387096774193e-05, | |
| "loss": 0.1241, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.3032258064516129, | |
| "grad_norm": 0.8511247634887695, | |
| "learning_rate": 1.8951612903225807e-05, | |
| "loss": 0.0942, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.3096774193548387, | |
| "grad_norm": 1.0182602405548096, | |
| "learning_rate": 1.935483870967742e-05, | |
| "loss": 0.116, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.3161290322580645, | |
| "grad_norm": 0.8452662825584412, | |
| "learning_rate": 1.975806451612903e-05, | |
| "loss": 0.1023, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 0.9583229422569275, | |
| "learning_rate": 2.0161290322580645e-05, | |
| "loss": 0.0956, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.32903225806451614, | |
| "grad_norm": 0.935484766960144, | |
| "learning_rate": 2.056451612903226e-05, | |
| "loss": 0.1185, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.33548387096774196, | |
| "grad_norm": 0.9844627380371094, | |
| "learning_rate": 2.0967741935483873e-05, | |
| "loss": 0.0998, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3419354838709677, | |
| "grad_norm": 1.0139315128326416, | |
| "learning_rate": 2.1370967741935487e-05, | |
| "loss": 0.0901, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.34838709677419355, | |
| "grad_norm": 0.844688892364502, | |
| "learning_rate": 2.1774193548387097e-05, | |
| "loss": 0.1158, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.3548387096774194, | |
| "grad_norm": 0.778408408164978, | |
| "learning_rate": 2.217741935483871e-05, | |
| "loss": 0.0884, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.36129032258064514, | |
| "grad_norm": 0.7307286858558655, | |
| "learning_rate": 2.258064516129032e-05, | |
| "loss": 0.1099, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.36774193548387096, | |
| "grad_norm": 0.681089460849762, | |
| "learning_rate": 2.2983870967741935e-05, | |
| "loss": 0.0965, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3741935483870968, | |
| "grad_norm": 0.7206712365150452, | |
| "learning_rate": 2.338709677419355e-05, | |
| "loss": 0.0898, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.38064516129032255, | |
| "grad_norm": 0.6326794624328613, | |
| "learning_rate": 2.3790322580645163e-05, | |
| "loss": 0.0896, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.3870967741935484, | |
| "grad_norm": 0.6684013605117798, | |
| "learning_rate": 2.4193548387096777e-05, | |
| "loss": 0.0859, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3935483870967742, | |
| "grad_norm": 0.7839128971099854, | |
| "learning_rate": 2.4596774193548387e-05, | |
| "loss": 0.0918, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7025837302207947, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0933, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.4064516129032258, | |
| "grad_norm": 0.7583072185516357, | |
| "learning_rate": 2.4999801888257584e-05, | |
| "loss": 0.0916, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.4129032258064516, | |
| "grad_norm": 0.8116795420646667, | |
| "learning_rate": 2.499920755931005e-05, | |
| "loss": 0.0934, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.41935483870967744, | |
| "grad_norm": 0.9053534865379333, | |
| "learning_rate": 2.4998217031996375e-05, | |
| "loss": 0.1116, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.4258064516129032, | |
| "grad_norm": 0.773985743522644, | |
| "learning_rate": 2.4996830337714163e-05, | |
| "loss": 0.0874, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.432258064516129, | |
| "grad_norm": 0.8468173146247864, | |
| "learning_rate": 2.4995047520418692e-05, | |
| "loss": 0.0954, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.43870967741935485, | |
| "grad_norm": 0.7126619815826416, | |
| "learning_rate": 2.4992868636621474e-05, | |
| "loss": 0.1017, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.44516129032258067, | |
| "grad_norm": 0.7975043654441833, | |
| "learning_rate": 2.4990293755388524e-05, | |
| "loss": 0.1086, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.45161290322580644, | |
| "grad_norm": 0.8055579662322998, | |
| "learning_rate": 2.4987322958338095e-05, | |
| "loss": 0.0836, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.45806451612903226, | |
| "grad_norm": 0.6494209170341492, | |
| "learning_rate": 2.4983956339638158e-05, | |
| "loss": 0.0883, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.4645161290322581, | |
| "grad_norm": 0.6997829675674438, | |
| "learning_rate": 2.4980194006003392e-05, | |
| "loss": 0.0763, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.47096774193548385, | |
| "grad_norm": 0.596174418926239, | |
| "learning_rate": 2.4976036076691787e-05, | |
| "loss": 0.0871, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.4774193548387097, | |
| "grad_norm": 0.6535652279853821, | |
| "learning_rate": 2.4971482683500884e-05, | |
| "loss": 0.0869, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.4838709677419355, | |
| "grad_norm": 0.8003737926483154, | |
| "learning_rate": 2.4966533970763586e-05, | |
| "loss": 0.1086, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.49032258064516127, | |
| "grad_norm": 0.6992926001548767, | |
| "learning_rate": 2.496119009534359e-05, | |
| "loss": 0.0822, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.4967741935483871, | |
| "grad_norm": 0.6500689387321472, | |
| "learning_rate": 2.4955451226630412e-05, | |
| "loss": 0.0876, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.5032258064516129, | |
| "grad_norm": 0.7626132369041443, | |
| "learning_rate": 2.4949317546534018e-05, | |
| "loss": 0.0911, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.5096774193548387, | |
| "grad_norm": 0.6485949158668518, | |
| "learning_rate": 2.4942789249479054e-05, | |
| "loss": 0.0914, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.5161290322580645, | |
| "grad_norm": 0.692364513874054, | |
| "learning_rate": 2.493586654239869e-05, | |
| "loss": 0.1074, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5225806451612903, | |
| "grad_norm": 0.7383131980895996, | |
| "learning_rate": 2.4928549644728057e-05, | |
| "loss": 0.085, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.5290322580645161, | |
| "grad_norm": 0.6585950255393982, | |
| "learning_rate": 2.492083878839729e-05, | |
| "loss": 0.0795, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.535483870967742, | |
| "grad_norm": 0.7683681845664978, | |
| "learning_rate": 2.491273421782417e-05, | |
| "loss": 0.073, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.5419354838709678, | |
| "grad_norm": 0.5386450290679932, | |
| "learning_rate": 2.4904236189906406e-05, | |
| "loss": 0.0814, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.5483870967741935, | |
| "grad_norm": 0.725712239742279, | |
| "learning_rate": 2.489534497401345e-05, | |
| "loss": 0.0896, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5548387096774193, | |
| "grad_norm": 0.8596577644348145, | |
| "learning_rate": 2.488606085197799e-05, | |
| "loss": 0.0816, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5612903225806452, | |
| "grad_norm": 0.7653164863586426, | |
| "learning_rate": 2.4876384118086992e-05, | |
| "loss": 0.1078, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.567741935483871, | |
| "grad_norm": 0.713628351688385, | |
| "learning_rate": 2.48663150790724e-05, | |
| "loss": 0.0887, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.5741935483870968, | |
| "grad_norm": 0.5724640488624573, | |
| "learning_rate": 2.4855854054101395e-05, | |
| "loss": 0.0849, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.5806451612903226, | |
| "grad_norm": 0.6235289573669434, | |
| "learning_rate": 2.484500137476627e-05, | |
| "loss": 0.0875, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5870967741935483, | |
| "grad_norm": 0.785372793674469, | |
| "learning_rate": 2.483375738507395e-05, | |
| "loss": 0.1225, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.5935483870967742, | |
| "grad_norm": 0.6431748867034912, | |
| "learning_rate": 2.4822122441435047e-05, | |
| "loss": 0.0913, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.8031719923019409, | |
| "learning_rate": 2.4810096912652604e-05, | |
| "loss": 0.102, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.6064516129032258, | |
| "grad_norm": 0.5750744938850403, | |
| "learning_rate": 2.4797681179910363e-05, | |
| "loss": 0.0754, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.6129032258064516, | |
| "grad_norm": 0.7892565727233887, | |
| "learning_rate": 2.4784875636760727e-05, | |
| "loss": 0.0825, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6193548387096774, | |
| "grad_norm": 0.7932739853858948, | |
| "learning_rate": 2.4771680689112244e-05, | |
| "loss": 0.1262, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.6258064516129033, | |
| "grad_norm": 0.7647889852523804, | |
| "learning_rate": 2.4758096755216763e-05, | |
| "loss": 0.1083, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.632258064516129, | |
| "grad_norm": 0.9550963640213013, | |
| "learning_rate": 2.474412426565618e-05, | |
| "loss": 0.0828, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.6387096774193548, | |
| "grad_norm": 0.6981013417243958, | |
| "learning_rate": 2.4729763663328774e-05, | |
| "loss": 0.0943, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 0.8088532090187073, | |
| "learning_rate": 2.4715015403435176e-05, | |
| "loss": 0.0954, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6516129032258065, | |
| "grad_norm": 0.9130911231040955, | |
| "learning_rate": 2.4699879953463945e-05, | |
| "loss": 0.0973, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.6580645161290323, | |
| "grad_norm": 0.791867196559906, | |
| "learning_rate": 2.468435779317673e-05, | |
| "loss": 0.0946, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6645161290322581, | |
| "grad_norm": 0.6049063205718994, | |
| "learning_rate": 2.466844941459309e-05, | |
| "loss": 0.0797, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.6709677419354839, | |
| "grad_norm": 0.6488558053970337, | |
| "learning_rate": 2.4652155321974883e-05, | |
| "loss": 0.1004, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.6774193548387096, | |
| "grad_norm": 0.7218672633171082, | |
| "learning_rate": 2.4635476031810284e-05, | |
| "loss": 0.0943, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6838709677419355, | |
| "grad_norm": 0.7997153997421265, | |
| "learning_rate": 2.4618412072797407e-05, | |
| "loss": 0.0831, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.6903225806451613, | |
| "grad_norm": 0.8165119886398315, | |
| "learning_rate": 2.4600963985827555e-05, | |
| "loss": 0.0919, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.6967741935483871, | |
| "grad_norm": 0.704238772392273, | |
| "learning_rate": 2.458313232396808e-05, | |
| "loss": 0.0778, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.7032258064516129, | |
| "grad_norm": 0.6857476234436035, | |
| "learning_rate": 2.456491765244483e-05, | |
| "loss": 0.0914, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.7096774193548387, | |
| "grad_norm": 0.7254015803337097, | |
| "learning_rate": 2.4546320548624264e-05, | |
| "loss": 0.1102, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7161290322580646, | |
| "grad_norm": 0.6534197330474854, | |
| "learning_rate": 2.4527341601995115e-05, | |
| "loss": 0.0841, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.7225806451612903, | |
| "grad_norm": 0.6944810152053833, | |
| "learning_rate": 2.450798141414974e-05, | |
| "loss": 0.1067, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.7290322580645161, | |
| "grad_norm": 0.7583324909210205, | |
| "learning_rate": 2.448824059876503e-05, | |
| "loss": 0.0979, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.7354838709677419, | |
| "grad_norm": 0.5010597705841064, | |
| "learning_rate": 2.4468119781582948e-05, | |
| "loss": 0.069, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.7419354838709677, | |
| "grad_norm": 0.5694583058357239, | |
| "learning_rate": 2.444761960039072e-05, | |
| "loss": 0.0687, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7483870967741936, | |
| "grad_norm": 0.803371787071228, | |
| "learning_rate": 2.442674070500061e-05, | |
| "loss": 0.123, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.7548387096774194, | |
| "grad_norm": 0.6523027420043945, | |
| "learning_rate": 2.4405483757229314e-05, | |
| "loss": 0.0917, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.7612903225806451, | |
| "grad_norm": 0.6718930006027222, | |
| "learning_rate": 2.438384943087698e-05, | |
| "loss": 0.0854, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.7677419354838709, | |
| "grad_norm": 0.5987946391105652, | |
| "learning_rate": 2.4361838411705865e-05, | |
| "loss": 0.0941, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.7741935483870968, | |
| "grad_norm": 0.6336897015571594, | |
| "learning_rate": 2.4339451397418584e-05, | |
| "loss": 0.0885, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7806451612903226, | |
| "grad_norm": 0.7484766840934753, | |
| "learning_rate": 2.4316689097636008e-05, | |
| "loss": 0.0966, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.7870967741935484, | |
| "grad_norm": 0.7096850275993347, | |
| "learning_rate": 2.4293552233874754e-05, | |
| "loss": 0.0843, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.7935483870967742, | |
| "grad_norm": 0.6953093409538269, | |
| "learning_rate": 2.4270041539524322e-05, | |
| "loss": 0.079, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.6068540215492249, | |
| "learning_rate": 2.4246157759823855e-05, | |
| "loss": 0.0846, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.8064516129032258, | |
| "grad_norm": 0.5982446670532227, | |
| "learning_rate": 2.4221901651838506e-05, | |
| "loss": 0.0864, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8129032258064516, | |
| "grad_norm": 0.6706437468528748, | |
| "learning_rate": 2.419727398443545e-05, | |
| "loss": 0.0796, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.8193548387096774, | |
| "grad_norm": 0.6994534730911255, | |
| "learning_rate": 2.417227553825949e-05, | |
| "loss": 0.0775, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.8258064516129032, | |
| "grad_norm": 0.6935513615608215, | |
| "learning_rate": 2.4146907105708357e-05, | |
| "loss": 0.1003, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.832258064516129, | |
| "grad_norm": 0.6945312023162842, | |
| "learning_rate": 2.4121169490907544e-05, | |
| "loss": 0.0901, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.8387096774193549, | |
| "grad_norm": 0.6928992867469788, | |
| "learning_rate": 2.409506350968485e-05, | |
| "loss": 0.0991, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8451612903225807, | |
| "grad_norm": 0.6358478665351868, | |
| "learning_rate": 2.4068589989544498e-05, | |
| "loss": 0.0877, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.8516129032258064, | |
| "grad_norm": 0.6835708022117615, | |
| "learning_rate": 2.404174976964092e-05, | |
| "loss": 0.1058, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.8580645161290322, | |
| "grad_norm": 0.6372717022895813, | |
| "learning_rate": 2.4014543700752156e-05, | |
| "loss": 0.0899, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.864516129032258, | |
| "grad_norm": 0.671310544013977, | |
| "learning_rate": 2.3986972645252883e-05, | |
| "loss": 0.0744, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.8709677419354839, | |
| "grad_norm": 0.5800638794898987, | |
| "learning_rate": 2.395903747708707e-05, | |
| "loss": 0.0818, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8774193548387097, | |
| "grad_norm": 0.5142645835876465, | |
| "learning_rate": 2.39307390817403e-05, | |
| "loss": 0.0811, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.8838709677419355, | |
| "grad_norm": 0.7107434868812561, | |
| "learning_rate": 2.390207835621167e-05, | |
| "loss": 0.0876, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.8903225806451613, | |
| "grad_norm": 0.6197046637535095, | |
| "learning_rate": 2.3873056208985383e-05, | |
| "loss": 0.0907, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.896774193548387, | |
| "grad_norm": 0.8946641087532043, | |
| "learning_rate": 2.384367356000195e-05, | |
| "loss": 0.0867, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.9032258064516129, | |
| "grad_norm": 0.6002138257026672, | |
| "learning_rate": 2.3813931340629018e-05, | |
| "loss": 0.0766, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9096774193548387, | |
| "grad_norm": 0.4771173298358917, | |
| "learning_rate": 2.378383049363184e-05, | |
| "loss": 0.074, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.9161290322580645, | |
| "grad_norm": 0.6188220381736755, | |
| "learning_rate": 2.3753371973143433e-05, | |
| "loss": 0.0823, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.9225806451612903, | |
| "grad_norm": 0.509564995765686, | |
| "learning_rate": 2.3722556744634272e-05, | |
| "loss": 0.069, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.9290322580645162, | |
| "grad_norm": 0.5153804421424866, | |
| "learning_rate": 2.3691385784881743e-05, | |
| "loss": 0.064, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.9354838709677419, | |
| "grad_norm": 0.5935696363449097, | |
| "learning_rate": 2.3659860081939146e-05, | |
| "loss": 0.0827, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9419354838709677, | |
| "grad_norm": 0.4910190999507904, | |
| "learning_rate": 2.3627980635104396e-05, | |
| "loss": 0.0804, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.9483870967741935, | |
| "grad_norm": 0.6524127721786499, | |
| "learning_rate": 2.359574845488833e-05, | |
| "loss": 0.0956, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.9548387096774194, | |
| "grad_norm": 0.6664571762084961, | |
| "learning_rate": 2.356316456298269e-05, | |
| "loss": 0.0937, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.9612903225806452, | |
| "grad_norm": 0.579138994216919, | |
| "learning_rate": 2.353022999222774e-05, | |
| "loss": 0.0936, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 0.6929976940155029, | |
| "learning_rate": 2.3496945786579503e-05, | |
| "loss": 0.1197, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9741935483870968, | |
| "grad_norm": 0.6236998438835144, | |
| "learning_rate": 2.3463313001076696e-05, | |
| "loss": 0.0958, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.9806451612903225, | |
| "grad_norm": 0.4676724672317505, | |
| "learning_rate": 2.342933270180728e-05, | |
| "loss": 0.0713, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.9870967741935484, | |
| "grad_norm": 0.4456840753555298, | |
| "learning_rate": 2.3395005965874657e-05, | |
| "loss": 0.0784, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.9935483870967742, | |
| "grad_norm": 0.5712344646453857, | |
| "learning_rate": 2.336033388136355e-05, | |
| "loss": 0.0935, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.4726645350456238, | |
| "learning_rate": 2.3325317547305485e-05, | |
| "loss": 0.0712, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.0064516129032257, | |
| "grad_norm": 0.48649105429649353, | |
| "learning_rate": 2.3289958073643976e-05, | |
| "loss": 0.0584, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.0129032258064516, | |
| "grad_norm": 0.5159472823143005, | |
| "learning_rate": 2.3254256581199336e-05, | |
| "loss": 0.0579, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.0193548387096774, | |
| "grad_norm": 0.5775710344314575, | |
| "learning_rate": 2.3218214201633136e-05, | |
| "loss": 0.0676, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.0258064516129033, | |
| "grad_norm": 0.5070593357086182, | |
| "learning_rate": 2.318183207741237e-05, | |
| "loss": 0.0794, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.032258064516129, | |
| "grad_norm": 0.38065212965011597, | |
| "learning_rate": 2.3145111361773186e-05, | |
| "loss": 0.051, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.038709677419355, | |
| "grad_norm": 0.562282383441925, | |
| "learning_rate": 2.310805321868439e-05, | |
| "loss": 0.0753, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.0451612903225806, | |
| "grad_norm": 0.49883219599723816, | |
| "learning_rate": 2.30706588228105e-05, | |
| "loss": 0.0554, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.0516129032258064, | |
| "grad_norm": 0.5298740863800049, | |
| "learning_rate": 2.303292935947455e-05, | |
| "loss": 0.0602, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.0580645161290323, | |
| "grad_norm": 0.5768100619316101, | |
| "learning_rate": 2.2994866024620486e-05, | |
| "loss": 0.0585, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.064516129032258, | |
| "grad_norm": 0.5079744458198547, | |
| "learning_rate": 2.2956470024775294e-05, | |
| "loss": 0.0459, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.070967741935484, | |
| "grad_norm": 0.5212790966033936, | |
| "learning_rate": 2.291774257701072e-05, | |
| "loss": 0.0619, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.0774193548387097, | |
| "grad_norm": 0.5063428282737732, | |
| "learning_rate": 2.2878684908904707e-05, | |
| "loss": 0.0609, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.0838709677419356, | |
| "grad_norm": 0.6523650288581848, | |
| "learning_rate": 2.2839298258502483e-05, | |
| "loss": 0.067, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.0903225806451613, | |
| "grad_norm": 0.57984459400177, | |
| "learning_rate": 2.279958387427732e-05, | |
| "loss": 0.0703, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.096774193548387, | |
| "grad_norm": 0.6002654433250427, | |
| "learning_rate": 2.2759543015090955e-05, | |
| "loss": 0.074, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.103225806451613, | |
| "grad_norm": 0.3899862766265869, | |
| "learning_rate": 2.2719176950153688e-05, | |
| "loss": 0.0461, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.1096774193548387, | |
| "grad_norm": 0.5003259778022766, | |
| "learning_rate": 2.267848695898416e-05, | |
| "loss": 0.0613, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.1161290322580646, | |
| "grad_norm": 0.558653712272644, | |
| "learning_rate": 2.2637474331368766e-05, | |
| "loss": 0.0658, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.1225806451612903, | |
| "grad_norm": 0.5032625794410706, | |
| "learning_rate": 2.2596140367320813e-05, | |
| "loss": 0.0564, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.129032258064516, | |
| "grad_norm": 0.5199857950210571, | |
| "learning_rate": 2.2554486377039282e-05, | |
| "loss": 0.0587, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.135483870967742, | |
| "grad_norm": 0.6159687042236328, | |
| "learning_rate": 2.251251368086731e-05, | |
| "loss": 0.0585, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.1419354838709677, | |
| "grad_norm": 0.5216447114944458, | |
| "learning_rate": 2.2470223609250328e-05, | |
| "loss": 0.0501, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.1483870967741936, | |
| "grad_norm": 0.49131321907043457, | |
| "learning_rate": 2.24276175026939e-05, | |
| "loss": 0.053, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.1548387096774193, | |
| "grad_norm": 0.8894760608673096, | |
| "learning_rate": 2.238469671172123e-05, | |
| "loss": 0.0854, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.1612903225806452, | |
| "grad_norm": 0.6628456711769104, | |
| "learning_rate": 2.2341462596830354e-05, | |
| "loss": 0.064, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.167741935483871, | |
| "grad_norm": 0.4577731788158417, | |
| "learning_rate": 2.229791652845099e-05, | |
| "loss": 0.0543, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.1741935483870969, | |
| "grad_norm": 0.49301421642303467, | |
| "learning_rate": 2.225405988690115e-05, | |
| "loss": 0.0598, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.1806451612903226, | |
| "grad_norm": 0.523009717464447, | |
| "learning_rate": 2.220989406234333e-05, | |
| "loss": 0.0752, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.1870967741935483, | |
| "grad_norm": 0.7591210007667542, | |
| "learning_rate": 2.2165420454740494e-05, | |
| "loss": 0.0643, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.1935483870967742, | |
| "grad_norm": 0.532319962978363, | |
| "learning_rate": 2.2120640473811656e-05, | |
| "loss": 0.0464, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.4334715008735657, | |
| "learning_rate": 2.2075555538987227e-05, | |
| "loss": 0.0669, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.206451612903226, | |
| "grad_norm": 0.4052492380142212, | |
| "learning_rate": 2.2030167079364007e-05, | |
| "loss": 0.0506, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.2129032258064516, | |
| "grad_norm": 0.763782799243927, | |
| "learning_rate": 2.1984476533659888e-05, | |
| "loss": 0.0477, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.2193548387096773, | |
| "grad_norm": 0.46810972690582275, | |
| "learning_rate": 2.1938485350168248e-05, | |
| "loss": 0.055, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.2258064516129032, | |
| "grad_norm": 0.4722144901752472, | |
| "learning_rate": 2.1892194986712045e-05, | |
| "loss": 0.053, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.232258064516129, | |
| "grad_norm": 0.5537333488464355, | |
| "learning_rate": 2.1845606910597616e-05, | |
| "loss": 0.0686, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.238709677419355, | |
| "grad_norm": 0.5123704671859741, | |
| "learning_rate": 2.179872259856814e-05, | |
| "loss": 0.0627, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.2451612903225806, | |
| "grad_norm": 0.5691571831703186, | |
| "learning_rate": 2.175154353675686e-05, | |
| "loss": 0.0601, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.2516129032258063, | |
| "grad_norm": 0.4747653007507324, | |
| "learning_rate": 2.1704071220639965e-05, | |
| "loss": 0.0551, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.2580645161290323, | |
| "grad_norm": 0.5692989826202393, | |
| "learning_rate": 2.1656307154989174e-05, | |
| "loss": 0.0482, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.2645161290322582, | |
| "grad_norm": 0.7472412586212158, | |
| "learning_rate": 2.1608252853824047e-05, | |
| "loss": 0.0609, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.270967741935484, | |
| "grad_norm": 0.568708062171936, | |
| "learning_rate": 2.1559909840364e-05, | |
| "loss": 0.0572, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.2774193548387096, | |
| "grad_norm": 0.6601235866546631, | |
| "learning_rate": 2.1511279646980016e-05, | |
| "loss": 0.0777, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.2838709677419355, | |
| "grad_norm": 0.429850697517395, | |
| "learning_rate": 2.1462363815146065e-05, | |
| "loss": 0.0454, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.2903225806451613, | |
| "grad_norm": 0.7702894806861877, | |
| "learning_rate": 2.1413163895390254e-05, | |
| "loss": 0.0655, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2967741935483872, | |
| "grad_norm": 0.4497153162956238, | |
| "learning_rate": 2.1363681447245686e-05, | |
| "loss": 0.0512, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.303225806451613, | |
| "grad_norm": 0.5631290674209595, | |
| "learning_rate": 2.1313918039200995e-05, | |
| "loss": 0.0645, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.3096774193548386, | |
| "grad_norm": 0.7414901852607727, | |
| "learning_rate": 2.1263875248650662e-05, | |
| "loss": 0.0561, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.3161290322580645, | |
| "grad_norm": 0.5053102970123291, | |
| "learning_rate": 2.121355466184499e-05, | |
| "loss": 0.0608, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.3225806451612903, | |
| "grad_norm": 10.00545597076416, | |
| "learning_rate": 2.116295787383985e-05, | |
| "loss": 0.0826, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.3290322580645162, | |
| "grad_norm": 0.6418637037277222, | |
| "learning_rate": 2.1112086488446085e-05, | |
| "loss": 0.0743, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.335483870967742, | |
| "grad_norm": 0.4627211391925812, | |
| "learning_rate": 2.1060942118178706e-05, | |
| "loss": 0.0476, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.3419354838709676, | |
| "grad_norm": 0.5375849604606628, | |
| "learning_rate": 2.1009526384205767e-05, | |
| "loss": 0.048, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.3483870967741935, | |
| "grad_norm": 0.606073796749115, | |
| "learning_rate": 2.095784091629697e-05, | |
| "loss": 0.0704, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.3548387096774195, | |
| "grad_norm": 0.44339242577552795, | |
| "learning_rate": 2.0905887352772004e-05, | |
| "loss": 0.0516, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.3612903225806452, | |
| "grad_norm": 0.6248610019683838, | |
| "learning_rate": 2.085366734044864e-05, | |
| "loss": 0.066, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.367741935483871, | |
| "grad_norm": 0.5914815664291382, | |
| "learning_rate": 2.080118253459049e-05, | |
| "loss": 0.0611, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.3741935483870968, | |
| "grad_norm": 0.45894381403923035, | |
| "learning_rate": 2.0748434598854573e-05, | |
| "loss": 0.0501, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.3806451612903226, | |
| "grad_norm": 0.49100032448768616, | |
| "learning_rate": 2.0695425205238557e-05, | |
| "loss": 0.0552, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.3870967741935485, | |
| "grad_norm": 0.528611958026886, | |
| "learning_rate": 2.0642156034027783e-05, | |
| "loss": 0.0639, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.3935483870967742, | |
| "grad_norm": 0.4487656056880951, | |
| "learning_rate": 2.0588628773741973e-05, | |
| "loss": 0.0435, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.547444224357605, | |
| "learning_rate": 2.0534845121081742e-05, | |
| "loss": 0.0547, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.4064516129032258, | |
| "grad_norm": 0.5207445621490479, | |
| "learning_rate": 2.0480806780874794e-05, | |
| "loss": 0.0574, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.4129032258064516, | |
| "grad_norm": 0.5784499049186707, | |
| "learning_rate": 2.0426515466021887e-05, | |
| "loss": 0.0608, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.4193548387096775, | |
| "grad_norm": 0.7198527455329895, | |
| "learning_rate": 2.0371972897442532e-05, | |
| "loss": 0.0639, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.4258064516129032, | |
| "grad_norm": 0.4550151228904724, | |
| "learning_rate": 2.031718080402046e-05, | |
| "loss": 0.0547, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.432258064516129, | |
| "grad_norm": 0.48588842153549194, | |
| "learning_rate": 2.026214092254881e-05, | |
| "loss": 0.0603, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.4387096774193548, | |
| "grad_norm": 0.5426737666130066, | |
| "learning_rate": 2.0206854997675072e-05, | |
| "loss": 0.0616, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.4451612903225808, | |
| "grad_norm": 0.5034387707710266, | |
| "learning_rate": 2.0151324781845787e-05, | |
| "loss": 0.0644, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.4516129032258065, | |
| "grad_norm": 0.5200063586235046, | |
| "learning_rate": 2.0095552035251007e-05, | |
| "loss": 0.0596, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.4580645161290322, | |
| "grad_norm": 0.4462428390979767, | |
| "learning_rate": 2.0039538525768496e-05, | |
| "loss": 0.0523, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.4645161290322581, | |
| "grad_norm": 0.5513397455215454, | |
| "learning_rate": 1.9983286028907687e-05, | |
| "loss": 0.0528, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.4709677419354839, | |
| "grad_norm": 0.44743800163269043, | |
| "learning_rate": 1.992679632775341e-05, | |
| "loss": 0.0649, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.4774193548387098, | |
| "grad_norm": 0.4505648910999298, | |
| "learning_rate": 1.9870071212909357e-05, | |
| "loss": 0.0453, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.4838709677419355, | |
| "grad_norm": 0.48718520998954773, | |
| "learning_rate": 1.9813112482441345e-05, | |
| "loss": 0.0664, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.4903225806451612, | |
| "grad_norm": 0.4392196834087372, | |
| "learning_rate": 1.9755921941820314e-05, | |
| "loss": 0.0504, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.4967741935483871, | |
| "grad_norm": 0.5312716364860535, | |
| "learning_rate": 1.9698501403865083e-05, | |
| "loss": 0.0699, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.5032258064516129, | |
| "grad_norm": 0.5387852787971497, | |
| "learning_rate": 1.9640852688684904e-05, | |
| "loss": 0.071, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.5096774193548388, | |
| "grad_norm": 0.4734801650047302, | |
| "learning_rate": 1.9582977623621766e-05, | |
| "loss": 0.0561, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.5161290322580645, | |
| "grad_norm": 0.4738084375858307, | |
| "learning_rate": 1.9524878043192463e-05, | |
| "loss": 0.0545, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.5225806451612902, | |
| "grad_norm": 0.5166822671890259, | |
| "learning_rate": 1.9466555789030456e-05, | |
| "loss": 0.0708, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.5290322580645161, | |
| "grad_norm": 0.5719185471534729, | |
| "learning_rate": 1.9408012709827485e-05, | |
| "loss": 0.073, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.535483870967742, | |
| "grad_norm": 0.5363075733184814, | |
| "learning_rate": 1.934925066127498e-05, | |
| "loss": 0.0581, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.5419354838709678, | |
| "grad_norm": 0.551699697971344, | |
| "learning_rate": 1.9290271506005236e-05, | |
| "loss": 0.0598, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.5483870967741935, | |
| "grad_norm": 0.5568850636482239, | |
| "learning_rate": 1.9231077113532363e-05, | |
| "loss": 0.0471, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.5548387096774192, | |
| "grad_norm": 0.5048776268959045, | |
| "learning_rate": 1.917166936019304e-05, | |
| "loss": 0.0613, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.5612903225806452, | |
| "grad_norm": 0.516986608505249, | |
| "learning_rate": 1.911205012908703e-05, | |
| "loss": 0.0678, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.567741935483871, | |
| "grad_norm": 0.48142287135124207, | |
| "learning_rate": 1.90522213100175e-05, | |
| "loss": 0.0557, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.5741935483870968, | |
| "grad_norm": 0.4997798800468445, | |
| "learning_rate": 1.8992184799431095e-05, | |
| "loss": 0.042, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.5806451612903225, | |
| "grad_norm": 0.5074776411056519, | |
| "learning_rate": 1.893194250035786e-05, | |
| "loss": 0.073, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.5870967741935482, | |
| "grad_norm": 0.5136696696281433, | |
| "learning_rate": 1.8871496322350883e-05, | |
| "loss": 0.0547, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.5935483870967742, | |
| "grad_norm": 0.6183574795722961, | |
| "learning_rate": 1.881084818142579e-05, | |
| "loss": 0.0708, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.5576770901679993, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.0629, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.6064516129032258, | |
| "grad_norm": 0.4211249351501465, | |
| "learning_rate": 1.868895370683179e-05, | |
| "loss": 0.0544, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "grad_norm": 0.43533676862716675, | |
| "learning_rate": 1.862771123695917e-05, | |
| "loss": 0.0568, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6193548387096774, | |
| "grad_norm": 0.48171842098236084, | |
| "learning_rate": 1.8566274531638516e-05, | |
| "loss": 0.0524, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.6258064516129034, | |
| "grad_norm": 0.459471195936203, | |
| "learning_rate": 1.850464553828307e-05, | |
| "loss": 0.0471, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.632258064516129, | |
| "grad_norm": 0.5311537384986877, | |
| "learning_rate": 1.844282621040119e-05, | |
| "loss": 0.0766, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.6387096774193548, | |
| "grad_norm": 0.5022658109664917, | |
| "learning_rate": 1.838081850753445e-05, | |
| "loss": 0.0579, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.6451612903225805, | |
| "grad_norm": 0.5516560077667236, | |
| "learning_rate": 1.8318624395195483e-05, | |
| "loss": 0.0616, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.6516129032258065, | |
| "grad_norm": 0.4552045166492462, | |
| "learning_rate": 1.825624584480573e-05, | |
| "loss": 0.0512, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.6580645161290324, | |
| "grad_norm": 0.5871717929840088, | |
| "learning_rate": 1.8193684833632925e-05, | |
| "loss": 0.0641, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.664516129032258, | |
| "grad_norm": 0.46038615703582764, | |
| "learning_rate": 1.8130943344728414e-05, | |
| "loss": 0.0459, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.6709677419354838, | |
| "grad_norm": 0.5282014608383179, | |
| "learning_rate": 1.8068023366864305e-05, | |
| "loss": 0.0569, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.6774193548387095, | |
| "grad_norm": 0.3797317147254944, | |
| "learning_rate": 1.800492689447043e-05, | |
| "loss": 0.0459, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.6838709677419355, | |
| "grad_norm": 0.5863360166549683, | |
| "learning_rate": 1.7941655927571125e-05, | |
| "loss": 0.0695, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.6903225806451614, | |
| "grad_norm": 0.5562090873718262, | |
| "learning_rate": 1.7878212471721824e-05, | |
| "loss": 0.0554, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.696774193548387, | |
| "grad_norm": 0.5164937973022461, | |
| "learning_rate": 1.781459853794551e-05, | |
| "loss": 0.0542, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.7032258064516128, | |
| "grad_norm": 0.5710752010345459, | |
| "learning_rate": 1.7750816142668937e-05, | |
| "loss": 0.0641, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.7096774193548387, | |
| "grad_norm": 0.43633976578712463, | |
| "learning_rate": 1.7686867307658743e-05, | |
| "loss": 0.0498, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.7161290322580647, | |
| "grad_norm": 0.5214335322380066, | |
| "learning_rate": 1.7622754059957343e-05, | |
| "loss": 0.054, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.7225806451612904, | |
| "grad_norm": 0.6009476780891418, | |
| "learning_rate": 1.7558478431818702e-05, | |
| "loss": 0.0538, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.729032258064516, | |
| "grad_norm": 0.5809276700019836, | |
| "learning_rate": 1.749404246064388e-05, | |
| "loss": 0.0751, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.7354838709677418, | |
| "grad_norm": 0.5733875632286072, | |
| "learning_rate": 1.7429448188916483e-05, | |
| "loss": 0.0685, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.7419354838709677, | |
| "grad_norm": 0.3861143887042999, | |
| "learning_rate": 1.7364697664137912e-05, | |
| "loss": 0.044, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.7483870967741937, | |
| "grad_norm": 0.8718386292457581, | |
| "learning_rate": 1.7299792938762443e-05, | |
| "loss": 0.0807, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.7548387096774194, | |
| "grad_norm": 0.6809967160224915, | |
| "learning_rate": 1.72347360701322e-05, | |
| "loss": 0.0698, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.761290322580645, | |
| "grad_norm": 0.45045140385627747, | |
| "learning_rate": 1.7169529120411922e-05, | |
| "loss": 0.0552, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.7677419354838708, | |
| "grad_norm": 0.46889108419418335, | |
| "learning_rate": 1.710417415652359e-05, | |
| "loss": 0.0576, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.7741935483870968, | |
| "grad_norm": 0.5053566098213196, | |
| "learning_rate": 1.7038673250080934e-05, | |
| "loss": 0.0535, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.7806451612903227, | |
| "grad_norm": 0.4330599904060364, | |
| "learning_rate": 1.6973028477323742e-05, | |
| "loss": 0.0518, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.7870967741935484, | |
| "grad_norm": 0.4866834580898285, | |
| "learning_rate": 1.6907241919052068e-05, | |
| "loss": 0.051, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.793548387096774, | |
| "grad_norm": 0.6048698425292969, | |
| "learning_rate": 1.6841315660560252e-05, | |
| "loss": 0.0683, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.39490604400634766, | |
| "learning_rate": 1.677525179157086e-05, | |
| "loss": 0.0515, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.8064516129032258, | |
| "grad_norm": 0.4544709324836731, | |
| "learning_rate": 1.6709052406168393e-05, | |
| "loss": 0.0624, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8129032258064517, | |
| "grad_norm": 0.5158767700195312, | |
| "learning_rate": 1.664271960273295e-05, | |
| "loss": 0.0575, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.8193548387096774, | |
| "grad_norm": 0.5172263979911804, | |
| "learning_rate": 1.6576255483873686e-05, | |
| "loss": 0.0578, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.8258064516129031, | |
| "grad_norm": 0.4233238995075226, | |
| "learning_rate": 1.6509662156362196e-05, | |
| "loss": 0.0547, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.832258064516129, | |
| "grad_norm": 0.45361143350601196, | |
| "learning_rate": 1.6442941731065697e-05, | |
| "loss": 0.0512, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.838709677419355, | |
| "grad_norm": 0.5802233219146729, | |
| "learning_rate": 1.637609632288014e-05, | |
| "loss": 0.0596, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.8451612903225807, | |
| "grad_norm": 0.5369323492050171, | |
| "learning_rate": 1.630912805066317e-05, | |
| "loss": 0.0646, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.8516129032258064, | |
| "grad_norm": 0.45122525095939636, | |
| "learning_rate": 1.6242039037166977e-05, | |
| "loss": 0.0517, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.8580645161290321, | |
| "grad_norm": 0.39205196499824524, | |
| "learning_rate": 1.6174831408970964e-05, | |
| "loss": 0.0491, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.864516129032258, | |
| "grad_norm": 0.4472959637641907, | |
| "learning_rate": 1.6107507296414383e-05, | |
| "loss": 0.049, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.870967741935484, | |
| "grad_norm": 0.41624531149864197, | |
| "learning_rate": 1.6040068833528797e-05, | |
| "loss": 0.0483, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.8774193548387097, | |
| "grad_norm": 0.43875551223754883, | |
| "learning_rate": 1.597251815797044e-05, | |
| "loss": 0.0487, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.8838709677419354, | |
| "grad_norm": 0.4978736937046051, | |
| "learning_rate": 1.5904857410952417e-05, | |
| "loss": 0.0573, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.8903225806451613, | |
| "grad_norm": 0.5798497796058655, | |
| "learning_rate": 1.5837088737176896e-05, | |
| "loss": 0.0683, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.896774193548387, | |
| "grad_norm": 0.7377052903175354, | |
| "learning_rate": 1.5769214284767086e-05, | |
| "loss": 0.0583, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.903225806451613, | |
| "grad_norm": 0.4153827428817749, | |
| "learning_rate": 1.570123620519915e-05, | |
| "loss": 0.0543, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.9096774193548387, | |
| "grad_norm": 0.4852810800075531, | |
| "learning_rate": 1.563315665323401e-05, | |
| "loss": 0.0636, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.9161290322580644, | |
| "grad_norm": 0.5545767545700073, | |
| "learning_rate": 1.5564977786849055e-05, | |
| "loss": 0.062, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.9225806451612903, | |
| "grad_norm": 0.4363822937011719, | |
| "learning_rate": 1.549670176716973e-05, | |
| "loss": 0.0516, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.9290322580645163, | |
| "grad_norm": 0.5309383273124695, | |
| "learning_rate": 1.5428330758401027e-05, | |
| "loss": 0.0647, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.935483870967742, | |
| "grad_norm": 0.6617056131362915, | |
| "learning_rate": 1.53598669277589e-05, | |
| "loss": 0.0641, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9419354838709677, | |
| "grad_norm": 0.49968254566192627, | |
| "learning_rate": 1.529131244540155e-05, | |
| "loss": 0.0585, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.9483870967741934, | |
| "grad_norm": 0.40158751606941223, | |
| "learning_rate": 1.5222669484360644e-05, | |
| "loss": 0.0537, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.9548387096774194, | |
| "grad_norm": 0.4537198543548584, | |
| "learning_rate": 1.5153940220472451e-05, | |
| "loss": 0.0511, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.9612903225806453, | |
| "grad_norm": 0.47163766622543335, | |
| "learning_rate": 1.5085126832308843e-05, | |
| "loss": 0.0532, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.967741935483871, | |
| "grad_norm": 0.4738634526729584, | |
| "learning_rate": 1.5016231501108253e-05, | |
| "loss": 0.0615, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.9741935483870967, | |
| "grad_norm": 0.38560569286346436, | |
| "learning_rate": 1.494725641070654e-05, | |
| "loss": 0.0522, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.9806451612903224, | |
| "grad_norm": 0.5569445490837097, | |
| "learning_rate": 1.4878203747467764e-05, | |
| "loss": 0.0731, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.9870967741935484, | |
| "grad_norm": 0.38958773016929626, | |
| "learning_rate": 1.480907570021487e-05, | |
| "loss": 0.0461, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.9935483870967743, | |
| "grad_norm": 0.4473820924758911, | |
| "learning_rate": 1.4739874460160316e-05, | |
| "loss": 0.0555, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.39891934394836426, | |
| "learning_rate": 1.4670602220836633e-05, | |
| "loss": 0.051, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.0064516129032257, | |
| "grad_norm": 0.474127858877182, | |
| "learning_rate": 1.4601261178026854e-05, | |
| "loss": 0.0401, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.0129032258064514, | |
| "grad_norm": 0.3391839563846588, | |
| "learning_rate": 1.4531853529694956e-05, | |
| "loss": 0.0333, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.0193548387096776, | |
| "grad_norm": 0.3230273723602295, | |
| "learning_rate": 1.446238147591616e-05, | |
| "loss": 0.0282, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.0258064516129033, | |
| "grad_norm": 0.3246399462223053, | |
| "learning_rate": 1.439284721880721e-05, | |
| "loss": 0.0345, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.032258064516129, | |
| "grad_norm": 0.41817039251327515, | |
| "learning_rate": 1.4323252962456554e-05, | |
| "loss": 0.0288, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.0387096774193547, | |
| "grad_norm": 0.48674166202545166, | |
| "learning_rate": 1.4253600912854497e-05, | |
| "loss": 0.0354, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.0451612903225804, | |
| "grad_norm": 0.42214757204055786, | |
| "learning_rate": 1.4183893277823265e-05, | |
| "loss": 0.0388, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.0516129032258066, | |
| "grad_norm": 0.5475701093673706, | |
| "learning_rate": 1.411413226694702e-05, | |
| "loss": 0.0294, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.0580645161290323, | |
| "grad_norm": 0.5432962775230408, | |
| "learning_rate": 1.4044320091501834e-05, | |
| "loss": 0.0372, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.064516129032258, | |
| "grad_norm": 0.49539855122566223, | |
| "learning_rate": 1.3974458964385579e-05, | |
| "loss": 0.0425, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.0709677419354837, | |
| "grad_norm": 0.340425044298172, | |
| "learning_rate": 1.3904551100047791e-05, | |
| "loss": 0.026, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.07741935483871, | |
| "grad_norm": 0.4815217852592468, | |
| "learning_rate": 1.3834598714419486e-05, | |
| "loss": 0.0352, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.0838709677419356, | |
| "grad_norm": 0.4457317888736725, | |
| "learning_rate": 1.3764604024842903e-05, | |
| "loss": 0.028, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.0903225806451613, | |
| "grad_norm": 0.45776546001434326, | |
| "learning_rate": 1.369456925000123e-05, | |
| "loss": 0.0287, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.096774193548387, | |
| "grad_norm": 0.3825792968273163, | |
| "learning_rate": 1.362449660984826e-05, | |
| "loss": 0.0257, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.1032258064516127, | |
| "grad_norm": 0.44209763407707214, | |
| "learning_rate": 1.3554388325538059e-05, | |
| "loss": 0.0274, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.109677419354839, | |
| "grad_norm": 0.37732046842575073, | |
| "learning_rate": 1.3484246619354524e-05, | |
| "loss": 0.0263, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.1161290322580646, | |
| "grad_norm": 0.4975365698337555, | |
| "learning_rate": 1.3414073714640951e-05, | |
| "loss": 0.0294, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.1225806451612903, | |
| "grad_norm": 0.37548768520355225, | |
| "learning_rate": 1.3343871835729565e-05, | |
| "loss": 0.0261, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.129032258064516, | |
| "grad_norm": 0.722154438495636, | |
| "learning_rate": 1.3273643207871025e-05, | |
| "loss": 0.0296, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.135483870967742, | |
| "grad_norm": 0.513611912727356, | |
| "learning_rate": 1.3203390057163855e-05, | |
| "loss": 0.0326, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.141935483870968, | |
| "grad_norm": 0.43579375743865967, | |
| "learning_rate": 1.3133114610483909e-05, | |
| "loss": 0.035, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.1483870967741936, | |
| "grad_norm": 0.4927336275577545, | |
| "learning_rate": 1.3062819095413786e-05, | |
| "loss": 0.0358, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.1548387096774193, | |
| "grad_norm": 0.43542489409446716, | |
| "learning_rate": 1.2992505740172196e-05, | |
| "loss": 0.035, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.161290322580645, | |
| "grad_norm": 0.34009236097335815, | |
| "learning_rate": 1.2922176773543355e-05, | |
| "loss": 0.0264, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.167741935483871, | |
| "grad_norm": 0.4710192084312439, | |
| "learning_rate": 1.2851834424806314e-05, | |
| "loss": 0.0403, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.174193548387097, | |
| "grad_norm": 0.8653304576873779, | |
| "learning_rate": 1.2781480923664326e-05, | |
| "loss": 0.0839, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.1806451612903226, | |
| "grad_norm": 0.7528795599937439, | |
| "learning_rate": 1.2711118500174138e-05, | |
| "loss": 0.0488, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.1870967741935483, | |
| "grad_norm": 0.5551451444625854, | |
| "learning_rate": 1.2640749384675324e-05, | |
| "loss": 0.0223, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.193548387096774, | |
| "grad_norm": 0.42200708389282227, | |
| "learning_rate": 1.2570375807719576e-05, | |
| "loss": 0.0305, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.5258976817131042, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.0455, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.206451612903226, | |
| "grad_norm": 0.495807945728302, | |
| "learning_rate": 1.242962419228043e-05, | |
| "loss": 0.0323, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.2129032258064516, | |
| "grad_norm": 0.5464356541633606, | |
| "learning_rate": 1.2359250615324678e-05, | |
| "loss": 0.0325, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.2193548387096773, | |
| "grad_norm": 0.5555934906005859, | |
| "learning_rate": 1.2288881499825863e-05, | |
| "loss": 0.0504, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.225806451612903, | |
| "grad_norm": 0.41927701234817505, | |
| "learning_rate": 1.2218519076335677e-05, | |
| "loss": 0.0288, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.232258064516129, | |
| "grad_norm": 0.5449569821357727, | |
| "learning_rate": 1.2148165575193685e-05, | |
| "loss": 0.0328, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.238709677419355, | |
| "grad_norm": 0.4198172688484192, | |
| "learning_rate": 1.2077823226456648e-05, | |
| "loss": 0.0284, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.2451612903225806, | |
| "grad_norm": 0.5396814346313477, | |
| "learning_rate": 1.2007494259827809e-05, | |
| "loss": 0.0379, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.2516129032258063, | |
| "grad_norm": 0.4842919409275055, | |
| "learning_rate": 1.1937180904586215e-05, | |
| "loss": 0.0316, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.258064516129032, | |
| "grad_norm": 0.5152572989463806, | |
| "learning_rate": 1.1866885389516092e-05, | |
| "loss": 0.0321, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.264516129032258, | |
| "grad_norm": 0.556614875793457, | |
| "learning_rate": 1.179660994283615e-05, | |
| "loss": 0.0372, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.270967741935484, | |
| "grad_norm": 0.5159235000610352, | |
| "learning_rate": 1.1726356792128978e-05, | |
| "loss": 0.0328, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.2774193548387096, | |
| "grad_norm": 0.5564429759979248, | |
| "learning_rate": 1.1656128164270436e-05, | |
| "loss": 0.0304, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.2838709677419353, | |
| "grad_norm": 0.6227903366088867, | |
| "learning_rate": 1.1585926285359049e-05, | |
| "loss": 0.0321, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.2903225806451615, | |
| "grad_norm": 0.5218878388404846, | |
| "learning_rate": 1.1515753380645479e-05, | |
| "loss": 0.0358, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.296774193548387, | |
| "grad_norm": 0.49731266498565674, | |
| "learning_rate": 1.1445611674461942e-05, | |
| "loss": 0.0331, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.303225806451613, | |
| "grad_norm": 0.5095941424369812, | |
| "learning_rate": 1.1375503390151737e-05, | |
| "loss": 0.0315, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.3096774193548386, | |
| "grad_norm": 0.4576358199119568, | |
| "learning_rate": 1.1305430749998775e-05, | |
| "loss": 0.0304, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.3161290322580643, | |
| "grad_norm": 0.5103798508644104, | |
| "learning_rate": 1.12353959751571e-05, | |
| "loss": 0.0281, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.3225806451612905, | |
| "grad_norm": 0.5072308778762817, | |
| "learning_rate": 1.1165401285580515e-05, | |
| "loss": 0.0267, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.329032258064516, | |
| "grad_norm": 0.45558691024780273, | |
| "learning_rate": 1.1095448899952212e-05, | |
| "loss": 0.0302, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.335483870967742, | |
| "grad_norm": 0.4773171842098236, | |
| "learning_rate": 1.1025541035614427e-05, | |
| "loss": 0.0307, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.3419354838709676, | |
| "grad_norm": 0.4630301892757416, | |
| "learning_rate": 1.0955679908498171e-05, | |
| "loss": 0.0292, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.3483870967741938, | |
| "grad_norm": 0.5814460515975952, | |
| "learning_rate": 1.0885867733052985e-05, | |
| "loss": 0.034, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.3548387096774195, | |
| "grad_norm": 0.3135308623313904, | |
| "learning_rate": 1.0816106722176741e-05, | |
| "loss": 0.0264, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.361290322580645, | |
| "grad_norm": 0.4219888150691986, | |
| "learning_rate": 1.0746399087145504e-05, | |
| "loss": 0.0304, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.367741935483871, | |
| "grad_norm": 0.4246158003807068, | |
| "learning_rate": 1.0676747037543447e-05, | |
| "loss": 0.032, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.3741935483870966, | |
| "grad_norm": 0.4565359950065613, | |
| "learning_rate": 1.0607152781192796e-05, | |
| "loss": 0.0326, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.3806451612903228, | |
| "grad_norm": 0.4495943486690521, | |
| "learning_rate": 1.053761852408384e-05, | |
| "loss": 0.0307, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.3870967741935485, | |
| "grad_norm": 0.47505924105644226, | |
| "learning_rate": 1.0468146470305047e-05, | |
| "loss": 0.0366, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.393548387096774, | |
| "grad_norm": 0.41802337765693665, | |
| "learning_rate": 1.039873882197315e-05, | |
| "loss": 0.0242, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.4308302104473114, | |
| "learning_rate": 1.0329397779163372e-05, | |
| "loss": 0.0303, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.4064516129032256, | |
| "grad_norm": 0.3776704967021942, | |
| "learning_rate": 1.0260125539839686e-05, | |
| "loss": 0.0224, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.412903225806452, | |
| "grad_norm": 0.3952430188655853, | |
| "learning_rate": 1.0190924299785138e-05, | |
| "loss": 0.0236, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.4193548387096775, | |
| "grad_norm": 0.5212628841400146, | |
| "learning_rate": 1.0121796252532237e-05, | |
| "loss": 0.0352, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.425806451612903, | |
| "grad_norm": 0.5264010429382324, | |
| "learning_rate": 1.0052743589293463e-05, | |
| "loss": 0.0366, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.432258064516129, | |
| "grad_norm": 0.42148974537849426, | |
| "learning_rate": 9.983768498891747e-06, | |
| "loss": 0.0281, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 2.4387096774193546, | |
| "grad_norm": 0.4387865960597992, | |
| "learning_rate": 9.91487316769116e-06, | |
| "loss": 0.0321, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.445161290322581, | |
| "grad_norm": 0.4530801475048065, | |
| "learning_rate": 9.846059779527552e-06, | |
| "loss": 0.03, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 2.4516129032258065, | |
| "grad_norm": 0.44786474108695984, | |
| "learning_rate": 9.777330515639356e-06, | |
| "loss": 0.0312, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.458064516129032, | |
| "grad_norm": 0.42808324098587036, | |
| "learning_rate": 9.708687554598454e-06, | |
| "loss": 0.0321, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 2.464516129032258, | |
| "grad_norm": 0.4658293128013611, | |
| "learning_rate": 9.640133072241105e-06, | |
| "loss": 0.0335, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 2.4709677419354836, | |
| "grad_norm": 0.45854416489601135, | |
| "learning_rate": 9.571669241598974e-06, | |
| "loss": 0.0306, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 2.47741935483871, | |
| "grad_norm": 0.5602400302886963, | |
| "learning_rate": 9.503298232830274e-06, | |
| "loss": 0.0425, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 2.4838709677419355, | |
| "grad_norm": 0.4135296940803528, | |
| "learning_rate": 9.43502221315095e-06, | |
| "loss": 0.0317, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.490322580645161, | |
| "grad_norm": 0.6756112575531006, | |
| "learning_rate": 9.366843346765992e-06, | |
| "loss": 0.0492, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 2.496774193548387, | |
| "grad_norm": 0.6048617362976074, | |
| "learning_rate": 9.298763794800856e-06, | |
| "loss": 0.0326, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 2.5032258064516126, | |
| "grad_norm": 0.3737858831882477, | |
| "learning_rate": 9.230785715232917e-06, | |
| "loss": 0.0226, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 2.509677419354839, | |
| "grad_norm": 0.49958306550979614, | |
| "learning_rate": 9.162911262823104e-06, | |
| "loss": 0.0293, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 2.5161290322580645, | |
| "grad_norm": 0.4132345914840698, | |
| "learning_rate": 9.095142589047586e-06, | |
| "loss": 0.0268, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.52258064516129, | |
| "grad_norm": 0.5339500308036804, | |
| "learning_rate": 9.027481842029567e-06, | |
| "loss": 0.0308, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 2.5290322580645164, | |
| "grad_norm": 0.5680338740348816, | |
| "learning_rate": 8.9599311664712e-06, | |
| "loss": 0.026, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 2.535483870967742, | |
| "grad_norm": 0.4945621192455292, | |
| "learning_rate": 8.89249270358562e-06, | |
| "loss": 0.0414, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 2.541935483870968, | |
| "grad_norm": 0.478188157081604, | |
| "learning_rate": 8.825168591029042e-06, | |
| "loss": 0.0325, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.5483870967741935, | |
| "grad_norm": 0.41539856791496277, | |
| "learning_rate": 8.757960962833026e-06, | |
| "loss": 0.0276, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.554838709677419, | |
| "grad_norm": 0.41548025608062744, | |
| "learning_rate": 8.69087194933683e-06, | |
| "loss": 0.0258, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.5612903225806454, | |
| "grad_norm": 0.7209835052490234, | |
| "learning_rate": 8.623903677119866e-06, | |
| "loss": 0.0275, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 2.567741935483871, | |
| "grad_norm": 0.45113834738731384, | |
| "learning_rate": 8.557058268934306e-06, | |
| "loss": 0.0276, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.574193548387097, | |
| "grad_norm": 0.4919924736022949, | |
| "learning_rate": 8.490337843637807e-06, | |
| "loss": 0.0352, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 2.5806451612903225, | |
| "grad_norm": 0.4441167414188385, | |
| "learning_rate": 8.423744516126313e-06, | |
| "loss": 0.0312, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.587096774193548, | |
| "grad_norm": 0.3870048522949219, | |
| "learning_rate": 8.357280397267054e-06, | |
| "loss": 0.0273, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 2.5935483870967744, | |
| "grad_norm": 0.4747593104839325, | |
| "learning_rate": 8.29094759383161e-06, | |
| "loss": 0.0428, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.3494237959384918, | |
| "learning_rate": 8.224748208429142e-06, | |
| "loss": 0.0249, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 2.606451612903226, | |
| "grad_norm": 0.3618505001068115, | |
| "learning_rate": 8.158684339439748e-06, | |
| "loss": 0.0221, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 2.6129032258064515, | |
| "grad_norm": 0.45744070410728455, | |
| "learning_rate": 8.092758080947939e-06, | |
| "loss": 0.0303, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.6193548387096772, | |
| "grad_norm": 0.3921363055706024, | |
| "learning_rate": 8.02697152267626e-06, | |
| "loss": 0.0267, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 2.6258064516129034, | |
| "grad_norm": 0.5149343013763428, | |
| "learning_rate": 7.961326749919069e-06, | |
| "loss": 0.0347, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 2.632258064516129, | |
| "grad_norm": 0.5246243476867676, | |
| "learning_rate": 7.895825843476412e-06, | |
| "loss": 0.0318, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.638709677419355, | |
| "grad_norm": 0.5338672995567322, | |
| "learning_rate": 7.83047087958808e-06, | |
| "loss": 0.0331, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 2.6451612903225805, | |
| "grad_norm": 0.4028920531272888, | |
| "learning_rate": 7.7652639298678e-06, | |
| "loss": 0.0251, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.6516129032258062, | |
| "grad_norm": 0.3391985297203064, | |
| "learning_rate": 7.70020706123756e-06, | |
| "loss": 0.0206, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 2.6580645161290324, | |
| "grad_norm": 0.4651046097278595, | |
| "learning_rate": 7.635302335862094e-06, | |
| "loss": 0.0242, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 2.664516129032258, | |
| "grad_norm": 0.4581477642059326, | |
| "learning_rate": 7.570551811083521e-06, | |
| "loss": 0.0334, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 2.670967741935484, | |
| "grad_norm": 0.629748523235321, | |
| "learning_rate": 7.505957539356126e-06, | |
| "loss": 0.0426, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 2.6774193548387095, | |
| "grad_norm": 0.44972798228263855, | |
| "learning_rate": 7.441521568181299e-06, | |
| "loss": 0.0279, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.6838709677419352, | |
| "grad_norm": 0.49497148394584656, | |
| "learning_rate": 7.37724594004266e-06, | |
| "loss": 0.0331, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 2.6903225806451614, | |
| "grad_norm": 0.4186260998249054, | |
| "learning_rate": 7.313132692341263e-06, | |
| "loss": 0.0294, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 2.696774193548387, | |
| "grad_norm": 0.4715961813926697, | |
| "learning_rate": 7.249183857331064e-06, | |
| "loss": 0.0293, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 2.703225806451613, | |
| "grad_norm": 0.48064178228378296, | |
| "learning_rate": 7.185401462054495e-06, | |
| "loss": 0.0312, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 2.709677419354839, | |
| "grad_norm": 0.4826470613479614, | |
| "learning_rate": 7.121787528278177e-06, | |
| "loss": 0.0303, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.7161290322580647, | |
| "grad_norm": 0.39333951473236084, | |
| "learning_rate": 7.058344072428877e-06, | |
| "loss": 0.0211, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 2.7225806451612904, | |
| "grad_norm": 0.3964556157588959, | |
| "learning_rate": 6.99507310552957e-06, | |
| "loss": 0.031, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 2.729032258064516, | |
| "grad_norm": 0.5450259447097778, | |
| "learning_rate": 6.931976633135695e-06, | |
| "loss": 0.0344, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 2.735483870967742, | |
| "grad_norm": 0.4331640601158142, | |
| "learning_rate": 6.869056655271588e-06, | |
| "loss": 0.0261, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 2.741935483870968, | |
| "grad_norm": 0.46446603536605835, | |
| "learning_rate": 6.806315166367075e-06, | |
| "loss": 0.0311, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.7483870967741937, | |
| "grad_norm": 0.5200790166854858, | |
| "learning_rate": 6.743754155194268e-06, | |
| "loss": 0.0292, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 2.7548387096774194, | |
| "grad_norm": 0.6154363751411438, | |
| "learning_rate": 6.681375604804521e-06, | |
| "loss": 0.0252, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 2.761290322580645, | |
| "grad_norm": 0.43054288625717163, | |
| "learning_rate": 6.619181492465557e-06, | |
| "loss": 0.0225, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 2.767741935483871, | |
| "grad_norm": 0.5042747259140015, | |
| "learning_rate": 6.55717378959881e-06, | |
| "loss": 0.0266, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 2.774193548387097, | |
| "grad_norm": 0.5118414759635925, | |
| "learning_rate": 6.4953544617169376e-06, | |
| "loss": 0.0324, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.7806451612903227, | |
| "grad_norm": 0.4071415364742279, | |
| "learning_rate": 6.43372546836149e-06, | |
| "loss": 0.0306, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 2.7870967741935484, | |
| "grad_norm": 0.43498843908309937, | |
| "learning_rate": 6.372288763040833e-06, | |
| "loss": 0.0267, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.793548387096774, | |
| "grad_norm": 0.6407294273376465, | |
| "learning_rate": 6.3110462931682075e-06, | |
| "loss": 0.0529, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.4058496356010437, | |
| "learning_rate": 6.250000000000003e-06, | |
| "loss": 0.0289, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.806451612903226, | |
| "grad_norm": 0.34818795323371887, | |
| "learning_rate": 6.1891518185742116e-06, | |
| "loss": 0.0236, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.8129032258064517, | |
| "grad_norm": 0.4517665505409241, | |
| "learning_rate": 6.1285036776491165e-06, | |
| "loss": 0.0341, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.8193548387096774, | |
| "grad_norm": 0.5423181056976318, | |
| "learning_rate": 6.068057499642144e-06, | |
| "loss": 0.0406, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 2.825806451612903, | |
| "grad_norm": 0.4574117362499237, | |
| "learning_rate": 6.007815200568906e-06, | |
| "loss": 0.0344, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.832258064516129, | |
| "grad_norm": 0.4028095006942749, | |
| "learning_rate": 5.9477786899825024e-06, | |
| "loss": 0.026, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 2.838709677419355, | |
| "grad_norm": 0.4277281165122986, | |
| "learning_rate": 5.8879498709129735e-06, | |
| "loss": 0.032, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.8451612903225807, | |
| "grad_norm": 0.4217607080936432, | |
| "learning_rate": 5.82833063980696e-06, | |
| "loss": 0.0275, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 2.8516129032258064, | |
| "grad_norm": 0.4865557849407196, | |
| "learning_rate": 5.7689228864676394e-06, | |
| "loss": 0.0344, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.858064516129032, | |
| "grad_norm": 0.44111689925193787, | |
| "learning_rate": 5.70972849399477e-06, | |
| "loss": 0.028, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 2.864516129032258, | |
| "grad_norm": 0.5612359046936035, | |
| "learning_rate": 5.650749338725019e-06, | |
| "loss": 0.0433, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.870967741935484, | |
| "grad_norm": 0.42652663588523865, | |
| "learning_rate": 5.591987290172518e-06, | |
| "loss": 0.0266, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.8774193548387097, | |
| "grad_norm": 0.43139341473579407, | |
| "learning_rate": 5.533444210969546e-06, | |
| "loss": 0.0228, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.8838709677419354, | |
| "grad_norm": 0.3348155915737152, | |
| "learning_rate": 5.475121956807537e-06, | |
| "loss": 0.0236, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 2.8903225806451616, | |
| "grad_norm": 0.41803082823753357, | |
| "learning_rate": 5.417022376378239e-06, | |
| "loss": 0.0275, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.896774193548387, | |
| "grad_norm": 0.4131038784980774, | |
| "learning_rate": 5.359147311315094e-06, | |
| "loss": 0.0265, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 2.903225806451613, | |
| "grad_norm": 0.5227479934692383, | |
| "learning_rate": 5.30149859613492e-06, | |
| "loss": 0.025, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.9096774193548387, | |
| "grad_norm": 0.43142953515052795, | |
| "learning_rate": 5.244078058179691e-06, | |
| "loss": 0.0249, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.9161290322580644, | |
| "grad_norm": 0.4158158600330353, | |
| "learning_rate": 5.186887517558653e-06, | |
| "loss": 0.0297, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.9225806451612906, | |
| "grad_norm": 0.3552153706550598, | |
| "learning_rate": 5.129928787090646e-06, | |
| "loss": 0.0234, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 2.9290322580645163, | |
| "grad_norm": 0.49204781651496887, | |
| "learning_rate": 5.073203672246593e-06, | |
| "loss": 0.0379, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.935483870967742, | |
| "grad_norm": 0.38140571117401123, | |
| "learning_rate": 5.016713971092311e-06, | |
| "loss": 0.0294, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.9419354838709677, | |
| "grad_norm": 0.5261517763137817, | |
| "learning_rate": 4.960461474231505e-06, | |
| "loss": 0.0305, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 2.9483870967741934, | |
| "grad_norm": 0.6391315460205078, | |
| "learning_rate": 4.904447964748993e-06, | |
| "loss": 0.038, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 2.9548387096774196, | |
| "grad_norm": 0.3812016546726227, | |
| "learning_rate": 4.848675218154214e-06, | |
| "loss": 0.0259, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 2.9612903225806453, | |
| "grad_norm": 0.4748527407646179, | |
| "learning_rate": 4.793145002324933e-06, | |
| "loss": 0.0329, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 2.967741935483871, | |
| "grad_norm": 0.4919755458831787, | |
| "learning_rate": 4.737859077451191e-06, | |
| "loss": 0.0253, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.9741935483870967, | |
| "grad_norm": 0.4986102879047394, | |
| "learning_rate": 4.68281919597954e-06, | |
| "loss": 0.0293, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 2.9806451612903224, | |
| "grad_norm": 0.48589223623275757, | |
| "learning_rate": 4.6280271025574695e-06, | |
| "loss": 0.0287, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 2.9870967741935486, | |
| "grad_norm": 0.4930824935436249, | |
| "learning_rate": 4.573484533978119e-06, | |
| "loss": 0.0258, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 2.9935483870967743, | |
| "grad_norm": 0.38358667492866516, | |
| "learning_rate": 4.5191932191252075e-06, | |
| "loss": 0.0235, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.5300599336624146, | |
| "learning_rate": 4.465154878918258e-06, | |
| "loss": 0.0309, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.0064516129032257, | |
| "grad_norm": 0.24884271621704102, | |
| "learning_rate": 4.411371226258032e-06, | |
| "loss": 0.0182, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.0129032258064514, | |
| "grad_norm": 0.3061859607696533, | |
| "learning_rate": 4.3578439659722246e-06, | |
| "loss": 0.0161, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 3.0193548387096776, | |
| "grad_norm": 0.3014248311519623, | |
| "learning_rate": 4.304574794761447e-06, | |
| "loss": 0.0135, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 3.0258064516129033, | |
| "grad_norm": 0.31640511751174927, | |
| "learning_rate": 4.251565401145432e-06, | |
| "loss": 0.0168, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 3.032258064516129, | |
| "grad_norm": 0.25447705388069153, | |
| "learning_rate": 4.1988174654095104e-06, | |
| "loss": 0.0117, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.0387096774193547, | |
| "grad_norm": 0.31153520941734314, | |
| "learning_rate": 4.146332659551364e-06, | |
| "loss": 0.0147, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 3.0451612903225804, | |
| "grad_norm": 0.32956749200820923, | |
| "learning_rate": 4.094112647227996e-06, | |
| "loss": 0.015, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 3.0516129032258066, | |
| "grad_norm": 0.3105918765068054, | |
| "learning_rate": 4.042159083703031e-06, | |
| "loss": 0.0136, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 3.0580645161290323, | |
| "grad_norm": 0.3890332281589508, | |
| "learning_rate": 3.9904736157942355e-06, | |
| "loss": 0.0128, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 3.064516129032258, | |
| "grad_norm": 0.2500901222229004, | |
| "learning_rate": 3.939057881821295e-06, | |
| "loss": 0.0109, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.0709677419354837, | |
| "grad_norm": 0.33506497740745544, | |
| "learning_rate": 3.887913511553917e-06, | |
| "loss": 0.0138, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 3.07741935483871, | |
| "grad_norm": 0.35200193524360657, | |
| "learning_rate": 3.837042126160157e-06, | |
| "loss": 0.0163, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 3.0838709677419356, | |
| "grad_norm": 0.33882763981819153, | |
| "learning_rate": 3.786445338155013e-06, | |
| "loss": 0.0125, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 3.0903225806451613, | |
| "grad_norm": 0.4239828586578369, | |
| "learning_rate": 3.736124751349343e-06, | |
| "loss": 0.0146, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 3.096774193548387, | |
| "grad_norm": 0.39569729566574097, | |
| "learning_rate": 3.6860819607990108e-06, | |
| "loss": 0.0149, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.1032258064516127, | |
| "grad_norm": 0.4424724578857422, | |
| "learning_rate": 3.6363185527543156e-06, | |
| "loss": 0.0147, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 3.109677419354839, | |
| "grad_norm": 0.54300457239151, | |
| "learning_rate": 3.5868361046097475e-06, | |
| "loss": 0.0166, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 3.1161290322580646, | |
| "grad_norm": 0.41813674569129944, | |
| "learning_rate": 3.537636184853939e-06, | |
| "loss": 0.0131, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 3.1225806451612903, | |
| "grad_norm": 0.4122736155986786, | |
| "learning_rate": 3.4887203530199864e-06, | |
| "loss": 0.0141, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 3.129032258064516, | |
| "grad_norm": 0.39058443903923035, | |
| "learning_rate": 3.440090159636003e-06, | |
| "loss": 0.0115, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.135483870967742, | |
| "grad_norm": 0.42365285754203796, | |
| "learning_rate": 3.391747146175954e-06, | |
| "loss": 0.0097, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 3.141935483870968, | |
| "grad_norm": 0.5885961055755615, | |
| "learning_rate": 3.3436928450108264e-06, | |
| "loss": 0.0219, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 3.1483870967741936, | |
| "grad_norm": 0.39843595027923584, | |
| "learning_rate": 3.2959287793600356e-06, | |
| "loss": 0.0098, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 3.1548387096774193, | |
| "grad_norm": 0.27729499340057373, | |
| "learning_rate": 3.2484564632431396e-06, | |
| "loss": 0.0068, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 3.161290322580645, | |
| "grad_norm": 0.4415301978588104, | |
| "learning_rate": 3.2012774014318625e-06, | |
| "loss": 0.0151, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.167741935483871, | |
| "grad_norm": 0.40353745222091675, | |
| "learning_rate": 3.154393089402391e-06, | |
| "loss": 0.016, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 3.174193548387097, | |
| "grad_norm": 0.4263345003128052, | |
| "learning_rate": 3.107805013287958e-06, | |
| "loss": 0.0112, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 3.1806451612903226, | |
| "grad_norm": 0.37084028124809265, | |
| "learning_rate": 3.061514649831755e-06, | |
| "loss": 0.014, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 3.1870967741935483, | |
| "grad_norm": 0.5308308005332947, | |
| "learning_rate": 3.0155234663401146e-06, | |
| "loss": 0.0146, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 3.193548387096774, | |
| "grad_norm": 0.47034651041030884, | |
| "learning_rate": 2.9698329206359925e-06, | |
| "loss": 0.0124, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.3541916012763977, | |
| "learning_rate": 2.9244444610127764e-06, | |
| "loss": 0.0121, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 3.206451612903226, | |
| "grad_norm": 0.4616714417934418, | |
| "learning_rate": 2.8793595261883465e-06, | |
| "loss": 0.0181, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 3.2129032258064516, | |
| "grad_norm": 0.36847707629203796, | |
| "learning_rate": 2.8345795452595095e-06, | |
| "loss": 0.0165, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 3.2193548387096773, | |
| "grad_norm": 0.3731675148010254, | |
| "learning_rate": 2.790105937656673e-06, | |
| "loss": 0.013, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 3.225806451612903, | |
| "grad_norm": 0.42840346693992615, | |
| "learning_rate": 2.7459401130988534e-06, | |
| "loss": 0.0109, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.232258064516129, | |
| "grad_norm": 0.33602991700172424, | |
| "learning_rate": 2.7020834715490093e-06, | |
| "loss": 0.0106, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 3.238709677419355, | |
| "grad_norm": 0.47426825761795044, | |
| "learning_rate": 2.6585374031696474e-06, | |
| "loss": 0.0133, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 3.2451612903225806, | |
| "grad_norm": 0.41371604800224304, | |
| "learning_rate": 2.61530328827877e-06, | |
| "loss": 0.0094, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 3.2516129032258063, | |
| "grad_norm": 0.4533410668373108, | |
| "learning_rate": 2.5723824973061e-06, | |
| "loss": 0.0123, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 3.258064516129032, | |
| "grad_norm": 0.2636722922325134, | |
| "learning_rate": 2.5297763907496746e-06, | |
| "loss": 0.0086, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 3.264516129032258, | |
| "grad_norm": 0.48444676399230957, | |
| "learning_rate": 2.4874863191326953e-06, | |
| "loss": 0.0169, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 3.270967741935484, | |
| "grad_norm": 0.5979859828948975, | |
| "learning_rate": 2.44551362296072e-06, | |
| "loss": 0.0112, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 3.2774193548387096, | |
| "grad_norm": 0.43151959776878357, | |
| "learning_rate": 2.4038596326791884e-06, | |
| "loss": 0.0109, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 3.2838709677419353, | |
| "grad_norm": 0.4825892746448517, | |
| "learning_rate": 2.362525668631238e-06, | |
| "loss": 0.013, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 3.2903225806451615, | |
| "grad_norm": 0.3168151080608368, | |
| "learning_rate": 2.3215130410158424e-06, | |
| "loss": 0.0106, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.296774193548387, | |
| "grad_norm": 0.4605632722377777, | |
| "learning_rate": 2.2808230498463116e-06, | |
| "loss": 0.0189, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 3.303225806451613, | |
| "grad_norm": 0.47640544176101685, | |
| "learning_rate": 2.240456984909049e-06, | |
| "loss": 0.015, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 3.3096774193548386, | |
| "grad_norm": 0.5328596234321594, | |
| "learning_rate": 2.2004161257226805e-06, | |
| "loss": 0.0201, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 3.3161290322580643, | |
| "grad_norm": 0.5342445969581604, | |
| "learning_rate": 2.16070174149752e-06, | |
| "loss": 0.0109, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 3.3225806451612905, | |
| "grad_norm": 0.5308839082717896, | |
| "learning_rate": 2.121315091095297e-06, | |
| "loss": 0.014, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 3.329032258064516, | |
| "grad_norm": 0.4669474959373474, | |
| "learning_rate": 2.082257422989281e-06, | |
| "loss": 0.0105, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 3.335483870967742, | |
| "grad_norm": 0.37382492423057556, | |
| "learning_rate": 2.0435299752247077e-06, | |
| "loss": 0.0132, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 3.3419354838709676, | |
| "grad_norm": 0.4566926062107086, | |
| "learning_rate": 2.0051339753795125e-06, | |
| "loss": 0.0159, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 3.3483870967741938, | |
| "grad_norm": 0.4399929642677307, | |
| "learning_rate": 1.9670706405254548e-06, | |
| "loss": 0.0149, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 3.3548387096774195, | |
| "grad_norm": 0.3071390986442566, | |
| "learning_rate": 1.929341177189506e-06, | |
| "loss": 0.0085, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.361290322580645, | |
| "grad_norm": 0.39541929960250854, | |
| "learning_rate": 1.8919467813156121e-06, | |
| "loss": 0.0088, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 3.367741935483871, | |
| "grad_norm": 0.49959710240364075, | |
| "learning_rate": 1.854888638226815e-06, | |
| "loss": 0.0147, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 3.3741935483870966, | |
| "grad_norm": 0.3740963339805603, | |
| "learning_rate": 1.8181679225876324e-06, | |
| "loss": 0.0099, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 3.3806451612903228, | |
| "grad_norm": 0.27066710591316223, | |
| "learning_rate": 1.7817857983668612e-06, | |
| "loss": 0.0071, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 3.3870967741935485, | |
| "grad_norm": 0.47990116477012634, | |
| "learning_rate": 1.745743418800669e-06, | |
| "loss": 0.012, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.393548387096774, | |
| "grad_norm": 0.40311211347579956, | |
| "learning_rate": 1.7100419263560263e-06, | |
| "loss": 0.0102, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.4134123623371124, | |
| "learning_rate": 1.6746824526945163e-06, | |
| "loss": 0.0123, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 3.4064516129032256, | |
| "grad_norm": 0.44432902336120605, | |
| "learning_rate": 1.6396661186364543e-06, | |
| "loss": 0.0106, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 3.412903225806452, | |
| "grad_norm": 0.4438421428203583, | |
| "learning_rate": 1.6049940341253442e-06, | |
| "loss": 0.0172, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 3.4193548387096775, | |
| "grad_norm": 0.5001305341720581, | |
| "learning_rate": 1.570667298192724e-06, | |
| "loss": 0.0115, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.425806451612903, | |
| "grad_norm": 0.36009618639945984, | |
| "learning_rate": 1.5366869989233062e-06, | |
| "loss": 0.0111, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 3.432258064516129, | |
| "grad_norm": 0.35556045174598694, | |
| "learning_rate": 1.5030542134205003e-06, | |
| "loss": 0.0129, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 3.4387096774193546, | |
| "grad_norm": 0.38800477981567383, | |
| "learning_rate": 1.4697700077722616e-06, | |
| "loss": 0.0111, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 3.445161290322581, | |
| "grad_norm": 0.35132497549057007, | |
| "learning_rate": 1.4368354370173073e-06, | |
| "loss": 0.0133, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 3.4516129032258065, | |
| "grad_norm": 0.49624019861221313, | |
| "learning_rate": 1.404251545111672e-06, | |
| "loss": 0.0152, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 3.458064516129032, | |
| "grad_norm": 0.34581199288368225, | |
| "learning_rate": 1.3720193648956062e-06, | |
| "loss": 0.0093, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 3.464516129032258, | |
| "grad_norm": 0.4548514783382416, | |
| "learning_rate": 1.3401399180608551e-06, | |
| "loss": 0.0174, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 3.4709677419354836, | |
| "grad_norm": 0.33973830938339233, | |
| "learning_rate": 1.3086142151182605e-06, | |
| "loss": 0.0143, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 3.47741935483871, | |
| "grad_norm": 0.3562283515930176, | |
| "learning_rate": 1.2774432553657303e-06, | |
| "loss": 0.0129, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 3.4838709677419355, | |
| "grad_norm": 0.42894405126571655, | |
| "learning_rate": 1.2466280268565708e-06, | |
| "loss": 0.0136, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.490322580645161, | |
| "grad_norm": 0.36266642808914185, | |
| "learning_rate": 1.2161695063681589e-06, | |
| "loss": 0.0152, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 3.496774193548387, | |
| "grad_norm": 0.41463732719421387, | |
| "learning_rate": 1.186068659370984e-06, | |
| "loss": 0.0126, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 3.5032258064516126, | |
| "grad_norm": 0.3517482876777649, | |
| "learning_rate": 1.1563264399980512e-06, | |
| "loss": 0.0106, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 3.509677419354839, | |
| "grad_norm": 0.3592299520969391, | |
| "learning_rate": 1.1269437910146173e-06, | |
| "loss": 0.01, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 3.5161290322580645, | |
| "grad_norm": 0.3486897051334381, | |
| "learning_rate": 1.0979216437883327e-06, | |
| "loss": 0.0132, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 3.52258064516129, | |
| "grad_norm": 0.3669939935207367, | |
| "learning_rate": 1.069260918259704e-06, | |
| "loss": 0.0108, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 3.5290322580645164, | |
| "grad_norm": 0.345688134431839, | |
| "learning_rate": 1.0409625229129292e-06, | |
| "loss": 0.0112, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 3.535483870967742, | |
| "grad_norm": 0.40567103028297424, | |
| "learning_rate": 1.0130273547471176e-06, | |
| "loss": 0.017, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 3.541935483870968, | |
| "grad_norm": 0.45447733998298645, | |
| "learning_rate": 9.854562992478445e-07, | |
| "loss": 0.0293, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 3.5483870967741935, | |
| "grad_norm": 0.40127208828926086, | |
| "learning_rate": 9.582502303590798e-07, | |
| "loss": 0.0151, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.554838709677419, | |
| "grad_norm": 0.32802486419677734, | |
| "learning_rate": 9.314100104555066e-07, | |
| "loss": 0.0101, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 3.5612903225806454, | |
| "grad_norm": 0.35557428002357483, | |
| "learning_rate": 9.049364903151558e-07, | |
| "loss": 0.0108, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 3.567741935483871, | |
| "grad_norm": 0.5105459094047546, | |
| "learning_rate": 8.788305090924556e-07, | |
| "loss": 0.0167, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 3.574193548387097, | |
| "grad_norm": 0.3577045202255249, | |
| "learning_rate": 8.530928942916447e-07, | |
| "loss": 0.0076, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 3.5806451612903225, | |
| "grad_norm": 0.2892685532569885, | |
| "learning_rate": 8.277244617405102e-07, | |
| "loss": 0.0077, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 3.587096774193548, | |
| "grad_norm": 0.47886940836906433, | |
| "learning_rate": 8.027260155645546e-07, | |
| "loss": 0.0109, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 3.5935483870967744, | |
| "grad_norm": 0.3236874043941498, | |
| "learning_rate": 7.780983481614962e-07, | |
| "loss": 0.0074, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.46806618571281433, | |
| "learning_rate": 7.538422401761461e-07, | |
| "loss": 0.0174, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 3.606451612903226, | |
| "grad_norm": 0.4186045229434967, | |
| "learning_rate": 7.299584604756784e-07, | |
| "loss": 0.0111, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 3.6129032258064515, | |
| "grad_norm": 0.4132605791091919, | |
| "learning_rate": 7.064477661252483e-07, | |
| "loss": 0.0132, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.6193548387096772, | |
| "grad_norm": 0.5827385783195496, | |
| "learning_rate": 6.833109023639928e-07, | |
| "loss": 0.017, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 3.6258064516129034, | |
| "grad_norm": 0.3105774521827698, | |
| "learning_rate": 6.605486025814164e-07, | |
| "loss": 0.0091, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 3.632258064516129, | |
| "grad_norm": 0.34796178340911865, | |
| "learning_rate": 6.381615882941366e-07, | |
| "loss": 0.0083, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 3.638709677419355, | |
| "grad_norm": 0.3462621867656708, | |
| "learning_rate": 6.16150569123021e-07, | |
| "loss": 0.0143, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 3.6451612903225805, | |
| "grad_norm": 0.4699903428554535, | |
| "learning_rate": 5.945162427706888e-07, | |
| "loss": 0.0145, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 3.6516129032258062, | |
| "grad_norm": 0.42084646224975586, | |
| "learning_rate": 5.732592949993898e-07, | |
| "loss": 0.015, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 3.6580645161290324, | |
| "grad_norm": 0.4539680778980255, | |
| "learning_rate": 5.5238039960928e-07, | |
| "loss": 0.0154, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 3.664516129032258, | |
| "grad_norm": 0.3853324353694916, | |
| "learning_rate": 5.318802184170565e-07, | |
| "loss": 0.0126, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 3.670967741935484, | |
| "grad_norm": 0.409679651260376, | |
| "learning_rate": 5.117594012349735e-07, | |
| "loss": 0.0143, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 3.6774193548387095, | |
| "grad_norm": 0.42005378007888794, | |
| "learning_rate": 4.920185858502596e-07, | |
| "loss": 0.0129, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.6838709677419352, | |
| "grad_norm": 0.34078460931777954, | |
| "learning_rate": 4.7265839800488543e-07, | |
| "loss": 0.0132, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 3.6903225806451614, | |
| "grad_norm": 0.5289260149002075, | |
| "learning_rate": 4.5367945137573946e-07, | |
| "loss": 0.0114, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 3.696774193548387, | |
| "grad_norm": 0.43742361664772034, | |
| "learning_rate": 4.350823475551713e-07, | |
| "loss": 0.0099, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 3.703225806451613, | |
| "grad_norm": 0.3581911623477936, | |
| "learning_rate": 4.1686767603192344e-07, | |
| "loss": 0.0153, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 3.709677419354839, | |
| "grad_norm": 0.3517850637435913, | |
| "learning_rate": 3.990360141724478e-07, | |
| "loss": 0.0123, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.7161290322580647, | |
| "grad_norm": 0.44253072142601013, | |
| "learning_rate": 3.815879272025966e-07, | |
| "loss": 0.0068, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 3.7225806451612904, | |
| "grad_norm": 0.3429562449455261, | |
| "learning_rate": 3.6452396818971863e-07, | |
| "loss": 0.0121, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 3.729032258064516, | |
| "grad_norm": 0.7084751129150391, | |
| "learning_rate": 3.4784467802511797e-07, | |
| "loss": 0.0145, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 3.735483870967742, | |
| "grad_norm": 0.388698011636734, | |
| "learning_rate": 3.3155058540691037e-07, | |
| "loss": 0.0107, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 3.741935483870968, | |
| "grad_norm": 0.41982078552246094, | |
| "learning_rate": 3.1564220682327314e-07, | |
| "loss": 0.014, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.7483870967741937, | |
| "grad_norm": 0.645720899105072, | |
| "learning_rate": 3.001200465360593e-07, | |
| "loss": 0.015, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 3.7548387096774194, | |
| "grad_norm": 0.5690763592720032, | |
| "learning_rate": 2.8498459656482317e-07, | |
| "loss": 0.0189, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 3.761290322580645, | |
| "grad_norm": 0.4491289556026459, | |
| "learning_rate": 2.702363366712257e-07, | |
| "loss": 0.0127, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 3.767741935483871, | |
| "grad_norm": 0.4925324618816376, | |
| "learning_rate": 2.5587573434381895e-07, | |
| "loss": 0.0138, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 3.774193548387097, | |
| "grad_norm": 0.6388445496559143, | |
| "learning_rate": 2.41903244783237e-07, | |
| "loss": 0.0145, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 3.7806451612903227, | |
| "grad_norm": 0.31376415491104126, | |
| "learning_rate": 2.2831931088775904e-07, | |
| "loss": 0.0135, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 3.7870967741935484, | |
| "grad_norm": 0.3549552261829376, | |
| "learning_rate": 2.1512436323927604e-07, | |
| "loss": 0.014, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 3.793548387096774, | |
| "grad_norm": 0.446304053068161, | |
| "learning_rate": 2.0231882008963783e-07, | |
| "loss": 0.0144, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 0.45615440607070923, | |
| "learning_rate": 1.8990308734739976e-07, | |
| "loss": 0.0143, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 3.806451612903226, | |
| "grad_norm": 0.3915248513221741, | |
| "learning_rate": 1.7787755856495254e-07, | |
| "loss": 0.0131, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.8129032258064517, | |
| "grad_norm": 0.2556948959827423, | |
| "learning_rate": 1.6624261492605153e-07, | |
| "loss": 0.0061, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 3.8193548387096774, | |
| "grad_norm": 0.5648970603942871, | |
| "learning_rate": 1.5499862523372933e-07, | |
| "loss": 0.011, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 3.825806451612903, | |
| "grad_norm": 0.30211833119392395, | |
| "learning_rate": 1.4414594589860774e-07, | |
| "loss": 0.0087, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 3.832258064516129, | |
| "grad_norm": 0.21295692026615143, | |
| "learning_rate": 1.3368492092760142e-07, | |
| "loss": 0.0059, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 3.838709677419355, | |
| "grad_norm": 0.49749764800071716, | |
| "learning_rate": 1.2361588191300983e-07, | |
| "loss": 0.0112, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 3.8451612903225807, | |
| "grad_norm": 0.3051888048648834, | |
| "learning_rate": 1.139391480220145e-07, | |
| "loss": 0.0077, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 3.8516129032258064, | |
| "grad_norm": 0.3583107590675354, | |
| "learning_rate": 1.0465502598655114e-07, | |
| "loss": 0.0115, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 3.858064516129032, | |
| "grad_norm": 0.4116378426551819, | |
| "learning_rate": 9.576381009359508e-08, | |
| "loss": 0.0127, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 3.864516129032258, | |
| "grad_norm": 0.3633911907672882, | |
| "learning_rate": 8.726578217582993e-08, | |
| "loss": 0.0109, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 3.870967741935484, | |
| "grad_norm": 0.35112428665161133, | |
| "learning_rate": 7.916121160271572e-08, | |
| "loss": 0.0112, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.8774193548387097, | |
| "grad_norm": 0.4247336685657501, | |
| "learning_rate": 7.145035527194588e-08, | |
| "loss": 0.0103, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 3.8838709677419354, | |
| "grad_norm": 0.36888250708580017, | |
| "learning_rate": 6.413345760131057e-08, | |
| "loss": 0.0145, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 3.8903225806451616, | |
| "grad_norm": 0.3833377957344055, | |
| "learning_rate": 5.721075052094599e-08, | |
| "loss": 0.0094, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 3.896774193548387, | |
| "grad_norm": 0.45575153827667236, | |
| "learning_rate": 5.068245346598332e-08, | |
| "loss": 0.0111, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 3.903225806451613, | |
| "grad_norm": 0.3035842776298523, | |
| "learning_rate": 4.454877336958763e-08, | |
| "loss": 0.0071, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 3.9096774193548387, | |
| "grad_norm": 0.3646605908870697, | |
| "learning_rate": 3.8809904656410264e-08, | |
| "loss": 0.0111, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 3.9161290322580644, | |
| "grad_norm": 0.3893778920173645, | |
| "learning_rate": 3.346602923641473e-08, | |
| "loss": 0.0084, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 3.9225806451612906, | |
| "grad_norm": 0.29794201254844666, | |
| "learning_rate": 2.8517316499115932e-08, | |
| "loss": 0.0069, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 3.9290322580645163, | |
| "grad_norm": 0.38667142391204834, | |
| "learning_rate": 2.3963923308212288e-08, | |
| "loss": 0.0144, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 3.935483870967742, | |
| "grad_norm": 0.39387401938438416, | |
| "learning_rate": 1.9805993996606376e-08, | |
| "loss": 0.0087, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.9419354838709677, | |
| "grad_norm": 0.3552229106426239, | |
| "learning_rate": 1.604366036184052e-08, | |
| "loss": 0.0093, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 3.9483870967741934, | |
| "grad_norm": 0.5617074966430664, | |
| "learning_rate": 1.2677041661907085e-08, | |
| "loss": 0.008, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 3.9548387096774196, | |
| "grad_norm": 0.3588564693927765, | |
| "learning_rate": 9.706244611480674e-09, | |
| "loss": 0.016, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 3.9612903225806453, | |
| "grad_norm": 0.36114564538002014, | |
| "learning_rate": 7.131363378524991e-09, | |
| "loss": 0.0113, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 3.967741935483871, | |
| "grad_norm": 0.5072866678237915, | |
| "learning_rate": 4.952479581311897e-09, | |
| "loss": 0.0128, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 3.9741935483870967, | |
| "grad_norm": 0.3469507396221161, | |
| "learning_rate": 3.1696622858373716e-09, | |
| "loss": 0.01, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 3.9806451612903224, | |
| "grad_norm": 0.6267412900924683, | |
| "learning_rate": 1.7829680036274276e-09, | |
| "loss": 0.0201, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 3.9870967741935486, | |
| "grad_norm": 0.4204852879047394, | |
| "learning_rate": 7.924406899492698e-10, | |
| "loss": 0.0108, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 3.9935483870967743, | |
| "grad_norm": 0.2933880090713501, | |
| "learning_rate": 1.9811174241796127e-10, | |
| "loss": 0.0113, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.3794356882572174, | |
| "learning_rate": 0.0, | |
| "loss": 0.0109, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 620, | |
| "total_flos": 2.582134138035241e+17, | |
| "train_loss": 0.053207253262911355, | |
| "train_runtime": 778.335, | |
| "train_samples_per_second": 25.449, | |
| "train_steps_per_second": 0.797 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 620, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.582134138035241e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |